From 84027bc0c77043d91324126414e1a9bd0b539fb3 Mon Sep 17 00:00:00 2001 From: Cameron Davidson-Pilon Date: Fri, 6 Jul 2018 15:33:52 -0400 Subject: [PATCH 01/59] adding robust errors to coxph --- lifelines/datasets/__init__.py | 3 +- lifelines/datasets/regression.csv | 400 +++++++++++++++--------------- lifelines/fitters/coxph_fitter.py | 79 +++++- tests/test_estimation.py | 29 ++- 4 files changed, 299 insertions(+), 212 deletions(-) diff --git a/lifelines/datasets/__init__.py b/lifelines/datasets/__init__.py index 87686de35..35e01f656 100644 --- a/lifelines/datasets/__init__.py +++ b/lifelines/datasets/__init__.py @@ -270,7 +270,8 @@ def load_rossi(**kwargs): def load_regression_dataset(**kwargs): """ - Artificial regression dataset + Artificial regression dataset. Useful since there are no ties in this dataset. + Slightly edit in v0.15.0 to achieve this, however. Size: (200,5) Example: diff --git a/lifelines/datasets/regression.csv b/lifelines/datasets/regression.csv index 01d39937a..2afc66848 100644 --- a/lifelines/datasets/regression.csv +++ b/lifelines/datasets/regression.csv @@ -1,201 +1,201 @@ var1,var2,var3,T,E -0.59517,1.143472,1.5710790000000001,14.785479,1 -0.209325,0.184677,0.35698,7.336734,1 -0.693919,0.071893,0.55796,5.271527,1 -0.443804,1.3646459999999998,0.374221,11.684168,1 -1.613324,0.125566,1.921325,7.637764,1 -0.065636,0.098375,0.237896,12.678268,1 -0.386294,1.6630919999999998,0.7903140000000001,6.601660000000001,1 -0.946688,1.345394,3.209113,11.369137,1 -0.11374000000000001,0.40986000000000006,0.064934,14.680468,1 -0.7777930000000001,0.33499,0.411055,10.585059,1 -0.04428,0.305158,0.17648,19.370936999999998,1 -1.03545,3.304733,0.997323,5.558555999999999,1 -0.22919499999999998,0.5813550000000001,0.48479399999999995,11.292129,1 -0.055970000000000006,2.6741349999999997,0.355279,9.919992,0 -1.236583,1.796598,0.179952,9.884988,1 -1.162835,0.46475900000000003,2.028854,6.265626999999999,1 -0.14943599999999999,2.949291,0.277801,13.812381,1 -0.399475,0.822413,0.673405,6.433643,1 -0.762121,0.050407,1.2851629999999998,6.979698,1 -1.239718,1.869215,0.020202,7.742774000000001,1 -0.019221000000000002,1.435543,0.255689,4.70447,1 -0.090253,0.211037,0.372809,11.236124,1 -0.20584899999999998,0.048722,0.00253,6.664666,1 -0.088185,1.319679,0.201675,10.718072,1 -4.629747,0.36352199999999996,1.08207,11.593159,1 -1.6028360000000001,1.217881,0.350837,8.463846,1 -0.014804,0.684737,0.493267,5.432543,1 -0.08402000000000001,1.432093,0.456541,10.277028,1 -2.260223,1.2389299999999999,5.541837999999999,7.217722,1 -0.6219680000000001,0.6844279999999999,0.135933,13.217322,1 -0.013219999999999999,3.280555,1.193551,8.029803,1 -0.070651,1.430517,0.0052049999999999996,9.807981,1 -0.20598000000000002,0.29064,0.096565,2.632263,1 -1.389882,0.14313299999999998,0.821257,6.160616,1 -0.104143,2.072924,0.449696,6.265626999999999,1 -0.42848100000000006,0.06573899999999999,3.00755,4.606461,0 -1.785151,1.572282,0.475059,11.124112,1 -0.22889299999999999,0.429025,0.60805,6.083608,1 -0.640837,0.311084,3.165658,9.219922,1 -1.450683,0.8470219999999999,2.5211770000000002,7.119712,0 -0.469459,0.318871,0.164498,13.056306,1 -0.36617,0.23499099999999998,0.678709,4.949495,1 -1.4325700000000001,2.668335,0.558046,4.858486,1 -2.696463,0.244077,1.3151110000000001,8.505851,1 -0.152624,0.37950100000000003,0.330164,7.035703999999999,1 -0.27739899999999995,0.871603,1.555185,10.837083999999999,1 -0.353633,0.294236,0.928573,9.919992,1 -0.6209560000000001,0.021884,3.2057599999999997,5.292529,1 -0.0007570000000000001,1.216615,0.8610690000000001,20.981098,1 -0.497674,2.744032,0.47358900000000004,5.810581,1 -1.213709,0.072756,0.09842000000000001,11.292129,1 -0.426255,2.550392,0.16762,8.19782,1 -0.008408,1.132205,1.234917,7.469747,1 -1.207833,0.13335,0.528231,10.669067,1 -0.036975,0.040631,0.2664,10.543054,1 -0.789439,0.669067,1.332697,6.2866290000000005,1 -1.482055,0.627205,0.738271,9.079908,1 -1.028671,0.21520999999999998,0.457692,14.183418,1 -0.521986,2.282683,0.31597600000000003,21.940194,1 -0.26262199999999997,0.345999,0.9210969999999999,8.883888,1 -0.360319,1.001364,0.237533,9.982998,1 -0.362587,0.110046,2.486691,9.555956,1 -1.793598,0.310001,0.26306599999999997,8.659866000000001,1 -0.419275,0.11430799999999999,1.124784,5.642564,1 -1.4702469999999999,0.289054,0.331833,10.9981,1 -0.27476,0.523508,2.139204,8.050805,1 -0.119805,0.7337739999999999,0.21205700000000002,11.250125,1 -0.369294,0.609847,0.89402,10.214021,0 -1.01825,2.119666,0.716002,12.335234,1 -0.607065,2.3501119999999998,0.031389999999999994,15.723572,1 -4.169830999999999,0.316285,0.16935,11.831183,0 -1.483383,2.242744,0.26543,7.364736,1 -0.32359699999999997,0.165159,0.97204,12.517252000000001,1 -1.82716,0.32779400000000003,0.9415389999999999,8.512851,1 -0.104104,0.9233020000000001,1.22007,12.79728,1 -0.392766,0.42279399999999995,3.4826080000000004,8.540854,1 -2.579629,0.109011,2.2800279999999997,3.5633559999999997,1 -0.775092,0.974519,2.2236990000000003,9.576958,1 -0.5075930000000001,0.917278,0.103131,9.646965,1 -0.13843699999999998,2.474084,1.6350049999999998,11.789178999999999,1 -1.120586,1.480593,0.6382439999999999,4.648465,0 -0.001842,0.6014520000000001,0.40551,14.162416,1 -0.9969790000000001,0.44859,0.782013,7.490749,1 -1.0672629999999999,0.304582,0.795276,10.739074,1 -1.123429,1.4093149999999999,0.090895,8.239824,1 -0.139158,3.203523,0.28734899999999997,7.630763000000001,1 -1.276529,1.039313,1.217827,7.819782000000001,1 -0.175824,1.371635,1.7854880000000002,12.419242,1 -0.24130100000000002,4.048806,0.423415,10.564055999999999,1 -1.8644439999999998,0.821839,0.426364,6.293629,1 -0.34029499999999996,0.727143,0.341437,12.405241,1 -5.130831,0.074513,0.8015260000000001,10.79508,1 -1.404635,0.039251,0.785162,17.09571,1 -0.07394400000000001,0.053314999999999994,0.18626199999999998,15.464545999999999,0 -1.271488,0.10678,0.291883,9.611961,1 -0.781452,1.229076,0.069747,14.407441,1 -0.3909,0.35690700000000003,0.23058,10.088009,1 -2.193825,0.6211840000000001,0.466925,5.817582,1 -2.942882,0.16383,1.040333,7.987799000000001,1 -0.705527,0.592699,0.923248,11.831183,1 -1.662925,2.1851700000000003,0.664273,11.873187,1 -0.407842,1.011611,0.485592,4.144414,1 -0.091321,0.281593,0.153947,8.288829,1 -3.5385089999999995,1.80715,1.336961,4.326433,1 -0.661027,1.171563,0.30091,14.246425,1 -0.106552,0.121843,0.257878,5.663566,1 -0.104327,1.513503,0.314581,7.9177919999999995,1 -0.811837,1.6833240000000003,0.061925,11.845185,1 -0.402495,0.43151999999999996,0.489576,11.075108,1 -1.322155,0.521161,1.859989,6.888689,1 -0.647954,3.243631,0.034075,8.344834,1 -0.851476,0.21736599999999998,0.29733000000000004,4.473447,1 -0.14999400000000002,3.027889,0.5427489999999999,9.247925,1 -0.381276,1.146927,0.22583000000000003,11.215122000000001,1 -0.019479,1.374707,1.5665950000000002,8.288829,1 -0.806793,0.60941,1.903648,10.074007,1 -0.9268280000000001,1.062158,0.048544,14.484448,1 -0.998282,0.385911,1.403305,11.026102999999999,1 -0.198755,1.668675,0.182337,6.251625,1 -1.668232,0.717113,0.39318000000000003,15.884588,1 -0.903388,0.34757,0.796215,11.341134,1 -3.094217,0.764497,3.063756,7.644764,1 -0.565765,0.8556440000000001,2.4122220000000003,8.365836999999999,1 -0.600544,0.019666,2.356107,11.90119,1 -0.453201,0.24214899999999998,0.7611140000000001,9.912991,1 -0.441605,0.271366,0.9775219999999999,8.323832000000001,1 -0.41135,0.029483999999999996,1.8434580000000003,11.971197,1 -0.5351199999999999,0.045629,0.16006700000000001,11.124112,1 -0.47211899999999996,2.239749,0.148828,6.153615,1 -0.485754,1.464013,0.380293,8.911891,1 -5.353937,0.855298,0.001006,4.879487999999999,1 -0.000974,0.35496500000000003,0.698741,20.666067,1 -0.36145700000000003,2.792862,1.503787,11.082108,1 -1.2026729999999999,1.825852,0.391339,8.008801,1 -0.8530770000000001,0.22137600000000002,1.6355389999999999,9.779978,1 -1.646959,3.3371690000000003,1.262672,5.663566,1 -0.050491,1.0423879999999999,0.040406,10.50105,1 -0.693033,0.067717,1.6319299999999999,9.968997,1 -3.8753610000000003,1.206579,0.6567850000000001,4.837484,1 -0.401754,1.526443,0.449621,7.952795,1 -2.112141,0.994604,0.12592799999999998,4.445444999999999,1 -2.358111,1.411174,4.747023,6.930692999999999,1 -0.406167,0.7479359999999999,1.240233,11.971197,1 -0.9833120000000001,1.330699,0.931057,12.769277,1 -2.8028560000000002,0.141768,0.96447,6.153615,1 -0.22598000000000001,0.156969,0.771678,9.093909,1 -1.0202120000000001,1.338747,1.485407,9.70297,1 -0.737183,0.21196700000000002,1.479703,10.417042,1 -0.694596,0.13306500000000002,1.612199,13.182317999999999,1 -1.614919,1.628414,3.3395629999999996,2.576258,1 -1.263567,0.041625999999999996,0.13448800000000002,7.091709,1 -1.81759,0.89371,0.256831,5.740574,1 -0.221442,1.00047,0.13556500000000002,12.923292,1 -0.388571,2.331312,0.048117,12.874286999999999,1 -1.365461,0.44473,0.26388,4.725473,1 -0.017446,1.50251,1.859648,9.835984,1 -0.803217,0.259678,0.305695,6.062606,1 -1.153738,2.357565,0.264925,8.092808999999999,1 -0.546425,0.516525,0.05980599999999999,8.043804,1 -0.061367,2.453071,0.234816,8.715872000000001,1 -0.42113599999999995,0.295455,1.117664,13.287329000000001,1 -1.5747790000000002,0.7411220000000001,0.533676,10.515052,1 -1.3943510000000001,0.877793,1.637652,6.426643,1 -0.923441,1.1076139999999999,0.78291,3.640364,1 -0.231346,0.620135,1.8213549999999998,4.746475,1 -0.7357060000000001,3.4050540000000002,3.457625,11.677168,1 -1.748839,1.132628,0.812584,11.558156,1 -0.280291,1.664837,0.051460000000000006,8.757876,1 -0.150857,2.545696,1.456119,12.825283,1 -1.5516809999999999,0.125114,0.148355,15.618561999999999,0 -0.746388,0.267458,0.42003599999999996,11.943194,1 -0.068177,0.19378800000000002,2.693533,7.952795,1 -0.305141,0.858988,3.883753,12.356236,1 -3.614956,0.659784,1.013164,3.5633559999999997,1 -1.9810330000000003,0.7379720000000001,0.272071,8.561856,1 -0.19708,1.164958,0.8204870000000001,4.207421,1 -0.027854000000000004,0.6533260000000001,0.08022,21.030103,1 -1.8066659999999999,3.535072,2.176759,5.810581,1 -0.16528800000000002,1.6233950000000001,1.9945509999999997,8.79988,1 -1.617063,0.49479799999999996,0.131597,7.798780000000001,0 -1.298794,1.778036,0.453693,12.657266,1 -0.707968,1.081388,0.477484,14.30243,1 -0.246455,0.11361800000000001,0.407209,13.329332999999998,1 -0.282453,0.731784,0.002421,6.1256129999999995,1 -0.133855,0.096552,0.152854,4.935494,0 -0.025306,0.07387,0.163927,6.314630999999999,1 -1.017839,0.737884,3.126409,6.573657000000001,0 -0.847491,1.142187,1.342932,8.610861,1 -0.9420930000000001,0.161735,1.388318,9.997,1 -0.38300100000000004,0.006451,0.901114,7.749775,1 -0.011165999999999999,0.220669,0.6917909999999999,7.3437339999999995,1 -1.5435020000000002,1.472249,0.830817,6.986699000000001,1 -0.168033,3.052163,0.035085000000000005,18.131813,1 -2.1599459999999997,0.001644,1.443158,4.382438,1 -0.249142,0.628992,2.3185130000000003,8.743874,1 -0.137399,0.107748,0.354812,11.446145,1 -0.6373409999999999,2.847188,1.4591370000000001,7.623761999999999,1 -1.109732,0.405561,0.018856,10.634063000000001,1 -0.031865,1.753759,0.25204,8.519852,1 -1.631269,1.5886209999999998,3.7098989999999996,4.480448,1 +0.59517,1.143472,1.571079,14.7856515748,1 +0.209325,0.184677,0.35698,7.33584583652,1 +0.693919,0.071893,0.55796,5.26979701571,1 +0.443804,1.364646,0.374221,11.6840920212,1 +1.613324,0.125566,1.921325,7.63949212526,1 +0.065636,0.098375,0.237896,12.6784581817,1 +0.386294,1.663092,0.790314,6.60166572026,1 +0.946688,1.345394,3.209113,11.3670916491,1 +0.11374,0.40986,0.064934,14.6805866317,1 +0.777793,0.33499,0.411055,10.5854086595,1 +0.04428,0.305158,0.17648,19.3721173864,1 +1.03545,3.304733,0.997323,5.55904466985,1 +0.229195,0.581355,0.484794,11.2924891948,1 +0.05597,2.674135,0.355279,9.92047433529,0 +1.236583,1.796598,0.179952,9.88652411916,1 +1.162835,0.464759,2.028854,6.26643301257,1 +0.149436,2.949291,0.277801,13.8127296,1 +0.399475,0.822413,0.673405,6.43309776107,1 +0.762121,0.050407,1.285163,6.97979741031,1 +1.239718,1.869215,0.020202,7.74300832502,1 +0.019221,1.435543,0.255689,4.70530329608,1 +0.090253,0.211037,0.372809,11.2335841641,1 +0.205849,0.048722,0.00253,6.66273101972,1 +0.088185,1.319679,0.201675,10.7174137318,1 +4.629747,0.363522,1.08207,11.5938047533,1 +1.602836,1.217881,0.350837,8.46420655566,1 +0.014804,0.684737,0.493267,5.43255855841,1 +0.08402,1.432093,0.456541,10.276593667,1 +2.260223,1.23893,5.541838,7.21736226987,1 +0.621968,0.684428,0.135933,13.2176584654,1 +0.01322,3.280555,1.193551,8.0299335416,1 +0.070651,1.430517,0.005205,9.80826804874,1 +0.20598,0.29064,0.096565,2.63226375911,1 +1.389882,0.143133,0.821257,6.16269524116,1 +0.104143,2.072924,0.449696,6.26182607469,1 +0.428481,0.065739,3.00755,4.6048190364,0 +1.785151,1.572282,0.475059,11.1239077928,1 +0.228893,0.429025,0.60805,6.08263253911,1 +0.640837,0.311084,3.165658,9.22065563383,1 +1.450683,0.847022,2.521177,7.12012891282,0 +0.469459,0.318871,0.164498,13.0551806715,1 +0.36617,0.234991,0.678709,4.95133583707,1 +1.43257,2.668335,0.558046,4.85889534209,1 +2.696463,0.244077,1.315111,8.50543244101,1 +0.152624,0.379501,0.330164,7.03794878748,1 +0.277399,0.871603,1.555185,10.8353931443,1 +0.353633,0.294236,0.928573,9.92109564592,1 +0.620956,0.021884,3.20576,5.29165212342,1 +0.000757,1.216615,0.861069,20.9813809356,1 +0.497674,2.744032,0.473589,5.80982311606,1 +1.213709,0.072756,0.09842,11.2908076197,1 +0.426255,2.550392,0.16762,8.19779319557,1 +0.008408,1.132205,1.234917,7.47213923892,1 +1.207833,0.13335,0.528231,10.670073183,1 +0.036975,0.040631,0.2664,10.543203936,1 +0.789439,0.669067,1.332697,6.28852320244,1 +1.482055,0.627205,0.738271,9.080334859,1 +1.028671,0.21521,0.457692,14.1822947115,1 +0.521986,2.282683,0.315976,21.9399783806,1 +0.262622,0.345999,0.921097,8.88229093093,1 +0.360319,1.001364,0.237533,9.98342539597,1 +0.362587,0.110046,2.486691,9.55638379129,1 +1.793598,0.310001,0.263066,8.65928769028,1 +0.419275,0.114308,1.124784,5.6434659532,1 +1.470247,0.289054,0.331833,10.9977552573,1 +0.27476,0.523508,2.139204,8.05076874352,1 +0.119805,0.733774,0.212057,11.2503684762,1 +0.369294,0.609847,0.89402,10.2129272163,0 +1.01825,2.119666,0.716002,12.3366190173,1 +0.607065,2.350112,0.03139,15.7231628796,1 +4.169831,0.316285,0.16935,11.8302258308,0 +1.483383,2.242744,0.26543,7.36252640465,1 +0.323597,0.165159,0.97204,12.5169357575,1 +1.82716,0.327794,0.941539,8.51322372304,1 +0.104104,0.923302,1.22007,12.7973318626,1 +0.392766,0.422794,3.482608,8.54052212332,1 +2.579629,0.109011,2.280028,3.56358063047,1 +0.775092,0.974519,2.223699,9.57756677618,1 +0.507593,0.917278,0.103131,9.64881133972,1 +0.138437,2.474084,1.635005,11.789052178,1 +1.120586,1.480593,0.638244,4.6478831396,0 +0.001842,0.601452,0.40551,14.1629885287,1 +0.996979,0.44859,0.782013,7.49170332816,1 +1.067263,0.304582,0.795276,10.7391507856,1 +1.123429,1.409315,0.090895,8.23925481073,1 +0.139158,3.203523,0.287349,7.63092764446,1 +1.276529,1.039313,1.217827,7.8203073558,1 +0.175824,1.371635,1.785488,12.4208671834,1 +0.241301,4.048806,0.423415,10.5633022553,1 +1.864444,0.821839,0.426364,6.29314423772,1 +0.340295,0.727143,0.341437,12.4052831947,1 +5.130831,0.074513,0.801526,10.7964133954,1 +1.404635,0.039251,0.785162,17.0955898105,1 +0.073944,0.053315,0.186262,15.4629258426,0 +1.271488,0.10678,0.291883,9.61273262267,1 +0.781452,1.229076,0.069747,14.4093766173,1 +0.3909,0.356907,0.23058,10.0869453587,1 +2.193825,0.621184,0.466925,5.81874836313,1 +2.942882,0.16383,1.040333,7.98833687105,1 +0.705527,0.592699,0.923248,11.8312524366,1 +1.662925,2.18517,0.664273,11.8731640313,1 +0.407842,1.011611,0.485592,4.14387121547,1 +0.091321,0.281593,0.153947,8.2907236782,1 +3.538509,1.80715,1.336961,4.32535868496,1 +0.661027,1.171563,0.30091,14.2454199869,1 +0.106552,0.121843,0.257878,5.66506177628,1 +0.104327,1.513503,0.314581,7.91846164924,1 +0.811837,1.683324,0.061925,11.8443153255,1 +0.402495,0.43152,0.489576,11.0761501869,1 +1.322155,0.521161,1.859989,6.88916431026,1 +0.647954,3.243631,0.034075,8.34491489657,1 +0.851476,0.217366,0.29733,4.47399978584,1 +0.149994,3.027889,0.542749,9.24722217844,1 +0.381276,1.146927,0.22583,11.2146024447,1 +0.019479,1.374707,1.566595,8.28623107033,1 +0.806793,0.60941,1.903648,10.0738349202,1 +0.926828,1.062158,0.048544,14.4854236558,1 +0.998282,0.385911,1.403305,11.0245383745,1 +0.198755,1.668675,0.182337,6.25197764519,1 +1.668232,0.717113,0.39318,15.8836989034,1 +0.903388,0.34757,0.796215,11.34167126,1 +3.094217,0.764497,3.063756,7.6440979558,1 +0.565765,0.855644,2.412222,8.36665621446,1 +0.600544,0.019666,2.356107,11.9012822052,1 +0.453201,0.242149,0.761114,9.91242335863,1 +0.441605,0.271366,0.977522,8.32289768512,1 +0.41135,0.029484,1.843458,11.9717347284,1 +0.53512,0.045629,0.160067,11.1247094819,1 +0.472119,2.239749,0.148828,6.15413791004,1 +0.485754,1.464013,0.380293,8.91144648631,1 +5.353937,0.855298,0.001006,4.88104290364,1 +0.000974,0.354965,0.698741,20.6652251814,1 +0.361457,2.792862,1.503787,11.0822662185,1 +1.202673,1.825852,0.391339,8.00771353063,1 +0.853077,0.221376,1.635539,9.78044985255,1 +1.646959,3.337169,1.262672,5.66532639351,1 +0.050491,1.042388,0.040406,10.5017493224,1 +0.693033,0.067717,1.63193,9.96821890209,1 +3.875361,1.206579,0.656785,4.83539084237,1 +0.401754,1.526443,0.449621,7.95252897093,1 +2.112141,0.994604,0.125928,4.44530592881,1 +2.358111,1.411174,4.747023,6.92942222099,1 +0.406167,0.747936,1.240233,11.9701980352,1 +0.983312,1.330699,0.931057,12.7687578904,1 +2.802856,0.141768,0.96447,6.1536973616,1 +0.22598,0.156969,0.771678,9.09300617059,1 +1.020212,1.338747,1.485407,9.70355645693,1 +0.737183,0.211967,1.479703,10.4173960087,1 +0.694596,0.133065,1.612199,13.1829276562,1 +1.614919,1.628414,3.339563,2.57553764605,1 +1.263567,0.041626,0.134488,7.09141708798,1 +1.81759,0.89371,0.256831,5.73937524269,1 +0.221442,1.00047,0.135565,12.9241740121,1 +0.388571,2.331312,0.048117,12.8735020088,1 +1.365461,0.44473,0.26388,4.72642629428,1 +0.017446,1.50251,1.859648,9.83594179776,1 +0.803217,0.259678,0.305695,6.06237621553,1 +1.153738,2.357565,0.264925,8.09338513449,1 +0.546425,0.516525,0.059806,8.04406734742,1 +0.061367,2.453071,0.234816,8.71594708122,1 +0.421136,0.295455,1.117664,13.2869538904,1 +1.574779,0.741122,0.533676,10.5135003978,1 +1.394351,0.877793,1.637652,6.42775744203,1 +0.923441,1.107614,0.78291,3.63878040929,1 +0.231346,0.620135,1.821355,4.74775344975,1 +0.735706,3.405054,3.457625,11.6770199376,1 +1.748839,1.132628,0.812584,11.5594329742,1 +0.280291,1.664837,0.05146,8.75824998324,1 +0.150857,2.545696,1.456119,12.8268461929,1 +1.551681,0.125114,0.148355,15.6204061094,0 +0.746388,0.267458,0.420036,11.9422735844,1 +0.068177,0.193788,2.693533,7.95449616061,1 +0.305141,0.858988,3.883753,12.3573764607,1 +3.614956,0.659784,1.013164,3.56383007199,1 +1.981033,0.737972,0.272071,8.5619748224,1 +0.19708,1.164958,0.820487,4.20656850475,1 +0.027854,0.653326,0.08022,21.0318230188,1 +1.806666,3.535072,2.176759,5.81052910695,1 +0.165288,1.623395,1.994551,8.79849009986,1 +1.617063,0.494798,0.131597,7.79923023169,0 +1.298794,1.778036,0.453693,12.6551650347,1 +0.707968,1.081388,0.477484,14.3014540711,1 +0.246455,0.113618,0.407209,13.3297030877,1 +0.282453,0.731784,0.002421,6.12506421389,1 +0.133855,0.096552,0.152854,4.93564074908,0 +0.025306,0.07387,0.163927,6.3156952171,1 +1.017839,0.737884,3.126409,6.57321280053,0 +0.847491,1.142187,1.342932,8.61060494656,1 +0.942093,0.161735,1.388318,9.9956084953,1 +0.383001,0.006451,0.901114,7.74825868839,1 +0.011166,0.220669,0.691791,7.34226786253,1 +1.543502,1.472249,0.830817,6.98633720723,1 +0.168033,3.052163,0.035085,18.1313105791,1 +2.159946,0.001644,1.443158,4.38165504789,1 +0.249142,0.628992,2.318513,8.74257448673,1 +0.137399,0.107748,0.354812,11.4454572735,1 +0.637341,2.847188,1.459137,7.62462675408,1 +1.109732,0.405561,0.018856,10.6346199544,1 +0.031865,1.753759,0.25204,8.51971771151,1 +1.631269,1.588621,3.709899,4.47895208711,1 diff --git a/lifelines/fitters/coxph_fitter.py b/lifelines/fitters/coxph_fitter.py index d93149bc7..2e65fa99a 100644 --- a/lifelines/fitters/coxph_fitter.py +++ b/lifelines/fitters/coxph_fitter.py @@ -8,7 +8,8 @@ import pandas as pd from numpy import dot, exp -from numpy.linalg import solve, norm, inv +from numpy.linalg import norm, inv +from scipy.linalg import solve as spsolve from scipy.integrate import trapz import scipy.stats as stats @@ -57,7 +58,8 @@ def __init__(self, alpha=0.95, tie_method='Efron', penalizer=0.0, strata=None): def fit(self, df, duration_col, event_col=None, show_progress=False, initial_beta=None, - strata=None, step_size=None, weights_col=None): + strata=None, step_size=None, weights_col=None, + robust=False): """ Fit the Cox Propertional Hazard model to a dataset. Tied survival times are handled using Efron's tie-method. @@ -83,9 +85,12 @@ def fit(self, df, duration_col, event_col=None, is used similar to the `strata` expression in R. See http://courses.washington.edu/b515/l17.pdf. step_size: set an initial step size for the fitting algorithm. + robust: Compute the robust errors using the Huber sandwich estimator, aka Wei-Lin estimate. This does not handle + ties, so if there are high number of ties, results may significantly differ. See + "The Robust Inference for the Cox Proportional Hazards Model", Journal of the American Statistical Association, Vol. 84, No. 408 (Dec., 1989), pp. 1074- 1078 Returns: - self, with additional properties: hazards_ + self, with additional properties: hazards_, confidence_intervals_, baseline_survival_, etc. """ @@ -94,6 +99,7 @@ def fit(self, df, duration_col, event_col=None, # Sort on time df = df.sort_values(by=duration_col) + self.robust = robust self._n_examples = df.shape[0] self.strata = coalesce(strata, self.strata) if self.strata is not None: @@ -143,14 +149,18 @@ def fit(self, df, duration_col, event_col=None, step_size=step_size) self.hazards_ = pd.DataFrame(hazards_.T, columns=df.columns, index=['coef']) / self._norm_std + + self.standard_errors_ = self._compute_standard_errors(normalize(df, self._norm_mean, self._norm_std), T, E) self.confidence_intervals_ = self._compute_confidence_intervals() + self.baseline_hazard_ = self._compute_baseline_hazards(df, T, E) self.baseline_cumulative_hazard_ = self._compute_baseline_cumulative_hazard() self.baseline_survival_ = self._compute_baseline_survival() self.score_ = concordance_index(self.durations, -self.predict_partial_hazard(df).values.ravel(), self.event_observed) + self._train_log_partial_hazard = self.predict_log_partial_hazard(self._norm_mean.to_frame().T) return self @@ -224,7 +234,7 @@ def _newton_rhaphson(self, X, T, E, weights=None, initial_beta=None, step_size=N g -= self.penalizer * beta.T h.flat[::d + 1] -= self.penalizer - delta = solve(-h, step_size * g.T) + delta = spsolve(-h, step_size * g.T, sym_pos=True) if np.any(np.isnan(delta)): raise ValueError("""delta contains nan value(s). Convergence halted. Please see the following tips in the lifelines documentation: https://lifelines.readthedocs.io/en/latest/Examples.html#problems-with-convergence-in-the-cox-proportional-hazard-model @@ -380,21 +390,72 @@ def _check_values(df, T, E): def _compute_confidence_intervals(self): alpha2 = inv_normal_cdf((1. + self.alpha) / 2.) - se = self._compute_standard_errors() + se = self.standard_errors_ hazards = self.hazards_.values return pd.DataFrame(np.r_[hazards - alpha2 * se, hazards + alpha2 * se], index=['lower-bound', 'upper-bound'], columns=self.hazards_.columns) - def _compute_standard_errors(self): - se = np.sqrt(inv(-self._hessian_).diagonal()) / self._norm_std + def _compute_sandwich_estimator(self, X, T, E): + + n, d = X.shape + + # Init risk and tie sums to zero + risk_phi = 0 + risk_phi_x = np.zeros((1, d)) + + # need to store these histories, as we access them often + risk_phi_history = np.zeros((n,)) + risk_phi_x_history = np.zeros((n, d)) + + score_covariance = np.zeros((d, d)) + + # we already unnormalized the betas in `fit`, so we need normalize them again since X is + # normalized. + beta = self.hazards_.values[0] * self._norm_std + + # Iterate backwards to utilize recursive relationship + for i in range(n - 1, -1, -1): + # Doing it like this to preserve shape + ei = E[i] + xi = X[i:i + 1] + + phi_i = exp(dot(xi, beta)) + phi_x_i = phi_i * xi + + risk_phi += phi_i + risk_phi_x += phi_x_i + + risk_phi_history[i] = risk_phi + risk_phi_x_history[i] = risk_phi_x + + # Iterate forwards + for i in range(0, n): + # Doing it like this to preserve shape + xi = X[i:i + 1] + phi_i = exp(dot(xi, beta)) + + correction_term = sum(E[j] * phi_i / risk_phi_history[j] * (xi - risk_phi_x_history[j] / risk_phi_history[j]) for j in range(0, i+1)) + + score = E[i] * (xi - risk_phi_x_history[i] / risk_phi_history[i]) - correction_term + score_covariance += (score.T).dot(score) + + # TODO: need a faster way to invert these matrices + sandwich_estimator = inv(self._hessian_).dot(score_covariance).dot(inv(self._hessian_)) + return sandwich_estimator + + def _compute_standard_errors(self, df, T, E): + if self.robust: + se = np.sqrt(self._compute_sandwich_estimator(df.values, T.values, E.values).diagonal()) / self._norm_std + else: + se = np.sqrt(-inv(self._hessian_).diagonal()) / self._norm_std return pd.DataFrame(se[None, :], index=['se'], columns=self.hazards_.columns) def _compute_z_values(self): return (self.hazards_.loc['coef'] / - self._compute_standard_errors().loc['se']) + self.standard_errors_.loc['se']) def _compute_p_values(self): U = self._compute_z_values() ** 2 @@ -413,7 +474,7 @@ def summary(self): df = pd.DataFrame(index=self.hazards_.columns) df['coef'] = self.hazards_.loc['coef'].values df['exp(coef)'] = exp(self.hazards_.loc['coef'].values) - df['se(coef)'] = self._compute_standard_errors().loc['se'].values + df['se(coef)'] = self.standard_errors_.loc['se'].values df['z'] = self._compute_z_values() df['p'] = self._compute_p_values() df['lower %.2f' % self.alpha] = self.confidence_intervals_.loc['lower-bound'].values diff --git a/tests/test_estimation.py b/tests/test_estimation.py index 30a9f6bed..52df6b4c0 100644 --- a/tests/test_estimation.py +++ b/tests/test_estimation.py @@ -991,9 +991,9 @@ def test_coef_output_against_R_super_accurate(self, rossi): library(survival) rossi <- read.csv('.../lifelines/datasets/rossi.csv') - mod.allison <- coxph(Surv(week, arrest) ~ fin + age + race + wexp + mar + paro + prio, + r <- coxph(Surv(week, arrest) ~ fin + age + race + wexp + mar + paro + prio, data=rossi) - cat(round(mod.allison$coefficients, 4), sep=", ") + cat(round(r$coefficients, 4), sep=", ") """ expected = np.array([[-0.3794, -0.0574, 0.3139, -0.1498, -0.4337, -0.0849, 0.0915]]) cf = CoxPHFitter() @@ -1379,6 +1379,30 @@ def test_all_okay_with_non_trivial_index_in_dataframe(self, rossi): assert_frame_equal(cp2.summary, cp1.summary) + def test_robust_errors_against_R_no_ties(self, regression_dataset): + df = regression_dataset.copy() + cph = CoxPHFitter() + cph.fit(df, 'T', 'E', robust=True) + expected = pd.Series({'var1': 0.0879, 'var2': 0.0847, 'var3': 0.0655}) + assert_series_equal(cph.standard_errors_.loc['se'], expected, check_less_precise=2, check_names=False) + + + def test_robust_errors_with_strata_doesnt_break(self, rossi): + """ + rossi <- read.csv('.../lifelines/datasets/rossi.csv') + r = coxph(formula = Surv(week, arrest) ~ fin + age + strata(race, + paro, mar, wexp) + prio, data = rossi, robust=TRUE) + """ + cf = CoxPHFitter() + cf.fit(rossi, duration_col='week', event_col='arrest', strata=['race', 'paro', 'mar', 'wexp'], robust=True) + + + def test_robust_errors_against_R_with_ties(self,): + pass + + + + class TestAalenAdditiveFitter(): @@ -1514,6 +1538,7 @@ def test_predict_cumulative_hazard_inputs(self, data_pred1): assert_frame_equal(y_df, y_np) + class TestCoxTimeVaryingFitter(): @pytest.fixture() From f20dfba55c0025fdc581d32c9139fd2954c32ec4 Mon Sep 17 00:00:00 2001 From: Cameron Davidson-Pilon Date: Mon, 30 Jul 2018 17:36:11 -0400 Subject: [PATCH 02/59] no idea what these changes are --- CHANGELOG.md | 4 ++ lifelines/fitters/cox_time_varying_fitter.py | 68 ++++++++++++++++++-- lifelines/fitters/coxph_fitter.py | 55 ++++++++++------ lifelines/statistics.py | 11 +++- tests/test_estimation.py | 47 ++++++++++++-- 5 files changed, 152 insertions(+), 33 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 3aec7afef..f430fabeb 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,9 @@ ### Changelogs +#### 0.15.0 + - adding `robust` params to Cox models' `fit`. This enables atleast i) using non-integer weights in the model (these could be sampling weights like IPTW), and ii) misspecified models (ex: non-propotional hazards). Under the hood it's a sandwich estimator. This does not handle ties, so if there are high number of ties, results may significantly differ from other software. + - `standard_errors_` is now a property on fitted Cox models. + #### 0.14.6 - fix for n > 2 groups in `multivariate_logrank_test` (again). - fix bug for when `event_observed` column was not boolean. diff --git a/lifelines/fitters/cox_time_varying_fitter.py b/lifelines/fitters/cox_time_varying_fitter.py index 611959a93..1aa0cb0be 100644 --- a/lifelines/fitters/cox_time_varying_fitter.py +++ b/lifelines/fitters/cox_time_varying_fitter.py @@ -37,7 +37,7 @@ def __init__(self, alpha=0.95, penalizer=0.0): self.alpha = alpha self.penalizer = penalizer - def fit(self, df, id_col, event_col, start_col='start', stop_col='stop', show_progress=False, step_size=None): + def fit(self, df, id_col, event_col, start_col='start', stop_col='stop', show_progress=False, step_size=None, robust=False): """ Fit the Cox Propertional Hazard model to a time varying dataset. Tied survival times are handled using Efron's tie-method. @@ -56,6 +56,10 @@ def fit(self, df, id_col, event_col, start_col='start', stop_col='stop', show_pr show_progress: since the fitter is iterative, show convergence diagnostics. step_size: set an initial step size for the fitting algorithm. + robust: Compute the robust errors using the Huber sandwich estimator, aka Wei-Lin estimate. This does not handle + ties, so if there are high number of ties, results may significantly differ. See + "The Robust Inference for the Cox Proportional Hazards Model", Journal of the American Statistical Association, Vol. 84, No. 408 (Dec., 1989), pp. 1074- 1078 + Returns: self, with additional properties: hazards_ @@ -83,6 +87,7 @@ def fit(self, df, id_col, event_col, start_col='start', stop_col='stop', show_pr step_size=step_size) self.hazards_ = pd.DataFrame(hazards_.T, columns=df.columns, index=['coef']) / self._norm_std + self.standard_errors_ = self._compute_standard_errors(normalize(df, self._norm_mean, self._norm_std), stop_times_events) self.confidence_intervals_ = self._compute_confidence_intervals() self.baseline_cumulative_hazard_ = self._compute_cumulative_baseline_hazard(df, stop_times_events) self.baseline_survival_ = self._compute_baseline_survival() @@ -103,14 +108,65 @@ def _check_values(df, stop_times_events): check_for_immediate_deaths(stop_times_events) check_for_instantaneous_events(stop_times_events) - def _compute_standard_errors(self): - se = np.sqrt(inv(-self._hessian_).diagonal()) / self._norm_std + def _compute_sandwich_estimator(self, df, stop_times_events): + + n, d = df.shape + + # Init risk and tie sums to zero + risk_phi = 0 + risk_phi_x = np.zeros((1, d)) + + # need to store these histories, as we access them often + risk_phi_history = pd.DataFrame(np.zeros((n,)), index=df.index) + risk_phi_x_history = pd.DataFrame(np.zeros((n, d)), index=df.index) + + score_covariance = np.zeros((d, d)) + + # we already unnormalized the betas in `fit`, so we need normalize them again since X is + # normalized. + beta = self.hazards_.values[0] * self._norm_std + + # Iterate backwards to utilize recursive relationship + for i in range(n - 1, -1, -1): + # Doing it like this to preserve shape + ei = E[i] + xi = X[i:i + 1] + + phi_i = exp(dot(xi, beta)) + phi_x_i = phi_i * xi + + risk_phi += phi_i + risk_phi_x += phi_x_i + + risk_phi_history[i] = risk_phi + risk_phi_x_history[i] = risk_phi_x + + # Iterate forwards + for i in range(0, n): + # Doing it like this to preserve shape + xi = X[i:i + 1] + phi_i = exp(dot(xi, beta)) + + correction_term = sum(E[j] * phi_i / risk_phi_history[j] * (xi - risk_phi_x_history[j] / risk_phi_history[j]) for j in range(0, i+1)) + + score = E[i] * (xi - risk_phi_x_history[i] / risk_phi_history[i]) - correction_term + score_covariance += (score.T).dot(score) + + # TODO: need a faster way to invert these matrices + sandwich_estimator = inv(self._hessian_).dot(score_covariance).dot(inv(self._hessian_)) + return sandwich_estimator + + def _compute_standard_errors(self, df, stop_times_events): + if self.robust: + se = np.sqrt(self._compute_sandwich_estimator(df, stop_times_events).diagonal()) / self._norm_std + else: + se = np.sqrt(-inv(self._hessian_).diagonal()) / self._norm_std return pd.DataFrame(se[None, :], index=['se'], columns=self.hazards_.columns) def _compute_z_values(self): return (self.hazards_.loc['coef'] / - self._compute_standard_errors().loc['se']) + self.standard_errors_.loc['se']) def _compute_p_values(self): U = self._compute_z_values() ** 2 @@ -118,7 +174,7 @@ def _compute_p_values(self): def _compute_confidence_intervals(self): alpha2 = inv_normal_cdf((1. + self.alpha) / 2.) - se = self._compute_standard_errors() + se = self.standard_errors_ hazards = self.hazards_.values return pd.DataFrame(np.r_[hazards - alpha2 * se, hazards + alpha2 * se], @@ -137,7 +193,7 @@ def summary(self): df = pd.DataFrame(index=self.hazards_.columns) df['coef'] = self.hazards_.loc['coef'].values df['exp(coef)'] = exp(self.hazards_.loc['coef'].values) - df['se(coef)'] = self._compute_standard_errors().loc['se'].values + df['se(coef)'] = self.standard_errors_.loc['se'].values df['z'] = self._compute_z_values() df['p'] = self._compute_p_values() df['lower %.2f' % self.alpha] = self.confidence_intervals_.loc['lower-bound'].values diff --git a/lifelines/fitters/coxph_fitter.py b/lifelines/fitters/coxph_fitter.py index 2e65fa99a..3f9b995fe 100644 --- a/lifelines/fitters/coxph_fitter.py +++ b/lifelines/fitters/coxph_fitter.py @@ -76,6 +76,9 @@ def fit(self, df, duration_col, event_col=None, weights_col: an optional column in the dataframe that denotes the weight per subject. This column is expelled and not used as a covariate, but as a weight in the final regression. Default weight is 1. + This can be used for case-weights. For example, a weight of 2 means there were two subjects with + identical observations. + This can be used for sampling weights. In that case, use `robust=True` to get more accurate standard errors. show_progress: since the fitter is iterative, show convergence diagnostics. initial_beta: initialize the starting point of the iterative @@ -117,14 +120,15 @@ def fit(self, df, duration_col, event_col=None, if weights_col: weights = df.pop(weights_col) - if (weights.astype(int) != weights).any(): - warnings.warn("""It looks like your weights are not integers, possibly propensity scores then? -It's important to know that the naive variance estimates of the coefficients are biased. Instead use Monte Carlo to + if (weights.astype(int) != weights).any() and not self.robust: + warnings.warn("""It appears your weights are not integers, possibly propensity or sampling scores then? +It's important to know that the naive variance estimates of the coefficients are biased. Instead a) set `robust=True` in the call to `fit`, or b) use Monte Carlo to estimate the variances. See paper "Variance estimation when using inverse probability of treatment weighting (IPTW) with survival analysis" - """, RuntimeWarning) - +""", RuntimeWarning) else: - weights = pd.DataFrame(np.ones((self._n_examples, 1)), index=df.index) + weights = pd.Series(np.ones((self._n_examples,)), index=df.index) + + self._replication_weights = (weights.astype(int) == weights).all() self._check_values(df, T, E) df = df.astype(float) @@ -280,15 +284,22 @@ def _newton_rhaphson(self, X, T, E, weights=None, initial_beta=None, step_size=N def _get_efron_values(self, X, beta, T, E, weights): """ - Calculates the first and second order vector differentials, - with respect to beta. + Calculates the first and second order vector differentials, with respect to beta. Note that X, T, E are assumed to be sorted on T! + + A good explaination for Efron. Consider three of five subjects who fail at the time. + As it is not known a priori that who is the first to fail, so one-third of + (φ1 + φ2 + φ3) is adjusted from sum_j^{5} φj after one fails. Similarly two-third + of (φ1 + φ2 + φ3) is adjusted after first two individuals fail, etc. + + Parameters: X: (n,d) numpy array of observations. beta: (1, d) numpy array of coefficients. T: (n) numpy array representing observed durations. E: (n) numpy array representing death events. weights: (n) an array representing weights per observation. + Returns: hessian: (d, d) numpy array, gradient: (1, d) numpy array @@ -335,7 +346,7 @@ def _get_efron_values(self, X, beta, T, E, weights): tie_phi_x_x += phi_x_x_i # Keep track of count - tie_count += int(w) + tie_count += 1 if i > 0 and T[i - 1] == ti: # There are more ties/members of the risk set @@ -348,22 +359,27 @@ def _get_efron_values(self, X, beta, T, E, weights): partial_gradient = np.zeros((1, d)) for l in range(tie_count): - c = l / tie_count - - denom = (risk_phi - c * tie_phi) - z = (risk_phi_x - c * tie_phi_x) + """ + A good explaination for Efron. Consider three of five subjects who fail at the time. + As it is not known a priori that who is the first to fail, so one-third of + (φ1 + φ2 + φ3) is adjusted from sum_j^{5} φj after one fails. Similarly two-third + of (φ1 + φ2 + φ3) is adjusted after first two individuals fail, etc. + """ + numer = (risk_phi_x - l * tie_phi_x / tie_count) + denom = (risk_phi - l * tie_phi / tie_count) # Gradient - partial_gradient += z / denom + partial_gradient += w * numer / denom # Hessian - a1 = (risk_phi_x_x - c * tie_phi_x_x) / denom - # In case z and denom both are really small numbers, + a1 = (risk_phi_x_x - l * tie_phi_x_x / tie_count) / denom + # In case numer and denom both are really small numbers, # make sure to do division before multiplications - a2 = dot(z.T / denom, z / denom) + a2 = dot(numer.T / denom, numer / denom) - hessian -= (a1 - a2) + hessian -= w * (a1 - a2) + + log_lik -= w * np.log(denom[0][0]) - log_lik -= np.log(denom[0][0]) # Values outside tie sum gradient += x_tie_sum - partial_gradient @@ -418,7 +434,6 @@ def _compute_sandwich_estimator(self, X, T, E): # Iterate backwards to utilize recursive relationship for i in range(n - 1, -1, -1): # Doing it like this to preserve shape - ei = E[i] xi = X[i:i + 1] phi_i = exp(dot(xi, beta)) diff --git a/lifelines/statistics.py b/lifelines/statistics.py index 1cac53495..84f2b2b19 100644 --- a/lifelines/statistics.py +++ b/lifelines/statistics.py @@ -79,11 +79,15 @@ def logrank_test(event_times_A, event_times_B, event_observed_A=None, event_obse H_0: both event series are from the same generating processes H_A: the event series are from different generating processes. - See Survival and Event Analysis, page 108. This implicitly uses the log-rank weights. + + This implicitly uses the log-rank weights. + + See also `multivariate_logrank_test` for a more general function. + Parameters: - event_times_foo: a (nx1) array of event durations (birth to death,...) for the population. - censorship_bar: a (nx1) array of censorship flags, 1 if observed, 0 if not. Default assumes all observed. + event_times_foo: a (n,) list-like of event durations (birth to death,...) for the population. + censorship_bar: a (n,) list-like of censorship flags, 1 if observed, 0 if not. Default assumes all observed. t_0: the period under observation, -1 for all time. alpha: the level of signifiance kwargs: add keywords and meta-data to the experiment summary @@ -91,6 +95,7 @@ def logrank_test(event_times_A, event_times_B, event_observed_A=None, event_obse Returns: results: a StatisticalResult object with properties 'p_value', 'summary', 'test_statistic', 'test_result' + See Survival and Event Analysis, page 108. """ event_times_A, event_times_B = np.array(event_times_A), np.array(event_times_B) diff --git a/tests/test_estimation.py b/tests/test_estimation.py index 52df6b4c0..ad09dcdf3 100644 --- a/tests/test_estimation.py +++ b/tests/test_estimation.py @@ -1000,7 +1000,7 @@ def test_coef_output_against_R_super_accurate(self, rossi): cf.fit(rossi, duration_col='week', event_col='arrest') npt.assert_array_almost_equal(cf.hazards_.values, expected, decimal=4) - def test_coef_output_against_R_using_non_trivial_weights(self, rossi): + def test_coef_output_against_R_using_non_trivial_but_integer_weights(self, rossi): rossi_ = rossi.copy() rossi_['weights'] = 1. rossi_ = rossi_.groupby(rossi.columns.tolist())['weights'].sum()\ @@ -1011,7 +1011,36 @@ def test_coef_output_against_R_using_non_trivial_weights(self, rossi): cf.fit(rossi_, duration_col='week', event_col='arrest', weights_col='weights') npt.assert_array_almost_equal(cf.hazards_.values, expected, decimal=4) - def test_adding_non_integer_weights_raises_a_warning(self, rossi): + def test_robust_errors_with_weights_is_the_same_as_R(self, regression_dataset): + """ + rossi <- read.csv('.../lifelines/datasets/rossi.csv') + r = coxph(formula = Surv(week, arrest) ~ fin + age + strata(race, + paro, mar, wexp) + prio, data = rossi, robust=TRUE) + """ + df = regression_dataset + df['var3'] = np.round(df['var3'] + 1) + cph = CoxPHFitter() + cph.fit(df.head(5), 'T', 'E', robust=True, weights_col='var3', show_progress=True) + expected = pd.Series({'var1': -2.23662, 'var2': -5.75105}) + assert_series_equal(cph.hazards_.T['coef'], expected, check_less_precise=2, check_names=False) + + + def test_summary_output_using_non_trivial_but_integer_weights(self, rossi): + rossi_weights = rossi.copy() + rossi_weights['weights'] = 1. + rossi_weights = rossi_weights.groupby(rossi.columns.tolist())['weights'].sum()\ + .reset_index() + + cf1 = CoxPHFitter() + cf1.fit(rossi_weights, duration_col='week', event_col='arrest', weights_col='weights') + + cf2 = CoxPHFitter() + cf2.fit(rossi, duration_col='week', event_col='arrest') + + assert_frame_equal(cf1.summary, cf2.summary, check_like=True) + + + def test_adding_non_integer_weights_without_robust_flag_raises_a_warning(self, rossi): rossi['weights'] = np.random.exponential(1, rossi.shape[0]) cox = CoxPHFitter() @@ -1024,6 +1053,16 @@ def test_adding_non_integer_weights_raises_a_warning(self, rossi): assert "naive variance estimates" in str(w[0].message) + def test_adding_non_integer_weights_is_fine_if_robust_is_on(self, rossi): + rossi['weights'] = np.random.exponential(1, rossi.shape[0]) + + cox = CoxPHFitter() + + with warnings.catch_warnings(record=True) as w: + warnings.simplefilter("always") + cox.fit(rossi, 'week', 'arrest', weights_col='weights', robust=True) + assert len(w) == 0 + def test_standard_error_coef_output_against_R(self, rossi): """ from http://cran.r-project.org/doc/contrib/Fox-Companion/appendix-cox-regression.pdf @@ -1115,7 +1154,7 @@ def test_se_against_Survival_Analysis_by_John_Klein_and_Melvin_Moeschberger(self cf.fit(df, duration_col='time', event_col='death') # standard errors - actual_se = cf._compute_standard_errors().values + actual_se = cf._compute_standard_errors(None, None, None).values expected_se = np.array([[0.0143, 0.4623, 0.3561, 0.4222]]) npt.assert_array_almost_equal(actual_se, expected_se, decimal=3) @@ -1380,7 +1419,7 @@ def test_all_okay_with_non_trivial_index_in_dataframe(self, rossi): assert_frame_equal(cp2.summary, cp1.summary) def test_robust_errors_against_R_no_ties(self, regression_dataset): - df = regression_dataset.copy() + df = regression_dataset cph = CoxPHFitter() cph.fit(df, 'T', 'E', robust=True) expected = pd.Series({'var1': 0.0879, 'var2': 0.0847, 'var3': 0.0655}) From 57ee62463d12eeb44ce304e8da296a67afd40c37 Mon Sep 17 00:00:00 2001 From: Cameron Davidson-Pilon Date: Mon, 3 Sep 2018 14:04:44 -0400 Subject: [PATCH 03/59] I have made progress. I still can't get weights + robust to work though --- lifelines/fitters/coxph_fitter.py | 43 +++++--- tests/test_estimation.py | 168 ++++++++++++++++++++++++++++-- 2 files changed, 188 insertions(+), 23 deletions(-) diff --git a/lifelines/fitters/coxph_fitter.py b/lifelines/fitters/coxph_fitter.py index 3f9b995fe..e23f40b94 100644 --- a/lifelines/fitters/coxph_fitter.py +++ b/lifelines/fitters/coxph_fitter.py @@ -125,11 +125,11 @@ def fit(self, df, duration_col, event_col=None, It's important to know that the naive variance estimates of the coefficients are biased. Instead a) set `robust=True` in the call to `fit`, or b) use Monte Carlo to estimate the variances. See paper "Variance estimation when using inverse probability of treatment weighting (IPTW) with survival analysis" """, RuntimeWarning) + if (weights <= 0).any(): + raise ValueError("values in weights_col must be positive.") else: weights = pd.Series(np.ones((self._n_examples,)), index=df.index) - self._replication_weights = (weights.astype(int) == weights).all() - self._check_values(df, T, E) df = df.astype(float) @@ -154,7 +154,7 @@ def fit(self, df, duration_col, event_col=None, self.hazards_ = pd.DataFrame(hazards_.T, columns=df.columns, index=['coef']) / self._norm_std - self.standard_errors_ = self._compute_standard_errors(normalize(df, self._norm_mean, self._norm_std), T, E) + self.standard_errors_ = self._compute_standard_errors(normalize(df, self._norm_mean, self._norm_std), T, E, weights) self.confidence_intervals_ = self._compute_confidence_intervals() @@ -317,7 +317,8 @@ def _get_efron_values(self, X, beta, T, E, weights): risk_phi_x, tie_phi_x = np.zeros((1, d)), np.zeros((1, d)) risk_phi_x_x, tie_phi_x_x = np.zeros((d, d)), np.zeros((d, d)) - # Init number of ties + # Init number of ties and weights + weight_count = 0.0 tie_count = 0 # Iterate backwards to utilize recursive relationship @@ -347,6 +348,7 @@ def _get_efron_values(self, X, beta, T, E, weights): # Keep track of count tie_count += 1 + weight_count += w if i > 0 and T[i - 1] == ti: # There are more ties/members of the risk set @@ -357,6 +359,7 @@ def _get_efron_values(self, X, beta, T, E, weights): # There was atleast one event and no more ties remain. Time to sum. partial_gradient = np.zeros((1, d)) + weighted_average = weight_count / tie_count for l in range(tie_count): """ @@ -369,16 +372,17 @@ def _get_efron_values(self, X, beta, T, E, weights): denom = (risk_phi - l * tie_phi / tie_count) # Gradient - partial_gradient += w * numer / denom + partial_gradient += weighted_average * numer / denom # Hessian a1 = (risk_phi_x_x - l * tie_phi_x_x / tie_count) / denom + # In case numer and denom both are really small numbers, # make sure to do division before multiplications a2 = dot(numer.T / denom, numer / denom) - hessian -= w * (a1 - a2) + hessian -= weighted_average * (a1 - a2) - log_lik -= w * np.log(denom[0][0]) + log_lik -= weighted_average * np.log(denom[0][0]) # Values outside tie sum @@ -387,6 +391,7 @@ def _get_efron_values(self, X, beta, T, E, weights): # reset tie values tie_count = 0 + weight_count = 0.0 x_tie_sum = np.zeros((1, d)) tie_phi = 0 tie_phi_x = np.zeros((1, d)) @@ -413,7 +418,7 @@ def _compute_confidence_intervals(self): index=['lower-bound', 'upper-bound'], columns=self.hazards_.columns) - def _compute_sandwich_estimator(self, X, T, E): + def _compute_sandwich_estimator(self, X, T, E, weights): n, d = X.shape @@ -430,39 +435,45 @@ def _compute_sandwich_estimator(self, X, T, E): # we already unnormalized the betas in `fit`, so we need normalize them again since X is # normalized. beta = self.hazards_.values[0] * self._norm_std + weight_count = 0.0 # Iterate backwards to utilize recursive relationship for i in range(n - 1, -1, -1): # Doing it like this to preserve shape xi = X[i:i + 1] + w = weights[i] - phi_i = exp(dot(xi, beta)) + phi_i = w * exp(dot(xi, beta)) phi_x_i = phi_i * xi risk_phi += phi_i risk_phi_x += phi_x_i - risk_phi_history[i] = risk_phi - risk_phi_x_history[i] = risk_phi_x + risk_phi_history[i] = risk_phi # denom + risk_phi_x_history[i] = risk_phi_x # a[i] # Iterate forwards for i in range(0, n): # Doing it like this to preserve shape + # doesn't handle ties. xi = X[i:i + 1] - phi_i = exp(dot(xi, beta)) + w = weights[i] + phi_i = w * exp(dot(xi, beta)) - correction_term = sum(E[j] * phi_i / risk_phi_history[j] * (xi - risk_phi_x_history[j] / risk_phi_history[j]) for j in range(0, i+1)) + score = -sum(E[j] * phi_i / risk_phi_history[j] * (xi - risk_phi_x_history[j] / risk_phi_history[j]) for j in range(0, i+1)) - score = E[i] * (xi - risk_phi_x_history[i] / risk_phi_history[i]) - correction_term + score = score + E[i] * (xi - risk_phi_x_history[i] / risk_phi_history[i]) score_covariance += (score.T).dot(score) # TODO: need a faster way to invert these matrices + import pdb + pdb.set_trace() sandwich_estimator = inv(self._hessian_).dot(score_covariance).dot(inv(self._hessian_)) return sandwich_estimator - def _compute_standard_errors(self, df, T, E): + def _compute_standard_errors(self, df, T, E, weights): if self.robust: - se = np.sqrt(self._compute_sandwich_estimator(df.values, T.values, E.values).diagonal()) / self._norm_std + se = np.sqrt(self._compute_sandwich_estimator(df.values, T.values, E.values, weights).diagonal()) / self._norm_std else: se = np.sqrt(-inv(self._hessian_).diagonal()) / self._norm_std return pd.DataFrame(se[None, :], diff --git a/tests/test_estimation.py b/tests/test_estimation.py index ad09dcdf3..00f907f88 100644 --- a/tests/test_estimation.py +++ b/tests/test_estimation.py @@ -1011,19 +1011,173 @@ def test_coef_output_against_R_using_non_trivial_but_integer_weights(self, rossi cf.fit(rossi_, duration_col='week', event_col='arrest', weights_col='weights') npt.assert_array_almost_equal(cf.hazards_.values, expected, decimal=4) - def test_robust_errors_with_weights_is_the_same_as_R(self, regression_dataset): + def test_robust_errors_with_less_trival_weights_is_the_same_as_R(self, regression_dataset): """ - rossi <- read.csv('.../lifelines/datasets/rossi.csv') - r = coxph(formula = Surv(week, arrest) ~ fin + age + strata(race, - paro, mar, wexp) + prio, data = rossi, robust=TRUE) + df <- data.frame( + "var1" = c(0.209325, 0.693919, 0.443804, 0.065636, 0.386294), + "var2" = c(0.184677, 0.071893, 1.364646, 0.098375, 1.663092), + "T" = c( 7.335846, 5.269797, 11.684092, 12.678458, 6.601666) + ) + df['E'] = 1 + df['var3'] = 0.75 + df[1, 'var3'] = 1.75 + coxph(formula=Surv(T, E) ~ var1 + var2, data=df, weights=var3, robust=TRUE) + """ + + df = pd.DataFrame({ + "var1": [0.209325, 0.693919, 0.443804, 0.065636, 0.386294], + "var2": [0.184677, 0.071893, 1.364646, 0.098375, 1.663092], + "T": [7.335846, 5.269797, 11.684092, 12.678458, 6.601666], + 'var3': [1.75, 0.75, 0.75, 0.75, 0.75] + }) + df['E'] = 1 + + print(df) + cph = CoxPHFitter() + cph.fit(df, 'T', 'E', robust=True, weights_col='var3', show_progress=True) + cph.print_summary() + expected = pd.Series({'var1': 7.995, 'var2': -1.154}) + assert_series_equal(cph.hazards_.T['coef'], expected, check_less_precise=2, check_names=False) + + expected = pd.Series({'var1': 2.931, 'var2': 1.117}) + assert_series_equal(cph.summary['se(coef)'], expected, check_less_precise=2, check_names=False) + + + def test_robust_errors_with_trival_weights_is_the_same_as_R(self, regression_dataset): + """ + df <- data.frame( + "var1" = c(0.209325, 0.693919, 0.443804, 0.065636, 0.386294), + "var2" = c(0.184677, 0.071893, 1.364646, 0.098375, 1.663092), + "T" = c( 7.335846, 5.269797, 11.684092, 12.678458, 6.601666) + ) + df['E'] = 1 + df['var3'] = 0.75 + coxph(formula=Surv(T, E) ~ var1 + var2, data=df, weights=var3, robust=TRUE) + """ + + df = pd.DataFrame({ + "var1": [0.209325, 0.693919, 0.443804, 0.065636, 0.386294], + "var2": [0.184677, 0.071893, 1.364646, 0.098375, 1.663092], + "T": [7.335846, 5.269797, 11.684092, 12.678458, 6.601666], + 'var3': [0.75, 0.75, 0.75, 0.75, 0.75] + }) + df['E'] = 1 + + print(df) + cph = CoxPHFitter() + cph.fit(df, 'T', 'E', robust=True, weights_col='var3', show_progress=True) + cph.print_summary() + expected = pd.Series({'var1': 7.680, 'var2': -0.915}) + assert_series_equal(cph.hazards_.T['coef'], expected, check_less_precise=2, check_names=False) + + expected = pd.Series({'var1': 2.097, 'var2': 0.827}) + assert_series_equal(cph.summary['se(coef)'], expected, check_less_precise=2, check_names=False) + + + + + def test_robust_errors_is_the_same_as_R(self, regression_dataset): + """ + df <- data.frame( + "var1" = c(0.209325, 0.693919, 0.443804, 0.065636, 0.386294), + "var2" = c(0.184677, 0.071893, 1.364646, 0.098375, 1.663092), + "T" = c( 7.335846, 5.269797, 11.684092, 12.678458, 6.601666) + ) + df['E'] = 1 + + coxph(formula=Surv(T, E) ~ var1 + var2, data=df, robust=TRUE) + """ + + df = pd.DataFrame({ + "var1": [0.209325, 0.693919, 0.443804, 0.065636, 0.386294], + "var2": [0.184677, 0.071893, 1.364646, 0.098375, 1.663092], + "T": [7.335846, 5.269797, 11.684092, 12.678458, 6.601666] + }) + df['E'] = 1 + + cph = CoxPHFitter() + cph.fit(df, 'T', 'E', robust=True, show_progress=True) + expected = pd.Series({'var1': 7.680, 'var2': -0.915}) + assert_series_equal(cph.hazards_.T['coef'], expected, check_less_precise=2, check_names=False) + + expected = pd.Series({'var1': 2.097, 'var2': 0.827}) + assert_series_equal(cph.summary['se(coef)'], expected, check_less_precise=2, check_names=False) + + + def test_trival_float_weights_with_no_ties_is_the_same_as_R(self, regression_dataset): + """ + df <- data.frame( + "var1" = c(0.209325, 0.693919, 0.443804, 0.065636, 0.386294), + "var2" = c(0.184677, 0.071893, 1.364646, 0.098375, 1.663092), + "T" = c( 7.335846, 5.269797, 11.684092, 12.678458, 6.601666) + ) + df['E'] = 1 + df['var3'] = 0.75 + + coxph(formula=Surv(T, E) ~ var1 + var2, data=df, weights=var3) """ df = regression_dataset - df['var3'] = np.round(df['var3'] + 1) + ix = df['var3'] < 1. + df = df.loc[ix].head() + df['var3'] = [0.75] * 5 + cph = CoxPHFitter() - cph.fit(df.head(5), 'T', 'E', robust=True, weights_col='var3', show_progress=True) - expected = pd.Series({'var1': -2.23662, 'var2': -5.75105}) + + cph.fit(df, 'T', 'E', weights_col='var3', show_progress=True) + + expected_coef = pd.Series({'var1': 7.680, 'var2': -0.915}) + assert_series_equal(cph.hazards_.T['coef'], expected_coef, check_less_precise=2, check_names=False) + + expected_std = pd.Series({'var1': 6.641, 'var2': 1.650}) + assert_series_equal(cph.summary['se(coef)'], expected_std, check_less_precise=2, check_names=False) + + expected_ll = -1.142397 + assert abs(cph._log_likelihood - expected_ll) < 0.001 + + def test_less_trival_float_weights_with_no_ties_is_the_same_as_R(self, regression_dataset): + """ + df <- data.frame( + "var1" = c(0.209325, 0.693919, 0.443804, 0.065636, 0.386294), + "var2" = c(0.184677, 0.071893, 1.364646, 0.098375, 1.663092), + "T" = c( 7.335846, 5.269797, 11.684092, 12.678458, 6.601666) + ) + df['E'] = 1 + df['var3'] = 0.75 + df[1, 'var3'] = 1.75 + + coxph(formula=Surv(T, E) ~ var1 + var2, data=df, weights=var3) + """ + df = regression_dataset + ix = df['var3'] < 1. + df = df.loc[ix].head() + df['var3'] = [1.75] + [0.75] * 4 + + cph = CoxPHFitter() + + cph.fit(df, 'T', 'E', weights_col='var3', show_progress=True) + expected = pd.Series({'var1': 7.995, 'var2': -1.154}) + assert_series_equal(cph.hazards_.T['coef'], expected, check_less_precise=2, check_names=False) + + expected = pd.Series({'var1': 6.690, 'var2': 1.614}) + assert_series_equal(cph.summary['se(coef)'], expected, check_less_precise=2, check_names=False) + + + def test_non_trival_float_weights_with_no_ties_is_the_same_as_R(self, regression_dataset): + """ + df <- read.csv('.../lifelines/datasets/regression.csv') + coxph(formula=Surv(T, E) ~ var1 + var2, data=df, weights=var3) + """ + df = regression_dataset + + cph = CoxPHFitter() + + cph.fit(df, 'T', 'E', weights_col='var3', show_progress=True) + expected = pd.Series({'var1': 0.3268, 'var2': 0.0775}) assert_series_equal(cph.hazards_.T['coef'], expected, check_less_precise=2, check_names=False) + expected = pd.Series({'var1': 0.0697, 'var2': 0.0861}) + assert_series_equal(cph.summary['se(coef)'], expected, check_less_precise=2, check_names=False) + def test_summary_output_using_non_trivial_but_integer_weights(self, rossi): rossi_weights = rossi.copy() From 9952e68486797a4aae1a4d77859b5285915701b4 Mon Sep 17 00:00:00 2001 From: Cameron Davidson-Pilon Date: Mon, 3 Sep 2018 17:33:18 -0400 Subject: [PATCH 04/59] can't get weights + robust to work. What is true is that R and my hessian matrices are the same, but R is doing something else I can't find. --- CHANGELOG.md | 1 + lifelines/fitters/coxph_fitter.py | 9 +++--- tests/test_estimation.py | 51 ++++++++++++++++--------------- 3 files changed, 33 insertions(+), 28 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index f430fabeb..b8c33e0a3 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -3,6 +3,7 @@ #### 0.15.0 - adding `robust` params to Cox models' `fit`. This enables atleast i) using non-integer weights in the model (these could be sampling weights like IPTW), and ii) misspecified models (ex: non-propotional hazards). Under the hood it's a sandwich estimator. This does not handle ties, so if there are high number of ties, results may significantly differ from other software. - `standard_errors_` is now a property on fitted Cox models. + - `variance_matrix_` is now a property on fitted `CoxPHFitter` which describes the variance matrix of the coefficients. #### 0.14.6 - fix for n > 2 groups in `multivariate_logrank_test` (again). diff --git a/lifelines/fitters/coxph_fitter.py b/lifelines/fitters/coxph_fitter.py index e23f40b94..8327b371b 100644 --- a/lifelines/fitters/coxph_fitter.py +++ b/lifelines/fitters/coxph_fitter.py @@ -127,6 +127,7 @@ def fit(self, df, duration_col, event_col=None, """, RuntimeWarning) if (weights <= 0).any(): raise ValueError("values in weights_col must be positive.") + else: weights = pd.Series(np.ones((self._n_examples,)), index=df.index) @@ -157,7 +158,6 @@ def fit(self, df, duration_col, event_col=None, self.standard_errors_ = self._compute_standard_errors(normalize(df, self._norm_mean, self._norm_std), T, E, weights) self.confidence_intervals_ = self._compute_confidence_intervals() - self.baseline_hazard_ = self._compute_baseline_hazards(df, T, E) self.baseline_cumulative_hazard_ = self._compute_baseline_cumulative_hazard() self.baseline_survival_ = self._compute_baseline_survival() @@ -466,16 +466,17 @@ def _compute_sandwich_estimator(self, X, T, E, weights): score_covariance += (score.T).dot(score) # TODO: need a faster way to invert these matrices - import pdb - pdb.set_trace() sandwich_estimator = inv(self._hessian_).dot(score_covariance).dot(inv(self._hessian_)) return sandwich_estimator def _compute_standard_errors(self, df, T, E, weights): + if self.robust: se = np.sqrt(self._compute_sandwich_estimator(df.values, T.values, E.values, weights).diagonal()) / self._norm_std + #self.variance_matrix_ = -inv(self._hessian_) / np.outer(self._norm_std, self._norm_std) else: - se = np.sqrt(-inv(self._hessian_).diagonal()) / self._norm_std + self.variance_matrix_ = -inv(self._hessian_) / np.outer(self._norm_std, self._norm_std) + se = np.sqrt(self.variance_matrix_.diagonal()) return pd.DataFrame(se[None, :], index=['se'], columns=self.hazards_.columns) diff --git a/tests/test_estimation.py b/tests/test_estimation.py index 00f907f88..7827e385b 100644 --- a/tests/test_estimation.py +++ b/tests/test_estimation.py @@ -1011,7 +1011,7 @@ def test_coef_output_against_R_using_non_trivial_but_integer_weights(self, rossi cf.fit(rossi_, duration_col='week', event_col='arrest', weights_col='weights') npt.assert_array_almost_equal(cf.hazards_.values, expected, decimal=4) - def test_robust_errors_with_less_trival_weights_is_the_same_as_R(self, regression_dataset): + def test_robust_errors_with_trival_weights_is_the_different_than_R(self, regression_dataset): """ df <- data.frame( "var1" = c(0.209325, 0.693919, 0.443804, 0.065636, 0.386294), @@ -1020,30 +1020,31 @@ def test_robust_errors_with_less_trival_weights_is_the_same_as_R(self, regressio ) df['E'] = 1 df['var3'] = 0.75 - df[1, 'var3'] = 1.75 - coxph(formula=Surv(T, E) ~ var1 + var2, data=df, weights=var3, robust=TRUE) + r = coxph(formula=Surv(T, E) ~ var1 + var2, data=df, weights=var3, robust=TRUE) + r$var + r$naive.var """ + w = 0.75 df = pd.DataFrame({ "var1": [0.209325, 0.693919, 0.443804, 0.065636, 0.386294], "var2": [0.184677, 0.071893, 1.364646, 0.098375, 1.663092], "T": [7.335846, 5.269797, 11.684092, 12.678458, 6.601666], - 'var3': [1.75, 0.75, 0.75, 0.75, 0.75] }) df['E'] = 1 + df['var3'] = w - print(df) cph = CoxPHFitter() cph.fit(df, 'T', 'E', robust=True, weights_col='var3', show_progress=True) - cph.print_summary() - expected = pd.Series({'var1': 7.995, 'var2': -1.154}) + expected = pd.Series({'var1': 7.680, 'var2': -0.915}) assert_series_equal(cph.hazards_.T['coef'], expected, check_less_precise=2, check_names=False) - expected = pd.Series({'var1': 2.931, 'var2': 1.117}) - assert_series_equal(cph.summary['se(coef)'], expected, check_less_precise=2, check_names=False) - + expected_cov = np.array([[33.079106, -5.964652], [-5.964652, 2.040642]]) + npt.assert_array_almost_equal( + w * cph.variance_matrix_, expected_cov, + decimal=2) - def test_robust_errors_with_trival_weights_is_the_same_as_R(self, regression_dataset): + def test_robust_errors_with_less_trival_weights_is_the_same_as_R(self, regression_dataset): """ df <- data.frame( "var1" = c(0.209325, 0.693919, 0.443804, 0.065636, 0.386294), @@ -1052,28 +1053,34 @@ def test_robust_errors_with_trival_weights_is_the_same_as_R(self, regression_dat ) df['E'] = 1 df['var3'] = 0.75 - coxph(formula=Surv(T, E) ~ var1 + var2, data=df, weights=var3, robust=TRUE) + df[1, 'var3'] = 1.75 + r = coxph(formula=Surv(T, E) ~ var1 + var2, data=df, weights=var3, robust=TRUE) + r$var + r$naive.var """ df = pd.DataFrame({ "var1": [0.209325, 0.693919, 0.443804, 0.065636, 0.386294], "var2": [0.184677, 0.071893, 1.364646, 0.098375, 1.663092], "T": [7.335846, 5.269797, 11.684092, 12.678458, 6.601666], - 'var3': [0.75, 0.75, 0.75, 0.75, 0.75] + 'var3': [1.75, 0.75, 0.75, 0.75, 0.75] }) df['E'] = 1 - print(df) cph = CoxPHFitter() cph.fit(df, 'T', 'E', robust=True, weights_col='var3', show_progress=True) - cph.print_summary() - expected = pd.Series({'var1': 7.680, 'var2': -0.915}) + expected = pd.Series({'var1': 7.995, 'var2': -1.154}) assert_series_equal(cph.hazards_.T['coef'], expected, check_less_precise=2, check_names=False) - expected = pd.Series({'var1': 2.097, 'var2': 0.827}) - assert_series_equal(cph.summary['se(coef)'], expected, check_less_precise=2, check_names=False) + variance_matrix = -np.linalg.inv(cph._hessian_) / np.outer(cph._norm_std, cph._norm_std) + expected_cov = np.array([[44.758444, -8.781867], [-8.781867, 2.606589]]) + npt.assert_array_almost_equal( + variance_matrix, expected_cov, + decimal=2) + expected = pd.Series({'var1': 2.931, 'var2': 1.117}) + assert_series_equal(cph.summary['se(coef)'], expected, check_less_precise=2, check_names=False) def test_robust_errors_is_the_same_as_R(self, regression_dataset): @@ -1308,7 +1315,7 @@ def test_se_against_Survival_Analysis_by_John_Klein_and_Melvin_Moeschberger(self cf.fit(df, duration_col='time', event_col='death') # standard errors - actual_se = cf._compute_standard_errors(None, None, None).values + actual_se = cf._compute_standard_errors(None, None, None, None).values expected_se = np.array([[0.0143, 0.4623, 0.3561, 0.4222]]) npt.assert_array_almost_equal(actual_se, expected_se, decimal=3) @@ -1586,15 +1593,11 @@ def test_robust_errors_with_strata_doesnt_break(self, rossi): r = coxph(formula = Surv(week, arrest) ~ fin + age + strata(race, paro, mar, wexp) + prio, data = rossi, robust=TRUE) """ + assert False cf = CoxPHFitter() cf.fit(rossi, duration_col='week', event_col='arrest', strata=['race', 'paro', 'mar', 'wexp'], robust=True) - def test_robust_errors_against_R_with_ties(self,): - pass - - - class TestAalenAdditiveFitter(): From 108f3f8af9e730243611a694ccc443f773bd947c Mon Sep 17 00:00:00 2001 From: Cameron Davidson-Pilon Date: Mon, 3 Sep 2018 21:00:55 -0400 Subject: [PATCH 05/59] adding a sample algo for the perfect correlation check --- lifelines/utils/__init__.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/lifelines/utils/__init__.py b/lifelines/utils/__init__.py index 0aa7b3896..0477cfda7 100644 --- a/lifelines/utils/__init__.py +++ b/lifelines/utils/__init__.py @@ -1074,9 +1074,15 @@ def check_complete_separation_low_variance(df, events): def check_complete_separation_close_to_perfect_correlation(df, durations): # slow for many columns THRESHOLD = 0.99 + n, _ = df.shape + if n > 1000: + # let's sample to speed this n**2 algo up. + df = df.sample(n=1000, random_state=15).copy() + durations = durations.sample(n=1000, random_state=15).copy() + for col, series in df.iteritems(): if abs(stats.spearmanr(series, durations).correlation) >= THRESHOLD: - warning_text = "Column %s has high correlation with the duration column. This may harm convergence. This could be a form of 'complete separation'. \ + warning_text = "Column %s has high sample correlation with the duration column. This may harm convergence. This could be a form of 'complete separation'. \ See https://stats.idre.ucla.edu/other/mult-pkg/faq/general/faqwhat-is-complete-or-quasi-complete-separation-in-logisticprobit-regression-and-how-do-we-deal-with-them/ " % (col) warnings.warn(warning_text, ConvergenceWarning) From 75cdaa85dbe0903bc74c813cfeda5f9deb0b5bd3 Mon Sep 17 00:00:00 2001 From: Cameron Davidson-Pilon Date: Mon, 3 Sep 2018 21:05:37 -0400 Subject: [PATCH 06/59] fix #507 --- lifelines/utils/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lifelines/utils/__init__.py b/lifelines/utils/__init__.py index 0477cfda7..09be09a8a 100644 --- a/lifelines/utils/__init__.py +++ b/lifelines/utils/__init__.py @@ -896,7 +896,7 @@ def _concordance_index(event_times, predicted_event_times, event_observed): times_to_compare = _BTree(np.unique(died_pred)) num_pairs = 0 num_correct = 0 - num_tied = 0 + num_tied = np.int64(0) def handle_pairs(truth, pred, first_ix): """ From cb10e01126e72894d4f7d34ab1be9852b348be5c Mon Sep 17 00:00:00 2001 From: Cameron Davidson-Pilon Date: Mon, 3 Sep 2018 21:39:40 -0400 Subject: [PATCH 07/59] this new newton-decrement convergence is very promising, and I don't see a decrease in coeffitient accuracy --- CHANGELOG.md | 3 +++ lifelines/fitters/cox_time_varying_fitter.py | 10 ++++++++-- lifelines/fitters/coxph_fitter.py | 13 +++++++++++-- tests/test_estimation.py | 2 +- 4 files changed, 23 insertions(+), 5 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 3aec7afef..177cce062 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,8 @@ ### Changelogs +#### 0.15.0 + - New criteria for convergence of `CoxPHFitter` and `CoxTimeVaryingFitter` called the Newton-decrement. Tests show it is as accurate (w.r.t to previous coefficients) and typically shaves off a single step, resulting in generally faster convergence. + #### 0.14.6 - fix for n > 2 groups in `multivariate_logrank_test` (again). - fix bug for when `event_observed` column was not boolean. diff --git a/lifelines/fitters/cox_time_varying_fitter.py b/lifelines/fitters/cox_time_varying_fitter.py index 611959a93..3beebdd5d 100644 --- a/lifelines/fitters/cox_time_varying_fitter.py +++ b/lifelines/fitters/cox_time_varying_fitter.py @@ -186,7 +186,10 @@ def _newton_rhaphson(self, df, stop_times_events, show_progress=False, step_size g -= self.penalizer * beta.T h.flat[::d + 1] -= self.penalizer - delta = solve(-h, step_size * g.T) + # reusing a piece to make g * inv(h) * g.T faster later + inv_h_dot_g_T = spsolve(-h, g.T, sym_pos=True) + delta = step_size * inv_h_dot_g_T + if np.any(np.isnan(delta)): raise ValueError("""delta contains nan value(s). Convergence halted. Please see the following tips in the lifelines documentation: https://lifelines.readthedocs.io/en/latest/Examples.html#problems-with-convergence-in-the-cox-proportional-hazard-model @@ -194,15 +197,18 @@ def _newton_rhaphson(self, df, stop_times_events, show_progress=False, step_size # Save these as pending result hessian, gradient = h, g norm_delta = norm(delta) + newton_decrement = g.dot(inv_h_dot_g_T)/2 if show_progress: - print("Iteration %d: norm_delta = %.6f, step_size = %.3f, ll = %.6f, seconds_since_start = %.1f" % (i, norm_delta, step_size, ll, time.time() - start)) + print("Iteration %d: norm_delta = %.5f, step_size = %.5f, ll = %.5f, newton_decrement = %.5f, seconds_since_start = %.1f" % (i, norm_delta, step_size, ll, newton_decrement, time.time() - start)) # convergence criteria if norm_delta < precision: converging, completed = False, True elif abs(ll - previous_ll) < precision: converging, completed = False, True + if newton_decrement < precision: + converging, completed = False, True elif i >= max_steps: # 50 iterations steps with N-R is a lot. # Expected convergence is ~10 steps diff --git a/lifelines/fitters/coxph_fitter.py b/lifelines/fitters/coxph_fitter.py index 2e65fa99a..17977acfa 100644 --- a/lifelines/fitters/coxph_fitter.py +++ b/lifelines/fitters/coxph_fitter.py @@ -234,7 +234,10 @@ def _newton_rhaphson(self, X, T, E, weights=None, initial_beta=None, step_size=N g -= self.penalizer * beta.T h.flat[::d + 1] -= self.penalizer - delta = spsolve(-h, step_size * g.T, sym_pos=True) + # reusing a piece to make g * inv(h) * g.T faster later + inv_h_dot_g_T = spsolve(-h, g.T, sym_pos=True) + delta = step_size * inv_h_dot_g_T + if np.any(np.isnan(delta)): raise ValueError("""delta contains nan value(s). Convergence halted. Please see the following tips in the lifelines documentation: https://lifelines.readthedocs.io/en/latest/Examples.html#problems-with-convergence-in-the-cox-proportional-hazard-model @@ -244,11 +247,17 @@ def _newton_rhaphson(self, X, T, E, weights=None, initial_beta=None, step_size=N hessian, gradient = h, g norm_delta = norm(delta) + # reusing an above piece to make g * inv(h) * g.T faster. + newton_decrement = g.dot(inv_h_dot_g_T)/2 + if show_progress: - print("Iteration %d: norm_delta = %.5f, step_size = %.5f, ll = %.5f, seconds_since_start = %.1f" % (i, norm_delta, step_size, ll, time.time() - start)) + print("Iteration %d: norm_delta = %.5f, step_size = %.5f, ll = %.5f, newton_decrement = %.5f, seconds_since_start = %.1f" % (i, norm_delta, step_size, ll, newton_decrement, time.time() - start)) + # convergence criteria if norm_delta < precision: converging, completed = False, True + if newton_decrement < precision: + converging, completed = False, True elif abs(ll - previous_ll) < precision: converging, completed = False, True elif i >= max_steps: diff --git a/tests/test_estimation.py b/tests/test_estimation.py index 52df6b4c0..866409d27 100644 --- a/tests/test_estimation.py +++ b/tests/test_estimation.py @@ -997,7 +997,7 @@ def test_coef_output_against_R_super_accurate(self, rossi): """ expected = np.array([[-0.3794, -0.0574, 0.3139, -0.1498, -0.4337, -0.0849, 0.0915]]) cf = CoxPHFitter() - cf.fit(rossi, duration_col='week', event_col='arrest') + cf.fit(rossi, duration_col='week', event_col='arrest', show_progress=True) npt.assert_array_almost_equal(cf.hazards_.values, expected, decimal=4) def test_coef_output_against_R_using_non_trivial_weights(self, rossi): From d3e407b635c4d84a7379612ec4c0e9c978668a38 Mon Sep 17 00:00:00 2001 From: Cameron Davidson-Pilon Date: Mon, 3 Sep 2018 21:43:33 -0400 Subject: [PATCH 08/59] this new newton-decrement convergence is very promising, and I don't see a decrease in coeffitient accuracy --- CHANGELOG.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 177cce062..269b4296d 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,7 +1,7 @@ ### Changelogs #### 0.15.0 - - New criteria for convergence of `CoxPHFitter` and `CoxTimeVaryingFitter` called the Newton-decrement. Tests show it is as accurate (w.r.t to previous coefficients) and typically shaves off a single step, resulting in generally faster convergence. + - New criteria for convergence of `CoxPHFitter` and `CoxTimeVaryingFitter` called the Newton-decrement. Tests show it is as accurate (w.r.t to previous coefficients) and typically shaves off a single step, resulting in generally faster convergence. See https://www.cs.cmu.edu/~pradeepr/convexopt/Lecture_Slides/Newton_methods.pdf. Details about the Newton-decrement are added to the `show_progress` statements. #### 0.14.6 - fix for n > 2 groups in `multivariate_logrank_test` (again). From be8ecf5fd54a4b4fe4f9aa6ef43155877f59f855 Mon Sep 17 00:00:00 2001 From: Cameron Davidson-Pilon Date: Mon, 10 Sep 2018 15:06:07 -0400 Subject: [PATCH 09/59] lost again --- lifelines/fitters/cox_time_varying_fitter.py | 8 +++++--- lifelines/fitters/coxph_fitter.py | 9 ++++++--- lifelines/utils/__init__.py | 4 ++-- tests/test_estimation.py | 11 ++++++++--- 4 files changed, 21 insertions(+), 11 deletions(-) diff --git a/lifelines/fitters/cox_time_varying_fitter.py b/lifelines/fitters/cox_time_varying_fitter.py index b291f0608..9b2c9acbb 100644 --- a/lifelines/fitters/cox_time_varying_fitter.py +++ b/lifelines/fitters/cox_time_varying_fitter.py @@ -10,6 +10,7 @@ from numpy import dot, exp from numpy.linalg import solve, norm, inv +from scipy.linalg import solve as spsolve from lifelines.fitters import BaseFitter from lifelines.fitters.coxph_fitter import CoxPHFitter from lifelines.statistics import chisq_test @@ -157,7 +158,7 @@ def _compute_sandwich_estimator(self, df, stop_times_events): return sandwich_estimator def _compute_standard_errors(self, df, stop_times_events): - if self.robust: + if self.robust: # TODO se = np.sqrt(self._compute_sandwich_estimator(df, stop_times_events).diagonal()) / self._norm_std else: se = np.sqrt(-inv(self._hessian_).diagonal()) / self._norm_std @@ -261,9 +262,10 @@ def _newton_rhaphson(self, df, stop_times_events, show_progress=False, step_size # convergence criteria if norm_delta < precision: converging, completed = False, True - elif abs(ll - previous_ll) < precision: + elif abs(ll - previous_ll) / (-previous_ll) < 1e-09: + # this is what R uses by default converging, completed = False, True - if newton_decrement < precision: + elif newton_decrement < 10e-8: converging, completed = False, True elif i >= max_steps: # 50 iterations steps with N-R is a lot. diff --git a/lifelines/fitters/coxph_fitter.py b/lifelines/fitters/coxph_fitter.py index cce7998bc..4a526ef1a 100644 --- a/lifelines/fitters/coxph_fitter.py +++ b/lifelines/fitters/coxph_fitter.py @@ -260,9 +260,10 @@ def _newton_rhaphson(self, X, T, E, weights=None, initial_beta=None, step_size=N # convergence criteria if norm_delta < precision: converging, completed = False, True - if newton_decrement < precision: + elif previous_ll != 0 and abs(ll - previous_ll) / (-previous_ll) < 1e-09: + # this is what R uses by default converging, completed = False, True - elif abs(ll - previous_ll) < precision: + elif newton_decrement < precision: converging, completed = False, True elif i >= max_steps: # 50 iterations steps with N-R is a lot. @@ -428,6 +429,8 @@ def _compute_confidence_intervals(self): columns=self.hazards_.columns) def _compute_sandwich_estimator(self, X, T, E, weights): + # https://www.stat.tamu.edu/~carroll/ftp/gk001.pdf + # lin1989 n, d = X.shape @@ -472,7 +475,7 @@ def _compute_sandwich_estimator(self, X, T, E, weights): score = -sum(E[j] * phi_i / risk_phi_history[j] * (xi - risk_phi_x_history[j] / risk_phi_history[j]) for j in range(0, i+1)) score = score + E[i] * (xi - risk_phi_x_history[i] / risk_phi_history[i]) - score_covariance += (score.T).dot(score) + score_covariance += w * (score.T).dot(score) # TODO: need a faster way to invert these matrices sandwich_estimator = inv(self._hessian_).dot(score_covariance).dot(inv(self._hessian_)) diff --git a/lifelines/utils/__init__.py b/lifelines/utils/__init__.py index 09be09a8a..d0658b939 100644 --- a/lifelines/utils/__init__.py +++ b/lifelines/utils/__init__.py @@ -68,9 +68,9 @@ def qth_survival_times(q, survival_functions, cdf=False): # Typically, one would expect that the output should equal the "height" of q. # An issue can arise if the Series q contains duplicate values. We handle this un-eligantly. if q.duplicated().any(): - return pd.DataFrame.from_items([ + return pd.DataFrame.from_dict(dict([ (_q, survival_functions.apply(lambda s: qth_survival_time(_q, s))) for i, _q in enumerate(q) - ], orient='index', columns=survival_functions.columns) + ]), orient='index', columns=survival_functions.columns) else: return pd.DataFrame({_q: survival_functions.apply(lambda s: qth_survival_time(_q, s)) for _q in q}).T diff --git a/tests/test_estimation.py b/tests/test_estimation.py index ab3267f5e..46b92c285 100644 --- a/tests/test_estimation.py +++ b/tests/test_estimation.py @@ -1039,10 +1039,15 @@ def test_robust_errors_with_trival_weights_is_the_different_than_R(self, regress expected = pd.Series({'var1': 7.680, 'var2': -0.915}) assert_series_equal(cph.hazards_.T['coef'], expected, check_less_precise=2, check_names=False) + variance_matrix = -np.linalg.inv(cph._hessian_) / np.outer(cph._norm_std, cph._norm_std) expected_cov = np.array([[33.079106, -5.964652], [-5.964652, 2.040642]]) npt.assert_array_almost_equal( - w * cph.variance_matrix_, expected_cov, - decimal=2) + w * variance_matrix_, expected_cov, + decimal=1) + + expected = pd.Series({'var1': 2.931, 'var2': 1.117}) + assert_series_equal(cph.summary['se(coef)'], expected, check_less_precise=2, check_names=False) + def test_robust_errors_with_less_trival_weights_is_the_same_as_R(self, regression_dataset): """ @@ -1077,7 +1082,7 @@ def test_robust_errors_with_less_trival_weights_is_the_same_as_R(self, regressio expected_cov = np.array([[44.758444, -8.781867], [-8.781867, 2.606589]]) npt.assert_array_almost_equal( variance_matrix, expected_cov, - decimal=2) + decimal=1) # not as precise because matrix inversion will accumulate estimation errors. expected = pd.Series({'var1': 2.931, 'var2': 1.117}) assert_series_equal(cph.summary['se(coef)'], expected, check_less_precise=2, check_names=False) From 1c905582448be4efede37dd7391de03250e0a57d Mon Sep 17 00:00:00 2001 From: Cameron Davidson-Pilon Date: Thu, 20 Sep 2018 21:20:58 -0400 Subject: [PATCH 10/59] great! I've gotten weights + robust errors to work - took way to long. Next is to test for censorship can be included --- lifelines/fitters/coxph_fitter.py | 32 +++++++++-------- tests/test_estimation.py | 58 +++++++++++++++++++++++-------- 2 files changed, 62 insertions(+), 28 deletions(-) diff --git a/lifelines/fitters/coxph_fitter.py b/lifelines/fitters/coxph_fitter.py index 4a526ef1a..f216aa9ad 100644 --- a/lifelines/fitters/coxph_fitter.py +++ b/lifelines/fitters/coxph_fitter.py @@ -431,34 +431,37 @@ def _compute_confidence_intervals(self): def _compute_sandwich_estimator(self, X, T, E, weights): # https://www.stat.tamu.edu/~carroll/ftp/gk001.pdf # lin1989 + # https://www.ics.uci.edu/~dgillen/STAT255/Handouts/lecture10.pdf n, d = X.shape # Init risk and tie sums to zero risk_phi = 0 risk_phi_x = np.zeros((1, d)) + running_weight_sum = 0 # need to store these histories, as we access them often risk_phi_history = np.zeros((n,)) risk_phi_x_history = np.zeros((n, d)) - score_covariance = np.zeros((d, d)) # we already unnormalized the betas in `fit`, so we need normalize them again since X is # normalized. beta = self.hazards_.values[0] * self._norm_std - weight_count = 0.0 + + score_residuals = np.zeros((n, d)) # Iterate backwards to utilize recursive relationship for i in range(n - 1, -1, -1): # Doing it like this to preserve shape xi = X[i:i + 1] + w = weights[i] phi_i = w * exp(dot(xi, beta)) phi_x_i = phi_i * xi - risk_phi += phi_i + risk_phi += phi_i risk_phi_x += phi_x_i risk_phi_history[i] = risk_phi # denom @@ -466,28 +469,29 @@ def _compute_sandwich_estimator(self, X, T, E, weights): # Iterate forwards for i in range(0, n): - # Doing it like this to preserve shape # doesn't handle ties. xi = X[i:i + 1] - w = weights[i] - phi_i = w * exp(dot(xi, beta)) - - score = -sum(E[j] * phi_i / risk_phi_history[j] * (xi - risk_phi_x_history[j] / risk_phi_history[j]) for j in range(0, i+1)) + phi_i = exp(dot(xi, beta)) + score = -sum( + E[j] * (phi_i * weights[j]) / risk_phi_history[j] * (xi - risk_phi_x_history[j] / risk_phi_history[j]) for j in range(0, i+1) + ) score = score + E[i] * (xi - risk_phi_x_history[i] / risk_phi_history[i]) - score_covariance += w * (score.T).dot(score) + score *= weights[i] + score_residuals[i, :] = score + - # TODO: need a faster way to invert these matrices - sandwich_estimator = inv(self._hessian_).dot(score_covariance).dot(inv(self._hessian_)) + naive_var = inv(self._hessian_) / self._norm_std.values + delta_betas = score_residuals.dot(naive_var) + sandwich_estimator = delta_betas.T.dot(delta_betas) return sandwich_estimator def _compute_standard_errors(self, df, T, E, weights): + self.variance_matrix_ = -inv(self._hessian_) / np.outer(self._norm_std, self._norm_std) if self.robust: - se = np.sqrt(self._compute_sandwich_estimator(df.values, T.values, E.values, weights).diagonal()) / self._norm_std - #self.variance_matrix_ = -inv(self._hessian_) / np.outer(self._norm_std, self._norm_std) + se = np.sqrt(self._compute_sandwich_estimator(df.values, T.values, E.values, weights.values).diagonal()) # / self._norm_std else: - self.variance_matrix_ = -inv(self._hessian_) / np.outer(self._norm_std, self._norm_std) se = np.sqrt(self.variance_matrix_.diagonal()) return pd.DataFrame(se[None, :], index=['se'], columns=self.hazards_.columns) diff --git a/tests/test_estimation.py b/tests/test_estimation.py index 46b92c285..f6c79a900 100644 --- a/tests/test_estimation.py +++ b/tests/test_estimation.py @@ -1011,7 +1011,7 @@ def test_coef_output_against_R_using_non_trivial_but_integer_weights(self, rossi cf.fit(rossi_, duration_col='week', event_col='arrest', weights_col='weights') npt.assert_array_almost_equal(cf.hazards_.values, expected, decimal=4) - def test_robust_errors_with_trival_weights_is_the_different_than_R(self, regression_dataset): + def test_robust_errors_with_trival_weights_is_the_same_than_R(self, regression_dataset): """ df <- data.frame( "var1" = c(0.209325, 0.693919, 0.443804, 0.065636, 0.386294), @@ -1039,13 +1039,12 @@ def test_robust_errors_with_trival_weights_is_the_different_than_R(self, regress expected = pd.Series({'var1': 7.680, 'var2': -0.915}) assert_series_equal(cph.hazards_.T['coef'], expected, check_less_precise=2, check_names=False) - variance_matrix = -np.linalg.inv(cph._hessian_) / np.outer(cph._norm_std, cph._norm_std) expected_cov = np.array([[33.079106, -5.964652], [-5.964652, 2.040642]]) npt.assert_array_almost_equal( - w * variance_matrix_, expected_cov, + w * cph.variance_matrix_, expected_cov, decimal=1) - expected = pd.Series({'var1': 2.931, 'var2': 1.117}) + expected = pd.Series({'var1': 2.097, 'var2': 0.827}) assert_series_equal(cph.summary['se(coef)'], expected, check_less_precise=2, check_names=False) @@ -1054,39 +1053,70 @@ def test_robust_errors_with_less_trival_weights_is_the_same_as_R(self, regressio df <- data.frame( "var1" = c(0.209325, 0.693919, 0.443804, 0.065636, 0.386294), "var2" = c(0.184677, 0.071893, 1.364646, 0.098375, 1.663092), - "T" = c( 7.335846, 5.269797, 11.684092, 12.678458, 6.601666) + "T" = c(1, 2, 3, 4, 5) ) df['E'] = 1 - df['var3'] = 0.75 - df[1, 'var3'] = 1.75 + df['var3'] = 2 + df[4, 'var3'] = 1 r = coxph(formula=Surv(T, E) ~ var1 + var2, data=df, weights=var3, robust=TRUE) r$var r$naive.var + residuals(r, type='dfbeta') """ df = pd.DataFrame({ "var1": [0.209325, 0.693919, 0.443804, 0.065636, 0.386294], "var2": [0.184677, 0.071893, 1.364646, 0.098375, 1.663092], - "T": [7.335846, 5.269797, 11.684092, 12.678458, 6.601666], - 'var3': [1.75, 0.75, 0.75, 0.75, 0.75] + "T": [1, 2, 3, 4, 5], + 'var3': [2, 2, 2, 1, 2] }) df['E'] = 1 cph = CoxPHFitter() cph.fit(df, 'T', 'E', robust=True, weights_col='var3', show_progress=True) - expected = pd.Series({'var1': 7.995, 'var2': -1.154}) + expected = pd.Series({'var1': 1.431, 'var2': -1.277}) assert_series_equal(cph.hazards_.T['coef'], expected, check_less_precise=2, check_names=False) - variance_matrix = -np.linalg.inv(cph._hessian_) / np.outer(cph._norm_std, cph._norm_std) - expected_cov = np.array([[44.758444, -8.781867], [-8.781867, 2.606589]]) + expected_cov = np.array([[3.5439245, -0.3549099], [-0.3549099, 0.4499553]]) npt.assert_array_almost_equal( - variance_matrix, expected_cov, + cph.variance_matrix_, expected_cov, decimal=1) # not as precise because matrix inversion will accumulate estimation errors. - expected = pd.Series({'var1': 2.931, 'var2': 1.117}) + expected = pd.Series({'var1': 2.094, 'var2': 0.452}) assert_series_equal(cph.summary['se(coef)'], expected, check_less_precise=2, check_names=False) + def test_robust_errors_with_non_trivial_weights_is_the_same_as_R(self, regression_dataset): + """ + df <- data.frame( + "var1" = c(0.209325, 0.693919, 0.443804, 0.065636, 0.386294), + "var2" = c(0.184677, 0.071893, 1.364646, 0.098375, 1.663092), + "var3" = c(0.184677, 0.071893, 1.364646, 0.098375, 1.663092), + "T" = c( 7.335846, 5.269797, 11.684092, 12.678458, 6.601666) + ) + df['E'] = 1 + r = coxph(formula=Surv(T, E) ~ var1 + var2, data=df, weights=var3, robust=TRUE) + r$var + r$naive.var + """ + + df = pd.DataFrame({ + "var1": [0.209325, 0.693919, 0.443804, 0.065636, 0.386294], + "var2": [0.184677, 0.071893, 1.364646, 0.098375, 1.663092], + 'var3': [0.184677, 0.071893, 1.364646, 0.098375, 1.663092], + "T": [7.335846, 5.269797, 11.684092, 12.678458, 6.601666], + }) + df['E'] = 1 + + cph = CoxPHFitter() + cph.fit(df, 'T', 'E', robust=True, weights_col='var3', show_progress=True) + expected = pd.Series({'var1': -5.16231, 'var2': 1.71924}) + assert_series_equal(cph.hazards_.T['coef'], expected, check_less_precise=1, check_names=False) + + expected = pd.Series({'var1': 9.97730, 'var2': 2.45648}) + assert_series_equal(cph.summary['se(coef)'], expected, check_less_precise=2, check_names=False) + + def test_robust_errors_is_the_same_as_R(self, regression_dataset): """ From 25b9c7a00d1e34b3cb0241032f68f947128d00d8 Mon Sep 17 00:00:00 2001 From: Cameron Davidson-Pilon Date: Fri, 21 Sep 2018 21:20:43 -0400 Subject: [PATCH 11/59] more performant robust --- CHANGELOG.md | 2 +- lifelines/fitters/coxph_fitter.py | 58 +++++++++++-------------------- tests/test_estimation.py | 32 ++++++++++++++++- 3 files changed, 53 insertions(+), 39 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 6f5b7627b..6fac768ae 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,7 +1,7 @@ ### Changelogs #### 0.15.0 - - adding `robust` params to Cox models' `fit`. This enables atleast i) using non-integer weights in the model (these could be sampling weights like IPTW), and ii) misspecified models (ex: non-propotional hazards). Under the hood it's a sandwich estimator. This does not handle ties, so if there are high number of ties, results may significantly differ from other software. + - adding `robust` params to Cox models' `fit`. This enables atleast i) using non-integer weights in the model (these could be sampling weights like IPTW), and ii) mis-specified models (ex: non-proportional hazards). Under the hood it's a sandwich estimator. This does not handle ties, so if there are high number of ties, results may significantly differ from other software. - `standard_errors_` is now a property on fitted Cox models. - `variance_matrix_` is now a property on fitted `CoxPHFitter` which describes the variance matrix of the coefficients. - new criteria for convergence of `CoxPHFitter` and `CoxTimeVaryingFitter` called the Newton-decrement. Tests show it is as accurate (w.r.t to previous coefficients) and typically shaves off a single step, resulting in generally faster convergence. See https://www.cs.cmu.edu/~pradeepr/convexopt/Lecture_Slides/Newton_methods.pdf. Details about the Newton-decrement are added to the `show_progress` statements. diff --git a/lifelines/fitters/coxph_fitter.py b/lifelines/fitters/coxph_fitter.py index f216aa9ad..c12782911 100644 --- a/lifelines/fitters/coxph_fitter.py +++ b/lifelines/fitters/coxph_fitter.py @@ -155,6 +155,7 @@ def fit(self, df, duration_col, event_col=None, self.hazards_ = pd.DataFrame(hazards_.T, columns=df.columns, index=['coef']) / self._norm_std + self.variance_matrix_ = -inv(self._hessian_) / np.outer(self._norm_std, self._norm_std) self.standard_errors_ = self._compute_standard_errors(normalize(df, self._norm_mean, self._norm_std), T, E, weights) self.confidence_intervals_ = self._compute_confidence_intervals() @@ -432,63 +433,46 @@ def _compute_sandwich_estimator(self, X, T, E, weights): # https://www.stat.tamu.edu/~carroll/ftp/gk001.pdf # lin1989 # https://www.ics.uci.edu/~dgillen/STAT255/Handouts/lecture10.pdf + # doesn't handle ties. n, d = X.shape - # Init risk and tie sums to zero - risk_phi = 0 - risk_phi_x = np.zeros((1, d)) - running_weight_sum = 0 - - # need to store these histories, as we access them often - risk_phi_history = np.zeros((n,)) - risk_phi_x_history = np.zeros((n, d)) - - # we already unnormalized the betas in `fit`, so we need normalize them again since X is # normalized. beta = self.hazards_.values[0] * self._norm_std + E = E.astype(int) score_residuals = np.zeros((n, d)) - # Iterate backwards to utilize recursive relationship - for i in range(n - 1, -1, -1): - # Doing it like this to preserve shape - xi = X[i:i + 1] - - w = weights[i] - - phi_i = w * exp(dot(xi, beta)) - phi_x_i = phi_i * xi - - risk_phi += phi_i - risk_phi_x += phi_x_i + phi_s = exp(dot(X, beta)) - risk_phi_history[i] = risk_phi # denom - risk_phi_x_history[i] = risk_phi_x # a[i] + # need to store these histories, as we access them often + # this is a reverse cumulative sum. See original code in https://github.com/CamDavidsonPilon/lifelines/pull/496/files#diff-81ee0759dbae0770e1a02cf17f4cfbb1R431 + risk_phi_x_history = (X * (weights * phi_s)[:, None])[::-1].cumsum(0)[::-1] + risk_phi_history = (weights * phi_s) [::-1].cumsum() [::-1][:, None] # Iterate forwards for i in range(0, n): - # doesn't handle ties. + xi = X[i:i + 1] - phi_i = exp(dot(xi, beta)) + phi_i = phi_s[i] - score = -sum( - E[j] * (phi_i * weights[j]) / risk_phi_history[j] * (xi - risk_phi_x_history[j] / risk_phi_history[j]) for j in range(0, i+1) - ) - score = score + E[i] * (xi - risk_phi_x_history[i] / risk_phi_history[i]) - score *= weights[i] - score_residuals[i, :] = score + score = - phi_i * ( + (E[:i+1] * weights[:i+1] / risk_phi_history[:i+1].T).T # this is constant-ish, and could be cached + * (xi - risk_phi_x_history[:i+1] / risk_phi_history[:i+1]) + ).sum(0) + + if E[i]: + score = score + (xi - risk_phi_x_history[i] / risk_phi_history[i]) + score_residuals[i, :] = score - naive_var = inv(self._hessian_) / self._norm_std.values - delta_betas = score_residuals.dot(naive_var) - sandwich_estimator = delta_betas.T.dot(delta_betas) + naive_var = inv(self._hessian_) + delta_betas = score_residuals.dot(naive_var) * weights[:, None] + sandwich_estimator = delta_betas.T.dot(delta_betas) / np.outer(self._norm_std, self._norm_std) return sandwich_estimator def _compute_standard_errors(self, df, T, E, weights): - - self.variance_matrix_ = -inv(self._hessian_) / np.outer(self._norm_std, self._norm_std) if self.robust: se = np.sqrt(self._compute_sandwich_estimator(df.values, T.values, E.values, weights.values).diagonal()) # / self._norm_std else: diff --git a/tests/test_estimation.py b/tests/test_estimation.py index f6c79a900..6148f2f22 100644 --- a/tests/test_estimation.py +++ b/tests/test_estimation.py @@ -1011,7 +1011,7 @@ def test_coef_output_against_R_using_non_trivial_but_integer_weights(self, rossi cf.fit(rossi_, duration_col='week', event_col='arrest', weights_col='weights') npt.assert_array_almost_equal(cf.hazards_.values, expected, decimal=4) - def test_robust_errors_with_trival_weights_is_the_same_than_R(self, regression_dataset): + def test_robust_errors_with_trivial_weights_is_the_same_than_R(self, regression_dataset): """ df <- data.frame( "var1" = c(0.209325, 0.693919, 0.443804, 0.065636, 0.386294), @@ -1117,6 +1117,36 @@ def test_robust_errors_with_non_trivial_weights_is_the_same_as_R(self, regressio assert_series_equal(cph.summary['se(coef)'], expected, check_less_precise=2, check_names=False) + def test_robust_errors_with_non_trivial_weights_with_censorship_is_the_same_as_R(self, regression_dataset): + """ + df <- data.frame( + "var1" = c(0.209325, 0.693919, 0.443804, 0.065636, 0.386294), + "var2" = c(0.184677, 0.071893, 1.364646, 0.098375, 1.663092), + "var3" = c(0.184677, 0.071893, 1.364646, 0.098375, 1.663092), + "T" = c( 7.335846, 5.269797, 11.684092, 12.678458, 6.601666), + "E" = c(1, 1, 0, 1, 1) + ) + r = coxph(formula=Surv(T, E) ~ var1 + var2, data=df, weights=var3, robust=TRUE) + r$var + r$naive.var + """ + + df = pd.DataFrame({ + "var1": [0.209325, 0.693919, 0.443804, 0.065636, 0.386294], + "var2": [0.184677, 0.071893, 1.364646, 0.098375, 1.663092], + 'var3': [0.184677, 0.071893, 1.364646, 0.098375, 1.663092], + "T": [7.335846, 5.269797, 11.684092, 12.678458, 6.601666], + "E": [1, 1, 0, 1, 1], + }) + + cph = CoxPHFitter() + cph.fit(df, 'T', 'E', robust=True, weights_col='var3', show_progress=True) + expected = pd.Series({'var1': -8.360533, 'var2': 1.781126}) + assert_series_equal(cph.hazards_.T['coef'], expected, check_less_precise=3, check_names=False) + + expected = pd.Series({'var1': 12.303338, 'var2': 2.395670}) + assert_series_equal(cph.summary['se(coef)'], expected, check_less_precise=3, check_names=False) + def test_robust_errors_is_the_same_as_R(self, regression_dataset): """ From d4b64e7cecea3bfe8605c5416fd159cddf210bf5 Mon Sep 17 00:00:00 2001 From: Cameron Davidson-Pilon Date: Thu, 4 Oct 2018 19:02:02 -0400 Subject: [PATCH 12/59] need a test for this, but fix overflow in concordance index --- lifelines/utils/__init__.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/lifelines/utils/__init__.py b/lifelines/utils/__init__.py index d0658b939..06e746d77 100644 --- a/lifelines/utils/__init__.py +++ b/lifelines/utils/__init__.py @@ -894,8 +894,8 @@ def _concordance_index(event_times, predicted_event_times, event_observed): censored_ix = 0 died_ix = 0 times_to_compare = _BTree(np.unique(died_pred)) - num_pairs = 0 - num_correct = 0 + num_pairs = np.int64(0) + num_correct = np.int64(0) num_tied = np.int64(0) def handle_pairs(truth, pred, first_ix): @@ -912,8 +912,8 @@ def handle_pairs(truth, pred, first_ix): while next_ix < len(truth) and truth[next_ix] == truth[first_ix]: next_ix += 1 pairs = len(times_to_compare) * (next_ix - first_ix) - correct = 0 - tied = 0 + correct = np.int64(0) + tied = np.int64(0) for i in range(first_ix, next_ix): rank, count = times_to_compare.rank(pred[i]) correct += rank From b3fe81974914cd8cf0649245bd89edec47179e51 Mon Sep 17 00:00:00 2001 From: Cameron Davidson-Pilon Date: Thu, 11 Oct 2018 09:08:16 -0400 Subject: [PATCH 13/59] use new CovergenceError, and make qth_survival_times much cleaner --- CHANGELOG.md | 2 ++ docs/Examples.rst | 2 ++ lifelines/fitters/cox_time_varying_fitter.py | 18 ++++++++++++++---- lifelines/fitters/coxph_fitter.py | 14 ++++++++++++-- lifelines/utils/__init__.py | 20 +++++++++++++++----- requirements.txt | 2 +- tests/utils/test_utils.py | 3 +++ 7 files changed, 49 insertions(+), 12 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 6fac768ae..505be2ef7 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,6 +5,8 @@ - `standard_errors_` is now a property on fitted Cox models. - `variance_matrix_` is now a property on fitted `CoxPHFitter` which describes the variance matrix of the coefficients. - new criteria for convergence of `CoxPHFitter` and `CoxTimeVaryingFitter` called the Newton-decrement. Tests show it is as accurate (w.r.t to previous coefficients) and typically shaves off a single step, resulting in generally faster convergence. See https://www.cs.cmu.edu/~pradeepr/convexopt/Lecture_Slides/Newton_methods.pdf. Details about the Newton-decrement are added to the `show_progress` statements. + - Minimum suppport for scipy is 1.0 + - Convergence errors in models that use Newton-Rhapson methods now throw a `ConvergenceError`, instead of a `ValueError` (the former is a subclass of the latter, however). #### 0.14.6 - fix for n > 2 groups in `multivariate_logrank_test` (again). diff --git a/docs/Examples.rst b/docs/Examples.rst index 1080a6646..935b3de2f 100644 --- a/docs/Examples.rst +++ b/docs/Examples.rst @@ -573,6 +573,8 @@ Problems with convergence in the Cox Proportional Hazard Model ################################################################ Since the estimation of the coefficients in the Cox proportional hazard model is done using the Newton-Raphson algorithm, there is sometimes a problem with convergence. Here are some common symptoms and possible resolutions: + 0. First diagnostic: look for ``ConvergenceWarning`` in the output. Most often problems in convergence are the result of problems in the dataset. Lifelines has diagnostic checks it runs against the dataset before fitting and warnings are outputted to the user. + 1. ``delta contains nan value(s). Convergence halted.``: First try adding ``show_progress=True`` in the ``fit`` function. If the values in ``delta`` grow unboundedly, it's possible the ``step_size`` is too large. Try setting it to a small value (0.1-0.5). 2. ``LinAlgError: Singular matrix``: This means that there is a linear combination in your dataset. That is, a column is equal to the linear combination of 1 or more other columns. Try to find the relationship by looking at the correlation matrix of your dataset. diff --git a/lifelines/fitters/cox_time_varying_fitter.py b/lifelines/fitters/cox_time_varying_fitter.py index 9b2c9acbb..2276e2f79 100644 --- a/lifelines/fitters/cox_time_varying_fitter.py +++ b/lifelines/fitters/cox_time_varying_fitter.py @@ -19,7 +19,7 @@ pass_for_numeric_dtypes_or_raise, check_low_var,\ check_for_overlapping_intervals, check_complete_separation_low_variance,\ ConvergenceWarning, StepSizer, _get_index, check_for_immediate_deaths,\ - check_for_instantaneous_events + check_for_instantaneous_events, ConvergenceError class CoxTimeVaryingFitter(BaseFitter): @@ -243,12 +243,22 @@ def _newton_rhaphson(self, df, stop_times_events, show_progress=False, step_size g -= self.penalizer * beta.T h.flat[::d + 1] -= self.penalizer - # reusing a piece to make g * inv(h) * g.T faster later - inv_h_dot_g_T = spsolve(-h, g.T, sym_pos=True) + try: + # reusing a piece to make g * inv(h) * g.T faster later + inv_h_dot_g_T = spsolve(-h, g.T, sym_pos=True) + except ValueError as e: + if 'infs or NaNs' in e.message: + raise ConvergenceError("""hessian or gradient contains nan or inf value(s). Convergence halted. Please see the following tips in the lifelines documentation: +https://lifelines.readthedocs.io/en/latest/Examples.html#problems-with-convergence-in-the-cox-proportional-hazard-model +""") + else: + # something else? + raise e + delta = step_size * inv_h_dot_g_T if np.any(np.isnan(delta)): - raise ValueError("""delta contains nan value(s). Convergence halted. Please see the following tips in the lifelines documentation: + raise ConvergenceError("""delta contains nan value(s). Convergence halted. Please see the following tips in the lifelines documentation: https://lifelines.readthedocs.io/en/latest/Examples.html#problems-with-convergence-in-the-cox-proportional-hazard-model """) # Save these as pending result diff --git a/lifelines/fitters/coxph_fitter.py b/lifelines/fitters/coxph_fitter.py index c12782911..b2a937ea0 100644 --- a/lifelines/fitters/coxph_fitter.py +++ b/lifelines/fitters/coxph_fitter.py @@ -240,11 +240,21 @@ def _newton_rhaphson(self, X, T, E, weights=None, initial_beta=None, step_size=N h.flat[::d + 1] -= self.penalizer # reusing a piece to make g * inv(h) * g.T faster later - inv_h_dot_g_T = spsolve(-h, g.T, sym_pos=True) + try: + inv_h_dot_g_T = spsolve(-h, g.T, sym_pos=True) + except ValueError as e: + if 'infs or NaNs' in e.message: + raise ConvergenceError("""hessian or gradient contains nan or inf value(s). Convergence halted. Please see the following tips in the lifelines documentation: +https://lifelines.readthedocs.io/en/latest/Examples.html#problems-with-convergence-in-the-cox-proportional-hazard-model +""") + else: + # something else? + raise e + delta = step_size * inv_h_dot_g_T if np.any(np.isnan(delta)): - raise ValueError("""delta contains nan value(s). Convergence halted. Please see the following tips in the lifelines documentation: + raise ConvergenceError("""delta contains nan value(s). Convergence halted. Please see the following tips in the lifelines documentation: https://lifelines.readthedocs.io/en/latest/Examples.html#problems-with-convergence-in-the-cox-proportional-hazard-model """) diff --git a/lifelines/utils/__init__.py b/lifelines/utils/__init__.py index 06e746d77..91cfbefc7 100644 --- a/lifelines/utils/__init__.py +++ b/lifelines/utils/__init__.py @@ -34,6 +34,16 @@ def __str__(self): return repr(self.msg) +class ConvergenceError(ValueError): + # inherits from ValueError for backwards compatilibity reasons + + def __init__(self, msg): + self.msg = msg + + def __str__(self): + return repr(self.msg) + + class ConvergenceWarning(RuntimeWarning): def __init__(self, msg): @@ -65,14 +75,14 @@ def qth_survival_times(q, survival_functions, cdf=False): if survival_functions.shape[1] == 1 and q.shape == (1,): return survival_functions.apply(lambda s: qth_survival_time(q[0], s, cdf=cdf)).iloc[0] else: + survival_times = pd.DataFrame({_q: survival_functions.apply(lambda s: qth_survival_time(_q, s)) for _q in q}).T + # Typically, one would expect that the output should equal the "height" of q. # An issue can arise if the Series q contains duplicate values. We handle this un-eligantly. if q.duplicated().any(): - return pd.DataFrame.from_dict(dict([ - (_q, survival_functions.apply(lambda s: qth_survival_time(_q, s))) for i, _q in enumerate(q) - ]), orient='index', columns=survival_functions.columns) - else: - return pd.DataFrame({_q: survival_functions.apply(lambda s: qth_survival_time(_q, s)) for _q in q}).T + survival_times = survival_times.loc[q] + + return survival_times def qth_survival_time(q, survival_function, cdf=False): diff --git a/requirements.txt b/requirements.txt index b57710b75..d8a2807d6 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,4 +1,4 @@ numpy -scipy +scipy>=1.0 pandas>=0.18 matplotlib>=2.0 diff --git a/tests/utils/test_utils.py b/tests/utils/test_utils.py index a6b17202e..5e7a3372a 100644 --- a/tests/utils/test_utils.py +++ b/tests/utils/test_utils.py @@ -165,6 +165,9 @@ def test_qth_survival_times_with_duplicate_q_returns_valid_index_and_shape(): q = pd.Series([0.5, 0.5, 0.2, 0.0, 0.0]) actual = utils.qth_survival_times(q, sf) assert actual.shape[0] == len(q) + assert actual.index[0] == actual.index[1] + assert_series_equal(actual.iloc[0], actual.iloc[1]) + npt.assert_almost_equal(actual.index.values, q.values) From 2468bc29cfc2018103942b89dab6b26eb2791f18 Mon Sep 17 00:00:00 2001 From: Cameron Davidson-Pilon Date: Fri, 12 Oct 2018 12:33:20 -0400 Subject: [PATCH 14/59] close.... --- CHANGELOG.md | 1 + lifelines/fitters/aalen_additive_fitter.py | 7 +- lifelines/fitters/cox_time_varying_fitter.py | 90 ++++++++++++-------- lifelines/utils/__init__.py | 5 +- tests/test_estimation.py | 67 ++++++++++++++- 5 files changed, 127 insertions(+), 43 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 505be2ef7..171efbc6b 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,7 @@ - new criteria for convergence of `CoxPHFitter` and `CoxTimeVaryingFitter` called the Newton-decrement. Tests show it is as accurate (w.r.t to previous coefficients) and typically shaves off a single step, resulting in generally faster convergence. See https://www.cs.cmu.edu/~pradeepr/convexopt/Lecture_Slides/Newton_methods.pdf. Details about the Newton-decrement are added to the `show_progress` statements. - Minimum suppport for scipy is 1.0 - Convergence errors in models that use Newton-Rhapson methods now throw a `ConvergenceError`, instead of a `ValueError` (the former is a subclass of the latter, however). + - `AalenAdditiveModel` raises `ConvergenceWarning` instead of printing a warning. #### 0.14.6 - fix for n > 2 groups in `multivariate_logrank_test` (again). diff --git a/lifelines/fitters/aalen_additive_fitter.py b/lifelines/fitters/aalen_additive_fitter.py index 5f67ba05a..69b4e0389 100644 --- a/lifelines/fitters/aalen_additive_fitter.py +++ b/lifelines/fitters/aalen_additive_fitter.py @@ -1,5 +1,6 @@ # -*- coding: utf-8 -*- from __future__ import print_function +import warnings import numpy as np import pandas as pd @@ -9,7 +10,7 @@ from lifelines.fitters import BaseFitter from lifelines.utils import _get_index, inv_normal_cdf, epanechnikov_kernel, \ ridge_regression as lr, qth_survival_times, pass_for_numeric_dtypes_or_raise,\ - concordance_index, check_nans + concordance_index, check_nans, ConvergenceWarning from lifelines.utils.progress_bar import progress_bar from lifelines.plotting import fill_between_steps @@ -185,7 +186,7 @@ def _fit_static(self, dataframe, duration_col, event_col=None, try: v, V = lr(df.values, relevant_individuals, c1=self.coef_penalizer, c2=self.smoothing_penalizer, offset=previous_hazard) except LinAlgError: - print("Linear regression error. Try increasing the penalizer term.") + warnings.warn("Linear regression error. Try increasing the penalizer term.", ConvergenceWarning) hazards_.loc[time, id_] = v.T variance_.loc[time, id_] = V[:, relevant_individuals][:, 0] ** 2 @@ -278,7 +279,7 @@ def _fit_varying(self, dataframe, duration_col="T", event_col="E", try: v, V = lr(wp[time].values, relevant_individuals, c1=self.coef_penalizer, c2=self.smoothing_penalizer, offset=previous_hazard) except LinAlgError: - print("Linear regression error. Try increasing the penalizer term.") + warnings.warn("Linear regression error. Try increasing the penalizer term.", ConvergenceWarning) hazards_.loc[id, time] = v.T variance_.loc[id, time] = V[:, relevant_individuals][:, 0] ** 2 diff --git a/lifelines/fitters/cox_time_varying_fitter.py b/lifelines/fitters/cox_time_varying_fitter.py index 2276e2f79..ea2002bff 100644 --- a/lifelines/fitters/cox_time_varying_fitter.py +++ b/lifelines/fitters/cox_time_varying_fitter.py @@ -38,7 +38,7 @@ def __init__(self, alpha=0.95, penalizer=0.0): self.alpha = alpha self.penalizer = penalizer - def fit(self, df, id_col, event_col, start_col='start', stop_col='stop', show_progress=False, step_size=None, robust=False): + def fit(self, df, id_col, event_col, start_col='start', stop_col='stop', weights_col=None, show_progress=False, step_size=None, robust=False): """ Fit the Cox Propertional Hazard model to a time varying dataset. Tied survival times are handled using Efron's tie-method. @@ -54,6 +54,7 @@ def fit(self, df, id_col, event_col, start_col='start', stop_col='stop', show_pr observation. If left as None, assume all individuals are non-censored. start_col: the column that contains the start of a subject's time period. stop_col: the column that contains the end of a subject's time period. + weights_col: the column that contains (possibly time-varying) weight of each subject-period row. show_progress: since the fitter is iterative, show convergence diagnostics. step_size: set an initial step size for the fitting algorithm. @@ -67,28 +68,36 @@ def fit(self, df, id_col, event_col, start_col='start', stop_col='stop', show_pr """ + self.robust = robust + df = df.copy() if not (id_col in df and event_col in df and start_col in df and stop_col in df): raise KeyError("A column specified in the call to `fit` does not exist in the dataframe provided.") - df = df.rename(columns={id_col: 'id', event_col: 'event', start_col: 'start', stop_col: 'stop'}) + if weights_col is None: + assert '__weights' not in df.columns, '__weights is an internal lifelines column, please rename your column first.' + df['__weights'] = 1.0 + + df = df.rename(columns={id_col: 'id', event_col: 'event', start_col: 'start', stop_col: 'stop', weights_col: '__weights'}) df = df.set_index('id') stop_times_events = df[["event", "stop", "start"]].copy() - df = df.drop(["event", "stop", "start"], axis=1) + weights = df[['__weights']].copy().astype(float) + df = df.drop(["event", "stop", "start", "__weights"], axis=1) stop_times_events['event'] = stop_times_events['event'].astype(bool) + self._check_values(df, stop_times_events) df = df.astype(float) self._norm_mean = df.mean(0) self._norm_std = df.std(0) - hazards_ = self._newton_rhaphson(normalize(df, self._norm_mean, self._norm_std), stop_times_events, show_progress=show_progress, + hazards_ = self._newton_rhaphson(normalize(df, self._norm_mean, self._norm_std), stop_times_events, weights, show_progress=show_progress, step_size=step_size) self.hazards_ = pd.DataFrame(hazards_.T, columns=df.columns, index=['coef']) / self._norm_std - self.standard_errors_ = self._compute_standard_errors(normalize(df, self._norm_mean, self._norm_std), stop_times_events) + self.standard_errors_ = self._compute_standard_errors(normalize(df, self._norm_mean, self._norm_std), stop_times_events, weights) self.confidence_intervals_ = self._compute_confidence_intervals() self.baseline_cumulative_hazard_ = self._compute_cumulative_baseline_hazard(df, stop_times_events) self.baseline_survival_ = self._compute_baseline_survival() @@ -97,7 +106,6 @@ def fit(self, df, id_col, event_col, start_col='start', stop_col='stop', show_pr self._n_examples = df.shape[0] self._n_unique = df.index.unique().shape[0] - return self @staticmethod @@ -121,8 +129,8 @@ def _compute_sandwich_estimator(self, df, stop_times_events): risk_phi_history = pd.DataFrame(np.zeros((n,)), index=df.index) risk_phi_x_history = pd.DataFrame(np.zeros((n, d)), index=df.index) - score_covariance = np.zeros((d, d)) - + E = E.astype(int) + score_residuals = np.zeros((n, d)) # we already unnormalized the betas in `fit`, so we need normalize them again since X is # normalized. beta = self.hazards_.values[0] * self._norm_std @@ -148,23 +156,26 @@ def _compute_sandwich_estimator(self, df, stop_times_events): xi = X[i:i + 1] phi_i = exp(dot(xi, beta)) - correction_term = sum(E[j] * phi_i / risk_phi_history[j] * (xi - risk_phi_x_history[j] / risk_phi_history[j]) for j in range(0, i+1)) - - score = E[i] * (xi - risk_phi_x_history[i] / risk_phi_history[i]) - correction_term - score_covariance += (score.T).dot(score) + score = -sum(E[j] * weights[j] * phi_i / risk_phi_history[j] * (xi - risk_phi_x_history[j] / risk_phi_history[j]) for j in range(0, i+1)) + score = score + E[i] * (xi - risk_phi_x_history[i] / risk_phi_history[i]) + score *= weights[i] + score_residuals[i, :] = score - # TODO: need a faster way to invert these matrices - sandwich_estimator = inv(self._hessian_).dot(score_covariance).dot(inv(self._hessian_)) + naive_var = inv(self._hessian_) + delta_betas = score_residuals.dot(naive_var) * weights[:, None] + sandwich_estimator = delta_betas.T.dot(delta_betas) / np.outer(self._norm_std, self._norm_std) return sandwich_estimator - def _compute_standard_errors(self, df, stop_times_events): - if self.robust: # TODO - se = np.sqrt(self._compute_sandwich_estimator(df, stop_times_events).diagonal()) / self._norm_std + + def _compute_standard_errors(self, df, stop_times_events, weights): + if self.robust: + se = np.sqrt(self._compute_sandwich_estimator(df.values, T.values, E.values, weights.values).diagonal()) # / self._norm_std else: se = np.sqrt(-inv(self._hessian_).diagonal()) / self._norm_std return pd.DataFrame(se[None, :], index=['se'], columns=self.hazards_.columns) + def _compute_z_values(self): return (self.hazards_.loc['coef'] / self.standard_errors_.loc['se']) @@ -201,7 +212,7 @@ def summary(self): df['upper %.2f' % self.alpha] = self.confidence_intervals_.loc['upper-bound'].values return df - def _newton_rhaphson(self, df, stop_times_events, show_progress=False, step_size=None, precision=10e-6, + def _newton_rhaphson(self, df, stop_times_events, weights, show_progress=False, step_size=None, precision=10e-6, max_steps=50): """ Newton Rhaphson algorithm for fitting CPH model. @@ -236,7 +247,7 @@ def _newton_rhaphson(self, df, stop_times_events, show_progress=False, step_size while converging: i += 1 - h, g, ll = self._get_gradients(df, stop_times_events, beta) + h, g, ll = self._get_gradients(df, stop_times_events, weights, beta) if self.penalizer > 0: # add the gradient and hessian of the l2 term @@ -272,7 +283,7 @@ def _newton_rhaphson(self, df, stop_times_events, show_progress=False, step_size # convergence criteria if norm_delta < precision: converging, completed = False, True - elif abs(ll - previous_ll) / (-previous_ll) < 1e-09: + elif previous_ll > 0 and abs(ll - previous_ll) / (-previous_ll) < 1e-09: # this is what R uses by default converging, completed = False, True elif newton_decrement < 10e-8: @@ -303,7 +314,7 @@ def _newton_rhaphson(self, df, stop_times_events, show_progress=False, step_size return beta - def _get_gradients(self, df, stops_events, beta): + def _get_gradients(self, df, stops_events, weights, beta): """ Calculates the first and second order vector differentials, with respect to beta. @@ -324,9 +335,10 @@ def _get_gradients(self, df, stops_events, beta): ix = (stops_events['start'] < t) & (t <= stops_events['stop']) df_at_t = df.loc[ix] + weights_at_t = weights.loc[ix] stops_events_at_t = stops_events.loc[ix] - phi_i = exp(dot(df_at_t, beta)) + phi_i = weights_at_t.values * exp(dot(df_at_t, beta)) phi_x_i = phi_i * df_at_t phi_x_x_i = dot(df_at_t.T, phi_x_i) @@ -337,43 +349,47 @@ def _get_gradients(self, df, stops_events, beta): # Calculate the sums of Tie set deaths = stops_events_at_t['event'] & (stops_events_at_t['stop'] == t) - death_counts = deaths.sum() # should always be atleast 1. + + ties_counts = deaths.sum() # should always at least 1 xi_deaths = df_at_t.loc[deaths] + weights_deaths = weights_at_t.loc[deaths].values - x_death_sum = xi_deaths.sum(0).values + x_death_sum = (weights_deaths * xi_deaths).sum(0).values - if death_counts > 1: + if ties_counts > 1: # it's faster if we can skip computing these when we don't need to. tie_phi = phi_i[deaths.values].sum() tie_phi_x = phi_x_i.loc[deaths].sum(0).values tie_phi_x_x = dot(xi_deaths.T, phi_i[deaths.values] * xi_deaths) partial_gradient = np.zeros(d) + weight_count = weights_deaths.sum() + weighted_average = weight_count / ties_counts + - for l in range(death_counts): + for l in range(ties_counts): - if death_counts > 1: - c = l / death_counts - denom = (risk_phi - c * tie_phi) - z = (risk_phi_x - c * tie_phi_x) + if ties_counts > 1: + denom = (risk_phi - l * tie_phi / ties_counts) + numer = (risk_phi_x - l * tie_phi_x / ties_counts) # Hessian - a1 = (risk_phi_x_x - c * tie_phi_x_x) / denom + a1 = (risk_phi_x_x - l * tie_phi_x_x / ties_counts) / denom else: denom = risk_phi - z = risk_phi_x + numer = risk_phi_x # Hessian a1 = risk_phi_x_x / denom # Gradient - partial_gradient += z / denom - # In case z and denom both are really small numbers, + partial_gradient += weighted_average * numer / denom + # In case numer and denom both are really small numbers, # make sure to do division before multiplications - a2 = np.outer(z / denom, z / denom) + a2 = np.outer(numer / denom, numer / denom) - hessian -= (a1 - a2) + hessian -= weighted_average * (a1 - a2) + log_lik -= weighted_average * np.log(denom) - log_lik -= np.log(denom) # Values outside tie sum gradient += x_death_sum - partial_gradient diff --git a/lifelines/utils/__init__.py b/lifelines/utils/__init__.py index 91cfbefc7..07443e169 100644 --- a/lifelines/utils/__init__.py +++ b/lifelines/utils/__init__.py @@ -68,7 +68,7 @@ def qth_survival_times(q, survival_functions, cdf=False): """ q = pd.Series(q) - if not((q <= 1).all() and (0 <= q).all()): + if not ((q <= 1).all() and (0 <= q).all()): raise ValueError('q must be between 0 and 1') survival_functions = pd.DataFrame(survival_functions) @@ -78,7 +78,8 @@ def qth_survival_times(q, survival_functions, cdf=False): survival_times = pd.DataFrame({_q: survival_functions.apply(lambda s: qth_survival_time(_q, s)) for _q in q}).T # Typically, one would expect that the output should equal the "height" of q. - # An issue can arise if the Series q contains duplicate values. We handle this un-eligantly. + # An issue can arise if the Series q contains duplicate values. We solve + # this by duplicating the entire row. if q.duplicated().any(): survival_times = survival_times.loc[q] diff --git a/tests/test_estimation.py b/tests/test_estimation.py index 6148f2f22..ac73025e9 100644 --- a/tests/test_estimation.py +++ b/tests/test_estimation.py @@ -19,7 +19,7 @@ import numpy.testing as npt from numpy.linalg.linalg import LinAlgError -from lifelines.utils import k_fold_cross_validation, StatError, concordance_index, ConvergenceWarning +from lifelines.utils import k_fold_cross_validation, StatError, concordance_index, ConvergenceWarning, to_long_format from lifelines.estimation import CoxPHFitter, AalenAdditiveFitter, KaplanMeierFitter, \ NelsonAalenFitter, BreslowFlemingHarringtonFitter, ExponentialFitter, \ WeibullFitter, BaseFitter, CoxTimeVaryingFitter @@ -1884,6 +1884,71 @@ def test_fitter_will_error_if_degenerate_time(self, ctv): ctv.fit(df, id_col="id", start_col="start", stop_col="stop", event_col="event") assert True + def test_ctv_fitter_will_hande_trivial_weight_col(self, ctv, dfcv): + ctv.fit(dfcv, id_col="id", start_col="start", stop_col="stop", event_col="event") + coefs_no_weights = ctv.summary['coef'].values + + dfcv['weight'] = 1.0 + ctv.fit(dfcv, id_col="id", start_col="start", stop_col="stop", event_col="event", weights_col='weight') + coefs_trivial_weights = ctv.summary['coef'].values + + npt.assert_almost_equal(coefs_no_weights, coefs_trivial_weights, decimal=3) + + + def test_ctv_fitter_will_hande_integer_weight_col_on_tv_dataset(self, ctv, dfcv): + + # duplicate a few subjects + dfcv_unfolded = dfcv.copy() + for _id in [10, 9, 8, 7]: + to_append = dfcv[dfcv['id'].isin([_id])].copy() + to_append['id'] = (10 + _id) + dfcv_unfolded = dfcv_unfolded.append(to_append) + dfcv_unfolded = dfcv_unfolded.reset_index(drop=True) + print(dfcv_unfolded[(dfcv_unfolded['start'] < 5) & (5 <= dfcv_unfolded['stop'])]) + + ctv = CoxTimeVaryingFitter() + ctv.fit(dfcv_unfolded, id_col="id", start_col="start", stop_col="stop", event_col="event", show_progress=True) + coefs_unfolded_weights = ctv.hazards_ + + + dfcv_folded = dfcv.copy() + dfcv_folded['weights'] = 1.0 + dfcv_folded.loc[dfcv_folded['id'].isin([10,9,8,7]), 'weights'] = 2.0 + print(dfcv_folded[(dfcv_folded['start'] < 5) & (5 <= dfcv_folded['stop'])]) + + ctv = CoxTimeVaryingFitter() + ctv.fit(dfcv_folded, id_col="id", start_col="start", stop_col="stop", event_col="event", weights_col='weights', show_progress=True) + coefs_folded_weights = ctv.hazards_ + + print(coefs_unfolded_weights) + print(coefs_folded_weights) + assert_frame_equal(coefs_unfolded_weights, coefs_folded_weights) + + + def test_ctv_fitter_will_give_the_same_results_as_static_cox_model(self, ctv, rossi): + + rossi = rossi.reset_index() + rossi = to_long_format(rossi, 'week') + + expected = np.array([[-0.3794, -0.0574, 0.3139, -0.1498, -0.4337, -0.0849, 0.0915]]) + ctv.fit(rossi, start_col='start', stop_col='stop', event_col='arrest', id_col='index') + npt.assert_array_almost_equal(ctv.hazards_.values, expected, decimal=4) + + + def test_ctv_fitter_will_handle_integer_weight_as_static_model(self, ctv, rossi): + rossi_ = rossi.copy() + rossi_['weights'] = 1. + rossi_ = rossi_.groupby(rossi.columns.tolist())['weights'].sum()\ + .reset_index() + + # create the id column this way. + rossi_ = rossi_.reset_index() + rossi_ = to_long_format(rossi_, 'week') + + expected = np.array([[-0.3794, -0.0574, 0.3139, -0.1498, -0.4337, -0.0849, 0.0915]]) + ctv.fit(rossi_, start_col='start', stop_col='stop', event_col='arrest', id_col='index', weights_col='weights') + npt.assert_array_almost_equal(ctv.hazards_.values, expected, decimal=4) + def test_fitter_accept_boolean_columns(self, ctv): df = pd.DataFrame.from_records([ From 0eb027c3f1bcffd275c3b7383e0aed533927511f Mon Sep 17 00:00:00 2001 From: Cameron Davidson-Pilon Date: Fri, 12 Oct 2018 13:28:09 -0400 Subject: [PATCH 15/59] not sure why this is failing --- tests/test_estimation.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/test_estimation.py b/tests/test_estimation.py index ac73025e9..58c152d9a 100644 --- a/tests/test_estimation.py +++ b/tests/test_estimation.py @@ -1896,7 +1896,7 @@ def test_ctv_fitter_will_hande_trivial_weight_col(self, ctv, dfcv): def test_ctv_fitter_will_hande_integer_weight_col_on_tv_dataset(self, ctv, dfcv): - + # not sure yet why this is failing. # duplicate a few subjects dfcv_unfolded = dfcv.copy() for _id in [10, 9, 8, 7]: @@ -1904,7 +1904,7 @@ def test_ctv_fitter_will_hande_integer_weight_col_on_tv_dataset(self, ctv, dfcv) to_append['id'] = (10 + _id) dfcv_unfolded = dfcv_unfolded.append(to_append) dfcv_unfolded = dfcv_unfolded.reset_index(drop=True) - print(dfcv_unfolded[(dfcv_unfolded['start'] < 5) & (5 <= dfcv_unfolded['stop'])]) + print(dfcv_unfolded[(dfcv_unfolded['start'] < 7) & (7 <= dfcv_unfolded['stop'])]) ctv = CoxTimeVaryingFitter() ctv.fit(dfcv_unfolded, id_col="id", start_col="start", stop_col="stop", event_col="event", show_progress=True) @@ -1914,7 +1914,7 @@ def test_ctv_fitter_will_hande_integer_weight_col_on_tv_dataset(self, ctv, dfcv) dfcv_folded = dfcv.copy() dfcv_folded['weights'] = 1.0 dfcv_folded.loc[dfcv_folded['id'].isin([10,9,8,7]), 'weights'] = 2.0 - print(dfcv_folded[(dfcv_folded['start'] < 5) & (5 <= dfcv_folded['stop'])]) + print(dfcv_folded[(dfcv_folded['start'] < 7) & (7 <= dfcv_folded['stop'])]) ctv = CoxTimeVaryingFitter() ctv.fit(dfcv_folded, id_col="id", start_col="start", stop_col="stop", event_col="event", weights_col='weights', show_progress=True) From 5a02162b3ea2dadd46423b5c69d7e2885a286d5f Mon Sep 17 00:00:00 2001 From: Cameron Davidson-Pilon Date: Fri, 12 Oct 2018 14:05:24 -0400 Subject: [PATCH 16/59] add kmf cumulative option --- CHANGELOG.md | 1 + docs/Examples.rst | 12 ++++++++++++ docs/images/invert_y_axis.png | Bin 0 -> 30719 bytes lifelines/plotting.py | 16 ++++++++++++++-- tests/test_plotting.py | 16 ++++++++++++++++ 5 files changed, 43 insertions(+), 2 deletions(-) create mode 100644 docs/images/invert_y_axis.png diff --git a/CHANGELOG.md b/CHANGELOG.md index 171efbc6b..654498f1d 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -8,6 +8,7 @@ - Minimum suppport for scipy is 1.0 - Convergence errors in models that use Newton-Rhapson methods now throw a `ConvergenceError`, instead of a `ValueError` (the former is a subclass of the latter, however). - `AalenAdditiveModel` raises `ConvergenceWarning` instead of printing a warning. + - `KaplanMeierFitter` now has a cumulative plot option. Example `kmf.plot(invert_y_axis=True)` #### 0.14.6 - fix for n > 2 groups in `multivariate_logrank_test` (again). diff --git a/docs/Examples.rst b/docs/Examples.rst index 935b3de2f..a84055296 100644 --- a/docs/Examples.rst +++ b/docs/Examples.rst @@ -282,6 +282,18 @@ Hide confidence intervals :height: 300 +Invert axis +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +.. code-block:: python + + kmf.fit(T, label="kmf.plot(invert_y_axis=True)") + kmf.plot(invert_y_axis=True) + +.. image:: /images/invert_y_axis.png + :height: 300 + + Set the index/timeline of a estimate ############################################## diff --git a/docs/images/invert_y_axis.png b/docs/images/invert_y_axis.png new file mode 100644 index 0000000000000000000000000000000000000000..86ff80593cccd4b238c98133cb507def10dc6f09 GIT binary patch literal 30719 zcmeFZc{r5s8~=SH6iG=@WNlHhB}>T?2_Z`MokI4qW|=`r2uWoN*|Uf2%aASEl4UG0 zw(L7&nK3iJ>(=-4Y59JS-|_tO{P7&eb6nsT;uk#6Vdd=Xcv(Q;*8%(v z4=n{!?J%z)hyzl(cJ+??+xeNT$LOy!y!-nm4tJe{W*)(OKaZ;OTvbohSvrd0ogbtXWP?-2bs z>=QB$xSh`gnW31dwpcM1$vtw3;16UT9EQ>|^zXs7BuybJGb(E85H`O=b(AeQ^F=QE zJL+x$eN?>Tp=a$+WI4e07OZsh>@q}j$X37Zkz<_nlvsQm+`V{cI zWZ0J8yfgbDmy;Nx?5Ja@40rzNm+R>jS{@!A`6A&n5bC4eIg^L&Og6XGFPW=r`WFIgK z(Gf`ppu!&wOvd&8Oh%<6`N0C!%mUS0V?lYb(b2W*>&~lZ7y_p+wTk?ZCXP!lZ+9Ts zq=_qGE8$^YC%3mkPe^?2?tXG6-koc^@37hx-^odfx8585$55!u%&m5}>c?>sZe|1= z)_rhr_^ATB_r+~|M#fpM)rf@pH+=dZ5Co6Pl@N#e4kXbDs%ejtAcHF0mac1R9^o&W zV1Se@C*kj2i%to3ZDN?`AuM8MbGH|1RZV9NfW$sjM-ecmI<8JoZiUQ-*+SbPBp4(>exM^G|_eD>9R@<9h`-%n8@19FlrAyWL@I})f|lg~=^e5I0p@|2nw|Okg3KyZL416C zJOTm=+S*K@FE2|-JedB<2{r^2CM+}a-1^1_c~nA3Tx@LJ#KcMHU18y2g;3V3=a0%g zdUVO!#^%A!)&r;phn+hVa$QJD>HySN?!xQg>3Jydn!1+O)2C0VSS39ep|4-RLYkUF zmoL*meFgVusi>$99tgN~`?eog6Rc8R%#gaaHg#;hCH27Q%}gv%=`$Uxs|!pmr`q;9mPH%Gs7 zXeb1%8<)A>+Nm|ZWlylsG&D5s=k7a)6&)kyWdHIy>-7&VBo%1D$5ba%bmT|#}qs#6ZG?Q8HQipe!jmAUsYX3 z+2srQ33q)M1S_YE;(>gF4HA!B-sV+bTr;&=uHTt*@)njYysAQ-2F23F7TY$gBnKp( zqxs@3wT0Fko2c2r8e&S>MweO1)AN=#`1Ct5p8HpH_NU(a@jf%RHjubIS5aJjC|`$B zBVrBV@ckp@87kbt;On<;>jlM>d+cA2mlAWq0$UO>4*qO~c`{mi^%|F{d!^CEi=v53 zTj27V8nj?r56lNwO3gW8&Rw^+!+*rp^Fwv&5pLg0DosA_2J%bILf6upe4hl#2tKKq za3Aoz=Hen!Y2?tmxi{w!>aOiRPJUpab{5@Gb-;?0&pTbdd>OWy-@Vq_nw#Q1X3th` zRxQc%FzPWtjpGmIF(`I0XFm*fsHAyZVROI223(PM<>el=HXBk852nkmJa2p}LoPaD zOs%6kTR#+O-?Fvsxw+T?)-KDn?#v{{ZG{Sj$Q1-%8o zx-S%=9SK-;a#E66&*e(RlYXkPqN61!0$;o2=J$#51mb$SkeFCN#bPhSns7X?qAYAjn;v#K@k)ACfNqA@y zwFelAl~&=75mg5dje(V6=Pp&|W5;RwC5Iee~6O=Sk<4e(3a63OPTdFW-yWY7ls#c+#;`o{4XtnbsIs{(JppOMwMFdW9|I`bs&v>Of|uSx3C& zR$r6UQxX9~YM^8?@Y$@Kk`!O6TzmK}dDKx{T~{}-y}fhjH--dq%*3#e;8mZB0J7uPns-SBL;S0*KO&VIh|QiZ(I zQAOS#y)QtrM0#@k#-5wn78rpBzd8#!mke4+kYQ5G(W4d;^f#<aAG!iPY_<5Fj zAad|Az0@;c&L~k%6+<8TZA+(t9v$Hnn-p6*3S-Lp0LLC1p~?2`WPU*h`?2hqeP3nXyr zJvF(eyUHw0fxFdBW&(DxUQcMv&dA6cs|no9MBw*`VZ4MvM6gmrK@UN>Ws}Fh1Lvo7 zCM|YZjkK=jPTZbaf#;~2)b`!l##gH6M8g%Cdugbm@`62fR+?6bn<(~3W5gu{Va`}! zA2|k_>vqb9CQ{jfzX_P%(G_>cvHE>)k;H9fM4slE?(XgpIdP=#P+?nz}qUt&1OY}@x|9Vl^H090x!^V9C)@TtgWp_Ht}o6% zoy5#!?6ndTc0#7Wv0oAXtj|1bl(2-9mfEVNA48*vSU-#eYv{ze8I`V)<6s{_8+q~( z7cfM+%*?Si2ZV51diu&YWdX1Uje>8gxd(@f?O3?N_r^b(Jm&1|3`GgF@}E6hTkVa9 zomuy);mXJSAsZX6SGGnT_mw!ntKTPp&0{ZeyT$luH&;<{@!0lMLepqn00dElH{5t#mMI5n?_aOGwKfO=iJ~TD`ge+a-WKBh-z3rCdeXx z0;(QAv83}LRju4mYIr5oZ6ps#$O=s^U<1JetzQlBW++h|XQpwVn5undXRW_Knu?oV zSdf>Or)-zP&)@y9=vuJ?-05Q!6co!IReeP^ObFo2Ls4pT!@!9w9S@ao#4nVXgBjuu zCYcjcjGva9JGMu=&Z2O%poPD#N830ALn3ZAPZ@wUvIU0dnOoJIA@Bv<=kuGEdrPX1 zs9jmvQ@x%lF#3fz@0uXiqF4|}sGX-Wv1!w7YOf)QrcE?=Yk`QF9 zFO)poEaM>5HM$)wfkON04@VEmKK;7<;FTyN;?R?0`mz7($AJqPG~q5X#vE7 zuqN}LF<10-hb5H&TDQzVH^a#OlI_hke)7*J8|EJLJ$Rt7c$mzh7j;w-MLoTbLgw({ z;D+;r$Yv0uXc=53j0FqimfF4T!A`X_N1VyGAG(|zV9N7NQb!fZX|eoCsHYq2f?PP! zK%3l>t{aka@a%BUGpfds^ST31m1({$k$G@!4!)kgDJMsb9iLoQH_UNam+48Iy+eC~qnO<0MB&eN|5>>&$kVyLO#^l~KuNF?l8ZEH(TzVNLR4 zwXbrKUS6!g6~9hjr{37qbPL_LM_x7iQB}+_b=j-pH*RQqh4x!6ubviSU!IG$5E0HF zVZ8GCA(QNi)O+?_*?_Xw(vxVPOG_276W1%+;}{ED1s0#Wq5TeXN!3V_(bl zT3-E#3jCt9oh}#nuo|?)yEH)Q;L&-3!Q;vsudp;=9EiNwlY?=)R)5&3KG!2!{ zzpyyRWL!u%{~|iE9<;U=Qn&nW z@77-7`~3Oy^L?IL$G>5fPr6>l&OPiKbs0ER(5$;`x@SA>p3~La=fu+9GslM1&V4Z~ zlvCw69-%>RIQFs5H1ZbF{NjE{RYmjl^9AIa$GCelVfC8MVSA=Ds?$q+AHJEa?r;`* zS3>mqR#5gr`5gKS+UF%y$uHq5iW_NuAkQRvLgw+9XJG6dX=n2pAd?tW?LnM%m49)+c=R#$=pU4mAmrA3M{MJ4^0UBPMJ zA;rEF)z_&-kD4Q8dzO_lTX-+>@~tYCy!Sm=SOa~nS2TvM5^&ST>Nb|E8|UKB_2cF9 z(ohFQyeu5@%_eO)BIMb&`K?F1)&HZz-Hf~JUr=xXEJEh)yf8#g&kgIGINe*a!j7fN zP_|tQL*WZ`%|5MZ&KMh;BfQ}rbWtHHM<1Ox>VCoZPn%R**VjLG$_X;U>fjMQzWTbt zua{3A?<}Zi%{EHcZaw?G+omRaqVTi1h&%hM9Ahr ztunH=O!R+T7WAYMGrpZi7O04#S46`yn`$fA#{ZQIzb0#q_Y=II@nk-8KBId>iSV@2 zt?O%WklLqpT^^A|mspI#;ScO?w0378f13h=HtZwfZn(X#VSI!~ER zUiY|l8)LC9xc{E(mN5Ovcq@oiKboT3YJ=g*3H?0ss!*6#UkN|si*4*$;f=XqO%-XV&@gX%~GPQ#Lz z{dJ9AXJGc;>2vrzWI##jfbqobmh{VQ{p1!oKgUpLv&7dChsW>s4mha@f#k2CD>}ND z=p&qCL~csy`+=NK>&2bT$Qu*g&)^lT*EON8(mbmW$JSHT-rl+_#RKcER4-m|on+9C z2$nQBP;tsn+T?y#zg(%I?iR6Yl8xL`NS> z4lv>APKYHs8Wb7KGDWG8tgX%(pJKRREuID|)0uYutG1pTX5IehAF(zk5{DMjg3< zEAnSQVl)+0iy7-FwvZW|N&Ylf(;u>yBW-&SK{uUktRI!HrTWH6x3v}zVGnzofM-l` zlTIhLuBQ+FFfS6&wJXXwrewIH|Lv>AcNrC*VVsj;R5)+hMb$}fA6%dmyq*INoM5P; z=fdXYN}~;>j^-7M&<4BVqF5H>l!mA%6z-n4NyW6PzInv{4$%y&E6O_{VI&w|0Z*PK@3s0dpWwQ`N9%Kx7xZdxOzrr<^?d*WC^@;YKAMZs(C2q1+CVg z77!YqDScW3q+p1hYOmf&CP(LyP5oUC$4jfeLo~`0?Fq^s)zLO-oz~s&k6}h@x-me*PHMP=QdWw z!gd!!p5C{fZr}f=(}`Dqq&W$NDOh@Kfevesu#bBE5?4I&i9V2)Eoedb1?{(gQYG&K zFFl;tTWs5J25eAr6qsfOb|MVvH$QcL)%ze6<&6$=ZVjO``J$g-uyt#mmhGs9N`kLJ z_~Jk52UV5x#TV_)1e$$Iux}~WG&a{>(;G?bA?jvKItXPl7=Kt!?qj-gJ}pX$qKs>Z zVs|;gTh{7B8>|)Y#fEpr%6Mx@d_jK!-^(dct6&>aw$7TRQzZF|qGj&aW?TCwf}MY* zY=Ew(jl6xx+=UR_Ro%Za@ANvaWohed5whqWzjF5u8Evd&mQlZctu8@WeQ$~eWIg#BA`nTvCs{*HOJ(-~2Yz6Icw1jm=XUC`2BG*y6BBprQ0fi;l!7 zKXnz~%OWRp7X^-FA*pD5|I>`o{T!T^UzBU=i{8h`n*8Wq8nft7m@=5QS`E&`b3b`e zF3QN2LMLlhpj*psF(~GHs6%JB!#TY+V`Zm^>5-1i1YggoHeto(eK9(IWn^`)#El>?>q7U>0u zHs3@J@}qWm28}y=KCMuSAhoKfX#D?3j8Os$+bPL#O+4+BX-&DmrRV9Fq~hBP?2c)~ zyuxY&=YdF&%_UFs`FFq5Q9@dyP-Ue-G_;qILryAGp8n9Byr4vt%BBGyX|Q+M|Xks|s%`--KRV>ZduKq6KAZl%((r z`#j?;0}2n$Khkqt;b5;2WXzg2u563#SorL@K@W9r&FXXT*>5N1>*2KLO41kdZH^w@ z+$sDb@{Z|O9|0ScmVF+TQTKXACN1ZTQnl*0pPfJWRZ40}9I{TfpOQ!S%|BVD3hkp6Qx^Y!9W3z+1Q4~rQK{1 z{O9r-+*Ip7EI>uYN<_lW9(u|3gVoFa0@*8{r!IM{zvpZcq1F1Dmx`p#pEeg7U8y~S zOy~&0G!QmggO22qNrwNM?qqa<;4~LY!dCrm2FDgteF7*p^f#21A1qaErg;++NIRCW zais0!4@_S@*}z;lvjj>DJlr9Oc_n%zD?uJ`j0dD?SeV?3C{_1 zyE^cO>s&|m;_hAUI3aCtG>Y)|%|xMA@FebJ#v~kirE1BWek#t5EtK_=-&`Jw&3OBZ z_Hu&&;z2(;&ZSou2!G9_rT#){SaLYd!V-a{{VDM`W6&wF zKj9_C4$>(Q1k`$v`e^L#X8b#i^Jh{3TxTEDSM4nUGRq)mHoCU04m8j4rZ9>q%(MhU zpoy7Tbv(#;*onK&GXq#aHu2z>mX-(eeIVBX)P}JYai9u9(1{Z#rk9o!#96O{HUb^S zapH4$3I+m6d~yg*TLAGb0#Q<2oRpqU4XNwv)24R?0$CQ4@${6qe*Lr!p-`)&|0#4=Kd2d&c68rueyu4E?&uN)13{dTadqNO?>N_dHmfI zi-YtXpDYTKQxFuu;|^CmIdcANK%M^6&l%e*9=$!>53Tp)kDTo@kyDamnY@k%@x~P= zLEro`VKMoV6VGm{aj?ZuL9eZ`GZ_NvG-$N9FF!dmnMdy?PuFe57&^ij+Ftgd z%8BLoOxDv0k-^N6}cmnawoSSiy@IpQ!Wf32S;x$0C|Ny*WM2 z<5xSvfZ`6{YrwBPVADI}oRpkQVQ6Rw`G64tYN$Y|1%)pJ2!eT6iDJ9Kr;n-WWi>VL z+`02EDTy4U$RVFMZ;tMHeD&>R7wcw`xlKHrP#+#Pf@Cx_4u2YT9Iz>_tYmd@aRFHx zZ*T8s*SL82_zbaZor8nHot>TcmJR`nr-TZ|vVnx)JDwYQsFp8Z)QycDzIa~C#Eg%R zM@2{Lg1in;OaTP};zkGpay(i3%{aZ0k`2iZKuXyn5Gm2~t*xyE2!U#gsmaNg2sc;P zvCrY$Kox?shIV!fex4669#^%mEq0mHl3O|?EG!KEV7$S0%g~T64;_n@TA*^2%q=si>-sb}FzNuEvy9E!erQ#bTZVbIrZo*MZm& zhST9d=rRy&p@vr6S@D}o3Z9-7{K7tb=ezLybs^ zBirM{n_Ef_@{Fa}UVR~uvj-@`*i3rD&0fU%=B7Z@O2v;i`X+__jxBs8CXS8=0VPrj zH=!ucyY0nFK%J>vsLj#V_r@KK9kihl76D;*F7z*OzwY)%b}f|IXtlc2WuN-(h6ey) zH(--zDEb2rwvF>!sIHlMyON=&gZ346bk~b7gXBAaP}E!BOQYBGiXV!-G+h>nn>xUr zkutc6J|_va{dGNhC`ZM~sStK3gki~r$C_)Zu|w~Mm_AMTq-x31(wL-MTTpZ^8TT&` zD2l`)BRH`7C?iT9RXn?y>MPS8lAxhce#A)Oy0l3Ayz7<@ogD^qyQGexmfa-YriSMi zm-CkF$x0P}Tg*VzTX3wjso?rjhO@RaaKHFaWC;pSyXVKrP zozfqp473PoS@@Q;w2r4c>*>8j-Lng^G}06W|IjIi=e)W1$_`#oICp(kO%?y9ob}Se zR(|PX-r{=hiD-tn=+;^HjFdElKKf@RfJ>Ztu4t#8eAL15n&2FPo92;xH%jBSHzd#?viOs6*@PP@dT{f$}9J8Ms zWb5I8(Cih_YZIrcO0bCCP(39o8Q(Q`G5am9XT(wlroCV;%j45Apm);v!$CfCB2Y1$ z>^JterXwlNW?)#{96moff8$~a-wQ^Od|bqN^4}j7^lan!3Nwfpb!`Y4yvQ{rlJ!JI zaKO;a7o3QYygUc1!o9#k9&6Ipu@^c2pkHwG%py7=<#}g+b`i1rHL99bi$JdP0qsy- z;qIP!ybuwhMMCQu+i$N6)N}xy<)5d*VGhcxsHug4`Sq%(J0S+=a%uk3(t6^pQJ0bV z#vRuc5Pf%S7v*wA5h%b7|8>5A=JD(drrV)f0x_Yiax3v<`%Wyt26gG+iBVkK3;Ohn z-b{xWHg+kLy=C;AaS3o@J&E@jHI}3iRCj;Fz*L`ioyD(%RvT!bGVF|ZjHs_1@Epb> zA(KKls-H^GTATVX7?eb$2i^PcEV|8#VUQ16Cn{)tMU^hJ@GX6R26t-(kseCYT_;-? zU*K_(IFH$SB&1bJwSMi_c~>xuPncym2dgJ!g|Iu^PYxEi!kq`Q)$p?Hl||0ZV%{1V z*Axo`uV{(oYwuyd>${?-LJ}IUZCD{XCdm!caQ0WflWQxaeN4UpkVkW>JSYeoW)eA<6%m5Z`Rr z*Sq1soo(M_jq9Z zZ)JP7CF!<4YP$KmqQgLwh9{}2y4m<67#?Ygf34o(c8DR|Y6fFq(b1cURpttp5pScp z$A_JJFc5rs3-oak(Ct-KTwF%yzGOc!!mX*2X8UpS+-dKlTwc{4QTQ+5s+>K)Et`lj5>>w7uR2 z{)Gnpex9Uuhf`a}tb{?6xPctt0Dh3Cg(m`VR=<6_4U51Y(;cbAr5g2v9UCefN$;C; zFa=WfH|E}`bd@B_JvO^t)$;eiM#smCf(zcZ0-3y^&ojm6`dhR4hBxRwR}s~+W?G(n zJzpX-_PbR9CQCf7KuwJkt1xzscv(E8tTe_2C+0FJhC-8t)x|#`Bxr3$Dyt@}uL0!r zUo#lU7p^#AbV%SWKzRuGO%S`Ba1?)-cZM5xIOv@FeGVvch$_zsTZIIs{BTqrzT!t5 zi?Gz>SUHLzy=vNRk>klhUI1JjU={Zns&EUYyRfZ#*9<^Muz6#=4Wb_O4;{%ZiuM$5 ziuA|Q9Qcn8)5JNwEwlgT&OeR3Np|lPDX4{PJ*dbV%zd@F$^{xv~ceO zcM+FS)vcjOab%wYm)YY%lrD0!IzJoMQd$UUp%OYUz`29|C7*>(EVq|R>8J@B&;hQ z?MMIDnP7UiB&WI?x(*hcHA)x^e=u$tV(W%iJV6g|T{8=~-Mzh9@P)Hxz>cODw5u*Q zJ&*0hok9;u`E27*GUB0M7%YbeOO>Qazcagnl(uH%zCN(|eZ8KZ=3S_|!XE&iRxD5x zjYGBLRx9y@zs~{o>YKpM?(xco2fE*s72I6mKV5`XHjL)Y!%1t$0}SNPURNQDWv(!E z^@_iVOzf(z`PQAcYYfEhV%fHBw~JZ-8YEaj1a3M!y61Zn$`b^9vNzWa5Ab2L!tG|o zhTb@A_mar2{P`dsMTMy8*)D?FT%dzuyKl@v0=t8LHDAD=GZAdas?Rfz^2b@pADvSh z>JY$=&vh-#mG(g zq}G$LfT*^S2PXdfb=>yve0BX5Ov>GTAl!hB>9-ky_4E3PJl6&z8x;Q>ln+!dqT&?| zRExdOp4?vCIi%6;SL_Xw*#MgW+goaC1K8ex%+TW3E8uU9PYPGVM|SrLcy#|)r-0G* zt9=KLu8CmjJFIfzK(*CG7x5GrEAZpr?SoDIC70P(@bQ4pZ)BNW%2oTwS@# z+Sfxp7aL*e|CV;@#66Y-n>dbT7gQz?I<9K0LkSU{7?%vY@YK% z08gR?e4D(tw`BX}x$|CtwFyX?=;aYN&dki*|MPj`?j0$&$G=g=x#i4~pt33HBQyUg z9N5*2L?Kpo_JAaw_j!57%kJPpfUxYUo&++V`_^bcA`cBr;d!kD8`qaqWBI(WS;mGd zIv_#?O36*)HLh?!D*$ltea7nqf|;GWqnZ9KA>B(cht%>kDv66#x^OK3I9)!iYbfS_ zZo8YJP`;J!Qz);g33I;ysO1C&744U%SOGdY2KcEO5ZV;a&%w|1L^g4PwD}+8IV2=R z++&@`XS+r?T+sw@tk8#hEl1yqKOhH`Gyq1d?%%H+ty>Fb1Jwp5C4)l3!jE%v`GGUO z55)0+!Srt$zpAg#0%pV>=zT_p@8CisARfpd7BTz!zCLEZS`Gt#CQ*ms=ZlN>&>B+d z^{ZEK>{jFD;R(jFtE&s3%`lz|h*F@L?1uMVQv3Jshaf$DeIZH75RelCK%+AV@V_R? zsWVa{Q0xGo1=EZIx+o^b%}4_$_iJ{}BaCUJyMd0V-YY05NWfJj1>cqxdF0ubn>%8n5_|VuVV1UPWu#FqBng?g#yrz&d<5!f9D2V3gEsYs}$}4 zmUs?p;T@)UvN!F`VC(+_Z3wtnbw3`C0)F3nH6JeSs0Si5+cmC3r9+F99brx5eJG^# zz~G>1NkX&=*8K`l6d&2_-9!HV{rk^&@9b+lnIoyb&<(=_;^v&<$`ADuD_DU*ejM=qaSoKjlDxWRSlF^N;hxRV+8tTgyHqsghL; z*#pG>5#C4thrJQ1Nb zHfKz_ys#NeerNct<>Ka*S;u>lTBoVr1aibBz|}%jH0NwZ~uI%b60zGjP17 zNG6NyJIK_qhJF&|rJB^}%^swc38`F!oqsFU{sT!|OZL~_ zV=8|qRZt(gSAmT!9t`NbZ~Uhta2R%TDl0$mskh#=?PRMspr&+nSZpu{)BCU_?DY#* zV%P7RdSGjPi9E;yfQZQE`Yeq}f@gL7H_iC|qcUKi-HR>FuD*-Bk(*=OYX+a}OpKKK zt?&rMjxo$iMfAbi5QU7pnP4*7nh&5Crxw(TbBv!`;|rLTrjVC@uh{~5Orhss592R> zRzLMP*T8!5wtnSS)r_{eC+kMG9*WtJqyVbhQft;MNn zAZT95)7`*k+&AWHnCtk^(DKl#2r0qh{WNDsH8rXOnf7n*{-@9C$NhCFZLWWOmDggg zoaW_iuds@NUZ_W+^HRMvnol-Vux4Z!hjrPzFHdrNx2M>Aqpn{m%LGG-%Z`LcRs?IK%CVk*Mi`qP~Z<9$=epLFnlD*nLp>N;^f z5Yb(W6xdGlCR;t}C6^xMp4F5TsY%NgQ^bOWZqF$EwQj*lPH?r=&H0KOAlqu(L2yBmRAO0!6={@(_=D z1ydHspHuO6<#r*Lj0cHZ`v!Z>ZZenu?f_av0{K6kVvrbwSr{Q$F1vY{O*(wXZB`29 ze*+&=L4K-E=%TYf@kVeA03SzpPxMs=WSoRt^qyw4eg@YFruJA|Y-8b&W zSgUg6oBVLFW2^8^k!$8UVYjEjgIIX=_qn~M>iObia&q!@`owp4HAvJ@ezZn25?^)T zd#R^f?Kjk}oF{f~A>n#{9LeCX!bjc1u-Q7+s83qD!_|*V{c43p#$kV?sgM|eSmE)> zLF#ZJaH^66)CCK2NEALK2@i`z#Y7v@$Mw9HltQG;18~hhx&;m|9*ot`sueKzr|5Hc zx`VpbY(k+dj)|xJuc-oRNdu!}V_~)9T@fNWu&v|Ukpe8ON!)tg-(EHd7N4W%3<*l| z&w6q4a%MQD7oJ3o-)~%B*~PwR7x+vxxm!ps4o9lK`Y4@K1X@P)cE=xnE*wnP1<IfTL#S zH41ympXG6qnXTy4sF)Ae2<+-trli?_pX`Z+r!Cyw`=&W3RC9qOb}%kpXSBZ`KqL#c z>!VmzL>HPw*6@J1CjZBzK+-ZON7xV&jB<>x0KwW%dbhQG)c>f_6CrPiZNSlTcK;WUAf)EKB-I)Sd$uBwOEscC5kje9b_#Xy zcPnNV=OE|X+TMO#@=4`qaPS7U)A4GPc~P%`mNdy<=J)migyMPAxRA$QN>230EFe2? zl?loF7qpt0!&gq#X-7)^eq~ri2La&`5?}LjQxCCGIQbZxZf(1Zkl= z*$5&n08xH6yaXJa6#Tiq_8%6&|E$SjCs2!*9PmQ8zn7G%Pj*FYAhI(b81}d5+u<8c z=RjbYzq&2J_`5j+8uYYmF%}874okl`8$Gux+0sv>+AG*ooCXVg4@?#Tr=3~y$=l{M zg7Ucm8WYIzb&fufz?ABp6E$W?$D2Ts{e2^YQLAxRrZoCty@(hhmQR@54KOyaaS}#$ z1%sfhX3~JTs-lYbxJixFQeMe@s@Z|${C%dUlvWD`KaV)02KxE!=aU4yZO5MLmgak( z9NF`U=Hm~p3<;lW6ywT9OfYLV3yg0*CDO;`uZuhp_PHjeb)(r@7%2)NmUhl3Hklgd z6V+s4#N|Ia87OpsZ=sVx2!4YD3?D#%N-Bk`ZaQE;i3&0RenSVl!^H6WLJ8U&EZhrH z?_f#m7V6=d_^`43KPrGieQsgA;q8WML(mlJ>mLIPXE63tg|Nx?W^0imV{kcqPB~Gl znp}yemiYGZ#*<`$4*IXcxZ8IR?^Pd(=>(RSQN&l#OR7dcN3>(fWu$S4MawI6=)vUb z^Rws3N5#)6zcrgBW;+wHvcFF}SgKf?Y`ad+$<6iP2=c_+#_uyD4yhXg{UaIoZxZ#h zNORc&0=VwHr<(1k_}`n#H6GiYse{ z{G5fLxBog9Bx+s?11O6>;KVR_M|)~9!gP=ctjH8WI4=OI5wFvm>ONG$voX_fy|8Es z`{DQo$Mg{q{$W+@h;Ra_=SO}an$g$Mz;+f* zGiB`Xy9ENyL32=p3o`m;4Dsgx# zEc97#J2t4l9d+oRUPQ)%{Z&sq$IR^XnKL2(Ib0Q|4YtnKwr8p&@<|EO^}n~mpX@TE z-sdV1oq{ZSjhZvNWoxqCI}k@5JO>ARtx{(C%U!Y(o2b`eCe@t0rcqVjSu(U=sBT;A`0qmj zziwco1Qnv@HFmhK(2^c5lm(?t)S#5RbaU7<(mN!HXX~?ql%Ih2jIx!3gK=Fae}jtt z$lb2RQ|x9zF}8$<^UIuuVfS2Horxj8PmzcuIw-l5XSc)}6qhwO0V0CGa#^+#AX6>X zi=b}$0?jt$+K_P66pWv zlLf990JGx>^4l#=N2Af>Aft2Nc&$mAcu0D?Sq*-zfrU#ps8Xj*ROi_yNqTOKfZ|9w zK)i&qNyc2U2X{+LD6#GLJACG*$gdEp+E^f zEkHp)-h}n?MQI5w4Gjvw695}X4vE@-e+I%R0Ctob@Y@boUfN~?MQcZH+_>>FL}c*) z3i^29aidYeoVWUYrfv7GlyLyk=f>=ypxC#7YqI`b`cw)T^x@5>kdLL|iC0$4D_^}< z77HaUiS3Sq4C>~G0O8U!m-}9xi%s5E6oS-wV@>JzCnD|lik<{f`TvWAkBA09LBg5} zYnH$m05PEEUlc|Or3}EdvHE>b4Jh??1oa;G2pf0;Ad9G>j~_qQSQI8^WzmYp(t%e! zz@^fR^1hfQ1<+ey3h%zt0f1M^xo_s?=j+3{6%GbBs2drD4h-nAN+W<+1B%W--R$!) zfHtY+{a<98mk2R2F+k(kiRyn_f@v))tG%G09#m+vh}!COE_HU^+Vi;f6heaaz#ekn zkLY6^vh`ip2$;r;X1~eS#U0K;NFlisPrQsHKX|9M$3rRM5Kv@B03l!mlDW6oW0F)A- z@T6A#Qe%Mn^=WjuVDlNMWdp@|*-MA^pC{8#QZ$62G=Ow1gF4k?6|M_6q!8eZ1EwS* z!F?t*^5_+$g$0#mg10DmBgoR|&=Nun8Zl?bGpD7cEq~??OOd>Og{_H5z;_N0AE+Rx zycl0`A6ao{0Fx3<0KUpAJi#;MRwFQbI>GG;^)~i$AOP=ifr6c5( zpw<&;xwc>!W-l2B}5#MnGb;3H~SsqO7cp_Xr70J zjP~j~G%PO*x-NbXOiw^u*4-JzU{LQ|J8XgBOs@3$VJMs}ds@N=aNO%telYj&i(vo& z>0Ssf^vtc=82E_t1f{-OUJKrvL)u$p_i6!M*qdkU@HwB!f%Jy9|09yD{i08=;BC}T z3gWlM07U0`n}rJB+hW2X@K^*`HAbB*Q76C4h+`a?K zZ#V7|*iGyRASU=@X8fC+eQ_Ax>tj%HA=P)=66)-gi7X#-iL$#0^sl%io9v>{K;w*! z(#hP(9vr2PB!NYv`|cO?_{Ze#1(}S&e2y)O=ba9x|LPK9pOZ?~_(%hN6vUg31W&5s@^q#xog5^QQNpGE$UDX=#YAHe6~!FRtzBpQ8(BR%H`ka5lMI zdT2u+zgvwDXu5AK1a8~)zeXbZ?OTw$)QxCcWY02{C;0CoBfk~~GxXB$CHNcM7$GV+ zmAo)m0=t{yKdVlk@tjN32D0Dj3kc(^X*4bD7ykk)M)!lEo|);y61X26Nqqlur@4sG z0tyS6<3czOYQ6J!YbiqRdoq43kJzJ+>fc42|D9j3Ko;eGC3ny>z1U0XROE($-T<1a zX33>y{6o4b$(q+kVy;Eo5F($>!qDtC_)njFZQI#%sr!upV7Y;A)z0hlAENt6*ghzC zzb9!XU4_04eOowgj{+D*CTB!W{y*+PRJ4{PRc6 zQ@Fdzr9%Ka;Zu#v*@caMEjmXkbq>HfEQXt0E%wyUS#%JJPD8?a)-EH`3aXx69vK7o z=F?nj=CFX_fLl>H%Im8#O7p*G?`iNxA{;TgckDj!7w@bNEtsEL?{#qfYf6cLsKU(5 z40|SrGvhvnCXd499>1X*UNDwevLOsP}bKd5Fk2cvNsZ zd!x;j`e2CcTnQ-1{6nS%k(Y*+me%f}ImNB7*WNhvze`1e#hI;rf=Lh6^Vg+exwg96 zC~_0fTQ&?kWMx3oC#>sJ{y7U_X^zGa|6*ywq3neXj>%*p;IAbEG0M(vIE0D}gzzP- z3&5SQ{uqcx_ib%K+31gQg1%8xD>o2fv`|O>>ITq)(`;c(JCf*!r}@FT@4^tgf0+ar zD(LIesU7Bx#i1zVu71x)(eI`uHp8;Rw0+hcII;SN{%SG13|UGq>E1Vd-oXmwL@e%6SYW3|7UleW<3B~MxHx3bS92=7oGhzhvED0a-5uJXu^q<*;y#{ z|6X`MYqko~UbH3TyhpwG?*p6wM($Usmd0oDaYLZ27(qr;fh$qRGFl|04qA5k5UQlHbMq3}95S5_nCzkE*f08;zbKu=RiRQL+QP*5sE`EWq-FL|GoA zMMy;d0es0pT+Z{qugoYbN*4Ve5@g4VZ=qs0B{UM&mH@BFwNDQB4+~$o@Um+T#N&zY z{nYP(x0W1rzhkKbN|kpBJDQ_AKQd7|e@(}B=;$%9#!&}ul1K|fk6v_R<47x(ZXGI{ z-&?(fn8fyS`6_b8aR zW&3M?`U=YegQK}A`o8g@Lx=Jcj@O8b-uv=ONczmfHsd7p;i^di$!4+4QzMP*QbOVj zo@cj*>eqMP9gm7Od}o|C;njUWkT%q8iV52nB|8_feSxcI*95lgkx>% z0kF8BS5bAWVF!MXE-wxG?_xAT+nw)y>+K&4CO5pJ^Yh0vG!ba7r z9eXqQFqIDaOyAYmILu9JXk3TmAafVdduoSeCI(flQGIt0|NPwMqky->Q@w8ZP!ZJN zZj<19yxXyuDv1D30*r%b8bLu~3Nx%>cagO1>~OuNNey-D()=8ioT=Co_88^1rK4$6)IBxunLMce z?S&L>AN~sibN{EcFAt}3|JvWBbQ;vDgk&fxoD`yij2$A0#0iy*p;8&MW!f4MaZ(8x zjwmTJDZ(b1=M)`e%dAY<8yWWAzxz4w`}@8}oqyizdb_TYJU#n)KEwK~b+3Efi!G8w zxxCh7u5oLFh=|B&0=25g)ZD!D$+Tci6m@OnB4HDrN4L&eD2+`CR+~<|x;-?rZ;!N! z+7r%dM|WSPsS*V9}+x!LFh0~CBfkgBCva==aWDW3yH=`e4MJkbZ)r9X% z$jo$ZWU77Z;<%7Mv_1F2$7*QZq4uB;5wq(#i72?>|GBwl$c7NPrNPZw4```>&oD6r z2EN(!c8MY-KdGTe``Ax&G`8w9<5cUGlw**;x_$R9H)Q*0tMb_>#;K=4TgUbNQ4<-P z@>Q&vICi9yLM`O+td%FEkUI}O!@ixy{_<*8+xKjDhEcIuL4W%C@+jK*g`ZXR{6#B+ z!N_oE-9;$yXur5rXdb@nB=Ou@ll(B7h{m(Bv;9A&LpvsY`ZuYO>AA0S%=^ENtA>Sz zX+5W%PGi6Fm<+wBTkY>#$d^8&bhoLWykp^HxSZ$s@a~+%pPy?l( zwh~%&;=6X)4ZVF-Ix6oqbq)!thoL5^$ID{KojOO3tc|0vE-`#|*4&Ksjoo|kmx@r~ zr0ncF>YLS3DgpY~lFe#r!s)HLC=USgb|_Xka(31enQlBFvi}B%t<~*}S8E@?;t}=W zry1pIuDU?W;%O)`9!Y z$1JofSKJAUsD2kMP%jt#rsuMy$zuEC#}z0nWuZ|2NLhQslPC4WxkW@puco9(f*7&5 z7!^-LgM(e550Pr4_5?*m@gvyM0VpFce;?5|>1I&S(CBD?ZKRa3Sfq>%r~evZXvO$r zVnMg~8#94u3IcMMS;iR>YG=w(DK+%@vxuT%WLVfb5L~!Z<;@g)q;0FYp_8;Wm5i0^ zc4MZXKzZG|bq~o8^rGYfAuWe7&^dU}`fZ|4Q^O&B{nftII09jT>XHsBWKa;bb?a8? z^KEjV6Ok(G>$h*)cAcnm0!C1u=Q=>pZ3gHvKC^BR{s^@_b|^v`S`e!M_pBfWzN)GU zungDnvEku!0G~+EUyIMqmOVa(5pe-554z2nGiQ4H`X--|k>Up*i*zVXH3qpAq~vw< z^d4noNY6&@l$6{muzGr~R!GTR^7t4uOQz;(J(Xim7eei=HrXf%M2zCGX(MxU)pLcL zHf=iN;IPSH4C4$vo`m~Wm|v%<;bSO_=X|HMV@FU+0$}3eD|m zBT5hxgHUkF&Mqu6@~`wx(6yqK$QO$3-7`GN7cX+nM(XM6imItmJ~RviCxL7s#=`c^ z*iTqhM~}Y$gSHk0ZB3qQCIxQ;Pcbw!q&5r>H7DU=L}g@dOBg-Qps-nk_P;7BUTXAQ zGZm?K`0$v?7|*66QBY7IGQ^kDnVg)=&CUICR!M|!mEqB&dj`pxpoRP~>0U#=W_t|C z1J{h;?K^igVAtvRm!Wc<(7w07L@Qsy{#pklvHs!ypoi_?otY!3E*FdQ-aSkg21zW| zqkFk3EKOR$MD@Pup^*_=IG=~2kh5%AEv=;!vD7|phVP!ZPN{oacF#=3@X;w z8E=kVo6ny<{bxpo(}w}o$#+J|Qy**!pscC!E5+wgQc@3Hu~oceaDct3=Cd#if46>G z;qt8#5`i=Hf$3M5MO`$sX`)wny}AR%OsT#q4}LvwU2%KnfBH`gHJUj+~oD#IbTRmtzuM6YneKPDdMPt{~A zDa8K1jZ5{XQPS!gudk)BLVE&}@3?Ax=UV)-sRB0Y*WR&wuTr<^kd6$li_6B#zA&}k zlzybHE$x%Pyzp2b;&u`UO0n?B9PN~ji#cu8kZhcP#mYNjuyt1v$ICH6Di&jc8vWqz z?rzSX`SwyJ7Cx7bHmM@Ma-I@szGf&nkl9b_@OpY{`fR;s==}3l&PY#qWgXY4k8zrdi z-_1R|rtq+^-TecaA@i5~f{S^zr{+~1sek)5_B@GYbwiycW(R%cWeA^z7gB^5mRGIeY*eUWmUn`s>YG~HpKFuL7asM=sg&uZc3 zA-qP=kNwVnEVP=E&*6$-JIWXLL`=)%iWa7d1&MdE!0DXqDzJWKALt6Ha zM#mKqP0_58<;R+;;*7$)oqd)`uZpE@UAW}9X@Y*A9v_ZV?H#+Z*Hix85UsL5y-gq*AB+b=`W6nm1DF8lo&)b zkL+6<$QxDIyF}`;bIbfb_WbtU3m5X&)(w1#II?F4;=8hd#5_w0duZru1HOCYM%N9*SGk`TY=Wwo%p~k$4q|9s%W< z@IqjIyxkvOukUnX%u-S#dCIgJ01^(gxw<@YEX+_NDEYdU}=tS4X z9>?DHo6~3C#5(YCTOu6-dhsg7XgVYQAl*+{GBfAU~u`%@r{9+4&>=Yntzq7S22|F~4x<{uHYy z)5UN+wSv#jw%Nt(5?%WZiu0p|t|Tm+@~V@9SKF_7`71LYY#f)7RV%wEPtin9=r-9! zX5omtShRMD7+bqgD|q8r7r)dAuZ5SWzoz8K!hPIZlvc9;byXx&$ijP8{&lK61)e0v zT>D1@-S3br?B-=xTybyzDBI-7`SYpG8cuhA*rQ2T;-yPewU_KiEU1OS5LlUHwetOfv7;29BVX$@Z@bp}{jP#;gVlM>a`x=rcvh%9j?PgE<(6u#Bo{U-e{uWre zX1em1-^d&muMBT!XnN;sB@w+fuJ!#h3gi99k0bx`7NeWX6?e#}A0FdP@>NkcKP$4b zX1Cd^sj)IcBcp$LAY#KSY+HGnv{|&PC#~#}+Y${e0VapIq@8F2Lx);U1B6E}p2UN&&aEtp`XI)30>#^K(8&SD%b(HEe=T5}?gv2`J zE@i`|Eb1rvYW@QIIaqtOJfHrT}$c%$zEo(NN9R*!$8Q1e&40|`@?G2o(a78%}aWA03>^`0my!mzzRZas&AMd!|*72#fS#K54@(QDtn zMaxzT@G%q-DqS75u&_`YWUbA%Z+M)LAUf*-_neRe<=(JizoV6uxOj0^%HXGA+}H(D zMbJuKzGU&;@NgH<_DK_TdP1v;N6OF5S?iP4oP!~VI`6BtVZ#QH;yM2KJN{ZnNP+*E zl@->Ubhx>tMa$Y+#OU!MA_Gf0Wp95cI=U8tBnixRA_PaC9P-8ysy=%mj!=;$x&2Ut zx~Jz}R5Nm*NFf*+67_L7HhZiPkK-dye_dHwxf#KvsCEcR+PPa3aD;-_lq3lg^6>HF z8&H^d+uU4LSNAA4m&dQW+ZypKmeVSdtX*x__@`3fD6kFdeO4L#{<|imfszi#aay+J zoPp5$UDT_D-MJ$nA@KCmX8aN03t|D|erxUsTB+4; zS=p-E+F_G1&_amE?~+wAEqs-q&&jMs{oL)iIJIH>DQIhev~>qwc{lkDq;t_PE#r{V!_ZA_DTkITlW@a^3}4_=Z3aQsYF^ZsT(am zKQDiV?g5DO`WVszHvi+Ox8G6YUXh8ixp2v2zx{T_A@n*|&3}S!{ue+Fw>a*Ql9Uu^ zX01hX@SAs$lzdlE!AoyW)Xe;DY;4}rDtH)6>_>B2^_=$}-bjYRLRsuz>pt?V_9sL+ zsG?VVb{@}pLSvC0$U6$TP+sXpD>U7nzQs#Gu>VAz)6SC?5m+ZG)?PbO=1u?=YFOht z&H)(I4x60C0 z;~3Eg*wA#ay9_NX8o&Z5!@_>Z%NA1c%%MH^YBh7^LZ=i?HRjOaJb(?6zybjs?t+nV z|K$qK&`R5Zwg>9gTU^#x!Z1tC?eQBgi>b}a%e&9KSg{iD{C|S9k*f3u>Sn$xJPPO@ zwWf5k`1cQlVCjRUQA30#;<bkxZuJuxlnxT?2vm@2+_K-&cTZ#XnbK@?F4g#37D&)|zjRAHIm(ka3;;QnNKH z;Gj5n{&2eBYJVLa9TeKNPt1PR|K~%`O=z1#?zl0ByW>8%GG%C_(#w#>$B`@dg2jiCYWIWz%o`|4)}~H^(`=_G^_E3hP7KGSQ9XNlMCA;GY=u zZ19#^^-aF2vFU1N6y_v1FE2PW9$*U2F*~y)g}lf20d^zq0z_Fr*`qPOsma)=ixN)6za1A$_(U6A`G zd!EjP`L`eK$Rp@}S6M9L2luJ-+fdhtswLGeAUoOCgg3o^zoF-LoBd0E*kFPHB;?Vx zi11OU%yufYhU#dLCf_S<=oQS^7~la`&9`qop%S}xg)~)~)rY-w0eSGmGn4cX)z#O3 zGj?{)d~Q|BxlFq+y1ZO#Wd<?{QK%w}FS}IAKZM+5rge zq|zJz+V)lyoCJeouL0y8hee{^`&JvQnmjdgSapQC#0SwTU^EQH5C6%Tgo zhtqigz%mF})|ls}^XBRIAL-uDvZ+|TOuIiCXA21#3qc{BbE)QqQAB?irs~S=XwmR( z4UeR0G11-tCrU%XCAtTuC|bz)&`ET!IrmrAb-?u`0`o)}aV2fOj;5fT_2F4+>Z78f zA{u*HWMrhI*~`WcW18Q(y_$NyN?W~NlH|W`{U@i06BqKX!#6`K9TNo=f!0PtC+EcE z(jl7SysxY=HlP(YnrK_xnZP*>MEaG(o)s9Ze7(;jjfbhmtd{|uh0 zu(0p9czk>$>LH!6p@?-YdS7gXf2pKth967^x8EgtKT*gBmgeb09N*Hhmkuc zSob{S1GjG3QZl*nHSdNEG7c>%zU?p{RlsEh`Hu{zo_G;LjFk6)63q%$rNgCW=RBnS zPLlwu~D(J9qd?&Szg%El{^Gdm_D4{@V2wF{^`+Y(XeGH*dy|t9#@IS6ax#>Gv=_J z-FD=7R)^O6!qMPJBSVI4o0wQdz?aR$ySdcuX&oO-2zBYXC67$1KA73I#Baiy5jB-+ z?3o6M9Xon%?nvv(ZcyzZ@|a$=7cHpnavtEH(EBv zp+m>}p?Jme#50zXHCZgW8oN14#VfaOdM7rg2N-Xy1F_=eX}L< zYgMyVhkdPao-3)yoq*!t#YXmkNt)T2V=cT!CCcN)9LD?hT_$dWCT=ZHapbu--v7&K z&y&?A^QYT?TnqwA;?w%yejBWWLzpN+ z4JnvLlZuN)tjTn3;Z}J6I3%$Y1vZ8TR6a6*H{t96kq71Z@zQ8DO6O2~K)}m`SDHC~ zUc0e+YNY*A?`>^D{VY=2>NhtpboMcK^q!x4UQiHSU$4Jz{rVW8n*o(i;0D}wt{9tE zQn@A`T7m49lP8~uh|M}-_BEchd6noRR|^RS5_SUmMS{a4ge$DRP>pY;6fuPU~aF)ajM}rOI=|*@TgShp}SCikHl( zcATmm89f5S{KY363-2MrL%0#}F?5j5!%Zpk6_l;qD^7+_Z&*9C+ii(~X7^h{=m1;I zgI*^;gQH|0D%wexE?v@49jI3t!#4W^uKYEe#}Vjn5{akEKv<7y_h2+>&p=_W0Vi%1 zHZq=YKO^r2%~vVCaEc$Ouo8+Mp|1%f?HlJQ!S4N(U z$dM^C8EC(u94S2CwJf(V9fN2pBI5iR3KKb`XSun#v?)Yy4t>@4nkvAXeY(^X|34dY zhHi660ZO7H#7E}}!KU+tg@rAMzf!m9hHJiHqS+u0&B<1av1wJL)A%!da6B_(PnGO_ajaz87g)N6C^ZUe8}a#liAKdD-TZ{us!*udtOuSI1{)#` zJRke%Go~4qi;$lVx!{HAL3q}i&ib4V67Ee_ZJb&hh8+9`Po17Tbnlo|VS!1(a|ffn zF;IU|P=Et$=%iP4WaM&WjEDS2e9|)sqlLU|@4&#C*?d@88zEh!Je~6Q?$}W?Mf^t~ zJa`z@ILCMGsjBjq5f%avxP-n<%Y0Z)t%4yJC4K(-S|lQp5YLI}G^RD@xko8q+D#Zj z!oPws+&(tcJ9jf+5Uy$93FKW#x#>#?GqrBTbz~cvezeJ{skSMWKHfbbgmm*j2S!4b zL{vEtT?aJ)CcuC(Z7FdWT4GIR!xMtFE=7kNZTt|7gI*Aj`_U(`2W(}hI(*g>Y=f7R z@6&znLFggdxkn3nz?ukqA!-&r-3HkOp!oyXjC+WZ^ySN!BX2#9!|i@q5O4cLtbS2? zKt!F*FV6E1fLNx0o>Fn3nO#999 G%KrmDj@qpN literal 0 HcmV?d00001 diff --git a/lifelines/plotting.py b/lifelines/plotting.py index fa93e0cc6..ec3c2c32e 100644 --- a/lifelines/plotting.py +++ b/lifelines/plotting.py @@ -308,16 +308,16 @@ def plot_estimate(cls, estimate): iloc: specify a location-based subsection of the curves to plot, ex: .plot(iloc=slice(0,10)) will plot the first 10 time points. + invert_y_axis: boolean to invert the y-axis, useful to show cumulative graphs instead of survival graphs. bandwidth: specify the bandwidth of the kernel smoother for the smoothed-hazard rate. Only used when called 'plot_hazard'. - Returns: ax: a pyplot axis object """ % estimate def plot(loc=None, iloc=None, show_censors=False, censor_styles=None, ci_legend=False, ci_force_lines=False, - ci_alpha=0.25, ci_show=True, at_risk_counts=False, + ci_alpha=0.25, ci_show=True, at_risk_counts=False, invert_y_axis=False, bandwidth=None, **kwargs): if censor_styles is None: @@ -379,6 +379,18 @@ def plot(loc=None, iloc=None, show_censors=False, if at_risk_counts: add_at_risk_counts(cls, ax=ax) + if invert_y_axis: + # need to check if it's already inverted + original_y_ticks = ax.get_yticks() + if not getattr(ax, '__lifelines_inverted', False): + # not inverted yet + + ax.invert_yaxis() + # don't ask. + y_ticks = np.round(1.000000000001 - original_y_ticks, decimals=8) + ax.set_yticklabels(y_ticks) + ax.__lifelines_inverted = True + return ax plot.__doc__ = doc_string diff --git a/tests/test_plotting.py b/tests/test_plotting.py index 7c0db7a3a..36a51b0bc 100644 --- a/tests/test_plotting.py +++ b/tests/test_plotting.py @@ -57,6 +57,22 @@ def test_kmf_with_risk_counts(self, block, kmf): self.plt.title("test_kmf_with_risk_counts") self.plt.show(block=block) + + def test_kmf_with_inverted_axis(self, block, kmf): + + T = np.random.exponential(size=100) + kmf = KaplanMeierFitter() + kmf.fit(T, label='t2') + ax = kmf.plot(invert_y_axis=True, at_risk_counts=True) + + T = np.random.exponential(3, size=100) + kmf = KaplanMeierFitter() + kmf.fit(T, label='t1') + kmf.plot(invert_y_axis=True, ax=ax, ci_force_lines=False) + + self.plt.title("test_kmf_with_inverted_axis") + self.plt.show(block=block) + def test_naf_plotting_with_custom_colours(self, block): data1 = np.random.exponential(5, size=(200, 1)) data2 = np.random.exponential(1, size=(500)) From 9cac77cface02a3a519277b00d2778807977a307 Mon Sep 17 00:00:00 2001 From: Cameron Davidson-Pilon Date: Wed, 17 Oct 2018 21:08:13 -0400 Subject: [PATCH 17/59] more tests to ctv weights --- lifelines/datasets/__init__.py | 2 +- lifelines/fitters/cox_time_varying_fitter.py | 3 +- lifelines/fitters/coxph_fitter.py | 9 ++- tests/test_estimation.py | 81 ++++++++++++-------- 4 files changed, 56 insertions(+), 39 deletions(-) diff --git a/lifelines/datasets/__init__.py b/lifelines/datasets/__init__.py index 35e01f656..95826655f 100644 --- a/lifelines/datasets/__init__.py +++ b/lifelines/datasets/__init__.py @@ -363,7 +363,7 @@ def load_dfcv(): 3 0 1.0 0 6.0 4 True """ from lifelines.datasets.dfcv_dataset import dfcv - return dfcv + return dfcv.copy() def load_lymphoma(**kwargs): diff --git a/lifelines/fitters/cox_time_varying_fitter.py b/lifelines/fitters/cox_time_varying_fitter.py index ea2002bff..a39f5178b 100644 --- a/lifelines/fitters/cox_time_varying_fitter.py +++ b/lifelines/fitters/cox_time_varying_fitter.py @@ -258,7 +258,7 @@ def _newton_rhaphson(self, df, stop_times_events, weights, show_progress=False, # reusing a piece to make g * inv(h) * g.T faster later inv_h_dot_g_T = spsolve(-h, g.T, sym_pos=True) except ValueError as e: - if 'infs or NaNs' in e.message: + if 'infs or NaNs' in str(e): raise ConvergenceError("""hessian or gradient contains nan or inf value(s). Convergence halted. Please see the following tips in the lifelines documentation: https://lifelines.readthedocs.io/en/latest/Examples.html#problems-with-convergence-in-the-cox-proportional-hazard-model """) @@ -367,7 +367,6 @@ def _get_gradients(self, df, stops_events, weights, beta): weight_count = weights_deaths.sum() weighted_average = weight_count / ties_counts - for l in range(ties_counts): if ties_counts > 1: diff --git a/lifelines/fitters/coxph_fitter.py b/lifelines/fitters/coxph_fitter.py index b2a937ea0..248b87f42 100644 --- a/lifelines/fitters/coxph_fitter.py +++ b/lifelines/fitters/coxph_fitter.py @@ -18,7 +18,7 @@ significance_code, concordance_index, _get_index, qth_survival_times,\ pass_for_numeric_dtypes_or_raise, check_low_var, coalesce,\ check_complete_separation, check_nans, StatError, ConvergenceWarning,\ - StepSizer + StepSizer, ConvergenceError from lifelines.statistics import chisq_test @@ -243,7 +243,7 @@ def _newton_rhaphson(self, X, T, E, weights=None, initial_beta=None, step_size=N try: inv_h_dot_g_T = spsolve(-h, g.T, sym_pos=True) except ValueError as e: - if 'infs or NaNs' in e.message: + if 'infs or NaNs' in str(e): raise ConvergenceError("""hessian or gradient contains nan or inf value(s). Convergence halted. Please see the following tips in the lifelines documentation: https://lifelines.readthedocs.io/en/latest/Examples.html#problems-with-convergence-in-the-cox-proportional-hazard-model """) @@ -313,6 +313,11 @@ def _get_efron_values(self, X, beta, T, E, weights): (φ1 + φ2 + φ3) is adjusted from sum_j^{5} φj after one fails. Similarly two-third of (φ1 + φ2 + φ3) is adjusted after first two individuals fail, etc. + From https://cran.r-project.org/web/packages/survival/survival.pdf: + + "Setting all weights to 2 for instance will give the same coefficient estimate but halve the variance. When + the Efron approximation for ties (default) is employed replication of the data will not give exactly the same coefficients as the + weights option, and in this case the weighted fit is arguably the correct one." Parameters: X: (n,d) numpy array of observations. diff --git a/tests/test_estimation.py b/tests/test_estimation.py index 58c152d9a..081d433d6 100644 --- a/tests/test_estimation.py +++ b/tests/test_estimation.py @@ -1252,6 +1252,7 @@ def test_non_trival_float_weights_with_no_ties_is_the_same_as_R(self, regression def test_summary_output_using_non_trivial_but_integer_weights(self, rossi): + rossi_weights = rossi.copy() rossi_weights['weights'] = 1. rossi_weights = rossi_weights.groupby(rossi.columns.tolist())['weights'].sum()\ @@ -1263,8 +1264,23 @@ def test_summary_output_using_non_trivial_but_integer_weights(self, rossi): cf2 = CoxPHFitter() cf2.fit(rossi, duration_col='week', event_col='arrest') + # strictly speaking, the variances, etc. don't need to be the same, only the coefs. assert_frame_equal(cf1.summary, cf2.summary, check_like=True) + def test_doubling_the_weights_halves_the_variance(self, rossi): + + w = 2.0 + rossi_weights = rossi.copy() + rossi_weights['weights'] = 2 + + cf1 = CoxPHFitter() + cf1.fit(rossi_weights, duration_col='week', event_col='arrest', weights_col='weights') + + cf2 = CoxPHFitter() + cf2.fit(rossi, duration_col='week', event_col='arrest') + + assert_frame_equal(cf2.standard_errors_ ** 2, w * cf1.standard_errors_ ** 2, check_like=True) + def test_adding_non_integer_weights_without_robust_flag_raises_a_warning(self, rossi): rossi['weights'] = np.random.exponential(1, rossi.shape[0]) @@ -1884,7 +1900,7 @@ def test_fitter_will_error_if_degenerate_time(self, ctv): ctv.fit(df, id_col="id", start_col="start", stop_col="stop", event_col="event") assert True - def test_ctv_fitter_will_hande_trivial_weight_col(self, ctv, dfcv): + def test_ctv_fitter_will_handle_trivial_weight_col(self, ctv, dfcv): ctv.fit(dfcv, id_col="id", start_col="start", stop_col="stop", event_col="event") coefs_no_weights = ctv.summary['coef'].values @@ -1895,59 +1911,56 @@ def test_ctv_fitter_will_hande_trivial_weight_col(self, ctv, dfcv): npt.assert_almost_equal(coefs_no_weights, coefs_trivial_weights, decimal=3) - def test_ctv_fitter_will_hande_integer_weight_col_on_tv_dataset(self, ctv, dfcv): - # not sure yet why this is failing. - # duplicate a few subjects - dfcv_unfolded = dfcv.copy() - for _id in [10, 9, 8, 7]: - to_append = dfcv[dfcv['id'].isin([_id])].copy() - to_append['id'] = (10 + _id) - dfcv_unfolded = dfcv_unfolded.append(to_append) - dfcv_unfolded = dfcv_unfolded.reset_index(drop=True) - print(dfcv_unfolded[(dfcv_unfolded['start'] < 7) & (7 <= dfcv_unfolded['stop'])]) - - ctv = CoxTimeVaryingFitter() - ctv.fit(dfcv_unfolded, id_col="id", start_col="start", stop_col="stop", event_col="event", show_progress=True) - coefs_unfolded_weights = ctv.hazards_ + def test_doubling_the_weights_halves_the_variance(self, ctv, dfcv): + ctv.fit(dfcv, id_col="id", start_col="start", stop_col="stop", event_col="event") + coefs_no_weights = ctv.summary['coef'].values + variance_no_weights = ctv.summary['se(coef)'].values**2 + dfcv['weight'] = 2.0 + ctv.fit(dfcv, id_col="id", start_col="start", stop_col="stop", event_col="event", weights_col='weight') + coefs_double_weights = ctv.summary['coef'].values + variance_double_weights = ctv.summary['se(coef)'].values**2 - dfcv_folded = dfcv.copy() - dfcv_folded['weights'] = 1.0 - dfcv_folded.loc[dfcv_folded['id'].isin([10,9,8,7]), 'weights'] = 2.0 - print(dfcv_folded[(dfcv_folded['start'] < 7) & (7 <= dfcv_folded['stop'])]) + npt.assert_almost_equal(coefs_no_weights, coefs_double_weights, decimal=3) + npt.assert_almost_equal(variance_no_weights, 2 * variance_double_weights, decimal=3) - ctv = CoxTimeVaryingFitter() - ctv.fit(dfcv_folded, id_col="id", start_col="start", stop_col="stop", event_col="event", weights_col='weights', show_progress=True) - coefs_folded_weights = ctv.hazards_ - print(coefs_unfolded_weights) - print(coefs_folded_weights) - assert_frame_equal(coefs_unfolded_weights, coefs_folded_weights) + def test_ctv_fitter_will_give_the_same_results_as_static_cox_model(self, ctv, rossi): + cph = CoxPHFitter() + cph.fit(rossi, 'week', 'arrest') + expected = cph.hazards_.values - def test_ctv_fitter_will_give_the_same_results_as_static_cox_model(self, ctv, rossi): + rossi_ctv = rossi.reset_index() + rossi_ctv = to_long_format(rossi_ctv, 'week') - rossi = rossi.reset_index() - rossi = to_long_format(rossi, 'week') - expected = np.array([[-0.3794, -0.0574, 0.3139, -0.1498, -0.4337, -0.0849, 0.0915]]) - ctv.fit(rossi, start_col='start', stop_col='stop', event_col='arrest', id_col='index') + ctv.fit(rossi_ctv, start_col='start', stop_col='stop', event_col='arrest', id_col='index') npt.assert_array_almost_equal(ctv.hazards_.values, expected, decimal=4) def test_ctv_fitter_will_handle_integer_weight_as_static_model(self, ctv, rossi): + # deleting some columns to create more duplicates + del rossi['age'] + del rossi['paro'] + del rossi['mar'] + del rossi['prio'] + rossi_ = rossi.copy() rossi_['weights'] = 1. rossi_ = rossi_.groupby(rossi.columns.tolist())['weights'].sum()\ .reset_index() + cph = CoxPHFitter() + cph.fit(rossi, 'week', 'arrest') + expected = cph.hazards_.values + # create the id column this way. rossi_ = rossi_.reset_index() rossi_ = to_long_format(rossi_, 'week') - expected = np.array([[-0.3794, -0.0574, 0.3139, -0.1498, -0.4337, -0.0849, 0.0915]]) ctv.fit(rossi_, start_col='start', stop_col='stop', event_col='arrest', id_col='index', weights_col='weights') - npt.assert_array_almost_equal(ctv.hazards_.values, expected, decimal=4) + npt.assert_array_almost_equal(ctv.hazards_.values, expected, decimal=3) def test_fitter_accept_boolean_columns(self, ctv): @@ -1985,9 +1998,9 @@ def test_warning_is_raised_if_df_has_a_near_constant_column_in_one_seperation(se ctv.fit(dfcv, id_col="id", start_col="start", stop_col="stop", event_col="event") except (LinAlgError, ValueError): pass - assert len(w) == 2 + assert len(w) == 1 assert issubclass(w[0].category, ConvergenceWarning) - assert "complete separation" in str(w[1].message) + assert "complete separation" in str(w[0].message) def test_summary_output_versus_Rs_against_standford_heart_transplant(self, ctv, heart): """ From 8184b2c32b5b3326c298d0da728b6e0f59d29482 Mon Sep 17 00:00:00 2001 From: Cameron Davidson-Pilon Date: Thu, 18 Oct 2018 14:15:35 -0400 Subject: [PATCH 18/59] error handling for nans --- CHANGELOG.md | 1 + lifelines/fitters/cox_time_varying_fitter.py | 7 ++- lifelines/fitters/coxph_fitter.py | 1 + lifelines/utils/__init__.py | 6 +-- tests/test_estimation.py | 56 ++++++++++++++++++-- 5 files changed, 64 insertions(+), 7 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 654498f1d..51947b629 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -9,6 +9,7 @@ - Convergence errors in models that use Newton-Rhapson methods now throw a `ConvergenceError`, instead of a `ValueError` (the former is a subclass of the latter, however). - `AalenAdditiveModel` raises `ConvergenceWarning` instead of printing a warning. - `KaplanMeierFitter` now has a cumulative plot option. Example `kmf.plot(invert_y_axis=True)` + - a `weights_col` option has been added to `CoxTimeVaryingFitter` that allows for time-varying weights. #### 0.14.6 - fix for n > 2 groups in `multivariate_logrank_test` (again). diff --git a/lifelines/fitters/cox_time_varying_fitter.py b/lifelines/fitters/cox_time_varying_fitter.py index a39f5178b..dc8de9f4e 100644 --- a/lifelines/fitters/cox_time_varying_fitter.py +++ b/lifelines/fitters/cox_time_varying_fitter.py @@ -19,7 +19,7 @@ pass_for_numeric_dtypes_or_raise, check_low_var,\ check_for_overlapping_intervals, check_complete_separation_low_variance,\ ConvergenceWarning, StepSizer, _get_index, check_for_immediate_deaths,\ - check_for_instantaneous_events, ConvergenceError + check_for_instantaneous_events, ConvergenceError, check_nans class CoxTimeVaryingFitter(BaseFitter): @@ -78,6 +78,10 @@ def fit(self, df, id_col, event_col, start_col='start', stop_col='stop', weights if weights_col is None: assert '__weights' not in df.columns, '__weights is an internal lifelines column, please rename your column first.' df['__weights'] = 1.0 + else: + if (df[weights_col] <= 0).any(): + raise ValueError("values in weights_col must be positive.") + df = df.rename(columns={id_col: 'id', event_col: 'event', start_col: 'start', stop_col: 'stop', weights_col: '__weights'}) df = df.set_index('id') @@ -111,6 +115,7 @@ def fit(self, df, id_col, event_col, start_col='start', stop_col='stop', weights @staticmethod def _check_values(df, stop_times_events): # check_for_overlapping_intervals(df) # this is currenty too slow for production. + check_nans(df) check_low_var(df) check_complete_separation_low_variance(df, stop_times_events['event']) pass_for_numeric_dtypes_or_raise(df) diff --git a/lifelines/fitters/coxph_fitter.py b/lifelines/fitters/coxph_fitter.py index 248b87f42..93f411bd3 100644 --- a/lifelines/fitters/coxph_fitter.py +++ b/lifelines/fitters/coxph_fitter.py @@ -432,6 +432,7 @@ def _check_values(df, T, E): pass_for_numeric_dtypes_or_raise(df) check_nans(T) check_nans(E) + check_nans(df) check_low_var(df) check_complete_separation(df, E, T) diff --git a/lifelines/utils/__init__.py b/lifelines/utils/__init__.py index 07443e169..b8d073613 100644 --- a/lifelines/utils/__init__.py +++ b/lifelines/utils/__init__.py @@ -1103,9 +1103,9 @@ def check_complete_separation(df, events, durations): check_complete_separation_close_to_perfect_correlation(df, durations) -def check_nans(array): - if pd.isnull(array).any(): - raise TypeError("NaNs were detected in the duration_col and/or the event_col") +def check_nans(df_or_array): + if pd.isnull(df_or_array).values.any(): + raise TypeError("NaNs were detected in the dataset. Try using pd.isnull to find the problematic values.") def to_long_format(df, duration_col): diff --git a/tests/test_estimation.py b/tests/test_estimation.py index 081d433d6..921209152 100644 --- a/tests/test_estimation.py +++ b/tests/test_estimation.py @@ -286,13 +286,13 @@ def test_valueerror_is_thrown_if_alpha_out_of_bounds(self, univariate_fitters): with pytest.raises(ValueError): fitter(alpha=95) - def test_error_is_thrown_if_there_is_nans_in_the_duration_col(self, univariate_fitters): + def test_typeerror_is_thrown_if_there_is_nans_in_the_duration_col(self, univariate_fitters): T = np.array([1.0, 2.0, 4.0, None, 8.0]) for fitter in univariate_fitters: with pytest.raises(TypeError): fitter().fit(T) - def test_error_is_thrown_if_there_is_nans_in_the_event_col(self, univariate_fitters): + def test_typeerror_is_thrown_if_there_is_nans_in_the_event_col(self, univariate_fitters): T = np.arange(5) E = [1, 0, None, 1, 1] for fitter in univariate_fitters: @@ -1679,6 +1679,13 @@ def test_robust_errors_with_strata_doesnt_break(self, rossi): cf.fit(rossi, duration_col='week', event_col='arrest', strata=['race', 'paro', 'mar', 'wexp'], robust=True) + def test_what_happens_to_nans(self, rossi): + rossi['var4'] = np.nan + cf = CoxPHFitter() + with pytest.raises(TypeError): + cf.fit(rossi, duration_col='week' event_col="arrest") + + class TestAalenAdditiveFitter(): @@ -1832,12 +1839,55 @@ def heart(self): return load_stanford_heart_transplants() def test_inference_against_known_R_output(self, ctv, dfcv): - # from http://www.math.ucsd.edu/~rxu/math284/slect7.pdf + """ + from http://www.math.ucsd.edu/~rxu/math284/slect7.pdf + + > coxph(formula = Surv(time = start, time2 = stop, event) ~ group + z, data = dfcv) + + """ ctv.fit(dfcv, id_col="id", start_col="start", stop_col="stop", event_col="event") npt.assert_almost_equal(ctv.summary['coef'].values, [1.826757, 0.705963], decimal=4) npt.assert_almost_equal(ctv.summary['se(coef)'].values, [1.229, 1.206], decimal=3) npt.assert_almost_equal(ctv.summary['p'].values, [0.14, 0.56], decimal=2) + def test_what_happens_to_nans(self, ctv, dfcv): + """ + from http://www.math.ucsd.edu/~rxu/math284/slect7.pdf + + > coxph(formula = Surv(time = start, time2 = stop, event) ~ group + z, data = dfcv) + + """ + dfcv['var4'] = np.nan + with pytest.raises(TypeError): + ctv.fit(dfcv, id_col="id", start_col="start", stop_col="stop", event_col="event") + + + def test_inference_against_known_R_output_with_weights(self, ctv, dfcv): + """ + > dfcv['weights'] = [0.46009262, 0.04643257, 0.38150793, 0.11903676, 0.51965860, 0.96173133, 0.32435527, 0.16708398, 0.85464418, 0.15146481, 0.24713429, 0.55198318, 0.16948366, 0.19246483] + > coxph(formula = Surv(time = start, time2 = stop, event) ~ group + z, data = dfcv) + + """ + dfcv['weights'] = [ + 0.4600926178338619, + 0.046432574620396294, + 0.38150793079960477, + 0.11903675541025949, + 0.5196585971574837, + 0.9617313298681641, + 0.3243552664091651, + 0.16708398114269085, + 0.8546441798716636, + 0.15146480991643507, + 0.24713429350878657, + 0.5519831777187729, + 0.16948366380884838, + 0.19246482703103884 + ] + ctv.fit(dfcv, id_col="id", start_col="start", stop_col="stop", event_col="event", weights_col='weights') + npt.assert_almost_equal(ctv.summary['coef'].values, [0.313, 0.423], decimal=3) + npt.assert_almost_equal(ctv.summary['se(coef)'].values, [1.542, 1.997], decimal=3) + @pytest.mark.xfail() def test_fitter_will_raise_an_error_if_overlapping_intervals(self, ctv): df = pd.DataFrame.from_records([ From 942051d808cdf632acb50e88c015e74f48e9cb42 Mon Sep 17 00:00:00 2001 From: Cameron Davidson-Pilon Date: Mon, 22 Oct 2018 22:49:35 -0400 Subject: [PATCH 19/59] better print_summary for regression models --- CHANGELOG.md | 2 + docs/Quickstart.rst | 15 ++++-- docs/Survival Regression.rst | 52 +++++++++++++------- lifelines/fitters/cox_time_varying_fitter.py | 32 ++++++++---- lifelines/fitters/coxph_fitter.py | 28 ++++++++--- lifelines/fitters/weibull_fitter.py | 23 +++++---- lifelines/utils/__init__.py | 7 ++- tests/test_estimation.py | 42 ++++++++++------ 8 files changed, 139 insertions(+), 62 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 51947b629..2a91824e4 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -10,6 +10,8 @@ - `AalenAdditiveModel` raises `ConvergenceWarning` instead of printing a warning. - `KaplanMeierFitter` now has a cumulative plot option. Example `kmf.plot(invert_y_axis=True)` - a `weights_col` option has been added to `CoxTimeVaryingFitter` that allows for time-varying weights. + - `WeibullFitter` has a new `show_progress` param. + - `CoxPHFitter` and `CoxTimeVaryFitter` method `print_summary` is updated with new fields. #### 0.14.6 - fix for n > 2 groups in `multivariate_logrank_test` (again). diff --git a/docs/Quickstart.rst b/docs/Quickstart.rst index cbf4fa793..a3b1e12f7 100644 --- a/docs/Quickstart.rst +++ b/docs/Quickstart.rst @@ -159,16 +159,23 @@ The input of the ``fit`` method's API in a regression is different. All the data cph.print_summary() """ - n=200, number of events=189 + duration col = T + event col = E + number of subjects = 200 + number of events = 189 + log-likelihood = -807.620 + time fit was run = 2018-10-23 02:44:18 UTC + --- coef exp(coef) se(coef) z p lower 0.95 upper 0.95 - var1 0.2213 1.2477 0.0743 2.9796 0.0029 0.0757 0.3669 ** - var2 0.0509 1.0522 0.0829 0.6139 0.5393 -0.1116 0.2134 - var3 0.2186 1.2443 0.0758 2.8836 0.0039 0.0700 0.3672 ** + var1 0.2222 1.2488 0.0743 2.9920 0.0028 0.0767 0.3678 ** + var2 0.0510 1.0523 0.0829 0.6148 0.5387 -0.1115 0.2134 + var3 0.2183 1.2440 0.0758 2.8805 0.0040 0.0698 0.3669 ** --- Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 Concordance = 0.580 + Likelihood ratio test = 15.540 on 3 df, p=0.00141 """ cph.plot() diff --git a/docs/Survival Regression.rst b/docs/Survival Regression.rst index 3676fb142..3d4e8bc12 100644 --- a/docs/Survival Regression.rst +++ b/docs/Survival Regression.rst @@ -60,16 +60,22 @@ This example data is from the paper `here , ]. Default: