From 84027bc0c77043d91324126414e1a9bd0b539fb3 Mon Sep 17 00:00:00 2001
From: Cameron Davidson-Pilon <cam.davidson.pilon@gmail.com>
Date: Fri, 6 Jul 2018 15:33:52 -0400
Subject: [PATCH 01/59] adding robust errors to coxph

---
 lifelines/datasets/__init__.py    |   3 +-
 lifelines/datasets/regression.csv | 400 +++++++++++++++---------------
 lifelines/fitters/coxph_fitter.py |  79 +++++-
 tests/test_estimation.py          |  29 ++-
 4 files changed, 299 insertions(+), 212 deletions(-)

diff --git a/lifelines/datasets/__init__.py b/lifelines/datasets/__init__.py
index 87686de35..35e01f656 100644
--- a/lifelines/datasets/__init__.py
+++ b/lifelines/datasets/__init__.py
@@ -270,7 +270,8 @@ def load_rossi(**kwargs):
 
 def load_regression_dataset(**kwargs):
     """
-    Artificial regression dataset
+    Artificial regression dataset. Useful since there are no ties in this dataset.
+    Slightly edit in v0.15.0 to achieve this, however.
 
     Size: (200,5)
     Example:
diff --git a/lifelines/datasets/regression.csv b/lifelines/datasets/regression.csv
index 01d39937a..2afc66848 100644
--- a/lifelines/datasets/regression.csv
+++ b/lifelines/datasets/regression.csv
@@ -1,201 +1,201 @@
 var1,var2,var3,T,E
-0.59517,1.143472,1.5710790000000001,14.785479,1
-0.209325,0.184677,0.35698,7.336734,1
-0.693919,0.071893,0.55796,5.271527,1
-0.443804,1.3646459999999998,0.374221,11.684168,1
-1.613324,0.125566,1.921325,7.637764,1
-0.065636,0.098375,0.237896,12.678268,1
-0.386294,1.6630919999999998,0.7903140000000001,6.601660000000001,1
-0.946688,1.345394,3.209113,11.369137,1
-0.11374000000000001,0.40986000000000006,0.064934,14.680468,1
-0.7777930000000001,0.33499,0.411055,10.585059,1
-0.04428,0.305158,0.17648,19.370936999999998,1
-1.03545,3.304733,0.997323,5.558555999999999,1
-0.22919499999999998,0.5813550000000001,0.48479399999999995,11.292129,1
-0.055970000000000006,2.6741349999999997,0.355279,9.919992,0
-1.236583,1.796598,0.179952,9.884988,1
-1.162835,0.46475900000000003,2.028854,6.265626999999999,1
-0.14943599999999999,2.949291,0.277801,13.812381,1
-0.399475,0.822413,0.673405,6.433643,1
-0.762121,0.050407,1.2851629999999998,6.979698,1
-1.239718,1.869215,0.020202,7.742774000000001,1
-0.019221000000000002,1.435543,0.255689,4.70447,1
-0.090253,0.211037,0.372809,11.236124,1
-0.20584899999999998,0.048722,0.00253,6.664666,1
-0.088185,1.319679,0.201675,10.718072,1
-4.629747,0.36352199999999996,1.08207,11.593159,1
-1.6028360000000001,1.217881,0.350837,8.463846,1
-0.014804,0.684737,0.493267,5.432543,1
-0.08402000000000001,1.432093,0.456541,10.277028,1
-2.260223,1.2389299999999999,5.541837999999999,7.217722,1
-0.6219680000000001,0.6844279999999999,0.135933,13.217322,1
-0.013219999999999999,3.280555,1.193551,8.029803,1
-0.070651,1.430517,0.0052049999999999996,9.807981,1
-0.20598000000000002,0.29064,0.096565,2.632263,1
-1.389882,0.14313299999999998,0.821257,6.160616,1
-0.104143,2.072924,0.449696,6.265626999999999,1
-0.42848100000000006,0.06573899999999999,3.00755,4.606461,0
-1.785151,1.572282,0.475059,11.124112,1
-0.22889299999999999,0.429025,0.60805,6.083608,1
-0.640837,0.311084,3.165658,9.219922,1
-1.450683,0.8470219999999999,2.5211770000000002,7.119712,0
-0.469459,0.318871,0.164498,13.056306,1
-0.36617,0.23499099999999998,0.678709,4.949495,1
-1.4325700000000001,2.668335,0.558046,4.858486,1
-2.696463,0.244077,1.3151110000000001,8.505851,1
-0.152624,0.37950100000000003,0.330164,7.035703999999999,1
-0.27739899999999995,0.871603,1.555185,10.837083999999999,1
-0.353633,0.294236,0.928573,9.919992,1
-0.6209560000000001,0.021884,3.2057599999999997,5.292529,1
-0.0007570000000000001,1.216615,0.8610690000000001,20.981098,1
-0.497674,2.744032,0.47358900000000004,5.810581,1
-1.213709,0.072756,0.09842000000000001,11.292129,1
-0.426255,2.550392,0.16762,8.19782,1
-0.008408,1.132205,1.234917,7.469747,1
-1.207833,0.13335,0.528231,10.669067,1
-0.036975,0.040631,0.2664,10.543054,1
-0.789439,0.669067,1.332697,6.2866290000000005,1
-1.482055,0.627205,0.738271,9.079908,1
-1.028671,0.21520999999999998,0.457692,14.183418,1
-0.521986,2.282683,0.31597600000000003,21.940194,1
-0.26262199999999997,0.345999,0.9210969999999999,8.883888,1
-0.360319,1.001364,0.237533,9.982998,1
-0.362587,0.110046,2.486691,9.555956,1
-1.793598,0.310001,0.26306599999999997,8.659866000000001,1
-0.419275,0.11430799999999999,1.124784,5.642564,1
-1.4702469999999999,0.289054,0.331833,10.9981,1
-0.27476,0.523508,2.139204,8.050805,1
-0.119805,0.7337739999999999,0.21205700000000002,11.250125,1
-0.369294,0.609847,0.89402,10.214021,0
-1.01825,2.119666,0.716002,12.335234,1
-0.607065,2.3501119999999998,0.031389999999999994,15.723572,1
-4.169830999999999,0.316285,0.16935,11.831183,0
-1.483383,2.242744,0.26543,7.364736,1
-0.32359699999999997,0.165159,0.97204,12.517252000000001,1
-1.82716,0.32779400000000003,0.9415389999999999,8.512851,1
-0.104104,0.9233020000000001,1.22007,12.79728,1
-0.392766,0.42279399999999995,3.4826080000000004,8.540854,1
-2.579629,0.109011,2.2800279999999997,3.5633559999999997,1
-0.775092,0.974519,2.2236990000000003,9.576958,1
-0.5075930000000001,0.917278,0.103131,9.646965,1
-0.13843699999999998,2.474084,1.6350049999999998,11.789178999999999,1
-1.120586,1.480593,0.6382439999999999,4.648465,0
-0.001842,0.6014520000000001,0.40551,14.162416,1
-0.9969790000000001,0.44859,0.782013,7.490749,1
-1.0672629999999999,0.304582,0.795276,10.739074,1
-1.123429,1.4093149999999999,0.090895,8.239824,1
-0.139158,3.203523,0.28734899999999997,7.630763000000001,1
-1.276529,1.039313,1.217827,7.819782000000001,1
-0.175824,1.371635,1.7854880000000002,12.419242,1
-0.24130100000000002,4.048806,0.423415,10.564055999999999,1
-1.8644439999999998,0.821839,0.426364,6.293629,1
-0.34029499999999996,0.727143,0.341437,12.405241,1
-5.130831,0.074513,0.8015260000000001,10.79508,1
-1.404635,0.039251,0.785162,17.09571,1
-0.07394400000000001,0.053314999999999994,0.18626199999999998,15.464545999999999,0
-1.271488,0.10678,0.291883,9.611961,1
-0.781452,1.229076,0.069747,14.407441,1
-0.3909,0.35690700000000003,0.23058,10.088009,1
-2.193825,0.6211840000000001,0.466925,5.817582,1
-2.942882,0.16383,1.040333,7.987799000000001,1
-0.705527,0.592699,0.923248,11.831183,1
-1.662925,2.1851700000000003,0.664273,11.873187,1
-0.407842,1.011611,0.485592,4.144414,1
-0.091321,0.281593,0.153947,8.288829,1
-3.5385089999999995,1.80715,1.336961,4.326433,1
-0.661027,1.171563,0.30091,14.246425,1
-0.106552,0.121843,0.257878,5.663566,1
-0.104327,1.513503,0.314581,7.9177919999999995,1
-0.811837,1.6833240000000003,0.061925,11.845185,1
-0.402495,0.43151999999999996,0.489576,11.075108,1
-1.322155,0.521161,1.859989,6.888689,1
-0.647954,3.243631,0.034075,8.344834,1
-0.851476,0.21736599999999998,0.29733000000000004,4.473447,1
-0.14999400000000002,3.027889,0.5427489999999999,9.247925,1
-0.381276,1.146927,0.22583000000000003,11.215122000000001,1
-0.019479,1.374707,1.5665950000000002,8.288829,1
-0.806793,0.60941,1.903648,10.074007,1
-0.9268280000000001,1.062158,0.048544,14.484448,1
-0.998282,0.385911,1.403305,11.026102999999999,1
-0.198755,1.668675,0.182337,6.251625,1
-1.668232,0.717113,0.39318000000000003,15.884588,1
-0.903388,0.34757,0.796215,11.341134,1
-3.094217,0.764497,3.063756,7.644764,1
-0.565765,0.8556440000000001,2.4122220000000003,8.365836999999999,1
-0.600544,0.019666,2.356107,11.90119,1
-0.453201,0.24214899999999998,0.7611140000000001,9.912991,1
-0.441605,0.271366,0.9775219999999999,8.323832000000001,1
-0.41135,0.029483999999999996,1.8434580000000003,11.971197,1
-0.5351199999999999,0.045629,0.16006700000000001,11.124112,1
-0.47211899999999996,2.239749,0.148828,6.153615,1
-0.485754,1.464013,0.380293,8.911891,1
-5.353937,0.855298,0.001006,4.879487999999999,1
-0.000974,0.35496500000000003,0.698741,20.666067,1
-0.36145700000000003,2.792862,1.503787,11.082108,1
-1.2026729999999999,1.825852,0.391339,8.008801,1
-0.8530770000000001,0.22137600000000002,1.6355389999999999,9.779978,1
-1.646959,3.3371690000000003,1.262672,5.663566,1
-0.050491,1.0423879999999999,0.040406,10.50105,1
-0.693033,0.067717,1.6319299999999999,9.968997,1
-3.8753610000000003,1.206579,0.6567850000000001,4.837484,1
-0.401754,1.526443,0.449621,7.952795,1
-2.112141,0.994604,0.12592799999999998,4.445444999999999,1
-2.358111,1.411174,4.747023,6.930692999999999,1
-0.406167,0.7479359999999999,1.240233,11.971197,1
-0.9833120000000001,1.330699,0.931057,12.769277,1
-2.8028560000000002,0.141768,0.96447,6.153615,1
-0.22598000000000001,0.156969,0.771678,9.093909,1
-1.0202120000000001,1.338747,1.485407,9.70297,1
-0.737183,0.21196700000000002,1.479703,10.417042,1
-0.694596,0.13306500000000002,1.612199,13.182317999999999,1
-1.614919,1.628414,3.3395629999999996,2.576258,1
-1.263567,0.041625999999999996,0.13448800000000002,7.091709,1
-1.81759,0.89371,0.256831,5.740574,1
-0.221442,1.00047,0.13556500000000002,12.923292,1
-0.388571,2.331312,0.048117,12.874286999999999,1
-1.365461,0.44473,0.26388,4.725473,1
-0.017446,1.50251,1.859648,9.835984,1
-0.803217,0.259678,0.305695,6.062606,1
-1.153738,2.357565,0.264925,8.092808999999999,1
-0.546425,0.516525,0.05980599999999999,8.043804,1
-0.061367,2.453071,0.234816,8.715872000000001,1
-0.42113599999999995,0.295455,1.117664,13.287329000000001,1
-1.5747790000000002,0.7411220000000001,0.533676,10.515052,1
-1.3943510000000001,0.877793,1.637652,6.426643,1
-0.923441,1.1076139999999999,0.78291,3.640364,1
-0.231346,0.620135,1.8213549999999998,4.746475,1
-0.7357060000000001,3.4050540000000002,3.457625,11.677168,1
-1.748839,1.132628,0.812584,11.558156,1
-0.280291,1.664837,0.051460000000000006,8.757876,1
-0.150857,2.545696,1.456119,12.825283,1
-1.5516809999999999,0.125114,0.148355,15.618561999999999,0
-0.746388,0.267458,0.42003599999999996,11.943194,1
-0.068177,0.19378800000000002,2.693533,7.952795,1
-0.305141,0.858988,3.883753,12.356236,1
-3.614956,0.659784,1.013164,3.5633559999999997,1
-1.9810330000000003,0.7379720000000001,0.272071,8.561856,1
-0.19708,1.164958,0.8204870000000001,4.207421,1
-0.027854000000000004,0.6533260000000001,0.08022,21.030103,1
-1.8066659999999999,3.535072,2.176759,5.810581,1
-0.16528800000000002,1.6233950000000001,1.9945509999999997,8.79988,1
-1.617063,0.49479799999999996,0.131597,7.798780000000001,0
-1.298794,1.778036,0.453693,12.657266,1
-0.707968,1.081388,0.477484,14.30243,1
-0.246455,0.11361800000000001,0.407209,13.329332999999998,1
-0.282453,0.731784,0.002421,6.1256129999999995,1
-0.133855,0.096552,0.152854,4.935494,0
-0.025306,0.07387,0.163927,6.314630999999999,1
-1.017839,0.737884,3.126409,6.573657000000001,0
-0.847491,1.142187,1.342932,8.610861,1
-0.9420930000000001,0.161735,1.388318,9.997,1
-0.38300100000000004,0.006451,0.901114,7.749775,1
-0.011165999999999999,0.220669,0.6917909999999999,7.3437339999999995,1
-1.5435020000000002,1.472249,0.830817,6.986699000000001,1
-0.168033,3.052163,0.035085000000000005,18.131813,1
-2.1599459999999997,0.001644,1.443158,4.382438,1
-0.249142,0.628992,2.3185130000000003,8.743874,1
-0.137399,0.107748,0.354812,11.446145,1
-0.6373409999999999,2.847188,1.4591370000000001,7.623761999999999,1
-1.109732,0.405561,0.018856,10.634063000000001,1
-0.031865,1.753759,0.25204,8.519852,1
-1.631269,1.5886209999999998,3.7098989999999996,4.480448,1
+0.59517,1.143472,1.571079,14.7856515748,1
+0.209325,0.184677,0.35698,7.33584583652,1
+0.693919,0.071893,0.55796,5.26979701571,1
+0.443804,1.364646,0.374221,11.6840920212,1
+1.613324,0.125566,1.921325,7.63949212526,1
+0.065636,0.098375,0.237896,12.6784581817,1
+0.386294,1.663092,0.790314,6.60166572026,1
+0.946688,1.345394,3.209113,11.3670916491,1
+0.11374,0.40986,0.064934,14.6805866317,1
+0.777793,0.33499,0.411055,10.5854086595,1
+0.04428,0.305158,0.17648,19.3721173864,1
+1.03545,3.304733,0.997323,5.55904466985,1
+0.229195,0.581355,0.484794,11.2924891948,1
+0.05597,2.674135,0.355279,9.92047433529,0
+1.236583,1.796598,0.179952,9.88652411916,1
+1.162835,0.464759,2.028854,6.26643301257,1
+0.149436,2.949291,0.277801,13.8127296,1
+0.399475,0.822413,0.673405,6.43309776107,1
+0.762121,0.050407,1.285163,6.97979741031,1
+1.239718,1.869215,0.020202,7.74300832502,1
+0.019221,1.435543,0.255689,4.70530329608,1
+0.090253,0.211037,0.372809,11.2335841641,1
+0.205849,0.048722,0.00253,6.66273101972,1
+0.088185,1.319679,0.201675,10.7174137318,1
+4.629747,0.363522,1.08207,11.5938047533,1
+1.602836,1.217881,0.350837,8.46420655566,1
+0.014804,0.684737,0.493267,5.43255855841,1
+0.08402,1.432093,0.456541,10.276593667,1
+2.260223,1.23893,5.541838,7.21736226987,1
+0.621968,0.684428,0.135933,13.2176584654,1
+0.01322,3.280555,1.193551,8.0299335416,1
+0.070651,1.430517,0.005205,9.80826804874,1
+0.20598,0.29064,0.096565,2.63226375911,1
+1.389882,0.143133,0.821257,6.16269524116,1
+0.104143,2.072924,0.449696,6.26182607469,1
+0.428481,0.065739,3.00755,4.6048190364,0
+1.785151,1.572282,0.475059,11.1239077928,1
+0.228893,0.429025,0.60805,6.08263253911,1
+0.640837,0.311084,3.165658,9.22065563383,1
+1.450683,0.847022,2.521177,7.12012891282,0
+0.469459,0.318871,0.164498,13.0551806715,1
+0.36617,0.234991,0.678709,4.95133583707,1
+1.43257,2.668335,0.558046,4.85889534209,1
+2.696463,0.244077,1.315111,8.50543244101,1
+0.152624,0.379501,0.330164,7.03794878748,1
+0.277399,0.871603,1.555185,10.8353931443,1
+0.353633,0.294236,0.928573,9.92109564592,1
+0.620956,0.021884,3.20576,5.29165212342,1
+0.000757,1.216615,0.861069,20.9813809356,1
+0.497674,2.744032,0.473589,5.80982311606,1
+1.213709,0.072756,0.09842,11.2908076197,1
+0.426255,2.550392,0.16762,8.19779319557,1
+0.008408,1.132205,1.234917,7.47213923892,1
+1.207833,0.13335,0.528231,10.670073183,1
+0.036975,0.040631,0.2664,10.543203936,1
+0.789439,0.669067,1.332697,6.28852320244,1
+1.482055,0.627205,0.738271,9.080334859,1
+1.028671,0.21521,0.457692,14.1822947115,1
+0.521986,2.282683,0.315976,21.9399783806,1
+0.262622,0.345999,0.921097,8.88229093093,1
+0.360319,1.001364,0.237533,9.98342539597,1
+0.362587,0.110046,2.486691,9.55638379129,1
+1.793598,0.310001,0.263066,8.65928769028,1
+0.419275,0.114308,1.124784,5.6434659532,1
+1.470247,0.289054,0.331833,10.9977552573,1
+0.27476,0.523508,2.139204,8.05076874352,1
+0.119805,0.733774,0.212057,11.2503684762,1
+0.369294,0.609847,0.89402,10.2129272163,0
+1.01825,2.119666,0.716002,12.3366190173,1
+0.607065,2.350112,0.03139,15.7231628796,1
+4.169831,0.316285,0.16935,11.8302258308,0
+1.483383,2.242744,0.26543,7.36252640465,1
+0.323597,0.165159,0.97204,12.5169357575,1
+1.82716,0.327794,0.941539,8.51322372304,1
+0.104104,0.923302,1.22007,12.7973318626,1
+0.392766,0.422794,3.482608,8.54052212332,1
+2.579629,0.109011,2.280028,3.56358063047,1
+0.775092,0.974519,2.223699,9.57756677618,1
+0.507593,0.917278,0.103131,9.64881133972,1
+0.138437,2.474084,1.635005,11.789052178,1
+1.120586,1.480593,0.638244,4.6478831396,0
+0.001842,0.601452,0.40551,14.1629885287,1
+0.996979,0.44859,0.782013,7.49170332816,1
+1.067263,0.304582,0.795276,10.7391507856,1
+1.123429,1.409315,0.090895,8.23925481073,1
+0.139158,3.203523,0.287349,7.63092764446,1
+1.276529,1.039313,1.217827,7.8203073558,1
+0.175824,1.371635,1.785488,12.4208671834,1
+0.241301,4.048806,0.423415,10.5633022553,1
+1.864444,0.821839,0.426364,6.29314423772,1
+0.340295,0.727143,0.341437,12.4052831947,1
+5.130831,0.074513,0.801526,10.7964133954,1
+1.404635,0.039251,0.785162,17.0955898105,1
+0.073944,0.053315,0.186262,15.4629258426,0
+1.271488,0.10678,0.291883,9.61273262267,1
+0.781452,1.229076,0.069747,14.4093766173,1
+0.3909,0.356907,0.23058,10.0869453587,1
+2.193825,0.621184,0.466925,5.81874836313,1
+2.942882,0.16383,1.040333,7.98833687105,1
+0.705527,0.592699,0.923248,11.8312524366,1
+1.662925,2.18517,0.664273,11.8731640313,1
+0.407842,1.011611,0.485592,4.14387121547,1
+0.091321,0.281593,0.153947,8.2907236782,1
+3.538509,1.80715,1.336961,4.32535868496,1
+0.661027,1.171563,0.30091,14.2454199869,1
+0.106552,0.121843,0.257878,5.66506177628,1
+0.104327,1.513503,0.314581,7.91846164924,1
+0.811837,1.683324,0.061925,11.8443153255,1
+0.402495,0.43152,0.489576,11.0761501869,1
+1.322155,0.521161,1.859989,6.88916431026,1
+0.647954,3.243631,0.034075,8.34491489657,1
+0.851476,0.217366,0.29733,4.47399978584,1
+0.149994,3.027889,0.542749,9.24722217844,1
+0.381276,1.146927,0.22583,11.2146024447,1
+0.019479,1.374707,1.566595,8.28623107033,1
+0.806793,0.60941,1.903648,10.0738349202,1
+0.926828,1.062158,0.048544,14.4854236558,1
+0.998282,0.385911,1.403305,11.0245383745,1
+0.198755,1.668675,0.182337,6.25197764519,1
+1.668232,0.717113,0.39318,15.8836989034,1
+0.903388,0.34757,0.796215,11.34167126,1
+3.094217,0.764497,3.063756,7.6440979558,1
+0.565765,0.855644,2.412222,8.36665621446,1
+0.600544,0.019666,2.356107,11.9012822052,1
+0.453201,0.242149,0.761114,9.91242335863,1
+0.441605,0.271366,0.977522,8.32289768512,1
+0.41135,0.029484,1.843458,11.9717347284,1
+0.53512,0.045629,0.160067,11.1247094819,1
+0.472119,2.239749,0.148828,6.15413791004,1
+0.485754,1.464013,0.380293,8.91144648631,1
+5.353937,0.855298,0.001006,4.88104290364,1
+0.000974,0.354965,0.698741,20.6652251814,1
+0.361457,2.792862,1.503787,11.0822662185,1
+1.202673,1.825852,0.391339,8.00771353063,1
+0.853077,0.221376,1.635539,9.78044985255,1
+1.646959,3.337169,1.262672,5.66532639351,1
+0.050491,1.042388,0.040406,10.5017493224,1
+0.693033,0.067717,1.63193,9.96821890209,1
+3.875361,1.206579,0.656785,4.83539084237,1
+0.401754,1.526443,0.449621,7.95252897093,1
+2.112141,0.994604,0.125928,4.44530592881,1
+2.358111,1.411174,4.747023,6.92942222099,1
+0.406167,0.747936,1.240233,11.9701980352,1
+0.983312,1.330699,0.931057,12.7687578904,1
+2.802856,0.141768,0.96447,6.1536973616,1
+0.22598,0.156969,0.771678,9.09300617059,1
+1.020212,1.338747,1.485407,9.70355645693,1
+0.737183,0.211967,1.479703,10.4173960087,1
+0.694596,0.133065,1.612199,13.1829276562,1
+1.614919,1.628414,3.339563,2.57553764605,1
+1.263567,0.041626,0.134488,7.09141708798,1
+1.81759,0.89371,0.256831,5.73937524269,1
+0.221442,1.00047,0.135565,12.9241740121,1
+0.388571,2.331312,0.048117,12.8735020088,1
+1.365461,0.44473,0.26388,4.72642629428,1
+0.017446,1.50251,1.859648,9.83594179776,1
+0.803217,0.259678,0.305695,6.06237621553,1
+1.153738,2.357565,0.264925,8.09338513449,1
+0.546425,0.516525,0.059806,8.04406734742,1
+0.061367,2.453071,0.234816,8.71594708122,1
+0.421136,0.295455,1.117664,13.2869538904,1
+1.574779,0.741122,0.533676,10.5135003978,1
+1.394351,0.877793,1.637652,6.42775744203,1
+0.923441,1.107614,0.78291,3.63878040929,1
+0.231346,0.620135,1.821355,4.74775344975,1
+0.735706,3.405054,3.457625,11.6770199376,1
+1.748839,1.132628,0.812584,11.5594329742,1
+0.280291,1.664837,0.05146,8.75824998324,1
+0.150857,2.545696,1.456119,12.8268461929,1
+1.551681,0.125114,0.148355,15.6204061094,0
+0.746388,0.267458,0.420036,11.9422735844,1
+0.068177,0.193788,2.693533,7.95449616061,1
+0.305141,0.858988,3.883753,12.3573764607,1
+3.614956,0.659784,1.013164,3.56383007199,1
+1.981033,0.737972,0.272071,8.5619748224,1
+0.19708,1.164958,0.820487,4.20656850475,1
+0.027854,0.653326,0.08022,21.0318230188,1
+1.806666,3.535072,2.176759,5.81052910695,1
+0.165288,1.623395,1.994551,8.79849009986,1
+1.617063,0.494798,0.131597,7.79923023169,0
+1.298794,1.778036,0.453693,12.6551650347,1
+0.707968,1.081388,0.477484,14.3014540711,1
+0.246455,0.113618,0.407209,13.3297030877,1
+0.282453,0.731784,0.002421,6.12506421389,1
+0.133855,0.096552,0.152854,4.93564074908,0
+0.025306,0.07387,0.163927,6.3156952171,1
+1.017839,0.737884,3.126409,6.57321280053,0
+0.847491,1.142187,1.342932,8.61060494656,1
+0.942093,0.161735,1.388318,9.9956084953,1
+0.383001,0.006451,0.901114,7.74825868839,1
+0.011166,0.220669,0.691791,7.34226786253,1
+1.543502,1.472249,0.830817,6.98633720723,1
+0.168033,3.052163,0.035085,18.1313105791,1
+2.159946,0.001644,1.443158,4.38165504789,1
+0.249142,0.628992,2.318513,8.74257448673,1
+0.137399,0.107748,0.354812,11.4454572735,1
+0.637341,2.847188,1.459137,7.62462675408,1
+1.109732,0.405561,0.018856,10.6346199544,1
+0.031865,1.753759,0.25204,8.51971771151,1
+1.631269,1.588621,3.709899,4.47895208711,1
diff --git a/lifelines/fitters/coxph_fitter.py b/lifelines/fitters/coxph_fitter.py
index d93149bc7..2e65fa99a 100644
--- a/lifelines/fitters/coxph_fitter.py
+++ b/lifelines/fitters/coxph_fitter.py
@@ -8,7 +8,8 @@
 import pandas as pd
 
 from numpy import dot, exp
-from numpy.linalg import solve, norm, inv
+from numpy.linalg import norm, inv
+from scipy.linalg import solve as spsolve
 from scipy.integrate import trapz
 import scipy.stats as stats
 
@@ -57,7 +58,8 @@ def __init__(self, alpha=0.95, tie_method='Efron', penalizer=0.0, strata=None):
 
     def fit(self, df, duration_col, event_col=None,
             show_progress=False, initial_beta=None,
-            strata=None, step_size=None, weights_col=None):
+            strata=None, step_size=None, weights_col=None,
+            robust=False):
         """
         Fit the Cox Propertional Hazard model to a dataset. Tied survival times
         are handled using Efron's tie-method.
@@ -83,9 +85,12 @@ def fit(self, df, duration_col, event_col=None,
              is used similar to the `strata` expression in R.
              See http://courses.washington.edu/b515/l17.pdf.
           step_size: set an initial step size for the fitting algorithm.
+          robust: Compute the robust errors using the Huber sandwich estimator, aka Wei-Lin estimate. This does not handle
+            ties, so if there are high number of ties, results may significantly differ. See
+            "The Robust Inference for the Cox Proportional Hazards Model", Journal of the American Statistical Association, Vol. 84, No. 408 (Dec., 1989), pp. 1074- 1078
 
         Returns:
-            self, with additional properties: hazards_
+            self, with additional properties: hazards_, confidence_intervals_, baseline_survival_, etc.
 
         """
 
@@ -94,6 +99,7 @@ def fit(self, df, duration_col, event_col=None,
         # Sort on time
         df = df.sort_values(by=duration_col)
 
+        self.robust = robust
         self._n_examples = df.shape[0]
         self.strata = coalesce(strata, self.strata)
         if self.strata is not None:
@@ -143,14 +149,18 @@ def fit(self, df, duration_col, event_col=None,
                                          step_size=step_size)
 
         self.hazards_ = pd.DataFrame(hazards_.T, columns=df.columns, index=['coef']) / self._norm_std
+
+        self.standard_errors_ = self._compute_standard_errors(normalize(df, self._norm_mean, self._norm_std), T, E)
         self.confidence_intervals_ = self._compute_confidence_intervals()
 
+
         self.baseline_hazard_ = self._compute_baseline_hazards(df, T, E)
         self.baseline_cumulative_hazard_ = self._compute_baseline_cumulative_hazard()
         self.baseline_survival_ = self._compute_baseline_survival()
         self.score_ = concordance_index(self.durations,
                                         -self.predict_partial_hazard(df).values.ravel(),
                                         self.event_observed)
+
         self._train_log_partial_hazard = self.predict_log_partial_hazard(self._norm_mean.to_frame().T)
         return self
 
@@ -224,7 +234,7 @@ def _newton_rhaphson(self, X, T, E, weights=None, initial_beta=None, step_size=N
                 g -= self.penalizer * beta.T
                 h.flat[::d + 1] -= self.penalizer
 
-            delta = solve(-h, step_size * g.T)
+            delta = spsolve(-h, step_size * g.T, sym_pos=True)
             if np.any(np.isnan(delta)):
                 raise ValueError("""delta contains nan value(s). Convergence halted. Please see the following tips in the lifelines documentation:
 https://lifelines.readthedocs.io/en/latest/Examples.html#problems-with-convergence-in-the-cox-proportional-hazard-model
@@ -380,21 +390,72 @@ def _check_values(df, T, E):
 
     def _compute_confidence_intervals(self):
         alpha2 = inv_normal_cdf((1. + self.alpha) / 2.)
-        se = self._compute_standard_errors()
+        se = self.standard_errors_
         hazards = self.hazards_.values
         return pd.DataFrame(np.r_[hazards - alpha2 * se,
                                   hazards + alpha2 * se],
                             index=['lower-bound', 'upper-bound'],
                             columns=self.hazards_.columns)
 
-    def _compute_standard_errors(self):
-        se = np.sqrt(inv(-self._hessian_).diagonal()) / self._norm_std
+    def _compute_sandwich_estimator(self, X, T, E):
+
+        n, d = X.shape
+
+        # Init risk and tie sums to zero
+        risk_phi = 0
+        risk_phi_x = np.zeros((1, d))
+
+        # need to store these histories, as we access them often
+        risk_phi_history = np.zeros((n,))
+        risk_phi_x_history = np.zeros((n, d))
+
+        score_covariance = np.zeros((d, d))
+
+        # we already unnormalized the betas in `fit`, so we need normalize them again since X is
+        # normalized.
+        beta = self.hazards_.values[0] * self._norm_std
+
+        # Iterate backwards to utilize recursive relationship
+        for i in range(n - 1, -1, -1):
+            # Doing it like this to preserve shape
+            ei = E[i]
+            xi = X[i:i + 1]
+
+            phi_i = exp(dot(xi, beta))
+            phi_x_i = phi_i * xi
+
+            risk_phi += phi_i
+            risk_phi_x += phi_x_i
+
+            risk_phi_history[i] = risk_phi
+            risk_phi_x_history[i] = risk_phi_x
+
+        # Iterate forwards
+        for i in range(0, n):
+            # Doing it like this to preserve shape
+            xi = X[i:i + 1]
+            phi_i = exp(dot(xi, beta))
+
+            correction_term = sum(E[j] * phi_i / risk_phi_history[j] * (xi - risk_phi_x_history[j] / risk_phi_history[j]) for j in range(0, i+1))
+
+            score = E[i] * (xi - risk_phi_x_history[i] / risk_phi_history[i]) - correction_term
+            score_covariance += (score.T).dot(score)
+
+        # TODO: need a faster way to invert these matrices
+        sandwich_estimator = inv(self._hessian_).dot(score_covariance).dot(inv(self._hessian_))
+        return sandwich_estimator
+
+    def _compute_standard_errors(self, df, T, E):
+        if self.robust:
+            se = np.sqrt(self._compute_sandwich_estimator(df.values, T.values, E.values).diagonal()) / self._norm_std
+        else:
+            se = np.sqrt(-inv(self._hessian_).diagonal()) / self._norm_std
         return pd.DataFrame(se[None, :],
                             index=['se'], columns=self.hazards_.columns)
 
     def _compute_z_values(self):
         return (self.hazards_.loc['coef'] /
-                self._compute_standard_errors().loc['se'])
+                self.standard_errors_.loc['se'])
 
     def _compute_p_values(self):
         U = self._compute_z_values() ** 2
@@ -413,7 +474,7 @@ def summary(self):
         df = pd.DataFrame(index=self.hazards_.columns)
         df['coef'] = self.hazards_.loc['coef'].values
         df['exp(coef)'] = exp(self.hazards_.loc['coef'].values)
-        df['se(coef)'] = self._compute_standard_errors().loc['se'].values
+        df['se(coef)'] = self.standard_errors_.loc['se'].values
         df['z'] = self._compute_z_values()
         df['p'] = self._compute_p_values()
         df['lower %.2f' % self.alpha] = self.confidence_intervals_.loc['lower-bound'].values
diff --git a/tests/test_estimation.py b/tests/test_estimation.py
index 30a9f6bed..52df6b4c0 100644
--- a/tests/test_estimation.py
+++ b/tests/test_estimation.py
@@ -991,9 +991,9 @@ def test_coef_output_against_R_super_accurate(self, rossi):
 
         library(survival)
         rossi <- read.csv('.../lifelines/datasets/rossi.csv')
-        mod.allison <- coxph(Surv(week, arrest) ~ fin + age + race + wexp + mar + paro + prio,
+        r <- coxph(Surv(week, arrest) ~ fin + age + race + wexp + mar + paro + prio,
             data=rossi)
-        cat(round(mod.allison$coefficients, 4), sep=", ")
+        cat(round(r$coefficients, 4), sep=", ")
         """
         expected = np.array([[-0.3794, -0.0574, 0.3139, -0.1498, -0.4337, -0.0849,  0.0915]])
         cf = CoxPHFitter()
@@ -1379,6 +1379,30 @@ def test_all_okay_with_non_trivial_index_in_dataframe(self, rossi):
 
         assert_frame_equal(cp2.summary, cp1.summary)
 
+    def test_robust_errors_against_R_no_ties(self, regression_dataset):
+        df = regression_dataset.copy()
+        cph = CoxPHFitter()
+        cph.fit(df, 'T', 'E', robust=True)
+        expected = pd.Series({'var1': 0.0879, 'var2': 0.0847, 'var3': 0.0655})
+        assert_series_equal(cph.standard_errors_.loc['se'], expected, check_less_precise=2, check_names=False)
+
+
+    def test_robust_errors_with_strata_doesnt_break(self, rossi):
+        """
+        rossi <- read.csv('.../lifelines/datasets/rossi.csv')
+        r = coxph(formula = Surv(week, arrest) ~ fin + age + strata(race,
+                    paro, mar, wexp) + prio, data = rossi, robust=TRUE)
+        """
+        cf = CoxPHFitter()
+        cf.fit(rossi, duration_col='week', event_col='arrest', strata=['race', 'paro', 'mar', 'wexp'], robust=True)
+
+
+    def test_robust_errors_against_R_with_ties(self,):
+        pass
+
+
+
+
 
 class TestAalenAdditiveFitter():
 
@@ -1514,6 +1538,7 @@ def test_predict_cumulative_hazard_inputs(self, data_pred1):
         assert_frame_equal(y_df, y_np)
 
 
+
 class TestCoxTimeVaryingFitter():
 
     @pytest.fixture()

From f20dfba55c0025fdc581d32c9139fd2954c32ec4 Mon Sep 17 00:00:00 2001
From: Cameron Davidson-Pilon <cam.davidson.pilon@gmail.com>
Date: Mon, 30 Jul 2018 17:36:11 -0400
Subject: [PATCH 02/59] no idea what these changes are

---
 CHANGELOG.md                                 |  4 ++
 lifelines/fitters/cox_time_varying_fitter.py | 68 ++++++++++++++++++--
 lifelines/fitters/coxph_fitter.py            | 55 ++++++++++------
 lifelines/statistics.py                      | 11 +++-
 tests/test_estimation.py                     | 47 ++++++++++++--
 5 files changed, 152 insertions(+), 33 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 3aec7afef..f430fabeb 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,5 +1,9 @@
 ### Changelogs
 
+#### 0.15.0
+ - adding `robust` params to Cox models' `fit`. This enables atleast i) using non-integer weights in the model (these could be sampling weights like IPTW), and ii) misspecified models (ex: non-propotional hazards). Under the hood it's a sandwich estimator. This does not handle ties, so if there are high number of ties, results may significantly differ from other software.
+ - `standard_errors_` is now a property on fitted Cox models.
+
 #### 0.14.6
  - fix for n > 2 groups in `multivariate_logrank_test` (again).
  - fix bug for when `event_observed` column was not boolean. 
diff --git a/lifelines/fitters/cox_time_varying_fitter.py b/lifelines/fitters/cox_time_varying_fitter.py
index 611959a93..1aa0cb0be 100644
--- a/lifelines/fitters/cox_time_varying_fitter.py
+++ b/lifelines/fitters/cox_time_varying_fitter.py
@@ -37,7 +37,7 @@ def __init__(self, alpha=0.95, penalizer=0.0):
         self.alpha = alpha
         self.penalizer = penalizer
 
-    def fit(self, df, id_col, event_col, start_col='start', stop_col='stop', show_progress=False, step_size=None):
+    def fit(self, df, id_col, event_col, start_col='start', stop_col='stop', show_progress=False, step_size=None, robust=False):
         """
         Fit the Cox Propertional Hazard model to a time varying dataset. Tied survival times
         are handled using Efron's tie-method.
@@ -56,6 +56,10 @@ def fit(self, df, id_col, event_col, start_col='start', stop_col='stop', show_pr
           show_progress: since the fitter is iterative, show convergence
              diagnostics.
           step_size: set an initial step size for the fitting algorithm.
+          robust: Compute the robust errors using the Huber sandwich estimator, aka Wei-Lin estimate. This does not handle
+            ties, so if there are high number of ties, results may significantly differ. See
+            "The Robust Inference for the Cox Proportional Hazards Model", Journal of the American Statistical Association, Vol. 84, No. 408 (Dec., 1989), pp. 1074- 1078
+
 
         Returns:
             self, with additional properties: hazards_
@@ -83,6 +87,7 @@ def fit(self, df, id_col, event_col, start_col='start', stop_col='stop', show_pr
                                          step_size=step_size)
 
         self.hazards_ = pd.DataFrame(hazards_.T, columns=df.columns, index=['coef']) / self._norm_std
+        self.standard_errors_ = self._compute_standard_errors(normalize(df, self._norm_mean, self._norm_std), stop_times_events)
         self.confidence_intervals_ = self._compute_confidence_intervals()
         self.baseline_cumulative_hazard_ = self._compute_cumulative_baseline_hazard(df, stop_times_events)
         self.baseline_survival_ = self._compute_baseline_survival()
@@ -103,14 +108,65 @@ def _check_values(df, stop_times_events):
         check_for_immediate_deaths(stop_times_events)
         check_for_instantaneous_events(stop_times_events)
 
-    def _compute_standard_errors(self):
-        se = np.sqrt(inv(-self._hessian_).diagonal()) / self._norm_std
+    def _compute_sandwich_estimator(self, df, stop_times_events):
+
+        n, d = df.shape
+
+        # Init risk and tie sums to zero
+        risk_phi = 0
+        risk_phi_x = np.zeros((1, d))
+
+        # need to store these histories, as we access them often
+        risk_phi_history = pd.DataFrame(np.zeros((n,)), index=df.index)
+        risk_phi_x_history = pd.DataFrame(np.zeros((n, d)), index=df.index)
+
+        score_covariance = np.zeros((d, d))
+
+        # we already unnormalized the betas in `fit`, so we need normalize them again since X is
+        # normalized.
+        beta = self.hazards_.values[0] * self._norm_std
+
+        # Iterate backwards to utilize recursive relationship
+        for i in range(n - 1, -1, -1):
+            # Doing it like this to preserve shape
+            ei = E[i]
+            xi = X[i:i + 1]
+
+            phi_i = exp(dot(xi, beta))
+            phi_x_i = phi_i * xi
+
+            risk_phi += phi_i
+            risk_phi_x += phi_x_i
+
+            risk_phi_history[i] = risk_phi
+            risk_phi_x_history[i] = risk_phi_x
+
+        # Iterate forwards
+        for i in range(0, n):
+            # Doing it like this to preserve shape
+            xi = X[i:i + 1]
+            phi_i = exp(dot(xi, beta))
+
+            correction_term = sum(E[j] * phi_i / risk_phi_history[j] * (xi - risk_phi_x_history[j] / risk_phi_history[j]) for j in range(0, i+1))
+
+            score = E[i] * (xi - risk_phi_x_history[i] / risk_phi_history[i]) - correction_term
+            score_covariance += (score.T).dot(score)
+
+        # TODO: need a faster way to invert these matrices
+        sandwich_estimator = inv(self._hessian_).dot(score_covariance).dot(inv(self._hessian_))
+        return sandwich_estimator
+
+    def _compute_standard_errors(self, df, stop_times_events):
+        if self.robust:
+            se = np.sqrt(self._compute_sandwich_estimator(df, stop_times_events).diagonal()) / self._norm_std
+        else:
+            se = np.sqrt(-inv(self._hessian_).diagonal()) / self._norm_std
         return pd.DataFrame(se[None, :],
                             index=['se'], columns=self.hazards_.columns)
 
     def _compute_z_values(self):
         return (self.hazards_.loc['coef'] /
-                self._compute_standard_errors().loc['se'])
+                self.standard_errors_.loc['se'])
 
     def _compute_p_values(self):
         U = self._compute_z_values() ** 2
@@ -118,7 +174,7 @@ def _compute_p_values(self):
 
     def _compute_confidence_intervals(self):
         alpha2 = inv_normal_cdf((1. + self.alpha) / 2.)
-        se = self._compute_standard_errors()
+        se = self.standard_errors_
         hazards = self.hazards_.values
         return pd.DataFrame(np.r_[hazards - alpha2 * se,
                                   hazards + alpha2 * se],
@@ -137,7 +193,7 @@ def summary(self):
         df = pd.DataFrame(index=self.hazards_.columns)
         df['coef'] = self.hazards_.loc['coef'].values
         df['exp(coef)'] = exp(self.hazards_.loc['coef'].values)
-        df['se(coef)'] = self._compute_standard_errors().loc['se'].values
+        df['se(coef)'] = self.standard_errors_.loc['se'].values
         df['z'] = self._compute_z_values()
         df['p'] = self._compute_p_values()
         df['lower %.2f' % self.alpha] = self.confidence_intervals_.loc['lower-bound'].values
diff --git a/lifelines/fitters/coxph_fitter.py b/lifelines/fitters/coxph_fitter.py
index 2e65fa99a..3f9b995fe 100644
--- a/lifelines/fitters/coxph_fitter.py
+++ b/lifelines/fitters/coxph_fitter.py
@@ -76,6 +76,9 @@ def fit(self, df, duration_col, event_col=None,
           weights_col: an optional column in the dataframe that denotes the weight per subject.
              This column is expelled and not used as a covariate, but as a weight in the
              final regression. Default weight is 1.
+             This can be used for case-weights. For example, a weight of 2 means there were two subjects with
+             identical observations.
+             This can be used for sampling weights. In that case, use `robust=True` to get more accurate standard errors.
           show_progress: since the fitter is iterative, show convergence
              diagnostics.
           initial_beta: initialize the starting point of the iterative
@@ -117,14 +120,15 @@ def fit(self, df, duration_col, event_col=None,
 
         if weights_col:
             weights = df.pop(weights_col)
-            if (weights.astype(int) != weights).any():
-                warnings.warn("""It looks like your weights are not integers, possibly propensity scores then?
-It's important to know that the naive variance estimates of the coefficients are biased. Instead use Monte Carlo to
+            if (weights.astype(int) != weights).any() and not self.robust:
+                warnings.warn("""It appears your weights are not integers, possibly propensity or sampling scores then?
+It's important to know that the naive variance estimates of the coefficients are biased. Instead a) set `robust=True` in the call to `fit`, or b) use Monte Carlo to
 estimate the variances. See paper "Variance estimation when using inverse probability of treatment weighting (IPTW) with survival analysis"
-                    """, RuntimeWarning)
-
+""", RuntimeWarning)
         else:
-            weights = pd.DataFrame(np.ones((self._n_examples, 1)), index=df.index)
+            weights = pd.Series(np.ones((self._n_examples,)), index=df.index)
+
+        self._replication_weights = (weights.astype(int) == weights).all()
 
         self._check_values(df, T, E)
         df = df.astype(float)
@@ -280,15 +284,22 @@ def _newton_rhaphson(self, X, T, E, weights=None, initial_beta=None, step_size=N
 
     def _get_efron_values(self, X, beta, T, E, weights):
         """
-        Calculates the first and second order vector differentials,
-        with respect to beta.
+        Calculates the first and second order vector differentials, with respect to beta.
         Note that X, T, E are assumed to be sorted on T!
+
+        A good explaination for Efron. Consider three of five subjects who fail at the time.
+        As it is not known a priori that who is the first to fail, so one-third of
+        (φ1 + φ2 + φ3) is adjusted from sum_j^{5} φj after one fails. Similarly two-third
+        of (φ1 + φ2 + φ3) is adjusted after first two individuals fail, etc.
+
+
         Parameters:
             X: (n,d) numpy array of observations.
             beta: (1, d) numpy array of coefficients.
             T: (n) numpy array representing observed durations.
             E: (n) numpy array representing death events.
             weights: (n) an array representing weights per observation.
+
         Returns:
             hessian: (d, d) numpy array,
             gradient: (1, d) numpy array
@@ -335,7 +346,7 @@ def _get_efron_values(self, X, beta, T, E, weights):
                 tie_phi_x_x += phi_x_x_i
 
                 # Keep track of count
-                tie_count += int(w)
+                tie_count += 1
 
             if i > 0 and T[i - 1] == ti:
                 # There are more ties/members of the risk set
@@ -348,22 +359,27 @@ def _get_efron_values(self, X, beta, T, E, weights):
             partial_gradient = np.zeros((1, d))
 
             for l in range(tie_count):
-                c = l / tie_count
-
-                denom = (risk_phi - c * tie_phi)
-                z = (risk_phi_x - c * tie_phi_x)
+                """
+                A good explaination for Efron. Consider three of five subjects who fail at the time.
+                As it is not known a priori that who is the first to fail, so one-third of
+                (φ1 + φ2 + φ3) is adjusted from sum_j^{5} φj after one fails. Similarly two-third
+                of (φ1 + φ2 + φ3) is adjusted after first two individuals fail, etc.
+                """
+                numer = (risk_phi_x - l * tie_phi_x / tie_count)
+                denom = (risk_phi - l * tie_phi / tie_count)
 
                 # Gradient
-                partial_gradient += z / denom
+                partial_gradient += w * numer / denom
                 # Hessian
-                a1 = (risk_phi_x_x - c * tie_phi_x_x) / denom
-                # In case z and denom both are really small numbers,
+                a1 = (risk_phi_x_x - l * tie_phi_x_x / tie_count) / denom
+                # In case numer and denom both are really small numbers,
                 # make sure to do division before multiplications
-                a2 = dot(z.T / denom, z / denom)
+                a2 = dot(numer.T / denom, numer / denom)
 
-                hessian -= (a1 - a2)
+                hessian -= w * (a1 - a2)
+
+                log_lik -= w * np.log(denom[0][0])
 
-                log_lik -= np.log(denom[0][0])
 
             # Values outside tie sum
             gradient += x_tie_sum - partial_gradient
@@ -418,7 +434,6 @@ def _compute_sandwich_estimator(self, X, T, E):
         # Iterate backwards to utilize recursive relationship
         for i in range(n - 1, -1, -1):
             # Doing it like this to preserve shape
-            ei = E[i]
             xi = X[i:i + 1]
 
             phi_i = exp(dot(xi, beta))
diff --git a/lifelines/statistics.py b/lifelines/statistics.py
index 1cac53495..84f2b2b19 100644
--- a/lifelines/statistics.py
+++ b/lifelines/statistics.py
@@ -79,11 +79,15 @@ def logrank_test(event_times_A, event_times_B, event_observed_A=None, event_obse
     H_0: both event series are from the same generating processes
     H_A: the event series are from different generating processes.
 
-    See Survival and Event Analysis, page 108. This implicitly uses the log-rank weights.
+
+    This implicitly uses the log-rank weights.
+
+    See also `multivariate_logrank_test` for a more general function.
+
 
     Parameters:
-      event_times_foo: a (nx1) array of event durations (birth to death,...) for the population.
-      censorship_bar: a (nx1) array of censorship flags, 1 if observed, 0 if not. Default assumes all observed.
+      event_times_foo: a (n,) list-like of event durations (birth to death,...) for the population.
+      censorship_bar: a (n,) list-like of censorship flags, 1 if observed, 0 if not. Default assumes all observed.
       t_0: the period under observation, -1 for all time.
       alpha: the level of signifiance
       kwargs: add keywords and meta-data to the experiment summary
@@ -91,6 +95,7 @@ def logrank_test(event_times_A, event_times_B, event_observed_A=None, event_obse
     Returns:
       results: a StatisticalResult object with properties 'p_value', 'summary', 'test_statistic', 'test_result'
 
+    See Survival and Event Analysis, page 108.
     """
 
     event_times_A, event_times_B = np.array(event_times_A), np.array(event_times_B)
diff --git a/tests/test_estimation.py b/tests/test_estimation.py
index 52df6b4c0..ad09dcdf3 100644
--- a/tests/test_estimation.py
+++ b/tests/test_estimation.py
@@ -1000,7 +1000,7 @@ def test_coef_output_against_R_super_accurate(self, rossi):
         cf.fit(rossi, duration_col='week', event_col='arrest')
         npt.assert_array_almost_equal(cf.hazards_.values, expected, decimal=4)
 
-    def test_coef_output_against_R_using_non_trivial_weights(self, rossi):
+    def test_coef_output_against_R_using_non_trivial_but_integer_weights(self, rossi):
         rossi_ = rossi.copy()
         rossi_['weights'] = 1.
         rossi_ = rossi_.groupby(rossi.columns.tolist())['weights'].sum()\
@@ -1011,7 +1011,36 @@ def test_coef_output_against_R_using_non_trivial_weights(self, rossi):
         cf.fit(rossi_, duration_col='week', event_col='arrest', weights_col='weights')
         npt.assert_array_almost_equal(cf.hazards_.values, expected, decimal=4)
 
-    def test_adding_non_integer_weights_raises_a_warning(self, rossi):
+    def test_robust_errors_with_weights_is_the_same_as_R(self, regression_dataset):
+        """
+        rossi <- read.csv('.../lifelines/datasets/rossi.csv')
+        r = coxph(formula = Surv(week, arrest) ~ fin + age + strata(race,
+                    paro, mar, wexp) + prio, data = rossi, robust=TRUE)
+        """
+        df = regression_dataset
+        df['var3'] = np.round(df['var3'] + 1)
+        cph = CoxPHFitter()
+        cph.fit(df.head(5), 'T', 'E', robust=True, weights_col='var3', show_progress=True)
+        expected = pd.Series({'var1': -2.23662, 'var2': -5.75105})
+        assert_series_equal(cph.hazards_.T['coef'], expected, check_less_precise=2, check_names=False)
+
+
+    def test_summary_output_using_non_trivial_but_integer_weights(self, rossi):
+        rossi_weights = rossi.copy()
+        rossi_weights['weights'] = 1.
+        rossi_weights = rossi_weights.groupby(rossi.columns.tolist())['weights'].sum()\
+                                     .reset_index()
+
+        cf1 = CoxPHFitter()
+        cf1.fit(rossi_weights, duration_col='week', event_col='arrest', weights_col='weights')
+
+        cf2 = CoxPHFitter()
+        cf2.fit(rossi, duration_col='week', event_col='arrest')
+
+        assert_frame_equal(cf1.summary, cf2.summary, check_like=True)
+
+
+    def test_adding_non_integer_weights_without_robust_flag_raises_a_warning(self, rossi):
         rossi['weights'] = np.random.exponential(1, rossi.shape[0])
 
         cox = CoxPHFitter()
@@ -1024,6 +1053,16 @@ def test_adding_non_integer_weights_raises_a_warning(self, rossi):
             assert "naive variance estimates" in str(w[0].message)
 
 
+    def test_adding_non_integer_weights_is_fine_if_robust_is_on(self, rossi):
+        rossi['weights'] = np.random.exponential(1, rossi.shape[0])
+
+        cox = CoxPHFitter()
+
+        with warnings.catch_warnings(record=True) as w:
+            warnings.simplefilter("always")
+            cox.fit(rossi, 'week', 'arrest', weights_col='weights', robust=True)
+            assert len(w) == 0
+
     def test_standard_error_coef_output_against_R(self, rossi):
         """
         from http://cran.r-project.org/doc/contrib/Fox-Companion/appendix-cox-regression.pdf
@@ -1115,7 +1154,7 @@ def test_se_against_Survival_Analysis_by_John_Klein_and_Melvin_Moeschberger(self
         cf.fit(df, duration_col='time', event_col='death')
 
         # standard errors
-        actual_se = cf._compute_standard_errors().values
+        actual_se = cf._compute_standard_errors(None, None, None).values
         expected_se = np.array([[0.0143,  0.4623,  0.3561,  0.4222]])
         npt.assert_array_almost_equal(actual_se, expected_se, decimal=3)
 
@@ -1380,7 +1419,7 @@ def test_all_okay_with_non_trivial_index_in_dataframe(self, rossi):
         assert_frame_equal(cp2.summary, cp1.summary)
 
     def test_robust_errors_against_R_no_ties(self, regression_dataset):
-        df = regression_dataset.copy()
+        df = regression_dataset
         cph = CoxPHFitter()
         cph.fit(df, 'T', 'E', robust=True)
         expected = pd.Series({'var1': 0.0879, 'var2': 0.0847, 'var3': 0.0655})

From 57ee62463d12eeb44ce304e8da296a67afd40c37 Mon Sep 17 00:00:00 2001
From: Cameron Davidson-Pilon <cam.davidson.pilon@gmail.com>
Date: Mon, 3 Sep 2018 14:04:44 -0400
Subject: [PATCH 03/59] I have made progress. I still can't get weights +
 robust to work though

---
 lifelines/fitters/coxph_fitter.py |  43 +++++---
 tests/test_estimation.py          | 168 ++++++++++++++++++++++++++++--
 2 files changed, 188 insertions(+), 23 deletions(-)

diff --git a/lifelines/fitters/coxph_fitter.py b/lifelines/fitters/coxph_fitter.py
index 3f9b995fe..e23f40b94 100644
--- a/lifelines/fitters/coxph_fitter.py
+++ b/lifelines/fitters/coxph_fitter.py
@@ -125,11 +125,11 @@ def fit(self, df, duration_col, event_col=None,
 It's important to know that the naive variance estimates of the coefficients are biased. Instead a) set `robust=True` in the call to `fit`, or b) use Monte Carlo to
 estimate the variances. See paper "Variance estimation when using inverse probability of treatment weighting (IPTW) with survival analysis"
 """, RuntimeWarning)
+            if (weights <= 0).any():
+                raise ValueError("values in weights_col must be positive.")
         else:
             weights = pd.Series(np.ones((self._n_examples,)), index=df.index)
 
-        self._replication_weights = (weights.astype(int) == weights).all()
-
         self._check_values(df, T, E)
         df = df.astype(float)
 
@@ -154,7 +154,7 @@ def fit(self, df, duration_col, event_col=None,
 
         self.hazards_ = pd.DataFrame(hazards_.T, columns=df.columns, index=['coef']) / self._norm_std
 
-        self.standard_errors_ = self._compute_standard_errors(normalize(df, self._norm_mean, self._norm_std), T, E)
+        self.standard_errors_ = self._compute_standard_errors(normalize(df, self._norm_mean, self._norm_std), T, E, weights)
         self.confidence_intervals_ = self._compute_confidence_intervals()
 
 
@@ -317,7 +317,8 @@ def _get_efron_values(self, X, beta, T, E, weights):
         risk_phi_x, tie_phi_x = np.zeros((1, d)), np.zeros((1, d))
         risk_phi_x_x, tie_phi_x_x = np.zeros((d, d)), np.zeros((d, d))
 
-        # Init number of ties
+        # Init number of ties and weights
+        weight_count = 0.0
         tie_count = 0
 
         # Iterate backwards to utilize recursive relationship
@@ -347,6 +348,7 @@ def _get_efron_values(self, X, beta, T, E, weights):
 
                 # Keep track of count
                 tie_count += 1
+                weight_count += w
 
             if i > 0 and T[i - 1] == ti:
                 # There are more ties/members of the risk set
@@ -357,6 +359,7 @@ def _get_efron_values(self, X, beta, T, E, weights):
 
             # There was atleast one event and no more ties remain. Time to sum.
             partial_gradient = np.zeros((1, d))
+            weighted_average = weight_count / tie_count
 
             for l in range(tie_count):
                 """
@@ -369,16 +372,17 @@ def _get_efron_values(self, X, beta, T, E, weights):
                 denom = (risk_phi - l * tie_phi / tie_count)
 
                 # Gradient
-                partial_gradient += w * numer / denom
+                partial_gradient += weighted_average * numer / denom
                 # Hessian
                 a1 = (risk_phi_x_x - l * tie_phi_x_x / tie_count) / denom
+
                 # In case numer and denom both are really small numbers,
                 # make sure to do division before multiplications
                 a2 = dot(numer.T / denom, numer / denom)
 
-                hessian -= w * (a1 - a2)
+                hessian -= weighted_average * (a1 - a2)
 
-                log_lik -= w * np.log(denom[0][0])
+                log_lik -= weighted_average * np.log(denom[0][0])
 
 
             # Values outside tie sum
@@ -387,6 +391,7 @@ def _get_efron_values(self, X, beta, T, E, weights):
 
             # reset tie values
             tie_count = 0
+            weight_count = 0.0
             x_tie_sum = np.zeros((1, d))
             tie_phi = 0
             tie_phi_x = np.zeros((1, d))
@@ -413,7 +418,7 @@ def _compute_confidence_intervals(self):
                             index=['lower-bound', 'upper-bound'],
                             columns=self.hazards_.columns)
 
-    def _compute_sandwich_estimator(self, X, T, E):
+    def _compute_sandwich_estimator(self, X, T, E, weights):
 
         n, d = X.shape
 
@@ -430,39 +435,45 @@ def _compute_sandwich_estimator(self, X, T, E):
         # we already unnormalized the betas in `fit`, so we need normalize them again since X is
         # normalized.
         beta = self.hazards_.values[0] * self._norm_std
+        weight_count = 0.0
 
         # Iterate backwards to utilize recursive relationship
         for i in range(n - 1, -1, -1):
             # Doing it like this to preserve shape
             xi = X[i:i + 1]
+            w = weights[i]
 
-            phi_i = exp(dot(xi, beta))
+            phi_i = w * exp(dot(xi, beta))
             phi_x_i = phi_i * xi
 
             risk_phi += phi_i
             risk_phi_x += phi_x_i
 
-            risk_phi_history[i] = risk_phi
-            risk_phi_x_history[i] = risk_phi_x
+            risk_phi_history[i] = risk_phi # denom
+            risk_phi_x_history[i] = risk_phi_x # a[i]
 
         # Iterate forwards
         for i in range(0, n):
             # Doing it like this to preserve shape
+            # doesn't handle ties.
             xi = X[i:i + 1]
-            phi_i = exp(dot(xi, beta))
+            w = weights[i]
+            phi_i = w * exp(dot(xi, beta))
 
-            correction_term = sum(E[j] * phi_i / risk_phi_history[j] * (xi - risk_phi_x_history[j] / risk_phi_history[j]) for j in range(0, i+1))
+            score = -sum(E[j] * phi_i / risk_phi_history[j] * (xi - risk_phi_x_history[j] / risk_phi_history[j]) for j in range(0, i+1))
 
-            score = E[i] * (xi - risk_phi_x_history[i] / risk_phi_history[i]) - correction_term
+            score = score + E[i] * (xi - risk_phi_x_history[i] / risk_phi_history[i])
             score_covariance += (score.T).dot(score)
 
         # TODO: need a faster way to invert these matrices
+        import pdb
+        pdb.set_trace()
         sandwich_estimator = inv(self._hessian_).dot(score_covariance).dot(inv(self._hessian_))
         return sandwich_estimator
 
-    def _compute_standard_errors(self, df, T, E):
+    def _compute_standard_errors(self, df, T, E, weights):
         if self.robust:
-            se = np.sqrt(self._compute_sandwich_estimator(df.values, T.values, E.values).diagonal()) / self._norm_std
+            se = np.sqrt(self._compute_sandwich_estimator(df.values, T.values, E.values, weights).diagonal()) / self._norm_std
         else:
             se = np.sqrt(-inv(self._hessian_).diagonal()) / self._norm_std
         return pd.DataFrame(se[None, :],
diff --git a/tests/test_estimation.py b/tests/test_estimation.py
index ad09dcdf3..00f907f88 100644
--- a/tests/test_estimation.py
+++ b/tests/test_estimation.py
@@ -1011,19 +1011,173 @@ def test_coef_output_against_R_using_non_trivial_but_integer_weights(self, rossi
         cf.fit(rossi_, duration_col='week', event_col='arrest', weights_col='weights')
         npt.assert_array_almost_equal(cf.hazards_.values, expected, decimal=4)
 
-    def test_robust_errors_with_weights_is_the_same_as_R(self, regression_dataset):
+    def test_robust_errors_with_less_trival_weights_is_the_same_as_R(self, regression_dataset):
         """
-        rossi <- read.csv('.../lifelines/datasets/rossi.csv')
-        r = coxph(formula = Surv(week, arrest) ~ fin + age + strata(race,
-                    paro, mar, wexp) + prio, data = rossi, robust=TRUE)
+        df <- data.frame(
+            "var1" = c(0.209325, 0.693919, 0.443804, 0.065636, 0.386294),
+            "var2" = c(0.184677, 0.071893, 1.364646, 0.098375, 1.663092),
+            "T" = c( 7.335846, 5.269797, 11.684092, 12.678458, 6.601666)
+        )
+        df['E'] = 1
+        df['var3'] = 0.75
+        df[1, 'var3'] = 1.75
+        coxph(formula=Surv(T, E) ~ var1 + var2, data=df, weights=var3, robust=TRUE)
+        """
+
+        df = pd.DataFrame({
+            "var1": [0.209325, 0.693919, 0.443804, 0.065636, 0.386294],
+            "var2": [0.184677, 0.071893, 1.364646, 0.098375, 1.663092],
+            "T": [7.335846, 5.269797, 11.684092, 12.678458, 6.601666],
+            'var3': [1.75, 0.75, 0.75, 0.75, 0.75]
+        })
+        df['E'] = 1
+
+        print(df)
+        cph = CoxPHFitter()
+        cph.fit(df, 'T', 'E', robust=True, weights_col='var3', show_progress=True)
+        cph.print_summary()
+        expected = pd.Series({'var1': 7.995, 'var2': -1.154})
+        assert_series_equal(cph.hazards_.T['coef'], expected, check_less_precise=2, check_names=False)
+
+        expected = pd.Series({'var1': 2.931, 'var2': 1.117})
+        assert_series_equal(cph.summary['se(coef)'], expected, check_less_precise=2, check_names=False)
+
+
+    def test_robust_errors_with_trival_weights_is_the_same_as_R(self, regression_dataset):
+        """
+        df <- data.frame(
+            "var1" = c(0.209325, 0.693919, 0.443804, 0.065636, 0.386294),
+            "var2" = c(0.184677, 0.071893, 1.364646, 0.098375, 1.663092),
+            "T" = c( 7.335846, 5.269797, 11.684092, 12.678458, 6.601666)
+        )
+        df['E'] = 1
+        df['var3'] = 0.75
+        coxph(formula=Surv(T, E) ~ var1 + var2, data=df, weights=var3, robust=TRUE)
+        """
+
+        df = pd.DataFrame({
+            "var1": [0.209325, 0.693919, 0.443804, 0.065636, 0.386294],
+            "var2": [0.184677, 0.071893, 1.364646, 0.098375, 1.663092],
+            "T": [7.335846, 5.269797, 11.684092, 12.678458, 6.601666],
+            'var3': [0.75, 0.75, 0.75, 0.75, 0.75]
+        })
+        df['E'] = 1
+
+        print(df)
+        cph = CoxPHFitter()
+        cph.fit(df, 'T', 'E', robust=True, weights_col='var3', show_progress=True)
+        cph.print_summary()
+        expected = pd.Series({'var1': 7.680, 'var2': -0.915})
+        assert_series_equal(cph.hazards_.T['coef'], expected, check_less_precise=2, check_names=False)
+
+        expected = pd.Series({'var1': 2.097, 'var2': 0.827})
+        assert_series_equal(cph.summary['se(coef)'], expected, check_less_precise=2, check_names=False)
+
+
+
+
+    def test_robust_errors_is_the_same_as_R(self, regression_dataset):
+        """
+        df <- data.frame(
+            "var1" = c(0.209325, 0.693919, 0.443804, 0.065636, 0.386294),
+            "var2" = c(0.184677, 0.071893, 1.364646, 0.098375, 1.663092),
+            "T" = c( 7.335846, 5.269797, 11.684092, 12.678458, 6.601666)
+        )
+        df['E'] = 1
+
+        coxph(formula=Surv(T, E) ~ var1 + var2, data=df, robust=TRUE)
+        """
+
+        df = pd.DataFrame({
+            "var1": [0.209325, 0.693919, 0.443804, 0.065636, 0.386294],
+            "var2": [0.184677, 0.071893, 1.364646, 0.098375, 1.663092],
+            "T": [7.335846, 5.269797, 11.684092, 12.678458, 6.601666]
+        })
+        df['E'] = 1
+
+        cph = CoxPHFitter()
+        cph.fit(df, 'T', 'E', robust=True, show_progress=True)
+        expected = pd.Series({'var1': 7.680, 'var2': -0.915})
+        assert_series_equal(cph.hazards_.T['coef'], expected, check_less_precise=2, check_names=False)
+
+        expected = pd.Series({'var1': 2.097, 'var2': 0.827})
+        assert_series_equal(cph.summary['se(coef)'], expected, check_less_precise=2, check_names=False)
+
+
+    def test_trival_float_weights_with_no_ties_is_the_same_as_R(self, regression_dataset):
+        """
+        df <- data.frame(
+            "var1" = c(0.209325, 0.693919, 0.443804, 0.065636, 0.386294),
+            "var2" = c(0.184677, 0.071893, 1.364646, 0.098375, 1.663092),
+            "T" = c( 7.335846, 5.269797, 11.684092, 12.678458, 6.601666)
+        )
+        df['E'] = 1
+        df['var3'] = 0.75
+
+        coxph(formula=Surv(T, E) ~ var1 + var2, data=df, weights=var3)
         """
         df = regression_dataset
-        df['var3'] = np.round(df['var3'] + 1)
+        ix = df['var3'] < 1.
+        df = df.loc[ix].head()
+        df['var3'] = [0.75] * 5
+
         cph = CoxPHFitter()
-        cph.fit(df.head(5), 'T', 'E', robust=True, weights_col='var3', show_progress=True)
-        expected = pd.Series({'var1': -2.23662, 'var2': -5.75105})
+
+        cph.fit(df, 'T', 'E', weights_col='var3', show_progress=True)
+
+        expected_coef = pd.Series({'var1': 7.680, 'var2': -0.915})
+        assert_series_equal(cph.hazards_.T['coef'], expected_coef, check_less_precise=2, check_names=False)
+
+        expected_std = pd.Series({'var1': 6.641, 'var2': 1.650})
+        assert_series_equal(cph.summary['se(coef)'], expected_std, check_less_precise=2, check_names=False)
+
+        expected_ll = -1.142397
+        assert abs(cph._log_likelihood - expected_ll) < 0.001
+
+    def test_less_trival_float_weights_with_no_ties_is_the_same_as_R(self, regression_dataset):
+        """
+        df <- data.frame(
+            "var1" = c(0.209325, 0.693919, 0.443804, 0.065636, 0.386294),
+            "var2" = c(0.184677, 0.071893, 1.364646, 0.098375, 1.663092),
+            "T" = c( 7.335846, 5.269797, 11.684092, 12.678458, 6.601666)
+        )
+        df['E'] = 1
+        df['var3'] = 0.75
+        df[1, 'var3'] = 1.75
+
+        coxph(formula=Surv(T, E) ~ var1 + var2, data=df, weights=var3)
+        """
+        df = regression_dataset
+        ix = df['var3'] < 1.
+        df = df.loc[ix].head()
+        df['var3'] = [1.75] + [0.75] * 4
+
+        cph = CoxPHFitter()
+
+        cph.fit(df, 'T', 'E', weights_col='var3', show_progress=True)
+        expected = pd.Series({'var1': 7.995, 'var2': -1.154})
+        assert_series_equal(cph.hazards_.T['coef'], expected, check_less_precise=2, check_names=False)
+
+        expected = pd.Series({'var1': 6.690, 'var2': 1.614})
+        assert_series_equal(cph.summary['se(coef)'], expected, check_less_precise=2, check_names=False)
+
+
+    def test_non_trival_float_weights_with_no_ties_is_the_same_as_R(self, regression_dataset):
+        """
+        df <- read.csv('.../lifelines/datasets/regression.csv')
+        coxph(formula=Surv(T, E) ~ var1 + var2, data=df, weights=var3)
+        """
+        df = regression_dataset
+
+        cph = CoxPHFitter()
+
+        cph.fit(df, 'T', 'E', weights_col='var3', show_progress=True)
+        expected = pd.Series({'var1': 0.3268, 'var2': 0.0775})
         assert_series_equal(cph.hazards_.T['coef'], expected, check_less_precise=2, check_names=False)
 
+        expected = pd.Series({'var1': 0.0697, 'var2': 0.0861})
+        assert_series_equal(cph.summary['se(coef)'], expected, check_less_precise=2, check_names=False)
+
 
     def test_summary_output_using_non_trivial_but_integer_weights(self, rossi):
         rossi_weights = rossi.copy()

From 9952e68486797a4aae1a4d77859b5285915701b4 Mon Sep 17 00:00:00 2001
From: Cameron Davidson-Pilon <cam.davidson.pilon@gmail.com>
Date: Mon, 3 Sep 2018 17:33:18 -0400
Subject: [PATCH 04/59] can't get weights + robust to work. What is true is
 that R and my hessian matrices are the same, but R is doing something else I
 can't find.

---
 CHANGELOG.md                      |  1 +
 lifelines/fitters/coxph_fitter.py |  9 +++---
 tests/test_estimation.py          | 51 ++++++++++++++++---------------
 3 files changed, 33 insertions(+), 28 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index f430fabeb..b8c33e0a3 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -3,6 +3,7 @@
 #### 0.15.0
  - adding `robust` params to Cox models' `fit`. This enables atleast i) using non-integer weights in the model (these could be sampling weights like IPTW), and ii) misspecified models (ex: non-propotional hazards). Under the hood it's a sandwich estimator. This does not handle ties, so if there are high number of ties, results may significantly differ from other software.
  - `standard_errors_` is now a property on fitted Cox models.
+ - `variance_matrix_` is now a property on fitted `CoxPHFitter` which describes the variance matrix of the coefficients.
 
 #### 0.14.6
  - fix for n > 2 groups in `multivariate_logrank_test` (again).
diff --git a/lifelines/fitters/coxph_fitter.py b/lifelines/fitters/coxph_fitter.py
index e23f40b94..8327b371b 100644
--- a/lifelines/fitters/coxph_fitter.py
+++ b/lifelines/fitters/coxph_fitter.py
@@ -127,6 +127,7 @@ def fit(self, df, duration_col, event_col=None,
 """, RuntimeWarning)
             if (weights <= 0).any():
                 raise ValueError("values in weights_col must be positive.")
+
         else:
             weights = pd.Series(np.ones((self._n_examples,)), index=df.index)
 
@@ -157,7 +158,6 @@ def fit(self, df, duration_col, event_col=None,
         self.standard_errors_ = self._compute_standard_errors(normalize(df, self._norm_mean, self._norm_std), T, E, weights)
         self.confidence_intervals_ = self._compute_confidence_intervals()
 
-
         self.baseline_hazard_ = self._compute_baseline_hazards(df, T, E)
         self.baseline_cumulative_hazard_ = self._compute_baseline_cumulative_hazard()
         self.baseline_survival_ = self._compute_baseline_survival()
@@ -466,16 +466,17 @@ def _compute_sandwich_estimator(self, X, T, E, weights):
             score_covariance += (score.T).dot(score)
 
         # TODO: need a faster way to invert these matrices
-        import pdb
-        pdb.set_trace()
         sandwich_estimator = inv(self._hessian_).dot(score_covariance).dot(inv(self._hessian_))
         return sandwich_estimator
 
     def _compute_standard_errors(self, df, T, E, weights):
+
         if self.robust:
             se = np.sqrt(self._compute_sandwich_estimator(df.values, T.values, E.values, weights).diagonal()) / self._norm_std
+            #self.variance_matrix_ = -inv(self._hessian_) / np.outer(self._norm_std, self._norm_std)
         else:
-            se = np.sqrt(-inv(self._hessian_).diagonal()) / self._norm_std
+            self.variance_matrix_ = -inv(self._hessian_) / np.outer(self._norm_std, self._norm_std)
+            se = np.sqrt(self.variance_matrix_.diagonal())
         return pd.DataFrame(se[None, :],
                             index=['se'], columns=self.hazards_.columns)
 
diff --git a/tests/test_estimation.py b/tests/test_estimation.py
index 00f907f88..7827e385b 100644
--- a/tests/test_estimation.py
+++ b/tests/test_estimation.py
@@ -1011,7 +1011,7 @@ def test_coef_output_against_R_using_non_trivial_but_integer_weights(self, rossi
         cf.fit(rossi_, duration_col='week', event_col='arrest', weights_col='weights')
         npt.assert_array_almost_equal(cf.hazards_.values, expected, decimal=4)
 
-    def test_robust_errors_with_less_trival_weights_is_the_same_as_R(self, regression_dataset):
+    def test_robust_errors_with_trival_weights_is_the_different_than_R(self, regression_dataset):
         """
         df <- data.frame(
             "var1" = c(0.209325, 0.693919, 0.443804, 0.065636, 0.386294),
@@ -1020,30 +1020,31 @@ def test_robust_errors_with_less_trival_weights_is_the_same_as_R(self, regressio
         )
         df['E'] = 1
         df['var3'] = 0.75
-        df[1, 'var3'] = 1.75
-        coxph(formula=Surv(T, E) ~ var1 + var2, data=df, weights=var3, robust=TRUE)
+        r = coxph(formula=Surv(T, E) ~ var1 + var2, data=df, weights=var3, robust=TRUE)
+        r$var
+        r$naive.var
         """
 
+        w = 0.75
         df = pd.DataFrame({
             "var1": [0.209325, 0.693919, 0.443804, 0.065636, 0.386294],
             "var2": [0.184677, 0.071893, 1.364646, 0.098375, 1.663092],
             "T": [7.335846, 5.269797, 11.684092, 12.678458, 6.601666],
-            'var3': [1.75, 0.75, 0.75, 0.75, 0.75]
         })
         df['E'] = 1
+        df['var3'] = w
 
-        print(df)
         cph = CoxPHFitter()
         cph.fit(df, 'T', 'E', robust=True, weights_col='var3', show_progress=True)
-        cph.print_summary()
-        expected = pd.Series({'var1': 7.995, 'var2': -1.154})
+        expected = pd.Series({'var1': 7.680, 'var2': -0.915})
         assert_series_equal(cph.hazards_.T['coef'], expected, check_less_precise=2, check_names=False)
 
-        expected = pd.Series({'var1': 2.931, 'var2': 1.117})
-        assert_series_equal(cph.summary['se(coef)'], expected, check_less_precise=2, check_names=False)
-
+        expected_cov = np.array([[33.079106, -5.964652], [-5.964652, 2.040642]])
+        npt.assert_array_almost_equal(
+            w * cph.variance_matrix_, expected_cov,
+        decimal=2)
 
-    def test_robust_errors_with_trival_weights_is_the_same_as_R(self, regression_dataset):
+    def test_robust_errors_with_less_trival_weights_is_the_same_as_R(self, regression_dataset):
         """
         df <- data.frame(
             "var1" = c(0.209325, 0.693919, 0.443804, 0.065636, 0.386294),
@@ -1052,28 +1053,34 @@ def test_robust_errors_with_trival_weights_is_the_same_as_R(self, regression_dat
         )
         df['E'] = 1
         df['var3'] = 0.75
-        coxph(formula=Surv(T, E) ~ var1 + var2, data=df, weights=var3, robust=TRUE)
+        df[1, 'var3'] = 1.75
+        r = coxph(formula=Surv(T, E) ~ var1 + var2, data=df, weights=var3, robust=TRUE)
+        r$var
+        r$naive.var
         """
 
         df = pd.DataFrame({
             "var1": [0.209325, 0.693919, 0.443804, 0.065636, 0.386294],
             "var2": [0.184677, 0.071893, 1.364646, 0.098375, 1.663092],
             "T": [7.335846, 5.269797, 11.684092, 12.678458, 6.601666],
-            'var3': [0.75, 0.75, 0.75, 0.75, 0.75]
+            'var3': [1.75, 0.75, 0.75, 0.75, 0.75]
         })
         df['E'] = 1
 
-        print(df)
         cph = CoxPHFitter()
         cph.fit(df, 'T', 'E', robust=True, weights_col='var3', show_progress=True)
-        cph.print_summary()
-        expected = pd.Series({'var1': 7.680, 'var2': -0.915})
+        expected = pd.Series({'var1': 7.995, 'var2': -1.154})
         assert_series_equal(cph.hazards_.T['coef'], expected, check_less_precise=2, check_names=False)
 
-        expected = pd.Series({'var1': 2.097, 'var2': 0.827})
-        assert_series_equal(cph.summary['se(coef)'], expected, check_less_precise=2, check_names=False)
 
+        variance_matrix = -np.linalg.inv(cph._hessian_) / np.outer(cph._norm_std, cph._norm_std)
+        expected_cov = np.array([[44.758444, -8.781867], [-8.781867, 2.606589]])
+        npt.assert_array_almost_equal(
+            variance_matrix, expected_cov,
+        decimal=2)
 
+        expected = pd.Series({'var1': 2.931, 'var2': 1.117})
+        assert_series_equal(cph.summary['se(coef)'], expected, check_less_precise=2, check_names=False)
 
 
     def test_robust_errors_is_the_same_as_R(self, regression_dataset):
@@ -1308,7 +1315,7 @@ def test_se_against_Survival_Analysis_by_John_Klein_and_Melvin_Moeschberger(self
         cf.fit(df, duration_col='time', event_col='death')
 
         # standard errors
-        actual_se = cf._compute_standard_errors(None, None, None).values
+        actual_se = cf._compute_standard_errors(None, None, None, None).values
         expected_se = np.array([[0.0143,  0.4623,  0.3561,  0.4222]])
         npt.assert_array_almost_equal(actual_se, expected_se, decimal=3)
 
@@ -1586,15 +1593,11 @@ def test_robust_errors_with_strata_doesnt_break(self, rossi):
         r = coxph(formula = Surv(week, arrest) ~ fin + age + strata(race,
                     paro, mar, wexp) + prio, data = rossi, robust=TRUE)
         """
+        assert False
         cf = CoxPHFitter()
         cf.fit(rossi, duration_col='week', event_col='arrest', strata=['race', 'paro', 'mar', 'wexp'], robust=True)
 
 
-    def test_robust_errors_against_R_with_ties(self,):
-        pass
-
-
-
 
 
 class TestAalenAdditiveFitter():

From 108f3f8af9e730243611a694ccc443f773bd947c Mon Sep 17 00:00:00 2001
From: Cameron Davidson-Pilon <cam.davidson.pilon@gmail.com>
Date: Mon, 3 Sep 2018 21:00:55 -0400
Subject: [PATCH 05/59] adding a sample algo for the perfect correlation check

---
 lifelines/utils/__init__.py | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/lifelines/utils/__init__.py b/lifelines/utils/__init__.py
index 0aa7b3896..0477cfda7 100644
--- a/lifelines/utils/__init__.py
+++ b/lifelines/utils/__init__.py
@@ -1074,9 +1074,15 @@ def check_complete_separation_low_variance(df, events):
 def check_complete_separation_close_to_perfect_correlation(df, durations):
     # slow for many columns
     THRESHOLD = 0.99
+    n, _ = df.shape
+    if n > 1000:
+        # let's sample to speed this n**2 algo up.
+        df = df.sample(n=1000, random_state=15).copy()
+        durations = durations.sample(n=1000, random_state=15).copy()
+
     for col, series in df.iteritems():
         if abs(stats.spearmanr(series, durations).correlation) >= THRESHOLD:
-            warning_text = "Column %s has high correlation with the duration column. This may harm convergence. This could be a form of 'complete separation'. \
+            warning_text = "Column %s has high sample correlation with the duration column. This may harm convergence. This could be a form of 'complete separation'. \
 See https://stats.idre.ucla.edu/other/mult-pkg/faq/general/faqwhat-is-complete-or-quasi-complete-separation-in-logisticprobit-regression-and-how-do-we-deal-with-them/ " % (col)
             warnings.warn(warning_text, ConvergenceWarning)
 

From 75cdaa85dbe0903bc74c813cfeda5f9deb0b5bd3 Mon Sep 17 00:00:00 2001
From: Cameron Davidson-Pilon <cam.davidson.pilon@gmail.com>
Date: Mon, 3 Sep 2018 21:05:37 -0400
Subject: [PATCH 06/59] fix #507

---
 lifelines/utils/__init__.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lifelines/utils/__init__.py b/lifelines/utils/__init__.py
index 0477cfda7..09be09a8a 100644
--- a/lifelines/utils/__init__.py
+++ b/lifelines/utils/__init__.py
@@ -896,7 +896,7 @@ def _concordance_index(event_times, predicted_event_times, event_observed):
     times_to_compare = _BTree(np.unique(died_pred))
     num_pairs = 0
     num_correct = 0
-    num_tied = 0
+    num_tied = np.int64(0)
 
     def handle_pairs(truth, pred, first_ix):
         """

From cb10e01126e72894d4f7d34ab1be9852b348be5c Mon Sep 17 00:00:00 2001
From: Cameron Davidson-Pilon <cam.davidson.pilon@gmail.com>
Date: Mon, 3 Sep 2018 21:39:40 -0400
Subject: [PATCH 07/59] this new newton-decrement convergence is very
 promising, and I don't see a decrease in coeffitient accuracy

---
 CHANGELOG.md                                 |  3 +++
 lifelines/fitters/cox_time_varying_fitter.py | 10 ++++++++--
 lifelines/fitters/coxph_fitter.py            | 13 +++++++++++--
 tests/test_estimation.py                     |  2 +-
 4 files changed, 23 insertions(+), 5 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 3aec7afef..177cce062 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,5 +1,8 @@
 ### Changelogs
 
+#### 0.15.0
+ - New criteria for convergence of `CoxPHFitter` and `CoxTimeVaryingFitter` called the Newton-decrement. Tests show it is as accurate (w.r.t to previous coefficients) and typically shaves off a single step, resulting in generally faster convergence.
+
 #### 0.14.6
  - fix for n > 2 groups in `multivariate_logrank_test` (again).
  - fix bug for when `event_observed` column was not boolean. 
diff --git a/lifelines/fitters/cox_time_varying_fitter.py b/lifelines/fitters/cox_time_varying_fitter.py
index 611959a93..3beebdd5d 100644
--- a/lifelines/fitters/cox_time_varying_fitter.py
+++ b/lifelines/fitters/cox_time_varying_fitter.py
@@ -186,7 +186,10 @@ def _newton_rhaphson(self, df, stop_times_events, show_progress=False, step_size
                 g -= self.penalizer * beta.T
                 h.flat[::d + 1] -= self.penalizer
 
-            delta = solve(-h, step_size * g.T)
+            # reusing a piece to make g * inv(h) * g.T faster later
+            inv_h_dot_g_T = spsolve(-h, g.T, sym_pos=True)
+            delta = step_size * inv_h_dot_g_T
+
             if np.any(np.isnan(delta)):
                 raise ValueError("""delta contains nan value(s). Convergence halted. Please see the following tips in the lifelines documentation:
 https://lifelines.readthedocs.io/en/latest/Examples.html#problems-with-convergence-in-the-cox-proportional-hazard-model
@@ -194,15 +197,18 @@ def _newton_rhaphson(self, df, stop_times_events, show_progress=False, step_size
             # Save these as pending result
             hessian, gradient = h, g
             norm_delta = norm(delta)
+            newton_decrement = g.dot(inv_h_dot_g_T)/2
 
             if show_progress:
-                print("Iteration %d: norm_delta = %.6f, step_size = %.3f, ll = %.6f, seconds_since_start = %.1f" % (i, norm_delta, step_size, ll, time.time() - start))
+                print("Iteration %d: norm_delta = %.5f, step_size = %.5f, ll = %.5f, newton_decrement = %.5f, seconds_since_start = %.1f" % (i, norm_delta, step_size, ll, newton_decrement, time.time() - start))
 
             # convergence criteria
             if norm_delta < precision:
                 converging, completed = False, True
             elif abs(ll - previous_ll) < precision:
                 converging, completed = False, True
+            if newton_decrement < precision:
+                converging, completed = False, True
             elif i >= max_steps:
                 # 50 iterations steps with N-R is a lot.
                 # Expected convergence is ~10 steps
diff --git a/lifelines/fitters/coxph_fitter.py b/lifelines/fitters/coxph_fitter.py
index 2e65fa99a..17977acfa 100644
--- a/lifelines/fitters/coxph_fitter.py
+++ b/lifelines/fitters/coxph_fitter.py
@@ -234,7 +234,10 @@ def _newton_rhaphson(self, X, T, E, weights=None, initial_beta=None, step_size=N
                 g -= self.penalizer * beta.T
                 h.flat[::d + 1] -= self.penalizer
 
-            delta = spsolve(-h, step_size * g.T, sym_pos=True)
+            # reusing a piece to make g * inv(h) * g.T faster later
+            inv_h_dot_g_T = spsolve(-h, g.T, sym_pos=True)
+            delta = step_size * inv_h_dot_g_T
+
             if np.any(np.isnan(delta)):
                 raise ValueError("""delta contains nan value(s). Convergence halted. Please see the following tips in the lifelines documentation:
 https://lifelines.readthedocs.io/en/latest/Examples.html#problems-with-convergence-in-the-cox-proportional-hazard-model
@@ -244,11 +247,17 @@ def _newton_rhaphson(self, X, T, E, weights=None, initial_beta=None, step_size=N
             hessian, gradient = h, g
             norm_delta = norm(delta)
 
+            # reusing an above piece to make g * inv(h) * g.T faster.
+            newton_decrement = g.dot(inv_h_dot_g_T)/2
+
             if show_progress:
-                print("Iteration %d: norm_delta = %.5f, step_size = %.5f, ll = %.5f, seconds_since_start = %.1f" % (i, norm_delta, step_size, ll, time.time() - start))
+                print("Iteration %d: norm_delta = %.5f, step_size = %.5f, ll = %.5f, newton_decrement = %.5f, seconds_since_start = %.1f" % (i, norm_delta, step_size, ll, newton_decrement, time.time() - start))
+
             # convergence criteria
             if norm_delta < precision:
                 converging, completed = False, True
+            if newton_decrement < precision:
+                converging, completed = False, True
             elif abs(ll - previous_ll) < precision:
                 converging, completed = False, True
             elif i >= max_steps:
diff --git a/tests/test_estimation.py b/tests/test_estimation.py
index 52df6b4c0..866409d27 100644
--- a/tests/test_estimation.py
+++ b/tests/test_estimation.py
@@ -997,7 +997,7 @@ def test_coef_output_against_R_super_accurate(self, rossi):
         """
         expected = np.array([[-0.3794, -0.0574, 0.3139, -0.1498, -0.4337, -0.0849,  0.0915]])
         cf = CoxPHFitter()
-        cf.fit(rossi, duration_col='week', event_col='arrest')
+        cf.fit(rossi, duration_col='week', event_col='arrest', show_progress=True)
         npt.assert_array_almost_equal(cf.hazards_.values, expected, decimal=4)
 
     def test_coef_output_against_R_using_non_trivial_weights(self, rossi):

From d3e407b635c4d84a7379612ec4c0e9c978668a38 Mon Sep 17 00:00:00 2001
From: Cameron Davidson-Pilon <cam.davidson.pilon@gmail.com>
Date: Mon, 3 Sep 2018 21:43:33 -0400
Subject: [PATCH 08/59] this new newton-decrement convergence is very
 promising, and I don't see a decrease in coeffitient accuracy

---
 CHANGELOG.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 177cce062..269b4296d 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,7 +1,7 @@
 ### Changelogs
 
 #### 0.15.0
- - New criteria for convergence of `CoxPHFitter` and `CoxTimeVaryingFitter` called the Newton-decrement. Tests show it is as accurate (w.r.t to previous coefficients) and typically shaves off a single step, resulting in generally faster convergence.
+ - New criteria for convergence of `CoxPHFitter` and `CoxTimeVaryingFitter` called the Newton-decrement. Tests show it is as accurate (w.r.t to previous coefficients) and typically shaves off a single step, resulting in generally faster convergence. See https://www.cs.cmu.edu/~pradeepr/convexopt/Lecture_Slides/Newton_methods.pdf. Details about the Newton-decrement are added to the `show_progress` statements.
 
 #### 0.14.6
  - fix for n > 2 groups in `multivariate_logrank_test` (again).

From be8ecf5fd54a4b4fe4f9aa6ef43155877f59f855 Mon Sep 17 00:00:00 2001
From: Cameron Davidson-Pilon <cam.davidson.pilon@gmail.com>
Date: Mon, 10 Sep 2018 15:06:07 -0400
Subject: [PATCH 09/59] lost again

---
 lifelines/fitters/cox_time_varying_fitter.py |  8 +++++---
 lifelines/fitters/coxph_fitter.py            |  9 ++++++---
 lifelines/utils/__init__.py                  |  4 ++--
 tests/test_estimation.py                     | 11 ++++++++---
 4 files changed, 21 insertions(+), 11 deletions(-)

diff --git a/lifelines/fitters/cox_time_varying_fitter.py b/lifelines/fitters/cox_time_varying_fitter.py
index b291f0608..9b2c9acbb 100644
--- a/lifelines/fitters/cox_time_varying_fitter.py
+++ b/lifelines/fitters/cox_time_varying_fitter.py
@@ -10,6 +10,7 @@
 
 from numpy import dot, exp
 from numpy.linalg import solve, norm, inv
+from scipy.linalg import solve as spsolve
 from lifelines.fitters import BaseFitter
 from lifelines.fitters.coxph_fitter import CoxPHFitter
 from lifelines.statistics import chisq_test
@@ -157,7 +158,7 @@ def _compute_sandwich_estimator(self, df, stop_times_events):
         return sandwich_estimator
 
     def _compute_standard_errors(self, df, stop_times_events):
-        if self.robust:
+        if self.robust: # TODO
             se = np.sqrt(self._compute_sandwich_estimator(df, stop_times_events).diagonal()) / self._norm_std
         else:
             se = np.sqrt(-inv(self._hessian_).diagonal()) / self._norm_std
@@ -261,9 +262,10 @@ def _newton_rhaphson(self, df, stop_times_events, show_progress=False, step_size
             # convergence criteria
             if norm_delta < precision:
                 converging, completed = False, True
-            elif abs(ll - previous_ll) < precision:
+            elif abs(ll - previous_ll) / (-previous_ll) < 1e-09:
+                # this is what R uses by default
                 converging, completed = False, True
-            if newton_decrement < precision:
+            elif newton_decrement < 10e-8:
                 converging, completed = False, True
             elif i >= max_steps:
                 # 50 iterations steps with N-R is a lot.
diff --git a/lifelines/fitters/coxph_fitter.py b/lifelines/fitters/coxph_fitter.py
index cce7998bc..4a526ef1a 100644
--- a/lifelines/fitters/coxph_fitter.py
+++ b/lifelines/fitters/coxph_fitter.py
@@ -260,9 +260,10 @@ def _newton_rhaphson(self, X, T, E, weights=None, initial_beta=None, step_size=N
             # convergence criteria
             if norm_delta < precision:
                 converging, completed = False, True
-            if newton_decrement < precision:
+            elif previous_ll != 0 and abs(ll - previous_ll) / (-previous_ll) < 1e-09:
+                # this is what R uses by default
                 converging, completed = False, True
-            elif abs(ll - previous_ll) < precision:
+            elif newton_decrement < precision:
                 converging, completed = False, True
             elif i >= max_steps:
                 # 50 iterations steps with N-R is a lot.
@@ -428,6 +429,8 @@ def _compute_confidence_intervals(self):
                             columns=self.hazards_.columns)
 
     def _compute_sandwich_estimator(self, X, T, E, weights):
+        # https://www.stat.tamu.edu/~carroll/ftp/gk001.pdf
+        # lin1989
 
         n, d = X.shape
 
@@ -472,7 +475,7 @@ def _compute_sandwich_estimator(self, X, T, E, weights):
             score = -sum(E[j] * phi_i / risk_phi_history[j] * (xi - risk_phi_x_history[j] / risk_phi_history[j]) for j in range(0, i+1))
 
             score = score + E[i] * (xi - risk_phi_x_history[i] / risk_phi_history[i])
-            score_covariance += (score.T).dot(score)
+            score_covariance += w * (score.T).dot(score)
 
         # TODO: need a faster way to invert these matrices
         sandwich_estimator = inv(self._hessian_).dot(score_covariance).dot(inv(self._hessian_))
diff --git a/lifelines/utils/__init__.py b/lifelines/utils/__init__.py
index 09be09a8a..d0658b939 100644
--- a/lifelines/utils/__init__.py
+++ b/lifelines/utils/__init__.py
@@ -68,9 +68,9 @@ def qth_survival_times(q, survival_functions, cdf=False):
         #  Typically, one would expect that the output should equal the "height" of q.
         #  An issue can arise if the Series q contains duplicate values. We handle this un-eligantly.
         if q.duplicated().any():
-            return pd.DataFrame.from_items([
+            return pd.DataFrame.from_dict(dict([
                 (_q, survival_functions.apply(lambda s: qth_survival_time(_q, s))) for i, _q in enumerate(q)
-            ], orient='index', columns=survival_functions.columns)
+            ]), orient='index', columns=survival_functions.columns)
         else:
             return pd.DataFrame({_q: survival_functions.apply(lambda s: qth_survival_time(_q, s)) for _q in q}).T
 
diff --git a/tests/test_estimation.py b/tests/test_estimation.py
index ab3267f5e..46b92c285 100644
--- a/tests/test_estimation.py
+++ b/tests/test_estimation.py
@@ -1039,10 +1039,15 @@ def test_robust_errors_with_trival_weights_is_the_different_than_R(self, regress
         expected = pd.Series({'var1': 7.680, 'var2': -0.915})
         assert_series_equal(cph.hazards_.T['coef'], expected, check_less_precise=2, check_names=False)
 
+        variance_matrix = -np.linalg.inv(cph._hessian_) / np.outer(cph._norm_std, cph._norm_std)
         expected_cov = np.array([[33.079106, -5.964652], [-5.964652, 2.040642]])
         npt.assert_array_almost_equal(
-            w * cph.variance_matrix_, expected_cov,
-        decimal=2)
+            w * variance_matrix_, expected_cov,
+        decimal=1)
+
+        expected = pd.Series({'var1': 2.931, 'var2': 1.117})
+        assert_series_equal(cph.summary['se(coef)'], expected, check_less_precise=2, check_names=False)
+
 
     def test_robust_errors_with_less_trival_weights_is_the_same_as_R(self, regression_dataset):
         """
@@ -1077,7 +1082,7 @@ def test_robust_errors_with_less_trival_weights_is_the_same_as_R(self, regressio
         expected_cov = np.array([[44.758444, -8.781867], [-8.781867, 2.606589]])
         npt.assert_array_almost_equal(
             variance_matrix, expected_cov,
-        decimal=2)
+        decimal=1) # not as precise because matrix inversion will accumulate estimation errors.
 
         expected = pd.Series({'var1': 2.931, 'var2': 1.117})
         assert_series_equal(cph.summary['se(coef)'], expected, check_less_precise=2, check_names=False)

From 1c905582448be4efede37dd7391de03250e0a57d Mon Sep 17 00:00:00 2001
From: Cameron Davidson-Pilon <cam.davidson.pilon@gmail.com>
Date: Thu, 20 Sep 2018 21:20:58 -0400
Subject: [PATCH 10/59] great! I've gotten weights + robust errors to work -
 took way to long. Next is to test for censorship can be included

---
 lifelines/fitters/coxph_fitter.py | 32 +++++++++--------
 tests/test_estimation.py          | 58 +++++++++++++++++++++++--------
 2 files changed, 62 insertions(+), 28 deletions(-)

diff --git a/lifelines/fitters/coxph_fitter.py b/lifelines/fitters/coxph_fitter.py
index 4a526ef1a..f216aa9ad 100644
--- a/lifelines/fitters/coxph_fitter.py
+++ b/lifelines/fitters/coxph_fitter.py
@@ -431,34 +431,37 @@ def _compute_confidence_intervals(self):
     def _compute_sandwich_estimator(self, X, T, E, weights):
         # https://www.stat.tamu.edu/~carroll/ftp/gk001.pdf
         # lin1989
+        # https://www.ics.uci.edu/~dgillen/STAT255/Handouts/lecture10.pdf
 
         n, d = X.shape
 
         # Init risk and tie sums to zero
         risk_phi = 0
         risk_phi_x = np.zeros((1, d))
+        running_weight_sum = 0
 
         # need to store these histories, as we access them often
         risk_phi_history = np.zeros((n,))
         risk_phi_x_history = np.zeros((n, d))
 
-        score_covariance = np.zeros((d, d))
 
         # we already unnormalized the betas in `fit`, so we need normalize them again since X is
         # normalized.
         beta = self.hazards_.values[0] * self._norm_std
-        weight_count = 0.0
+
+        score_residuals = np.zeros((n, d))
 
         # Iterate backwards to utilize recursive relationship
         for i in range(n - 1, -1, -1):
             # Doing it like this to preserve shape
             xi = X[i:i + 1]
+
             w = weights[i]
 
             phi_i = w * exp(dot(xi, beta))
             phi_x_i = phi_i * xi
 
-            risk_phi += phi_i
+            risk_phi   += phi_i
             risk_phi_x += phi_x_i
 
             risk_phi_history[i] = risk_phi # denom
@@ -466,28 +469,29 @@ def _compute_sandwich_estimator(self, X, T, E, weights):
 
         # Iterate forwards
         for i in range(0, n):
-            # Doing it like this to preserve shape
             # doesn't handle ties.
             xi = X[i:i + 1]
-            w = weights[i]
-            phi_i = w * exp(dot(xi, beta))
-
-            score = -sum(E[j] * phi_i / risk_phi_history[j] * (xi - risk_phi_x_history[j] / risk_phi_history[j]) for j in range(0, i+1))
+            phi_i = exp(dot(xi, beta))
 
+            score = -sum(
+                E[j] * (phi_i * weights[j]) / risk_phi_history[j] * (xi - risk_phi_x_history[j] / risk_phi_history[j]) for j in range(0, i+1)
+            )
             score = score + E[i] * (xi - risk_phi_x_history[i] / risk_phi_history[i])
-            score_covariance += w * (score.T).dot(score)
+            score *= weights[i]
+            score_residuals[i, :] = score
+
 
-        # TODO: need a faster way to invert these matrices
-        sandwich_estimator = inv(self._hessian_).dot(score_covariance).dot(inv(self._hessian_))
+        naive_var = inv(self._hessian_) / self._norm_std.values
+        delta_betas = score_residuals.dot(naive_var)
+        sandwich_estimator = delta_betas.T.dot(delta_betas)
         return sandwich_estimator
 
     def _compute_standard_errors(self, df, T, E, weights):
 
+        self.variance_matrix_ = -inv(self._hessian_) / np.outer(self._norm_std, self._norm_std)
         if self.robust:
-            se = np.sqrt(self._compute_sandwich_estimator(df.values, T.values, E.values, weights).diagonal()) / self._norm_std
-            #self.variance_matrix_ = -inv(self._hessian_) / np.outer(self._norm_std, self._norm_std)
+            se = np.sqrt(self._compute_sandwich_estimator(df.values, T.values, E.values, weights.values).diagonal()) # / self._norm_std
         else:
-            self.variance_matrix_ = -inv(self._hessian_) / np.outer(self._norm_std, self._norm_std)
             se = np.sqrt(self.variance_matrix_.diagonal())
         return pd.DataFrame(se[None, :],
                             index=['se'], columns=self.hazards_.columns)
diff --git a/tests/test_estimation.py b/tests/test_estimation.py
index 46b92c285..f6c79a900 100644
--- a/tests/test_estimation.py
+++ b/tests/test_estimation.py
@@ -1011,7 +1011,7 @@ def test_coef_output_against_R_using_non_trivial_but_integer_weights(self, rossi
         cf.fit(rossi_, duration_col='week', event_col='arrest', weights_col='weights')
         npt.assert_array_almost_equal(cf.hazards_.values, expected, decimal=4)
 
-    def test_robust_errors_with_trival_weights_is_the_different_than_R(self, regression_dataset):
+    def test_robust_errors_with_trival_weights_is_the_same_than_R(self, regression_dataset):
         """
         df <- data.frame(
             "var1" = c(0.209325, 0.693919, 0.443804, 0.065636, 0.386294),
@@ -1039,13 +1039,12 @@ def test_robust_errors_with_trival_weights_is_the_different_than_R(self, regress
         expected = pd.Series({'var1': 7.680, 'var2': -0.915})
         assert_series_equal(cph.hazards_.T['coef'], expected, check_less_precise=2, check_names=False)
 
-        variance_matrix = -np.linalg.inv(cph._hessian_) / np.outer(cph._norm_std, cph._norm_std)
         expected_cov = np.array([[33.079106, -5.964652], [-5.964652, 2.040642]])
         npt.assert_array_almost_equal(
-            w * variance_matrix_, expected_cov,
+            w * cph.variance_matrix_, expected_cov,
         decimal=1)
 
-        expected = pd.Series({'var1': 2.931, 'var2': 1.117})
+        expected = pd.Series({'var1': 2.097, 'var2': 0.827})
         assert_series_equal(cph.summary['se(coef)'], expected, check_less_precise=2, check_names=False)
 
 
@@ -1054,39 +1053,70 @@ def test_robust_errors_with_less_trival_weights_is_the_same_as_R(self, regressio
         df <- data.frame(
             "var1" = c(0.209325, 0.693919, 0.443804, 0.065636, 0.386294),
             "var2" = c(0.184677, 0.071893, 1.364646, 0.098375, 1.663092),
-            "T" = c( 7.335846, 5.269797, 11.684092, 12.678458, 6.601666)
+            "T" = c(1, 2, 3, 4, 5)
         )
         df['E'] = 1
-        df['var3'] = 0.75
-        df[1, 'var3'] = 1.75
+        df['var3'] = 2
+        df[4, 'var3'] = 1
         r = coxph(formula=Surv(T, E) ~ var1 + var2, data=df, weights=var3, robust=TRUE)
         r$var
         r$naive.var
+        residuals(r, type='dfbeta')
         """
 
         df = pd.DataFrame({
             "var1": [0.209325, 0.693919, 0.443804, 0.065636, 0.386294],
             "var2": [0.184677, 0.071893, 1.364646, 0.098375, 1.663092],
-            "T": [7.335846, 5.269797, 11.684092, 12.678458, 6.601666],
-            'var3': [1.75, 0.75, 0.75, 0.75, 0.75]
+            "T":    [1, 2, 3, 4, 5],
+            'var3': [2, 2, 2, 1, 2]
         })
         df['E'] = 1
 
         cph = CoxPHFitter()
         cph.fit(df, 'T', 'E', robust=True, weights_col='var3', show_progress=True)
-        expected = pd.Series({'var1': 7.995, 'var2': -1.154})
+        expected = pd.Series({'var1': 1.431, 'var2': -1.277})
         assert_series_equal(cph.hazards_.T['coef'], expected, check_less_precise=2, check_names=False)
 
 
-        variance_matrix = -np.linalg.inv(cph._hessian_) / np.outer(cph._norm_std, cph._norm_std)
-        expected_cov = np.array([[44.758444, -8.781867], [-8.781867, 2.606589]])
+        expected_cov = np.array([[3.5439245, -0.3549099], [-0.3549099, 0.4499553]])
         npt.assert_array_almost_equal(
-            variance_matrix, expected_cov,
+            cph.variance_matrix_, expected_cov,
         decimal=1) # not as precise because matrix inversion will accumulate estimation errors.
 
-        expected = pd.Series({'var1': 2.931, 'var2': 1.117})
+        expected = pd.Series({'var1': 2.094, 'var2': 0.452})
         assert_series_equal(cph.summary['se(coef)'], expected, check_less_precise=2, check_names=False)
 
+    def test_robust_errors_with_non_trivial_weights_is_the_same_as_R(self, regression_dataset):
+        """
+        df <- data.frame(
+            "var1" = c(0.209325, 0.693919, 0.443804, 0.065636, 0.386294),
+            "var2" = c(0.184677, 0.071893, 1.364646, 0.098375, 1.663092),
+            "var3" = c(0.184677, 0.071893, 1.364646, 0.098375, 1.663092),
+            "T" =    c( 7.335846, 5.269797, 11.684092, 12.678458, 6.601666)
+        )
+        df['E'] = 1
+        r = coxph(formula=Surv(T, E) ~ var1 + var2, data=df, weights=var3, robust=TRUE)
+        r$var
+        r$naive.var
+        """
+
+        df = pd.DataFrame({
+            "var1": [0.209325, 0.693919, 0.443804, 0.065636, 0.386294],
+            "var2": [0.184677, 0.071893, 1.364646, 0.098375, 1.663092],
+            'var3': [0.184677, 0.071893, 1.364646, 0.098375, 1.663092],
+            "T":    [7.335846, 5.269797, 11.684092, 12.678458, 6.601666],
+        })
+        df['E'] = 1
+
+        cph = CoxPHFitter()
+        cph.fit(df, 'T', 'E', robust=True, weights_col='var3', show_progress=True)
+        expected = pd.Series({'var1': -5.16231, 'var2': 1.71924})
+        assert_series_equal(cph.hazards_.T['coef'], expected, check_less_precise=1, check_names=False)
+
+        expected = pd.Series({'var1': 9.97730, 'var2': 2.45648})
+        assert_series_equal(cph.summary['se(coef)'], expected, check_less_precise=2, check_names=False)
+
+
 
     def test_robust_errors_is_the_same_as_R(self, regression_dataset):
         """

From 25b9c7a00d1e34b3cb0241032f68f947128d00d8 Mon Sep 17 00:00:00 2001
From: Cameron Davidson-Pilon <cam.davidson.pilon@gmail.com>
Date: Fri, 21 Sep 2018 21:20:43 -0400
Subject: [PATCH 11/59] more performant robust

---
 CHANGELOG.md                      |  2 +-
 lifelines/fitters/coxph_fitter.py | 58 +++++++++++--------------------
 tests/test_estimation.py          | 32 ++++++++++++++++-
 3 files changed, 53 insertions(+), 39 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 6f5b7627b..6fac768ae 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,7 +1,7 @@
 ### Changelogs
 
 #### 0.15.0
- - adding `robust` params to Cox models' `fit`. This enables atleast i) using non-integer weights in the model (these could be sampling weights like IPTW), and ii) misspecified models (ex: non-propotional hazards). Under the hood it's a sandwich estimator. This does not handle ties, so if there are high number of ties, results may significantly differ from other software.
+ - adding `robust` params to Cox models' `fit`. This enables atleast i) using non-integer weights in the model (these could be sampling weights like IPTW), and ii) mis-specified models (ex: non-proportional hazards). Under the hood it's a sandwich estimator. This does not handle ties, so if there are high number of ties, results may significantly differ from other software.
  - `standard_errors_` is now a property on fitted Cox models.
  - `variance_matrix_` is now a property on fitted `CoxPHFitter` which describes the variance matrix of the coefficients.
  - new criteria for convergence of `CoxPHFitter` and `CoxTimeVaryingFitter` called the Newton-decrement. Tests show it is as accurate (w.r.t to previous coefficients) and typically shaves off a single step, resulting in generally faster convergence. See https://www.cs.cmu.edu/~pradeepr/convexopt/Lecture_Slides/Newton_methods.pdf. Details about the Newton-decrement are added to the `show_progress` statements.
diff --git a/lifelines/fitters/coxph_fitter.py b/lifelines/fitters/coxph_fitter.py
index f216aa9ad..c12782911 100644
--- a/lifelines/fitters/coxph_fitter.py
+++ b/lifelines/fitters/coxph_fitter.py
@@ -155,6 +155,7 @@ def fit(self, df, duration_col, event_col=None,
 
         self.hazards_ = pd.DataFrame(hazards_.T, columns=df.columns, index=['coef']) / self._norm_std
 
+        self.variance_matrix_ = -inv(self._hessian_) / np.outer(self._norm_std, self._norm_std)
         self.standard_errors_ = self._compute_standard_errors(normalize(df, self._norm_mean, self._norm_std), T, E, weights)
         self.confidence_intervals_ = self._compute_confidence_intervals()
 
@@ -432,63 +433,46 @@ def _compute_sandwich_estimator(self, X, T, E, weights):
         # https://www.stat.tamu.edu/~carroll/ftp/gk001.pdf
         # lin1989
         # https://www.ics.uci.edu/~dgillen/STAT255/Handouts/lecture10.pdf
+        # doesn't handle ties.
 
         n, d = X.shape
 
-        # Init risk and tie sums to zero
-        risk_phi = 0
-        risk_phi_x = np.zeros((1, d))
-        running_weight_sum = 0
-
-        # need to store these histories, as we access them often
-        risk_phi_history = np.zeros((n,))
-        risk_phi_x_history = np.zeros((n, d))
-
-
         # we already unnormalized the betas in `fit`, so we need normalize them again since X is
         # normalized.
         beta = self.hazards_.values[0] * self._norm_std
 
+        E = E.astype(int)
         score_residuals = np.zeros((n, d))
 
-        # Iterate backwards to utilize recursive relationship
-        for i in range(n - 1, -1, -1):
-            # Doing it like this to preserve shape
-            xi = X[i:i + 1]
-
-            w = weights[i]
-
-            phi_i = w * exp(dot(xi, beta))
-            phi_x_i = phi_i * xi
-
-            risk_phi   += phi_i
-            risk_phi_x += phi_x_i
+        phi_s = exp(dot(X, beta))
 
-            risk_phi_history[i] = risk_phi # denom
-            risk_phi_x_history[i] = risk_phi_x # a[i]
+        # need to store these histories, as we access them often
+        # this is a reverse cumulative sum. See original code in https://github.com/CamDavidsonPilon/lifelines/pull/496/files#diff-81ee0759dbae0770e1a02cf17f4cfbb1R431
+        risk_phi_x_history = (X * (weights * phi_s)[:, None])[::-1].cumsum(0)[::-1]
+        risk_phi_history =        (weights * phi_s)          [::-1].cumsum() [::-1][:, None]
 
         # Iterate forwards
         for i in range(0, n):
-            # doesn't handle ties.
+
             xi = X[i:i + 1]
-            phi_i = exp(dot(xi, beta))
+            phi_i = phi_s[i]
 
-            score = -sum(
-                E[j] * (phi_i * weights[j]) / risk_phi_history[j] * (xi - risk_phi_x_history[j] / risk_phi_history[j]) for j in range(0, i+1)
-            )
-            score = score + E[i] * (xi - risk_phi_x_history[i] / risk_phi_history[i])
-            score *= weights[i]
-            score_residuals[i, :] = score
+            score = - phi_i * (
+                (E[:i+1] * weights[:i+1] / risk_phi_history[:i+1].T).T  # this is constant-ish, and could be cached
+              * (xi - risk_phi_x_history[:i+1] / risk_phi_history[:i+1])
+            ).sum(0)
+
+            if E[i]:
+                score = score + (xi - risk_phi_x_history[i] / risk_phi_history[i])
 
+            score_residuals[i, :] = score
 
-        naive_var = inv(self._hessian_) / self._norm_std.values
-        delta_betas = score_residuals.dot(naive_var)
-        sandwich_estimator = delta_betas.T.dot(delta_betas)
+        naive_var = inv(self._hessian_)
+        delta_betas = score_residuals.dot(naive_var) * weights[:, None]
+        sandwich_estimator = delta_betas.T.dot(delta_betas) / np.outer(self._norm_std, self._norm_std)
         return sandwich_estimator
 
     def _compute_standard_errors(self, df, T, E, weights):
-
-        self.variance_matrix_ = -inv(self._hessian_) / np.outer(self._norm_std, self._norm_std)
         if self.robust:
             se = np.sqrt(self._compute_sandwich_estimator(df.values, T.values, E.values, weights.values).diagonal()) # / self._norm_std
         else:
diff --git a/tests/test_estimation.py b/tests/test_estimation.py
index f6c79a900..6148f2f22 100644
--- a/tests/test_estimation.py
+++ b/tests/test_estimation.py
@@ -1011,7 +1011,7 @@ def test_coef_output_against_R_using_non_trivial_but_integer_weights(self, rossi
         cf.fit(rossi_, duration_col='week', event_col='arrest', weights_col='weights')
         npt.assert_array_almost_equal(cf.hazards_.values, expected, decimal=4)
 
-    def test_robust_errors_with_trival_weights_is_the_same_than_R(self, regression_dataset):
+    def test_robust_errors_with_trivial_weights_is_the_same_than_R(self, regression_dataset):
         """
         df <- data.frame(
             "var1" = c(0.209325, 0.693919, 0.443804, 0.065636, 0.386294),
@@ -1117,6 +1117,36 @@ def test_robust_errors_with_non_trivial_weights_is_the_same_as_R(self, regressio
         assert_series_equal(cph.summary['se(coef)'], expected, check_less_precise=2, check_names=False)
 
 
+    def test_robust_errors_with_non_trivial_weights_with_censorship_is_the_same_as_R(self, regression_dataset):
+        """
+        df <- data.frame(
+            "var1" = c(0.209325, 0.693919, 0.443804, 0.065636, 0.386294),
+            "var2" = c(0.184677, 0.071893, 1.364646, 0.098375, 1.663092),
+            "var3" = c(0.184677, 0.071893, 1.364646, 0.098375, 1.663092),
+            "T" =    c( 7.335846, 5.269797, 11.684092, 12.678458, 6.601666),
+            "E" =    c(1, 1, 0, 1, 1)
+        )
+        r = coxph(formula=Surv(T, E) ~ var1 + var2, data=df, weights=var3, robust=TRUE)
+        r$var
+        r$naive.var
+        """
+
+        df = pd.DataFrame({
+            "var1": [0.209325, 0.693919, 0.443804, 0.065636, 0.386294],
+            "var2": [0.184677, 0.071893, 1.364646, 0.098375, 1.663092],
+            'var3': [0.184677, 0.071893, 1.364646, 0.098375, 1.663092],
+            "T":    [7.335846, 5.269797, 11.684092, 12.678458, 6.601666],
+            "E":    [1, 1, 0, 1, 1],
+        })
+
+        cph = CoxPHFitter()
+        cph.fit(df, 'T', 'E', robust=True, weights_col='var3', show_progress=True)
+        expected = pd.Series({'var1': -8.360533, 'var2': 1.781126})
+        assert_series_equal(cph.hazards_.T['coef'], expected, check_less_precise=3, check_names=False)
+
+        expected = pd.Series({'var1': 12.303338, 'var2': 2.395670})
+        assert_series_equal(cph.summary['se(coef)'], expected, check_less_precise=3, check_names=False)
+
 
     def test_robust_errors_is_the_same_as_R(self, regression_dataset):
         """

From d4b64e7cecea3bfe8605c5416fd159cddf210bf5 Mon Sep 17 00:00:00 2001
From: Cameron Davidson-Pilon <cam.davidson.pilon@gmail.com>
Date: Thu, 4 Oct 2018 19:02:02 -0400
Subject: [PATCH 12/59] need a test for this, but fix overflow in concordance
 index

---
 lifelines/utils/__init__.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/lifelines/utils/__init__.py b/lifelines/utils/__init__.py
index d0658b939..06e746d77 100644
--- a/lifelines/utils/__init__.py
+++ b/lifelines/utils/__init__.py
@@ -894,8 +894,8 @@ def _concordance_index(event_times, predicted_event_times, event_observed):
     censored_ix = 0
     died_ix = 0
     times_to_compare = _BTree(np.unique(died_pred))
-    num_pairs = 0
-    num_correct = 0
+    num_pairs = np.int64(0)
+    num_correct = np.int64(0)
     num_tied = np.int64(0)
 
     def handle_pairs(truth, pred, first_ix):
@@ -912,8 +912,8 @@ def handle_pairs(truth, pred, first_ix):
         while next_ix < len(truth) and truth[next_ix] == truth[first_ix]:
             next_ix += 1
         pairs = len(times_to_compare) * (next_ix - first_ix)
-        correct = 0
-        tied = 0
+        correct = np.int64(0)
+        tied = np.int64(0)
         for i in range(first_ix, next_ix):
             rank, count = times_to_compare.rank(pred[i])
             correct += rank

From b3fe81974914cd8cf0649245bd89edec47179e51 Mon Sep 17 00:00:00 2001
From: Cameron Davidson-Pilon <cam.davidson.pilon@gmail.com>
Date: Thu, 11 Oct 2018 09:08:16 -0400
Subject: [PATCH 13/59] use new CovergenceError, and make qth_survival_times
 much cleaner

---
 CHANGELOG.md                                 |  2 ++
 docs/Examples.rst                            |  2 ++
 lifelines/fitters/cox_time_varying_fitter.py | 18 ++++++++++++++----
 lifelines/fitters/coxph_fitter.py            | 14 ++++++++++++--
 lifelines/utils/__init__.py                  | 20 +++++++++++++++-----
 requirements.txt                             |  2 +-
 tests/utils/test_utils.py                    |  3 +++
 7 files changed, 49 insertions(+), 12 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 6fac768ae..505be2ef7 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -5,6 +5,8 @@
  - `standard_errors_` is now a property on fitted Cox models.
  - `variance_matrix_` is now a property on fitted `CoxPHFitter` which describes the variance matrix of the coefficients.
  - new criteria for convergence of `CoxPHFitter` and `CoxTimeVaryingFitter` called the Newton-decrement. Tests show it is as accurate (w.r.t to previous coefficients) and typically shaves off a single step, resulting in generally faster convergence. See https://www.cs.cmu.edu/~pradeepr/convexopt/Lecture_Slides/Newton_methods.pdf. Details about the Newton-decrement are added to the `show_progress` statements.
+ - Minimum suppport for scipy is 1.0
+ - Convergence errors in models that use Newton-Rhapson methods now throw a `ConvergenceError`, instead of a `ValueError` (the former is a subclass of the latter, however).
 
 #### 0.14.6
  - fix for n > 2 groups in `multivariate_logrank_test` (again).
diff --git a/docs/Examples.rst b/docs/Examples.rst
index 1080a6646..935b3de2f 100644
--- a/docs/Examples.rst
+++ b/docs/Examples.rst
@@ -573,6 +573,8 @@ Problems with convergence in the Cox Proportional Hazard Model
 ################################################################
 Since the estimation of the coefficients in the Cox proportional hazard model is done using the Newton-Raphson algorithm, there is sometimes a problem with convergence. Here are some common symptoms and possible resolutions:
 
+ 0. First diagnostic: look for ``ConvergenceWarning`` in the output. Most often problems in convergence are the result of problems in the dataset. Lifelines has diagnostic checks it runs against the dataset before fitting and warnings are outputted to the user. 
+
  1. ``delta contains nan value(s). Convergence halted.``: First try adding ``show_progress=True`` in the ``fit`` function. If the values in ``delta`` grow unboundedly, it's possible the ``step_size`` is too large. Try setting it to a small value (0.1-0.5). 
 
  2. ``LinAlgError: Singular matrix``: This means that there is a linear combination in your dataset. That is, a column is equal to the linear combination of 1 or more other columns. Try to find the relationship by looking at the correlation matrix of your dataset.
diff --git a/lifelines/fitters/cox_time_varying_fitter.py b/lifelines/fitters/cox_time_varying_fitter.py
index 9b2c9acbb..2276e2f79 100644
--- a/lifelines/fitters/cox_time_varying_fitter.py
+++ b/lifelines/fitters/cox_time_varying_fitter.py
@@ -19,7 +19,7 @@
     pass_for_numeric_dtypes_or_raise, check_low_var,\
     check_for_overlapping_intervals, check_complete_separation_low_variance,\
     ConvergenceWarning, StepSizer, _get_index, check_for_immediate_deaths,\
-    check_for_instantaneous_events
+    check_for_instantaneous_events, ConvergenceError
 
 
 class CoxTimeVaryingFitter(BaseFitter):
@@ -243,12 +243,22 @@ def _newton_rhaphson(self, df, stop_times_events, show_progress=False, step_size
                 g -= self.penalizer * beta.T
                 h.flat[::d + 1] -= self.penalizer
 
-            # reusing a piece to make g * inv(h) * g.T faster later
-            inv_h_dot_g_T = spsolve(-h, g.T, sym_pos=True)
+            try:
+                # reusing a piece to make g * inv(h) * g.T faster later
+                inv_h_dot_g_T = spsolve(-h, g.T, sym_pos=True)
+            except ValueError as e:
+                if 'infs or NaNs' in e.message:
+                    raise ConvergenceError("""hessian or gradient contains nan or inf value(s). Convergence halted. Please see the following tips in the lifelines documentation:
+https://lifelines.readthedocs.io/en/latest/Examples.html#problems-with-convergence-in-the-cox-proportional-hazard-model
+""")
+                else:
+                    # something else?
+                    raise e
+
             delta = step_size * inv_h_dot_g_T
 
             if np.any(np.isnan(delta)):
-                raise ValueError("""delta contains nan value(s). Convergence halted. Please see the following tips in the lifelines documentation:
+                raise ConvergenceError("""delta contains nan value(s). Convergence halted. Please see the following tips in the lifelines documentation:
 https://lifelines.readthedocs.io/en/latest/Examples.html#problems-with-convergence-in-the-cox-proportional-hazard-model
 """)
             # Save these as pending result
diff --git a/lifelines/fitters/coxph_fitter.py b/lifelines/fitters/coxph_fitter.py
index c12782911..b2a937ea0 100644
--- a/lifelines/fitters/coxph_fitter.py
+++ b/lifelines/fitters/coxph_fitter.py
@@ -240,11 +240,21 @@ def _newton_rhaphson(self, X, T, E, weights=None, initial_beta=None, step_size=N
                 h.flat[::d + 1] -= self.penalizer
 
             # reusing a piece to make g * inv(h) * g.T faster later
-            inv_h_dot_g_T = spsolve(-h, g.T, sym_pos=True)
+            try:
+                inv_h_dot_g_T = spsolve(-h, g.T, sym_pos=True)
+            except ValueError as e:
+                if 'infs or NaNs' in e.message:
+                    raise ConvergenceError("""hessian or gradient contains nan or inf value(s). Convergence halted. Please see the following tips in the lifelines documentation:
+https://lifelines.readthedocs.io/en/latest/Examples.html#problems-with-convergence-in-the-cox-proportional-hazard-model
+""")
+                else:
+                    # something else?
+                    raise e
+
             delta = step_size * inv_h_dot_g_T
 
             if np.any(np.isnan(delta)):
-                raise ValueError("""delta contains nan value(s). Convergence halted. Please see the following tips in the lifelines documentation:
+                raise ConvergenceError("""delta contains nan value(s). Convergence halted. Please see the following tips in the lifelines documentation:
 https://lifelines.readthedocs.io/en/latest/Examples.html#problems-with-convergence-in-the-cox-proportional-hazard-model
 """)
 
diff --git a/lifelines/utils/__init__.py b/lifelines/utils/__init__.py
index 06e746d77..91cfbefc7 100644
--- a/lifelines/utils/__init__.py
+++ b/lifelines/utils/__init__.py
@@ -34,6 +34,16 @@ def __str__(self):
         return repr(self.msg)
 
 
+class ConvergenceError(ValueError):
+    # inherits from ValueError for backwards compatilibity reasons
+
+    def __init__(self, msg):
+        self.msg = msg
+
+    def __str__(self):
+        return repr(self.msg)
+
+
 class ConvergenceWarning(RuntimeWarning):
 
     def __init__(self, msg):
@@ -65,14 +75,14 @@ def qth_survival_times(q, survival_functions, cdf=False):
     if survival_functions.shape[1] == 1 and q.shape == (1,):
         return survival_functions.apply(lambda s: qth_survival_time(q[0], s, cdf=cdf)).iloc[0]
     else:
+        survival_times = pd.DataFrame({_q: survival_functions.apply(lambda s: qth_survival_time(_q, s)) for _q in q}).T
+
         #  Typically, one would expect that the output should equal the "height" of q.
         #  An issue can arise if the Series q contains duplicate values. We handle this un-eligantly.
         if q.duplicated().any():
-            return pd.DataFrame.from_dict(dict([
-                (_q, survival_functions.apply(lambda s: qth_survival_time(_q, s))) for i, _q in enumerate(q)
-            ]), orient='index', columns=survival_functions.columns)
-        else:
-            return pd.DataFrame({_q: survival_functions.apply(lambda s: qth_survival_time(_q, s)) for _q in q}).T
+            survival_times = survival_times.loc[q]
+
+        return survival_times
 
 
 def qth_survival_time(q, survival_function, cdf=False):
diff --git a/requirements.txt b/requirements.txt
index b57710b75..d8a2807d6 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,4 +1,4 @@
 numpy
-scipy
+scipy>=1.0
 pandas>=0.18
 matplotlib>=2.0
diff --git a/tests/utils/test_utils.py b/tests/utils/test_utils.py
index a6b17202e..5e7a3372a 100644
--- a/tests/utils/test_utils.py
+++ b/tests/utils/test_utils.py
@@ -165,6 +165,9 @@ def test_qth_survival_times_with_duplicate_q_returns_valid_index_and_shape():
     q = pd.Series([0.5, 0.5, 0.2, 0.0, 0.0])
     actual = utils.qth_survival_times(q, sf)
     assert actual.shape[0] == len(q)
+    assert actual.index[0] == actual.index[1]
+    assert_series_equal(actual.iloc[0], actual.iloc[1])
+
     npt.assert_almost_equal(actual.index.values, q.values)
 
 

From 2468bc29cfc2018103942b89dab6b26eb2791f18 Mon Sep 17 00:00:00 2001
From: Cameron Davidson-Pilon <cam.davidson.pilon@gmail.com>
Date: Fri, 12 Oct 2018 12:33:20 -0400
Subject: [PATCH 14/59] close....

---
 CHANGELOG.md                                 |  1 +
 lifelines/fitters/aalen_additive_fitter.py   |  7 +-
 lifelines/fitters/cox_time_varying_fitter.py | 90 ++++++++++++--------
 lifelines/utils/__init__.py                  |  5 +-
 tests/test_estimation.py                     | 67 ++++++++++++++-
 5 files changed, 127 insertions(+), 43 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 505be2ef7..171efbc6b 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -7,6 +7,7 @@
  - new criteria for convergence of `CoxPHFitter` and `CoxTimeVaryingFitter` called the Newton-decrement. Tests show it is as accurate (w.r.t to previous coefficients) and typically shaves off a single step, resulting in generally faster convergence. See https://www.cs.cmu.edu/~pradeepr/convexopt/Lecture_Slides/Newton_methods.pdf. Details about the Newton-decrement are added to the `show_progress` statements.
  - Minimum suppport for scipy is 1.0
  - Convergence errors in models that use Newton-Rhapson methods now throw a `ConvergenceError`, instead of a `ValueError` (the former is a subclass of the latter, however).
+ - `AalenAdditiveModel` raises `ConvergenceWarning` instead of printing a warning.
 
 #### 0.14.6
  - fix for n > 2 groups in `multivariate_logrank_test` (again).
diff --git a/lifelines/fitters/aalen_additive_fitter.py b/lifelines/fitters/aalen_additive_fitter.py
index 5f67ba05a..69b4e0389 100644
--- a/lifelines/fitters/aalen_additive_fitter.py
+++ b/lifelines/fitters/aalen_additive_fitter.py
@@ -1,5 +1,6 @@
 # -*- coding: utf-8 -*-
 from __future__ import print_function
+import warnings
 
 import numpy as np
 import pandas as pd
@@ -9,7 +10,7 @@
 from lifelines.fitters import BaseFitter
 from lifelines.utils import _get_index, inv_normal_cdf, epanechnikov_kernel, \
     ridge_regression as lr, qth_survival_times, pass_for_numeric_dtypes_or_raise,\
-    concordance_index, check_nans
+    concordance_index, check_nans, ConvergenceWarning
 
 from lifelines.utils.progress_bar import progress_bar
 from lifelines.plotting import fill_between_steps
@@ -185,7 +186,7 @@ def _fit_static(self, dataframe, duration_col, event_col=None,
             try:
                 v, V = lr(df.values, relevant_individuals, c1=self.coef_penalizer, c2=self.smoothing_penalizer, offset=previous_hazard)
             except LinAlgError:
-                print("Linear regression error. Try increasing the penalizer term.")
+                warnings.warn("Linear regression error. Try increasing the penalizer term.", ConvergenceWarning)
 
             hazards_.loc[time, id_] = v.T
             variance_.loc[time, id_] = V[:, relevant_individuals][:, 0] ** 2
@@ -278,7 +279,7 @@ def _fit_varying(self, dataframe, duration_col="T", event_col="E",
             try:
                 v, V = lr(wp[time].values, relevant_individuals, c1=self.coef_penalizer, c2=self.smoothing_penalizer, offset=previous_hazard)
             except LinAlgError:
-                print("Linear regression error. Try increasing the penalizer term.")
+                warnings.warn("Linear regression error. Try increasing the penalizer term.", ConvergenceWarning)
 
             hazards_.loc[id, time] = v.T
             variance_.loc[id, time] = V[:, relevant_individuals][:, 0] ** 2
diff --git a/lifelines/fitters/cox_time_varying_fitter.py b/lifelines/fitters/cox_time_varying_fitter.py
index 2276e2f79..ea2002bff 100644
--- a/lifelines/fitters/cox_time_varying_fitter.py
+++ b/lifelines/fitters/cox_time_varying_fitter.py
@@ -38,7 +38,7 @@ def __init__(self, alpha=0.95, penalizer=0.0):
         self.alpha = alpha
         self.penalizer = penalizer
 
-    def fit(self, df, id_col, event_col, start_col='start', stop_col='stop', show_progress=False, step_size=None, robust=False):
+    def fit(self, df, id_col, event_col, start_col='start', stop_col='stop', weights_col=None, show_progress=False, step_size=None, robust=False):
         """
         Fit the Cox Propertional Hazard model to a time varying dataset. Tied survival times
         are handled using Efron's tie-method.
@@ -54,6 +54,7 @@ def fit(self, df, id_col, event_col, start_col='start', stop_col='stop', show_pr
              observation. If left as None, assume all individuals are non-censored.
           start_col: the column that contains the start of a subject's time period.
           stop_col: the column that contains the end of a subject's time period.
+          weights_col: the column that contains (possibly time-varying) weight of each subject-period row.
           show_progress: since the fitter is iterative, show convergence
              diagnostics.
           step_size: set an initial step size for the fitting algorithm.
@@ -67,28 +68,36 @@ def fit(self, df, id_col, event_col, start_col='start', stop_col='stop', show_pr
 
         """
 
+        self.robust = robust
+
         df = df.copy()
 
         if not (id_col in df and event_col in df and start_col in df and stop_col in df):
             raise KeyError("A column specified in the call to `fit` does not exist in the dataframe provided.")
 
-        df = df.rename(columns={id_col: 'id', event_col: 'event', start_col: 'start', stop_col: 'stop'})
+        if weights_col is None:
+            assert '__weights' not in df.columns, '__weights is an internal lifelines column, please rename your column first.'
+            df['__weights'] = 1.0
+
+        df = df.rename(columns={id_col: 'id', event_col: 'event', start_col: 'start', stop_col: 'stop', weights_col: '__weights'})
         df = df.set_index('id')
         stop_times_events = df[["event", "stop", "start"]].copy()
-        df = df.drop(["event", "stop", "start"], axis=1)
+        weights = df[['__weights']].copy().astype(float)
+        df = df.drop(["event", "stop", "start", "__weights"], axis=1)
         stop_times_events['event'] = stop_times_events['event'].astype(bool)
 
+
         self._check_values(df, stop_times_events)
         df = df.astype(float)
 
         self._norm_mean = df.mean(0)
         self._norm_std = df.std(0)
 
-        hazards_ = self._newton_rhaphson(normalize(df, self._norm_mean, self._norm_std), stop_times_events, show_progress=show_progress,
+        hazards_ = self._newton_rhaphson(normalize(df, self._norm_mean, self._norm_std), stop_times_events, weights, show_progress=show_progress,
                                          step_size=step_size)
 
         self.hazards_ = pd.DataFrame(hazards_.T, columns=df.columns, index=['coef']) / self._norm_std
-        self.standard_errors_ = self._compute_standard_errors(normalize(df, self._norm_mean, self._norm_std), stop_times_events)
+        self.standard_errors_ = self._compute_standard_errors(normalize(df, self._norm_mean, self._norm_std), stop_times_events, weights)
         self.confidence_intervals_ = self._compute_confidence_intervals()
         self.baseline_cumulative_hazard_ = self._compute_cumulative_baseline_hazard(df, stop_times_events)
         self.baseline_survival_ = self._compute_baseline_survival()
@@ -97,7 +106,6 @@ def fit(self, df, id_col, event_col, start_col='start', stop_col='stop', show_pr
 
         self._n_examples = df.shape[0]
         self._n_unique = df.index.unique().shape[0]
-
         return self
 
     @staticmethod
@@ -121,8 +129,8 @@ def _compute_sandwich_estimator(self, df, stop_times_events):
         risk_phi_history = pd.DataFrame(np.zeros((n,)), index=df.index)
         risk_phi_x_history = pd.DataFrame(np.zeros((n, d)), index=df.index)
 
-        score_covariance = np.zeros((d, d))
-
+        E = E.astype(int)
+        score_residuals = np.zeros((n, d))
         # we already unnormalized the betas in `fit`, so we need normalize them again since X is
         # normalized.
         beta = self.hazards_.values[0] * self._norm_std
@@ -148,23 +156,26 @@ def _compute_sandwich_estimator(self, df, stop_times_events):
             xi = X[i:i + 1]
             phi_i = exp(dot(xi, beta))
 
-            correction_term = sum(E[j] * phi_i / risk_phi_history[j] * (xi - risk_phi_x_history[j] / risk_phi_history[j]) for j in range(0, i+1))
-
-            score = E[i] * (xi - risk_phi_x_history[i] / risk_phi_history[i]) - correction_term
-            score_covariance += (score.T).dot(score)
+            score = -sum(E[j] * weights[j] * phi_i / risk_phi_history[j] * (xi - risk_phi_x_history[j] / risk_phi_history[j]) for j in range(0, i+1))
+            score = score + E[i] * (xi - risk_phi_x_history[i] / risk_phi_history[i])
+            score *= weights[i]
+            score_residuals[i, :] = score
 
-        # TODO: need a faster way to invert these matrices
-        sandwich_estimator = inv(self._hessian_).dot(score_covariance).dot(inv(self._hessian_))
+        naive_var = inv(self._hessian_)
+        delta_betas = score_residuals.dot(naive_var) * weights[:, None]
+        sandwich_estimator = delta_betas.T.dot(delta_betas) / np.outer(self._norm_std, self._norm_std)
         return sandwich_estimator
 
-    def _compute_standard_errors(self, df, stop_times_events):
-        if self.robust: # TODO
-            se = np.sqrt(self._compute_sandwich_estimator(df, stop_times_events).diagonal()) / self._norm_std
+
+    def _compute_standard_errors(self, df, stop_times_events, weights):
+        if self.robust:
+            se = np.sqrt(self._compute_sandwich_estimator(df.values, T.values, E.values, weights.values).diagonal()) # / self._norm_std
         else:
             se = np.sqrt(-inv(self._hessian_).diagonal()) / self._norm_std
         return pd.DataFrame(se[None, :],
                             index=['se'], columns=self.hazards_.columns)
 
+
     def _compute_z_values(self):
         return (self.hazards_.loc['coef'] /
                 self.standard_errors_.loc['se'])
@@ -201,7 +212,7 @@ def summary(self):
         df['upper %.2f' % self.alpha] = self.confidence_intervals_.loc['upper-bound'].values
         return df
 
-    def _newton_rhaphson(self, df, stop_times_events, show_progress=False, step_size=None, precision=10e-6,
+    def _newton_rhaphson(self, df, stop_times_events, weights, show_progress=False, step_size=None, precision=10e-6,
                          max_steps=50):
         """
         Newton Rhaphson algorithm for fitting CPH model.
@@ -236,7 +247,7 @@ def _newton_rhaphson(self, df, stop_times_events, show_progress=False, step_size
 
         while converging:
             i += 1
-            h, g, ll = self._get_gradients(df, stop_times_events, beta)
+            h, g, ll = self._get_gradients(df, stop_times_events, weights, beta)
 
             if self.penalizer > 0:
                 # add the gradient and hessian of the l2 term
@@ -272,7 +283,7 @@ def _newton_rhaphson(self, df, stop_times_events, show_progress=False, step_size
             # convergence criteria
             if norm_delta < precision:
                 converging, completed = False, True
-            elif abs(ll - previous_ll) / (-previous_ll) < 1e-09:
+            elif previous_ll > 0 and abs(ll - previous_ll) / (-previous_ll) < 1e-09:
                 # this is what R uses by default
                 converging, completed = False, True
             elif newton_decrement < 10e-8:
@@ -303,7 +314,7 @@ def _newton_rhaphson(self, df, stop_times_events, show_progress=False, step_size
 
         return beta
 
-    def _get_gradients(self, df, stops_events, beta):
+    def _get_gradients(self, df, stops_events, weights, beta):
         """
         Calculates the first and second order vector differentials, with respect to beta.
 
@@ -324,9 +335,10 @@ def _get_gradients(self, df, stops_events, beta):
 
             ix = (stops_events['start'] < t) & (t <= stops_events['stop'])
             df_at_t = df.loc[ix]
+            weights_at_t = weights.loc[ix]
             stops_events_at_t = stops_events.loc[ix]
 
-            phi_i = exp(dot(df_at_t, beta))
+            phi_i = weights_at_t.values * exp(dot(df_at_t, beta))
             phi_x_i = phi_i * df_at_t
             phi_x_x_i = dot(df_at_t.T, phi_x_i)
 
@@ -337,43 +349,47 @@ def _get_gradients(self, df, stops_events, beta):
 
             # Calculate the sums of Tie set
             deaths = stops_events_at_t['event'] & (stops_events_at_t['stop'] == t)
-            death_counts = deaths.sum()  # should always be atleast 1.
+
+            ties_counts = deaths.sum()  # should always at least 1
 
             xi_deaths = df_at_t.loc[deaths]
+            weights_deaths = weights_at_t.loc[deaths].values
 
-            x_death_sum = xi_deaths.sum(0).values
+            x_death_sum = (weights_deaths * xi_deaths).sum(0).values
 
-            if death_counts > 1:
+            if ties_counts > 1:
                 # it's faster if we can skip computing these when we don't need to.
                 tie_phi = phi_i[deaths.values].sum()
                 tie_phi_x = phi_x_i.loc[deaths].sum(0).values
                 tie_phi_x_x = dot(xi_deaths.T, phi_i[deaths.values] * xi_deaths)
 
             partial_gradient = np.zeros(d)
+            weight_count = weights_deaths.sum()
+            weighted_average = weight_count / ties_counts
+
 
-            for l in range(death_counts):
+            for l in range(ties_counts):
 
-                if death_counts > 1:
-                    c = l / death_counts
-                    denom = (risk_phi - c * tie_phi)
-                    z = (risk_phi_x - c * tie_phi_x)
+                if ties_counts > 1:
+                    denom = (risk_phi - l * tie_phi / ties_counts)
+                    numer = (risk_phi_x - l * tie_phi_x / ties_counts)
                     # Hessian
-                    a1 = (risk_phi_x_x - c * tie_phi_x_x) / denom
+                    a1 = (risk_phi_x_x - l * tie_phi_x_x / ties_counts) / denom
                 else:
                     denom = risk_phi
-                    z = risk_phi_x
+                    numer = risk_phi_x
                     # Hessian
                     a1 = risk_phi_x_x / denom
 
                 # Gradient
-                partial_gradient += z / denom
-                # In case z and denom both are really small numbers,
+                partial_gradient += weighted_average * numer / denom
+                # In case numer and denom both are really small numbers,
                 # make sure to do division before multiplications
-                a2 = np.outer(z / denom, z / denom)
+                a2 = np.outer(numer / denom, numer / denom)
 
-                hessian -= (a1 - a2)
+                hessian -= weighted_average * (a1 - a2)
+                log_lik -= weighted_average * np.log(denom)
 
-                log_lik -= np.log(denom)
 
             # Values outside tie sum
             gradient += x_death_sum - partial_gradient
diff --git a/lifelines/utils/__init__.py b/lifelines/utils/__init__.py
index 91cfbefc7..07443e169 100644
--- a/lifelines/utils/__init__.py
+++ b/lifelines/utils/__init__.py
@@ -68,7 +68,7 @@ def qth_survival_times(q, survival_functions, cdf=False):
     """
     q = pd.Series(q)
 
-    if not((q <= 1).all() and (0 <= q).all()):
+    if not ((q <= 1).all() and (0 <= q).all()):
         raise ValueError('q must be between 0 and 1')
 
     survival_functions = pd.DataFrame(survival_functions)
@@ -78,7 +78,8 @@ def qth_survival_times(q, survival_functions, cdf=False):
         survival_times = pd.DataFrame({_q: survival_functions.apply(lambda s: qth_survival_time(_q, s)) for _q in q}).T
 
         #  Typically, one would expect that the output should equal the "height" of q.
-        #  An issue can arise if the Series q contains duplicate values. We handle this un-eligantly.
+        #  An issue can arise if the Series q contains duplicate values. We solve
+        #  this by duplicating the entire row.
         if q.duplicated().any():
             survival_times = survival_times.loc[q]
 
diff --git a/tests/test_estimation.py b/tests/test_estimation.py
index 6148f2f22..ac73025e9 100644
--- a/tests/test_estimation.py
+++ b/tests/test_estimation.py
@@ -19,7 +19,7 @@
 import numpy.testing as npt
 from numpy.linalg.linalg import LinAlgError
 
-from lifelines.utils import k_fold_cross_validation, StatError, concordance_index, ConvergenceWarning
+from lifelines.utils import k_fold_cross_validation, StatError, concordance_index, ConvergenceWarning, to_long_format
 from lifelines.estimation import CoxPHFitter, AalenAdditiveFitter, KaplanMeierFitter, \
     NelsonAalenFitter, BreslowFlemingHarringtonFitter, ExponentialFitter, \
     WeibullFitter, BaseFitter, CoxTimeVaryingFitter
@@ -1884,6 +1884,71 @@ def test_fitter_will_error_if_degenerate_time(self, ctv):
         ctv.fit(df, id_col="id", start_col="start", stop_col="stop", event_col="event")
         assert True
 
+    def test_ctv_fitter_will_hande_trivial_weight_col(self, ctv, dfcv):
+        ctv.fit(dfcv, id_col="id", start_col="start", stop_col="stop", event_col="event")
+        coefs_no_weights = ctv.summary['coef'].values
+
+        dfcv['weight'] = 1.0
+        ctv.fit(dfcv, id_col="id", start_col="start", stop_col="stop", event_col="event", weights_col='weight')
+        coefs_trivial_weights = ctv.summary['coef'].values
+
+        npt.assert_almost_equal(coefs_no_weights, coefs_trivial_weights, decimal=3)
+
+
+    def test_ctv_fitter_will_hande_integer_weight_col_on_tv_dataset(self, ctv, dfcv):
+
+        # duplicate a few subjects
+        dfcv_unfolded = dfcv.copy()
+        for _id in [10, 9, 8, 7]:
+            to_append = dfcv[dfcv['id'].isin([_id])].copy()
+            to_append['id'] = (10 + _id)
+            dfcv_unfolded = dfcv_unfolded.append(to_append)
+        dfcv_unfolded = dfcv_unfolded.reset_index(drop=True)
+        print(dfcv_unfolded[(dfcv_unfolded['start'] < 5) & (5 <= dfcv_unfolded['stop'])])
+
+        ctv = CoxTimeVaryingFitter()
+        ctv.fit(dfcv_unfolded, id_col="id", start_col="start", stop_col="stop", event_col="event", show_progress=True)
+        coefs_unfolded_weights = ctv.hazards_
+
+
+        dfcv_folded = dfcv.copy()
+        dfcv_folded['weights'] = 1.0
+        dfcv_folded.loc[dfcv_folded['id'].isin([10,9,8,7]), 'weights'] = 2.0
+        print(dfcv_folded[(dfcv_folded['start'] < 5) & (5 <= dfcv_folded['stop'])])
+
+        ctv = CoxTimeVaryingFitter()
+        ctv.fit(dfcv_folded, id_col="id", start_col="start", stop_col="stop", event_col="event", weights_col='weights', show_progress=True)
+        coefs_folded_weights = ctv.hazards_
+
+        print(coefs_unfolded_weights)
+        print(coefs_folded_weights)
+        assert_frame_equal(coefs_unfolded_weights, coefs_folded_weights)
+
+
+    def test_ctv_fitter_will_give_the_same_results_as_static_cox_model(self, ctv, rossi):
+
+        rossi = rossi.reset_index()
+        rossi = to_long_format(rossi, 'week')
+
+        expected = np.array([[-0.3794, -0.0574, 0.3139, -0.1498, -0.4337, -0.0849,  0.0915]])
+        ctv.fit(rossi, start_col='start', stop_col='stop', event_col='arrest', id_col='index')
+        npt.assert_array_almost_equal(ctv.hazards_.values, expected, decimal=4)
+
+
+    def test_ctv_fitter_will_handle_integer_weight_as_static_model(self, ctv, rossi):
+        rossi_ = rossi.copy()
+        rossi_['weights'] = 1.
+        rossi_ = rossi_.groupby(rossi.columns.tolist())['weights'].sum()\
+                       .reset_index()
+
+        # create the id column this way.
+        rossi_ = rossi_.reset_index()
+        rossi_ = to_long_format(rossi_, 'week')
+
+        expected = np.array([[-0.3794, -0.0574, 0.3139, -0.1498, -0.4337, -0.0849,  0.0915]])
+        ctv.fit(rossi_, start_col='start', stop_col='stop', event_col='arrest', id_col='index', weights_col='weights')
+        npt.assert_array_almost_equal(ctv.hazards_.values, expected, decimal=4)
+
 
     def test_fitter_accept_boolean_columns(self, ctv):
         df = pd.DataFrame.from_records([

From 0eb027c3f1bcffd275c3b7383e0aed533927511f Mon Sep 17 00:00:00 2001
From: Cameron Davidson-Pilon <cam.davidson.pilon@gmail.com>
Date: Fri, 12 Oct 2018 13:28:09 -0400
Subject: [PATCH 15/59] not sure why this is failing

---
 tests/test_estimation.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/tests/test_estimation.py b/tests/test_estimation.py
index ac73025e9..58c152d9a 100644
--- a/tests/test_estimation.py
+++ b/tests/test_estimation.py
@@ -1896,7 +1896,7 @@ def test_ctv_fitter_will_hande_trivial_weight_col(self, ctv, dfcv):
 
 
     def test_ctv_fitter_will_hande_integer_weight_col_on_tv_dataset(self, ctv, dfcv):
-
+        # not sure yet why this is failing.
         # duplicate a few subjects
         dfcv_unfolded = dfcv.copy()
         for _id in [10, 9, 8, 7]:
@@ -1904,7 +1904,7 @@ def test_ctv_fitter_will_hande_integer_weight_col_on_tv_dataset(self, ctv, dfcv)
             to_append['id'] = (10 + _id)
             dfcv_unfolded = dfcv_unfolded.append(to_append)
         dfcv_unfolded = dfcv_unfolded.reset_index(drop=True)
-        print(dfcv_unfolded[(dfcv_unfolded['start'] < 5) & (5 <= dfcv_unfolded['stop'])])
+        print(dfcv_unfolded[(dfcv_unfolded['start'] < 7) & (7 <= dfcv_unfolded['stop'])])
 
         ctv = CoxTimeVaryingFitter()
         ctv.fit(dfcv_unfolded, id_col="id", start_col="start", stop_col="stop", event_col="event", show_progress=True)
@@ -1914,7 +1914,7 @@ def test_ctv_fitter_will_hande_integer_weight_col_on_tv_dataset(self, ctv, dfcv)
         dfcv_folded = dfcv.copy()
         dfcv_folded['weights'] = 1.0
         dfcv_folded.loc[dfcv_folded['id'].isin([10,9,8,7]), 'weights'] = 2.0
-        print(dfcv_folded[(dfcv_folded['start'] < 5) & (5 <= dfcv_folded['stop'])])
+        print(dfcv_folded[(dfcv_folded['start'] < 7) & (7 <= dfcv_folded['stop'])])
 
         ctv = CoxTimeVaryingFitter()
         ctv.fit(dfcv_folded, id_col="id", start_col="start", stop_col="stop", event_col="event", weights_col='weights', show_progress=True)

From 5a02162b3ea2dadd46423b5c69d7e2885a286d5f Mon Sep 17 00:00:00 2001
From: Cameron Davidson-Pilon <cam.davidson.pilon@gmail.com>
Date: Fri, 12 Oct 2018 14:05:24 -0400
Subject: [PATCH 16/59] add kmf cumulative option

---
 CHANGELOG.md                  |   1 +
 docs/Examples.rst             |  12 ++++++++++++
 docs/images/invert_y_axis.png | Bin 0 -> 30719 bytes
 lifelines/plotting.py         |  16 ++++++++++++++--
 tests/test_plotting.py        |  16 ++++++++++++++++
 5 files changed, 43 insertions(+), 2 deletions(-)
 create mode 100644 docs/images/invert_y_axis.png

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 171efbc6b..654498f1d 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -8,6 +8,7 @@
  - Minimum suppport for scipy is 1.0
  - Convergence errors in models that use Newton-Rhapson methods now throw a `ConvergenceError`, instead of a `ValueError` (the former is a subclass of the latter, however).
  - `AalenAdditiveModel` raises `ConvergenceWarning` instead of printing a warning.
+ - `KaplanMeierFitter` now has a cumulative plot option. Example `kmf.plot(invert_y_axis=True)`
 
 #### 0.14.6
  - fix for n > 2 groups in `multivariate_logrank_test` (again).
diff --git a/docs/Examples.rst b/docs/Examples.rst
index 935b3de2f..a84055296 100644
--- a/docs/Examples.rst
+++ b/docs/Examples.rst
@@ -282,6 +282,18 @@ Hide confidence intervals
    :height: 300
 
 
+Invert axis
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+.. code-block:: python
+
+    kmf.fit(T, label="kmf.plot(invert_y_axis=True)")
+    kmf.plot(invert_y_axis=True)
+
+.. image:: /images/invert_y_axis.png 
+   :height: 300
+
+
 Set the index/timeline of a estimate
 ##############################################
 
diff --git a/docs/images/invert_y_axis.png b/docs/images/invert_y_axis.png
new file mode 100644
index 0000000000000000000000000000000000000000..86ff80593cccd4b238c98133cb507def10dc6f09
GIT binary patch
literal 30719
zcmeFZc{r5s8~=SH6iG=@WNlHhB}>T?2_Z`MokI4qW|=`r2uWoN*|Uf2%aASEl4UG0
zw(L7&nK3iJ>(=-4Y59JS-|_tO{P7&e<ER7UUatFE&g(qi=j*%!)o&`$(Hy6NAc#&$
z@!D+&BKLtHvXQ-1;1iL~?-StL9w%9)JA1*G*WSC&!0-DU6?L5;h~5<bPxf9q{XY2c
zlC!*yvxb9(vzzI|dyt)}v!ktpv+aE|4%d4Ro$fo>b6nsT;uk#6Vdd=Xcv(Q;*8%(v
z4=n{!?J%z)hyzl(cJ+??+xe<Tjs}*miKO*mz4ChtFN<{E*!8bI={u+>NT$LOy<gB*
zo%1z&i&*VplTYHDazT4OJxS4#)6}^jM{cTo%2b&{W~13}E#xdmMC+chea6ji#Q5G6
zwOD8L%q|QQ37i;LWoxA>!y!-nm4tJe{W*)(OKaZ;OTvbohSvrd0<V!jIViiA^V)vb
zWNMkcoZy$<qf|b-pM5fc6yZ;}_W%DMz<u=p<^FIyE(!4_XP!&y<Y7HPZ7UkvSqG1`
zBGpNTeLO+rC4+PohLZ-&6g5|#T~FTXN+8}{7lg<DCB;+%QmWD@ud_~Y50%fO5l3+d
zvW#=lTE+O^efW?;Q&W=?X`CfPD8N#z%akL0*wj%{xFhnl7~+PQVRc0gIk@E4Z`B6^
z<RXJV+m{4ai)R=YgToZj^j|4v(b}kqRlevzhhV`A7tGcV?)S&to1>ogbtXWP?-2bs
z>=QB$xSh`gnW31dwpcM1$vtw3;16UT9EQ>|^zXs7BuybJGb(E85H`O=b(AeQ^F=QE
zJL+x$eN?>Tp=a$+WI4e<sGlizV8K;IMP*v(yt4W{?r`89=-tPUjH#)q$_QyOv65c7
z3n#w)cCH}Q9WKKG#kM`2ot=HFb^LW?WZn46wBQ9Bj;lzn@S4`v{p7yTxpU`^?%cF?
z$h?!{`CF4{bQH|;nRgJ#_LkNmdpBQ&yL>07OZsh>@q}j$X37Zkz<_nlvsQm+`V{cI
zWZ0J8yfgbDmy;Nx?5Ja@40rzNm+R>jS{@!A`<VmFUM&8P9tpY(?bNXoeU$mj_ROL4
z0@^I<xWld#%-R^ak5X2S(mD7vjn_`L#Xi7zZ_Y@6z2|uOUZmAzCUR-GYHM0ST5qH?
zTe7IA=sJC^<L^yBI}16wx*nBqov#^k!^Cy2g|SPAZdPwrm5%t)F3)D=jTw`$#w#mf
zNY;)kYtP%cRl{!Scc;+4WAk3Wbx~AwEVp9u({PpNElo{-7W-1uW*djfmFssjHBG^d
zABCBV5Wv^_%*;<Vc~!+DD}xRw$#JK6dJpQDZzKr%ZRk_pQ`ESj@Iza(+NdCUuMvu4
zlB9Ug#oERYmX#a_Z`!!HxYCUQo0VF6#6iq<yPGrHeUEQ^`vv`|+1w%0P6z3^17a-}
zK7<TXRaM<ua6nd=1R}&)8|&z$pHEIs`mu^>6A&n5bC4eIg^L&Og6XGFPW=r`WFIgK
z(Gf`ppu!&wOvd&8Oh%<6`N0C!%mUS0V?lYb(b2W*>&~lZ7y_p+wTk?ZCXP!lZ+9Ts
zq=_qGE8$^YC%3mkPe^?2?tXG6-koc^@37hx-^odfx8585$55!u%&m5}>c?>sZe|1=
z)_rhr_^ATB_r+~|M#fpM)rf@pH+=dZ5Co6Pl@N#e4kXbDs%ejtAcHF0mac1R9^o&W
zV1Se@C*kj2i%to3ZDN?`AuM8MbG<pGBqCl&LL#`zb2F)^=*ZMtrvsqN$|h_W<!fEg
zhLNo3vZ?rB@X48Nyo1*`%hnG8WMF!{2Mt)q@R+cZwiKY6z{6)gSaz^|EGkN@sye~G
z{aN8F29uC~25bf4Rbj04q|IRTatCUm@fy$5K=u6(L`>H|1RZV9NfW$s<v9+AqYUdo
zqs=`$jypLy@ti%YgfA~_zs%b|e+=4~?|(d(TX6s^y5?gSB3t~garwGmIRGuYUjfTP
zUGs=*#R>jM-ecmI<8JoZiUQ-*+SbPBp4(>exM^G|_eD>9R@<9<YX-nx$=`a%Lt`t-
zQfVXr#`0iTQ)1>h`-%n8@19FlrAyWL<y^Na+D<FRprn%`1`isJY|W@4gEmnlf}=RA
z`-m^K<Mxlp@zh8ozsyXfg;S?a?G30?Jjnq5=y*9!DaUQdg^3NZW%?a~U+61+z-{#C
z#}NakHj(9RFZUMxVZS`@1)J%`#YQzGPNTc0XS6ThJSr}(;rn-1(BGE~oIB;+musmz
z`}?EU^;wWt*wj2$T8xPdq*VcuAD9#5VbVl#DSUs&*h&+7lk`rV^a!xI7ZJ;k*+*h*
za%+*KO{Doyxl2@Rtem0Y3HbAO)zxfBTvvi)m#SD|S{hZPk;hpnFyE3ZDp(TL_rfax
zbjUeg-b29+j{Wx{N8dWN*CvNa(Uc5^>@I})f|lg~=^e5I0p@|2nw|Okg3KyZL416C
zJOTm=+S*K@FE2|-JedB<2{r^2CM+}a-1^1_c~nA3Tx@LJ#KcMHU18y2g;3V3=a0%g
zdUVO!#^%A!)&r;phn+hVa$QJD>HySN?!xQg>3Jydn!1+O)2C0VSS39ep|4-RLYkUF
zmoL*meFgVusi>$99tgN~`?eog6Rc8R%#gaaHg#;<oA0IGU1_S!$B)bMF!}-G^^}g~
zW!5&hd|2xNarNr}0$Gh?mELHiPxg@pxi_hk;B?f&mF9524pz01j=i-?GbijRNV
zHFHftfx5iB9D-iIehr?ksip?<+1@zhD+7UlV2TYE+tt-RrUo+xe5v2Mm0z=DBKS-r
z{U#Yy?0^Z4k7uhNU-^g#s2jZ-v1&J13=c2Z7r~{}jMAa6eSOc|uY~633qdmAMqt7p
z1kbCd7u49;n1aD#v0xt2?zom2OwZ2xq37h4m0QSB3$7AR0|Fut^z`)Rz`U}BQQO(s
zi91gpgx-O{1P^pZEq)JV`@Qs#xw-jXs>hCH27Q%}gv%=`$Uxs|!pmr`q;9mPH%Gs7
zXeb1%8<)A>+Nm|ZWlylsG&D5s=k<U2a$_BfC4*vOV-L~zzeL++Vx$u2MMOlJZHmE7
zxVgFS6Zm;~<-ENKL5R)G%`<uZX=$ffFFiatBjz(9Qt@cbV*h~yX7=_6jPjUra&mZ$
z3c)#GUK>7a)6&)kyWdHIy>-7&VBo%1D$5ba%bmT|#}qs#6ZG?Q8HQipe!jmAUsYX3
z+2srQ33q)M1S_YE;(>gF4HA!B-sV+bTr;&=uHTt*@)njYysAQ-2F23F7TY$gBnKp(
zqxs@3wT0Fko2c2r8e&S>MweO1)AN=#`1Ct5p8HpH_NU(a@jf%RHjubIS5aJjC|`$B
zBVrBV@ckp@87kbt;On<;>jlM>d+cA2mlAWq0$UO>4*qO~c`{mi^%|F{d!^CEi=v53
zTj27V8nj?r56lNwO3gW8&Rw^+!+*rp^Fwv&5pLg0DosA_2J%bILf6upe4hl#2tKKq
za3Aoz=Hen!Y2?tmxi{w!>aOiRPJUpab{5@Gb-;?0&pTbdd>OWy-@Vq_nw#Q1X3th`
zRxQc%FzPWtjpGmIF(`I0XFm*fsHAyZVROI223(PM<>el=HXBk852nkmJa2p}LoPaD
zOs%6kTR#+O-?Fvsxw+T?)-KDn<Xcx(KXbd|2Qk$Xn_v#R4L>?#v{{ZG{Sj$Q1-%8o
zx-S%=9SK-;a#E66&*e(RlYXkPqN61!0$;o2=J$#51mb$SkeFCN#bP<F=f?b1u#qg5
z&*h%VTUuIb0^3VIl7y&Sbjj1dP@O=Sw}d7wA>hSns7X?qAYAjn;v#K@k)ACfNqA@y
zwFelAl~&=75mg5dje(V6=Pp&|W5;R<o0x<RacDWqm_UQU^_DEdo<j5SMyrcKV1F7*
zmnNJY24*VFd&wDn!`PTz)VA+KQb0rKt`pR{boFGetggkIb7`l)N;1?N7_@B@7KD@4
zy<IM$hHrewP92U4<k8U{tKXCsB*lqb@*>wC5Iee~6O=Sk<4e(3<yq2qSuQ#=GBGic
za~u7x@$=`sI|x!iArcx{+XjB?h0Bc@>a63OPTd<h^%%ks#?;J=9JAFZw)|AAlI+Ki
zAIkAy89ff>FW-yWY7ls#c+#;`o{4XtnbsIs{(JppOMwMFdW9|I`bs&v>Of|uSx3C&
zR$r6UQxX9~YM^8?@Y$@Kk`!O6TzmK}dDKx{T~{}-y}f<H+{I;0Ovlb*KZMvBXCI%E
zCSAJuR@~Q7{4Fc`#tv{x>hjH--dq%*3#e;8mZB0J7uPns-SBL;S0*KO&VIh|QiZ(I
zQAOS#y)QtrM0#@k#-5wn78rpBzd8#!mke4+kYQ5G(W4d;^f#<<J!Khq5#EI62a2*x
zn%<vp`@SmM4p}Ihn{zK#EJ;-^c`wz0X%lKpnAHZ`nR5JQXBdJ}Y{>aAG!iPY_<5Fj
zAad|Az0@;c&L~k%6+<8TZA+(<f*nU{$gXUDcXA)~XIl29_ky&Iy+x{5U+3L*$!<W7
z_S=e<8st)vs3&b0$VS9a+jRC-SS_y~yvDV$!|SJCMMjQSU<mwRBYmwYve1`LA-yw7
zAGwnYEKrt^a^JptBD-EFB>t9v$Hnn-p6*3S-Lp0LLC1p~?2`WPU*h`?2hqeP3nXyr
zJvF(eyUHw0fxFdBW&(DxUQcMv&dA6cs|no9MBw*`VZ4MvM6gmrK@UN>Ws}Fh1Lvo7
zCM|YZjkK=jPTZbaf#;~2)b`!l##gH6M8g%Cdugbm@`62fR+?6bn<(~3W5gu{Va`}!
zA2|k_>vqb9CQ{jfzX_P%(G_>cvHE>)k;H9fM4slE?(XgpId<GZsr5v9Q*^FjbBzk^
z2;;pORc>P=#P+?nz}qUt&1OY}@x|9Vl^H090x!^V9C)@TtgWp_Ht}o6<vcw-jq(Jj
zQasgdaZb_AS-O~@2<}41emloYttIxIQ?BPFa!H=+jExqwFIEDN5ZNp(ZWvhZoG<S1
ziRPJwmXX9Mq+=~5(@9buHm71XGs_Y9{H9h`R<*OO@XBjGlTdFdwRUU=k0MEc-C<>%
zoy5#!?6ndTc0#7Wv0oAXtj|1bl(2-9mfEVNA48*vSU-#eYv{ze8I`V)<6s{_8+q~(
z7cfM+%*?Si2ZV51diu&YWdX1Uje>8gxd(@f?O3?N_r^b(Jm&1|3`GgF@}E6hTkVa9
zomuy);mXJSAsZX6SGGnT_mw!ntKTPp&0{ZeyT$luH&;<{@!0lMLepqn00<o@fdkSa
zfMSLHFezMT*w|#OLnClBgaieBVE+ki=yoC|CguVgypQL5<2U@K^Ur;g<T*_UFOBI^
zi8n7`H#HXay2r0U<->dElH{5t#mMI5n?_aOGwKfO=iJ~TD`ge+a-WKBh-z3rCdeXx
z0;(QAv83}LRju4mYIr5oZ6ps#$O=s^U<1JetzQlBW++h|XQpwVn5undXRW_Knu?oV
zSdf>Or)-zP&)@y9=vuJ?-05Q!6co!IReeP^ObFo2Ls4pT!@!9w9S@ao#4nVXgBjuu
zCYcjcjGva9JGMu=&Z2O%poPD#N830ALn3ZAPZ@wUvIU0dnOoJIA@Bv<=kuGEdrPX1
zs9jmvQ@x%lF#3fz@0uX<HL#?Ii5n&9s)>iqF4|}sGX-Wv1!w7YOf)QrcE?=Yk`QF9
zFO)poEaM>5HM$)wfkON04@VEmKK;7<;FTyN;?R?0`<gVLn9;P`<RjR%c{wbOd|<Q6
z@^PE*3wPb%9s1I43*2(r+3plhC1knHy}NhsD(Q3GSKO3wAe6O#`Ef%$&EiCBgT?95
zRm~ak%<pC9NcQW(iYvXZ%N)lSziVBH)kP*M(Lu^XYnTiVZ>mz7($AJqPG~q5X#vE7
zuqN}LF<10-hb5H&TDQzVH^a#OlI_hke)7*J8|EJLJ$Rt7c$mzh7j;w-MLoTbLgw({
z;D+;r$Yv0uXc=53j0FqimfF4T!A`X_N1VyGAG(|zV9N7NQb!fZX|eoCsHYq2f?PP!
zK%3l>t{aka@a%BUGpfds^ST31m1({$k$G@!4!)kgDJMs<v4la5E$wTjD%^P7MJun$
z!Dxt}v;r4-82wsbIbJ%$?21|Sh`^DsKmzyOOF)ib&QkN1h-<HWv#f83thW8mlf3MC
z>b9iLoQH_UNam+48Iy+eC~qnO<0MB&eN|5>>&$kVyLO#^l~KuNF?l8ZEH(TzVNLR4
zwXbrKUS6!g6~9hjr{37qbPL_LM_x7iQB}+_b=j-pH*RQqh4x!6ubviSU!IG$5E0HF
zVZ8GCA(QNi)O+?_*?_Xw<Dz&m!>(vxVPOG_276W1%+;}{ED1s0#Wq5TeXN!3V_(bl
zT3-E#3jCt9oh}#n<T;Q`mVaKN-l7or1bqu-Lp7;+ey@{^HG4klrSkjes!v%uosYuu
z;`(#t`{pN=A5DG%Q|efeitd377uN=7x(UB-{D_{6rAz(Pn(0qT=n|!`QJ!|dEnVt;
zI8DuliHW-d76SvJXI|>uo|?)yEH)Q;L&-3!Q;vsudp;=9EiNwlY?=)R)5&3KG!2!{
z<uTWa(M|k=+3t+D8(2E3$2<a&6+5UttPT1JW=TzduCJ%(ls)Z9dDZh#E5TvdPul3y
z7q%$YF_c{BkGA|10~hWMKmH!W6YCq5B-_j#e1d(>zpyyRWL!u%{~|iE9<;U=Qn&nW
z@77-7`~3Oy^L?IL$G>5fPr6>l&OPiKbs0ER(5$;`x@SA>p3~La=fu+9GslM1&V4Z~
zlvCw69-%>RIQFs5H1ZbF{NjE{RYmjl^9AIa$GCelVfC8MVSA=Ds?$q+AHJEa?r;`*
zS3>mqR#5gr`5gKS+UF%y$uHq5iW_NuAk<Nsen7<K;YOoNpB-)GOILOSWmx3;_U-kc
zG^0d!ULK-QM6-9G@{W64v#xYN#nzmRwpq@4Y+1t%Ap;7wbjh+3yW;G%$f=zHGa1~b
zPrM*>QRvLgw+9XJG6dX=n2pAd?tW?LnM%m49)+c=R#$=pU4mAmrA3M{MJ4^0UBPMJ
zA;rEF)z_&-kD4Q8dzO_lTX-+>@~tYCy!Sm=SOa~nS2TvM5^&ST>Nb|E8|UKB_2cF9
z(ohFQyeu5@%_eO)BIMb&`K?F1)&HZz-Hf~JUr=xXEJEh)yf8#g&kgIGINe*a!j7fN
zP_|tQL*WZ`%|5MZ&KMh;BfQ}rbWtHHM<1Ox>VCoZPn%R**VjLG$_X;U>fjMQzWTbt
zua{3A?<}Zi%{EHcZaw?G+omRaqVTi1h&%hM9Ah<jEe6Ofei87&rOr`9u?s3Dg}r>r
ztunH=O!R+T7WAYMGrpZi7O04#S46`yn`$fA#{ZQIzb0#q_Y=II@nk-8KBId>iSV@2
zt?O%Wk<J7sL5jl=wcD>lLqpT^^A|mspI#;ScO?w0378f13h=HtZwfZn(X#VSI!~ER
zUiY|l8)LC9xc{E(mN<Fp+}ePuoZ9+uncwEsT`BWjpwdZ|tusr<WeNVB3At04F@&eZ
zDUb3URb~89&vGZBZ_*SsmDJmdX^CgH`B!!5X3##)1dmT(9J2LG+NU|0X|Pjy(hS;%
zg3c*>5Ovcq@oiKboT3YJ=g*3H?0ss!*6#UkN|si*4*$;f=XqO%-XV&@gX%~GPQ#Lz
z{dJ9AXJGc;>2vrzWI##jfbqobmh{VQ{p1!oKgUpLv&7dChsW>s4mha@f#k2CD>}ND
z=p&qCL~csy`+=NK>&2bT$Qu*g&)^lT*EON8(mbmW$JSHT-rl+_#RKcER4-m|on+9C
z2$nQBP;tsn+T?y#<ddYDIoH5UJg?S;`!R1rlb!gTFK(;qk`$i3h9z7*Jw4bSsUz)i
zktqni+hQ&zJkM&9v;=O5?+GaB9tl4dU};zdZYkd1yIN#tb>zg(%I?iR6Yl8xL`NS>
z4lv>APKYHs8Wb7KGDWG8tgX%(pJKRREuID|)0<?okzlrDue5Qz0!rJ@qpOID2J)MV
z2S=l5^13c(7(=uhTRpd{S1$F7k&3>uYutG<jw4MBT#N>1pTX5IehAF(zk5{DMjg3<
zEAnSQVl)+0iy7-FwvZW|N&Ylf(;u>yBW-&SK{uUktRI!HrTWH6x3v}zVGnzofM-l`
zlTIhLuBQ+FFfS6&wJXXwrewIH|Lv>AcNrC*VVsj;R5)+hMb$}fA6%dmyq*INoM5P;
z=fdXYN}~;>j^-7M&<4BVqF5H>l!mA%6z-n4NyW6PzInv{4$%y&<?jM-F*fW@6mUin
z+L*t>E6O_{<cW1WW6`0zNz##%Uo!GUBr56Er8}vqd`@5Ypmv@$Hbl-l&!T^PsEakV
z<|`Ts>VI&w|0Z*PK@3s0dpWwQ`N9%Kx7xZdxOzrr<^?d*WC^@;YKAMZs(C2q1+CVg
z<U1Q}ND+en`_YzEeh0qM-S$UqYN&l39JJY$^*yzBywi9QOcA+5LArq|2aW+DzxfVI
zt1hx7Ec&rDJUUAv9((0!Eunq+92xpmKVqtt?Mlp!tY)|sa#?fvI(ltimgMbTJ|x=;
zy8&I@-LPQmkL#K%nbmSA8!^(3-PzRJ%oe67{$_p6oEL2qQ}tB9*d6pt5<R@;fi?%!
zp66^&O%>77!YqDScW3q+p1hYOmf&CP(LyP5oUC$<E!(Bzvaq1k(d()UUlPa)(aS%A
z*TLxhl;>4jfeLo~`0?Fq^s)zLO-oz~s&k6}h@x-me*PHMP=Qd<ze_0y)ys7`;+cBS
z#*|qaChnt3`#SJ(c-oXXXaWz(5WZpOzyrvtJWD$L4hTg!eCkWkjXzuof_NGekY>Ww
z!gd!!p5C{fZr}f=(}`Dqq&W$NDOh@Kfevesu#bBE5?4I&i9V2)Eoedb1?{(gQYG&K
zFFl;tTWs5J25eAr6qsfOb|MVvH$QcL)%ze6<&6$=ZVjO``J$g-uyt#mmhGs9N`kLJ
z_~Jk52UV5x#TV_)1e$$Iux}~WG&a{>(;G?bA?jvKItXPl7=Kt!?qj-gJ}pX$qKs>Z
zVs|;gTh{7B8>|)Y#fEpr%6Mx@d_jK!-^(dct6&>aw$7TRQzZF|qGj&aW?TCwf}MY*
zY=Ew(jl6xx+=UR_Ro%Za@ANvaWohed5whqWzjF5u8Evd&mQlZctu8@WeQ$~<oMM^_
z#HQ`p1FMek>eWIg#BA`nTvCs{*HOJ(-~2Yz6Icw1jm=XUC`2BG*y6BBprQ0fi;l!7
zKXnz~%OWRp7X^-FA*pD5|I>`o{T!T^UzBU=i{8h`n*8Wq8nft7m@=5QS`E&`b3b`e
z<O8O>F3QN2LMLlhpj*psF(~GHs6%JB!#TY+V`Zm^>5-1i1Yg<TJHzUtHJFzTnK%s%
z)V6KUn`cdH#{=HriWq|NC-tz$c^LmqFdQyJ>goHeto(eK9(IWn^`)#El>?>q7U>0u
zHs3@J@}qWm28}y=KCMuSAhoKfX#D?3j8Os$+bPL#O+4+BX-&DmrRV9Fq~hBP?2c)~
zyuxY&=YdF&%_UFs`FFq5Q9@dyP-Ue-G<m{#z&GH@Xp!<}QR?S!3n~IK$qn5Vn^YT_
zKT=YvDh>_;qILryAGp8n9Byr4vt%BBGyX|Q+M|Xks|s%`--KRV>ZduKq6KAZl%((r
z`#j?;0}2n$Khkqt;b5;2WXzg2u563#SorL@K@W9r&FXXT*>5N1>*2KLO41kdZH^w@
z+$sDb@{Z|O9|0ScmVF+TQ<EL4xt`9C<GG8H4j8PMb-Jxzy;ZeV7|Isl-Lt*~@1&+H
zMkuW^s&(0)X-KdG-d-mg0ZFV7iN#Wiy1F_b<4{-QhnB{^J9QXtY{Md2HHom4j6$&k
zk#uW>TKXACN1ZTQnl*0pPfJWRZ40}9I{TfpOQ!<ox9W`29bW1O8N3LB5w-oJsmAOu
z(+x@297nq8z<)=j{t6N|S0hI+`2?iC$V!G>S%|BVD3hkp6Qx^Y!9W3z+1Q4~rQK{1
z{O9r-+*Ip7EI>uYN<_lW9(u|3gVoFa0@*8{r!IM{zvpZcq1F1Dmx`p#pEeg7U8y~S
zOy~&0G!QmggO22qNrwNM?qqa<;4~LY!dCrm2FDgteF7*p^f#21A1qaErg;++NIRCW
za<K(c0ndwzMR$O7B?&7Ru##vr9)6hjMfp(dA<v%4@-Z<(lq9f61v~?g;PH^L*V;z4
zAQKkl)oNcd{)twg9wfw~HbfjAwdgN{c=RMiwRuzMXhwScC(q%vc#&gkW+`gkvAU?@
z)y<y7V?!RqGHoR?o4W;LVn=wfgTiYfACCS^0>is<P%)Jw2!Vo<K$-(lI}Sv2pzfOt
zpuM*0*c-RD#D{^L$Q7>0!4@_S@*}z;lvjj>DJlr9Oc_n%zD?uJ`j0dD?SeV?3C{_1
zyE^cO>s&|m;_hAUI3aCtG>Y)|%|xMA@FebJ#v~kirE1BWek#t5EtK_=-&`Jw&3OBZ
z_Hu&&;z2(;&ZSou2!G9_rT#){SaLYd!V-a{{VDM`<LdbIi%04M4}&bt<6WJf_5i!o
ze&~Hx*5e`jrp{yqFyKKzhBb9}XD%!(1W7I{8=JaBxc~s0h=@2=42eHlJqekNH-<2?
zu*eBAKWX{#2_$TRzE1~%Y@Do;(mpEcDVvsofdRowmjY*I?u)Y?upKHt0=!X>W6&wF
zKj9_C4$>(Q1k`$v`e^L#X8b#i^Jh{3TxTEDSM4nUGRq)mHoCU04m8j4rZ9>q%(MhU
zpoy7Tbv(#;*onK&GXq#aHu2z>mX-(eeIVBX)P}JYai9u9(1{Z#rk9o!#96O{HUb^S
zapH4$3I+m6d~yg*TLAGb0#Q<2oRpqU4XNwv)24R?0$CQ4@${6qe*L<jgaj=}OKogy
zOt91S^z?w9nPy>r!p-`)&|0#4=Kd2d&c68rueyu4E?&uN)13{dTadqNO?>N_dHmfI
zi-YtXpDYTKQxFuu;|^CmIdcANK%M^6&l%e*9=$!>53Tp)kDTo@kyDamnY@k%@x~P=
zLEro`VKMoV6VGm{aj?ZuL9e<?I4>Z`GZ_NvG-$N9FF!dmnMdy?PuFe57&^ij+Ftgd
z%8BLoOxDv0k-^<G7L!3CP<WJ3v9Yn>N6}cmnawoSSiy@IpQ!Wf32S;x$0C|Ny*WM2
z<5xSvfZ`6{YrwBPVADI}oRpkQVQ6Rw`G64tYN$Y|1%)pJ2!eT6iDJ9Kr;n-WWi>VL
z+`02EDTy4U$RVFMZ;tMHeD&>R7wcw`xlKHrP#+#Pf@Cx_4u2YT9Iz>_tYmd@aRFHx
zZ*T8s*SL82_zbaZor8nHot>TcmJR`nr-TZ|vVnx)JDwYQsFp8Z)QycDzIa~C#Eg%R
zM@2{Lg1in;OaTP};zkGpay(i3%{aZ0k`2iZKuXyn5Gm2~t*xyE2!U#gsmaNg2sc;P
zvCrY$Kox?shIV!fex4669#^%mEq0mHl3O|?EG!KEV7$S0%g~T64;_n@<O=_A{lv%k
zJ_h+&*^9^J@83TQ=T!|nW>TA*^2%q=si>-sb}FzNuEvy9E!erQ#bTZVbIrZo*MZm&
zhST9d=rRy&p@vr6S@D}o3Z9-7{K7t<Bf%{899}ZlY|S)QCdJ1DCu^mHEp0CyyTbjt
zZ?CJ2y@}!=UJ-?WJ7MtsR-7h{I4cO8&Wh)ye(L7+AzL$|u=&q+>b=ez<VQh$z+`iA
z2v8F3{zr1G!Sr-$zeJ7SSWu?&0a`z%+8ag*x~fkz((vtDWewdH`22e8ZbC<-ml0=w
z!9C?#OxKMlO;kKDQe?8L()C(-sKEA+^NqPk7B@?j;oH}Ob0QzKei^)0L$dfYV&5Gp
z5bMe8qeT3Yf}5L|6gET66l4cMf`$x&Rlw5i$=HYv)Wl2WWgyM6(@Sk}gaRC6D&F&e
z@y;02gXU9_(+ndyPv7W;2kByMLK3hBSF;wD5AEeuMtqc(FR&w+$+vpYWgq!>Lybs^
zBirM{n_Ef_@{Fa}UVR~uvj-@`*i3rD&0fU%=B7Z@O2v;i`X+__jxBs8CXS8=0VPrj
zH=!ucyY0nFK%J>vsLj#V_r@KK9kihl76D;*F7z*OzwY)%b}f|IXtlc2WuN-(h6ey)
zH(--zDEb2rwvF>!sIHlMyON=&gZ346bk~b7gXBAaP}E!BOQYBGiXV!-G+h>nn>xUr
zkutc6J|_va{dGNhC`ZM~sStK3gki~r$C_)Zu|w~Mm_AMTq-x31(wL-MTTpZ^8TT&`
zD2l`)BRH`7C?iT9RXn?y>MPS8lAxhce#A)Oy0l3Ayz7<@ogD^qyQGexmfa-YriSMi
zm-CkF<lQ5;W2GusUAMl6zSSmKM0|^Lpon<n8R=Zl*mGM12~%@au&0un3rFceoj9g+
zZ^Hu;mAb8OQ+o+N3Ty&ew4<>$x0P}Tg*VzTX3wjso?rjhO@RaaKHFaWC;pSyXVKrP
zozfqp473PoS@@Q;w2r4c>*>8j-Lng^G}06W|IjIi=e)W1$_`#oICp(kO%?y9ob}Se
zR(|PX-r{=hiD-tn=+;^HjFdEl<jsmxAiPQ-n*q$!t`EHrL|96Xd*?3=eiU##jn)6G
z#ow>KKf@RfJ>Ztu4t#8eAL15n&2FPo92;xH%jBSHzd#?viOs6*@PP@dT{f$}9J8Ms
zWb5I8(Cih_YZIrcO0bCCP(39o8Q(Q`G5am9XT(wlroCV;%j45Apm);v!$CfCB2Y1$
z>^JterXwlNW?)#{96moff8$~a-wQ^Od|bqN^4}j7^lan!3Nwfpb!`Y4yvQ{rlJ!JI
zaKO;a7o3QYygUc1!o9#k9&6Ipu@^c2pkHwG%py7=<#}g+b`i1rHL99bi$JdP0qsy-
z;qIP!ybuwhMMCQu+i$N6)N}xy<)5d*VGhcxsHug4`Sq%(J0S+=a%uk3(t6^pQJ0bV
z#vRuc5Pf%S7v*wA5h%b7|8>5A=JD(drrV)f0x_Yiax3v<`%Wyt26gG+iBVkK3;Ohn
z-b{xWHg+kLy=C;AaS3o@J&E@jHI}3iRCj;Fz*L`ioyD(%RvT!bGVF|ZjHs_1@Epb>
zA(KKls-H^GTATVX7?eb$2i^PcEV|8#VUQ16Cn{)tMU^hJ@GX6R26t-(kseCYT_;-?
zU*K_(IFH$SB&1bJwSMi_c~>xuPncym2dgJ!g|Iu^PYxEi!kq`Q)$p?Hl||0ZV%{1V
z*Axo`uV{(oY<cEP84LC7@I6f+gF(EXlM007!j~_@p^v-+VPF=c9bj=VG{f=kA-NOG
zG-IIz?`BPuzZJe9O2_eG1BOYz1LeFd%%T4;!{?G3xR2^Fe8ctLgpOHE11}P*NIDl@
z_B>wuyd>${?-LJ}IUZCD{XCdm!caQ0WflWQxaeN4UpkVkW>JSYeoW)eA<6%m5Z`Rr
z*Sq1soo(M_<y78N%v?=K1-3u)vqsn3jg{_UWP<^T8no-*;N@)lr$nyNk=mav>jq9Z
zZ)JP7CF!<4YP$KmqQgLwh9{}2y4m<67#?Ygf34o(c8DR|Y6fFq(b1cURpttp5pScp
z$A_JJFc5rs3-oak(Ct-KTwF%yzGOc!!mX*2X8UpS+-dKlTwc{4Q<m=1na?n$`+HuX
z=YU4?G%1oOn~KC*5t0&5LLgUr;;epU>TQ+5s+>K)Et`lj5>>w7<H!H$RENoR0jSUN
z%u(BwH#lz2t^b<4+;5sT?r*0c&4Xx8)yXMatpI~iFAm4|AWneeZI5YzJNQW3E>uR2
z{)Gnpex9Uuhf`a}tb{?6xPctt0Dh3Cg(m`VR=<6_4U51Y(;cbAr5g2v9UCefN$;C;
zFa=WfH|E}`bd@B_JvO^t)$;eiM#smCf(zcZ0-3y^&ojm6`dhR4hBxRwR}s~+W?G(n
zJzpX-_PbR9CQCf7KuwJkt1xzscv(E8tTe_2C+0FJhC-8t)x|#`Bxr3$Dyt@}uL0!r
zUo#lU7p^#AbV%SWKzRuGO%S`Ba1?)-cZM5xIOv@FeGVvch$_zsTZIIs{BTqrzT!t5
zi?Gz>SUHLzy=vNRk>klhUI1JjU={Zns&EUYyRfZ#*9<^Muz6#=4Wb_O4;{%ZiuM$5
ziuA|Q9Qcn8)<c*_d`EPtk>5JNwEwlgT&OeR3Np|lPDX4{PJ*dbV%zd@F$^{xv~ceO
zcM<M=|8EHkUohQ6Qaie7-oE}B7_Va62n6wO5cW$8w%?-l+}&hgU=z=Gja(XhjoMz^
z2#>+FS)vcjOab%wYm)YY%lrD0!IzJoMQd$UUp%OYUz`29|C7*>(EVq|R>8J@B&;hQ
z?MMIDnP7UiB&WI?x(*hcHA)x^e=u$tV(W%iJV6g|T{8=~-Mzh9@P)Hxz>cODw5u*Q
zJ&*0hok9;u`E27*GUB0M7%YbeOO>Qazcagnl(uH%zCN(|eZ8KZ=3S_|!XE&iRxD5x
zjYGBLRx9y@zs~{o>YKpM?(xco2fE*s72I6mKV5`XHjL)Y!%1t$0}SNPURNQDWv(!E
z^@_iVOzf(z`PQAcYYfEhV%fHBw~JZ-8YEaj1a3M!y61Zn$`b^9vNzWa5Ab2L!tG|o
zhTb@A_mar2{P`dsMTMy8*)D?FT%dzuyKl@v0=t8LHDAD=GZAdas?Rfz^2b@pADvSh
z>JY<xbh!Lbi9l{SP|fJkBKBWh9}^W6L<9G222Nvht<9~-fwL1FdTt>$=&vh-#mG(g
zq}G$LfT*^S2PXdfb=>yve0BX5Ov>GTAl!hB>9-ky_4E3PJl6&z8x;Q>ln+!dqT&?|
zRExdOp4?vCIi%6;SL_Xw*#MgW+goaC1K8ex%+TW3E8uU9PYPGVM|SrLcy#|)r-0G*
zt9=KLu8CmjJFIfzK(*CG7x5GrEAZpr?SoDIC70P(@bQ4p<Fg{%{yYNP6JL1=<l7;g
z_?0;gV2Tgz{N4A^$uxOYnUY=sV0ccF;#%6-J&F*!4JHQ-wg7%T{KNw#wmmr@^LKCM
zdaeX=x)XQm$e+C(kwgyxnA9Gyu(fm8vC6gfI)Eiqj;*DqrG0Q)G6KN!r}33{Ahc7H
z#C9JKapjbhlzhsH-hXZETqxV6{qL4@jBj_LfSak6fw{A8MR*xBv6ja<^vez@-O3|J
zOeY37*3pTLz<e{{3IZgSDh~rbL+HSX%TIt!In@z%JsD<j04Ny{z>Z)BNW%2oTwS@#
z+<s<B5XaM9%I&n`MJ5-TK3VWL+@_s~BvQ6_lV(D?*Ajw$XUqZSfB+O(;~A-yee*!n
zfE+s{H%DMTq-t|G7{JDB%JE1b6`6rVF8C#3^kEqU0aDTC(gfue0Z|ufce434D*l^a
z-VWueGRYc?e0}QX&>Sfxp7aL*e|CV;@#66Y-n>dbT7gQz?I<9K0LkSU{7?%vY@YK%
z08gR?e4D(tw`BX}x$|CtwFyX?=;aYN&dki*|MPj`?j0$&$G=g=x#i4~pt33HBQyUg
z9N5*2L?Kpo_JAaw_j!57%kJPpfUxYUo&++V`_^bcA`cBr;d!kD8`qaqWBI(WS;mGd
zIv_#?O36*)HLh?!D*$ltea7nqf|;GWqnZ9KA>B(cht%>kDv66#x^OK3I9)!iYbfS_
zZo8YJP`;J!Qz);g33I;ysO1C&744U%SOGdY2KcEO5ZV;a&%w|1L^g4PwD}+8IV2=R
z++&@`XS+r?T+sw@tk8#hEl1yqKOhH`Gyq1d?%%H+ty>Fb1Jwp5C4)l3!jE%v`GGUO
z55)0+!Srt$zpAg#0%pV>=zT_p@8CisARfpd7BTz!zCLEZS`Gt#CQ*ms=ZlN>&>B+d
z^{ZEK>{jFD;R(jFtE&s3%`lz|h*F@L?1uMVQv3Jshaf$DeIZH75RelCK%+AV@V_R?
zsWVa{Q0xGo1=EZIx+o^b%}4_$_iJ{}<?iCK54OCdVDf2eWgPOs&Y`(~YDfHeMc%Yr
zhaWFzJUBUa==}=Mr}4FM7kpN@`jL7+4anzZxz@GQ0f~*tj-^)|HM_a4AjqMS)izQO
zb8IyG<C3&v`8CY(i<h0^4fatcj*dIN&U@UpDC6ugw(b;rFgE@PL)_z`QNOAJvcfv=
z4}(z8TtL%8V}707he2By(+2DW=r{_XHid>BaCUJyMd0V-YY05NW<K!2lmaBDi?T5Y
zlyuqBJ!NIuPosihZGFQTYHGf&6cQB?F}1hnpK-IaJOfLCI_ICRu0HJU?d4yVI38jK
z#7XD_&^Q1R&BYM-E<1Za9UUFuY7-O5GBf#0EvSEknkOmI*`OK&0Em#ue*w)6-$6JH
z7;_LfkwYK<2ht32Fa&ZO+?!Hm!Gt#i2;u7LDka*M<Dd-3K_9S+z$WClJaYAYW)Ro}
zfLwHbS%P{0Y0uD9qr#b;0{|hkas4IV0qyP<Fd4wPA%lQ+c_^$&NK}*xnx3AP0sJL^
zwSdXiu2KN?F`%>fJj1>cqxdF0ubn>%8n5_|VuVV<fDHmXYevRCfPX9D=r|s=*$-D9
z1HNUm=R+HtxghjBcvuTd%SeO5pK>1UPWu#FqBng?g#yrz&d<5!f9D2V3gEsYs}$}4
zmUs?p;T@)UvN!F`VC(+_Z3wtnbw3`C0)F3nH6JeSs0Si5+cmC3r9+F99brx5eJG^#
zz~G>1NkX&=*8K`l6d&2_-9!HV{rk^&@9b+lnIoyb&<(=_;^v&<$`ADuD_DU*<o=Lq
znN(TLnat=Yp3B9k@u9Gjx?^S-=>ejM=qaSoKjlDxWRSlF^N;hxRV+8tTgyHqsghL;
z*#pG>5<uCESA{e+^Bpi!z^3so*tAQNyZ}_aC)2|sTt-ce9&~)<9%KesFhEe7P~-Fi
zDpf!o<s)|9L(00QvCkA}H>#<h`jrBBDs9b*H1=TN)~su`(lYhuE2QA?-%5W{tSDrl
z($S|Z%gOA7y_*=MG+4|l8EW1%KwWOutkpJpbjrSZdvan;8Wxhkazz^>C4thrJQ1Nb
zHfKz_ys<cB^LeZ2RRH7dITEWVhbRkApQrSouy1ES^T*{25gP)*bHv``_ot-)D1~Cg
zR!$w!Ht+?=`*)AI9A-AQMt}^jtaquMv5^L}08@&+zMkGPOau!jZ0`p=ob5#skx|@S
zUelD+=B4D5&tz>#NeerNct<>Ka*S;u>lTBoVr1aibBz|}%jH0NwZ~uI%b60zGjP17
zNG6NyJIK_qhJF&|rJB^}%^<VRkI*AcxeNfqWm0(L^z=7t(z(h5-&fm1vZOHY%Taey
z;z-VbO8XhpO5t=VbSBQ4WF_a=M%<WK4;uDrPQ?Zn4>swc38`F!oqsFU{sT!|OZL~_
zV=8|qRZt(gSAmT!9t`NbZ~Uhta2R%TDl0$mskh#=?PRMspr&+nSZpu{)BCU_?DY#*
zV%P7RdSGjPi9E;yfQZQE`Yeq}f@gL7H_iC|qcUKi-HR>FuD*-Bk(*=OYX+a}OpKKK
zt?&rMjxo$iMfAbi5QU7pnP4*7nh&5Crxw(TbBv!`;|rLTrjVC@uh{~5Orhss592R>
zRzLMP*T8!5wtnSS)r_{eC+kMG9*WtJqyVb<cKNxkzCNq<gU;Lx+AHYV`>hQft;MNn
zAZT95)7`*k+&AWHnCtk^(DKl#2r0qh{WNDsH8rXOnf7n*{-@9C$NhCFZLWWOmDggg
zoaW_iuds@NUZ_W+^HRMvnol-Vux4Z!hjrPzFH<vaZg(*LvxCAL_jA%4?v@Lpq86YX
zO@KO>drNx2M>Aqpn{m%LGG-%Z`LcRs?IK%CVk*Mi`qP~Z<9$=epLFnlD*nLp>N;^f
z5Yb(W6xdGlCR<BI>;t}C6^xMp4F5TsY%NgQ^bOWZqF$EwQj*lPH?<X8U*dXr8rKr|
z{#n&<fne0sVCeb8yu6NSPR6(01E_%u=$9WI&V@H8g0t*4U#i+LW!-*0C9VHwcfSh^
zyvB8%{;HTkXsQS@_|`Zr)1&kDn=h4Yn0Pf>r=&H0KOAlqu(L2yBmRAO0!6={@(_=D
z1ydHspHuO6<#r*Lj0cHZ`v!Z>ZZenu?f_av0{K6kVvrbwSr{Q$F1vY{O*(wXZB`29
ze*+&=L<qhv6i>4K-E=%TYf@kVeA03SzpPxMs=WSoRt^qyw4<x}qpOW|8v?+UPx<6<
z<*)|G5Ud}h2`VNp{WNFbLbeww_0|L~pGPiEL?6!-VkSW<<M_>eg@YFruJA|Y-8b&W
zSgUg6oBVLFW2^8^k!$8UVYjEjgIIX=_qn~M>iObia&q!@`owp4HAvJ@ezZn25?^)T
zd#R^f?Kjk}oF{f~A>n#{9LeCX!bjc1u-Q7+s83qD!_|*V{c43p#$kV?sgM|eSmE)>
zLF#ZJaH^66)CCK2NEALK2@i`z#Y7v@$Mw9HltQG;18~hhx&;m|9*ot`sueKzr|5Hc
zx`VpbY(k+dj)|xJuc-oRNdu!}V_~)9T@fNWu&v|Ukpe8ON!)tg-(EHd7N4W%3<*l|
z&w6q4a%MQD7oJ3o-)~%B*~PwR7x+vxxm!ps4o9lK`Y4@K1X@P)cE=xnE*wnP1<<od
zKJk0eMl(VoS3B{=IDwYZ4#=5j!0{d19n`-suT}z`VvwJ$76c4?_)dv_Q&!jf3@q<n
zgHgGt@^Zk)s;gg6Dzlz3hDzTLX@4DWVmy-nYEw+2!IkLr$8Z6w*Jv*g+7+>IfTL#S
zH41ympXG6qnXTy4sF)Ae2<+-trli?_pX`Z+r!Cyw`=&W3RC9qOb}%kpXSBZ`KqL#c
z>!VmzL>HPw*6@J1CjZBzK+-ZON7xV&jB<>x0KwW%dbhQG)c>f_6CrPiZN<c;u~2Jw
zoA3U|>SlTcK;WUAf)EKB-I)Sd$uBwOEsc<nRsyd5G$A&qcugT&=Bx>C5kje9b_#Xy
zcPnNV=OE|X+TMO#@=4`qaPS7U)A4GPc~P%`mNdy<=J)migyMPAxRA$QN>230EFe2?
zl?loF7qpt0!&gq#X-7)^eq~ri2La<T_`+#~ma!u>&`5?}LjQxCCGIQbZxZf(1Zkl=
z*$5&n08xH6yaXJa6#Tiq_8%6&|E$SjCs2!*9PmQ8zn7G%Pj*FYAhI(b81}d5+u<8c
z=RjbYzq&2J_`5j+8uYYmF%}874okl`8$Gux+0sv>+AG*ooCXVg4@?#Tr=3~y$=l{M
zg7Ucm8WYIzb&fufz?ABp6E$W?$D2Ts{e2^YQLAxRrZoCty@(hhmQR@54KOyaaS}#$
z1%sfhX3~JTs-lYbxJixFQeMe@s@Z|${C%dUlvWD`KaV)02KxE!=aU4yZO5MLmgak(
z9NF`U=Hm~p3<;lW6ywT9OfYLV3yg0*CDO;`uZuhp_PHjeb)(r@7%2)NmUhl3Hklgd
z6V+s4#N|Ia87OpsZ=sVx2!4YD3?D#%N-Bk`ZaQE;i3&0RenSVl!^H6WLJ8U&EZhrH
z?_f#m7V6=d_^`43KPrGieQsgA;q8WML(mlJ>mLIPXE63tg|Nx?W^0imV{kcqPB~Gl
znp}yemiYGZ#*<`$4*IXcxZ8IR?^Pd(=>(RSQN&l#OR7dcN3>(fWu$S4MawI6=)vUb
z^Rws3N5#)6zcrgBW;+wHvcFF}SgKf?Y`ad+$<6iP2=c_+#_uyD4yhXg{UaIoZxZ#h
zNORc&0=VwHr<(1k_}`n#H6GiY<m~1Ktj2Rte-9weRDNpUb{$(F<{x+Bt^x&Kq>se{
z{G5fLxBog9Bx+s?11O6>;KVR_M|)~9!gP=ctjH8WI4=OI5wFvm>ONG$voX_fy|8Es
z`{DQo$Mg{q{$W+@h;Ra_=<i2JOG&Ap<~*-#JpikKQ?A)D8{Wm1T-!^p2T)dhL-BXj
zu7-8-pWC8n^(LZeejlN`-pP8;YiqA22YfiEHy{=fBb`IJ-{;66>SO}an$g$Mz;+f*
zGiB`Xy9ENyL32=p3o`m;4Dsg<x1E`%Bs2U4Hi38fU<?xg70=l|k+0t0d})URc(>x#
zEc97#J2t4l9d+oRUPQ)%{Z&sq$IR^XnKL2(Ib0Q|4YtnKwr8p&@<|EO^}n~mpX@TE
z-sdV1oq{ZSjhZvNWoxq<tV)XcvBMJh&vJ)(LU~e~$L5e*GN`Qs<(4pk0?LJcJU7}L
zc}x%Z2DqrE=h2gvamNW%?DcA^`<O^7a(5RiDuAjs9asH)Deh)INL7ZeIb8hR0x)7r
z=Ke4IQTf;$pc+>CI}k@5JO>ARtx{(C%U!Y(o2b`eCe@t0rcqVjSu(U=sBT;A`0qmj
zziwco1Qnv@HFmhK(2^c5lm(?t)S#5RbaU7<(mN!HXX~?ql%Ih2jIx!3gK=Fae}jtt
z$lb2RQ|x9zF}8$<^UIuuVfS2Horxj8PmzcuIw-l5XSc)}6qhwO0V0CGa#^+#AX6>X
zi=b}$0?<SpL8*+TS2o~$L2~X4cmac=kvSSrFd&XfZnmI4m;*C!tg>jt$+K_P66pWv
zlLf990JGx>^4l#=N2Af>Aft2Nc&$mAcu0D?Sq*-zfrU#ps8Xj*ROi_yNqTOKfZ|9w
zK)i&qNyc2U2X{+LD6#GLJACG*$<O*&uz8DnZ3#l}a&r%WlHyTEW|&F%c>gdEp+E^f
zEkHp)-h}n?MQI5w4Gjvw695}X4vE@-e+I%R0Ctob@Y@boUfN~?MQcZH+_>>FL}c*)
z3i^29aidYeoVWUYrfv7GlyLyk=f>=ypxC#7YqI`b`cw)T^x@5>kdLL|iC0$4D_^}<
z77HaUiS3Sq4C>~G0O8U!m-}9xi%s5E6oS-wV@>JzCnD|lik<{f`TvWAkBA09LBg5}
zYnH$m05PEEUlc|Or3}EdvHE>b4Jh??1oa;G2pf0;Ad9G>j~_qQSQI8^WzmYp(t%e!
zz@^fR^1hfQ1<+ey3h%zt0f1M^xo_s?=j+3{6%GbBs2drD4h-nAN+W<+1B%W--R$!)
zfHtY+{a<98mk2R2F+k(kiRyn_f@v))tG%G09#m+vh}!COE_HU^+Vi;f6heaaz#ekn
zkLY<b6fZCDEpRK4I)P+tY<L00^S?PBGhYm-RtHE8IRvA4KpKA5G=84|RGO%$C<nWb
zc7vj<60ZBfTM9t=DF_$!S7H|C=HyN?T)lsPCUbMI{H?D&Jw@}9X729dGbOh+pWQ*F
zr>6^vh`ip2$;r;X1~eS#U0K;NFlisPrQsHKX|9M$3rRM5Kv@B03l!mlDW6oW0F)A-
z@T6A#Qe%Mn^=WjuVDlNMWdp@|*-MA^pC{8#QZ$62G=Ow1gF4k?6|M_6q!8eZ1EwS*
z!F?t*^5_+$g$0#mg10DmBgoR|&=Nun8Zl?bGpD7cEq~??OOd>Og{_H5z;_N0AE+Rx
zycl0`A6ao<?G*&F9Drpu@e^|YXfXh#AR@5a<ITrvNNc#K_q54u7L370=<XfC-dUzw
zhh+&o){g4@#~%NDuZ%AR^#{*Qd+|7V38rE$`B?H?poP|rDiVtlg)Ut1166vbI;4Tl
z3Z@85$AHQ9X~ucXmzu4{JWP@*y(@Yr_{<4V%?MP$ZH;Fm^fIdgHoOsS2RJfjK)7ne
zh+De~s|de9ogh)j8IXG5RT>{0Fx3<0KUpAJi#;MRwFQbI>GG;^)~i$AOP=ifr6c5(
zpw<&;xw<W*OmzK=5c%h!?Z>c>!W-l<y!MpEzD85q&|j>2B}5#MnGb;3H~Ss<v9h-Q
zv`~UMkcSQi*2t!I4xOV<4Jor3^8%HY?vgzxFm*2)7#|b8cEN*xQ+e>qO7cp_Xr70J
zjP~j~G%PO*x-NbXOiw^u*4-JzU{LQ|J8XgBOs@3$VJMs}ds@N=aNO%telYj&i(vo&
z>0Ssf^vtc=82E_t1f{-OUJKrvL)u$p_i6!M*qdkU@HwB!f%Jy9|09yD{i08=;BC}T
z3gWlM07U0`n}rJB+hW2X@K^*`HAbB*Q76<yu}D3N=`RdtSkrBwkZ%r0Vej|%ciX%R
zoB?Gv6Qvo0sIs59jVsigMs!%AF{u$JSo}3Mt@@C{%MgPQu7m|l!F^J}NYq(gpO9X#
zBmR0D(SN{`WS(erM|6k^9AWyu6kl6v(9uP5{@IDJ#?!d830$VPx_pGJ1>C4h+`a?K
zZ#V7|*iGyRASU=@X8fC+eQ_Ax>tj%HA=P)=66)-gi7X#-iL$#0^sl%io9v>{K;w*!
z(#hP(9vr2PB!NYv`|cO?_{Ze#1(}S&e2y)O=ba9x|LPK9pOZ?~_(%hN6vUg31W<Xf
zMm6x|_v*61_Xn>&5s@^q#xog5^QQNpGE$UDX=#YAHe6~!FRtzBpQ8(BR%H`ka5lMI
zdT2u+zgvwDXu5AK1a8~)zeXbZ?OTw$)QxCcWY02{C;0CoBfk~~GxXB$CHNcM7$GV+
zmAo)m0=t{yKdVlk@tjN32D0Dj3kc(^X*4bD7ykk)M)!lEo|);y61X26Nqqlur@4sG
z0tyS6<3czOYQ6J!YbiqRdoq43kJzJ+>fc42|D9j3Ko;eGC3ny>z1U0XROE($-T<1a
zX33>y{6o4b$(q+kVy;Eo5F($>!qDtC_)njFZQI#%sr!upV7Y;A)z0hlAENt6*ghzC
zzb9!X<A0q7UJj6q7cu};*Rj?{zSD+7m>U4_04eOowgj{+D*CTB!W{y*+PRJ4{PRc6
zQ@Fdzr9%Ka;Zu#v*@caMEjmXkbq>HfEQXt0E%wyUS#%JJPD8?a)-EH`3aXx69vK7o
z=F?nj=CFX_fLl>H%Im8#O7p*G?`iNxA{;TgckDj!7w@bNEtsEL?{#qfYf6cLsKU(5
z40|SrGvhvnCXd499>1X*UNDwev<s#E8e?$^MXeO|`xskS2((A`Sv4F0fb;^0m?)L-
zli$dx1OpJ<e?2)|Z~)$DKtqfNwb(sx9Q+k`kNm2Ddi__6;c>LOsP}bKd5Fk2cvNsZ
zd!x;j`e2CcTnQ-1{6nS%k(Y*+me%f}ImNB7*WNhvze`1e#hI;rf=Lh6^Vg+exwg96
zC~_0fTQ&?kWMx3oC#>sJ{y7U_X^zGa|6*ywq3neXj>%*p;IAbEG0M(vIE0D}gzzP-
z3&5SQ{uqcx_ib%K+31gQg1%8xD>o2fv`|O>>ITq)(`;c(JCf*!r}@FT@4^tgf0+ar
zD(LIesU7Bx#i1zVu71x)(eI`uHp8;Rw0+hcII;SN{%SG13|UGq>E1Vd-oXmwL@e<X
zJT3@$TmY0!`6$75j&$3NwCs6eS$w{Ki5PAPq^!4yEG`&n`elsX#s~?4qz<Jw%2p$P
zV38dZ%lH(sv<M>%6SYW3|7UleW<3B~MxHx3bS92=7oGhzhvED0a-5uJXu^q<*;y#{
z|6X`MYqko~UbH3TyhpwG?*p6wM($Usmd0oDaYLZ27(qr;fh$qR<pljbDCZv*^poEN
z#>GFl|04qA5k5UQlHbMq3}95S5_nCzkE*f08;zbKu=RiRQL+QP*5sE`EWq-FL|GoA
zMMy;d0es0pT+Z{qugoYbN*4Ve5@g4VZ=qs0B{UM&mH@BFwNDQB4+~$o@Um+T#N&zY
z{nYP(x0W1rzhkKbN|kpBJDQ_AKQd7|e@(}B=;$%9#!&}ul1K|fk6v_R<47x(ZXGI{
z-&?(fn8fyS`6_b8a<S<4=e*M2xt$w$Z%^RsZ|B{g7+w@w(PLN%y&iI{BxzgD5S`gG
zq<T*`oP&k3!RLcAU8_&4W5@nFq0P=NFrKF6(WdqjgP8cGDeU{0b2tgiBC&F~xGi>R
zW&3M?`U=YegQK}A`o8g@Lx=Jcj@O8b-uv=ONczmfHsd7p;i^di$!4+4QzMP*QbOVj
zo@cj*>eqMP9gm7<zeDjuOac|>Od}o|C;njUWkT%q8iV52nB|8_feSxcI*95lgkx>%
z0<H-g3v(?KZWLW>kF8BS5bAWVF!MXE-wxG?_xAT+nw)y>+K&4CO5pJ^Yh0vG!ba7r
z9eXqQFqIDaOyAYmILu9JXk3TmAafVdduoSeCI(flQGIt0|NPwMqky->Q@w8ZP!ZJN
zZj<19yxXyuDv1D30*r%b8bLu~3Nx%>cagO1>~OuNNey-D()=8ioT=<liktWWyX@_u
z#=oA3&V)59Bcm*3D#TfDqbf$YQ#$P9fWpK_2L-`vg0>Co_88^1rK4$6)IBxunLMce
z?S&L>AN~sibN{EcFAt}3|JvWBbQ;vDgk&fxoD`yij2$A0#0iy*p;8&MW!f4MaZ(8x
zjwmTJDZ(b1=M)`e%dAY<8yWWAzxz4w`}@8}oqyizdb_TYJU#n)KEwK~b+3Efi!G8w
zxxCh7u5oLFh=|B&0=25g)ZD!D$+Tci6m@OnB4HDrN4L&eD2+`CR+~<|x;-?rZ;!N!
z+7r%dM|WSP<VC?Y%a<ppsVjx=p;QEA_L4;x4w{R0QkNlu2Y$M1M-G3*qh9j*g%=od
z46aK6ElyKY^UWkz7wT~iK#k(Ey5IGcyjiXT^;@QW`tF&0YiljiGB)NpcI=qs$$YmD
z%#mNuyiGhZB2T=)GfJ>sS*V9}+x!LFh0~CBfkgBCva==aWDW3yH=`e4MJkbZ)r9X%
z$jo$ZWU77Z;<%7Mv_1F2$7*QZq4uB;5wq(#i72?>|GBwl$c7NPrNPZw4```>&oD6r
z2EN(!c8MY-KdGTe``Ax&G`8w9<5cUGlw**;x_$R9H)Q*0tMb_>#;K=4TgUbNQ4<-P
z@>Q&vICi9yLM`O+td%FEkUI}O!@ixy{_<*8+xKjDhEcIuL4W%C@+jK*g`ZXR{6#B+
z!N_oE-9;$yXur5rXdb@nB=Ou@ll(B7h{m(Bv;9A&LpvsY`ZuYO>AA0S%=^ENtA>Sz
zX+5W%PGi6Fm<+wBTkY>#$d^8&bhoLWykp^HxSZ$s@a~<aIQPRp|NN6Sb>+%pPy?l(
zwh~%&;=6X)4ZVF-Ix6oqbq)!thoL5^$ID{KojOO3tc|0vE-`#|*4&Ksjoo|kmx@r~
zr0ncF>YLS3DgpY~lFe#r!s)HLC=USgb|_Xka(31enQlBFvi}B%t<~*}S8E@?;t}=W
zry1pIuDU<v#)!T9c)9hypBh;thU?<Fr}<7FEgo)om{7z+abFuB!sGd>?W;%O)`9!Y
z$1JofSKJAUsD2kMP%jt#rsuMy$zuEC#}z0nWuZ|2NLhQslPC4WxkW@puco9(f*7&5
z7!^-LgM(e550Pr4_5?*m@gvyM0VpFce;?5|>1I&S(CBD?ZKRa3Sfq>%r~evZXvO$r
zVnMg~8#94u3IcMMS;iR>YG=w(DK+%@vxuT%WLVfb5L~!Z<;@g)q;0FYp_8;Wm5i0^
zc4MZXKzZG|bq~o8^rGYfAuWe7&^dU}`fZ|4Q^O&B{nftII09jT>XHsBWKa;bb?a8?
z^KEjV6Ok(G>$h*)cAcnm0!C1u=Q=>pZ3gHvKC^BR{s^@_b|^v`S`e!M_pBfWzN)GU
zungDnvEku!0G~+EUyIMqmOVa(5pe-554z2nGiQ4H`X--|k>Up*i*zVXH3qpAq~vw<
z^d4noNY6&@l$6{muzGr~R!GTR^7t4uOQz;(J(Xim7eei=HrXf%M2zCGX(MxU)pLcL
zHf=iN;IPSH4C4$vo`m~Wm|v%<;bSO_=X|HMV@FU<jvSUOr+4PH-(6j0>+0$}3eD|m
zBT5hxgHUkF&Mqu6@~`wx(6yqK$QO$3-7`GN7cX+nM(XM6imItmJ~RviCxL7s#=`c^
z*iTqhM~}Y$gSHk0ZB3qQCIxQ;Pcbw!q&5r>H7DU=L}g@dOBg-Qps-nk_P;7BUTXAQ
zGZm?K`0$v?7|*66QBY7IGQ^kDnVg)=&CUICR!M|!mEqB&dj`pxpoRP~>0U#=W_t|C
z1J{h;?K^igVAtvRm!Wc<(7w07L@Qsy{#pklvHs!ypoi_?otY!3E*FdQ-aSkg21zW|
zqkFk3EKOR$MD@Pup^*_=IG=~2<f_6&{jeLb{2V^`?DhQ3+a+ZqwL$l@^cOm7c+K-B
zQNI_pI|SSp52kfW3zq!6Qdia{tAETu>kh5%AEv=;!vD7|phVP!ZPN{oacF#=3@X;w
z8E=kVo6ny<{bxpo(}w}o$#+J|Qy**!pscC!E5+wgQc@3Hu~oceaDct3=Cd#if46>G
z;qt8#5`i=Hf$3M5MO`$sX`)wny}AR%OsT#q4}LvwU2<h<Cg0~jTwHEgB=MH}GHD*W
zqvkKzeP?l17XOstK62bR&8+Im(v^g0)FntoopE%ufB*C-DaADN3PuaENu=VQP_=qU
zbX9J0EZ9eErr9X0jkj;DDeulDHdp!#uyuv|e;@JMW527Q|0g5r5{>%KnfBH`g<Gnz
zk;GyCKPRU{C~2`b7=isL-0Ut{Cfxh<SsEd12I857=hrc8&AFYjbYRg<&COsacBmgY
za)dE}i(O95-wq1!-`zID&YzQW6V+q7<f|NzKBwqD;AkP*M^^gIJWj4v_uJ#pl3`aD
zwU085Phxlp<k_4p$TMqYUzhx8t7qO*@tC_?g;`5X*mLSQ(M)GIsfHXiHr_i?#9HEL
zCP&fl3*}vS4)@N}CUz%QDj#}mh%@D%4u>HJUj+~oD#IbTRmtzuM6YneKPDdMPt{~A
zDa8K1jZ5{XQPS!gudk)BLVE&}@3?Ax=UV)-sRB0Y*WR&wuTr<^kd6$li_6B#zA&}k
zlzybHE$x%Pyzp2b;&u`UO0n?B9PN~ji#cu8kZhcP#mYNjuyt1v$ICH6Di&jc8vWqz
z?rzSX`SwyJ7Cx7bHmM@Ma-I@szGf&nkl9b_@OpY{`fR;s==}3l&P<EJh8+Rl))$1N
zP4B?c%S_^w4zabjUo$tS8AmQ&(m3`$IzCfH!ID<g%3i8r74^!q>Y#qWgXY4k8zrdi
z-_1R|rtq+^-TecaA@i5~f{S^zr{+~1sek)5_B@GYbwiycW(R%cWeA^z7<u)Xx_ZZ?
zZ_IWJ1>gB^5mRGIeY*eUWmUn`s><KicQuU^##f=uwjBP!(|SkBuXHjh3cxSTC%OSP
zfIMy5C{H9T%AaNSlQ%6~F2wwSVkAE}W%+?WBXw+u7L?>YG~HpKFuL7asM=sg&uZc3
zA-qP=kNwVnEVP=E&*6$-JIW<dMDM6AD>XLL`=)%iWa7d1&MdE!0DXqDzJWKALt6Ha
zM#mKqP0_58<;R+;;*7$)oqd)`uZpE@UAW}9X@Y*A9v_ZV?H#+Z*Hix85UsL<QbmdV
zrr7yFjZc-jeBpFTuwId_j?k!-;#_zto2oq`=hD-Ee3Wu+F-vh2u?+h}V)S*XWpf=A
zpOeuvxrHB|h}qVQkdCMI=oYflF(@9?uL<cv^atNLweLvfnGiPG_hR~dx0Wok#zUF8
zQe!~=`ofiZEAB$Ex^}3LpmSVi3QHzdpZ$7p;#K6ri;@=!40Pnq`eW8U9EPvFW9Lr6
zsp*81Oq&A!e_c^6ZEbspQKmU7FIV|arl;h<%*XnLE;sRX^nnJfSq!&eI+;{j`NK6i
z+-3rFk6HP1SM?TiaU5n7E_};%wa^MBKPyY$=I{0>5y-gq*AB+b=`W6nm1DF8lo&)b
zkL+6<$QxDIyF}`;bIbfb_WbtU3m5X&)(w1#II?F4;<sNeGCfsKeRlKhG1I83*KZH?
z-#ZojDuL3dMh#pzqu8K5rs8r=KD66uJwYAgpAL_Q?XJE3`m1He{Pu-AMQo3sv*d0L
zPqJggXy;0K?_rhq<*>=8hd#5_w0duZru1HOCYM%N9*SGk`TY=Wwo%p~k$4q|9s%W<
z@IqjIyxkvOukUnX%u-<x8b1gHkMc$?wky~*^Xpw)kmTPbzHk%qvrk?L0v~gKI#ZY7
zS*feHwQ1m%KD&F-gwgs0qhd{ccF2ouv>S#dCIgJ01^(gxw<@YEX+_NDEYdU}=tS4X
z9>?DHo6~3C#5(YCTOu6-dhsg7X<C`*Z{g3daCm3*zB|oE2f59jJ}(sAJ00g3an;ku
zWR5SC&g6QOo!vaNMWK4(`*t+%eA$?nZC@cb9rqxFwJ2J}tL5LXGUy$4q&dt^#hy)=
zc{pUz;ZFa1Y3g!WMt>gVYQAl*+{GBfAU~u`%@r{9+4&>=Yntzq7S22|F~4x<{uHYy
z)5UN+wSv#jw%Nt(5?%WZiu0p|t|Tm+@~V@9SKF_7`71LYY#f)7RV%wEPtin9=r-9!
zX5omtShRMD7+bqgD|q8r7r)dAuZ5SWzoz8K!hPIZlvc9;byXx&$ijP8{&lK61)e0v
zT>D1@-S3br?B-=xTybyzDBI-7`SYpG8cuhA*rQ2T;-yPewU_KiEU1OS5LlUHw<iB%
zR+pRCAC@0n`WRAc%gV}re3H9e4ALpS_|7eq4F?!5N0024&`~w%BrR@iZthvQCWz2{
zbXdUbb^H5uOuFR`Kb%5o=UF7AivDH93O`=0VZ<!nUR9<S_qXYY(G*x|s~6158{?Ly
zC`!}>etOfv7;29BVX$@Z@bp}{jP#;gVlM>a`x=rcvh%9j?PgE<(6u#Bo{U-e{uWre
zX1em1-^d&muMBT!XnN;sB@w+fuJ!#h3gi99k0bx`7NeWX6?e#}A0FdP@>NkcKP$4b
zX1Cd^sj)IcBcp$LAY#KSY+HGnv{|&PC#~#}+Y${e0VapIq@<hYPa)r}tZDMv9@EL&
zcXHhPInR;5v{8BI=IIUpa#y@&dyBn<CEQl=bBgkOKxCY`ur5<7@|_aL+9Zz2ktg|`
zryZ{i`Lpy0aD<rizeDG$e~b04rYTUDB*HDk|B9~t<3w1kB|0*ekWbH<@^*eBRXS#K
z_QFjWGF$6f>8F2Lx);U1B6E}p2UN&&aEtp`XI)30>#^K(8&SD%b(HEe=T5}?gv2`J
zE@i`|Eb1rvYW@QIIaqtO<LLPSmSV}5EquAbd2{mCPuTj8|3*6g<28txkk_MMz$!xF
z^4RmU8?x={tRZ}nn3y>JfHrT}$c%$zEo(NN9R*!$8Q1<p$TT&W6isYEH-qbZp#poa
z^0MA%pL)9_?U#f0<~lcIwAlpgp_(?b9Rz1WZBzIDu3H!Ao`nE8?g}ESIvb5Y8STQB
zsLt0I6>e&40|`@?G2o(a78%}aWA03>^`0my!mzzRZas&AMd!|*72#fS#K54@(QDtn
zMaxzT@G%q-DqS75u&_`YWUbA%Z+M)LAUf*-_neRe<=(JizoV6uxOj0^%HXGA+}H(D
zMbJuKzGU&;@NgH<_DK_TdP1v;N6OF5S?iP4oP!~VI`6BtVZ#QH;yM2KJN{ZnNP+*E
zl@->Ubhx>tMa$Y+#OU!MA_Gf0Wp95cI=U8tBnixRA_PaC9P-8ysy=%mj!=;$x&2Ut
zx~Jz}R5Nm*NFf*+67_L7HhZiPkK-dye_dHwxf#KvsCEcR+PPa3aD;-_lq3lg^6>HF
z8&H^d+uU4LSNAA4m&dQW+ZypKmeVSdtX*x__@`3fD6kFdeO4L#{<|imfszi#aay+J
zoPp5$UDT_D-MJ$nA@K<nBu$<ia&q3zzxY(&l|Yq?$ni1g0z&bYR9RCax@F5{LTz38
z%v(ZF3v3ZFgiktmw(nO%?WiJX^1hQ7&YndJgL~JmErI)uz6VEhI&o(rweN?z48k`&
zRR|=OsHq`=gF+PJz?n;hK!v+oU?Kt~jC;altfK37Xs9794GRy~GB-D;dQRgF-o0Dm
zomp=OX?;kkBtCv@XllyGtmWb1S-0AM&c`lPP>CmX8aN03t|D|erxU<J|GQ)($l5}%
zS3*ikIJXnK0vtcCCu9YI$2mD}jSa(Pll9frB0G1c((H5d@**K_kKP2tutA>sTB+4;
zS=p-E+F_G1&_amE?~+wAEqs-q&&jMs{oL)iIJIH>DQIhev~>qwc{lkDq;t_P<f^YW
z^7Dh_=H9A1$?56jKJvASo-ImBUX7k8LNhcl==wYsfQAnUrFZ`$?$(6V3VfoC6G|dt
zV*cuzrTfQzs(G8jFI4mBD3Rg^S`SV_%>E#r{V!<fnQ*Q>_ZA_DT<nS)5h_A8dh#E+
zKTbg?`KGqerLD8^BhOyExDRa@8O&Ni=SuAXEL>kITlW@a^3}4_=Z3aQsYF^ZsT(am
zKQDiV?g5DO`WVszHvi+Ox8G6YUXh8ixp2v2zx{T_A@n*|&3}S!{ue+Fw>a*Ql9Uu^
zX01hX@SAs$lzdlE!AoyW)Xe;DY;4}rDtH)6>_>B2^_=$}-bjYRLRsuz>pt?V_9sL+
zsG?VVb{@}pLSvC0$U6$TP+sXpD>U7nzQs#Gu>VAz)6SC?5m+ZG)?PbO=1u?=YFOht
z&H<E(Fju4<#wlH(3pu;7UyT(#KtM=TKe(bKgiteqwz^uTT^+x6xp1EQ>)(I4x60C0
z;~3Eg*wA#ay9_NX8o&Z5!@_>Z%NA1c%%MH^YBh7^LZ=i?HRjOaJb(?6zybjs?t+nV
z|K$qK&`R5Zwg>9gTU^#x!Z1tC?eQBgi>b}a%e&9KSg{iD{C|S9k*f3u>Sn$xJPPO@
zwWf5k`1cQlVCjRUQA30#;<<mGEIM3&;)4RL2oD{_WlAuM62QB^N3Cz$`X71(+<gHg
z&MB>bkxZuJuxlnxT?2vm@2+_K-&cTZ#XnbK@?F4g#37D&)|zjRAHIm(ka3;;QnNKH
z;Gj5n{&2eBYJVLa9TeKNPt1PR|K~%`O=z1#?zl0ByW>8%GG%C_(#w#>$<Vt_(!3x-
zeRCI9U_9rzE&wu^E||(3vs)NT1*>B`@dg2jiCYWIWz%o`|4)}~H^(`=_G<DXZhVmm
z=2s`aEq$^mSOcwJL|~?uEMCO3%d8R`I7Y~@7K%BS(|sqOB_<`^iHmE%ftZ|@=I&`4
za*2>^_E3h<n2P?IZFayhOQ86D(#6H)fWE$6eYE2I&|!*;r>P7KGSQ9XNlMCA;GY=u
zZ19#^^-aF2vFU1N6y_v1FE2PW9$*U2F*~y)g}lf20d^zq0z_Fr*`qPOsm<mkKOk&E
z-5pk5<LudA;Oh}^5zN+Jh3*=M#0GT>a)<LWAi4x~=Z2^o;Q{xjzUZ_bX?w2BXw*Pq
z(*vNQS?B)RjdG<Jq2q@Ld+`352^|eeve6g<fF~h<1yKn7H9PxNnG7RB9o-fAg}o|w
zBNWM_0zte|+kqFdafUy9`t&BKCs9B$ll*yPiDoBKg^YDl1>=ixN)6za1A$_(U6A`G
zd!EjP`L`eK$Rp@}S6M9L2luJ-+fdhtswLGeAUoOCgg3o^zoF-LoBd0E*kFPHB;?Vx
zi11OU%yufYhU#dLCf_S<=oQS^7~la`&9`qop%S}xg)~)~)rY-w0eSGmGn4cX)z#O3
zGj?{)d~Q|BxlFq+y1ZO#Wd<<Eeujbd)926MNA11%^f_MFp(<=!3#8*l7=KuI7tM`k
zv$~~NuZOG@KEqpiJPFOPbXu;X&zBP|=)N=D7+}3DqmvK>?{QK%w}FS}IAKZM+5rge
zq|zJ<q2i;zs>z+V)lyoCJeouL0y8hee{^`&JvQnmjdgSapQC#0SwTU^EQH5C6%Tgo
zhtqigz%mF})|ls}^XBRIAL-uDvZ+|TOuIiCXA21#3qc{BbE)QqQAB?irs~S=XwmR(
z4UeR0G11-tCrU%XCAtTuC|bz)&`ET!IrmrAb-?u`0`o)}aV2fOj;5fT_2F4+>Z78f
zA{u*HWMrhI*~`WcW18Q(y_$NyN?W~NlH|W`{U@i06BqKX!#6`K9TNo=f!0PtC+EcE
z<Uj*)&3*g!QAd_8T`Hj9R2+XqQA}Vpmi=^w$5(5+hS(8?lXbrZtIq-*8sy`AXOo*X
z0`pMLKwyf5vpGs-`x46u-6(e9<T1^2dDL7Q6e!Fb)P`>(jl7SysxY=Hl<NkF-WtU6
z`~2n0B2K1$@O-@-NS7UCnokE>P(YnrK_xnZP*>MEaG(o)s9Ze7(;jjfbhmtd{|uh0
zu(0p9czk>$>LH!6p@?-YdS7gXf2pKth967^x8EgtKT*gBmgeb09N*Hhm<w35UkbjK
zZz6gK!Qb)+=hYu>k<Ho7S66ag?eOTiWg1?lD^=Kw_q2w%JJBKmyXnM^3n%UNE)~#r
z`5RJ}laph#cQ!VvTUo_xB)F6Vxc&z69MMl8Qo;FQb7^2CS~oO(_)tmsyvTDLIB>uc
zSob{S1GjG3QZl*nHSdNEG7c>%zU?p{RlsEh`Hu{zo_G;LjFk6)63q%$rNgCW=RBnS
zPL<x^A_hu?<~6Hd&^Rxs=yHwdp61FJ3#H79!gslQ@7{ju`cMJI*F^Ur^uYPV7%$%g
ztMK1IHtJ}?2Xp@`+h&?``u0`mXxC(!X4`S0zr|__WcgsS+&mA+TtwrM38G&S21jf*
z+p=UaN)YXcMo$d`uATw8#|u2Z`kkS`^Mf!L`Lt$cYg3bq<$1%n2FS;`pVq8TPDx43
z$jG3Ptx=O3=RFZ9Lko0@pHV`0pD`+175<?98VS1#d3`*rX$t!Vu5)&lL9eSGDtnF^
zBh>lwu~D(J9qd?&Szg%El{^Gdm_D4{@V2wF{^`+Y(XeGH*dy|t9#@IS6ax#>Gv=_J
z-FD=7R)^O6!qMPJBSVI4o0wQdz?aR$ySdcuX&oO-2zBYXC67$1KA73I#Baiy5jB-+
z?3o6M9Xon%?nvv(ZcyzZ@|a$<x!7in?ENUxhR1nRH7zyO7C<^&b`V#}(S)UxP&9k2
zA9K*wcH0`|aT1KQq(xCj$q@_l()WKpc@hB4$jc~nrK3+qh=Dja(;U?)+a)BblC3O$
z74_lVb-`o(`t=f$lBZ@FLzzNPUn%%(%38(1iF?A<*4Eh2Fzg~>=7cHpnavtEH(EBv
zp+m>}p?Jme#50zXHCZgW8oN14#VfaOdM<UNtevS8bv_GMCL>7rg2N-Xy1F_=eX}L<
zYgMyVhkdPao-3)yoq*!t#YXmkNt)T2V=cT!CCcN)9LD?hT_$dWCT=ZHapbu--v7&K
z&y&<t!3i&COpg8<W)|O&tdoo$hwCK|$(l#aH9>?A^QYT?TnqwA;?w%yejBWWLzpN+
z4JnvLlZuN)tjTn3;Z}J6I3%$Y1vZ8TR6a6*H{t96kq71Z@zQ8DO6O2~K)}m`SDHC~
zUc0e+YNY*A?`>^D{VY=2>NhtpboMcK^q!x4UQiHSU$4Jz{rVW8n*o(i;0D}wt{9tE
zQn@A`T7m49lP8~uh|M}-_BEchd6noRR|^RS5_SUmMS<oD3hyQ-t}YD9jf$R6_;D|c
zkcgC2=<@a3iRKTe!Y{>{a4ge$DRP>pY;6fuPU~aF)ajM}rOI=|*@TgShp}SCikHl(
zcATmm89f5S{KY363-2MrL%0#}F?5j5!%Zpk6_l;qD^7+_Z&*9C+ii(~X7^h{=m1;I
zgI*^;gQH|0D%wexE?v@49jI3t!#4W^uKYEe#}Vjn5{akEKv<7y_h2+>&p=_W0Vi%1
zHZq=YKO^r2%~vVCaEc$<d7`V|!`$3(sAgVnP@M}7Rh>Ouo8+Mp|1%f?HlJQ!S4N(U
z$dM^C8EC(u94S2CwJf(V9fN2pBI5iR3KKb`XSun#v?)Yy4t>@4nkvAXeY(^X|34dY
zhHi660ZO7H#7E}}!KU+tg@rAMzf!m9hHJiHqS+u0&B<1av1wJL)A%!da6B_(Pn<a6
zu&Cu(dJ+G}T$jFsD>GO_ajaz87g)N6C^ZUe8}a#liAKdD-TZ{us!*udtOuSI1{)#`
zJRke%Go~4qi;$lVx!{HAL3q}i&ib4V67Ee_ZJb&hh8+9`Po17Tbnlo|VS!1(a|ffn
zF;IU|P=Et$=%iP4WaM&WjEDS2e9|)sqlLU|@4&#C*?d@88zEh!Je~6Q?$}W?Mf^t~
zJa`z@ILCMGsjBjq5f%avxP-n<%Y0Z)t%4yJC4K(-S|lQp5YLI}G^RD@xko8q+D#Zj
z!oPws+&(tcJ9jf+5Uy$93FKW#x#>#?GqrBTbz~cvezeJ{skSMWKHfbbgmm*j2S!4b
zL{vEtT?aJ)CcuC(Z7FdWT4GIR!xMtFE=7kNZTt|7gI*Aj`_U(`2W(}hI(*g>Y=f7R
z@6&znLFggdxkn3nz?ukqA!-&r-3HkOp!oyXjC+WZ^ySN!BX2#9!|i@q5O4cLtbS2?
zKt!F*F<w1?DZ;l;d+b0WV9Heqctt=s(r5qAzooMmO?<g>V6E1fLNx0o>Fn3nO#999
G%KrmDj@qpN

literal 0
HcmV?d00001

diff --git a/lifelines/plotting.py b/lifelines/plotting.py
index fa93e0cc6..ec3c2c32e 100644
--- a/lifelines/plotting.py
+++ b/lifelines/plotting.py
@@ -308,16 +308,16 @@ def plot_estimate(cls, estimate):
           iloc: specify a location-based subsection of the curves to plot, ex:
                    .plot(iloc=slice(0,10))
                 will plot the first 10 time points.
+          invert_y_axis: boolean to invert the y-axis, useful to show cumulative graphs instead of survival graphs.
           bandwidth: specify the bandwidth of the kernel smoother for the
                      smoothed-hazard rate. Only used when called 'plot_hazard'.
-
         Returns:
           ax: a pyplot axis object
         """ % estimate
 
     def plot(loc=None, iloc=None, show_censors=False,
              censor_styles=None, ci_legend=False, ci_force_lines=False,
-             ci_alpha=0.25, ci_show=True, at_risk_counts=False,
+             ci_alpha=0.25, ci_show=True, at_risk_counts=False, invert_y_axis=False,
              bandwidth=None, **kwargs):
 
         if censor_styles is None:
@@ -379,6 +379,18 @@ def plot(loc=None, iloc=None, show_censors=False,
         if at_risk_counts:
             add_at_risk_counts(cls, ax=ax)
 
+        if invert_y_axis:
+            # need to check if it's already inverted
+            original_y_ticks =  ax.get_yticks()
+            if not getattr(ax, '__lifelines_inverted', False):
+                # not inverted yet
+
+                ax.invert_yaxis()
+                # don't ask.
+                y_ticks = np.round(1.000000000001 - original_y_ticks, decimals=8)
+                ax.set_yticklabels(y_ticks)
+                ax.__lifelines_inverted = True
+
         return ax
 
     plot.__doc__ = doc_string
diff --git a/tests/test_plotting.py b/tests/test_plotting.py
index 7c0db7a3a..36a51b0bc 100644
--- a/tests/test_plotting.py
+++ b/tests/test_plotting.py
@@ -57,6 +57,22 @@ def test_kmf_with_risk_counts(self, block, kmf):
         self.plt.title("test_kmf_with_risk_counts")
         self.plt.show(block=block)
 
+
+    def test_kmf_with_inverted_axis(self, block, kmf):
+
+        T = np.random.exponential(size=100)
+        kmf = KaplanMeierFitter()
+        kmf.fit(T, label='t2')
+        ax = kmf.plot(invert_y_axis=True, at_risk_counts=True)
+
+        T = np.random.exponential(3, size=100)
+        kmf = KaplanMeierFitter()
+        kmf.fit(T, label='t1')
+        kmf.plot(invert_y_axis=True, ax=ax, ci_force_lines=False)
+
+        self.plt.title("test_kmf_with_inverted_axis")
+        self.plt.show(block=block)
+
     def test_naf_plotting_with_custom_colours(self, block):
         data1 = np.random.exponential(5, size=(200, 1))
         data2 = np.random.exponential(1, size=(500))

From 9cac77cface02a3a519277b00d2778807977a307 Mon Sep 17 00:00:00 2001
From: Cameron Davidson-Pilon <cam.davidson.pilon@gmail.com>
Date: Wed, 17 Oct 2018 21:08:13 -0400
Subject: [PATCH 17/59] more tests to ctv weights

---
 lifelines/datasets/__init__.py               |  2 +-
 lifelines/fitters/cox_time_varying_fitter.py |  3 +-
 lifelines/fitters/coxph_fitter.py            |  9 ++-
 tests/test_estimation.py                     | 81 ++++++++++++--------
 4 files changed, 56 insertions(+), 39 deletions(-)

diff --git a/lifelines/datasets/__init__.py b/lifelines/datasets/__init__.py
index 35e01f656..95826655f 100644
--- a/lifelines/datasets/__init__.py
+++ b/lifelines/datasets/__init__.py
@@ -363,7 +363,7 @@ def load_dfcv():
     3       0    1.0  0   6.0   4   True
     """
     from lifelines.datasets.dfcv_dataset import dfcv
-    return dfcv
+    return dfcv.copy()
 
 
 def load_lymphoma(**kwargs):
diff --git a/lifelines/fitters/cox_time_varying_fitter.py b/lifelines/fitters/cox_time_varying_fitter.py
index ea2002bff..a39f5178b 100644
--- a/lifelines/fitters/cox_time_varying_fitter.py
+++ b/lifelines/fitters/cox_time_varying_fitter.py
@@ -258,7 +258,7 @@ def _newton_rhaphson(self, df, stop_times_events, weights, show_progress=False,
                 # reusing a piece to make g * inv(h) * g.T faster later
                 inv_h_dot_g_T = spsolve(-h, g.T, sym_pos=True)
             except ValueError as e:
-                if 'infs or NaNs' in e.message:
+                if 'infs or NaNs' in str(e):
                     raise ConvergenceError("""hessian or gradient contains nan or inf value(s). Convergence halted. Please see the following tips in the lifelines documentation:
 https://lifelines.readthedocs.io/en/latest/Examples.html#problems-with-convergence-in-the-cox-proportional-hazard-model
 """)
@@ -367,7 +367,6 @@ def _get_gradients(self, df, stops_events, weights, beta):
             weight_count = weights_deaths.sum()
             weighted_average = weight_count / ties_counts
 
-
             for l in range(ties_counts):
 
                 if ties_counts > 1:
diff --git a/lifelines/fitters/coxph_fitter.py b/lifelines/fitters/coxph_fitter.py
index b2a937ea0..248b87f42 100644
--- a/lifelines/fitters/coxph_fitter.py
+++ b/lifelines/fitters/coxph_fitter.py
@@ -18,7 +18,7 @@
     significance_code, concordance_index, _get_index, qth_survival_times,\
     pass_for_numeric_dtypes_or_raise, check_low_var, coalesce,\
     check_complete_separation, check_nans, StatError, ConvergenceWarning,\
-    StepSizer
+    StepSizer, ConvergenceError
 from lifelines.statistics import chisq_test
 
 
@@ -243,7 +243,7 @@ def _newton_rhaphson(self, X, T, E, weights=None, initial_beta=None, step_size=N
             try:
                 inv_h_dot_g_T = spsolve(-h, g.T, sym_pos=True)
             except ValueError as e:
-                if 'infs or NaNs' in e.message:
+                if 'infs or NaNs' in str(e):
                     raise ConvergenceError("""hessian or gradient contains nan or inf value(s). Convergence halted. Please see the following tips in the lifelines documentation:
 https://lifelines.readthedocs.io/en/latest/Examples.html#problems-with-convergence-in-the-cox-proportional-hazard-model
 """)
@@ -313,6 +313,11 @@ def _get_efron_values(self, X, beta, T, E, weights):
         (φ1 + φ2 + φ3) is adjusted from sum_j^{5} φj after one fails. Similarly two-third
         of (φ1 + φ2 + φ3) is adjusted after first two individuals fail, etc.
 
+        From https://cran.r-project.org/web/packages/survival/survival.pdf:
+
+        "Setting all weights to 2 for instance will give the same coefficient estimate but halve the variance. When
+        the Efron approximation for ties (default) is employed replication of the data will not give exactly the same coefficients as the
+        weights option, and in this case the weighted fit is arguably the correct one."
 
         Parameters:
             X: (n,d) numpy array of observations.
diff --git a/tests/test_estimation.py b/tests/test_estimation.py
index 58c152d9a..081d433d6 100644
--- a/tests/test_estimation.py
+++ b/tests/test_estimation.py
@@ -1252,6 +1252,7 @@ def test_non_trival_float_weights_with_no_ties_is_the_same_as_R(self, regression
 
 
     def test_summary_output_using_non_trivial_but_integer_weights(self, rossi):
+
         rossi_weights = rossi.copy()
         rossi_weights['weights'] = 1.
         rossi_weights = rossi_weights.groupby(rossi.columns.tolist())['weights'].sum()\
@@ -1263,8 +1264,23 @@ def test_summary_output_using_non_trivial_but_integer_weights(self, rossi):
         cf2 = CoxPHFitter()
         cf2.fit(rossi, duration_col='week', event_col='arrest')
 
+        # strictly speaking, the variances, etc. don't need to be the same, only the coefs.
         assert_frame_equal(cf1.summary, cf2.summary, check_like=True)
 
+    def test_doubling_the_weights_halves_the_variance(self, rossi):
+
+        w = 2.0
+        rossi_weights = rossi.copy()
+        rossi_weights['weights'] = 2
+
+        cf1 = CoxPHFitter()
+        cf1.fit(rossi_weights, duration_col='week', event_col='arrest', weights_col='weights')
+
+        cf2 = CoxPHFitter()
+        cf2.fit(rossi, duration_col='week', event_col='arrest')
+
+        assert_frame_equal(cf2.standard_errors_ ** 2, w * cf1.standard_errors_ ** 2, check_like=True)
+
 
     def test_adding_non_integer_weights_without_robust_flag_raises_a_warning(self, rossi):
         rossi['weights'] = np.random.exponential(1, rossi.shape[0])
@@ -1884,7 +1900,7 @@ def test_fitter_will_error_if_degenerate_time(self, ctv):
         ctv.fit(df, id_col="id", start_col="start", stop_col="stop", event_col="event")
         assert True
 
-    def test_ctv_fitter_will_hande_trivial_weight_col(self, ctv, dfcv):
+    def test_ctv_fitter_will_handle_trivial_weight_col(self, ctv, dfcv):
         ctv.fit(dfcv, id_col="id", start_col="start", stop_col="stop", event_col="event")
         coefs_no_weights = ctv.summary['coef'].values
 
@@ -1895,59 +1911,56 @@ def test_ctv_fitter_will_hande_trivial_weight_col(self, ctv, dfcv):
         npt.assert_almost_equal(coefs_no_weights, coefs_trivial_weights, decimal=3)
 
 
-    def test_ctv_fitter_will_hande_integer_weight_col_on_tv_dataset(self, ctv, dfcv):
-        # not sure yet why this is failing.
-        # duplicate a few subjects
-        dfcv_unfolded = dfcv.copy()
-        for _id in [10, 9, 8, 7]:
-            to_append = dfcv[dfcv['id'].isin([_id])].copy()
-            to_append['id'] = (10 + _id)
-            dfcv_unfolded = dfcv_unfolded.append(to_append)
-        dfcv_unfolded = dfcv_unfolded.reset_index(drop=True)
-        print(dfcv_unfolded[(dfcv_unfolded['start'] < 7) & (7 <= dfcv_unfolded['stop'])])
-
-        ctv = CoxTimeVaryingFitter()
-        ctv.fit(dfcv_unfolded, id_col="id", start_col="start", stop_col="stop", event_col="event", show_progress=True)
-        coefs_unfolded_weights = ctv.hazards_
+    def test_doubling_the_weights_halves_the_variance(self, ctv, dfcv):
+        ctv.fit(dfcv, id_col="id", start_col="start", stop_col="stop", event_col="event")
+        coefs_no_weights = ctv.summary['coef'].values
+        variance_no_weights = ctv.summary['se(coef)'].values**2
 
+        dfcv['weight'] = 2.0
+        ctv.fit(dfcv, id_col="id", start_col="start", stop_col="stop", event_col="event", weights_col='weight')
+        coefs_double_weights = ctv.summary['coef'].values
+        variance_double_weights = ctv.summary['se(coef)'].values**2
 
-        dfcv_folded = dfcv.copy()
-        dfcv_folded['weights'] = 1.0
-        dfcv_folded.loc[dfcv_folded['id'].isin([10,9,8,7]), 'weights'] = 2.0
-        print(dfcv_folded[(dfcv_folded['start'] < 7) & (7 <= dfcv_folded['stop'])])
+        npt.assert_almost_equal(coefs_no_weights, coefs_double_weights, decimal=3)
+        npt.assert_almost_equal(variance_no_weights, 2 * variance_double_weights, decimal=3)
 
-        ctv = CoxTimeVaryingFitter()
-        ctv.fit(dfcv_folded, id_col="id", start_col="start", stop_col="stop", event_col="event", weights_col='weights', show_progress=True)
-        coefs_folded_weights = ctv.hazards_
 
-        print(coefs_unfolded_weights)
-        print(coefs_folded_weights)
-        assert_frame_equal(coefs_unfolded_weights, coefs_folded_weights)
+    def test_ctv_fitter_will_give_the_same_results_as_static_cox_model(self, ctv, rossi):
 
+        cph = CoxPHFitter()
+        cph.fit(rossi, 'week', 'arrest')
+        expected = cph.hazards_.values
 
-    def test_ctv_fitter_will_give_the_same_results_as_static_cox_model(self, ctv, rossi):
+        rossi_ctv = rossi.reset_index()
+        rossi_ctv = to_long_format(rossi_ctv, 'week')
 
-        rossi = rossi.reset_index()
-        rossi = to_long_format(rossi, 'week')
 
-        expected = np.array([[-0.3794, -0.0574, 0.3139, -0.1498, -0.4337, -0.0849,  0.0915]])
-        ctv.fit(rossi, start_col='start', stop_col='stop', event_col='arrest', id_col='index')
+        ctv.fit(rossi_ctv, start_col='start', stop_col='stop', event_col='arrest', id_col='index')
         npt.assert_array_almost_equal(ctv.hazards_.values, expected, decimal=4)
 
 
     def test_ctv_fitter_will_handle_integer_weight_as_static_model(self, ctv, rossi):
+        # deleting some columns to create more duplicates
+        del rossi['age']
+        del rossi['paro']
+        del rossi['mar']
+        del rossi['prio']
+
         rossi_ = rossi.copy()
         rossi_['weights'] = 1.
         rossi_ = rossi_.groupby(rossi.columns.tolist())['weights'].sum()\
                        .reset_index()
 
+        cph = CoxPHFitter()
+        cph.fit(rossi, 'week', 'arrest')
+        expected = cph.hazards_.values
+
         # create the id column this way.
         rossi_ = rossi_.reset_index()
         rossi_ = to_long_format(rossi_, 'week')
 
-        expected = np.array([[-0.3794, -0.0574, 0.3139, -0.1498, -0.4337, -0.0849,  0.0915]])
         ctv.fit(rossi_, start_col='start', stop_col='stop', event_col='arrest', id_col='index', weights_col='weights')
-        npt.assert_array_almost_equal(ctv.hazards_.values, expected, decimal=4)
+        npt.assert_array_almost_equal(ctv.hazards_.values, expected, decimal=3)
 
 
     def test_fitter_accept_boolean_columns(self, ctv):
@@ -1985,9 +1998,9 @@ def test_warning_is_raised_if_df_has_a_near_constant_column_in_one_seperation(se
                 ctv.fit(dfcv, id_col="id", start_col="start", stop_col="stop", event_col="event")
             except (LinAlgError, ValueError):
                 pass
-            assert len(w) == 2
+            assert len(w) == 1
             assert issubclass(w[0].category, ConvergenceWarning)
-            assert "complete separation" in str(w[1].message)
+            assert "complete separation" in str(w[0].message)
 
     def test_summary_output_versus_Rs_against_standford_heart_transplant(self, ctv, heart):
         """

From 8184b2c32b5b3326c298d0da728b6e0f59d29482 Mon Sep 17 00:00:00 2001
From: Cameron Davidson-Pilon <cam.davidson.pilon@gmail.com>
Date: Thu, 18 Oct 2018 14:15:35 -0400
Subject: [PATCH 18/59] error handling for nans

---
 CHANGELOG.md                                 |  1 +
 lifelines/fitters/cox_time_varying_fitter.py |  7 ++-
 lifelines/fitters/coxph_fitter.py            |  1 +
 lifelines/utils/__init__.py                  |  6 +--
 tests/test_estimation.py                     | 56 ++++++++++++++++++--
 5 files changed, 64 insertions(+), 7 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 654498f1d..51947b629 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -9,6 +9,7 @@
  - Convergence errors in models that use Newton-Rhapson methods now throw a `ConvergenceError`, instead of a `ValueError` (the former is a subclass of the latter, however).
  - `AalenAdditiveModel` raises `ConvergenceWarning` instead of printing a warning.
  - `KaplanMeierFitter` now has a cumulative plot option. Example `kmf.plot(invert_y_axis=True)`
+ - a `weights_col` option has been added to `CoxTimeVaryingFitter` that allows for time-varying weights. 
 
 #### 0.14.6
  - fix for n > 2 groups in `multivariate_logrank_test` (again).
diff --git a/lifelines/fitters/cox_time_varying_fitter.py b/lifelines/fitters/cox_time_varying_fitter.py
index a39f5178b..dc8de9f4e 100644
--- a/lifelines/fitters/cox_time_varying_fitter.py
+++ b/lifelines/fitters/cox_time_varying_fitter.py
@@ -19,7 +19,7 @@
     pass_for_numeric_dtypes_or_raise, check_low_var,\
     check_for_overlapping_intervals, check_complete_separation_low_variance,\
     ConvergenceWarning, StepSizer, _get_index, check_for_immediate_deaths,\
-    check_for_instantaneous_events, ConvergenceError
+    check_for_instantaneous_events, ConvergenceError, check_nans
 
 
 class CoxTimeVaryingFitter(BaseFitter):
@@ -78,6 +78,10 @@ def fit(self, df, id_col, event_col, start_col='start', stop_col='stop', weights
         if weights_col is None:
             assert '__weights' not in df.columns, '__weights is an internal lifelines column, please rename your column first.'
             df['__weights'] = 1.0
+        else:
+            if (df[weights_col] <= 0).any():
+                raise ValueError("values in weights_col must be positive.")
+
 
         df = df.rename(columns={id_col: 'id', event_col: 'event', start_col: 'start', stop_col: 'stop', weights_col: '__weights'})
         df = df.set_index('id')
@@ -111,6 +115,7 @@ def fit(self, df, id_col, event_col, start_col='start', stop_col='stop', weights
     @staticmethod
     def _check_values(df, stop_times_events):
         # check_for_overlapping_intervals(df) # this is currenty too slow for production.
+        check_nans(df)
         check_low_var(df)
         check_complete_separation_low_variance(df, stop_times_events['event'])
         pass_for_numeric_dtypes_or_raise(df)
diff --git a/lifelines/fitters/coxph_fitter.py b/lifelines/fitters/coxph_fitter.py
index 248b87f42..93f411bd3 100644
--- a/lifelines/fitters/coxph_fitter.py
+++ b/lifelines/fitters/coxph_fitter.py
@@ -432,6 +432,7 @@ def _check_values(df, T, E):
         pass_for_numeric_dtypes_or_raise(df)
         check_nans(T)
         check_nans(E)
+        check_nans(df)
         check_low_var(df)
         check_complete_separation(df, E, T)
 
diff --git a/lifelines/utils/__init__.py b/lifelines/utils/__init__.py
index 07443e169..b8d073613 100644
--- a/lifelines/utils/__init__.py
+++ b/lifelines/utils/__init__.py
@@ -1103,9 +1103,9 @@ def check_complete_separation(df, events, durations):
     check_complete_separation_close_to_perfect_correlation(df, durations)
 
 
-def check_nans(array):
-    if pd.isnull(array).any():
-        raise TypeError("NaNs were detected in the duration_col and/or the event_col")
+def check_nans(df_or_array):
+    if pd.isnull(df_or_array).values.any():
+        raise TypeError("NaNs were detected in the dataset. Try using pd.isnull to find the problematic values.")
 
 
 def to_long_format(df, duration_col):
diff --git a/tests/test_estimation.py b/tests/test_estimation.py
index 081d433d6..921209152 100644
--- a/tests/test_estimation.py
+++ b/tests/test_estimation.py
@@ -286,13 +286,13 @@ def test_valueerror_is_thrown_if_alpha_out_of_bounds(self, univariate_fitters):
             with pytest.raises(ValueError):
                 fitter(alpha=95)
 
-    def test_error_is_thrown_if_there_is_nans_in_the_duration_col(self, univariate_fitters):
+    def test_typeerror_is_thrown_if_there_is_nans_in_the_duration_col(self, univariate_fitters):
         T = np.array([1.0, 2.0, 4.0, None, 8.0])
         for fitter in univariate_fitters:
             with pytest.raises(TypeError):
                 fitter().fit(T)
 
-    def test_error_is_thrown_if_there_is_nans_in_the_event_col(self, univariate_fitters):
+    def test_typeerror_is_thrown_if_there_is_nans_in_the_event_col(self, univariate_fitters):
         T = np.arange(5)
         E = [1, 0, None, 1, 1]
         for fitter in univariate_fitters:
@@ -1679,6 +1679,13 @@ def test_robust_errors_with_strata_doesnt_break(self, rossi):
         cf.fit(rossi, duration_col='week', event_col='arrest', strata=['race', 'paro', 'mar', 'wexp'], robust=True)
 
 
+    def test_what_happens_to_nans(self, rossi):
+        rossi['var4'] = np.nan
+        cf = CoxPHFitter()
+        with pytest.raises(TypeError):
+            cf.fit(rossi, duration_col='week' event_col="arrest")
+
+
 
 
 class TestAalenAdditiveFitter():
@@ -1832,12 +1839,55 @@ def heart(self):
         return load_stanford_heart_transplants()
 
     def test_inference_against_known_R_output(self, ctv, dfcv):
-        # from http://www.math.ucsd.edu/~rxu/math284/slect7.pdf
+        """
+        from http://www.math.ucsd.edu/~rxu/math284/slect7.pdf
+
+        > coxph(formula = Surv(time = start, time2 = stop, event) ~ group + z, data = dfcv)
+
+        """
         ctv.fit(dfcv, id_col="id", start_col="start", stop_col="stop", event_col="event")
         npt.assert_almost_equal(ctv.summary['coef'].values, [1.826757, 0.705963], decimal=4)
         npt.assert_almost_equal(ctv.summary['se(coef)'].values, [1.229, 1.206], decimal=3)
         npt.assert_almost_equal(ctv.summary['p'].values, [0.14, 0.56], decimal=2)
 
+    def test_what_happens_to_nans(self, ctv, dfcv):
+        """
+        from http://www.math.ucsd.edu/~rxu/math284/slect7.pdf
+
+        > coxph(formula = Surv(time = start, time2 = stop, event) ~ group + z, data = dfcv)
+
+        """
+        dfcv['var4'] = np.nan
+        with pytest.raises(TypeError):
+            ctv.fit(dfcv, id_col="id", start_col="start", stop_col="stop", event_col="event")
+
+
+    def test_inference_against_known_R_output_with_weights(self, ctv, dfcv):
+        """
+        > dfcv['weights'] = [0.46009262, 0.04643257, 0.38150793, 0.11903676, 0.51965860, 0.96173133, 0.32435527, 0.16708398, 0.85464418, 0.15146481, 0.24713429, 0.55198318, 0.16948366, 0.19246483]
+        > coxph(formula = Surv(time = start, time2 = stop, event) ~ group + z, data = dfcv)
+
+        """
+        dfcv['weights'] = [
+            0.4600926178338619,
+            0.046432574620396294,
+            0.38150793079960477,
+            0.11903675541025949,
+            0.5196585971574837,
+            0.9617313298681641,
+            0.3243552664091651,
+            0.16708398114269085,
+            0.8546441798716636,
+            0.15146480991643507,
+            0.24713429350878657,
+            0.5519831777187729,
+            0.16948366380884838,
+            0.19246482703103884
+        ]
+        ctv.fit(dfcv, id_col="id", start_col="start", stop_col="stop", event_col="event", weights_col='weights')
+        npt.assert_almost_equal(ctv.summary['coef'].values, [0.313, 0.423], decimal=3)
+        npt.assert_almost_equal(ctv.summary['se(coef)'].values, [1.542, 1.997], decimal=3)
+
     @pytest.mark.xfail()
     def test_fitter_will_raise_an_error_if_overlapping_intervals(self, ctv):
         df = pd.DataFrame.from_records([

From 942051d808cdf632acb50e88c015e74f48e9cb42 Mon Sep 17 00:00:00 2001
From: Cameron Davidson-Pilon <cam.davidson.pilon@gmail.com>
Date: Mon, 22 Oct 2018 22:49:35 -0400
Subject: [PATCH 19/59] better print_summary for regression models

---
 CHANGELOG.md                                 |  2 +
 docs/Quickstart.rst                          | 15 ++++--
 docs/Survival Regression.rst                 | 52 +++++++++++++-------
 lifelines/fitters/cox_time_varying_fitter.py | 32 ++++++++----
 lifelines/fitters/coxph_fitter.py            | 28 ++++++++---
 lifelines/fitters/weibull_fitter.py          | 23 +++++----
 lifelines/utils/__init__.py                  |  7 ++-
 tests/test_estimation.py                     | 42 ++++++++++------
 8 files changed, 139 insertions(+), 62 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 51947b629..2a91824e4 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -10,6 +10,8 @@
  - `AalenAdditiveModel` raises `ConvergenceWarning` instead of printing a warning.
  - `KaplanMeierFitter` now has a cumulative plot option. Example `kmf.plot(invert_y_axis=True)`
  - a `weights_col` option has been added to `CoxTimeVaryingFitter` that allows for time-varying weights. 
+ - `WeibullFitter` has a new `show_progress` param.
+ - `CoxPHFitter` and `CoxTimeVaryFitter` method `print_summary` is updated with new fields. 
 
 #### 0.14.6
  - fix for n > 2 groups in `multivariate_logrank_test` (again).
diff --git a/docs/Quickstart.rst b/docs/Quickstart.rst
index cbf4fa793..a3b1e12f7 100644
--- a/docs/Quickstart.rst
+++ b/docs/Quickstart.rst
@@ -159,16 +159,23 @@ The input of the ``fit`` method's API in a regression is different. All the data
     cph.print_summary()
 
     """
-    n=200, number of events=189
+          duration col = T
+             event col = E
+    number of subjects = 200
+      number of events = 189
+        log-likelihood = -807.620
+      time fit was run = 2018-10-23 02:44:18 UTC
 
+    ---
            coef  exp(coef)  se(coef)      z      p  lower 0.95  upper 0.95
-    var1 0.2213     1.2477    0.0743 2.9796 0.0029      0.0757      0.3669  **
-    var2 0.0509     1.0522    0.0829 0.6139 0.5393     -0.1116      0.2134
-    var3 0.2186     1.2443    0.0758 2.8836 0.0039      0.0700      0.3672  **
+    var1 0.2222     1.2488    0.0743 2.9920 0.0028      0.0767      0.3678  **
+    var2 0.0510     1.0523    0.0829 0.6148 0.5387     -0.1115      0.2134
+    var3 0.2183     1.2440    0.0758 2.8805 0.0040      0.0698      0.3669  **
     ---
     Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
 
     Concordance = 0.580
+    Likelihood ratio test = 15.540 on 3 df, p=0.00141
     """
 
     cph.plot()
diff --git a/docs/Survival Regression.rst b/docs/Survival Regression.rst
index 3676fb142..3d4e8bc12 100644
--- a/docs/Survival Regression.rst	
+++ b/docs/Survival Regression.rst	
@@ -60,16 +60,22 @@ This example data is from the paper `here <http://socserv.socsci.mcmaster.ca/jfo
     cph.print_summary()  # access the results using cph.summary
 
     """
-    n=432, number of events=114
+          duration col = week
+             event col = arrest
+    number of subjects = 432
+      number of events = 114
+        log-likelihood = -658.748
+      time fit was run = 2018-10-22 20:47:44 UTC
 
+    ---
             coef  exp(coef)  se(coef)       z      p  lower 0.95  upper 0.95
-    fin  -0.3790     0.6845    0.1914 -1.9806 0.0476     -0.7542     -0.0039   *
-    age  -0.0572     0.9444    0.0220 -2.6042 0.0092     -0.1003     -0.0142  **
-    race  0.3141     1.3691    0.3080  1.0198 0.3078     -0.2897      0.9180
-    wexp -0.1511     0.8597    0.2121 -0.7124 0.4762     -0.5670      0.2647
-    mar  -0.4328     0.6487    0.3818 -1.1335 0.2570     -1.1813      0.3157
-    paro -0.0850     0.9185    0.1957 -0.4341 0.6642     -0.4687      0.2988
-    prio  0.0911     1.0954    0.0286  3.1824 0.0015      0.0350      0.1472  **
+    fin  -0.3794     0.6843    0.1914 -1.9826 0.0474     -0.7545     -0.0043   *
+    age  -0.0574     0.9442    0.0220 -2.6109 0.0090     -0.1006     -0.0143  **
+    race  0.3139     1.3688    0.3080  1.0192 0.3081     -0.2898      0.9176
+    wexp -0.1498     0.8609    0.2122 -0.7058 0.4803     -0.5657      0.2662
+    mar  -0.4337     0.6481    0.3819 -1.1358 0.2561     -1.1821      0.3147
+    paro -0.0849     0.9186    0.1958 -0.4336 0.6646     -0.4685      0.2988
+    prio  0.0915     1.0958    0.0286  3.1939 0.0014      0.0353      0.1476  **
     ---
     Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
 
@@ -83,9 +89,9 @@ To access the coefficients and the baseline hazard directly, you can use ``cph.h
 Convergence 
 ###########################################
 
-Fitting the Cox model to the data involves using gradient descent. Lifelines takes extra effort to help with convergence. If you wish to see the fitting, there is a ``show_progress`` parameter in ``CoxPHFitter.fit`` function. For further help, see :ref:`Problems with convergence in the Cox Proportional Hazard Model`.
+Fitting the Cox model to the data involves using gradient descent. Lifelines takes extra effort to help with convergence, so please be attentive to any warnings that appear. Fixing any warnings will generally help convergence. If you wish to see the fitting, there is a ``show_progress`` parameter in ``CoxPHFitter.fit`` function. For further help, see :ref:`Problems with convergence in the Cox Proportional Hazard Model`.
 
-After fitting, the value of the maximum log-likelihood this available using ``cph._log_likelihood``. Similarly, the score and Hessian matrix are available under ``_score_`` and ``_hessian_`` respectively. The ``_hessian_`` can be used the find the covariance matrix of the coefficients. 
+After fitting, the value of the maximum log-likelihood this available using ``cph._log_likelihood``. Similarly, the score and Hessian matrix are available under ``_score_`` and ``_hessian_`` respectively. 
 
 
 Goodness of fit and prediction
@@ -196,21 +202,28 @@ Sometimes a covariate may not obey the proportional hazard assumption. In this c
     from lifelines import CoxPHFitter
 
     rossi_dataset = load_rossi()
-
+    cph = CoxPHFitter()
     cph.fit(rossi_dataset, 'week', event_col='arrest', strata=['race'], show_progress=True)
 
     cph.print_summary()  # access the results using cph.summary
 
     """
-    n=432, number of events=114
+          duration col = week
+             event col = arrest
+                strata = ['race']
+    number of subjects = 432
+      number of events = 114
+        log-likelihood = -620.564
+      time fit was run = 2018-10-23 02:45:52 UTC
 
+    ---
             coef  exp(coef)  se(coef)       z      p  lower 0.95  upper 0.95
-    fin  -0.3775     0.6856    0.1913 -1.9731 0.0485     -0.7525     -0.0024   *
-    age  -0.0573     0.9443    0.0220 -2.6081 0.0091     -0.1004     -0.0142  **
-    wexp -0.1435     0.8664    0.2127 -0.6746 0.4999     -0.5603      0.2734
-    mar  -0.4419     0.6428    0.3820 -1.1570 0.2473     -1.1907      0.3068
-    paro -0.0839     0.9196    0.1958 -0.4283 0.6684     -0.4677      0.3000
-    prio  0.0919     1.0962    0.0287  3.1985 0.0014      0.0356      0.1482  **
+    fin  -0.3788     0.6847    0.1913 -1.9799 0.0477     -0.7537     -0.0038   *
+    age  -0.0576     0.9440    0.0220 -2.6198 0.0088     -0.1008     -0.0145  **
+    wexp -0.1428     0.8670    0.2128 -0.6708 0.5023     -0.5598      0.2743
+    mar  -0.4388     0.6448    0.3821 -1.1484 0.2508     -1.1878      0.3101
+    paro -0.0858     0.9178    0.1958 -0.4380 0.6614     -0.4695      0.2980
+    prio  0.0922     1.0966    0.0287  3.2102 0.0013      0.0359      0.1485  **
     ---
     Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
 
@@ -888,6 +901,9 @@ Fitting the model
 
 Once your dataset is in the correct orientation, we can use ``CoxTimeVaryingFitter`` to fit the model to your data. The method is similar to ``CoxPHFitter``, expect we need to tell the ``fit`` about the additional time columns.
 
+Fitting the Cox model to the data involves using gradient descent. Lifelines takes extra effort to help with convergence, so please be attentive to any warnings that appear. Fixing any warnings will generally help convergence. For further help, see :ref:`Problems with convergence in the Cox Proportional Hazard Model`.
+
+
 .. code:: python
 
     from lifelines import CoxTimeVaryingFitter
diff --git a/lifelines/fitters/cox_time_varying_fitter.py b/lifelines/fitters/cox_time_varying_fitter.py
index dc8de9f4e..f88d5d0ee 100644
--- a/lifelines/fitters/cox_time_varying_fitter.py
+++ b/lifelines/fitters/cox_time_varying_fitter.py
@@ -1,6 +1,8 @@
 # -*- coding: utf-8 -*-
 from __future__ import print_function
 from __future__ import division
+
+from datetime import datetime
 import warnings
 import time
 
@@ -19,7 +21,7 @@
     pass_for_numeric_dtypes_or_raise, check_low_var,\
     check_for_overlapping_intervals, check_complete_separation_low_variance,\
     ConvergenceWarning, StepSizer, _get_index, check_for_immediate_deaths,\
-    check_for_instantaneous_events, ConvergenceError, check_nans
+    check_for_instantaneous_events, ConvergenceError, check_nans, string_justify
 
 
 class CoxTimeVaryingFitter(BaseFitter):
@@ -69,6 +71,8 @@ def fit(self, df, id_col, event_col, start_col='start', stop_col='stop', weights
         """
 
         self.robust = robust
+        self.event_col = event_col
+        self._time_fit_was_called = datetime.utcnow().strftime("%Y-%m-%d %H:%M:%S")
 
         df = df.copy()
 
@@ -101,6 +105,7 @@ def fit(self, df, id_col, event_col, start_col='start', stop_col='stop', weights
                                          step_size=step_size)
 
         self.hazards_ = pd.DataFrame(hazards_.T, columns=df.columns, index=['coef']) / self._norm_std
+        self.variance_matrix_ = -inv(self._hessian_) / np.outer(self._norm_std, self._norm_std)
         self.standard_errors_ = self._compute_standard_errors(normalize(df, self._norm_mean, self._norm_std), stop_times_events, weights)
         self.confidence_intervals_ = self._compute_confidence_intervals()
         self.baseline_cumulative_hazard_ = self._compute_cumulative_baseline_hazard(df, stop_times_events)
@@ -122,7 +127,7 @@ def _check_values(df, stop_times_events):
         check_for_immediate_deaths(stop_times_events)
         check_for_instantaneous_events(stop_times_events)
 
-    def _compute_sandwich_estimator(self, df, stop_times_events):
+    def _compute_sandwich_estimator(self, df, stop_times_events, weights):
 
         n, d = df.shape
 
@@ -174,9 +179,9 @@ def _compute_sandwich_estimator(self, df, stop_times_events):
 
     def _compute_standard_errors(self, df, stop_times_events, weights):
         if self.robust:
-            se = np.sqrt(self._compute_sandwich_estimator(df.values, T.values, E.values, weights.values).diagonal()) # / self._norm_std
+            se = np.sqrt(self._compute_sandwich_estimator(df, stop_times_events, weights).diagonal()) # / self._norm_std
         else:
-            se = np.sqrt(-inv(self._hessian_).diagonal()) / self._norm_std
+            se = np.sqrt(self.variance_matrix_.diagonal())
         return pd.DataFrame(se[None, :],
                             index=['se'], columns=self.hazards_.columns)
 
@@ -444,14 +449,23 @@ def print_summary(self):
         """
         Print summary statistics describing the fit, the coefficients, and the error bounds.
         """
+
+        # Print information about data first
+        justify = string_justify(18)
+        print()
+        print("{} = {}".format(justify('event col'), self.event_col))
+        print('{} = {}'.format(justify('number of subjects'), self._n_unique))
+        print('{} = {}'.format(justify('number of periods'), self._n_examples))
+        print('{} = {}'.format(justify('number of events'), self.event_observed.sum()))
+        print('{} = {:.3f}'.format(justify('log-likelihood'), self._log_likelihood))
+        print('{} = {} UTC'.format(justify('time fit was run'), self._time_fit_was_called), end='\n\n')
+
+
+        print('---')
+
         df = self.summary
         # Significance codes last
         df[''] = [significance_code(p) for p in df['p']]
-
-        # Print information about data first
-        print('periods={}, uniques={}, number of events={}'.format(self._n_examples, self._n_unique,
-                                                                   self.event_observed.sum()),
-              end='\n\n')
         print(df.to_string(float_format=lambda f: '{:4.4f}'.format(f)))
         # Significance code explanation
         print('---')
diff --git a/lifelines/fitters/coxph_fitter.py b/lifelines/fitters/coxph_fitter.py
index 93f411bd3..879d2463c 100644
--- a/lifelines/fitters/coxph_fitter.py
+++ b/lifelines/fitters/coxph_fitter.py
@@ -3,6 +3,7 @@
 from __future__ import division
 
 import time
+from datetime import datetime
 import warnings
 import numpy as np
 import pandas as pd
@@ -18,7 +19,7 @@
     significance_code, concordance_index, _get_index, qth_survival_times,\
     pass_for_numeric_dtypes_or_raise, check_low_var, coalesce,\
     check_complete_separation, check_nans, StatError, ConvergenceWarning,\
-    StepSizer, ConvergenceError
+    StepSizer, ConvergenceError, string_justify
 from lifelines.statistics import chisq_test
 
 
@@ -102,6 +103,9 @@ def fit(self, df, duration_col, event_col=None,
         # Sort on time
         df = df.sort_values(by=duration_col)
 
+        self._time_fit_was_called = datetime.utcnow().strftime("%Y-%m-%d %H:%M:%S")
+        self.duration_col = duration_col
+        self.event_col = event_col
         self.robust = robust
         self._n_examples = df.shape[0]
         self.strata = coalesce(strata, self.strata)
@@ -529,14 +533,26 @@ def print_summary(self):
         Print summary statistics describing the fit.
 
         """
+
+        # Print information about data first
+        justify = string_justify(18)
+        print()
+        print("{} = {}".format(justify('duration col'), self.duration_col))
+        print("{} = {}".format(justify('event col'), self.event_col))
+
+        if self.strata:
+            print('{} = {}'.format(justify('strata'), self.strata))
+
+        print('{} = {}'.format(justify('number of subjects'), self._n_examples))
+        print('{} = {}'.format(justify('number of events'), self.event_observed.sum()))
+        print('{} = {:.3f}'.format(justify('log-likelihood'), self._log_likelihood))
+        print('{} = {} UTC'.format(justify("time fit was run"), self._time_fit_was_called), end='\n\n')
+        print('---')
+
+
         df = self.summary
         # Significance codes last
         df[''] = [significance_code(p) for p in df['p']]
-
-        # Print information about data first
-        print('n={}, number of events={}'.format(self._n_examples,
-                                                 self.event_observed.sum()),
-              end='\n\n')
         print(df.to_string(float_format=lambda f: '{:4.4f}'.format(f)))
         # Significance code explanation
         print('---')
diff --git a/lifelines/fitters/weibull_fitter.py b/lifelines/fitters/weibull_fitter.py
index 5b5f0993c..caa495465 100644
--- a/lifelines/fitters/weibull_fitter.py
+++ b/lifelines/fitters/weibull_fitter.py
@@ -1,11 +1,12 @@
 # -*- coding: utf-8 -*-
 from __future__ import print_function, division
+import time
 import numpy as np
 import pandas as pd
 
 from numpy.linalg import solve, norm, inv
 from lifelines.fitters import UnivariateFitter
-from lifelines.utils import inv_normal_cdf, check_nans
+from lifelines.utils import inv_normal_cdf, check_nans, ConvergenceError
 
 
 def _negative_log_likelihood(lambda_rho, T, E):
@@ -62,7 +63,7 @@ class WeibullFitter(UnivariateFitter):
     """
 
     def fit(self, durations, event_observed=None, timeline=None, entry=None,
-            label='Weibull_estimate', alpha=None, ci_labels=None):
+            label='Weibull_estimate', alpha=None, ci_labels=None, show_progress=False):
         """
         Parameters:
           duration: an array, or pd.Series, of length n -- duration subject was observed for
@@ -77,7 +78,7 @@ def fit(self, durations, event_observed=None, timeline=None, entry=None,
              alpha for this call to fit only.
           ci_labels: add custom column names to the generated confidence intervals
                 as a length-2 list: [<lower-bound name>, <upper-bound name>]. Default: <label>_lower_<alpha>
-
+          show_progress: since this is an iterative fitting algorithm, switching this to True will display some iteration details.
         Returns:
           self, with new properties like `cumulative_hazard_', 'survival_function_', 'lambda_' and 'rho_'.
 
@@ -98,7 +99,7 @@ def fit(self, durations, event_observed=None, timeline=None, entry=None,
         alpha = alpha if alpha is not None else self.alpha
 
         # estimation
-        self.lambda_, self.rho_ = self._newton_rhaphson(self.durations, self.event_observed)
+        self.lambda_, self.rho_ = self._newton_rhaphson(self.durations, self.event_observed, show_progress=show_progress)
         self.survival_function_ = pd.DataFrame(self.survival_function_at_times(self.timeline), columns=[self._label], index=self.timeline)
         self.hazard_ = pd.DataFrame(self.hazard_at_times(self.timeline), columns=[self._label], index=self.timeline)
         self.cumulative_hazard_ = pd.DataFrame(self.cumulative_hazard_at_times(self.timeline), columns=[self._label], index=self.timeline)
@@ -125,7 +126,7 @@ def survival_function_at_times(self, times):
     def cumulative_hazard_at_times(self, times):
         return (self.lambda_ * times) ** self.rho_
 
-    def _newton_rhaphson(self, T, E, precision=1e-5):
+    def _newton_rhaphson(self, T, E, precision=1e-5, show_progress=False):
         from lifelines.utils import _smart_search
 
         def jacobian_function(parameters, T, E):
@@ -140,26 +141,30 @@ def gradient_function(parameters, T, E):
         # initialize the parameters. This shows dramatic improvements.
         parameters = _smart_search(_negative_log_likelihood, 2, T, E)
 
-        iter = 1
+        i = 1
         step_size = 0.9
         converging = True
+        start = time.time()
 
-        while converging and iter < 50:
+        while converging and i < 50:
             # Do not override hessian and gradient in case of garbage
             j, g = jacobian_function(parameters, T, E), gradient_function(parameters, T, E)
 
             delta = solve(j, - step_size * g.T)
             if np.any(np.isnan(delta)):
-                raise ValueError("delta contains nan value(s). Convergence halted.")
+                raise ConvergenceError("delta contains nan value(s). Convergence halted.")
 
             parameters += delta
 
             # Save these as pending result
             jacobian = j
 
+            if show_progress:
+                print("Iteration %d: norm_delta = %.5f, seconds_since_start = %.1f" % (i, norm(delta), time.time() - start))
+
             if norm(delta) < precision:
                 converging = False
-            iter += 1
+            i += 1
 
         self._jacobian = jacobian
         return parameters
diff --git a/lifelines/utils/__init__.py b/lifelines/utils/__init__.py
index b8d073613..1d04bc142 100644
--- a/lifelines/utils/__init__.py
+++ b/lifelines/utils/__init__.py
@@ -1086,10 +1086,11 @@ def check_complete_separation_close_to_perfect_correlation(df, durations):
     # slow for many columns
     THRESHOLD = 0.99
     n, _ = df.shape
+
     if n > 1000:
         # let's sample to speed this n**2 algo up.
-        df = df.sample(n=1000, random_state=15).copy()
-        durations = durations.sample(n=1000, random_state=15).copy()
+        df = df.sample(n=800, random_state=15).copy()
+        durations = durations.sample(n=800, random_state=15).copy()
 
     for col, series in df.iteritems():
         if abs(stats.spearmanr(series, durations).correlation) >= THRESHOLD:
@@ -1331,3 +1332,5 @@ def _is_monotonically_decreasing(array):
     def next(self):
         return self.step_size
 
+
+string_justify = lambda width: lambda s: s.rjust(width, ' ')
diff --git a/tests/test_estimation.py b/tests/test_estimation.py
index 921209152..6816730a8 100644
--- a/tests/test_estimation.py
+++ b/tests/test_estimation.py
@@ -775,18 +775,26 @@ def test_print_summary(self, rossi):
 
             cp = CoxPHFitter()
             cp.fit(rossi, duration_col='week', event_col='arrest')
+            cp._time_fit_was_called = '2018-10-23 02:40:45 UTC'
             cp.print_summary()
             output = out.getvalue().strip().split()
-            expected = """n=432, number of events=114
-
-           coef  exp(coef)  se(coef)          z         p  lower 0.95  upper 0.95
-fin  -1.897e-01  8.272e-01 9.579e-02 -1.981e+00 4.763e-02  -3.775e-01  -1.938e-03   *
-age  -3.500e-01  7.047e-01 1.344e-01 -2.604e+00 9.210e-03  -6.134e-01  -8.651e-02  **
-race  1.032e-01  1.109e+00 1.012e-01  1.020e+00 3.078e-01  -9.516e-02   3.015e-01
-wexp -7.486e-02  9.279e-01 1.051e-01 -7.124e-01 4.762e-01  -2.809e-01   1.311e-01
-mar  -1.421e-01  8.675e-01 1.254e-01 -1.134e+00 2.570e-01  -3.880e-01   1.037e-01
-paro -4.134e-02  9.595e-01 9.522e-02 -4.341e-01 6.642e-01  -2.280e-01   1.453e-01
-prio  2.639e-01  1.302e+00 8.291e-02  3.182e+00 1.460e-03   1.013e-01   4.264e-01  **
+            expected = """
+      duration col = week
+         event col = arrest
+number of subjects = 432
+  number of events = 114
+    log-likelihood = -658.748
+  time fit was run = 2018-10-23 02:40:45 UTC
+
+---
+        coef  exp(coef)  se(coef)       z      p  lower 0.95  upper 0.95
+fin  -0.3794     0.6843    0.1914 -1.9826 0.0474     -0.7545     -0.0043   *
+age  -0.0574     0.9442    0.0220 -2.6109 0.0090     -0.1006     -0.0143  **
+race  0.3139     1.3688    0.3080  1.0192 0.3081     -0.2898      0.9176
+wexp -0.1498     0.8609    0.2122 -0.7058 0.4803     -0.5657      0.2662
+mar  -0.4337     0.6481    0.3819 -1.1358 0.2561     -1.1821      0.3147
+paro -0.0849     0.9186    0.1958 -0.4336 0.6646     -0.4685      0.2988
+prio  0.0915     1.0958    0.0286  3.1939 0.0014      0.0353      0.1476  **
 ---
 Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
 
@@ -1683,9 +1691,7 @@ def test_what_happens_to_nans(self, rossi):
         rossi['var4'] = np.nan
         cf = CoxPHFitter()
         with pytest.raises(TypeError):
-            cf.fit(rossi, duration_col='week' event_col="arrest")
-
-
+            cf.fit(rossi, duration_col='week', event_col="arrest")
 
 
 class TestAalenAdditiveFitter():
@@ -2160,10 +2166,18 @@ def test_print_summary(self, ctv, heart):
             sys.stdout = out
 
             ctv.fit(heart, id_col='id', event_col='event')
+            ctv._time_fit_was_called = '2018-10-23 02:41:45 UTC'
             ctv.print_summary()
             output = out.getvalue().strip().split()
-            expected = """periods=172, uniques=103, number of events=75
+            expected = """
+         event col = event
+number of subjects = 103
+ number of periods = 172
+  number of events = 75
+    log-likelihood = -290.566
+  time fit was run = 2018-10-23 02:41:45 UTC
 
+---
               coef  exp(coef)  se(coef)       z      p  lower 0.95  upper 0.95
 age         0.0272     1.0275    0.0137  1.9809 0.0476      0.0003      0.0540  *
 year       -0.1463     0.8639    0.0705 -2.0768 0.0378     -0.2845     -0.0082  *

From eaef8757362ac449625dec8ac783527a3e24d7ef Mon Sep 17 00:00:00 2001
From: Cameron Davidson-Pilon <cam.davidson.pilon@gmail.com>
Date: Mon, 22 Oct 2018 23:11:07 -0400
Subject: [PATCH 20/59] adding more to docs about model selection

---
 docs/Survival Regression.rst      | 25 ++++++++++++++++++++++++-
 lifelines/fitters/coxph_fitter.py |  2 +-
 lifelines/utils/__init__.py       | 10 +++++++---
 3 files changed, 32 insertions(+), 5 deletions(-)

diff --git a/docs/Survival Regression.rst b/docs/Survival Regression.rst
index 3d4e8bc12..4c9dc98d2 100644
--- a/docs/Survival Regression.rst	
+++ b/docs/Survival Regression.rst	
@@ -933,7 +933,30 @@ of AUC, another common loss function, and is interpreted similarly:
 * 1.0 is perfect concordance and,
 * 0.0 is perfect anti-concordance (multiply predictions with -1 to get 1.0)
 
-The measure is implemented in lifelines under `lifelines.utils.concordance_index` and accepts the actual times (along with any censorships) and the predicted times.
+A fitted model's concordance-index is present in the `print_summary()`, but also available under the `score_` property. Generally, the measure is implemented in lifelines under `lifelines.utils.concordance_index` and accepts the actual times (along with any censorships) and the predicted times.
+
+.. code:: python
+
+    from lifelines import CoxPHFitter
+    from lifelines.datasets import load_rossi
+
+    rossi = load_rossi()
+
+    cph = CoxPHFitter()
+    cph.fit(rossi, duration_col="week", event_col="arrest")
+
+    # method one
+    cph.print_summary()
+
+    # method two
+    print(cph.score_)
+
+    # method three
+    from lifelines.utils import concordance_index
+    print(concordance_index(rossi['week'], -cph.predict_partial_hazard(rossi).values, rossi['arrest']))
+
+
+However, there are other, arguably better, methods to measure the fit of a model. Included in `print_summary` is the log-likelihood, which can be used in an `AIC calculation <https://en.wikipedia.org/wiki/Akaike_information_criterion>`, and the `log-likelihood ratio statistic <https://en.wikipedia.org/wiki/Likelihood-ratio_test>`. Generally, I personally loved this article by Frank Harrell, `"Statistically Efficient Ways to Quantify Added Predictive Value of New Measurements" <http://www.fharrell.com/post/addvalue/>`. 
 
 
 Cross Validation
diff --git a/lifelines/fitters/coxph_fitter.py b/lifelines/fitters/coxph_fitter.py
index 879d2463c..254c378b5 100644
--- a/lifelines/fitters/coxph_fitter.py
+++ b/lifelines/fitters/coxph_fitter.py
@@ -167,7 +167,7 @@ def fit(self, df, duration_col, event_col=None,
         self.baseline_cumulative_hazard_ = self._compute_baseline_cumulative_hazard()
         self.baseline_survival_ = self._compute_baseline_survival()
         self.score_ = concordance_index(self.durations,
-                                        -self.predict_partial_hazard(df).values.ravel(),
+                                        -self.predict_partial_hazard(df).values,
                                         self.event_observed)
 
         self._train_log_partial_hazard = self.predict_log_partial_hazard(self._norm_mean.to_frame().T)
diff --git a/lifelines/utils/__init__.py b/lifelines/utils/__init__.py
index 1d04bc142..17201a731 100644
--- a/lifelines/utils/__init__.py
+++ b/lifelines/utils/__init__.py
@@ -1105,9 +1105,13 @@ def check_complete_separation(df, events, durations):
 
 
 def check_nans(df_or_array):
-    if pd.isnull(df_or_array).values.any():
-        raise TypeError("NaNs were detected in the dataset. Try using pd.isnull to find the problematic values.")
-
+    nulls = pd.isnull(df_or_array)
+    if hasattr(nulls, 'values'):
+        if nulls.values.any():
+            raise TypeError("NaNs were detected in the dataset. Try using pd.isnull to find the problematic values.")
+    else:
+        if nulls.any():
+            raise TypeError("NaNs were detected in the dataset. Try using pd.isnull to find the problematic values.")
 
 def to_long_format(df, duration_col):
     """

From f71eab6de934f4a2e4a046b0ce38bb719bb9ca43 Mon Sep 17 00:00:00 2001
From: Cameron Davidson-Pilon <cam.davidson.pilon@gmail.com>
Date: Tue, 23 Oct 2018 09:30:18 -0400
Subject: [PATCH 21/59] test for super accurate strata

---
 tests/test_estimation.py | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)

diff --git a/tests/test_estimation.py b/tests/test_estimation.py
index 6816730a8..17f2c3408 100644
--- a/tests/test_estimation.py
+++ b/tests/test_estimation.py
@@ -1008,6 +1008,23 @@ def test_coef_output_against_R_super_accurate(self, rossi):
         cf.fit(rossi, duration_col='week', event_col='arrest', show_progress=True)
         npt.assert_array_almost_equal(cf.hazards_.values, expected, decimal=4)
 
+    def test_coef_output_against_R_with_strata_super_accurate(self, rossi):
+        """
+        from http://cran.r-project.org/doc/contrib/Fox-Companion/appendix-cox-regression.pdf
+        Link is now broken, but this is the code:
+
+        library(survival)
+        rossi <- read.csv('.../lifelines/datasets/rossi.csv')
+        r <- coxph(Surv(week, arrest) ~ fin + age + strata(race) + wexp + mar + paro + prio,
+            data=rossi)
+        cat(round(r$coefficients, 4), sep=", ")
+        """
+        expected = np.array([[-0.3788, -0.0576, -0.1427, -0.4388, -0.0858, 0.0922]])
+        cf = CoxPHFitter()
+        cf.fit(rossi, duration_col='week', event_col='arrest', strata=['race'], show_progress=True)
+        npt.assert_array_almost_equal(cf.hazards_.values, expected, decimal=4)
+
+
     def test_coef_output_against_R_using_non_trivial_but_integer_weights(self, rossi):
         rossi_ = rossi.copy()
         rossi_['weights'] = 1.

From 87ab9482b5310fd38edb84a87f44481880096b92 Mon Sep 17 00:00:00 2001
From: Cameron Davidson-Pilon <cam.davidson.pilon@gmail.com>
Date: Wed, 24 Oct 2018 09:15:09 -0400
Subject: [PATCH 22/59] more changes to print_summary

---
 CHANGELOG.md                                 |  5 +-
 lifelines/fitters/cox_time_varying_fitter.py |  4 +-
 lifelines/fitters/coxph_fitter.py            |  6 +-
 lifelines/fitters/exponential_fitter.py      | 33 +++++++++--
 lifelines/fitters/weibull_fitter.py          | 62 ++++++++++++++------
 tests/test_estimation.py                     | 16 ++---
 6 files changed, 88 insertions(+), 38 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 2a91824e4..038d6862e 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -11,7 +11,10 @@
  - `KaplanMeierFitter` now has a cumulative plot option. Example `kmf.plot(invert_y_axis=True)`
  - a `weights_col` option has been added to `CoxTimeVaryingFitter` that allows for time-varying weights. 
  - `WeibullFitter` has a new `show_progress` param.
- - `CoxPHFitter` and `CoxTimeVaryFitter` method `print_summary` is updated with new fields. 
+ - `CoxPHFitter`, `ExponentialFitter`, `WeibullFitter` and `CoxTimeVaryFitter` method `print_summary` is updated with new fields. 
+ - `WeibullFitter` has renamed the incorrect `_jacobian` to `_hessian_`. 
+ - `variance_matrix_` is now a property on fitted `WeibullFitter` which describes the variance matrix of the parameters.
+ - The default `WeibullFitter().timeline` has changed from integers between the min and max duration to _n_ floats between the max and min durations, where _n_ is the number of observations. 
 
 #### 0.14.6
  - fix for n > 2 groups in `multivariate_logrank_test` (again).
diff --git a/lifelines/fitters/cox_time_varying_fitter.py b/lifelines/fitters/cox_time_varying_fitter.py
index f88d5d0ee..ef479f44d 100644
--- a/lifelines/fitters/cox_time_varying_fitter.py
+++ b/lifelines/fitters/cox_time_varying_fitter.py
@@ -452,7 +452,7 @@ def print_summary(self):
 
         # Print information about data first
         justify = string_justify(18)
-        print()
+        print(self)
         print("{} = {}".format(justify('event col'), self.event_col))
         print('{} = {}'.format(justify('number of subjects'), self._n_unique))
         print('{} = {}'.format(justify('number of periods'), self._n_examples))
@@ -573,7 +573,7 @@ def _compute_baseline_survival(self):
     def __repr__(self):
         classname = self.__class__.__name__
         try:
-            s = """<lifelines.%s: fitted with %d periods, %d uniques, %d events>""" % (
+            s = """<lifelines.%s: fitted with %d periods, %d subjects, %d events>""" % (
                 classname, self._n_examples, self._n_unique, self.event_observed.sum())
         except AttributeError:
             s = """<lifelines.%s>""" % classname
diff --git a/lifelines/fitters/coxph_fitter.py b/lifelines/fitters/coxph_fitter.py
index 254c378b5..2958d82a4 100644
--- a/lifelines/fitters/coxph_fitter.py
+++ b/lifelines/fitters/coxph_fitter.py
@@ -103,7 +103,7 @@ def fit(self, df, duration_col, event_col=None,
         # Sort on time
         df = df.sort_values(by=duration_col)
 
-        self._time_fit_was_called = datetime.utcnow().strftime("%Y-%m-%d %H:%M:%S")
+        self._time_fit_was_called = datetime.utcnow().strftime("%Y-%m-%d %H:%M:%S") + ' UTC'
         self.duration_col = duration_col
         self.event_col = event_col
         self.robust = robust
@@ -536,7 +536,7 @@ def print_summary(self):
 
         # Print information about data first
         justify = string_justify(18)
-        print()
+        print(self)
         print("{} = {}".format(justify('duration col'), self.duration_col))
         print("{} = {}".format(justify('event col'), self.event_col))
 
@@ -546,7 +546,7 @@ def print_summary(self):
         print('{} = {}'.format(justify('number of subjects'), self._n_examples))
         print('{} = {}'.format(justify('number of events'), self.event_observed.sum()))
         print('{} = {:.3f}'.format(justify('log-likelihood'), self._log_likelihood))
-        print('{} = {} UTC'.format(justify("time fit was run"), self._time_fit_was_called), end='\n\n')
+        print('{} = {}'.format(justify("time fit was run"), self._time_fit_was_called), end='\n\n')
         print('---')
 
 
diff --git a/lifelines/fitters/exponential_fitter.py b/lifelines/fitters/exponential_fitter.py
index 7ebfa565b..44d7a6b11 100644
--- a/lifelines/fitters/exponential_fitter.py
+++ b/lifelines/fitters/exponential_fitter.py
@@ -2,9 +2,10 @@
 from __future__ import print_function
 import numpy as np
 import pandas as pd
+from scipy import stats
 
 from lifelines.fitters import UnivariateFitter
-from lifelines.utils import inv_normal_cdf, check_nans
+from lifelines.utils import inv_normal_cdf, check_nans, significance_code, string_justify
 
 
 class ExponentialFitter(UnivariateFitter):
@@ -26,6 +27,11 @@ class ExponentialFitter(UnivariateFitter):
     After calling the `.fit` method, you have access to properties like:
      'survival_function_', 'lambda_'
 
+    A summary of the fit is available with the method 'print_summary()'
+
+
+    Reference: https://www4.stat.ncsu.edu/~dzhang2/st745/chap3.pdf
+
     """
 
     def fit(self, durations, event_observed=None, timeline=None, entry=None,
@@ -62,8 +68,10 @@ def fit(self, durations, event_observed=None, timeline=None, entry=None,
         # estimation
         D = self.event_observed.sum()
         T = self.durations.sum()
+
         self.lambda_ = D / T
         self._lambda_variance_ = self.lambda_ / T
+        self._log_likelihood = np.log(self.lambda_) * D - self.lambda_ * T
         self.survival_function_ = pd.DataFrame(np.exp(-self.lambda_ * self.timeline), columns=[self._label], index=self.timeline)
         self.confidence_interval_ = self._bounds(alpha if alpha else self.alpha, ci_labels)
         self.median_ = 1. / self.lambda_ * (np.log(2))
@@ -123,18 +131,31 @@ def summary(self):
         df['se(coef)'] = self._compute_standard_errors().loc['se']
         df['lower %.2f' % self.alpha] = lower_upper_bounds.loc['lower-bound']
         df['upper %.2f' % self.alpha] = lower_upper_bounds.loc['upper-bound']
+        df['p'] = self._compute_p_values()
         return df
 
+    def _compute_z_values(self):
+        return self.lambda_ / self._compute_standard_errors().loc['se']
+
+    def _compute_p_values(self):
+        U = self._compute_z_values() ** 2
+        return stats.chi2.sf(U, 1)
+
     def print_summary(self):
         """
         Print summary statistics describing the fit.
 
         """
-        df = self.summary
+        justify = string_justify(18)
+        print(self)
+        print('{} = {}'.format(justify('number of subjects'), self.durations.shape[0]))
+        print('{} = {}'.format(justify('number of events'), np.where(self.event_observed)[0].shape[0]))
+        print('{} = {:.3f}'.format(justify('log-likelihood'), self._log_likelihood), end='\n\n')
 
-        # Print information about data first
-        print('n={}, number of events={}'.format(self.durations.shape[0],
-                                                 np.where(self.event_observed)[0].shape[0]),
-              end='\n\n')
+        df = self.summary
+        df[''] = [significance_code(p) for p in df['p']]
         print(df.to_string(float_format=lambda f: '{:4.4f}'.format(f)))
+        print('---')
+        print("Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 ",
+              end='\n\n')
         return
diff --git a/lifelines/fitters/weibull_fitter.py b/lifelines/fitters/weibull_fitter.py
index caa495465..1f52ba45d 100644
--- a/lifelines/fitters/weibull_fitter.py
+++ b/lifelines/fitters/weibull_fitter.py
@@ -4,13 +4,14 @@
 import numpy as np
 import pandas as pd
 
+from scipy import stats as stats
 from numpy.linalg import solve, norm, inv
 from lifelines.fitters import UnivariateFitter
-from lifelines.utils import inv_normal_cdf, check_nans, ConvergenceError
+from lifelines.utils import inv_normal_cdf, check_nans, ConvergenceError, string_justify, significance_code
 
 
 def _negative_log_likelihood(lambda_rho, T, E):
-    if np.any(lambda_rho < 0):
+    if np.any(np.asarray(lambda_rho) < 0):
         return 10e9
     lambda_, rho = lambda_rho
     return - np.log(rho * lambda_) * E.sum() - (rho - 1) * (E * np.log(lambda_ * T)).sum() + ((lambda_ * T) ** rho).sum()
@@ -60,6 +61,8 @@ class WeibullFitter(UnivariateFitter):
     After calling the `.fit` method, you have access to properties like:
     `cumulative_hazard_', 'survival_function_', 'lambda_' and 'rho_'.
 
+    A summary of the fit is available with the method 'print_summary()'
+
     """
 
     def fit(self, durations, event_observed=None, timeline=None, entry=None,
@@ -94,12 +97,19 @@ def fit(self, durations, event_observed=None, timeline=None, entry=None,
             raise ValueError('This model does not allow for non-positive durations. Suggestion: add a small positive value to zero elements.')
 
         self.event_observed = np.asarray(event_observed, dtype=int) if event_observed is not None else np.ones_like(self.durations)
-        self.timeline = np.sort(np.asarray(timeline)) if timeline is not None else np.arange(int(self.durations.min()), int(self.durations.max()) + 1)
+
+        if timeline is not None:
+            self.timeline = np.sort(np.asarray(timeline))
+        else:
+            self.timeline = np.linspace(self.durations.min(), self.durations.max(), self.durations.shape[0])
+
         self._label = label
         alpha = alpha if alpha is not None else self.alpha
 
         # estimation
-        self.lambda_, self.rho_ = self._newton_rhaphson(self.durations, self.event_observed, show_progress=show_progress)
+        (self.lambda_, self.rho_), self._hessian_ = self._newton_rhaphson(self.durations, self.event_observed, show_progress=show_progress)
+        self._log_likelihood = -_negative_log_likelihood((self.lambda_, self.rho_), self.durations, self.event_observed)
+        self.variance_matrix_ = -inv(self._hessian_)
         self.survival_function_ = pd.DataFrame(self.survival_function_at_times(self.timeline), columns=[self._label], index=self.timeline)
         self.hazard_ = pd.DataFrame(self.hazard_at_times(self.timeline), columns=[self._label], index=self.timeline)
         self.cumulative_hazard_ = pd.DataFrame(self.cumulative_hazard_at_times(self.timeline), columns=[self._label], index=self.timeline)
@@ -129,7 +139,7 @@ def cumulative_hazard_at_times(self, times):
     def _newton_rhaphson(self, T, E, precision=1e-5, show_progress=False):
         from lifelines.utils import _smart_search
 
-        def jacobian_function(parameters, T, E):
+        def hessian_function(parameters, T, E):
             return np.array([
                 [_d_lambda_d_lambda_(parameters, T, E), _d_rho_d_lambda_(parameters, T, E)],
                 [_d_rho_d_lambda_(parameters, T, E), _d_rho_d_rho(parameters, T, E)]
@@ -148,16 +158,16 @@ def gradient_function(parameters, T, E):
 
         while converging and i < 50:
             # Do not override hessian and gradient in case of garbage
-            j, g = jacobian_function(parameters, T, E), gradient_function(parameters, T, E)
+            h, g = hessian_function(parameters, T, E), gradient_function(parameters, T, E)
 
-            delta = solve(j, - step_size * g.T)
+            delta = solve(h, - step_size * g.T)
             if np.any(np.isnan(delta)):
                 raise ConvergenceError("delta contains nan value(s). Convergence halted.")
 
             parameters += delta
 
             # Save these as pending result
-            jacobian = j
+            hessian = h
 
             if show_progress:
                 print("Iteration %d: norm_delta = %.5f, seconds_since_start = %.1f" % (i, norm(delta), time.time() - start))
@@ -166,13 +176,12 @@ def gradient_function(parameters, T, E):
                 converging = False
             i += 1
 
-        self._jacobian = jacobian
-        return parameters
+        return parameters, hessian
 
     def _bounds(self, alpha, ci_labels):
         alpha2 = inv_normal_cdf((1. + alpha) / 2.)
         df = pd.DataFrame(index=self.timeline)
-        var_lambda_, var_rho_ = inv(self._jacobian).diagonal()
+        var_lambda_, var_rho_ = inv(self._hessian_).diagonal()
 
         def _dH_d_lambda(lambda_, rho, T):
             return rho / lambda_ * (lambda_ * T) ** rho
@@ -194,7 +203,7 @@ def sensitivity_analysis(lambda_, rho, var_lambda_, var_rho_, T):
         return df
 
     def _compute_standard_errors(self):
-        var_lambda_, var_rho_ = inv(self._jacobian).diagonal()
+        var_lambda_, var_rho_ = inv(self._hessian_).diagonal()
         return pd.DataFrame([[np.sqrt(var_lambda_), np.sqrt(var_rho_)]],
                             index=['se'], columns=['lambda_', 'rho_'])
 
@@ -206,6 +215,16 @@ def _compute_confidence_bounds_of_parameters(self):
             np.array([self.lambda_, self.rho_]) - alpha2 * se,
         ], columns=['lambda_', 'rho_'], index=['upper-bound', 'lower-bound'])
 
+
+    def _compute_z_values(self):
+        return (np.asarray([self.lambda_, self.rho_]) /
+                self._compute_standard_errors().loc['se'])
+
+
+    def _compute_p_values(self):
+        U = self._compute_z_values() ** 2
+        return stats.chi2.sf(U, 1)
+
     @property
     def summary(self):
         """Summary statistics describing the fit.
@@ -214,13 +233,15 @@ def summary(self):
         Returns
         -------
         df : pd.DataFrame
-            Contains columns coef, exp(coef), se(coef), z, p, lower, upper"""
+            Contains columns coef, exp(coef), se(coef), z, p, lower, upper
+        """
         lower_upper_bounds = self._compute_confidence_bounds_of_parameters()
         df = pd.DataFrame(index=['lambda_', 'rho_'])
         df['coef'] = [self.lambda_, self.rho_]
         df['se(coef)'] = self._compute_standard_errors().loc['se']
         df['lower %.2f' % self.alpha] = lower_upper_bounds.loc['lower-bound']
         df['upper %.2f' % self.alpha] = lower_upper_bounds.loc['upper-bound']
+        df['p'] = self._compute_p_values()
         return df
 
     def print_summary(self):
@@ -228,11 +249,16 @@ def print_summary(self):
         Print summary statistics describing the fit.
 
         """
-        df = self.summary
+        justify = string_justify(18)
+        print(self)
+        print('{} = {}'.format(justify('number of subjects'), self.durations.shape[0]))
+        print('{} = {}'.format(justify('number of events'), np.where(self.event_observed)[0].shape[0]))
+        print('{} = {:.3f}'.format(justify('log-likelihood'), self._log_likelihood), end='\n\n')
 
-        # Print information about data first
-        print('n={}, number of events={}'.format(self.durations.shape[0],
-                                                 np.where(self.event_observed)[0].shape[0]),
-              end='\n\n')
+        df = self.summary
+        df[''] = [significance_code(p) for p in df['p']]
         print(df.to_string(float_format=lambda f: '{:4.4f}'.format(f)))
+        print('---')
+        print("Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 ",
+              end='\n\n')
         return
diff --git a/tests/test_estimation.py b/tests/test_estimation.py
index 17f2c3408..f1ca63287 100644
--- a/tests/test_estimation.py
+++ b/tests/test_estimation.py
@@ -302,12 +302,12 @@ def test_typeerror_is_thrown_if_there_is_nans_in_the_event_col(self, univariate_
 
 class TestWeibullFitter():
 
-    def test_weibull_fit_returns_integer_timelines(self):
+    def test_weibull_fit_returns_float_timelines(self):
         wf = WeibullFitter()
         T = np.linspace(0.1, 10)
         wf.fit(T)
-        npt.assert_array_equal(wf.timeline, np.array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10]))
-        npt.assert_array_equal(wf.survival_function_.index.values, np.array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10]))
+        npt.assert_array_equal(wf.timeline, T)
+        npt.assert_array_equal(wf.survival_function_.index.values, T)
 
     def test_weibull_model_does_not_except_negative_or_zero_values(self):
         wf = WeibullFitter()
@@ -778,7 +778,7 @@ def test_print_summary(self, rossi):
             cp._time_fit_was_called = '2018-10-23 02:40:45 UTC'
             cp.print_summary()
             output = out.getvalue().strip().split()
-            expected = """
+            expected = (repr(cp) + "\n" + """
       duration col = week
          event col = arrest
 number of subjects = 432
@@ -800,7 +800,7 @@ def test_print_summary(self, rossi):
 
 Concordance = 0.640
 Likelihood ratio test = 33.266 on 7 df, p=0.00002
-""".strip().split()
+""").strip().split()
             for i in [0, 1, 2, 3, -2, -1, -3, -4, -5]:
                 assert output[i] == expected[i]
         finally:
@@ -2143,7 +2143,7 @@ def test_ctv_baseline_cumulative_hazard_against_R(self, ctv, heart):
     def test_repr_with_fitter(self, ctv, heart):
         ctv.fit(heart, id_col='id', event_col='event')
         uniques = heart['id'].unique().shape[0]
-        assert ctv.__repr__() == '<lifelines.CoxTimeVaryingFitter: fitted with %d periods, %d uniques, %d events>' % (heart.shape[0], uniques, heart['event'].sum())
+        assert ctv.__repr__() == '<lifelines.CoxTimeVaryingFitter: fitted with %d periods, %d subjects, %d events>' % (heart.shape[0], uniques, heart['event'].sum())
 
 
     def test_all_okay_with_non_trivial_index_in_dataframe(self, ctv, heart):
@@ -2186,7 +2186,7 @@ def test_print_summary(self, ctv, heart):
             ctv._time_fit_was_called = '2018-10-23 02:41:45 UTC'
             ctv.print_summary()
             output = out.getvalue().strip().split()
-            expected = """
+            expected = (repr(ctv) + "\n" + """
          event col = event
 number of subjects = 103
  number of periods = 172
@@ -2204,7 +2204,7 @@ def test_print_summary(self, ctv, heart):
 Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
 
 Likelihood ratio test = 15.111 on 4 df, p=0.00448
-""".strip().split()
+""").strip().split()
             for i in [0, 1, 2, 3, -2, -1, -3, -4, -5]:
                 assert output[i] == expected[i]
         finally:

From 5f4d3fc0d4926a75b9129a4f1a7cc5be2346c14e Mon Sep 17 00:00:00 2001
From: Cameron Davidson-Pilon <cam.davidson.pilon@gmail.com>
Date: Wed, 24 Oct 2018 10:47:21 -0400
Subject: [PATCH 23/59] performance improvements

---
 lifelines/fitters/coxph_fitter.py |  5 ++++-
 lifelines/utils/__init__.py       | 11 ++++-------
 2 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/lifelines/fitters/coxph_fitter.py b/lifelines/fitters/coxph_fitter.py
index 2958d82a4..4e15d23b2 100644
--- a/lifelines/fitters/coxph_fitter.py
+++ b/lifelines/fitters/coxph_fitter.py
@@ -57,6 +57,7 @@ def __init__(self, alpha=0.95, tie_method='Efron', penalizer=0.0, strata=None):
         self.penalizer = penalizer
         self.strata = strata
 
+
     def fit(self, df, duration_col, event_col=None,
             show_progress=False, initial_beta=None,
             strata=None, step_size=None, weights_col=None,
@@ -350,6 +351,7 @@ def _get_efron_values(self, X, beta, T, E, weights):
         # Init number of ties and weights
         weight_count = 0.0
         tie_count = 0
+        scores = weights[:,None] * exp(dot(X, beta))
 
         # Iterate backwards to utilize recursive relationship
         for i in range(n - 1, -1, -1):
@@ -357,10 +359,11 @@ def _get_efron_values(self, X, beta, T, E, weights):
             ti = T[i]
             ei = E[i]
             xi = X[i:i + 1]
+            score = scores[i:i+1]
             w = weights[i]
 
             # Calculate phi values
-            phi_i = w * exp(dot(xi, beta))
+            phi_i = score
             phi_x_i = phi_i * xi
             phi_x_x_i = dot(xi.T, phi_x_i)
 
diff --git a/lifelines/utils/__init__.py b/lifelines/utils/__init__.py
index 17201a731..1bbacd005 100644
--- a/lifelines/utils/__init__.py
+++ b/lifelines/utils/__init__.py
@@ -1014,9 +1014,8 @@ def concordance_value(time_a, time_b, pred_a, pred_b):
         raise ZeroDivisionError("No admissable pairs in the dataset.")
     return csum / paircount
 
-
 def pass_for_numeric_dtypes_or_raise(df):
-    nonnumeric_cols = df.select_dtypes(exclude=[np.number, bool]).columns.tolist()
+    nonnumeric_cols = [col for col in df.columns if not np.issubdtype(df[col].dtype, np.number)]
     if len(nonnumeric_cols) > 0:
         raise TypeError("DataFrame contains nonnumeric columns: %s. Try using pandas.get_dummies to convert the non-numeric column(s) to numerical data, or dropping the column(s)." % nonnumeric_cols)
 
@@ -1081,16 +1080,15 @@ def check_complete_separation_low_variance(df, events):
 See https://stats.idre.ucla.edu/other/mult-pkg/faq/general/faqwhat-is-complete-or-quasi-complete-separation-in-logisticprobit-regression-and-how-do-we-deal-with-them/ " % (inter)
         warnings.warn(warning_text, ConvergenceWarning)
 
-
 def check_complete_separation_close_to_perfect_correlation(df, durations):
     # slow for many columns
     THRESHOLD = 0.99
     n, _ = df.shape
 
-    if n > 1000:
+    if n > 500:
         # let's sample to speed this n**2 algo up.
-        df = df.sample(n=800, random_state=15).copy()
-        durations = durations.sample(n=800, random_state=15).copy()
+        df = df.sample(n=500, random_state=0).copy()
+        durations = durations.sample(n=500, random_state=0).copy()
 
     for col, series in df.iteritems():
         if abs(stats.spearmanr(series, durations).correlation) >= THRESHOLD:
@@ -1098,7 +1096,6 @@ def check_complete_separation_close_to_perfect_correlation(df, durations):
 See https://stats.idre.ucla.edu/other/mult-pkg/faq/general/faqwhat-is-complete-or-quasi-complete-separation-in-logisticprobit-regression-and-how-do-we-deal-with-them/ " % (col)
             warnings.warn(warning_text, ConvergenceWarning)
 
-
 def check_complete_separation(df, events, durations):
     check_complete_separation_low_variance(df, events)
     check_complete_separation_close_to_perfect_correlation(df, durations)

From 464b9cf93fecee88fc02bbbaf114a52dd979cc6f Mon Sep 17 00:00:00 2001
From: Cameron Davidson-Pilon <cam.davidson.pilon@gmail.com>
Date: Wed, 24 Oct 2018 10:50:36 -0400
Subject: [PATCH 24/59] performance improvements for CoxPHFitter

---
 CHANGELOG.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 038d6862e..a903b21b3 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -15,6 +15,7 @@
  - `WeibullFitter` has renamed the incorrect `_jacobian` to `_hessian_`. 
  - `variance_matrix_` is now a property on fitted `WeibullFitter` which describes the variance matrix of the parameters.
  - The default `WeibullFitter().timeline` has changed from integers between the min and max duration to _n_ floats between the max and min durations, where _n_ is the number of observations. 
+ - Performance improvements for `CoxPHFitter` (~15% faster)
 
 #### 0.14.6
  - fix for n > 2 groups in `multivariate_logrank_test` (again).

From ba51b68b1e2f6432bd0126ed51f71659f5ad12f5 Mon Sep 17 00:00:00 2001
From: Cameron Davidson-Pilon <cam.davidson.pilon@gmail.com>
Date: Wed, 24 Oct 2018 11:57:29 -0400
Subject: [PATCH 25/59] wowow this is much faster

---
 lifelines/fitters/cox_time_varying_fitter.py | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/lifelines/fitters/cox_time_varying_fitter.py b/lifelines/fitters/cox_time_varying_fitter.py
index ef479f44d..0bf61c1f4 100644
--- a/lifelines/fitters/cox_time_varying_fitter.py
+++ b/lifelines/fitters/cox_time_varying_fitter.py
@@ -324,6 +324,7 @@ def _newton_rhaphson(self, df, stop_times_events, weights, show_progress=False,
 
         return beta
 
+    @profile
     def _get_gradients(self, df, stops_events, weights, beta):
         """
         Calculates the first and second order vector differentials, with respect to beta.
@@ -343,7 +344,9 @@ def _get_gradients(self, df, stops_events, weights, beta):
 
         for t in unique_death_times:
 
-            ix = (stops_events['start'] < t) & (t <= stops_events['stop'])
+            # I feel like this can be made into some tree-like structure
+            ix = (stops_events['start'].values < t) & (t <= stops_events['stop'].values)
+
             df_at_t = df.loc[ix]
             weights_at_t = weights.loc[ix]
             stops_events_at_t = stops_events.loc[ix]
@@ -358,7 +361,7 @@ def _get_gradients(self, df, stops_events, weights, beta):
             risk_phi_x_x = phi_x_x_i
 
             # Calculate the sums of Tie set
-            deaths = stops_events_at_t['event'] & (stops_events_at_t['stop'] == t)
+            deaths = stops_events_at_t['event'].values & (stops_events_at_t['stop'].values == t)
 
             ties_counts = deaths.sum()  # should always at least 1
 
@@ -369,9 +372,9 @@ def _get_gradients(self, df, stops_events, weights, beta):
 
             if ties_counts > 1:
                 # it's faster if we can skip computing these when we don't need to.
-                tie_phi = phi_i[deaths.values].sum()
+                tie_phi = phi_i[deaths].sum()
                 tie_phi_x = phi_x_i.loc[deaths].sum(0).values
-                tie_phi_x_x = dot(xi_deaths.T, phi_i[deaths.values] * xi_deaths)
+                tie_phi_x_x = dot(xi_deaths.T, phi_i[deaths] * xi_deaths)
 
             partial_gradient = np.zeros(d)
             weight_count = weights_deaths.sum()

From 769cc4c577afab432cadb409488e1759a6f55afd Mon Sep 17 00:00:00 2001
From: Cameron Davidson-Pilon <cam.davidson.pilon@gmail.com>
Date: Wed, 24 Oct 2018 11:58:25 -0400
Subject: [PATCH 26/59] remove profile

---
 lifelines/fitters/cox_time_varying_fitter.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/lifelines/fitters/cox_time_varying_fitter.py b/lifelines/fitters/cox_time_varying_fitter.py
index 0bf61c1f4..6910a5d82 100644
--- a/lifelines/fitters/cox_time_varying_fitter.py
+++ b/lifelines/fitters/cox_time_varying_fitter.py
@@ -324,7 +324,6 @@ def _newton_rhaphson(self, df, stop_times_events, weights, show_progress=False,
 
         return beta
 
-    @profile
     def _get_gradients(self, df, stops_events, weights, beta):
         """
         Calculates the first and second order vector differentials, with respect to beta.

From 4f530a851cd6f69d41583a742b12797d3565c896 Mon Sep 17 00:00:00 2001
From: Cameron Davidson-Pilon <cam.davidson.pilon@gmail.com>
Date: Wed, 24 Oct 2018 12:22:00 -0400
Subject: [PATCH 27/59] wowow this is much faster

---
 CHANGELOG.md                                 |  1 +
 lifelines/fitters/cox_time_varying_fitter.py | 18 +++++++++++++-----
 2 files changed, 14 insertions(+), 5 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index a903b21b3..072a66186 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -16,6 +16,7 @@
  - `variance_matrix_` is now a property on fitted `WeibullFitter` which describes the variance matrix of the parameters.
  - The default `WeibullFitter().timeline` has changed from integers between the min and max duration to _n_ floats between the max and min durations, where _n_ is the number of observations. 
  - Performance improvements for `CoxPHFitter` (~15% faster)
+ - Performance improvements for `CoxTimeVaryingFitter` (~15% faster)
 
 #### 0.14.6
  - fix for n > 2 groups in `multivariate_logrank_test` (again).
diff --git a/lifelines/fitters/cox_time_varying_fitter.py b/lifelines/fitters/cox_time_varying_fitter.py
index 6910a5d82..8211d16ba 100644
--- a/lifelines/fitters/cox_time_varying_fitter.py
+++ b/lifelines/fitters/cox_time_varying_fitter.py
@@ -382,10 +382,18 @@ def _get_gradients(self, df, stops_events, weights, beta):
             for l in range(ties_counts):
 
                 if ties_counts > 1:
-                    denom = (risk_phi - l * tie_phi / ties_counts)
-                    numer = (risk_phi_x - l * tie_phi_x / ties_counts)
+                    """
+                    A good explaination for how Efron handles ties. Consider three of five subjects who fail at the time.
+                    As it is not known a priori that who is the first to fail, so one-third of
+                    (φ1 + φ2 + φ3) is adjusted from sum_j^{5} φj after one fails. Similarly two-third
+                    of (φ1 + φ2 + φ3) is adjusted after first two individuals fail, etc.
+
+                    """
+                    increasing_proportion = l / ties_counts
+                    denom = (risk_phi - increasing_proportion * tie_phi)
+                    numer = (risk_phi_x - increasing_proportion * tie_phi_x)
                     # Hessian
-                    a1 = (risk_phi_x_x - l * tie_phi_x_x / ties_counts) / denom
+                    a1 = (risk_phi_x_x - increasing_proportion * tie_phi_x_x) / denom
                 else:
                     denom = risk_phi
                     numer = risk_phi_x
@@ -557,10 +565,10 @@ def _compute_cumulative_baseline_hazard(self, tv_data, stop_times_events):
                                         columns=['baseline hazard'])
 
         for t in unique_death_times:
-            ix = (events['start'] < t) & (t <= events['stop'])
+            ix = (events['start'].values < t) & (t <= events['stop'].values)
             events_at_t = events.loc[ix]
 
-            deaths = events_at_t['event'] & (events_at_t['stop'] == t)
+            deaths = events_at_t['event'].values & (events_at_t['stop'] == t).values
             death_counts = deaths.sum()  # should always be atleast 1.
             baseline_hazard_.loc[t] = death_counts / events_at_t['hazard'].sum()
 

From 0af75fee3a07208137ab418cc2ea60298ba2538b Mon Sep 17 00:00:00 2001
From: Cameron Davidson-Pilon <cam.davidson.pilon@gmail.com>
Date: Wed, 24 Oct 2018 12:26:43 -0400
Subject: [PATCH 28/59] fix boolean columns

---
 lifelines/utils/__init__.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lifelines/utils/__init__.py b/lifelines/utils/__init__.py
index 1bbacd005..60da902f9 100644
--- a/lifelines/utils/__init__.py
+++ b/lifelines/utils/__init__.py
@@ -1015,7 +1015,7 @@ def concordance_value(time_a, time_b, pred_a, pred_b):
     return csum / paircount
 
 def pass_for_numeric_dtypes_or_raise(df):
-    nonnumeric_cols = [col for col in df.columns if not np.issubdtype(df[col].dtype, np.number)]
+    nonnumeric_cols = [col for col in df.columns if not (np.issubdtype(df[col].dtype, np.number) or np.issubdtype(df[col].dtype, np.bool_))]
     if len(nonnumeric_cols) > 0:
         raise TypeError("DataFrame contains nonnumeric columns: %s. Try using pandas.get_dummies to convert the non-numeric column(s) to numerical data, or dropping the column(s)." % nonnumeric_cols)
 

From a08dee8a7312a99baa985e77c8e8de6e27e36c49 Mon Sep 17 00:00:00 2001
From: Daniel Wilson <harenil@gmail.com>
Date: Wed, 24 Oct 2018 16:02:06 -0600
Subject: [PATCH 29/59] added serialization support for UnivariateFitters and
 unittest

---
 lifelines/fitters/__init__.py                 | 158 +++++++++---------
 .../breslow_fleming_harrington_fitter.py      |  10 +-
 lifelines/fitters/exponential_fitter.py       |  16 +-
 lifelines/fitters/kaplan_meier_fitter.py      |  14 +-
 lifelines/fitters/nelson_aalen_fitter.py      |  18 +-
 lifelines/fitters/weibull_fitter.py           |  14 +-
 6 files changed, 124 insertions(+), 106 deletions(-)

diff --git a/lifelines/fitters/__init__.py b/lifelines/fitters/__init__.py
index f3c830a7a..dcc853da3 100644
--- a/lifelines/fitters/__init__.py
+++ b/lifelines/fitters/__init__.py
@@ -6,7 +6,7 @@
 import pandas as pd
 
 from lifelines.plotting import plot_estimate
-from lifelines.utils import qth_survival_times
+from lifelines.utils import qth_survival_times, _to_array
 
 
 class BaseFitter(object):
@@ -28,86 +28,92 @@ def __repr__(self):
 
 class UnivariateFitter(BaseFitter):
 
-    def _plot_estimate(self, *args):
-        return plot_estimate(self, *args)
+    def _update_docstrings(self):
+        # Update their docstrings
+        self.__class__.subtract.__doc__ = self.subtract.__doc__.format(self._estimate_name,self.__class__.__name__)
+        self.__class__.divide.__doc__ = self.divide.__doc__.format(self._estimate_name,self.__class__.__name__)
+        self.__class__.predict.__doc__ = self.predict.__doc__.format(self.__class__.__name__)
+        self.__class__.plot.__doc__ = plot_estimate.__doc__.format(self.__class__.__name__)
 
-    def _subtract(self, estimate):
-        class_name = self.__class__.__name__
-        doc_string = """
-            Subtract the %s of two %s objects.
+    def plot(self, *args, **kwargs):
+        try:
+            estimate = self._estimate_name
+        except AttributeError:
+            raise RuntimeError("Must call `fit` first!")
+            
+        return plot_estimate(self, *args, **kwargs)
 
-            Parameters:
-              other: an %s fitted instance.
-
-            """ % (estimate, class_name, class_name)
-
-        def subtract(other):
-            self_estimate = getattr(self, estimate)
-            other_estimate = getattr(other, estimate)
-            new_index = np.concatenate((other_estimate.index, self_estimate.index))
-            new_index = np.unique(new_index)
-            return pd.DataFrame(
-                self_estimate.reindex(new_index, method='ffill').values -
-                other_estimate.reindex(new_index, method='ffill').values,
-                index=new_index,
-                columns=['diff']
-            )
-        subtract.__doc__ = doc_string
-        return subtract
-
-    def _divide(self, estimate):
-        class_name = self.__class__.__name__
-        doc_string = """
-            Divide the %s of two %s objects.
+    def subtract(self,other):
+        """
+        Subtract the {0} of two {1} objects.
 
             Parameters:
-              other: an %s fitted instance.
-
-            """ % (estimate, class_name, class_name)
-
-        def divide(other):
-            self_estimate = getattr(self, estimate)
-            other_estimate = getattr(other, estimate)
-            new_index = np.concatenate((other_estimate.index, self_estimate.index))
-            new_index = np.unique(new_index)
-            return pd.DataFrame(
-                self_estimate.reindex(new_index, method='ffill').values /
-                other_estimate.reindex(new_index, method='ffill').values,
-                index=new_index,
-                columns=['ratio']
-            )
-        divide.__doc__ = doc_string
-        return divide
-
-    def _predict(self, estimate_name_or_function, label):
-        class_name = self.__class__.__name__
-        doc_string = """
-          Predict the %s at certain point in time. Uses a linear interpolation if
-             points in time are not in the index.
-
-          Parameters:
-            time: a scalar or an array of times to predict the value of %s at.
-
-          Returns:
-            predictions: a scalar if time is a scalar, a numpy array if time in an array.
-          """ % (class_name, class_name)
-
-        def predict(times):
-            def _to_array(x):
-                if not isinstance(x, collections.Iterable):
-                    return np.array([x])
-                return np.asarray(x)
-
-            if callable(estimate_name_or_function):
-                return pd.DataFrame(estimate_name_or_function(_to_array(times)), index=_to_array(times)).loc[times].squeeze()
-            else:
-                estimate = getattr(self, estimate_name_or_function)
-                # non-linear interpolations can push the survival curves above 1 and below 0.
-                return estimate.reindex(estimate.index.union(_to_array(times))).interpolate("index").loc[times].squeeze()
-
-        predict.__doc__ = doc_string
-        return predict
+              other: an {1} fitted instance.
+        """
+
+        try:
+            estimate = self._estimate_name
+        except AttributeError:
+            raise RuntimeError("Must call `fit` first!")
+        
+        self_estimate = getattr(self, self._estimate_name)
+        other_estimate = getattr(other, other._estimate_name)
+        new_index = np.concatenate((other_estimate.index, self_estimate.index))
+        new_index = np.unique(new_index)
+        return pd.DataFrame(
+            self_estimate.reindex(new_index, method='ffill').values -
+            other_estimate.reindex(new_index, method='ffill').values,
+            index=new_index,
+            columns=['diff']
+        )
+
+    def divide(self, other):
+        """
+        Divide the {0} of two {1} objects.
+
+        Parameters:
+          other: an {1} fitted instance.
 
+        """
+        try:
+            estimate = self._estimate_name
+        except AttributeError:
+            raise RuntimeError("Must call `fit` first!")
+    
+        self_estimate = getattr(self, self._estimate_name)
+        other_estimate = getattr(other, other._estimate_name)
+        new_index = np.concatenate((other_estimate.index, self_estimate.index))
+        new_index = np.unique(new_index)
+        return pd.DataFrame(
+            self_estimate.reindex(new_index, method='ffill').values /
+            other_estimate.reindex(new_index, method='ffill').values,
+            index=new_index,
+            columns=['ratio']
+        )
+
+    def predict(self, times):
+        """
+        Predict the {0} at certain point in time. Uses a linear interpolation if
+        points in time are not in the index.
+
+        Parameters:
+          time: a scalar or an array of times to predict the value of {0} at.
+
+        Returns:
+          predictions: a scalar if time is a scalar, a numpy array if time in an array.
+        """ 
+        try:
+            estimate = self._estimate_name
+        except AttributeError:
+            raise RuntimeError("Must call `fit` first!")
+
+        if callable(self._estimation_method):
+            return pd.DataFrame(self._estimation_method(_to_array(times)), index=_to_array(times)).loc[times].squeeze()
+        else:
+            estimate = getattr(self, self._estimation_method)
+            # non-linear interpolations can push the survival curves above 1 and below 0.
+            return estimate.reindex(estimate.index.union(_to_array(times))).interpolate("index").loc[times].squeeze()
+        
     @property
     def conditional_time_to_event_(self):
         return self._conditional_time_to_event_()
diff --git a/lifelines/fitters/breslow_fleming_harrington_fitter.py b/lifelines/fitters/breslow_fleming_harrington_fitter.py
index 9561d50e7..14df53d66 100644
--- a/lifelines/fitters/breslow_fleming_harrington_fitter.py
+++ b/lifelines/fitters/breslow_fleming_harrington_fitter.py
@@ -59,11 +59,13 @@ def fit(self, durations, event_observed=None, timeline=None, entry=None,
         self.median_ = median_survival_times(self.survival_function_)
 
         # estimation methods
-        self.predict = self._predict("survival_function_", label)
-        self.subtract = self._subtract("survival_function_")
-        self.divide = self._divide("survival_function_")
+        self._estimation_method = "survival_function_"
+        self._estimate_name = "survival_function_"
+        self._predict_label = label
+        self._update_docstrings()
 
         # plotting functions
-        self.plot = self._plot_estimate("survival_function_")
         self.plot_survival_function = self.plot
         return self
+
+    
\ No newline at end of file
diff --git a/lifelines/fitters/exponential_fitter.py b/lifelines/fitters/exponential_fitter.py
index 7ebfa565b..c36e0330b 100644
--- a/lifelines/fitters/exponential_fitter.py
+++ b/lifelines/fitters/exponential_fitter.py
@@ -67,18 +67,20 @@ def fit(self, durations, event_observed=None, timeline=None, entry=None,
         self.survival_function_ = pd.DataFrame(np.exp(-self.lambda_ * self.timeline), columns=[self._label], index=self.timeline)
         self.confidence_interval_ = self._bounds(alpha if alpha else self.alpha, ci_labels)
         self.median_ = 1. / self.lambda_ * (np.log(2))
-
-        # estimation functions
-        self.predict = self._predict(lambda t: np.exp(-self.lambda_ * t), self._label)
-        self.subtract = self._subtract("survival_function_")
-        self.divide = self._divide("survival_function_")
+       
+        # estimation methods
+        self._estimate_name = "survival_function_"
+        self._predict_label = label
+        self._update_docstrings()
 
         # plotting
-        self.plot = self._plot_estimate("survival_function_")
         self.plot_survival_function_ = self.plot
 
         return self
-
+    
+    def _estimation_method(self,t):
+        return np.exp(-self.lambda_ * t)
+    
     def _bounds(self, alpha, ci_labels):
         alpha2 = inv_normal_cdf((1. + alpha) / 2.)
         df = pd.DataFrame(index=self.timeline)
diff --git a/lifelines/fitters/kaplan_meier_fitter.py b/lifelines/fitters/kaplan_meier_fitter.py
index 98c611f14..b65c9dcc1 100644
--- a/lifelines/fitters/kaplan_meier_fitter.py
+++ b/lifelines/fitters/kaplan_meier_fitter.py
@@ -79,16 +79,18 @@ def fit(self, durations, event_observed=None, timeline=None, entry=None, label='
         self.median_ = median_survival_times(self.__estimate, left_censorship=left_censorship)
 
         # estimation methods
-        self.predict = self._predict(estimate_name, label)
-        self.subtract = self._subtract(estimate_name)
-        self.divide = self._divide(estimate_name)
-
+        self._estimation_method = estimate_name
+        self._estimate_name = estimate_name
+        self._predict_label = label
+        self._update_docstrings()
+        
         # plotting functions
-        self.plot = self._plot_estimate(estimate_name)
         setattr(self, "plot_" + estimate_name, self.plot)
-        self.plot_loglogs = plot_loglogs(self)
         return self
 
+    def plot_loglogs(self,*args,**kwargs):
+        return plot_loglogs(self,*args,**kwargs)
+    
     def _bounds(self, cumulative_sq_, alpha, ci_labels):
         # This method calculates confidence intervals using the exponential Greenwood formula.
         # See https://www.math.wustl.edu/%7Esawyer/handouts/greenwood.pdf
diff --git a/lifelines/fitters/nelson_aalen_fitter.py b/lifelines/fitters/nelson_aalen_fitter.py
index 6b8e81802..b4e611aeb 100644
--- a/lifelines/fitters/nelson_aalen_fitter.py
+++ b/lifelines/fitters/nelson_aalen_fitter.py
@@ -36,6 +36,7 @@ def __init__(self, alpha=0.95, nelson_aalen_smoothing=True):
             self._variance_f = self._variance_f_discrete
             self._additive_f = self._additive_f_discrete
 
+            
     def fit(self, durations, event_observed=None, timeline=None, entry=None,
             label='NA_estimate', alpha=None, ci_labels=None, weights=None):
         """
@@ -77,18 +78,21 @@ def fit(self, durations, event_observed=None, timeline=None, entry=None,
         self.confidence_interval_ = self._bounds(cumulative_sq_[:, None], alpha if alpha else self.alpha, ci_labels)
         self._cumulative_sq = cumulative_sq_
 
-        # estimation functions
-        self.predict = self._predict("cumulative_hazard_", self._label)
-        self.subtract = self._subtract("cumulative_hazard_")
-        self.divide = self._divide("cumulative_hazard_")
-
+        # estimation methods
+        self._estimation_method = "cumulative_hazard_"
+        self._estimate_name = "cumulative_hazard_"
+        self._predict_label = label
+        self._update_docstrings()
+        
         # plotting
-        self.plot = self._plot_estimate("cumulative_hazard_")
         self.plot_cumulative_hazard = self.plot
-        self.plot_hazard = self._plot_estimate('hazard_')
 
         return self
 
+    def plot_hazard(self,*args,**kwargs):
+        kwargs['estimate'] = 'hazard_'
+        return self.plot(*args,**kwargs)
+    
     def _bounds(self, cumulative_sq_, alpha, ci_labels):
         alpha2 = inv_normal_cdf(1 - (1 - alpha) / 2)
         df = pd.DataFrame(index=self.timeline)
diff --git a/lifelines/fitters/weibull_fitter.py b/lifelines/fitters/weibull_fitter.py
index 5b5f0993c..6871ed489 100644
--- a/lifelines/fitters/weibull_fitter.py
+++ b/lifelines/fitters/weibull_fitter.py
@@ -105,17 +105,19 @@ def fit(self, durations, event_observed=None, timeline=None, entry=None,
         self.confidence_interval_ = self._bounds(alpha, ci_labels)
         self.median_ = 1. / self.lambda_ * (np.log(2)) ** (1. / self.rho_)
 
-        # estimation functions - Cumulative hazard takes priority.
-        self.predict = self._predict(lambda t: np.exp(-(self.lambda_ * t) ** self.rho_), self._label)
-        self.subtract = self._subtract("cumulative_hazard_")
-        self.divide = self._divide("cumulative_hazard_")
-
+        # estimation methods
+        self._estimate_name = "cumulative_hazard_"
+        self._predict_label = label
+        self._update_docstrings()
+        
         # plotting - Cumulative hazard takes priority.
-        self.plot = self._plot_estimate("cumulative_hazard_")
         self.plot_cumulative_hazard = self.plot
 
         return self
 
+    def _estimation_method(self,t):
+        return np.exp(-(self.lambda_ * t) ** self.rho_)
+    
     def hazard_at_times(self, times):
         return self.lambda_ * self.rho_ * (self.lambda_ * times) ** (self.rho_ - 1)
 

From ce16b9938c87a1789e17726b33d48b7a592af933 Mon Sep 17 00:00:00 2001
From: Daniel Wilson <harenil@gmail.com>
Date: Wed, 24 Oct 2018 16:05:09 -0600
Subject: [PATCH 30/59] added serialization support for UnivariateFitters and
 unittest

---
 lifelines/plotting.py       | 282 ++++++++++++++++++------------------
 lifelines/utils/__init__.py |   7 +
 tests/test_estimation.py    |  12 ++
 3 files changed, 159 insertions(+), 142 deletions(-)

diff --git a/lifelines/plotting.py b/lifelines/plotting.py
index fa93e0cc6..6fabf0a67 100644
--- a/lifelines/plotting.py
+++ b/lifelines/plotting.py
@@ -234,155 +234,153 @@ def create_dataframe_slicer(iloc, loc):
     return lambda df: getattr(df, get_method)[user_submitted_slice]
 
 
-def plot_loglogs(cls):
-    doc_string = """
+def plot_loglogs(cls,loc=None, iloc=None, show_censors=False, censor_styles=None, **kwargs):
+    """
     Specifies a plot of the log(-log(SV)) versus log(time) where SV is the estimated survival function.
     """
 
-    def _plot_loglogs(loc=None, iloc=None, show_censors=False, censor_styles=None, **kwargs):
-
-        def loglog(s): return np.log(-np.log(s))
-
-        if (loc is not None) and (iloc is not None):
-            raise ValueError('Cannot set both loc and iloc in call to .plot().')
-
-        if censor_styles is None:
-            censor_styles = {}
-
-        set_kwargs_ax(kwargs)
-        set_kwargs_color(kwargs)
-        set_kwargs_drawstyle(kwargs)
-        kwargs['logx'] = True
-
-        dataframe_slicer = create_dataframe_slicer(iloc, loc)
-
-        # plot censors
-        ax = kwargs['ax']
-        colour = kwargs['c']
-
-        if show_censors and cls.event_table['censored'].sum() > 0:
-            cs = {
-                'marker': '+',
-                'ms': 12,
-                'mew': 1
-            }
-            cs.update(censor_styles)
-            times = dataframe_slicer(cls.event_table.loc[(cls.event_table['censored'] > 0)]).index.values.astype(float)
-            v = cls.predict(times)
-            # don't log times, as Pandas will take care of all log-scaling later.
-            ax.plot(times, loglog(v), linestyle='None',
-                    color=colour, **cs)
-
-        # plot estimate
-        dataframe_slicer(loglog(cls.survival_function_)).plot(**kwargs)
-        ax.set_xlabel('log(timeline)')
-        ax.set_ylabel('log(-log(survival_function_))')
-        return ax
-
-    _plot_loglogs.__doc__ = doc_string
-    return _plot_loglogs
-
-
-def plot_estimate(cls, estimate):
-    doc_string = """"
-        Plots a pretty version of the fitted %s.
-
-        Matplotlib plot arguments can be passed in inside the kwargs, plus
-
-        Parameters:
-          show_censors: place markers at censorship events. Default: False
-          censor_styles: If show_censors, this dictionary will be passed into
-                         the plot call.
-          ci_alpha: the transparency level of the confidence interval.
-                    Default: 0.3
-          ci_force_lines: force the confidence intervals to be line plots
-                          (versus default shaded areas). Default: False
-          ci_show: show confidence intervals. Default: True
-          ci_legend: if ci_force_lines is True, this is a boolean flag to add
-                     the lines' labels to the legend. Default: False
-          at_risk_counts: show group sizes at time points. See function
-                          'add_at_risk_counts' for details. Default: False
-          loc: specify a time-based subsection of the curves to plot, ex:
-                   .plot(loc=slice(0.,10.))
-              will plot the time values between t=0. and t=10.
-          iloc: specify a location-based subsection of the curves to plot, ex:
-                   .plot(iloc=slice(0,10))
-                will plot the first 10 time points.
-          bandwidth: specify the bandwidth of the kernel smoother for the
-                     smoothed-hazard rate. Only used when called 'plot_hazard'.
-
-        Returns:
-          ax: a pyplot axis object
-        """ % estimate
-
-    def plot(loc=None, iloc=None, show_censors=False,
-             censor_styles=None, ci_legend=False, ci_force_lines=False,
-             ci_alpha=0.25, ci_show=True, at_risk_counts=False,
-             bandwidth=None, **kwargs):
-
-        if censor_styles is None:
-            censor_styles = {}
-
-        if (loc is not None) and (iloc is not None):
-            raise ValueError('Cannot set both loc and iloc in call to .plot().')
-
-        set_kwargs_ax(kwargs)
-        set_kwargs_color(kwargs)
-        set_kwargs_drawstyle(kwargs)
-
-        if estimate == "hazard_":
-            if bandwidth is None:
-                raise ValueError('Must specify a bandwidth parameter in the call to plot_hazard.')
-            estimate_ = cls.smoothed_hazard_(bandwidth)
-            confidence_interval_ = \
-                cls.smoothed_hazard_confidence_intervals_(bandwidth, hazard_=estimate_.values[:, 0])
+
+    def loglog(s): return np.log(-np.log(s))
+
+    if (loc is not None) and (iloc is not None):
+        raise ValueError('Cannot set both loc and iloc in call to .plot().')
+
+    if censor_styles is None:
+        censor_styles = {}
+
+    set_kwargs_ax(kwargs)
+    set_kwargs_color(kwargs)
+    set_kwargs_drawstyle(kwargs)
+    kwargs['logx'] = True
+
+    dataframe_slicer = create_dataframe_slicer(iloc, loc)
+
+    # plot censors
+    ax = kwargs['ax']
+    colour = kwargs['c']
+
+    if show_censors and cls.event_table['censored'].sum() > 0:
+        cs = {
+            'marker': '+',
+            'ms': 12,
+            'mew': 1
+        }
+        cs.update(censor_styles)
+        times = dataframe_slicer(cls.event_table.loc[(cls.event_table['censored'] > 0)]).index.values.astype(float)
+        v = cls.predict(times)
+        # don't log times, as Pandas will take care of all log-scaling later.
+        ax.plot(times, loglog(v), linestyle='None',
+                color=colour, **cs)
+
+    # plot estimate
+    dataframe_slicer(loglog(cls.survival_function_)).plot(**kwargs)
+    ax.set_xlabel('log(timeline)')
+    ax.set_ylabel('log(-log(survival_function_))')
+    return ax
+
+
+
+def plot_estimate(cls,estimate=None,loc=None, iloc=None, show_censors=False,
+         censor_styles=None, ci_legend=False, ci_force_lines=False,
+         ci_alpha=0.25, ci_show=True, at_risk_counts=False,
+         bandwidth=None, **kwargs):
+    
+    """"
+    Plots a pretty version of the fitted .
+
+    Matplotlib plot arguments can be passed in inside the kwargs, plus
+
+    Parameters:
+      show_censors: place markers at censorship events. Default: False
+      censor_styles: If show_censors, this dictionary will be passed into
+                     the plot call.
+      ci_alpha: the transparency level of the confidence interval.
+                Default: 0.3
+      ci_force_lines: force the confidence intervals to be line plots
+                      (versus default shaded areas). Default: False
+      ci_show: show confidence intervals. Default: True
+      ci_legend: if ci_force_lines is True, this is a boolean flag to add
+                 the lines' labels to the legend. Default: False
+      at_risk_counts: show group sizes at time points. See function
+                      'add_at_risk_counts' for details. Default: False
+      loc: specify a time-based subsection of the curves to plot, ex:
+               .plot(loc=slice(0.,10.))
+          will plot the time values between t=0. and t=10.
+      iloc: specify a location-based subsection of the curves to plot, ex:
+               .plot(iloc=slice(0,10))
+            will plot the first 10 time points.
+      bandwidth: specify the bandwidth of the kernel smoother for the
+                 smoothed-hazard rate. Only used when called 'plot_hazard'.
+
+    Returns:
+      ax: a pyplot axis object
+    """
+
+
+    if censor_styles is None:
+        censor_styles = {}
+
+    if (loc is not None) and (iloc is not None):
+        raise ValueError('Cannot set both loc and iloc in call to .plot().')
+
+    set_kwargs_ax(kwargs)
+    set_kwargs_color(kwargs)
+    set_kwargs_drawstyle(kwargs)
+    
+    if estimate is None:
+        estimate = cls._estimate_name
+
+    if estimate == "hazard_":
+        if bandwidth is None:
+            raise ValueError('Must specify a bandwidth parameter in the call to plot_hazard.')
+        estimate_ = cls.smoothed_hazard_(bandwidth)
+        confidence_interval_ = \
+            cls.smoothed_hazard_confidence_intervals_(bandwidth, hazard_=estimate_.values[:, 0])
+    else:
+        estimate_ = getattr(cls, estimate)
+        confidence_interval_ = getattr(cls, 'confidence_interval_')
+
+    dataframe_slicer = create_dataframe_slicer(iloc, loc)
+
+    # plot censors
+    ax = kwargs['ax']
+    colour = kwargs['c']
+
+    if show_censors and cls.event_table['censored'].sum() > 0:
+        cs = {
+            'marker': '+',
+            'ms': 12,
+            'mew': 1
+        }
+        cs.update(censor_styles)
+        times = dataframe_slicer(cls.event_table.loc[(cls.event_table['censored'] > 0)]).index.values.astype(float)
+        v = cls.predict(times)
+        ax.plot(times, v, linestyle='None',
+                color=colour, **cs)
+
+    # plot estimate
+    dataframe_slicer(estimate_).plot(**kwargs)
+
+    # plot confidence intervals
+    if ci_show:
+        if ci_force_lines:
+            dataframe_slicer(confidence_interval_).plot(linestyle="-", linewidth=1,
+                                                        color=[colour], legend=ci_legend,
+                                                        drawstyle=kwargs.get('drawstyle', 'default'),
+                                                        ax=ax, alpha=0.6)
         else:
-            estimate_ = getattr(cls, estimate)
-            confidence_interval_ = getattr(cls, 'confidence_interval_')
-
-        dataframe_slicer = create_dataframe_slicer(iloc, loc)
-
-        # plot censors
-        ax = kwargs['ax']
-        colour = kwargs['c']
-
-        if show_censors and cls.event_table['censored'].sum() > 0:
-            cs = {
-                'marker': '+',
-                'ms': 12,
-                'mew': 1
-            }
-            cs.update(censor_styles)
-            times = dataframe_slicer(cls.event_table.loc[(cls.event_table['censored'] > 0)]).index.values.astype(float)
-            v = cls.predict(times)
-            ax.plot(times, v, linestyle='None',
-                    color=colour, **cs)
-
-        # plot estimate
-        dataframe_slicer(estimate_).plot(**kwargs)
-
-        # plot confidence intervals
-        if ci_show:
-            if ci_force_lines:
-                dataframe_slicer(confidence_interval_).plot(linestyle="-", linewidth=1,
-                                                            color=[colour], legend=ci_legend,
-                                                            drawstyle=kwargs.get('drawstyle', 'default'),
-                                                            ax=ax, alpha=0.6)
-            else:
-                x = dataframe_slicer(confidence_interval_).index.values.astype(float)
-                lower = dataframe_slicer(confidence_interval_.filter(like='lower')).values[:, 0]
-                upper = dataframe_slicer(confidence_interval_.filter(like='upper')).values[:, 0]
-                fill_between_steps(x, lower, y2=upper, ax=ax,
-                                   alpha=ci_alpha, color=colour,
-                                   linewidth=1.0)
+            x = dataframe_slicer(confidence_interval_).index.values.astype(float)
+            lower = dataframe_slicer(confidence_interval_.filter(like='lower')).values[:, 0]
+            upper = dataframe_slicer(confidence_interval_.filter(like='upper')).values[:, 0]
+            fill_between_steps(x, lower, y2=upper, ax=ax,
+                               alpha=ci_alpha, color=colour,
+                               linewidth=1.0)
 
-        if at_risk_counts:
-            add_at_risk_counts(cls, ax=ax)
+    if at_risk_counts:
+        add_at_risk_counts(cls, ax=ax)
 
-        return ax
+    return ax
 
-    plot.__doc__ = doc_string
-    return plot
 
 
 def fill_between_steps(x, y1, y2=0, h_align='left', ax=None, **kwargs):
diff --git a/lifelines/utils/__init__.py b/lifelines/utils/__init__.py
index 0aa7b3896..6237200a3 100644
--- a/lifelines/utils/__init__.py
+++ b/lifelines/utils/__init__.py
@@ -1,8 +1,10 @@
 # -*- coding: utf-8 -*-
 from __future__ import print_function, division
 import warnings
+import collections
 from datetime import datetime
 
+
 import numpy as np
 from scipy.linalg import solve
 from scipy import stats
@@ -43,6 +45,7 @@ def __str__(self):
         return repr(self.msg)
 
 
+
 def qth_survival_times(q, survival_functions, cdf=False):
     """
     Parameters:
@@ -1314,3 +1317,7 @@ def _is_monotonically_decreasing(array):
     def next(self):
         return self.step_size
 
+def _to_array(x):
+    if not isinstance(x, collections.Iterable):
+        return np.array([x])
+    return np.asarray(x)
diff --git a/tests/test_estimation.py b/tests/test_estimation.py
index 30a9f6bed..289d53742 100644
--- a/tests/test_estimation.py
+++ b/tests/test_estimation.py
@@ -4,6 +4,7 @@
 from collections import Counter, Iterable
 import os
 import warnings
+import pickle
 from itertools import combinations
 
 try:
@@ -299,6 +300,17 @@ def test_error_is_thrown_if_there_is_nans_in_the_event_col(self, univariate_fitt
             with pytest.raises(TypeError):
                 fitter().fit(T, E)
 
+    def test_pickle_serialization(self, positive_sample_lifetimes, univariate_fitters):
+         T = positive_sample_lifetimes[0]
+         for f in univariate_fitters:
+            fitter = f()
+            fitter.fit(T)
+
+            unpickled = pickle.loads(pickle.dumps(fitter)) 
+            dif = (fitter.durations - unpickled.durations).sum()
+            assert(dif==0)
+
+
 
 class TestWeibullFitter():
 

From 945ad1cd4c34e6d4da79ae2e23b0c7e948f8d5ac Mon Sep 17 00:00:00 2001
From: Daniel Wilson <harenil@gmail.com>
Date: Thu, 25 Oct 2018 11:31:27 -0600
Subject: [PATCH 31/59] added spaces between args and return

---
 lifelines/fitters/kaplan_meier_fitter.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/lifelines/fitters/kaplan_meier_fitter.py b/lifelines/fitters/kaplan_meier_fitter.py
index b65c9dcc1..4331a76d8 100644
--- a/lifelines/fitters/kaplan_meier_fitter.py
+++ b/lifelines/fitters/kaplan_meier_fitter.py
@@ -88,7 +88,8 @@ def fit(self, durations, event_observed=None, timeline=None, entry=None, label='
         setattr(self, "plot_" + estimate_name, self.plot)
         return self
 
-    def plot_loglogs(self,*args,**kwargs):
+    def plot_loglogs(self, *args, **kwargs):
+
         return plot_loglogs(self,*args,**kwargs)
     
     def _bounds(self, cumulative_sq_, alpha, ci_labels):

From 945ff5e935fdf4b98fcb8e431af40aead2a670d0 Mon Sep 17 00:00:00 2001
From: Daniel Wilson <harenil@gmail.com>
Date: Thu, 25 Oct 2018 11:34:11 -0600
Subject: [PATCH 32/59] added spaces between args and return

---
 lifelines/fitters/kaplan_meier_fitter.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/lifelines/fitters/kaplan_meier_fitter.py b/lifelines/fitters/kaplan_meier_fitter.py
index 4331a76d8..e5dc2e92d 100644
--- a/lifelines/fitters/kaplan_meier_fitter.py
+++ b/lifelines/fitters/kaplan_meier_fitter.py
@@ -89,8 +89,7 @@ def fit(self, durations, event_observed=None, timeline=None, entry=None, label='
         return self
 
     def plot_loglogs(self, *args, **kwargs):
-
-        return plot_loglogs(self,*args,**kwargs)
+        return plot_loglogs(self, *args, **kwargs)
     
     def _bounds(self, cumulative_sq_, alpha, ci_labels):
         # This method calculates confidence intervals using the exponential Greenwood formula.

From 83347b7f34fd1fec374d84ff3cd218d6ab9d59f4 Mon Sep 17 00:00:00 2001
From: Cameron Davidson-Pilon <cam.davidson.pilon@gmail.com>
Date: Sun, 28 Oct 2018 09:52:40 -0400
Subject: [PATCH 33/59] weights_col is used in coxph for
 baseline_cumulative_hazard_

---
 CHANGELOG.md                                 |  2 ++
 lifelines/fitters/cox_time_varying_fitter.py |  8 +++----
 lifelines/fitters/coxph_fitter.py            | 15 ++++++------
 tests/test_estimation.py                     | 24 ++++++++++++++++++--
 4 files changed, 36 insertions(+), 13 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 072a66186..1b91ee935 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -17,6 +17,8 @@
  - The default `WeibullFitter().timeline` has changed from integers between the min and max duration to _n_ floats between the max and min durations, where _n_ is the number of observations. 
  - Performance improvements for `CoxPHFitter` (~15% faster)
  - Performance improvements for `CoxTimeVaryingFitter` (~15% faster)
+ - Univariate models are now serialisable with `pickle`. Thanks @dwilson1988 for the contribution. 
+ - `baseline_cumulative_hazard_` (and derivatives of that) on `CoxPHFitter` now correctly incorporate the `weights_col`. 
 
 #### 0.14.6
  - fix for n > 2 groups in `multivariate_logrank_test` (again).
diff --git a/lifelines/fitters/cox_time_varying_fitter.py b/lifelines/fitters/cox_time_varying_fitter.py
index 8211d16ba..402e9baf4 100644
--- a/lifelines/fitters/cox_time_varying_fitter.py
+++ b/lifelines/fitters/cox_time_varying_fitter.py
@@ -108,7 +108,7 @@ def fit(self, df, id_col, event_col, start_col='start', stop_col='stop', weights
         self.variance_matrix_ = -inv(self._hessian_) / np.outer(self._norm_std, self._norm_std)
         self.standard_errors_ = self._compute_standard_errors(normalize(df, self._norm_mean, self._norm_std), stop_times_events, weights)
         self.confidence_intervals_ = self._compute_confidence_intervals()
-        self.baseline_cumulative_hazard_ = self._compute_cumulative_baseline_hazard(df, stop_times_events)
+        self.baseline_cumulative_hazard_ = self._compute_cumulative_baseline_hazard(df, stop_times_events, weights)
         self.baseline_survival_ = self._compute_baseline_survival()
         self.event_observed = stop_times_events['event']
         self.start_stop_and_events = stop_times_events
@@ -555,7 +555,7 @@ def plot(self, standardized=False, columns=None, **kwargs):
         return ax
 
 
-    def _compute_cumulative_baseline_hazard(self, tv_data, stop_times_events):
+    def _compute_cumulative_baseline_hazard(self, tv_data, stop_times_events, weights):
         events = stop_times_events.copy()
         events['hazard'] = self.predict_partial_hazard(tv_data).values
 
@@ -567,9 +567,9 @@ def _compute_cumulative_baseline_hazard(self, tv_data, stop_times_events):
         for t in unique_death_times:
             ix = (events['start'].values < t) & (t <= events['stop'].values)
             events_at_t = events.loc[ix]
-
+            weights_at_t = weights.loc[ix].values
             deaths = events_at_t['event'].values & (events_at_t['stop'] == t).values
-            death_counts = deaths.sum()  # should always be atleast 1.
+            death_counts = (weights_at_t * deaths).sum()  # should always be atleast 1.
             baseline_hazard_.loc[t] = death_counts / events_at_t['hazard'].sum()
 
         return baseline_hazard_.cumsum()
diff --git a/lifelines/fitters/coxph_fitter.py b/lifelines/fitters/coxph_fitter.py
index 4e15d23b2..b071880e8 100644
--- a/lifelines/fitters/coxph_fitter.py
+++ b/lifelines/fitters/coxph_fitter.py
@@ -164,7 +164,7 @@ def fit(self, df, duration_col, event_col=None,
         self.standard_errors_ = self._compute_standard_errors(normalize(df, self._norm_mean, self._norm_std), T, E, weights)
         self.confidence_intervals_ = self._compute_confidence_intervals()
 
-        self.baseline_hazard_ = self._compute_baseline_hazards(df, T, E)
+        self.baseline_hazard_ = self._compute_baseline_hazards(df, T, E, weights)
         self.baseline_cumulative_hazard_ = self._compute_baseline_cumulative_hazard()
         self.baseline_survival_ = self._compute_baseline_survival()
         self.score_ = concordance_index(self.durations,
@@ -720,33 +720,34 @@ def predict_expectation(self, X):
         v = self.predict_survival_function(X)[subjects]
         return pd.DataFrame(trapz(v.values.T, v.index), index=subjects)
 
-    def _compute_baseline_hazard(self, data, durations, event_observed, name):
+    def _compute_baseline_hazard(self, data, durations, event_observed, weights, name):
         # https://stats.stackexchange.com/questions/46532/cox-baseline-hazard
-        ind_hazards = self.predict_partial_hazard(data)
+        ind_hazards = self.predict_partial_hazard(data).mul(weights, axis='index')
         ind_hazards['event_at'] = durations.values
         ind_hazards_summed_over_durations = ind_hazards.groupby('event_at')[0].sum().sort_index(ascending=False).cumsum()
         ind_hazards_summed_over_durations.name = 'hazards'
 
-        event_table = survival_table_from_events(durations, event_observed)
+        event_table = survival_table_from_events(durations, event_observed, weights=weights)
         event_table = event_table.join(ind_hazards_summed_over_durations)
         baseline_hazard = pd.DataFrame(event_table['observed'] / event_table['hazards'], columns=[name]).fillna(0)
+
         return baseline_hazard
 
 
-    def _compute_baseline_hazards(self, df, T, E):
+    def _compute_baseline_hazards(self, df, T, E, weights):
         if self.strata:
             index = self.durations.unique()
             baseline_hazards_ = pd.DataFrame(index=index)
             for stratum in df.index.unique():
                 baseline_hazards_ = baseline_hazards_.merge(
-                    self._compute_baseline_hazard(data=df.loc[[stratum]], durations=T.loc[[stratum]], event_observed=E.loc[[stratum]], name=stratum),
+                    self._compute_baseline_hazard(data=df.loc[[stratum]], durations=T.loc[[stratum]], event_observed=E.loc[[stratum]], weights=weights.loc[[stratum]], name=stratum),
                     left_index=True,
                     right_index=True,
                     how='left')
             return baseline_hazards_.fillna(0)
 
         else:
-            return self._compute_baseline_hazard(data=df, durations=T, event_observed=E, name='baseline hazard')
+            return self._compute_baseline_hazard(data=df, durations=T, event_observed=E, weights=weights, name='baseline hazard')
 
     def _compute_baseline_survival(self):
         survival_df = exp(-self.baseline_cumulative_hazard_)
diff --git a/tests/test_estimation.py b/tests/test_estimation.py
index 195fca085..ca3b96289 100644
--- a/tests/test_estimation.py
+++ b/tests/test_estimation.py
@@ -306,7 +306,7 @@ def test_pickle_serialization(self, positive_sample_lifetimes, univariate_fitter
             fitter = f()
             fitter.fit(T)
 
-            unpickled = pickle.loads(pickle.dumps(fitter)) 
+            unpickled = pickle.loads(pickle.dumps(fitter))
             dif = (fitter.durations - unpickled.durations).sum()
             assert(dif==0)
 
@@ -1506,7 +1506,7 @@ def test_strata_against_R_output(self, rossi):
         npt.assert_almost_equal(cp.summary['coef'].values, [-0.335, -0.059, 0.100], decimal=3)
         assert abs(cp._log_likelihood - -436.9339) / 436.9339 < 0.01
 
-    def test_hazard_works_as_intended_with_strata_against_R_output(self, rossi):
+    def test_baseline_hazard_works_with_strata_against_R_output(self, rossi):
         """
         > library(survival)
         > rossi = read.csv('.../lifelines/datasets/rossi.csv')
@@ -1519,6 +1519,26 @@ def test_hazard_works_as_intended_with_strata_against_R_output(self, rossi):
         npt.assert_almost_equal(cp.baseline_cumulative_hazard_[(0, 0, 0, 0)].loc[[14, 35, 37, 43, 52]].values, [0.076600555, 0.169748261, 0.272088807, 0.396562717, 0.396562717], decimal=4)
         npt.assert_almost_equal(cp.baseline_cumulative_hazard_[(0, 0, 0, 1)].loc[[27, 43, 48, 52]].values, [0.095499001, 0.204196905, 0.338393113, 0.338393113], decimal=4)
 
+
+    def test_baseline_hazard_works_with_weights_against_R_output(self, rossi):
+        """
+        library(survival)
+
+        fit<-coxph(Surv(week, arrest)~fin, data=rossi, weight=age)
+        H0 <- basehaz(fit, centered=TRUE)
+        """
+
+        rossi = rossi[['week', 'arrest', 'fin', 'age']]
+        cp = CoxPHFitter()
+        cp.fit(rossi, 'week', 'arrest', weights_col='age')
+
+        npt.assert_almost_equal(cp.baseline_cumulative_hazard_['baseline hazard'].loc[0.0], 0.0, decimal=4)
+        npt.assert_almost_equal(cp.baseline_cumulative_hazard_['baseline hazard'].loc[1.0], 0.00183466, decimal=4)
+        npt.assert_almost_equal(cp.baseline_cumulative_hazard_['baseline hazard'].loc[2.0], 0.005880265, decimal=4)
+        npt.assert_almost_equal(cp.baseline_cumulative_hazard_['baseline hazard'].loc[10.0], 0.035425868, decimal=4)
+        npt.assert_almost_equal(cp.baseline_cumulative_hazard_['baseline hazard'].loc[52.0], 0.274341397, decimal=3)
+
+
     def test_strata_from_init_is_used_in_fit_later(self, rossi):
         strata = ['race', 'paro', 'mar']
         cp_with_strata_in_init = CoxPHFitter(strata=strata)

From 050149ede97c734124463b233f7c21425b246208 Mon Sep 17 00:00:00 2001
From: Cameron Davidson-Pilon <cam.davidson.pilon@gmail.com>
Date: Mon, 29 Oct 2018 13:20:39 -0400
Subject: [PATCH 34/59] fix ctv baseline

---
 docs/Survival Regression.rst                 | 19 +++++++++++++++-
 docs/Survival analysis with lifelines.rst    | 24 +++++++++++++++++++-
 lifelines/fitters/cox_time_varying_fitter.py |  3 ++-
 lifelines/fitters/coxph_fitter.py            | 14 ++++++++----
 lifelines/fitters/weibull_fitter.py          |  4 ++--
 5 files changed, 55 insertions(+), 9 deletions(-)

diff --git a/docs/Survival Regression.rst b/docs/Survival Regression.rst
index 36ad7f269..a70e96f54 100644
--- a/docs/Survival Regression.rst	
+++ b/docs/Survival Regression.rst	
@@ -111,6 +111,7 @@ After fitting, you can use use the suite of prediction methods (similar to Aalen
     X = rossi_dataset.drop(["week", "arrest"], axis=1)
     cph.predict_partial_hazard(X)
     cph.predict_survival_function(X)
+    cph.predict_survival_function(X, times=[5., 25., 50.])
 
 
 Plotting the coefficients
@@ -191,10 +192,15 @@ The second variable is the regime type, and this variable does not follow the pr
 .. image:: images/lls_regime_type.png
 
 
+Non-proportional hazards is a case of *model misspecification*. Two suggestions are to look for ways to *stratify* a column (see below), or to go ahead with the current model but use ``robust`` errors (in this case, the sandwhich error). In the latter case, you can specify this with with ``CoxPHFitter.fit(..., robust=True)``. 
+
+
 Stratification
 ################
 
-Sometimes a covariate may not obey the proportional hazard assumption. In this case, we can allow a factor without estimating its effect to be adjusted. To specify categorical variables to be used in stratification, we define them in the call to ``fit``:
+Sometimes one or more covariates may not obey the proportional hazard assumption. In this case, we can allow the covariate(s) to still be including in the model without estimating its effect. This is called stratification. At a high level, think of it as splitting the dataset into *N* datasets, defined by the covariate(s). Each dataset has its own baseline hazard (the non-parametric part ofthe model), but they all share the regression parameters (the parametric part of the model). Since covariates are the same within each dataset, there is no regression parameter for the covariates stratified on, hence they will not show up in the output. However there will be *N* baseline hazards under ``baseline_cumulative_hazard_``. 
+
+To specify categorical variables to be used in stratification, we define them in the call to ``fit``:
 
 .. code:: python
 
@@ -231,6 +237,17 @@ Sometimes a covariate may not obey the proportional hazard assumption. In this c
     Likelihood ratio test = 109.634 on 6 df, p=0.00000
     """
 
+    cph.baseline_cumulative_hazard_.shape
+    # (49, 2)
+
+Weights & Robust Errors
+########################
+
+Observations can come with weights, as well. These weights may be integer values representing some commonly occuring observation, or they may be float values representing some sampling weights or inverse probability weights. In the ``CoxPHFitter.fit`` method, an option is present for specifying which column in the dataframe should be used as weights. See example below. 
+
+Generally, unless your weights are integers should 
+
+
 Aalen's Additive model
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
diff --git a/docs/Survival analysis with lifelines.rst b/docs/Survival analysis with lifelines.rst
index 8ebed4dae..59f148336 100644
--- a/docs/Survival analysis with lifelines.rst	
+++ b/docs/Survival analysis with lifelines.rst	
@@ -457,7 +457,7 @@ Another very popular model for survival data is the Weibull model. In contrast t
 
  ..math::  S(t) = \exp\left(-(\lambda t)^\rho\right),   \lambda >0, \rho > 0,
 
- Apriori, we do not know what :math:`\lambda` and :math:`\rho` are, but we use the data on hand to estimate these parameters. In lifelines, this is implemented in the ``WeibullFitter``:
+* A priori*, we do not know what :math:`\lambda` and :math:`\rho` are, but we use the data on hand to estimate these parameters. In lifelines, this is implemented in the ``WeibullFitter``:
 
 .. code:: python
 
@@ -471,6 +471,28 @@ Another very popular model for survival data is the Weibull model. In contrast t
     print(wf.lambda_, wf.rho_)
     wf.print_summary()
 
+Other parametric models: Exponential and LogNormal
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Similarly, there are other parametric models in lifelines. Generally, which parametric model to choose is determined by either knowledge of the distribution of durations, or some sort of model goodness-of-fit. Below are three parametric models of the same data. 
+
+.. code:: python
+
+    from lifelines import WeibullFitter
+    from lifelines import ExponentialFitter
+    from lifelines import LogNormalFitter
+  
+    T = data['duration']
+    E = data['observed']
+
+    wf = WeibullFitter().fit(T, E, label='WeibullFitter')
+    exf = ExponentialFitter().fit(T, E, label='ExponentalFitter')
+    lnf = LogNormalFitter().fit(T, E, label='LogNormalFitter')
+
+    ax = wf.plot()
+    ax = exf.plot(ax=ax)
+    ax = lnf.plot(ax=ax)
+
 
 Estimating hazard rates using Nelson-Aalen
 ''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''
diff --git a/lifelines/fitters/cox_time_varying_fitter.py b/lifelines/fitters/cox_time_varying_fitter.py
index 402e9baf4..bcc1924d0 100644
--- a/lifelines/fitters/cox_time_varying_fitter.py
+++ b/lifelines/fitters/cox_time_varying_fitter.py
@@ -569,7 +569,8 @@ def _compute_cumulative_baseline_hazard(self, tv_data, stop_times_events, weight
             events_at_t = events.loc[ix]
             weights_at_t = weights.loc[ix].values
             deaths = events_at_t['event'].values & (events_at_t['stop'] == t).values
-            death_counts = (weights_at_t * deaths).sum()  # should always be atleast 1.
+
+            death_counts = (weights_at_t.squeeze() * deaths).sum()  # should always be atleast 1.
             baseline_hazard_.loc[t] = death_counts / events_at_t['hazard'].sum()
 
         return baseline_hazard_.cumsum()
diff --git a/lifelines/fitters/coxph_fitter.py b/lifelines/fitters/coxph_fitter.py
index b071880e8..5f2aac944 100644
--- a/lifelines/fitters/coxph_fitter.py
+++ b/lifelines/fitters/coxph_fitter.py
@@ -692,7 +692,8 @@ def predict_percentile(self, X, p=0.5):
             can be in any order. If a numpy array, columns must be in the
             same order as the training data.
 
-        By default, returns the median lifetimes for the individuals.
+        Returns the median lifetimes for the individuals, by default. If the survival curve of an
+        individual does not cross 0.5, then the result is infinity.
         http://stats.stackexchange.com/questions/102986/percentile-loss-functions
         """
         subjects = _get_index(X)
@@ -704,7 +705,8 @@ def predict_median(self, X):
             can be in any order. If a numpy array, columns must be in the
             same order as the training data.
 
-        Returns the median lifetimes for the individuals
+        Returns the median lifetimes for the individuals. If the survival curve of an
+        individual does not cross 0.5, then the result is infinity.
         """
         return self.predict_percentile(X, 0.5)
 
@@ -714,7 +716,11 @@ def predict_expectation(self, X):
             can be in any order. If a numpy array, columns must be in the
             same order as the training data.
 
-        Compute the expected lifetime, E[T], using covarites X.
+        Compute the expected lifetime, E[T], using covarites X. This algorithm to compute the expection is
+        to use the fact that E[T] = int_0^inf P(T > t) dt = int_0^inf S(t) dt
+
+        To compute the integal, we use the trapizoidal rule to approximate the integral. However, if the
+        survival function, S(t), doesn't converge to 0, the the expectation is really infinity.
         """
         subjects = _get_index(X)
         v = self.predict_survival_function(X)[subjects]
@@ -722,7 +728,7 @@ def predict_expectation(self, X):
 
     def _compute_baseline_hazard(self, data, durations, event_observed, weights, name):
         # https://stats.stackexchange.com/questions/46532/cox-baseline-hazard
-        ind_hazards = self.predict_partial_hazard(data).mul(weights, axis='index')
+        ind_hazards = self.predict_partial_hazard(data) * weights[:, None]
         ind_hazards['event_at'] = durations.values
         ind_hazards_summed_over_durations = ind_hazards.groupby('event_at')[0].sum().sort_index(ascending=False).cumsum()
         ind_hazards_summed_over_durations.name = 'hazards'
diff --git a/lifelines/fitters/weibull_fitter.py b/lifelines/fitters/weibull_fitter.py
index 4424f5736..f505e292f 100644
--- a/lifelines/fitters/weibull_fitter.py
+++ b/lifelines/fitters/weibull_fitter.py
@@ -120,7 +120,7 @@ def fit(self, durations, event_observed=None, timeline=None, entry=None,
         self._estimate_name = "cumulative_hazard_"
         self._predict_label = label
         self._update_docstrings()
-        
+
         # plotting - Cumulative hazard takes priority.
         self.plot_cumulative_hazard = self.plot
 
@@ -128,7 +128,7 @@ def fit(self, durations, event_observed=None, timeline=None, entry=None,
 
     def _estimation_method(self,t):
         return np.exp(-(self.lambda_ * t) ** self.rho_)
-    
+
     def hazard_at_times(self, times):
         return self.lambda_ * self.rho_ * (self.lambda_ * times) ** (self.rho_ - 1)
 

From 0770781d3ae876e93f6399d16bebd5715c02acfc Mon Sep 17 00:00:00 2001
From: Cameron Davidson-Pilon <cam.davidson.pilon@gmail.com>
Date: Mon, 29 Oct 2018 13:32:14 -0400
Subject: [PATCH 35/59] docs and move warning around for non-int weights

---
 docs/Survival Regression.rst             |  2 +-
 lifelines/fitters/kaplan_meier_fitter.py | 13 +++++++++++--
 lifelines/fitters/nelson_aalen_fitter.py | 17 ++++++++++++++---
 lifelines/utils/__init__.py              |  7 -------
 4 files changed, 26 insertions(+), 13 deletions(-)

diff --git a/docs/Survival Regression.rst b/docs/Survival Regression.rst
index a70e96f54..4b911efb0 100644
--- a/docs/Survival Regression.rst	
+++ b/docs/Survival Regression.rst	
@@ -117,7 +117,7 @@ After fitting, you can use use the suite of prediction methods (similar to Aalen
 Plotting the coefficients
 ###########################################
 
-With a fitted model, an altervative way to view the coefficients and their ranges is to use the ``plot`` method.
+With a fitted model, an alternative way to view the coefficients and their ranges is to use the ``plot`` method.
 
 .. code:: python
 
diff --git a/lifelines/fitters/kaplan_meier_fitter.py b/lifelines/fitters/kaplan_meier_fitter.py
index e5dc2e92d..cd08cb23b 100644
--- a/lifelines/fitters/kaplan_meier_fitter.py
+++ b/lifelines/fitters/kaplan_meier_fitter.py
@@ -1,6 +1,7 @@
 # -*- coding: utf-8 -*-
 from __future__ import print_function
 from __future__ import division
+import warnings
 import numpy as np
 import pandas as pd
 
@@ -51,6 +52,14 @@ def fit(self, durations, event_observed=None, timeline=None, entry=None, label='
         if event_observed is not None:
             check_nans(event_observed)
 
+        if weights is not None:
+          if (weights.astype(int) != weights).any():
+              warnings.warn("""It looks like your weights are not integers, possibly prospenity scores then?
+  It's important to know that the naive variance estimates of the coefficients are biased. Instead use Monte Carlo to
+  estimate the variances. See paper "Variance estimation when using inverse probability of treatment weighting (IPTW) with survival analysis"
+  or "Adjusted Kaplan-Meier estimator and log-rank test with inverse probability of treatment weighting for survival data."
+                  """, RuntimeWarning)
+
         # if the user is interested in left-censorship, we return the cumulative_density_, no survival_function_,
         estimate_name = 'survival_function_' if not left_censorship else 'cumulative_density_'
         v = _preprocess_inputs(durations, event_observed, timeline, entry, weights)
@@ -83,14 +92,14 @@ def fit(self, durations, event_observed=None, timeline=None, entry=None, label='
         self._estimate_name = estimate_name
         self._predict_label = label
         self._update_docstrings()
-        
+
         # plotting functions
         setattr(self, "plot_" + estimate_name, self.plot)
         return self
 
     def plot_loglogs(self, *args, **kwargs):
         return plot_loglogs(self, *args, **kwargs)
-    
+
     def _bounds(self, cumulative_sq_, alpha, ci_labels):
         # This method calculates confidence intervals using the exponential Greenwood formula.
         # See https://www.math.wustl.edu/%7Esawyer/handouts/greenwood.pdf
diff --git a/lifelines/fitters/nelson_aalen_fitter.py b/lifelines/fitters/nelson_aalen_fitter.py
index b4e611aeb..5ae5ef7b4 100644
--- a/lifelines/fitters/nelson_aalen_fitter.py
+++ b/lifelines/fitters/nelson_aalen_fitter.py
@@ -1,5 +1,8 @@
 # -*- coding: utf-8 -*-
 from __future__ import print_function
+from __future__ import division
+
+import warnings
 import numpy as np
 import pandas as pd
 
@@ -36,7 +39,7 @@ def __init__(self, alpha=0.95, nelson_aalen_smoothing=True):
             self._variance_f = self._variance_f_discrete
             self._additive_f = self._additive_f_discrete
 
-            
+
     def fit(self, durations, event_observed=None, timeline=None, entry=None,
             label='NA_estimate', alpha=None, ci_labels=None, weights=None):
         """
@@ -66,6 +69,14 @@ def fit(self, durations, event_observed=None, timeline=None, entry=None,
         if event_observed is not None:
             check_nans(event_observed)
 
+        if weights is not None:
+          if (weights.astype(int) != weights).any():
+              warnings.warn("""It looks like your weights are not integers, possibly prospenity scores then?
+  It's important to know that the naive variance estimates of the coefficients are biased. Instead use Monte Carlo to
+  estimate the variances. See paper "Variance estimation when using inverse probability of treatment weighting (IPTW) with survival analysis"
+  or "Adjusted Kaplan-Meier estimator and log-rank test with inverse probability of treatment weighting for survival data."
+                  """, RuntimeWarning)
+
         v = _preprocess_inputs(durations, event_observed, timeline, entry, weights)
         self.durations, self.event_observed, self.timeline, self.entry, self.event_table = v
 
@@ -83,7 +94,7 @@ def fit(self, durations, event_observed=None, timeline=None, entry=None,
         self._estimate_name = "cumulative_hazard_"
         self._predict_label = label
         self._update_docstrings()
-        
+
         # plotting
         self.plot_cumulative_hazard = self.plot
 
@@ -92,7 +103,7 @@ def fit(self, durations, event_observed=None, timeline=None, entry=None,
     def plot_hazard(self,*args,**kwargs):
         kwargs['estimate'] = 'hazard_'
         return self.plot(*args,**kwargs)
-    
+
     def _bounds(self, cumulative_sq_, alpha, ci_labels):
         alpha2 = inv_normal_cdf(1 - (1 - alpha) / 2)
         df = pd.DataFrame(index=self.timeline)
diff --git a/lifelines/utils/__init__.py b/lifelines/utils/__init__.py
index 21b3587d4..9070bc155 100644
--- a/lifelines/utils/__init__.py
+++ b/lifelines/utils/__init__.py
@@ -255,13 +255,6 @@ def survival_table_from_events(death_times, event_observed, birth_times=None,
 
     if weights is None:
         weights = 1
-    else:
-        if (weights.astype(int) != weights).any():
-            warnings.warn("""It looks like your weights are not integers, possibly prospenity scores then?
-It's important to know that the naive variance estimates of the coefficients are biased. Instead use Monte Carlo to
-estimate the variances. See paper "Variance estimation when using inverse probability of treatment weighting (IPTW) with survival analysis"
-or "Adjusted Kaplan-Meier estimator and log-rank test with inverse probability of treatment weighting for survival data."
-                """, RuntimeWarning)
 
     # deal with deaths and censorships
     df = pd.DataFrame(death_times, columns=["event_at"])

From b2d73ef0aaa47cca1f7fbb389d89d0a123f3a52a Mon Sep 17 00:00:00 2001
From: Cameron Davidson-Pilon <cam.davidson.pilon@gmail.com>
Date: Fri, 2 Nov 2018 12:54:20 -0400
Subject: [PATCH 36/59] adding concondance overflow test

---
 lifelines/fitters/coxph_fitter.py | 6 +++---
 tests/utils/test_utils.py         | 7 +++++++
 2 files changed, 10 insertions(+), 3 deletions(-)

diff --git a/lifelines/fitters/coxph_fitter.py b/lifelines/fitters/coxph_fitter.py
index 5f2aac944..fb82467cc 100644
--- a/lifelines/fitters/coxph_fitter.py
+++ b/lifelines/fitters/coxph_fitter.py
@@ -600,7 +600,7 @@ def predict_partial_hazard(self, X):
         same as the training dataset.
 
         Returns the partial hazard for the individuals, partial since the
-        baseline hazard is not included. Equal to \exp{\beta X}
+        baseline hazard is not included. Equal to \exp{\beta (X - mean{X_train})}
         """
         return exp(self.predict_log_partial_hazard(X))
 
@@ -612,7 +612,7 @@ def predict_log_partial_hazard(self, X):
 
         This is equivalent to R's linear.predictors.
         Returns the log of the partial hazard for the individuals, partial since the
-        baseline hazard is not included. Equal to \beta (X - \bar{X})
+        baseline hazard is not included. Equal to \beta (X - mean{X_train})
 
         If X is a dataframe, the order of the columns do not matter. But
         if X is an array, then the column ordering is assumed to be the
@@ -633,7 +633,7 @@ def predict_log_hazard_relative_to_mean(self, X):
             same order as the training data.
 
         Returns the log hazard relative to the hazard of the mean covariates. This is the behaviour
-        of R's predict.coxph. Equal to \beta X - \beta \bar{X_{train}}
+        of R's predict.coxph. Equal to \beta X - \beta mean{X_train}}
         """
 
         return self.predict_log_partial_hazard(X) - self._train_log_partial_hazard.squeeze()
diff --git a/tests/utils/test_utils.py b/tests/utils/test_utils.py
index 5e7a3372a..27b06ef10 100644
--- a/tests/utils/test_utils.py
+++ b/tests/utils/test_utils.py
@@ -374,6 +374,13 @@ def test_concordance_index_function_exits():
     obs = np.ones(N)
     assert fast_cindex(actual_times, predicted_times, obs)
 
+def test_concordance_index_will_not_overflow():
+    a = np.arange(65536)
+    assert utils.concordance_index(a, a) == 1.0
+    b = np.arange(65537)
+    assert utils.concordance_index(b, b) == 1.0
+    assert utils.concordance_index(b, b[::-1]) == 0.0
+
 
 def test_survival_table_from_events_with_non_negative_T_and_no_lagged_births():
     n = 10

From cb6da37ac44a20ae7392250dd098a1e82a32e574 Mon Sep 17 00:00:00 2001
From: Cameron Davidson-Pilon <cam.davidson.pilon@gmail.com>
Date: Sun, 11 Nov 2018 12:52:29 -0500
Subject: [PATCH 37/59] close some issues

---
 docs/Survival Regression.rst                 |  2 +-
 docs/Survival analysis with lifelines.rst    |  2 +-
 lifelines/fitters/cox_time_varying_fitter.py |  2 ++
 lifelines/fitters/coxph_fitter.py            |  2 ++
 tests/test_estimation.py                     | 30 ++++++++++++++++++--
 5 files changed, 33 insertions(+), 5 deletions(-)

diff --git a/docs/Survival Regression.rst b/docs/Survival Regression.rst
index 4b911efb0..84bf0a73c 100644
--- a/docs/Survival Regression.rst	
+++ b/docs/Survival Regression.rst	
@@ -846,7 +846,7 @@ where ``time`` is the duration from the entry event. Here we see subject 1 had a
       <p>4 rows × 6 columns</p>
     </div>
 
-From the above output, we can see that subject 1 changed state twice over the observation period, finally expiring at the end of time 10. Subject 2 was a censored case, and we lost track of them after time 2.
+From the above output, we can see that subject 1 changed state twice over the observation period, finally expiring at the end of time 10. Subject 2 was a censored case, and we lost track of them after time 12.
 
 You may have multiple covariates you wish to add, so the above could be streamlined like so:
 
diff --git a/docs/Survival analysis with lifelines.rst b/docs/Survival analysis with lifelines.rst
index 59f148336..c461a27e0 100644
--- a/docs/Survival analysis with lifelines.rst	
+++ b/docs/Survival analysis with lifelines.rst	
@@ -319,7 +319,7 @@ probabilities of survival at those points:
 
 .. code:: python
 
-    ax = subplot(111)
+    ax = plt.subplot(111)
     
     t = np.linspace(0, 50, 51)
     kmf.fit(T[dem], event_observed=E[dem], timeline=t, label="Democratic Regimes")
diff --git a/lifelines/fitters/cox_time_varying_fitter.py b/lifelines/fitters/cox_time_varying_fitter.py
index bcc1924d0..8f7c1815b 100644
--- a/lifelines/fitters/cox_time_varying_fitter.py
+++ b/lifelines/fitters/cox_time_varying_fitter.py
@@ -433,7 +433,9 @@ def predict_log_partial_hazard(self, X):
         if isinstance(X, pd.DataFrame):
             order = self.hazards_.columns
             X = X[order]
+            pass_for_numeric_dtypes_or_raise(X)
 
+        X = X.astype(float)
         index = _get_index(X)
         X = normalize(X, self._norm_mean.values, 1)
         return pd.DataFrame(np.dot(X, self.hazards_.T), index=index)
diff --git a/lifelines/fitters/coxph_fitter.py b/lifelines/fitters/coxph_fitter.py
index fb82467cc..92f66de49 100644
--- a/lifelines/fitters/coxph_fitter.py
+++ b/lifelines/fitters/coxph_fitter.py
@@ -621,7 +621,9 @@ def predict_log_partial_hazard(self, X):
         if isinstance(X, pd.DataFrame):
             order = self.hazards_.columns
             X = X[order]
+            pass_for_numeric_dtypes_or_raise(X)
 
+        X = X.astype(float)
         index = _get_index(X)
         X = normalize(X, self._norm_mean.values, 1)
         return pd.DataFrame(np.dot(X, self.hazards_.T), index=index)
diff --git a/tests/test_estimation.py b/tests/test_estimation.py
index ca3b96289..78c2f2fdc 100644
--- a/tests/test_estimation.py
+++ b/tests/test_estimation.py
@@ -713,7 +713,7 @@ def test_prediction_methods_respect_index(self, regression_models, rossi):
             except AttributeError:
                 pass
 
-    def test_error_is_raised_if_using_non_numeric_data(self, regression_models):
+    def test_error_is_raised_if_using_non_numeric_data_in_fit(self, regression_models):
         df = pd.DataFrame.from_dict({
             't': [1., 2., 3.],
             'bool_': [True, True, False],
@@ -764,10 +764,34 @@ def test_error_is_thrown_if_there_is_nans_in_the_event_col(self, regression_mode
 
 class TestCoxPHFitter():
 
+    def test_error_is_raised_if_using_non_numeric_data_in_prediction(self):
+        df = pd.DataFrame.from_dict({
+            't': [1., 2., 3., 4.],
+            'int_': [1, -1, 0, 0],
+            'float_': [1.2, -0.5, 0.0, 0.1],
+        })
+
+        cp = CoxPHFitter()
+        cp.fit(df, duration_col='t')
+
+        df_predict_on = pd.DataFrame.from_dict({
+            'int_': ['1', '-1', '0'],
+            'float_': [1.2, -0.5, 0.0],
+        })
+
+        with pytest.raises(TypeError):
+            cp.predict_partial_hazard(df_predict_on)
+
+    def test_strata_will_work_with_matched_pairs(self, rossi):
+        rossi['matched_pairs'] = np.floor(rossi.index / 2.).astype(int)
+        cp = CoxPHFitter()
+        cp.fit(rossi, duration_col='week', event_col='arrest', strata=['matched_pairs'], show_progress=True)
+        assert cp.baseline_cumulative_hazard_.shape[1] == 216
+
     def test_summary(self, rossi):
         cp = CoxPHFitter()
         cp.fit(rossi, duration_col='week', event_col='arrest')
-        summDf = cp.summary
+        summary = cp.summary
         expectedColumns = ['coef',
                            'exp(coef)',
                            'se(coef)',
@@ -775,7 +799,7 @@ def test_summary(self, rossi):
                            'p',
                            'lower 0.95',
                            'upper 0.95']
-        assert all([col in summDf.columns for col in expectedColumns])
+        assert all([col in summary.columns for col in expectedColumns])
 
     def test_print_summary(self, rossi):
 

From 263f5d2f80d7fd75e92723ff0250f3ed60b46b71 Mon Sep 17 00:00:00 2001
From: Cameron Davidson-Pilon <cam.davidson.pilon@gmail.com>
Date: Sun, 11 Nov 2018 19:28:25 -0500
Subject: [PATCH 38/59] fixing #497

---
 CHANGELOG.md                |  1 +
 lifelines/utils/__init__.py | 14 ++++--
 tests/test_estimation.py    | 88 ++++++++++++++++++++++++++++++++-----
 3 files changed, 88 insertions(+), 15 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 1b91ee935..7aa6e956d 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -19,6 +19,7 @@
  - Performance improvements for `CoxTimeVaryingFitter` (~15% faster)
  - Univariate models are now serialisable with `pickle`. Thanks @dwilson1988 for the contribution. 
  - `baseline_cumulative_hazard_` (and derivatives of that) on `CoxPHFitter` now correctly incorporate the `weights_col`. 
+ - Fixed a bug in `KaplanMeierFitter` when late entry times lined up with death events. Thanks @pzivich
 
 #### 0.14.6
  - fix for n > 2 groups in `multivariate_logrank_test` (again).
diff --git a/lifelines/utils/__init__.py b/lifelines/utils/__init__.py
index 9070bc155..896387d3a 100644
--- a/lifelines/utils/__init__.py
+++ b/lifelines/utils/__init__.py
@@ -682,9 +682,17 @@ def _additive_estimate(events, timeline, _additive_f, _additive_var, reverse):
         var_ = np.cumsum(_additive_var(at_risk, deaths)).sort_index().shift(-1).fillna(0)
     else:
         deaths = events['observed']
-        at_risk = events['at_risk']
-        estimate_ = np.cumsum(_additive_f(at_risk, deaths))
-        var_ = np.cumsum(_additive_var(at_risk, deaths))
+
+        # Why subtract entrants like this? see https://github.com/CamDavidsonPilon/lifelines/issues/497
+        # specifically, we kill people, compute the ratio, and then "add" the entants. This means that
+        # the population should not have the late entrants. The only exception to this rule
+        # is the first period, where entrants happen _prior_ to deaths.
+        entrances = events['entrance'].copy()
+        entrances.iloc[0] = 0
+        population = events['at_risk'] - entrances
+
+        estimate_ = np.cumsum(_additive_f(population, deaths))
+        var_ = np.cumsum(_additive_var(population, deaths))
 
     timeline = sorted(timeline)
     estimate_ = estimate_.reindex(timeline, method='pad').fillna(0)
diff --git a/tests/test_estimation.py b/tests/test_estimation.py
index 78c2f2fdc..c3fcad321 100644
--- a/tests/test_estimation.py
+++ b/tests/test_estimation.py
@@ -401,6 +401,8 @@ def test_kaplan_meier_no_censorship(self, sample_lifetimes):
         T, _ = sample_lifetimes
         kmf = KaplanMeierFitter()
         kmf.fit(T)
+        print(kmf.survival_function_)
+        print(kmf.event_table)
         npt.assert_almost_equal(kmf.survival_function_.values, self.kaplan_meier(T))
 
     def test_kaplan_meier_with_censorship(self, sample_lifetimes):
@@ -438,7 +440,7 @@ def test_kmf_left_censorship_stats(self):
         kmf.fit(T, C, left_censorship=True)
 
         actual = kmf.cumulative_density_[kmf._label].values
-        npt.assert_almost_equal(actual, np.array([0, 0.437500, 0.5833333, 0.875, 0.875, 1]))
+        npt.assert_allclose(actual, np.array([0, 0.437500, 0.5833333, 0.875, 0.875, 1]))
 
     def test_shifting_durations_doesnt_affect_survival_function_values(self):
         T = np.random.exponential(10, size=100)
@@ -446,13 +448,13 @@ def test_shifting_durations_doesnt_affect_survival_function_values(self):
         expected = kmf.fit(T).survival_function_.values
 
         T_shifted = T + 100
-        npt.assert_almost_equal(expected, kmf.fit(T_shifted).survival_function_.values)
+        npt.assert_allclose(expected, kmf.fit(T_shifted).survival_function_.values)
 
         T_shifted = T - 50
-        npt.assert_almost_equal(expected[1:], kmf.fit(T_shifted).survival_function_.values)
+        npt.assert_allclose(expected[1:], kmf.fit(T_shifted).survival_function_.values)
 
         T_shifted = T - 200
-        npt.assert_almost_equal(expected[1:], kmf.fit(T_shifted).survival_function_.values)
+        npt.assert_allclose(expected[1:], kmf.fit(T_shifted).survival_function_.values)
 
     def test_kmf_survival_curve_output_against_R(self):
         df = load_g3()
@@ -461,11 +463,11 @@ def test_kmf_survival_curve_output_against_R(self):
 
         expected = np.array([[0.909, 0.779]]).T
         kmf.fit(df.loc[ix]['time'], df.loc[ix]['event'], timeline=[25, 53])
-        npt.assert_array_almost_equal(kmf.survival_function_.values, expected, decimal=3)
+        npt.assert_allclose(kmf.survival_function_.values, expected, rtol=10e-3)
 
         expected = np.array([[0.833, 0.667, 0.5, 0.333]]).T
         kmf.fit(df.loc[~ix]['time'], df.loc[~ix]['event'], timeline=[9, 19, 32, 34])
-        npt.assert_array_almost_equal(kmf.survival_function_.values, expected, decimal=3)
+        npt.assert_allclose(kmf.survival_function_.values, expected, rtol=10e-3)
 
     @pytest.mark.xfail()
     def test_kmf_survival_curve_output_against_R_super_accurate(self):
@@ -475,11 +477,11 @@ def test_kmf_survival_curve_output_against_R_super_accurate(self):
 
         expected = np.array([[0.909, 0.779]]).T
         kmf.fit(df.loc[ix]['time'], df.loc[ix]['event'], timeline=[25, 53])
-        npt.assert_array_almost_equal(kmf.survival_function_.values, expected, decimal=4)
+        npt.assert_allclose(kmf.survival_function_.values, expected, rtol=10e-4)
 
         expected = np.array([[0.833, 0.667, 0.5, 0.333]]).T
         kmf.fit(df.loc[~ix]['time'], df.loc[~ix]['event'], timeline=[9, 19, 32, 34])
-        npt.assert_array_almost_equal(kmf.survival_function_.values, expected, decimal=4)
+        npt.assert_allclose(kmf.survival_function_.values, expected, rtol=10e-4)
 
     def test_kmf_confidence_intervals_output_against_R(self):
         # this uses conf.type = 'log-log'
@@ -489,12 +491,12 @@ def test_kmf_confidence_intervals_output_against_R(self):
         kmf.fit(df.loc[ix]['time'], df.loc[ix]['event'], timeline=[9, 19, 32, 34])
 
         expected_lower_bound = np.array([0.2731, 0.1946, 0.1109, 0.0461])
-        npt.assert_array_almost_equal(kmf.confidence_interval_['KM_estimate_lower_0.95'].values,
-                                      expected_lower_bound, decimal=3)
+        npt.assert_allclose(kmf.confidence_interval_['KM_estimate_lower_0.95'].values,
+                                      expected_lower_bound, rtol=10e-4)
 
         expected_upper_bound = np.array([0.975, 0.904, 0.804, 0.676])
-        npt.assert_array_almost_equal(kmf.confidence_interval_['KM_estimate_upper_0.95'].values,
-                                      expected_upper_bound, decimal=3)
+        npt.assert_allclose(kmf.confidence_interval_['KM_estimate_upper_0.95'].values,
+                                      expected_upper_bound, rtol=10e-4)
 
     def test_kmf_does_not_drop_to_zero_if_last_point_is_censored(self):
         T = np.arange(0, 50, 0.5)
@@ -537,6 +539,68 @@ def test_weights_with_unaligned_index(self):
             a = list(kmf.survival_function_.KM_estimate)
             assert a == [1.0,0.6153846153846154,0.6153846153846154,0.32579185520362,0.32579185520362]
 
+    def test_late_entry_with_almost_tied_entry_and_death_against_R(self):
+        entry = [1.9, 0, 0, 0, 0]
+        T = [2, 10, 5, 4, 3]
+        kmf = KaplanMeierFitter()
+        kmf.fit(T, entry=entry)
+
+        expected = [1.0, 1.0, 0.8, 0.6, 0.4, 0.2, 0.0]
+        npt.assert_allclose(kmf.survival_function_.values.reshape(7,), expected)
+
+    def test_late_entry_with_against_R(self):
+        entry = [1, 2, 4, 0, 0]
+        T = [2, 10, 5, 4, 3]
+        kmf = KaplanMeierFitter()
+        kmf.fit(T, entry=entry)
+
+        expected = [1.0, 1.0, 0.667, 0.444, 0.222, 0.111, 0.0]
+        npt.assert_allclose(kmf.survival_function_.values.reshape(7,), expected, rtol=1e-2)
+
+
+    def test_late_entry_with_tied_entry_and_death(self):
+        np.random.seed(101)
+
+        Ct = 10.
+
+        n = 10000
+        df = pd.DataFrame()
+        df['id'] = [i for i in range(n)]
+        df['t'] = np.ceil(np.random.weibull(1,size=n)*5)
+        df['t_cens'] = np.ceil(np.random.weibull(1,size=n)*3)
+        df['t_enter'] = np.floor(np.random.weibull(1.5,size=n)*2)
+        df['ft'] = 10
+        df['t_out'] = np.min(df[['t','t_cens','ft']],axis=1).astype(int)
+        df['d'] = (np.where(df['t']<=Ct,1,0)) * (np.where(df['t']<=df['t_cens'],1,0))
+        df['c'] = (np.where(df['t_cens']<=Ct,1,0)) * (np.where(df['t_cens']<df['t'],1,0))
+        df['y'] = (np.where(df['t']>df['t_enter'],1,0)) * (np.where(df['t_cens']>df['t_enter'],1,0)) * (np.where(Ct > df['t_enter'],1,0))
+        dfo = df.loc[df['y']==1].copy() #"observed data"
+
+        #Fitting KM to full data
+        km1 = KaplanMeierFitter()
+        km1.fit(df['t_out'],event_observed=df['d'])
+        rf = pd.DataFrame(index=km1.survival_function_.index)
+        rf['KM_true'] = km1.survival_function_
+
+        print(dfo[['t_out', 't_enter', 'd']])
+
+
+        #Fitting KM to "observed" data
+        km2 = KaplanMeierFitter()
+        km2.fit(dfo['t_out'],entry=dfo['t_enter'],event_observed=dfo['d'])
+        rf['KM_lifelines_latest'] = km2.survival_function_
+        print(km2.event_table)
+
+        #Version of KM where late entries occur after
+        rf['KM_lateenterafter'] = np.cumprod(1 - (km2.event_table.observed/(km2.event_table.at_risk - km2.event_table.entrance)))
+
+        # drop the first NA from comparison
+        rf = rf.dropna()
+        print(rf)
+
+        npt.assert_allclose(rf['KM_true'].values, rf['KM_lateenterafter'].values, rtol=10e-2)
+        npt.assert_allclose(rf['KM_lifelines_latest'].values, rf['KM_lateenterafter'].values, rtol=10e-2)
+        npt.assert_allclose(rf['KM_lifelines_latest'].values, rf['KM_true'].values, rtol=10e-2)
 
 class TestNelsonAalenFitter():
 

From 1dde117e15c0ae5aea242fd2785adf442622663a Mon Sep 17 00:00:00 2001
From: Cameron Davidson-Pilon <cam.davidson.pilon@gmail.com>
Date: Sun, 11 Nov 2018 20:19:14 -0500
Subject: [PATCH 39/59] adding note about weights

---
 docs/Examples.rst | 33 +++++++++++++++++++++++++++++++++
 docs/conf.py      |  4 ++--
 2 files changed, 35 insertions(+), 2 deletions(-)

diff --git a/docs/Examples.rst b/docs/Examples.rst
index a84055296..eb658a34c 100644
--- a/docs/Examples.rst
+++ b/docs/Examples.rst
@@ -602,3 +602,36 @@ Since the estimation of the coefficients in the Cox proportional hazard model is
 
  5. If using the ``strata`` arugment, make sure your stratification group sizes are not too small. Try ``df.groupby(strata).size()``.
 
+Adding weights to observations in a Cox model
+##############################################
+
+There are two common uses for weights in a model. The first is as a data size reduction technique (known as case weights). If the dataset has more than one subjects with identical attributes, including duration and event, then their likelihood contribution is the same as well. Thus, instead of computing the log-likelihood for each individual, we can compute it once and multiple it by the count of users with identical attributes. In practice, this involves first grouping subjects by covariates and counting. For example, using the Rossi dataset, we will use Pandas to group by the attributes (but other data processing tools, like Spark, could do this as well): 
+
+.. code-block:: python
+    
+    from lifelines.datasets import load_rossi
+
+    rossi = load_rossi()
+
+    rossi_weights = rossi.copy()
+    rossi_weights['weights'] = 1.
+    rossi_weights = rossi_weights.groupby(rossi.columns.tolist())['weights'].sum()\
+                                 .reset_index()
+
+
+The original dataset has 432 rows, while the grouped dataset has 387 rows plus an additional `weights` column. ``CoxPHFitter`` has an additional parameter to specify which column is the weight column.
+
+.. code-block:: python
+
+    from lifelines import CoxPHFitter
+
+    cp = CoxPHFitter()
+    cp.fit(rossi_weights, 'week', 'arrest', weights_col='weights')
+
+
+The fitting should be faster, and the results identical to the unweighted dataset. This option is also available in the `CoxTimeVaryingFitter`. 
+
+
+The second use of weights is sampling weights. These are typically positive, non-integer weights that represent some artifical under/over sampling of observations (ex: inverse probability of treatment weights). It is recommened to set ``robust=True`` in the call to the ``fit`` as the usual standard error is incorrect for sampling weights. The ``robust`` flag will use the sandwich estimator for the standard error. 
+
+.. warning:: The implementation of the sandwich estimator does not handle ties correctly (under the Efron handling of ties), and will give slightly or significantly different results from other software depending on the frequeny of ties. g
diff --git a/docs/conf.py b/docs/conf.py
index 52cf02d79..13885f8f8 100644
--- a/docs/conf.py
+++ b/docs/conf.py
@@ -55,9 +55,9 @@
 # built documents.
 #
 # The short X.Y version.
-version = '0.14.6'
+version = '0.15.0'
 # The full version, including alpha/beta/rc tags.
-release = '0.14.6'
+release = '0.15.0'
 
 # The language for content autogenerated by Sphinx. Refer to documentation
 # for a list of supported languages.

From cc9f190dae6afe5e395cb64d794d301dab2c33aa Mon Sep 17 00:00:00 2001
From: Cameron Davidson-Pilon <cam.davidson.pilon@gmail.com>
Date: Tue, 13 Nov 2018 11:00:07 -0500
Subject: [PATCH 40/59] coxphfitter allows for robust + strata args

---
 lifelines/fitters/coxph_fitter.py | 31 +++++++++++++++----
 tests/test_estimation.py          | 50 +++++++++++++++++++++++++++----
 2 files changed, 70 insertions(+), 11 deletions(-)

diff --git a/lifelines/fitters/coxph_fitter.py b/lifelines/fitters/coxph_fitter.py
index 92f66de49..5e376ffde 100644
--- a/lifelines/fitters/coxph_fitter.py
+++ b/lifelines/fitters/coxph_fitter.py
@@ -452,7 +452,28 @@ def _compute_confidence_intervals(self):
                             index=['lower-bound', 'upper-bound'],
                             columns=self.hazards_.columns)
 
+
     def _compute_sandwich_estimator(self, X, T, E, weights):
+
+        if self.strata is None:
+            score_residuals = self._compute_residuals_within_strata(X.values, T.values, E.values, weights.values)
+
+        else:
+            score_residuals = np.empty((0,1))
+            for strata in np.unique(X.index):
+                stratified_X, stratified_T, stratified_E, stratified_W = X.loc[[strata]], T.loc[[strata]], E.loc[[strata]], weights.loc[[strata]]
+
+                score_residuals = np.append(score_residuals,
+                                            self._compute_residuals_within_strata(stratified_X.values, stratified_T.values, stratified_E.values, stratified_W.values),
+                                            axis=0)
+
+
+        naive_var = inv(self._hessian_)
+        delta_betas = score_residuals.dot(naive_var) * weights[:, None]
+        sandwich_estimator = delta_betas.T.dot(delta_betas) / np.outer(self._norm_std, self._norm_std)
+        return sandwich_estimator
+
+    def _compute_residuals_within_strata(self, X, T, E, weights):
         # https://www.stat.tamu.edu/~carroll/ftp/gk001.pdf
         # lin1989
         # https://www.ics.uci.edu/~dgillen/STAT255/Handouts/lecture10.pdf
@@ -469,6 +490,8 @@ def _compute_sandwich_estimator(self, X, T, E, weights):
 
         phi_s = exp(dot(X, beta))
 
+        # compute these within strata
+
         # need to store these histories, as we access them often
         # this is a reverse cumulative sum. See original code in https://github.com/CamDavidsonPilon/lifelines/pull/496/files#diff-81ee0759dbae0770e1a02cf17f4cfbb1R431
         risk_phi_x_history = (X * (weights * phi_s)[:, None])[::-1].cumsum(0)[::-1]
@@ -490,14 +513,12 @@ def _compute_sandwich_estimator(self, X, T, E, weights):
 
             score_residuals[i, :] = score
 
-        naive_var = inv(self._hessian_)
-        delta_betas = score_residuals.dot(naive_var) * weights[:, None]
-        sandwich_estimator = delta_betas.T.dot(delta_betas) / np.outer(self._norm_std, self._norm_std)
-        return sandwich_estimator
+        return score_residuals
+
 
     def _compute_standard_errors(self, df, T, E, weights):
         if self.robust:
-            se = np.sqrt(self._compute_sandwich_estimator(df.values, T.values, E.values, weights.values).diagonal()) # / self._norm_std
+            se = np.sqrt(self._compute_sandwich_estimator(df, T, E, weights).diagonal()) # / self._norm_std
         else:
             se = np.sqrt(self.variance_matrix_.diagonal())
         return pd.DataFrame(se[None, :],
diff --git a/tests/test_estimation.py b/tests/test_estimation.py
index c3fcad321..0d144f613 100644
--- a/tests/test_estimation.py
+++ b/tests/test_estimation.py
@@ -1813,15 +1813,53 @@ def test_robust_errors_against_R_no_ties(self, regression_dataset):
         assert_series_equal(cph.standard_errors_.loc['se'], expected, check_less_precise=2, check_names=False)
 
 
-    def test_robust_errors_with_strata_doesnt_break(self, rossi):
+    def test_robust_errors_with_strata_against_R(self, rossi):
         """
-        rossi <- read.csv('.../lifelines/datasets/rossi.csv')
-        r = coxph(formula = Surv(week, arrest) ~ fin + age + strata(race,
-                    paro, mar, wexp) + prio, data = rossi, robust=TRUE)
+        df <- data.frame(
+            "var1" = c(1, 1, 2, 2, 2),
+            "var2" = c(0.184677, 0.071893, 1.364646, 0.098375, 1.663092),
+            "T" = c( 7.335846, 5.269797, 11.684092, 12.678458, 6.601666)
+        )
+        df['E'] = 1
+
+        coxph(formula=Surv(T, E) ~ strata(var1) + var2, data=df, robust=TRUE)
+        """
+
+        df = pd.DataFrame({
+            "var1": [1, 1, 2, 2, 2],
+            "var2": [0.184677, 0.071893, 1.364646, 0.098375, 1.663092],
+            "T": [7.335846, 5.269797, 11.684092, 12.678458, 6.601666]
+        })
+        df['E'] = 1
+
+        cf = CoxPHFitter()
+        cf.fit(df, duration_col='T', event_col='E', strata=['var1'], robust=True)
+        npt.assert_allclose(cf.summary['se(coef)'].values, 2.79, rtol=1e-2)
+
+
+    @pytest.mark.xfail
+    def test_robust_errors_with_strata_against_R_super_accurate(self, rossi):
         """
-        assert False
+        df <- data.frame(
+            "var1" = c(1, 1, 2, 2, 2),
+            "var2" = c(0.184677, 0.071893, 1.364646, 0.098375, 1.663092),
+            "T" = c( 7.335846, 5.269797, 11.684092, 12.678458, 6.601666)
+        )
+        df['E'] = 1
+
+        coxph(formula=Surv(T, E) ~ strata(var1) + var2, data=df, robust=TRUE)
+        """
+
+        df = pd.DataFrame({
+            "var1": [1, 1, 2, 2, 2],
+            "var2": [0.184677, 0.071893, 1.364646, 0.098375, 1.663092],
+            "T": [7.335846, 5.269797, 11.684092, 12.678458, 6.601666]
+        })
+        df['E'] = 1
+
         cf = CoxPHFitter()
-        cf.fit(rossi, duration_col='week', event_col='arrest', strata=['race', 'paro', 'mar', 'wexp'], robust=True)
+        cf.fit(df, duration_col='T', event_col='E', strata=['var1'], robust=True)
+        npt.assert_allclose(cf.summary['se(coef)'].values, 2.79, rtol=1e-4)
 
 
     def test_what_happens_to_nans(self, rossi):

From ca23df2874b56bb9e833b7f6f0c4c655d4e0e01f Mon Sep 17 00:00:00 2001
From: Cameron Davidson-Pilon <cam.davidson.pilon@gmail.com>
Date: Tue, 13 Nov 2018 13:39:02 -0500
Subject: [PATCH 41/59] adding a warning if weibull fitter fails to converge

---
 lifelines/fitters/weibull_fitter.py | 16 ++++++++++++----
 1 file changed, 12 insertions(+), 4 deletions(-)

diff --git a/lifelines/fitters/weibull_fitter.py b/lifelines/fitters/weibull_fitter.py
index f505e292f..7060d9d7f 100644
--- a/lifelines/fitters/weibull_fitter.py
+++ b/lifelines/fitters/weibull_fitter.py
@@ -1,5 +1,6 @@
 # -*- coding: utf-8 -*-
 from __future__ import print_function, division
+import warnings
 import time
 import numpy as np
 import pandas as pd
@@ -7,7 +8,8 @@
 from scipy import stats as stats
 from numpy.linalg import solve, norm, inv
 from lifelines.fitters import UnivariateFitter
-from lifelines.utils import inv_normal_cdf, check_nans, ConvergenceError, string_justify, significance_code
+from lifelines.utils import inv_normal_cdf, check_nans, ConvergenceError, string_justify, significance_code,\
+                            ConvergenceWarning
 
 
 def _negative_log_likelihood(lambda_rho, T, E):
@@ -152,13 +154,13 @@ def gradient_function(parameters, T, E):
 
         # initialize the parameters. This shows dramatic improvements.
         parameters = _smart_search(_negative_log_likelihood, 2, T, E)
-
         i = 1
         step_size = 0.9
-        converging = True
+        max_steps = 50
+        converging, completed = True, False
         start = time.time()
 
-        while converging and i < 50:
+        while converging and i < max_steps:
             # Do not override hessian and gradient in case of garbage
             h, g = hessian_function(parameters, T, E), gradient_function(parameters, T, E)
 
@@ -176,8 +178,14 @@ def gradient_function(parameters, T, E):
 
             if norm(delta) < precision:
                 converging = False
+                completed = True
             i += 1
 
+        if show_progress and completed:
+            print("Convergence completed after %d iterations." % (i))
+        if not completed:
+            warnings.warn("Newton-Rhapson failed to converge sufficiently in %d steps." % max_steps, ConvergenceWarning)
+
         return parameters, hessian
 
     def _bounds(self, alpha, ci_labels):

From 0a8530888eb1858ccfceb11745c4c5b486d17b2a Mon Sep 17 00:00:00 2001
From: Cameron Davidson-Pilon <cam.davidson.pilon@gmail.com>
Date: Tue, 13 Nov 2018 16:12:51 -0500
Subject: [PATCH 42/59] I like this decorator to check if the fit has been
 called...

---
 docs/Survival analysis with lifelines.rst |  6 ++-
 lifelines/fitters/__init__.py             | 54 +++++++++++------------
 lifelines/plotting.py                     |  6 +--
 tests/test_estimation.py                  |  4 +-
 4 files changed, 36 insertions(+), 34 deletions(-)

diff --git a/docs/Survival analysis with lifelines.rst b/docs/Survival analysis with lifelines.rst
index c461a27e0..7380975a1 100644
--- a/docs/Survival analysis with lifelines.rst	
+++ b/docs/Survival analysis with lifelines.rst	
@@ -452,7 +452,7 @@ keywords to tinker with.
 Fitting to a Weibull model
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
-Another very popular model for survival data is the Weibull model. In contrast the the Kaplan Meier estimator, this model is a *parametric model*, meaning it has a functional form with parameters that we are fitting the data to. (The Kaplan Meier estimator has no parameters to fit too). Mathematically, the survival function looks like:
+Another very popular model for survival data is the Weibull model. In contrast the the Kaplan Meier estimator, this model is a *parametric model*, meaning it has a functional form with parameters that we are fitting the data to. (The Kaplan Meier estimator has no parameters to fit to). Mathematically, the survival function looks like:
 
 
  ..math::  S(t) = \exp\left(-(\lambda t)^\rho\right),   \lambda >0, \rho > 0,
@@ -468,9 +468,13 @@ Another very popular model for survival data is the Weibull model. In contrast t
 
     wf = WeibullFitter()
     wf.fit(T, E)
+    
     print(wf.lambda_, wf.rho_)
     wf.print_summary()
 
+    wf.
+
+
 Other parametric models: Exponential and LogNormal
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
diff --git a/lifelines/fitters/__init__.py b/lifelines/fitters/__init__.py
index dcc853da3..7e9021066 100644
--- a/lifelines/fitters/__init__.py
+++ b/lifelines/fitters/__init__.py
@@ -1,6 +1,7 @@
 # -*- coding: utf-8 -*-
 from __future__ import print_function
 import collections
+from functools import wraps
 
 import numpy as np
 import pandas as pd
@@ -9,6 +10,18 @@
 from lifelines.utils import qth_survival_times, _to_array
 
 
+def must_call_fit_first(func):
+    @wraps(func)
+    def error_wrapper(*args, **kwargs):
+        self = args[0]
+        try:
+            estimate = self._estimate_name
+        except AttributeError:
+            raise RuntimeError("Must call `fit` first!")
+        return func(*args, **kwargs)
+    return error_wrapper
+
+
 class BaseFitter(object):
 
     def __init__(self, alpha=0.95):
@@ -25,37 +38,28 @@ def __repr__(self):
             s = """<lifelines.%s>""" % classname
         return s
 
-
 class UnivariateFitter(BaseFitter):
 
+    @must_call_fit_first
     def _update_docstrings(self):
         # Update their docstrings
-        self.__class__.subtract.__doc__ = self.subtract.__doc__.format(self._estimate_name,self.__class__.__name__)
-        self.__class__.divide.__doc__ = self.divide.__doc__.format(self._estimate_name,self.__class__.__name__)
+        self.__class__.subtract.__doc__ = self.subtract.__doc__.format(self._estimate_name, self.__class__.__name__)
+        self.__class__.divide.__doc__ = self.divide.__doc__.format(self._estimate_name, self.__class__.__name__)
         self.__class__.predict.__doc__ = self.predict.__doc__.format(self.__class__.__name__)
-        self.__class__.plot.__doc__ = plot_estimate.__doc__.format(self.__class__.__name__)
+        self.__class__.plot.__doc__ = plot_estimate.__doc__.format(self.__class__.__name__, self._estimate_name)
 
+    @must_call_fit_first
     def plot(self, *args, **kwargs):
-        try:
-            estimate = self._estimate_name
-        except AttributeError:
-            raise RuntimeError("Must call `fit` first!")
-            
         return plot_estimate(self, *args, **kwargs)
 
-    def subtract(self,other):
+    @must_call_fit_first
+    def subtract(self, other):
         """
         Subtract the {0} of two {1} objects.
 
             Parameters:
               other: an {1} fitted instance.
         """
-
-        try:
-            estimate = self._estimate_name
-        except AttributeError:
-            raise RuntimeError("Must call `fit` first!")
-        
         self_estimate = getattr(self, self._estimate_name)
         other_estimate = getattr(other, other._estimate_name)
         new_index = np.concatenate((other_estimate.index, self_estimate.index))
@@ -67,6 +71,7 @@ def subtract(self,other):
             columns=['diff']
         )
 
+    @must_call_fit_first
     def divide(self, other):
         """
         Divide the {0} of two {1} objects.
@@ -75,11 +80,6 @@ def divide(self, other):
           other: an {1} fitted instance.
 
         """
-        try:
-            estimate = self._estimate_name
-        except AttributeError:
-            raise RuntimeError("Must call `fit` first!")
-    
         self_estimate = getattr(self, self._estimate_name)
         other_estimate = getattr(other, other._estimate_name)
         new_index = np.concatenate((other_estimate.index, self_estimate.index))
@@ -91,6 +91,7 @@ def divide(self, other):
             columns=['ratio']
         )
 
+    @must_call_fit_first
     def predict(self, times):
         """
         Predict the {0} at certain point in time. Uses a linear interpolation if
@@ -101,23 +102,20 @@ def predict(self, times):
 
         Returns:
           predictions: a scalar if time is a scalar, a numpy array if time in an array.
-        """ 
-        try:
-            estimate = self._estimate_name
-        except AttributeError:
-            raise RuntimeError("Must call `fit` first!")
-
+        """
         if callable(self._estimation_method):
             return pd.DataFrame(self._estimation_method(_to_array(times)), index=_to_array(times)).loc[times].squeeze()
         else:
             estimate = getattr(self, self._estimation_method)
             # non-linear interpolations can push the survival curves above 1 and below 0.
             return estimate.reindex(estimate.index.union(_to_array(times))).interpolate("index").loc[times].squeeze()
-        
+
     @property
+    @must_call_fit_first
     def conditional_time_to_event_(self):
         return self._conditional_time_to_event_()
 
+    @must_call_fit_first
     def _conditional_time_to_event_(self):
         """
         Return a DataFrame, with index equal to survival_function_, that estimates the median
diff --git a/lifelines/plotting.py b/lifelines/plotting.py
index 46eb69e3f..b2218ff0a 100644
--- a/lifelines/plotting.py
+++ b/lifelines/plotting.py
@@ -283,9 +283,9 @@ def plot_estimate(cls,estimate=None,loc=None, iloc=None, show_censors=False,
          censor_styles=None, ci_legend=False, ci_force_lines=False,
          ci_alpha=0.25, ci_show=True, at_risk_counts=False, invert_y_axis=False,
          bandwidth=None, **kwargs):
-    
+
     """"
-    Plots a pretty version of the fitted .
+    Plots a pretty figure of {0}.{1}
 
     Matplotlib plot arguments can be passed in inside the kwargs, plus
 
@@ -326,7 +326,7 @@ def plot_estimate(cls,estimate=None,loc=None, iloc=None, show_censors=False,
     set_kwargs_ax(kwargs)
     set_kwargs_color(kwargs)
     set_kwargs_drawstyle(kwargs)
-    
+
     if estimate is None:
         estimate = cls._estimate_name
 
diff --git a/tests/test_estimation.py b/tests/test_estimation.py
index 0d144f613..098afac26 100644
--- a/tests/test_estimation.py
+++ b/tests/test_estimation.py
@@ -1834,7 +1834,7 @@ def test_robust_errors_with_strata_against_R(self, rossi):
 
         cf = CoxPHFitter()
         cf.fit(df, duration_col='T', event_col='E', strata=['var1'], robust=True)
-        npt.assert_allclose(cf.summary['se(coef)'].values, 2.79, rtol=1e-2)
+        npt.assert_allclose(cf.summary['se(coef)'].values, 2.78649, rtol=1e-2)
 
 
     @pytest.mark.xfail
@@ -1859,7 +1859,7 @@ def test_robust_errors_with_strata_against_R_super_accurate(self, rossi):
 
         cf = CoxPHFitter()
         cf.fit(df, duration_col='T', event_col='E', strata=['var1'], robust=True)
-        npt.assert_allclose(cf.summary['se(coef)'].values, 2.79, rtol=1e-4)
+        npt.assert_allclose(cf.summary['se(coef)'].values, 2.78649, rtol=1e-4)
 
 
     def test_what_happens_to_nans(self, rossi):

From 1f3bca0efbdeaa5a22c3d213e5b05d1a05aeaa56 Mon Sep 17 00:00:00 2001
From: Cameron Davidson-Pilon <cam.davidson.pilon@gmail.com>
Date: Tue, 13 Nov 2018 17:49:01 -0500
Subject: [PATCH 43/59] caching the predicted values to score_ computation can
 be delayed until needed

---
 docs/Survival analysis with lifelines.rst    | 10 ++++++--
 lifelines/fitters/cox_time_varying_fitter.py | 15 ++++++------
 lifelines/fitters/coxph_fitter.py            | 24 +++++++++++++-------
 3 files changed, 32 insertions(+), 17 deletions(-)

diff --git a/docs/Survival analysis with lifelines.rst b/docs/Survival analysis with lifelines.rst
index 7380975a1..78ea39c11 100644
--- a/docs/Survival analysis with lifelines.rst	
+++ b/docs/Survival analysis with lifelines.rst	
@@ -457,7 +457,12 @@ Another very popular model for survival data is the Weibull model. In contrast t
 
  ..math::  S(t) = \exp\left(-(\lambda t)^\rho\right),   \lambda >0, \rho > 0,
 
-* A priori*, we do not know what :math:`\lambda` and :math:`\rho` are, but we use the data on hand to estimate these parameters. In lifelines, this is implemented in the ``WeibullFitter``:
+* A priori*, we do not know what :math:`\lambda` and :math:`\rho` are, but we use the data on hand to estimate these parameters. In fact, we actually model and estimate the hazard rate:
+
+
+ ..math::  S(t) = -(\lambda t)^\rho,   \lambda >0, \rho > 0,
+
+In lifelines, estimation is available using the ``WeibullFitter`` class:
 
 .. code:: python
 
@@ -472,7 +477,8 @@ Another very popular model for survival data is the Weibull model. In contrast t
     print(wf.lambda_, wf.rho_)
     wf.print_summary()
 
-    wf.
+    wf.plot()
+
 
 
 Other parametric models: Exponential and LogNormal
diff --git a/lifelines/fitters/cox_time_varying_fitter.py b/lifelines/fitters/cox_time_varying_fitter.py
index 8f7c1815b..f2c706851 100644
--- a/lifelines/fitters/cox_time_varying_fitter.py
+++ b/lifelines/fitters/cox_time_varying_fitter.py
@@ -16,12 +16,13 @@
 from lifelines.fitters import BaseFitter
 from lifelines.fitters.coxph_fitter import CoxPHFitter
 from lifelines.statistics import chisq_test
-from lifelines.utils import inv_normal_cdf, \
-    significance_code, normalize,\
-    pass_for_numeric_dtypes_or_raise, check_low_var,\
-    check_for_overlapping_intervals, check_complete_separation_low_variance,\
-    ConvergenceWarning, StepSizer, _get_index, check_for_immediate_deaths,\
+from lifelines.utils import (inv_normal_cdf,
+    significance_code, normalize,
+    pass_for_numeric_dtypes_or_raise, check_low_var,
+    check_for_overlapping_intervals, check_complete_separation_low_variance,
+    ConvergenceWarning, StepSizer, _get_index, check_for_immediate_deaths,
     check_for_instantaneous_events, ConvergenceError, check_nans, string_justify
+)
 
 
 class CoxTimeVaryingFitter(BaseFitter):
@@ -179,7 +180,7 @@ def _compute_sandwich_estimator(self, df, stop_times_events, weights):
 
     def _compute_standard_errors(self, df, stop_times_events, weights):
         if self.robust:
-            se = np.sqrt(self._compute_sandwich_estimator(df, stop_times_events, weights).diagonal()) # / self._norm_std
+            se = np.sqrt(self._compute_sandwich_estimator(df, stop_times_events, weights).diagonal())
         else:
             se = np.sqrt(self.variance_matrix_.diagonal())
         return pd.DataFrame(se[None, :],
@@ -300,7 +301,7 @@ def _newton_rhaphson(self, df, stop_times_events, weights, show_progress=False,
                 converging, completed = False, True
             elif i >= max_steps:
                 # 50 iterations steps with N-R is a lot.
-                # Expected convergence is ~10 steps
+                # Expected convergence is less than 10 steps
                 converging, completed = False, False
             elif step_size <= 0.0001:
                 converging, completed = False, False
diff --git a/lifelines/fitters/coxph_fitter.py b/lifelines/fitters/coxph_fitter.py
index 5e376ffde..84f57c500 100644
--- a/lifelines/fitters/coxph_fitter.py
+++ b/lifelines/fitters/coxph_fitter.py
@@ -15,12 +15,12 @@
 import scipy.stats as stats
 
 from lifelines.fitters import BaseFitter
-from lifelines.utils import survival_table_from_events, inv_normal_cdf, normalize,\
-    significance_code, concordance_index, _get_index, qth_survival_times,\
-    pass_for_numeric_dtypes_or_raise, check_low_var, coalesce,\
-    check_complete_separation, check_nans, StatError, ConvergenceWarning,\
-    StepSizer, ConvergenceError, string_justify
 from lifelines.statistics import chisq_test
+from lifelines.utils import (survival_table_from_events, inv_normal_cdf, normalize,
+    significance_code, concordance_index, _get_index, qth_survival_times,
+    pass_for_numeric_dtypes_or_raise, check_low_var, coalesce,
+    check_complete_separation, check_nans, StatError, ConvergenceWarning,
+    StepSizer, ConvergenceError, string_justify)
 
 
 class CoxPHFitter(BaseFitter):
@@ -167,9 +167,7 @@ def fit(self, df, duration_col, event_col=None,
         self.baseline_hazard_ = self._compute_baseline_hazards(df, T, E, weights)
         self.baseline_cumulative_hazard_ = self._compute_baseline_cumulative_hazard()
         self.baseline_survival_ = self._compute_baseline_survival()
-        self.score_ = concordance_index(self.durations,
-                                        -self.predict_partial_hazard(df).values,
-                                        self.event_observed)
+        self._predicted_partial_hazards_ = self.predict_partial_hazard(df).values
 
         self._train_log_partial_hazard = self.predict_log_partial_hazard(self._norm_mean.to_frame().T)
         return self
@@ -858,3 +856,13 @@ def plot_covariate_groups(self, covariate, groups, **kwargs):
         self.predict_survival_function(X).plot(ax=ax)
         self.baseline_survival_.plot(ax=ax, ls='--')
         return ax
+
+    @property
+    def score_(self):
+        if hasattr(self, '_concordance_score_'):
+            return self._concordance_score_
+        else:
+            self._concordance_score_ = concordance_index(self.durations,
+                                     -self._predicted_partial_hazards_,
+                                     self.event_observed)
+            return self._concordance_score_

From 2a23205548e8c69115f145ca195899418dd380ee Mon Sep 17 00:00:00 2001
From: Cameron Davidson-Pilon <cam.davidson.pilon@gmail.com>
Date: Tue, 13 Nov 2018 18:35:08 -0500
Subject: [PATCH 44/59] damn ctv is like 100% faster vs master now

---
 CHANGELOG.md                                 |  8 ++--
 lifelines/fitters/cox_time_varying_fitter.py | 40 +++++++++++---------
 2 files changed, 26 insertions(+), 22 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 7aa6e956d..956fa8931 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -2,7 +2,7 @@
 
 #### 0.15.0
  - adding `robust` params to Cox models' `fit`. This enables atleast i) using non-integer weights in the model (these could be sampling weights like IPTW), and ii) mis-specified models (ex: non-proportional hazards). Under the hood it's a sandwich estimator. This does not handle ties, so if there are high number of ties, results may significantly differ from other software.
- - `standard_errors_` is now a property on fitted Cox models.
+ - `standard_errors_` is now a property on fitted `CoxPHFitter` which describes the standard errors of the coefficients.
  - `variance_matrix_` is now a property on fitted `CoxPHFitter` which describes the variance matrix of the coefficients.
  - new criteria for convergence of `CoxPHFitter` and `CoxTimeVaryingFitter` called the Newton-decrement. Tests show it is as accurate (w.r.t to previous coefficients) and typically shaves off a single step, resulting in generally faster convergence. See https://www.cs.cmu.edu/~pradeepr/convexopt/Lecture_Slides/Newton_methods.pdf. Details about the Newton-decrement are added to the `show_progress` statements.
  - Minimum suppport for scipy is 1.0
@@ -10,13 +10,13 @@
  - `AalenAdditiveModel` raises `ConvergenceWarning` instead of printing a warning.
  - `KaplanMeierFitter` now has a cumulative plot option. Example `kmf.plot(invert_y_axis=True)`
  - a `weights_col` option has been added to `CoxTimeVaryingFitter` that allows for time-varying weights. 
- - `WeibullFitter` has a new `show_progress` param.
+ - `WeibullFitter` has a new `show_progress` param and additional information if the convergence fails. 
  - `CoxPHFitter`, `ExponentialFitter`, `WeibullFitter` and `CoxTimeVaryFitter` method `print_summary` is updated with new fields. 
  - `WeibullFitter` has renamed the incorrect `_jacobian` to `_hessian_`. 
  - `variance_matrix_` is now a property on fitted `WeibullFitter` which describes the variance matrix of the parameters.
  - The default `WeibullFitter().timeline` has changed from integers between the min and max duration to _n_ floats between the max and min durations, where _n_ is the number of observations. 
- - Performance improvements for `CoxPHFitter` (~15% faster)
- - Performance improvements for `CoxTimeVaryingFitter` (~15% faster)
+ - Performance improvements for `CoxPHFitter` (~20% faster)
+ - Performance improvements for `CoxTimeVaryingFitter` (~100% faster)
  - Univariate models are now serialisable with `pickle`. Thanks @dwilson1988 for the contribution. 
  - `baseline_cumulative_hazard_` (and derivatives of that) on `CoxPHFitter` now correctly incorporate the `weights_col`. 
  - Fixed a bug in `KaplanMeierFitter` when late entry times lined up with death events. Thanks @pzivich
diff --git a/lifelines/fitters/cox_time_varying_fitter.py b/lifelines/fitters/cox_time_varying_fitter.py
index f2c706851..28ba2dc36 100644
--- a/lifelines/fitters/cox_time_varying_fitter.py
+++ b/lifelines/fitters/cox_time_varying_fitter.py
@@ -347,33 +347,34 @@ def _get_gradients(self, df, stops_events, weights, beta):
             # I feel like this can be made into some tree-like structure
             ix = (stops_events['start'].values < t) & (t <= stops_events['stop'].values)
 
-            df_at_t = df.loc[ix]
-            weights_at_t = weights.loc[ix]
-            stops_events_at_t = stops_events.loc[ix]
+            df_at_t = df.values[ix]
+            weights_at_t = weights.values[ix]
+            stops_events_at_t = stops_events['stop'].values[ix]
+            events_at_t = stops_events['event'].values[ix]
 
-            phi_i = weights_at_t.values * exp(dot(df_at_t, beta))
+            phi_i = weights_at_t * exp(dot(df_at_t, beta))
             phi_x_i = phi_i * df_at_t
             phi_x_x_i = dot(df_at_t.T, phi_x_i)
 
             # Calculate sums of Risk set
             risk_phi = phi_i.sum()
-            risk_phi_x = phi_x_i.sum(0).values
+            risk_phi_x = phi_x_i.sum(0)
             risk_phi_x_x = phi_x_x_i
 
             # Calculate the sums of Tie set
-            deaths = stops_events_at_t['event'].values & (stops_events_at_t['stop'].values == t)
+            deaths = events_at_t & (stops_events_at_t == t)
 
             ties_counts = deaths.sum()  # should always at least 1
 
-            xi_deaths = df_at_t.loc[deaths]
-            weights_deaths = weights_at_t.loc[deaths].values
+            xi_deaths = df_at_t[deaths]
+            weights_deaths = weights_at_t[deaths]
 
-            x_death_sum = (weights_deaths * xi_deaths).sum(0).values
+            x_death_sum = (weights_deaths * xi_deaths).sum(0)
 
             if ties_counts > 1:
                 # it's faster if we can skip computing these when we don't need to.
                 tie_phi = phi_i[deaths].sum()
-                tie_phi_x = phi_x_i.loc[deaths].sum(0).values
+                tie_phi_x = phi_x_i[deaths].sum(0)
                 tie_phi_x_x = dot(xi_deaths.T, phi_i[deaths] * xi_deaths)
 
             partial_gradient = np.zeros(d)
@@ -559,22 +560,25 @@ def plot(self, standardized=False, columns=None, **kwargs):
 
 
     def _compute_cumulative_baseline_hazard(self, tv_data, stop_times_events, weights):
-        events = stop_times_events.copy()
-        events['hazard'] = self.predict_partial_hazard(tv_data).values
+        hazards = self.predict_partial_hazard(tv_data).values
 
-        unique_death_times = np.unique(events['stop'].loc[events['event']])
+        unique_death_times = np.unique(stop_times_events['stop'].loc[stop_times_events['event']])
         baseline_hazard_ = pd.DataFrame(np.zeros_like(unique_death_times),
                                         index=unique_death_times,
                                         columns=['baseline hazard'])
 
         for t in unique_death_times:
-            ix = (events['start'].values < t) & (t <= events['stop'].values)
-            events_at_t = events.loc[ix]
-            weights_at_t = weights.loc[ix].values
-            deaths = events_at_t['event'].values & (events_at_t['stop'] == t).values
+            ix = (stop_times_events['start'].values < t) & (t <= stop_times_events['stop'].values)
+
+            events_at_t = stop_times_events['event'].values[ix]
+            stops_at_t = stop_times_events['stop'].values[ix]
+            weights_at_t = weights.values[ix]
+            hazards_at_t = hazards[ix]
+
+            deaths = events_at_t & (stops_at_t == t)
 
             death_counts = (weights_at_t.squeeze() * deaths).sum()  # should always be atleast 1.
-            baseline_hazard_.loc[t] = death_counts / events_at_t['hazard'].sum()
+            baseline_hazard_.loc[t] = death_counts / hazards_at_t.sum()
 
         return baseline_hazard_.cumsum()
 

From 37047ee556f6eeb2a89e2aa920f51ec76d990fe3 Mon Sep 17 00:00:00 2001
From: hcarlens <harald.carlens@itg.com>
Date: Wed, 14 Nov 2018 14:02:37 +0000
Subject: [PATCH 45/59] Extend check_nans() to check_nans_or_infs()

---
 lifelines/fitters/aalen_additive_fitter.py   |  6 +++---
 lifelines/fitters/cox_time_varying_fitter.py |  4 ++--
 lifelines/fitters/coxph_fitter.py            |  8 ++++----
 lifelines/fitters/exponential_fitter.py      |  6 +++---
 lifelines/fitters/kaplan_meier_fitter.py     |  6 +++---
 lifelines/fitters/nelson_aalen_fitter.py     |  6 +++---
 lifelines/fitters/weibull_fitter.py          |  6 +++---
 lifelines/utils/__init__.py                  | 11 ++++++++++-
 8 files changed, 31 insertions(+), 22 deletions(-)

diff --git a/lifelines/fitters/aalen_additive_fitter.py b/lifelines/fitters/aalen_additive_fitter.py
index 69b4e0389..fbda1437f 100644
--- a/lifelines/fitters/aalen_additive_fitter.py
+++ b/lifelines/fitters/aalen_additive_fitter.py
@@ -10,7 +10,7 @@
 from lifelines.fitters import BaseFitter
 from lifelines.utils import _get_index, inv_normal_cdf, epanechnikov_kernel, \
     ridge_regression as lr, qth_survival_times, pass_for_numeric_dtypes_or_raise,\
-    concordance_index, check_nans, ConvergenceWarning
+    concordance_index, check_nans_or_infs, ConvergenceWarning
 
 from lifelines.utils.progress_bar import progress_bar
 from lifelines.plotting import fill_between_steps
@@ -315,8 +315,8 @@ def _fit_varying(self, dataframe, duration_col="T", event_col="E",
 
     def _check_values(self, df, T, E):
         pass_for_numeric_dtypes_or_raise(df)
-        check_nans(T)
-        check_nans(E)
+        check_nans_or_infs(T)
+        check_nans_or_infs(E)
 
     def smoothed_hazards_(self, bandwidth=1):
         """
diff --git a/lifelines/fitters/cox_time_varying_fitter.py b/lifelines/fitters/cox_time_varying_fitter.py
index 28ba2dc36..37fd814c5 100644
--- a/lifelines/fitters/cox_time_varying_fitter.py
+++ b/lifelines/fitters/cox_time_varying_fitter.py
@@ -21,7 +21,7 @@
     pass_for_numeric_dtypes_or_raise, check_low_var,
     check_for_overlapping_intervals, check_complete_separation_low_variance,
     ConvergenceWarning, StepSizer, _get_index, check_for_immediate_deaths,
-    check_for_instantaneous_events, ConvergenceError, check_nans, string_justify
+    check_for_instantaneous_events, ConvergenceError, check_nans_or_infs, string_justify
 )
 
 
@@ -121,7 +121,7 @@ def fit(self, df, id_col, event_col, start_col='start', stop_col='stop', weights
     @staticmethod
     def _check_values(df, stop_times_events):
         # check_for_overlapping_intervals(df) # this is currenty too slow for production.
-        check_nans(df)
+        check_nans_or_infs(df)
         check_low_var(df)
         check_complete_separation_low_variance(df, stop_times_events['event'])
         pass_for_numeric_dtypes_or_raise(df)
diff --git a/lifelines/fitters/coxph_fitter.py b/lifelines/fitters/coxph_fitter.py
index 84f57c500..0a423c657 100644
--- a/lifelines/fitters/coxph_fitter.py
+++ b/lifelines/fitters/coxph_fitter.py
@@ -19,7 +19,7 @@
 from lifelines.utils import (survival_table_from_events, inv_normal_cdf, normalize,
     significance_code, concordance_index, _get_index, qth_survival_times,
     pass_for_numeric_dtypes_or_raise, check_low_var, coalesce,
-    check_complete_separation, check_nans, StatError, ConvergenceWarning,
+    check_complete_separation, check_nans_or_infs, StatError, ConvergenceWarning,
     StepSizer, ConvergenceError, string_justify)
 
 
@@ -435,9 +435,9 @@ def _compute_baseline_cumulative_hazard(self):
     @staticmethod
     def _check_values(df, T, E):
         pass_for_numeric_dtypes_or_raise(df)
-        check_nans(T)
-        check_nans(E)
-        check_nans(df)
+        check_nans_or_infs(T)
+        check_nans_or_infs(E)
+        check_nans_or_infs(df)
         check_low_var(df)
         check_complete_separation(df, E, T)
 
diff --git a/lifelines/fitters/exponential_fitter.py b/lifelines/fitters/exponential_fitter.py
index 6c655653c..cb03c7c9f 100644
--- a/lifelines/fitters/exponential_fitter.py
+++ b/lifelines/fitters/exponential_fitter.py
@@ -5,7 +5,7 @@
 from scipy import stats
 
 from lifelines.fitters import UnivariateFitter
-from lifelines.utils import inv_normal_cdf, check_nans, significance_code, string_justify
+from lifelines.utils import inv_normal_cdf, check_nans_or_infs, significance_code, string_justify
 
 
 class ExponentialFitter(UnivariateFitter):
@@ -56,9 +56,9 @@ def fit(self, durations, event_observed=None, timeline=None, entry=None,
 
         """
 
-        check_nans(durations)
+        check_nans_or_infs(durations)
         if event_observed is not None:
-            check_nans(event_observed)
+            check_nans_or_infs(event_observed)
 
         self.durations = np.asarray(durations, dtype=float)
         self.event_observed = np.asarray(event_observed, dtype=int) if event_observed is not None else np.ones_like(self.durations)
diff --git a/lifelines/fitters/kaplan_meier_fitter.py b/lifelines/fitters/kaplan_meier_fitter.py
index cd08cb23b..ea6af248d 100644
--- a/lifelines/fitters/kaplan_meier_fitter.py
+++ b/lifelines/fitters/kaplan_meier_fitter.py
@@ -7,7 +7,7 @@
 
 from lifelines.fitters import UnivariateFitter
 from lifelines.utils import _preprocess_inputs, _additive_estimate, StatError, inv_normal_cdf,\
-    median_survival_times, check_nans
+    median_survival_times, check_nans_or_infs
 from lifelines.plotting import plot_loglogs
 
 
@@ -48,9 +48,9 @@ def fit(self, durations, event_observed=None, timeline=None, entry=None, label='
 
         """
 
-        check_nans(durations)
+        check_nans_or_infs(durations)
         if event_observed is not None:
-            check_nans(event_observed)
+            check_nans_or_infs(event_observed)
 
         if weights is not None:
           if (weights.astype(int) != weights).any():
diff --git a/lifelines/fitters/nelson_aalen_fitter.py b/lifelines/fitters/nelson_aalen_fitter.py
index 5ae5ef7b4..398eeff6a 100644
--- a/lifelines/fitters/nelson_aalen_fitter.py
+++ b/lifelines/fitters/nelson_aalen_fitter.py
@@ -8,7 +8,7 @@
 
 from lifelines.fitters import UnivariateFitter
 from lifelines.utils import _preprocess_inputs, _additive_estimate, epanechnikov_kernel,\
-    inv_normal_cdf, check_nans
+    inv_normal_cdf, check_nans_or_infs
 
 
 class NelsonAalenFitter(UnivariateFitter):
@@ -65,9 +65,9 @@ def fit(self, durations, event_observed=None, timeline=None, entry=None,
 
         """
 
-        check_nans(durations)
+        check_nans_or_infs(durations)
         if event_observed is not None:
-            check_nans(event_observed)
+            check_nans_or_infs(event_observed)
 
         if weights is not None:
           if (weights.astype(int) != weights).any():
diff --git a/lifelines/fitters/weibull_fitter.py b/lifelines/fitters/weibull_fitter.py
index 7060d9d7f..a24b32549 100644
--- a/lifelines/fitters/weibull_fitter.py
+++ b/lifelines/fitters/weibull_fitter.py
@@ -8,7 +8,7 @@
 from scipy import stats as stats
 from numpy.linalg import solve, norm, inv
 from lifelines.fitters import UnivariateFitter
-from lifelines.utils import inv_normal_cdf, check_nans, ConvergenceError, string_justify, significance_code,\
+from lifelines.utils import inv_normal_cdf, check_nans_or_infs, ConvergenceError, string_justify, significance_code,\
                             ConvergenceWarning
 
 
@@ -89,9 +89,9 @@ def fit(self, durations, event_observed=None, timeline=None, entry=None,
 
         """
 
-        check_nans(durations)
+        check_nans_or_infs(durations)
         if event_observed is not None:
-            check_nans(event_observed)
+            check_nans_or_infs(event_observed)
 
         self.durations = np.asarray(durations, dtype=float)
         # check for negative or 0 durations - these are not allowed in a weibull model.
diff --git a/lifelines/utils/__init__.py b/lifelines/utils/__init__.py
index 896387d3a..5280381c8 100644
--- a/lifelines/utils/__init__.py
+++ b/lifelines/utils/__init__.py
@@ -1105,7 +1105,7 @@ def check_complete_separation(df, events, durations):
     check_complete_separation_close_to_perfect_correlation(df, durations)
 
 
-def check_nans(df_or_array):
+def check_nans_or_infs(df_or_array):
     nulls = pd.isnull(df_or_array)
     if hasattr(nulls, 'values'):
         if nulls.values.any():
@@ -1113,6 +1113,15 @@ def check_nans(df_or_array):
     else:
         if nulls.any():
             raise TypeError("NaNs were detected in the dataset. Try using pd.isnull to find the problematic values.")
+    # isinf check is done after isnull check since np.isinf doesn't work on None values
+    infs = np.isinf(df_or_array)
+    if hasattr(infs, 'values'):
+        if infs.values.any():
+            raise TypeError("Infs were detected in the dataset. Try using np.isinf to find the problematic values.")
+    else:
+        if infs.any():
+            raise TypeError("Infs were detected in the dataset. Try using np.isinf to find the problematic values.")
+
 
 def to_long_format(df, duration_col):
     """

From 149197da83a02b7797bc1425a4d50ff0b56b76a1 Mon Sep 17 00:00:00 2001
From: hcarlens <harald.carlens@itg.com>
Date: Wed, 14 Nov 2018 14:50:39 +0000
Subject: [PATCH 46/59] Inf check: deal with pandas data types separately from
 arrays

---
 lifelines/utils/__init__.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/lifelines/utils/__init__.py b/lifelines/utils/__init__.py
index 5280381c8..12aface59 100644
--- a/lifelines/utils/__init__.py
+++ b/lifelines/utils/__init__.py
@@ -1114,7 +1114,11 @@ def check_nans_or_infs(df_or_array):
         if nulls.any():
             raise TypeError("NaNs were detected in the dataset. Try using pd.isnull to find the problematic values.")
     # isinf check is done after isnull check since np.isinf doesn't work on None values
-    infs = np.isinf(df_or_array)
+    infs = []
+    if isinstance(df_or_array, pd.Series) or isinstance(df_or_array, pd.DataFrame):
+        infs = (df_or_array == np.Inf)
+    else:
+        infs = np.isinf(df_or_array)
     if hasattr(infs, 'values'):
         if infs.values.any():
             raise TypeError("Infs were detected in the dataset. Try using np.isinf to find the problematic values.")

From 8ba97a8f080c4e1dc88874055023ef7b43656528 Mon Sep 17 00:00:00 2001
From: Cameron Davidson-Pilon <cam.davidson.pilon@gmail.com>
Date: Fri, 16 Nov 2018 13:23:59 -0500
Subject: [PATCH 47/59] a bit sad, but can only pickle in py3

---
 CHANGELOG.md                  |  2 +-
 lifelines/fitters/__init__.py | 19 +++++++++++++------
 tests/test_estimation.py      |  2 ++
 3 files changed, 16 insertions(+), 7 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 956fa8931..9a07ff839 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -17,7 +17,7 @@
  - The default `WeibullFitter().timeline` has changed from integers between the min and max duration to _n_ floats between the max and min durations, where _n_ is the number of observations. 
  - Performance improvements for `CoxPHFitter` (~20% faster)
  - Performance improvements for `CoxTimeVaryingFitter` (~100% faster)
- - Univariate models are now serialisable with `pickle`. Thanks @dwilson1988 for the contribution. 
+ - In Python3, Univariate models are now serialisable with `pickle`. Thanks @dwilson1988 for the contribution. For Python2, `dill` is still the preferred method.
  - `baseline_cumulative_hazard_` (and derivatives of that) on `CoxPHFitter` now correctly incorporate the `weights_col`. 
  - Fixed a bug in `KaplanMeierFitter` when late entry times lined up with death events. Thanks @pzivich
 
diff --git a/lifelines/fitters/__init__.py b/lifelines/fitters/__init__.py
index 7e9021066..f23859bf9 100644
--- a/lifelines/fitters/__init__.py
+++ b/lifelines/fitters/__init__.py
@@ -2,13 +2,14 @@
 from __future__ import print_function
 import collections
 from functools import wraps
+import sys
 
 import numpy as np
 import pandas as pd
 
 from lifelines.plotting import plot_estimate
 from lifelines.utils import qth_survival_times, _to_array
-
+from lifelines.compat import PY2, PY3
 
 def must_call_fit_first(func):
     @wraps(func)
@@ -43,10 +44,16 @@ class UnivariateFitter(BaseFitter):
     @must_call_fit_first
     def _update_docstrings(self):
         # Update their docstrings
-        self.__class__.subtract.__doc__ = self.subtract.__doc__.format(self._estimate_name, self.__class__.__name__)
-        self.__class__.divide.__doc__ = self.divide.__doc__.format(self._estimate_name, self.__class__.__name__)
-        self.__class__.predict.__doc__ = self.predict.__doc__.format(self.__class__.__name__)
-        self.__class__.plot.__doc__ = plot_estimate.__doc__.format(self.__class__.__name__, self._estimate_name)
+        if PY2:
+            self.__class__.subtract.__func__.__doc__ = self.subtract.__doc__.format(self._estimate_name, self.__class__.__name__)
+            self.__class__.divide.__func__.__doc__ = self.divide.__doc__.format(self._estimate_name, self.__class__.__name__)
+            self.__class__.predict.__func__.__doc__ = self.predict.__doc__.format(self.__class__.__name__)
+            self.__class__.plot.__func__.__doc__ = plot_estimate.__doc__.format(self.__class__.__name__, self._estimate_name)
+        elif PY3:
+            self.__class__.subtract.__doc__ = self.subtract.__doc__.format(self._estimate_name, self.__class__.__name__)
+            self.__class__.divide.__doc__ = self.divide.__doc__.format(self._estimate_name, self.__class__.__name__)
+            self.__class__.predict.__doc__ = self.predict.__doc__.format(self.__class__.__name__)
+            self.__class__.plot.__doc__ = plot_estimate.__doc__.format(self.__class__.__name__, self._estimate_name)
 
     @must_call_fit_first
     def plot(self, *args, **kwargs):
@@ -66,7 +73,7 @@ def subtract(self, other):
         new_index = np.unique(new_index)
         return pd.DataFrame(
             self_estimate.reindex(new_index, method='ffill').values -
-            other_estimate.reindex(new_index, method='ffill').values,
+              other_estimate.reindex(new_index, method='ffill').values,
             index=new_index,
             columns=['diff']
         )
diff --git a/tests/test_estimation.py b/tests/test_estimation.py
index 098afac26..ff81104ab 100644
--- a/tests/test_estimation.py
+++ b/tests/test_estimation.py
@@ -20,6 +20,7 @@
 import numpy.testing as npt
 from numpy.linalg.linalg import LinAlgError
 
+from lifelines.compat import PY2, PY3
 from lifelines.utils import k_fold_cross_validation, StatError, concordance_index, ConvergenceWarning, to_long_format
 from lifelines.estimation import CoxPHFitter, AalenAdditiveFitter, KaplanMeierFitter, \
     NelsonAalenFitter, BreslowFlemingHarringtonFitter, ExponentialFitter, \
@@ -300,6 +301,7 @@ def test_typeerror_is_thrown_if_there_is_nans_in_the_event_col(self, univariate_
             with pytest.raises(TypeError):
                 fitter().fit(T, E)
 
+    @pytest.mark.skipif(PY2, reason="requires python3 or higher")
     def test_pickle_serialization(self, positive_sample_lifetimes, univariate_fitters):
          T = positive_sample_lifetimes[0]
          for f in univariate_fitters:

From c5492a183aa396803cb122c0354e7f367c059e70 Mon Sep 17 00:00:00 2001
From: Cameron Davidson-Pilon <cam.davidson.pilon@gmail.com>
Date: Fri, 16 Nov 2018 13:24:20 -0500
Subject: [PATCH 48/59] new compat file

---
 lifelines/compat.py | 4 ++++
 1 file changed, 4 insertions(+)
 create mode 100644 lifelines/compat.py

diff --git a/lifelines/compat.py b/lifelines/compat.py
new file mode 100644
index 000000000..af8b13ae3
--- /dev/null
+++ b/lifelines/compat.py
@@ -0,0 +1,4 @@
+import sys
+
+PY2 = sys.version_info[0] == 2
+PY3 = sys.version_info[0] >= 3

From 10b59949856b948a082af8438e5f164be0cbcd82 Mon Sep 17 00:00:00 2001
From: Cameron Davidson-Pilon <cam.davidson.pilon@gmail.com>
Date: Fri, 16 Nov 2018 13:46:29 -0500
Subject: [PATCH 49/59] fixing issue #535

---
 lifelines/plotting.py       |  2 +-
 lifelines/utils/__init__.py |  8 +++++++-
 tests/utils/test_utils.py   | 10 ++++++++++
 3 files changed, 18 insertions(+), 2 deletions(-)

diff --git a/lifelines/plotting.py b/lifelines/plotting.py
index b2218ff0a..1417b18ea 100644
--- a/lifelines/plotting.py
+++ b/lifelines/plotting.py
@@ -2,7 +2,7 @@
 from __future__ import print_function
 
 import numpy as np
-from .utils import coalesce
+from lifelines.utils import coalesce
 
 
 def is_latex_enabled():
diff --git a/lifelines/utils/__init__.py b/lifelines/utils/__init__.py
index 12aface59..5e27e7433 100644
--- a/lifelines/utils/__init__.py
+++ b/lifelines/utils/__init__.py
@@ -91,8 +91,14 @@ def qth_survival_times(q, survival_functions, cdf=False):
 
 def qth_survival_time(q, survival_function, cdf=False):
     """
-    Expects a Pandas series, returns the time when the qth probability is reached.
+    Expects a Pandas series or single-column dataframe, returns the time when the qth probability is reached.
     """
+    if isinstance(survival_function, pd.DataFrame):
+        if survival_function.shape[1] > 1:
+            raise ValueError("Expecting a dataframe (or series) with a single column. Provide that or use utils.qth_survival_times.")
+
+        survival_function = survival_function.T.squeeze()
+
     if cdf:
         if survival_function.iloc[0] > q:
             return np.inf
diff --git a/tests/utils/test_utils.py b/tests/utils/test_utils.py
index 27b06ef10..2adf9d3cc 100644
--- a/tests/utils/test_utils.py
+++ b/tests/utils/test_utils.py
@@ -148,6 +148,16 @@ def test_qth_survival_time_returns_inf():
     sf = pd.Series([1., 0.7, 0.6])
     assert utils.qth_survival_time(0.5, sf) == np.inf
 
+def test_qth_survival_time_with_dataframe():
+    sf_df_no_index = pd.DataFrame([1.0, 0.75, 0.5, 0.25, 0.0])
+    sf_df_index = pd.DataFrame([1.0, 0.75, 0.5, 0.25, 0.0], index=[10, 20, 30, 40, 50])
+    sf_df_too_many_columns = pd.DataFrame([[1,2], [3,4]])
+
+    assert utils.qth_survival_time(0.5, sf_df_no_index) == 2
+    assert utils.qth_survival_time(0.5, sf_df_index) == 30
+
+    with pytest.raises(ValueError):
+        utils.qth_survival_time(0.5, sf_df_too_many_columns)
 
 def test_qth_survival_times_with_multivariate_q():
     sf = np.linspace(1, 0, 50)

From f3ccf2a594d595a898c0b6f604be67e2d5fe05bd Mon Sep 17 00:00:00 2001
From: Cameron Davidson-Pilon <cam.davidson.pilon@gmail.com>
Date: Sat, 17 Nov 2018 22:05:02 -0500
Subject: [PATCH 50/59] adding cluster arg to coxph

---
 lifelines/fitters/coxph_fitter.py | 45 ++++++++++++++++++++++++------
 tests/test_estimation.py          | 46 +++++++++++++++++++++++++------
 2 files changed, 74 insertions(+), 17 deletions(-)

diff --git a/lifelines/fitters/coxph_fitter.py b/lifelines/fitters/coxph_fitter.py
index 0a423c657..5dc945b09 100644
--- a/lifelines/fitters/coxph_fitter.py
+++ b/lifelines/fitters/coxph_fitter.py
@@ -61,7 +61,7 @@ def __init__(self, alpha=0.95, tie_method='Efron', penalizer=0.0, strata=None):
     def fit(self, df, duration_col, event_col=None,
             show_progress=False, initial_beta=None,
             strata=None, step_size=None, weights_col=None,
-            robust=False):
+            cluster_col=None, robust=False):
         """
         Fit the Cox Propertional Hazard model to a dataset. Tied survival times
         are handled using Efron's tie-method.
@@ -93,7 +93,8 @@ def fit(self, df, duration_col, event_col=None,
           robust: Compute the robust errors using the Huber sandwich estimator, aka Wei-Lin estimate. This does not handle
             ties, so if there are high number of ties, results may significantly differ. See
             "The Robust Inference for the Cox Proportional Hazards Model", Journal of the American Statistical Association, Vol. 84, No. 408 (Dec., 1989), pp. 1074- 1078
-
+          cluster_col: specifies what column has ids for clustering covariances. Using this forces the sandwich estimator (robust variance estimator) to
+            be used.
         Returns:
             self, with additional properties: hazards_, confidence_intervals_, baseline_survival_, etc.
 
@@ -108,6 +109,8 @@ def fit(self, df, duration_col, event_col=None,
         self.duration_col = duration_col
         self.event_col = event_col
         self.robust = robust
+        self.cluster_col = cluster_col
+        self.weights_col = weights_col
         self._n_examples = df.shape[0]
         self.strata = coalesce(strata, self.strata)
         if self.strata is not None:
@@ -136,6 +139,9 @@ def fit(self, df, duration_col, event_col=None,
         else:
             weights = pd.Series(np.ones((self._n_examples,)), index=df.index)
 
+        if self.cluster_col:
+            self._clusters = df.pop(self.cluster_col)
+
         self._check_values(df, T, E)
         df = df.astype(float)
 
@@ -453,21 +459,34 @@ def _compute_confidence_intervals(self):
 
     def _compute_sandwich_estimator(self, X, T, E, weights):
 
-        if self.strata is None:
-            score_residuals = self._compute_residuals_within_strata(X.values, T.values, E.values, weights.values)
+        _, d = X.shape
 
-        else:
-            score_residuals = np.empty((0,1))
+        if self.strata is not None:
+            score_residuals = np.empty((0, d))
             for strata in np.unique(X.index):
+                # TODO: use pandas .groupby
                 stratified_X, stratified_T, stratified_E, stratified_W = X.loc[[strata]], T.loc[[strata]], E.loc[[strata]], weights.loc[[strata]]
 
                 score_residuals = np.append(score_residuals,
-                                            self._compute_residuals_within_strata(stratified_X.values, stratified_T.values, stratified_E.values, stratified_W.values),
+                                            self._compute_residuals_within_strata(stratified_X.values, stratified_T.values, stratified_E.values, stratified_W.values) * stratified_W[:, None],
                                             axis=0)
 
+        else:
+            score_residuals = self._compute_residuals_within_strata(X.values, T.values, E.values, weights.values) * weights[:, None]
+
+        if self.cluster_col:
+            score_residuals_ = np.empty((0, d))
+            for cluster in np.unique(self._clusters):
+                ix = self._clusters == cluster
+                weights_ = weights.values[ix]
+
+                score_residuals_ = np.append(score_residuals_,
+                                            (score_residuals[ix, :] * weights_).sum(0).reshape(1, d),
+                                            axis=0)
+            score_residuals = score_residuals_
 
         naive_var = inv(self._hessian_)
-        delta_betas = score_residuals.dot(naive_var) * weights[:, None]
+        delta_betas = score_residuals.dot(naive_var)
         sandwich_estimator = delta_betas.T.dot(delta_betas) / np.outer(self._norm_std, self._norm_std)
         return sandwich_estimator
 
@@ -515,7 +534,7 @@ def _compute_residuals_within_strata(self, X, T, E, weights):
 
 
     def _compute_standard_errors(self, df, T, E, weights):
-        if self.robust:
+        if self.robust or self.cluster_col:
             se = np.sqrt(self._compute_sandwich_estimator(df, T, E, weights).diagonal()) # / self._norm_std
         else:
             se = np.sqrt(self.variance_matrix_.diagonal())
@@ -561,6 +580,14 @@ def print_summary(self):
         print(self)
         print("{} = {}".format(justify('duration col'), self.duration_col))
         print("{} = {}".format(justify('event col'), self.event_col))
+        if self.weights_col:
+            print("{} = {}".format(justify('weights col'), self.weights_col))
+
+        if self.cluster_col:
+            print("{} = {}".format(justify('cluster col'), self.cluster_col))
+
+        if self.robust or self.cluster_col:
+            print("{} = {}".format(justify('robust variance'), True))
 
         if self.strata:
             print('{} = {}'.format(justify('strata'), self.strata))
diff --git a/tests/test_estimation.py b/tests/test_estimation.py
index ff81104ab..2b5b5894d 100644
--- a/tests/test_estimation.py
+++ b/tests/test_estimation.py
@@ -1174,6 +1174,34 @@ def test_robust_errors_with_trivial_weights_is_the_same_than_R(self, regression_
         expected = pd.Series({'var1': 2.097, 'var2': 0.827})
         assert_series_equal(cph.summary['se(coef)'], expected, check_less_precise=2, check_names=False)
 
+    def test_cluster_option(self, regression_dataset):
+        """
+        library(survival)
+        df <- data.frame(
+          "var1" = c(1, 1, 2, 2, 2),
+          "var2" = c(0.184677, 0.071893, 1.364646, 0.098375, 1.663092),
+          "id" = c(1, 1, 2, 3, 4),
+          "T" = c( 7.335846, 5.269797, 11.684092, 12.678458, 6.601666)
+        )
+        df['E'] = 1
+
+        c = coxph(formula=Surv(T, E) ~ var1 + var2 + cluster(id), data=df)
+        """
+
+        df = pd.DataFrame({
+            "var1": [1, 1, 2, 2, 2],
+            "var2": [0.184677, 0.071893, 1.364646, 0.098375, 1.663092],
+            "T":    [7.335846, 5.269797, 11.684092, 12.678458, 6.601666],
+            "id":   [1, 1, 2, 3, 4],
+        })
+        df['E'] = 1
+
+        cph = CoxPHFitter()
+        cph.fit(df, 'T', 'E', cluster_col='id', show_progress=True)
+        expected = pd.Series({'var1': 5.9752, 'var2': 4.0683})
+        assert_series_equal(cph.summary['se(coef)'], expected, check_less_precise=2, check_names=False)
+        cph.print_summary()
+
 
     def test_robust_errors_with_less_trival_weights_is_the_same_as_R(self, regression_dataset):
         """
@@ -1818,25 +1846,27 @@ def test_robust_errors_against_R_no_ties(self, regression_dataset):
     def test_robust_errors_with_strata_against_R(self, rossi):
         """
         df <- data.frame(
-            "var1" = c(1, 1, 2, 2, 2),
-            "var2" = c(0.184677, 0.071893, 1.364646, 0.098375, 1.663092),
-            "T" = c( 7.335846, 5.269797, 11.684092, 12.678458, 6.601666)
+          "var1" = c(1, 1, 2, 2, 2, 1),
+          "var2" = c(0.184677, 0.071893, 1.364646, 0.098375, 1.663092, 0.5),
+          "var3" = c(1, 2, 3, 2, 1, 2),
+          "T" = c( 7.335846, 5.269797, 11.684092, 12.678458, 6.601666, 8.)
         )
         df['E'] = 1
 
-        coxph(formula=Surv(T, E) ~ strata(var1) + var2, data=df, robust=TRUE)
+        coxph(formula=Surv(T, E) ~ strata(var1) + var2 + var3, data=df, robust=TRUE)
         """
 
         df = pd.DataFrame({
-            "var1": [1, 1, 2, 2, 2],
-            "var2": [0.184677, 0.071893, 1.364646, 0.098375, 1.663092],
-            "T": [7.335846, 5.269797, 11.684092, 12.678458, 6.601666]
+            "var1": [1, 1, 2, 2, 2, 1],
+            "var2": [0.184677, 0.071893, 1.364646, 0.098375, 1.663092, 0.5],
+            "var3": [1, 2, 3, 2, 1, 2],
+            "T": [7.335846, 5.269797, 11.684092, 12.678458, 6.601666, 8.0]
         })
         df['E'] = 1
 
         cf = CoxPHFitter()
         cf.fit(df, duration_col='T', event_col='E', strata=['var1'], robust=True)
-        npt.assert_allclose(cf.summary['se(coef)'].values, 2.78649, rtol=1e-2)
+        npt.assert_allclose(cf.summary['se(coef)'].values, np.array([1.076, 0.680]), rtol=1e-2)
 
 
     @pytest.mark.xfail

From 12d9fe052c4eb0c97015e5742477121a28b6851e Mon Sep 17 00:00:00 2001
From: Cameron Davidson-Pilon <cam.davidson.pilon@gmail.com>
Date: Sat, 17 Nov 2018 22:10:59 -0500
Subject: [PATCH 51/59] adding another test

---
 CHANGELOG.md                      |  1 +
 lifelines/fitters/coxph_fitter.py |  6 +++++-
 tests/test_estimation.py          | 29 ++++++++++++++++++++++++++++-
 3 files changed, 34 insertions(+), 2 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 9a07ff839..ce38faeab 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -20,6 +20,7 @@
  - In Python3, Univariate models are now serialisable with `pickle`. Thanks @dwilson1988 for the contribution. For Python2, `dill` is still the preferred method.
  - `baseline_cumulative_hazard_` (and derivatives of that) on `CoxPHFitter` now correctly incorporate the `weights_col`. 
  - Fixed a bug in `KaplanMeierFitter` when late entry times lined up with death events. Thanks @pzivich
+ - adding `cluster_col` argument to `CoxPHFitter` so users can specify groups of subjects/rows that may be correlated. 
 
 #### 0.14.6
  - fix for n > 2 groups in `multivariate_logrank_test` (again).
diff --git a/lifelines/fitters/coxph_fitter.py b/lifelines/fitters/coxph_fitter.py
index 5dc945b09..fef92bd89 100644
--- a/lifelines/fitters/coxph_fitter.py
+++ b/lifelines/fitters/coxph_fitter.py
@@ -461,6 +461,9 @@ def _compute_sandwich_estimator(self, X, T, E, weights):
 
         _, d = X.shape
 
+        if self.strata is not None and self.cluster_col is not None:
+            raise NotImplementedError("Providing clusters and strata is not implemented yet")
+
         if self.strata is not None:
             score_residuals = np.empty((0, d))
             for strata in np.unique(X.index):
@@ -475,13 +478,14 @@ def _compute_sandwich_estimator(self, X, T, E, weights):
             score_residuals = self._compute_residuals_within_strata(X.values, T.values, E.values, weights.values) * weights[:, None]
 
         if self.cluster_col:
+
             score_residuals_ = np.empty((0, d))
             for cluster in np.unique(self._clusters):
                 ix = self._clusters == cluster
                 weights_ = weights.values[ix]
 
                 score_residuals_ = np.append(score_residuals_,
-                                            (score_residuals[ix, :] * weights_).sum(0).reshape(1, d),
+                                            (score_residuals[ix, :] * weights_[:, None]).sum(0).reshape(1, d),
                                             axis=0)
             score_residuals = score_residuals_
 
diff --git a/tests/test_estimation.py b/tests/test_estimation.py
index 2b5b5894d..f3a8a9636 100644
--- a/tests/test_estimation.py
+++ b/tests/test_estimation.py
@@ -1200,7 +1200,34 @@ def test_cluster_option(self, regression_dataset):
         cph.fit(df, 'T', 'E', cluster_col='id', show_progress=True)
         expected = pd.Series({'var1': 5.9752, 'var2': 4.0683})
         assert_series_equal(cph.summary['se(coef)'], expected, check_less_precise=2, check_names=False)
-        cph.print_summary()
+
+    @pytest.mark.xfail(reason="can't do this yet")
+    def test_cluster_option_with_strata(self, regression_dataset):
+        """
+        library(survival)
+        df <- data.frame(
+          "var1" = c(1, 1, 2, 2, 2),
+          "var2" = c(0.184677, 0.071893, 1.364646, 0.098375, 1.663092),
+          "id" = c(1, 1, 2, 3, 4),
+          "T" = c( 7.335846, 5.269797, 11.684092, 12.678458, 6.601666)
+        )
+        df['E'] = 1
+
+        c = coxph(formula=Surv(T, E) ~ strata(var1) + var2 + cluster(id), data=df)
+        """
+
+        df = pd.DataFrame({
+            "var1": [1, 1, 2, 2, 2],
+            "var2": [0.184677, 0.071893, 1.364646, 0.098375, 1.663092],
+            "T":    [7.335846, 5.269797, 11.684092, 12.678458, 6.601666],
+            "id":   [1, 1, 2, 3, 4],
+        })
+        df['E'] = 1
+
+        cph = CoxPHFitter()
+        cph.fit(df, 'T', 'E', cluster_col='id', strata=['var1'], show_progress=True)
+        expected = pd.Series({'var2': 3.34})
+        assert_series_equal(cph.summary['se(coef)'], expected, check_less_precise=2, check_names=False)
 
 
     def test_robust_errors_with_less_trival_weights_is_the_same_as_R(self, regression_dataset):

From efc64a0105ad34669de76daf08a4cade151bcc64 Mon Sep 17 00:00:00 2001
From: Cameron Davidson-Pilon <cam.davidson.pilon@gmail.com>
Date: Sat, 17 Nov 2018 23:54:21 -0500
Subject: [PATCH 52/59] fix test

---
 tests/test_estimation.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tests/test_estimation.py b/tests/test_estimation.py
index f3a8a9636..ca98e69c2 100644
--- a/tests/test_estimation.py
+++ b/tests/test_estimation.py
@@ -2024,6 +2024,7 @@ def test_aalen_additive_median_predictions_split_data(self):
         hz, coef, X = generate_hazard_rates(n, d, timeline)
         T = generate_random_lifetimes(hz, timeline)
         X['T'] = T
+        X = X.replace([np.inf, -np.inf], 10.0)
         # fit it to Aalen's model
         aaf = AalenAdditiveFitter()
         aaf.fit(X, 'T')

From d2beb9a1c84c944150b15d548826bd64acd5f2bb Mon Sep 17 00:00:00 2001
From: Paul Zivich <32672909+pzivich@users.noreply.github.com>
Date: Sun, 18 Nov 2018 13:06:41 -0500
Subject: [PATCH 53/59] Create aalen_johansen_fitter.py (#450)

* Create aalen_johansen_fitter.py

Adding a Aalen Johansen fitter as I mentioned in #413. Still needs some cleaning up. Items still needed: standard error estimator, tests, check to see how well jitter() works, ensure documentation and formatting matches rest of lifelines, write up an example

How it works is a follows: estimates an overall survival curve, calculates discrete time hazards for the event of interest (event_ind), calculates the cumulative density function (The survival function can then be used to generate the discrete time hazard (minus log transform S(t) and S(t-) where t- is the event time right before t, then subtract the quantities). To estimate F(t,j) you multiply S(t-) with the discrete time hazard and an indicator for j).

Potential addition: warn users not to calculate survival times from this (only generates the cumulative density function / risk) since the interpretation of those survival times is not straightforward.

Some discussion and examples:
https://www.duo.uio.no/bitstream/handle/10852/10287/stat-res-03-97.pdf?sequence=1
https://www.ncbi.nlm.nih.gov/pmc/articles/PMC5557056/
https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4325676/

* Update aalen_johansen_fitter.py

Added UnivariateFitter to initial call
Fixed internal function _jitter()
Changed instances of len() with shape[]
Removed inplace
Updated names to be more clear
Added reference to _bounds()

* Update aalen_johansen_fitter.py

Added the following:
Recursive function within _jitter() in case a random tie still exists after jittering
Confidence interval and variance for F(t). Note that the way they are implemented may not be the most efficient
Added items to self, to be consistent with lifelines KaplanMeierFitter
Updated documentation to be consistent with the remainder of lifelines

To do:
Figure out why confidence intervals don't match exactly with SAS (good till 3/4 decimal place)
Revise plot functionality so it can plot this
Write up tests to include in test.py
Write up example motivating use and for website

* Added plot and minor tweaks

Verified the plot functionality with AalenJohansenFitter. Added to the documentation. Cleaned up the code a bit.

For the future, might consider adding the Aalen confidence interval method. Currently uses the Delta method. Not a big difference between the two. Both are in the SAS document I linked in the _bounds docs

* Fix the plot functionality for v0.15.0

* Added AalenJohansenFitter

* Update estimation.py

* Update aalen_johansen_fitter.py

Seed link fixed. ``event_observed`` is now ``event_of_interest``. Fixed death typo. Fixed spacing

* Update aalen_johansen_fitter.py

Fixed issue with list inputs (now converts to pd.Series() since attributes like .duplicated() and .loc are used in fit() and _jitter(), respectively). Few tweaks to have the data types better correspond with Kaplan Meier.

* Update __init__.py

* Added Aalen-Johansen tests

Test in Aalen-Johansen tests the following; jittering times properly, jittered times are different, event table equals expectation, Aalen-Johansen is always less than or equal to Kaplan-Meier in settings of competing risk, when no competing risks then Aalen-Johansen is equal to Kaplan-Meier, tests variance compared to SAS v9.4 using delta method, tests confidence intervals against SAS v9.4 using transformed delta method for standard error

* Update test_estimation.py
---
 lifelines/__init__.py                      |   4 +-
 lifelines/estimation.py                    |   1 +
 lifelines/fitters/aalen_johansen_fitter.py | 175 +++++++++++++++++++++
 tests/test_estimation.py                   |  84 +++++++++-
 4 files changed, 261 insertions(+), 3 deletions(-)
 create mode 100644 lifelines/fitters/aalen_johansen_fitter.py

diff --git a/lifelines/__init__.py b/lifelines/__init__.py
index 910da8868..206e72312 100644
--- a/lifelines/__init__.py
+++ b/lifelines/__init__.py
@@ -1,10 +1,10 @@
 # -*- coding: utf-8 -*-
 from .estimation import KaplanMeierFitter, NelsonAalenFitter, \
     AalenAdditiveFitter, BreslowFlemingHarringtonFitter, CoxPHFitter, \
-    WeibullFitter, ExponentialFitter, CoxTimeVaryingFitter
+    WeibullFitter, ExponentialFitter, CoxTimeVaryingFitter, AalenJohansenFitter
 
 from .version import __version__
 
 __all__ = ['KaplanMeierFitter', 'NelsonAalenFitter', 'AalenAdditiveFitter',
            'BreslowFlemingHarringtonFitter', 'CoxPHFitter', 'WeibullFitter',
-           'ExponentialFitter', 'CoxTimeVaryingFitter']
+           'ExponentialFitter', 'CoxTimeVaryingFitter', 'AalenJohansenFitter']
diff --git a/lifelines/estimation.py b/lifelines/estimation.py
index 226309ca0..553331cb1 100644
--- a/lifelines/estimation.py
+++ b/lifelines/estimation.py
@@ -8,3 +8,4 @@
 from lifelines.fitters.coxph_fitter import CoxPHFitter
 from lifelines.fitters.cox_time_varying_fitter import CoxTimeVaryingFitter
 from lifelines.fitters.aalen_additive_fitter import AalenAdditiveFitter
+from lifelines.fitters.aalen_johansen_fitter import AalenJohansenFitter
diff --git a/lifelines/fitters/aalen_johansen_fitter.py b/lifelines/fitters/aalen_johansen_fitter.py
new file mode 100644
index 000000000..6f6075f0d
--- /dev/null
+++ b/lifelines/fitters/aalen_johansen_fitter.py
@@ -0,0 +1,175 @@
+from __future__ import print_function
+from __future__ import division
+import numpy as np
+import pandas as pd
+import warnings
+
+from lifelines.fitters import UnivariateFitter
+from lifelines.utils import _preprocess_inputs, inv_normal_cdf
+from lifelines.fitters.kaplan_meier_fitter import KaplanMeierFitter
+
+class AalenJohansenFitter(UnivariateFitter):
+    """Class for fitting the Aalen-Johansen estimate for the cumulative incidence function in a competing risks framework.
+    Treating competing risks as censoring can result in over-estimated cumulative density functions. Using the Kaplan
+    Meier estimator with competing risks as censored is akin to estimating the cumulative density if all competing risks
+    had been prevented. If you are interested in learning more, I (Paul Zivich) recommend the following open-access
+    paper; Edwards JK, Hester LL, Gokhale M, Lesko CR. Methodologic Issues When Estimating Risks in
+    Pharmacoepidemiology. Curr Epidemiol Rep. 2016;3(4):285-296.
+    
+    AalenJohansenFitter(alpha=0.95, jitter_level=0.00001, seed=None)
+    
+    Aalen-Johansen cannot deal with tied times. We can get around this by randomy jittering the event times 
+    slightly. This will be done automatically and generates a warning.
+    """
+    def __init__(self, jitter_level=0.0001, seed=None, alpha=0.95):
+        UnivariateFitter.__init__(self, alpha=alpha)
+        self._jitter_level = jitter_level
+        self._seed = seed  # Seed is for the jittering process
+
+    def fit(self, durations, event_observed, event_of_interest, timeline=None, entry=None, label='AJ_estimate', 
+            alpha=None, ci_labels=None, weights=None):
+        """
+        Parameters:
+          durations: an array or pd.Series of length n -- duration of subject was observed for 
+          event_observed: an array, or pd.Series, of length n. Integer indicator of distinct events. Must be 
+             only positive integers, where 0 indicates censoring.
+          event_of_interest: integer -- indicator for event of interest. All other integers are considered competing events
+             Ex) event_observed contains 0, 1, 2 where 0:censored, 1:lung cancer, and 2:death. If event_of_interest=1, then death (2)
+             is considered a competing event. The returned cumulative incidence function corresponds to risk of lung cancer
+          timeline: return the best estimate at the values in timelines (postively increasing)
+          entry: an array, or pd.Series, of length n -- relative time when a subject entered the study. This is
+             useful for left-truncated (not left-censored) observations. If None, all members of the population
+             were born at time 0.
+          label: a string to name the column of the estimate.
+          alpha: the alpha value in the confidence intervals. Overrides the initializing
+             alpha for this call to fit only.
+          ci_labels: add custom column names to the generated confidence intervals
+                as a length-2 list: [<lower-bound name>, <upper-bound name>]. Default: <label>_lower_<alpha>
+          weights: n array, or pd.Series, of length n, if providing a weighted dataset. For example, instead
+              of providing every subject as a single element of `durations` and `event_observed`, one could
+              weigh subject differently.
+
+        Returns:
+          self, with new properties like 'cumulative_incidence_'.
+        """
+        # Checking for tied event times
+        if np.sum(pd.Series(durations).duplicated()) > 0:
+            # Seeing if there is a large amount of ties in the data (>20%)
+            if np.sum(pd.Series(durations).duplicated()) / len(durations) > 0.2:
+                warnings.warn('''It looks like there are many tied events in your data set. The Aalen-Johansen 
+                              estimator should only be used when there are no/few tied events''', Warning)
+                # I am unaware of a recommended cut-off, but 20% would be suggestive of issues
+            # Raise warning if duplicated times, then randomly jitter times
+            warnings.warn('''Tied event times were detected. The Aalen-Johansen estimator cannot handle tied event times. 
+                To resolve ties, data is randomly jittered.''', Warning)  
+            durations = self._jitter(durations=pd.Series(durations), event=pd.Series(event_observed), 
+                                     jitter_level=self._jitter_level, seed=self._seed)
+        
+        # Creating label for event of interest & indicator for that event
+        cmprisk_label = 'CIF_' + str(int(event_of_interest))
+        self.label_cmprisk = 'observed_' + str(int(event_of_interest))
+        
+        # Fitting Kaplan-Meier for either event of interest OR competing risk
+        km = KaplanMeierFitter()
+        km.fit(durations, event_observed=event_observed, timeline=timeline, entry=entry, weights=weights)
+        aj = km.event_table
+        aj['overall_survival'] = km.survival_function_
+        aj['lagged_overall_survival'] = aj['overall_survival'].shift()
+        
+        # Setting up table for calculations and to return to user
+        event_spec = np.where(pd.Series(event_observed) == event_of_interest, 1, 0)
+        event_spec_proc = _preprocess_inputs(durations=durations, event_observed=event_spec, timeline=timeline,
+                                             entry=entry, weights=weights)
+        event_spec_times = event_spec_proc[-1]['observed']
+        event_spec_times = event_spec_times.rename(self.label_cmprisk)
+        aj = pd.concat([aj, event_spec_times], axis=1).reset_index()
+
+        # Estimator of Cumulative Incidence (Density) Function
+        aj[cmprisk_label] = ((aj[self.label_cmprisk]) / (aj['at_risk']) * aj['lagged_overall_survival']).cumsum()
+        aj.loc[0, cmprisk_label] = 0  # Setting initial CIF to be zero
+        aj = aj.set_index('event_at')
+        
+        # Setting attributes
+        self._estimation_method = "cumulative_density_"
+        self._estimate_name = "cumulative_density_"
+        self._predict_label = label
+        self._update_docstrings()
+
+        alpha = alpha if alpha else self.alpha
+        self._label = label
+        self.cumulative_density_ = pd.DataFrame(aj[cmprisk_label])  
+        # Technically, cumulative incidence, but consistent with KaplanMeierFitter
+        self.event_table = aj[['removed', 'observed', self.label_cmprisk, 'censored', 'entrance', 'at_risk']]  # Event table
+        self.variance, self.confidence_interval_ = self._bounds(aj['lagged_overall_survival'],
+                                                                alpha=alpha, ci_labels=ci_labels)
+        return self
+    
+    def _jitter(self, durations, event, jitter_level, seed=None):
+        """Determine extent to jitter tied event times. Automatically called by fit if tied event times are detected
+        """
+        if jitter_level <= 0:
+            raise ValueError('The jitter level is less than zero, please select a jitter value greater than 0')
+        if seed is not None:
+            np.random.seed(seed)
+        
+        event_time = durations.loc[event != 0].copy()
+        # Determining whether to randomly shift event times up or down
+        mark = np.random.choice([-1, 1], size=event_time.shape[0])
+        # Determining extent to jitter event times up or down
+        shift = np.random.uniform(size=event_time.shape[0])*jitter_level
+        # Jittering times
+        event_time += mark*shift
+        durations_jitter = event_time.align(durations)[0].fillna(durations)
+        
+        # Recursive call if event times are still tied after jitter
+        if np.sum(event_time.duplicated()) > 0: 
+            return self._jitter(durations=durations_jitter, event=event, jitter_level=jitter_level, seed=seed)
+        else:
+            return durations_jitter
+    
+    def _bounds(self, lagged_survival, alpha, ci_labels):
+        """Bounds are based on pg411 of "Modelling Survival Data in Medical Research" David Collett 3rd Edition, which
+        is derived from Greenwood's variance estimator. Confidence intervals are obtained using the delta method
+        transformation of SE(log(-log(F_j))). This ensures that the confidence intervals all lie between 0 and 1.
+        
+        Formula for the variance follows:
+        Var(F_j) = sum((F_j(t) - F_j(t_i))**2 * d/(n*(n-d) + S(t_i-1)**2 * ((d*(n-d))/n**3) +
+                    -2 * sum((F_j(t) - F_j(t_i)) * S(t_i-1) * (d/n**2)
+
+        Delta method transformation:
+        SE(log(-log(F_j) = SE(F_j) / (F_j * absolute(log(F_j)))
+        
+        More information can be found at: https://support.sas.com/documentation/onlinedoc/stat/141/lifetest.pdf
+        There is also an alternative method (Aalen) but this is not currently implemented
+        """
+        # Preparing environment
+        df = self.event_table.copy()
+        df['Ft'] = self.cumulative_density_
+        df['lagS'] = lagged_survival.fillna(1)
+        if ci_labels is None:
+            ci_labels = ["%s_upper_%.2f" % (self._predict_label, alpha), "%s_lower_%.2f" % (self._predict_label, alpha)]
+        assert len(ci_labels) == 2, "ci_labels should be a length 2 array."
+
+        # Have to loop through each time independently. Don't think there is a faster way
+        all_vars = []
+        for i, r in df.iterrows():
+            sf = df.loc[df.index <= r.name].copy()
+            F_t = float(r['Ft'])
+            sf['part1'] = ((F_t - sf['Ft'])**2) * (sf['observed'] / (sf['at_risk']*(sf['at_risk'] - sf['observed'])))
+            sf['part2'] = ((sf['lagS'])**2) * sf[self.label_cmprisk] * ((sf['at_risk']-
+                                                                         sf[self.label_cmprisk]))/(sf['at_risk']**3)
+            sf['part3'] = (F_t - sf['Ft']) * sf['lagS'] * (sf[self.label_cmprisk] / (sf['at_risk']**2))
+            variance = (np.sum(sf['part1'])) + (np.sum(sf['part2'])) - 2*(np.sum(sf['part3']))
+            all_vars.append(variance)
+        df['variance'] = all_vars        
+        
+        # Calculating Confidence Intervals
+        df['F_transformed'] = np.log(-np.log(df['Ft']))
+        df['se_transformed'] = np.sqrt(df['variance']) / (df['Ft'] * np.absolute(np.log(df['Ft'])))
+        zalpha = inv_normal_cdf((1. + alpha) / 2.)
+        df[ci_labels[0]] = np.exp(-np.exp(df['F_transformed']+zalpha*df['se_transformed']))
+        df[ci_labels[1]] = np.exp(-np.exp(df['F_transformed']-zalpha*df['se_transformed']))
+        return df['variance'], df[ci_labels]
+                
+
+
diff --git a/tests/test_estimation.py b/tests/test_estimation.py
index ca98e69c2..4d1d728f5 100644
--- a/tests/test_estimation.py
+++ b/tests/test_estimation.py
@@ -24,7 +24,7 @@
 from lifelines.utils import k_fold_cross_validation, StatError, concordance_index, ConvergenceWarning, to_long_format
 from lifelines.estimation import CoxPHFitter, AalenAdditiveFitter, KaplanMeierFitter, \
     NelsonAalenFitter, BreslowFlemingHarringtonFitter, ExponentialFitter, \
-    WeibullFitter, BaseFitter, CoxTimeVaryingFitter
+    WeibullFitter, BaseFitter, CoxTimeVaryingFitter, AalenJohansenFitter
 from lifelines.datasets import load_larynx, load_waltons, load_kidney_transplant, load_rossi,\
     load_panel_test, load_g3, load_holly_molly_polly, load_regression_dataset,\
     load_stanford_heart_transplants
@@ -2427,3 +2427,85 @@ def test_print_summary(self, ctv, heart):
                 assert output[i] == expected[i]
         finally:
             sys.stdout = saved_stdout
+
+class TestAalenJohansenFitter:
+
+    @pytest.fixture  # pytest fixtures are functions that are "executed" before every test
+    def duration(self):
+        return [1, 2, 3, 4, 5, 6]
+
+    @pytest.fixture
+    def event_observed(self):
+        return [0, 1, 1, 2, 2, 0]
+
+    @pytest.fixture
+    def fitter(self):
+        return AalenJohansenFitter()
+
+    @pytest.fixture
+    def kmfitter(self):
+        return KaplanMeierFitter()
+
+    def test_jitter(self, fitter):
+        d = pd.Series([1, 1, 1])
+        e = fitter._jitter(durations=d, event=pd.Series([1, 1, 1]), jitter_level=0.01)
+
+        npt.assert_equal(np.any(np.not_equal(d, e)), True)
+
+    def test_tied_input_data(self, fitter):
+        d = [1, 2, 2, 4, 5, 6]
+        fitter.fit(durations=d,
+                   event_observed=[0, 1, 2, 1, 2, 0],
+                   event_of_interest=2)
+        npt.assert_equal(np.any(np.not_equal([0]+d, fitter.event_table.index)), True)
+
+    def test_event_table_is_correct(self, fitter, duration, event_observed):
+        fitter.fit(duration, event_observed, event_of_interest=2)
+
+        expected_event_table = pd.DataFrame.from_records([
+            {'event_at': 0, 'removed': 0, 'observed': 0, 'observed_2': 0, 'censored': 0, 'entrance': 6, 'at_risk': 6},
+            {'event_at': 1, 'removed': 1, 'observed': 0, 'observed_2': 0, 'censored': 1, 'entrance': 0, 'at_risk': 6},
+            {'event_at': 2, 'removed': 1, 'observed': 1, 'observed_2': 0, 'censored': 0, 'entrance': 0, 'at_risk': 5},
+            {'event_at': 3, 'removed': 1, 'observed': 1, 'observed_2': 0, 'censored': 0, 'entrance': 0, 'at_risk': 4},
+            {'event_at': 4, 'removed': 1, 'observed': 1, 'observed_2': 1, 'censored': 0, 'entrance': 0, 'at_risk': 3},
+            {'event_at': 5, 'removed': 1, 'observed': 1, 'observed_2': 1, 'censored': 0, 'entrance': 0, 'at_risk': 2},
+            {'event_at': 6, 'removed': 1, 'observed': 0, 'observed_2': 0, 'censored': 1, 'entrance': 0, 'at_risk': 1}
+        ]).set_index('event_at')[['removed', 'observed', 'observed_2', 'censored', 'entrance', 'at_risk']]
+        # pandas util for checking if two dataframes are equal
+        assert_frame_equal(fitter.event_table, expected_event_table,
+                           check_dtype=False, check_like=True)  # Ignores dtype to avoid int32 vs int64 difference
+
+    def test_aj_less_than_km(self, fitter, kmfitter, duration, event_observed):
+        # In presence of competing risk, CIF_{AJ} >= CIF_{KM}
+        fitter.fit(duration, event_observed, event_of_interest=2)  # Aalen-Johansen
+        kmfitter.fit(duration, event_observed)
+
+        x = np.all(np.where(np.array(1 - kmfitter.survival_function_) >= np.array(fitter.cumulative_density_),
+                            True, False))
+        assert x
+
+    def test_no_competing_risk(self, fitter, kmfitter, duration):
+        # In presence of no competing risk, CIF_{AJ} == CIF_{KM}
+        same_events = [0, 2, 2, 2, 2, 0]
+        fitter.fit(duration, same_events, event_of_interest=2)  # Aalen-Johansen
+        kmfitter.fit(duration, same_events)  # Kaplan-Meier
+        npt.assert_allclose(np.array(1 - kmfitter.survival_function_),
+                            np.array(fitter.cumulative_density_))
+
+    def test_variance_calculation_against_sas(self, fitter, duration, event_observed):
+        variance_from_sas = np.array([0., 0., 0., 0., 0.032, 0.048, 0.048])
+
+        fitter.fit(duration, event_observed, event_of_interest=2)
+        npt.assert_allclose(variance_from_sas, np.array(fitter.variance))
+
+    def test_ci_calculation_against_sas(self, fitter, duration, event_observed):
+        ci_from_sas = np.array([[np.nan, np.nan],
+                                [np.nan, np.nan],
+                                [np.nan, np.nan],
+                                [np.nan, np.nan],
+                                [0.00836904, 0.58185303],
+                                [0.05197575, 0.75281579],
+                                [0.05197575, 0.75281579]])
+
+        fitter.fit(duration, event_observed, event_of_interest=2)
+        npt.assert_allclose(ci_from_sas, np.array(fitter.confidence_interval_))

From 1b4d15605af20143653f7c3536ba0f9d08e3bc52 Mon Sep 17 00:00:00 2001
From: Cameron Davidson-Pilon <cam.davidson.pilon@gmail.com>
Date: Mon, 19 Nov 2018 11:37:26 -0500
Subject: [PATCH 54/59] shifting p-value codes down an order of magnitude

---
 CHANGELOG.md                                 |  3 ++-
 docs/Examples.rst                            | 12 +++++++++++-
 lifelines/fitters/cox_time_varying_fitter.py |  7 +++----
 lifelines/fitters/coxph_fitter.py            | 12 ++++++------
 lifelines/fitters/exponential_fitter.py      | 11 +++++------
 lifelines/fitters/weibull_fitter.py          |  5 ++---
 lifelines/statistics.py                      |  4 ++--
 lifelines/utils/__init__.py                  | 17 +++++++++++++----
 tests/test_estimation.py                     | 17 ++---------------
 9 files changed, 46 insertions(+), 42 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index ce38faeab..babc596ae 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -20,7 +20,8 @@
  - In Python3, Univariate models are now serialisable with `pickle`. Thanks @dwilson1988 for the contribution. For Python2, `dill` is still the preferred method.
  - `baseline_cumulative_hazard_` (and derivatives of that) on `CoxPHFitter` now correctly incorporate the `weights_col`. 
  - Fixed a bug in `KaplanMeierFitter` when late entry times lined up with death events. Thanks @pzivich
- - adding `cluster_col` argument to `CoxPHFitter` so users can specify groups of subjects/rows that may be correlated. 
+ - Adding `cluster_col` argument to `CoxPHFitter` so users can specify groups of subjects/rows that may be correlated. 
+ - Shifting the "signficance codes" for p-values down an order of magnitude. (Example, p-values between 0.1 and 0.05 are not noted at all and p-values between 0.05 and 0.1 are noted with `.`, etc.). This deviates with how they are presented in other software. There is an argument to be made to remove p-values from lifelines altogether (_become the changes you want to see in the world_ lol), but I worry that people could compute the p-values by hand incorrectly, a worse outcome I think. So, this is my stance. P-values between 0.1 and 0.05 offer _very_ little information, so they are removed. There is a growing movement in statistics to shift "signficant" findings to p-values less than 0.01 anyways. 
 
 #### 0.14.6
  - fix for n > 2 groups in `multivariate_logrank_test` (again).
diff --git a/docs/Examples.rst b/docs/Examples.rst
index eb658a34c..0bed14bb5 100644
--- a/docs/Examples.rst
+++ b/docs/Examples.rst
@@ -598,7 +598,7 @@ Since the estimation of the coefficients in the Cox proportional hazard model is
     3. Related to above, the relationship between a covariate and the duration may be completely determined. For example, if the rank correlation between a covariate and the duration is very close to 1 or -1, then the log-likelihood can be increased arbitrarly using just that covariate. Look for a ``ConvergenceWarning`` after the ``fit`` call.
     4. Another problem may be a co-linear relationship in your dataset. See point 2. above. 
 
- 4. Adding a very small ``penalizer_coef`` significantly changes the results. This probably means that the step size is too large. Try decreasing it, and returning the ``penalizer_coef`` term to 0. 
+ 4. If adding a very small ``penalizer`` significantly changes the results (``CoxPHFitter(penalizer=0.0001)``), then this probably means that the step size in the iterative algorithm is too large. Try decreasing it (``.fit(..., step_size=0.50)`` or smaller), and returning the ``penalizer`` term to 0. 
 
  5. If using the ``strata`` arugment, make sure your stratification group sizes are not too small. Try ``df.groupby(strata).size()``.
 
@@ -635,3 +635,13 @@ The fitting should be faster, and the results identical to the unweighted datase
 The second use of weights is sampling weights. These are typically positive, non-integer weights that represent some artifical under/over sampling of observations (ex: inverse probability of treatment weights). It is recommened to set ``robust=True`` in the call to the ``fit`` as the usual standard error is incorrect for sampling weights. The ``robust`` flag will use the sandwich estimator for the standard error. 
 
 .. warning:: The implementation of the sandwich estimator does not handle ties correctly (under the Efron handling of ties), and will give slightly or significantly different results from other software depending on the frequeny of ties. g
+
+
+Correlations between subjects in a Cox model
+###################################################
+
+There are cases when your dataset contains correlated subjects, which breaks the independent-and-identically-distributed assumption. What are some cases when this may happen?
+
+1. If a subject appears more than once in the dataset (common when subjects can have the event more than once)
+2. If using a matching technique, like prospensity-score matching, there is a correlation between pairs. 
+3. 
\ No newline at end of file
diff --git a/lifelines/fitters/cox_time_varying_fitter.py b/lifelines/fitters/cox_time_varying_fitter.py
index 37fd814c5..4d4d313f8 100644
--- a/lifelines/fitters/cox_time_varying_fitter.py
+++ b/lifelines/fitters/cox_time_varying_fitter.py
@@ -17,11 +17,11 @@
 from lifelines.fitters.coxph_fitter import CoxPHFitter
 from lifelines.statistics import chisq_test
 from lifelines.utils import (inv_normal_cdf,
-    significance_code, normalize,
+    significance_code, normalize, significance_codes_as_text,
     pass_for_numeric_dtypes_or_raise, check_low_var,
     check_for_overlapping_intervals, check_complete_separation_low_variance,
     ConvergenceWarning, StepSizer, _get_index, check_for_immediate_deaths,
-    check_for_instantaneous_events, ConvergenceError, check_nans_or_infs, string_justify
+    check_for_instantaneous_events, ConvergenceError, check_nans_or_infs, string_justify,
 )
 
 
@@ -483,8 +483,7 @@ def print_summary(self):
         print(df.to_string(float_format=lambda f: '{:4.4f}'.format(f)))
         # Significance code explanation
         print('---')
-        print("Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 ",
-              end='\n\n')
+        print(significance_codes_as_text(), end='\n\n')
         print("Likelihood ratio test = {:.3f} on {} df, p={:.5f}".format(*self._compute_likelihood_ratio_test()))
         return
 
diff --git a/lifelines/fitters/coxph_fitter.py b/lifelines/fitters/coxph_fitter.py
index fef92bd89..800f0e62b 100644
--- a/lifelines/fitters/coxph_fitter.py
+++ b/lifelines/fitters/coxph_fitter.py
@@ -17,7 +17,7 @@
 from lifelines.fitters import BaseFitter
 from lifelines.statistics import chisq_test
 from lifelines.utils import (survival_table_from_events, inv_normal_cdf, normalize,
-    significance_code, concordance_index, _get_index, qth_survival_times,
+    significance_code, significance_codes_as_text, concordance_index, _get_index, qth_survival_times,
     pass_for_numeric_dtypes_or_raise, check_low_var, coalesce,
     check_complete_separation, check_nans_or_infs, StatError, ConvergenceWarning,
     StepSizer, ConvergenceError, string_justify)
@@ -93,7 +93,7 @@ def fit(self, df, duration_col, event_col=None,
           robust: Compute the robust errors using the Huber sandwich estimator, aka Wei-Lin estimate. This does not handle
             ties, so if there are high number of ties, results may significantly differ. See
             "The Robust Inference for the Cox Proportional Hazards Model", Journal of the American Statistical Association, Vol. 84, No. 408 (Dec., 1989), pp. 1074- 1078
-          cluster_col: specifies what column has ids for clustering covariances. Using this forces the sandwich estimator (robust variance estimator) to
+          cluster_col: specifies what column has unique identifers for clustering covariances. Using this forces the sandwich estimator (robust variance estimator) to
             be used.
         Returns:
             self, with additional properties: hazards_, confidence_intervals_, baseline_survival_, etc.
@@ -462,6 +462,7 @@ def _compute_sandwich_estimator(self, X, T, E, weights):
         _, d = X.shape
 
         if self.strata is not None and self.cluster_col is not None:
+            # TODO
             raise NotImplementedError("Providing clusters and strata is not implemented yet")
 
         if self.strata is not None:
@@ -498,7 +499,7 @@ def _compute_residuals_within_strata(self, X, T, E, weights):
         # https://www.stat.tamu.edu/~carroll/ftp/gk001.pdf
         # lin1989
         # https://www.ics.uci.edu/~dgillen/STAT255/Handouts/lecture10.pdf
-        # doesn't handle ties.
+        # TODO: doesn't handle ties.
 
         n, d = X.shape
 
@@ -609,8 +610,7 @@ def print_summary(self):
         print(df.to_string(float_format=lambda f: '{:4.4f}'.format(f)))
         # Significance code explanation
         print('---')
-        print("Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 ",
-              end='\n\n')
+        print(significance_codes_as_text(), end='\n\n')
         print("Concordance = {:.3f}".format(self.score_))
         print("Likelihood ratio test = {:.3f} on {} df, p={:.5f}".format(*self._compute_likelihood_ratio_test()))
         return
@@ -875,7 +875,7 @@ def plot_covariate_groups(self, covariate, groups, **kwargs):
         """
         from matplotlib import pyplot as plt
 
-        if covariate not in self.summary.index:
+        if covariate not in self.hazards_.columns:
             raise KeyError('covariate `%s` is not present in the original dataset' % covariate)
 
         ax = kwargs.get('ax', None) or plt.figure().add_subplot(111)
diff --git a/lifelines/fitters/exponential_fitter.py b/lifelines/fitters/exponential_fitter.py
index cb03c7c9f..d67dadc76 100644
--- a/lifelines/fitters/exponential_fitter.py
+++ b/lifelines/fitters/exponential_fitter.py
@@ -5,7 +5,7 @@
 from scipy import stats
 
 from lifelines.fitters import UnivariateFitter
-from lifelines.utils import inv_normal_cdf, check_nans_or_infs, significance_code, string_justify
+from lifelines.utils import inv_normal_cdf, check_nans_or_infs, significance_code, string_justify, significance_codes_as_text
 
 
 class ExponentialFitter(UnivariateFitter):
@@ -75,7 +75,7 @@ def fit(self, durations, event_observed=None, timeline=None, entry=None,
         self.survival_function_ = pd.DataFrame(np.exp(-self.lambda_ * self.timeline), columns=[self._label], index=self.timeline)
         self.confidence_interval_ = self._bounds(alpha if alpha else self.alpha, ci_labels)
         self.median_ = 1. / self.lambda_ * (np.log(2))
-       
+
         # estimation methods
         self._estimate_name = "survival_function_"
         self._predict_label = label
@@ -85,10 +85,10 @@ def fit(self, durations, event_observed=None, timeline=None, entry=None,
         self.plot_survival_function_ = self.plot
 
         return self
-    
+
     def _estimation_method(self,t):
         return np.exp(-self.lambda_ * t)
-    
+
     def _bounds(self, alpha, ci_labels):
         alpha2 = inv_normal_cdf((1. + alpha) / 2.)
         df = pd.DataFrame(index=self.timeline)
@@ -158,6 +158,5 @@ def print_summary(self):
         df[''] = [significance_code(p) for p in df['p']]
         print(df.to_string(float_format=lambda f: '{:4.4f}'.format(f)))
         print('---')
-        print("Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 ",
-              end='\n\n')
+        print(significance_codes_as_text(), end='\n\n')
         return
diff --git a/lifelines/fitters/weibull_fitter.py b/lifelines/fitters/weibull_fitter.py
index a24b32549..5ba0e86d2 100644
--- a/lifelines/fitters/weibull_fitter.py
+++ b/lifelines/fitters/weibull_fitter.py
@@ -9,7 +9,7 @@
 from numpy.linalg import solve, norm, inv
 from lifelines.fitters import UnivariateFitter
 from lifelines.utils import inv_normal_cdf, check_nans_or_infs, ConvergenceError, string_justify, significance_code,\
-                            ConvergenceWarning
+                            ConvergenceWarning, significance_codes_as_text
 
 
 def _negative_log_likelihood(lambda_rho, T, E):
@@ -269,6 +269,5 @@ def print_summary(self):
         df[''] = [significance_code(p) for p in df['p']]
         print(df.to_string(float_format=lambda f: '{:4.4f}'.format(f)))
         print('---')
-        print("Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 ",
-              end='\n\n')
+        print(significance_codes_as_text(), end='\n\n')
         return
diff --git a/lifelines/statistics.py b/lifelines/statistics.py
index 84f2b2b19..f976c7536 100644
--- a/lifelines/statistics.py
+++ b/lifelines/statistics.py
@@ -6,7 +6,7 @@
 from scipy import stats
 import pandas as pd
 
-from lifelines.utils import group_survival_table_from_events, significance_code
+from lifelines.utils import group_survival_table_from_events, significance_code, significance_codes_as_text
 
 
 def sample_size_necessary_under_cph(power, ratio_of_participants, p_exp, p_con,
@@ -291,7 +291,7 @@ def __unicode__(self):
         s += df.to_string(float_format=lambda f: '{:4.4f}'.format(f), index=False)
 
         s += '\n---'
-        s += "\nSignif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 "
+        s += "\n" + significance_codes_as_text()
         return s
 
     def _pretty_print_meta_data(self, dictionary):
diff --git a/lifelines/utils/__init__.py b/lifelines/utils/__init__.py
index 5e27e7433..ece15a18d 100644
--- a/lifelines/utils/__init__.py
+++ b/lifelines/utils/__init__.py
@@ -617,17 +617,26 @@ def epanechnikov_kernel(t, T, bandwidth=1.):
 
 
 def significance_code(p):
-    if p < 0.001:
+    """
+    v0.15.0:
+        p-values between 0.05 and 0.1 have such little information gain. For that reason, I am deviating
+        from the traditional "astericks" in R and making everthing an order-of-magnitude less.
+    """
+    if p < 0.0001:
         return '***'
-    elif p < 0.01:
+    elif p < 0.001:
         return '**'
-    elif p < 0.05:
+    elif p < 0.01:
         return '*'
-    elif p < 0.1:
+    elif p < 0.05:
         return '.'
     else:
         return ' '
 
+def significance_codes_as_text():
+    p_values = [0, 0.0001, 0.001, 0.01, 0.05]
+    return "Signif. codes: " + " ".join(["%s '%s'" % (p, significance_code(p)) for p in p_values]) + " 1"
+
 
 def ridge_regression(X, Y, c1=0.0, c2=0.0, offset=None):
     """
diff --git a/tests/test_estimation.py b/tests/test_estimation.py
index ca98e69c2..f64627b5f 100644
--- a/tests/test_estimation.py
+++ b/tests/test_estimation.py
@@ -898,7 +898,7 @@ def test_print_summary(self, rossi):
 paro -0.0849     0.9186    0.1958 -0.4336 0.6646     -0.4685      0.2988
 prio  0.0915     1.0958    0.0286  3.1939 0.0014      0.0353      0.1476  **
 ---
-Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
+Signif. codes:  0 '***' 0.0001 '**' 0.001 '*' 0.01 '.' 0.05 ' ' 1
 
 Concordance = 0.640
 Likelihood ratio test = 33.266 on 7 df, p=0.00002
@@ -1464,19 +1464,6 @@ def test_doubling_the_weights_halves_the_variance(self, rossi):
         assert_frame_equal(cf2.standard_errors_ ** 2, w * cf1.standard_errors_ ** 2, check_like=True)
 
 
-    def test_adding_non_integer_weights_without_robust_flag_raises_a_warning(self, rossi):
-        rossi['weights'] = np.random.exponential(1, rossi.shape[0])
-
-        cox = CoxPHFitter()
-
-        with warnings.catch_warnings(record=True) as w:
-            warnings.simplefilter("always")
-            cox.fit(rossi, 'week', 'arrest', weights_col='weights')
-
-            assert len(w) == 1
-            assert "naive variance estimates" in str(w[0].message)
-
-
     def test_adding_non_integer_weights_is_fine_if_robust_is_on(self, rossi):
         rossi['weights'] = np.random.exponential(1, rossi.shape[0])
 
@@ -2419,7 +2406,7 @@ def test_print_summary(self, ctv, heart):
 surgery    -0.6372     0.5288    0.3672 -1.7352 0.0827     -1.3570      0.0825  .
 transplant -0.0103     0.9898    0.3138 -0.0327 0.9739     -0.6252      0.6047
 ---
-Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
+Signif. codes:  0 '***' 0.0001 '**' 0.001 '*' 0.001 '.' 0.05 ' ' 1
 
 Likelihood ratio test = 15.111 on 4 df, p=0.00448
 """).strip().split()

From 31d5c551d0933dcb3d4799476dcdf3f0e640cf0d Mon Sep 17 00:00:00 2001
From: Cameron Davidson-Pilon <cam.davidson.pilon@gmail.com>
Date: Mon, 19 Nov 2018 11:55:03 -0500
Subject: [PATCH 55/59] cluster documenation

---
 docs/Examples.rst | 39 ++++++++++++++++++++++++++++++++++++++-
 1 file changed, 38 insertions(+), 1 deletion(-)

diff --git a/docs/Examples.rst b/docs/Examples.rst
index 0bed14bb5..37f15921e 100644
--- a/docs/Examples.rst
+++ b/docs/Examples.rst
@@ -644,4 +644,41 @@ There are cases when your dataset contains correlated subjects, which breaks the
 
 1. If a subject appears more than once in the dataset (common when subjects can have the event more than once)
 2. If using a matching technique, like prospensity-score matching, there is a correlation between pairs. 
-3. 
\ No newline at end of file
+
+In both cases, the reported standard errors from a unadjusted Cox model will be wrong. In order to adjust for these correlations, there is a ``cluster_col`` keyword in `CoxPHFitter.fit` that allows you to specify the column in the dataframe that contains designations for correlated subjects. For example, if subjects in rows 1 & 2 are correlated, but no other subjects are correlated, then ``cluster_col`` column should have the same value for rows 1 & 2, and all others unique. Another example: for matched pairs, each subject in the pair should have the same value. 
+    
+    from lifelines.datasets import load_rossi
+    from lifelines import CoxPHFitter
+
+    rossi = load_rossi()
+
+    # this may come from a database, or other libaries that specialize in matching
+    mathed_pairs = [
+        (156, 230),
+        (275, 228),
+        (61, 252),
+        (364, 201),
+        (54, 340),
+        (130, 33),
+        (183, 145),
+        (268, 140),
+        (332, 259),
+        (314, 413),
+        (330, 211),
+        (372, 255),
+        # ...
+    ]
+
+    rossi['id'] = None  # we will populate this column
+
+    for i, pair in enumerate(matched_pairs):
+        subjectA, subjectB = pair
+        rossi.loc[subjectA, 'id'] = i
+        rossi.loc[subjectB, 'id'] = i
+
+    rossi = rossi.dropna(subset=['id'])
+
+    cph = CoxPHFitter()
+    cph.fit(rossi, 'week', 'arrest', cluster_col='id')
+
+Specifying ``cluster_col`` will handle correlations, and invoke the robust sandwich estimator for standard errors (the same as setting `robust=True`).
\ No newline at end of file

From 94ae8c30a36803f22e3f04a4c00ba679f1862a6d Mon Sep 17 00:00:00 2001
From: Cameron Davidson-Pilon <cam.davidson.pilon@gmail.com>
Date: Wed, 21 Nov 2018 20:58:41 -0500
Subject: [PATCH 56/59] cleaning up some docs, and adding support for
 prediction with a series

---
 docs/Examples.rst                 |  8 +++++---
 docs/Survival Regression.rst      |  6 +++---
 lifelines/fitters/coxph_fitter.py | 34 +++++++++++++++++++++++++++++--
 perf_tests/aaf_perf_test.py       | 17 ++++++++++++++++
 perf_tests/cp_perf_test.py        | 15 ++++++++++++++
 perf_tests/ctv_perf_test.py       | 13 ++++++++++++
 tests/test_estimation.py          |  6 ++++++
 7 files changed, 91 insertions(+), 8 deletions(-)
 create mode 100644 perf_tests/aaf_perf_test.py
 create mode 100644 perf_tests/cp_perf_test.py
 create mode 100644 perf_tests/ctv_perf_test.py

diff --git a/docs/Examples.rst b/docs/Examples.rst
index 37f15921e..2036456f0 100644
--- a/docs/Examples.rst
+++ b/docs/Examples.rst
@@ -415,7 +415,7 @@ id                   T                      E
 Example SQL queries and transformations to get time varying data
 ####################################################################
 
-For Cox time-varying models, we discussed what the dataset should look like in :ref:`Dataset for time-varying regression`. Typically we have a base dataset, and then we fold in the covariate datasets. Below are some SQL queries and Python transformations from end-to-end.
+For Cox time-varying models, we discussed what the dataset should look like in :ref:`Dataset creation for time-varying regression`. Typically we have a base dataset, and then we fold in the covariate datasets. Below are some SQL queries and Python transformations from end-to-end.
 
 
 Base dataset: ``base_df``
@@ -494,7 +494,7 @@ Initially, this can't be added to our baseline dataframe. Using ``utils.covariat
 Example cumulative total using and time-varying covariates
 ############################################################
 
-Often we have either __transactional covariate datasets__ or __state covariate datasets__. In a transactional dataset, it may make sense to sum up the covariates to represent administration of a treatment over time. For example, in the risky world of start-ups, we may want to sum up the funding amount recieved at a certain time. We also may be interested in the amount of the last round of funding. Below is an example to do just that:
+Often we have either transactional covariate datasets or state covariate datasets. In a transactional dataset, it may make sense to sum up the covariates to represent administration of a treatment over time. For example, in the risky world of start-ups, we may want to sum up the funding amount recieved at a certain time. We also may be interested in the amount of the last round of funding. Below is an example to do just that:
 
 Suppose we have an initial DataFrame of start-ups like:
 
@@ -646,7 +646,9 @@ There are cases when your dataset contains correlated subjects, which breaks the
 2. If using a matching technique, like prospensity-score matching, there is a correlation between pairs. 
 
 In both cases, the reported standard errors from a unadjusted Cox model will be wrong. In order to adjust for these correlations, there is a ``cluster_col`` keyword in `CoxPHFitter.fit` that allows you to specify the column in the dataframe that contains designations for correlated subjects. For example, if subjects in rows 1 & 2 are correlated, but no other subjects are correlated, then ``cluster_col`` column should have the same value for rows 1 & 2, and all others unique. Another example: for matched pairs, each subject in the pair should have the same value. 
-    
+
+.. code-block:: python    
+
     from lifelines.datasets import load_rossi
     from lifelines import CoxPHFitter
 
diff --git a/docs/Survival Regression.rst b/docs/Survival Regression.rst
index 84bf0a73c..105af0eaf 100644
--- a/docs/Survival Regression.rst	
+++ b/docs/Survival Regression.rst	
@@ -35,7 +35,7 @@ Cox's Proportional Hazard model
 Lifelines has an implementation of the Cox propotional hazards regression model (implemented in 
 R under ``coxph``). The idea behind the model is that the log-hazard of an individual is a linear function of their static covariates *and* a population-level baseline hazard that changes over time. Mathematically:
 
-.. math::  \lambda(t | x) = \overbrace{b_0(t)}^{\text{baseline}}\underbrace{\exp \overbrace{\left(\sum_{i=1}^n b_i x_i \right)}^{\text{log-partial hazard}}}_ {\text{partial hazard}}
+.. math::  \lambda(t | x) = \overbrace{b_0(t)}^{\text{baseline}}\underbrace{\exp \overbrace{\left(\sum_{i=1}^n b_i (x_i - \overline{x_i})\right)}^{\text{log-partial hazard}}}_ {\text{partial hazard}}
 
 Note a few facts about this model: the only time component is in the baseline hazard, :math:`b_0(t)`. In the above product, the partial hazard is a time-invariant scalar factor that only increases or decreases the baseline hazard. Thus a changes in covariates will only increase or decrease this baseline hazard. 
 
@@ -83,7 +83,7 @@ This example data is from the paper `here <http://socserv.socsci.mcmaster.ca/jfo
     Likelihood ratio test = 33.266 on 7 df, p=0.00002
     """
 
-To access the coefficients and the baseline hazard directly, you can use ``cph.hazards_`` and ``cph.baseline_hazard_`` respectively. 
+To access the coefficients and the baseline hazard directly, you can use ``cph.hazards_`` and ``cph.baseline_hazard_`` respectively.
 
 
 Convergence 
@@ -594,7 +594,7 @@ Often an individual will have a covariate change over time. An example of this i
 
 We can incorporate changes over time into our survival analysis by using a modification of the Cox model above. The general mathematical description is:
 
-.. math::  \lambda(t | x) = \overbrace{b_0(t)}^{\text{baseline}}\underbrace{\exp \overbrace{\left(\sum_{i=1}^n \beta_i x_i(t) \right)}^{\text{log-partial hazard}}}_ {\text{partial hazard}}
+.. math::  \lambda(t | x) = \overbrace{b_0(t)}^{\text{baseline}}\underbrace{\exp \overbrace{\left(\sum_{i=1}^n \beta_i (x_i(t) - \overline{x_i}) \right)}^{\text{log-partial hazard}}}_ {\text{partial hazard}}
 
 Note the time-varying :math:`x_i(t)` to denote that covariates can change over time. This model is implemented in lifelines as ``CoxTimeVaryingFitter``. The dataset schema required is different than previous models, so we will spend some time describing this. 
 
diff --git a/lifelines/fitters/coxph_fitter.py b/lifelines/fitters/coxph_fitter.py
index 800f0e62b..25876c84c 100644
--- a/lifelines/fitters/coxph_fitter.py
+++ b/lifelines/fitters/coxph_fitter.py
@@ -668,13 +668,25 @@ def predict_log_partial_hazard(self, X):
         if X is an array, then the column ordering is assumed to be the
         same as the training dataset.
         """
+
+        hazard_names = self.hazards_.columns
         if isinstance(X, pd.DataFrame):
-            order = self.hazards_.columns
+            order = hazard_names
+            X = X[order]
+            pass_for_numeric_dtypes_or_raise(X)
+        elif isinstance(X, pd.Series) and ((X.shape[0] == len(hazard_names) + 2) or (X.shape[0] == len(hazard_names))):
+            X = X.to_frame().T
+            order = hazard_names
             X = X[order]
             pass_for_numeric_dtypes_or_raise(X)
+        elif isinstance(X, pd.Series):
+            assert len(hazard_names) == 1, 'Series not the correct arugment'
+            X = pd.DataFrame(series).T
+            pass_for_numeric_dtypes_or_raise(X)
 
         X = X.astype(float)
         index = _get_index(X)
+
         X = normalize(X, self._norm_mean.values, 1)
         return pd.DataFrame(np.dot(X, self.hazards_.T), index=index)
 
@@ -716,8 +728,8 @@ def predict_cumulative_hazard(self, X, times=None):
                 cumulative_hazard_ = cumulative_hazard_.merge(pd.DataFrame(np.dot(c_0, v.T), index=c_0.index, columns=col), how='outer', right_index=True, left_index=True)
         else:
             c_0 = self.baseline_cumulative_hazard_
-            col = _get_index(X)
             v = self.predict_partial_hazard(X)
+            col = _get_index(v)
             cumulative_hazard_ = pd.DataFrame(np.dot(c_0, v.T), columns=col, index=c_0.index)
 
         if times is not None:
@@ -808,6 +820,24 @@ def _compute_baseline_hazards(self, df, T, E, weights):
             return self._compute_baseline_hazard(data=df, durations=T, event_observed=E, weights=weights, name='baseline hazard')
 
     def _compute_baseline_survival(self):
+        """
+        Importantly, this agrees with what the KaplanMeierFitter produces. Ex:
+        from lifelines.datasets import load_rossi
+        from lifelines import CoxPHFitter, KaplanMeierFitter
+        rossi = load_rossi()
+
+        kmf = KaplanMeierFitter()
+        kmf.fit(rossi['week'], rossi['arrest'])
+
+        rossi2 = rossi[['week', 'arrest']].copy()
+        rossi2['var1'] = np.random.randn(432)
+
+        cph = CoxPHFitter()
+        cph.fit(rossi2, 'week', 'arrest')
+
+        ax = cph.baseline_survival_.plot()
+        kmf.plot(ax=ax)
+        """
         survival_df = exp(-self.baseline_cumulative_hazard_)
         if self.strata is None:
             survival_df.columns = ['baseline survival']
diff --git a/perf_tests/aaf_perf_test.py b/perf_tests/aaf_perf_test.py
new file mode 100644
index 000000000..e6e99a543
--- /dev/null
+++ b/perf_tests/aaf_perf_test.py
@@ -0,0 +1,17 @@
+#aalen additive
+
+
+if __name__ == "__main__":
+    import pandas as pd
+    import time
+
+    from lifelines.estimation import AalenAdditiveFitter
+    from lifelines.datasets import load_rossi
+    df = load_rossi()
+    df = pd.concat([df] * 5).reset_index(drop=True)
+    print("Size: ", df.shape)
+    aaf = AalenAdditiveFitter()
+    start_time = time.time()
+    aaf.fit(df, duration_col='week', event_col="arrest")
+    print("--- %s seconds ---" % (time.time() - start_time))
+    print(aaf.score_)
diff --git a/perf_tests/cp_perf_test.py b/perf_tests/cp_perf_test.py
new file mode 100644
index 000000000..6a628c44d
--- /dev/null
+++ b/perf_tests/cp_perf_test.py
@@ -0,0 +1,15 @@
+#cox regression
+
+
+if __name__ == "__main__":
+    import pandas as pd
+    import time
+
+    from lifelines.estimation import CoxPHFitter
+    from lifelines.datasets import load_rossi
+    df = load_rossi()
+    df = pd.concat([df] * 20)
+    cp = CoxPHFitter()
+    start_time = time.time()
+    cp.fit(df, duration_col='week', event_col="arrest")
+    print("--- %s seconds ---" % (time.time() - start_time))
diff --git a/perf_tests/ctv_perf_test.py b/perf_tests/ctv_perf_test.py
new file mode 100644
index 000000000..e619dc515
--- /dev/null
+++ b/perf_tests/ctv_perf_test.py
@@ -0,0 +1,13 @@
+if __name__ == "__main__":
+    import time
+    import pandas as pd
+    from lifelines.estimation import CoxTimeVaryingFitter
+    from lifelines.datasets import load_stanford_heart_transplants
+    dfcv = load_stanford_heart_transplants()
+    dfcv = pd.concat([dfcv]*50)
+    ctv = CoxTimeVaryingFitter()
+    start_time = time.time()
+    ctv.fit(dfcv, id_col="id", event_col="event", start_col='start', stop_col='stop')
+    time_took = (time.time() - start_time)
+    print("--- %s seconds ---" % time_took)
+    ctv.print_summary()
diff --git a/tests/test_estimation.py b/tests/test_estimation.py
index 4ef32167b..4e0b5aa67 100644
--- a/tests/test_estimation.py
+++ b/tests/test_estimation.py
@@ -1003,6 +1003,12 @@ def test_data_normalization(self, data_pred2):
 
         assert ci_org == ci_trn
 
+    def test_cox_ph_prediction_with_series(self, rossi):
+        cf = CoxPHFitter()
+        cf.fit(rossi, duration_col='week', event_col='arrest')
+        rossi_mean = rossi.mean()
+        cf.predict_survival_function(rossi_mean)
+
     @pytest.mark.xfail
     def test_cox_ph_prediction_monotonicity(self, data_pred2):
         # Concordance wise, all prediction methods should be monotonic versions

From 79ae1b61d6d9cb6c04362c5f2aa9a373fc23b080 Mon Sep 17 00:00:00 2001
From: Cameron Davidson-Pilon <cam.davidson.pilon@gmail.com>
Date: Wed, 21 Nov 2018 21:06:32 -0500
Subject: [PATCH 57/59] add check to confirm that predicting at mean equals
 baseline survival

---
 tests/test_estimation.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/tests/test_estimation.py b/tests/test_estimation.py
index 4e0b5aa67..877934cd8 100644
--- a/tests/test_estimation.py
+++ b/tests/test_estimation.py
@@ -1007,7 +1007,8 @@ def test_cox_ph_prediction_with_series(self, rossi):
         cf = CoxPHFitter()
         cf.fit(rossi, duration_col='week', event_col='arrest')
         rossi_mean = rossi.mean()
-        cf.predict_survival_function(rossi_mean)
+        result = cf.predict_survival_function(rossi_mean)
+        assert_series_equal(cf.baseline_survival_['baseline survival'], result[0], check_names=False)
 
     @pytest.mark.xfail
     def test_cox_ph_prediction_monotonicity(self, data_pred2):

From 51b006e02076e22a7aba15bf261725a8da270395 Mon Sep 17 00:00:00 2001
From: Cameron Davidson-Pilon <cam.davidson.pilon@gmail.com>
Date: Wed, 21 Nov 2018 21:11:34 -0500
Subject: [PATCH 58/59] I can delete this after I've computed with it

---
 lifelines/fitters/coxph_fitter.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/lifelines/fitters/coxph_fitter.py b/lifelines/fitters/coxph_fitter.py
index 25876c84c..8fb01c45a 100644
--- a/lifelines/fitters/coxph_fitter.py
+++ b/lifelines/fitters/coxph_fitter.py
@@ -926,4 +926,5 @@ def score_(self):
             self._concordance_score_ = concordance_index(self.durations,
                                      -self._predicted_partial_hazards_,
                                      self.event_observed)
+            del self._predicted_partial_hazards_
             return self._concordance_score_

From 815588f467e948d7f93d02f9a7e0e7c05645b556 Mon Sep 17 00:00:00 2001
From: Cameron Davidson-Pilon <cam.davidson.pilon@gmail.com>
Date: Thu, 22 Nov 2018 13:45:09 -0500
Subject: [PATCH 59/59] let's cut it here. So what's _not_ included in 0.15.0
 is lognormal, and robust to ctv models

---
 CHANGELOG.md                              | 3 ++-
 docs/Survival analysis with lifelines.rst | 5 +----
 tests/test_plotting.py                    | 1 -
 3 files changed, 3 insertions(+), 6 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index babc596ae..8a19f4243 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,7 +1,7 @@
 ### Changelogs
 
 #### 0.15.0
- - adding `robust` params to Cox models' `fit`. This enables atleast i) using non-integer weights in the model (these could be sampling weights like IPTW), and ii) mis-specified models (ex: non-proportional hazards). Under the hood it's a sandwich estimator. This does not handle ties, so if there are high number of ties, results may significantly differ from other software.
+ - adding `robust` params to `CoxPHFitter`'s `fit`. This enables atleast i) using non-integer weights in the model (these could be sampling weights like IPTW), and ii) mis-specified models (ex: non-proportional hazards). Under the hood it's a sandwich estimator. This does not handle ties, so if there are high number of ties, results may significantly differ from other software.
  - `standard_errors_` is now a property on fitted `CoxPHFitter` which describes the standard errors of the coefficients.
  - `variance_matrix_` is now a property on fitted `CoxPHFitter` which describes the variance matrix of the coefficients.
  - new criteria for convergence of `CoxPHFitter` and `CoxTimeVaryingFitter` called the Newton-decrement. Tests show it is as accurate (w.r.t to previous coefficients) and typically shaves off a single step, resulting in generally faster convergence. See https://www.cs.cmu.edu/~pradeepr/convexopt/Lecture_Slides/Newton_methods.pdf. Details about the Newton-decrement are added to the `show_progress` statements.
@@ -22,6 +22,7 @@
  - Fixed a bug in `KaplanMeierFitter` when late entry times lined up with death events. Thanks @pzivich
  - Adding `cluster_col` argument to `CoxPHFitter` so users can specify groups of subjects/rows that may be correlated. 
  - Shifting the "signficance codes" for p-values down an order of magnitude. (Example, p-values between 0.1 and 0.05 are not noted at all and p-values between 0.05 and 0.1 are noted with `.`, etc.). This deviates with how they are presented in other software. There is an argument to be made to remove p-values from lifelines altogether (_become the changes you want to see in the world_ lol), but I worry that people could compute the p-values by hand incorrectly, a worse outcome I think. So, this is my stance. P-values between 0.1 and 0.05 offer _very_ little information, so they are removed. There is a growing movement in statistics to shift "signficant" findings to p-values less than 0.01 anyways. 
+ - New fitter for cumulative incidence of multiple risks `AalenJohansenFitter`. Thanks @pzivich! See "Methodologic Issues When Estimating Risks in Pharmacoepidemiology" for a nice overview of the model. 
 
 #### 0.14.6
  - fix for n > 2 groups in `multivariate_logrank_test` (again).
diff --git a/docs/Survival analysis with lifelines.rst b/docs/Survival analysis with lifelines.rst
index 78ea39c11..92eaf0065 100644
--- a/docs/Survival analysis with lifelines.rst	
+++ b/docs/Survival analysis with lifelines.rst	
@@ -481,7 +481,7 @@ In lifelines, estimation is available using the ``WeibullFitter`` class:
 
 
 
-Other parametric models: Exponential and LogNormal
+Other parametric models: Exponential
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
 Similarly, there are other parametric models in lifelines. Generally, which parametric model to choose is determined by either knowledge of the distribution of durations, or some sort of model goodness-of-fit. Below are three parametric models of the same data. 
@@ -490,18 +490,15 @@ Similarly, there are other parametric models in lifelines. Generally, which para
 
     from lifelines import WeibullFitter
     from lifelines import ExponentialFitter
-    from lifelines import LogNormalFitter
   
     T = data['duration']
     E = data['observed']
 
     wf = WeibullFitter().fit(T, E, label='WeibullFitter')
     exf = ExponentialFitter().fit(T, E, label='ExponentalFitter')
-    lnf = LogNormalFitter().fit(T, E, label='LogNormalFitter')
 
     ax = wf.plot()
     ax = exf.plot(ax=ax)
-    ax = lnf.plot(ax=ax)
 
 
 Estimating hazard rates using Nelson-Aalen
diff --git a/tests/test_plotting.py b/tests/test_plotting.py
index 36a51b0bc..947ad3811 100644
--- a/tests/test_plotting.py
+++ b/tests/test_plotting.py
@@ -13,7 +13,6 @@
 from lifelines.generate_datasets import cumulative_integral
 
 
-@pytest.mark.plottest
 @pytest.mark.skipif("DISPLAY" not in os.environ, reason="requires display")
 class TestPlotting():