added multiclassification_report method

alvinthai · Dec 27, 2017 · 1e58940 · 1e58940
1 parent 75bfaf8
commit 1e58940
Show file tree

Hide file tree

Showing 4 changed files with 251 additions and 160 deletions.
diff --git a/OrderedOVRClassifier/classifier.py b/OrderedOVRClassifier/classifier.py
@@ -692,7 +692,7 @@ def _pred_cleanup(self, X, drop_cols):
             Data used for predictions.
 
         drop_cols: list of str, optional
-            Labels of columns to ignore in modeling, only applicable to pandas
+            Labels of columns ignored in modeling, only applicable to pandas
             DataFrame X input.
 
         Returns
@@ -761,7 +761,8 @@ def _skater_extract(self, X, n_jobs):
         else:
             col_names = None
 
-        no_multipooling = ['lightgbm.sklearn', 'sklearn.linear_model.logistic']
+        no_multipooling = ['lightgbm.sklearn', 'xgboost.sklearn',
+                           'sklearn.linear_model.logistic']
 
         for _, clf in self.pipeline:
             if clf.__module__ in no_multipooling:
@@ -770,7 +771,8 @@ def _skater_extract(self, X, n_jobs):
 
         return X, n_jobs, col_names
 
-    def _xg_cleanup(self, clf):
+    @staticmethod
+    def _xg_cleanup(clf):
         '''
         Utility function to delete the Booster.feature_names attributes in
         XGBClassifier. Deleting this attribute allows XGBClassifier to make
@@ -1152,8 +1154,34 @@ def fit_test_ovr(self, model, ovr_val, X, y=None, eval_set=None,
 
         return model
 
+    def multiclassification_report(self, X, y=None, drop_cols=None):
+        '''
+        Wrapper function for extended_classification_report, which is an
+        extension of sklearn.metrics.classification_report. Builds a text
+        report showing the main classification metrics and the total count of
+        multiclass predictions per class.
+
+        Parameters
+        ----------
+        X: array-like, shape = [n_samples, n_features]
+            Data used for predictions.
+
+        y: array-like, shape = [n_samples, ], optional
+            True labels for X. If not provided and X is a DataFrame, will
+            extract y column from X with the provided self.target value.
+
+        drop_cols: list of str, optional
+            Labels of columns ignored in modeling, only applicable to pandas
+            DataFrame X input.
+        '''
+        X, y, _ = self._xy_transform(X, y, drop_cols)
+        y_pred = self.predict(X)
+
+        return u.extended_classification_report(y, y_pred)
+
     def plot_feature_importance(self, X, y=None, filter_class=None, n_jobs=-1,
-                                progressbar=True, drop_cols=None):
+                                n_samples=5000, progressbar=True,
+                                drop_cols=None):
         '''
         Wrapper function for calling the plot_feature_importance function from
         skater, which estimates the feature importance of all columns based on
@@ -1186,20 +1214,23 @@ def plot_feature_importance(self, X, y=None, filter_class=None, n_jobs=-1,
             The number of CPUs to use to compute the feature importances. -1
             means 'all CPUs' (default).
 
+        n_samples: int, optional, default: 5000
+            How many samples to use when computing importance.
+
         progressbar: bool, optional, default: True
             Whether to display progress. This affects which function we use to
             multipool the function execution, where including the progress bar
             results in 10-20% slowdowns.
 
         drop_cols: list of str, optional
-            Labels of columns to ignore in modeling, only applicable to pandas
+            Labels of columns ignored in modeling, only applicable to pandas
             DataFrame X input.
         '''
         X, y, _ = self._xy_transform(X, y, drop_cols)
         X, n_jobs, col_names = self._skater_extract(X, n_jobs)
 
         return u.plot_feature_importance(self, X, y, col_names, filter_class,
-                                         n_jobs, progressbar)
+                                         n_jobs, n_samples, progressbar)
 
     def plot_oovr_dependencies(self, ovr_val, X, y=None, comp_vals=None,
                                drop_cols=None):
@@ -1226,15 +1257,16 @@ def plot_oovr_dependencies(self, ovr_val, X, y=None, comp_vals=None,
             ovr_val class.
 
         drop_cols: list of str, optional
-            Labels of columns to ignore in modeling, only applicable to pandas
+            Labels of columns ignored in modeling, only applicable to pandas
             DataFrame X input.
         '''
         X, y, _ = self._xy_transform(X, y, drop_cols)
         return u.plot_oovr_dependencies(self, ovr_val, X, y, comp_vals)
 
     def plot_partial_dependence(self, X, col, grid_resolution=100,
                                 grid_range=(.05, 0.95), n_jobs=-1,
-                                progressbar=True, drop_cols=None):
+                                n_samples=1000, progressbar=True,
+                                drop_cols=None):
         '''
         Wrapper function for calling the plot_partial_dependence function from
         skater, which estimates the partial dependence of a column based on a
@@ -1271,21 +1303,24 @@ def plot_partial_dependence(self, X, col, grid_resolution=100,
             The number of CPUs to use to compute the partial dependence. -1
             means 'all CPUs' (default).
 
+        n_samples: int, optional, default: 1000
+            How many samples to use when computing partial dependence.
+
         progressbar: bool, optional, default: True
             Whether to display progress. This affects which function we use to
             multipool the function execution, where including the progress bar
             results in 10-20% slowdowns.
 
         drop_cols: list of str, optional
-            Labels of columns to ignore in modeling, only applicable to pandas
+            Labels of columns ignored in modeling, only applicable to pandas
             DataFrame X input.
         '''
         X = self._pred_cleanup(X, drop_cols)
         X, n_jobs, col_names = self._skater_extract(X, n_jobs)
 
         return u.plot_2d_partial_dependence(self, X, col, col_names,
                                             grid_resolution, grid_range,
-                                            n_jobs, progressbar)
+                                            n_jobs, n_samples, progressbar)
 
     def predict(self, X, start=0, drop_cols=None):
         '''
@@ -1304,7 +1339,7 @@ def predict(self, X, start=0, drop_cols=None):
             prediction through full pipeline).
 
         drop_cols: list of str, optional
-            Labels of columns to ignore in modeling, only applicable to pandas
+            Labels of columns ignored in modeling, only applicable to pandas
             DataFrame X input.
 
         Returns
@@ -1358,7 +1393,7 @@ def predict_json(self, row):
             Predicted multi-class target for input row data.
         '''
         row = self._json_transform(row)
-        pred = self.predict2(row)[0]
+        pred = self.predict(row)[0]
         return pred
 
     def predict_proba(self, X, score_type='uniform', drop_cols=None):
@@ -1397,7 +1432,7 @@ def predict_proba(self, X, score_type='uniform', drop_cols=None):
             Acceptable inputs are 'raw', 'chained', and 'uniform'.
 
         drop_cols: list of str, optional
-            Labels of columns to ignore in modeling, only applicable to pandas
+            Labels of columns ignored in modeling, only applicable to pandas
             DataFrame X input.
 
         Returns
@@ -1548,7 +1583,7 @@ def score(self, X, y=None, sample_weight=None, drop_cols=None):
             Sample weights.
 
         drop_cols: list of str, optional
-            Labels of columns to ignore in modeling, only applicable to pandas
+            Labels of columns ignored in modeling, only applicable to pandas
             DataFrame X input.
 
         Returns

diff --git a/OrderedOVRClassifier/oovr_utils.py b/OrderedOVRClassifier/oovr_utils.py
@@ -17,6 +17,12 @@
 except ImportError:
     skater_loaded = False
 
+try:
+    from tqdm import tqdm
+    tqdm_loaded = True
+except ImportError:
+    tqdm_loaded = False
+
 
 class OOVR_Model(object):
     '''
@@ -157,7 +163,7 @@ def extended_classification_report(y_true, y_pred):
 
 def plot_2d_partial_dependence(oovr, X, col, col_names=None,
                                grid_resolution=100, grid_range=(.05, 0.95),
-                               n_jobs=-1, progressbar=True):
+                               n_jobs=-1, n_samples=1000, progressbar=True):
     '''
     Wrapper function for calling the plot_partial_dependence function from
     skater, which estimates the partial dependence of a column based on a
@@ -200,6 +206,9 @@ def plot_2d_partial_dependence(oovr, X, col, col_names=None,
         The number of CPUs to use to compute the partial dependence. -1 means
         'all CPUs' (default).
 
+    n_samples: int, optional, default: 1000
+        How many samples to use when computing partial dependence.
+
     progressbar: bool, optional, default: True
         Whether to display progress. This affects which function we use to
         multipool the function execution, where including the progress bar
@@ -221,7 +230,8 @@ def plot_2d_partial_dependence(oovr, X, col, col_names=None,
 
     fig = pdep.plot_partial_dependence([col], pyint_model, with_variance=False,
                                        grid_resolution=grid_resolution,
-                                       grid_range=grid_range, n_jobs=n_jobs,
+                                       grid_range=grid_range,
+                                       n_jobs=n_jobs, n_samples=n_samples,
                                        progressbar=progressbar)
 
     for i, f in enumerate(fig[0]):
@@ -233,7 +243,7 @@ def plot_2d_partial_dependence(oovr, X, col, col_names=None,
 
 
 def plot_feature_importance(oovr, X, y, col_names=None, filter_class=None,
-                            n_jobs=-1, progressbar=True):
+                            n_jobs=-1, n_samples=5000, progressbar=True):
     '''
     Wrapper function for calling the plot_feature_importance function from
     skater, which estimates the feature importance of all columns based on a
@@ -271,6 +281,9 @@ def plot_feature_importance(oovr, X, y, col_names=None, filter_class=None,
         The number of CPUs to use to compute the feature importances. -1 means
         'all CPUs' (default).
 
+    n_samples: int, optional, default: 5000
+        How many samples to use when computing importance.
+
     progressbar: bool, optional, default: True
         Whether to display progress. This affects which function we use to
         multipool the function execution, where including the progress bar
@@ -296,9 +309,11 @@ def plot_feature_importance(oovr, X, y, col_names=None, filter_class=None,
     pyint_model = InMemoryModel(oovr.predict_proba, target_names=target_names,
                                 examples=X)
 
-    fig, ax = feat.plot_feature_importance(pyint_model, n_jobs=n_jobs,
-                                           progressbar=progressbar,
-                                           filter_classes=filter_class)
+    fig, ax = feat.plot_feature_importance(pyint_model,
+                                           filter_classes=filter_class,
+                                           n_jobs=n_jobs,
+                                           n_samples=n_samples,
+                                           progressbar=progressbar)
     fig.set_size_inches(18.5, max(ax.get_ylim()[1] / 4, 10.5))
     ax.set_title(title)
 
@@ -382,21 +397,32 @@ def plot_oovr_dependencies(oovr, ovr_val, X, y, comp_vals=None):
     pred_partial = pred_partial.reshape(-1, 100).T
 
     # ============================================================
-    def accuracy_compute(y_pred):
+    def accuracy_compute(y_pred, pbar=None):
         # note y and mask are variables not local to myfunc
         accs = m.accuracy_score(y[~mask], y_pred[~mask])
+        if pbar is not None:
+            pbar.update()
         return accs
 
-    def classification_compute(y_pred):
+    def classification_compute(y_pred, pbar=None):
         # note y and mask are variables not local to myfunc
         prf = m.precision_recall_fscore_support(y[~mask], y_pred[~mask],
+                                                warn_for=(),
                                                 pos_label=None)[0:3]
         prf = np.ravel(np.vstack(prf)[:, cols_slice].T)
+        if pbar is not None:
+            pbar.update()
         return prf
     # ============================================================
 
     # Calculate accuracy scores across thresholds
-    accs = np.apply_along_axis(accuracy_compute, 1, pred_partial)
+    if tqdm_loaded:
+        pbar1 = tqdm(total=100, desc='Calculating accuracy')
+        accs = np.apply_along_axis(accuracy_compute, 1, pred_partial,
+                                   pbar=pbar1)
+        pbar1.close()
+    else:
+        accs = np.apply_along_axis(accuracy_compute, 1, pred_partial)
 
     # Plot accuracy as a function of threshold for OVR classifier
     pd.DataFrame(accs, index=np.arange(0, 1.00, 0.01),
@@ -412,7 +438,14 @@ def classification_compute(y_pred):
     plt.show(block=False)
 
     # Calculate precison, recall, f1 scores across thresholds
-    prf = np.apply_along_axis(classification_compute, 1, pred_partial)
+    if tqdm_loaded:
+        pbar2 = tqdm(total=100, desc='Calculating precision, recall, and f1')
+        prf = np.apply_along_axis(classification_compute, 1, pred_partial,
+                                  pbar=pbar2)
+        pbar2.close()
+    else:
+        prf = np.apply_along_axis(classification_compute, 1, pred_partial)
+
     cols = [['precision_' + str(l), 'recall_' + str(l), 'f1_' + str(l)]
             for l in oovr._le.inverse_transform(cols_slice)]
     cols = np.ravel(cols)