Skip to content

Commit

Permalink
added multiclassification_report method
Browse files Browse the repository at this point in the history
  • Loading branch information
alvinthai committed Dec 27, 2017
1 parent 75bfaf8 commit 1e58940
Show file tree
Hide file tree
Showing 4 changed files with 251 additions and 160 deletions.
63 changes: 49 additions & 14 deletions OrderedOVRClassifier/classifier.py
Original file line number Diff line number Diff line change
Expand Up @@ -692,7 +692,7 @@ def _pred_cleanup(self, X, drop_cols):
Data used for predictions.
drop_cols: list of str, optional
Labels of columns to ignore in modeling, only applicable to pandas
Labels of columns ignored in modeling, only applicable to pandas
DataFrame X input.
Returns
Expand Down Expand Up @@ -761,7 +761,8 @@ def _skater_extract(self, X, n_jobs):
else:
col_names = None

no_multipooling = ['lightgbm.sklearn', 'sklearn.linear_model.logistic']
no_multipooling = ['lightgbm.sklearn', 'xgboost.sklearn',
'sklearn.linear_model.logistic']

for _, clf in self.pipeline:
if clf.__module__ in no_multipooling:
Expand All @@ -770,7 +771,8 @@ def _skater_extract(self, X, n_jobs):

return X, n_jobs, col_names

def _xg_cleanup(self, clf):
@staticmethod
def _xg_cleanup(clf):
'''
Utility function to delete the Booster.feature_names attributes in
XGBClassifier. Deleting this attribute allows XGBClassifier to make
Expand Down Expand Up @@ -1152,8 +1154,34 @@ def fit_test_ovr(self, model, ovr_val, X, y=None, eval_set=None,

return model

def multiclassification_report(self, X, y=None, drop_cols=None):
'''
Wrapper function for extended_classification_report, which is an
extension of sklearn.metrics.classification_report. Builds a text
report showing the main classification metrics and the total count of
multiclass predictions per class.
Parameters
----------
X: array-like, shape = [n_samples, n_features]
Data used for predictions.
y: array-like, shape = [n_samples, ], optional
True labels for X. If not provided and X is a DataFrame, will
extract y column from X with the provided self.target value.
drop_cols: list of str, optional
Labels of columns ignored in modeling, only applicable to pandas
DataFrame X input.
'''
X, y, _ = self._xy_transform(X, y, drop_cols)
y_pred = self.predict(X)

return u.extended_classification_report(y, y_pred)

def plot_feature_importance(self, X, y=None, filter_class=None, n_jobs=-1,
progressbar=True, drop_cols=None):
n_samples=5000, progressbar=True,
drop_cols=None):
'''
Wrapper function for calling the plot_feature_importance function from
skater, which estimates the feature importance of all columns based on
Expand Down Expand Up @@ -1186,20 +1214,23 @@ def plot_feature_importance(self, X, y=None, filter_class=None, n_jobs=-1,
The number of CPUs to use to compute the feature importances. -1
means 'all CPUs' (default).
n_samples: int, optional, default: 5000
How many samples to use when computing importance.
progressbar: bool, optional, default: True
Whether to display progress. This affects which function we use to
multipool the function execution, where including the progress bar
results in 10-20% slowdowns.
drop_cols: list of str, optional
Labels of columns to ignore in modeling, only applicable to pandas
Labels of columns ignored in modeling, only applicable to pandas
DataFrame X input.
'''
X, y, _ = self._xy_transform(X, y, drop_cols)
X, n_jobs, col_names = self._skater_extract(X, n_jobs)

return u.plot_feature_importance(self, X, y, col_names, filter_class,
n_jobs, progressbar)
n_jobs, n_samples, progressbar)

def plot_oovr_dependencies(self, ovr_val, X, y=None, comp_vals=None,
drop_cols=None):
Expand All @@ -1226,15 +1257,16 @@ def plot_oovr_dependencies(self, ovr_val, X, y=None, comp_vals=None,
ovr_val class.
drop_cols: list of str, optional
Labels of columns to ignore in modeling, only applicable to pandas
Labels of columns ignored in modeling, only applicable to pandas
DataFrame X input.
'''
X, y, _ = self._xy_transform(X, y, drop_cols)
return u.plot_oovr_dependencies(self, ovr_val, X, y, comp_vals)

def plot_partial_dependence(self, X, col, grid_resolution=100,
grid_range=(.05, 0.95), n_jobs=-1,
progressbar=True, drop_cols=None):
n_samples=1000, progressbar=True,
drop_cols=None):
'''
Wrapper function for calling the plot_partial_dependence function from
skater, which estimates the partial dependence of a column based on a
Expand Down Expand Up @@ -1271,21 +1303,24 @@ def plot_partial_dependence(self, X, col, grid_resolution=100,
The number of CPUs to use to compute the partial dependence. -1
means 'all CPUs' (default).
n_samples: int, optional, default: 1000
How many samples to use when computing partial dependence.
progressbar: bool, optional, default: True
Whether to display progress. This affects which function we use to
multipool the function execution, where including the progress bar
results in 10-20% slowdowns.
drop_cols: list of str, optional
Labels of columns to ignore in modeling, only applicable to pandas
Labels of columns ignored in modeling, only applicable to pandas
DataFrame X input.
'''
X = self._pred_cleanup(X, drop_cols)
X, n_jobs, col_names = self._skater_extract(X, n_jobs)

return u.plot_2d_partial_dependence(self, X, col, col_names,
grid_resolution, grid_range,
n_jobs, progressbar)
n_jobs, n_samples, progressbar)

def predict(self, X, start=0, drop_cols=None):
'''
Expand All @@ -1304,7 +1339,7 @@ def predict(self, X, start=0, drop_cols=None):
prediction through full pipeline).
drop_cols: list of str, optional
Labels of columns to ignore in modeling, only applicable to pandas
Labels of columns ignored in modeling, only applicable to pandas
DataFrame X input.
Returns
Expand Down Expand Up @@ -1358,7 +1393,7 @@ def predict_json(self, row):
Predicted multi-class target for input row data.
'''
row = self._json_transform(row)
pred = self.predict2(row)[0]
pred = self.predict(row)[0]
return pred

def predict_proba(self, X, score_type='uniform', drop_cols=None):
Expand Down Expand Up @@ -1397,7 +1432,7 @@ def predict_proba(self, X, score_type='uniform', drop_cols=None):
Acceptable inputs are 'raw', 'chained', and 'uniform'.
drop_cols: list of str, optional
Labels of columns to ignore in modeling, only applicable to pandas
Labels of columns ignored in modeling, only applicable to pandas
DataFrame X input.
Returns
Expand Down Expand Up @@ -1548,7 +1583,7 @@ def score(self, X, y=None, sample_weight=None, drop_cols=None):
Sample weights.
drop_cols: list of str, optional
Labels of columns to ignore in modeling, only applicable to pandas
Labels of columns ignored in modeling, only applicable to pandas
DataFrame X input.
Returns
Expand Down
53 changes: 43 additions & 10 deletions OrderedOVRClassifier/oovr_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,12 @@
except ImportError:
skater_loaded = False

try:
from tqdm import tqdm
tqdm_loaded = True
except ImportError:
tqdm_loaded = False


class OOVR_Model(object):
'''
Expand Down Expand Up @@ -157,7 +163,7 @@ def extended_classification_report(y_true, y_pred):

def plot_2d_partial_dependence(oovr, X, col, col_names=None,
grid_resolution=100, grid_range=(.05, 0.95),
n_jobs=-1, progressbar=True):
n_jobs=-1, n_samples=1000, progressbar=True):
'''
Wrapper function for calling the plot_partial_dependence function from
skater, which estimates the partial dependence of a column based on a
Expand Down Expand Up @@ -200,6 +206,9 @@ def plot_2d_partial_dependence(oovr, X, col, col_names=None,
The number of CPUs to use to compute the partial dependence. -1 means
'all CPUs' (default).
n_samples: int, optional, default: 1000
How many samples to use when computing partial dependence.
progressbar: bool, optional, default: True
Whether to display progress. This affects which function we use to
multipool the function execution, where including the progress bar
Expand All @@ -221,7 +230,8 @@ def plot_2d_partial_dependence(oovr, X, col, col_names=None,

fig = pdep.plot_partial_dependence([col], pyint_model, with_variance=False,
grid_resolution=grid_resolution,
grid_range=grid_range, n_jobs=n_jobs,
grid_range=grid_range,
n_jobs=n_jobs, n_samples=n_samples,
progressbar=progressbar)

for i, f in enumerate(fig[0]):
Expand All @@ -233,7 +243,7 @@ def plot_2d_partial_dependence(oovr, X, col, col_names=None,


def plot_feature_importance(oovr, X, y, col_names=None, filter_class=None,
n_jobs=-1, progressbar=True):
n_jobs=-1, n_samples=5000, progressbar=True):
'''
Wrapper function for calling the plot_feature_importance function from
skater, which estimates the feature importance of all columns based on a
Expand Down Expand Up @@ -271,6 +281,9 @@ def plot_feature_importance(oovr, X, y, col_names=None, filter_class=None,
The number of CPUs to use to compute the feature importances. -1 means
'all CPUs' (default).
n_samples: int, optional, default: 5000
How many samples to use when computing importance.
progressbar: bool, optional, default: True
Whether to display progress. This affects which function we use to
multipool the function execution, where including the progress bar
Expand All @@ -296,9 +309,11 @@ def plot_feature_importance(oovr, X, y, col_names=None, filter_class=None,
pyint_model = InMemoryModel(oovr.predict_proba, target_names=target_names,
examples=X)

fig, ax = feat.plot_feature_importance(pyint_model, n_jobs=n_jobs,
progressbar=progressbar,
filter_classes=filter_class)
fig, ax = feat.plot_feature_importance(pyint_model,
filter_classes=filter_class,
n_jobs=n_jobs,
n_samples=n_samples,
progressbar=progressbar)
fig.set_size_inches(18.5, max(ax.get_ylim()[1] / 4, 10.5))
ax.set_title(title)

Expand Down Expand Up @@ -382,21 +397,32 @@ def plot_oovr_dependencies(oovr, ovr_val, X, y, comp_vals=None):
pred_partial = pred_partial.reshape(-1, 100).T

# ============================================================
def accuracy_compute(y_pred):
def accuracy_compute(y_pred, pbar=None):
# note y and mask are variables not local to myfunc
accs = m.accuracy_score(y[~mask], y_pred[~mask])
if pbar is not None:
pbar.update()
return accs

def classification_compute(y_pred):
def classification_compute(y_pred, pbar=None):
# note y and mask are variables not local to myfunc
prf = m.precision_recall_fscore_support(y[~mask], y_pred[~mask],
warn_for=(),
pos_label=None)[0:3]
prf = np.ravel(np.vstack(prf)[:, cols_slice].T)
if pbar is not None:
pbar.update()
return prf
# ============================================================

# Calculate accuracy scores across thresholds
accs = np.apply_along_axis(accuracy_compute, 1, pred_partial)
if tqdm_loaded:
pbar1 = tqdm(total=100, desc='Calculating accuracy')
accs = np.apply_along_axis(accuracy_compute, 1, pred_partial,
pbar=pbar1)
pbar1.close()
else:
accs = np.apply_along_axis(accuracy_compute, 1, pred_partial)

# Plot accuracy as a function of threshold for OVR classifier
pd.DataFrame(accs, index=np.arange(0, 1.00, 0.01),
Expand All @@ -412,7 +438,14 @@ def classification_compute(y_pred):
plt.show(block=False)

# Calculate precison, recall, f1 scores across thresholds
prf = np.apply_along_axis(classification_compute, 1, pred_partial)
if tqdm_loaded:
pbar2 = tqdm(total=100, desc='Calculating precision, recall, and f1')
prf = np.apply_along_axis(classification_compute, 1, pred_partial,
pbar=pbar2)
pbar2.close()
else:
prf = np.apply_along_axis(classification_compute, 1, pred_partial)

cols = [['precision_' + str(l), 'recall_' + str(l), 'f1_' + str(l)]
for l in oovr._le.inverse_transform(cols_slice)]
cols = np.ravel(cols)
Expand Down
Loading

0 comments on commit 1e58940

Please sign in to comment.