Skip to content

Commit

Permalink
add options to turn off categorical sorting and sorting by gradients …
Browse files Browse the repository at this point in the history
…(not updates)
  • Loading branch information
paulbkoch committed Dec 31, 2024
1 parent 15c55ac commit 8bb0f56
Show file tree
Hide file tree
Showing 3 changed files with 39 additions and 23 deletions.
8 changes: 6 additions & 2 deletions docs/benchmarks/ebm-benchmark.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -206,8 +206,9 @@
" max_samples = None\n",
" n_calibration_folds = 4 # 4 uses all cores on the containers\n",
"\n",
" from interpret.develop import set_option\n",
" from interpret.glassbox import ExplainableBoostingClassifier, ExplainableBoostingRegressor\n",
" from interpret.develop import set_option\n",
" from interpret.utils._native import Native\n",
" from xgboost import XGBClassifier, XGBRegressor, XGBRFClassifier, XGBRFRegressor\n",
" from lightgbm import LGBMClassifier, LGBMRegressor\n",
" from catboost import CatBoostClassifier, CatBoostRegressor\n",
Expand Down Expand Up @@ -235,6 +236,9 @@
" import re\n",
" import random\n",
"\n",
" # turn off AVX512F support since it adds variability in results given spotty architecture support\n",
" set_option(\"acceleration\", ~Native.AccelerationFlags_AVX512F)\n",
" \n",
" random.seed(seed)\n",
" np.random.seed(seed)\n",
"\n",
Expand Down Expand Up @@ -996,7 +1000,7 @@
"#results_df = results_df[results_df['type'] != 'multiclass']\n",
"#results_df = results_df[results_df['type'] != 'regression']\n",
"#\n",
"#results_df = results_df[(results_df['method'] != 'ebm') | (results_df['meta'] == '{}')]\n",
"#results_df = results_df[(results_df['method'] != 'ebm') | (results_df['meta'] == '{\"interactions\": 0}') | (results_df['meta'] == '{}')]\n",
"#results_df = results_df[((results_df['method'] == 'ebm') & (results_df['meta'] == '{}')) | (results_df['method'] == 'xgb')]\n",
"#\n",
"#results_df = results_df[\n",
Expand Down
52 changes: 32 additions & 20 deletions shared/libebm/PartitionOneDimensionalBoosting.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -919,17 +919,24 @@ template<bool bHessian, size_t cCompilerScores> class CompareBin final {
// NEVER check for exact equality (as a precondition is ok), since then we'd violate the weak ordering rule
// https://medium.com/@shiansu/strict-weak-ordering-and-the-c-stl-f7dcfa4d4e07

const bool bUpdateWithHessian = bHessian && m_bHessianRuntime;

const FloatCalc hess1 =
static_cast<FloatCalc>(bUpdateWithHessian ? lhs->GetGradientPairs()[0].GetHess() : lhs->GetWeight());
const FloatCalc val1 =
static_cast<FloatCalc>(lhs->GetGradientPairs()[0].m_sumGradients) / (hess1 + m_categoricalSmoothing);

const FloatCalc hess2 =
static_cast<FloatCalc>(bUpdateWithHessian ? rhs->GetGradientPairs()[0].GetHess() : rhs->GetWeight());
const FloatCalc val2 =
static_cast<FloatCalc>(rhs->GetGradientPairs()[0].m_sumGradients) / (hess2 + m_categoricalSmoothing);
EBM_ASSERT(!std::isnan(m_categoricalSmoothing));

FloatCalc val1 = static_cast<FloatCalc>(lhs->GetGradientPairs()[0].m_sumGradients);
FloatCalc val2 = static_cast<FloatCalc>(rhs->GetGradientPairs()[0].m_sumGradients);
if(!std::isinf(m_categoricalSmoothing)) {
FloatCalc hess1;
FloatCalc hess2;
const bool bUpdateWithHessian = bHessian && m_bHessianRuntime;
if(!bUpdateWithHessian) {
hess1 = static_cast<FloatCalc>(lhs->GetWeight());
hess2 = static_cast<FloatCalc>(rhs->GetWeight());
} else {
hess1 = static_cast<FloatCalc>(lhs->GetGradientPairs()[0].GetHess());
hess2 = static_cast<FloatCalc>(rhs->GetGradientPairs()[0].GetHess());
}
val1 /= (hess1 + m_categoricalSmoothing);
val2 /= (hess2 + m_categoricalSmoothing);
}

if(val1 == val2) {
return lhs < rhs;
Expand Down Expand Up @@ -1130,17 +1137,22 @@ template<bool bHessian, size_t cCompilerScores> class PartitionOneDimensionalBoo
return error;
}

// shuffle
while(size_t{1} != cRemaining) {
const size_t iSwap = pRng->NextFast(cRemaining);
auto* const pTemp = apBins[iSwap];
--cRemaining;
apBins[iSwap] = apBins[cRemaining];
apBins[cRemaining] = pTemp;
const bool bShuffle = 1 != cCompilerScores || std::isnan(categoricalSmoothing);
const bool bSort = 1 == cCompilerScores && !std::isnan(categoricalSmoothing);

EBM_ASSERT(bShuffle || bSort);

if(bShuffle) {
while(size_t{1} != cRemaining) {
const size_t iSwap = pRng->NextFast(cRemaining);
auto* const pTemp = apBins[iSwap];
--cRemaining;
apBins[iSwap] = apBins[cRemaining];
apBins[cRemaining] = pTemp;
}
}

static constexpr bool bSingleScore = 1 == cCompilerScores;
if(bSingleScore) {
if(bSort) {
// there isn't a single key to sort on with multiple grad/hess pairs, so use random ordering otherwise.
std::sort(apBins,
ppBin,
Expand Down
2 changes: 1 addition & 1 deletion shared/libebm/tests/boosting_unusual_inputs.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2380,7 +2380,7 @@ static double RandomizedTesting(const AccelerationFlags acceleration) {
}

TEST_CASE("stress test, boosting") {
const double expected = 15044531333054.148;
const double expected = 15111161995602.100;

double validationMetricExact = RandomizedTesting(AccelerationFlags_NONE);
CHECK(validationMetricExact == expected);
Expand Down

0 comments on commit 8bb0f56

Please sign in to comment.