Skip to content

Commit

Permalink
add runtime CPU big-performance-little core
Browse files Browse the repository at this point in the history
  • Loading branch information
huangzhengxiang committed Dec 13, 2024
1 parent 0046e50 commit 68a53de
Show file tree
Hide file tree
Showing 16 changed files with 150 additions and 76 deletions.
11 changes: 7 additions & 4 deletions include/MNN/Interpreter.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -219,10 +219,9 @@ class MNN_PUBLIC Interpreter {
// 0: Close dynamic quant; 1: per batch quant; 2: per tensor quant
DYNAMIC_QUANT_OPTIONS = 5,

// For Mobile CPU with big-litter core, set decrease rate to let MNN divide task differential by CPU's performance
// 0-100, 50 means litter core has 50% capacity of large core
// Default is 50
CPU_LITTLECORE_DECREASE_RATE = 6,
// For Mobile CPU with big-little core, set decrease rate to let MNN divide task differential by CPU's performance
// 0-100, e.g., 70 means performance core has 70% capacity of large core
CPU_PERFORMANCECORE_DECREASE_RATE = 6,

// 0: Do not quantize
// 1: Only quantize key, use int8 asymmetric quantization
Expand All @@ -236,6 +235,10 @@ class MNN_PUBLIC Interpreter {
KVCACHE_SIZE_LIMIT = 8,
// Op encoder number for commit
OP_ENCODER_NUMBER_FOR_COMMIT = 9,

// For Mobile CPU with big-little core, set decrease rate to let MNN divide task differential by CPU's performance
// 0-100, e.g., 20 means little core has 20% capacity of large core
CPU_LITTLECORE_DECREASE_RATE = 10,
};

enum ExternalPathType {
Expand Down
106 changes: 76 additions & 30 deletions source/backend/cpu/CPUBackend.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,13 @@
#include "x86_x64/AVX2Backend.hpp"
#endif

#ifdef MNN_USE_THREAD_POOL
#include <sys/types.h>
#include <sched.h>
#include "core/Concurrency.h"
#endif

// #define UNIFORM_DIVIDE
#define MNN_CPU_MAX_BUFFER_INDEX 2
#define MNN_CPU_CHECK_NAN 1
#define MNN_CPU_USE_DEFAULT_BACKEND 4
Expand All @@ -48,8 +55,18 @@ ErrorCode CastWrapExecution::onExecute(const std::vector<Tensor*>& inputs, const
CPUCastCreator::cast(inputs[0], outputs[0], cpuBackend, convertType);
return NO_ERROR;
}

float getTotalRate(const std::vector<float>& rate, const std::vector<int>& cpuids) {
float totalRate = 0.0f;
for (auto id : cpuids) {
totalRate += rate[id];
}
return totalRate;
}

// Shall be called in onExecute!!!
void CPUBackend::computeDivideSizes(int size, int* dst, float avgDiv) const {
if (mGroupWithComputeRate.size() <= 1 || (avgDiv > 0 && avgDiv < mComputeI)) {
if (mGroupWithComputeRate.size() <= 1 || (avgDiv > 0.0f && avgDiv < mComputeI)) {
// Avg divide
int length = UP_DIV(size, mThreadNumber);
int cur = length;
Expand All @@ -61,18 +78,43 @@ void CPUBackend::computeDivideSizes(int size, int* dst, float avgDiv) const {
return;
}

#if defined(MNN_USE_THREAD_POOL) && !defined(UNIFORM_DIVIDE)
// probe cpuid
std::vector<int> cpuids(mThreadNumber);
Backend* this_ptr = (Backend*)this;
std::function<void(int)> mGetCPUId = [&](int tId) {
// MNN_PRINT("Current id: %d, tid: %d, cpuid: %d\n", tId, gettid(), sched_getcpu());
cpuids[tId] = sched_getcpu();
};
std::function<Backend*(void)> backend = [=](void) {
return this_ptr;
};
MNN_CONCURRENCY_BEGIN(tId, mThreadNumber) {
mGetCPUId((int)tId);
}
MNN_CONCURRENCY_END();

// assign workload according to physical cpu power.
float totalRate = getTotalRate(mGroupWithComputeRate, cpuids);
int cur = 0;
int curPos = 0;
for (auto& group : mGroupWithComputeRate) {
int currentGroupTotal = (int)(ceilf((float)size*group.first));
int length = UP_DIV(currentGroupTotal, group.second);
for (int i=0; i<group.second; ++i) {
cur = cur + length;
cur = ALIMIN(cur, size);
dst[curPos+i] = cur;
}
curPos += group.second;
for (int i=0; i<mThreadNumber; ++i) {
// MNN_PRINT("totalRate: %.4f, rate: %.4f\n", totalRate, mGroupWithComputeRate[cpuids[i]]/totalRate);
int length = (int)(ceilf((float)size*(mGroupWithComputeRate[cpuids[i]]/totalRate)));
cur += length;
cur = ALIMIN(cur, size);
dst[i] = cur;
}
MNN_ASSERT(cur==size);
#else
// Avg divide
int length = UP_DIV(size, mThreadNumber);
int cur = length;
for (int i=0; i<mThreadNumber; ++i) {
dst[i] = cur;
cur = cur + length;
cur = ALIMIN(cur, size);
}
#endif
}

void CPURuntime::_bindCPUCore() const {
Expand Down Expand Up @@ -372,8 +414,12 @@ CPUBackend::CPUBackend(const CPURuntime* runtime, BackendConfig::PrecisionMode p
if (mThreadNumber <= 1 || mRuntime->mPower == BackendConfig::Power_Low) {
break;
}
auto rate = mRuntime->hint().cpuDecreaseRate;
if (rate >= 100 || rate <= 0) {
auto middleRate = mRuntime->hint().cpuMiddleDecreaseRate;
auto littleRate = mRuntime->hint().cpuLittleDecreaseRate;
if (middleRate >= 100 || middleRate <= 0) {
break;
}
if (littleRate >= 100 || littleRate <= 0) {
break;
}
auto cpuInfo = MNNGetCPUInfo();
Expand All @@ -387,24 +433,24 @@ CPUBackend::CPUBackend(const CPURuntime* runtime, BackendConfig::PrecisionMode p
} else {
mComputeI = 7.f;
}
// initialize rate.
mGroupWithComputeRate.clear();
float decreaseRate = (float)(rate) / 100.0f;
int validCpuSize = (int)(cpuInfo->groups[cpuInfo->groups.size()-1].ids.size());
int groupIndex = (int)cpuInfo->groups.size()-2;
validCpuSize = ALIMIN(validCpuSize, mThreadNumber);
float totalComputeRate = 1.0f * validCpuSize;
mGroupWithComputeRate.emplace_back(std::make_pair(totalComputeRate, validCpuSize));
float currentRate = 1.0f;
while (validCpuSize < mThreadNumber && groupIndex >= 0) {
auto& group = cpuInfo->groups[groupIndex];
int selectSize = ALIMIN(mThreadNumber - validCpuSize, (int)group.ids.size());
validCpuSize += group.ids.size();
currentRate *= decreaseRate;
totalComputeRate += currentRate * selectSize;
mGroupWithComputeRate.emplace_back(std::make_pair(currentRate * selectSize, selectSize));
}
for (auto& g : mGroupWithComputeRate) {
g.first = g.first / totalComputeRate;
mGroupWithComputeRate.resize(cpuInfo->cpuNumber, 1.0f);
int groupIndex = (int)cpuInfo->groups.size()-1;
// initialize to be little-core.
std::vector<float> rate((int)cpuInfo->groups.size(), (float)(littleRate) / 100.0f);
// set big core
rate[(int)cpuInfo->groups.size()-1] = 1.0f;
// set middle core
if ((int)cpuInfo->groups.size() >= 2) {
rate[(int)cpuInfo->groups.size()-2] = (float)(middleRate) / 100.0f;
}
while (groupIndex >= 0) {
auto& group = cpuInfo->groups[groupIndex].ids;
for (auto cpuid : group) {
mGroupWithComputeRate[cpuid] = rate[groupIndex];
}
groupIndex--;
}
} while (false);
auto dynamicAlloc = mRuntime->mSharedDmaInfo;
Expand Down
2 changes: 1 addition & 1 deletion source/backend/cpu/CPUBackend.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -183,7 +183,7 @@ class CPUBackend : public Backend {
CoreInt8Functions* mInt8CoreFunctions;
private:
int mThreadNumber;
std::vector<std::pair<float, int>> mGroupWithComputeRate;
std::vector<float> mGroupWithComputeRate;
float mComputeI = 0.f;

std::shared_ptr<CPURuntime::DynamicAllocator> mDmaInfo;
Expand Down
14 changes: 3 additions & 11 deletions source/backend/cpu/CPUConvolutionDepthwise.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -171,17 +171,9 @@ ErrorCode CPUConvolutionDepthwise::BasicFloatExecution::onResize(const std::vect
auto batch = inputs[0]->batch();
int total = batch * dst_depth_quad;
int numberThread = ((CPUBackend*)backend())->threadNumber();
std::vector<int> divides(numberThread+1);
divides[0] = 0;
static_cast<CPUBackend *>(backend())->computeDivideSizes(total, divides.data()+1);
mTotalWork = total;
divides.resize(numberThread+1, 0);
mNumber = numberThread;
for (int i=1; i<numberThread; ++i) {
if (divides[i+1] <= divides[i]) {
// Only 0-(i-1) thread has work
mNumber = i;
break;
}
}
MNN_ASSERT(mNumber > 0);
auto postData = getPostParameters();
if (static_cast<CPUBackend*>(backend())->functions()->bytes < 4) {
Expand All @@ -204,7 +196,6 @@ ErrorCode CPUConvolutionDepthwise::BasicFloatExecution::onResize(const std::vect
src_y_step = paddedWidth * unit;
}
mExecutor = [=](const uint8_t* inputPtr, uint8_t* outputPtr, int tId) {
MNN_ASSERT(divides[tId] < divides[tId+1]);
const auto inputPadPtr = mInputPad->host<uint8_t>() + mInputPad->stride(0) * tId * bytes;
::memset(inputPadPtr, 0, mInputPad->stride(0) * bytes);
auto biasP = inputs[2]->host<uint8_t>();
Expand Down Expand Up @@ -239,6 +230,7 @@ ErrorCode CPUConvolutionDepthwise::BasicFloatExecution::onExecute(const std::vec
auto outputTensor = outputs[0];
const auto srcOrigin = inputTensor->host<uint8_t>();
auto dstOrigin = outputTensor->host<uint8_t>();
static_cast<CPUBackend *>(backend())->computeDivideSizes(mTotalWork, divides.data()+1);
MNN_CONCURRENCY_BEGIN(tId, mNumber) {
mExecutor(srcOrigin, dstOrigin, (int)tId);
}
Expand Down
2 changes: 2 additions & 0 deletions source/backend/cpu/CPUConvolutionDepthwise.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,8 @@ class CPUConvolutionDepthwise {
size_t fw, size_t fh, size_t dilateX_step, size_t dilateY_step, size_t height,
size_t srcHStep, size_t dstHStep)> mFastKernel;
int mNumber = 1;
std::vector<int> divides;
int mTotalWork;
std::shared_ptr<Tensor> mInputPad;
bool mFastKernelApply = false;
};
Expand Down
30 changes: 21 additions & 9 deletions source/backend/cpu/compute/ConvInt8TiledExecutor.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -515,6 +515,17 @@ void DenseConvInt8TiledExecutor::getPackParameter(int* Unit, int* srcUnit, int*
core->MNNGetGemmUnit(Unit, srcUnit, DestUnit);
}

void DenseConvInt8TiledExecutor::computeDivideSizes() {
if (mSplitByOc) {
static_cast<CPUBackend *>(backend())->computeDivideSizes(mTotalWork, mDivides.data() + 1, mflops / mios);
for (int i = 0; i < mDivides.size(); ++i) {
mDivides[i] *= mPart;
}
} else {
static_cast<CPUBackend *>(backend())->computeDivideSizes(mTotalWork, mDivides.data() + 1, mflops / mios);
}
}


ErrorCode DenseConvInt8TiledExecutor::onResize(const std::vector<Tensor*>& inputs, const std::vector<Tensor*>& outputs) {
mUseBatchQuan = (static_cast<CPUBackend*>(backend())->getRuntime()->hint().dynamicQuantOption == 1);
Expand Down Expand Up @@ -631,21 +642,21 @@ ErrorCode DenseConvInt8TiledExecutor::onResize(const std::vector<Tensor*>& input
}
mThreadNums = ALIMIN(threads, threadNeed);
mSplitByOc = true;
mTotalWork = totalWork;
mPart = part;

mDivides.resize(threads+1);
mDivides[0] = 0;
static_cast<CPUBackend *>(backend())->computeDivideSizes(totalWork, mDivides.data() + 1, flop / ios);
for (int i = 0; i < mDivides.size(); ++i) {
mDivides[i] *= part;
}
}

if (!mSplitByOc) {
mThreadNums = ALIMIN(threads, mTileCount);
mDivides.resize(threads+1);
mDivides[0] = 0;
static_cast<CPUBackend *>(backend())->computeDivideSizes(mTileCount, mDivides.data() + 1, flop / ios);
mTotalWork = mTileCount;
}

// record flop and ios and prepare for divide compute.
mDivides.resize(threads+1, 0);
mflops = flop, mios = ios;


int ocUp4 = ROUND_UP(outC, gcore->pack);
int alphaSize = mResourceInt8->mOriginScale->size() / (sizeof(float) * 2);
int k = mThreadNums;
Expand Down Expand Up @@ -1239,6 +1250,7 @@ ErrorCode DenseConvInt8TiledExecutor::onExecute(const std::vector<Tensor*>& inpu

};
const int threads = static_cast<CPUBackend*>(backend())->threadNumber();
computeDivideSizes();
if (!mSplitByOc) {
MNN_CONCURRENCY_BEGIN(tId, threads) {
ThreadFunction((int)tId, mDivides[tId], mDivides[tId + 1], 1, 0);
Expand Down
3 changes: 3 additions & 0 deletions source/backend/cpu/compute/ConvInt8TiledExecutor.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -72,7 +72,10 @@ class DenseConvInt8TiledExecutor : public ConvInt8TiledExecutor {
std::shared_ptr<Tensor> mTempMaxMinValueBuffer;
std::vector<uint8_t> mTempSrcSum;
std::vector<int32_t> mDivides;
void computeDivideSizes();

float mflops, mios;
int mTotalWork, mPart;
int mThreadNums;
int mBlockNum = 1;
int mOcPerThread;
Expand Down
15 changes: 7 additions & 8 deletions source/backend/cpu/compute/ConvolutionPackWinograd.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -127,11 +127,13 @@ bool ConvolutionPackWinograd::onClone(Backend* bn, const Op* op, Execution** dst
}

ErrorCode ConvolutionPackWinograd::onExecute(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) {
static_cast<CPUBackend *>(backend())->computeDivideSizes(mTotalWork, divides.data()+1);
MNN_CONCURRENCY_BEGIN(tId, mMainFunction.first) {
mMainFunction.second(tId, inputs[0]->host<uint8_t>(), outputs[0]->host<uint8_t>());
};
MNN_CONCURRENCY_END();

static_cast<CPUBackend *>(backend())->computeDivideSizes(mPostWork, divides.data()+1);
MNN_CONCURRENCY_BEGIN(tId, mPostFunction.first) {
mPostFunction.second(tId, outputs[0]->host<uint8_t>());
};
Expand Down Expand Up @@ -261,9 +263,8 @@ ErrorCode ConvolutionPackWinograd::onResize(const std::vector<Tensor *> &inputs,
auto totalCount = wUnit * hUnit * batch;
// MNN_PRINT("ow=%d, oh=%d\n", ow, oh);

std::vector<int> divides(threadNumber+1);
static_cast<CPUBackend *>(backend())->computeDivideSizes(totalCount, divides.data()+1);
divides[0] = 0;
divides.resize(threadNumber+1, 0);
mTotalWork = totalCount;
auto midBuffer0Bytes = srcUnit2 * pack * bytes;
bool allow_x86_bf16_winograd = true;
#ifdef MNN_USE_SSE
Expand Down Expand Up @@ -541,15 +542,13 @@ ErrorCode ConvolutionPackWinograd::onResize(const std::vector<Tensor *> &inputs,
/*Dest Transform And Post Treat End*/
}
};
std::vector<int> postDivides(threadNumber+1);
static_cast<CPUBackend *>(backend())->computeDivideSizes(dc_4, postDivides.data()+1);
postDivides[0] = 0;
mPostWork = dc_4;

mPostFunction.first = threadNumber;
mPostFunction.second = [=](int tId, uint8_t* outputOrigin) {
auto dstOrigin = outputOrigin;
int tSta = postDivides[tId];
int tFin = postDivides[tId+1];
int tSta = divides[tId];
int tFin = divides[tId+1];
for (int dy=tSta; dy < tFin; ++dy) {
auto dataFloatPtr = (float*)(dstOrigin + ow * oh * batch * dy * pack * bytes);
auto biasFloatPtr = (const float*)(bias + pack * dy * bytes);
Expand Down
2 changes: 2 additions & 0 deletions source/backend/cpu/compute/ConvolutionPackWinograd.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,8 @@ class ConvolutionPackWinograd : public ConvolutionWinogradImpl {
}
std::pair<int, std::function<void(int tId, const uint8_t*, uint8_t*)>> mMainFunction;
std::pair<int, std::function<void(int, uint8_t*)>> mPostFunction;
std::vector<int> divides;
int mTotalWork, mPostWork;

};
} // namespace MNN
Expand Down
12 changes: 5 additions & 7 deletions source/backend/cpu/compute/DenseConvolutionTiledExecutor.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -539,9 +539,6 @@ ErrorCode DenseConvolutionTiledImpl::onResize(const std::vector<Tensor*>& inputs

if (mConvPerfconfig.isParallelInner) {
auto rt = static_cast<const CPURuntime*>(backend()->getRuntime());
std::vector<int> ocC4ParralSize(threadNumber + 1);
ocC4ParralSize[0] = 0;
static_cast<CPUBackend *>(backend())->computeDivideSizes(oC4, ocC4ParralSize.data()+1);
mFunction.second = [=](int placeholder) {
const float* biasPtr = bias ? bias->host<float>() : nullptr;
auto gemmBuffer = mTempBufferTranspose.host<uint8_t>() + mTempBufferTranspose.stride(0) * 0;
Expand Down Expand Up @@ -601,6 +598,8 @@ ErrorCode DenseConvolutionTiledImpl::onResize(const std::vector<Tensor*>& inputs
}
MNN_CONCURRENCY_END();

std::vector<int> ocC4ParralSize(threadNumber + 1, 0);
static_cast<CPUBackend *>(backend())->computeDivideSizes(oC4, ocC4ParralSize.data()+1);
if (xC == eP) {
MNN_CONCURRENCY_BEGIN(tId, threadNumber) {
size_t paraParameters[PARAMETERSIZE];
Expand Down Expand Up @@ -669,10 +668,8 @@ ErrorCode DenseConvolutionTiledImpl::onResize(const std::vector<Tensor*>& inputs
};

} else {
std::vector<int> divides(threadNumber + 1);
divides[0] = 0;

static_cast<CPUBackend *>(backend())->computeDivideSizes(tileCount, divides.data() + 1);
divides.resize(threadNumber + 1, 0);
mTotalWork = tileCount;

mFunction.second = [=](int tId) {
const float* biasPtr = bias ? bias->host<float>() : nullptr;
Expand Down Expand Up @@ -757,6 +754,7 @@ ErrorCode DenseConvolutionTiledImpl::onExecute(const std::vector<Tensor*>& input
if (mConvPerfconfig.isParallelInner) {
mFunction.second(0);
} else {
static_cast<CPUBackend *>(backend())->computeDivideSizes(mTotalWork, divides.data() + 1);
MNN_CONCURRENCY_BEGIN(tId, mFunction.first) {
mFunction.second((int)tId);
}
Expand Down
2 changes: 2 additions & 0 deletions source/backend/cpu/compute/DenseConvolutionTiledExecutor.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,8 @@ class DenseConvolutionTiledImpl : public ConvolutionTiledImpl {
static PerfConfig bestTileConvolutionConfig(const Convolution2DCommon *common, const Tensor *inputTensor,
const Tensor *outputTensor, int threadNumber, Backend* b);
protected:
std::vector<int> divides;
int mTotalWork;
};
class DenseConvolutionTiledExecutor : public ConvolutionTiledExecutor {
public:
Expand Down
Loading

0 comments on commit 68a53de

Please sign in to comment.