add runtime CPU big-performance-little core

alibaba · Dec 13, 2024 · 68a53de · 68a53de
1 parent 0046e50
commit 68a53de
Show file tree

Hide file tree

Showing 16 changed files with 150 additions and 76 deletions.
diff --git a/include/MNN/Interpreter.hpp b/include/MNN/Interpreter.hpp
@@ -219,10 +219,9 @@ class MNN_PUBLIC Interpreter {
         // 0: Close dynamic quant; 1: per batch quant; 2: per tensor quant
         DYNAMIC_QUANT_OPTIONS = 5,
 
-        // For Mobile CPU with big-litter core, set decrease rate to let MNN divide task differential by CPU's performance
-        // 0-100, 50 means litter core has 50% capacity of large core
-        // Default is 50
-        CPU_LITTLECORE_DECREASE_RATE = 6,
+        // For Mobile CPU with big-little core, set decrease rate to let MNN divide task differential by CPU's performance
+        // 0-100, e.g., 70 means performance core has 70% capacity of large core
+        CPU_PERFORMANCECORE_DECREASE_RATE = 6,
 
         // 0: Do not quantize
         // 1: Only quantize key, use int8 asymmetric quantization 
@@ -236,6 +235,10 @@ class MNN_PUBLIC Interpreter {
         KVCACHE_SIZE_LIMIT = 8,
         // Op encoder number for commit
         OP_ENCODER_NUMBER_FOR_COMMIT = 9,
+
+        // For Mobile CPU with big-little core, set decrease rate to let MNN divide task differential by CPU's performance
+        // 0-100, e.g., 20 means little core has 20% capacity of large core
+        CPU_LITTLECORE_DECREASE_RATE = 10,
     };
 
     enum ExternalPathType {

diff --git a/source/backend/cpu/CPUBackend.cpp b/source/backend/cpu/CPUBackend.cpp
@@ -37,6 +37,13 @@
 #include "x86_x64/AVX2Backend.hpp"
 #endif
 
+#ifdef MNN_USE_THREAD_POOL
+#include <sys/types.h>
+#include <sched.h>
+#include "core/Concurrency.h"
+#endif
+
+// #define UNIFORM_DIVIDE
 #define MNN_CPU_MAX_BUFFER_INDEX 2
 #define MNN_CPU_CHECK_NAN 1
 #define MNN_CPU_USE_DEFAULT_BACKEND 4
@@ -48,8 +55,18 @@ ErrorCode CastWrapExecution::onExecute(const std::vector<Tensor*>& inputs, const
     CPUCastCreator::cast(inputs[0], outputs[0], cpuBackend, convertType);
     return NO_ERROR;
 }
+
+float getTotalRate(const std::vector<float>& rate, const std::vector<int>& cpuids) {
+    float totalRate = 0.0f;
+    for (auto id : cpuids) {
+        totalRate += rate[id];
+    }
+    return totalRate;
+}
+
+// Shall be called in onExecute!!!
 void CPUBackend::computeDivideSizes(int size, int* dst, float avgDiv) const {
-    if (mGroupWithComputeRate.size() <= 1 || (avgDiv > 0 && avgDiv < mComputeI)) {
+    if (mGroupWithComputeRate.size() <= 1 || (avgDiv > 0.0f && avgDiv < mComputeI)) {
         // Avg divide
         int length = UP_DIV(size, mThreadNumber);
         int cur = length;
@@ -61,18 +78,43 @@ void CPUBackend::computeDivideSizes(int size, int* dst, float avgDiv) const {
         return;
     }
 
+#if defined(MNN_USE_THREAD_POOL) && !defined(UNIFORM_DIVIDE)
+    // probe cpuid
+    std::vector<int> cpuids(mThreadNumber);
+    Backend* this_ptr = (Backend*)this;
+    std::function<void(int)> mGetCPUId = [&](int tId) {
+        // MNN_PRINT("Current id: %d, tid: %d, cpuid: %d\n", tId, gettid(), sched_getcpu());
+        cpuids[tId] = sched_getcpu();
+    };
+    std::function<Backend*(void)> backend = [=](void) {
+        return this_ptr;
+    };
+    MNN_CONCURRENCY_BEGIN(tId, mThreadNumber) {
+        mGetCPUId((int)tId);
+    }
+    MNN_CONCURRENCY_END();
+
+    // assign workload according to physical cpu power.
+    float totalRate = getTotalRate(mGroupWithComputeRate, cpuids); 
     int cur = 0;
-    int curPos = 0;
-    for (auto& group : mGroupWithComputeRate) {
-        int currentGroupTotal = (int)(ceilf((float)size*group.first));
-        int length = UP_DIV(currentGroupTotal, group.second);
-        for (int i=0; i<group.second; ++i) {
-            cur = cur + length;
-            cur = ALIMIN(cur, size);
-            dst[curPos+i] = cur;
-        }
-        curPos += group.second;
+    for (int i=0; i<mThreadNumber; ++i) {
+        // MNN_PRINT("totalRate: %.4f, rate: %.4f\n", totalRate, mGroupWithComputeRate[cpuids[i]]/totalRate);
+        int length = (int)(ceilf((float)size*(mGroupWithComputeRate[cpuids[i]]/totalRate)));
+        cur += length;
+        cur = ALIMIN(cur, size);
+        dst[i] = cur;
+    }
+    MNN_ASSERT(cur==size);
+#else
+    // Avg divide
+    int length = UP_DIV(size, mThreadNumber);
+    int cur = length;
+    for (int i=0; i<mThreadNumber; ++i) {
+        dst[i] = cur;
+        cur = cur + length;
+        cur = ALIMIN(cur, size);
     }
+#endif
 }
 
 void CPURuntime::_bindCPUCore() const {
@@ -372,8 +414,12 @@ CPUBackend::CPUBackend(const CPURuntime* runtime, BackendConfig::PrecisionMode p
         if (mThreadNumber <= 1 || mRuntime->mPower == BackendConfig::Power_Low) {
             break;
         }
-        auto rate = mRuntime->hint().cpuDecreaseRate;
-        if (rate >= 100 || rate <= 0) {
+        auto middleRate = mRuntime->hint().cpuMiddleDecreaseRate;
+        auto littleRate = mRuntime->hint().cpuLittleDecreaseRate;
+        if (middleRate >= 100 || middleRate <= 0) {
+            break;
+        }
+        if (littleRate >= 100 || littleRate <= 0) {
             break;
         }
         auto cpuInfo = MNNGetCPUInfo();
@@ -387,24 +433,24 @@ CPUBackend::CPUBackend(const CPURuntime* runtime, BackendConfig::PrecisionMode p
         } else {
             mComputeI = 7.f;
         }
+        // initialize rate.
         mGroupWithComputeRate.clear();
-        float decreaseRate = (float)(rate) / 100.0f;
-        int validCpuSize = (int)(cpuInfo->groups[cpuInfo->groups.size()-1].ids.size());
-        int groupIndex = (int)cpuInfo->groups.size()-2;
-        validCpuSize = ALIMIN(validCpuSize, mThreadNumber);
-        float totalComputeRate = 1.0f * validCpuSize;
-        mGroupWithComputeRate.emplace_back(std::make_pair(totalComputeRate, validCpuSize));
-        float currentRate = 1.0f;
-        while (validCpuSize < mThreadNumber && groupIndex >= 0) {
-            auto& group = cpuInfo->groups[groupIndex];
-            int selectSize = ALIMIN(mThreadNumber - validCpuSize, (int)group.ids.size());
-            validCpuSize += group.ids.size();
-            currentRate *= decreaseRate;
-            totalComputeRate += currentRate * selectSize;
-            mGroupWithComputeRate.emplace_back(std::make_pair(currentRate * selectSize, selectSize));
-        }
-        for (auto& g : mGroupWithComputeRate) {
-            g.first = g.first / totalComputeRate;
+        mGroupWithComputeRate.resize(cpuInfo->cpuNumber, 1.0f);
+        int groupIndex = (int)cpuInfo->groups.size()-1;
+        // initialize to be little-core.
+        std::vector<float> rate((int)cpuInfo->groups.size(), (float)(littleRate) / 100.0f);
+        // set big core
+        rate[(int)cpuInfo->groups.size()-1] = 1.0f;
+        // set middle core
+        if ((int)cpuInfo->groups.size() >= 2) {
+            rate[(int)cpuInfo->groups.size()-2] = (float)(middleRate) / 100.0f;
+        }
+        while (groupIndex >= 0) {
+            auto& group = cpuInfo->groups[groupIndex].ids;
+            for (auto cpuid : group) {
+                mGroupWithComputeRate[cpuid] = rate[groupIndex];
+            }
+            groupIndex--;
         }
     } while (false);
     auto dynamicAlloc = mRuntime->mSharedDmaInfo;

diff --git a/source/backend/cpu/CPUBackend.hpp b/source/backend/cpu/CPUBackend.hpp
@@ -183,7 +183,7 @@ class CPUBackend : public Backend {
     CoreInt8Functions* mInt8CoreFunctions;
 private:
     int mThreadNumber;
-    std::vector<std::pair<float, int>> mGroupWithComputeRate;
+    std::vector<float> mGroupWithComputeRate;
     float mComputeI = 0.f;
 
     std::shared_ptr<CPURuntime::DynamicAllocator> mDmaInfo;

diff --git a/source/backend/cpu/CPUConvolutionDepthwise.cpp b/source/backend/cpu/CPUConvolutionDepthwise.cpp
@@ -171,17 +171,9 @@ ErrorCode CPUConvolutionDepthwise::BasicFloatExecution::onResize(const std::vect
     auto batch = inputs[0]->batch();
     int total = batch * dst_depth_quad;
     int numberThread = ((CPUBackend*)backend())->threadNumber();
-    std::vector<int> divides(numberThread+1);
-    divides[0] = 0;
-    static_cast<CPUBackend *>(backend())->computeDivideSizes(total, divides.data()+1);
+    mTotalWork = total;
+    divides.resize(numberThread+1, 0);
     mNumber = numberThread;
-    for (int i=1; i<numberThread; ++i) {
-        if (divides[i+1] <= divides[i]) {
-            // Only 0-(i-1) thread has work
-            mNumber = i;
-            break;
-        }
-    }
     MNN_ASSERT(mNumber > 0);
     auto postData = getPostParameters();
     if (static_cast<CPUBackend*>(backend())->functions()->bytes < 4) {
@@ -204,7 +196,6 @@ ErrorCode CPUConvolutionDepthwise::BasicFloatExecution::onResize(const std::vect
         src_y_step     = paddedWidth * unit;
     }
     mExecutor   = [=](const uint8_t* inputPtr, uint8_t* outputPtr, int tId) {
-        MNN_ASSERT(divides[tId] < divides[tId+1]);
         const auto inputPadPtr = mInputPad->host<uint8_t>() + mInputPad->stride(0) * tId * bytes;
         ::memset(inputPadPtr, 0, mInputPad->stride(0) * bytes);
         auto biasP   = inputs[2]->host<uint8_t>();
@@ -239,6 +230,7 @@ ErrorCode CPUConvolutionDepthwise::BasicFloatExecution::onExecute(const std::vec
     auto outputTensor = outputs[0];
     const auto srcOrigin = inputTensor->host<uint8_t>();
     auto dstOrigin       = outputTensor->host<uint8_t>();
+    static_cast<CPUBackend *>(backend())->computeDivideSizes(mTotalWork, divides.data()+1);
     MNN_CONCURRENCY_BEGIN(tId, mNumber) {
         mExecutor(srcOrigin, dstOrigin, (int)tId);
     }

diff --git a/source/backend/cpu/CPUConvolutionDepthwise.hpp b/source/backend/cpu/CPUConvolutionDepthwise.hpp
@@ -30,6 +30,8 @@ class CPUConvolutionDepthwise {
                            size_t fw, size_t fh, size_t dilateX_step, size_t dilateY_step, size_t height,
                            size_t srcHStep, size_t dstHStep)> mFastKernel;
         int mNumber = 1;
+        std::vector<int> divides;
+        int mTotalWork;
         std::shared_ptr<Tensor> mInputPad;
         bool mFastKernelApply = false;
     };

diff --git a/source/backend/cpu/compute/ConvInt8TiledExecutor.cpp b/source/backend/cpu/compute/ConvInt8TiledExecutor.cpp
@@ -515,6 +515,17 @@ void DenseConvInt8TiledExecutor::getPackParameter(int* Unit, int* srcUnit, int*
     core->MNNGetGemmUnit(Unit, srcUnit, DestUnit);
 }
 
+void DenseConvInt8TiledExecutor::computeDivideSizes() {
+    if (mSplitByOc) {
+        static_cast<CPUBackend *>(backend())->computeDivideSizes(mTotalWork, mDivides.data() + 1, mflops / mios);
+        for (int i = 0; i < mDivides.size(); ++i) {
+            mDivides[i] *= mPart;
+        }
+    } else {
+        static_cast<CPUBackend *>(backend())->computeDivideSizes(mTotalWork, mDivides.data() + 1, mflops / mios);
+    }
+}
+
 
 ErrorCode DenseConvInt8TiledExecutor::onResize(const std::vector<Tensor*>& inputs, const std::vector<Tensor*>& outputs) {
     mUseBatchQuan = (static_cast<CPUBackend*>(backend())->getRuntime()->hint().dynamicQuantOption == 1);
@@ -631,21 +642,21 @@ ErrorCode DenseConvInt8TiledExecutor::onResize(const std::vector<Tensor*>& input
         }
         mThreadNums = ALIMIN(threads, threadNeed);
         mSplitByOc = true;
+        mTotalWork = totalWork;
+        mPart = part;
 
-        mDivides.resize(threads+1);
-        mDivides[0] = 0;
-        static_cast<CPUBackend *>(backend())->computeDivideSizes(totalWork, mDivides.data() + 1, flop / ios);
-        for (int i = 0; i < mDivides.size(); ++i) {
-            mDivides[i] *= part;
-        }
     }
 
     if (!mSplitByOc) {
         mThreadNums = ALIMIN(threads, mTileCount);
-        mDivides.resize(threads+1);
-        mDivides[0] = 0;
-        static_cast<CPUBackend *>(backend())->computeDivideSizes(mTileCount, mDivides.data() + 1, flop / ios);
+        mTotalWork = mTileCount;
     }
+
+    // record flop and ios and prepare for divide compute.
+    mDivides.resize(threads+1, 0);
+    mflops = flop, mios = ios;
+
+
     int ocUp4 = ROUND_UP(outC, gcore->pack);
     int alphaSize = mResourceInt8->mOriginScale->size() / (sizeof(float) * 2);
     int k = mThreadNums;
@@ -1239,6 +1250,7 @@ ErrorCode DenseConvInt8TiledExecutor::onExecute(const std::vector<Tensor*>& inpu
 
     };
     const int threads = static_cast<CPUBackend*>(backend())->threadNumber();
+    computeDivideSizes();
     if (!mSplitByOc) {
         MNN_CONCURRENCY_BEGIN(tId, threads) {
                 ThreadFunction((int)tId, mDivides[tId], mDivides[tId + 1], 1, 0);

diff --git a/source/backend/cpu/compute/ConvInt8TiledExecutor.hpp b/source/backend/cpu/compute/ConvInt8TiledExecutor.hpp
@@ -72,7 +72,10 @@ class DenseConvInt8TiledExecutor : public ConvInt8TiledExecutor {
     std::shared_ptr<Tensor> mTempMaxMinValueBuffer;
     std::vector<uint8_t> mTempSrcSum;
     std::vector<int32_t> mDivides;
+    void computeDivideSizes();
 
+    float mflops, mios;
+    int mTotalWork, mPart;
     int mThreadNums;
     int mBlockNum = 1;
     int mOcPerThread;

diff --git a/source/backend/cpu/compute/ConvolutionPackWinograd.cpp b/source/backend/cpu/compute/ConvolutionPackWinograd.cpp
@@ -127,11 +127,13 @@ bool ConvolutionPackWinograd::onClone(Backend* bn, const Op* op, Execution** dst
 }
 
 ErrorCode ConvolutionPackWinograd::onExecute(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) {
+    static_cast<CPUBackend *>(backend())->computeDivideSizes(mTotalWork, divides.data()+1);
     MNN_CONCURRENCY_BEGIN(tId, mMainFunction.first) {
         mMainFunction.second(tId, inputs[0]->host<uint8_t>(), outputs[0]->host<uint8_t>());
     };
     MNN_CONCURRENCY_END();
 
+    static_cast<CPUBackend *>(backend())->computeDivideSizes(mPostWork, divides.data()+1);
     MNN_CONCURRENCY_BEGIN(tId, mPostFunction.first) {
         mPostFunction.second(tId, outputs[0]->host<uint8_t>());
     };
@@ -261,9 +263,8 @@ ErrorCode ConvolutionPackWinograd::onResize(const std::vector<Tensor *> &inputs,
     auto totalCount   = wUnit * hUnit * batch;
     // MNN_PRINT("ow=%d, oh=%d\n", ow, oh);
 
-    std::vector<int> divides(threadNumber+1);
-    static_cast<CPUBackend *>(backend())->computeDivideSizes(totalCount, divides.data()+1);
-    divides[0] = 0;
+    divides.resize(threadNumber+1, 0);
+    mTotalWork = totalCount;
     auto midBuffer0Bytes = srcUnit2 * pack * bytes;
     bool allow_x86_bf16_winograd = true;
 #ifdef MNN_USE_SSE
@@ -541,15 +542,13 @@ ErrorCode ConvolutionPackWinograd::onResize(const std::vector<Tensor *> &inputs,
             /*Dest Transform And Post Treat End*/
         }
     };
-    std::vector<int> postDivides(threadNumber+1);
-    static_cast<CPUBackend *>(backend())->computeDivideSizes(dc_4, postDivides.data()+1);
-    postDivides[0] = 0;
+    mPostWork = dc_4;
 
     mPostFunction.first = threadNumber;
     mPostFunction.second = [=](int tId, uint8_t* outputOrigin) {
         auto dstOrigin = outputOrigin;
-        int tSta = postDivides[tId];
-        int tFin = postDivides[tId+1];
+        int tSta = divides[tId];
+        int tFin = divides[tId+1];
         for (int dy=tSta; dy < tFin; ++dy) {
             auto dataFloatPtr = (float*)(dstOrigin + ow * oh * batch * dy * pack * bytes);
             auto biasFloatPtr = (const float*)(bias + pack * dy * bytes);

diff --git a/source/backend/cpu/compute/ConvolutionPackWinograd.hpp b/source/backend/cpu/compute/ConvolutionPackWinograd.hpp
@@ -36,6 +36,8 @@ class ConvolutionPackWinograd : public ConvolutionWinogradImpl {
     }
     std::pair<int, std::function<void(int tId, const uint8_t*, uint8_t*)>> mMainFunction;
     std::pair<int, std::function<void(int, uint8_t*)>> mPostFunction;
+    std::vector<int> divides;
+    int mTotalWork, mPostWork;
 
 };
 } // namespace MNN

diff --git a/source/backend/cpu/compute/DenseConvolutionTiledExecutor.cpp b/source/backend/cpu/compute/DenseConvolutionTiledExecutor.cpp
@@ -539,9 +539,6 @@ ErrorCode DenseConvolutionTiledImpl::onResize(const std::vector<Tensor*>& inputs
 
     if (mConvPerfconfig.isParallelInner) {
         auto rt = static_cast<const CPURuntime*>(backend()->getRuntime());
-        std::vector<int> ocC4ParralSize(threadNumber + 1);
-        ocC4ParralSize[0] = 0;
-        static_cast<CPUBackend *>(backend())->computeDivideSizes(oC4, ocC4ParralSize.data()+1);
         mFunction.second = [=](int placeholder) {
         const float* biasPtr = bias ? bias->host<float>() : nullptr;
         auto gemmBuffer = mTempBufferTranspose.host<uint8_t>() + mTempBufferTranspose.stride(0) * 0;
@@ -601,6 +598,8 @@ ErrorCode DenseConvolutionTiledImpl::onResize(const std::vector<Tensor*>& inputs
             }
             MNN_CONCURRENCY_END();
 
+            std::vector<int> ocC4ParralSize(threadNumber + 1, 0);
+            static_cast<CPUBackend *>(backend())->computeDivideSizes(oC4, ocC4ParralSize.data()+1);
             if (xC == eP) {
                 MNN_CONCURRENCY_BEGIN(tId, threadNumber) {
                     size_t paraParameters[PARAMETERSIZE];
@@ -669,10 +668,8 @@ ErrorCode DenseConvolutionTiledImpl::onResize(const std::vector<Tensor*>& inputs
     };
 
     } else {
-        std::vector<int> divides(threadNumber + 1);
-        divides[0] = 0;
-
-        static_cast<CPUBackend *>(backend())->computeDivideSizes(tileCount, divides.data() + 1);
+        divides.resize(threadNumber + 1, 0);
+        mTotalWork = tileCount;
 
         mFunction.second       = [=](int tId) {
             const float* biasPtr = bias ? bias->host<float>() : nullptr;
@@ -757,6 +754,7 @@ ErrorCode DenseConvolutionTiledImpl::onExecute(const std::vector<Tensor*>& input
     if (mConvPerfconfig.isParallelInner) {
         mFunction.second(0);
     } else {
+        static_cast<CPUBackend *>(backend())->computeDivideSizes(mTotalWork, divides.data() + 1);
         MNN_CONCURRENCY_BEGIN(tId, mFunction.first) {
             mFunction.second((int)tId);
         }

diff --git a/source/backend/cpu/compute/DenseConvolutionTiledExecutor.hpp b/source/backend/cpu/compute/DenseConvolutionTiledExecutor.hpp
@@ -31,6 +31,8 @@ class DenseConvolutionTiledImpl : public ConvolutionTiledImpl {
     static PerfConfig bestTileConvolutionConfig(const Convolution2DCommon *common, const Tensor *inputTensor,
                                           const Tensor *outputTensor, int threadNumber, Backend* b);
 protected:
+    std::vector<int> divides;
+    int mTotalWork;
 };
 class DenseConvolutionTiledExecutor : public ConvolutionTiledExecutor {
 public: