feat(kernel): 实现 slice 的重整和 cuda kernel 调用

Signed-off-by: YdrMaster <[email protected]>
InfiniTensor · Nov 13, 2023 · 0951b5b · 0951b5b
1 parent 928cdb8
commit 0951b5b
Show file tree

Hide file tree

Showing 7 changed files with 146 additions and 7 deletions.
diff --git a/src/04kernel/cuda/include/kernel/cuda/slice.cuh b/src/04kernel/cuda/include/kernel/cuda/slice.cuh
@@ -0,0 +1,20 @@
+#ifndef KERNEL_CUDA_SLICE_CUH
+#define KERNEL_CUDA_SLICE_CUH
+
+#include "threads_distributer.cuh"
+
+namespace refactor::kernel::cuda {
+
+    struct DimInfo {
+        unsigned int countStride, sizeStart;
+        int sizeStride;
+    };
+
+    void launchSlice(
+        KernelLaunchParameters const &,
+        void const *src, DimInfo const *dims, void *output,
+        unsigned int blockSize);
+
+}// namespace refactor::kernel::cuda
+
+#endif// KERNEL_CUDA_SLICE_CUH
diff --git a/src/04kernel/cuda/src/slice.cu b/src/04kernel/cuda/src/slice.cu
@@ -0,0 +1,28 @@
+#include "kernel/cuda/slice.cuh"
+#include <cstdint>
+
+namespace refactor::kernel::cuda {
+
+    __global__ static void sliceKernel(
+        unsigned long long n,
+        uint8_t const *src, DimInfo const *dims, uint8_t *output,
+        unsigned int blockSize) {
+    }
+
+    void launchSlice(
+        KernelLaunchParameters const &params,
+        void const *src, DimInfo const *dims, void *output,
+        unsigned int blockSize) {
+        sliceKernel<<<
+            params.gridSize,
+            params.blockSize,
+            params.dynamicSharedBytes,
+            reinterpret_cast<cudaStream_t>(params.stream)>>>(
+            params.n,
+            reinterpret_cast<uint8_t const *>(src),
+            dims,
+            reinterpret_cast<uint8_t *>(output),
+            blockSize);
+    }
+
+}// namespace refactor::kernel::cuda
diff --git a/src/04kernel/include/kernel/attributes/slice_info.h b/src/04kernel/include/kernel/attributes/slice_info.h
@@ -24,7 +24,10 @@ namespace refactor::kernel {
         std::vector<Dim> dims;
         dim_t blockCount, blockSize, baseOffset;
 
+        SliceInfo(std::vector<Dim>, dim_t, dim_t, dim_t) noexcept;
         SliceInfo(Dimensions const &, Tensor const &) noexcept;
+        SliceInfo reform(dim_t maxblockSize) const noexcept;
+        void reformAssign(dim_t maxblockSize) noexcept;
     };
 
 }// namespace refactor::kernel

diff --git a/src/04kernel/src/attributes/slice_info.cc b/src/04kernel/src/attributes/slice_info.cc
@@ -1,4 +1,5 @@
 #include "kernel/attributes/slice_info.h"
+#include <numeric>
 
 namespace refactor::kernel {
 
@@ -11,6 +12,16 @@ namespace refactor::kernel {
         return !operator==(rhs);
     }
 
+    SliceInfo::SliceInfo(
+        std::vector<Dim> dims_,
+        dim_t blockCount_,
+        dim_t blockSize_,
+        dim_t baseOffset_) noexcept
+        : blockCount(blockCount_),
+          blockSize(blockSize_),
+          baseOffset(baseOffset_),
+          dims(std::move(dims_)) {}
+
     SliceInfo::SliceInfo(Dimensions const &dims_, Tensor const &input) noexcept
         : blockCount(1),
           blockSize(input.dataType.size()),
@@ -53,4 +64,40 @@ namespace refactor::kernel {
         dims.shrink_to_fit();
     }
 
+    SliceInfo SliceInfo::reform(dim_t maxblockSize) const noexcept {
+        auto blockSize_ = std::gcd(blockSize, maxblockSize);
+        if (blockSize_ == blockSize) { return *this; }
+        auto times = blockSize / blockSize_;
+        SliceInfo ans{
+            std::vector<Dim>(dims.size() + 1),
+            blockCount * times,
+            blockSize_,
+            baseOffset,
+        };
+        for (auto i : range0_(dims.size())) {
+            auto const &d = dims[i];
+            ans.dims[i] = {
+                d.countStride * times,
+                d.sizeStart,
+                d.sizeStride,
+            };
+        }
+        ans.dims.back() = {1, 0, static_cast<sdim_t>(blockSize_)};
+        return ans;
+    }
+
+    void SliceInfo::reformAssign(dim_t maxblockSize) noexcept {
+        auto blockSize_ = std::gcd(blockSize, maxblockSize);
+        if (blockSize_ == blockSize) { return; }
+        auto times = blockSize / blockSize_;
+        blockCount *= times;
+        blockSize = blockSize_;
+        for (auto &d : dims) {
+            d.countStride *= times;
+        }
+        dims.resize(dims.size() + 1);
+        dims.back() = {1, 0, static_cast<sdim_t>(blockSize_)};
+    }
+
+
 }// namespace refactor::kernel
diff --git a/src/04kernel/src/kernels/slice/cuda_kernel.cu b/src/04kernel/src/kernels/slice/cuda_kernel.cu
@@ -1,14 +1,27 @@
 #include "cuda_kernel.hh"
-#include "kernel/cuda/split.cuh"
-#include "mem_manager/foreign_blob.hh"
-#include "runtime/mem_manager.hh"
+#include "kernel/cuda/slice.cuh"
 #include <thrust/device_vector.h>
+#include <thrust/host_vector.h>
 
 namespace refactor::kernel {
     using namespace runtime;
 
     Routine SliceCuda::lower(Resources &) const noexcept {
-        return [](Resources &, void const **inputs, void **outputs) {
+        auto reformed = info.reform(16);
+        thrust::host_vector<cuda::DimInfo> dims(info.dims.size());
+        std::transform(info.dims.begin(), info.dims.end(),
+                       dims.begin(),
+                       [](auto const &d) { return cuda::DimInfo{
+                                               d.countStride,
+                                               d.sizeStart,
+                                               d.sizeStride,
+                                           }; });
+        return [dims = thrust::device_vector<cuda::DimInfo>(dims),
+                params = cuda::ThreadsDistributer()(reformed.blockCount),
+                blockSize = reformed.blockSize,
+                baseOffset = reformed.baseOffset](Resources &, void const **inputs, void **outputs) {
+            auto src = reinterpret_cast<uint8_t const *>(inputs[0]) + baseOffset;
+            cuda::launchSlice(params, src, dims.data().get(), outputs[0], blockSize);
         };
     }
 

diff --git a/src/04kernel/test/attributes/test_slice_info.cpp b/src/04kernel/test/attributes/test_slice_info.cpp
@@ -26,4 +26,18 @@ TEST(kernel, SliceInfo) {
               })
               // clang-format on
     );
+
+    auto reformed = info.reform(16);
+    EXPECT_EQ(reformed.blockCount, 36);
+    EXPECT_EQ(reformed.blockSize, 16);
+    EXPECT_EQ(reformed.baseOffset, 24);
+    EXPECT_EQ(reformed.dims,
+              // clang-format off
+              (decltype(reformed.dims){
+                  {48 / 24 * 6, 900 * 4, -360 * 4},
+                  {24 / 24 * 6,  60 * 4,   90 * 4},
+                  {          1,       0,       16},
+              })
+              // clang-format on
+    );
 }
diff --git a/src/04kernel/test/kernels/slice/test_cpu.cpp b/src/04kernel/test/kernels/slice/test_cpu.cpp
@@ -27,9 +27,11 @@ TEST(kernel, SliceCpu) {
         result(output->elementsSize());
     std::iota(data.begin(), data.end(), 0);
     // inference
-    void const *inputs[]{data.data()};
-    void *outputs[]{result.data()};
-    routine(res, inputs, outputs);
+    {
+        void const *inputs[]{data.data()};
+        void *outputs[]{result.data()};
+        routine(res, inputs, outputs);
+    }
     // check
     dim_t
         di[]{5, 3, 1},
@@ -49,4 +51,16 @@ TEST(kernel, SliceCpu) {
             }
         }
     }
+    // test reform
+    auto kernelReformed = SliceCpu::build(SliceInfo(dims, *input).reform(16));
+    ASSERT_TRUE(kernelReformed);
+    auto routineReformed = kernelReformed->lower(res);
+    std::vector<float> resultReformed(result.size());
+    {
+        void const *inputs[]{data.data()};
+        void *outputs[]{resultReformed.data()};
+        routineReformed(res, inputs, outputs);
+    }
+    // check
+    ASSERT_EQ(result, resultReformed);
 }