Skip to content

Commit

Permalink
feat(kernel): 实现 slice 的重整和 cuda kernel 调用
Browse files Browse the repository at this point in the history
Signed-off-by: YdrMaster <[email protected]>
  • Loading branch information
YdrMaster committed Nov 13, 2023
1 parent 928cdb8 commit 0951b5b
Show file tree
Hide file tree
Showing 7 changed files with 146 additions and 7 deletions.
20 changes: 20 additions & 0 deletions src/04kernel/cuda/include/kernel/cuda/slice.cuh
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
#ifndef KERNEL_CUDA_SLICE_CUH
#define KERNEL_CUDA_SLICE_CUH

#include "threads_distributer.cuh"

namespace refactor::kernel::cuda {

struct DimInfo {
unsigned int countStride, sizeStart;
int sizeStride;
};

void launchSlice(
KernelLaunchParameters const &,
void const *src, DimInfo const *dims, void *output,
unsigned int blockSize);

}// namespace refactor::kernel::cuda

#endif// KERNEL_CUDA_SLICE_CUH
28 changes: 28 additions & 0 deletions src/04kernel/cuda/src/slice.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
#include "kernel/cuda/slice.cuh"
#include <cstdint>

namespace refactor::kernel::cuda {

__global__ static void sliceKernel(
unsigned long long n,
uint8_t const *src, DimInfo const *dims, uint8_t *output,
unsigned int blockSize) {
}

void launchSlice(
KernelLaunchParameters const &params,
void const *src, DimInfo const *dims, void *output,
unsigned int blockSize) {
sliceKernel<<<
params.gridSize,
params.blockSize,
params.dynamicSharedBytes,
reinterpret_cast<cudaStream_t>(params.stream)>>>(
params.n,
reinterpret_cast<uint8_t const *>(src),
dims,
reinterpret_cast<uint8_t *>(output),
blockSize);
}

}// namespace refactor::kernel::cuda
3 changes: 3 additions & 0 deletions src/04kernel/include/kernel/attributes/slice_info.h
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,10 @@ namespace refactor::kernel {
std::vector<Dim> dims;
dim_t blockCount, blockSize, baseOffset;

SliceInfo(std::vector<Dim>, dim_t, dim_t, dim_t) noexcept;
SliceInfo(Dimensions const &, Tensor const &) noexcept;
SliceInfo reform(dim_t maxblockSize) const noexcept;
void reformAssign(dim_t maxblockSize) noexcept;
};

}// namespace refactor::kernel
Expand Down
47 changes: 47 additions & 0 deletions src/04kernel/src/attributes/slice_info.cc
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
#include "kernel/attributes/slice_info.h"
#include <numeric>

namespace refactor::kernel {

Expand All @@ -11,6 +12,16 @@ namespace refactor::kernel {
return !operator==(rhs);
}

SliceInfo::SliceInfo(
std::vector<Dim> dims_,
dim_t blockCount_,
dim_t blockSize_,
dim_t baseOffset_) noexcept
: blockCount(blockCount_),
blockSize(blockSize_),
baseOffset(baseOffset_),
dims(std::move(dims_)) {}

SliceInfo::SliceInfo(Dimensions const &dims_, Tensor const &input) noexcept
: blockCount(1),
blockSize(input.dataType.size()),
Expand Down Expand Up @@ -53,4 +64,40 @@ namespace refactor::kernel {
dims.shrink_to_fit();
}

SliceInfo SliceInfo::reform(dim_t maxblockSize) const noexcept {
auto blockSize_ = std::gcd(blockSize, maxblockSize);
if (blockSize_ == blockSize) { return *this; }
auto times = blockSize / blockSize_;
SliceInfo ans{
std::vector<Dim>(dims.size() + 1),
blockCount * times,
blockSize_,
baseOffset,
};
for (auto i : range0_(dims.size())) {
auto const &d = dims[i];
ans.dims[i] = {
d.countStride * times,
d.sizeStart,
d.sizeStride,
};
}
ans.dims.back() = {1, 0, static_cast<sdim_t>(blockSize_)};
return ans;
}

void SliceInfo::reformAssign(dim_t maxblockSize) noexcept {
auto blockSize_ = std::gcd(blockSize, maxblockSize);
if (blockSize_ == blockSize) { return; }
auto times = blockSize / blockSize_;
blockCount *= times;
blockSize = blockSize_;
for (auto &d : dims) {
d.countStride *= times;
}
dims.resize(dims.size() + 1);
dims.back() = {1, 0, static_cast<sdim_t>(blockSize_)};
}


}// namespace refactor::kernel
21 changes: 17 additions & 4 deletions src/04kernel/src/kernels/slice/cuda_kernel.cu
Original file line number Diff line number Diff line change
@@ -1,14 +1,27 @@
#include "cuda_kernel.hh"
#include "kernel/cuda/split.cuh"
#include "mem_manager/foreign_blob.hh"
#include "runtime/mem_manager.hh"
#include "kernel/cuda/slice.cuh"
#include <thrust/device_vector.h>
#include <thrust/host_vector.h>

namespace refactor::kernel {
using namespace runtime;

Routine SliceCuda::lower(Resources &) const noexcept {
return [](Resources &, void const **inputs, void **outputs) {
auto reformed = info.reform(16);
thrust::host_vector<cuda::DimInfo> dims(info.dims.size());
std::transform(info.dims.begin(), info.dims.end(),
dims.begin(),
[](auto const &d) { return cuda::DimInfo{
d.countStride,
d.sizeStart,
d.sizeStride,
}; });
return [dims = thrust::device_vector<cuda::DimInfo>(dims),
params = cuda::ThreadsDistributer()(reformed.blockCount),
blockSize = reformed.blockSize,
baseOffset = reformed.baseOffset](Resources &, void const **inputs, void **outputs) {
auto src = reinterpret_cast<uint8_t const *>(inputs[0]) + baseOffset;
cuda::launchSlice(params, src, dims.data().get(), outputs[0], blockSize);
};
}

Expand Down
14 changes: 14 additions & 0 deletions src/04kernel/test/attributes/test_slice_info.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -26,4 +26,18 @@ TEST(kernel, SliceInfo) {
})
// clang-format on
);

auto reformed = info.reform(16);
EXPECT_EQ(reformed.blockCount, 36);
EXPECT_EQ(reformed.blockSize, 16);
EXPECT_EQ(reformed.baseOffset, 24);
EXPECT_EQ(reformed.dims,
// clang-format off
(decltype(reformed.dims){
{48 / 24 * 6, 900 * 4, -360 * 4},
{24 / 24 * 6, 60 * 4, 90 * 4},
{ 1, 0, 16},
})
// clang-format on
);
}
20 changes: 17 additions & 3 deletions src/04kernel/test/kernels/slice/test_cpu.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -27,9 +27,11 @@ TEST(kernel, SliceCpu) {
result(output->elementsSize());
std::iota(data.begin(), data.end(), 0);
// inference
void const *inputs[]{data.data()};
void *outputs[]{result.data()};
routine(res, inputs, outputs);
{
void const *inputs[]{data.data()};
void *outputs[]{result.data()};
routine(res, inputs, outputs);
}
// check
dim_t
di[]{5, 3, 1},
Expand All @@ -49,4 +51,16 @@ TEST(kernel, SliceCpu) {
}
}
}
// test reform
auto kernelReformed = SliceCpu::build(SliceInfo(dims, *input).reform(16));
ASSERT_TRUE(kernelReformed);
auto routineReformed = kernelReformed->lower(res);
std::vector<float> resultReformed(result.size());
{
void const *inputs[]{data.data()};
void *outputs[]{resultReformed.data()};
routineReformed(res, inputs, outputs);
}
// check
ASSERT_EQ(result, resultReformed);
}

0 comments on commit 0951b5b

Please sign in to comment.