From 40d623e5c334d15256725e42a79b89649cb1e3bc Mon Sep 17 00:00:00 2001 From: YdrMaster Date: Mon, 13 Nov 2023 12:57:56 +0800 Subject: [PATCH] =?UTF-8?q?feat(kernel):=20=E5=AE=9E=E7=8E=B0=20slice=20cp?= =?UTF-8?q?u=20kernel?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: YdrMaster --- .../include/kernel/attributes/slice_info.h | 2 +- src/04kernel/src/attributes/slice_info.cc | 23 +++++--- src/04kernel/src/collectors/slice.cc | 16 +++--- src/04kernel/src/kernels/slice/cpu_kernel.cc | 46 ++++++++++++++++ src/04kernel/src/kernels/slice/cpu_kernel.hh | 24 +++++++++ src/04kernel/src/kernels/slice/cuda_kernel.cc | 27 ++++++++++ src/04kernel/src/kernels/slice/cuda_kernel.cu | 15 ++++++ src/04kernel/src/kernels/slice/cuda_kernel.hh | 26 ++++++++++ .../test/attributes/test_slice_info.cpp | 15 +++--- src/04kernel/test/kernels/slice/test_cpu.cpp | 52 +++++++++++++++++++ src/04kernel/test/kernels/slice/test_cuda.cpp | 16 ++++++ 11 files changed, 241 insertions(+), 21 deletions(-) create mode 100644 src/04kernel/src/kernels/slice/cpu_kernel.cc create mode 100644 src/04kernel/src/kernels/slice/cpu_kernel.hh create mode 100644 src/04kernel/src/kernels/slice/cuda_kernel.cc create mode 100644 src/04kernel/src/kernels/slice/cuda_kernel.cu create mode 100644 src/04kernel/src/kernels/slice/cuda_kernel.hh create mode 100644 src/04kernel/test/kernels/slice/test_cpu.cpp create mode 100644 src/04kernel/test/kernels/slice/test_cuda.cpp diff --git a/src/04kernel/include/kernel/attributes/slice_info.h b/src/04kernel/include/kernel/attributes/slice_info.h index 8f276379..e8c919f3 100644 --- a/src/04kernel/include/kernel/attributes/slice_info.h +++ b/src/04kernel/include/kernel/attributes/slice_info.h @@ -22,7 +22,7 @@ namespace refactor::kernel { bool operator!=(Dim const &) const noexcept; }; std::vector dims; - dim_t blockSize; + dim_t blockCount, blockSize, baseOffset; SliceInfo(Dimensions const &, Tensor const &) noexcept; }; diff --git a/src/04kernel/src/attributes/slice_info.cc b/src/04kernel/src/attributes/slice_info.cc index 350d6948..83ed40c9 100644 --- a/src/04kernel/src/attributes/slice_info.cc +++ b/src/04kernel/src/attributes/slice_info.cc @@ -12,7 +12,10 @@ namespace refactor::kernel { } SliceInfo::SliceInfo(Dimensions const &dims_, Tensor const &input) noexcept - : blockSize(input.dataType.size()), dims(1) { + : blockCount(1), + blockSize(input.dataType.size()), + baseOffset(0), + dims(1) { ASSERT(dims_.size() == input.rank(), "Unreachable"); auto continuous = true; @@ -21,14 +24,13 @@ namespace refactor::kernel { for (auto i : range0_(input.rank()).rev()) { auto l = input.shape[i]; auto const &d = dims_[i]; - if (continuous && d.step == 1) { - auto &it = dims.back(); + if (auto &it = dims.back(); continuous && d.step == 1) { it.countStride *= d.length; it.sizeStart = d.start * stride; it.sizeStride *= l; } else { dims.push_back(Dim{ - static_cast(dims.back().countStride * d.length), + static_cast(it.countStride * d.length), static_cast(d.start * stride), static_cast(d.step * stride), }); @@ -36,12 +38,19 @@ namespace refactor::kernel { continuous = d.length == l; stride *= l; } - auto blockCount = dims[0].countStride; - blockSize *= blockCount; + baseOffset = dims[0].sizeStart; + auto elementCount = dims[0].countStride; + blockSize *= elementCount; for (auto &d : dims) { - d.countStride /= blockCount; + d.countStride /= elementCount; } std::reverse(dims.begin(), dims.end()); + blockCount = dims[0].countStride; + for (auto i : range(1ul, dims.size())) { + dims[i - 1].countStride = dims[i].countStride; + } + dims.pop_back(); + dims.shrink_to_fit(); } }// namespace refactor::kernel diff --git a/src/04kernel/src/collectors/slice.cc b/src/04kernel/src/collectors/slice.cc index 88774620..d2224786 100644 --- a/src/04kernel/src/collectors/slice.cc +++ b/src/04kernel/src/collectors/slice.cc @@ -1,4 +1,6 @@ #include "kernel/collectors/slice.h" +#include "../kernels/slice/cpu_kernel.hh" +#include "../kernels/slice/cuda_kernel.hh" namespace refactor::kernel { @@ -11,17 +13,19 @@ namespace refactor::kernel { std::vector SliceCollector::filter(TensorRefs inputs, TensorRefs outputs) const { + SliceInfo info(dimentions, inputs[0]); + std::vector ans; switch (target) { case Target::Cpu: - // if (auto ptr = SliceCpu::build(info); ptr) { - // ans.emplace_back(std::move(ptr)); - // } + if (auto ptr = SliceCpu::build(info); ptr) { + ans.emplace_back(std::move(ptr)); + } break; case Target::NvidiaGpu: - // if (auto ptr = SliceCuda::build(info); ptr) { - // ans.emplace_back(std::move(ptr)); - // } + if (auto ptr = SliceCuda::build(info); ptr) { + ans.emplace_back(std::move(ptr)); + } break; default: UNREACHABLEX(void, "Unknown target"); diff --git a/src/04kernel/src/kernels/slice/cpu_kernel.cc b/src/04kernel/src/kernels/slice/cpu_kernel.cc new file mode 100644 index 00000000..2dcbfc88 --- /dev/null +++ b/src/04kernel/src/kernels/slice/cpu_kernel.cc @@ -0,0 +1,46 @@ +#include "cpu_kernel.hh" +#include + +namespace refactor::kernel { + using K = SliceCpu; + + K::SliceCpu(SliceInfo info_) noexcept + : Kernel(), info(std::move(info_)) {} + + auto K::build(SliceInfo info) noexcept -> KernelBox { + return std::make_unique(std::move(info)); + } + auto K::typeId() noexcept -> size_t { + static uint8_t ID = 1; + return reinterpret_cast(&ID); + } + + auto K::kernelTypeId() const noexcept -> size_t { + return typeId(); + } + auto K::description() const noexcept -> std::string_view { + return "Performing slice operation on generic cpu"; + } + + Routine K::lower(Resources &) const noexcept { + using namespace runtime; + return [info = this->info](Resources &, void const **inputs, void **outputs) { + auto src = reinterpret_cast(inputs[0]) + info.baseOffset; + auto dst = reinterpret_cast(outputs[0]); + std::for_each_n(std::execution::par_unseq, + natural_t(0), info.blockCount, + [=, &info](auto i) { + long rem = i; + auto src_ = src; + auto dst_ = dst + i * info.blockSize; + for (auto const &dim : info.dims) { + auto d = std::div(rem, dim.countStride); + src_ += d.quot * dim.sizeStride + dim.sizeStart; + rem = d.rem; + } + std::memcpy(dst_, src_, info.blockSize); + }); + }; + } + +}// namespace refactor::kernel diff --git a/src/04kernel/src/kernels/slice/cpu_kernel.hh b/src/04kernel/src/kernels/slice/cpu_kernel.hh new file mode 100644 index 00000000..bd524886 --- /dev/null +++ b/src/04kernel/src/kernels/slice/cpu_kernel.hh @@ -0,0 +1,24 @@ +#ifndef KERNEL_SPLIT_CPU_KERNEL_HH +#define KERNEL_SPLIT_CPU_KERNEL_HH + +#include "kernel/attributes/slice_info.h" +#include "kernel/kernel.h" + +namespace refactor::kernel { + + struct SliceCpu final : public Kernel { + SliceInfo info; + + explicit SliceCpu(SliceInfo) noexcept; + + static KernelBox build(SliceInfo) noexcept; + static size_t typeId() noexcept; + + size_t kernelTypeId() const noexcept final; + std::string_view description() const noexcept final; + Routine lower(Resources &) const noexcept final; + }; + +}// namespace refactor::kernel + +#endif// KERNEL_SPLIT_CPU_KERNEL_HH diff --git a/src/04kernel/src/kernels/slice/cuda_kernel.cc b/src/04kernel/src/kernels/slice/cuda_kernel.cc new file mode 100644 index 00000000..3fe28f3f --- /dev/null +++ b/src/04kernel/src/kernels/slice/cuda_kernel.cc @@ -0,0 +1,27 @@ +#include "cuda_kernel.hh" + +namespace refactor::kernel { + using K = SliceCuda; + + K::SliceCuda(SliceInfo info_) noexcept + : Kernel(), info(std::move(info_)) {} + + auto K::build(SliceInfo info) noexcept -> KernelBox { +#ifndef USE_CUDA + return nullptr; +#endif + return std::make_unique(std::move(info)); + } + auto K::typeId() noexcept -> size_t { + static uint8_t ID = 1; + return reinterpret_cast(&ID); + } + + auto K::kernelTypeId() const noexcept -> size_t { + return typeId(); + } + auto K::description() const noexcept -> std::string_view { + return "Performing slice operation using CUDA"; + } + +}// namespace refactor::kernel diff --git a/src/04kernel/src/kernels/slice/cuda_kernel.cu b/src/04kernel/src/kernels/slice/cuda_kernel.cu new file mode 100644 index 00000000..27b6fff9 --- /dev/null +++ b/src/04kernel/src/kernels/slice/cuda_kernel.cu @@ -0,0 +1,15 @@ +#include "cuda_kernel.hh" +#include "kernel/cuda/split.cuh" +#include "mem_manager/foreign_blob.hh" +#include "runtime/mem_manager.hh" +#include + +namespace refactor::kernel { + using namespace runtime; + + Routine SliceCuda::lower(Resources &) const noexcept { + return [](Resources &, void const **inputs, void **outputs) { + }; + } + +}// namespace refactor::kernel diff --git a/src/04kernel/src/kernels/slice/cuda_kernel.hh b/src/04kernel/src/kernels/slice/cuda_kernel.hh new file mode 100644 index 00000000..43516c33 --- /dev/null +++ b/src/04kernel/src/kernels/slice/cuda_kernel.hh @@ -0,0 +1,26 @@ +#ifndef KERNEL_SPLIT_CUDA_KERNEL_HH +#define KERNEL_SPLIT_CUDA_KERNEL_HH + +#include "kernel/attributes/slice_info.h" +#include "kernel/kernel.h" + +namespace refactor::kernel { + + struct SliceCuda final : public Kernel { + SliceInfo info; + + explicit SliceCuda(SliceInfo) noexcept; + + static KernelBox build(SliceInfo) noexcept; + static size_t typeId() noexcept; + + size_t kernelTypeId() const noexcept final; + std::string_view description() const noexcept final; +#ifdef USE_CUDA + Routine lower(Resources &) const noexcept final; +#endif + }; + +}// namespace refactor::kernel + +#endif// KERNEL_SPLIT_CUDA_KERNEL_HH diff --git a/src/04kernel/test/attributes/test_slice_info.cpp b/src/04kernel/test/attributes/test_slice_info.cpp index 6fdd7c8b..2621d8dd 100644 --- a/src/04kernel/test/attributes/test_slice_info.cpp +++ b/src/04kernel/test/attributes/test_slice_info.cpp @@ -7,21 +7,22 @@ using namespace kernel; TEST(kernel, SliceInfo) { auto input = Tensor::share(DataType::F32, Shape{7, 6, 5, 1, 2, 3}); Dimensions dims{ - {5, -2, 3},// 7 -> {5, 3, 1} -> {108, 900, -360} - {2, 3, 2}, // 6 -> {2, 5} -> { 36, 60, 90} - {1, 1, 3}, // 5 -> {1, 2, 3} -> { 18, 6, 30} + {5, -2, 3},// 7 -> {5, 3, 1} -> {144, 900, -360} + {2, 3, 2}, // 6 -> {2, 5} -> { 48, 60, 90} + {1, 1, 4}, // 5 -> {1, 2, 3, 4} -> { 24, 6, 30} {0, 1, 1}, // 1 -> {0} {0, 1, 2}, // 2 -> {0, 1} {0, 1, 3}, // 3 -> {0, 1, 2} }; SliceInfo info(dims, *input); - EXPECT_EQ(info.blockSize, 72); + EXPECT_EQ(info.blockCount, 6); + EXPECT_EQ(info.blockSize, 96); + EXPECT_EQ(info.baseOffset, 24); EXPECT_EQ(info.dims, // clang-format off (decltype(info.dims){ - {108 / 18, 900 * 4, -360 * 4}, - { 36 / 18, 60 * 4, 90 * 4}, - { 18 / 18, 6 * 4, 30 * 4}, + {48 / 24, 900 * 4, -360 * 4}, + {24 / 24, 60 * 4, 90 * 4}, }) // clang-format on ); diff --git a/src/04kernel/test/kernels/slice/test_cpu.cpp b/src/04kernel/test/kernels/slice/test_cpu.cpp new file mode 100644 index 00000000..e554c16f --- /dev/null +++ b/src/04kernel/test/kernels/slice/test_cpu.cpp @@ -0,0 +1,52 @@ +#include "../../../src/kernels/slice/cpu_kernel.hh" +#include +#include + +using namespace refactor; +using namespace kernel; + +TEST(kernel, SliceCpu) { + // build routine + Dimensions dims{ + {5, -2, 3},// 7 -> {5, 3, 1} -> {108, 900, -360} + {2, 3, 2}, // 6 -> {2, 5} -> { 36, 60, 90} + {1, 1, 3}, // 5 -> {1, 2, 3} -> { 18, 6, 30} + {0, 1, 1}, // 1 -> {0} + {0, 1, 2}, // 2 -> {0, 1} + {0, 1, 3}, // 3 -> {0, 1, 2} + }; + auto input = Tensor::share(DataType::F32, Shape{7, 6, 5, 1, 2, 3}), + output = Tensor::share(DataType::F32, Shape{3, 2, 3, 1, 2, 3}); + auto kernel = SliceCpu::build(SliceInfo(dims, *input)); + ASSERT_TRUE(kernel); + auto res = runtime::Resources(); + auto routine = kernel->lower(res); + // put input data + std::vector + data(input->elementsSize()), + result(output->elementsSize()); + std::iota(data.begin(), data.end(), 0); + // inference + void const *inputs[]{data.data()}; + void *outputs[]{result.data()}; + routine(res, inputs, outputs); + // check + dim_t + di[]{5, 3, 1}, + dj[]{2, 5}, + dk[]{1, 2, 3}; + auto n = 6; + for (auto i : range0_(3)) { + for (auto j : range0_(2)) { + for (auto k : range0_(3)) { + // clang-format off + auto src = di[i] * 6 * 5 * n + dj[j] * 5 * n + dk[k] * n; + auto dst = i * 2 * 3 * n + j * 3 * n + k * n; + // clang-format on + for (auto l : range0_(n)) { + EXPECT_EQ(data[src + l], result[dst + l]); + } + } + } + } +} diff --git a/src/04kernel/test/kernels/slice/test_cuda.cpp b/src/04kernel/test/kernels/slice/test_cuda.cpp new file mode 100644 index 00000000..11ac7f44 --- /dev/null +++ b/src/04kernel/test/kernels/slice/test_cuda.cpp @@ -0,0 +1,16 @@ +#ifdef USE_CUDA + +#include "../../../src/kernels/slice/cpu_kernel.hh" +#include "../../../src/kernels/slice/cuda_kernel.hh" +#include "kernel/target.h" +#include "runtime/mem_manager.hh" +#include +#include + +using namespace refactor; +using namespace kernel; + +TEST(kernel, SliceCuda) { +} + +#endif