Skip to content

Commit

Permalink
feat(kernel): 实现 slice cpu kernel
Browse files Browse the repository at this point in the history
Signed-off-by: YdrMaster <[email protected]>
  • Loading branch information
YdrMaster committed Nov 13, 2023
1 parent d55c074 commit 40d623e
Show file tree
Hide file tree
Showing 11 changed files with 241 additions and 21 deletions.
2 changes: 1 addition & 1 deletion src/04kernel/include/kernel/attributes/slice_info.h
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ namespace refactor::kernel {
bool operator!=(Dim const &) const noexcept;
};
std::vector<Dim> dims;
dim_t blockSize;
dim_t blockCount, blockSize, baseOffset;

SliceInfo(Dimensions const &, Tensor const &) noexcept;
};
Expand Down
23 changes: 16 additions & 7 deletions src/04kernel/src/attributes/slice_info.cc
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,10 @@ namespace refactor::kernel {
}

SliceInfo::SliceInfo(Dimensions const &dims_, Tensor const &input) noexcept
: blockSize(input.dataType.size()), dims(1) {
: blockCount(1),
blockSize(input.dataType.size()),
baseOffset(0),
dims(1) {
ASSERT(dims_.size() == input.rank(), "Unreachable");

auto continuous = true;
Expand All @@ -21,27 +24,33 @@ namespace refactor::kernel {
for (auto i : range0_(input.rank()).rev()) {
auto l = input.shape[i];
auto const &d = dims_[i];
if (continuous && d.step == 1) {
auto &it = dims.back();
if (auto &it = dims.back(); continuous && d.step == 1) {
it.countStride *= d.length;
it.sizeStart = d.start * stride;
it.sizeStride *= l;
} else {
dims.push_back(Dim{
static_cast<dim_t>(dims.back().countStride * d.length),
static_cast<dim_t>(it.countStride * d.length),
static_cast<dim_t>(d.start * stride),
static_cast<sdim_t>(d.step * stride),
});
}
continuous = d.length == l;
stride *= l;
}
auto blockCount = dims[0].countStride;
blockSize *= blockCount;
baseOffset = dims[0].sizeStart;
auto elementCount = dims[0].countStride;
blockSize *= elementCount;
for (auto &d : dims) {
d.countStride /= blockCount;
d.countStride /= elementCount;
}
std::reverse(dims.begin(), dims.end());
blockCount = dims[0].countStride;
for (auto i : range(1ul, dims.size())) {
dims[i - 1].countStride = dims[i].countStride;
}
dims.pop_back();
dims.shrink_to_fit();
}

}// namespace refactor::kernel
16 changes: 10 additions & 6 deletions src/04kernel/src/collectors/slice.cc
Original file line number Diff line number Diff line change
@@ -1,4 +1,6 @@
#include "kernel/collectors/slice.h"
#include "../kernels/slice/cpu_kernel.hh"
#include "../kernels/slice/cuda_kernel.hh"

namespace refactor::kernel {

Expand All @@ -11,17 +13,19 @@ namespace refactor::kernel {

std::vector<KernelBox>
SliceCollector::filter(TensorRefs inputs, TensorRefs outputs) const {
SliceInfo info(dimentions, inputs[0]);

std::vector<KernelBox> ans;
switch (target) {
case Target::Cpu:
// if (auto ptr = SliceCpu::build(info); ptr) {
// ans.emplace_back(std::move(ptr));
// }
if (auto ptr = SliceCpu::build(info); ptr) {
ans.emplace_back(std::move(ptr));
}
break;
case Target::NvidiaGpu:
// if (auto ptr = SliceCuda::build(info); ptr) {
// ans.emplace_back(std::move(ptr));
// }
if (auto ptr = SliceCuda::build(info); ptr) {
ans.emplace_back(std::move(ptr));
}
break;
default:
UNREACHABLEX(void, "Unknown target");
Expand Down
46 changes: 46 additions & 0 deletions src/04kernel/src/kernels/slice/cpu_kernel.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
#include "cpu_kernel.hh"
#include <execution>

namespace refactor::kernel {
using K = SliceCpu;

K::SliceCpu(SliceInfo info_) noexcept
: Kernel(), info(std::move(info_)) {}

auto K::build(SliceInfo info) noexcept -> KernelBox {
return std::make_unique<K>(std::move(info));
}
auto K::typeId() noexcept -> size_t {
static uint8_t ID = 1;
return reinterpret_cast<size_t>(&ID);
}

auto K::kernelTypeId() const noexcept -> size_t {
return typeId();
}
auto K::description() const noexcept -> std::string_view {
return "Performing slice operation on generic cpu";
}

Routine K::lower(Resources &) const noexcept {
using namespace runtime;
return [info = this->info](Resources &, void const **inputs, void **outputs) {
auto src = reinterpret_cast<uint8_t const *>(inputs[0]) + info.baseOffset;
auto dst = reinterpret_cast<uint8_t *>(outputs[0]);
std::for_each_n(std::execution::par_unseq,
natural_t(0), info.blockCount,
[=, &info](auto i) {
long rem = i;
auto src_ = src;
auto dst_ = dst + i * info.blockSize;
for (auto const &dim : info.dims) {
auto d = std::div(rem, dim.countStride);
src_ += d.quot * dim.sizeStride + dim.sizeStart;
rem = d.rem;
}
std::memcpy(dst_, src_, info.blockSize);
});
};
}

}// namespace refactor::kernel
24 changes: 24 additions & 0 deletions src/04kernel/src/kernels/slice/cpu_kernel.hh
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
#ifndef KERNEL_SPLIT_CPU_KERNEL_HH
#define KERNEL_SPLIT_CPU_KERNEL_HH

#include "kernel/attributes/slice_info.h"
#include "kernel/kernel.h"

namespace refactor::kernel {

struct SliceCpu final : public Kernel {
SliceInfo info;

explicit SliceCpu(SliceInfo) noexcept;

static KernelBox build(SliceInfo) noexcept;
static size_t typeId() noexcept;

size_t kernelTypeId() const noexcept final;
std::string_view description() const noexcept final;
Routine lower(Resources &) const noexcept final;
};

}// namespace refactor::kernel

#endif// KERNEL_SPLIT_CPU_KERNEL_HH
27 changes: 27 additions & 0 deletions src/04kernel/src/kernels/slice/cuda_kernel.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
#include "cuda_kernel.hh"

namespace refactor::kernel {
using K = SliceCuda;

K::SliceCuda(SliceInfo info_) noexcept
: Kernel(), info(std::move(info_)) {}

auto K::build(SliceInfo info) noexcept -> KernelBox {
#ifndef USE_CUDA
return nullptr;
#endif
return std::make_unique<K>(std::move(info));
}
auto K::typeId() noexcept -> size_t {
static uint8_t ID = 1;
return reinterpret_cast<size_t>(&ID);
}

auto K::kernelTypeId() const noexcept -> size_t {
return typeId();
}
auto K::description() const noexcept -> std::string_view {
return "Performing slice operation using CUDA";
}

}// namespace refactor::kernel
15 changes: 15 additions & 0 deletions src/04kernel/src/kernels/slice/cuda_kernel.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
#include "cuda_kernel.hh"
#include "kernel/cuda/split.cuh"
#include "mem_manager/foreign_blob.hh"
#include "runtime/mem_manager.hh"
#include <thrust/device_vector.h>

namespace refactor::kernel {
using namespace runtime;

Routine SliceCuda::lower(Resources &) const noexcept {
return [](Resources &, void const **inputs, void **outputs) {
};
}

}// namespace refactor::kernel
26 changes: 26 additions & 0 deletions src/04kernel/src/kernels/slice/cuda_kernel.hh
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
#ifndef KERNEL_SPLIT_CUDA_KERNEL_HH
#define KERNEL_SPLIT_CUDA_KERNEL_HH

#include "kernel/attributes/slice_info.h"
#include "kernel/kernel.h"

namespace refactor::kernel {

struct SliceCuda final : public Kernel {
SliceInfo info;

explicit SliceCuda(SliceInfo) noexcept;

static KernelBox build(SliceInfo) noexcept;
static size_t typeId() noexcept;

size_t kernelTypeId() const noexcept final;
std::string_view description() const noexcept final;
#ifdef USE_CUDA
Routine lower(Resources &) const noexcept final;
#endif
};

}// namespace refactor::kernel

#endif// KERNEL_SPLIT_CUDA_KERNEL_HH
15 changes: 8 additions & 7 deletions src/04kernel/test/attributes/test_slice_info.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -7,21 +7,22 @@ using namespace kernel;
TEST(kernel, SliceInfo) {
auto input = Tensor::share(DataType::F32, Shape{7, 6, 5, 1, 2, 3});
Dimensions dims{
{5, -2, 3},// 7 -> {5, 3, 1} -> {108, 900, -360}
{2, 3, 2}, // 6 -> {2, 5} -> { 36, 60, 90}
{1, 1, 3}, // 5 -> {1, 2, 3} -> { 18, 6, 30}
{5, -2, 3},// 7 -> {5, 3, 1} -> {144, 900, -360}
{2, 3, 2}, // 6 -> {2, 5} -> { 48, 60, 90}
{1, 1, 4}, // 5 -> {1, 2, 3, 4} -> { 24, 6, 30}
{0, 1, 1}, // 1 -> {0}
{0, 1, 2}, // 2 -> {0, 1}
{0, 1, 3}, // 3 -> {0, 1, 2}
};
SliceInfo info(dims, *input);
EXPECT_EQ(info.blockSize, 72);
EXPECT_EQ(info.blockCount, 6);
EXPECT_EQ(info.blockSize, 96);
EXPECT_EQ(info.baseOffset, 24);
EXPECT_EQ(info.dims,
// clang-format off
(decltype(info.dims){
{108 / 18, 900 * 4, -360 * 4},
{ 36 / 18, 60 * 4, 90 * 4},
{ 18 / 18, 6 * 4, 30 * 4},
{48 / 24, 900 * 4, -360 * 4},
{24 / 24, 60 * 4, 90 * 4},
})
// clang-format on
);
Expand Down
52 changes: 52 additions & 0 deletions src/04kernel/test/kernels/slice/test_cpu.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
#include "../../../src/kernels/slice/cpu_kernel.hh"
#include <gtest/gtest.h>
#include <numeric>

using namespace refactor;
using namespace kernel;

TEST(kernel, SliceCpu) {
// build routine
Dimensions dims{
{5, -2, 3},// 7 -> {5, 3, 1} -> {108, 900, -360}
{2, 3, 2}, // 6 -> {2, 5} -> { 36, 60, 90}
{1, 1, 3}, // 5 -> {1, 2, 3} -> { 18, 6, 30}
{0, 1, 1}, // 1 -> {0}
{0, 1, 2}, // 2 -> {0, 1}
{0, 1, 3}, // 3 -> {0, 1, 2}
};
auto input = Tensor::share(DataType::F32, Shape{7, 6, 5, 1, 2, 3}),
output = Tensor::share(DataType::F32, Shape{3, 2, 3, 1, 2, 3});
auto kernel = SliceCpu::build(SliceInfo(dims, *input));
ASSERT_TRUE(kernel);
auto res = runtime::Resources();
auto routine = kernel->lower(res);
// put input data
std::vector<float>
data(input->elementsSize()),
result(output->elementsSize());
std::iota(data.begin(), data.end(), 0);
// inference
void const *inputs[]{data.data()};
void *outputs[]{result.data()};
routine(res, inputs, outputs);
// check
dim_t
di[]{5, 3, 1},
dj[]{2, 5},
dk[]{1, 2, 3};
auto n = 6;
for (auto i : range0_(3)) {
for (auto j : range0_(2)) {
for (auto k : range0_(3)) {
// clang-format off
auto src = di[i] * 6 * 5 * n + dj[j] * 5 * n + dk[k] * n;
auto dst = i * 2 * 3 * n + j * 3 * n + k * n;
// clang-format on
for (auto l : range0_(n)) {
EXPECT_EQ(data[src + l], result[dst + l]);
}
}
}
}
}
16 changes: 16 additions & 0 deletions src/04kernel/test/kernels/slice/test_cuda.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
#ifdef USE_CUDA

#include "../../../src/kernels/slice/cpu_kernel.hh"
#include "../../../src/kernels/slice/cuda_kernel.hh"
#include "kernel/target.h"
#include "runtime/mem_manager.hh"
#include <gtest/gtest.h>
#include <numeric>

using namespace refactor;
using namespace kernel;

TEST(kernel, SliceCuda) {
}

#endif

0 comments on commit 40d623e

Please sign in to comment.