Skip to content

Commit

Permalink
Merge branch 'dev' of github.com:InfiniTensor/RefactorGraph into dev
Browse files Browse the repository at this point in the history
  • Loading branch information
bitzyz committed Nov 13, 2023
2 parents 030a059 + 10712b5 commit 65f9c5e
Show file tree
Hide file tree
Showing 4 changed files with 56 additions and 10 deletions.
1 change: 1 addition & 0 deletions src/04kernel/cuda/src/slice.cu
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
#include "kernel/cuda/slice.cuh"
#include <cstdint>
#include <cstdio>

namespace refactor::kernel::cuda {

Expand Down
2 changes: 1 addition & 1 deletion src/04kernel/src/kernels/slice/cuda_kernel.cc
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ namespace refactor::kernel {
using K = SliceCuda;

K::SliceCuda(SliceInfo info_) noexcept
: Kernel(), info(std::move(info_)) {}
: Kernel(), info(info_.reform(16)) {}

auto K::build(SliceInfo info) noexcept -> KernelBox {
#ifndef USE_CUDA
Expand Down
19 changes: 10 additions & 9 deletions src/04kernel/src/kernels/slice/cuda_kernel.cu
Original file line number Diff line number Diff line change
Expand Up @@ -7,19 +7,20 @@ namespace refactor::kernel {
using namespace runtime;

Routine SliceCuda::lower(Resources &) const noexcept {
auto reformed = info.reform(16);
thrust::host_vector<cuda::DimInfo> dims(info.dims.size());
std::transform(info.dims.begin(), info.dims.end(),
dims.begin(),
[](auto const &d) { return cuda::DimInfo{
d.countStride,
d.sizeStart,
d.sizeStride,
}; });
[](auto const &d) {
return cuda::DimInfo{
d.countStride,
d.sizeStart,
d.sizeStride,
};
});
return [dims = thrust::device_vector<cuda::DimInfo>(dims),
params = cuda::ThreadsDistributer()(reformed.blockCount),
blockSize = reformed.blockSize,
baseOffset = reformed.baseOffset](Resources &, void const **inputs, void **outputs) {
params = cuda::ThreadsDistributer()(info.blockCount),
blockSize = info.blockSize,
baseOffset = info.baseOffset](Resources &, void const **inputs, void **outputs) {
auto src = reinterpret_cast<uint8_t const *>(inputs[0]) + baseOffset;
cuda::launchSlice(params, src, dims.data().get(), outputs[0],
dims.size(),
Expand Down
44 changes: 44 additions & 0 deletions src/04kernel/test/kernels/slice/test_cuda.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,50 @@ using namespace refactor;
using namespace kernel;

TEST(kernel, SliceCuda) {
// build routine
Dimensions dims{
{5, -2, 3},// 7 -> {5, 3, 1} -> {108, 900, -360}
{2, 3, 2}, // 6 -> {2, 5} -> { 36, 60, 90}
{1, 1, 3}, // 5 -> {1, 2, 3} -> { 18, 6, 30}
{0, 1, 1}, // 1 -> {0}
{0, 1, 2}, // 2 -> {0, 1}
{0, 1, 3}, // 3 -> {0, 1, 2}
};
auto input = Tensor::share(DataType::F32, Shape{7, 6, 5, 1, 2, 3}),
output = Tensor::share(DataType::F32, Shape{3, 2, 3, 1, 2, 3});
SliceInfo info(dims, *input);
auto kernel = SliceCuda::build(info);
auto kCpu = SliceCpu::build(info);
ASSERT_TRUE(kernel && kCpu);
auto res = runtime::Resources();
auto routine = kernel->lower(res);
auto rCpu = kCpu->lower(res);
// malloc
auto memManager = Target(Target::NvidiaGpu).memManager();
Arc<mem_manager::ForeignBlob>
gpuIn = mem_manager::ForeignBlob::share(memManager, input->bytesSize()),
gpuOut = mem_manager::ForeignBlob::share(memManager, output->bytesSize());
// put input data
std::vector<float>
data(input->elementsSize()),
ans(output->elementsSize()),
result(ans.size());
std::iota(data.begin(), data.end(), 0);
gpuIn->copyIn(data.data(), input->bytesSize());
// inference
{
void const *inputs[]{*gpuIn};
void *outputs[]{*gpuOut};
routine(res, inputs, outputs);
}
{
void const *inputs[]{data.data()};
void *outputs[]{ans.data()};
rCpu(res, inputs, outputs);
}
// check
gpuOut->copyOut(result.data(), output->bytesSize());
EXPECT_EQ(result, ans);
}

#endif

0 comments on commit 65f9c5e

Please sign in to comment.