From ab270e4158a3beedc1342e6ed39971e24b5a4ee1 Mon Sep 17 00:00:00 2001 From: YdrMaster Date: Fri, 26 Jan 2024 14:27:30 +0800 Subject: [PATCH] =?UTF-8?q?todo:=20=E6=89=93=E5=BC=80=E6=9B=B4=E5=A4=9A?= =?UTF-8?q?=E4=B8=9C=E8=A5=BF=E4=BB=A5=E7=BC=A9=E5=B0=8F=E9=97=AE=E9=A2=98?= =?UTF-8?q?=E8=8C=83=E5=9B=B4?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: YdrMaster --- 3rd-party/cccl | 1 + src/04kernel/src/kernels/pad/cuda_kernel.cu | 53 +++++++++++---------- src/04kernel/test/kernels/pad/test_cuda.cpp | 32 ++++++------- 3 files changed, 43 insertions(+), 43 deletions(-) create mode 160000 3rd-party/cccl diff --git a/3rd-party/cccl b/3rd-party/cccl new file mode 160000 index 000000000..b7d4228ab --- /dev/null +++ b/3rd-party/cccl @@ -0,0 +1 @@ +Subproject commit b7d4228ab7268ed928984cd61096079bd671d25d diff --git a/src/04kernel/src/kernels/pad/cuda_kernel.cu b/src/04kernel/src/kernels/pad/cuda_kernel.cu index b246ab4fd..4f14cfa3c 100644 --- a/src/04kernel/src/kernels/pad/cuda_kernel.cu +++ b/src/04kernel/src/kernels/pad/cuda_kernel.cu @@ -8,32 +8,33 @@ namespace refactor::kernel { auto PadCuda::lower(Resources &) const noexcept -> RoutineWorkspace { thrust::host_vector dims(info.dims.size()); - std::transform(info.dims.begin(), info.dims.end(), - dims.begin(), - [](auto const &d) { - return cuda::DimInfo{ - d.strideI, - d.strideO, - d.padS, - d.dimI, - }; - }); - return [dims = thrust::device_vector(dims), - params = cuda::ThreadsDistributer()(info.blockCount), - blockSize = info.blockSize, - value = this->valueLength](Resources &, void *workspace, void const *const *inputs, void *const *outputs) { - auto src = reinterpret_cast(inputs[0]); - thrust::device_vector defaultValue(blockSize, 0); - if (value != 0) { - auto constValue = reinterpret_cast(inputs[2]); - for (auto i : range0_(blockSize / value)) { - cudaMemcpy(defaultValue.data().get() + i * value, constValue, value, cudaMemcpyDeviceToDevice); - } - } - cuda::launchPad(params, src, defaultValue.data().get(), dims.data().get(), outputs[0], - dims.size(), - blockSize); - }; + exit(999); + // std::transform(info.dims.begin(), info.dims.end(), + // dims.begin(), + // [](auto const &d) { + // return cuda::DimInfo{ + // d.strideI, + // d.strideO, + // d.padS, + // d.dimI, + // }; + // }); + // return [dims = thrust::device_vector(dims), + // params = cuda::ThreadsDistributer()(info.blockCount), + // blockSize = info.blockSize, + // value = this->valueLength](Resources &, void *workspace, void const *const *inputs, void *const *outputs) { + // auto src = reinterpret_cast(inputs[0]); + // thrust::device_vector defaultValue(blockSize, 0); + // if (value != 0) { + // auto constValue = reinterpret_cast(inputs[2]); + // for (auto i : range0_(blockSize / value)) { + // cudaMemcpy(defaultValue.data().get() + i * value, constValue, value, cudaMemcpyDeviceToDevice); + // } + // } + // cuda::launchPad(params, src, defaultValue.data().get(), dims.data().get(), outputs[0], + // dims.size(), + // blockSize); + // }; } }// namespace refactor::kernel diff --git a/src/04kernel/test/kernels/pad/test_cuda.cpp b/src/04kernel/test/kernels/pad/test_cuda.cpp index 4c490cd85..0bf3ff4dc 100644 --- a/src/04kernel/test/kernels/pad/test_cuda.cpp +++ b/src/04kernel/test/kernels/pad/test_cuda.cpp @@ -23,36 +23,34 @@ TEST(kernel, PadCuda) { auto yTensor = Tensor::share(DataType::F32, Shape{4, 5, 1, 8}); PadType type = PadType::Constant; auto kCpu = PadCpu::build(PadInfo(dims, *t1Tensor), type, std::make_optional(std::reference_wrapper(*t3Tensor))); - // auto kernel = PadCuda::build(PadInfo(dims, *t1Tensor), type, std::make_optional(std::reference_wrapper(*t3Tensor))); - // ASSERT_TRUE(kernel && kCpu); + auto kernel = PadCuda::build(PadInfo(dims, *t1Tensor), type, std::make_optional(std::reference_wrapper(*t3Tensor))); + ASSERT_TRUE(kernel && kCpu); auto res = runtime::Resources(); // auto routine = kernel->lower(res).routine, // rCpu = kCpu->lower(res).routine; auto rCpu = kCpu->lower(res).routine; // malloc auto &dev = *device::init(Device::Type::Nvidia, 0, ""); - // auto gpuIn = dev.malloc(t1Tensor->bytesSize()), - // gpuIn2 = dev.malloc(t2Tensor->bytesSize()), - // gpuIn3 = dev.malloc(t3Tensor->bytesSize()), - // gpuOut = dev.malloc(yTensor->bytesSize()); + auto gpuIn = dev.malloc(t1Tensor->bytesSize()), + gpuIn2 = dev.malloc(t2Tensor->bytesSize()), + gpuIn3 = dev.malloc(t3Tensor->bytesSize()), + gpuOut = dev.malloc(yTensor->bytesSize()); // put input data std::vector data(t1Tensor->elementsSize()), constvalue(1, 1.2f), cpuOut(yTensor->elementsSize()); std::vector pads{1, 1, 0, 2, 1, 1, 0, 2}; - for (auto i : range0_(data.size())) { data[i] = i; } - // gpuIn->copyFromHost(data.data(), t1Tensor->bytesSize()); - // gpuIn2->copyFromHost(pads.data(), t2Tensor->bytesSize()); - // gpuIn3->copyFromHost(constvalue.data(), t3Tensor->bytesSize()); - + gpuIn->copyFromHost(data.data(), t1Tensor->bytesSize()); + gpuIn2->copyFromHost(pads.data(), t2Tensor->bytesSize()); + gpuIn3->copyFromHost(constvalue.data(), t3Tensor->bytesSize()); // inference - // { - // void const *inputs[]{*gpuIn, *gpuIn2, *gpuIn3}; - // void *outputs[]{*gpuOut}; - // routine(res, nullptr, inputs, outputs); - // } + { + void const *inputs[]{*gpuIn, *gpuIn2, *gpuIn3}; + void *outputs[]{*gpuOut}; + // routine(res, nullptr, inputs, outputs); + } { void const *inputs[]{data.data(), pads.data(), constvalue.data()}; void *outputs[]{cpuOut.data()}; @@ -60,7 +58,7 @@ TEST(kernel, PadCuda) { } // take output data std::vector result(yTensor->elementsSize()); - // gpuOut->copyToHost(result.data(), yTensor->bytesSize()); + gpuOut->copyToHost(result.data(), yTensor->bytesSize()); // check for (auto i : range0_(cpuOut.size())) { // fmt::println("i = {}, cpuout = {}, gpuout = {}", i, cpuOut[i], result[i]);