From 784bea65c241d0db1ca5e15bd7ecf34088780b5a Mon Sep 17 00:00:00 2001 From: YdrMaster Date: Wed, 15 Nov 2023 17:55:25 +0800 Subject: [PATCH] =?UTF-8?q?feat(kernel):=20=E4=BD=BF=E7=94=A8=E7=AD=89?= =?UTF-8?q?=E5=8F=B7=E6=9B=BF=E6=8D=A2=20memcpy=20=E4=BB=A5=E8=A7=A6?= =?UTF-8?q?=E5=8F=91=E6=8C=87=E4=BB=A4=E7=BA=A7=E4=BC=98=E5=8C=96?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: YdrMaster --- src/04kernel/cuda/src/concat.cu | 3 +- src/04kernel/cuda/src/expand.cu | 17 +++--- src/04kernel/cuda/src/gather.cu | 3 +- src/04kernel/cuda/src/macro.cuh | 22 ++++++++ src/04kernel/cuda/src/slice.cu | 4 +- src/04kernel/cuda/src/split.cu | 3 +- src/04kernel/cuda/src/transpose.cu | 3 +- src/04kernel/cuda/src/where.cu | 4 +- src/04kernel/src/attributes/expand_info.cc | 5 +- .../src/kernels/expand/cuda_kernel.cc | 4 +- .../src/kernels/mat_mul/cublas_kernel.cc | 44 +++++++++++++-- .../src/kernels/mat_mul/cublas_kernel.cu | 53 +++++-------------- .../src/kernels/mat_mul/cublas_kernel.hh | 5 +- src/04kernel/src/kernels/slice/cuda_kernel.cc | 4 +- 14 files changed, 109 insertions(+), 65 deletions(-) diff --git a/src/04kernel/cuda/src/concat.cu b/src/04kernel/cuda/src/concat.cu index 1ccc8801..9d4c586a 100644 --- a/src/04kernel/cuda/src/concat.cu +++ b/src/04kernel/cuda/src/concat.cu @@ -1,4 +1,5 @@ #include "kernel/cuda/concat.cuh" +#include "macro.cuh" #include namespace refactor::kernel::cuda { @@ -15,7 +16,7 @@ namespace refactor::kernel::cuda { tid += step) { auto i = tid % sum, j = i * sub, k = 0u; while (j >= segments[k]) { j -= segments[k++]; } - memcpy(output + tid * sub, inputs[k] + (tid / sum) * segments[k] + j, sub); + MEMCPY(output + tid * sub, inputs[k] + (tid / sum) * segments[k] + j, sub); } } diff --git a/src/04kernel/cuda/src/expand.cu b/src/04kernel/cuda/src/expand.cu index 82e4904a..a1125877 100644 --- a/src/04kernel/cuda/src/expand.cu +++ b/src/04kernel/cuda/src/expand.cu @@ -1,4 +1,5 @@ #include "kernel/cuda/expand.cuh" +#include "macro.cuh" #include namespace refactor::kernel::cuda { @@ -8,20 +9,22 @@ namespace refactor::kernel::cuda { uint8_t const *data, expand::DimStride const *strides, uint8_t *output, unsigned int rank, unsigned int eleSize) { + extern __shared__ expand::DimStride shared[]; + for (auto i = threadIdx.x; i < rank; i += blockDim.x) { + shared[i] = strides[i]; + } + __syncthreads(); for (auto tid = blockIdx.x * blockDim.x + threadIdx.x, step = blockDim.x * gridDim.x; tid < n; tid += step) { long rem = tid, i = 0; for (auto j = 0; j < rank; ++j) { - auto const &s = strides[j]; - if (s.i) { - i += rem / s.o * s.i; - } + auto s = shared[j]; + i += rem / s.o * s.i; rem %= s.o; } - - memcpy(output + tid * eleSize, data + i * eleSize, eleSize); + MEMCPY(output + tid * eleSize, data + i * eleSize, eleSize); } } @@ -33,7 +36,7 @@ namespace refactor::kernel::cuda { expandKernel<<< params.gridSize, params.blockSize, - 0, + rank * sizeof(expand::DimStride), reinterpret_cast(params.stream)>>>( params.n, reinterpret_cast(data), diff --git a/src/04kernel/cuda/src/gather.cu b/src/04kernel/cuda/src/gather.cu index 0c4f5337..35010185 100644 --- a/src/04kernel/cuda/src/gather.cu +++ b/src/04kernel/cuda/src/gather.cu @@ -1,4 +1,5 @@ #include "kernel/cuda/gather.cuh" +#include "macro.cuh" #include namespace refactor::kernel::cuda { @@ -24,7 +25,7 @@ namespace refactor::kernel::cuda { tid += step) { auto i = tid / batch, j = tid % batch; - memcpy(unit * tid + output, + MEMCPY(unit * tid + output, unit * (batch * (i / midSizeO * midSizeI + shared[i % midSizeO]) + j) + data, unit); } diff --git a/src/04kernel/cuda/src/macro.cuh b/src/04kernel/cuda/src/macro.cuh index de31880f..efbffef1 100644 --- a/src/04kernel/cuda/src/macro.cuh +++ b/src/04kernel/cuda/src/macro.cuh @@ -10,4 +10,26 @@ cudaGetErrorString(status))); \ } +#define MEMCPY(DST, SRC, ELE_SIZE) \ + switch (ELE_SIZE) { \ + case 1: \ + *reinterpret_cast(DST) = *reinterpret_cast(SRC); \ + break; \ + case 2: \ + *reinterpret_cast(DST) = *reinterpret_cast(SRC); \ + break; \ + case 4: \ + *reinterpret_cast(DST) = *reinterpret_cast(SRC); \ + break; \ + case 8: \ + *reinterpret_cast(DST) = *reinterpret_cast(SRC); \ + break; \ + case 16: \ + *reinterpret_cast(DST) = *reinterpret_cast(SRC); \ + break; \ + default: \ + memcpy((DST), (SRC), (ELE_SIZE)); \ + break; \ + } + #endif// KERNEL_CUDA_MACRO_CUH diff --git a/src/04kernel/cuda/src/slice.cu b/src/04kernel/cuda/src/slice.cu index 28ff7aaa..827639ef 100644 --- a/src/04kernel/cuda/src/slice.cu +++ b/src/04kernel/cuda/src/slice.cu @@ -1,6 +1,6 @@ #include "kernel/cuda/slice.cuh" +#include "macro.cuh" #include -#include namespace refactor::kernel::cuda { @@ -26,7 +26,7 @@ namespace refactor::kernel::cuda { src_ += rem / dim.countStride * dim.sizeStride + dim.sizeStart; rem %= dim.countStride; } - memcpy(dst_, src_, blockSize); + MEMCPY(dst_, src_, blockSize); } } diff --git a/src/04kernel/cuda/src/split.cu b/src/04kernel/cuda/src/split.cu index 000bd99a..043fe155 100644 --- a/src/04kernel/cuda/src/split.cu +++ b/src/04kernel/cuda/src/split.cu @@ -1,4 +1,5 @@ #include "kernel/cuda/split.cuh" +#include "macro.cuh" #include namespace refactor::kernel::cuda { @@ -20,7 +21,7 @@ namespace refactor::kernel::cuda { tid += step) { auto i = tid % sum, j = i * sub, k = 0u; while (j >= shared[k]) { j -= shared[k++]; } - memcpy(outputs[k] + (tid / sum) * shared[k] + j, data + tid * sub, sub); + MEMCPY(outputs[k] + (tid / sum) * shared[k] + j, data + tid * sub, sub); } } diff --git a/src/04kernel/cuda/src/transpose.cu b/src/04kernel/cuda/src/transpose.cu index a60f3934..3bf3a672 100644 --- a/src/04kernel/cuda/src/transpose.cu +++ b/src/04kernel/cuda/src/transpose.cu @@ -1,4 +1,5 @@ #include "kernel/cuda/transpose.cuh" +#include "macro.cuh" #include namespace refactor::kernel::cuda { @@ -19,7 +20,7 @@ namespace refactor::kernel::cuda { rem %= d.o; } - memcpy(output + tid * eleSize, data + j * eleSize, eleSize); + MEMCPY(output + tid * eleSize, data + j * eleSize, eleSize); } } diff --git a/src/04kernel/cuda/src/where.cu b/src/04kernel/cuda/src/where.cu index 9906ecee..cb7cf903 100644 --- a/src/04kernel/cuda/src/where.cu +++ b/src/04kernel/cuda/src/where.cu @@ -1,4 +1,5 @@ #include "kernel/cuda/where.cuh" +#include "macro.cuh" #include namespace refactor::kernel::cuda { @@ -25,7 +26,8 @@ namespace refactor::kernel::cuda { ix += quot * dim[1]; iy += quot * dim[2]; } - memcpy(output + tid * eleSize, + + MEMCPY(output + tid * eleSize, c[ic] ? x + ix * eleSize : y + iy * eleSize, diff --git a/src/04kernel/src/attributes/expand_info.cc b/src/04kernel/src/attributes/expand_info.cc index 7ce05330..3aa5d47d 100644 --- a/src/04kernel/src/attributes/expand_info.cc +++ b/src/04kernel/src/attributes/expand_info.cc @@ -77,8 +77,9 @@ namespace refactor::kernel { s.i *= times; s.o *= times; } - strides.resize(strides.size() + 1); - strides.back() = {1, 1}; + strides.resize(strides.size() + 2); + strides.rbegin()[1] = {times, times}; + strides.rbegin()[0] = {1, 1}; } } diff --git a/src/04kernel/src/kernels/expand/cuda_kernel.cc b/src/04kernel/src/kernels/expand/cuda_kernel.cc index a84c8f7f..c4058b33 100644 --- a/src/04kernel/src/kernels/expand/cuda_kernel.cc +++ b/src/04kernel/src/kernels/expand/cuda_kernel.cc @@ -4,14 +4,14 @@ namespace refactor::kernel { using K = ExpandCuda; K::ExpandCuda(ExpandInfo info_) noexcept - : Kernel(), info(std::move(info_)) {} + : Kernel(), info(info_.reform(16)) {} auto K::build(ExpandInfo info) noexcept -> KernelBox { #ifndef USE_CUDA return nullptr; #endif - return std::make_unique(info.reform(16)); + return std::make_unique(std::move(info)); } auto K::typeId() noexcept -> size_t { static uint8_t ID = 1; diff --git a/src/04kernel/src/kernels/mat_mul/cublas_kernel.cc b/src/04kernel/src/kernels/mat_mul/cublas_kernel.cc index 1c7cf114..3c612c47 100644 --- a/src/04kernel/src/kernels/mat_mul/cublas_kernel.cc +++ b/src/04kernel/src/kernels/mat_mul/cublas_kernel.cc @@ -4,8 +4,8 @@ namespace refactor::kernel { using K = MatMulCublas; using DT = DataType; - K::MatMulCublas(decltype(info) info_) noexcept - : Kernel(), info(std::move(info_)) {} + K::MatMulCublas(decltype(info) info_, decltype(biasExpand) biasExpand_) noexcept + : Kernel(), info(std::move(info_)), biasExpand(std::move(biasExpand_)) {} auto K::build(Tensor const &a, Tensor const &b, Tensor const &y, MatMulInfo info) noexcept -> KernelBox { static const std::unordered_set TYPE{DT::F32, DT::F64, DT::FP16}; @@ -20,7 +20,45 @@ namespace refactor::kernel { return nullptr; } - return std::make_unique(std::move(info)); + dim_t inputs[2]; + switch (info.biasType) { + case BiasType::NoBias: + return std::make_unique(std::move(info), std::nullopt); + case BiasType::Scalar: + inputs[0] = 1; + inputs[1] = 1; + break; + case BiasType::RowVector: + inputs[0] = 1; + inputs[1] = info.n; + break; + case BiasType::ColVector: + inputs[0] = info.m; + inputs[1] = 1; + break; + case BiasType::Matrix: + inputs[0] = info.m; + inputs[1] = info.n; + break; + default: + break; + } + + std::vector outputShape(std::max(a.rank(), b.rank())); + for (auto i : range0_(outputShape.size() - 2)) { + auto a_ = i < a.rank() ? a.shape[i] : 1; + auto b_ = i < b.rank() ? b.shape[i] : 1; + outputShape[i] = std::max(a_, b_); + } + outputShape.rbegin()[1] = info.m; + outputShape.rbegin()[0] = info.n; + + return std::make_unique( + std::move(info), + std::make_optional(ExpandInfo( + dataType, + slice(inputs, 2), + slice(outputShape.data(), outputShape.size())))); } auto K::typeId() noexcept -> size_t { diff --git a/src/04kernel/src/kernels/mat_mul/cublas_kernel.cu b/src/04kernel/src/kernels/mat_mul/cublas_kernel.cu index 444527e1..e2fd8814 100644 --- a/src/04kernel/src/kernels/mat_mul/cublas_kernel.cu +++ b/src/04kernel/src/kernels/mat_mul/cublas_kernel.cu @@ -1,4 +1,5 @@ #include "../../utilities/cuda/cublas_context.hh" +#include "../expand/cuda_kernel.hh" #include "cublas_kernel.hh" #include #include @@ -9,27 +10,10 @@ namespace refactor::kernel { using namespace cublas; template - struct MatMulBroadcastBiasFunctor { - T const *src; - size_t const n, strideC0, strideC1; - - __device__ T operator()(size_t i) const noexcept { - return src[i / n * strideC0 + i % n * strideC1]; - } - }; - - template - struct MatMulCopyBiasFunctor { - T const *src; - size_t blockSize; - - __device__ T operator()(size_t i) const noexcept { - return src[i % blockSize]; - } - }; - - template - Routine lowerTyped(cudaDataType_t cudaDataType, MatMulInfo info, size_t strideC0, size_t strideC1) noexcept { + Routine lowerTyped(cudaDataType_t cudaDataType, MatMulInfo info, Resources &res, std::optional biasExpand, size_t strideC0, size_t strideC1) noexcept { + auto biasEx = biasExpand + ? std::make_optional(ExpandCuda(*biasExpand).lower(res)) + : std::nullopt; return [cudaDataType, alpha = static_cast(info.alpha), beta = static_cast(info.biasType != BiasType::NoBias ? info.beta : 0.0f), @@ -42,29 +26,16 @@ namespace refactor::kernel { strideC0, strideC1, lda = info.transA ? info.m : info.k, ldb = info.transB ? info.k : info.n, + biasEx, broadcaster = info.broadcaster](Resources &res, void const **inputs, void **outputs) { auto a = reinterpret_cast(inputs[0]); auto b = reinterpret_cast(inputs[1]); auto y = reinterpret_cast(outputs[0]); if (beta != (T) 0) { - // Expand bias to 2D and store in final output Y - { - auto c = reinterpret_cast(inputs[2]); - thrust::tabulate( - thrust::device, - y, - y + strideY, - MatMulBroadcastBiasFunctor{c, n, strideC0, strideC1}); - } - // Copy 2D bias to each batch - if (broadcaster.outputsCount > 1) { - thrust::tabulate( - thrust::device, - y + strideY, - y + strideY * broadcaster.outputsCount, - MatMulCopyBiasFunctor{y, strideY}); - } + void const *inputs_[]{inputs[2]}; + void *outputs_[]{outputs[0]}; + (*biasEx)(res, inputs_, outputs_); } auto handle = res.fetchOrStore()->handle; @@ -102,11 +73,11 @@ namespace refactor::kernel { res.fetchOrStore(); switch (info.dataType) { case DataType::F32: - return lowerTyped(CUDA_R_32F, info, strideC0, strideC1); + return lowerTyped(CUDA_R_32F, info, res, biasExpand, strideC0, strideC1); case DataType::F64: - return lowerTyped(CUDA_R_64F, info, strideC0, strideC1); + return lowerTyped(CUDA_R_64F, info, res, biasExpand, strideC0, strideC1); case DataType::FP16: - return lowerTyped(CUDA_R_16F, info, strideC0, strideC1); + return lowerTyped(CUDA_R_16F, info, res, biasExpand, strideC0, strideC1); default: UNREACHABLE(); } diff --git a/src/04kernel/src/kernels/mat_mul/cublas_kernel.hh b/src/04kernel/src/kernels/mat_mul/cublas_kernel.hh index df26c813..89ed816c 100644 --- a/src/04kernel/src/kernels/mat_mul/cublas_kernel.hh +++ b/src/04kernel/src/kernels/mat_mul/cublas_kernel.hh @@ -1,16 +1,19 @@ #ifndef KERNEL_MATMUL_CUBLAS_KERNEL_HH #define KERNEL_MATMUL_CUBLAS_KERNEL_HH +#include "kernel/attributes/expand_info.h" #include "kernel/attributes/matmul_info.h" #include "kernel/kernel.h" #include "kernel/tensor.h" +#include namespace refactor::kernel { struct MatMulCublas final : public Kernel { MatMulInfo info; + std::optional biasExpand; - explicit MatMulCublas(MatMulInfo) noexcept; + explicit MatMulCublas(MatMulInfo, std::optional) noexcept; static KernelBox build(Tensor const &, Tensor const &, Tensor const &, MatMulInfo) noexcept; static size_t typeId() noexcept; diff --git a/src/04kernel/src/kernels/slice/cuda_kernel.cc b/src/04kernel/src/kernels/slice/cuda_kernel.cc index 7e3a6f19..de12569e 100644 --- a/src/04kernel/src/kernels/slice/cuda_kernel.cc +++ b/src/04kernel/src/kernels/slice/cuda_kernel.cc @@ -4,14 +4,14 @@ namespace refactor::kernel { using K = SliceCuda; K::SliceCuda(SliceInfo info_) noexcept - : Kernel(), info(std::move(info_)) {} + : Kernel(), info(info_.reform(16)) {} auto K::build(SliceInfo info) noexcept -> KernelBox { #ifndef USE_CUDA return nullptr; #endif - return std::make_unique(info.reform(16)); + return std::make_unique(std::move(info)); } auto K::typeId() noexcept -> size_t { static uint8_t ID = 1;