temp

Signed-off-by: YdrMaster <[email protected]>
InfiniTensor · Nov 15, 2023 · d0078ba · d0078ba
1 parent 3f4d976
commit d0078ba
Show file tree

Hide file tree

Showing 6 changed files with 90 additions and 34 deletions.
diff --git a/src/04kernel/cuda/src/expand.cu b/src/04kernel/cuda/src/expand.cu
@@ -1,5 +1,6 @@
 #include "kernel/cuda/expand.cuh"
 #include <cstdint>
+#include <cstdio>
 
 namespace refactor::kernel::cuda {
 
@@ -8,16 +9,19 @@ namespace refactor::kernel::cuda {
         uint8_t const *data, expand::DimStride const *strides, uint8_t *output,
         unsigned int rank,
         unsigned int eleSize) {
+        extern __shared__ expand::DimStride shared[];
+        for (auto i = threadIdx.x; i < rank; i += blockDim.x) {
+            shared[i] = strides[i];
+        }
+        __syncthreads();
         for (auto tid = blockIdx.x * blockDim.x + threadIdx.x,
                   step = blockDim.x * gridDim.x;
              tid < n;
              tid += step) {
             long rem = tid, i = 0;
             for (auto j = 0; j < rank; ++j) {
-                auto const &s = strides[j];
-                if (s.i) {
-                    i += rem / s.o * s.i;
-                }
+                auto s = shared[j];
+                i += rem / s.o * s.i;
                 rem %= s.o;
             }
 
@@ -30,10 +34,11 @@ namespace refactor::kernel::cuda {
         void const *data, expand::DimStride const *strides, void *output,
         unsigned int rank,
         unsigned int eleSize) {
+        printf("launchExpand %d %d %d\n", params.gridSize, params.blockSize, eleSize);
         expandKernel<<<
             params.gridSize,
             params.blockSize,
-            0,
+            rank * sizeof(expand::DimStride),
             reinterpret_cast<cudaStream_t>(params.stream)>>>(
             params.n,
             reinterpret_cast<uint8_t const *>(data),

diff --git a/src/04kernel/src/kernels/expand/cuda_kernel.cc b/src/04kernel/src/kernels/expand/cuda_kernel.cc
@@ -4,14 +4,14 @@ namespace refactor::kernel {
     using K = ExpandCuda;
 
     K::ExpandCuda(ExpandInfo info_) noexcept
-        : Kernel(), info(std::move(info_)) {}
+        : Kernel(), info(info_.reform(16)) {}
 
     auto K::build(ExpandInfo info) noexcept -> KernelBox {
 #ifndef USE_CUDA
         return nullptr;
 #endif
 
-        return std::make_unique<K>(info.reform(16));
+        return std::make_unique<K>(std::move(info));
     }
     auto K::typeId() noexcept -> size_t {
         static uint8_t ID = 1;

diff --git a/src/04kernel/src/kernels/mat_mul/cublas_kernel.cc b/src/04kernel/src/kernels/mat_mul/cublas_kernel.cc
@@ -4,8 +4,8 @@ namespace refactor::kernel {
     using K = MatMulCublas;
     using DT = DataType;
 
-    K::MatMulCublas(decltype(info) info_) noexcept
-        : Kernel(), info(std::move(info_)) {}
+    K::MatMulCublas(decltype(info) info_, decltype(biasExpand) biasExpand_) noexcept
+        : Kernel(), info(std::move(info_)), biasExpand(std::move(biasExpand_)) {}
 
     auto K::build(Tensor const &a, Tensor const &b, Tensor const &y, MatMulInfo info) noexcept -> KernelBox {
         static const std::unordered_set<decltype(DT::internal)> TYPE{DT::F32, DT::F64, DT::FP16};
@@ -20,7 +20,47 @@ namespace refactor::kernel {
             return nullptr;
         }
 
-        return std::make_unique<K>(std::move(info));
+        dim_t inputs[2];
+        switch (info.biasType) {
+            case BiasType::NoBias:
+                return std::make_unique<K>(std::move(info), std::nullopt);
+            case BiasType::Scalar:
+                inputs[0] = 1;
+                inputs[1] = 1;
+                break;
+            case BiasType::RowVector:
+                inputs[0] = 1;
+                inputs[1] = info.n;
+                break;
+            case BiasType::ColVector:
+                inputs[0] = info.m;
+                inputs[1] = 1;
+                break;
+            case BiasType::Matrix:
+                inputs[0] = info.m;
+                inputs[1] = info.n;
+                break;
+            default:
+                break;
+        }
+
+        std::vector<dim_t> outputShape(std::max(a.rank(), b.rank()));
+        for (auto i : range0_(outputShape.size() - 2)) {
+            auto a_ = i < a.rank() ? a.shape[i] : 1;
+            auto b_ = i < b.rank() ? b.shape[i] : 1;
+            outputShape[i] = std::max(a_, b_);
+        }
+        outputShape.rbegin()[1] = info.m;
+        outputShape.rbegin()[0] = info.n;
+
+        fmt::println("inputs {} outputs {}", vec2str(slice(inputs, 2)), vec2str(slice(outputShape.data(), outputShape.size())));
+
+        return std::make_unique<K>(
+            std::move(info),
+            std::make_optional(ExpandInfo(
+                dataType,
+                slice(inputs, 2),
+                slice(outputShape.data(), outputShape.size()))));
     }
 
     auto K::typeId() noexcept -> size_t {

diff --git a/src/04kernel/src/kernels/mat_mul/cublas_kernel.cu b/src/04kernel/src/kernels/mat_mul/cublas_kernel.cu
@@ -1,4 +1,5 @@
 #include "../../utilities/cuda/cublas_context.hh"
+#include "../expand/cuda_kernel.hh"
 #include "cublas_kernel.hh"
 #include <cublas_v2.h>
 #include <thrust/execution_policy.h>
@@ -29,7 +30,10 @@ namespace refactor::kernel {
     };
 
     template<class T>
-    Routine lowerTyped(cudaDataType_t cudaDataType, MatMulInfo info, size_t strideC0, size_t strideC1) noexcept {
+    Routine lowerTyped(cudaDataType_t cudaDataType, MatMulInfo info, Resources &res, std::optional<ExpandInfo> biasExpand, size_t strideC0, size_t strideC1) noexcept {
+        auto biasEx = biasExpand
+                          ? std::make_optional(ExpandCuda(*biasExpand).lower(res))
+                          : std::nullopt;
         return [cudaDataType,
                 alpha = static_cast<T>(info.alpha),
                 beta = static_cast<T>(info.biasType != BiasType::NoBias ? info.beta : 0.0f),
@@ -42,29 +46,33 @@ namespace refactor::kernel {
                 strideC0, strideC1,
                 lda = info.transA ? info.m : info.k,
                 ldb = info.transB ? info.k : info.n,
+                biasEx,
                 broadcaster = info.broadcaster](Resources &res, void const **inputs, void **outputs) {
             auto a = reinterpret_cast<T const *>(inputs[0]);
             auto b = reinterpret_cast<T const *>(inputs[1]);
             auto y = reinterpret_cast<T *>(outputs[0]);
 
             if (beta != (T) 0) {
-                // Expand bias to 2D and store in final output Y
-                {
-                    auto c = reinterpret_cast<T const *>(inputs[2]);
-                    thrust::tabulate(
-                        thrust::device,
-                        y,
-                        y + strideY,
-                        MatMulBroadcastBiasFunctor<T>{c, n, strideC0, strideC1});
-                }
-                // Copy 2D bias to each batch
-                if (broadcaster.outputsCount > 1) {
-                    thrust::tabulate(
-                        thrust::device,
-                        y + strideY,
-                        y + strideY * broadcaster.outputsCount,
-                        MatMulCopyBiasFunctor<T>{y, strideY});
-                }
+                // // Expand bias to 2D and store in final output Y
+                // {
+                //     auto c = reinterpret_cast<T const *>(inputs[2]);
+                //     thrust::tabulate(
+                //         thrust::device,
+                //         y,
+                //         y + strideY,
+                //         MatMulBroadcastBiasFunctor<T>{c, n, strideC0, strideC1});
+                // }
+                // // Copy 2D bias to each batch
+                // if (broadcaster.outputsCount > 1) {
+                //     thrust::tabulate(
+                //         thrust::device,
+                //         y + strideY,
+                //         y + strideY * broadcaster.outputsCount,
+                //         MatMulCopyBiasFunctor<T>{y, strideY});
+                // }
+                void const *inputs_[]{inputs[2]};
+                void *outputs_[]{outputs[0]};
+                (*biasEx)(res, inputs_, outputs_);
             }
 
             auto handle = res.fetchOrStore<CublasContext>()->handle;
@@ -102,11 +110,11 @@ namespace refactor::kernel {
         res.fetchOrStore<CublasContext>();
         switch (info.dataType) {
             case DataType::F32:
-                return lowerTyped<float>(CUDA_R_32F, info, strideC0, strideC1);
+                return lowerTyped<float>(CUDA_R_32F, info, res, biasExpand, strideC0, strideC1);
             case DataType::F64:
-                return lowerTyped<double>(CUDA_R_64F, info, strideC0, strideC1);
+                return lowerTyped<double>(CUDA_R_64F, info, res, biasExpand, strideC0, strideC1);
             case DataType::FP16:
-                return lowerTyped<half>(CUDA_R_16F, info, strideC0, strideC1);
+                return lowerTyped<half>(CUDA_R_16F, info, res, biasExpand, strideC0, strideC1);
             default:
                 UNREACHABLE();
         }

diff --git a/src/04kernel/src/kernels/mat_mul/cublas_kernel.hh b/src/04kernel/src/kernels/mat_mul/cublas_kernel.hh
@@ -1,16 +1,19 @@
 #ifndef KERNEL_MATMUL_CUBLAS_KERNEL_HH
 #define KERNEL_MATMUL_CUBLAS_KERNEL_HH
 
+#include "kernel/attributes/expand_info.h"
 #include "kernel/attributes/matmul_info.h"
 #include "kernel/kernel.h"
 #include "kernel/tensor.h"
+#include <optional>
 
 namespace refactor::kernel {
 
     struct MatMulCublas final : public Kernel {
         MatMulInfo info;
+        std::optional<ExpandInfo> biasExpand;
 
-        explicit MatMulCublas(MatMulInfo) noexcept;
+        explicit MatMulCublas(MatMulInfo, std::optional<ExpandInfo>) noexcept;
 
         static KernelBox build(Tensor const &, Tensor const &, Tensor const &, MatMulInfo) noexcept;
         static size_t typeId() noexcept;

diff --git a/src/04kernel/src/kernels/slice/cuda_kernel.cc b/src/04kernel/src/kernels/slice/cuda_kernel.cc
@@ -4,14 +4,14 @@ namespace refactor::kernel {
     using K = SliceCuda;
 
     K::SliceCuda(SliceInfo info_) noexcept
-        : Kernel(), info(std::move(info_)) {}
+        : Kernel(), info(info_.reform(16)) {}
 
     auto K::build(SliceInfo info) noexcept -> KernelBox {
 #ifndef USE_CUDA
         return nullptr;
 #endif
 
-        return std::make_unique<K>(info.reform(16));
+        return std::make_unique<K>(std::move(info));
     }
     auto K::typeId() noexcept -> size_t {
         static uint8_t ID = 1;