Skip to content

Commit

Permalink
temp
Browse files Browse the repository at this point in the history
Signed-off-by: YdrMaster <[email protected]>
  • Loading branch information
YdrMaster committed Nov 15, 2023
1 parent 3f4d976 commit d0078ba
Show file tree
Hide file tree
Showing 6 changed files with 90 additions and 34 deletions.
15 changes: 10 additions & 5 deletions src/04kernel/cuda/src/expand.cu
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
#include "kernel/cuda/expand.cuh"
#include <cstdint>
#include <cstdio>

namespace refactor::kernel::cuda {

Expand All @@ -8,16 +9,19 @@ namespace refactor::kernel::cuda {
uint8_t const *data, expand::DimStride const *strides, uint8_t *output,
unsigned int rank,
unsigned int eleSize) {
extern __shared__ expand::DimStride shared[];
for (auto i = threadIdx.x; i < rank; i += blockDim.x) {
shared[i] = strides[i];
}
__syncthreads();
for (auto tid = blockIdx.x * blockDim.x + threadIdx.x,
step = blockDim.x * gridDim.x;
tid < n;
tid += step) {
long rem = tid, i = 0;
for (auto j = 0; j < rank; ++j) {
auto const &s = strides[j];
if (s.i) {
i += rem / s.o * s.i;
}
auto s = shared[j];
i += rem / s.o * s.i;
rem %= s.o;
}

Expand All @@ -30,10 +34,11 @@ namespace refactor::kernel::cuda {
void const *data, expand::DimStride const *strides, void *output,
unsigned int rank,
unsigned int eleSize) {
printf("launchExpand %d %d %d\n", params.gridSize, params.blockSize, eleSize);
expandKernel<<<
params.gridSize,
params.blockSize,
0,
rank * sizeof(expand::DimStride),
reinterpret_cast<cudaStream_t>(params.stream)>>>(
params.n,
reinterpret_cast<uint8_t const *>(data),
Expand Down
4 changes: 2 additions & 2 deletions src/04kernel/src/kernels/expand/cuda_kernel.cc
Original file line number Diff line number Diff line change
Expand Up @@ -4,14 +4,14 @@ namespace refactor::kernel {
using K = ExpandCuda;

K::ExpandCuda(ExpandInfo info_) noexcept
: Kernel(), info(std::move(info_)) {}
: Kernel(), info(info_.reform(16)) {}

auto K::build(ExpandInfo info) noexcept -> KernelBox {
#ifndef USE_CUDA
return nullptr;
#endif

return std::make_unique<K>(info.reform(16));
return std::make_unique<K>(std::move(info));
}
auto K::typeId() noexcept -> size_t {
static uint8_t ID = 1;
Expand Down
46 changes: 43 additions & 3 deletions src/04kernel/src/kernels/mat_mul/cublas_kernel.cc
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,8 @@ namespace refactor::kernel {
using K = MatMulCublas;
using DT = DataType;

K::MatMulCublas(decltype(info) info_) noexcept
: Kernel(), info(std::move(info_)) {}
K::MatMulCublas(decltype(info) info_, decltype(biasExpand) biasExpand_) noexcept
: Kernel(), info(std::move(info_)), biasExpand(std::move(biasExpand_)) {}

auto K::build(Tensor const &a, Tensor const &b, Tensor const &y, MatMulInfo info) noexcept -> KernelBox {
static const std::unordered_set<decltype(DT::internal)> TYPE{DT::F32, DT::F64, DT::FP16};
Expand All @@ -20,7 +20,47 @@ namespace refactor::kernel {
return nullptr;
}

return std::make_unique<K>(std::move(info));
dim_t inputs[2];
switch (info.biasType) {
case BiasType::NoBias:
return std::make_unique<K>(std::move(info), std::nullopt);
case BiasType::Scalar:
inputs[0] = 1;
inputs[1] = 1;
break;
case BiasType::RowVector:
inputs[0] = 1;
inputs[1] = info.n;
break;
case BiasType::ColVector:
inputs[0] = info.m;
inputs[1] = 1;
break;
case BiasType::Matrix:
inputs[0] = info.m;
inputs[1] = info.n;
break;
default:
break;
}

std::vector<dim_t> outputShape(std::max(a.rank(), b.rank()));
for (auto i : range0_(outputShape.size() - 2)) {
auto a_ = i < a.rank() ? a.shape[i] : 1;
auto b_ = i < b.rank() ? b.shape[i] : 1;
outputShape[i] = std::max(a_, b_);
}
outputShape.rbegin()[1] = info.m;
outputShape.rbegin()[0] = info.n;

fmt::println("inputs {} outputs {}", vec2str(slice(inputs, 2)), vec2str(slice(outputShape.data(), outputShape.size())));

return std::make_unique<K>(
std::move(info),
std::make_optional(ExpandInfo(
dataType,
slice(inputs, 2),
slice(outputShape.data(), outputShape.size()))));
}

auto K::typeId() noexcept -> size_t {
Expand Down
50 changes: 29 additions & 21 deletions src/04kernel/src/kernels/mat_mul/cublas_kernel.cu
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
#include "../../utilities/cuda/cublas_context.hh"
#include "../expand/cuda_kernel.hh"
#include "cublas_kernel.hh"
#include <cublas_v2.h>
#include <thrust/execution_policy.h>
Expand Down Expand Up @@ -29,7 +30,10 @@ namespace refactor::kernel {
};

template<class T>
Routine lowerTyped(cudaDataType_t cudaDataType, MatMulInfo info, size_t strideC0, size_t strideC1) noexcept {
Routine lowerTyped(cudaDataType_t cudaDataType, MatMulInfo info, Resources &res, std::optional<ExpandInfo> biasExpand, size_t strideC0, size_t strideC1) noexcept {
auto biasEx = biasExpand
? std::make_optional(ExpandCuda(*biasExpand).lower(res))
: std::nullopt;
return [cudaDataType,
alpha = static_cast<T>(info.alpha),
beta = static_cast<T>(info.biasType != BiasType::NoBias ? info.beta : 0.0f),
Expand All @@ -42,29 +46,33 @@ namespace refactor::kernel {
strideC0, strideC1,
lda = info.transA ? info.m : info.k,
ldb = info.transB ? info.k : info.n,
biasEx,
broadcaster = info.broadcaster](Resources &res, void const **inputs, void **outputs) {
auto a = reinterpret_cast<T const *>(inputs[0]);
auto b = reinterpret_cast<T const *>(inputs[1]);
auto y = reinterpret_cast<T *>(outputs[0]);

if (beta != (T) 0) {
// Expand bias to 2D and store in final output Y
{
auto c = reinterpret_cast<T const *>(inputs[2]);
thrust::tabulate(
thrust::device,
y,
y + strideY,
MatMulBroadcastBiasFunctor<T>{c, n, strideC0, strideC1});
}
// Copy 2D bias to each batch
if (broadcaster.outputsCount > 1) {
thrust::tabulate(
thrust::device,
y + strideY,
y + strideY * broadcaster.outputsCount,
MatMulCopyBiasFunctor<T>{y, strideY});
}
// // Expand bias to 2D and store in final output Y
// {
// auto c = reinterpret_cast<T const *>(inputs[2]);
// thrust::tabulate(
// thrust::device,
// y,
// y + strideY,
// MatMulBroadcastBiasFunctor<T>{c, n, strideC0, strideC1});
// }
// // Copy 2D bias to each batch
// if (broadcaster.outputsCount > 1) {
// thrust::tabulate(
// thrust::device,
// y + strideY,
// y + strideY * broadcaster.outputsCount,
// MatMulCopyBiasFunctor<T>{y, strideY});
// }
void const *inputs_[]{inputs[2]};
void *outputs_[]{outputs[0]};
(*biasEx)(res, inputs_, outputs_);
}

auto handle = res.fetchOrStore<CublasContext>()->handle;
Expand Down Expand Up @@ -102,11 +110,11 @@ namespace refactor::kernel {
res.fetchOrStore<CublasContext>();
switch (info.dataType) {
case DataType::F32:
return lowerTyped<float>(CUDA_R_32F, info, strideC0, strideC1);
return lowerTyped<float>(CUDA_R_32F, info, res, biasExpand, strideC0, strideC1);
case DataType::F64:
return lowerTyped<double>(CUDA_R_64F, info, strideC0, strideC1);
return lowerTyped<double>(CUDA_R_64F, info, res, biasExpand, strideC0, strideC1);
case DataType::FP16:
return lowerTyped<half>(CUDA_R_16F, info, strideC0, strideC1);
return lowerTyped<half>(CUDA_R_16F, info, res, biasExpand, strideC0, strideC1);
default:
UNREACHABLE();
}
Expand Down
5 changes: 4 additions & 1 deletion src/04kernel/src/kernels/mat_mul/cublas_kernel.hh
Original file line number Diff line number Diff line change
@@ -1,16 +1,19 @@
#ifndef KERNEL_MATMUL_CUBLAS_KERNEL_HH
#define KERNEL_MATMUL_CUBLAS_KERNEL_HH

#include "kernel/attributes/expand_info.h"
#include "kernel/attributes/matmul_info.h"
#include "kernel/kernel.h"
#include "kernel/tensor.h"
#include <optional>

namespace refactor::kernel {

struct MatMulCublas final : public Kernel {
MatMulInfo info;
std::optional<ExpandInfo> biasExpand;

explicit MatMulCublas(MatMulInfo) noexcept;
explicit MatMulCublas(MatMulInfo, std::optional<ExpandInfo>) noexcept;

static KernelBox build(Tensor const &, Tensor const &, Tensor const &, MatMulInfo) noexcept;
static size_t typeId() noexcept;
Expand Down
4 changes: 2 additions & 2 deletions src/04kernel/src/kernels/slice/cuda_kernel.cc
Original file line number Diff line number Diff line change
Expand Up @@ -4,14 +4,14 @@ namespace refactor::kernel {
using K = SliceCuda;

K::SliceCuda(SliceInfo info_) noexcept
: Kernel(), info(std::move(info_)) {}
: Kernel(), info(info_.reform(16)) {}

auto K::build(SliceInfo info) noexcept -> KernelBox {
#ifndef USE_CUDA
return nullptr;
#endif

return std::make_unique<K>(info.reform(16));
return std::make_unique<K>(std::move(info));
}
auto K::typeId() noexcept -> size_t {
static uint8_t ID = 1;
Expand Down

0 comments on commit d0078ba

Please sign in to comment.