Skip to content

Commit

Permalink
test(kernel): 添加 attention 单测
Browse files Browse the repository at this point in the history
Signed-off-by: YdrMaster <[email protected]>
  • Loading branch information
YdrMaster committed Feb 1, 2024
1 parent fddac13 commit d2d43a3
Show file tree
Hide file tree
Showing 3 changed files with 54 additions and 4 deletions.
8 changes: 5 additions & 3 deletions src/04kernel/src/kernels/attention/cuda_kernel.cu
Original file line number Diff line number Diff line change
Expand Up @@ -140,6 +140,7 @@ namespace refactor::kernel {
auto att = reinterpret_cast<half *>(workspace);
auto workspaceQK = reinterpret_cast<uint8_t *>(workspace) + hardware::alignBytes(info.attSize(0), 256);
auto workspaceAV = workspaceQK + hardware::alignBytes(d->workspaceSizeQK, 256);
auto stream = cudaStreamLegacy;
{
half alpha = rsqrtf(info.headDim), beta = 0;
cublasLtMatmul(
Expand All @@ -152,13 +153,14 @@ namespace refactor::kernel {
att, d->att.get(),
&d->algoQK,
workspaceQK, d->workspaceSizeQK,
cudaStreamLegacy);
stream);
}
auto attLen = info.attLen(0);
auto bufLen = attLen;
softmax<<<dim3(info.batch * info.nHead, info.seqLen),
std::min(1024u, attLen),
attLen * sizeof(float)>>>(
attLen * sizeof(float),
stream>>>(
att, causualMask, attLen, bufLen);
{
half alpha = 1, beta = 0;
Expand All @@ -172,7 +174,7 @@ namespace refactor::kernel {
o, d->q.get(),
&d->algoAV,
workspaceAV, d->workspaceSizeAV,
cudaStreamLegacy);
stream);
};
};
Expand Down
2 changes: 1 addition & 1 deletion src/04kernel/src/utilities/cuda/cublaslt_utils.cu
Original file line number Diff line number Diff line change
Expand Up @@ -109,7 +109,7 @@ namespace refactor::kernel::cublas {
CUDA_ASSERT(cudaGetDeviceProperties(&prop, device));

auto workspace = std::numeric_limits<uint64_t>::max();
auto alignment = prop.textureAlignment;
uint32_t alignment = prop.textureAlignment;

cublasLtMatmulPreference_t preference;
CUBLASLT_ASSERT(cublasLtMatmulPreferenceCreate(&preference));
Expand Down
48 changes: 48 additions & 0 deletions src/04kernel/test/kernels/attention/test_cuda.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
#ifdef USE_CUDA

#include "../../../src/kernels/attention/cuda_kernel.hh"
#include "hardware/device_manager.h"
#include <gtest/gtest.h>
#include <numeric>

using namespace refactor;
using namespace kernel;
using namespace hardware;

TEST(kernel, AttentionCudaNoKvCache) {
// build routine
AttentionInfo info{
.dataType = DataType::FP16,
.batch = 1,
.nHead = 4,
.nKVHead = 4,
.seqLen = 31,
.headDim = 256,
.cacheLen = 0,
.concatCache = false,
.resetCache = false,
};
auto q = Tensor::share(DataType::FP16, Shape{info.batch, info.nHead, info.seqLen, info.headDim}),
k = Tensor::share(DataType::FP16, Shape{info.batch, info.nKVHead, info.seqLen, info.headDim}),
v = Tensor::share(DataType::FP16, Shape{info.batch, info.nKVHead, info.seqLen, info.headDim}),
o = q;
auto kernel = AttentionCuda::build(info);
ASSERT_TRUE(kernel);
auto res = runtime::Resources();
auto [routine, workspaceSize] = kernel->lower(res);
// malloc
auto &dev = *device::init(Device::Type::Nvidia, 0, "");
auto qGpu = dev.malloc(q->bytesSize()),
kGpu = dev.malloc(k->bytesSize()),
vGpu = dev.malloc(v->bytesSize()),
oGpu = dev.malloc(o->bytesSize()),
workspace = dev.malloc(workspaceSize);
// inference
{
void const *inputs[]{*qGpu, *kGpu, *vGpu};
void *outputs[]{*oGpu};
routine(res, *workspace, inputs, outputs);
}
}

#endif

0 comments on commit d2d43a3

Please sign in to comment.