test(kernel): 添加 attention 单测

Signed-off-by: YdrMaster <[email protected]>
InfiniTensor · Feb 1, 2024 · d2d43a3 · d2d43a3
1 parent fddac13
commit d2d43a3
Show file tree

Hide file tree

Showing 3 changed files with 54 additions and 4 deletions.
diff --git a/src/04kernel/src/kernels/attention/cuda_kernel.cu b/src/04kernel/src/kernels/attention/cuda_kernel.cu
@@ -140,6 +140,7 @@ namespace refactor::kernel {
                         auto att = reinterpret_cast<half *>(workspace);
                         auto workspaceQK = reinterpret_cast<uint8_t *>(workspace) + hardware::alignBytes(info.attSize(0), 256);
                         auto workspaceAV = workspaceQK + hardware::alignBytes(d->workspaceSizeQK, 256);
+                        auto stream = cudaStreamLegacy;
                         {
                             half alpha = rsqrtf(info.headDim), beta = 0;
                             cublasLtMatmul(
@@ -152,13 +153,14 @@ namespace refactor::kernel {
                                 att, d->att.get(),
                                 &d->algoQK,
                                 workspaceQK, d->workspaceSizeQK,
-                                cudaStreamLegacy);
+                                stream);
                         }
                         auto attLen = info.attLen(0);
                         auto bufLen = attLen;
                         softmax<<<dim3(info.batch * info.nHead, info.seqLen),
                                   std::min(1024u, attLen),
-                                  attLen * sizeof(float)>>>(
+                                  attLen * sizeof(float),
+                                  stream>>>(
                             att, causualMask, attLen, bufLen);
                         {
                             half alpha = 1, beta = 0;
@@ -172,7 +174,7 @@ namespace refactor::kernel {
                                 o, d->q.get(),
                                 &d->algoAV,
                                 workspaceAV, d->workspaceSizeAV,
-                                cudaStreamLegacy);
+                                stream);
                         };
                     };
 

diff --git a/src/04kernel/src/utilities/cuda/cublaslt_utils.cu b/src/04kernel/src/utilities/cuda/cublaslt_utils.cu
@@ -109,7 +109,7 @@ namespace refactor::kernel::cublas {
         CUDA_ASSERT(cudaGetDeviceProperties(&prop, device));
 
         auto workspace = std::numeric_limits<uint64_t>::max();
-        auto alignment = prop.textureAlignment;
+        uint32_t alignment = prop.textureAlignment;
 
         cublasLtMatmulPreference_t preference;
         CUBLASLT_ASSERT(cublasLtMatmulPreferenceCreate(&preference));

diff --git a/src/04kernel/test/kernels/attention/test_cuda.cpp b/src/04kernel/test/kernels/attention/test_cuda.cpp
@@ -0,0 +1,48 @@
+#ifdef USE_CUDA
+
+#include "../../../src/kernels/attention/cuda_kernel.hh"
+#include "hardware/device_manager.h"
+#include <gtest/gtest.h>
+#include <numeric>
+
+using namespace refactor;
+using namespace kernel;
+using namespace hardware;
+
+TEST(kernel, AttentionCudaNoKvCache) {
+    // build routine
+    AttentionInfo info{
+        .dataType = DataType::FP16,
+        .batch = 1,
+        .nHead = 4,
+        .nKVHead = 4,
+        .seqLen = 31,
+        .headDim = 256,
+        .cacheLen = 0,
+        .concatCache = false,
+        .resetCache = false,
+    };
+    auto q = Tensor::share(DataType::FP16, Shape{info.batch, info.nHead, info.seqLen, info.headDim}),
+         k = Tensor::share(DataType::FP16, Shape{info.batch, info.nKVHead, info.seqLen, info.headDim}),
+         v = Tensor::share(DataType::FP16, Shape{info.batch, info.nKVHead, info.seqLen, info.headDim}),
+         o = q;
+    auto kernel = AttentionCuda::build(info);
+    ASSERT_TRUE(kernel);
+    auto res = runtime::Resources();
+    auto [routine, workspaceSize] = kernel->lower(res);
+    // malloc
+    auto &dev = *device::init(Device::Type::Nvidia, 0, "");
+    auto qGpu = dev.malloc(q->bytesSize()),
+         kGpu = dev.malloc(k->bytesSize()),
+         vGpu = dev.malloc(v->bytesSize()),
+         oGpu = dev.malloc(o->bytesSize()),
+         workspace = dev.malloc(workspaceSize);
+    // inference
+    {
+        void const *inputs[]{*qGpu, *kGpu, *vGpu};
+        void *outputs[]{*oGpu};
+        routine(res, *workspace, inputs, outputs);
+    }
+}
+
+#endif