From 8c6922b011e06f120ace5d1c4025d4bcb8781f06 Mon Sep 17 00:00:00 2001 From: DefTruth <31974251+DefTruth@users.noreply.github.com> Date: Thu, 17 Oct 2024 10:07:16 +0800 Subject: [PATCH] [SGEMM] Update SGEMM TF32 Benchmark (#87) * Update README.md * Update hgemm_wmma_stage.cu * Update README.md * Update README.md * Update sgemm.py --- hgemm/hgemm_wmma_stage.cu | 7 +- sgemm/README.md | 815 ++++++++++++++++++++++---------------- sgemm/sgemm.py | 8 +- 3 files changed, 491 insertions(+), 339 deletions(-) diff --git a/hgemm/hgemm_wmma_stage.cu b/hgemm/hgemm_wmma_stage.cu index ff8b4648..381e0d9d 100644 --- a/hgemm/hgemm_wmma_stage.cu +++ b/hgemm/hgemm_wmma_stage.cu @@ -31,7 +31,6 @@ using namespace nvcuda; // Support A and B matrix with row-major inorder to compare with the kernels using CUDA Cores in // hgemm.cu and hgemm_async.cu. - HOST_DEVICE_INLINE int div_ceil(int a, int b) { return (a % b != 0) ? (a / b + 1) : (a / b); } @@ -41,7 +40,7 @@ int div_ceil(int a, int b) { return (a % b != 0) ? (a / b + 1) : (a / b); } // 共享内存,调用kernel时 需要指定动态共享内存大小,且smem的寻址 // 方式需要按照一维数组来使用 2. 提高L2 Cache的局部性(Thread // Block Swizzle): https://zhuanlan.zhihu.com/p/555339335 -// 3. nedd __launch_bounds__ to avoid error 'too many resources required for launch' +// 3. __launch_bounds__: avoid error 'too many resources required for launch' // reference: https://blog.csdn.net/feng__shuai/article/details/124395023 template