-
Notifications
You must be signed in to change notification settings - Fork 1k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[QST] Example 06_splitK_gemm
error in A100.
#1182
Comments
could you please show me your code? |
Sure, first, I created two files: #pragma once
#include <iostream>
#include "cutlass/cutlass.h"
#include "cutlass/gemm/device/gemm_splitk_parallel.h"
#include "cutlass/util/host_tensor.h"
#include "cutlass/util/reference/device/gemm.h"
#include "cutlass/util/reference/host/tensor_compare.h"
#include "cutlass/util/reference/host/tensor_copy.h"
#include "cutlass/util/reference/host/tensor_fill.h"
#include "cutlass/util/tensor_view_io.h"
#include "helper.h"
int test_cutlass_gemm_split_K();
#include "gemm/cutlass_split_K.cuh"
// The code section below describes datatype for input, output matrices and
// computation between elements in input matrices.
using ElementAccumulator = float; // <- data type of accumulator
using ElementComputeEpilogue =
ElementAccumulator; // <- data type of epilogue operations
using ElementInputA =
cutlass::half_t; // <- data type of elements in input matrix A
using ElementInputB =
cutlass::half_t; // <- data type of elements in input matrix B
using ElementOutput = float; // <- data type of elements in output matrix D
// The code section below describes matrix layout of input and output matrices.
// Column Major for Matrix A, Row Major for Matrix B and Row Major for Matrix C
using LayoutInputA = cutlass::layout::ColumnMajor;
using LayoutInputB = cutlass::layout::RowMajor;
using LayoutOutput = cutlass::layout::RowMajor;
// This code section describes whether you want to use tensor cores or regular
// SIMT cores on GPU SM
using MMAOp = cutlass::arch::OpClassTensorOp;
// This code section describes CUDA SM architecture number
using SmArch = cutlass::arch::Sm80;
// This code section describes the tile size a thread block will compute
using ShapeMMAThreadBlock =
cutlass::gemm::GemmShape<128, 128, 64>; // <- threadblock tile M = 128, N =
// 128, K = 32
// This code section describes tile size a warp will compute
using ShapeMMAWarp =
cutlass::gemm::GemmShape<64, 64,
64>; // <- warp tile M = 64, N = 64, K = 32
// This code section describes the size of MMA op
using ShapeMMAOp =
cutlass::gemm::GemmShape<16, 8, 16>; // <- MMA Op tile M = 8, N = 8, K = 4
// This code section describes ?
using EpilogueOp = cutlass::epilogue::thread::LinearCombination<
ElementOutput, // <- data type of output matrix
128 / cutlass::sizeof_bits<
ElementOutput>::value, // <- This is the number of elements per
// vectorized memory access. For half
// precision, it's 8 elements. This
// becomes the vector width of math
// instructions in epilogue too
ElementAccumulator, // <- data type of accumulator
ElementComputeEpilogue>; // <- data type for alpha/beta in linear
// combination function
// Put all the created template variables to create GemmSplitKParallel template
// variable
using Gemm = cutlass::gemm::device::GemmSplitKParallel<
ElementInputA, LayoutInputA, ElementInputB, LayoutInputB, ElementOutput,
LayoutOutput, ElementAccumulator, MMAOp, SmArch, ShapeMMAThreadBlock,
ShapeMMAWarp, ShapeMMAOp, EpilogueOp>;
int test_cutlass_gemm_split_K() {
cudaDeviceProp props;
cudaError_t error = cudaGetDeviceProperties(&props, 0);
if (error != cudaSuccess) {
std::cerr << "cudaGetDeviceProperties() returned an error: "
<< cudaGetErrorString(error) << std::endl;
return -1;
}
// if (props.major != 7) {
// std::cerr << "Volta Tensor Ops must be run on a machine with compute
// "
// "capability of 70, 72, or 75."
// << std::endl;
// // Return 0 so tests pass if run on unsupported architectures or CUDA
// // Toolkits.
// return 0;
// }
//
// Define problem size
//
const int length_m = 5120;
const int length_n = 4096;
const int length_k = 4096;
// Create a tuple of problem size for matrix multiplication
cutlass::gemm::GemmCoord problem_size(length_m, length_n, length_k);
// Initialize tensors using CUTLASS helper functions
cutlass::HostTensor<ElementInputA, LayoutInputA> tensor_a(
problem_size.mk()); // <- Create matrix A with dimensions M x K
cutlass::HostTensor<ElementInputB, LayoutInputB> tensor_b(
problem_size.kn()); // <- Create matrix B with dimensions K x N
cutlass::HostTensor<ElementOutput, LayoutOutput> tensor_c(
problem_size.mn()); // <- Create matrix C with dimensions M x N
cutlass::HostTensor<ElementOutput, LayoutOutput> tensor_d(
problem_size.mn()); // <- Create matrix D with dimensions M x N used to
// store output from CUTLASS kernel
cutlass::HostTensor<ElementOutput, LayoutOutput> tensor_ref_d(
problem_size.mn()); // <- Create matrix D with dimensions M x N used to
// store output from reference kernel
// Fill input and output matrices on host using CUTLASS helper functions
cutlass::reference::host::TensorFillRandomUniform(
tensor_a.host_view(), 1, ElementInputA(4), ElementInputA(-4),
0); // <- Fill matrix A on host with uniform-distribution random data
cutlass::reference::host::TensorFillRandomUniform(
tensor_b.host_view(), 1, ElementInputB(4), ElementInputB(-4),
0); // <- Fill matrix B on host with uniform-distribution random data
cutlass::reference::host::TensorFillRandomUniform(
tensor_c.host_view(), 1, ElementOutput(4), ElementOutput(-4),
0); // <- Fill matrix C on host with uniform-distribution random data
cutlass::reference::host::TensorFill(
tensor_d.host_view()); // <- fill matrix D on host with zeros
cutlass::reference::host::TensorFill(
tensor_ref_d
.host_view()); // <- fill matrix D for reference on host with zeros
// Copy data from host to GPU
tensor_a.sync_device();
tensor_b.sync_device();
tensor_c.sync_device();
tensor_d.sync_device();
tensor_ref_d.sync_device();
// Initialize alpha and beta for dot product computation
ElementComputeEpilogue alpha = ElementComputeEpilogue(1);
ElementComputeEpilogue beta = ElementComputeEpilogue(0);
// Split K dimension into 16 partitions
int split_k_slices = 16;
// Create a tuple of gemm kernel arguments. This is later passed as
// arguments to launch instantiated CUTLASS kernel
typename Gemm::Arguments arguments{
problem_size, // <- problem size of matrix multiplication
tensor_a.device_ref(), // <- reference to matrix A on device
tensor_b.device_ref(), // <- reference to matrix B on device
tensor_c.device_ref(), // <- reference to matrix C on device
tensor_d.device_ref(), // <- reference to matrix D on device
{alpha, beta}, // <- tuple of alpha and beta
split_k_slices}; // <- k-dimension split factor
// Using the arguments, query for extra workspace required for matrix
// multiplication computation
size_t workspace_size = Gemm::get_workspace_size(arguments);
// Allocate workspace memory
cutlass::device_memory::allocation<uint8_t> workspace(workspace_size);
// Instantiate CUTLASS kernel depending on templates
Gemm gemm_op;
// Initialize CUTLASS kernel with arguments and workspace pointer
cutlass::Status status = gemm_op.initialize(arguments, workspace.get());
CUTLASS_CHECK(status);
// Launch initialized CUTLASS kernel
status = gemm_op();
CUTLASS_CHECK(status);
// Create instantiation for device reference gemm kernel
cutlass::reference::device::Gemm<
ElementInputA, LayoutInputA, ElementInputB, LayoutInputB, ElementOutput,
LayoutOutput, ElementComputeEpilogue, ElementComputeEpilogue>
gemm_device;
// Launch device reference gemm kernel
gemm_device(problem_size, alpha, tensor_a.device_ref(),
tensor_b.device_ref(), beta, tensor_c.device_ref(),
tensor_ref_d.device_ref());
// Wait for kernels to finish
cudaDeviceSynchronize();
// Copy output data from CUTLASS and reference kernel to host for comparison
tensor_d.sync_host();
tensor_ref_d.sync_host();
// Check if output from CUTLASS kernel and reference kernel are equal or not
bool passed = cutlass::reference::host::TensorEquals(
tensor_d.host_view(), tensor_ref_d.host_view());
std::cout << (passed ? "Passed" : "Failed") << std::endl;
return (passed ? 0 : -1);
}
Then, I called the test function in main.cu: #include <cuda.h>
#include <cuda_runtime.h>
#include <gemm/kernels.h>
int main() {
if (test_cutlass_gemm_split_K() != 0) {
std::cout << "test_cutlass_gemm_split_K failed" << std::endl;
return -1;
}
std::cout << "test_cutlass_gemm_split_K passed" << std::endl;
return 0;
} Finally, I compiled and ran it using the following command: Build:
Run:
I cloned cutlass into the 3rd-party directory and compiled it following the instructions in |
i think you need to specify sm80 in your |
Thank you very much for your response. I added |
What is your question?
I am a newbee in cutlass, and I'm trying to run the
examples/06_splitK_gemm
on Nvidia A100. I made modifications based on issue #1141, including changing SmArch to cutlass::arch::Sm80 and modifying variables like ShapeMMAThreadBlock, ShapeMMAWarp, ShapeMMAOp according to the file https://github.com/NVIDIA/cutlass/blob/main/test/unit/gemm/device/gemm_f16t_f16n_f32t_tensor_op_f32_sm80.cu. However, I encountered the following runtime error:The screenshot of the runtime output is as follows:
I would like to ask how to modify it in order to run this example on A100?
The text was updated successfully, but these errors were encountered: