diff --git a/gemm/.clangd b/gemm/.clangd new file mode 100644 index 0000000..6c495d4 --- /dev/null +++ b/gemm/.clangd @@ -0,0 +1,20 @@ +--- +CompileFlags: + Add: + - -std=c++17 + - --no-cuda-version-check + - -Iexternal/com_github_nvidia_cutlass/include + +--- +If: + PathMatch: .*\.cuh? +CompileFlags: + Add: + - -xcu + +--- +If: + PathMatch: .*/include/(cutlass|cute)/.* +CompileFlags: + Add: + - -xcu diff --git a/gemm/cpu/matmul.hpp b/gemm/cpu/matmul.hpp index dc41ba3..e5ba72f 100644 --- a/gemm/cpu/matmul.hpp +++ b/gemm/cpu/matmul.hpp @@ -20,7 +20,7 @@ void make_string_impl(std::ostringstream& oss, const T& head, Ts... tail) { template std::string make_string(Ts... args) { std::ostringstream oss; - detail::make_string_impl(oss, args...); + ::detail::make_string_impl(oss, args...); return oss.str(); } diff --git a/gemm/cuda_cute/benchmark_driver.py b/gemm/cuda_cute/benchmark_driver.py index 4faed45..896dafb 100644 --- a/gemm/cuda_cute/benchmark_driver.py +++ b/gemm/cuda_cute/benchmark_driver.py @@ -21,6 +21,8 @@ import numpy as np import pandas as pd import matplotlib.pyplot as plt + +import matmul from matmul import * @@ -54,7 +56,7 @@ def __str__(self): def run_benchmark(f, size, *, min_iterations=20, max_iterations=100) -> stat: - benchmark_seconds= (0.100 / (4096 ** 3)) * size ** 3 + benchmark_seconds = (0.100 / (4096**3)) * size**3 a = np.zeros((size, size), dtype=np.float32) b = np.zeros((size, size), dtype=np.float32) c = np.zeros((size, size), dtype=np.float32) @@ -141,3 +143,17 @@ def new_benchmark_plot(*dataframes, **kwargs): chart = chart.configure_axis(labelFontSize=12, titleFontSize=14) chart = chart.configure_legend(labelLimit=0) return chart + + +if __name__ == "__main__": + available = list(filter(lambda name: name.startswith("matmul_") or name.startswith("launch_"), dir(matmul))) + + print("Available tests:") + for name in available: + print(" ", name) + + selected = sys.argv[1] if len(sys.argv) > 1 else available[0] + f = getattr(matmul, selected) + size = int(sys.argv[2]) if len(sys.argv) > 2 else 1024 + print(f"Running {selected} for [{size}x{size}] * [{size}x{size}] ...") + print(run_benchmark(f, size))