From e300d6979e92dee5254af7d6f1d84bc942540de2 Mon Sep 17 00:00:00 2001 From: Bernhard Manfred Gruber Date: Thu, 11 Jan 2024 21:17:19 +0100 Subject: [PATCH] Use 20 repetitions with a warmup for CUDA viewcopy --- examples/cuda/viewcopy/viewcopy.cu | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/examples/cuda/viewcopy/viewcopy.cu b/examples/cuda/viewcopy/viewcopy.cu index a36095d309..3980c58c4a 100644 --- a/examples/cuda/viewcopy/viewcopy.cu +++ b/examples/cuda/viewcopy/viewcopy.cu @@ -13,7 +13,7 @@ // No specialized layout aware copy routine is implemented. using Size = std::size_t; -constexpr auto repetitions = 5; +constexpr auto repetitions = 20; // excluding 1 warmup run constexpr auto extents = llama::ArrayExtentsDynamic{200u, 512u, 512u}; // z, y, x constexpr auto threadsPerBlock = Size{256}; @@ -246,6 +246,7 @@ try std::byte* dst = nullptr; cudaMalloc(&src, dataSize); cudaMalloc(&dst, dataSize); + cudaMemcpyAsync(dst, src, dataSize, cudaMemcpyDeviceToDevice); // warmup cudaEventRecord(startEvent); for(auto i = 0; i < repetitions; i++) cudaMemcpyAsync(dst, src, dataSize, cudaMemcpyDeviceToDevice); @@ -272,6 +273,7 @@ try auto benchmarkCopy = [&, srcView = &srcView, srcHash = srcHash](std::string_view algName, auto copy) { auto dstView = llama::allocViewUninitialized(dstMapping, llama::bloballoc::CudaMalloc{}); + copy(*srcView, dstView); // warumup cudaEventRecord(startEvent); for(auto i = 0; i < repetitions; i++) copy(*srcView, dstView);