From e300d6979e92dee5254af7d6f1d84bc942540de2 Mon Sep 17 00:00:00 2001
From: Bernhard Manfred Gruber <bernhardmgruber@gmail.com>
Date: Thu, 11 Jan 2024 21:17:19 +0100
Subject: [PATCH] Use 20 repetitions with a warmup for CUDA viewcopy

---
 examples/cuda/viewcopy/viewcopy.cu | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/examples/cuda/viewcopy/viewcopy.cu b/examples/cuda/viewcopy/viewcopy.cu
index a36095d309..3980c58c4a 100644
--- a/examples/cuda/viewcopy/viewcopy.cu
+++ b/examples/cuda/viewcopy/viewcopy.cu
@@ -13,7 +13,7 @@
 // No specialized layout aware copy routine is implemented.
 
 using Size = std::size_t;
-constexpr auto repetitions = 5;
+constexpr auto repetitions = 20; // excluding 1 warmup run
 constexpr auto extents = llama::ArrayExtentsDynamic<Size, 3>{200u, 512u, 512u}; // z, y, x
 constexpr auto threadsPerBlock = Size{256};
 
@@ -246,6 +246,7 @@ try
         std::byte* dst = nullptr;
         cudaMalloc(&src, dataSize);
         cudaMalloc(&dst, dataSize);
+        cudaMemcpyAsync(dst, src, dataSize, cudaMemcpyDeviceToDevice); // warmup
         cudaEventRecord(startEvent);
         for(auto i = 0; i < repetitions; i++)
             cudaMemcpyAsync(dst, src, dataSize, cudaMemcpyDeviceToDevice);
@@ -272,6 +273,7 @@ try
         auto benchmarkCopy = [&, srcView = &srcView, srcHash = srcHash](std::string_view algName, auto copy)
         {
             auto dstView = llama::allocViewUninitialized(dstMapping, llama::bloballoc::CudaMalloc{});
+            copy(*srcView, dstView); // warumup
             cudaEventRecord(startEvent);
             for(auto i = 0; i < repetitions; i++)
                 copy(*srcView, dstView);