Merge branch 'main' into dealloc-tmem

NVIDIA · Feb 3, 2025 · c509202 · c509202
2 parents c3521f9 + dbd0d6b
commit c509202
Show file tree

Hide file tree

Showing 7 changed files with 199 additions and 30 deletions.
diff --git a/csrc/device_lower/analysis/tensor_memory.cpp b/csrc/device_lower/analysis/tensor_memory.cpp
@@ -17,7 +17,7 @@ namespace nvfuser {
 TensorMemoryInfo computeTMemInfo(Fusion* fusion) {
   TensorMemoryInfo result;
 
-  // Step 1: partition the tensors. Each partition of tensor will become a
+  // Step 1: partition the tensors. Each partition of tensors will become a
   // region, so we use the term partition and region interchangeably. The user
   // may have provided full or partial partitioning information. For the
   // TensorViews that the user has already specified which region they belong
@@ -57,7 +57,8 @@ TensorMemoryInfo computeTMemInfo(Fusion* fusion) {
   // Step 2: Compute the allocation information for tensor memory. That is, for
   // each partition, we create a Region object and fill in the necessary
   // information.
-  auto& regions = result.allocation.regions;
+  using Region = TMemAlllocationInfo::Region;
+  std::vector<Region>& regions = result.allocation.regions;
   for (const auto& partition : partitions) {
     regions.emplace_back();
     auto& region = regions.back();

diff --git a/csrc/device_lower/pass/allocation.cpp b/csrc/device_lower/pass/allocation.cpp
@@ -494,6 +494,10 @@ class AllocationInserter : public kir::ExprMutator {
           break;
         }
       }
+      NVF_ERROR(
+          alloc_expr->address() != nullptr,
+          "Could not find region for tensor memory allocation of ",
+          info.buffer);
     }
 
     return alloc_expr;

diff --git a/csrc/scheduler/normalization_inner.cpp b/csrc/scheduler/normalization_inner.cpp
@@ -44,6 +44,7 @@ std::pair<int64_t, int64_t> getPersistentBufferSize(
       normalization_scheduler_utils::isProjectBufferToInputs(
           fusion,
           runtime_info,
+          reduction_tvs,
           persistent_buffer_info,
           persistent_buffer_size_info,
           InnerPersistentKernelScheduler::schedulerType(),
@@ -58,9 +59,12 @@ std::pair<int64_t, int64_t> getPersistentBufferSize(
 
   int64_t available_persistent_buffer_size = normalization_scheduler_utils::
       getMaxRegOrSharedMemorySizeForPersistentBuffer(
+          fusion,
           runtime_info,
-          persistent_buffer_info.persistent_buffers,
-          can_use_smem_persistent);
+          reduction_tvs,
+          persistent_buffer_info,
+          can_use_smem_persistent,
+          project_persistent_buffers);
   return std::make_pair(
       persistent_buffer_size, available_persistent_buffer_size);
 }
@@ -148,7 +152,9 @@ int64_t getMaxPersistentBatch(
   // occupancy due to the limitation of the current heuristics. TODO: remove
   // this parameter when we have a better heuristic to select the best
   // persistent batch size.
-  int64_t max_batches_per_block = is_high_bandwidth_flops_ratio ? 12l : 10l;
+  int64_t max_batches_per_block =
+      normalization_scheduler_utils::getInnerPersistentMaxBatchSize(
+          is_high_bandwidth_flops_ratio);
   return std::min(max_batches_per_block, batch_size);
 }
 

diff --git a/csrc/scheduler/normalization_inner_outer.cpp b/csrc/scheduler/normalization_inner_outer.cpp
@@ -218,6 +218,7 @@ PersistentBufferStorageParams getPersistentBufferStorageParams(
       normalization_scheduler_utils::isProjectBufferToInputs(
           fusion,
           runtime_info,
+          reduction_tvs,
           persistent_buffer_info,
           persistent_buffer_size_info,
           InnerOuterPersistentKernelScheduler::schedulerType(),

diff --git a/csrc/scheduler/normalization_utils.cpp b/csrc/scheduler/normalization_utils.cpp
@@ -716,37 +716,86 @@ void checkReductionTvForScheduling(Fusion* fusion, TensorView* ref_red_tv) {
       "Tried to schedule a fusion with no tensor inputs, currently not supported.");
 }
 
+namespace {
+// For inner persistent kernel, shared memory is allocated as:
+// ceilDiv(N/vect, batch) * vect * batch. The required shared memory size is
+// larger than buffer size when split is not divisible. The difference is
+// counted as roundup overhead. This function estimates the maximum possible
+// shared memory size due to this round up.
+int64_t roundUpSharedMemory(int64_t tv_buffer_size, int64_t data_type_size) {
+  auto dev_prop = at::cuda::getCurrentDeviceProperties();
+  int64_t max_threads_per_block = (int64_t)dev_prop->maxThreadsPerBlock;
+  int64_t max_smem = 0;
+  int64_t max_vectorize_factor =
+      SchedulerRuntimeInfo::max_alignment_size_in_byte / data_type_size;
+  int64_t dim_size = tv_buffer_size / data_type_size;
+  // Check all possible combinations of vectorization factor, batch size and
+  // threads per block
+  for (int64_t vectorize_factor = 1; vectorize_factor <= max_vectorize_factor;
+       vectorize_factor *= 2) {
+    // heuristic only uses divisible vectorization factor
+    if (dim_size % vectorize_factor != 0) {
+      continue;
+    }
+    int64_t after_vect = dim_size / vectorize_factor;
+    // For shared memory persistence, heuristic always uses maximum threads
+    // per block
+    int64_t threads_per_block = max_threads_per_block;
+    int64_t persistent_batch = ceilDiv(after_vect, threads_per_block);
+    max_smem = std::max(
+        max_smem,
+        persistent_batch * vectorize_factor * threads_per_block *
+            data_type_size);
+  }
+  return max_smem;
+}
+int64_t sharedMemoryRoundUpOverhead(
+    SchedulerRuntimeInfo& runtime_info,
+    const scheduler_utils::PersistentBufferInfo& persistent_buffer_info,
+    const bool project_to_inputs) {
+  auto buffers = project_to_inputs
+      ? persistent_buffer_info.projectable_buffer_inputs
+      : persistent_buffer_info.persistent_buffers;
+  int64_t total_smem_overhead = 0;
+  for (auto buffer : buffers) {
+    // Buffer size derived from shape and dtype of the persistent tensor
+    int64_t logical_buffer_size =
+        scheduler_utils::getPersistentBufferSizeOfTensor(
+            buffer, runtime_info, persistent_buffer_info);
+    // Required shared memory size if store that tensor in shared memory
+    int64_t buffer_size_smem = roundUpSharedMemory(
+        logical_buffer_size, dataTypeSize(buffer->getDataType().value()));
+    // The difference is counted as roundup overhead
+    total_smem_overhead += (buffer_size_smem - logical_buffer_size);
+  }
+  return total_smem_overhead;
+}
+} // namespace
+
 int64_t getMaxRegOrSharedMemorySizeForPersistentBuffer(
+    Fusion* fusion,
     SchedulerRuntimeInfo& runtime_info,
-    const std::vector<TensorView*>& persistent_buffers,
-    const bool can_use_smem_persistent) {
+    const std::vector<TensorView*>& reduction_tvs,
+    const scheduler_utils::PersistentBufferInfo& persistent_buffer_info,
+    const bool can_use_smem_persistent,
+    const bool project_to_inputs) {
   // Init to register file size, which is half of the full register file size
   int64_t available_persistent_buffer_size =
       scheduler_utils::register_file_size;
   // shared memory persistent is not implemented for 3D inner reduction
   if (!can_use_smem_persistent) {
     return available_persistent_buffer_size;
   }
-  // Check available shared memory
   const auto dev_prop = at::cuda::getCurrentDeviceProperties();
-  const int64_t max_shared_memory_size =
-      (int64_t)dev_prop->sharedMemPerBlockOptin;
-  // Some shared memories are reserved for kernel launch overhead and
-  // reduction_broadcast_workspace. Estimation is conservative, but should
-  // be good enough. The actual threads per block is set in the heuristics
-  // and it may be smaller than maxThreadsPerBlock.
-  // TODO: More accurate estimation of available shared memory size.
-  const int64_t kernel_overhead = (int64_t)dev_prop->reservedSharedMemPerBlock;
-  int64_t max_buffer_dtype_size = 1;
-  for (auto tv : persistent_buffers) {
-    max_buffer_dtype_size = std::max(
-        max_buffer_dtype_size,
-        dataTypeSize(tv->getDataType().value(), runtime_info.getIndexType()));
-  }
-  const int64_t reduction_broadcast_workspace =
-      (int64_t)(dev_prop->maxThreadsPerBlock) * max_buffer_dtype_size;
-  const int64_t available_shared_memory_size =
-      max_shared_memory_size - kernel_overhead - reduction_broadcast_workspace;
+  int64_t smem_overhead =
+      scheduler_utils::getSharedMemoryOverheadPerBlock(fusion, reduction_tvs);
+
+  smem_overhead += sharedMemoryRoundUpOverhead(
+      runtime_info, persistent_buffer_info, project_to_inputs);
+
+  int64_t available_shared_memory_size =
+      (int64_t)dev_prop->sharedMemPerMultiprocessor - smem_overhead;
+
   available_persistent_buffer_size =
       std::max(available_persistent_buffer_size, available_shared_memory_size);
   return available_persistent_buffer_size;
@@ -760,6 +809,7 @@ int64_t getMaxRegOrSharedMemorySizeForPersistentBuffer(
 BufferProjectionStrategy isProjectBufferToInputs(
     Fusion* fusion,
     SchedulerRuntimeInfo& runtime_info,
+    const std::vector<TensorView*>& reduction_tvs,
     const scheduler_utils::PersistentBufferInfo& persistent_buffer_info,
     const scheduler_utils::PersistentBufferSizeReturn&
         persistent_buffer_size_info,
@@ -790,9 +840,12 @@ BufferProjectionStrategy isProjectBufferToInputs(
   if (scheduler_type != SchedulerType::InnerOuterPersistent) {
     int64_t max_available_buffer =
         getMaxRegOrSharedMemorySizeForPersistentBuffer(
+            fusion,
             runtime_info,
-            persistent_buffer_info.persistent_buffers,
-            can_use_smem_persistent);
+            reduction_tvs,
+            persistent_buffer_info,
+            can_use_smem_persistent,
+            false);
     if (max_available_buffer <
         persistent_buffer_size_info.persistent_buffer_size) {
       return BufferProjectionStrategy::ProjectToInputs;
@@ -911,6 +964,7 @@ PersistentKernelProperties getPersistentKernelProperties(
   auto project_strategy = isProjectBufferToInputs(
       fusion,
       runtime_info,
+      reduction_tvs,
       persistent_buffer_info,
       persistent_buffer_size_info,
       scheduler_type,
@@ -1633,5 +1687,9 @@ std::vector<TensorView*> getResolutionPointsOf(TensorView* persistent_buffer) {
   return PersistentBufferResolution::getResolutionPointsOf(persistent_buffer);
 }
 
+int64_t getInnerPersistentMaxBatchSize(bool is_high_bandwidth_flops_ratio) {
+  return is_high_bandwidth_flops_ratio ? 12l : 10l;
+}
+
 } // namespace normalization_scheduler_utils
 } // namespace nvfuser
diff --git a/csrc/scheduler/normalization_utils.h b/csrc/scheduler/normalization_utils.h
@@ -285,9 +285,12 @@ void schedulePersistentKernel(
 
 // Get max register or shared memory size for persistent buffer
 int64_t getMaxRegOrSharedMemorySizeForPersistentBuffer(
+    Fusion* fusion,
     SchedulerRuntimeInfo& runtime_info,
-    const std::vector<TensorView*>& persistent_buffers,
-    const bool can_use_smem_persistent);
+    const std::vector<TensorView*>& reduction_tvs,
+    const scheduler_utils::PersistentBufferInfo& persistent_buffer_info,
+    const bool can_use_smem_persistent,
+    const bool project_to_inputs);
 
 enum class BufferProjectionStrategy {
   // Recompute persistent buffers from inputs, only need to cache inputs in
@@ -331,6 +334,7 @@ enum class BufferProjectionStrategy {
 BufferProjectionStrategy isProjectBufferToInputs(
     Fusion* fusion,
     SchedulerRuntimeInfo& runtime_info,
+    const std::vector<TensorView*>& reduction_tvs,
     const scheduler_utils::PersistentBufferInfo& persistent_buffer_info,
     const scheduler_utils::PersistentBufferSizeReturn&
         persistent_buffer_size_info,
@@ -375,5 +379,7 @@ std::vector<TensorView*> movePersistentBufferToSmem(
 // PersistentBufferTest.GetResolutionIssue1123 for a concrete example
 std::vector<TensorView*> getResolutionPointsOf(TensorView* persistent_buffer);
 
+// Return empirical maximum persistent batch size for inner persistent scheduler
+int64_t getInnerPersistentMaxBatchSize(bool is_high_bandwidth_flops_ratio);
 } // namespace normalization_scheduler_utils
 } // namespace nvfuser
diff --git a/tests/cpp/test_persistent_buffer.cpp b/tests/cpp/test_persistent_buffer.cpp
@@ -12,6 +12,7 @@
 #include <logical_domain_map.h>
 #include <ops/all_ops.h>
 #include <scheduler/all_schedulers.h>
+#include <scheduler/normalization_utils.h>
 #include <scheduler/reduction_utils.h>
 #include <scheduler/utils.h>
 #include <tests/cpp/utils.h>
@@ -1363,4 +1364,96 @@ TEST_F(PersistentBufferTest, GetResolutionIssue1123) {
       std::vector<TensorView*>{tv7});
 }
 
+TEST_F(PersistentBufferTest, InnerPersistentNotEnoughSharedMemory) {
+  auto fusion_ptr = std::make_unique<Fusion>();
+  auto& fusion = *fusion_ptr;
+  FusionGuard fg(fusion_ptr.get());
+
+  auto tv0 = makeContigTensor(2, DataType::Half);
+  fusion.addInput(tv0);
+  auto tv1 = makeContigTensor(1, DataType::Half);
+  fusion.addInput(tv1);
+  auto tv2 = makeContigTensor(1, DataType::Half);
+  fusion.addInput(tv2);
+
+  auto tv3 = castOp(DataType::Float, tv0);
+  auto tvs = Welford(tv3, {1});
+  auto tv6 = tvs.avg;
+  auto tv7 = tvs.var_sum;
+  auto tv9 = broadcast(tv6, {false, true});
+  TensorView* tv10 = nullptr;
+  auto tv21 = castOp(DataType::Float, tv0);
+  tv10 = sub(tv21, tv9);
+  auto tv11 = broadcast(tv7, {false, true});
+  auto tv13 = add(tv11, IrBuilder::create<Val>(0.001));
+  auto tv14 = rsqrt(tv13);
+  auto tv15 = mul(tv10, tv14);
+  auto tv4 = castOp(DataType::Float, tv1);
+  auto tv16 = broadcast(tv4, {true, false});
+  auto tv17 = mul(tv15, tv16);
+  auto tv5 = castOp(DataType::Float, tv2);
+  auto tv18 = broadcast(tv5, {true, false});
+  auto tv19 = add(tv17, tv18);
+  auto tv20 = castOp(DataType::Half, tv19);
+
+  fusion.addOutput(tv20);
+  fusion.addOutput(tv9);
+  fusion.addOutput(tv14);
+
+  std::vector<int64_t> input_shape{2048, 80 * 1024};
+
+  auto options = at::TensorOptions().dtype(at::kHalf).device(at::kCUDA, 0);
+  auto t0 = at::randn(input_shape, options);
+  auto t1 = at::randn({input_shape[1]}, options);
+  auto t2 = at::randn({input_shape[1]}, options);
+  std::vector<c10::IValue> inputs({t0, t1, t2});
+
+  // The logic size of the persistent buffer in this fusion is 80 * 1024 * 2
+  // bytes. Inner persistent scheduler allows 32 * 1024 * 4 bytes for register
+  // persistent, so it should use shared memory persistent buffer if there are
+  // enough shared memory. Otherwise, it will be segmented.
+  SchedulerRuntimeInfo runtime_info(&fusion, inputs);
+  auto persistent_buffer_info = scheduler_utils::persistentBuffers(&fusion);
+  auto persistent_buffer_size =
+      persistentBufferSize(&fusion, runtime_info, persistent_buffer_info);
+  int64_t logic_buffer_size = 80 * 1024 * dataTypeSize(DataType::Half);
+  EXPECT_EQ(
+      persistent_buffer_size.projected_persistent_buffer_size,
+      logic_buffer_size);
+
+  // If total shared memory on device is less than logic buffer size, should
+  // segment. Otherwise, further calculate available shared memory size by
+  // removing overhead due to reduction broadcast workspace and non-divisible
+  // split.
+  bool is_segmented = false;
+  const auto dev_prop = at::cuda::getCurrentDeviceProperties();
+  if ((int64_t)dev_prop->sharedMemPerMultiprocessor < logic_buffer_size) {
+    is_segmented = true;
+  } else {
+    int64_t available_buffer_size = normalization_scheduler_utils::
+        getMaxRegOrSharedMemorySizeForPersistentBuffer(
+            &fusion,
+            runtime_info,
+            scheduler_utils::getReductionTvs(&fusion),
+            persistent_buffer_info,
+            /*can_use_smem_persistent*/ true,
+            /*project_to_inputs*/ true);
+    is_segmented = logic_buffer_size >= available_buffer_size;
+  }
+
+  FusionExecutorCache executor_cache(std::move(fusion_ptr));
+  auto outputs = executor_cache.runFusionWithInputs(inputs);
+
+  // check segmentation, if not segmented, further check shared memory
+  // persistence
+  auto runtime = executor_cache.getMostRecentKernelRuntime();
+  ASSERT_EQ(is_segmented, runtime->isSegmented());
+  if (!is_segmented) {
+    auto& params = runtime->schedulerHeuristics()->heuristicsList().at(0);
+    ASSERT_TRUE(params->isA<ReductionParams>());
+    ASSERT_TRUE(
+        params->as<ReductionParams>()->smem_persistent_buffers.size() > 0);
+  }
+  testValidate(&fusion, outputs, inputs, __LINE__, __FILE__);
+}
 } // namespace nvfuser