Skip to content

Commit

Permalink
Merge branch 'main' into dealloc-tmem
Browse files Browse the repository at this point in the history
  • Loading branch information
zasdfgbnm authored Feb 3, 2025
2 parents c3521f9 + dbd0d6b commit c509202
Show file tree
Hide file tree
Showing 7 changed files with 199 additions and 30 deletions.
5 changes: 3 additions & 2 deletions csrc/device_lower/analysis/tensor_memory.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ namespace nvfuser {
TensorMemoryInfo computeTMemInfo(Fusion* fusion) {
TensorMemoryInfo result;

// Step 1: partition the tensors. Each partition of tensor will become a
// Step 1: partition the tensors. Each partition of tensors will become a
// region, so we use the term partition and region interchangeably. The user
// may have provided full or partial partitioning information. For the
// TensorViews that the user has already specified which region they belong
Expand Down Expand Up @@ -57,7 +57,8 @@ TensorMemoryInfo computeTMemInfo(Fusion* fusion) {
// Step 2: Compute the allocation information for tensor memory. That is, for
// each partition, we create a Region object and fill in the necessary
// information.
auto& regions = result.allocation.regions;
using Region = TMemAlllocationInfo::Region;
std::vector<Region>& regions = result.allocation.regions;
for (const auto& partition : partitions) {
regions.emplace_back();
auto& region = regions.back();
Expand Down
4 changes: 4 additions & 0 deletions csrc/device_lower/pass/allocation.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -494,6 +494,10 @@ class AllocationInserter : public kir::ExprMutator {
break;
}
}
NVF_ERROR(
alloc_expr->address() != nullptr,
"Could not find region for tensor memory allocation of ",
info.buffer);
}

return alloc_expr;
Expand Down
12 changes: 9 additions & 3 deletions csrc/scheduler/normalization_inner.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,7 @@ std::pair<int64_t, int64_t> getPersistentBufferSize(
normalization_scheduler_utils::isProjectBufferToInputs(
fusion,
runtime_info,
reduction_tvs,
persistent_buffer_info,
persistent_buffer_size_info,
InnerPersistentKernelScheduler::schedulerType(),
Expand All @@ -58,9 +59,12 @@ std::pair<int64_t, int64_t> getPersistentBufferSize(

int64_t available_persistent_buffer_size = normalization_scheduler_utils::
getMaxRegOrSharedMemorySizeForPersistentBuffer(
fusion,
runtime_info,
persistent_buffer_info.persistent_buffers,
can_use_smem_persistent);
reduction_tvs,
persistent_buffer_info,
can_use_smem_persistent,
project_persistent_buffers);
return std::make_pair(
persistent_buffer_size, available_persistent_buffer_size);
}
Expand Down Expand Up @@ -148,7 +152,9 @@ int64_t getMaxPersistentBatch(
// occupancy due to the limitation of the current heuristics. TODO: remove
// this parameter when we have a better heuristic to select the best
// persistent batch size.
int64_t max_batches_per_block = is_high_bandwidth_flops_ratio ? 12l : 10l;
int64_t max_batches_per_block =
normalization_scheduler_utils::getInnerPersistentMaxBatchSize(
is_high_bandwidth_flops_ratio);
return std::min(max_batches_per_block, batch_size);
}

Expand Down
1 change: 1 addition & 0 deletions csrc/scheduler/normalization_inner_outer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -218,6 +218,7 @@ PersistentBufferStorageParams getPersistentBufferStorageParams(
normalization_scheduler_utils::isProjectBufferToInputs(
fusion,
runtime_info,
reduction_tvs,
persistent_buffer_info,
persistent_buffer_size_info,
InnerOuterPersistentKernelScheduler::schedulerType(),
Expand Down
104 changes: 81 additions & 23 deletions csrc/scheduler/normalization_utils.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -716,37 +716,86 @@ void checkReductionTvForScheduling(Fusion* fusion, TensorView* ref_red_tv) {
"Tried to schedule a fusion with no tensor inputs, currently not supported.");
}

namespace {
// For inner persistent kernel, shared memory is allocated as:
// ceilDiv(N/vect, batch) * vect * batch. The required shared memory size is
// larger than buffer size when split is not divisible. The difference is
// counted as roundup overhead. This function estimates the maximum possible
// shared memory size due to this round up.
int64_t roundUpSharedMemory(int64_t tv_buffer_size, int64_t data_type_size) {
auto dev_prop = at::cuda::getCurrentDeviceProperties();
int64_t max_threads_per_block = (int64_t)dev_prop->maxThreadsPerBlock;
int64_t max_smem = 0;
int64_t max_vectorize_factor =
SchedulerRuntimeInfo::max_alignment_size_in_byte / data_type_size;
int64_t dim_size = tv_buffer_size / data_type_size;
// Check all possible combinations of vectorization factor, batch size and
// threads per block
for (int64_t vectorize_factor = 1; vectorize_factor <= max_vectorize_factor;
vectorize_factor *= 2) {
// heuristic only uses divisible vectorization factor
if (dim_size % vectorize_factor != 0) {
continue;
}
int64_t after_vect = dim_size / vectorize_factor;
// For shared memory persistence, heuristic always uses maximum threads
// per block
int64_t threads_per_block = max_threads_per_block;
int64_t persistent_batch = ceilDiv(after_vect, threads_per_block);
max_smem = std::max(
max_smem,
persistent_batch * vectorize_factor * threads_per_block *
data_type_size);
}
return max_smem;
}
int64_t sharedMemoryRoundUpOverhead(
SchedulerRuntimeInfo& runtime_info,
const scheduler_utils::PersistentBufferInfo& persistent_buffer_info,
const bool project_to_inputs) {
auto buffers = project_to_inputs
? persistent_buffer_info.projectable_buffer_inputs
: persistent_buffer_info.persistent_buffers;
int64_t total_smem_overhead = 0;
for (auto buffer : buffers) {
// Buffer size derived from shape and dtype of the persistent tensor
int64_t logical_buffer_size =
scheduler_utils::getPersistentBufferSizeOfTensor(
buffer, runtime_info, persistent_buffer_info);
// Required shared memory size if store that tensor in shared memory
int64_t buffer_size_smem = roundUpSharedMemory(
logical_buffer_size, dataTypeSize(buffer->getDataType().value()));
// The difference is counted as roundup overhead
total_smem_overhead += (buffer_size_smem - logical_buffer_size);
}
return total_smem_overhead;
}
} // namespace

int64_t getMaxRegOrSharedMemorySizeForPersistentBuffer(
Fusion* fusion,
SchedulerRuntimeInfo& runtime_info,
const std::vector<TensorView*>& persistent_buffers,
const bool can_use_smem_persistent) {
const std::vector<TensorView*>& reduction_tvs,
const scheduler_utils::PersistentBufferInfo& persistent_buffer_info,
const bool can_use_smem_persistent,
const bool project_to_inputs) {
// Init to register file size, which is half of the full register file size
int64_t available_persistent_buffer_size =
scheduler_utils::register_file_size;
// shared memory persistent is not implemented for 3D inner reduction
if (!can_use_smem_persistent) {
return available_persistent_buffer_size;
}
// Check available shared memory
const auto dev_prop = at::cuda::getCurrentDeviceProperties();
const int64_t max_shared_memory_size =
(int64_t)dev_prop->sharedMemPerBlockOptin;
// Some shared memories are reserved for kernel launch overhead and
// reduction_broadcast_workspace. Estimation is conservative, but should
// be good enough. The actual threads per block is set in the heuristics
// and it may be smaller than maxThreadsPerBlock.
// TODO: More accurate estimation of available shared memory size.
const int64_t kernel_overhead = (int64_t)dev_prop->reservedSharedMemPerBlock;
int64_t max_buffer_dtype_size = 1;
for (auto tv : persistent_buffers) {
max_buffer_dtype_size = std::max(
max_buffer_dtype_size,
dataTypeSize(tv->getDataType().value(), runtime_info.getIndexType()));
}
const int64_t reduction_broadcast_workspace =
(int64_t)(dev_prop->maxThreadsPerBlock) * max_buffer_dtype_size;
const int64_t available_shared_memory_size =
max_shared_memory_size - kernel_overhead - reduction_broadcast_workspace;
int64_t smem_overhead =
scheduler_utils::getSharedMemoryOverheadPerBlock(fusion, reduction_tvs);

smem_overhead += sharedMemoryRoundUpOverhead(
runtime_info, persistent_buffer_info, project_to_inputs);

int64_t available_shared_memory_size =
(int64_t)dev_prop->sharedMemPerMultiprocessor - smem_overhead;

available_persistent_buffer_size =
std::max(available_persistent_buffer_size, available_shared_memory_size);
return available_persistent_buffer_size;
Expand All @@ -760,6 +809,7 @@ int64_t getMaxRegOrSharedMemorySizeForPersistentBuffer(
BufferProjectionStrategy isProjectBufferToInputs(
Fusion* fusion,
SchedulerRuntimeInfo& runtime_info,
const std::vector<TensorView*>& reduction_tvs,
const scheduler_utils::PersistentBufferInfo& persistent_buffer_info,
const scheduler_utils::PersistentBufferSizeReturn&
persistent_buffer_size_info,
Expand Down Expand Up @@ -790,9 +840,12 @@ BufferProjectionStrategy isProjectBufferToInputs(
if (scheduler_type != SchedulerType::InnerOuterPersistent) {
int64_t max_available_buffer =
getMaxRegOrSharedMemorySizeForPersistentBuffer(
fusion,
runtime_info,
persistent_buffer_info.persistent_buffers,
can_use_smem_persistent);
reduction_tvs,
persistent_buffer_info,
can_use_smem_persistent,
false);
if (max_available_buffer <
persistent_buffer_size_info.persistent_buffer_size) {
return BufferProjectionStrategy::ProjectToInputs;
Expand Down Expand Up @@ -911,6 +964,7 @@ PersistentKernelProperties getPersistentKernelProperties(
auto project_strategy = isProjectBufferToInputs(
fusion,
runtime_info,
reduction_tvs,
persistent_buffer_info,
persistent_buffer_size_info,
scheduler_type,
Expand Down Expand Up @@ -1633,5 +1687,9 @@ std::vector<TensorView*> getResolutionPointsOf(TensorView* persistent_buffer) {
return PersistentBufferResolution::getResolutionPointsOf(persistent_buffer);
}

int64_t getInnerPersistentMaxBatchSize(bool is_high_bandwidth_flops_ratio) {
return is_high_bandwidth_flops_ratio ? 12l : 10l;
}

} // namespace normalization_scheduler_utils
} // namespace nvfuser
10 changes: 8 additions & 2 deletions csrc/scheduler/normalization_utils.h
Original file line number Diff line number Diff line change
Expand Up @@ -285,9 +285,12 @@ void schedulePersistentKernel(

// Get max register or shared memory size for persistent buffer
int64_t getMaxRegOrSharedMemorySizeForPersistentBuffer(
Fusion* fusion,
SchedulerRuntimeInfo& runtime_info,
const std::vector<TensorView*>& persistent_buffers,
const bool can_use_smem_persistent);
const std::vector<TensorView*>& reduction_tvs,
const scheduler_utils::PersistentBufferInfo& persistent_buffer_info,
const bool can_use_smem_persistent,
const bool project_to_inputs);

enum class BufferProjectionStrategy {
// Recompute persistent buffers from inputs, only need to cache inputs in
Expand Down Expand Up @@ -331,6 +334,7 @@ enum class BufferProjectionStrategy {
BufferProjectionStrategy isProjectBufferToInputs(
Fusion* fusion,
SchedulerRuntimeInfo& runtime_info,
const std::vector<TensorView*>& reduction_tvs,
const scheduler_utils::PersistentBufferInfo& persistent_buffer_info,
const scheduler_utils::PersistentBufferSizeReturn&
persistent_buffer_size_info,
Expand Down Expand Up @@ -375,5 +379,7 @@ std::vector<TensorView*> movePersistentBufferToSmem(
// PersistentBufferTest.GetResolutionIssue1123 for a concrete example
std::vector<TensorView*> getResolutionPointsOf(TensorView* persistent_buffer);

// Return empirical maximum persistent batch size for inner persistent scheduler
int64_t getInnerPersistentMaxBatchSize(bool is_high_bandwidth_flops_ratio);
} // namespace normalization_scheduler_utils
} // namespace nvfuser
93 changes: 93 additions & 0 deletions tests/cpp/test_persistent_buffer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
#include <logical_domain_map.h>
#include <ops/all_ops.h>
#include <scheduler/all_schedulers.h>
#include <scheduler/normalization_utils.h>
#include <scheduler/reduction_utils.h>
#include <scheduler/utils.h>
#include <tests/cpp/utils.h>
Expand Down Expand Up @@ -1363,4 +1364,96 @@ TEST_F(PersistentBufferTest, GetResolutionIssue1123) {
std::vector<TensorView*>{tv7});
}

TEST_F(PersistentBufferTest, InnerPersistentNotEnoughSharedMemory) {
auto fusion_ptr = std::make_unique<Fusion>();
auto& fusion = *fusion_ptr;
FusionGuard fg(fusion_ptr.get());

auto tv0 = makeContigTensor(2, DataType::Half);
fusion.addInput(tv0);
auto tv1 = makeContigTensor(1, DataType::Half);
fusion.addInput(tv1);
auto tv2 = makeContigTensor(1, DataType::Half);
fusion.addInput(tv2);

auto tv3 = castOp(DataType::Float, tv0);
auto tvs = Welford(tv3, {1});
auto tv6 = tvs.avg;
auto tv7 = tvs.var_sum;
auto tv9 = broadcast(tv6, {false, true});
TensorView* tv10 = nullptr;
auto tv21 = castOp(DataType::Float, tv0);
tv10 = sub(tv21, tv9);
auto tv11 = broadcast(tv7, {false, true});
auto tv13 = add(tv11, IrBuilder::create<Val>(0.001));
auto tv14 = rsqrt(tv13);
auto tv15 = mul(tv10, tv14);
auto tv4 = castOp(DataType::Float, tv1);
auto tv16 = broadcast(tv4, {true, false});
auto tv17 = mul(tv15, tv16);
auto tv5 = castOp(DataType::Float, tv2);
auto tv18 = broadcast(tv5, {true, false});
auto tv19 = add(tv17, tv18);
auto tv20 = castOp(DataType::Half, tv19);

fusion.addOutput(tv20);
fusion.addOutput(tv9);
fusion.addOutput(tv14);

std::vector<int64_t> input_shape{2048, 80 * 1024};

auto options = at::TensorOptions().dtype(at::kHalf).device(at::kCUDA, 0);
auto t0 = at::randn(input_shape, options);
auto t1 = at::randn({input_shape[1]}, options);
auto t2 = at::randn({input_shape[1]}, options);
std::vector<c10::IValue> inputs({t0, t1, t2});

// The logic size of the persistent buffer in this fusion is 80 * 1024 * 2
// bytes. Inner persistent scheduler allows 32 * 1024 * 4 bytes for register
// persistent, so it should use shared memory persistent buffer if there are
// enough shared memory. Otherwise, it will be segmented.
SchedulerRuntimeInfo runtime_info(&fusion, inputs);
auto persistent_buffer_info = scheduler_utils::persistentBuffers(&fusion);
auto persistent_buffer_size =
persistentBufferSize(&fusion, runtime_info, persistent_buffer_info);
int64_t logic_buffer_size = 80 * 1024 * dataTypeSize(DataType::Half);
EXPECT_EQ(
persistent_buffer_size.projected_persistent_buffer_size,
logic_buffer_size);

// If total shared memory on device is less than logic buffer size, should
// segment. Otherwise, further calculate available shared memory size by
// removing overhead due to reduction broadcast workspace and non-divisible
// split.
bool is_segmented = false;
const auto dev_prop = at::cuda::getCurrentDeviceProperties();
if ((int64_t)dev_prop->sharedMemPerMultiprocessor < logic_buffer_size) {
is_segmented = true;
} else {
int64_t available_buffer_size = normalization_scheduler_utils::
getMaxRegOrSharedMemorySizeForPersistentBuffer(
&fusion,
runtime_info,
scheduler_utils::getReductionTvs(&fusion),
persistent_buffer_info,
/*can_use_smem_persistent*/ true,
/*project_to_inputs*/ true);
is_segmented = logic_buffer_size >= available_buffer_size;
}

FusionExecutorCache executor_cache(std::move(fusion_ptr));
auto outputs = executor_cache.runFusionWithInputs(inputs);

// check segmentation, if not segmented, further check shared memory
// persistence
auto runtime = executor_cache.getMostRecentKernelRuntime();
ASSERT_EQ(is_segmented, runtime->isSegmented());
if (!is_segmented) {
auto& params = runtime->schedulerHeuristics()->heuristicsList().at(0);
ASSERT_TRUE(params->isA<ReductionParams>());
ASSERT_TRUE(
params->as<ReductionParams>()->smem_persistent_buffers.size() > 0);
}
testValidate(&fusion, outputs, inputs, __LINE__, __FILE__);
}
} // namespace nvfuser

0 comments on commit c509202

Please sign in to comment.