Skip to content

Commit

Permalink
[LLPC] Scalarize only the non-uniform load descriptors of an image call
Browse files Browse the repository at this point in the history
  • Loading branch information
kmitropoulou committed Dec 2, 2023
1 parent 195b936 commit aaa28a5
Show file tree
Hide file tree
Showing 5 changed files with 133 additions and 138 deletions.
166 changes: 70 additions & 96 deletions lgc/builder/BuilderImpl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -330,6 +330,7 @@ BranchInst *BuilderImpl::createIf(Value *condition, bool wantElse, const Twine &
}

#if defined(LLVM_HAVE_BRANCH_AMD_GFX)
// =====================================================================================================================
// A simple memory efficient container that holds up to 64 instructions in a bit vector. It needs two helper data
// structures: 1. instrToIndex that maps an instruction to its index in the bit vector and 2. indexToInstr that maps an
// index back to an instruction.
Expand Down Expand Up @@ -386,13 +387,15 @@ class TinyInstructionSet {
bool empty() const { return bits.empty(); }
};

// =====================================================================================================================
class TraceNonUniformIndex {
// Maps the instruction to its index in the bit vector.
TinyInstructionSet::InstrToIndexMap instrToIndex;
// The instructions used as keys in instrToIndex in program order. It is used to map an index to an instruction.
TinyInstructionSet::IndexToInstructionVec indexToInstr;
// Maps an instruction to its dependencies.
DenseMap<Value *, TinyInstructionSet> instrDeps;
// Maps non-uniform operands with the scalarization option.
bool scalarizeDescriptorLoads;
unsigned upperLimit;
void insertNewValueInInstrDeps(Value *, Instruction *);
Expand Down Expand Up @@ -441,7 +444,6 @@ void TraceNonUniformIndex::insertNewValueInInstrDeps(Value *newValue, Instructio
setOfInstrs.insert(currentVisitedInstr, instrToIndex);
}

// =====================================================================================================================
// For a non-uniform input, try and trace back through a descriptor load to
// find the non-uniform index used in it. If that fails, we just use the
// operand value as the index.
Expand Down Expand Up @@ -688,109 +690,92 @@ Instruction *BuilderImpl::createWaterfallLoop(Instruction *nonUniformInst, Array
SmallVector<Value *, 2> nonUniformIndices;
// Maps the nonUniformIndex that is returned by traceNonUniformIndex() to the nonUniformInst.
DenseMap<Value *, std::pair<Value *, unsigned>> nonUniformIndexImageCallOperand;
TraceNonUniformIndex traceNonUniformIndex(nonUniformInst, scalarizeDescriptorLoads, 64);
TraceNonUniformIndex traceNonUniformIndex(nonUniformInst, scalarizeDescriptorLoads);
DenseMap<unsigned, Value *> operandIdxnonUniformIndex;

for (unsigned operandIdx : operandIdxs) {
Value *nonUniformImageCallOperand = nonUniformInst->getOperand(operandIdx);
Value *nonUniformIndex = traceNonUniformIndex.run(nonUniformImageCallOperand);
scalarizeDescriptorLoads = traceNonUniformIndex.foundDependencies();
if (nonUniformIndex) {
nonUniformIndices.push_back(nonUniformIndex);

if (scalarizeDescriptorLoads)
if (scalarizeDescriptorLoads) {
nonUniformIndexImageCallOperand[nonUniformIndex] = std::make_pair(nonUniformImageCallOperand, operandIdx);
operandIdxnonUniformIndex[operandIdx] = nonUniformIndex;
}
}
}

if (nonUniformIndices.empty())
return nonUniformInst;

// Save Builder's insert point
IRBuilder<>::InsertPointGuard guard(*this);
// Insert new code just before nonUniformInst.
SetInsertPoint(nonUniformInst);

// For any index that is 64 bit, change it back to 32 bit for comparison at the top of the waterfall loop.
// At this point the nonUniformVal of nonUniformIndices might change. We also need the original non uniform values for
// the scalarization of the descriptor loads.
DenseMap<Value *, Value *> newOrigNonUniformVal;
for (Value *&nonUniformVal : nonUniformIndices) {
if (nonUniformVal->getType()->isIntegerTy(64)) {
auto sExt = dyn_cast<SExtInst>(nonUniformVal);
Value *origNonUniformVal = nonUniformVal;
// 64-bit index may already be formed from extension of 32-bit value.
if (sExt && sExt->getOperand(0)->getType()->isIntegerTy(32)) {
nonUniformVal = sExt->getOperand(0);
newOrigNonUniformVal[nonUniformVal] = origNonUniformVal;
} else {
nonUniformVal = CreateTrunc(nonUniformVal, getInt32Ty());
newOrigNonUniformVal[nonUniformVal] = origNonUniformVal;
}
DenseMap<Value *, Value *> nonUniformIndex32BitVal;
for (auto nonUniformIndex : nonUniformIndices) {
// Start the waterfall loop using the waterfall index.
if (nonUniformIndex->getType()->isIntegerTy(64)) {
Value *new32BitValue = get32BitNonUniformIndex(nonUniformIndex);
nonUniformIndex32BitVal[nonUniformIndex] = new32BitValue;
}
}

// Find first index instruction and check if index instructions are identical.
Instruction *firstIndexInst = nullptr;
if (scalarizeDescriptorLoads) {
// FIXME: these do not actually need to be identical if we introduce multiple waterfall
// begin and readlane intrinsics for these.
bool identicalIndexes = true;
for (Value *nonUniformVal : nonUniformIndices) {
Instruction *nuInst = dyn_cast<Instruction>(nonUniformVal);
// Note: parent check here guards use of comesBefore below
if (!nuInst || (firstIndexInst && !instructionsEqual(nuInst, firstIndexInst)) ||
(firstIndexInst && nuInst->getParent() != firstIndexInst->getParent())) {
identicalIndexes = false;
break;
}
if (!firstIndexInst || nuInst->comesBefore(firstIndexInst))
firstIndexInst = nuInst;
Value *sharedIndex = nullptr;
if (scalarizeDescriptorLoads)
sharedIndex = getSharedIndex(nonUniformIndices, nonUniformIndex32BitVal, traceNonUniformIndex, nonUniformInst);

Value *readFirstLane = nullptr;

// The first begin contains a null token for the previous token argument
Value *waterfallBegin = ConstantInt::get(getInt32Ty(), 0);
// Emit waterfall begin intrinsics for the cases that
for (auto nonUniformIndex : nonUniformIndices) {
if (scalarizeDescriptorLoads && sharedIndex) {
auto sharedIndexTy = sharedIndex->getType();
waterfallBegin = CreateIntrinsic(Intrinsic::amdgcn_waterfall_begin, sharedIndexTy, {waterfallBegin, sharedIndex},
nullptr, instName);
readFirstLane = CreateIntrinsic(Intrinsic::amdgcn_waterfall_readfirstlane, {sharedIndexTy, sharedIndexTy},
{waterfallBegin, sharedIndex}, nullptr, instName);
break;
}

// Ensure we do not create a waterfall across blocks.
// FIXME: we could use dominator check to allow scalarizing descriptor loads on multi-block spans;
// however, this also requires backend support for multi-block waterfalls to be implemented.
if (!identicalIndexes || !firstIndexInst ||
(firstIndexInst && firstIndexInst->getParent() != nonUniformInst->getParent()))
scalarizeDescriptorLoads = false;
Value *nonUniformIndex32Bit =
nonUniformIndex->getType()->isIntegerTy(64) ? nonUniformIndex32BitVal[nonUniformIndex] : nonUniformIndex;
waterfallBegin = CreateIntrinsic(Intrinsic::amdgcn_waterfall_begin, nonUniformIndex32Bit->getType(),
{waterfallBegin, nonUniformIndex32Bit}, nullptr, instName);
}

// Save Builder's insert point
IRBuilder<>::InsertPointGuard guard(*this);

Value *waterfallBegin;
if (scalarizeDescriptorLoads) {
SetInsertPoint(nonUniformInst);
auto descTy = firstIndexInst->getType();
// Create waterfall.begin and waterfall.readfirstlane intrinsics.
waterfallBegin = ConstantInt::get(getInt32Ty(), 0);
waterfallBegin =
CreateIntrinsic(Intrinsic::amdgcn_waterfall_begin, descTy, {waterfallBegin, firstIndexInst}, nullptr, instName);

// Scalarize shared index.
Value *readFirstLane = CreateIntrinsic(Intrinsic::amdgcn_waterfall_readfirstlane, {descTy, descTy},
{waterfallBegin, firstIndexInst}, nullptr, instName);

for (auto *nonUniformVal : nonUniformIndices) {
// Get the first non uniform instruction of the chain.
auto it1 = newOrigNonUniformVal.find(nonUniformVal);
Value *origNonUniformVal = nonUniformVal;
if (it1 != newOrigNonUniformVal.end())
origNonUniformVal = it1->second;

auto [nonUniformImageCallOperand, operandIdx] = nonUniformIndexImageCallOperand[origNonUniformVal];

if (origNonUniformVal == nonUniformImageCallOperand)
continue;
for (unsigned operandIdx : operandIdxs) {
Value *nonUniformImageCallOperand = nonUniformInst->getOperand(operandIdx);
auto nonUniformImageCallOperandTy = nonUniformImageCallOperand->getType();
Value *nonUniformIndex = operandIdxnonUniformIndex[operandIdx];
const DenseMap<Value *, TinyInstructionSet> &instrDeps = traceNonUniformIndex.getInstrDeps();
auto itDep = instrDeps.find(nonUniformIndex);

if (scalarizeDescriptorLoads && nonUniformIndex != nonUniformImageCallOperand && itDep != instrDeps.end()) {
Value *nonUniformIndex32Bit =
nonUniformIndex->getType()->isIntegerTy(64) ? nonUniformIndex32BitVal[nonUniformIndex] : nonUniformIndex;

if (!sharedIndex) {
auto nonUniformIndex32BitTy = nonUniformIndex32Bit->getType();
readFirstLane =
CreateIntrinsic(Intrinsic::amdgcn_waterfall_readfirstlane, {nonUniformIndex32BitTy, nonUniformIndex32BitTy},
{waterfallBegin, nonUniformIndex32Bit}, nullptr, instName);
}

// Get the instruction chain of the first non uniform instruction.
const DenseMap<Value *, TinyInstructionSet> &instrDeps = traceNonUniformIndex.getInstrDeps();
auto it2 = instrDeps.find(origNonUniformVal);
assert(it2 != instrDeps.end() && "The non-uniform index should be in instrDep map.");
auto &instrsToClone = it2->second;
auto &instrsToClone = itDep->second;
assert(!instrsToClone.empty() && "There are not any instructions to clone.");

// Clone and emit the instructions that we want to push inside the waterfall loop.
std::map<Instruction *, Instruction *> origClonedValuesMap;
Instruction *prevInst = nonUniformInst;
const TinyInstructionSet::IndexToInstructionVec &indexToInstr = traceNonUniformIndex.getIndexToInstr();
for (auto it3 = instrsToClone.begin(indexToInstr), ite = instrsToClone.end(indexToInstr); it3 != ite; ++it3) {
auto *origInst = *it3;
for (auto iti = instrsToClone.begin(indexToInstr), ite = instrsToClone.end(indexToInstr); iti != ite; ++iti) {
auto *origInst = *iti;
auto *newInst = origInst->clone();
newInst->insertBefore(prevInst);
origClonedValuesMap[origInst] = newInst;
Expand All @@ -803,8 +788,9 @@ Instruction *BuilderImpl::createWaterfallLoop(Instruction *nonUniformInst, Array
nonUniformInst->setOperand(operandIdx, newInst);
}
}

// Finally, clone the first non uniform instruction.
auto *origInst = cast<Instruction>(origNonUniformVal);
auto *origInst = cast<Instruction>(nonUniformIndex);
auto *newInst = origInst->clone();
newInst->insertBefore(prevInst);
origClonedValuesMap[origInst] = newInst;
Expand All @@ -826,48 +812,36 @@ Instruction *BuilderImpl::createWaterfallLoop(Instruction *nonUniformInst, Array
// Replace all references to shared index within the waterfall loop with scalarized index.
// (Note: this includes the non-uniform instruction itself.)
// Loads using scalarized index will become scalar loads.
nonUniformVal->replaceUsesWithIf(readFirstLane, [readFirstLane, waterfallBegin, nonUniformInst](Use &U) {
nonUniformIndex32Bit->replaceUsesWithIf(readFirstLane, [readFirstLane, waterfallBegin, nonUniformInst](Use &U) {
Instruction *userInst = cast<Instruction>(U.getUser());
return userInst != waterfallBegin && userInst != readFirstLane &&
userInst->getParent() == nonUniformInst->getParent() &&
(userInst == nonUniformInst || userInst->comesBefore(nonUniformInst)) &&
!userInst->comesBefore(cast<Instruction>(waterfallBegin));
});
}
} else {
// Insert new code just before nonUniformInst.
SetInsertPoint(nonUniformInst);

// The first begin contains a null token for the previous token argument
waterfallBegin = ConstantInt::get(getInt32Ty(), 0);
for (auto nonUniformVal : nonUniformIndices) {
// Start the waterfall loop using the waterfall index.
waterfallBegin = CreateIntrinsic(Intrinsic::amdgcn_waterfall_begin, nonUniformVal->getType(),
{waterfallBegin, nonUniformVal}, nullptr, instName);
}

// Scalarize each non-uniform operand of the instruction.
for (unsigned operandIdx : operandIdxs) {
Value *desc = nonUniformInst->getOperand(operandIdx);
auto descTy = desc->getType();
} else {
Value *desc = nonUniformImageCallOperand;
#if LLVM_MAIN_REVISION && LLVM_MAIN_REVISION < 463892
// Old version of the code
#else
// When the non-uniform use is in a VGPR, we can save a v_mov by not inserting the amdgcn_waterfall_readfirstlane
if (!useVgprForOperands)
#endif
desc = CreateIntrinsic(Intrinsic::amdgcn_waterfall_readfirstlane, {descTy, descTy}, {waterfallBegin, desc},
desc = CreateIntrinsic(Intrinsic::amdgcn_waterfall_readfirstlane,
{nonUniformImageCallOperandTy, nonUniformImageCallOperandTy}, {waterfallBegin, desc},
nullptr, instName);
if (nonUniformInst->getType()->isVoidTy()) {
// The buffer/image operation we are waterfalling is a store with no return value. Use
// llvm.amdgcn.waterfall.last.use on the descriptor.
#if LLVM_MAIN_REVISION && LLVM_MAIN_REVISION < 463892
// Old version of the code
desc = CreateIntrinsic(Intrinsic::amdgcn_waterfall_last_use, descTy, {waterfallBegin, desc}, nullptr, instName);
desc = CreateIntrinsic(Intrinsic::amdgcn_waterfall_last_use, nonUniformImageCallOperandTy,
{waterfallBegin, desc}, nullptr, instName);
#else
desc = CreateIntrinsic(useVgprForOperands ? Intrinsic::amdgcn_waterfall_last_use_vgpr
: Intrinsic::amdgcn_waterfall_last_use,
descTy, {waterfallBegin, desc}, nullptr, instName);
nonUniformImageCallOperandTy, {waterfallBegin, desc}, nullptr, instName);
#endif
}
// Replace the descriptor operand in the buffer/image operation.
Expand Down
11 changes: 7 additions & 4 deletions lgc/test/scalarizationOfDescriptorLoadsTest3.ll
Original file line number Diff line number Diff line change
Expand Up @@ -107,9 +107,12 @@ attributes #2 = { nounwind memory(none) }
; CHECK-NEXT: [[TMP22:%.*]] = load <8 x i32>, ptr addrspace(4) [[TMP21]], align 32, !invariant.load !24
; CHECK-NEXT: [[TMP23:%.*]] = call i32 @llvm.amdgcn.waterfall.begin.i32(i32 0, i32 [[TMP19]])
; CHECK-NEXT: [[TMP24:%.*]] = call i32 @llvm.amdgcn.waterfall.begin.v4i32(i32 [[TMP23]], <4 x i32> <i32 42, i32 42, i32 42, i32 42>)
; CHECK-NEXT: [[TMP25:%.*]] = call <8 x i32> @llvm.amdgcn.waterfall.readfirstlane.v8i32.v8i32(i32 [[TMP24]], <8 x i32> [[TMP22]])
; CHECK-NEXT: [[TMP26:%.*]] = call <4 x i32> @llvm.amdgcn.waterfall.readfirstlane.v4i32.v4i32(i32 [[TMP24]], <4 x i32> <i32 42, i32 42, i32 42, i32 42>)
; CHECK-NEXT: [[TMP27:%.*]] = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32 15, float 0.000000e+00, float 0.000000e+00, <8 x i32> [[TMP25]], <4 x i32> [[TMP26]], i1 false, i32 0, i32 0)
; CHECK-NEXT: [[TMP28:%.*]] = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.waterfall.end.v4f32(i32 [[TMP24]], <4 x float> [[TMP27]])
; CHECK-NEXT: [[TMP25:%.*]] = call i32 @llvm.amdgcn.waterfall.readfirstlane.i32.i32(i32 [[TMP24]], i32 [[TMP19]])
; CHECK-NEXT: [[TMP26:%.*]] = sext i32 [[TMP25]] to i64
; CHECK-NEXT: [[TMP27:%.*]] = getelementptr i8, ptr addrspace(4) [[TMP18]], i64 [[TMP26]]
; CHECK-NEXT: [[TMP28:%.*]] = load <8 x i32>, ptr addrspace(4) [[TMP27]], align 32, !invariant.load !24
; CHECK-NEXT: [[TMP29:%.*]] = call <4 x i32> @llvm.amdgcn.waterfall.readfirstlane.v4i32.v4i32(i32 [[TMP24]], <4 x i32> <i32 42, i32 42, i32 42, i32 42>)
; CHECK-NEXT: [[TMP30:%.*]] = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32 15, float 0.000000e+00, float 0.000000e+00, <8 x i32> [[TMP28]], <4 x i32> [[TMP29]], i1 false, i32 0, i32 0)
; CHECK-NEXT: [[TMP31:%.*]] = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.waterfall.end.v4f32(i32 [[TMP24]], <4 x float> [[TMP30]])
; CHECK-NEXT: ret void
;
11 changes: 7 additions & 4 deletions lgc/test/scalarizationOfDescriptorLoadsTest4.ll
Original file line number Diff line number Diff line change
Expand Up @@ -109,9 +109,12 @@ attributes #2 = { nounwind memory(none) }
; CHECK-NEXT: [[TMP23:%.*]] = call <4 x i32> @foo1(<4 x i32> [[TMP12]])
; CHECK-NEXT: [[TMP24:%.*]] = call i32 @llvm.amdgcn.waterfall.begin.i32(i32 0, i32 [[TMP19]])
; CHECK-NEXT: [[TMP25:%.*]] = call i32 @llvm.amdgcn.waterfall.begin.v4i32(i32 [[TMP24]], <4 x i32> [[TMP23]])
; CHECK-NEXT: [[TMP26:%.*]] = call <8 x i32> @llvm.amdgcn.waterfall.readfirstlane.v8i32.v8i32(i32 [[TMP25]], <8 x i32> [[TMP22]])
; CHECK-NEXT: [[TMP27:%.*]] = call <4 x i32> @llvm.amdgcn.waterfall.readfirstlane.v4i32.v4i32(i32 [[TMP25]], <4 x i32> [[TMP23]])
; CHECK-NEXT: [[TMP28:%.*]] = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32 15, float 0.000000e+00, float 0.000000e+00, <8 x i32> [[TMP26]], <4 x i32> [[TMP27]], i1 false, i32 0, i32 0)
; CHECK-NEXT: [[TMP29:%.*]] = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.waterfall.end.v4f32(i32 [[TMP25]], <4 x float> [[TMP28]])
; CHECK-NEXT: [[TMP26:%.*]] = call i32 @llvm.amdgcn.waterfall.readfirstlane.i32.i32(i32 [[TMP25]], i32 [[TMP19]])
; CHECK-NEXT: [[TMP27:%.*]] = sext i32 [[TMP26]] to i64
; CHECK-NEXT: [[TMP28:%.*]] = getelementptr i8, ptr addrspace(4) [[TMP18]], i64 [[TMP27]]
; CHECK-NEXT: [[TMP29:%.*]] = load <8 x i32>, ptr addrspace(4) [[TMP28]], align 32, !invariant.load !24
; CHECK-NEXT: [[TMP30:%.*]] = call <4 x i32> @llvm.amdgcn.waterfall.readfirstlane.v4i32.v4i32(i32 [[TMP25]], <4 x i32> [[TMP23]])
; CHECK-NEXT: [[TMP31:%.*]] = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32 15, float 0.000000e+00, float 0.000000e+00, <8 x i32> [[TMP29]], <4 x i32> [[TMP30]], i1 false, i32 0, i32 0)
; CHECK-NEXT: [[TMP32:%.*]] = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.waterfall.end.v4f32(i32 [[TMP25]], <4 x float> [[TMP31]])
; CHECK-NEXT: ret void
;
Loading

0 comments on commit aaa28a5

Please sign in to comment.