From a5df20e570cafbe91be3d4df84764c04229728ab Mon Sep 17 00:00:00 2001 From: Konstantina Mitropoulou Date: Thu, 9 Nov 2023 23:41:32 -0800 Subject: [PATCH] [WIP][LLPC] Scalarize non-uniform loads inside the waterfall loop --- include/vkgcDefs.h | 3 +- lgc/builder/BuilderImpl.cpp | 287 ++++++++++++++++-- llpc/context/llpcPipelineContext.cpp | 11 +- ...peSampledImage_TestWaterfallInsertion.frag | 34 ++- ...peSampledImage_TestWaterfallScalarize.frag | 70 +++-- ...age_TestWaterfallScalarize_MultiBlock.frag | 155 ++++++++-- ...age_TestWaterfallScalarize_SharedDesc.frag | 111 +++++-- tool/dumper/vkgcPipelineDumper.cpp | 3 +- 8 files changed, 569 insertions(+), 105 deletions(-) diff --git a/include/vkgcDefs.h b/include/vkgcDefs.h index 2d8ff2159e..c88357a396 100644 --- a/include/vkgcDefs.h +++ b/include/vkgcDefs.h @@ -241,6 +241,7 @@ struct optional_bool : private std::optional { using std::optional::has_value; using std::optional::value; using std::optional::value_or; + using std::optional::operator*; }; /// Enumerates result codes of LLPC operations. @@ -873,7 +874,7 @@ struct PipelineShaderOptions { unsigned ldsSpillLimitDwords; /// Attempt to scalarize waterfall descriptor loads. - bool scalarizeWaterfallLoads; + optional_bool scalarizeWaterfallLoads; /// Force rearranges threadId within group into blocks of 8*8 or 8*4 bool overrideForceThreadIdSwizzling; diff --git a/lgc/builder/BuilderImpl.cpp b/lgc/builder/BuilderImpl.cpp index 2a3197be06..ab892cd96a 100644 --- a/lgc/builder/BuilderImpl.cpp +++ b/lgc/builder/BuilderImpl.cpp @@ -33,6 +33,7 @@ #include "lgc/LgcDialect.h" #include "lgc/state/PipelineState.h" #include "lgc/state/TargetInfo.h" +#include "llvm/ADT/BitVector.h" #include "llvm/IR/IntrinsicInst.h" #include "llvm/IR/IntrinsicsAMDGPU.h" @@ -334,6 +335,75 @@ BranchInst *BuilderImpl::createIf(Value *condition, bool wantElse, const Twine & return branch; } +// A simple memory efficient container that holds up to 64 instructions in a bit vector. It needs two helper data +// structures: 1. instrToIndex that maps an instruction to its index in the bit vector and 2. indexToInstr that maps an +// index back to an instruction. +class TinyInstructionSet { +public: + using IndexToInstructionVec = SmallVector; + using InstrToIndexMap = DenseMap; + +private: + BitVector bits; + const InstrToIndexMap &instrToIndex; + const IndexToInstructionVec &indexToInstr; + +public: + TinyInstructionSet(const InstrToIndexMap &instrToIndex, const IndexToInstructionVec &indexToInstr) + : instrToIndex(instrToIndex), indexToInstr(indexToInstr) { + bits.resize(64); + } + + class iterator { + BitVector::const_set_bits_iterator it; + const SmallVector &indexToInstr; + + public: + iterator(BitVector::const_set_bits_iterator it, const IndexToInstructionVec &indexToInstr) + : it(it), indexToInstr(indexToInstr) {} + iterator &operator++() { + ++it; + return *this; + } + + Instruction *operator*() { + unsigned index = *it; + assert(index < indexToInstr.size() && "Index out of range."); + return indexToInstr[index]; + } + + bool operator!=(const iterator &otherIt) { + assert(&otherIt.indexToInstr == &indexToInstr && "Iterators of different objects."); + return otherIt.it != it; + } + }; + + iterator begin() const { return iterator(bits.set_bits_begin(), indexToInstr); } + + iterator end() const { return iterator(bits.set_bits_end(), indexToInstr); } + + void insert(Instruction *instr) { + auto it = instrToIndex.find(instr); + assert(it != instrToIndex.end() && "Expected to find I in instrToIndex."); + unsigned index = it->second; + bits.set(index); + } + + unsigned size() const { return bits.size(); } + + bool empty() const { return bits.empty(); } + + LLVM_DUMP_METHOD void dump(); +}; + +void TinyInstructionSet::dump() { + errs() << "Dependencies:\n"; + for (unsigned Index : make_range(bits.set_bits_begin(), bits.set_bits_end())) { + auto *I = indexToInstr[Index]; + errs() << *I << "\n"; + } +} + #if defined(LLVM_HAVE_BRANCH_AMD_GFX) // ===================================================================================================================== // For a non-uniform input, try and trace back through a descriptor load to @@ -345,17 +415,47 @@ BranchInst *BuilderImpl::createIf(Value *condition, bool wantElse, const Twine & // This uses a fairly simple heuristic that nevertheless allows temporary expansion of the search breadth to handle // the common case where a base pointer is assembled from separate high and low halves. // +// In case of scalarization, it fills the insnDeps map by using insertNewValueInInsnDeps(). +// // @param nonUniformVal : Value representing non-uniform descriptor +// @param insnDep : Maps an instruction to its dependencies +// @param instrToIndex : Maps the instruction to its index in the bit vector +// @param indexToInstr: Maps a bit vector index to an instruction. +// @param scalarizeDescriptorLoads : indicates if scalarization is enabled // @return : Value representing the non-uniform index, or null if nonUniformVal could be proven to be uniform -static Value *traceNonUniformIndex(Value *nonUniformVal) { +static Value *traceNonUniformIndex(Value *nonUniformVal, DenseMap &insnDeps, + TinyInstructionSet::InstrToIndexMap &instrToIndex, + TinyInstructionSet::IndexToInstructionVec &indexToInstr, + bool &scalarizeDescriptorLoads) { + + auto insertNewValueInInsnDeps = [&insnDeps, &instrToIndex, &indexToInstr, + &scalarizeDescriptorLoads](Value *newValue, Instruction *currentVisitedInsn) { + if (!instrToIndex.contains(currentVisitedInsn)) { + // The instruction is either outside of 64 limit or in a different basic block. So, we bail-out scalarization. + scalarizeDescriptorLoads = false; + return; + } + assert(insnDeps.contains(currentVisitedInsn)); + auto it = insnDeps.try_emplace(newValue, instrToIndex, indexToInstr).first; + auto &setOfInsns = it->second; + auto itc = insnDeps.find(currentVisitedInsn); + for (auto *instr : itc->second) + setOfInsns.insert(instr); + setOfInsns.insert(currentVisitedInsn); + }; + auto load = dyn_cast(nonUniformVal); - if (!load) { + if (scalarizeDescriptorLoads && load) { + insnDeps.try_emplace(load, instrToIndex, indexToInstr); + } else if (!load) { // Workarounds that modify image descriptor can be peeped through, i.e. // %baseValue = load <8 x i32>, <8 x i32> addrspace(4)* %..., align 16 // %rawElement = extractelement <8 x i32> %baseValue, i64 6 // %updatedElement = and i32 %rawElement, -1048577 // %nonUniform = insertelement <8 x i32> %baseValue, i32 %updatedElement, i64 6 auto insert = dyn_cast(nonUniformVal); + if (scalarizeDescriptorLoads) + insnDeps.try_emplace(insert, instrToIndex, indexToInstr); if (!insert) return nonUniformVal; @@ -366,9 +466,15 @@ static Value *traceNonUniformIndex(Value *nonUniformVal) { // We found the load, but must verify the chain. // Consider updatedElement as a generic instruction or constant. if (auto updatedElement = dyn_cast(insert->getOperand(1))) { + if (scalarizeDescriptorLoads) + insertNewValueInInsnDeps(updatedElement, insert); for (Value *operand : updatedElement->operands()) { if (auto extract = dyn_cast(operand)) { // Only dynamic value must be ExtractElementInst based on load. + if (scalarizeDescriptorLoads) { + insertNewValueInInsnDeps(extract, updatedElement); + insertNewValueInInsnDeps(load, extract); + } if (dyn_cast(extract->getOperand(0)) != load) return nonUniformVal; } else if (!isa(operand)) { @@ -418,12 +524,22 @@ static Value *traceNonUniformIndex(Value *nonUniformVal) { if (current->isCast() || current->isUnaryOp()) { if (!propagate(current->getOperand(0))) return nonUniformVal; + + if (scalarizeDescriptorLoads) + insertNewValueInInsnDeps(current->getOperand(0), current); + continue; } if (current->isBinaryOp()) { if (!propagate(current->getOperand(0)) || !propagate(current->getOperand(1))) return nonUniformVal; + + if (scalarizeDescriptorLoads) { + insertNewValueInInsnDeps(current->getOperand(0), current); + insertNewValueInInsnDeps(current->getOperand(1), current); + } + continue; } @@ -435,13 +551,22 @@ static Value *traceNonUniformIndex(Value *nonUniformVal) { if (!propagate(ptr)) return nonUniformVal; + + if (scalarizeDescriptorLoads) + insertNewValueInInsnDeps(ptr, current); + continue; } if (auto gep = dyn_cast(current)) { if (gep->hasAllConstantIndices()) { + if (!propagate(gep->getPointerOperand())) return nonUniformVal; + + if (scalarizeDescriptorLoads) + insertNewValueInInsnDeps(gep->getPointerOperand(), current); + continue; } @@ -455,28 +580,57 @@ static Value *traceNonUniformIndex(Value *nonUniformVal) { candidateIndex = *gep->idx_begin(); if (getSize(candidateIndex) > nonUniformValSize) return nonUniformVal; // propagating further is worthless + + if (scalarizeDescriptorLoads) { + insertNewValueInInsnDeps(gep->getPointerOperand(), current); + insertNewValueInInsnDeps(candidateIndex, current); + } + continue; } if (auto extract = dyn_cast(current)) { if (!propagate(extract->getAggregateOperand())) return nonUniformVal; + + if (scalarizeDescriptorLoads) + insertNewValueInInsnDeps(extract->getAggregateOperand(), current); + continue; } if (auto insert = dyn_cast(current)) { if (!propagate(insert->getAggregateOperand()) || !propagate(insert->getInsertedValueOperand())) return nonUniformVal; + + if (scalarizeDescriptorLoads) { + insertNewValueInInsnDeps(insert->getAggregateOperand(), current); + insertNewValueInInsnDeps(insert->getInsertedValueOperand(), current); + } + continue; } if (auto extract = dyn_cast(current)) { if (!isa(extract->getIndexOperand()) || !propagate(extract->getVectorOperand())) return nonUniformVal; + + if (scalarizeDescriptorLoads) { + insertNewValueInInsnDeps(extract->getIndexOperand(), current); + insertNewValueInInsnDeps(extract->getVectorOperand(), current); + } + continue; } if (auto insert = dyn_cast(current)) { if (!isa(insert->getOperand(2)) || !propagate(insert->getOperand(0)) || !propagate(insert->getOperand(1))) return nonUniformVal; + + if (scalarizeDescriptorLoads) { + insertNewValueInInsnDeps(insert->getOperand(0), current); + insertNewValueInInsnDeps(insert->getOperand(1), current); + insertNewValueInInsnDeps(insert->getOperand(2), current); + } + continue; } @@ -538,6 +692,12 @@ static bool instructionsEqual(Instruction *lhs, Instruction *rhs) { // Create a waterfall loop containing the specified instruction. // This does not use the current insert point; new code is inserted before and after nonUniformInst. // +// For scalarization we need to collect all the instructions that need to be moved inside the loop. This is done by +// traceNonUniformIndex() which traverses all use-def predecessors of nonUniformInst. At the same time it adds these +// instructions to insnDeps map. Once traceNonUniformIndex() completes, we use the returned value as a key to the +// insnDeps map to get the dependencies. These dependencies are the instructions that will be cloned and moved inside +// the waterfall loop. +// // @param nonUniformInst : The instruction to put in a waterfall loop // @param operandIdxs : The operand index/indices for non-uniform inputs that need to be uniform // @param scalarizeDescriptorLoads : Attempt to scalarize descriptor loads @@ -554,24 +714,59 @@ Instruction *BuilderImpl::createWaterfallLoop(Instruction *nonUniformInst, Array assert(operandIdxs.empty() == false); SmallVector nonUniformIndices; + // Maps the instruction to its index in the bit vector. + TinyInstructionSet::InstrToIndexMap instrToIndex; + // The instructions used as keys in instrToIndex in program order. It is used to map an index to an instruction. + TinyInstructionSet::IndexToInstructionVec indexToInstr; + // Maps an instruction to its dependencies. + DenseMap insnDeps; + // Maps the nonUniformIndex that is returned by traceNonUniformIndex() to the nonUniformInst. + DenseMap> nonUniformIndexImageCallOperand; + + // Initialization of instrToIndex and indexToInstr. + if (scalarizeDescriptorLoads) { + unsigned cnt = 0; + for (Instruction *I = nonUniformInst->getPrevNode(); I != nullptr && cnt < 64; I = I->getPrevNode(), ++cnt) { + indexToInstr.push_back(I); + instrToIndex[I] = cnt; + } + } + for (unsigned operandIdx : operandIdxs) { - Value *nonUniformIndex = traceNonUniformIndex(nonUniformInst->getOperand(operandIdx)); - if (nonUniformIndex) + Value *nonUniformImageCallOperand = nonUniformInst->getOperand(operandIdx); + Value *nonUniformIndex = + traceNonUniformIndex(nonUniformImageCallOperand, insnDeps, instrToIndex, indexToInstr, scalarizeDescriptorLoads); + + if (nonUniformIndex) { nonUniformIndices.push_back(nonUniformIndex); + + if (scalarizeDescriptorLoads) + nonUniformIndexImageCallOperand[nonUniformIndex] = std::make_pair(nonUniformImageCallOperand, operandIdx); + } } + if (nonUniformIndices.empty()) return nonUniformInst; - // For any index that is 64 bit, change it back to 32 bit for comparison at the top of the - // waterfall loop. + // We do not apply scalarization for store instrinsics e.g. llvm.amdgcn.struct.buffer.store.format.v4f32(). + if (nonUniformInst->getType()->isVoidTy()) + scalarizeDescriptorLoads = false; + + // For any index that is 64 bit, change it back to 32 bit for comparison at the top of the waterfall loop. + // At this point the nonUniformVal of nonUniformIndices might change. We also need the original non uniform values for + // the scalarization of the descriptor loads. + DenseMap newOrigNonUniformVal; for (Value *&nonUniformVal : nonUniformIndices) { if (nonUniformVal->getType()->isIntegerTy(64)) { auto sExt = dyn_cast(nonUniformVal); + Value *origNonUniformVal = nonUniformVal; // 64-bit index may already be formed from extension of 32-bit value. if (sExt && sExt->getOperand(0)->getType()->isIntegerTy(32)) { nonUniformVal = sExt->getOperand(0); + newOrigNonUniformVal[nonUniformVal] = origNonUniformVal; } else { nonUniformVal = CreateTrunc(nonUniformVal, getInt32Ty()); + newOrigNonUniformVal[nonUniformVal] = origNonUniformVal; } } } @@ -607,34 +802,72 @@ Instruction *BuilderImpl::createWaterfallLoop(Instruction *nonUniformInst, Array Value *waterfallBegin; if (scalarizeDescriptorLoads) { - // Attempt to scalarize descriptor loads. - assert(firstIndexInst); - CallInst *firstCallInst = dyn_cast(firstIndexInst); - if (firstCallInst && firstCallInst->getIntrinsicID() == Intrinsic::amdgcn_waterfall_readfirstlane) { - // Descriptor loads are already inside a waterfall. - waterfallBegin = firstCallInst->getArgOperand(0); - } else { - // Begin waterfall loop just after shared index is computed. - // This places all dependent instructions within the waterfall loop, including descriptor loads. - auto descTy = firstIndexInst->getType(); - SetInsertPoint(firstIndexInst->getNextNonDebugInstruction(false)); - waterfallBegin = ConstantInt::get(getInt32Ty(), 0); - waterfallBegin = CreateIntrinsic(Intrinsic::amdgcn_waterfall_begin, descTy, {waterfallBegin, firstIndexInst}, - nullptr, instName); - - // Scalarize shared index. - Value *desc = CreateIntrinsic(Intrinsic::amdgcn_waterfall_readfirstlane, {descTy, descTy}, - {waterfallBegin, firstIndexInst}, nullptr, instName); + SetInsertPoint(nonUniformInst); + auto descTy = firstIndexInst->getType(); + // Create waterfall.begin and waterfall.readfirstlane intrinsics. + waterfallBegin = ConstantInt::get(getInt32Ty(), 0); + waterfallBegin = + CreateIntrinsic(Intrinsic::amdgcn_waterfall_begin, descTy, {waterfallBegin, firstIndexInst}, nullptr, instName); + + // Scalarize shared index. + Value *readFirstLane = CreateIntrinsic(Intrinsic::amdgcn_waterfall_readfirstlane, {descTy, descTy}, + {waterfallBegin, firstIndexInst}, nullptr, instName); + for (auto *nonUniformVal : nonUniformIndices) { + // Get the first non uniform instruction of the chain. + auto it1 = newOrigNonUniformVal.find(nonUniformVal); + Value *origNonUniformVal = nonUniformVal; + if (it1 != newOrigNonUniformVal.end()) + origNonUniformVal = it1->second; + + // Get the instruction chain of the first non uniform instruction. + auto it2 = insnDeps.find(origNonUniformVal); + assert(it2 != insnDeps.end() && "The non-uniform index should be in insnDep map."); + auto &insnsToClone = it2->second; + assert(!insnsToClone.empty() && "There are not any instructions to clone."); + auto [nonUniformImageCallOperand, operandIdx] = nonUniformIndexImageCallOperand[origNonUniformVal]; + + // Clone and emit the instructions that we want to push inside the waterfall loop. + std::map origClonedValuesMap; + Instruction *prevInst = nonUniformInst; + for (auto *origInst : insnsToClone) { + auto *newInst = origInst->clone(); + // Update the non-uniform operand of the image call with the new non-uniform operand. + if (nonUniformImageCallOperand == origInst) + nonUniformInst->setOperand(operandIdx, newInst); + newInst->insertBefore(prevInst); + origClonedValuesMap[origInst] = newInst; + prevInst = newInst; + } + // Finally, clone the first non uniform instruction. + auto *origInst = cast(origNonUniformVal); + auto *newInst = origInst->clone(); + newInst->insertBefore(prevInst); + origClonedValuesMap[origInst] = newInst; + + // Update the operands of the cloned instructions. + for (auto [origInst, newInst] : origClonedValuesMap) { + for (Use &use : newInst->operands()) { + Value *op = use.get(); + if (auto *opI = dyn_cast(op)) { + auto it = origClonedValuesMap.find(opI); + if (it == origClonedValuesMap.end()) + continue; + Instruction *clonedI = it->second; + use.set(clonedI); + } + } + } // Replace all references to shared index within the waterfall loop with scalarized index. // (Note: this includes the non-uniform instruction itself.) // Loads using scalarized index will become scalar loads. for (Value *otherNonUniformVal : nonUniformIndices) { - otherNonUniformVal->replaceUsesWithIf(desc, [desc, waterfallBegin, nonUniformInst](Use &U) { + otherNonUniformVal->replaceUsesWithIf(readFirstLane, [readFirstLane, waterfallBegin, nonUniformInst](Use &U) { Instruction *userInst = cast(U.getUser()); - return U.getUser() != waterfallBegin && U.getUser() != desc && + return userInst != waterfallBegin && userInst != readFirstLane && userInst->getParent() == nonUniformInst->getParent() && - (userInst == nonUniformInst || userInst->comesBefore(nonUniformInst)); + (userInst == nonUniformInst || userInst->comesBefore(nonUniformInst)) && + !userInst->comesBefore(cast(waterfallBegin)); }); } } diff --git a/llpc/context/llpcPipelineContext.cpp b/llpc/context/llpcPipelineContext.cpp index cf1105a45c..c21589412d 100644 --- a/llpc/context/llpcPipelineContext.cpp +++ b/llpc/context/llpcPipelineContext.cpp @@ -612,13 +612,12 @@ ShaderOptions PipelineContext::computeShaderOptions(const PipelineShaderInfo &sh } } - if (ScalarizeWaterfallDescriptorLoads.getNumOccurrences() > 0) { + if (ScalarizeWaterfallDescriptorLoads.getNumOccurrences() > 0) shaderOptions.scalarizeWaterfallLoads = ScalarizeWaterfallDescriptorLoads; - } else { - shaderOptions.scalarizeWaterfallLoads = shaderInfo.options.scalarizeWaterfallLoads; - // Enable waterfall load scalarization when vgpr limit is set. - if (shaderOptions.vgprLimit != 0 && shaderOptions.vgprLimit != UINT_MAX) - shaderOptions.scalarizeWaterfallLoads = true; + else { + shaderOptions.scalarizeWaterfallLoads = true; + if (shaderInfo.options.scalarizeWaterfallLoads.has_value()) + shaderOptions.scalarizeWaterfallLoads = *shaderInfo.options.scalarizeWaterfallLoads; } shaderOptions.sgprLimit = shaderInfo.options.sgprLimit; diff --git a/llpc/test/shaderdb/core/OpTypeSampledImage_TestWaterfallInsertion.frag b/llpc/test/shaderdb/core/OpTypeSampledImage_TestWaterfallInsertion.frag index 6845f3f011..74975a3767 100644 --- a/llpc/test/shaderdb/core/OpTypeSampledImage_TestWaterfallInsertion.frag +++ b/llpc/test/shaderdb/core/OpTypeSampledImage_TestWaterfallInsertion.frag @@ -18,16 +18,24 @@ void main() _3 = texture(_11[nonuniformEXT(_12)], vec2(0.0)); } -// BEGIN_SHADERTEST -/* -; RUN: amdllpc -v %gfxip %s | FileCheck -check-prefix=SHADERTEST %s -; Make sure that the begin indices chosen are the non-uniform offsets rather than the whole resource desc -; Make sure that there's a waterfall.readfirstlane for both the image resource desc and sample desc -; SHADERTEST-LABEL: {{^// LLPC}} pipeline patching results -; SHADERTEST-DAG: call i32 @llvm.amdgcn.waterfall.begin.i32 -; SHADERTEST-DAG: call i32 @llvm.amdgcn.waterfall.begin.i32 -; SHADERTEST-DAG: call <8 x i32> @llvm.amdgcn.waterfall.readfirstlane.v8i32.v8i32 -; SHADERTEST-DAG: call <4 x i32> @llvm.amdgcn.waterfall.readfirstlane.v4i32.v4i32 -; SHADERTEST: AMDLLPC SUCCESS -*/ -// END_SHADERTEST +// RUN: amdllpc -v %gfxip %s | FileCheck -check-prefix=SHADERTEST %s +// Make sure that the begin indices chosen are the non-uniform offsets rather than the whole resource desc +// Make sure that there's a waterfall.readfirstlane for both the image resource desc and sample desc +// SHADERTEST-LABEL: {{^// LLPC}} pipeline patching results +// SHADERTEST: %[[mul:[0-9]+]] = mul i32 %{{.*}}, 48 +// SHADERTEST-NEXT: %[[begin:[0-9]+]] = call i32 @llvm.amdgcn.waterfall.begin.i32(i32 0, i32 %[[mul]]) +// SHADERTEST-NEXT: %[[readfirstlane:[0-9]+]] = call i32 @llvm.amdgcn.waterfall.readfirstlane.i32.i32(i32 %[[begin]], i32 %[[mul]]) +// SHADERTEST-NEXT: %[[sext:[0-9]+]] = sext i32 %[[readfirstlane]] to i64 +// SHADERTEST-NEXT: %[[gep1:[0-9]+]] = getelementptr i8, ptr addrspace(4) %{{.*}}, i64 %[[sext]] +// SHADERTEST-NEXT: %[[load1:[0-9]+]] = load <8 x i32>, ptr addrspace(4) %[[gep1]], align 32 +// SHADERTEST-NEXT: %[[extract:[.a-z0-9]+]] = extractelement <8 x i32> %[[load1]], i64 3 +// SHADERTEST-NEXT: %[[and:[0-9]+]] = and i32 %[[extract]], 268435455 +// SHADERTEST-NEXT: %[[cmp:[0-9]+]] = icmp slt i32 %[[extract]], 0 +// SHADERTEST-NEXT: %[[select:[0-9]+]] = select i1 %[[cmp]], i32 %[[extract]], i32 %[[and]] +// SHADERTEST-NEXT: %[[insert:[.a-z0-9]+]] = insertelement <8 x i32> %[[load1]], i32 %[[select]], i64 3 +// SHADERTEST-NEXT: %[[shufflevector:[0-9]+]] = shufflevector <8 x i32> %[[insert]], <8 x i32> %[[load1]], <8 x i32> +// SHADERTEST-NEXT: %[[gep2:[0-9]+]] = getelementptr i8, ptr addrspace(4) %{{.*}}, i64 %[[sext]] +// SHADERTEST-NEXT: %[[load2:[0-9]+]] = load <4 x i32>, ptr addrspace(4) %[[gep2]], align 16 +// SHADERTEST-NEXT: %[[image_call:[0-9]+]] = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f16(i32 15, half 0xH0000, half 0xH0000, <8 x i32> %[[shufflevector]], <4 x i32> %[[load2]], i1 false, i32 0, i32 0) +// SHADERTEST-NEXT: %[[end:[0-9]+]] = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.waterfall.end.v4f32(i32 %[[begin]], <4 x float> %[[image_call]]) +// SHADERTEST: AMDLLPC SUCCESS diff --git a/llpc/test/shaderdb/core/OpTypeSampledImage_TestWaterfallScalarize.frag b/llpc/test/shaderdb/core/OpTypeSampledImage_TestWaterfallScalarize.frag index fbf9c25c0f..e51942ba95 100644 --- a/llpc/test/shaderdb/core/OpTypeSampledImage_TestWaterfallScalarize.frag +++ b/llpc/test/shaderdb/core/OpTypeSampledImage_TestWaterfallScalarize.frag @@ -1,6 +1,3 @@ -// Make sure that there is a single begin index -// Make sure that there is a single waterfall.readfirstlane for the offset - #version 450 #extension GL_EXT_nonuniform_qualifier : require @@ -16,18 +13,57 @@ void main() _3 = texture(_11[nonuniformEXT(_12)], _6); } -// BEGIN_SHADERTEST -// -// RUN: amdllpc -scalarize-waterfall-descriptor-loads -v %gfxip %s | FileCheck -check-prefix=SHADERTEST %s +// RUN: amdllpc -scalarize-waterfall-descriptor-loads -v %gfxip %s | FileCheck -check-prefix=GFX %s // Explicitly check GFX10.3 ASIC variants: -// RUN: amdllpc -scalarize-waterfall-descriptor-loads -v --gfxip=10.3.0 %s | FileCheck -check-prefix=SHADERTEST %s -// RUN: amdllpc -scalarize-waterfall-descriptor-loads -v --gfxip=10.3.2 %s | FileCheck -check-prefix=SHADERTEST %s -// SHADERTEST-LABEL: {{^// LLPC}} pipeline patching results -// SHADERTEST: call i32 @llvm.amdgcn.waterfall.begin.i32 -// SHADERTEST-NOT: call i32 @llvm.amdgcn.waterfall.begin.i32 -// SHADERTEST: call i32 @llvm.amdgcn.waterfall.readfirstlane.i32.i32 -// SHADERTEST-NOT: call i32 @llvm.amdgcn.waterfall.readfirstlane.i32.i32 -// SHADERTEST: call {{.*}} <4 x float> @llvm.amdgcn.waterfall.end.v4f32 -// SHADERTEST: AMDLLPC SUCCESS -// -// END_SHADERTEST +// RUN: amdllpc -scalarize-waterfall-descriptor-loads -v --gfxip=10.3.0 %s | FileCheck -check-prefix=GFX_10_3_0 %s +// RUN: amdllpc -scalarize-waterfall-descriptor-loads -v --gfxip=10.3.2 %s | FileCheck -check-prefix=GFX_10_3_2 %s + +// GFX-LABEL: {{^// LLPC}} pipeline patching results +// GFX: %[[mul:[0-9]+]] = mul i32 %{{.*}}, 48 +// GFX-NEXT: %[[begin:[0-9]+]] = call i32 @llvm.amdgcn.waterfall.begin.i32(i32 0, i32 %[[mul]]) +// GFX-NEXT: %[[readfirstlane:[0-9]+]] = call i32 @llvm.amdgcn.waterfall.readfirstlane.i32.i32(i32 %[[begin]], i32 %[[mul]]) +// GFX-NEXT: %[[sext:[0-9]+]] = sext i32 %[[readfirstlane]] to i64 +// GFX-NEXT: %[[gep1:[0-9]+]] = getelementptr i8, ptr addrspace(4) %{{.*}}, i64 %[[sext]] +// GFX-NEXT: %[[load1:[0-9]+]] = load <8 x i32>, ptr addrspace(4) %[[gep1]], align 32 +// GFX-NEXT: %[[extract:[.a-z0-9]+]] = extractelement <8 x i32> %[[load1]], i64 3 +// GFX-NEXT: %[[and:[0-9]+]] = and i32 %[[extract]], 268435455 +// GFX-NEXT: %[[cmp:[0-9]+]] = icmp slt i32 %[[extract]], 0 +// GFX-NEXT: %[[select:[0-9]+]] = select i1 %[[cmp]], i32 %[[extract]], i32 %[[and]] +// GFX-NEXT: %[[insert:[.a-z0-9]+]] = insertelement <8 x i32> %[[load1]], i32 %[[select]], i64 3 +// GFX-NEXT: %[[shufflevector:[0-9]+]] = shufflevector <8 x i32> %[[insert]], <8 x i32> %[[load1]], <8 x i32> +// GFX-NEXT: %[[gep2:[0-9]+]] = getelementptr i8, ptr addrspace(4) %{{.*}}, i64 %[[sext]] +// GFX-NEXT: %[[load2:[0-9]+]] = load <4 x i32>, ptr addrspace(4) %[[gep2]], align 16 +// GFX-NEXT: %[[image_call:[0-9]+]] = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32 15, float %{{.*}}, float %{{.*}}, <8 x i32> %[[shufflevector]], <4 x i32> %[[load2]], i1 false, i32 0, i32 0) +// GFX-NEXT: %[[end:[0-9]+]] = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.waterfall.end.v4f32(i32 %[[begin]], <4 x float> %[[image_call]]) +// GFX: AMDLLPC SUCCESS + +// GFX_10_3_0-LABEL: {{^// LLPC}} pipeline patching results +// GFX_10_3_0: %[[mul:[0-9]+]] = mul i32 %{{.*}}, 48 +// GFX_10_3_0-NEXT: %[[begin:[0-9]+]] = call i32 @llvm.amdgcn.waterfall.begin.i32(i32 0, i32 %[[mul]]) +// GFX_10_3_0-NEXT: %[[readfirstlane:[0-9]+]] = call i32 @llvm.amdgcn.waterfall.readfirstlane.i32.i32(i32 %[[begin]], i32 %[[mul]]) +// GFX_10_3_0-NEXT: %[[sext:[0-9]+]] = sext i32 %[[readfirstlane]] to i64 +// GFX_10_3_0-NEXT: %[[gep1:[0-9]+]] = getelementptr i8, ptr addrspace(4) %{{.*}}, i64 %[[sext]] +// GFX_10_3_0-NEXT: %[[load1:[0-9]+]] = load <8 x i32>, ptr addrspace(4) %[[gep1]], align 32 +// GFX_10_3_0-NEXT: %[[gep2:[0-9]+]] = getelementptr i8, ptr addrspace(4) %{{.*}}, i64 %[[sext]] +// GFX_10_3_0-NEXT: %[[load2:[0-9]+]] = load <4 x i32>, ptr addrspace(4) %[[gep2]], align 16 +// GFX_10_3_0-NEXT: %[[image_call:[0-9]+]] = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32 15, float %{{.*}}, float %{{.*}}, <8 x i32> %[[load1]], <4 x i32> %[[load2]], i1 false, i32 0, i32 0) +// GFX_10_3_0-NEXT: %[[end:[0-9]+]] = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.waterfall.end.v4f32(i32 %[[begin]], <4 x float> %[[image_call]]) +// GFX_10_3_0: AMDLLPC SUCCESS + + +// GFX_10_3_2-LABEL: {{^// LLPC}} pipeline patching results +// GFX_10_3_2: %[[mul:[0-9]+]] = mul i32 %{{.*}}, 48 +// GFX_10_3_2-NEXT: %[[begin:[0-9]+]] = call i32 @llvm.amdgcn.waterfall.begin.i32(i32 0, i32 %[[mul]]) +// GFX_10_3_2-NEXT: %[[readfirstlane:[0-9]+]] = call i32 @llvm.amdgcn.waterfall.readfirstlane.i32.i32(i32 %[[begin]], i32 %[[mul]]) +// GFX_10_3_2-NEXT: %[[sext:[0-9]+]] = sext i32 %[[readfirstlane]] to i64 +// GFX_10_3_2-NEXT: %[[gep1:[0-9]+]] = getelementptr i8, ptr addrspace(4) %{{.*}}, i64 %[[sext]] +// GFX_10_3_2-NEXT: %[[load1:[0-9]+]] = load <8 x i32>, ptr addrspace(4) %[[gep1]], align 32 +// GFX_10_3_2-NEXT: %[[extract:[.a-z0-9]+]] = extractelement <8 x i32> %[[load1]], i64 6 +// GFX_10_3_2-NEXT: %[[and:[0-9]+]] = and i32 %[[extract]], -1048577 +// GFX_10_3_2-NEXT: %[[insert:[.a-z0-9]+]] = insertelement <8 x i32> %[[load1]], i32 %[[and]], i64 6 +// GFX_10_3_2-NEXT: %[[shufflevector:[0-9]+]] = shufflevector <8 x i32> %[[insert]], <8 x i32> %[[load1]], <8 x i32> +// GFX_10_3_2-NEXT: %[[gep2:[0-9]+]] = getelementptr i8, ptr addrspace(4) %{{.*}}, i64 %[[sext]] +// GFX_10_3_2-NEXT: %[[load2:[0-9]+]] = load <4 x i32>, ptr addrspace(4) %[[gep2]], align 16 +// GFX_10_3_2-NEXT: %[[image_call:[0-9]+]] = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32 15, float %{{.*}}, float %{{.*}}, <8 x i32> %[[shufflevector]], <4 x i32> %[[load2]], i1 false, i32 0, i32 0) +// GFX_10_3_2-NEXT: %[[end:[0-9]+]] = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.waterfall.end.v4f32(i32 %[[begin]], <4 x float> %[[image_call]]) +// GFX_10_3_2: AMDLLPC SUCCESS diff --git a/llpc/test/shaderdb/core/OpTypeSampledImage_TestWaterfallScalarize_MultiBlock.frag b/llpc/test/shaderdb/core/OpTypeSampledImage_TestWaterfallScalarize_MultiBlock.frag index 82cd87a930..8cb70707e4 100644 --- a/llpc/test/shaderdb/core/OpTypeSampledImage_TestWaterfallScalarize_MultiBlock.frag +++ b/llpc/test/shaderdb/core/OpTypeSampledImage_TestWaterfallScalarize_MultiBlock.frag @@ -1,5 +1,5 @@ // Make sure that there are two non-overlapping waterfall loops -// First is scalarized and second is vector type +// The first two loops are scalarized and the last one is vector type #version 450 #extension GL_EXT_nonuniform_qualifier : require @@ -25,24 +25,139 @@ void main() _3 = samp0 + samp1; } -// BEGIN_SHADERTEST -// -// RUN: amdllpc -scalarize-waterfall-descriptor-loads -v %gfxip %s | FileCheck -check-prefix=SHADERTEST %s +// RUN: amdllpc -scalarize-waterfall-descriptor-loads -v %gfxip %s | FileCheck -check-prefix=GFX %s // Explicitly check GFX10.3 ASIC variants: -// RUN: amdllpc -scalarize-waterfall-descriptor-loads -v --gfxip=10.3.0 %s | FileCheck -check-prefix=SHADERTEST %s -// RUN: amdllpc -scalarize-waterfall-descriptor-loads -v --gfxip=10.3.2 %s | FileCheck -check-prefix=SHADERTEST %s -// SHADERTEST-LABEL: {{^// LLPC}} pipeline patching results -// SHADERTEST: call i32 @llvm.amdgcn.waterfall.begin.i32 -// SHADERTEST-NOT: call i32 @llvm.amdgcn.waterfall.begin.i32 -// SHADERTEST: call i32 @llvm.amdgcn.waterfall.readfirstlane.i32.i32 -// SHADERTEST-NOT: call i32 @llvm.amdgcn.waterfall.begin.i32 -// SHADERTEST: call {{.*}} <4 x float> @llvm.amdgcn.waterfall.end.v4f32 -// SHADERTEST-NOT: call i32 @llvm.amdgcn.waterfall.begin.i32 -// SHADERTEST: call {{.*}} <4 x float> @llvm.amdgcn.waterfall.end.v4f32 -// SHADERTEST: call i32 @llvm.amdgcn.waterfall.begin.i32 -// SHADERTEST: call <8 x i32> @llvm.amdgcn.waterfall.readfirstlane.v8i32.v8i32 -// SHADERTEST: call <4 x i32> @llvm.amdgcn.waterfall.readfirstlane.v4i32.v4i32 -// SHADERTEST: call {{.*}} <4 x float> @llvm.amdgcn.waterfall.end.v4f32 -// SHADERTEST: AMDLLPC SUCCESS +// RUN: amdllpc -scalarize-waterfall-descriptor-loads -v --gfxip=10.3.0 %s | FileCheck -check-prefix=GFX_10_3_0 %s +// RUN: amdllpc -scalarize-waterfall-descriptor-loads -v --gfxip=10.3.2 %s | FileCheck -check-prefix=GFX_10_3_2 %s + +// GFX-LABEL: {{^// LLPC}} pipeline patching results +// GFX: %[[mul1:[0-9]+]] = mul i32 %{{.*}}, 48 +// GFX-NEXT: %[[sext1:[0-9]+]] = sext i32 %[[mul1]] to i64 +// GFX-NEXT: %[[gep1:[0-9]+]] = getelementptr i8, ptr addrspace(4) %{{.*}}, i64 %[[sext1]] +// GFX-NEXT: %[[gep2:[0-9]+]] = getelementptr i8, ptr addrspace(4) %{{.*}}, i64 %[[sext1]] +// GFX-NEXT: %[[load1:[0-9]+]] = load <4 x i32>, ptr addrspace(4) %[[gep2]], align 16 +// GFX-NEXT: %[[load2:[0-9]+]] = load <8 x i32>, ptr addrspace(4) %[[gep1]], align 32 +// GFX-NEXT: %[[begin1:[0-9]+]] = call i32 @llvm.amdgcn.waterfall.begin.i32(i32 0, i32 %[[mul1]]) +// GFX-NEXT: %[[readfirstlane1:[0-9]+]] = call i32 @llvm.amdgcn.waterfall.readfirstlane.i32.i32(i32 %[[begin1]], i32 %[[mul1]]) +// GFX-NEXT: %[[sext2:[0-9]+]] = sext i32 %[[readfirstlane1]] to i64 +// GFX-NEXT: %[[gep3:[0-9]+]] = getelementptr i8, ptr addrspace(4) %{{.*}}, i64 %[[sext2]] +// GFX-NEXT: %[[load3:[0-9]+]] = load <8 x i32>, ptr addrspace(4) %[[gep3]], align 32 +// GFX-NEXT: %[[extract1:[.a-z0-9]+]] = extractelement <8 x i32> %[[load3]], i64 3 +// GFX-NEXT: %[[and1:[0-9]+]] = and i32 %[[extract1]], 268435455 +// GFX-NEXT: %[[cmp1:[0-9]+]] = icmp slt i32 %[[extract1]], 0 +// GFX-NEXT: %[[select1:[0-9]+]] = select i1 %[[cmp1]], i32 %[[extract1]], i32 %[[and1]] +// GFX-NEXT: %[[insert1:[.a-z0-9]+]] = insertelement <8 x i32> %[[load3]], i32 %[[select1]], i64 3 +// GFX-NEXT: %[[shufflevector1:[0-9]+]] = shufflevector <8 x i32> %[[insert1]], <8 x i32> %[[load3]], <8 x i32> +// GFX-NEXT: %[[gep4:[0-9]+]] = getelementptr i8, ptr addrspace(4) %{{.*}}, i64 %[[sext2]] +// GFX-NEXT: %[[load4:[0-9]+]] = load <4 x i32>, ptr addrspace(4) %[[gep4]], align 16 +// GFX-NEXT: %[[image_call1:[0-9]+]] = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32 15, float %{{.*}}, float %{{.*}}, <8 x i32> %[[shufflevector1]], <4 x i32> %[[load4]], i1 false, i32 0, i32 0) +// GFX-NEXT: %[[end1:[0-9]+]] = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.waterfall.end.v4f32(i32 %[[begin1]], <4 x float> %[[image_call1]]) +// +// GFX-NEXT: %[[begin2:[0-9]+]] = call i32 @llvm.amdgcn.waterfall.begin.i32(i32 0, i32 %[[mul1]]) +// GFX-NEXT: %[[readfirstlane2:[0-9]+]] = call i32 @llvm.amdgcn.waterfall.readfirstlane.i32.i32(i32 %[[begin2]], i32 %[[mul1]]) +// GFX-NEXT: %[[sext3:[0-9]+]] = sext i32 %[[readfirstlane2]] to i64 +// GFX-NEXT: %[[gep5:[0-9]+]] = getelementptr i8, ptr addrspace(4) %{{.*}}, i64 %[[sext3]] +// GFX-NEXT: %[[load5:[0-9]+]] = load <8 x i32>, ptr addrspace(4) %[[gep5]], align 32 +// GFX-NEXT: %[[extract2:[.a-z0-9]+]] = extractelement <8 x i32> %[[load5]], i64 3 +// GFX-NEXT: %[[and2:[0-9]+]] = and i32 %[[extract2]], 268435455 +// GFX-NEXT: %[[cmp2:[0-9]+]] = icmp slt i32 %[[extract2]], 0 +// GFX-NEXT: %[[select2:[0-9]+]] = select i1 %[[cmp2]], i32 %[[extract2]], i32 %[[and2]] +// GFX-NEXT: %[[insert2:[.a-z0-9]+]] = insertelement <8 x i32> %[[load5]], i32 %[[select2]], i64 3 +// GFX-NEXT: %[[shufflevector2:[0-9]+]] = shufflevector <8 x i32> %[[insert2]], <8 x i32> %[[load5]], <8 x i32> +// GFX-NEXT: %[[gep6:[0-9]+]] = getelementptr i8, ptr addrspace(4) %{{.*}}, i64 %[[sext3]] +// GFX-NEXT: %[[load6:[0-9]+]] = load <4 x i32>, ptr addrspace(4) %[[gep6]], align 16 +// GFX-NEXT: %[[image_call2:[0-9]+]] = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32 15, float %{{.*}}, float %{{.*}}, <8 x i32> %[[shufflevector2]], <4 x i32> %[[load6]], i1 false, i32 0, i32 0) +// GFX-NEXT: %[[end2:[0-9]+]] = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.waterfall.end.v4f32(i32 %[[begin2]], <4 x float> %[[image_call2]]) + +// GFX: %[[begin3:[0-9]+]] = call i32 @llvm.amdgcn.waterfall.begin.i32(i32 0, i32 %[[mul1]]) +// GFX-NEXT: %[[readfirstlane3:[0-9]+]] = call <8 x i32> @llvm.amdgcn.waterfall.readfirstlane.v8i32.v8i32(i32 %[[begin3]], <8 x i32> %[[load2]]) +// GFX-NEXT: %[[extract3:[.a-z0-9]+]] = extractelement <8 x i32> %[[readfirstlane3]], i64 3 +// GFX-NEXT: %[[and3:[0-9]+]] = and i32 %[[extract3]], 268435455 +// GFX-NEXT: %[[cmp3:[0-9]+]] = icmp slt i32 %[[extract3]], 0 +// GFX-NEXT: %[[select3:[0-9]+]] = select i1 %[[cmp3]], i32 %[[extract3]], i32 %[[and3]] +// GFX-NEXT: %[[insert3:[.a-z0-9]+]] = insertelement <8 x i32> %[[readfirstlane3]], i32 %[[select3]], i64 3 +// GFX-NEXT: %[[shufflevector3:[0-9]+]] = shufflevector <8 x i32> %[[insert3]], <8 x i32> %[[readfirstlane3]], <8 x i32> +// GFX-NEXT: %[[readfirstlane4:[0-9]+]] = call <4 x i32> @llvm.amdgcn.waterfall.readfirstlane.v4i32.v4i32(i32 %[[begin3]], <4 x i32> %[[load1]]) +// GFX-NEXT: [[image_call3:[0-9]+]] = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32 15, float %{{.*}}, float %{{.*}}, <8 x i32> %[[shufflevector3]], <4 x i32> %[[readfirstlane4]], i1 false, i32 0, i32 0) +// GFX-NEXT: %[[end3:[0-9]+]] = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.waterfall.end.v4f32(i32 %[[begin3]], <4 x float> %[[image_call3]]) +// GFX: AMDLLPC SUCCESS + +// GFX_10_3_0-LABEL: {{^// LLPC}} pipeline patching results +// GFX_10_3_0: %[[mul1:[0-9]+]] = mul i32 %{{.*}}, 48 +// GFX_10_3_0-NEXT: %[[sext1:[0-9]+]] = sext i32 %[[mul1]] to i64 +// GFX_10_3_0-NEXT: %[[gep1:[0-9]+]] = getelementptr i8, ptr addrspace(4) %{{.*}}, i64 %[[sext1]] +// GFX_10_3_0-NEXT: %[[gep2:[0-9]+]] = getelementptr i8, ptr addrspace(4) %{{.*}}, i64 %[[sext1]] +// GFX_10_3_0-NEXT: %[[load1:[0-9]+]] = load <4 x i32>, ptr addrspace(4) %[[gep2]], align 16 +// GFX_10_3_0-NEXT: %[[load2:[0-9]+]] = load <8 x i32>, ptr addrspace(4) %[[gep1]], align 32 +// GFX_10_3_0-NEXT: %[[begin1:[0-9]+]] = call i32 @llvm.amdgcn.waterfall.begin.i32(i32 0, i32 %[[mul1]]) +// GFX_10_3_0-NEXT: %[[readfirstlane1:[0-9]+]] = call i32 @llvm.amdgcn.waterfall.readfirstlane.i32.i32(i32 %[[begin1]], i32 %[[mul1]]) +// GFX_10_3_0-NEXT: %[[sext2:[0-9]+]] = sext i32 %[[readfirstlane1]] to i64 +// GFX_10_3_0-NEXT: %[[gep3:[0-9]+]] = getelementptr i8, ptr addrspace(4) %{{.*}}, i64 %[[sext2]] +// GFX_10_3_0-NEXT: %[[load3:[0-9]+]] = load <8 x i32>, ptr addrspace(4) %[[gep3]], align 32 +// GFX_10_3_0-NEXT: %[[gep4:[0-9]+]] = getelementptr i8, ptr addrspace(4) %{{.*}}, i64 %[[sext2]] +// GFX_10_3_0-NEXT: %[[load4:[0-9]+]] = load <4 x i32>, ptr addrspace(4) %[[gep4]], align 16 +// GFX_10_3_0-NEXT: %[[image_call1:[0-9]+]] = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32 15, float %{{.*}}, float %{{.*}}, <8 x i32> %[[load3]], <4 x i32> %[[load4]], i1 false, i32 0, i32 0) +// GFX_10_3_0-NEXT: %[[end1:[0-9]+]] = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.waterfall.end.v4f32(i32 %[[begin1]], <4 x float> %[[image_call1]]) // -// END_SHADERTEST +// GFX_10_3_0-NEXT: %[[begin2:[0-9]+]] = call i32 @llvm.amdgcn.waterfall.begin.i32(i32 0, i32 %[[mul1]]) +// GFX_10_3_0-NEXT: %[[readfirstlane2:[0-9]+]] = call i32 @llvm.amdgcn.waterfall.readfirstlane.i32.i32(i32 %[[begin2]], i32 %[[mul1]]) +// GFX_10_3_0-NEXT: %[[sext3:[0-9]+]] = sext i32 %[[readfirstlane2]] to i64 +// GFX_10_3_0-NEXT: %[[gep5:[0-9]+]] = getelementptr i8, ptr addrspace(4) %{{.*}}, i64 %[[sext3]] +// GFX_10_3_0-NEXT: %[[load5:[0-9]+]] = load <8 x i32>, ptr addrspace(4) %[[gep5]], align 32 +// GFX_10_3_0-NEXT: %[[gep6:[0-9]+]] = getelementptr i8, ptr addrspace(4) %{{.*}}, i64 %[[sext3]] +// GFX_10_3_0-NEXT: %[[load6:[0-9]+]] = load <4 x i32>, ptr addrspace(4) %[[gep6]], align 16 +// GFX_10_3_0-NEXT: %[[image_call2:[0-9]+]] = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32 15, float %{{.*}}, float %{{.*}}, <8 x i32> %[[load5]], <4 x i32> %[[load6]], i1 false, i32 0, i32 0) +// GFX_10_3_0-NEXT: %[[end2:[0-9]+]] = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.waterfall.end.v4f32(i32 %[[begin2]], <4 x float> %[[image_call2]]) + +// GFX_10_3_0: %[[begin3:[0-9]+]] = call i32 @llvm.amdgcn.waterfall.begin.i32(i32 0, i32 %[[mul1]]) +// GFX_10_3_0-NEXT: %[[readfirstlane3:[0-9]+]] = call <8 x i32> @llvm.amdgcn.waterfall.readfirstlane.v8i32.v8i32(i32 %[[begin3]], <8 x i32> %[[load2]]) +// GFX_10_3_0-NEXT: %[[readfirstlane4:[0-9]+]] = call <4 x i32> @llvm.amdgcn.waterfall.readfirstlane.v4i32.v4i32(i32 %[[begin3]], <4 x i32> %[[load1]]) +// GFX_10_3_0-NEXT: [[image_call3:[0-9]+]] = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32 15, float %{{.*}}, float %{{.*}}, <8 x i32> %[[readfirstlane3]], <4 x i32> %[[readfirstlane4]], i1 false, i32 0, i32 0) +// GFX_10_3_0-NEXT: %[[end3:[0-9]+]] = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.waterfall.end.v4f32(i32 %[[begin3]], <4 x float> %[[image_call3]]) +// GFX_10_3_0: AMDLLPC SUCCESS + +// GFX_10_3_2-LABEL: {{^// LLPC}} pipeline patching results +// GFX_10_3_2: %[[mul1:[0-9]+]] = mul i32 %{{.*}}, 48 +// GFX_10_3_2-NEXT: %[[sext1:[0-9]+]] = sext i32 %[[mul1]] to i64 +// GFX_10_3_2-NEXT: %[[gep1:[0-9]+]] = getelementptr i8, ptr addrspace(4) %{{.*}}, i64 %[[sext1]] +// GFX_10_3_2-NEXT: %[[gep2:[0-9]+]] = getelementptr i8, ptr addrspace(4) %{{.*}}, i64 %[[sext1]] +// GFX_10_3_2-NEXT: %[[load1:[0-9]+]] = load <4 x i32>, ptr addrspace(4) %[[gep2]], align 16 +// GFX_10_3_2-NEXT: %[[load2:[0-9]+]] = load <8 x i32>, ptr addrspace(4) %[[gep1]], align 32 +// GFX_10_3_2-NEXT: %[[begin1:[0-9]+]] = call i32 @llvm.amdgcn.waterfall.begin.i32(i32 0, i32 %[[mul1]]) +// GFX_10_3_2-NEXT: %[[readfirstlane1:[0-9]+]] = call i32 @llvm.amdgcn.waterfall.readfirstlane.i32.i32(i32 %[[begin1]], i32 %[[mul1]]) +// GFX_10_3_2-NEXT: %[[sext2:[0-9]+]] = sext i32 %[[readfirstlane1]] to i64 +// GFX_10_3_2-NEXT: %[[gep3:[0-9]+]] = getelementptr i8, ptr addrspace(4) %{{.*}}, i64 %[[sext2]] +// GFX_10_3_2-NEXT: %[[load3:[0-9]+]] = load <8 x i32>, ptr addrspace(4) %[[gep3]], align 32 +// GFX_10_3_2-NEXT: %[[extract1:[.a-z0-9]+]] = extractelement <8 x i32> %[[load3]], i64 6 +// GFX_10_3_2-NEXT: %[[and1:[0-9]+]] = and i32 %[[extract1]], -1048577 +// GFX_10_3_2-NEXT: %[[insert1:[.a-z0-9]+]] = insertelement <8 x i32> %[[load3]], i32 %[[and1]], i64 6 +// GFX_10_3_2-NEXT: %[[shufflevector1:[0-9]+]] = shufflevector <8 x i32> %[[insert1]], <8 x i32> %[[load3]], <8 x i32> +// GFX_10_3_2-NEXT: %[[gep4:[0-9]+]] = getelementptr i8, ptr addrspace(4) %{{.*}}, i64 %[[sext2]] +// GFX_10_3_2-NEXT: %[[load4:[0-9]+]] = load <4 x i32>, ptr addrspace(4) %[[gep4]], align 16 +// GFX_10_3_2-NEXT: %[[image_call1:[0-9]+]] = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32 15, float %{{.*}}, float %{{.*}}, <8 x i32> %[[shufflevector1]], <4 x i32> %[[load4]], i1 false, i32 0, i32 0) +// GFX_10_3_2-NEXT: %[[end1:[0-9]+]] = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.waterfall.end.v4f32(i32 %[[begin1]], <4 x float> %[[image_call1]]) +// +// GFX_10_3_2-NEXT: %[[begin2:[0-9]+]] = call i32 @llvm.amdgcn.waterfall.begin.i32(i32 0, i32 %[[mul1]]) +// GFX_10_3_2-NEXT: %[[readfirstlane2:[0-9]+]] = call i32 @llvm.amdgcn.waterfall.readfirstlane.i32.i32(i32 %[[begin2]], i32 %[[mul1]]) +// GFX_10_3_2-NEXT: %[[sext3:[0-9]+]] = sext i32 %[[readfirstlane2]] to i64 +// GFX_10_3_2-NEXT: %[[gep5:[0-9]+]] = getelementptr i8, ptr addrspace(4) %{{.*}}, i64 %[[sext3]] +// GFX_10_3_2-NEXT: %[[load5:[0-9]+]] = load <8 x i32>, ptr addrspace(4) %[[gep5]], align 32 +// GFX_10_3_2-NEXT: %[[extract1:[.a-z0-9]+]] = extractelement <8 x i32> %[[load5]], i64 6 +// GFX_10_3_2-NEXT: %[[and1:[0-9]+]] = and i32 %[[extract1]], -1048577 +// GFX_10_3_2-NEXT: %[[insert1:[.a-z0-9]+]] = insertelement <8 x i32> %[[load5]], i32 %[[and1]], i64 6 +// GFX_10_3_2-NEXT: %[[shufflevector1:[0-9]+]] = shufflevector <8 x i32> %[[insert1]], <8 x i32> %[[load5]], <8 x i32> +// GFX_10_3_2-NEXT: %[[gep6:[0-9]+]] = getelementptr i8, ptr addrspace(4) %{{.*}}, i64 %[[sext3]] +// GFX_10_3_2-NEXT: %[[load6:[0-9]+]] = load <4 x i32>, ptr addrspace(4) %[[gep6]], align 16 +// GFX_10_3_2-NEXT: %[[image_call2:[0-9]+]] = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32 15, float %{{.*}}, float %{{.*}}, <8 x i32> %[[shufflevector1]], <4 x i32> %[[load6]], i1 false, i32 0, i32 0) +// GFX_10_3_2-NEXT: %[[end2:[0-9]+]] = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.waterfall.end.v4f32(i32 %[[begin2]], <4 x float> %[[image_call2]]) + +// GFX_10_3_2: %[[extract2:[.a-z0-9]+]] = extractelement <8 x i32> %[[load2]], i64 6 +// GFX_10_3_2: %[[and2:[0-9]+]] = and i32 %[[extract2]], -1048577 +// GFX_10_3_2-NEXT: %[[insert2:[.a-z0-9]+]] = insertelement <8 x i32> %[[load2]], i32 %[[and2]], i64 6 +// GFX_10_3_2-NEXT: %[[shufflevector2:[0-9]+]] = shufflevector <8 x i32> %[[insert2]], <8 x i32> %[[load2]], <8 x i32> +// GFX_10_3_2: %[[begin3:[0-9]+]] = call i32 @llvm.amdgcn.waterfall.begin.i32(i32 0, i32 %[[mul1]]) +// GFX_10_3_2-NEXT: %[[readfirstlane3:[0-9]+]] = call <8 x i32> @llvm.amdgcn.waterfall.readfirstlane.v8i32.v8i32(i32 %[[begin3]], <8 x i32> %[[shufflevector2]]) +// GFX_10_3_2-NEXT: %[[readfirstlane4:[0-9]+]] = call <4 x i32> @llvm.amdgcn.waterfall.readfirstlane.v4i32.v4i32(i32 %[[begin3]], <4 x i32> %[[load1]]) +// GFX_10_3_2-NEXT: [[image_call3:[0-9]+]] = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32 15, float %{{.*}}, float %{{.*}}, <8 x i32> %[[readfirstlane3]], <4 x i32> %[[readfirstlane4]], i1 false, i32 0, i32 0) +// GFX_10_3_2-NEXT: %[[end3:[0-9]+]] = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.waterfall.end.v4f32(i32 %[[begin3]], <4 x float> %[[image_call3]]) +// GFX_10_3_2: AMDLLPC SUCCESS diff --git a/llpc/test/shaderdb/core/OpTypeSampledImage_TestWaterfallScalarize_SharedDesc.frag b/llpc/test/shaderdb/core/OpTypeSampledImage_TestWaterfallScalarize_SharedDesc.frag index 123a2bc917..88e6a371cc 100644 --- a/llpc/test/shaderdb/core/OpTypeSampledImage_TestWaterfallScalarize_SharedDesc.frag +++ b/llpc/test/shaderdb/core/OpTypeSampledImage_TestWaterfallScalarize_SharedDesc.frag @@ -1,7 +1,3 @@ -// Make sure that there is a single begin index -// Make sure that there is a single waterfall.readfirstlane for the offset -// Make sure that there are two waterfall.end operations for the samples - #version 450 #extension GL_EXT_nonuniform_qualifier : require @@ -20,21 +16,96 @@ void main() _3 = samp0 + samp1; } -// BEGIN_SHADERTEST -// -// RUN: amdllpc -scalarize-waterfall-descriptor-loads -v %gfxip %s | FileCheck -check-prefix=SHADERTEST %s +// RUN: amdllpc -scalarize-waterfall-descriptor-loads -v %gfxip %s | FileCheck -check-prefix=GFX %s // Explicitly check GFX10.3 ASIC variants: -// RUN: amdllpc -scalarize-waterfall-descriptor-loads -v --gfxip=10.3.0 %s | FileCheck -check-prefix=SHADERTEST %s -// RUN: amdllpc -scalarize-waterfall-descriptor-loads -v --gfxip=10.3.2 %s | FileCheck -check-prefix=SHADERTEST %s -// SHADERTEST-LABEL: {{^// LLPC}} pipeline patching results -// SHADERTEST: call i32 @llvm.amdgcn.waterfall.begin.i32 -// SHADERTEST-NOT: call i32 @llvm.amdgcn.waterfall.begin.i32 -// SHADERTEST: call i32 @llvm.amdgcn.waterfall.readfirstlane.i32.i32 -// SHADERTEST-NOT: call i32 @llvm.amdgcn.waterfall.readfirstlane.i32.i32 -// SHADERTEST: call {{.*}} <4 x float> @llvm.amdgcn.waterfall.end.v4f32 -// SHADERTEST-NOT: call i32 @llvm.amdgcn.waterfall.begin.i32 -// SHADERTEST-NOT: call i32 @llvm.amdgcn.waterfall.readfirstlane.i32.i32 -// SHADERTEST: call {{.*}} <4 x float> @llvm.amdgcn.waterfall.end.v4f32 -// SHADERTEST: AMDLLPC SUCCESS +// RUN: amdllpc -scalarize-waterfall-descriptor-loads -v --gfxip=10.3.0 %s | FileCheck -check-prefix=GFX_10_3_0 %s +// RUN: amdllpc -scalarize-waterfall-descriptor-loads -v --gfxip=10.3.2 %s | FileCheck -check-prefix=GFX_10_3_2 %s + +// GFX-LABEL: {{^// LLPC}} pipeline patching results +// GFX: %[[mul1:[0-9]+]] = mul i32 %{{.*}}, 48 +// GFX-NEXT: %[[begin1:[0-9]+]] = call i32 @llvm.amdgcn.waterfall.begin.i32(i32 0, i32 %[[mul1]]) +// GFX-NEXT: %[[readfirstlane1:[0-9]+]] = call i32 @llvm.amdgcn.waterfall.readfirstlane.i32.i32(i32 %[[begin1]], i32 %[[mul1]]) +// GFX-NEXT: %[[sext1:[0-9]+]] = sext i32 %[[readfirstlane1]] to i64 +// GFX-NEXT: %[[gep1:[0-9]+]] = getelementptr i8, ptr addrspace(4) %{{.*}}, i64 %[[sext1]] +// GFX-NEXT: %[[load1:[0-9]+]] = load <8 x i32>, ptr addrspace(4) %[[gep1]], align 32 +// GFX-NEXT: %[[extract1:[.a-z0-9]+]] = extractelement <8 x i32> %[[load1]], i64 3 +// GFX-NEXT: %[[and1:[0-9]+]] = and i32 %[[extract1]], 268435455 +// GFX-NEXT: %[[cmp1:[0-9]+]] = icmp slt i32 %[[extract1]], 0 +// GFX-NEXT: %[[select1:[0-9]+]] = select i1 %[[cmp1]], i32 %[[extract1]], i32 %[[and1]] +// GFX-NEXT: %[[insert1:[.a-z0-9]+]] = insertelement <8 x i32> %[[load1]], i32 %[[select1]], i64 3 +// GFX-NEXT: %[[shufflevector1:[0-9]+]] = shufflevector <8 x i32> %[[insert1]], <8 x i32> %[[load1]], <8 x i32> +// GFX-NEXT: %[[gep2:[0-9]+]] = getelementptr i8, ptr addrspace(4) %{{.*}}, i64 %[[sext1]] +// GFX-NEXT: %[[load2:[0-9]+]] = load <4 x i32>, ptr addrspace(4) %[[gep2]], align 16 +// GFX-NEXT: %[[image_call1:[0-9]+]] = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32 15, float %{{.*}}, float %{{.*}}, <8 x i32> %[[shufflevector1]], <4 x i32> %[[load2]], i1 false, i32 0, i32 0) +// GFX-NEXT: %[[end1:[0-9]+]] = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.waterfall.end.v4f32(i32 %[[begin1]], <4 x float> %[[image_call1]]) +// +// GFX: %[[begin2:[0-9]+]] = call i32 @llvm.amdgcn.waterfall.begin.i32(i32 0, i32 %[[mul1]]) +// GFX-NEXT: %[[readfirstlane2:[0-9]+]] = call i32 @llvm.amdgcn.waterfall.readfirstlane.i32.i32(i32 %[[begin2]], i32 %[[mul1]]) +// GFX-NEXT: %[[sext2:[0-9]+]] = sext i32 %[[readfirstlane2]] to i64 +// GFX-NEXT: %[[gep3:[0-9]+]] = getelementptr i8, ptr addrspace(4) %{{.*}}, i64 %[[sext2]] +// GFX-NEXT: %[[load3:[0-9]+]] = load <8 x i32>, ptr addrspace(4) %[[gep3]], align 32 +// GFX-NEXT: %[[extract2:[.a-z0-9]+]] = extractelement <8 x i32> %[[load3]], i64 3 +// GFX-NEXT: %[[and2:[0-9]+]] = and i32 %[[extract2]], 268435455 +// GFX-NEXT: %[[cmp2:[0-9]+]] = icmp slt i32 %[[extract2]], 0 +// GFX-NEXT: %[[select2:[0-9]+]] = select i1 %[[cmp2]], i32 %[[extract2]], i32 %[[and2]] +// GFX-NEXT: %[[insert2:[.a-z0-9]+]] = insertelement <8 x i32> %[[load3]], i32 %[[select2]], i64 3 +// GFX-NEXT: %[[shufflevector2:[0-9]+]] = shufflevector <8 x i32> %[[insert2]], <8 x i32> %[[load3]], <8 x i32> +// GFX-NEXT: %[[gep4:[0-9]+]] = getelementptr i8, ptr addrspace(4) %{{.*}}, i64 %[[sext2]] +// GFX-NEXT: %[[load4:[0-9]+]] = load <4 x i32>, ptr addrspace(4) %[[gep4]], align 16 +// GFX-NEXT: %[[image_call2:[0-9]+]] = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32 15, float %{{.*}}, float %{{.*}}, <8 x i32> %[[shufflevector2]], <4 x i32> %[[load4]], i1 false, i32 0, i32 0) +// GFX-NEXT: %[[end2:[0-9]+]] = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.waterfall.end.v4f32(i32 %[[begin2]], <4 x float> %[[image_call2]]) +// GFX: AMDLLPC SUCCESS + +// GFX_10_3_0-LABEL: {{^// LLPC}} pipeline patching results +// GFX_10_3_0: %[[mul1:[0-9]+]] = mul i32 %{{.*}}, 48 +// GFX_10_3_0-NEXT: %[[begin1:[0-9]+]] = call i32 @llvm.amdgcn.waterfall.begin.i32(i32 0, i32 %[[mul1]]) +// GFX_10_3_0-NEXT: %[[readfirstlane1:[0-9]+]] = call i32 @llvm.amdgcn.waterfall.readfirstlane.i32.i32(i32 %[[begin1]], i32 %[[mul1]]) +// GFX_10_3_0-NEXT: %[[sext1:[0-9]+]] = sext i32 %[[readfirstlane1]] to i64 +// GFX_10_3_0-NEXT: %[[gep1:[0-9]+]] = getelementptr i8, ptr addrspace(4) %{{.*}}, i64 %[[sext1]] +// GFX_10_3_0-NEXT: %[[load1:[0-9]+]] = load <8 x i32>, ptr addrspace(4) %[[gep1]], align 32 +// GFX_10_3_0-NEXT: %[[gep2:[0-9]+]] = getelementptr i8, ptr addrspace(4) %{{.*}}, i64 %[[sext1]] +// GFX_10_3_0-NEXT: %[[load2:[0-9]+]] = load <4 x i32>, ptr addrspace(4) %[[gep2]], align 16 +// GFX_10_3_0-NEXT: %[[image_call1:[0-9]+]] = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32 15, float %{{.*}}, float %{{.*}}, <8 x i32> %[[load1]], <4 x i32> %[[load2]], i1 false, i32 0, i32 0) +// GFX_10_3_0-NEXT: %[[end1:[0-9]+]] = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.waterfall.end.v4f32(i32 %[[begin1]], <4 x float> %[[image_call1]]) +// +// GFX_10_3_0: %[[begin2:[0-9]+]] = call i32 @llvm.amdgcn.waterfall.begin.i32(i32 0, i32 %[[mul1]]) +// GFX_10_3_0-NEXT: %[[readfirstlane2:[0-9]+]] = call i32 @llvm.amdgcn.waterfall.readfirstlane.i32.i32(i32 %[[begin2]], i32 %[[mul1]]) +// GFX_10_3_0-NEXT: %[[sext2:[0-9]+]] = sext i32 %[[readfirstlane2]] to i64 +// GFX_10_3_0-NEXT: %[[gep3:[0-9]+]] = getelementptr i8, ptr addrspace(4) %{{.*}}, i64 %[[sext2]] +// GFX_10_3_0-NEXT: %[[load3:[0-9]+]] = load <8 x i32>, ptr addrspace(4) %[[gep3]], align 32 +// GFX_10_3_0-NEXT: %[[gep4:[0-9]+]] = getelementptr i8, ptr addrspace(4) %{{.*}}, i64 %[[sext2]] +// GFX_10_3_0-NEXT: %[[load4:[0-9]+]] = load <4 x i32>, ptr addrspace(4) %[[gep4]], align 16 +// GFX_10_3_0-NEXT: %[[image_call2:[0-9]+]] = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32 15, float %{{.*}}, float %{{.*}}, <8 x i32> %[[load3]], <4 x i32> %[[load4]], i1 false, i32 0, i32 0) +// GFX_10_3_0-NEXT: %[[end2:[0-9]+]] = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.waterfall.end.v4f32(i32 %[[begin2]], <4 x float> %[[image_call2]]) +// GFX_10_3_0: AMDLLPC SUCCESS + +// GFX_10_3_2-LABEL: {{^// LLPC}} pipeline patching results +// GFX_10_3_2: %[[mul1:[0-9]+]] = mul i32 %{{.*}}, 48 +// GFX_10_3_2-NEXT: %[[begin1:[0-9]+]] = call i32 @llvm.amdgcn.waterfall.begin.i32(i32 0, i32 %[[mul1]]) +// GFX_10_3_2-NEXT: %[[readfirstlane1:[0-9]+]] = call i32 @llvm.amdgcn.waterfall.readfirstlane.i32.i32(i32 %[[begin1]], i32 %[[mul1]]) +// GFX_10_3_2-NEXT: %[[sext1:[0-9]+]] = sext i32 %[[readfirstlane1]] to i64 +// GFX_10_3_2-NEXT: %[[gep1:[0-9]+]] = getelementptr i8, ptr addrspace(4) %{{.*}}, i64 %[[sext1]] +// GFX_10_3_2-NEXT: %[[load1:[0-9]+]] = load <8 x i32>, ptr addrspace(4) %[[gep1]], align 32 +// GFX_10_3_2-NEXT: %[[extract:[.a-z0-9]+]] = extractelement <8 x i32> %[[load1]], i64 6 +// GFX_10_3_2-NEXT: %[[and:[0-9]+]] = and i32 %[[extract]], -1048577 +// GFX_10_3_2-NEXT: %[[insert:[.a-z0-9]+]] = insertelement <8 x i32> %[[load1]], i32 %[[and]], i64 6 +// GFX_10_3_2-NEXT: %[[shufflevector:[0-9]+]] = shufflevector <8 x i32> %[[insert]], <8 x i32> %[[load1]], <8 x i32> +// GFX_10_3_2-NEXT: %[[gep2:[0-9]+]] = getelementptr i8, ptr addrspace(4) %{{.*}}, i64 %[[sext1]] +// GFX_10_3_2-NEXT: %[[load2:[0-9]+]] = load <4 x i32>, ptr addrspace(4) %[[gep2]], align 16 +// GFX_10_3_2-NEXT: %[[image_call1:[0-9]+]] = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32 15, float %{{.*}}, float %{{.*}}, <8 x i32> %[[shufflevector]], <4 x i32> %[[load2]], i1 false, i32 0, i32 0) +// GFX_10_3_2-NEXT: %[[end1:[0-9]+]] = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.waterfall.end.v4f32(i32 %[[begin1]], <4 x float> %[[image_call1]]) // -// END_SHADERTEST +// GFX_10_3_2: %[[begin2:[0-9]+]] = call i32 @llvm.amdgcn.waterfall.begin.i32(i32 0, i32 %[[mul1]]) +// GFX_10_3_2-NEXT: %[[readfirstlane2:[0-9]+]] = call i32 @llvm.amdgcn.waterfall.readfirstlane.i32.i32(i32 %[[begin2]], i32 %[[mul1]]) +// GFX_10_3_2-NEXT: %[[sext2:[0-9]+]] = sext i32 %[[readfirstlane2]] to i64 +// GFX_10_3_2-NEXT: %[[gep3:[0-9]+]] = getelementptr i8, ptr addrspace(4) %{{.*}}, i64 %[[sext2]] +// GFX_10_3_2-NEXT: %[[load3:[0-9]+]] = load <8 x i32>, ptr addrspace(4) %[[gep3]], align 32 +// GFX_10_3_2-NEXT: %[[extract:[.a-z0-9]+]] = extractelement <8 x i32> %[[load3]], i64 6 +// GFX_10_3_2-NEXT: %[[and:[0-9]+]] = and i32 %[[extract]], -1048577 +// GFX_10_3_2-NEXT: %[[insert:[.a-z0-9]+]] = insertelement <8 x i32> %[[load3]], i32 %[[and]], i64 6 +// GFX_10_3_2-NEXT: %[[shufflevector:[0-9]+]] = shufflevector <8 x i32> %[[insert]], <8 x i32> %[[load3]], <8 x i32> +// GFX_10_3_2-NEXT: %[[gep4:[0-9]+]] = getelementptr i8, ptr addrspace(4) %{{.*}}, i64 %[[sext2]] +// GFX_10_3_2-NEXT: %[[load4:[0-9]+]] = load <4 x i32>, ptr addrspace(4) %[[gep4]], align 16 +// GFX_10_3_2-NEXT: %[[image_call2:[0-9]+]] = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32 15, float %{{.*}}, float %{{.*}}, <8 x i32> %[[shufflevector]], <4 x i32> %[[load4]], i1 false, i32 0, i32 0) +// GFX_10_3_2-NEXT: %[[end2:[0-9]+]] = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.waterfall.end.v4f32(i32 %[[begin2]], <4 x float> %[[image_call2]]) +// GFX_10_3_2: AMDLLPC SUCCESS diff --git a/tool/dumper/vkgcPipelineDumper.cpp b/tool/dumper/vkgcPipelineDumper.cpp index 76686cf557..b799441cc2 100644 --- a/tool/dumper/vkgcPipelineDumper.cpp +++ b/tool/dumper/vkgcPipelineDumper.cpp @@ -647,7 +647,8 @@ void PipelineDumper::dumpPipelineShaderInfo(const PipelineShaderInfo *shaderInfo dumpFile << "options.fastMathFlags = " << shaderInfo->options.fastMathFlags << "\n"; dumpFile << "options.disableFastMathFlags = " << shaderInfo->options.disableFastMathFlags << "\n"; dumpFile << "options.ldsSpillLimitDwords = " << shaderInfo->options.ldsSpillLimitDwords << "\n"; - dumpFile << "options.scalarizeWaterfallLoads = " << shaderInfo->options.scalarizeWaterfallLoads << "\n"; + if (shaderInfo->options.scalarizeWaterfallLoads.has_value()) + dumpFile << "options.scalarizeWaterfallLoads = " << *shaderInfo->options.scalarizeWaterfallLoads << "\n"; dumpFile << "options.overrideShaderThreadGroupSizeX = " << shaderInfo->options.overrideShaderThreadGroupSizeX << "\n"; dumpFile << "options.overrideShaderThreadGroupSizeY = " << shaderInfo->options.overrideShaderThreadGroupSizeY << "\n"; dumpFile << "options.overrideShaderThreadGroupSizeZ = " << shaderInfo->options.overrideShaderThreadGroupSizeZ << "\n";