diff --git a/CMakeLists.txt b/CMakeLists.txt
index 790eb47185..b5518522f6 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -127,10 +127,9 @@ endif(LLPC_BUILD_TOOLS)
 if(ICD_BUILD_LLPC)
     # Generate Strings for LLPC standalone tool and vkgc_gpurtshim
     add_subdirectory(util ${PROJECT_BINARY_DIR}/util)
+    add_subdirectory(gfxruntime ${PROJECT_BINARY_DIR}/gfxruntime)
 endif()
 
-add_subdirectory(gfxruntime ${PROJECT_BINARY_DIR}/gfxruntime)
-
 ### VKGC build LLPC ################################################################
 if(ICD_BUILD_LLPC)
     include("cmake/compilerutils.cmake")
diff --git a/compilerutils/include/compilerutils/TypeLowering.h b/compilerutils/include/compilerutils/TypeLowering.h
index f697681db0..b3c1b42d2a 100644
--- a/compilerutils/include/compilerutils/TypeLowering.h
+++ b/compilerutils/include/compilerutils/TypeLowering.h
@@ -144,6 +144,7 @@ class TypeLowering {
 
   llvm::SmallVector<llvm::Value *> getValue(llvm::Value *);
   llvm::SmallVector<llvm::Value *> getValueOptional(llvm::Value *);
+  void replaceValue(llvm::Value *toReplace, llvm::Value *with) { replaceMappingWith(toReplace, with); }
   void replaceInstruction(llvm::Instruction *, llvm::ArrayRef<llvm::Value *>);
   void eraseInstruction(llvm::Instruction *);
 
diff --git a/gfxruntime/CMakeLists.txt b/gfxruntime/CMakeLists.txt
index d18bce873c..382ff8478c 100644
--- a/gfxruntime/CMakeLists.txt
+++ b/gfxruntime/CMakeLists.txt
@@ -37,18 +37,18 @@ find_package(Python3
 )
 
 # Locate dxc binary.
-if (CMAKE_HOST_SYSTEM_NAME MATCHES "Linux")
-    find_program(DXC_PATH dxc)
-    if ("${DXC_PATH}" STREQUAL "DXC_PATH-NOTFOUND")
-        message(FATAL_ERROR "Could not find shader compiler tool dxc.")
-    endif()
 #if _WIN32
-elseif(WIN32)
-    set(DXC_PATH "$ENV{DK_ROOT}/DirectXShaderCompiler/8c9d92b/bin")
+if(WIN32)
     if (NOT EXISTS "${DXC_PATH}")
         message(FATAL_ERROR "Unable to find DirectXShaderCompiler directory: ${DXC_PATH}")
     endif()
+endif()
 #endif
+if (NOT DXC_PATH)
+    find_program(DXC_PATH dxc)
+    if ("${DXC_PATH}" STREQUAL "DXC_PATH-NOTFOUND")
+        message(FATAL_ERROR "Could not find shader compiler tool dxc.")
+    endif()
 endif()
 
 set(CMAKE_CURRENT_SOURCE_DIR ${PROJECT_SOURCE_DIR}/gfxruntime)
diff --git a/include/vkgcDefs.h b/include/vkgcDefs.h
index 2ffe7fdec5..60b5621565 100644
--- a/include/vkgcDefs.h
+++ b/include/vkgcDefs.h
@@ -106,6 +106,7 @@ struct optional_bool : private std::optional<bool> {
   using std::optional<bool>::has_value;
   using std::optional<bool>::value;
   using std::optional<bool>::value_or;
+  using std::optional<bool>::operator*;
 };
 
 /// Enumerates result codes of LLPC operations.
@@ -576,7 +577,6 @@ struct ShaderModuleUsage {
   unsigned localSizeX;            ///< Compute shader work-group size in the X dimension
   unsigned localSizeY;            ///< Compute shader work-group size in the Y dimension
   unsigned localSizeZ;            ///< Compute shader work-group size in the Z dimension
-  bool useBarycentric;            ///< Whether to use gl_BarycentricXX or pervertexEXT decoration
   bool disableDualSource;         ///< Whether disable dualSource blend
   uint32_t clipDistanceArraySize; ///< Count of output clip distance
 };
@@ -768,7 +768,7 @@ struct PipelineShaderOptions {
   unsigned ldsSpillLimitDwords;
 
   /// Attempt to scalarize waterfall descriptor loads.
-  bool scalarizeWaterfallLoads;
+  optional_bool scalarizeWaterfallLoads;
 
   /// Force rearranges threadId within group into blocks of 8*8 or 8*4
   bool overrideForceThreadIdSwizzling;
@@ -1141,6 +1141,12 @@ struct RtState {
   bool rtIpOverride;
 };
 
+/// GPURT option
+struct GpurtOption {
+  uint64_t nameHash; ///< A hash value that is used as name.
+  uint64_t value;    ///< Value of the setting
+};
+
 struct UniformConstantMapEntry {
   unsigned location; ///< Starting location of the uniform constant variable
   unsigned offset;   ///< Offset of the uniform constant variable in the final buffer
@@ -1264,6 +1270,7 @@ struct GraphicsPipelineBuildInfo {
                                            ///  return extra meta data.
   bool enableEarlyCompile;                 ///< Whether enable early compile
   bool useSoftwareVertexBufferDescriptors; ///< Use software vertex buffer descriptors to structure SRD.
+  bool dynamicTopology;                    ///< Whether primitive topology is dynamic.
 #if LLPC_CLIENT_INTERFACE_MAJOR_VERSION < 62
   BinaryData shaderLibrary; ///< SPIR-V library binary data
 #endif
@@ -1290,6 +1297,8 @@ struct GraphicsPipelineBuildInfo {
     uint8_t vbAddressLowBits[MaxVertexBindings];  ///< Lowest two bits of vertex buffer addresses
     float pixelTransferScale[4];                  ///< Scale apply to render color target
     float pixelTransferBias[4];                   ///< Bias apply to render color target
+    bool enableColorClampVs;                      ///< Enable clamp vertex output color
+    bool enableColorClampFs;                      ///< Enable clamp fragment output color
   } glState;
   const auto &getGlState() const { return glState; }
 #endif
@@ -1363,6 +1372,8 @@ struct RayTracingPipelineBuildInfo {
                                   ///  stored inside the ELF
   size_t clientMetadataSize;      ///< Size (in bytes) of the client-defined data
   unsigned cpsFlags;              ///< Cps feature flags
+  GpurtOption *pGpurtOptions;     ///< Array of GPURT options
+  unsigned gpurtOptionCount;      ///< Number of GPURT options
 };
 
 /// Ray tracing max shader name length
diff --git a/lgc/builder/ArithBuilder.cpp b/lgc/builder/ArithBuilder.cpp
index 722c224231..57ba3e3f8c 100644
--- a/lgc/builder/ArithBuilder.cpp
+++ b/lgc/builder/ArithBuilder.cpp
@@ -1267,9 +1267,20 @@ Value *BuilderImpl::CreateFDot2(Value *a, Value *b, Value *scalar, Value *clamp,
   assert(scalar->getType()->isFloatTy());
   assert(clamp->getType()->isIntegerTy() && clamp->getType()->getIntegerBitWidth() == 1);
 
-  Value *result = CreateIntrinsic(scalar->getType(), Intrinsic::amdgcn_fdot2, {a, b, scalar, clamp});
-  result->setName(instName);
-  return result;
+  // GFX10.1 doesn't support v_dot2_f32_f16.
+  if (m_pipelineState->getTargetInfo().getGfxIpVersion() >= GfxIpVersion({10, 3})) {
+    Value *result = CreateIntrinsic(scalar->getType(), Intrinsic::amdgcn_fdot2, {a, b, scalar, clamp});
+    result->setName(instName);
+    return result;
+  }
+
+  // The half dot product result cannot be +/-inf if it exceeds the range of half. Two v_fma_mix_f32 can do this but
+  // it is currently unavailable.
+  Type *floatVecTy = FixedVectorType::get(scalar->getType(), 2);
+  Value *fa = CreateFPExt(a, floatVecTy);
+  Value *fb = CreateFPExt(b, floatVecTy);
+  Value *dot = CreateDotProduct(fa, fb);
+  return CreateFAdd(dot, scalar, instName);
 }
 
 // =====================================================================================================================
diff --git a/lgc/builder/BuilderBase.cpp b/lgc/builder/BuilderBase.cpp
index 92a993b347..951c0687e7 100644
--- a/lgc/builder/BuilderBase.cpp
+++ b/lgc/builder/BuilderBase.cpp
@@ -293,3 +293,13 @@ Instruction *BuilderBase::CreateWaterfallEnd(Value *nonUniform, Value *waterfall
 
   return resultValue;
 }
+
+// =====================================================================================================================
+// Create code to build a vector out of a number of scalar elements of the same type.
+Value *BuilderCommon::CreateBuildVector(llvm::ArrayRef<llvm::Value *> elements, const llvm::Twine &instName) {
+  Value *vector = PoisonValue::get(FixedVectorType::get(elements[0]->getType(), elements.size()));
+  for (unsigned idx = 0; idx != elements.size() - 1; ++idx)
+    vector = CreateInsertElement(vector, elements[idx], idx);
+  vector = CreateInsertElement(vector, elements.back(), elements.size() - 1, instName);
+  return vector;
+}
diff --git a/lgc/builder/BuilderImpl.cpp b/lgc/builder/BuilderImpl.cpp
index 3e88482491..ad69faf624 100644
--- a/lgc/builder/BuilderImpl.cpp
+++ b/lgc/builder/BuilderImpl.cpp
@@ -33,6 +33,7 @@
 #include "lgc/LgcDialect.h"
 #include "lgc/state/PipelineState.h"
 #include "lgc/state/TargetInfo.h"
+#include "llvm/ADT/BitVector.h"
 #include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/IntrinsicsAMDGPU.h"
 
@@ -73,6 +74,7 @@ Type *BuilderBase::getConditionallyVectorizedTy(Type *elementTy, Type *maybeVecT
 // @param vector2 : The float vector 2
 // @param instName : Name to give instruction(s)
 Value *BuilderImpl::CreateDotProduct(Value *const vector1, Value *const vector2, const Twine &instName) {
+
   Value *product = CreateFMul(vector1, vector2);
   if (!isa<VectorType>(product->getType()))
     return product;
@@ -246,7 +248,7 @@ Value *BuilderImpl::CreateIntegerDotProduct(Value *vector1, Value *vector2, Valu
 }
 
 // =====================================================================================================================
-// Get whether the context we are building in support the bpermute operation.
+// Get whether the context we are building in supports ds_bpermute or v_bpermute across all lanes in the wave
 bool BuilderImpl::supportWaveWideBPermute() const {
   auto gfxIp = getPipelineState()->getTargetInfo().getGfxIpVersion().major;
   auto supportBPermute = gfxIp == 8 || gfxIp == 9;
@@ -276,7 +278,7 @@ bool BuilderImpl::supportPermLane64Dpp() const {
 // @param condition : The "if" condition
 // @param wantElse : Whether to generate an "else" block
 // @param instName : Base of name for new basic blocks
-BranchInst *BuilderImpl::createIf(Value *condition, bool wantElse, const Twine &instName) {
+BranchInst *BuilderCommon::CreateIf(Value *condition, bool wantElse, const Twine &instName) {
   // Create "if" block and move instructions in current block to it.
   BasicBlock *endIfBlock = GetInsertBlock();
   BasicBlock *ifBlock = BasicBlock::Create(getContext(), "", endIfBlock->getParent(), endIfBlock);
@@ -320,20 +322,186 @@ BranchInst *BuilderImpl::createIf(Value *condition, bool wantElse, const Twine &
 
 #if defined(LLVM_HAVE_BRANCH_AMD_GFX)
 // =====================================================================================================================
-// For a non-uniform input, try and trace back through a descriptor load to
-// find the non-uniform index used in it. If that fails, we just use the
-// operand value as the index.
+// Track a small number of instructions, giving each of them an index by which they can easily be identified.
+class TinyInstructionTracker {
+  unsigned m_indexCounter = 0;
+  // List of all instructions we've encountered.
+  SmallVector<Instruction *> m_instructions;
+  // Index we've assigned to instructions.
+  DenseMap<Instruction *, unsigned> m_indexForInstruction;
+
+public:
+  TinyInstructionTracker() {}
+  size_t size() const { return m_instructions.size(); }
+  unsigned indexForInstruction(Instruction *inst);
+  Instruction *instructionForIndex(unsigned idx) const { return m_instructions[idx]; }
+
+  bool contains(Instruction *instr) const { return m_indexForInstruction.contains(instr); }
+
+  void init(Instruction *instr) {
+    m_instructions.push_back(instr);
+    m_indexForInstruction[instr] = m_indexCounter;
+    m_indexCounter++;
+  }
+};
+
+// Return the index for the given instruction, adding it to the tracker if necessary.
+unsigned TinyInstructionTracker::indexForInstruction(Instruction *inst) {
+  auto [it, inserted] = m_indexForInstruction.try_emplace(inst, m_instructions.size());
+  if (inserted)
+    m_instructions.push_back(inst);
+  return it->second;
+}
+
+// =====================================================================================================================
+// A simple memory efficient container that holds the dependencies of the instructions.
+class TinyInstructionSet {
+  BitVector m_bits;
+
+public:
+  class const_iterator {
+    BitVector::const_set_bits_iterator m_it;
+    const TinyInstructionTracker &m_tracker;
+
+  public:
+    const_iterator(BitVector::const_set_bits_iterator it, const TinyInstructionTracker &tracker)
+        : m_it(it), m_tracker(tracker) {}
+    const_iterator &operator++() {
+      ++m_it;
+      return *this;
+    }
+
+    Instruction *operator*() {
+      unsigned index = *m_it;
+      return m_tracker.instructionForIndex(index);
+    }
+
+    bool operator!=(const const_iterator &otherIt) {
+      assert(&otherIt.m_tracker == &m_tracker && "Iterators of different objects.");
+      return otherIt.m_it != m_it;
+    }
+  };
+
+  const_iterator begin(const TinyInstructionTracker &tracker) const {
+    return const_iterator(m_bits.set_bits_begin(), tracker);
+  }
+
+  const_iterator end(const TinyInstructionTracker &tracker) const {
+    return const_iterator(m_bits.set_bits_end(), tracker);
+  }
+
+  void insert(unsigned index) {
+    if (index >= m_bits.size())
+      m_bits.resize(index + 1);
+    m_bits.set(index);
+  }
+
+  void insert(Instruction *instr, TinyInstructionTracker &tracker) { insert(tracker.indexForInstruction(instr)); }
+
+  bool contains(unsigned index) const { return index < m_bits.size() && m_bits[index]; }
+
+  bool contains(Instruction *instr, TinyInstructionTracker &tracker) const {
+    return contains(tracker.indexForInstruction(instr));
+  }
+
+  unsigned size() const { return m_bits.count(); }
+
+  bool empty() const { return !m_bits.any(); }
+
+  TinyInstructionSet &operator|=(const TinyInstructionSet &rhs) {
+    m_bits |= rhs.m_bits;
+    return *this;
+  }
+};
+
+// =====================================================================================================================
+// Traverse the instructions to find the non-uniform index. In case of scalarization of descriptor loads, we also
+// collect the dependencies of the instructions.
+class TraceNonUniformIndex {
+  bool m_scalarizeDescriptorLoads;
+  TinyInstructionTracker m_tracker;
+  // For each value, the set of instructions that depend on it.
+  DenseMap<Value *, TinyInstructionSet> m_dependentInstructions;
+  SmallVector<std::pair<Value *, unsigned>> nonUniformIndexOperandIdx;
+  void init(Instruction *instr) {
+    if (!m_scalarizeDescriptorLoads)
+      return;
+    m_tracker.init(instr);
+  }
+
+public:
+  TraceNonUniformIndex(bool scalarizeDescriptorLoads = false)
+      : m_scalarizeDescriptorLoads(scalarizeDescriptorLoads), m_tracker() {}
+
+  // Non-uniform index calculation.
+  Value *run(Value *);
+
+  // Helper functions for non-uniform index.
+  void setNonUniformIndex(Value *nonUniformIndex, unsigned operandIdx) {
+    nonUniformIndexOperandIdx.push_back(std::make_pair(nonUniformIndex, operandIdx));
+  }
+
+  auto getNonUniformIndexes() {
+    return llvm::make_range(nonUniformIndexOperandIdx.begin(), nonUniformIndexOperandIdx.end());
+  }
+
+  unsigned getNumOfNonUniformIndexes() { return nonUniformIndexOperandIdx.size(); }
+
+  // Return true if there are not any non-uniform indexes.
+  bool empty() { return nonUniformIndexOperandIdx.empty(); }
+
+  // Helper functions for reading/writing the instruction dependencies.
+  auto getDependentInstructions(Value *value) {
+    TinyInstructionSet &dependents = m_dependentInstructions[value];
+    return llvm::make_range(dependents.begin(m_tracker), dependents.end(m_tracker));
+  }
+
+  bool hasDependentInstructions(Value *value) {
+    return !m_dependentInstructions[value].empty() && m_scalarizeDescriptorLoads;
+  }
+
+  void addDependents(Value *newValue, Instruction *dependent) {
+    if (!m_scalarizeDescriptorLoads)
+      return;
+
+    init(cast<Instruction>(newValue));
+    TinyInstructionSet &dst = m_dependentInstructions[newValue];
+    for (Instruction *dep : getDependentInstructions(dependent)) {
+      dst.insert(dep, m_tracker);
+      auto it = m_dependentInstructions.find(dep);
+      if (it != m_dependentInstructions.end())
+        dst |= it->second;
+    }
+    if (dependent)
+      dst.insert(dependent, m_tracker);
+  }
+};
+
+// For a non-uniform input, try and trace back through a descriptor load to find the non-uniform index used in it. If
+// that fails, we just use the operand value as the index.
 //
 // Note that this function may return null, which means that the given value has been shown to be uniform.
 //
 // This uses a fairly simple heuristic that nevertheless allows temporary expansion of the search breadth to handle
 // the common case where a base pointer is assembled from separate high and low halves.
 //
+// In case of scalarization, while it traverses all use-def predecessors of the nonUniformVal, it adds the  instructions
+// to instrDeps map (addDependents()). These dependencies are the instructions that will be cloned and moved
+// inside the waterfall loop.
+//
 // @param nonUniformVal : Value representing non-uniform descriptor
 // @return : Value representing the non-uniform index, or null if nonUniformVal could be proven to be uniform
-static Value *traceNonUniformIndex(Value *nonUniformVal) {
+Value *TraceNonUniformIndex::run(Value *nonUniformVal) {
+  auto inst = dyn_cast<Instruction>(nonUniformVal);
+  if (!inst) {
+    // Could plausibly be a constant or a function argument. Either way, we don't have to search any further.
+    return isa<Constant>(nonUniformVal) ? nullptr : nonUniformVal;
+  }
+
   auto load = dyn_cast<LoadInst>(nonUniformVal);
-  if (!load) {
+  if (load)
+    init(load);
+  else {
     // Workarounds that modify image descriptor can be peeped through, i.e.
     //   %baseValue = load <8 x i32>, <8 x i32> addrspace(4)* %..., align 16
     //   %rawElement = extractelement <8 x i32> %baseValue, i64 6
@@ -347,14 +515,21 @@ static Value *traceNonUniformIndex(Value *nonUniformVal) {
     if (!load)
       return nonUniformVal;
 
+    init(insert);
+    addDependents(load, insert);
+
     // We found the load, but must verify the chain.
     // Consider updatedElement as a generic instruction or constant.
     if (auto updatedElement = dyn_cast<Instruction>(insert->getOperand(1))) {
+      addDependents(updatedElement, insert);
       for (Value *operand : updatedElement->operands()) {
         if (auto extract = dyn_cast<ExtractElementInst>(operand)) {
           // Only dynamic value must be ExtractElementInst based on load.
           if (dyn_cast<LoadInst>(extract->getOperand(0)) != load)
             return nonUniformVal;
+
+          addDependents(extract, updatedElement);
+          addDependents(load, extract);
         } else if (!isa<Constant>(operand)) {
           return nonUniformVal;
         }
@@ -376,14 +551,15 @@ static Value *traceNonUniformIndex(Value *nonUniformVal) {
   SmallVector<Instruction *, 2> nonUniforms;
   nonUniforms.push_back(load);
 
-  auto propagate = [&](Value *value) -> bool {
-    if (auto inst = dyn_cast<Instruction>(value)) {
+  auto propagate = [&](Value *currentOp, Instruction *current) {
+    if (auto instOp = dyn_cast<Instruction>(currentOp)) {
       if (nonUniforms.size() >= 2)
         return false;
-      nonUniforms.push_back(inst);
+      nonUniforms.push_back(instOp);
+      addDependents(instOp, current);
       return true;
     }
-    return isa<Constant>(value);
+    return isa<Constant>(currentOp);
   };
 
   do {
@@ -400,13 +576,13 @@ static Value *traceNonUniformIndex(Value *nonUniformVal) {
 
     // See if we can propagate the search further.
     if (current->isCast() || current->isUnaryOp()) {
-      if (!propagate(current->getOperand(0)))
+      if (!propagate(current->getOperand(0), current))
         return nonUniformVal;
       continue;
     }
 
     if (current->isBinaryOp()) {
-      if (!propagate(current->getOperand(0)) || !propagate(current->getOperand(1)))
+      if (!propagate(current->getOperand(0), current) || !propagate(current->getOperand(1), current))
         return nonUniformVal;
       continue;
     }
@@ -417,14 +593,15 @@ static Value *traceNonUniformIndex(Value *nonUniformVal) {
       if (as == ADDR_SPACE_FLAT || as == ADDR_SPACE_PRIVATE)
         return nonUniformVal; // load is a source of divergence, can't propagate
 
-      if (!propagate(ptr))
+      if (!propagate(ptr, current))
         return nonUniformVal;
       continue;
     }
 
     if (auto gep = dyn_cast<GetElementPtrInst>(current)) {
       if (gep->hasAllConstantIndices()) {
-        if (!propagate(gep->getPointerOperand()))
+
+        if (!propagate(gep->getPointerOperand(), current))
           return nonUniformVal;
         continue;
       }
@@ -433,33 +610,35 @@ static Value *traceNonUniformIndex(Value *nonUniformVal) {
       if (candidateIndex || gep->getNumIndices() != 1)
         return nonUniformVal;
 
-      if (!propagate(gep->getPointerOperand()))
+      if (!propagate(gep->getPointerOperand(), current))
         return nonUniformVal;
 
       candidateIndex = *gep->idx_begin();
       if (getSize(candidateIndex) > nonUniformValSize)
         return nonUniformVal; // propagating further is worthless
+
+      addDependents(candidateIndex, gep);
       continue;
     }
 
     if (auto extract = dyn_cast<ExtractValueInst>(current)) {
-      if (!propagate(extract->getAggregateOperand()))
+      if (!propagate(extract->getAggregateOperand(), current))
         return nonUniformVal;
       continue;
     }
     if (auto insert = dyn_cast<InsertValueInst>(current)) {
-      if (!propagate(insert->getAggregateOperand()) || !propagate(insert->getInsertedValueOperand()))
+      if (!propagate(insert->getAggregateOperand(), current) || !propagate(insert->getInsertedValueOperand(), current))
         return nonUniformVal;
       continue;
     }
     if (auto extract = dyn_cast<ExtractElementInst>(current)) {
-      if (!isa<Constant>(extract->getIndexOperand()) || !propagate(extract->getVectorOperand()))
+      if (!isa<Constant>(extract->getIndexOperand()) || !propagate(extract->getVectorOperand(), current))
         return nonUniformVal;
       continue;
     }
     if (auto insert = dyn_cast<InsertElementInst>(current)) {
-      if (!isa<Constant>(insert->getOperand(2)) || !propagate(insert->getOperand(0)) ||
-          !propagate(insert->getOperand(1)))
+      if (!isa<Constant>(insert->getOperand(2)) || !propagate(insert->getOperand(0), current) ||
+          !propagate(insert->getOperand(1), current))
         return nonUniformVal;
       continue;
     }
@@ -489,8 +668,7 @@ static Value *traceNonUniformIndex(Value *nonUniformVal) {
 }
 
 // =====================================================================================================================
-// Test whether two instructions are identical
-// or are the same operation on identical operands.
+// Test whether two instructions are identical or are the same operation on identical operands.
 // @param lhs : First instruction
 // @param rhs : Second instruction
 // @return Result of equally test
@@ -516,12 +694,139 @@ static bool instructionsEqual(Instruction *lhs, Instruction *rhs) {
 
   return true;
 }
+
+// =====================================================================================================================
+// Check if the non-uniform indexes are identical.
+// @param nonUniformInst : the non-uniform instruction
+// @param traceNonUniformIndex : non-uniform index information
+Instruction *getSharedIndex(Instruction *nonUniformInst, TraceNonUniformIndex &traceNonUniformIndex) {
+  // FIXME: these do not actually need to be identical if we introduce multiple waterfall
+  // begin and readfirstlane intrinsics for these.
+  Instruction *sharedIndex = nullptr;
+  bool identicalIndexes = false;
+  for (auto &P : traceNonUniformIndex.getNonUniformIndexes()) {
+    Value *nonUniformVal = P.first;
+    Instruction *nuInst = dyn_cast<Instruction>(nonUniformVal);
+    if (!nuInst)
+      return nullptr;
+
+    identicalIndexes = sharedIndex && instructionsEqual(nuInst, sharedIndex);
+    if (sharedIndex && !identicalIndexes)
+      return nullptr;
+
+    if (!sharedIndex)
+      sharedIndex = nuInst;
+  }
+  return sharedIndex;
+}
+
+// =====================================================================================================================
+// For any index that is 64 bit, change it back to 32 bit for comparison at the top of the
+// waterfall loop.
+Value *get32BitVal(Value *nonUniformVal) {
+  Type *nonUniformValTy = nonUniformVal->getType();
+  if (nonUniformValTy->isIntegerTy(32))
+    return nonUniformVal;
+  if (nonUniformValTy->isIntegerTy(64)) {
+    auto sExt = dyn_cast<SExtInst>(nonUniformVal);
+    // 64-bit index may already be formed from extension of 32-bit value.
+    if (sExt && sExt->getOperand(0)->getType()->isIntegerTy(32))
+      return sExt->getOperand(0);
+    else
+      return IRBuilder<>(cast<Instruction>(nonUniformVal)->getNextNode())
+          .CreateTrunc(nonUniformVal, Type::getInt32Ty(nonUniformVal->getContext()));
+  }
+  return nullptr;
+}
+
+// =====================================================================================================================
+// Code generation for the scalarization of the descriptor loads.
+//
+// First, we get the dependencies of the non-uniform index from the instrDeps map. Next, we copy and emit the
+// non-uniform index with its dependencies inside the waterfall loop (between the waterfall.readfirstlane intrinsic and
+// the nonUniformInst).
+//
+// @param nonUniformInstOperand: the non-uniform operand of the nonUniformInst
+// @param nonUniformIndex : the non-uniform index for the nonUniformInstOperand
+// @param readFirstLane : the amdgcn.waterfall.readfirstlane intrinsic
+// @param waterfallBegin : the amdgcn.waterfall.begin intrinsic
+// @param nonUniformInst : the non-uniform instruction
+// @param operandIdx : the operand number of the nonUniformInstOperand
+// @param instName : the name for the new intrinsics
+// @param traceNonUniformIndex : non-uniform index information
+void implementScalarization(Value *nonUniformInstOperand, Value *nonUniformIndex, Value *readFirstLane,
+                            Value *waterfallBegin, Instruction *nonUniformInst, unsigned operandIdx,
+                            const Twine &instName, TraceNonUniformIndex &traceNonUniformIndex) {
+
+  // Get the instruction chain of the non-uniform index.
+  auto instrsToClone = traceNonUniformIndex.getDependentInstructions(nonUniformIndex);
+
+  // Clone and emit the instructions that we want to push inside the waterfall loop.
+  std::map<Instruction *, Instruction *> origClonedValuesMap;
+  Instruction *prevInst = nonUniformInst;
+
+  for (Instruction *origInst : instrsToClone) {
+    auto *newInst = origInst->clone();
+    newInst->insertBefore(prevInst);
+    origClonedValuesMap[origInst] = newInst;
+    prevInst = newInst;
+    // Update the operand of the nonUniformInst (for which the waterfall is created) with the new load that we
+    // emitted inside the loop.
+    if (nonUniformInstOperand == origInst) {
+      if (nonUniformInst->getType()->isVoidTy())
+        newInst = IRBuilder<>(nonUniformInst)
+                      .CreateIntrinsic(Intrinsic::amdgcn_waterfall_last_use, newInst->getType(),
+                                       {waterfallBegin, newInst}, nullptr, instName);
+      nonUniformInst->setOperand(operandIdx, newInst);
+    }
+  }
+
+  // Clone the first non-uniform index.
+  auto *origInst = cast<Instruction>(nonUniformIndex);
+  auto *newInst = origInst->clone();
+  newInst->insertBefore(prevInst);
+  origClonedValuesMap[origInst] = newInst;
+
+  // Update the operands of the cloned instructions.
+  for (auto [origInst, newInst] : origClonedValuesMap) {
+    for (Use &use : newInst->operands()) {
+      Value *op = use.get();
+      if (auto *opI = dyn_cast<Instruction>(op)) {
+        auto it = origClonedValuesMap.find(opI);
+        if (it == origClonedValuesMap.end())
+          continue;
+        Instruction *clonedI = it->second;
+        use.set(clonedI);
+      }
+    }
+  }
+
+  Value *nonUniformIndex32Bit = get32BitVal(nonUniformIndex);
+  nonUniformIndex32Bit->replaceUsesWithIf(readFirstLane, [readFirstLane, waterfallBegin, nonUniformInst](Use &U) {
+    Instruction *userInst = cast<Instruction>(U.getUser());
+    return userInst != waterfallBegin && userInst != readFirstLane &&
+           userInst->getParent() == nonUniformInst->getParent() &&
+           (userInst == nonUniformInst || userInst->comesBefore(nonUniformInst)) &&
+           !userInst->comesBefore(cast<Instruction>(waterfallBegin));
+  });
+}
 #endif
 
 // =====================================================================================================================
 // Create a waterfall loop containing the specified instruction.
-// This does not use the current insert point; new code is inserted before and after nonUniformInst.
 //
+// This is done in three steps:
+// 1. Calculate the non-uniform indexes : Collect the non-uniform indexes that correspond to the operands of
+// the nonUniformInst. In addition, in case of scalarization, we need to collect all the instructions that need
+// to be moved inside the loop. All these are done by traceNonUniformIndex.
+//
+// 2. Process the non-uniform indexes : Check if the non-uniform indexes are identical.
+//
+// 3. Generate the waterfall loop intrinisics and generate the code that is related to the scalarization if it is
+// needed.
+
+// This does not use the current insert point; new code is inserted before and after nonUniformInst.
+
 // @param nonUniformInst : The instruction to put in a waterfall loop
 // @param operandIdxs : The operand index/indices for non-uniform inputs that need to be uniform
 // @param scalarizeDescriptorLoads : Attempt to scalarize descriptor loads
@@ -537,129 +842,105 @@ Instruction *BuilderImpl::createWaterfallLoop(Instruction *nonUniformInst, Array
 #else
   assert(operandIdxs.empty() == false);
 
-  SmallVector<Value *, 2> nonUniformIndices;
+  // Non-uniform index calculation
+  TraceNonUniformIndex traceNonUniformIndex(scalarizeDescriptorLoads);
   for (unsigned operandIdx : operandIdxs) {
-    Value *nonUniformIndex = traceNonUniformIndex(nonUniformInst->getOperand(operandIdx));
-    if (nonUniformIndex)
-      nonUniformIndices.push_back(nonUniformIndex);
-  }
-  if (nonUniformIndices.empty())
-    return nonUniformInst;
-
-  // For any index that is 64 bit, change it back to 32 bit for comparison at the top of the
-  // waterfall loop.
-  for (Value *&nonUniformVal : nonUniformIndices) {
-    if (nonUniformVal->getType()->isIntegerTy(64)) {
-      auto sExt = dyn_cast<SExtInst>(nonUniformVal);
-      // 64-bit index may already be formed from extension of 32-bit value.
-      if (sExt && sExt->getOperand(0)->getType()->isIntegerTy(32)) {
-        nonUniformVal = sExt->getOperand(0);
-      } else {
-        nonUniformVal = CreateTrunc(nonUniformVal, getInt32Ty());
-      }
+    Value *nonUniformInstOperand = nonUniformInst->getOperand(operandIdx);
+    Value *nonUniformIndex = traceNonUniformIndex.run(nonUniformInstOperand);
+    if (nonUniformIndex) {
+      traceNonUniformIndex.setNonUniformIndex(nonUniformIndex, operandIdx);
     }
   }
 
-  // Find first index instruction and check if index instructions are identical.
-  Instruction *firstIndexInst = nullptr;
-  if (scalarizeDescriptorLoads) {
-    // FIXME: these do not actually need to be identical if we introduce multiple waterfall
-    // begin and readlane intrinsics for these.
-    bool identicalIndexes = true;
-    for (Value *nonUniformVal : nonUniformIndices) {
-      Instruction *nuInst = dyn_cast<Instruction>(nonUniformVal);
-      // Note: parent check here guards use of comesBefore below
-      if (!nuInst || (firstIndexInst && !instructionsEqual(nuInst, firstIndexInst)) ||
-          (firstIndexInst && nuInst->getParent() != firstIndexInst->getParent())) {
-        identicalIndexes = false;
-        break;
-      }
-      if (!firstIndexInst || nuInst->comesBefore(firstIndexInst))
-        firstIndexInst = nuInst;
-    }
+  if (traceNonUniformIndex.empty())
+    return nonUniformInst;
 
-    // Ensure we do not create a waterfall across blocks.
-    // FIXME: we could use dominator check to allow scalarizing descriptor loads on multi-block spans;
-    // however, this also requires backend support for multi-block waterfalls to be implemented.
-    if (!identicalIndexes || !firstIndexInst ||
-        (firstIndexInst && firstIndexInst->getParent() != nonUniformInst->getParent()))
-      scalarizeDescriptorLoads = false;
-  }
+  // Check if the non-uniform indexes are identical.
+  Instruction *sharedIndex = scalarizeDescriptorLoads && (traceNonUniformIndex.getNumOfNonUniformIndexes() > 1)
+                                 ? getSharedIndex(nonUniformInst, traceNonUniformIndex)
+                                 : nullptr;
 
+  // Generate the waterfall loop intrinisics and implement the scalarization of the descriptor loads.
   // Save Builder's insert point
   IRBuilder<>::InsertPointGuard guard(*this);
-
-  Value *waterfallBegin;
-  if (scalarizeDescriptorLoads) {
-    // Attempt to scalarize descriptor loads.
-    assert(firstIndexInst);
-    CallInst *firstCallInst = dyn_cast<CallInst>(firstIndexInst);
-    if (firstCallInst && firstCallInst->getIntrinsicID() == Intrinsic::amdgcn_waterfall_readfirstlane) {
-      // Descriptor loads are already inside a waterfall.
-      waterfallBegin = firstCallInst->getArgOperand(0);
-    } else {
-      // Begin waterfall loop just after shared index is computed.
-      // This places all dependent instructions within the waterfall loop, including descriptor loads.
-      auto descTy = firstIndexInst->getType();
-      SetInsertPoint(firstIndexInst->getNextNonDebugInstruction(false));
-      waterfallBegin = ConstantInt::get(getInt32Ty(), 0);
-      waterfallBegin = CreateIntrinsic(Intrinsic::amdgcn_waterfall_begin, descTy, {waterfallBegin, firstIndexInst},
-                                       nullptr, instName);
-
-      // Scalarize shared index.
-      Value *desc = CreateIntrinsic(Intrinsic::amdgcn_waterfall_readfirstlane, {descTy, descTy},
-                                    {waterfallBegin, firstIndexInst}, nullptr, instName);
-
-      // Replace all references to shared index within the waterfall loop with scalarized index.
-      // (Note: this includes the non-uniform instruction itself.)
-      // Loads using scalarized index will become scalar loads.
-      for (Value *otherNonUniformVal : nonUniformIndices) {
-        otherNonUniformVal->replaceUsesWithIf(desc, [desc, waterfallBegin, nonUniformInst](Use &U) {
-          Instruction *userInst = cast<Instruction>(U.getUser());
-          return U.getUser() != waterfallBegin && U.getUser() != desc &&
-                 userInst->getParent() == nonUniformInst->getParent() &&
-                 (userInst == nonUniformInst || userInst->comesBefore(nonUniformInst));
-        });
-      }
-    }
+  // Insert new code just before nonUniformInst.
+  SetInsertPoint(nonUniformInst);
+
+  // Emit waterfall.begin intrinsics. The first begin contains a null token for the previous token argument.
+  Value *readFirstLane = nullptr;
+  Value *waterfallBegin = ConstantInt::get(getInt32Ty(), 0);
+  if (sharedIndex) {
+    // Emit the waterfall.begin and the waterfall.readfirstlane intrinsics for the shared non-uniform index.
+    Value *sharedIndex32Bit = get32BitVal(sharedIndex);
+    assert(sharedIndex32Bit != nullptr);
+    auto sharedIndexTy = sharedIndex32Bit->getType();
+    waterfallBegin = CreateIntrinsic(Intrinsic::amdgcn_waterfall_begin, sharedIndexTy,
+                                     {waterfallBegin, sharedIndex32Bit}, nullptr, instName);
+    readFirstLane = CreateIntrinsic(Intrinsic::amdgcn_waterfall_readfirstlane, {sharedIndexTy, sharedIndexTy},
+                                    {waterfallBegin, sharedIndex32Bit}, nullptr, instName);
   } else {
-    // Insert new code just before nonUniformInst.
-    SetInsertPoint(nonUniformInst);
-
-    // The first begin contains a null token for the previous token argument
-    waterfallBegin = ConstantInt::get(getInt32Ty(), 0);
-    for (auto nonUniformVal : nonUniformIndices) {
-      // Start the waterfall loop using the waterfall index.
-      waterfallBegin = CreateIntrinsic(Intrinsic::amdgcn_waterfall_begin, nonUniformVal->getType(),
-                                       {waterfallBegin, nonUniformVal}, nullptr, instName);
+    // Emit waterfall.begin intrinsics for every non-uniform index.
+    for (auto &P : traceNonUniformIndex.getNonUniformIndexes()) {
+      Value *nonUniformIndex32Bit = get32BitVal(P.first);
+      Value *nonUniformIndex = nonUniformIndex32Bit ? nonUniformIndex32Bit : P.first;
+      waterfallBegin = CreateIntrinsic(Intrinsic::amdgcn_waterfall_begin, nonUniformIndex->getType(),
+                                       {waterfallBegin, nonUniformIndex}, nullptr, instName);
     }
+  }
 
-    // Scalarize each non-uniform operand of the instruction.
-    for (unsigned operandIdx : operandIdxs) {
-      Value *desc = nonUniformInst->getOperand(operandIdx);
-      auto descTy = desc->getType();
+  // For each non-uniform index, emit the waterfall.readfirstlane intrinsics (if there is not a shared non-uniform
+  // index) and the waterfall.last.use intrinsics. In case of scalarization, we also emit the instructions that should
+  // be moved inside the waterfall loop.
+  for (auto [nonUniformIndex, operandIdx] : traceNonUniformIndex.getNonUniformIndexes()) {
+    Value *nonUniformInstOperand = nonUniformInst->getOperand(operandIdx);
+    auto nonUniformInstOperandTy = nonUniformInstOperand->getType();
+
+    // The scalarization of descriptor loads cannot be done if the dependencies of the load instructions were not
+    // found or if the load is not invariant because such a load could produce different results if it were moved. In
+    // this case, we just emit the waterfall loop intrinsics without moving the non-uniform loads inside the waterfall
+    // loop.
+    if (scalarizeDescriptorLoads &&
+        (!isa<LoadInst>(nonUniformInstOperand) ||
+         cast<LoadInst>(nonUniformInstOperand)->hasMetadata(LLVMContext::MD_invariant_load)) &&
+        traceNonUniformIndex.hasDependentInstructions(nonUniformIndex)) {
+
+      // Emit read first lane intrinsics for each of the non-uniform operands.
+      if (!sharedIndex) {
+        Value *nonUniformIndex32Bit = get32BitVal(nonUniformIndex);
+        assert(nonUniformIndex32Bit != nullptr);
+        auto nonUniformIndexTy = nonUniformIndex32Bit->getType();
+        readFirstLane =
+            CreateIntrinsic(Intrinsic::amdgcn_waterfall_readfirstlane, {nonUniformIndexTy, nonUniformIndexTy},
+                            {waterfallBegin, nonUniformIndex32Bit}, nullptr, instName);
+      }
+      implementScalarization(nonUniformInstOperand, nonUniformIndex, readFirstLane, waterfallBegin, nonUniformInst,
+                             operandIdx, instName, traceNonUniformIndex);
+    } else {
+      Value *newIntrinsic = nonUniformInstOperand;
 #if LLVM_MAIN_REVISION && LLVM_MAIN_REVISION < 463892
       // Old version of the code
 #else
       // When the non-uniform use is in a VGPR, we can save a v_mov by not inserting the amdgcn_waterfall_readfirstlane
       if (!useVgprForOperands)
 #endif
-      desc = CreateIntrinsic(Intrinsic::amdgcn_waterfall_readfirstlane, {descTy, descTy}, {waterfallBegin, desc},
-                             nullptr, instName);
+      newIntrinsic =
+          CreateIntrinsic(Intrinsic::amdgcn_waterfall_readfirstlane, {nonUniformInstOperandTy, nonUniformInstOperandTy},
+                          {waterfallBegin, newIntrinsic}, nullptr, instName);
       if (nonUniformInst->getType()->isVoidTy()) {
         // The buffer/image operation we are waterfalling is a store with no return value. Use
         // llvm.amdgcn.waterfall.last.use on the descriptor.
 #if LLVM_MAIN_REVISION && LLVM_MAIN_REVISION < 463892
         // Old version of the code
-        desc = CreateIntrinsic(Intrinsic::amdgcn_waterfall_last_use, descTy, {waterfallBegin, desc}, nullptr, instName);
+        newIntrinsic = CreateIntrinsic(Intrinsic::amdgcn_waterfall_last_use, nonUniformInstOperandTy,
+                                       {waterfallBegin, newIntrinsic}, nullptr, instName);
 #else
-        desc = CreateIntrinsic(useVgprForOperands ? Intrinsic::amdgcn_waterfall_last_use_vgpr
-                                                  : Intrinsic::amdgcn_waterfall_last_use,
-                               descTy, {waterfallBegin, desc}, nullptr, instName);
+        newIntrinsic = CreateIntrinsic(useVgprForOperands ? Intrinsic::amdgcn_waterfall_last_use_vgpr
+                                                          : Intrinsic::amdgcn_waterfall_last_use,
+                                       nonUniformInstOperandTy, {waterfallBegin, newIntrinsic}, nullptr, instName);
 #endif
       }
       // Replace the descriptor operand in the buffer/image operation.
-      nonUniformInst->setOperand(operandIdx, desc);
+      nonUniformInst->setOperand(operandIdx, newIntrinsic);
     }
   }
 
diff --git a/lgc/builder/BuilderRecorder.cpp b/lgc/builder/BuilderRecorder.cpp
index 92e78a02ab..972d7a1bdf 100644
--- a/lgc/builder/BuilderRecorder.cpp
+++ b/lgc/builder/BuilderRecorder.cpp
@@ -260,12 +260,6 @@ StringRef BuilderRecorder::getCallName(BuilderOpcode opcode) {
     return "get.wave.size";
   case BuilderOpcode::GetSubgroupSize:
     return "get.subgroup.size";
-  case BuilderOpcode::SubgroupAll:
-    return "subgroup.all";
-  case BuilderOpcode::SubgroupAllEqual:
-    return "subgroup.all.equal";
-  case BuilderOpcode::SubgroupRotate:
-    return "subgroup.rotate";
   case BuilderOpcode::SubgroupBroadcast:
     return "subgroup.broadcast";
   case BuilderOpcode::SubgroupBroadcastWaterfall:
@@ -1622,24 +1616,6 @@ Value *Builder::CreateGetSubgroupSize(const Twine &instName) {
   return record(BuilderOpcode::GetSubgroupSize, getInt32Ty(), {}, instName);
 }
 
-// =====================================================================================================================
-// Create a subgroup all.
-//
-// @param value : The value to compare
-// @param instName : Name to give instruction(s)
-Value *Builder::CreateSubgroupAll(Value *const value, const Twine &instName) {
-  return record(BuilderOpcode::SubgroupAll, getInt1Ty(), {value}, instName);
-}
-
-// =====================================================================================================================
-// Create a subgroup all equal.
-//
-// @param value : The value to compare
-// @param instName : Name to give instruction(s)
-Value *Builder::CreateSubgroupAllEqual(Value *const value, const Twine &instName) {
-  return record(BuilderOpcode::SubgroupAllEqual, getInt1Ty(), {value}, instName);
-}
-
 // =====================================================================================================================
 // Create a subgroup broadcast.
 //
@@ -1792,18 +1768,6 @@ Value *Builder::CreateSubgroupShuffleDown(Value *const value, Value *const offse
   return record(BuilderOpcode::SubgroupShuffleDown, value->getType(), {value, offset}, instName);
 }
 
-// =====================================================================================================================
-// Create a subgroup rotate call.
-//
-// @param value : The value to read from the chosen rotated lane to all active lanes.
-// @param delta : The delta/offset added to lane id.
-// @param clusterSize : The cluster size if exists.
-// @param instName : Name to give final instruction.
-Value *Builder::CreateSubgroupRotate(Value *const value, Value *const delta, Value *const clusterSize,
-                                     const Twine &instName) {
-  return record(BuilderOpcode::SubgroupRotate, value->getType(), {value, delta, clusterSize}, instName);
-}
-
 // =====================================================================================================================
 // Create a subgroup clustered reduction.
 //
@@ -2108,9 +2072,6 @@ Instruction *Builder::record(BuilderOpcode opcode, Type *resultTy, ArrayRef<Valu
     case BuilderOpcode::WriteXfbOutput:
       // Functions that read and write memory.
       break;
-    case BuilderOpcode::SubgroupAll:
-    case BuilderOpcode::SubgroupAllEqual:
-    case BuilderOpcode::SubgroupRotate:
     case BuilderOpcode::SubgroupBallot:
     case BuilderOpcode::SubgroupBroadcast:
     case BuilderOpcode::SubgroupBroadcastWaterfall:
diff --git a/lgc/builder/BuilderRecorder.h b/lgc/builder/BuilderRecorder.h
index 534631de85..8a9fd60521 100644
--- a/lgc/builder/BuilderRecorder.h
+++ b/lgc/builder/BuilderRecorder.h
@@ -160,9 +160,6 @@ enum BuilderOpcode : unsigned {
 
   // Subgroup
   GetSubgroupSize,
-  SubgroupAll,
-  SubgroupAllEqual,
-  SubgroupRotate,
   SubgroupBroadcast,
   SubgroupBroadcastWaterfall,
   SubgroupBroadcastFirst,
diff --git a/lgc/builder/BuilderReplayer.cpp b/lgc/builder/BuilderReplayer.cpp
index 6a88b4b778..608b7014dc 100644
--- a/lgc/builder/BuilderReplayer.cpp
+++ b/lgc/builder/BuilderReplayer.cpp
@@ -722,15 +722,6 @@ Value *BuilderReplayer::processCall(unsigned opcode, CallInst *call) {
   case BuilderOpcode::GetSubgroupSize: {
     return m_builder->CreateGetSubgroupSize();
   }
-  case BuilderOpcode::SubgroupAll: {
-    return m_builder->CreateSubgroupAll(args[0]);
-  }
-  case BuilderOpcode::SubgroupAllEqual: {
-    return m_builder->CreateSubgroupAllEqual(args[0]);
-  }
-  case BuilderOpcode::SubgroupRotate: {
-    return m_builder->CreateSubgroupRotate(args[0], args[1], isa<PoisonValue>(args[2]) ? nullptr : &*args[2]);
-  }
   case BuilderOpcode::SubgroupBroadcast: {
     return m_builder->CreateSubgroupBroadcast(args[0], args[1]);
   }
diff --git a/lgc/builder/DescBuilder.cpp b/lgc/builder/DescBuilder.cpp
index 57f45587a4..c837b2129a 100644
--- a/lgc/builder/DescBuilder.cpp
+++ b/lgc/builder/DescBuilder.cpp
@@ -148,20 +148,21 @@ Value *BuilderImpl::createBufferDesc(uint64_t descSet, unsigned binding, Value *
     // Load the descriptor.
     desc = CreateLoad(getDescTy(resType), descPtr);
 
-    // Force convert the buffer view to raw view.
-    if (flags & BufferFlagForceRawView) {
-      Value *desc1 = CreateExtractElement(desc, 1);
-      Value *desc2 = CreateExtractElement(desc, 2);
-      Value *desc3 = CreateExtractElement(desc, 3);
-      // stride is 14 bits in dword1[29:16]
-      Value *stride = CreateAnd(CreateLShr(desc1, getInt32(16)), getInt32(0x3fff));
-      stride = CreateBinaryIntrinsic(Intrinsic::smax, stride, getInt32(1));
-      // set srd with new stride = 0 and new num_record = stride * num_record, num_record is dword2[31:0]
-      desc = CreateInsertElement(desc, CreateAnd(desc1, getInt32(0xc000ffff)), 1);
-      desc = CreateInsertElement(desc, CreateMul(stride, desc2), 2);
-      // gfx10 and gfx11 have oob fields with 2 bits in dword3[29:28] here force to set to 3 as OOB_COMPLETE mode.
-      if (getPipelineState()->getTargetInfo().getGfxIpVersion().major >= 10)
+    {
+      // Force convert the buffer view to raw view.
+      if (flags & BufferFlagForceRawView) {
+        Value *desc1 = CreateExtractElement(desc, 1);
+        Value *desc2 = CreateExtractElement(desc, 2);
+        Value *desc3 = CreateExtractElement(desc, 3);
+        // stride is 14 bits in dword1[29:16]
+        Value *stride = CreateAnd(CreateLShr(desc1, getInt32(16)), getInt32(0x3fff));
+        stride = CreateBinaryIntrinsic(Intrinsic::smax, stride, getInt32(1));
+        // set srd with new stride = 0 and new num_record = stride * num_record, num_record is dword2[31:0]
+        desc = CreateInsertElement(desc, CreateAnd(desc1, getInt32(0xc000ffff)), 1);
+        desc = CreateInsertElement(desc, CreateMul(stride, desc2), 2);
+        // gfx10 and gfx11 have oob fields with 2 bits in dword3[29:28] here force to set to 3 as OOB_COMPLETE mode.
         desc = CreateInsertElement(desc, CreateOr(desc3, getInt32(0x30000000)), 3);
+      }
     }
   }
 
diff --git a/lgc/builder/ImageBuilder.cpp b/lgc/builder/ImageBuilder.cpp
index f0da76d2e8..9b2fdb6d0e 100644
--- a/lgc/builder/ImageBuilder.cpp
+++ b/lgc/builder/ImageBuilder.cpp
@@ -439,6 +439,9 @@ Value *BuilderImpl::CreateImageLoad(Type *resultTy, unsigned dim, unsigned flags
   if (origTexelTy->isIntOrIntVectorTy(64)) {
     // Only load the first component for 64-bit texel, casted to <2 x i32>
     texelTy = FixedVectorType::get(getInt32Ty(), 2);
+  } else if (origTexelTy->isIntOrIntVectorTy(16)) {
+    // Treat i16 load as f16 load and cast it back later.
+    texelTy = FixedVectorType::get(getHalfTy(), 4);
   }
 
   if (auto vectorResultTy = dyn_cast<FixedVectorType>(texelTy))
@@ -512,40 +515,45 @@ Value *BuilderImpl::CreateImageLoad(Type *resultTy, unsigned dim, unsigned flags
   else if (flags & ImageFlagEnforceReadFirstLaneImage)
     enforceReadFirstLane(imageInst, imageDescArgIndex);
 
-  // For 64-bit texel, only the first component is loaded, other components are filled in with (0, 0, 1). This
-  // operation could be viewed as supplement of the intrinsic call.
-  if (origTexelTy->isIntOrIntVectorTy(64)) {
+  if (texelTy != origTexelTy) {
     Value *texel = result;
-    if (isa<StructType>(resultTy))
-      texel = CreateExtractValue(result, uint64_t(0));
-    texel = CreateBitCast(texel, getInt64Ty()); // Casted to i64
-
-    if (origTexelTy->isVectorTy()) {
-      texel = CreateInsertElement(PoisonValue::get(origTexelTy), texel, uint64_t(0));
-
-      SmallVector<Value *, 3> defaults = {getInt64(0), getInt64(0), getInt64(1)};
-      // The default of W channel is set to 0 if allowNullDescriptor is on and image descriptor is a null descriptor
-      if (m_pipelineState->getOptions().allowNullDescriptor) {
-        // Check dword3 against 0 for a null descriptor
-        Value *descWord3 = CreateExtractElement(imageDesc, 3);
-        if (m_pipelineState->getOptions().maskOffNullDescriptorTypeField) {
-          GfxIpVersion gfxIp = getPipelineState()->getTargetInfo().getGfxIpVersion();
-          SqImgRsrcRegHandler proxySqRsrcRegHelper(this, imageDesc, &gfxIp);
-          unsigned typeMask = proxySqRsrcRegHelper.getRegMask(SqRsrcRegs::Type);
-          // Mask off the type bits for the null descriptor
-          descWord3 = CreateAnd(descWord3, getInt32(~typeMask));
+    bool tfe = isa<StructType>(resultTy);
+    if (tfe)
+      texel = CreateExtractValue(result, 0);
+
+    // For 64-bit texel, only the first component is loaded, other components are filled in with (0, 0, 1). This
+    // operation could be viewed as supplement of the intrinsic call.
+    if (origTexelTy->isIntOrIntVectorTy(64)) {
+      texel = CreateBitCast(texel, getInt64Ty()); // Casted to i64
+
+      if (origTexelTy->isVectorTy()) {
+        texel = CreateInsertElement(PoisonValue::get(origTexelTy), texel, uint64_t(0));
+
+        SmallVector<Value *, 3> defaults = {getInt64(0), getInt64(0), getInt64(1)};
+        // The default of W channel is set to 0 if allowNullDescriptor is on and image descriptor is a null descriptor
+        if (m_pipelineState->getOptions().allowNullDescriptor) {
+          // Check dword3 against 0 for a null descriptor
+          Value *descWord3 = CreateExtractElement(imageDesc, 3);
+          if (m_pipelineState->getOptions().maskOffNullDescriptorTypeField) {
+            GfxIpVersion gfxIp = getPipelineState()->getTargetInfo().getGfxIpVersion();
+            SqImgRsrcRegHandler proxySqRsrcRegHelper(this, imageDesc, &gfxIp);
+            unsigned typeMask = proxySqRsrcRegHelper.getRegMask(SqRsrcRegs::Type);
+            // Mask off the type bits for the null descriptor
+            descWord3 = CreateAnd(descWord3, getInt32(~typeMask));
+          }
+          Value *isNullDesc = CreateICmpEQ(descWord3, getInt32(0));
+          defaults[2] = CreateSelect(isNullDesc, getInt64(0), getInt64(1));
         }
-        Value *isNullDesc = CreateICmpEQ(descWord3, getInt32(0));
-        defaults[2] = CreateSelect(isNullDesc, getInt64(0), getInt64(1));
+        for (unsigned i = 1; i < cast<FixedVectorType>(origTexelTy)->getNumElements(); ++i)
+          texel = CreateInsertElement(texel, defaults[i - 1], i);
       }
-      for (unsigned i = 1; i < cast<FixedVectorType>(origTexelTy)->getNumElements(); ++i)
-        texel = CreateInsertElement(texel, defaults[i - 1], i);
+    } else if (origTexelTy->isIntOrIntVectorTy(16)) {
+      texel = CreateBitCast(texel, origTexelTy);
     }
 
-    if (isa<StructType>(resultTy)) {
-      // TFE
+    if (tfe) {
       intrinsicDataTy = StructType::get(origTexelTy->getContext(), {origTexelTy, getInt32Ty()});
-      result = CreateInsertValue(CreateInsertValue(PoisonValue::get(intrinsicDataTy), texel, uint64_t(0)),
+      result = CreateInsertValue(CreateInsertValue(PoisonValue::get(intrinsicDataTy), texel, 0),
                                  CreateExtractValue(result, 1), 1);
     } else {
       result = texel;
diff --git a/lgc/builder/InOutBuilder.cpp b/lgc/builder/InOutBuilder.cpp
index 0eacb6ca44..4172a0aa6b 100644
--- a/lgc/builder/InOutBuilder.cpp
+++ b/lgc/builder/InOutBuilder.cpp
@@ -38,6 +38,7 @@
 #include "lgc/util/Internal.h"
 #include "llvm/IR/IntrinsicsAMDGPU.h"
 #include "llvm/Support/CommandLine.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
 
 #define DEBUG_TYPE "lgc-builder-impl-inout"
 
@@ -109,6 +110,49 @@ Value *BuilderImpl::CreateReadPerVertexInput(Type *resultTy, unsigned location,
   unsigned vertexIndexInt = cast<ConstantInt>(vertexIndex)->getZExtValue();
   Value *result = nullptr;
 
+  if (m_pipelineState->isUnlinked() || m_pipelineState->getOptions().dynamicTopology) {
+    auto resUsage = m_pipelineState->getShaderResourceUsage(ShaderStage::Fragment);
+    resUsage->builtInUsage.fs.useDynamicToplogy = true;
+    auto numVertices = ShaderInputs::getSpecialUserData(UserDataMapping::CompositeData, BuilderBase::get(*this));
+    numVertices = CreateIntrinsic(Intrinsic::amdgcn_ubfe, getInt32Ty(), {numVertices, getInt32(0), getInt32(2)});
+    auto isTriangle = CreateICmpEQ(numVertices, getInt32(3));
+    Instruction *InsertI = &*GetInsertPoint();
+    Instruction *thenInst = nullptr;
+    Instruction *elseInst = nullptr;
+    SplitBlockAndInsertIfThenElse(isTriangle, InsertI, &thenInst, &elseInst);
+
+    BasicBlock *thenBB = thenInst->getParent();
+    BasicBlock *elseBB = elseInst->getParent();
+    BasicBlock *tailBB = InsertI->getParent();
+
+    Value *triValue = nullptr;
+    {
+      SetInsertPoint(thenInst);
+      Value *isOne = nullptr;
+      Value *isTwo = nullptr;
+      getProvokingVertexInfo(&isOne, &isTwo);
+
+      auto V0 = readInput(getInt32((vertexIndexInt + 0) % 3));
+      auto V1 = readInput(getInt32((vertexIndexInt + 1) % 3));
+      auto V2 = readInput(getInt32((vertexIndexInt + 2) % 3));
+      triValue = CreateSelect(isOne, V1, CreateSelect(isTwo, V2, V0));
+    }
+
+    Value *pointOrLineValue = nullptr;
+    {
+      SetInsertPoint(elseInst);
+      pointOrLineValue = readInput(vertexIndex);
+    }
+
+    {
+      SetInsertPoint(&*tailBB->getFirstInsertionPt());
+      auto phiInst = CreatePHI(resultTy, 2);
+      phiInst->addIncoming(triValue, thenBB);
+      phiInst->addIncoming(pointOrLineValue, elseBB);
+      return phiInst;
+    }
+  }
+
   auto vertexCount = m_pipelineState->getVerticesPerPrimitive();
   switch (vertexCount) {
   case 1:
@@ -513,31 +557,6 @@ void BuilderImpl::markGenericInputOutputUsage(bool isOutput, unsigned location,
     // Mark usage for interpolation info.
     markInterpolationInfo(inOutInfo);
   }
-
-  if (isOutput && m_shaderStage == ShaderStage::Mesh) {
-    // Record number of components for mesh shader outputs
-    for (unsigned i = 0; i < locationCount; ++i) {
-      unsigned numComponents = 0;
-      if (inOutInfo.getNumComponents() > 4) {
-        assert(locationCount % 2 == 0);        // Must have even number of locations for 64-bit data type
-        assert(inOutInfo.getComponent() == 0); // Start component must be 0 in this case
-        // NOTE: For 64-bit vec3/vec4 data types, they will occupy two consecutive locations, we only record the number
-        // of components to the former one and skip the latter one.
-        if (i % 2 != 0)
-          continue;
-        numComponents = inOutInfo.getNumComponents();
-      } else {
-        numComponents = inOutInfo.getComponent() + inOutInfo.getNumComponents();
-      }
-
-      if (inOutInfo.isPerPrimitive())
-        resUsage->inOutUsage.mesh.primitiveOutputComponents[location + i] = {numComponents,
-                                                                             static_cast<BuiltInKind>(InvalidValue)};
-      else
-        resUsage->inOutUsage.mesh.vertexOutputComponents[location + i] = {numComponents,
-                                                                          static_cast<BuiltInKind>(InvalidValue)};
-    }
-  }
 }
 
 // =====================================================================================================================
@@ -1143,21 +1162,78 @@ Value *BuilderImpl::normalizeBaryCoord(InOutInfo inputInfo, Value *iJCoord) {
   hwCoord[1] = CreateExtractElement(iJCoord, 1);
   hwCoord[2] = CreateFSub(CreateFSub(one, hwCoord[0]), hwCoord[1]);
 
-  Value *normalized[3] = {zero, zero, zero};
+  if (m_pipelineState->isUnlinked() || m_pipelineState->getOptions().dynamicTopology) {
+    auto resUsage = m_pipelineState->getShaderResourceUsage(ShaderStage::Fragment);
+    resUsage->builtInUsage.fs.useDynamicToplogy = true;
+    auto numVertices = ShaderInputs::getSpecialUserData(UserDataMapping::CompositeData, BuilderBase::get(*this));
+    numVertices = CreateIntrinsic(Intrinsic::amdgcn_ubfe, getInt32Ty(), {numVertices, getInt32(0), getInt32(2)});
+    auto currentBlock = GetInsertBlock();
+    auto endBlock = currentBlock->splitBasicBlock(numVertices->getNextNode());
+    currentBlock->getTerminator()->eraseFromParent();
+    SetInsertPoint(currentBlock);
+    auto switchInst = CreateSwitch(numVertices, endBlock, 3);
+    BasicBlock *case0 = BasicBlock::Create(getContext(), "case0", currentBlock->getParent(), endBlock);
+    BasicBlock *case1 = BasicBlock::Create(getContext(), "case1", currentBlock->getParent(), endBlock);
+    BasicBlock *case2 = BasicBlock::Create(getContext(), "case2", currentBlock->getParent(), endBlock);
+    switchInst->addCase(getInt32(1), case0);
+    switchInst->addCase(getInt32(2), case1);
+    switchInst->addCase(getInt32(3), case2);
+
+    Value *pointCoord = ConstantVector::get({one, zero, zero});
+    {
+      SetInsertPoint(case0);
+      CreateBr(endBlock);
+    }
+
+    Value *lineCoord = ConstantVector::get({zero, zero, zero});
+    {
+      SetInsertPoint(case1);
+      auto yCoord = CreateFAdd(hwCoord[0], hwCoord[1]);
+      lineCoord = CreateInsertElement(lineCoord, hwCoord[2], uint64_t(0));
+      lineCoord = CreateInsertElement(lineCoord, yCoord, 1);
+      CreateBr(endBlock);
+    }
+
+    Value *triCoord = PoisonValue::get(baryType);
+    {
+      SetInsertPoint(case2);
+      Value *isOne = nullptr;
+      Value *isTwo = nullptr;
+      getProvokingVertexInfo(&isOne, &isTwo);
+
+      Value *barycoord1 = CreateInsertElement(PoisonValue::get(baryType), hwCoord[0], uint64_t(0));
+      barycoord1 = CreateInsertElement(barycoord1, hwCoord[1], 1);
+      barycoord1 = CreateInsertElement(barycoord1, hwCoord[2], 2);
+
+      Value *barycoord0 = CreateShuffleVector(barycoord1, ArrayRef<int>({2, 0, 1}));
+      Value *barycoord2 = CreateShuffleVector(barycoord1, ArrayRef<int>({1, 2, 0}));
+      triCoord = CreateSelect(isOne, barycoord1, CreateSelect(isTwo, barycoord2, barycoord0));
+      CreateBr(endBlock);
+    }
+
+    {
+      SetInsertPoint(&*endBlock->getFirstInsertionPt());
+      auto phiInst = CreatePHI(baryType, 4);
+      phiInst->addIncoming(pointCoord, case0);
+      phiInst->addIncoming(lineCoord, case1);
+      phiInst->addIncoming(triCoord, case2);
+      phiInst->addIncoming(PoisonValue::get(baryType), currentBlock);
+      return phiInst;
+    }
+  }
 
   auto vertexCount = m_pipelineState->getVerticesPerPrimitive();
   switch (vertexCount) {
   case 1:
     // Points
-    normalized[0] = one;
-    break;
+    return ConstantVector::get({one, zero, zero});
   case 2: {
     // Lines
     // The weight of vertex0 is (1 - i - j), the weight of vertex1 is (i + j).
     auto yCoord = CreateFAdd(hwCoord[0], hwCoord[1]);
-    normalized[0] = hwCoord[2];
-    normalized[1] = yCoord;
-    break;
+    Value *barycoord = CreateInsertElement(ConstantVector::get({zero, zero, zero}), hwCoord[2], uint64_t(0));
+    barycoord = CreateInsertElement(barycoord, yCoord, 1);
+    return barycoord;
   }
   case 3: {
     Value *isOne = nullptr;
@@ -1182,11 +1258,7 @@ Value *BuilderImpl::normalizeBaryCoord(InOutInfo inputInfo, Value *iJCoord) {
     break;
   }
   }
-
-  Value *barycoord = PoisonValue::get(baryType);
-  for (unsigned i = 0; i < 3; ++i)
-    barycoord = CreateInsertElement(barycoord, normalized[i], i);
-  return barycoord;
+  return PoisonValue::get(baryType);
 }
 
 // =====================================================================================================================
diff --git a/lgc/builder/MatrixBuilder.cpp b/lgc/builder/MatrixBuilder.cpp
index dc4a3ff04f..a21161c04b 100644
--- a/lgc/builder/MatrixBuilder.cpp
+++ b/lgc/builder/MatrixBuilder.cpp
@@ -391,343 +391,3 @@ Type *BuilderCommon::getCooperativeMatrixTy(CooperativeMatrixElementType elemTyp
     llvm_unreachable("Type is not supported!");
   }
 }
-
-// =====================================================================================================================
-// Determine the "length" of a cooperative matrix for purposes of extract/insert operations.
-//
-// @param elemType : the matrix element type
-// @param layout : the matrix layout
-// @param instName : name to give instruction(s)
-Value *BuilderCommon::CreateCooperativeMatrixLength(CooperativeMatrixElementType elemType,
-                                                    CooperativeMatrixLayout layout, const Twine &instName) {
-  Type *resultTy = getInt32Ty();
-  Value *args[] = {getInt32(static_cast<unsigned>(elemType)), getInt32(static_cast<unsigned>(layout))};
-  std::string callName(lgcName::CooperativeMatrixLength);
-  addTypeMangling(resultTy, args, callName);
-
-  Value *result =
-      CreateNamedCall(callName, resultTy, args, {Attribute::ReadNone, Attribute::Speculatable, Attribute::WillReturn});
-  result->setName(instName);
-  return result;
-}
-
-// =====================================================================================================================
-// Create an "extractelement"-equivalent operation for a cooperative matrix value.
-//
-// @param matrix : the matrix from which to extract an element
-// @param index : the index from which to extract
-// @param elemType : the matrix element type
-// @param layout : the matrix layout
-// @param instName : name to give instruction(s)
-Value *BuilderCommon::CreateCooperativeMatrixExtract(Value *matrix, Value *index, CooperativeMatrixElementType elemType,
-                                                     CooperativeMatrixLayout layout, const Twine &instName) {
-  assert(matrix->getType() == getCooperativeMatrixTy(elemType, layout));
-
-  Type *resultTy = transCooperativeMatrixElementType(elemType);
-  Value *args[] = {matrix, index, getInt32(static_cast<unsigned>(elemType)), getInt32(static_cast<unsigned>(layout))};
-  std::string callName(lgcName::CooperativeMatrixExtract);
-  addTypeMangling(resultTy, args, callName);
-  Value *result =
-      CreateNamedCall(callName, resultTy, args,
-                      {Attribute::ReadNone, Attribute::Convergent, Attribute::Speculatable, Attribute::WillReturn});
-  result->setName(instName);
-  return result;
-}
-
-// =====================================================================================================================
-// Create an "insertelement"-equivalent operation for a cooperative matrix value.
-//
-// @param matrix : the matrix from which to extract an element
-// @param index : the index from which to extract
-// @param elemType : the matrix element type
-// @param layout : the matrix layout
-// @param instName : name to give instruction(s)
-Value *BuilderCommon::CreateCooperativeMatrixInsert(Value *matrix, Value *value, Value *index,
-                                                    CooperativeMatrixElementType elemType,
-                                                    CooperativeMatrixLayout layout, const Twine &instName) {
-  assert(matrix->getType() == getCooperativeMatrixTy(elemType, layout));
-  assert(value->getType() == transCooperativeMatrixElementType(elemType));
-  assert(index->getType() == getInt32Ty());
-
-  Type *resultTy = matrix->getType();
-  Value *args[] = {matrix, value, index, getInt32(static_cast<unsigned>(elemType)),
-                   getInt32(static_cast<unsigned>(layout))};
-  std::string callName(lgcName::CooperativeMatrixInsert);
-  addTypeMangling(resultTy, args, callName);
-  Value *result =
-      CreateNamedCall(callName, resultTy, args,
-                      {Attribute::ReadNone, Attribute::Convergent, Attribute::Speculatable, Attribute::WillReturn});
-  result->setName(instName);
-  return result;
-}
-
-// =====================================================================================================================
-// Create an "fill"-equivalent operation for a cooperative matrix value.
-//
-// @param value : the value to fill the cooperative matrix
-// @param elemType : the matrix element type
-// @param layout : the matrix layout
-// @param instName : name to give instruction(s)
-Value *BuilderCommon::CreateCooperativeMatrixFill(Value *value, CooperativeMatrixElementType elemType,
-                                                  CooperativeMatrixLayout layout, const Twine &instName) {
-  Type *resultTy = getCooperativeMatrixTy(elemType, layout);
-  Value *args[] = {value, getInt32(static_cast<unsigned>(elemType)), getInt32(static_cast<unsigned>(layout))};
-  std::string callName(lgcName::CooperativeMatrixFill);
-  addTypeMangling(resultTy, args, callName);
-  Value *result =
-      CreateNamedCall(callName, resultTy, args,
-                      {Attribute::ReadNone, Attribute::Convergent, Attribute::Speculatable, Attribute::WillReturn});
-  result->setName(instName);
-  return result;
-}
-
-// =====================================================================================================================
-// Create cooperative matrix load.
-// We only allow the size 16x16 size for a cooperative matrix. So 16 lanes are responsible for reading all data from
-// memory. The layout of a cooperative matrix A in the VGPR under wave32 mode is that . Each lane reads a contiguous
-// data from memory as a row (or column) of matrix A into the VGPR (implemented as a vector), where A0_0 in one VGPR if
-// the data format is f32/i32, A0_0/A0_1 would be in the same VGPR if the data format is f16, A0_0/A0_1/A0_2/A0_3 would
-// be in the same VGPR if the data format is i8.
-//
-// @param pointer : The pointer to a data array.
-// @param stride : The number of bytes in memory between the first component of consecutive rows (or columns) in the
-// source data. Must be a multiple of the matrix element size.
-// @param colMaj : Whether the values loaded from memory are arrayed in column-major or row-major.
-// @param elemType : Element type for the matrix.
-// @param layout : Identify whether it's A/B or C/D
-// @param memoryAccess : Parsed from memory operation.
-// @param alignment : Alignment for memory operation.
-// @param instName : Name to give instruction(s).
-Value *BuilderCommon::CreateCooperativeMatrixLoad(Value *pointer, Value *stride, bool colMajor,
-                                                  CooperativeMatrixElementType elemType, CooperativeMatrixLayout layout,
-                                                  unsigned memoryAccess, Align alignment, const Twine &instName) {
-  Type *resultTy = getCooperativeMatrixTy(elemType, layout);
-  std::string callName(lgcName::CooperativeMatrixLoad);
-  Value *args[] = {pointer,
-                   stride,
-                   getInt1(colMajor),
-                   getInt32(static_cast<unsigned>(elemType)),
-                   getInt32(static_cast<unsigned>(layout)),
-                   getInt32(memoryAccess),
-                   getInt32(alignment.value())};
-  addTypeMangling(resultTy, args, callName);
-  Value *loadVal = CreateNamedCall(callName, resultTy, args, {Attribute::ReadOnly, Attribute::Convergent});
-  loadVal->setName(instName);
-  return loadVal;
-}
-
-// =====================================================================================================================
-// Create cooperative matrix store.
-// We only allow the size 16x16 size for a cooperative matrix. So 16 lanes are responsible for writing matrix elements
-// to memory. The layout of a cooperative matrix A in the VGPR under wave32 mode is that each lane writes a row (or
-// column) of matrix A from the VGPRs (implemented as a vector) to the memory, where the value of one VGPR is written
-// into a memory location if the data format is f32/i32, the value of one VGPR is split into two values to store if
-// the data format is f16, the value of one VGPR is split into four values to store if the data format is i8.
-//
-// @param pointer : The pointer to a data array.
-// @param matrix : The row of cooperative matrix to store.
-// @param stride : The number of bytes in memory between the first components of consecutive rows (or columns) in the
-// destination. Must be a multiple of the element size.
-// @param colMaj : Whether the values loaded from memory are arrayed in column-major or row-major.
-// @param elemType : Element type for the matrix.
-// @param layout : Identify the matrix type(A/B or C).
-// @param memoryAccess : Memoray operands
-// @param alignment : Alignment for memory operation.
-// @param instName : Name to give instruction(s).
-Value *BuilderCommon::CreateCooperativeMatrixStore(Value *pointer, Value *matrix, Value *stride, bool colMajor,
-                                                   CooperativeMatrixElementType elemType,
-                                                   CooperativeMatrixLayout layout, unsigned memoryAccess,
-                                                   Align alignment, const Twine &instName) {
-  assert(matrix->getType() == getCooperativeMatrixTy(elemType, layout));
-
-  std::string callName(lgcName::CooperativeMatrixStore);
-  Value *args[] = {pointer,
-                   stride,
-                   getInt1(colMajor),
-                   getInt32(static_cast<unsigned>(elemType)),
-                   getInt32(static_cast<unsigned>(layout)),
-                   getInt32(memoryAccess),
-                   getInt32(alignment.value()),
-                   matrix};
-  addTypeMangling(Type::getVoidTy(getContext()), args, callName);
-
-  Value *storeVal = CreateNamedCall(callName, Type::getVoidTy(getContext()), args,
-                                    {Attribute::WriteOnly, Attribute::Convergent, Attribute::WillReturn});
-  storeVal->setName(instName);
-  return nullptr;
-}
-
-// =====================================================================================================================
-// Create cooperative matrix conversion.
-// Element-wise-conversion
-// @param castOp : The cast Opcode.
-// @param source : The source cooperative matrix.
-// @param srcElemTy : Source matrix's element type.
-// @param dstElemTy : Destination matrix's element type.
-// @param srcLayout : Layout for source matrix
-// @param dstLayout : Layout for target matrix
-// @param instName : Name to give instruction(s).
-CallInst *BuilderCommon::CreateCooperativeMatrixConvert(CastInst::CastOps castOp, Value *source,
-                                                        CooperativeMatrixElementType srcElemTy,
-                                                        CooperativeMatrixElementType dstElemTy,
-                                                        CooperativeMatrixLayout srcLayout,
-                                                        CooperativeMatrixLayout dstLayout, const Twine &instName) {
-  assert(source->getType() == getCooperativeMatrixTy(srcElemTy, srcLayout));
-
-  Value *args[] = {getInt32(static_cast<unsigned>(castOp)),    source,
-                   getInt32(static_cast<unsigned>(srcElemTy)), getInt32(static_cast<unsigned>(dstElemTy)),
-                   getInt32(static_cast<unsigned>(srcLayout)), getInt32(static_cast<unsigned>(dstLayout))};
-  Type *resultTy = getCooperativeMatrixTy(dstElemTy, dstLayout);
-  std::string callName(lgcName::CooperativeMatrixConvert);
-  addTypeMangling(resultTy, args, callName);
-
-  CallInst *dstElems =
-      CreateNamedCall(callName, resultTy, args, {Attribute::ReadNone, Attribute::Convergent, Attribute::WillReturn});
-  dstElems->setName(instName);
-  return dstElems;
-}
-// =====================================================================================================================
-// Create cooperative matrix binary operation
-//
-// @param coopMatArithOp : The cooperative matrix arithmetic operation to perform.
-// @param lhs : The first operand and it can be a scalar or a cooperative matrix.
-// @param rhs : The second operand and it should be a cooperative matrix.
-// @param elemType : Element type for the matrix.
-// @param layout : Layout for the matrix.
-// @param instName : Name to give instruction(s).
-Value *BuilderCommon::CreateCooperativeMatrixBinaryOp(CooperativeMatrixArithOp coopMatArithOp, Value *lhs, Value *rhs,
-                                                      CooperativeMatrixElementType elemType,
-                                                      CooperativeMatrixLayout layout, const Twine &instName) {
-  assert(lhs->getType() == getCooperativeMatrixTy(elemType, layout));
-  assert(lhs->getType() == rhs->getType());
-
-  std::string callName(lgcName::CooperativeMatrixBinOp);
-  Value *args[] = {getInt32(static_cast<unsigned>(coopMatArithOp)), lhs, rhs, getInt32(static_cast<unsigned>(elemType)),
-                   getInt32(static_cast<unsigned>(layout))};
-  addTypeMangling(rhs->getType(), args, callName);
-
-  Value *result = CreateNamedCall(callName, rhs->getType(), args,
-                                  {Attribute::ReadNone, Attribute::Convergent, Attribute::WillReturn});
-  result->setName(instName);
-  return result;
-}
-
-// =====================================================================================================================
-// Create cooperative matrix MatrixTimesScalar operation
-//
-// @param matrix : The first operand and it should be a cooperative matrix.
-// @param scalar : The second operand and it should be a scalar. If the matrix is a packed accumulator matrix, the
-// scalar has to be a <2 x half> vector.
-// @param elemType : The component type of the matrix.
-// @param layout : Identify whether it's A/B or C/D
-// @param instName : Name to give instruction(s).
-Value *BuilderCommon::CreateCoopMatrixTimesScalar(Value *matrix, Value *scalar, CooperativeMatrixElementType elemType,
-                                                  CooperativeMatrixLayout layout, const Twine &instName) {
-  assert(matrix->getType() == getCooperativeMatrixTy(elemType, layout));
-  assert(scalar->getType() == (elemType == CooperativeMatrixElementType::Float16Packed
-                                   ? FixedVectorType::get(getHalfTy(), 2)
-                                   : transCooperativeMatrixElementType(elemType)));
-
-  std::string callName(lgcName::CooperativeMatrixTimesScalar);
-  Value *args[] = {matrix, scalar, getInt32(static_cast<unsigned>(elemType)), getInt32(static_cast<unsigned>(layout))};
-  addTypeMangling(matrix->getType(), args, callName);
-
-  Value *result = CreateNamedCall(callName, matrix->getType(), args,
-                                  {Attribute::ReadNone, Attribute::Convergent, Attribute::WillReturn});
-  result->setName(instName);
-  return result;
-}
-
-// =====================================================================================================================
-// Create cooperative matrix transpose operation
-//
-// @param matrix : The first operand and it should be a cooperative matrix.
-// @param elemType : The component type of the matrix.
-// @param layout : Identify whether it's A/B or C/D
-// @param instName : Name to give instruction(s).
-CallInst *BuilderCommon::CreateCooperativeMatrixTranspose(llvm::Value *matrix, CooperativeMatrixElementType elemType,
-                                                          CooperativeMatrixLayout layout, const Twine &instName) {
-  assert(matrix->getType() == getCooperativeMatrixTy(elemType, layout));
-
-  std::string callName(lgcName::CooperativeMatrixTranspose);
-  Value *args[] = {matrix, getInt32(static_cast<unsigned>(elemType)), getInt32(static_cast<unsigned>(layout))};
-  addTypeMangling(matrix->getType(), args, callName);
-
-  CallInst *result = CreateNamedCall(callName, matrix->getType(), args,
-                                     {Attribute::ReadNone, Attribute::Convergent, Attribute::WillReturn});
-  result->setName(instName);
-  return result;
-}
-
-// =====================================================================================================================
-// Create cooperative matrix muladd operation
-//
-// @param matrixA : Factor cooperative matrix.
-// @param matrixB : Factor cooperative matrix.
-// @param matrixC : Accumulator cooperative matrix.
-// @param isSignedA : Identify the signess for matrix A's element type
-// @param isSignedB : Identify the signess for matrix B's element type
-// @param isSatOrOpsel : SaturatingAccumulation for calculation. In the case of 16-bit floating point
-// matrices, this bit acts as an opsel bit. If it is set to false, we store the result in the lower half of
-// the registers. If it is true, we store it in the upper half.
-// @param isTied : If true, the output matrix has to be the same as the input accumulator (i.e., D has to be C)
-// @param accumElemType : The component type of the accumulator matrix.
-// @param factorElemType : The component type of the factor matrix.
-Value *BuilderCommon::CreateCooperativeMatrixMulAdd(llvm::Value *matrixA, llvm::Value *matrixB, llvm::Value *matrixC,
-                                                    bool isSignedA, bool isSignedB, bool isSatOrOpsel, bool isTied,
-                                                    CooperativeMatrixElementType accumElemType,
-                                                    CooperativeMatrixElementType factorElemType,
-                                                    const llvm::Twine &instName) {
-  std::string callName(lgcName::CooperativeMatrixMulAdd);
-  Value *args[] = {matrixA,
-                   matrixB,
-                   matrixC,
-                   getInt1(isSignedA),
-                   getInt1(isSignedB),
-                   getInt1(isSatOrOpsel),
-                   getInt1(isTied),
-                   getInt32(static_cast<unsigned>(accumElemType)),
-                   getInt32(static_cast<unsigned>(factorElemType))};
-  addTypeMangling(matrixC->getType(), args, callName);
-
-  Value *result = CreateNamedCall(callName, matrixC->getType(), args,
-                                  {Attribute::ReadNone, Attribute::Convergent, Attribute::WillReturn});
-  result->setName(instName);
-  return result;
-}
-
-// =====================================================================================================================
-// Create cooperative matrix pack operation
-//
-// @param matrixCLo : Lower Accumulator cooperative matrix.
-// @param matrixCHi : Upper Accumulator cooperative matrix.
-Value *BuilderCommon::CreateCooperativeMatrixPack(llvm::Value *matrixCLo, llvm::Value *matrixCHi,
-                                                  const llvm::Twine &instName) {
-
-  std::string callName(lgcName::CooperativeMatrixPack);
-  Value *args[] = {matrixCLo, matrixCHi};
-
-  Type *retTy = matrixCLo->getType();
-  addTypeMangling(retTy, args, callName);
-
-  Value *result = CreateNamedCall(callName, retTy, args, {Attribute::ReadNone, Attribute::WillReturn});
-  result->setName(instName);
-  return result;
-}
-
-// =====================================================================================================================
-// Create cooperative matrix unpack operation
-//
-// @param packedMatrix : Packed Accumulator cooperative matrix.
-// @param high: Whether to get the matrix stored in the upper half of the registers.
-Value *BuilderCommon::CreateCooperativeMatrixUnpack(llvm::Value *packedMatrix, bool high, const llvm::Twine &instName) {
-
-  std::string callName(lgcName::CooperativeMatrixUnpack);
-  Value *args[] = {packedMatrix, getInt1(high)};
-  Type *retTy = packedMatrix->getType();
-  addTypeMangling(retTy, args, callName);
-
-  Value *result = CreateNamedCall(callName, retTy, args, {Attribute::ReadNone, Attribute::WillReturn});
-  result->setName(instName);
-  return result;
-}
diff --git a/lgc/builder/SubgroupBuilder.cpp b/lgc/builder/SubgroupBuilder.cpp
index 6df4b1e5c0..3a74d5bc01 100644
--- a/lgc/builder/SubgroupBuilder.cpp
+++ b/lgc/builder/SubgroupBuilder.cpp
@@ -90,16 +90,27 @@ Value *SubgroupBuilder::CreateSubgroupElect(const Twine &instName) {
 //
 // @param value : The value to compare across the subgroup. Must be an integer type.
 // @param instName : Name to give final instruction.
-Value *BuilderImpl::CreateSubgroupAll(Value *const value, const Twine &instName) {
-  Value *result = CreateICmpEQ(createGroupBallot(value), createGroupBallot(getTrue()));
+Value *SubgroupBuilder::CreateSubgroupAll(Value *const value, const Twine &instName) {
+  bool ballotExcludeHelperLanes = false;
+  bool includeHelperLanes = false;
+  bool requireHelperLanes = false;
+
+  if (getShaderStage(GetInsertBlock()->getParent()).value() == ShaderStage::Fragment) {
+    const auto &fragmentMode = m_pipelineState->getShaderModes()->getFragmentShaderMode();
+    ballotExcludeHelperLanes = fragmentMode.waveOpsExcludeHelperLanes;
+    includeHelperLanes = !fragmentMode.waveOpsExcludeHelperLanes;
+    requireHelperLanes = fragmentMode.waveOpsRequireHelperLanes;
+  }
+
+  Value *result = CreateICmpEQ(createGroupBallot(value, ballotExcludeHelperLanes),
+                               createGroupBallot(getTrue(), ballotExcludeHelperLanes));
   result = CreateSelect(CreateUnaryIntrinsic(Intrinsic::is_constant, value), value, result);
 
   // Helper invocations of whole quad mode should be included in the subgroup vote execution
-  const auto &fragmentMode = m_pipelineState->getShaderModes()->getFragmentShaderMode();
-  if (m_shaderStage == ShaderStage::Fragment && !fragmentMode.waveOpsExcludeHelperLanes) {
+  if (includeHelperLanes) {
     result = CreateZExt(result, getInt32Ty());
-    result = CreateIntrinsic(fragmentMode.waveOpsRequireHelperLanes ? Intrinsic::amdgcn_wqm : Intrinsic::amdgcn_softwqm,
-                             {getInt32Ty()}, {result});
+    result = CreateIntrinsic(requireHelperLanes ? Intrinsic::amdgcn_wqm : Intrinsic::amdgcn_softwqm, {getInt32Ty()},
+                             {result});
     result = CreateTrunc(result, getInt1Ty());
   }
   return result;
@@ -140,7 +151,7 @@ Value *SubgroupBuilder::CreateSubgroupAny(Value *const value, const Twine &instN
 //
 // @param value : The value to compare across the subgroup. Must be an integer type.
 // @param instName : Name to give final instruction.
-Value *BuilderImpl::CreateSubgroupAllEqual(Value *const value, const Twine &instName) {
+Value *SubgroupBuilder::CreateSubgroupAllEqual(Value *const value, const Twine &instName) {
   Type *const type = value->getType();
 
   Value *compare = CreateSubgroupBroadcastFirst(value, instName);
@@ -170,8 +181,8 @@ Value *BuilderImpl::CreateSubgroupAllEqual(Value *const value, const Twine &inst
 // @param delta : The delta/offset added to lane id.
 // @param clusterSize : The cluster size if exists.
 // @param instName : Name to give final instruction.
-Value *BuilderImpl::CreateSubgroupRotate(Value *const value, Value *const delta, Value *const clusterSize,
-                                         const Twine &instName) {
+Value *SubgroupBuilder::CreateSubgroupRotate(Value *const value, Value *const delta, Value *const clusterSize,
+                                             const Twine &instName) {
   // LocalId = SubgroupLocalInvocationId
   // RotationGroupSize = hasClusterSIze? ClusterSize : SubgroupSize.
   // Invocation ID = ((LocalId + Delta) & (RotationGroupSize - 1)) + (LocalId & ~(RotationGroupSize - 1))
@@ -375,6 +386,7 @@ Value *BuilderImpl::CreateSubgroupBallotFindMsb(Value *const value, const Twine
 // @param index : The index to shuffle from.
 // @param instName : Name to give final instruction.
 Value *BuilderImpl::CreateSubgroupShuffle(Value *const value, Value *const index, const Twine &instName) {
+
   if (supportWaveWideBPermute()) {
     auto mapFunc = [](BuilderBase &builder, ArrayRef<Value *> mappedArgs,
                       ArrayRef<Value *> passthroughArgs) -> Value * {
@@ -391,22 +403,23 @@ Value *BuilderImpl::CreateSubgroupShuffle(Value *const value, Value *const index
     // Start the WWM section by setting the inactive lanes.
     Value *const poisonValue = PoisonValue::get(value->getType());
     Value *const poisonIndex = PoisonValue::get(index->getType());
-    Value *const scaledIndex = CreateMul(index, getInt32(4));
     Value *wwmValue = BuilderBase::get(*this).CreateSetInactive(value, poisonValue);
-    Value *wwmIndex = BuilderBase::get(*this).CreateSetInactive(scaledIndex, poisonIndex);
+    Value *wwmIndex = nullptr;
+    BuilderBase::MapToSimpleTypeFunc bPermFunc = nullptr;
+    {
+      Value *const scaledIndex = CreateMul(index, getInt32(4));
+      wwmIndex = BuilderBase::get(*this).CreateSetInactive(scaledIndex, poisonIndex);
+      bPermFunc = [](BuilderBase &builder, ArrayRef<Value *> mappedArgs, ArrayRef<Value *> passthroughArgs) -> Value * {
+        return builder.CreateIntrinsic(Intrinsic::amdgcn_ds_bpermute, {}, {passthroughArgs[0], mappedArgs[0]});
+      };
+    }
 
     auto permuteFunc = [](BuilderBase &builder, ArrayRef<Value *> mappedArgs,
                           ArrayRef<Value *> passthroughArgs) -> Value * {
       return builder.CreateIntrinsic(builder.getInt32Ty(), Intrinsic::amdgcn_permlane64, {mappedArgs[0]});
     };
-
     auto swapped = CreateMapToSimpleType(permuteFunc, wwmValue, {});
 
-    auto bPermFunc = [](BuilderBase &builder, ArrayRef<Value *> mappedArgs,
-                        ArrayRef<Value *> passthroughArgs) -> Value * {
-      return builder.CreateIntrinsic(Intrinsic::amdgcn_ds_bpermute, {}, {passthroughArgs[0], mappedArgs[0]});
-    };
-
     auto bPermSameHalf = CreateMapToSimpleType(bPermFunc, wwmValue, wwmIndex);
     auto bPermOtherHalf = CreateMapToSimpleType(bPermFunc, swapped, wwmIndex);
     bPermOtherHalf = createWwm(bPermOtherHalf);
@@ -449,9 +462,8 @@ Value *BuilderImpl::CreateSubgroupShuffleXor(Value *const value, Value *const ma
   // issue dpp_mov for some simple quad/row shuffle cases;
   // then issue ds_permlane_x16 if supported or ds_swizzle, if maskValue < 32
   // default to call SubgroupShuffle, which may issue waterfallloops to handle complex cases.
-  if (isa<ConstantInt>(mask)) {
-    maskValue = cast<ConstantInt>(mask)->getZExtValue();
-
+  if (auto maskInt = dyn_cast<ConstantInt>(mask)) {
+    maskValue = maskInt->getZExtValue();
     if (maskValue < 32) {
       canOptimize = true;
       switch (maskValue) {
@@ -571,8 +583,11 @@ Value *BuilderImpl::CreateSubgroupShuffleDown(Value *const value, Value *const d
 // @param instName : Name to give final instruction.
 Value *BuilderImpl::CreateSubgroupClusteredReduction(GroupArithOp groupArithOp, Value *const value,
                                                      Value *const inClusterSize, const Twine &instName) {
-  auto waveSize = getInt32(getShaderWaveSize());
-  Value *clusterSize = CreateSelect(CreateICmpUGT(inClusterSize, waveSize), waveSize, inClusterSize);
+  assert(isa<ConstantInt>(inClusterSize));
+  unsigned clusterSize = cast<ConstantInt>(inClusterSize)->getZExtValue();
+  assert(isPowerOf2_32(clusterSize));
+  const unsigned waveSize = getShaderWaveSize();
+  clusterSize = std::min(clusterSize, waveSize);
 
   // Start the WWM section by setting the inactive lanes.
   Value *const identity = createGroupArithmeticIdentity(groupArithOp, value->getType());
@@ -585,53 +600,47 @@ Value *BuilderImpl::CreateSubgroupClusteredReduction(GroupArithOp groupArithOp,
     result = CreateSelect(isLive, result, identity);
   }
 
-  // Perform The group arithmetic operation between adjacent lanes in the subgroup, with all masks and rows enabled
-  // (0xF).
-  result = CreateSelect(
-      CreateICmpUGE(clusterSize, getInt32(2)),
-      createGroupArithmeticOperation(groupArithOp, result,
-                                     createDppUpdate(identity, result, DppCtrl::DppQuadPerm1032, 0xF, 0xF, true)),
-      result);
-
-  // Perform The group arithmetic operation between N <-> N+2 lanes in the subgroup, with all masks and rows enabled
-  // (0xF).
-  result = CreateSelect(
-      CreateICmpUGE(clusterSize, getInt32(4)),
-      createGroupArithmeticOperation(groupArithOp, result,
-                                     createDppUpdate(identity, result, DppCtrl::DppQuadPerm2301, 0xF, 0xF, true)),
-      result);
-
-  // Use a row half mirror to make all values in a cluster of 8 the same, with all masks and rows enabled (0xF).
-  result = CreateSelect(
-      CreateICmpUGE(clusterSize, getInt32(8)),
-      createGroupArithmeticOperation(groupArithOp, result,
-                                     createDppUpdate(identity, result, DppCtrl::DppRowHalfMirror, 0xF, 0xF, true)),
-      result);
-
-  // Use a row mirror to make all values in a cluster of 16 the same, with all masks and rows enabled (0xF).
-  result =
-      CreateSelect(CreateICmpUGE(clusterSize, getInt32(16)),
-                   createGroupArithmeticOperation(
-                       groupArithOp, result, createDppUpdate(identity, result, DppCtrl::DppRowMirror, 0xF, 0xF, true)),
-                   result);
-
-  // Use a permute lane to cross rows (row 1 <-> row 0, row 3 <-> row 2).
-  result =
-      CreateSelect(CreateICmpUGE(clusterSize, getInt32(32)),
-                   createGroupArithmeticOperation(
-                       groupArithOp, result, createPermLaneX16(result, result, UINT32_MAX, UINT32_MAX, true, false)),
-                   result);
+  if (clusterSize >= 2) {
+    // Perform The group arithmetic operation between adjacent lanes in the subgroup, with all masks and rows enabled
+    // (0xF).
+    result = createGroupArithmeticOperation(
+        groupArithOp, result, createDppUpdate(identity, result, DppCtrl::DppQuadPerm1032, 0xF, 0xF, true));
+  }
+
+  if (clusterSize >= 4) {
+    // Perform The group arithmetic operation between N <-> N+2 lanes in the subgroup, with all masks and rows enabled
+    // (0xF).
+    result = createGroupArithmeticOperation(
+        groupArithOp, result, createDppUpdate(identity, result, DppCtrl::DppQuadPerm2301, 0xF, 0xF, true));
+  }
 
-  if (supportPermLane64Dpp()) {
-    result = CreateSelect(CreateICmpEQ(clusterSize, getInt32(64)),
-                          createGroupArithmeticOperation(groupArithOp, result, createPermLane64(result)), result);
-  } else {
-    Value *const broadcast31 = CreateSubgroupBroadcast(result, getInt32(31), instName);
-    Value *const broadcast63 = CreateSubgroupBroadcast(result, getInt32(63), instName);
+  if (clusterSize >= 8) {
+    // Use a row half mirror to make all values in a cluster of 8 the same, with all masks and rows enabled (0xF).
+    result = createGroupArithmeticOperation(
+        groupArithOp, result, createDppUpdate(identity, result, DppCtrl::DppRowHalfMirror, 0xF, 0xF, true));
+  }
+
+  if (clusterSize >= 16) {
+    // Use a row mirror to make all values in a cluster of 16 the same, with all masks and rows enabled (0xF).
+    result = createGroupArithmeticOperation(groupArithOp, result,
+                                            createDppUpdate(identity, result, DppCtrl::DppRowMirror, 0xF, 0xF, true));
+  }
+
+  if (clusterSize >= 32) {
+    // Use a permute lane to cross rows (row 1 <-> row 0, row 3 <-> row 2).
+    result = createGroupArithmeticOperation(groupArithOp, result,
+                                            createPermLaneX16(result, result, UINT32_MAX, UINT32_MAX, true, false));
+  }
 
-    // Combine broadcast from the 31st and 63rd for the final result.
-    result = CreateSelect(CreateICmpEQ(clusterSize, getInt32(64)),
-                          createGroupArithmeticOperation(groupArithOp, broadcast31, broadcast63), result);
+  if (clusterSize == 64) {
+    assert(waveSize == 64);
+    if (supportPermLane64Dpp()) {
+      result = createGroupArithmeticOperation(groupArithOp, result, createPermLane64(result));
+    } else {
+      Value *const broadcast31 = CreateSubgroupBroadcast(result, getInt32(31), instName);
+      Value *const broadcast63 = CreateSubgroupBroadcast(result, getInt32(63), instName);
+      result = createGroupArithmeticOperation(groupArithOp, broadcast31, broadcast63);
+    }
   }
 
   // Finish the WWM section by calling the intrinsic.
@@ -654,65 +663,62 @@ Value *BuilderImpl::CreateSubgroupClusteredReduction(GroupArithOp groupArithOp,
 // @param instName : Name to give final instruction.
 Value *BuilderImpl::CreateSubgroupClusteredInclusive(GroupArithOp groupArithOp, Value *const value,
                                                      Value *const inClusterSize, const Twine &instName) {
-  auto waveSize = getInt32(getShaderWaveSize());
-  Value *clusterSize = CreateSelect(CreateICmpUGT(inClusterSize, waveSize), waveSize, inClusterSize);
+  assert(isa<ConstantInt>(inClusterSize));
+  unsigned clusterSize = cast<ConstantInt>(inClusterSize)->getZExtValue();
+  assert(isPowerOf2_32(clusterSize));
+  const unsigned waveSize = getShaderWaveSize();
+  clusterSize = std::min(clusterSize, waveSize);
 
   Value *const identity = createGroupArithmeticIdentity(groupArithOp, value->getType());
 
   // Start the WWM section by setting the inactive invocations.
-  Value *const setInactive = BuilderBase::get(*this).CreateSetInactive(value, identity);
-
-  // The DPP operation has all rows active and all banks in the rows active (0xF).
-  Value *result = CreateSelect(
-      CreateICmpUGE(clusterSize, getInt32(2)),
-      createGroupArithmeticOperation(groupArithOp, setInactive,
-                                     createDppUpdate(identity, setInactive, DppCtrl::DppRowSr1, 0xF, 0xF, 0)),
-      setInactive);
-
-  // The DPP operation has all rows active and all banks in the rows active (0xF).
-  result =
-      CreateSelect(CreateICmpUGE(clusterSize, getInt32(4)),
-                   createGroupArithmeticOperation(
-                       groupArithOp, result, createDppUpdate(identity, setInactive, DppCtrl::DppRowSr2, 0xF, 0xF, 0)),
-                   result);
-
-  // The DPP operation has all rows active and all banks in the rows active (0xF).
-  result =
-      CreateSelect(CreateICmpUGE(clusterSize, getInt32(4)),
-                   createGroupArithmeticOperation(
-                       groupArithOp, result, createDppUpdate(identity, setInactive, DppCtrl::DppRowSr3, 0xF, 0xF, 0)),
-                   result);
-
-  // The DPP operation has all rows active (0xF) and the top 3 banks active (0xe, 0b1110) to make sure that in
-  // each cluster of 16, only the top 12 lanes perform the operation.
-  result = CreateSelect(CreateICmpUGE(clusterSize, getInt32(8)),
-                        createGroupArithmeticOperation(
-                            groupArithOp, result, createDppUpdate(identity, result, DppCtrl::DppRowSr4, 0xF, 0xE, 0)),
-                        result);
-
-  // The DPP operation has all rows active (0xF) and the top 2 banks active (0xc, 0b1100) to make sure that in
-  // each cluster of 16, only the top 8 lanes perform the operation.
-  result = CreateSelect(CreateICmpUGE(clusterSize, getInt32(16)),
-                        createGroupArithmeticOperation(
-                            groupArithOp, result, createDppUpdate(identity, result, DppCtrl::DppRowSr8, 0xF, 0xC, 0)),
-                        result);
+  Value *result = BuilderBase::get(*this).CreateSetInactive(value, identity);
+
+  if (clusterSize >= 2) {
+    // The DPP operation has all rows active and all banks in the rows active (0xF).
+    result = createGroupArithmeticOperation(groupArithOp, result,
+                                            createDppUpdate(identity, result, DppCtrl::DppRowSr1, 0xF, 0xF, 0));
+  }
+
+  if (clusterSize >= 4) {
+    // The DPP operation has all rows active and all banks in the rows active (0xF).
+    result = createGroupArithmeticOperation(groupArithOp, result,
+                                            createDppUpdate(identity, result, DppCtrl::DppRowSr2, 0xF, 0xF, 0));
+  }
+
+  if (clusterSize >= 8) {
+    // The DPP operation has all rows active (0xF) and the top 3 banks active (0xe, 0b1110) to make sure that in
+    // each cluster of 16, only the top 12 lanes perform the operation.
+    result = createGroupArithmeticOperation(groupArithOp, result,
+                                            createDppUpdate(identity, result, DppCtrl::DppRowSr4, 0xF, 0xE, 0));
+  }
+
+  if (clusterSize >= 16) {
+    // The DPP operation has all rows active (0xF) and the top 2 banks active (0xc, 0b1100) to make sure that in
+    // each cluster of 16, only the top 8 lanes perform the operation.
+    result = createGroupArithmeticOperation(groupArithOp, result,
+                                            createDppUpdate(identity, result, DppCtrl::DppRowSr8, 0xF, 0xC, 0));
+  }
 
   Value *const threadMask = createThreadMask();
 
-  Value *const maskedPermLane = createThreadMaskedSelect(
-      threadMask, 0xFFFF0000FFFF0000, createPermLaneX16(result, result, UINT32_MAX, UINT32_MAX, true, false), identity);
+  if (clusterSize >= 32) {
+    Value *const maskedPermLane =
+        createThreadMaskedSelect(threadMask, 0xFFFF0000FFFF0000,
+                                 createPermLaneX16(result, result, UINT32_MAX, UINT32_MAX, true, false), identity);
+    // Use a permute lane to cross rows (row 1 <-> row 0, row 3 <-> row 2).
+    result = createGroupArithmeticOperation(groupArithOp, result, maskedPermLane);
+  }
 
-  // Use a permute lane to cross rows (row 1 <-> row 0, row 3 <-> row 2).
-  result = CreateSelect(CreateICmpUGE(clusterSize, getInt32(32)),
-                        createGroupArithmeticOperation(groupArithOp, result, maskedPermLane), result);
+  if (clusterSize == 64) {
 
-  Value *const broadcast31 = CreateSubgroupBroadcast(result, getInt32(31), instName);
+    Value *const broadcast31 = CreateSubgroupBroadcast(result, getInt32(31), instName);
 
-  Value *const maskedBroadcast = createThreadMaskedSelect(threadMask, 0xFFFFFFFF00000000, broadcast31, identity);
+    Value *const maskedBroadcast = createThreadMaskedSelect(threadMask, 0xFFFFFFFF00000000, broadcast31, identity);
 
-  // Combine broadcast of 31 with the top two rows only.
-  result = CreateSelect(CreateICmpEQ(clusterSize, getInt32(64)),
-                        createGroupArithmeticOperation(groupArithOp, result, maskedBroadcast), result);
+    // Combine broadcast of 31 with the top two rows only.
+    result = createGroupArithmeticOperation(groupArithOp, result, maskedBroadcast);
+  }
 
   // Finish the WWM section by calling the intrinsic.
   result = createWwm(result);
@@ -734,19 +740,22 @@ Value *BuilderImpl::CreateSubgroupClusteredInclusive(GroupArithOp groupArithOp,
 // @param instName : Name to give final instruction.
 Value *BuilderImpl::CreateSubgroupClusteredExclusive(GroupArithOp groupArithOp, Value *const value,
                                                      Value *const inClusterSize, const Twine &instName) {
-  auto waveSize = getInt32(getShaderWaveSize());
-  Value *clusterSize = CreateSelect(CreateICmpUGT(inClusterSize, waveSize), waveSize, inClusterSize);
+  assert(isa<ConstantInt>(inClusterSize));
+  unsigned clusterSize = cast<ConstantInt>(inClusterSize)->getZExtValue();
+  assert(isPowerOf2_32(clusterSize));
+  const unsigned waveSize = getShaderWaveSize();
+  clusterSize = std::min(clusterSize, waveSize);
 
   Value *const identity = createGroupArithmeticIdentity(groupArithOp, value->getType());
 
   // Start the WWM section by setting the inactive invocations.
-  Value *setInactive = BuilderBase::get(*this).CreateSetInactive(value, identity);
+  Value *result = BuilderBase::get(*this).CreateSetInactive(value, identity);
 
   // For waveOpsExcludeHelperLanes mode, we need mask away the helperlane.
   const auto &fragmentMode = m_pipelineState->getShaderModes()->getFragmentShaderMode();
   if (m_shaderStage == ShaderStage::Fragment && fragmentMode.waveOpsExcludeHelperLanes) {
     auto isLive = CreateIntrinsic(Intrinsic::amdgcn_live_mask, {}, {}, nullptr, {});
-    setInactive = CreateSelect(isLive, setInactive, identity);
+    result = CreateSelect(isLive, result, identity);
   }
 
   Value *shiftRight = nullptr;
@@ -756,10 +765,10 @@ Value *BuilderImpl::CreateSubgroupClusteredExclusive(GroupArithOp groupArithOp,
   // Shift right within each row:
   // 0b0110,0101,0100,0011,0010,0001,0000,1111 = 0x6543210F
   // 0b1110,1101,1100,1011,1010,1001,1000,0111 = 0xEDCBA987
-  shiftRight = createPermLane16(setInactive, setInactive, 0x6543210F, 0xEDCBA987, true, false);
+  shiftRight = createPermLane16(result, result, 0x6543210F, 0xEDCBA987, true, false);
 
-  // Only needed for wave size 64.
-  if (getShaderWaveSize() == 64) {
+  // Only needed for cluster size 64.
+  if (clusterSize == 64) {
     // Need to write the value from the 16th invocation into the 48th.
     shiftRight = CreateSubgroupWriteInvocation(shiftRight, CreateSubgroupBroadcast(shiftRight, getInt32(16), ""),
                                                getInt32(48), "");
@@ -773,55 +782,53 @@ Value *BuilderImpl::CreateSubgroupClusteredExclusive(GroupArithOp groupArithOp,
       createThreadMaskedSelect(threadMask, 0x0001000100010001,
                                createPermLaneX16(shiftRight, shiftRight, 0, UINT32_MAX, true, false), shiftRight);
 
-  // The DPP operation has all rows active and all banks in the rows active (0xF).
-  Value *result = CreateSelect(
-      CreateICmpUGE(clusterSize, getInt32(2)),
-      createGroupArithmeticOperation(groupArithOp, shiftRight,
-                                     createDppUpdate(identity, shiftRight, DppCtrl::DppRowSr1, 0xF, 0xF, 0)),
-      shiftRight);
-
-  // The DPP operation has all rows active and all banks in the rows active (0xF).
-  result =
-      CreateSelect(CreateICmpUGE(clusterSize, getInt32(4)),
-                   createGroupArithmeticOperation(
-                       groupArithOp, result, createDppUpdate(identity, shiftRight, DppCtrl::DppRowSr2, 0xF, 0xF, 0)),
-                   result);
-
-  // The DPP operation has all rows active and all banks in the rows active (0xF).
-  result =
-      CreateSelect(CreateICmpUGE(clusterSize, getInt32(4)),
-                   createGroupArithmeticOperation(
-                       groupArithOp, result, createDppUpdate(identity, shiftRight, DppCtrl::DppRowSr3, 0xF, 0xF, 0)),
-                   result);
-
-  // The DPP operation has all rows active (0xF) and the top 3 banks active (0xe, 0b1110) to make sure that in
-  // each cluster of 16, only the top 12 lanes perform the operation.
-  result = CreateSelect(CreateICmpUGE(clusterSize, getInt32(8)),
-                        createGroupArithmeticOperation(
-                            groupArithOp, result, createDppUpdate(identity, result, DppCtrl::DppRowSr4, 0xF, 0xE, 0)),
-                        result);
-
-  // The DPP operation has all rows active (0xF) and the top 2 banks active (0xc, 0b1100) to make sure that in
-  // each cluster of 16, only the top 8 lanes perform the operation.
-  result = CreateSelect(CreateICmpUGE(clusterSize, getInt32(16)),
-                        createGroupArithmeticOperation(
-                            groupArithOp, result, createDppUpdate(identity, result, DppCtrl::DppRowSr8, 0xF, 0xC, 0)),
-                        result);
-
-  Value *const maskedPermLane = createThreadMaskedSelect(
-      threadMask, 0xFFFF0000FFFF0000, createPermLaneX16(result, result, UINT32_MAX, UINT32_MAX, true, false), identity);
-
-  // Use a permute lane to cross rows (row 1 <-> row 0, row 3 <-> row 2).
-  result = CreateSelect(CreateICmpUGE(clusterSize, getInt32(32)),
-                        createGroupArithmeticOperation(groupArithOp, result, maskedPermLane), result);
-
-  Value *const broadcast31 = CreateSubgroupBroadcast(result, getInt32(31), instName);
-
-  Value *const maskedBroadcast = createThreadMaskedSelect(threadMask, 0xFFFFFFFF00000000, broadcast31, identity);
-
-  // Combine broadcast of 31 with the top two rows only.
-  result = CreateSelect(CreateICmpEQ(clusterSize, getInt32(64)),
-                        createGroupArithmeticOperation(groupArithOp, result, maskedBroadcast), result);
+  if (clusterSize >= 2) {
+    // The DPP operation has all rows active and all banks in the rows active (0xF).
+    result = createGroupArithmeticOperation(groupArithOp, shiftRight,
+                                            createDppUpdate(identity, shiftRight, DppCtrl::DppRowSr1, 0xF, 0xF, 0));
+  }
+
+  if (clusterSize >= 4) {
+    // The DPP operation has all rows active and all banks in the rows active (0xF).
+    result = createGroupArithmeticOperation(groupArithOp, result,
+                                            createDppUpdate(identity, shiftRight, DppCtrl::DppRowSr2, 0xF, 0xF, 0));
+
+    // The DPP operation has all rows active and all banks in the rows active (0xF).
+    result = createGroupArithmeticOperation(groupArithOp, result,
+                                            createDppUpdate(identity, shiftRight, DppCtrl::DppRowSr3, 0xF, 0xF, 0));
+  }
+
+  if (clusterSize >= 8) {
+    // The DPP operation has all rows active (0xF) and the top 3 banks active (0xe, 0b1110) to make sure that in
+    // each cluster of 16, only the top 12 lanes perform the operation.
+    result = createGroupArithmeticOperation(groupArithOp, result,
+                                            createDppUpdate(identity, result, DppCtrl::DppRowSr4, 0xF, 0xE, 0));
+  }
+
+  if (clusterSize >= 16) {
+    // The DPP operation has all rows active (0xF) and the top 2 banks active (0xc, 0b1100) to make sure that in
+    // each cluster of 16, only the top 8 lanes perform the operation.
+    result = createGroupArithmeticOperation(groupArithOp, result,
+                                            createDppUpdate(identity, result, DppCtrl::DppRowSr8, 0xF, 0xC, 0));
+  }
+
+  if (clusterSize >= 32) {
+    Value *const maskedPermLane =
+        createThreadMaskedSelect(threadMask, 0xFFFF0000FFFF0000,
+                                 createPermLaneX16(result, result, UINT32_MAX, UINT32_MAX, true, false), identity);
+
+    // Use a permute lane to cross rows (row 1 <-> row 0, row 3 <-> row 2).
+    result = createGroupArithmeticOperation(groupArithOp, result, maskedPermLane);
+  }
+
+  if (clusterSize >= 64) {
+    Value *const broadcast31 = CreateSubgroupBroadcast(result, getInt32(31), instName);
+
+    Value *const maskedBroadcast = createThreadMaskedSelect(threadMask, 0xFFFFFFFF00000000, broadcast31, identity);
+
+    // Combine broadcast of 31 with the top two rows only.
+    result = createGroupArithmeticOperation(groupArithOp, result, maskedBroadcast);
+  }
 
   // Finish the WWM section by calling the intrinsic.
   result = createWwm(result);
diff --git a/lgc/disassembler/Disassembler.cpp b/lgc/disassembler/Disassembler.cpp
index 4c711565c5..8dc125c0a3 100644
--- a/lgc/disassembler/Disassembler.cpp
+++ b/lgc/disassembler/Disassembler.cpp
@@ -794,9 +794,26 @@ void ObjDisassembler::outputData(bool outputting, uint64_t offset, StringRef dat
         for (size = nl + 1; size != data.size() && data[size] == '\n'; ++size)
           ;
       }
+    } else {
+      // If not outputting ascii, only do 4 bytes, as m_streamer->emitBinaryData splits into 4 byte chunks anyway,
+      // and splitting it ourselves allows us to add a comment with offset and chars on each 4 byte chunk.
+      size = std::min(size, size_t(4));
     }
 
     if (outputting) {
+      std::string comment;
+      raw_string_ostream commentStream(comment);
+      commentStream << format("%06x", offset);
+      if (!isAscii) {
+        commentStream << ": ";
+        for (size_t idx = 0; idx != size; ++idx) {
+          int ch = data[idx];
+          if (ch < ' ' || ch > '~')
+            ch = '.';
+          commentStream << char(ch);
+        }
+      }
+      m_streamer->AddComment(comment);
       if (isAscii)
         m_streamer->emitBytes(data.take_front(size));
       else
diff --git a/lgc/include/lgc/builder/BuilderImpl.h b/lgc/include/lgc/builder/BuilderImpl.h
index d06aeb6a37..98057d0670 100644
--- a/lgc/include/lgc/builder/BuilderImpl.h
+++ b/lgc/include/lgc/builder/BuilderImpl.h
@@ -73,15 +73,12 @@ class BuilderImpl : public BuilderDefs {
   // Get the PipelineState object.
   PipelineState *getPipelineState() const { return m_pipelineState; }
 
-  // Get whether the context we are building in support the bpermute operation.
+  // Get whether the context we are building in supports ds_bpermute or v_bpermute across all lanes in the wave.
   bool supportWaveWideBPermute() const;
 
   // Get whether the context we are building in supports permute lane 64 DPP operations.
   bool supportPermLane64Dpp() const;
 
-  // Create an "if..endif" or "if..else..endif" structure.
-  llvm::BranchInst *createIf(llvm::Value *condition, bool wantElse, const llvm::Twine &instName = "");
-
   // Helper method to scalarize a possibly vector unary operation
   llvm::Value *scalarize(llvm::Value *value, const std::function<llvm::Value *(llvm::Value *)> &callback);
 
@@ -315,6 +312,9 @@ class BuilderImpl : public BuilderDefs {
   // Check whether vertex buffer descriptors are in a descriptor array binding instead of the VertexBufferTable.
   bool useVertexBufferDescArray();
 
+  // Build buffer compact descriptor
+  llvm::Value *buildBufferCompactDesc(llvm::Value *desc, unsigned stride);
+
 private:
   // Get a struct containing the pointer and byte stride for a descriptor
   llvm::Value *getDescPtrAndStride(ResourceNodeType resType, uint64_t descSet, unsigned binding,
@@ -329,9 +329,6 @@ class BuilderImpl : public BuilderDefs {
 
   llvm::Value *scalarizeIfUniform(llvm::Value *value, bool isNonUniform);
 
-  // Build buffer compact descriptor
-  llvm::Value *buildBufferCompactDesc(llvm::Value *desc, unsigned stride);
-
   // Create a buffer descriptor.
   llvm::Value *createBufferDesc(uint64_t descSet, unsigned binding, llvm::Value *descIndex, unsigned flags,
                                 unsigned stride, const llvm::Twine &instName = "");
@@ -651,15 +648,6 @@ class BuilderImpl : public BuilderDefs {
   // Create a get subgroup size query.
   llvm::Value *CreateGetSubgroupSize(const llvm::Twine &instName = "");
 
-  // Create a subgroup all.
-  llvm::Value *CreateSubgroupAll(llvm::Value *const value, const llvm::Twine &instName = "");
-
-  // Create a subgroup all equal.
-  llvm::Value *CreateSubgroupAllEqual(llvm::Value *const value, const llvm::Twine &instName = "");
-
-  // Create a subgroup rotate.
-  llvm::Value *CreateSubgroupRotate(llvm::Value *const value, llvm::Value *const delta, llvm::Value *const clusterSize,
-                                    const llvm::Twine &instName = "");
   // Create a subgroup broadcast.
   llvm::Value *CreateSubgroupBroadcast(llvm::Value *const value, llvm::Value *const index,
                                        const llvm::Twine &instName = "");
diff --git a/lgc/include/lgc/builder/SubgroupBuilder.h b/lgc/include/lgc/builder/SubgroupBuilder.h
index 89b4cbf163..afd0c82b0c 100644
--- a/lgc/include/lgc/builder/SubgroupBuilder.h
+++ b/lgc/include/lgc/builder/SubgroupBuilder.h
@@ -55,6 +55,27 @@ class SubgroupBuilder : public BuilderImpl {
   // @param instName : Name to give instruction(s)
   llvm::Value *CreateSubgroupAny(llvm::Value *const value, const llvm::Twine &instName = "");
 
+  // Create a subgroup all.
+  //
+  // @param value : The value to compare
+  // @param instName : Name to give instruction(s)
+  llvm::Value *CreateSubgroupAll(llvm::Value *const value, const llvm::Twine &instName = "");
+
+  // Create a subgroup all equal.
+  //
+  // @param value : The value to compare
+  // @param instName : Name to give instruction(s)
+  llvm::Value *CreateSubgroupAllEqual(llvm::Value *const value, const llvm::Twine &instName = "");
+
+  // Create a subgroup rotate call.
+  //
+  // @param value : The value to read from the chosen rotated lane to all active lanes.
+  // @param delta : The delta/offset added to lane id.
+  // @param clusterSize : The cluster size if exists.
+  // @param instName : Name to give final instruction.
+  llvm::Value *CreateSubgroupRotate(llvm::Value *const value, llvm::Value *const delta, llvm::Value *const clusterSize,
+                                    const llvm::Twine &instName = "");
+
 private:
   SubgroupBuilder() = delete;
   SubgroupBuilder(const SubgroupBuilder &) = delete;
diff --git a/lgc/include/lgc/patch/CombineCooperativeMatrix.h b/lgc/include/lgc/patch/CombineCooperativeMatrix.h
index 8aebac6885..03fd9e2f14 100644
--- a/lgc/include/lgc/patch/CombineCooperativeMatrix.h
+++ b/lgc/include/lgc/patch/CombineCooperativeMatrix.h
@@ -33,6 +33,9 @@
 #include "llvm/IR/PassManager.h"
 
 namespace lgc {
+class CooperativeMatrixConvertOp;
+class CooperativeMatrixTransposeOp;
+class CooperativeMatrixMulAddOp;
 // =====================================================================================================================
 // Pass to combine cooperative matrix operations.
 class CombineCooperativeMatrix : public Patch, public llvm::PassInfoMixin<CombineCooperativeMatrix> {
diff --git a/lgc/include/lgc/patch/LowerCooperativeMatrix.h b/lgc/include/lgc/patch/LowerCooperativeMatrix.h
index 915dc399f2..7deb5cbee9 100644
--- a/lgc/include/lgc/patch/LowerCooperativeMatrix.h
+++ b/lgc/include/lgc/patch/LowerCooperativeMatrix.h
@@ -48,6 +48,20 @@ class CooperativeRowAccExpandOp;
 class CooperativeRowAccSumAccumulateOp;
 class CooperativeRowAccScalarOp;
 
+class CooperativeMatrixLoadOp;
+class CooperativeMatrixStoreOp;
+class CooperativeMatrixLengthOp;
+class CooperativeMatrixFillOp;
+class CooperativeMatrixExtractOp;
+class CooperativeMatrixInsertOp;
+class CooperativeMatrixConvertOp;
+class CooperativeMatrixTransposeOp;
+class CooperativeMatrixBinaryOp;
+class CooperativeMatrixTimesScalarOp;
+class CooperativeMatrixMulAddOp;
+class CooperativeMatrixPackOp;
+class CooperativeMatrixUnPackOp;
+
 // =====================================================================================================================
 // Pass to lower coopMatrix calls
 class LowerCooperativeMatrix : public Patch, public llvm::PassInfoMixin<LowerCooperativeMatrix> {
@@ -59,7 +73,7 @@ class LowerCooperativeMatrix : public Patch, public llvm::PassInfoMixin<LowerCoo
   void visitCallInst(llvm::CallInst &callInst);
 
 private:
-  void processCoopMatrixFunction(llvm::ArrayRef<llvm::Function *> coopMatrixCallees);
+  void processCoopMatrixFunction(llvm::Module &module);
 
   struct TypeProperties {
     // Number of (true) elements per lane.
@@ -91,18 +105,26 @@ class LowerCooperativeMatrix : public Patch, public llvm::PassInfoMixin<LowerCoo
     unsigned microCount;
   };
 
-  unsigned getLength(CooperativeMatrixLayout layout) const;
-
   TypeProperties getTypeProperties(CooperativeMatrixElementType elemType, CooperativeMatrixLayout layout) const;
 
   ComputeAddressInfo computeAddressing(CooperativeMatrixLayout layout, CooperativeMatrixElementType elemType,
                                        int waveSize, llvm::Value *stride, bool isColMajor,
                                        llvm::Instruction *insertPos);
 
-  llvm::Value *cooperativeMatrixLoadInternal(llvm::Value *dataPtr, llvm::Value *stride, bool colMajor,
-                                             CooperativeMatrixElementType elemType, CooperativeMatrixLayout layout,
-                                             unsigned memoryAccess, unsigned alignment, const llvm::Twine &instName,
-                                             llvm::Instruction *insertPos);
+  void visitCooperativeMatrixLengthOp(CooperativeMatrixLengthOp &matrixlength);
+  void visitCooperativeMatrixLoadOp(CooperativeMatrixLoadOp &load);
+  void visitCooperativeMatrixStoreOp(CooperativeMatrixStoreOp &store);
+  void visitCooperativeMatrixFillOp(CooperativeMatrixFillOp &fill);
+  void visitCooperativeMatrixExtractOp(CooperativeMatrixExtractOp &extract);
+  void visitCooperativeMatrixInsertOp(CooperativeMatrixInsertOp &insert);
+  void visitCooperativeMatrixConvertOp(CooperativeMatrixConvertOp &convert);
+  void visitCooperativeMatrixTransposeOp(CooperativeMatrixTransposeOp &transpose);
+  void visitCooperativeMatrixBinaryOp(CooperativeMatrixBinaryOp &binary);
+  void visitCooperativeMatrixTimesScalarOp(CooperativeMatrixTimesScalarOp &timesscalar);
+  void visitCooperativeMatrixMulAddOp(CooperativeMatrixMulAddOp &muladd);
+  void visitCooperativeMatrixPackOp(CooperativeMatrixPackOp &pack);
+  void visitCooperativeMatrixUnPackOp(CooperativeMatrixUnPackOp &unpack);
+
   // Convert vector data to cooperativeMatrix vec data
   // eg. v16*data_In_Buffer-->v8*coopMatrix_data as two 16bits elements packed.
   llvm::Value *convFlatVecToCoopMatrixVec(BuilderCommon &builder, llvm::Value *vecValue,
@@ -112,32 +134,6 @@ class LowerCooperativeMatrix : public Patch, public llvm::PassInfoMixin<LowerCoo
   llvm::Value *convCoopMatrixVecToFlatVec(BuilderCommon &builder, llvm::Value *matrixValue,
                                           CooperativeMatrixElementType elemType, CooperativeMatrixLayout layout);
 
-  // Create cooperative matrix store operation
-  void cooperativeMatrixStoreInternal(llvm::Value *dataPtr, llvm::Value *stride, bool colMajor,
-                                      CooperativeMatrixElementType elemType, CooperativeMatrixLayout layout,
-                                      unsigned memoryAccess, unsigned alignment, llvm::Value *&vecVal,
-                                      const llvm::Twine &instName, llvm::Instruction *insertPos);
-
-  // Open-code cooperative matrix extract operation
-  llvm::Value *cooperativeMatrixExtract(BuilderCommon &builder, llvm::Value *matrix, llvm::Value *index,
-                                        CooperativeMatrixElementType elemType, CooperativeMatrixLayout layout);
-
-  // Open-code cooperative matrix insert operation
-  llvm::Value *cooperativeMatrixInsert(BuilderCommon &builder, llvm::Value *matrix, llvm::Value *value,
-                                       llvm::Value *index, CooperativeMatrixElementType elemType,
-                                       CooperativeMatrixLayout layout);
-
-  // Open-code cooperative matrix fill operation
-  llvm::Value *cooperativeMatrixFill(BuilderCommon &builder, llvm::Value *value, CooperativeMatrixElementType elemType,
-                                     CooperativeMatrixLayout layout);
-
-  // Create cooperative matrix convert operation
-  llvm::Value *cooperativeMatrixConvert(llvm::CastInst::CastOps castOp, llvm::Value *source,
-                                        CooperativeMatrixElementType srcElemType,
-                                        CooperativeMatrixElementType dstElemType, CooperativeMatrixLayout srclayout,
-                                        CooperativeMatrixLayout dstlayout, const llvm::Twine &instName,
-                                        llvm::Instruction *insertPos);
-
   // Create cooperative matrix convert operation without reshape operation
   llvm::Value *cooperativeMatrixConvertInternal(llvm::CastInst::CastOps castOp, llvm::Value *source,
                                                 CooperativeMatrixElementType srcElemType,
@@ -188,11 +184,6 @@ class LowerCooperativeMatrix : public Patch, public llvm::PassInfoMixin<LowerCoo
                                                     CooperativeMatrixLayout dstLayout, const llvm::Twine &instName,
                                                     llvm::Instruction *insertPos);
 
-  // Create cooperative matrix transpose operation
-  llvm::Value *cooperativeMatrixTranspose(llvm::Value *matrix, CooperativeMatrixElementType elemType,
-                                          CooperativeMatrixLayout srcLayout, const llvm::Twine &instName,
-                                          llvm::Instruction *insertPos);
-
   llvm::Value *transposeCooperativeMatrixRecursively(llvm::Value *matrix, unsigned vecStride, unsigned laneStride,
                                                      llvm::Value *threadId, BuilderBase &builder);
 
diff --git a/lgc/include/lgc/patch/LowerDesc.h b/lgc/include/lgc/patch/LowerDesc.h
index f462f1a53f..517f03027b 100644
--- a/lgc/include/lgc/patch/LowerDesc.h
+++ b/lgc/include/lgc/patch/LowerDesc.h
@@ -40,7 +40,7 @@
 
 namespace lgc {
 
-class LoadBufferAddrOp;
+class ExtendAddressOp;
 class LoadBufferDescOp;
 class LoadStridedBufferDescOp;
 
@@ -52,7 +52,7 @@ class LowerDesc : public llvm::PassInfoMixin<LowerDesc> {
   static llvm::StringRef name() { return "Lower buffer descriptor loads"; }
 
 private:
-  void visitLoadBufferAddr(LoadBufferAddrOp &op);
+  void visitExtendAddress(ExtendAddressOp &op);
   void visitLoadBufferDesc(LoadBufferDescOp &op);
   void visitLoadStridedBufferDesc(LoadStridedBufferDescOp &op);
   llvm::SmallVector<llvm::Instruction *> m_toErase;
diff --git a/lgc/include/lgc/patch/LowerGpuRt.h b/lgc/include/lgc/patch/LowerGpuRt.h
index 232e4c1a16..7923c1d9b2 100644
--- a/lgc/include/lgc/patch/LowerGpuRt.h
+++ b/lgc/include/lgc/patch/LowerGpuRt.h
@@ -51,6 +51,7 @@ class GpurtGetFlattenedGroupThreadIdOp;
 class GpurtFloatWithRoundModeOp;
 class GpurtDispatchThreadIdFlatOp;
 class GpurtContinuationStackIsGlobalOp;
+class GpurtWaveScanOp;
 
 class LowerGpuRt : public llvm::PassInfoMixin<LowerGpuRt> {
 public:
@@ -77,6 +78,7 @@ class LowerGpuRt : public llvm::PassInfoMixin<LowerGpuRt> {
   void visitFloatWithRoundMode(lgc::GpurtFloatWithRoundModeOp &inst);
   void visitGpurtDispatchThreadIdFlatOp(lgc::GpurtDispatchThreadIdFlatOp &inst);
   void visitContinuationStackIsGlobalOp(lgc::GpurtContinuationStackIsGlobalOp &inst);
+  void visitWaveScanOp(lgc::GpurtWaveScanOp &inst);
   llvm::Value *m_stack = nullptr;                        // Stack array to hold stack value
   llvm::Type *m_stackTy = nullptr;                       // Stack type
   PipelineState *m_pipelineState = nullptr;              // Pipeline state
diff --git a/lgc/include/lgc/patch/LowerSubgroupOps.h b/lgc/include/lgc/patch/LowerSubgroupOps.h
index d6ca33c49c..26045fffc0 100644
--- a/lgc/include/lgc/patch/LowerSubgroupOps.h
+++ b/lgc/include/lgc/patch/LowerSubgroupOps.h
@@ -59,6 +59,9 @@ class LowerSubgroupOps : public Patch, public llvm::PassInfoMixin<LowerSubgroupO
 
   void visitElect(SubgroupElectOp &op);
   void visitAny(SubgroupAnyOp &op);
+  void visitAll(SubgroupAllOp &op);
+  void visitAllEqual(SubgroupAllEqualOp &op);
+  void visitRotate(SubgroupRotateOp &op);
 
   PipelineState *m_pipelineState = nullptr;
   SubgroupBuilder *m_builder = nullptr;
diff --git a/lgc/include/lgc/patch/PatchBufferOp.h b/lgc/include/lgc/patch/PatchBufferOp.h
index b2fbfef3e3..0b8d6ee522 100644
--- a/lgc/include/lgc/patch/PatchBufferOp.h
+++ b/lgc/include/lgc/patch/PatchBufferOp.h
@@ -50,12 +50,14 @@ class DivergenceInfo;
 
 namespace lgc {
 
+class BufferAddrToPtrOp;
 class BufferDescToPtrOp;
 class StridedBufferDescToPtrOp;
 class StridedBufferAddrAndStrideToPtrOp;
 class StridedIndexAddOp;
 class BufferLengthOp;
 class BufferPtrDiffOp;
+class LoadTfeOp;
 class PipelineState;
 
 // =====================================================================================================================
@@ -97,6 +99,7 @@ class BufferOpLowering {
   void visitAtomicCmpXchgInst(llvm::AtomicCmpXchgInst &atomicCmpXchgInst);
   void visitAtomicRMWInst(llvm::AtomicRMWInst &atomicRmwInst);
   void visitBitCastInst(llvm::BitCastInst &bitCastInst);
+  void visitBufferAddrToPtr(BufferAddrToPtrOp &op);
   void visitBufferDescToPtr(BufferDescToPtrOp &descToPtr);
   void visitStridedBufferDescToPtr(StridedBufferDescToPtrOp &descToPtr);
   void visitStridedBufferAddrAndStrideToPtr(StridedBufferAddrAndStrideToPtrOp &addrAndStrideToPtr);
@@ -112,11 +115,13 @@ class BufferOpLowering {
   void visitStoreInst(llvm::StoreInst &storeInst);
   void visitICmpInst(llvm::ICmpInst &icmpInst);
   void visitInvariantStart(llvm::IntrinsicInst &intrinsic);
+  void visitLoadTfeOp(LoadTfeOp &loadTfe);
 
   void postVisitLoadInst(llvm::LoadInst &loadInst);
   void postVisitStoreInst(llvm::StoreInst &storeInst);
   void postVisitMemCpyInst(llvm::MemCpyInst &memCpyInst);
   void postVisitMemSetInst(llvm::MemSetInst &memSetInst);
+  void postVisitLoadTfeOp(LoadTfeOp &loadTfe);
 
   DescriptorInfo getDescriptorInfo(llvm::Value *desc);
   bool isAnyBufferPointer(const llvm::Value *const pointerVal);
diff --git a/lgc/include/lgc/patch/PatchEntryPointMutate.h b/lgc/include/lgc/patch/PatchEntryPointMutate.h
index 40bd268486..e56338b3ef 100644
--- a/lgc/include/lgc/patch/PatchEntryPointMutate.h
+++ b/lgc/include/lgc/patch/PatchEntryPointMutate.h
@@ -150,6 +150,10 @@ class PatchEntryPointMutate : public Patch, public llvm::PassInfoMixin<PatchEntr
   bool lowerCpsOps(llvm::Function *func, ShaderInputs *shaderInputs);
   llvm::Function *lowerCpsFunction(llvm::Function *func, llvm::ArrayRef<llvm::Type *> fixedShaderArgTys,
                                    llvm::ArrayRef<std::string> argNames);
+
+  llvm::Value *takeLevel(llvm::Value *level, llvm::IRBuilder<> &builder, llvm::Type *waveMaskTy,
+                         llvm::ArrayRef<lgc::cps::CpsLevel> priority);
+
   unsigned lowerCpsJump(llvm::Function *parent, cps::JumpOp *jumpOp, llvm::BasicBlock *tailBlock,
                         llvm::SmallVectorImpl<CpsExitInfo> &exitInfos);
   void lowerAsCpsReference(cps::AsContinuationReferenceOp &asCpsReferenceOp);
diff --git a/lgc/include/lgc/state/Abi.h b/lgc/include/lgc/state/Abi.h
index 4926198194..7f8aeba066 100644
--- a/lgc/include/lgc/state/Abi.h
+++ b/lgc/include/lgc/state/Abi.h
@@ -133,6 +133,7 @@ struct PrimShaderCbLayout {
 };
 
 /// Constant buffer used by SW stream-out processing (GFX11+).
+
 struct StreamOutControlCb {
   unsigned bufOffsets[MaxTransformFeedbackBuffers];
 };
diff --git a/lgc/include/lgc/state/AbiMetadata.h b/lgc/include/lgc/state/AbiMetadata.h
index f35ab2b638..b687fa1576 100644
--- a/lgc/include/lgc/state/AbiMetadata.h
+++ b/lgc/include/lgc/state/AbiMetadata.h
@@ -126,6 +126,7 @@ static constexpr char String[] = ".string";
 static constexpr char Name[] = ".name";
 static constexpr char Type[] = ".type";
 static constexpr char InternalPipelineHash[] = ".internal_pipeline_hash";
+static constexpr char ResourceHash[] = ".resource_hash";
 static constexpr char XglCacheInfo[] = ".xgl_cache_info";
 static constexpr char CacheHash128Bits[] = ".128_bit_cache_hash";
 static constexpr char LlpcVersion[] = ".llpc_version";
@@ -187,6 +188,7 @@ static constexpr char ShaderSpillThreshold[] = ".shader_spill_threshold";
 namespace ShaderMetadataKey {
 static constexpr char ApiShaderHash[] = ".api_shader_hash";
 static constexpr char HardwareMapping[] = ".hardware_mapping";
+static constexpr char ShaderSubtype[] = ".shader_subtype";
 }; // namespace ShaderMetadataKey
 
 namespace ComputeRegisterMetadataKey {
@@ -590,6 +592,9 @@ enum class UserDataMapping : unsigned {
   DynamicDualSrcBlendInfo =
       0x10000022, // Dual source blend dynamic info, dynamicStateHasChange + dsBlendDynamicEnable Pattern
 
+  CompositeData = 0x10000023, // sample info + DynamicDualSrcBlendInfo + topology, this will replace the two
+                              // userdata above.
+
   // Values used in a user data PAL metadata register to be resolved at link time.
   // This is part of the "unlinked" ABI, so should arguably be in AbiUnlinked.h.
   DescriptorSet0 = 0x80000000,   // 32-bit pointer to the descriptor table for descriptor set 0: add N to this value
diff --git a/lgc/include/lgc/state/Defs.h b/lgc/include/lgc/state/Defs.h
index 526ab83c1a..18ae6090cf 100644
--- a/lgc/include/lgc/state/Defs.h
+++ b/lgc/include/lgc/state/Defs.h
@@ -73,21 +73,6 @@ const static char CopyShaderEntryPoint[] = "lgc.shader.COPY.main";
 const static char NullFsEntryPoint[] = "lgc.shader.FS.null.main";
 const static char TcsPassthroughEntryPoint[] = "lgc.shader.TCS.passthrough.main";
 
-const static char CooperativeMatrix[] = "lgc.cooperative.matrix";
-const static char CooperativeMatrixLength[] = "lgc.cooperative.matrix.length";
-const static char CooperativeMatrixExtract[] = "lgc.cooperative.matrix.extract";
-const static char CooperativeMatrixInsert[] = "lgc.cooperative.matrix.insert";
-const static char CooperativeMatrixFill[] = "lgc.cooperative.matrix.fill";
-const static char CooperativeMatrixLoad[] = "lgc.cooperative.matrix.load";
-const static char CooperativeMatrixStore[] = "lgc.cooperative.matrix.store";
-const static char CooperativeMatrixConvert[] = "lgc.cooperative.matrix.convert";
-const static char CooperativeMatrixBinOp[] = "lgc.cooperative.matrix.binop";
-const static char CooperativeMatrixTimesScalar[] = "lgc.cooperative.matrix.times.scalar";
-const static char CooperativeMatrixTranspose[] = "lgc.cooperative.matrix.transpose";
-const static char CooperativeMatrixMulAdd[] = "lgc.cooperative.matrix.muladd";
-const static char CooperativeMatrixPack[] = "lgc.cooperative.matrix.pack";
-const static char CooperativeMatrixUnpack[] = "lgc.cooperative.matrix.unpack";
-
 } // namespace lgcName
 
 // Value for high half of address that means "use PC".
diff --git a/lgc/include/lgc/state/PipelineState.h b/lgc/include/lgc/state/PipelineState.h
index 6437c15512..9a2ad7bebd 100644
--- a/lgc/include/lgc/state/PipelineState.h
+++ b/lgc/include/lgc/state/PipelineState.h
@@ -156,7 +156,7 @@ class PipelineState final : public Pipeline {
   void set128BitCacheHash(const Hash128 &finalizedCacheHash, const llvm::VersionTuple &version) override final;
 
   // Find the shader entry-point from shader module, and set pipeline stage.
-  void attachModule(llvm::Module *modules) override final;
+  void attachModule(llvm::Module *modules, PipelineLink pipelineLink) override final;
 
   // Record pipeline state into IR metadata of specified module.
   void record(llvm::Module *module) override final;
@@ -431,6 +431,9 @@ class PipelineState final : public Pipeline {
 
   // Get the activeness for a vertex stream
   bool isVertexStreamActive(unsigned streamId) {
+    if (!hasShaderStage(ShaderStage::Geometry))
+      return streamId == 0; // The active stream is always 0 when GS is not present
+
     if (getRasterizerState().rasterStream == streamId)
       return true; // Rasterization stream is always active
     return m_xfbStateMetadata.streamActive[streamId];
diff --git a/lgc/include/lgc/state/ResourceUsage.h b/lgc/include/lgc/state/ResourceUsage.h
index b2668cb272..6b17320cd5 100644
--- a/lgc/include/lgc/state/ResourceUsage.h
+++ b/lgc/include/lgc/state/ResourceUsage.h
@@ -335,8 +335,9 @@ struct ResourceUsage {
         unsigned sampleMask : 1;     // Whether gl_SampleMask[] is used
         unsigned fragStencilRef : 1; // Whether gl_FragStencilRef is used
         // Statements
-        unsigned discard : 1;         // Whether "discard" statement is used
-        unsigned runAtSampleRate : 1; // Whether fragment shader run at sample rate
+        unsigned discard : 1;           // Whether "discard" statement is used
+        unsigned runAtSampleRate : 1;   // Whether fragment shader run at sample rate
+        unsigned useDynamicToplogy : 1; // Whether to use dynamic topology.
       } fs;
 
       // Compute shader
@@ -462,10 +463,9 @@ struct ResourceUsage {
       std::map<BuiltInKind, unsigned> vertexBuiltInExportSlots;
       std::map<BuiltInKind, unsigned> primitiveBuiltInExportSlots;
 
-      // Map from output locations to their number of components: <location, <numComponents, forBuiltIn>> (including
-      // those special outputs to which built-ins are mapped)
-      std::map<unsigned, std::pair<unsigned, BuiltInKind>> vertexOutputComponents;
-      std::map<unsigned, std::pair<unsigned, BuiltInKind>> primitiveOutputComponents;
+      // Export count for generic outputs (excluding those special outputs to which the built-ins are mapped)
+      unsigned vertexGenericOutputExportCount = 0;
+      unsigned primitiveGenericOutputExportCount = 0;
     } mesh;
 
     struct {
@@ -540,6 +540,7 @@ struct InterfaceData {
         unsigned viewId;             // View ID
         unsigned vbTablePtr;         // Pointer of vertex buffer table
         unsigned esGsOffset;         // ES-GS ring buffer offset
+        unsigned compositeData;      // CompositeData
         StreamOutData streamOutData; // Stream-out Data
       } vs;
 
@@ -572,6 +573,7 @@ struct InterfaceData {
         unsigned primitiveId;                     // Primitive ID
         unsigned invocationId;                    // Invocation ID
         unsigned viewId;                          // View ID
+        unsigned compositeData;                   // CompositeData
         StreamOutData streamOutData;              // Stream-out Data
       } gs;
 
@@ -588,10 +590,9 @@ struct InterfaceData {
 
       // Fragment shader
       struct {
-        unsigned viewId;                  // View ID
-        unsigned primMask;                // Primitive mask
-        unsigned sampleInfo;              // Sample Info: numSample + samplePattern
-        unsigned dynamicDualSrcBlendInfo; // dualSrcBlendDynamicValue
+        unsigned viewId;        // View ID
+        unsigned primMask;      // Primitive mask
+        unsigned compositeData; // CompositeData
 
         // Perspective interpolation (I/J)
         struct {
diff --git a/lgc/include/lgc/state/ShaderStage.h b/lgc/include/lgc/state/ShaderStage.h
index 2884e860cb..9636106b72 100644
--- a/lgc/include/lgc/state/ShaderStage.h
+++ b/lgc/include/lgc/state/ShaderStage.h
@@ -35,6 +35,7 @@
 
 namespace llvm {
 class Function;
+class GlobalObject;
 class Module;
 class Type;
 } // namespace llvm
@@ -44,11 +45,24 @@ namespace lgc {
 // Set shader stage metadata on every defined function in a module
 void setShaderStage(llvm::Module *module, std::optional<ShaderStageEnum> stage);
 
-// Set shader stage metadata on a function
-void setShaderStage(llvm::Function *func, std::optional<ShaderStageEnum> stage);
+// Set shader stage metadata on a function.
+// This can instead be a GlobalVariable; that functionality is not used by LGC,
+// but can be used by a front-end that uses a GlobalVariable to represent a part-pipeline retrieved
+// from the cache, and wants to mark it with a shader stage
+void setShaderStage(llvm::GlobalObject *func, std::optional<ShaderStageEnum> stage);
 
 // Gets the shader stage from the specified LLVM function.
-std::optional<ShaderStageEnum> getShaderStage(const llvm::Function *func);
+// This can instead be a GlobalVariable; that functionality is not used by LGC,
+// but can be used by a front-end that uses a GlobalVariable to represent a part-pipeline retrieved
+// from the cache, and wants to mark it with a shader stage
+std::optional<ShaderStageEnum> getShaderStage(const llvm::GlobalObject *func);
+
+// Set a function's shader subtype. Only has an effect on a compute shader or non-shader export function,
+// where it causes the .shader_subtype PAL metadata item to be set to the arbitrary string given here.
+void setShaderSubtype(llvm::GlobalObject *func, llvm::StringRef subtype);
+
+// Get a function's shader subtype, or "" if none.
+llvm::StringRef getShaderSubtype(llvm::GlobalObject *func);
 
 // Determine whether the function is a shader entry-point.
 bool isShaderEntryPoint(const llvm::Function *func);
diff --git a/lgc/include/lgc/state/TargetInfo.h b/lgc/include/lgc/state/TargetInfo.h
index 6a79787186..c0bbe768c9 100644
--- a/lgc/include/lgc/state/TargetInfo.h
+++ b/lgc/include/lgc/state/TargetInfo.h
@@ -61,6 +61,7 @@ struct GpuProperty {
   unsigned gsOnChipDefaultPrimsPerSubgroup;   // Default target number of primitives per subgroup for GS on-chip mode.
   unsigned gsOnChipDefaultLdsSizePerSubgroup; // Default value for the maximum LDS size per subgroup for
   unsigned gsOnChipMaxLdsSize;                // Max LDS size used by GS on-chip mode (in dwords)
+  unsigned maxMsaaRasterizerSamples;          // Maximum number of MSAA samples supported by the rasterizer
 
   // TODO: Setup gsPrimBufferDepth from hardware config option, will be done in another change.
   unsigned gsPrimBufferDepth; // Comes from the hardware GPU__GC__GSPRIM_BUFF_DEPTH configuration option
diff --git a/lgc/interface/lgc/Builder.h b/lgc/interface/lgc/Builder.h
index f0df5c6a1f..f345b7010c 100644
--- a/lgc/interface/lgc/Builder.h
+++ b/lgc/interface/lgc/Builder.h
@@ -155,6 +155,7 @@ class InOutInfo {
 class BuilderDefs : public BuilderCommon {
 public:
   BuilderDefs(llvm::LLVMContext &context) : BuilderCommon(context) {}
+  BuilderDefs(llvm::Instruction *insertPoint) : BuilderCommon(insertPoint) {}
 
   // Bit settings for integer dot product
   enum : unsigned {
@@ -164,7 +165,7 @@ class BuilderDefs : public BuilderCommon {
 
   // The group arithmetic operations the builder can consume.
   // NOTE : We rely on casting this implicitly to an integer, so we cannot use an enum class.
-  enum GroupArithOp { IAdd = 0, FAdd, IMul, FMul, SMin, UMin, FMin, SMax, UMax, FMax, And, Or, Xor };
+  enum GroupArithOp { Nop = -1, IAdd = 0, FAdd, IMul, FMul, SMin, UMin, FMin, SMax, UMax, FMax, And, Or, Xor };
 
   // Bit settings for flags argument in CreateLoadBufferDesc.
   enum {
@@ -384,6 +385,7 @@ class BuilderDefs : public BuilderCommon {
 class Builder : public BuilderDefs {
 public:
   Builder(llvm::LLVMContext &context) : BuilderDefs(context) {}
+  Builder(llvm::Instruction *insertPoint) : BuilderDefs(insertPoint) {}
 
   // -----------------------------------------------------------------------------------------------------------------
   // Base class operations
@@ -1425,27 +1427,6 @@ class Builder : public BuilderDefs {
   // @param instName : Name to give instruction(s)
   llvm::Value *CreateGetSubgroupSize(const llvm::Twine &instName = "");
 
-  // Create a subgroup all.
-  //
-  // @param value : The value to compare
-  // @param instName : Name to give instruction(s)
-  llvm::Value *CreateSubgroupAll(llvm::Value *const value, const llvm::Twine &instName = "");
-
-  // Create a subgroup all equal.
-  //
-  // @param value : The value to compare
-  // @param instName : Name to give instruction(s)
-  llvm::Value *CreateSubgroupAllEqual(llvm::Value *const value, const llvm::Twine &instName = "");
-
-  // Create a subgroup rotate call.
-  //
-  // @param value : The value to read from the chosen rotated lane to all active lanes.
-  // @param delta : The delta/offset added to lane id.
-  // @param clusterSize : The cluster size if exists.
-  // @param instName : Name to give final instruction.
-  llvm::Value *CreateSubgroupRotate(llvm::Value *const value, llvm::Value *const delta, llvm::Value *const clusterSize,
-                                    const llvm::Twine &instName = "");
-
   // Create a subgroup broadcast.
   //
   // @param value : The value to broadcast
diff --git a/lgc/interface/lgc/BuilderCommon.h b/lgc/interface/lgc/BuilderCommon.h
index 8fb34c6628..519758946d 100644
--- a/lgc/interface/lgc/BuilderCommon.h
+++ b/lgc/interface/lgc/BuilderCommon.h
@@ -88,6 +88,12 @@ class BuilderCommon : public llvm_dialects::Builder {
   llvm::CallInst *CreateNamedCall(llvm::StringRef funcName, llvm::Type *retTy, llvm::ArrayRef<llvm::Value *> args,
                                   llvm::ArrayRef<llvm::Attribute::AttrKind> attribs, const llvm::Twine &instName = "");
 
+  // Create code to build a vector out of a number of scalar elements of the same type.
+  llvm::Value *CreateBuildVector(llvm::ArrayRef<llvm::Value *> elements, const llvm::Twine &instName = "");
+
+  // Create an "if..endif" or "if..else..endif" structure.
+  llvm::BranchInst *CreateIf(llvm::Value *condition, bool wantElse, const llvm::Twine &instName = "");
+
   // =====================================================================================================================
   // Create alloca for given input type.
   //
@@ -107,131 +113,6 @@ class BuilderCommon : public llvm_dialects::Builder {
 
   // Get the LGC type of a cooperative matrix with the given element type and layout.
   llvm::Type *getCooperativeMatrixTy(CooperativeMatrixElementType elemType, CooperativeMatrixLayout layout);
-
-  // Determine the "length" of a cooperative matrix for purposes of extract/insert operations.
-  llvm::Value *CreateCooperativeMatrixLength(CooperativeMatrixElementType elemType, CooperativeMatrixLayout layout,
-                                             const llvm::Twine &instName = "");
-
-  // Create an "extractelement"-equivalent operation for a cooperative matrix value.
-  llvm::Value *CreateCooperativeMatrixExtract(llvm::Value *matrix, llvm::Value *index,
-                                              CooperativeMatrixElementType elemType, CooperativeMatrixLayout layout,
-                                              const llvm::Twine &instName = "");
-
-  // Create an "insertelement"-equivalent operation for a cooperative matrix value.
-  llvm::Value *CreateCooperativeMatrixInsert(llvm::Value *matrix, llvm::Value *value, llvm::Value *index,
-                                             CooperativeMatrixElementType elemType, CooperativeMatrixLayout layout,
-                                             const llvm::Twine &instName = "");
-
-  // Create an "fill"-equaivalent operation for a cooperative matrix value.
-  llvm::Value *CreateCooperativeMatrixFill(llvm::Value *value, CooperativeMatrixElementType elemType,
-                                           CooperativeMatrixLayout layout, const llvm::Twine &instName = "");
-
-  // Create cooperative matrix load.
-  //
-  // @param pointer : The pointer to a data array.
-  // @param stride : The number of bytes in memory between the first component of consecutive rows (or
-  // columns) in the result.
-  // @param colMaj : Whether the values loaded from memory are arrayed in column-major or row-major.
-  // @param layout : Identify it's factor or accumulator
-  // @param memoryAccess : Parsed from Memory operands in SPIRV-reader
-  // @param alignment : Alignment for memory operation.
-  // @param instName : Name to give instruction(s)
-  llvm::Value *CreateCooperativeMatrixLoad(llvm::Value *pointer, llvm::Value *stride, bool colMajor,
-                                           CooperativeMatrixElementType elemType, CooperativeMatrixLayout layout,
-                                           unsigned memoryAccess, llvm::Align alignment,
-                                           const llvm::Twine &instName = "");
-
-  // Create cooperative matrix store.
-  //
-  // @param pointer : The pointer to a data array.
-  // @param matrix : The cooperative matrix to store.
-  // @param stride : The number of bytes in memory between the first component of consecutive rows (or
-  // columns) in the result.
-  // @param colMaj : Whether the values loaded from memory are arrayed in column-major or row-major.
-  // @param layout : Identify it's factor or accumulator
-  // @param memoryAccess : Parsed from Memory operands in SPIRV-reader
-  // @param alignment : Alignment for memory operation.
-  // @param instName : Name to give instruction(s).
-  llvm::Value *CreateCooperativeMatrixStore(llvm::Value *pointer, llvm::Value *matrix, llvm::Value *stride,
-                                            bool colMajor, CooperativeMatrixElementType elemType,
-                                            CooperativeMatrixLayout layout, unsigned memoryAccess,
-                                            llvm::Align alignment, const llvm::Twine &instName = "");
-
-  // Create cooperative matrix conversion.
-  // @param opCode : The convert opCode.
-  // @param source : The source cooperative matrix.
-  // @param dest : The conversion target.
-  // @param instName : Name to give instruction(s).
-  llvm::CallInst *CreateCooperativeMatrixConvert(llvm::CastInst::CastOps opCode, llvm::Value *source,
-                                                 CooperativeMatrixElementType srcElemType,
-                                                 CooperativeMatrixElementType dstElemType,
-                                                 CooperativeMatrixLayout srcLayout, CooperativeMatrixLayout dstLayout,
-                                                 const llvm::Twine &instName = "");
-
-  // Create cooperative matrix binary operation
-  //
-  // @param coopMatArithOp : The cooperative matrix arithmetic operation to perform.
-  // @param operand1 : The first operand.
-  // @param operand2 : The second operand.
-  // @param instName : Name to give instruction(s).
-  llvm::Value *CreateCooperativeMatrixBinaryOp(CooperativeMatrixArithOp coopMatArithOp, llvm::Value *lhs,
-                                               llvm::Value *rhs, CooperativeMatrixElementType elemType,
-                                               CooperativeMatrixLayout layout, const llvm::Twine &instName = "");
-
-  // Create cooperative MatrixTimesScalar binary operation
-  //
-  // @param matrix : It should be cooperative matrix.
-  // @param scalar : It should be scalar type. If the matrix is a packed
-  // accumulator matrix, the scalar has to be a <2 x half> vector.
-  // @param elemType : Name to give instruction(s).
-  // @param layout : Identify A/B matrices or C/D matrices.
-  llvm::Value *CreateCoopMatrixTimesScalar(llvm::Value *matrix, llvm::Value *scalar,
-                                           CooperativeMatrixElementType elemType, CooperativeMatrixLayout layout,
-                                           const llvm::Twine &instName = "");
-
-  // =====================================================================================================================
-  // Create cooperative matrix transpose operation
-  //
-  // @param matrix : The first operand and it should be a cooperative matrix.
-  // @param elemType : The component type of the matrix.
-  // @param srcLayout : Identify whether it's A/B or C/D
-  llvm::CallInst *CreateCooperativeMatrixTranspose(llvm::Value *matrix, CooperativeMatrixElementType elemType,
-                                                   CooperativeMatrixLayout srcLayout, const llvm::Twine &instName = "");
-
-  // Create cooperative matrix muladd operation
-  // @param coopMatrixa : Factor cooperative matrix.
-  // @param coopMatrixb : Factor cooperative matrix.
-  // @param coopMatrixc : Accumulator cooperative matrix.
-  // @param isSignedA : Identify the signess for matrix A's element type
-  // @param isSignedB : Identify the signess for matrix B's element type
-  // @param isSatOrOpsel : SaturatingAccumulation for calculation. In the
-  // case of 16-bit floating point matrices, this bit acts as an opsel bit. If
-  // it is set to false, we store the result in the lower half of the registers.
-  // If it is true, we store it in the upper half.
-  // @param isTied : If true, the output matrix has to be the same as the
-  // input accumulator (i.e., D has to be C)
-  // @param accumElemType : The component type of the matrix c
-  // @param factorElemType : The component type of the matrix a
-  llvm::Value *CreateCooperativeMatrixMulAdd(llvm::Value *coopMatrixa, llvm::Value *coopMatrixb,
-                                             llvm::Value *coopMatrixc, bool isSignedA, bool isSignedB,
-                                             bool isSatOrOpsel, bool isTied, CooperativeMatrixElementType accumElemType,
-                                             CooperativeMatrixElementType factorElemType,
-                                             const llvm::Twine &instName = "");
-
-  // =====================================================================================================================
-  // Create cooperative matrix pack operation
-  //
-  // @param matrixCLo : Lower Accumulator cooperative matrix.
-  // @param matrixCHi : Upper Accumulator cooperative matrix.
-  llvm::Value *CreateCooperativeMatrixPack(llvm::Value *matrixCLo, llvm::Value *matrixCHi,
-                                           const llvm::Twine &instName = "");
-
-  // =====================================================================================================================
-  // Create cooperative matrix unpack operation
-  //
-  // @param packedMatrix : Packed Accumulator cooperative matrices.
-  // @param high: Whether to get the matrix stored in the upper half of the registers.
-  llvm::Value *CreateCooperativeMatrixUnpack(llvm::Value *packedMatrix, bool high, const llvm::Twine &instName = "");
 };
 
 } // namespace lgc
diff --git a/lgc/interface/lgc/CommonDefs.h b/lgc/interface/lgc/CommonDefs.h
index dd718b92d5..1cace83670 100644
--- a/lgc/interface/lgc/CommonDefs.h
+++ b/lgc/interface/lgc/CommonDefs.h
@@ -149,8 +149,11 @@ enum AddrSpace {
 // Max number of threads per subgroup in NGG mode.
 constexpr unsigned NggMaxThreadsPerSubgroup = 256;
 
-// Max number of waves per subgroup in NGG mode.
-constexpr unsigned NggMaxWavesPerSubgroup = NggMaxThreadsPerSubgroup / 32;
+// Max number of GS primitive amplifier defined by GE_NGG_SUBGRP_CNTL.PRIM_AMP_FACTOR.
+// NOTE: There are 9 bits that program the register field to launch 511 threads at most though it is not
+// documented in HW spec. HW spec says the maximum value is 256 and this value might be limited by rasterization.
+// In experiments, we find it is able to launch 511 threads.
+constexpr unsigned NggMaxPrimitiveAmplifier = 511;
 
 constexpr unsigned EsVertsOffchipGsOrTess = 250;
 constexpr unsigned GsPrimsOffchipGsOrTess = 126;
diff --git a/lgc/interface/lgc/LgcDialect.td b/lgc/interface/lgc/LgcDialect.td
index e44fe74f5d..7dbfd25551 100644
--- a/lgc/interface/lgc/LgcDialect.td
+++ b/lgc/interface/lgc/LgcDialect.td
@@ -46,6 +46,19 @@ defm CooperativeMatrixArithOp : AttrEnum<"CooperativeMatrixArithOp">;
 class LgcOp<string mnemonic_, list<Trait> traits_ = []>
     : Op<LgcDialect, mnemonic_, traits_ # [NoUnwind]>;
 
+def BufferAddrToPtrOp : LgcOp<"buffer.addr.to.ptr", [Memory<[]>, WillReturn]> {
+  let arguments = (ins I64:$addr);
+  let results = (outs BufferPointer:$result);
+
+  let summary = "convert a buffer address into a buffer fat pointer";
+  let description = [{
+    Given a 64-bit buffer address, returns a fat buffer pointer to the start of the buffer.
+
+    The descriptor must be 0 or a valid address for a storage buffer aka raw buffer, i.e. a buffer
+    for which the indexing feature of BUFFER_LOAD_* instructions is never used.
+  }];
+}
+
 def BufferDescToPtrOp : LgcOp<"buffer.desc.to.ptr", [Memory<[]>, WillReturn]> {
   let arguments = (ins V4I32:$desc);
   let results = (outs BufferPointer:$result);
@@ -120,17 +133,6 @@ def BufferPtrDiffOp : LgcOp<"buffer.ptr.diff", [Memory<[]>, WillReturn]> {
   }];
 }
 
-def LoadBufferAddrOp : LgcOp<"load.buffer.addr", [Memory<[]>, WillReturn]> {
-  let arguments = (ins AttrI64:$desc_set, AttrI32:$binding, I32:$desc_index,
-      AttrI32:$flags);
-  let results = (outs I64:$result);
-
-  let summary = "create a load of a buffer descriptor and convert to 64-bit address";
-  let description = [{
-    Return the i64 address. This works whether the descriptor is compact or not.
-  }];
-}
-
 def LoadBufferDescOp : LgcOp<"load.buffer.desc", [Memory<[]>, WillReturn]> {
   let arguments = (ins AttrI64:$desc_set, AttrI32:$binding, I32:$desc_index,
       AttrI32:$flags);
@@ -254,19 +256,30 @@ def GetMeshBuiltinInputOp : LgcOp<"get.mesh.builtin.input", [Memory<[]>, WillRet
   }];
 }
 
-def WriteMeshOutputOp : LgcOp<"write.mesh.output", [Memory<[]>]> {
-  let arguments = (ins AttrI1:$is_primitive, AttrI32:$location, I32:$location_offset, I32:$component_index, I32:$prim_or_vertex_index, value:$output_value);
+def WriteMeshVertexOutputOp : LgcOp<"write.mesh.vertex.output", [Memory<[]>]> {
+  let arguments = (ins I32:$output_offset, I32:$vertex_index, value:$output_value);
   let results = (outs);
 
-  let summary = "Write mesh shader primitive/vertex outputs";
+  let summary = "Write mesh shader vertex outputs";
   let description = [{
-    In the mesh shader, write mesh shader primitive/vertex outputs to LDS.
+    In the mesh shader, write mesh shader vertex outputs to LDS.
 
-    `is_primitive` indicates if this write is for a primitive output or for a vertex output.
-    `location` is the start location of this output.
-    `location_offset` is the relative location offset of this output, used by arrayed outputs.
-    `component_index` is the component index of this output when component addressing is involved.
-    `prim_or_vertex_index` is the primitive/vertex index specifying which primitive/vertex to write.
+    `output_offset` is the relative offset of this output (in dwords) within all outputs of the indexed vertex.
+    `vertex_index` is the vertex index specifying which vertex to write.
+    `output_value` is the output value to write.
+  }];
+}
+
+def WriteMeshPrimitiveOutputOp : LgcOp<"write.mesh.primitive.output", [Memory<[]>]> {
+  let arguments = (ins I32:$output_offset, I32:$primitive_index, value:$output_value);
+  let results = (outs);
+
+  let summary = "Write mesh shader primitive outputs";
+  let description = [{
+    In the mesh shader, write mesh shader primitive outputs to LDS.
+
+    `output_offset` is the relative offset of this output (in dwords) within all outputs of the indexed primitive.
+    `primitive_index` is the primitive index specifying which primitive to write.
     `output_value` is the output value to write.
   }];
 }
@@ -389,6 +402,17 @@ def UserDataOp : LgcOp<"user.data", [Memory<[]>, WillReturn]> {
   }];
 }
 
+def ExtendAddressOp : LgcOp<"extend.address", [Memory<[]>, WillReturn]> {
+  let arguments = (ins I32:$addr32);
+  let results = (outs I64:$addr64);
+
+  let summary = "extend a 32-bit address to 64 bit";
+  let description = [{
+    The 32-bit address is extended to 64 bits by whatever method LGC is configured to use, which
+    could be using s_getpc to get the high half, or using a device-wide constant it was given.
+  }];
+}
+
 def GroupMemcpyOp : LgcOp<"group.memcpy", [Memory<[]>]> {
   let arguments = (ins PointerType:$dst, PointerType:$src, AttrI32:$size, AttrI32:$scope);
   let results = (outs);
@@ -436,6 +460,47 @@ def SubgroupAnyOp : LgcOp<"subgroup.any", [NoUnwind, Convergent]> {
   }];
 }
 
+def SubgroupAllOp : LgcOp<"subgroup.all", [NoUnwind, Convergent]> {
+  let arguments = (ins I1:$value);
+  let results = (outs I1:$result);
+
+  let summary = "subgroupAll";
+  let description = [{
+    Evaluates `value` for all active invocations in the group. `result` will be
+    true if and only if it evaluates to true for all invocations in the group.
+
+    This is used to implement OpGroupAll OpGroupNonUniformAll in the SPIR-V
+    reference, see there for details.
+  }];
+}
+
+def SubgroupAllEqualOp : LgcOp<"subgroup.all.equal", [NoUnwind, Convergent]> {
+  let arguments = (ins value:$value);
+  let results = (outs I1:$result);
+
+  let summary = "subgroupAllEqual";
+  let description = [{
+    Evaluates `value` for all active invocations in the group. `result` will be
+    true if and only if it is equal for all active invocations in the group
+
+    This is used to implement to OpGroupAllEqual and OpGroupNonUniformAllEqual
+    in the SPIR-V reference, see there for details.
+  }];
+}
+
+def SubgroupRotateOp : LgcOp<"subgroup.rotate", [NoUnwind, Convergent]> {
+  let arguments = (ins value:$value, I32:$delta, I32:$clusterSize);
+  let results = (outs (eq $value):$result);
+
+  let summary = "subgroupRotate";
+  let description = [{
+    Reserved for SPIR-V OpGroupNonUniformRotateKHR.
+
+    clusterSize is optional. If it is unset it will be represented by a
+    llvm::PoisonValue
+  }];
+}
+
 def CooperativeRowAccLoadOp : LgcOp<"cooperative.rowacc.load", [Memory<[(read)]>, WillReturn]> {
   let arguments = (ins value:$pointer, I32:$stride, CooperativeMatrixElementType:$elem_type, CooperativeMatrixMemoryAccess:$memory_access);
   let results = (outs value:$result);
@@ -451,8 +516,11 @@ def CooperativeRowAccLoadOp : LgcOp<"cooperative.rowacc.load", [Memory<[(read)]>
     'pointer' is the pointer address to the data.
     'stride' is the stride in bytes in memory between the first elements in the source data.
     'elem_type' is the element type for the row acc.
-    'memory_access' is the memory operands which provide:isVolatile/isTemporal/isCoherent additional operands,
-    maybe volatile/Aligned/Nontemporal/MakePointerAvailable
+
+	'memory_access' is a set of flags describing the memory.
+	- Bit 0 is set if the memory is volatile
+	- Bit 1 is set if the memory is coherent
+	- Bit 2 is set if the memory is temporal.
   }];
 }
 
@@ -468,8 +536,11 @@ def CooperativeRowAccStoreOp : LgcOp<"cooperative.rowacc.store", [Memory<[(write
     'stride' is the stride in bytes in memory between the first elements in the source data.
     'elem_type' is the element type for the row acc.
     'data' is data of row acc, Must be in finalized mode.
-    'memory_access' is the memory operands which provide:isVolatile/isTemporal/isCoherent additional operands,
-    maybe volatile/Aligned/Nontemporal/MakePointerAvailable
+
+	'memory_access' is a set of flags describing the memory.
+	- Bit 0 is set if the memory is volatile
+	- Bit 1 is set if the memory is coherent
+	- Bit 2 is set if the memory is temporal.
   }];
 }
 
@@ -533,7 +604,7 @@ def CooperativeRowAccSumAccumulateOp : LgcOp<"cooperative.rowacc.sum.accumulate"
     'matrix_layout' is the layout for the cooperative matrix.
     'row_acc' is the input cooperative row acc, must be in accumulate mode.
     'row_acc_elem_type' is the element type for input cooperative row acc.
-    'is_signed' indicate if row accumulator elememnt type need to be consider as signed or not.
+    'is_signed' indicate if row accumulator element type is considered signed or not.
   }];
 }
 
@@ -583,3 +654,332 @@ def LoadDriverTableEntryOp : LgcOp<"load.driver.table.entry", [Memory<[]>, WillR
     `offset` is the offset into the driver table, in unit of dwords.
   }];
 }
+
+def InvariantDecorationOp : LgcOp<"invariant.decoration", [WillReturn]> {
+  let arguments = (ins value:$invariant);
+  let results = (outs);
+
+  let summary = "Indication that the value has been marked with invariant decoration.";
+  let description = [{
+    Value marked with invariant decoration will have allowContract FMF set to false to prevent
+    FMA contraction in the backend.
+  }];
+}
+
+def CooperativeMatrixLengthOp : LgcOp<"cooperative.matrix.length", [Memory<[]>, WillReturn]> {
+  let arguments = (ins CooperativeMatrixLayout:$layout);
+  let results = (outs I32:$result);
+
+  let summary = "get the length for the cooperative matrix";
+  let description = [{
+    Get the "length" of a matrix of the given layout, i.e. the number of matrix components stored per lane.
+
+    'layout' is layout of cooperative matrix.
+  }];
+}
+
+def CooperativeMatrixLoadOp : LgcOp<"cooperative.matrix.load", [Memory<[(read)]>, Convergent, WillReturn]> {
+  let arguments = (ins value:$pointer, value:$stride, AttrI1:$col_major, CooperativeMatrixElementType:$elem_type,
+                   CooperativeMatrixLayout:$layout, AttrI32:$memory_access, AttrI32:$alignment);
+  let results = (outs value:$result);
+
+  let defaultBuilderHasExplicitResultType = true;
+
+  let summary = "load the cooperative matrix elements per lane";
+  let description = [{
+    Load the elements of cooperative matrix per lane through a pointer.
+
+    Return <n x i32> or <n x float> vector containing all the elements of the cooperative matrix per lane.
+
+    'pointer' is the pointer address of the first element of the cooperative matrix stored in memory.
+    'stride' is the stride in bytes in memory between the first elements in the source data.
+    'col_major' is the order of the data loaded from memory, col-major or row-major.
+    'elem_type' is the element type of the cooperative matrix.
+    'layout' is the layout of the input cooperative matrix.
+
+	'memory_access' is a set of flags describing the memory.
+	- Bit 0 is set if the memory is volatile
+	- Bit 1 is set if the memory is coherent
+	- Bit 2 is set if the memory is temporal.
+
+	'alignment' is the alignment of this load operation.
+  }];
+}
+
+def CooperativeMatrixStoreOp : LgcOp<"cooperative.matrix.store", [Memory<[(write)]>, Convergent]> {
+  let arguments = (ins value:$pointer, value:$stride, AttrI1:$col_major, CooperativeMatrixElementType:$elem_type,
+                   CooperativeMatrixLayout:$layout, AttrI32:$memory_access, AttrI32:$alignment,
+                   value:$store_value);
+  let results = (outs);
+
+  let summary = "Store cooperative matrix elements per lane to the memory";
+  let description = [{
+    Store cooperative matrix elements per lane to the memory through the pointer. The elements should be converted to
+    <n x i32> or <n x float> type.
+
+    'pointer' is the pointer address of the data array in memory.
+    'stride' is the stride in bytes in memory between the first elements in the source data.
+    'col_major' is the order of the data stored into memory, col-major or row-major.
+    'elem_type' is the element type of the cooperative matrix.
+    'layout' is the layout of the input cooperative matrix.
+
+	'memory_access' is a set of flags describing the memory.
+	- Bit 0 is set if the memory is volatile
+	- Bit 1 is set if the memory is coherent
+	- Bit 2 is set if the memory is temporal.
+
+    'alignment' is the alignment of this store operation.
+    'store_value' is the elements of the cooperative matrix perlane typed in <n x i32> or <n x float> to be stored in memory.
+  }];
+}
+
+def CooperativeMatrixFillOp : LgcOp<"cooperative.matrix.fill", [Memory<[]>, WillReturn]> {
+  let arguments = (ins value:$scalar, CooperativeMatrixElementType:$elem_type, CooperativeMatrixLayout:$layout);
+  let results = (outs value:$result);
+
+  let defaultBuilderHasExplicitResultType = true;
+
+  let summary = "Return a matrix filled with a scalar value";
+  let description = [{
+    Return a matrix whose elements are all equal to the given `scalar`.
+
+    'scalar' is the value to fill the cooperative matrix.
+    'elem_type' is the element type for the cooperative matrix.
+    'layout' is the layout of the input cooperative matrix.
+  }];
+}
+
+def CooperativeMatrixExtractOp : LgcOp<"cooperative.matrix.extract", [Memory<[]>, WillReturn]> {
+  let arguments = (ins value:$matrix, value:$index, CooperativeMatrixElementType:$elem_type, CooperativeMatrixLayout:$layout);
+  let results = (outs value:$result);
+
+  let defaultBuilderHasExplicitResultType = true;
+
+  let summary = "return the element extracted from the cooperative matrix by index";
+  let description = [{
+    Returns the value at the given `index` in the input matrix.
+
+    'matrix' is the matrix from which to extract a component.
+    'index' is the index to be extracted.
+    'elem_type' is the element type for the cooperativ ematrix.
+    'layout' is the layout of the input cooperative matrix.
+  }];
+}
+
+def CooperativeMatrixInsertOp : LgcOp<"cooperative.matrix.insert", [Memory<[]>, WillReturn]> {
+  let arguments = (ins value:$matrix, value:$insert_value, value:$index, CooperativeMatrixElementType:$elem_type, CooperativeMatrixLayout:$layout);
+  let results = (outs value:$result);
+
+  let defaultBuilderHasExplicitResultType = true;
+
+  let summary = "Insert the element into the cooperative matrix";
+  let description = [{
+    Insert the given `insert_value` at the given `index` into the input matrix and return the matrix.
+
+    'matrix' is the matrix from which to extract a component.
+    'value' is the value to be inserted.
+    'index' is the index to be inserted.
+    'elem_type' is the element type for the cooperative matrix.
+    'layout' is the layout of the input cooperative matrix.
+  }];
+}
+
+def CooperativeMatrixConvertOp : LgcOp<"cooperative.matrix.convert", [Memory<[(read)]>, Convergent, WillReturn]> {
+  let arguments = (ins AttrI32:$cast_op, value:$source, CooperativeMatrixElementType:$src_elem_type, CooperativeMatrixElementType:$dst_elem_type,
+                   CooperativeMatrixLayout:$src_layout, CooperativeMatrixLayout:$dst_layout);
+  let results = (outs value:$result);
+
+  let defaultBuilderHasExplicitResultType = true;
+
+  let summary = "Reshape the layout for cooperative matrix or cooperative matrix element-wise-conversion operation";
+  let description = [{
+    This operation will convert the input matrix into either a different layout or convert the elements into a different type and return the result.
+
+    'cast_op' is the conversion operation. 0 means reshape on cooperative matrix layout, other values are for element-wise-conversion.
+    'source' is the source cooperative matrix.
+    'src_elem_type' is the source cooperative matrix's element type.
+    'dst_elem_type' is the destination cooperative matrix's element type.
+    'src_layout' is the layout for source cooperative matrix.
+    'dst_layout' is the layout for target cooperative matrix.
+  }];
+}
+
+def CooperativeMatrixTransposeOp : LgcOp<"cooperative.matrix.transpose", [Convergent, WillReturn]> {
+  let arguments = (ins value:$matrix, CooperativeMatrixElementType:$elem_type, CooperativeMatrixLayout:$layout);
+  let results = (outs value:$result);
+
+  let defaultBuilderHasExplicitResultType = true;
+
+  let summary = "Transpose the cooperative matrix in register level and change its layout";
+  let description = [{
+    This operation will transpose the input matrix and return the transposed matrix.
+
+    'matrix' is the original cooprative matrix for transposition.
+    'elem_type' is the element type for the cooperative matrix.
+    'layout' is the layout of the input cooperative matrix.
+  }];
+}
+
+def CooperativeMatrixBinaryOp : LgcOp<"cooperative.matrix.binary", [Convergent, WillReturn]> {
+  let arguments = (ins CooperativeMatrixArithOp:$arith_op, value:$lhs, value:$rhs, CooperativeMatrixElementType:$elem_type,
+                   CooperativeMatrixLayout:$layout);
+  let results = (outs value:$result);
+
+  let defaultBuilderHasExplicitResultType = true;
+
+  let summary = "Cooperativematrix binary operation";
+  let description = [{
+    Perform a binary operation on two matrices and return the resulting matrix.
+    The two input matrices need to have the same layout and element type.
+
+    'arith_op' is the arithmetic operation.
+    'lhs' is the first operation of cooperative matrix.
+    'rhs' is the second operation of cooperative matrix.
+    'elem_type' is the element type of cooperative matrix.
+    'layout' is the layout of the input cooperative matrix.
+  }];
+}
+
+def CooperativeMatrixTimesScalarOp : LgcOp<"cooperative.matrix.times.scalar", [Convergent, WillReturn]> {
+  let arguments = (ins value:$matrix, value:$scalar, CooperativeMatrixElementType:$elem_type,
+                   CooperativeMatrixLayout:$layout);
+  let results = (outs value:$result);
+
+  let defaultBuilderHasExplicitResultType = true;
+
+  let summary = "Compute Matrix x Scalar and return the resulting cooperative matrix";
+  let description = [{
+    Multiply all matrix elements in the input matrix by the given `scalar`.
+
+    'matrix' is the matrix operand for the operation.
+    'scalar' is the scalar operand for the operation.
+    'elem_type' is the element type for the cooperativematrix operand.
+    'layout' is the layout for the cooperative matrix.
+  }];
+}
+
+def CooperativeMatrixMulAddOp : LgcOp<"cooperative.matrix.muladd", [Convergent, WillReturn]> {
+  let arguments = (ins value:$matrix_a, value:$matrix_b, value:$matrix_c, AttrI1:$is_signed_a, AttrI1:$is_signed_b,
+                   AttrI1:$is_sat_or_opsel, AttrI1:$is_tied, CooperativeMatrixElementType:$accu_elem_type,
+                   CooperativeMatrixElementType:$factor_elem_type);
+  let results = (outs value:$result);
+
+  let defaultBuilderHasExplicitResultType = true;
+
+  let summary = "Calculate `matrix_a` * `matrix_b` + `matrix_c`";
+  let description = [{
+    Multiply `matrix_a` by `matrix_b` and add `matrix_c`. The resulting matrix has the same type as `matrix_c`.
+
+    'matrix_a' is the factor cooperative matrix whose use is MatrixAKHR.
+    'matrix_b' is the factor cooperative matrix whose use is MatrixBKHR.
+    'matrix_c' is the accumulator cooperative matrix whose use is MatrixAccumulatorKHR.
+    'is_signed_a' is the signess for matrix_a's element type.
+    'is_signed_b' is the signess for matrix_b's element type.
+
+    'is_sat_or_opsel' is the saturatingAccumulation for calculation,
+    In the case of 16-bit floating point matrices, this bit acts as an opsel bit,
+    if it is set to false, we store the result in the lower half of
+    the registers. If it is true, we store it in the upper half.
+
+    'is_tied' is the flag of the output matrix has to be the same
+    as the input accumulator (i.e., D has to be C)
+
+    'accu_elem_type' is the component type of the accumulator matrix.
+    'factor_elem_type' is the component type of the factor matrix.
+  }];
+}
+
+def CooperativeMatrixPackOp : LgcOp<"cooperative.matrix.pack", [Memory<[(read)]>, WillReturn]> {
+  let arguments = (ins value:$matrix_c_lo, value:$matrix_c_hi);
+  let results = (outs value:$result);
+
+  let defaultBuilderHasExplicitResultType = true;
+
+  let summary = "Cooperativematrix pack operation";
+  let description = [{
+    This is to pack two accumulator cooperativematrices and store in the same registers.
+
+    'matrix_c_lo' is the lower accumulator cooperative matrix to be packed.
+    'matrix_c_hi' is the upper accumulator cooperative matrix to be packed.
+  }];
+}
+
+def CooperativeMatrixUnPackOp : LgcOp<"cooperative.matrix.unpack", [Memory<[(read)]>, WillReturn]> {
+  let arguments = (ins value:$packed_matrix, AttrI1:$get_upper_half);
+  let results = (outs value:$result);
+
+  let defaultBuilderHasExplicitResultType = true;
+
+  let summary = "Restores an unpacked matrix from a packed accumulator";
+  let description = [{
+    Returns the unpacked matrix stored in either the upper or lower half of a packed accumulator.
+
+    'packed_matrix' is the packed Accumulator cooperative matrix.
+
+    'get_upper_half' is the flag of getting the upper half or lower half of the register.
+    - if it's true, it will unpack cooperative matrix stored in the upper half register.
+    - if it's false, it will unpack cooperative matrix stored in the lower half register.
+  }];
+}
+
+def SparsityIndexLoadOp : LgcOp<"sparsityindex.load", [Memory<[(read)]>, Convergent, WillReturn]> {
+  let arguments = (ins value:$pointer, value:$stride, AttrI1:$col_major, AttrI32:$memory_access);
+  let results = (outs value:$result);
+
+  let defaultBuilderHasExplicitResultType = true;
+
+  let summary = "load the sparsity index for the sparse cooperative matrix";
+  let description = [{
+    Load the sparsity index for sparse cooperative matrix A which will be used in sparseA * DenseB + DenseC
+    Return value which size is [unused_16bit | index_16bit] for wave32 or [unused_24bit | index_8bit] for wave64.
+
+    'pointer' is the pointer to the index data stored in memory.
+    'stride' is to qualify how the index data is laid out in memory. It must be of scalar integer type.
+    'col_major' is a constant instruction with 32-bit integer type whose value corresponds to a Sparsity Index Memory Layout.
+
+	'memory_access' is a set of flags describing the memory.
+	- Bit 0 is set if the memory is volatile
+	- Bit 1 is set if the memory is coherent
+	- Bit 2 is set if the memory is temporal.
+  }];
+}
+
+def SparseCooperativeMatrixMulAddOp : LgcOp<"sparseCooperativeMatrix.muladd", [Convergent, WillReturn]> {
+  let arguments = (ins value:$matrix_a, value:$sparse_index, value:$matrix_b, value:$matrix_c, AttrI1:$is_signed_a, AttrI1:$is_signed_b,
+                   AttrI1:$is_sat, CooperativeMatrixElementType:$accu_elem_type,
+                   CooperativeMatrixElementType:$factor_elem_type);
+  let results = (outs value:$result);
+
+  let defaultBuilderHasExplicitResultType = true;
+
+  let summary = "The muladd operation supported as sparseA * denseB + denseC";
+  let description = [{
+    Sparse linear-algebraic matrix multiply of A and B with structural sparsity
+    information taken from Index, followed by component-wise addition of C.
+    The semantics of the multiplication are defined by the sparsity format of Index.
+
+    Only support sparseA*DenseB+DenseC now.
+
+    'matrix_a' is the factor cooperative matrix whose use is MatrixAKHR.
+    'matrix_b' is the factor cooperative matrix whose use is MatrixBKHR.
+    'sparse_index' is the sparsity index.
+    'matrix_c' is the accumulator cooperative matrix whose use is MatrixCKHR.
+    'is_signed_a' is the signess for matrixA's element type.
+    'is_signed_b' is the signess for matrixB's element type.
+    'is_sat' is the saturatingAccumulation for calculation,
+    'accu_elem_type' is the component type of the accumulator matrix.
+    'factor_elem_type' is the component type of the factor matrix.
+  }];
+}
+
+def LoadTfeOp : LgcOp<"load.tfe", [Memory<[]>, WillReturn]> {
+  let arguments = (ins (or BufferPointer, BufferStridedPointer):$pointer);
+  let results = (outs value:$result);
+
+  let defaultBuilderHasExplicitResultType = true;
+
+  let summary = "load data and tfe value from the given buffer pointer";
+  let description = [{
+    Return the data stored in the structure or raw buffer and the tfe value in term of {T, i32}.
+  }];
+}
diff --git a/lgc/interface/lgc/ModuleBunch.h b/lgc/interface/lgc/ModuleBunch.h
index 39966a80ac..a7b80245ca 100644
--- a/lgc/interface/lgc/ModuleBunch.h
+++ b/lgc/interface/lgc/ModuleBunch.h
@@ -80,9 +80,18 @@ class ModuleBunch {
   // Dump the module to stderr (for debugging).
   void dump() const;
 
-  bool IsNewDbgInfoFormat = false;
+#if !defined(LLVM_MAIN_REVISION) || LLVM_MAIN_REVISION >= 494698
+  // API used by PassManager.h.
+  void setIsNewDbgInfoFormat(bool UseNewFormat) {
+    IsNewDbgInfoFormat = UseNewFormat;
+    assert(isNormalized());
+    for (const std::unique_ptr<Module> &Entry : Modules)
+      Entry->setIsNewDbgInfoFormat(UseNewFormat);
+  }
 
-  void setIsNewDbgInfoFormat(bool UseNewFormat) { llvm_unreachable("Should never be called!"); }
+  // Public field used by PassManager.h.
+  bool IsNewDbgInfoFormat = false;
+#endif
 
 private:
   SmallVector<std::unique_ptr<Module>> Modules;
diff --git a/lgc/interface/lgc/Pipeline.h b/lgc/interface/lgc/Pipeline.h
index 666c639922..1c30ffe785 100644
--- a/lgc/interface/lgc/Pipeline.h
+++ b/lgc/interface/lgc/Pipeline.h
@@ -138,9 +138,8 @@ union Options {
 
     // Unused members, kept in place to keep LLVM IR metadata stable.
     unsigned unused0;
-    unsigned unused1;
-    unsigned unused2;
 
+    uint64_t resourceHash;               // Resource hash to set in ELF PAL metadata
     unsigned includeIr;                  // If set, the IR for all compiled shaders will be included in the
                                          //   pipeline ELF.
     unsigned nggFlags;                   // Flags to control NGG (NggFlag* values ored together)
@@ -196,6 +195,7 @@ union Options {
     bool disableSampleCoverageAdjust;              // Disable the adjustment of sample coverage
     bool forceFragColorDummyExport;                // Force dummy export is added to fragment shader color export.
     unsigned reserved22;
+    bool dynamicTopology; // Whether primitive topology is dynamic.
   };
 };
 static_assert(sizeof(Options) == sizeof(Options::u32All));
@@ -873,20 +873,32 @@ class Pipeline {
   // with irLink(). This is a static method in Pipeline, as it does not need a Pipeline object, and can be used
   // in the front-end before a shader is associated with a pipeline.
   //
-  // @param func : Function to mark
+  // @param func : Function to mark. This can instead be a GlobalVariable; that functionality is not used by LGC,
+  //               but can be used by a front-end that uses a GlobalVariable to represent a part-pipeline retrieved
+  //               from the cache, and wants to mark it with a shader stage
   // @param stage : Shader stage, or ShaderStage::Invalid if none
-  static void markShaderEntryPoint(llvm::Function *func, ShaderStageEnum stage);
+  static void markShaderEntryPoint(llvm::GlobalObject *func, ShaderStageEnum stage);
 
   // Get a function's shader stage.
   //
-  // @param func : Function to check
+  // @param func : Function to check. This can instead be a GlobalVariable; that functionality is not used by LGC,
+  //               but can be used by a front-end that uses a GlobalVariable to represent a part-pipeline retrieved
+  //               from the cache, and wants to mark it with a shader stage
   // @returns stage : Shader stage, or nullopt if none
-  static std::optional<ShaderStageEnum> getShaderStage(llvm::Function *func);
+  static std::optional<ShaderStageEnum> getShaderStage(llvm::GlobalObject *func);
+
+  // Set a function's shader subtype. Only has an effect on a compute shader or non-shader export function,
+  // where it causes the .shader_subtype PAL metadata item to be set to the arbitrary string given here.
+  // The PAL metadata item has no semantic role, but is used by tools, which expect the value to be one of:
+  // "Traversal", "RayGeneration", "Intersection", "AnyHit", "ClosestHit", "Miss", "Callable", "LaunchKernel",
+  // "FixedExpansionNode", "DynamicExpansionNode", "AggregationNode", "ThreadLaunchNode", "DrawNode"
+  static void setShaderSubtype(llvm::GlobalObject *func, llvm::StringRef subtype);
 
   // Find the shader entry-point from shader module, and set pipeline stage.
   //
   // @param module : Shader module to attach
-  virtual void attachModule(llvm::Module *modules) = 0;
+  // @param (optional) pipelineLink : Enum saying whether this is a pipeline, unlinked or part-pipeline compile.
+  virtual void attachModule(llvm::Module *modules, PipelineLink pipelineLink = PipelineLink::WholePipeline) = 0;
 
   // Record pipeline state into IR metadata of specified module.
   //
diff --git a/lgc/patch/CombineCooperativeMatrix.cpp b/lgc/patch/CombineCooperativeMatrix.cpp
index 658d70a2e7..1b32e9753b 100644
--- a/lgc/patch/CombineCooperativeMatrix.cpp
+++ b/lgc/patch/CombineCooperativeMatrix.cpp
@@ -38,6 +38,7 @@
 #include "lgc/state/Defs.h"
 #include "lgc/state/PipelineState.h"
 #include "lgc/state/TargetInfo.h"
+#include "llvm-dialects/Dialect/Visitor.h"
 #include "llvm/ADT/SmallVector.h"
 #include <optional>
 
@@ -46,7 +47,11 @@
 using namespace llvm;
 using namespace lgc;
 
-namespace {
+namespace lgc {
+
+class CooperativeMatrixConvertOp;
+class CooperativeMatrixTransposeOp;
+class CooperativeMatrixMulAddOp;
 
 struct Shape {
   CooperativeMatrixElementType elementType;
@@ -81,22 +86,22 @@ class CooperativeMatrixCombiner {
   bool run();
 
 private:
-  Shape getShapeOfTranspose(CallInst *transpose);
+  Shape getShapeOfTranspose(CooperativeMatrixTransposeOp &transpose);
   void foldTo(Value *from, Value *to);
   bool tryFold(CallInst *op);
   bool tryFoldComponentContaining(Value *start);
   Instruction *findFirstUser(Instruction *instruction);
   Value *tryFoldTimesScalar(CallInst *timesScalarLo, CallInst *timesScalarHi, Value *packedMatrix);
-  bool tryFoldMuladd(SmallVector<CallInst *> muladds);
+  bool tryFoldMuladd(SmallVector<CooperativeMatrixMulAddOp *> muladds);
 
   Function &m_function;
   BuilderCommon b;
   GfxIpVersion m_gfxIpVersion;
   std::vector<Instruction *> m_eraseList;
+  std::vector<WeakVH> m_ops;
+  MapVector<BasicBlock *, SmallVector<CooperativeMatrixMulAddOp *>> m_muladds;
 };
 
-} // anonymous namespace
-
 // =====================================================================================================================
 // Run the combiner.
 //
@@ -107,46 +112,24 @@ bool CooperativeMatrixCombiner::run() {
   bool changed = false;
 
   // Step 1: Collect transposes, converts and muladds
-  std::vector<WeakVH> ops;
-  MapVector<BasicBlock *, SmallVector<CallInst *>> muladds;
-
-  for (Function &fn : m_function.getParent()->functions()) {
-    if (!fn.isDeclaration())
-      continue;
-
-    if (fn.getName().starts_with(lgcName::CooperativeMatrixTranspose)) {
-      for (User *user : fn.users()) {
-        if (auto *call = dyn_cast<CallInst>(user)) {
-          if (call->getFunction() == &m_function)
-            ops.push_back(call);
-        }
-      }
-    } else if (fn.getName().starts_with(lgcName::CooperativeMatrixConvert)) {
-      for (User *user : fn.users()) {
-        if (auto *call = dyn_cast<CallInst>(user)) {
-          if (call->getFunction() == &m_function)
-            ops.push_back(call);
-        }
-      }
+  static const auto visitor = llvm_dialects::VisitorBuilder<CooperativeMatrixCombiner>()
+                                  .setStrategy(llvm_dialects::VisitorStrategy::ByFunctionDeclaration)
+                                  .addSet<CooperativeMatrixConvertOp, CooperativeMatrixTransposeOp>(
+                                      [](auto &self, auto &op) { self.m_ops.push_back(&op); })
+                                  .add<CooperativeMatrixMulAddOp>([](auto &self, auto &op) {
 #if !defined(LLVM_MAIN_REVISION) || LLVM_MAIN_REVISION >= 479080
-      // wmma packing on gfx11 only possible with new wmma_f16_tied intrinsic
-    } else if (m_gfxIpVersion.major == 11 && fn.getName().starts_with(lgcName::CooperativeMatrixMulAdd)) {
-      for (User *user : fn.users()) {
-        if (auto *call = dyn_cast<CallInst>(user)) {
-          auto accumElemType =
-              static_cast<CooperativeMatrixElementType>(cast<ConstantInt>(call->getOperand(7))->getZExtValue());
-          bool isPackable = accumElemType == CooperativeMatrixElementType::Float16;
-          if (call->getFunction() == &m_function && isPackable) {
-            muladds[call->getParent()].push_back(call);
-          }
-        }
-      }
+                                    auto accumElemType = op.getAccuElemType();
+                                    bool isPackable = accumElemType == CooperativeMatrixElementType::Float16;
+                                    if ((self.m_gfxIpVersion.major == 11) && isPackable) {
+                                      self.m_muladds[op.getParent()].push_back(&op);
+                                    }
 #endif
-    }
-  }
+                                  })
+                                  .build();
+  visitor.visit(*this, m_function);
 
   // Step 2: Attempt folds.
-  for (const WeakVH &handle : ops) {
+  for (const WeakVH &handle : m_ops) {
     auto *op = cast_or_null<CallInst>(handle);
     if (!op)
       continue;
@@ -163,7 +146,7 @@ bool CooperativeMatrixCombiner::run() {
   }
 #if !defined(LLVM_MAIN_REVISION) || LLVM_MAIN_REVISION >= 479080
   // wmma packing on gfx11 only possible with new wmma_f16_tied intrinsic
-  for (auto muladdsPerBB : muladds) {
+  for (auto muladdsPerBB : m_muladds) {
     changed |= tryFoldMuladd(std::move(muladdsPerBB.second));
 
     for (Instruction *inst : llvm::reverse(m_eraseList)) {
@@ -173,10 +156,10 @@ bool CooperativeMatrixCombiner::run() {
     m_eraseList.clear();
   }
 
-  muladds.clear();
+  m_muladds.clear();
 #endif
 
-  ops.clear();
+  m_ops.clear();
 
   return changed;
 }
@@ -186,9 +169,9 @@ bool CooperativeMatrixCombiner::run() {
 //
 // @param [in] transpose : the transpose operation
 // @returns : the cooperative matrix shape
-Shape CooperativeMatrixCombiner::getShapeOfTranspose(CallInst *transpose) {
-  unsigned elemType = cast<ConstantInt>(transpose->getArgOperand(1))->getZExtValue();
-  unsigned layout = cast<ConstantInt>(transpose->getArgOperand(2))->getZExtValue();
+Shape CooperativeMatrixCombiner::getShapeOfTranspose(CooperativeMatrixTransposeOp &transpose) {
+  auto elemType = transpose.getElemType();
+  auto layout = transpose.getLayout();
   return {(CooperativeMatrixElementType)elemType, (CooperativeMatrixLayout)layout};
 }
 
@@ -219,12 +202,13 @@ void CooperativeMatrixCombiner::foldTo(Value *from, Value *to) {
 bool CooperativeMatrixCombiner::tryFold(CallInst *op) {
   Value *src;
   bool isConvert = false;
-  if (op->getCalledFunction()->getName().starts_with(lgcName::CooperativeMatrixConvert)) {
-    src = op->getArgOperand(1);
+  if (auto *convertOp = dyn_cast<CooperativeMatrixConvertOp>(op)) {
+    src = convertOp->getSource();
     isConvert = true;
+  } else if (auto *transposeOp = dyn_cast<CooperativeMatrixTransposeOp>(op)) {
+    src = transposeOp->getMatrix();
   } else {
-    assert(op->getCalledFunction()->getName().starts_with(lgcName::CooperativeMatrixTranspose));
-    src = op->getArgOperand(0);
+    llvm_unreachable("the operation is not supported here.");
   }
 
   if (auto *constant = dyn_cast<Constant>(src)) {
@@ -237,8 +221,8 @@ bool CooperativeMatrixCombiner::tryFold(CallInst *op) {
       // transpose/convert(undef) -> undef, if legal
       bool isFoldable = true;
       if (isConvert) {
-        auto srcElementType = (CooperativeMatrixElementType)cast<ConstantInt>(op->getArgOperand(2))->getZExtValue();
-        auto dstElementType = (CooperativeMatrixElementType)cast<ConstantInt>(op->getArgOperand(3))->getZExtValue();
+        auto srcElementType = cast<CooperativeMatrixConvertOp>(op)->getSrcElemType();
+        auto dstElementType = cast<CooperativeMatrixConvertOp>(op)->getDstElemType();
         if (srcElementType != dstElementType) {
           // This is slightly conservative, but the point here is that e.g. `zext undef(i16) to i32` can't be folded
           // to undef because the result can't truly take all possible bit patterns.
@@ -294,30 +278,27 @@ bool CooperativeMatrixCombiner::tryFoldComponentContaining(Value *start) {
       }
       return true;
     }
-    if (auto *call = dyn_cast<CallInst>(val)) {
-      if (auto *callee = call->getCalledFunction()) {
-        if (callee->getName().starts_with(lgcName::CooperativeMatrixTimesScalar)) {
-          if (is_contained(component.inner.timesScalars, call))
-            return true;
-
-          component.inner.timesScalars.push_back(call);
-          worklistForward.push_back(call);
-          worklistBackward.push_back(call->getArgOperand(0));
-          return true;
-        }
-        if (callee->getName().starts_with(lgcName::CooperativeMatrixBinOp)) {
-          if (is_contained(component.inner.binOps, call))
-            return true;
-
-          component.inner.binOps.push_back(call);
-          worklistForward.push_back(call);
-          worklistBackward.push_back(call->getArgOperand(1));
-          worklistBackward.push_back(call->getArgOperand(2));
-          return true;
-        }
-        return false;
-      }
+
+    if (auto *timesScalarOp = dyn_cast<CooperativeMatrixTimesScalarOp>(val)) {
+      if (is_contained(component.inner.timesScalars, val))
+        return true;
+
+      component.inner.timesScalars.push_back(cast<CallInst>(val));
+      worklistForward.push_back(val);
+      worklistBackward.push_back(timesScalarOp->getMatrix());
+      return true;
+    }
+    if (auto *binOp = dyn_cast<CooperativeMatrixBinaryOp>(val)) {
+      if (is_contained(component.inner.binOps, val))
+        return true;
+
+      component.inner.binOps.push_back(cast<CallInst>(val));
+      worklistForward.push_back(val);
+      worklistBackward.push_back(binOp->getLhs());
+      worklistBackward.push_back(binOp->getRhs());
+      return true;
     }
+
     return false;
   };
 
@@ -387,34 +368,28 @@ bool CooperativeMatrixCombiner::tryFoldComponentContaining(Value *start) {
       continue;
     }
 
-    if (auto *call = dyn_cast<CallInst>(input)) {
-      if (auto *callee = call->getCalledFunction()) {
-        if (callee->getName().starts_with(lgcName::CooperativeMatrixLoad))
-          continue; // loads can be adjusted at zero cost
-        if (callee->getName().starts_with(lgcName::CooperativeMatrixTranspose)) {
-          foundComponentShape(getShapeOfTranspose(call));
-          ++numTransposeInputs;
-          continue;
-        }
-        if (callee->getName().starts_with(lgcName::CooperativeMatrixConvert)) {
-          auto srcElemType = (CooperativeMatrixElementType)cast<ConstantInt>(call->getArgOperand(2))->getZExtValue();
-          auto dstElemType = (CooperativeMatrixElementType)cast<ConstantInt>(call->getArgOperand(3))->getZExtValue();
-          if (srcElemType != dstElemType) {
-            LLVM_DEBUG(dbgs() << "  unhandled element type input conversion: " << *call << '\n');
-            ++numUnhandledInputs;
-            continue;
-          }
+    if (isa<CooperativeMatrixLoadOp>(input))
+      continue; // loads can be adjusted at zero cost
+    if (auto *transposeOp = dyn_cast<CooperativeMatrixTransposeOp>(input)) {
+      foundComponentShape(getShapeOfTranspose(*transposeOp));
+      ++numTransposeInputs;
+      continue;
+    }
+    if (auto *convertOp = dyn_cast<CooperativeMatrixConvertOp>(input)) {
+      auto srcElemType = convertOp->getSrcElemType();
+      auto dstElemType = convertOp->getDstElemType();
+      if (srcElemType != dstElemType) {
+        LLVM_DEBUG(dbgs() << "  unhandled element type input conversion: " << *input << '\n');
+        ++numUnhandledInputs;
+        continue;
+      }
 
-          auto srcLayout = (CooperativeMatrixLayout)cast<ConstantInt>(call->getArgOperand(4))->getZExtValue();
-          auto dstLayout = (CooperativeMatrixLayout)cast<ConstantInt>(call->getArgOperand(5))->getZExtValue();
-          foundComponentShape({dstElemType, dstLayout});
-          foundOtherLayout(srcLayout, call->getArgOperand(1)->getType());
+      auto srcLayout = convertOp->getSrcLayout();
+      auto dstLayout = convertOp->getDstLayout();
+      foundComponentShape({dstElemType, dstLayout});
+      foundOtherLayout(srcLayout, convertOp->getSource()->getType());
 
-          ++numRelayoutInputs;
-          continue;
-        }
-      }
-      ++numUnhandledInputs;
+      ++numRelayoutInputs;
       continue;
     }
 
@@ -422,33 +397,29 @@ bool CooperativeMatrixCombiner::tryFoldComponentContaining(Value *start) {
   }
 
   for (Use *use : component.outputs) {
-    if (auto *call = dyn_cast<CallInst>(use->getUser())) {
-      if (auto *callee = call->getCalledFunction()) {
-        if (callee->getName().starts_with(lgcName::CooperativeMatrixStore))
-          continue; // stores can be adapted at zero cost
-        if (callee->getName().starts_with(lgcName::CooperativeMatrixTranspose)) {
-          foundComponentShape(getShapeOfTranspose(call));
-          transposeOutputs.insert(use->get());
-          continue;
-        }
-        if (callee->getName().starts_with(lgcName::CooperativeMatrixConvert)) {
-          auto srcElemType = (CooperativeMatrixElementType)cast<ConstantInt>(call->getArgOperand(2))->getZExtValue();
-          auto dstElemType = (CooperativeMatrixElementType)cast<ConstantInt>(call->getArgOperand(3))->getZExtValue();
-          if (srcElemType != dstElemType) {
-            LLVM_DEBUG(dbgs() << "  unhandled element type output conversion: " << *call << '\n');
-            ++numUnhandledInputs;
-            continue;
-          }
+    if (dyn_cast<CooperativeMatrixStoreOp>(use->getUser()))
+      continue; // stores can be adapted at zero cost
+    if (auto *transposeOp = dyn_cast<CooperativeMatrixTransposeOp>(use->getUser())) {
+      foundComponentShape(getShapeOfTranspose(*transposeOp));
+      transposeOutputs.insert(use->get());
+      continue;
+    }
+    if (auto *convertOp = dyn_cast<CooperativeMatrixConvertOp>(use->getUser())) {
+      auto srcElemType = convertOp->getSrcElemType();
+      auto dstElemType = convertOp->getDstElemType();
+      if (srcElemType != dstElemType) {
+        LLVM_DEBUG(dbgs() << "  unhandled element type output conversion: " << *use->getUser() << '\n');
+        ++numUnhandledInputs;
+        continue;
+      }
 
-          auto srcLayout = (CooperativeMatrixLayout)cast<ConstantInt>(call->getArgOperand(4))->getZExtValue();
-          auto dstLayout = (CooperativeMatrixLayout)cast<ConstantInt>(call->getArgOperand(5))->getZExtValue();
-          foundComponentShape({srcElemType, srcLayout});
-          foundOtherLayout(dstLayout, call->getType());
+      auto srcLayout = convertOp->getSrcLayout();
+      auto dstLayout = convertOp->getDstLayout();
+      foundComponentShape({srcElemType, srcLayout});
+      foundOtherLayout(dstLayout, use->getUser()->getType());
 
-          relayoutOutputs.insert(use->get());
-          continue;
-        }
-      }
+      relayoutOutputs.insert(use->get());
+      continue;
     }
 
     unhandledOutputs.insert(use->get());
@@ -466,22 +437,18 @@ bool CooperativeMatrixCombiner::tryFoldComponentContaining(Value *start) {
 
     for (Value *input : component.inputs) {
       // Handle inputs that can be folded away / absorbed.
-      if (auto *call = dyn_cast<CallInst>(input)) {
-        if (auto *callee = call->getCalledFunction()) {
-          if (callee->getName().starts_with(lgcName::CooperativeMatrixTranspose)) {
-            Value *src = call->getArgOperand(0);
-            foldTo(input, src);
-
-            // Prepopulate the transpose cache to re-use the old transpose operation instead of creating a new one.
-            outTransposed.try_emplace(src, input);
-            continue;
-          }
-          if (callee->getName().starts_with(lgcName::CooperativeMatrixLoad)) {
-            bool colMajor = cast<ConstantInt>(call->getArgOperand(2))->getZExtValue();
-            call->setArgOperand(2, b.getInt1(!colMajor));
-            continue;
-          }
-        }
+      if (auto *transposeOp = dyn_cast<CooperativeMatrixTransposeOp>(input)) {
+        Value *src = transposeOp->getMatrix();
+        foldTo(input, src);
+
+        // Prepopulate the transpose cache to re-use the old transpose operation instead of creating a new one.
+        outTransposed.try_emplace(src, input);
+        continue;
+      }
+      if (auto *loadOp = dyn_cast<CooperativeMatrixLoadOp>(input)) {
+        bool colMajor = loadOp->getColMajor();
+        loadOp->setColMajor(!colMajor);
+        continue;
       }
 
       // Handle generic inputs that need to be transposed explicitly.
@@ -492,26 +459,23 @@ bool CooperativeMatrixCombiner::tryFoldComponentContaining(Value *start) {
         b.SetInsertPointPastAllocas(&m_function);
       }
 
-      auto *transposed = b.CreateCooperativeMatrixTranspose(PoisonValue::get(input->getType()),
-                                                            component.shape->elementType, component.shape->layout);
+      Type *resultMatrixTy = b.getCooperativeMatrixTy(component.shape->elementType, component.shape->layout);
+      auto *transposed = b.create<CooperativeMatrixTransposeOp>(resultMatrixTy, PoisonValue::get(input->getType()),
+                                                                component.shape->elementType, component.shape->layout);
       foldTo(input, transposed);
-      transposed->setArgOperand(0, input);
+      transposed->setMatrix(input);
     }
 
     for (Use *use : component.outputs) {
       // Handle outputs that can be folded away / absorbed.
-      if (auto *call = dyn_cast<CallInst>(use->getUser())) {
-        if (auto *callee = call->getCalledFunction()) {
-          if (callee->getName().starts_with(lgcName::CooperativeMatrixTranspose)) {
-            foldTo(call, use->get());
-            continue;
-          }
-          if (callee->getName().starts_with(lgcName::CooperativeMatrixStore)) {
-            bool colMajor = cast<ConstantInt>(call->getArgOperand(2))->getZExtValue();
-            call->setArgOperand(2, b.getInt1(!colMajor));
-            continue;
-          }
-        }
+      if (isa<CooperativeMatrixTransposeOp>(use->getUser())) {
+        foldTo(use->getUser(), use->get());
+        continue;
+      }
+      if (auto *storeOp = dyn_cast<CooperativeMatrixStoreOp>(use->getUser())) {
+        bool colMajor = storeOp->getColMajor();
+        storeOp->setColMajor(!colMajor);
+        continue;
       }
 
       // Handle generic outputs that need to be transposed explicitly.
@@ -524,8 +488,9 @@ bool CooperativeMatrixCombiner::tryFoldComponentContaining(Value *start) {
           b.SetInsertPoint(def->getNextNode());
         }
 
-        transposed =
-            b.CreateCooperativeMatrixTranspose(use->get(), component.shape->elementType, component.shape->layout);
+        Type *resultMatrixTy = b.getCooperativeMatrixTy(component.shape->elementType, component.shape->layout);
+        transposed = b.create<CooperativeMatrixTransposeOp>(resultMatrixTy, use->get(), component.shape->elementType,
+                                                            component.shape->layout);
       }
 
       use->set(transposed);
@@ -568,46 +533,40 @@ bool CooperativeMatrixCombiner::tryFoldComponentContaining(Value *start) {
 
     for (CallInst *timesScalar : component.inner.timesScalars) {
       timesScalar->mutateType(otherType);
-      timesScalar->setArgOperand(3, b.getInt32((unsigned)*otherLayout));
+      cast<CooperativeMatrixTimesScalarOp>(timesScalar)->setLayout(*otherLayout);
       continue;
     }
 
     for (CallInst *binOp : component.inner.binOps) {
       binOp->mutateType(otherType);
-      binOp->setArgOperand(4, b.getInt32((unsigned)*otherLayout));
+      cast<CooperativeMatrixBinaryOp>(binOp)->setLayout(*otherLayout);
       continue;
     }
 
     for (Value *input : component.inputs) {
       // Handle inputs for which the relayout can be folded or absorbed.
-      if (auto *call = dyn_cast<CallInst>(input)) {
-        if (auto *callee = call->getCalledFunction()) {
-          if (callee->getName().starts_with(lgcName::CooperativeMatrixConvert)) {
-            unsigned srcElemType = cast<ConstantInt>(call->getArgOperand(2))->getZExtValue();
-            unsigned dstElemType = cast<ConstantInt>(call->getArgOperand(3))->getZExtValue();
-
-            if (srcElemType == dstElemType) {
-              auto srcLayout = (CooperativeMatrixLayout)cast<ConstantInt>(call->getArgOperand(4))->getZExtValue();
-              assert(srcLayout == *otherLayout);
-              (void(srcLayout)); // unused
-
-              Value *src = call->getArgOperand(1);
-              foldTo(input, src);
-
-              // Pre-populate the cache to re-use the relayout operation instead of creating a new one.
-              outRelayouted.try_emplace(src, input);
-              continue;
-            }
+      if (auto *convertOp = dyn_cast<CooperativeMatrixConvertOp>(input)) {
+        auto srcElemType = convertOp->getSrcElemType();
+        auto dstElemType = convertOp->getDstElemType();
 
-            // Integrate the relayouting into a merged conversion op.
-            call->setArgOperand(5, b.getInt32((unsigned)*otherLayout));
-            continue;
-          }
-          if (callee->getName().starts_with(lgcName::CooperativeMatrixLoad)) {
-            call->setArgOperand(4, b.getInt32((unsigned)*otherLayout));
-            continue;
-          }
+        if (srcElemType == dstElemType) {
+          assert(convertOp->getSrcLayout() == *otherLayout);
+
+          Value *src = convertOp->getSource();
+          foldTo(input, src);
+
+          // Pre-populate the cache to re-use the relayout operation instead of creating a new one.
+          outRelayouted.try_emplace(src, input);
+          continue;
         }
+
+        // Integrate the relayouting into a merged conversion op.
+        convertOp->setDstLayout(*otherLayout);
+        continue;
+      }
+      if (auto *loadOp = dyn_cast<CooperativeMatrixLoadOp>(input)) {
+        loadOp->setLayout(*otherLayout);
+        continue;
       }
 
       // Handle generic inputs that need a new convert operation inserted.
@@ -618,36 +577,31 @@ bool CooperativeMatrixCombiner::tryFoldComponentContaining(Value *start) {
         b.SetInsertPointPastAllocas(&m_function);
       }
 
-      CallInst *convert = b.CreateCooperativeMatrixConvert((CastInst::CastOps)0, PoisonValue::get(input->getType()),
-                                                           component.shape->elementType, component.shape->elementType,
-                                                           component.shape->layout, *otherLayout);
+      Type *resultMatrixTy = b.getCooperativeMatrixTy(component.shape->elementType, *otherLayout);
+      CooperativeMatrixConvertOp *convert = b.create<CooperativeMatrixConvertOp>(
+          resultMatrixTy, (CastInst::CastOps)0, PoisonValue::get(input->getType()), component.shape->elementType,
+          component.shape->elementType, component.shape->layout, *otherLayout);
       foldTo(input, convert);
-      convert->setArgOperand(1, input);
+      convert->setSource(input);
     }
 
     for (Use *use : component.outputs) {
       // Handle outputs for which the relayout can be folded or absorbed.
-      if (auto *call = dyn_cast<CallInst>(use->getUser())) {
-        if (auto *callee = call->getCalledFunction()) {
-          if (callee->getName().starts_with(lgcName::CooperativeMatrixConvert)) {
-            unsigned srcElemType = cast<ConstantInt>(call->getArgOperand(2))->getZExtValue();
-            unsigned dstElemType = cast<ConstantInt>(call->getArgOperand(3))->getZExtValue();
-
-            if (srcElemType == dstElemType) {
-              auto dstLayout = (CooperativeMatrixLayout)cast<ConstantInt>(call->getArgOperand(5))->getZExtValue();
-              assert(dstLayout == *otherLayout);
-              (void(dstLayout)); // unused
-
-              foldTo(call, use->get());
-              continue;
-            }
-          }
-          if (callee->getName().starts_with(lgcName::CooperativeMatrixStore)) {
-            call->setArgOperand(4, b.getInt32((unsigned)*otherLayout));
-            continue;
-          }
+      if (auto *convertOp = dyn_cast<CooperativeMatrixConvertOp>(use->getUser())) {
+        auto srcElemType = convertOp->getSrcElemType();
+        auto dstElemType = convertOp->getDstElemType();
+
+        if (srcElemType == dstElemType) {
+          assert(convertOp->getDstLayout() == *otherLayout);
+
+          foldTo(use->getUser(), use->get());
+          continue;
         }
       }
+      if (auto *storeOp = dyn_cast<CooperativeMatrixStoreOp>(use->getUser())) {
+        storeOp->setLayout(*otherLayout);
+        continue;
+      }
 
       // Handle generic outputs that need a new convert operation inserted.
       Value *&relayouted = outRelayouted[use->get()];
@@ -659,9 +613,10 @@ bool CooperativeMatrixCombiner::tryFoldComponentContaining(Value *start) {
           b.SetInsertPoint(def->getNextNode());
         }
 
-        relayouted =
-            b.CreateCooperativeMatrixConvert((CastInst::CastOps)0, use->get(), component.shape->elementType,
-                                             component.shape->elementType, *otherLayout, component.shape->layout);
+        Type *resultMatrixTy = b.getCooperativeMatrixTy(component.shape->elementType, component.shape->layout);
+        relayouted = b.create<CooperativeMatrixConvertOp>(resultMatrixTy, (CastInst::CastOps)0, use->get(),
+                                                          component.shape->elementType, component.shape->elementType,
+                                                          *otherLayout, component.shape->layout);
       }
 
       use->set(relayouted);
@@ -682,7 +637,7 @@ Instruction *CooperativeMatrixCombiner::findFirstUser(Instruction *instruction)
     if (instruction->getParent() != userInst->getParent())
       continue;
 
-    if (dyn_cast<PHINode>(userInst))
+    if (isa<PHINode>(userInst))
       continue;
 
     if (!earliestUser || userInst->comesBefore(earliestUser))
@@ -691,7 +646,7 @@ Instruction *CooperativeMatrixCombiner::findFirstUser(Instruction *instruction)
   return earliestUser;
 }
 
-bool CooperativeMatrixCombiner::tryFoldMuladd(SmallVector<CallInst *> muladds) {
+bool CooperativeMatrixCombiner::tryFoldMuladd(SmallVector<CooperativeMatrixMulAddOp *> muladds) {
   bool changed = false;
 
   auto cmp = [](CallInst *a, CallInst *b) { return b->comesBefore(a); };
@@ -708,19 +663,18 @@ bool CooperativeMatrixCombiner::tryFoldMuladd(SmallVector<CallInst *> muladds) {
     };
     SmallVector<PackingComponents> worklist;
     SmallVector<std::pair<Use &, bool>> unpackedUses;
-    SmallVector<CallInst *> muladdChain;
+    SmallVector<CooperativeMatrixMulAddOp *> muladdChain;
 
-    auto *matCLo = muladdLo->getArgOperand(2);
+    auto *matCLo = muladdLo->getMatrixC();
 
     muladdChain.push_back(muladdLo);
-    muladdLo->setArgOperand(5, b.getInt1(false));
+    muladdLo->setIsSatOrOpsel(false);
     while (muladdLo->hasOneUse()) {
-      auto *next = dyn_cast<CallInst>(*muladdLo->users().begin());
+      auto *next = dyn_cast<CooperativeMatrixMulAddOp>(*muladdLo->users().begin());
 
       if (!is_contained(muladds, next))
         break;
-
-      next->setArgOperand(5, b.getInt1(false));
+      next->setIsSatOrOpsel(false);
       muladdChain.push_back(next);
       muladdLo = next;
 #if LLVM_MAIN_REVISION && LLVM_MAIN_REVISION < 478769
@@ -732,12 +686,12 @@ bool CooperativeMatrixCombiner::tryFoldMuladd(SmallVector<CallInst *> muladds) {
 
     Instruction *firstLoUser = findFirstUser(muladdLo);
 
-    CallInst *muladdHi = nullptr;
+    CooperativeMatrixMulAddOp *muladdHi = nullptr;
     for (auto *candidate : llvm::reverse(muladds)) {
       if (firstLoUser && firstLoUser->comesBefore(candidate))
         continue;
 
-      if (auto *matCHi = dyn_cast<Instruction>(candidate->getArgOperand(2))) {
+      if (auto *matCHi = dyn_cast<Instruction>(candidate->getMatrixC())) {
         if (matCHi->getParent() == muladdLo->getParent() && packInsertPoint->comesBefore(matCHi)) {
           continue;
         }
@@ -750,17 +704,17 @@ bool CooperativeMatrixCombiner::tryFoldMuladd(SmallVector<CallInst *> muladds) {
     if (!muladdHi)
       continue;
 
-    auto *matCHi = muladdHi->getArgOperand(2);
+    auto *matCHi = muladdHi->getMatrixC();
 
     muladdChain.push_back(muladdHi);
-    muladdHi->setArgOperand(5, b.getInt1(true));
+    muladdHi->setIsSatOrOpsel(true);
 #if LLVM_MAIN_REVISION && LLVM_MAIN_REVISION < 478769
     llvm::erase_value(muladds, muladdLo);
 #else
     llvm::erase(muladds, muladdHi);
 #endif
     while (muladdHi->hasOneUse()) {
-      auto *next = dyn_cast<CallInst>(*muladdHi->users().begin());
+      auto *next = dyn_cast<CooperativeMatrixMulAddOp>(*muladdHi->users().begin());
       if (!is_contained(muladds, next)) {
         break;
       }
@@ -768,7 +722,7 @@ bool CooperativeMatrixCombiner::tryFoldMuladd(SmallVector<CallInst *> muladds) {
       if (firstLoUser && firstLoUser->comesBefore(next)) {
         break;
       }
-      next->setArgOperand(5, b.getInt1(true));
+      next->setIsSatOrOpsel(true);
       muladdChain.push_back(next);
       muladdHi = next;
 #if LLVM_MAIN_REVISION && LLVM_MAIN_REVISION < 478769
@@ -786,13 +740,14 @@ bool CooperativeMatrixCombiner::tryFoldMuladd(SmallVector<CallInst *> muladds) {
     // incoming values are packed accumulators.
     PHINode *const phiLo = dyn_cast<PHINode>(matCLo);
     PHINode *const phiHi = dyn_cast<PHINode>(matCHi);
+    Type *packedTy = FixedVectorType::get(b.getFloatTy(), 8);
     Value *curAccum = nullptr;
     if (phiLo && phiHi && phiLo->getParent() == phiHi->getParent()) {
       for (BasicBlock *incoming : phiLo->blocks()) {
         b.SetInsertPoint(incoming->getTerminator());
         auto *matCLo = phiLo->getIncomingValueForBlock(incoming);
         auto *matCHi = phiHi->getIncomingValueForBlock(incoming);
-        auto *packed = b.CreateCooperativeMatrixPack(matCLo, matCHi);
+        auto *packed = b.create<CooperativeMatrixPackOp>(packedTy, matCLo, matCHi);
         phiLo->setIncomingValueForBlock(incoming, packed);
         phiHi->setIncomingValueForBlock(incoming, packed);
       }
@@ -801,12 +756,12 @@ bool CooperativeMatrixCombiner::tryFoldMuladd(SmallVector<CallInst *> muladds) {
     } else {
       // otherwise, we pack just before the first muladd
       b.SetInsertPoint(packInsertPoint);
-      curAccum = b.CreateCooperativeMatrixPack(matCLo, matCHi);
+      curAccum = b.create<CooperativeMatrixPackOp>(packedTy, matCLo, matCHi);
     }
 
     for (auto *next : muladdChain) {
-      next->setArgOperand(2, curAccum);
-      next->setArgOperand(6, b.getInt1(true));
+      next->setMatrixC(curAccum);
+      next->setIsTied(true);
       curAccum = next;
     }
 
@@ -824,35 +779,28 @@ bool CooperativeMatrixCombiner::tryFoldMuladd(SmallVector<CallInst *> muladds) {
         if (is_contained(muladdChain, use.getUser()))
           continue;
 
-        if (auto *call = dyn_cast<CallInst>(use.getUser())) {
-          if (auto *callee = call->getCalledFunction()) {
-            if (callee->getName().starts_with(lgcName::CooperativeMatrixTimesScalar)) {
-              auto *candidate = llvm::find_if(unpackedUses, [&](auto pair) {
-                if (auto *call = dyn_cast<CallInst>(pair.first.getUser())) {
-                  if (auto *callee = call->getCalledFunction()) {
-                    if (callee->getName().starts_with(lgcName::CooperativeMatrixTimesScalar) &&
-                        call->getArgOperand(0) == current.matrixLo) {
-                      return true;
-                    }
-                  }
-                }
-                return false;
-              });
-
-              if (candidate == unpackedUses.end()) {
-                unpackedUses.push_back({use, true});
-                continue;
+        if (auto *call = dyn_cast<CooperativeMatrixTimesScalarOp>(use.getUser())) {
+          auto *candidate = llvm::find_if(unpackedUses, [&](auto pair) {
+            if (auto *timesScalarOp = dyn_cast<CooperativeMatrixTimesScalarOp>(pair.first.getUser())) {
+              if (timesScalarOp->getMatrix() == current.matrixLo) {
+                return true;
               }
+            }
+            return false;
+          });
 
-              auto *timesScalarLo = cast<CallInst>(candidate->first.getUser());
-              auto *timesScalarHi = call;
-              auto *timesScalarPacked = tryFoldTimesScalar(timesScalarLo, timesScalarHi, current.packedAccum);
+          if (candidate == unpackedUses.end()) {
+            unpackedUses.push_back({use, true});
+            continue;
+          }
 
-              if (timesScalarPacked) {
-                worklist.push_back({timesScalarLo, timesScalarHi, timesScalarPacked});
-                continue;
-              }
-            }
+          auto *timesScalarLo = cast<CallInst>(candidate->first.getUser());
+          auto *timesScalarHi = call;
+          auto *timesScalarPacked = tryFoldTimesScalar(timesScalarLo, timesScalarHi, current.packedAccum);
+
+          if (timesScalarPacked) {
+            worklist.push_back({timesScalarLo, timesScalarHi, timesScalarPacked});
+            continue;
           }
         }
 
@@ -863,10 +811,9 @@ bool CooperativeMatrixCombiner::tryFoldMuladd(SmallVector<CallInst *> muladds) {
         if (is_contained(m_eraseList, use.first.getUser()))
           continue;
 
-        if (auto *call = dyn_cast<CallInst>(use.first.getUser())) {
-          if (call->getCalledFunction()->getName().starts_with(lgcName::CooperativeMatrixPack) &&
-              call->getArgOperand(0) == current.matrixLo && call->getArgOperand(1) == current.matrixHi) {
-            foldTo(call, current.packedAccum);
+        if (auto *packOp = dyn_cast<CooperativeMatrixPackOp>(use.first.getUser())) {
+          if (packOp->getMatrixCLo() == current.matrixLo && packOp->getMatrixCHi() == current.matrixHi) {
+            foldTo(use.first.getUser(), current.packedAccum);
             continue;
           }
         }
@@ -877,7 +824,8 @@ bool CooperativeMatrixCombiner::tryFoldMuladd(SmallVector<CallInst *> muladds) {
         } else {
           b.SetInsertPoint(cast<Instruction>(use.first.getUser()));
         }
-        auto unpacked = b.CreateCooperativeMatrixUnpack(current.packedAccum, use.second);
+        Type *unpackedTy = FixedVectorType::get(b.getFloatTy(), 8);
+        auto unpacked = b.create<CooperativeMatrixUnPackOp>(unpackedTy, current.packedAccum, use.second);
         use.first.set(unpacked);
       }
       unpackedUses.clear();
@@ -908,11 +856,13 @@ Value *CooperativeMatrixCombiner::tryFoldTimesScalar(CallInst *timesScalarLo, Ca
   b.SetInsertPoint(laterInst);
 
   auto *scalarVec = b.CreateVectorSplat(2, PoisonValue::get(b.getHalfTy()));
-  scalarVec = b.CreateInsertElement(scalarVec, timesScalarLo->getArgOperand(1), b.getInt32(0));
-  scalarVec = b.CreateInsertElement(scalarVec, timesScalarHi->getArgOperand(1), b.getInt32(1));
-  auto *timesScalarPacked =
-      b.CreateCoopMatrixTimesScalar(packedMatrix, scalarVec, CooperativeMatrixElementType::Float16Packed,
-                                    CooperativeMatrixLayout::AccumulatorMatrixLayout);
+  auto *loScalar = cast<CooperativeMatrixTimesScalarOp>(*timesScalarLo).getScalar();
+  auto *hiScalar = cast<CooperativeMatrixTimesScalarOp>(*timesScalarHi).getScalar();
+  scalarVec = b.CreateInsertElement(scalarVec, loScalar, b.getInt32(0));
+  scalarVec = b.CreateInsertElement(scalarVec, hiScalar, b.getInt32(1));
+  auto *timesScalarPacked = b.create<CooperativeMatrixTimesScalarOp>(packedMatrix->getType(), packedMatrix, scalarVec,
+                                                                     CooperativeMatrixElementType::Float16Packed,
+                                                                     CooperativeMatrixLayout::AccumulatorMatrixLayout);
   m_eraseList.push_back(timesScalarLo);
   m_eraseList.push_back(timesScalarHi);
   return timesScalarPacked;
@@ -937,3 +887,4 @@ PreservedAnalyses CombineCooperativeMatrix::run(Function &function, FunctionAnal
   }
   return PreservedAnalyses::all();
 }
+} // namespace lgc
diff --git a/lgc/patch/ConfigBuilderBase.cpp b/lgc/patch/ConfigBuilderBase.cpp
index f2c4bb9c2e..2d2d57147b 100644
--- a/lgc/patch/ConfigBuilderBase.cpp
+++ b/lgc/patch/ConfigBuilderBase.cpp
@@ -81,16 +81,30 @@ ConfigBuilderBase::~ConfigBuilderBase() {
 
 // =====================================================================================================================
 /// Adds the .shaders.$(apiStage).hardware_mapping node to the PAL metadata.
+/// Also add .shader_subtype if it is a compute shader.
 ///
 /// @param [in] apiStage : The API shader stage
 /// @param [in] hwStages : The HW stage(s) that the API shader is mapped to, as a combination of
 ///                      @ref Util::Abi::HardwareStageFlagBits.
 void ConfigBuilderBase::addApiHwShaderMapping(ShaderStageEnum apiStage, unsigned hwStages) {
-  auto hwMappingNode = getApiShaderNode(apiStage)[Util::Abi::ShaderMetadataKey::HardwareMapping].getArray(true);
+  msgpack::MapDocNode apiShaderNode = getApiShaderNode(apiStage);
+  auto hwMappingNode = apiShaderNode[Util::Abi::ShaderMetadataKey::HardwareMapping].getArray(true);
   for (unsigned hwStage = 0; hwStage < unsigned(Util::Abi::HardwareStage::Count); ++hwStage) {
     if (hwStages & (1 << hwStage))
       hwMappingNode.push_back(m_document->getNode(HwStageNames[hwStage]));
   }
+  if (apiStage == ShaderStage::Compute) {
+    // Find the only export function in the module, get the subtype from it, and add the .shader_subtype metadata
+    // item.
+    for (Function &func : *m_module) {
+      if (!func.isDeclaration() && func.getLinkage() != GlobalValue::InternalLinkage) {
+        StringRef subtype = getShaderSubtype(&func);
+        if (!subtype.empty())
+          apiShaderNode[Util::Abi::ShaderMetadataKey::ShaderSubtype] = subtype;
+        break;
+      }
+    }
+  }
 }
 
 // =====================================================================================================================
diff --git a/lgc/patch/Continufy.cpp b/lgc/patch/Continufy.cpp
index 6f0162f307..cc9ef99f7b 100644
--- a/lgc/patch/Continufy.cpp
+++ b/lgc/patch/Continufy.cpp
@@ -167,9 +167,7 @@ PreservedAnalyses Continufy::run(Module &module, ModuleAnalysisManager &analysis
         auto *continuationRef = builder.CreatePtrToInt(called, IntegerType::get(context, 32));
         CpsLevel calleeLevel =
             getCpsLevelFromRtStage(mdconst::extract<ConstantInt>(calleeStage->getOperand(0))->getSExtValue());
-        // RayGen level is zero, so it does not need a logic OR here.
-        if (calleeLevel != CpsLevel::RayGen)
-          continuationRef = builder.CreateOr(continuationRef, builder.getInt32((uint32_t)calleeLevel));
+        continuationRef = builder.CreateOr(continuationRef, builder.getInt32((uint32_t)calleeLevel));
 
         // Always put a shader-index.
         SmallVector<Value *> tailArgs = {PoisonValue::get(builder.getInt32Ty())};
diff --git a/lgc/patch/FragColorExport.cpp b/lgc/patch/FragColorExport.cpp
index 36ac78c7c1..b7dd372a75 100644
--- a/lgc/patch/FragColorExport.cpp
+++ b/lgc/patch/FragColorExport.cpp
@@ -458,7 +458,9 @@ PreservedAnalyses LowerFragColorExport::run(Module &module, ModuleAnalysisManage
   // Just according to the dualSourceBlendEnable flag.
   Value *dynamicIsDualSource = builder.getInt32(0);
   if (m_pipelineState->getTargetInfo().getGfxIpVersion().major >= 11) {
-    dynamicIsDualSource = ShaderInputs::getSpecialUserData(UserDataMapping::DynamicDualSrcBlendInfo, builder);
+    dynamicIsDualSource = ShaderInputs::getSpecialUserData(UserDataMapping::CompositeData, builder);
+    dynamicIsDualSource = builder.CreateIntrinsic(Intrinsic::amdgcn_ubfe, builder.getInt32Ty(),
+                                                  {dynamicIsDualSource, builder.getInt32(7), builder.getInt32(1)});
   }
 
   bool willGenerateColorExportShader = m_pipelineState->isUnlinked() && !m_pipelineState->hasColorExportFormats();
diff --git a/lgc/patch/LowerCooperativeMatrix.cpp b/lgc/patch/LowerCooperativeMatrix.cpp
index 5e6c615ddf..5dc9181dcb 100644
--- a/lgc/patch/LowerCooperativeMatrix.cpp
+++ b/lgc/patch/LowerCooperativeMatrix.cpp
@@ -66,23 +66,7 @@ PreservedAnalyses LowerCooperativeMatrix::run(Module &module, ModuleAnalysisMana
   m_gfxIp = m_pipelineState->getTargetInfo().getGfxIpVersion();
 
   processCoopRowAccFunction(module);
-
-  SmallVector<Function *, 16> lowerCoopMatrixCallees;
-  for (auto &func : module) {
-    auto name = func.getName();
-    if (name.starts_with(lgcName::CooperativeMatrix))
-      lowerCoopMatrixCallees.push_back(&func);
-  }
-  if (lowerCoopMatrixCallees.empty())
-    return PreservedAnalyses::all();
-
-  processCoopMatrixFunction(lowerCoopMatrixCallees);
-
-  for (auto callInst : m_coopMatrixCalls) {
-    callInst->dropAllReferences();
-    callInst->eraseFromParent();
-  }
-  m_coopMatrixCalls.clear();
+  processCoopMatrixFunction(module);
 
   PreservedAnalyses PA;
   PA.preserveSet<CFGAnalyses>();
@@ -90,195 +74,34 @@ PreservedAnalyses LowerCooperativeMatrix::run(Module &module, ModuleAnalysisMana
 }
 
 // =====================================================================================================================
-// Run the on a module
-//
-// @param coopMatrixCallees : Function array for the cooperativeMatrix
-void LowerCooperativeMatrix::processCoopMatrixFunction(ArrayRef<Function *> coopMatrixCallees) {
-  for (auto callee : coopMatrixCallees) {
-    for (auto user : callee->users()) {
-      if (CallInst *callInst = dyn_cast<CallInst>(user)) {
-        visitCallInst(*callInst);
-      }
-    }
-  }
-}
-
-// =====================================================================================================================
-// Visits "call" instruction.
+// Visit the cooperative matrix ops on module
 //
-// @param callInst : "Call" instruction
-void LowerCooperativeMatrix::visitCallInst(CallInst &callInst) {
-  auto callee = callInst.getCalledFunction();
-  if (!callee)
-    return;
-
-  m_coopMatrixCalls.push_back(&callInst);
-
-  BuilderCommon builder(*m_context);
-  builder.SetInsertPoint(&callInst);
-
-  auto mangledName = callee->getName();
-  if (mangledName.starts_with(lgcName::CooperativeMatrixLength)) {
-    auto layout = static_cast<CooperativeMatrixLayout>(cast<ConstantInt>(callInst.getOperand(1))->getZExtValue());
-    callInst.replaceAllUsesWith(builder.getInt32(getLength(layout)));
-  } else if (mangledName.starts_with(lgcName::CooperativeMatrixExtract)) {
-    Value *matrix = callInst.getOperand(0);
-    Value *index = callInst.getOperand(1);
-    auto elemType =
-        static_cast<CooperativeMatrixElementType>(cast<ConstantInt>(callInst.getOperand(2))->getZExtValue());
-    auto layout = static_cast<CooperativeMatrixLayout>(cast<ConstantInt>(callInst.getOperand(3))->getZExtValue());
-    Value *result = cooperativeMatrixExtract(builder, matrix, index, elemType, layout);
-    result->takeName(&callInst);
-    callInst.replaceAllUsesWith(result);
-  } else if (mangledName.starts_with(lgcName::CooperativeMatrixInsert)) {
-    Value *matrix = callInst.getOperand(0);
-    Value *value = callInst.getOperand(1);
-    Value *index = callInst.getOperand(2);
-    auto elemType =
-        static_cast<CooperativeMatrixElementType>(cast<ConstantInt>(callInst.getOperand(3))->getZExtValue());
-    auto layout = static_cast<CooperativeMatrixLayout>(cast<ConstantInt>(callInst.getOperand(4))->getZExtValue());
-    Value *result = cooperativeMatrixInsert(builder, matrix, value, index, elemType, layout);
-    result->takeName(&callInst);
-    callInst.replaceAllUsesWith(result);
-  } else if (mangledName.starts_with(lgcName::CooperativeMatrixFill)) {
-    Value *value = callInst.getOperand(0);
-    auto elemType =
-        static_cast<CooperativeMatrixElementType>(cast<ConstantInt>(callInst.getOperand(1))->getZExtValue());
-    auto layout = static_cast<CooperativeMatrixLayout>(cast<ConstantInt>(callInst.getOperand(2))->getZExtValue());
-    Value *result = cooperativeMatrixFill(builder, value, elemType, layout);
-    result->takeName(&callInst);
-    callInst.replaceAllUsesWith(result);
-  } else if (mangledName.starts_with(lgcName::CooperativeMatrixLoad)) {
-    Value *dataPtr = callInst.getOperand(0);
-    Value *stride = callInst.getOperand(1);
-    bool colMajor = cast<ConstantInt>(callInst.getOperand(2))->getZExtValue();
-    auto elemType =
-        static_cast<CooperativeMatrixElementType>(cast<ConstantInt>(callInst.getOperand(3))->getZExtValue());
-    auto layout = static_cast<CooperativeMatrixLayout>(cast<ConstantInt>(callInst.getOperand(4))->getZExtValue());
-    unsigned memoryAccess = cast<ConstantInt>(callInst.getOperand(5))->getZExtValue();
-    unsigned alignment = cast<ConstantInt>(callInst.getOperand(6))->getZExtValue();
-
-    Value *loadVal = cooperativeMatrixLoadInternal(dataPtr, stride, colMajor, elemType, layout, memoryAccess, alignment,
-                                                   callInst.getName(), &callInst);
-    callInst.replaceAllUsesWith(loadVal);
-
-  } else if (mangledName.starts_with(lgcName::CooperativeMatrixStore)) {
-    Value *dataPtr = callInst.getOperand(0);
-    Value *stride = callInst.getOperand(1);
-    bool colMajor = cast<ConstantInt>(callInst.getOperand(2))->getZExtValue();
-    auto elemType =
-        static_cast<CooperativeMatrixElementType>(cast<ConstantInt>(callInst.getOperand(3))->getZExtValue());
-    auto layout = static_cast<CooperativeMatrixLayout>(cast<ConstantInt>(callInst.getOperand(4))->getZExtValue());
-    unsigned memoryAccess = cast<ConstantInt>(callInst.getOperand(5))->getZExtValue();
-    unsigned alignment = cast<ConstantInt>(callInst.getOperand(6))->getZExtValue();
-    Value *vecVal = callInst.getOperand(7);
-
-    cooperativeMatrixStoreInternal(dataPtr, stride, colMajor, elemType, layout, memoryAccess, alignment, vecVal,
-                                   callInst.getName(), &callInst);
-
-  } else if (mangledName.starts_with(lgcName::CooperativeMatrixConvert)) {
-    CastInst::CastOps castOp =
-        static_cast<CastInst::CastOps>(cast<ConstantInt>(callInst.getOperand(0))->getZExtValue());
-    Value *source = callInst.getOperand(1);
-    auto srcElemType =
-        static_cast<CooperativeMatrixElementType>(cast<ConstantInt>(callInst.getOperand(2))->getZExtValue());
-    auto dstElemType =
-        static_cast<CooperativeMatrixElementType>(cast<ConstantInt>(callInst.getOperand(3))->getZExtValue());
-    auto srcLayout = static_cast<CooperativeMatrixLayout>(cast<ConstantInt>(callInst.getOperand(4))->getZExtValue());
-    auto dstLayout = static_cast<CooperativeMatrixLayout>(cast<ConstantInt>(callInst.getOperand(5))->getZExtValue());
-    Value *resultVal = cooperativeMatrixConvert(castOp, source, srcElemType, dstElemType, srcLayout, dstLayout,
-                                                callInst.getName(), &callInst);
-    if ((cast<FixedVectorType>(resultVal->getType())->getNumElements() == 4) &&
-        (dstLayout == CooperativeMatrixLayout::AccumulatorMatrixLayout ||
-         dstLayout == CooperativeMatrixLayout::Gfx10Accumulator16bitMatrixLayout ||
-         dstLayout == CooperativeMatrixLayout::Gfx10AccumulatorMatrixLayout)) {
-      // for wave64 needs shuffleVector from V4 to V8 as frontend will always recognize V8 not care wave32 or wave64
-      resultVal = builder.CreateShuffleVector(resultVal, PoisonValue::get(resultVal->getType()),
-                                              ArrayRef<int>{0, 1, 2, 3, 4, 5, 6, 7});
-    }
-    callInst.replaceAllUsesWith(resultVal);
-
-  } else if (mangledName.starts_with(lgcName::CooperativeMatrixTranspose)) {
-    Value *matrix = callInst.getOperand(0);
-    auto elemType =
-        static_cast<CooperativeMatrixElementType>(cast<ConstantInt>(callInst.getOperand(1))->getZExtValue());
-    auto srcLayout = static_cast<CooperativeMatrixLayout>(cast<ConstantInt>(callInst.getOperand(2))->getZExtValue());
-
-    Value *resultVal = cooperativeMatrixTranspose(matrix, elemType, srcLayout, callInst.getName(), &callInst);
-    callInst.replaceAllUsesWith(resultVal);
-
-  } else if (mangledName.starts_with(lgcName::CooperativeMatrixBinOp)) {
-    CooperativeMatrixArithOp coopMatArithOp =
-        static_cast<CooperativeMatrixArithOp>(cast<ConstantInt>(callInst.getOperand(0))->getZExtValue());
-    Value *lhs = callInst.getOperand(1);
-    Value *rhs = callInst.getOperand(2);
-    auto elemType =
-        static_cast<CooperativeMatrixElementType>(cast<ConstantInt>(callInst.getOperand(3))->getZExtValue());
-    auto srcLayout = static_cast<CooperativeMatrixLayout>(cast<ConstantInt>(callInst.getOperand(4))->getZExtValue());
-
-    Value *resultVal =
-        cooperativeMatrixBinaryOp(coopMatArithOp, lhs, rhs, elemType, srcLayout, callInst.getName(), &callInst);
-    callInst.replaceAllUsesWith(resultVal);
-
-  } else if (mangledName.starts_with(lgcName::CooperativeMatrixTimesScalar)) {
-    Value *matrix = callInst.getOperand(0);
-    Value *scalar = callInst.getOperand(1);
-    auto elemType =
-        static_cast<CooperativeMatrixElementType>(cast<ConstantInt>(callInst.getOperand(2))->getZExtValue());
-    auto srcLayout = static_cast<CooperativeMatrixLayout>(cast<ConstantInt>(callInst.getOperand(3))->getZExtValue());
-
-    Value *resultVal = coopMatrixTimesScalar(matrix, scalar, elemType, srcLayout, callInst.getName(), &callInst);
-    callInst.replaceAllUsesWith(resultVal);
-
-  } else if (mangledName.starts_with(lgcName::CooperativeMatrixMulAdd)) {
-    Value *matrixA = callInst.getOperand(0);
-    Value *matrixB = callInst.getOperand(1);
-    Value *matrixC = callInst.getOperand(2);
-    bool isSignedA = cast<ConstantInt>(callInst.getOperand(3))->getZExtValue();
-    bool isSignedB = cast<ConstantInt>(callInst.getOperand(4))->getZExtValue();
-    bool isSatOrOpsel = cast<ConstantInt>(callInst.getOperand(5))->getZExtValue();
-    bool isTied = cast<ConstantInt>(callInst.getOperand(6))->getZExtValue();
-    auto accumElemType =
-        static_cast<CooperativeMatrixElementType>(cast<ConstantInt>(callInst.getOperand(7))->getZExtValue());
-    auto factorElemType =
-        static_cast<CooperativeMatrixElementType>(cast<ConstantInt>(callInst.getOperand(8))->getZExtValue());
-    Value *resultVal = cooperativeMatrixMulAdd(matrixA, matrixB, matrixC, isSignedA, isSignedB, isSatOrOpsel, isTied,
-                                               accumElemType, factorElemType, callInst.getName(), &callInst);
-    callInst.replaceAllUsesWith(resultVal);
-  } else if (mangledName.starts_with(lgcName::CooperativeMatrixPack)) {
-    Value *matrixA = callInst.getOperand(0);
-    Value *matrixB = callInst.getOperand(1);
-    Value *resultVal = cooperativeMatrixPack(matrixA, matrixB, callInst.getName(), &callInst);
-    callInst.replaceAllUsesWith(resultVal);
-  } else if (mangledName.starts_with(lgcName::CooperativeMatrixUnpack)) {
-    Value *packedMatrix = callInst.getOperand(0);
-    bool high = cast<ConstantInt>(callInst.getOperand(1))->getZExtValue();
-    Value *resultVal = cooperativeMatrixUnpack(packedMatrix, high, callInst.getName(), &callInst);
-    callInst.replaceAllUsesWith(resultVal);
+// @param [in] module :  LLVM module to be run on
+void LowerCooperativeMatrix::processCoopMatrixFunction(Module &module) {
+  static auto visitor = llvm_dialects::VisitorBuilder<LowerCooperativeMatrix>()
+                            .setStrategy(llvm_dialects::VisitorStrategy::ByFunctionDeclaration)
+                            .add(&LowerCooperativeMatrix::visitCooperativeMatrixLengthOp)
+                            .add(&LowerCooperativeMatrix::visitCooperativeMatrixExtractOp)
+                            .add(&LowerCooperativeMatrix::visitCooperativeMatrixInsertOp)
+                            .add(&LowerCooperativeMatrix::visitCooperativeMatrixFillOp)
+                            .add(&LowerCooperativeMatrix::visitCooperativeMatrixLoadOp)
+                            .add(&LowerCooperativeMatrix::visitCooperativeMatrixStoreOp)
+                            .add(&LowerCooperativeMatrix::visitCooperativeMatrixConvertOp)
+                            .add(&LowerCooperativeMatrix::visitCooperativeMatrixTransposeOp)
+                            .add(&LowerCooperativeMatrix::visitCooperativeMatrixBinaryOp)
+                            .add(&LowerCooperativeMatrix::visitCooperativeMatrixTimesScalarOp)
+                            .add(&LowerCooperativeMatrix::visitCooperativeMatrixMulAddOp)
+                            .add(&LowerCooperativeMatrix::visitCooperativeMatrixPackOp)
+                            .add(&LowerCooperativeMatrix::visitCooperativeMatrixUnPackOp)
+                            .build();
 
-  } else {
-    llvm_unreachable("Should never be called!");
-  }
-}
+  visitor.visit(*this, module);
 
-// =====================================================================================================================
-// Get the "length" of a matrix of the given layout, i.e. the number of matrix components stored per lane.
-//
-// @param layout : the matrix layout
-unsigned LowerCooperativeMatrix::getLength(CooperativeMatrixLayout layout) const {
-  auto waveSize = m_pipelineState->getShaderWaveSize(m_shaderStage);
-  switch (layout) {
-  case CooperativeMatrixLayout::FactorMatrixLayout:
-    return 16;
-  case CooperativeMatrixLayout::AccumulatorMatrixLayout: {
-    return waveSize == 32 ? 8 : 4;
-  }
-  case CooperativeMatrixLayout::Gfx10AccumulatorMatrixLayout:
-  case CooperativeMatrixLayout::Gfx10Accumulator16bitMatrixLayout:
-    return 8;
-  default:
-    llvm_unreachable("unhandled matrix layout");
+  for (auto callInst : m_coopMatrixCalls) {
+    callInst->dropAllReferences();
+    callInst->eraseFromParent();
   }
+  m_coopMatrixCalls.clear();
 }
 
 // =====================================================================================================================
@@ -443,31 +266,54 @@ LowerCooperativeMatrix::computeAddressing(CooperativeMatrixLayout layout, Cooper
 }
 
 // =====================================================================================================================
-// Load contiguous elements from the specified location of the memory.
-// @param dataPtr : The pointer to a data array.
-// @param stride : The stride in bytes in memory between the first elements of consecutive rows (orcolumns) in the
-// source data. Guaranteed to be a multiple of the matrix element size.
-// @param isColMajor : Identify the order for the data stored in memory, col-major/row-major
-// @param elemType : The element type for the matrix
-// @param layout : This is identify for factor(A/B) or accumulator(C) for 16 bit element matrix.
-// @param memoryAccess : The memory operands which provide:isVolatile/isTemporal/isCoherent
-// @param alignment:  Alignment for the memory access operations.
-// additional operands, maybe volatile/Aligned/Nontemporal/MakePointerAvailable
-// /MakePointerVisible/NonPrivatePointer usded by CooperativeMatrix Load/Store.
-// @param instName : Name to give instruction(s).
-// @param insertPos : Where to insert the instruction
-Value *LowerCooperativeMatrix::cooperativeMatrixLoadInternal(Value *dataPtr, Value *stride, bool isColMajor,
-                                                             CooperativeMatrixElementType elemType,
-                                                             CooperativeMatrixLayout layout, unsigned memoryAccess,
-                                                             unsigned alignment, const Twine &instName,
-                                                             Instruction *insertPos) {
+// Visit "CooperativeMatrixLengthOp" instruction
+//
+// @param matrixlength : The dialect instruction to process
+void LowerCooperativeMatrix::visitCooperativeMatrixLengthOp(CooperativeMatrixLengthOp &matrixlength) {
   BuilderBase builder(*m_context);
-  builder.SetInsertPoint(insertPos);
+  builder.SetInsertPoint(&matrixlength);
+  auto waveSize = m_pipelineState->getShaderWaveSize(m_shaderStage);
+  auto layout = matrixlength.getLayout();
+  unsigned length = 0;
+  switch (layout) {
+  case CooperativeMatrixLayout::FactorMatrixLayout:
+    length = 16;
+    break;
+  case CooperativeMatrixLayout::AccumulatorMatrixLayout: {
+    length = (waveSize == 32) ? 8 : 4;
+    break;
+  }
+  case CooperativeMatrixLayout::Gfx10AccumulatorMatrixLayout:
+  case CooperativeMatrixLayout::Gfx10Accumulator16bitMatrixLayout:
+    length = 8;
+    break;
+  default:
+    llvm_unreachable("unhandled matrix layout");
+  }
+  m_coopMatrixCalls.push_back(&matrixlength);
+  matrixlength.replaceAllUsesWith(builder.getInt32(length));
+}
+
+// =====================================================================================================================
+// Visit "CooperativeMatrixLoadOp" instruction
+//
+// @param load : The dialect instruction to process
+void LowerCooperativeMatrix::visitCooperativeMatrixLoadOp(CooperativeMatrixLoadOp &load) {
+  BuilderBase builder(*m_context);
+  builder.SetInsertPoint(&load);
 
   auto shaderStage = getShaderStage(builder.GetInsertBlock()->getParent());
   auto waveSize = m_pipelineState->getShaderWaveSize(shaderStage.value());
   assert(waveSize == 32 || waveSize == 64);
 
+  auto elemType = load.getElemType();
+  Value *dataPtr = load.getPointer();
+  Value *stride = load.getStride();
+  auto memoryAccess = load.getMemoryAccess();
+  auto layout = load.getLayout();
+  auto isColMajor = load.getColMajor();
+  auto alignment = load.getAlignment();
+
   // Calc element offset in memory
   Type *elemTy = builder.transCooperativeMatrixElementType(elemType);
   const unsigned dataBitwidth = elemTy->getScalarSizeInBits();
@@ -483,7 +329,7 @@ Value *LowerCooperativeMatrix::cooperativeMatrixLoadInternal(Value *dataPtr, Val
 
   auto props = getTypeProperties(elemType, layout);
 
-  auto addrInfo = computeAddressing(layout, elemType, waveSize, stride, isColMajor, insertPos);
+  auto addrInfo = computeAddressing(layout, elemType, waveSize, stride, isColMajor, &load);
   Value *vecVal = PoisonValue::get(FixedVectorType::get(elemTy, props.numFlatElements));
   for (unsigned idx = 0; idx < props.numFlatElements; ++idx) {
     Value *macroOffset = builder.CreateMul(addrInfo.macroStep, builder.getInt32(idx / addrInfo.microCount));
@@ -497,10 +343,10 @@ Value *LowerCooperativeMatrix::cooperativeMatrixLoadInternal(Value *dataPtr, Val
       // merging load/store instructions on backend later.
       unsigned constantOffsetInRowCol = cast<ConstantInt>(offsetInRowCol)->getZExtValue();
       Align compAlignment = commonAlignment(Align(alignment), constantOffsetInRowCol);
-      eleVal = builder.CreateAlignedLoad(elemTy, elePtr, compAlignment, isVolatile, instName);
+      eleVal = builder.CreateAlignedLoad(elemTy, elePtr, compAlignment, isVolatile);
     } else {
       // For rowMajor@B/C and colMajor@A, as the elements of one lane aren't continuous, no alignments needed.
-      eleVal = builder.CreateLoad(elemTy, elePtr, isVolatile, instName);
+      eleVal = builder.CreateLoad(elemTy, elePtr, isVolatile);
     }
     if (isCoherent && !(addrSpace == ADDR_SPACE_LOCAL && dataBitwidth < 32))
       cast<LoadInst>(eleVal)->setAtomic(AtomicOrdering::Unordered);
@@ -510,32 +356,26 @@ Value *LowerCooperativeMatrix::cooperativeMatrixLoadInternal(Value *dataPtr, Val
   }
 
   Value *coMatrix = convFlatVecToCoopMatrixVec(builder, vecVal, elemType, layout);
-  return coMatrix;
+  m_coopMatrixCalls.push_back(&load);
+  load.replaceAllUsesWith(coMatrix);
 }
 
 // =====================================================================================================================
-// Store a contiguous elements from the specified location of the memory.
+// Visit "CooperativeMatrixStoreOp" instruction
 //
-// @param dataPtr : The pointer to a data array.
-// @param stride : The stride in bytes between the first elements of consecutive rows (or columns) in the destination.
-// Guaranteed to be a multiple of the element size.
-// @param colMajor : Identify the order for the data stored in memory, col-major/row-major
-// @param elemType : The type for the element.
-// @param layout : This is identify for factor(A/B) or accumulator(C) for 16 bit element matrix.
-// @param memoryAccess :  The memory operands which provide
-// additional operands, maybe volatile/Aligned/Nontemporal/MakePointerAvailable
-// /MakePointerVisible/NonPrivatePointer used by CooperativeMatrix Load/Store.
-// @param vecVal : The contiguous elements made up of a vector to be loaded or stored.
-// @param instName : Name to give instruction(s).
-// @param insertPos : Where to insert the instruction
-void LowerCooperativeMatrix::cooperativeMatrixStoreInternal(Value *dataPtr, Value *stride, bool isColMajor,
-                                                            CooperativeMatrixElementType elemType,
-                                                            CooperativeMatrixLayout layout, unsigned memoryAccess,
-                                                            unsigned alignment, Value *&vecVal, const Twine &instName,
-                                                            Instruction *insertPos) {
+// @param store : The dialect instruction to process
+void LowerCooperativeMatrix::visitCooperativeMatrixStoreOp(CooperativeMatrixStoreOp &store) {
   BuilderBase builder(*m_context);
-  builder.SetInsertPoint(insertPos);
+  builder.SetInsertPoint(&store);
 
+  auto elemType = store.getElemType();
+  Value *dataPtr = store.getPointer();
+  Value *stride = store.getStride();
+  auto memoryAccess = store.getMemoryAccess();
+  auto layout = store.getLayout();
+  auto isColMajor = store.getColMajor();
+  auto alignment = store.getAlignment();
+  Value *vecVal = store.getStoreValue();
   auto shaderStage = getShaderStage(builder.GetInsertBlock()->getParent());
   auto waveSize = m_pipelineState->getShaderWaveSize(shaderStage.value());
   assert(waveSize == 32 || waveSize == 64);
@@ -554,7 +394,7 @@ void LowerCooperativeMatrix::cooperativeMatrixStoreInternal(Value *dataPtr, Valu
   bool isTemporal = memoryAccess & (unsigned)(CooperativeMatrixMemoryAccess::MemoryAccessTemporalMask);
 
   auto props = getTypeProperties(elemType, layout);
-  auto addrInfo = computeAddressing(layout, elemType, waveSize, stride, isColMajor, insertPos);
+  auto addrInfo = computeAddressing(layout, elemType, waveSize, stride, isColMajor, &store);
 
   vecVal = convCoopMatrixVecToFlatVec(builder, vecVal, elemType, layout);
 
@@ -579,19 +419,46 @@ void LowerCooperativeMatrix::cooperativeMatrixStoreInternal(Value *dataPtr, Valu
     if (isTemporal)
       st->setMetadata(LLVMContext::MD_nontemporal, MDNode::get(builder.getContext(), {}));
   }
+
+  m_coopMatrixCalls.push_back(&store);
 }
 
 // =====================================================================================================================
-// Open-code cooperative matrix extract operation
+// Visit "CooperativeMatrixFillOp" instruction
 //
-// @param builder : builder to use
-// @param matrix : the matrix from which to extract a component
-// @param index : the index to be extracted
-// @param elemType : the matrix element type
-// @param layout : the matrix layout type
-Value *LowerCooperativeMatrix::cooperativeMatrixExtract(BuilderCommon &builder, Value *matrix, Value *index,
-                                                        CooperativeMatrixElementType elemType,
-                                                        CooperativeMatrixLayout layout) {
+// @param fill : The dialect instruction to process
+void LowerCooperativeMatrix::visitCooperativeMatrixFillOp(CooperativeMatrixFillOp &fill) {
+  BuilderBase builder(*m_context);
+  builder.SetInsertPoint(&fill);
+
+  auto elemType = fill.getElemType();
+  auto layout = fill.getLayout();
+  Value *value = fill.getScalar();
+  auto props = getTypeProperties(elemType, layout);
+  Type *flatType = FixedVectorType::get(builder.transCooperativeMatrixElementType(elemType), props.numMatrixElements);
+
+  Value *vec = PoisonValue::get(flatType);
+  for (unsigned idx = 0; idx < props.numMatrixElements; idx++)
+    vec = builder.CreateInsertElement(vec, value, idx);
+
+  Value *fillValue = convFlatVecToCoopMatrixVec(builder, vec, elemType, layout);
+
+  m_coopMatrixCalls.push_back(&fill);
+  fill.replaceAllUsesWith(fillValue);
+}
+
+// =====================================================================================================================
+// Visit "CooperativeMatrixExtractOp" instruction
+//
+// @param extract : The dialect instruction to process
+void LowerCooperativeMatrix::visitCooperativeMatrixExtractOp(CooperativeMatrixExtractOp &extract) {
+  BuilderBase builder(*m_context);
+  builder.SetInsertPoint(&extract);
+
+  auto matrix = extract.getMatrix();
+  auto elemType = extract.getElemType();
+  auto layout = extract.getLayout();
+  auto index = extract.getIndex();
   Value *vec = convCoopMatrixVecToFlatVec(builder, matrix, elemType, layout);
 
   // This is a hacky workaround to the fact that for SPV_NV_cooperative_matrix, we have to support matrix length as
@@ -603,21 +470,24 @@ Value *LowerCooperativeMatrix::cooperativeMatrixExtract(BuilderCommon &builder,
     index = builder.CreateAnd(index, builder.getInt32(length - 1));
   }
 
-  return builder.CreateExtractElement(vec, index);
+  Value *elementValue = builder.CreateExtractElement(vec, index);
+  m_coopMatrixCalls.push_back(&extract);
+  extract.replaceAllUsesWith(elementValue);
 }
 
 // =====================================================================================================================
-// Open-code cooperative matrix insert operation
+// Visit "CooperativeMatrixInsertOp" instruction
 //
-// @param builder : builder to use
-// @param matrix : the matrix into which to insert a component
-// @param value : the value to be inserted
-// @param index : the index to be inserted
-// @param elemType : the matrix element type
-// @param layout : the matrix layout type
-Value *LowerCooperativeMatrix::cooperativeMatrixInsert(BuilderCommon &builder, Value *matrix, Value *value,
-                                                       Value *index, CooperativeMatrixElementType elemType,
-                                                       CooperativeMatrixLayout layout) {
+// @param insert : The dialect instruction to process
+void LowerCooperativeMatrix::visitCooperativeMatrixInsertOp(CooperativeMatrixInsertOp &insert) {
+  BuilderBase builder(*m_context);
+  builder.SetInsertPoint(&insert);
+
+  auto matrix = insert.getMatrix();
+  auto elemType = insert.getElemType();
+  auto layout = insert.getLayout();
+  auto index = insert.getIndex();
+  auto value = insert.getInsertValue();
   Value *vec = convCoopMatrixVecToFlatVec(builder, matrix, elemType, layout);
 
   // This is a hacky workaround to the fact that for SPV_NV_cooperative_matrix, we have to support matrix length as
@@ -634,27 +504,9 @@ Value *LowerCooperativeMatrix::cooperativeMatrixInsert(BuilderCommon &builder, V
     vec = builder.CreateInsertElement(vec, value, index);
   }
 
-  return convFlatVecToCoopMatrixVec(builder, vec, elemType, layout);
-}
-
-// =====================================================================================================================
-// Open-code cooperative matrix fill operation
-//
-// @param builder : builder to use
-// @param value : the value to fill the cooperative matrix
-// @param elemType : the matrix element type
-// @param layout : the matrix layout type
-Value *LowerCooperativeMatrix::cooperativeMatrixFill(BuilderCommon &builder, Value *value,
-                                                     CooperativeMatrixElementType elemType,
-                                                     CooperativeMatrixLayout layout) {
-  auto props = getTypeProperties(elemType, layout);
-  Type *flatType = FixedVectorType::get(builder.transCooperativeMatrixElementType(elemType), props.numMatrixElements);
-
-  Value *vec = PoisonValue::get(flatType);
-  for (unsigned idx = 0; idx < props.numMatrixElements; idx++)
-    vec = builder.CreateInsertElement(vec, value, idx);
-
-  return convFlatVecToCoopMatrixVec(builder, vec, elemType, layout);
+  Value *out = convFlatVecToCoopMatrixVec(builder, vec, elemType, layout);
+  m_coopMatrixCalls.push_back(&insert);
+  insert.replaceAllUsesWith(out);
 }
 
 // =====================================================================================================================
@@ -694,27 +546,20 @@ Value *LowerCooperativeMatrix::cooperativeMatrixConvertInternal(CastInst::CastOp
 }
 
 // =====================================================================================================================
-// Create cooperative matrix conversion.
-// Element-wise-conversion
-// @param castOp : The cast Opcode.
-// @param source : The source cooperative matrix.
-// @param srcElemType : Source matrix's element type.
-// @param dstElemType : Destination matrix's element type.
-// @param srcLayout : Layout for source matrix
-// @param dstLayout : Layout for destination matrix
-// @param instName : Name to give instruction(s).
-// @param insertPos : Where to insert the instruction
-Value *LowerCooperativeMatrix::cooperativeMatrixConvert(CastInst::CastOps castOp, Value *source,
-                                                        CooperativeMatrixElementType srcElemType,
-                                                        CooperativeMatrixElementType dstElemType,
-                                                        CooperativeMatrixLayout srcLayout,
-                                                        CooperativeMatrixLayout dstLayout, const Twine &instName,
-                                                        Instruction *insertPos) {
-  assert(source->getType()->isVectorTy());
+// Visit "CooperativeMatrixConvertOp" instruction
+//
+// @param convert : The dialect instruction to process
+void LowerCooperativeMatrix::visitCooperativeMatrixConvertOp(CooperativeMatrixConvertOp &convert) {
   BuilderBase builder(*m_context);
-  builder.SetInsertPoint(insertPos);
+  builder.SetInsertPoint(&convert);
   Value *resultValue = nullptr;
   Value *threadId = getLaneNumber(builder);
+  CastInst::CastOps castOp = static_cast<CastInst::CastOps>(convert.getCastOp());
+  auto srcLayout = convert.getSrcLayout();
+  auto dstLayout = convert.getDstLayout();
+  auto source = convert.getSource();
+  auto srcElemType = convert.getSrcElemType();
+  auto dstElemType = convert.getDstElemType();
 
   if (castOp == 0) { // Only reshape on 16bits, not do convert
     if ((srcLayout == CooperativeMatrixLayout::AccumulatorMatrixLayout) &&
@@ -724,7 +569,7 @@ Value *LowerCooperativeMatrix::cooperativeMatrixConvert(CastInst::CastOps castOp
       source = builder.CreateBitCast(source, FixedVectorType::get(builder.getInt32Ty(), vecNums));
     }
     resultValue = cooperativeMatrixReshape16BitElementGfx1011(source, srcElemType, srcLayout, dstLayout, threadId,
-                                                              instName, insertPos);
+                                                              convert.getName(), &convert);
   } else {
     unsigned numSrcBit = builder.transCooperativeMatrixElementType(srcElemType)->getScalarSizeInBits();
     unsigned numDstBit = builder.transCooperativeMatrixElementType(dstElemType)->getScalarSizeInBits();
@@ -733,47 +578,45 @@ Value *LowerCooperativeMatrix::cooperativeMatrixConvert(CastInst::CastOps castOp
     if ((numSrcBit < numDstBit) && (srcLayout != dstLayout)) {
       // Need Reshape from A/B layout to C/D layout
       // This interface will do cooperativeVecToflatVec internally except 8bit reshape.
-      source = cooperativeMatrixReshapeBeforeConvert(source, srcElemType, dstElemType, srcLayout, dstLayout, instName,
-                                                     insertPos);
+      source = cooperativeMatrixReshapeBeforeConvert(source, srcElemType, dstElemType, srcLayout, dstLayout,
+                                                     convert.getName(), &convert);
     } else {
       // For 16bit->32bit on Gfx11, no reshape needed as it will always in 	AccumulatorMatrixLayout
       source = convCoopMatrixVecToFlatVec(builder, source, srcElemType, srcLayout);
     }
 
     // Step 2: Just do flatElement conversion without any layout change.
-    resultValue = cooperativeMatrixConvertInternal(castOp, source, srcElemType, dstElemType, instName, insertPos);
+    resultValue =
+        cooperativeMatrixConvertInternal(castOp, source, srcElemType, dstElemType, convert.getName(), &convert);
 
     // Step 3: Some cases need change the layout due to different element types after conversion.
     if ((numSrcBit > numDstBit) && (srcLayout != dstLayout)) {
       // All these reshape interfaces will return N*packetTy.
       // Need Reshape from A/B layout to C/D layout
       resultValue = cooperativeMatrixReshapeAfterConvert(resultValue, srcElemType, dstElemType, srcLayout, dstLayout,
-                                                         instName, insertPos);
+                                                         convert.getName(), &convert);
     } else {
       resultValue = convFlatVecToCoopMatrixVec(builder, resultValue, dstElemType, dstLayout);
     }
   }
-  return resultValue;
+  m_coopMatrixCalls.push_back(&convert);
+  convert.replaceAllUsesWith(resultValue);
 }
 
 // =====================================================================================================================
-// Create cooperative matrix binary operation
+// Visit "CooperativeMatrixBinaryOp" instruction
 //
-// @param coopMatArithOp : The cooperative matrix arithmetic operation to perform.
-// @param lhs : The first operand and it can be a scalar or a cooperative matrix.
-// @param rhs : The second operand and it should be a cooperative matrix.
-// @param elemType : Element type for the matrix.
-// @param layout : Layout for the matrix.
-// @param instName : Name to give instruction(s).
-// @param insertPos : Where to insert the instruction
-Value *LowerCooperativeMatrix::cooperativeMatrixBinaryOp(CooperativeMatrixArithOp coopMatArithOp, Value *lhs,
-                                                         Value *rhs, CooperativeMatrixElementType elemType,
-                                                         CooperativeMatrixLayout layout, const Twine &instName,
-                                                         Instruction *insertPos) {
+// @param binary : The dialect instruction to process
+void LowerCooperativeMatrix::visitCooperativeMatrixBinaryOp(CooperativeMatrixBinaryOp &binary) {
+  Value *lhs = binary.getLhs();
+  Value *rhs = binary.getRhs();
   assert(lhs->getType()->isVectorTy() && lhs->getType() == rhs->getType() || rhs->getType()->isVectorTy());
+  CooperativeMatrixArithOp coopMatArithOp = binary.getArithOp();
+  auto elemType = binary.getElemType();
+  auto layout = binary.getLayout();
   Value *vcResult;
   BuilderBase builder(*m_context);
-  builder.SetInsertPoint(insertPos);
+  builder.SetInsertPoint(&binary);
 
   lhs = convCoopMatrixVecToFlatVec(builder, lhs, elemType, layout);
   rhs = convCoopMatrixVecToFlatVec(builder, rhs, elemType, layout);
@@ -810,26 +653,23 @@ Value *LowerCooperativeMatrix::cooperativeMatrixBinaryOp(CooperativeMatrixArithO
   }
 
   Value *coopMatResult = convFlatVecToCoopMatrixVec(builder, vcResult, elemType, layout);
-  return coopMatResult;
+  m_coopMatrixCalls.push_back(&binary);
+  binary.replaceAllUsesWith(coopMatResult);
 }
 
 // =====================================================================================================================
-// Create cooperative matrix MatrixTimesScalar operation
+// Visit "CooperativeMatrixTimesScalarOp" instruction
 //
-// @param matrix : The first operand and it should be a cooperative matrix.
-// @param scalar : The second operand and it should be a scalar. If the matrix
-// is a packed accumulator matrix, the scalar has to be a <2 x half> vector.
-// @param elemType : The component type of the matrix.
-// @param layout : Identify whether it's A/B or C/D
-// @param instName : Name to give instruction(s).
-// @param insertPos : Where to insert the instruction
-Value *LowerCooperativeMatrix::coopMatrixTimesScalar(Value *matrix, Value *scalar,
-                                                     CooperativeMatrixElementType elemType,
-                                                     CooperativeMatrixLayout layout, const Twine &instName,
-                                                     Instruction *insertPos) {
+// @param timesScalar : The dialect instruction to process
+void LowerCooperativeMatrix::visitCooperativeMatrixTimesScalarOp(CooperativeMatrixTimesScalarOp &timesScalar) {
+  Value *matrix = timesScalar.getMatrix();
   assert(matrix->getType()->getScalarType()->isIntegerTy() || matrix->getType()->getScalarType()->isFloatTy());
+  auto elemType = timesScalar.getElemType();
+  auto layout = timesScalar.getLayout();
+  Value *scalar = timesScalar.getScalar();
+
   BuilderBase builder(*m_context);
-  builder.SetInsertPoint(insertPos);
+  builder.SetInsertPoint(&timesScalar);
 
   Value *vcFlat = convCoopMatrixVecToFlatVec(builder, matrix, elemType, layout);
   const unsigned numElems = cast<FixedVectorType>(vcFlat->getType())->getNumElements();
@@ -846,7 +686,8 @@ Value *LowerCooperativeMatrix::coopMatrixTimesScalar(Value *matrix, Value *scala
     vcFlatResult = builder.CreateMul(vcFlat, splat);
   }
   Value *coopMatResult = convFlatVecToCoopMatrixVec(builder, vcFlatResult, elemType, layout);
-  return coopMatResult;
+  m_coopMatrixCalls.push_back(&timesScalar);
+  timesScalar.replaceAllUsesWith(coopMatResult);
 }
 
 // =====================================================================================================================
@@ -1350,18 +1191,15 @@ Value *LowerCooperativeMatrix::cooperativeMatrixReshapeAfterConvert(Value *sourc
 }
 
 // =====================================================================================================================
-// Create cooperative matrix transpose operation
+// Visit "CooperativeMatrixTransposeOp" instruction
 //
-// @param matrix : The first operand and it should be a cooperative matrix.
-// @param elemType : The component type of the matrix.
-// @param srcLayout: Identify whether it's A/B or C/D
-// @param instName : Name to give instruction(s).
-// @param insertPos : Where to insert the instruction
-Value *LowerCooperativeMatrix::cooperativeMatrixTranspose(llvm::Value *matrix, CooperativeMatrixElementType elemType,
-                                                          CooperativeMatrixLayout srcLayout, const Twine &instName,
-                                                          llvm::Instruction *insertPos) {
+// @param transpose : The dialect instruction to process
+void LowerCooperativeMatrix::visitCooperativeMatrixTransposeOp(CooperativeMatrixTransposeOp &transpose) {
   BuilderBase builder(*m_context);
-  builder.SetInsertPoint(insertPos);
+  builder.SetInsertPoint(&transpose);
+
+  Value *matrix = transpose.getMatrix();
+  auto elemType = transpose.getElemType();
 
   Value *threadId = getLaneNumber(builder);
   Value *isEvenThread = builder.CreateICmpEQ(builder.CreateAnd(threadId, builder.getInt32(1)), builder.getInt32(0));
@@ -1436,7 +1274,8 @@ Value *LowerCooperativeMatrix::cooperativeMatrixTranspose(llvm::Value *matrix, C
   // lane0/V0: {0_0,0_1}; V1: {2_0,2_1} lane2/V0:{0_2,0_3} V1:{2_2,2_3} ==>
   // lane0/V0: {0_0,0_1}; V1: {0_2,0_3} lane2/V0:{2_0,2_1} V1:{2_2,2_3}
   Value *resultValue = transposeCooperativeMatrixRecursively(matrix, vecStride, laneStride, threadId, builder);
-  return resultValue;
+  m_coopMatrixCalls.push_back(&transpose);
+  transpose.replaceAllUsesWith(resultValue);
 }
 
 // =====================================================================================================================
@@ -1513,26 +1352,22 @@ Value *LowerCooperativeMatrix::transposeCooperativeMatrixRecursively(llvm::Value
 }
 
 // =====================================================================================================================
-// Create cooperative matrix muladd operation
+// Visit "CooperativeMatrixMulAddOp" instruction
 //
-// @param matrixA : Factor cooperative matrix.
-// @param matrixB : Factor cooperative matrix.
-// @param matrixC : Accumulator cooperative matrix.
-// @param isSignedA : Identify the signess for matrix A's element type
-// @param isSignedB : Identify the signess for matrix B's element type
-// @param isSat : SaturatingAccumulation for calculation
-// @param accumElemType : The component type of the accumulator matrix.
-// @param factorElemType : The component type of the factor matrix.
-// @param matrixCLayout: The layout for matrix C/D.
-// @param instName : Name to give instruction(s).
-// @param insertPos : Where to insert the instruction
-Value *LowerCooperativeMatrix::cooperativeMatrixMulAdd(llvm::Value *matrixA, llvm::Value *matrixB, llvm::Value *matrixC,
-                                                       bool isSignedA, bool isSignedB, bool isSatOrOpsel, bool isTied,
-                                                       CooperativeMatrixElementType accumElemType,
-                                                       CooperativeMatrixElementType factorElemType,
-                                                       const Twine &instName, Instruction *insertPos) {
+// @param muladd : The dialect instruction to process
+void LowerCooperativeMatrix::visitCooperativeMatrixMulAddOp(CooperativeMatrixMulAddOp &muladd) {
   BuilderBase builder(*m_context);
-  builder.SetInsertPoint(insertPos);
+  builder.SetInsertPoint(&muladd);
+
+  Value *matrixA = muladd.getMatrixA();
+  Value *matrixB = muladd.getMatrixB();
+  Value *matrixC = muladd.getMatrixC();
+  auto factorElemType = muladd.getFactorElemType();
+  auto accumElemType = muladd.getAccuElemType();
+  bool isSignedA = muladd.getIsSignedA();
+  bool isSignedB = muladd.getIsSignedB();
+  bool isSatOrOpsel = muladd.getIsSatOrOpsel();
+  StringRef instName = muladd.getName();
 
   if (m_gfxIp.major >= 11) {
     // Gfx11:
@@ -1601,6 +1436,7 @@ Value *LowerCooperativeMatrix::cooperativeMatrixMulAdd(llvm::Value *matrixA, llv
                accumElemType == CooperativeMatrixElementType::Float16) {
       // Matrix convert to match intrinsic arguments: Wave32: float32*v8->half*v16
       // Wave64: float32*v4->half*v8
+      bool isTied = muladd.getIsTied();
       auto intrinsic = Intrinsic::amdgcn_wmma_f16_16x16x16_f16;
       if (isTied)
 #if defined(LLVM_MAIN_REVISION) && LLVM_MAIN_REVISION < 479080
@@ -1631,98 +1467,95 @@ Value *LowerCooperativeMatrix::cooperativeMatrixMulAdd(llvm::Value *matrixA, llv
                                                              ArrayRef<int>{0, 1, 2, 3, 4, 5, 6, 7})
                                : matrixD;
     }
-    return matrixD;
-  } else { // Emulator on NAVI2X
+    m_coopMatrixCalls.push_back(&muladd);
+    muladd.replaceAllUsesWith(matrixD);
+    return;
+  }
 
-    Type *packedTy =
-        (factorElemType == CooperativeMatrixElementType::Float16) ? builder.getFloatTy() : builder.getInt32Ty();
-    Value *dotProductValue;
+  // Emulator on NAVI2X
+  Type *packedTy =
+      (factorElemType == CooperativeMatrixElementType::Float16) ? builder.getFloatTy() : builder.getInt32Ty();
+  Value *dotProductValue;
 
-    Value *threadId = getLaneNumber(builder);
-    Value *laneGroupIdx = builder.CreateUDiv(threadId, builder.getInt32(16));
-    Value *isEvenGroup =
-        builder.CreateICmpEQ(builder.CreateAnd(laneGroupIdx, builder.getInt32(1)), builder.getInt32(0));
+  Value *threadId = getLaneNumber(builder);
+  Value *laneGroupIdx = builder.CreateUDiv(threadId, builder.getInt32(16));
+  Value *isEvenGroup = builder.CreateICmpEQ(builder.CreateAnd(laneGroupIdx, builder.getInt32(1)), builder.getInt32(0));
 
-    unsigned flags = (isSignedB << 1) | isSignedA;
-    auto mapFuncReadLane = [](BuilderBase &builder, ArrayRef<Value *> mappedArgs,
-                              ArrayRef<Value *> passthroughArgs) -> Value * {
-      Type *const int32Ty = builder.getInt32Ty();
+  unsigned flags = (isSignedB << 1) | isSignedA;
+  auto mapFuncReadLane = [](BuilderBase &builder, ArrayRef<Value *> mappedArgs,
+                            ArrayRef<Value *> passthroughArgs) -> Value * {
+    Type *const int32Ty = builder.getInt32Ty();
 
-      return builder.CreateIntrinsic(int32Ty, Intrinsic::amdgcn_readlane, {mappedArgs[0], passthroughArgs[0]});
-    };
+    return builder.CreateIntrinsic(int32Ty, Intrinsic::amdgcn_readlane, {mappedArgs[0], passthroughArgs[0]});
+  };
 
-    // matrixC is not reshaped for gfx10
-    if (accumElemType == CooperativeMatrixElementType::Float32 ||
-        accumElemType == CooperativeMatrixElementType::Int32) {
-      dotProductValue = PoisonValue::get(FixedVectorType::get(packedTy, 8));
-      for (unsigned idxc = 0; idxc < 8; ++idxc) {
-        Value *rowlowgroup = builder.CreateMapToSimpleType(mapFuncReadLane, matrixA, builder.getInt32(idxc * 2));
-        Value *rowhighgroup = builder.CreateMapToSimpleType(mapFuncReadLane, matrixA, builder.getInt32(idxc * 2 + 1));
-        Value *rowData = builder.CreateSelect(isEvenGroup, rowlowgroup, rowhighgroup);
-        Value *mulAB;
-        Value *initAccumulator = builder.CreateExtractElement(matrixC, idxc);
-        if (factorElemType == CooperativeMatrixElementType::Float16) {
-          mulAB = createDotProductFp16Fp32(rowData, matrixB, initAccumulator, isSatOrOpsel, instName, insertPos);
-        } else if (factorElemType == CooperativeMatrixElementType::Int16) {
-          mulAB =
-              createDotProductInt16Int32(rowData, matrixB, initAccumulator, flags, isSatOrOpsel, instName, insertPos);
-        } else if (factorElemType == CooperativeMatrixElementType::Int8) {
-          mulAB =
-              createDotProductInt8Int32(rowData, matrixB, initAccumulator, flags, isSatOrOpsel, instName, insertPos);
-        } else {
-          llvm_unreachable("Unsupported element type!");
-        }
-        dotProductValue = builder.CreateInsertElement(dotProductValue, mulAB, idxc);
+  // matrixC is not reshaped for gfx10
+  if (accumElemType == CooperativeMatrixElementType::Float32 || accumElemType == CooperativeMatrixElementType::Int32) {
+    dotProductValue = PoisonValue::get(FixedVectorType::get(packedTy, 8));
+    for (unsigned idxc = 0; idxc < 8; ++idxc) {
+      Value *rowlowgroup = builder.CreateMapToSimpleType(mapFuncReadLane, matrixA, builder.getInt32(idxc * 2));
+      Value *rowhighgroup = builder.CreateMapToSimpleType(mapFuncReadLane, matrixA, builder.getInt32(idxc * 2 + 1));
+      Value *rowData = builder.CreateSelect(isEvenGroup, rowlowgroup, rowhighgroup);
+      Value *mulAB;
+      Value *initAccumulator = builder.CreateExtractElement(matrixC, idxc);
+      if (factorElemType == CooperativeMatrixElementType::Float16) {
+        mulAB = createDotProductFp16Fp32(rowData, matrixB, initAccumulator, isSatOrOpsel, instName, &muladd);
+      } else if (factorElemType == CooperativeMatrixElementType::Int16) {
+        mulAB = createDotProductInt16Int32(rowData, matrixB, initAccumulator, flags, isSatOrOpsel, instName, &muladd);
+      } else if (factorElemType == CooperativeMatrixElementType::Int8) {
+        mulAB = createDotProductInt8Int32(rowData, matrixB, initAccumulator, flags, isSatOrOpsel, instName, &muladd);
+      } else {
+        llvm_unreachable("Unsupported element type!");
       }
-    } else if (accumElemType == CooperativeMatrixElementType::Int16 ||
-               accumElemType == CooperativeMatrixElementType::Float16) {
-      dotProductValue =
-          PoisonValue::get(FixedVectorType::get(builder.transCooperativeMatrixElementType(accumElemType), 8));
-      // For gfx10, A*B:8*float32->16*half  C: no reshape for 16bit, still 16*half
-      Value *colData =
-          convCoopMatrixVecToFlatVec(builder, matrixB, factorElemType, CooperativeMatrixLayout::FactorMatrixLayout);
-      matrixC = convCoopMatrixVecToFlatVec(builder, matrixC, accumElemType,
-                                           CooperativeMatrixLayout::Gfx10Accumulator16bitMatrixLayout);
-
-      for (unsigned idxc = 0, accIdx = 0; idxc < 16; idxc += 4, accIdx += 2) {
-        Value *rowData1Low = builder.CreateMapToSimpleType(mapFuncReadLane, matrixA, builder.getInt32(idxc));
-        Value *rowData2Low = builder.CreateMapToSimpleType(mapFuncReadLane, matrixA, builder.getInt32(idxc + 1));
-        Value *rowData1High = builder.CreateMapToSimpleType(mapFuncReadLane, matrixA, builder.getInt32(idxc + 2));
-        Value *rowData2High = builder.CreateMapToSimpleType(mapFuncReadLane, matrixA, builder.getInt32(idxc + 3));
-
-        Value *rowData1 = builder.CreateSelect(isEvenGroup, rowData1Low, rowData1High);
-        Value *rowData2 = builder.CreateSelect(isEvenGroup, rowData2Low, rowData2High);
-
-        rowData1 =
-            convCoopMatrixVecToFlatVec(builder, rowData1, factorElemType, CooperativeMatrixLayout::FactorMatrixLayout);
-        rowData2 =
-            convCoopMatrixVecToFlatVec(builder, rowData2, factorElemType, CooperativeMatrixLayout::FactorMatrixLayout);
-
-        Value *mulAB1;
-        Value *mulAB2;
-        Value *accumulator1 = builder.CreateExtractElement(matrixC, accIdx);
-        Value *accumulator2 = builder.CreateExtractElement(matrixC, accIdx + 1);
-
-        if (accumElemType == CooperativeMatrixElementType::Float16) {
-          mulAB1 = createDotProductFp16Fp16(rowData1, colData, accumulator1, isSatOrOpsel, instName, insertPos);
-          mulAB2 = createDotProductFp16Fp16(rowData2, colData, accumulator2, isSatOrOpsel, instName, insertPos);
-        } else {
-          mulAB1 =
-              createDotProductInt16Int16(rowData1, colData, accumulator1, flags, isSatOrOpsel, instName, insertPos);
-          mulAB2 =
-              createDotProductInt16Int16(rowData2, colData, accumulator2, flags, isSatOrOpsel, instName, insertPos);
-        }
-        dotProductValue = builder.CreateInsertElement(dotProductValue, mulAB1, accIdx);
-        dotProductValue = builder.CreateInsertElement(dotProductValue, mulAB2, accIdx + 1);
+      dotProductValue = builder.CreateInsertElement(dotProductValue, mulAB, idxc);
+    }
+  } else if (accumElemType == CooperativeMatrixElementType::Int16 ||
+             accumElemType == CooperativeMatrixElementType::Float16) {
+    dotProductValue =
+        PoisonValue::get(FixedVectorType::get(builder.transCooperativeMatrixElementType(accumElemType), 8));
+    // For gfx10, A*B:8*float32->16*half  C: no reshape for 16bit, still 16*half
+    Value *colData =
+        convCoopMatrixVecToFlatVec(builder, matrixB, factorElemType, CooperativeMatrixLayout::FactorMatrixLayout);
+    matrixC = convCoopMatrixVecToFlatVec(builder, matrixC, accumElemType,
+                                         CooperativeMatrixLayout::Gfx10Accumulator16bitMatrixLayout);
+
+    for (unsigned idxc = 0, accIdx = 0; idxc < 16; idxc += 4, accIdx += 2) {
+      Value *rowData1Low = builder.CreateMapToSimpleType(mapFuncReadLane, matrixA, builder.getInt32(idxc));
+      Value *rowData2Low = builder.CreateMapToSimpleType(mapFuncReadLane, matrixA, builder.getInt32(idxc + 1));
+      Value *rowData1High = builder.CreateMapToSimpleType(mapFuncReadLane, matrixA, builder.getInt32(idxc + 2));
+      Value *rowData2High = builder.CreateMapToSimpleType(mapFuncReadLane, matrixA, builder.getInt32(idxc + 3));
+
+      Value *rowData1 = builder.CreateSelect(isEvenGroup, rowData1Low, rowData1High);
+      Value *rowData2 = builder.CreateSelect(isEvenGroup, rowData2Low, rowData2High);
+
+      rowData1 =
+          convCoopMatrixVecToFlatVec(builder, rowData1, factorElemType, CooperativeMatrixLayout::FactorMatrixLayout);
+      rowData2 =
+          convCoopMatrixVecToFlatVec(builder, rowData2, factorElemType, CooperativeMatrixLayout::FactorMatrixLayout);
+
+      Value *mulAB1;
+      Value *mulAB2;
+      Value *accumulator1 = builder.CreateExtractElement(matrixC, accIdx);
+      Value *accumulator2 = builder.CreateExtractElement(matrixC, accIdx + 1);
+
+      if (accumElemType == CooperativeMatrixElementType::Float16) {
+        mulAB1 = createDotProductFp16Fp16(rowData1, colData, accumulator1, isSatOrOpsel, instName, &muladd);
+        mulAB2 = createDotProductFp16Fp16(rowData2, colData, accumulator2, isSatOrOpsel, instName, &muladd);
+      } else {
+        mulAB1 = createDotProductInt16Int16(rowData1, colData, accumulator1, flags, isSatOrOpsel, instName, &muladd);
+        mulAB2 = createDotProductInt16Int16(rowData2, colData, accumulator2, flags, isSatOrOpsel, instName, &muladd);
       }
-
-      dotProductValue = convFlatVecToCoopMatrixVec(builder, dotProductValue, accumElemType,
-                                                   CooperativeMatrixLayout::Gfx10Accumulator16bitMatrixLayout);
-    } else {
-      llvm_unreachable("The accumulator type is not supported.");
+      dotProductValue = builder.CreateInsertElement(dotProductValue, mulAB1, accIdx);
+      dotProductValue = builder.CreateInsertElement(dotProductValue, mulAB2, accIdx + 1);
     }
-    return dotProductValue;
+
+    dotProductValue = convFlatVecToCoopMatrixVec(builder, dotProductValue, accumElemType,
+                                                 CooperativeMatrixLayout::Gfx10Accumulator16bitMatrixLayout);
+  } else {
+    llvm_unreachable("The accumulator type is not supported.");
   }
+  m_coopMatrixCalls.push_back(&muladd);
+  muladd.replaceAllUsesWith(dotProductValue);
 }
 
 // =====================================================================================================================
@@ -1933,16 +1766,15 @@ Value *LowerCooperativeMatrix::createDotProductInt16Int16(Value *vector1, Value
 }
 
 // =====================================================================================================================
-// Create code to pack two accumulator matrices into one set of registers
+// Visit "CooperativeMatrixPackOp" instruction
 //
-// @param matrixCLo : The lower accumulator
-// @param matrixCHi : The higher accumulator
-// @param instName : Name to give instruction(s)
-// @param insertPos : Where to insert the instruction
-Value *LowerCooperativeMatrix::cooperativeMatrixPack(llvm::Value *matrixCLo, llvm::Value *matrixCHi,
-                                                     const Twine &instName, Instruction *insertPos) {
+// @param pack: The dialect instruction to process
+void LowerCooperativeMatrix::visitCooperativeMatrixPackOp(CooperativeMatrixPackOp &pack) {
+  Value *matrixCLo = pack.getMatrixCLo();
+  Value *matrixCHi = pack.getMatrixCHi();
+
   BuilderBase builder(*m_context);
-  builder.SetInsertPoint(insertPos);
+  builder.SetInsertPoint(&pack);
 
   static const int shuffleIndices[] = {0, 16, 2, 18, 4, 20, 6, 22, 8, 24, 10, 26, 12, 28, 14, 30};
 
@@ -1952,21 +1784,21 @@ Value *LowerCooperativeMatrix::cooperativeMatrixPack(llvm::Value *matrixCLo, llv
 
   auto result = builder.CreateShuffleVector(matrixCLo, matrixCHi, shuffleIndices);
 
-  return builder.CreateBitCast(result, FixedVectorType::get(builder.getFloatTy(), 8));
+  Value *packValue = builder.CreateBitCast(result, FixedVectorType::get(builder.getFloatTy(), 8));
+  m_coopMatrixCalls.push_back(&pack);
+  pack.replaceAllUsesWith(packValue);
 }
 
 // =====================================================================================================================
-// Create code to unpack one packed accumulator matrix into two separate set of
-// registers
+// Visit "CooperativeMatrixUnPackOp" instruction
 //
-// @param packedMatrix : The packed accumulator matrix
-// @param: high: Whether to get the matrix in the upper half of the registers
-// @param instName : Name to give instruction(s)
-// @param insertPos : Where to insert the instruction
-Value *LowerCooperativeMatrix::cooperativeMatrixUnpack(llvm::Value *packedMatrix, bool high, const Twine &instName,
-                                                       Instruction *insertPos) {
+// @param unpack: The dialect instruction to process
+void LowerCooperativeMatrix::visitCooperativeMatrixUnPackOp(CooperativeMatrixUnPackOp &unpack) {
+  Value *packedMatrix = unpack.getPackedMatrix();
+  bool high = unpack.getGetUpperHalf();
+
   BuilderBase builder(*m_context);
-  builder.SetInsertPoint(insertPos);
+  builder.SetInsertPoint(&unpack);
 
   static const int shuffleIndicesLo[] = {0, -1, 2, -1, 4, -1, 6, -1, 8, -1, 10, -1, 12, -1, 14, -1};
   static const int shuffleIndicesHi[] = {1, -1, 3, -1, 5, -1, 7, -1, 9, -1, 11, -1, 13, -1, 15, -1};
@@ -1975,7 +1807,9 @@ Value *LowerCooperativeMatrix::cooperativeMatrixUnpack(llvm::Value *packedMatrix
   auto matrixPackedCast = builder.CreateBitCast(packedMatrix, halfVecTy);
   auto matrixUnpacked = builder.CreateShuffleVector(matrixPackedCast, high ? shuffleIndicesHi : shuffleIndicesLo);
 
-  return builder.CreateBitCast(matrixUnpacked, FixedVectorType::get(builder.getFloatTy(), 8));
+  Value *unpackValue = builder.CreateBitCast(matrixUnpacked, FixedVectorType::get(builder.getFloatTy(), 8));
+  m_coopMatrixCalls.push_back(&unpack);
+  unpack.replaceAllUsesWith(unpackValue);
 }
 
 // =====================================================================================================================
diff --git a/lgc/patch/LowerDesc.cpp b/lgc/patch/LowerDesc.cpp
index 9a428cbd00..198964c6d8 100644
--- a/lgc/patch/LowerDesc.cpp
+++ b/lgc/patch/LowerDesc.cpp
@@ -31,6 +31,7 @@
 #include "lgc/patch/LowerDesc.h"
 #include "lgc/LgcDialect.h"
 #include "lgc/builder/BuilderImpl.h"
+#include "lgc/util/AddressExtender.h"
 #include "llvm-dialects/Dialect/Visitor.h"
 #include "llvm/Support/Debug.h"
 
@@ -53,7 +54,7 @@ PreservedAnalyses LowerDesc::run(Module &module, ModuleAnalysisManager &analysis
   m_pipelineState = pipelineState;
 
   static const auto visitor = llvm_dialects::VisitorBuilder<LowerDesc>()
-                                  .add(&LowerDesc::visitLoadBufferAddr)
+                                  .add(&LowerDesc::visitExtendAddress)
                                   .add(&LowerDesc::visitLoadBufferDesc)
                                   .add(&LowerDesc::visitLoadStridedBufferDesc)
                                   .build();
@@ -69,25 +70,13 @@ PreservedAnalyses LowerDesc::run(Module &module, ModuleAnalysisManager &analysis
 }
 
 // =====================================================================================================================
-// Lower a load.buffer.addr operation. The result is an i64.
-//
-// @param op : the operation
-void LowerDesc::visitLoadBufferAddr(LoadBufferAddrOp &op) {
-  BuilderImpl builder(m_pipelineState);
-  builder.setShaderStage(getShaderStage(op.getFunction()));
-  builder.SetInsertPoint(&op);
-
-  // BufferFlagAddress only supports the case where the descriptor is a compact descriptor. This op supports
-  // normal descriptors, extracting the 48-bit address out of the descriptor.
-  unsigned flags = op.getFlags() & ~Builder::BufferFlagAddress;
-  Value *desc = builder.CreateBufferDesc(op.getDescSet(), op.getBinding(), op.getDescIndex(), flags);
-  m_toErase.push_back(&op);
-
-  // Extract 48-bit address out of <4 x i32> descriptor, resulting in an i64.
-  Value *addr = builder.CreateShuffleVector(desc, desc, {0, 1});
-  addr = builder.CreateBitCast(addr, builder.getInt64Ty());
-  addr = builder.CreateAnd(addr, builder.getInt64(0x0000ffffffffffffULL));
-  op.replaceAllUsesWith(addr);
+// Lower an extend.address op, to extend a 32-bit address to i64.
+void LowerDesc::visitExtendAddress(ExtendAddressOp &op) {
+  IRBuilder<> builder(&op);
+  AddressExtender extender(op.getFunction());
+  Value *addr64 = extender.extendWithPc(op.getAddr32(), /*ptrTy=*/nullptr, builder);
+  op.replaceAllUsesWith(addr64);
+  op.eraseFromParent();
 }
 
 // =====================================================================================================================
diff --git a/lgc/patch/LowerGpuRt.cpp b/lgc/patch/LowerGpuRt.cpp
index d60766e6ff..916101b8e9 100644
--- a/lgc/patch/LowerGpuRt.cpp
+++ b/lgc/patch/LowerGpuRt.cpp
@@ -78,6 +78,7 @@ PreservedAnalyses LowerGpuRt::run(Module &module, ModuleAnalysisManager &analysi
                             .add(&LowerGpuRt::visitFloatWithRoundMode)
                             .add(&LowerGpuRt::visitGpurtDispatchThreadIdFlatOp)
                             .add(&LowerGpuRt::visitContinuationStackIsGlobalOp)
+                            .add(&LowerGpuRt::visitWaveScanOp)
                             .build();
 
   visitor.visit(*this, module);
@@ -149,7 +150,10 @@ void LowerGpuRt::createGlobalStack(Module &module) {
                               payload.needGlobalStack = true;
                               payload.needExtraStack |= op.getUseExtraStack();
                             })
-                            .add<GpurtLdsStackInitOp>([](auto &payload, auto &op) { payload.needGlobalStack = true; })
+                            .add<GpurtLdsStackInitOp>([](auto &payload, auto &op) {
+                              payload.needGlobalStack = true;
+                              payload.needExtraStack |= op.getUseExtraStack();
+                            })
                             .build();
   visitor.visit(payload, module);
 
@@ -272,6 +276,11 @@ void LowerGpuRt::visitLdsStackInit(GpurtLdsStackInitOp &inst) {
     stackBasePerThread = m_builder->CreateAdd(localThreadId, groupOf32ThreadSize);
   }
 
+  if (inst.getUseExtraStack()) {
+    auto ldsStackSize = m_builder->getInt32(getWorkgroupSize() * MaxLdsStackEntries);
+    stackBasePerThread = m_builder->CreateAdd(stackBasePerThread, ldsStackSize);
+  }
+
   Value *stackBaseAsInt = m_builder->CreatePtrToInt(
       m_builder->CreateGEP(m_stackTy, m_stack, {m_builder->getInt32(0), stackBasePerThread}), m_builder->getInt32Ty());
 
@@ -337,15 +346,50 @@ void LowerGpuRt::visitFloatWithRoundMode(lgc::GpurtFloatWithRoundModeOp &inst) {
   m_funcsToLower.insert(func);
 }
 
+// =====================================================================================================================
+// Visit "GpurtWaveScanOp" instruction
+//
+// @param inst : The dialect instruction to process
+void LowerGpuRt::visitWaveScanOp(lgc::GpurtWaveScanOp &inst) {
+  m_builder->SetInsertPoint(&inst);
+
+  constexpr unsigned int Inclusive = 0x1;
+  constexpr unsigned int Exclusive = 0x2;
+
+  const BuilderDefs::GroupArithOp WaveScanOpTable[] = {
+      BuilderDefs::Nop,  BuilderDefs::FAdd, BuilderDefs::IAdd, BuilderDefs::IAdd, BuilderDefs::FMul, BuilderDefs::IMul,
+      BuilderDefs::IMul, BuilderDefs::FMin, BuilderDefs::SMin, BuilderDefs::UMin, BuilderDefs::FMax, BuilderDefs::SMax,
+      BuilderDefs::UMax, BuilderDefs::Nop,  BuilderDefs::Nop,  BuilderDefs::Nop,
+  };
+
+  auto waveOpCode = cast<ConstantInt>(inst.getOperation())->getZExtValue();
+  auto waveOpFlags = cast<ConstantInt>(inst.getFlags())->getZExtValue();
+  Value *src0 = inst.getSrc0();
+
+  BuilderDefs::GroupArithOp opCode = WaveScanOpTable[waveOpCode];
+
+  assert((waveOpFlags == Inclusive) || (waveOpFlags == Exclusive));
+  assert(opCode != BuilderDefs::Nop);
+
+  Value *result = nullptr;
+  if (waveOpFlags == Inclusive)
+    result = m_builder->CreateSubgroupClusteredInclusive(opCode, src0, m_builder->CreateGetWaveSize());
+  else if (waveOpFlags == Exclusive)
+    result = m_builder->CreateSubgroupClusteredExclusive(opCode, src0, m_builder->CreateGetWaveSize());
+
+  inst.replaceAllUsesWith(result);
+  m_callsToLower.push_back(&inst);
+  m_funcsToLower.insert(inst.getCalledFunction());
+}
+
 // =====================================================================================================================
 // Visit "GpurtLdsStackStoreOp" instruction
 //
 // @param inst : The dialect instruction to process
 void LowerGpuRt::visitLdsStackStore(GpurtLdsStackStoreOp &inst) {
   m_builder->SetInsertPoint(&inst);
-  Value *stackAddr = inst.getNewPos();
-  Value *stackAddrVal = m_builder->CreateLoad(m_builder->getInt32Ty(), stackAddr);
-  Value *lastVisited = inst.getOldPos();
+  Value *stackAddrVal = inst.getOldPos();
+  Value *lastVisited = inst.getLastNode();
   Value *data = inst.getData();
   // OFFSET = {OFFSET1, OFFSET0}
   // stack_size[1:0] = OFFSET1[5:4]
@@ -356,13 +400,11 @@ void LowerGpuRt::visitLdsStackStore(GpurtLdsStackStoreOp &inst) {
   // 64 -> {0x30, 0x00}
   assert(MaxLdsStackEntries == 16);
   Value *offset = m_builder->getInt32((Log2_32(MaxLdsStackEntries) - 3) << 12);
-
+  // return struct {newNode, newStackAddr}
   Value *result =
       m_builder->CreateIntrinsic(Intrinsic::amdgcn_ds_bvh_stack_rtn, {}, {stackAddrVal, lastVisited, data, offset});
 
-  m_builder->CreateStore(m_builder->CreateExtractValue(result, 1), stackAddr);
-  Value *ret = m_builder->CreateExtractValue(result, 0);
-  inst.replaceAllUsesWith(ret);
+  inst.replaceAllUsesWith(result);
   m_callsToLower.push_back(&inst);
   m_funcsToLower.insert(inst.getCalledFunction());
 }
diff --git a/lgc/patch/LowerSubgroupOps.cpp b/lgc/patch/LowerSubgroupOps.cpp
index dcbca0fc48..621a7f968d 100644
--- a/lgc/patch/LowerSubgroupOps.cpp
+++ b/lgc/patch/LowerSubgroupOps.cpp
@@ -63,6 +63,9 @@ PreservedAnalyses LowerSubgroupOps::run(Module &module, ModuleAnalysisManager &a
                                   .setStrategy(llvm_dialects::VisitorStrategy::ByFunctionDeclaration)
                                   .add(&LowerSubgroupOps::visitElect)
                                   .add(&LowerSubgroupOps::visitAny)
+                                  .add(&LowerSubgroupOps::visitAll)
+                                  .add(&LowerSubgroupOps::visitAllEqual)
+                                  .add(&LowerSubgroupOps::visitRotate)
                                   .build();
   visitor.visit(*this, module);
   m_builder = nullptr;
@@ -86,4 +89,20 @@ void LowerSubgroupOps::visitAny(SubgroupAnyOp &op) {
   replace(op, m_builder->CreateSubgroupAny(op.getValue()));
 }
 
+void LowerSubgroupOps::visitAll(SubgroupAllOp &op) {
+  m_builder->SetInsertPoint(&op);
+  replace(op, m_builder->CreateSubgroupAll(op.getValue()));
+}
+
+void LowerSubgroupOps::visitAllEqual(SubgroupAllEqualOp &op) {
+  m_builder->SetInsertPoint(&op);
+  replace(op, m_builder->CreateSubgroupAllEqual(op.getValue()));
+}
+
+void LowerSubgroupOps::visitRotate(SubgroupRotateOp &op) {
+  m_builder->SetInsertPoint(&op);
+  Value *cs = op.getClusterSize();
+  replace(op, m_builder->CreateSubgroupRotate(op.getValue(), op.getDelta(), isa<PoisonValue>(cs) ? nullptr : cs));
+}
+
 } // namespace lgc
diff --git a/lgc/patch/MeshTaskShader.cpp b/lgc/patch/MeshTaskShader.cpp
index 8666090720..c1b259d425 100644
--- a/lgc/patch/MeshTaskShader.cpp
+++ b/lgc/patch/MeshTaskShader.cpp
@@ -69,14 +69,16 @@ MeshTaskShader::~MeshTaskShader() {
 // @param pipelineState : Pipeline state
 // @param entryPoint : Entry-point of mesh shader
 // @param ldsLayout : Mesh shader LDS layout (could be null)
-// @param outputsLayout : Mesh shader outputs layout (could be null)
 unsigned MeshTaskShader::layoutMeshShaderLds(PipelineState *pipelineState, Function *entryPoint,
-                                             MeshLdsLayout *ldsLayout, MeshOutputsLayout *outputsLayout) {
+                                             MeshLdsLayout *ldsLayout) {
   if (!pipelineState->hasShaderStage(ShaderStage::Mesh))
     return 0; // Mesh shader absent (standalone compiler tries to compile a single task shader)
 
-  assert(getShaderStage(entryPoint) == ShaderStage::Mesh);                           // Must be mesh shader
-  assert(pipelineState->getTargetInfo().getGfxIpVersion() >= GfxIpVersion({10, 3})); // Must be GFX10.3+
+  assert(getShaderStage(entryPoint) == ShaderStage::Mesh); // Must be mesh shader
+
+  auto gfxIp = pipelineState->getTargetInfo().getGfxIpVersion();
+  assert(gfxIp >= GfxIpVersion({10, 3})); // Must be GFX10.3+
+  (void(gfxIp));                          // Unused
 
   //
   // The LDS layout of mesh shader is something as follow (consists of two main parts):
@@ -114,21 +116,6 @@ unsigned MeshTaskShader::layoutMeshShaderLds(PipelineState *pipelineState, Funct
     LLPC_OUTS("\n");
   };
 
-  auto printOutputLayoutInfo = [=](unsigned location, unsigned numComponents, unsigned relativeOffset,
-                                   BuiltInKind forBuiltIn) {
-    if (numComponents > 4) {
-      LLPC_OUTS(format("-- location = %u-%u, components = %u, offset = %u", location, location + 1, numComponents,
-                       relativeOffset));
-    } else {
-      LLPC_OUTS(format("-- location = %u, components = %u, offset = %u", location, numComponents, relativeOffset));
-    }
-
-    if (forBuiltIn != InvalidValue)
-      LLPC_OUTS(" (builtin = " << PipelineState::getBuiltInName(forBuiltIn) << ")");
-
-    LLPC_OUTS("\n");
-  };
-
   if (ldsLayout) {
     LLPC_OUTS("===============================================================================\n");
     LLPC_OUTS("// LLPC mesh shader LDS region info (in dwords) and general info\n\n");
@@ -182,86 +169,21 @@ unsigned MeshTaskShader::layoutMeshShaderLds(PipelineState *pipelineState, Funct
   meshLdsSizeInDwords += ldsRegionSize;
 
   // Per-vertex outputs
-  auto &vertexOutputComponents = resUsage->inOutUsage.mesh.vertexOutputComponents;
-  unsigned vertexStride = 0;
-  for (auto &vertexOutput : vertexOutputComponents) {
-    const auto numComponents = vertexOutput.second.first;
-    vertexStride += numComponents; // Calculate total number of components of vertex outputs
-  }
-
+  const unsigned vertexStride = 4 * resUsage->inOutUsage.outputMapLocCount; // Corresponds to vec4 output
   ldsRegionSize = vertexStride * meshMode.outputVertices;
   if (ldsLayout) {
     printLdsRegionInfo("Per-vertex Output", ldsOffsetInDwords, ldsRegionSize);
     (*ldsLayout)[MeshLdsRegion::VertexOutput] = std::make_pair(ldsOffsetInDwords, ldsRegionSize);
-
-    assert(outputsLayout);
-    outputsLayout->vertexStride = vertexStride;
-
-    unsigned offsetInVertex = 0;
-    unsigned vertexExportCount = 0;
-
-    for (auto &vertexOutput : vertexOutputComponents) {
-      const auto location = vertexOutput.first;
-      const auto &[numComponents, forBuiltIn] = vertexOutput.second;
-
-      outputsLayout->offsetsInVertex[location] = offsetInVertex; // Map output locations to relative offsets in vertex
-      offsetInVertex += numComponents;
-
-      // Skip those special outputs mapped from vertex built-ins, don't count them in at present
-      if (forBuiltIn == InvalidValue)
-        vertexExportCount += (numComponents > 4 ? 2 : 1);
-    }
-
-    // Consider those special outputs mapped from vertex built-ins
-    for (auto &vertexExport : resUsage->inOutUsage.mesh.vertexBuiltInExportSlots) {
-      const unsigned exportSlot = vertexExport.second;
-      vertexExportCount = std::max(vertexExportCount, exportSlot + 1);
-    }
-    outputsLayout->vertexExportCount = vertexExportCount;
-
     ldsOffsetInDwords += ldsRegionSize;
   }
   meshLdsSizeInDwords += ldsRegionSize;
 
   // Per-primitive outputs
-  auto &primitiveOutputComponents = resUsage->inOutUsage.mesh.primitiveOutputComponents;
-  unsigned primitiveStride = 0;
-  for (auto &primitiveOutput : primitiveOutputComponents) {
-    const auto numComponents = primitiveOutput.second.first;
-    primitiveStride += numComponents; // Calculate total number of components of primitive outputs
-  }
-
+  const unsigned primitiveStride = 4 * resUsage->inOutUsage.perPrimitiveOutputMapLocCount; // Corresponds to vec4 output
   ldsRegionSize = primitiveStride * meshMode.outputPrimitives;
   if (ldsLayout) {
     printLdsRegionInfo("Per-primitive Output", ldsOffsetInDwords, ldsRegionSize);
     (*ldsLayout)[MeshLdsRegion::PrimitiveOutput] = std::make_pair(ldsOffsetInDwords, ldsRegionSize);
-
-    assert(outputsLayout);
-    outputsLayout->primitiveStride = primitiveStride;
-
-    unsigned offsetInPrimitive = 0;
-    unsigned primitiveExportCount = 0;
-
-    for (auto &primitiveOutput : primitiveOutputComponents) {
-      const auto location = primitiveOutput.first;
-      const auto &[numComponents, forBuiltIn] = primitiveOutput.second;
-
-      outputsLayout->offsetsInPrimitive[location] =
-          offsetInPrimitive; // Map output locations to relative offsets in primitive
-      offsetInPrimitive += numComponents;
-
-      // Skip those special outputs mapped from primitive built-ins, don't count them in at present
-      if (forBuiltIn == InvalidValue)
-        primitiveExportCount += (numComponents > 4 ? 2 : 1);
-    }
-
-    // Consider those special outputs mapped from primitive built-ins
-    for (auto &primitiveExport : resUsage->inOutUsage.mesh.primitiveBuiltInExportSlots) {
-      const unsigned exportSlot = primitiveExport.second;
-      primitiveExportCount = std::max(primitiveExportCount, exportSlot + 1);
-    }
-    outputsLayout->primitiveExportCount = primitiveExportCount;
-
     ldsOffsetInDwords += ldsRegionSize;
   }
   meshLdsSizeInDwords += ldsRegionSize;
@@ -312,27 +234,6 @@ unsigned MeshTaskShader::layoutMeshShaderLds(PipelineState *pipelineState, Funct
     printLdsRegionInfo("Internal Mesh LDS", 0, meshLdsSizeInDwords);
     printLdsRegionInfo("Shared Variable LDS", 0, sharedVarLdsSizeInDwords);
     printLdsRegionInfo("Total LDS", 0, meshLdsSizeInDwords + sharedVarLdsSizeInDwords);
-
-    if (!outputsLayout->offsetsInVertex.empty()) {
-      LLPC_OUTS("\nVertex Outputs Layout (stride = " << outputsLayout->vertexStride
-                                                     << ", exports = " << outputsLayout->vertexExportCount << "):\n");
-      for (auto &vertexOutput : outputsLayout->offsetsInVertex) {
-        const auto &[location, offsetInVertex] = vertexOutput;
-        const auto &[numComponents, forBuiltIn] = vertexOutputComponents[location];
-        printOutputLayoutInfo(location, numComponents, offsetInVertex, forBuiltIn);
-      }
-    }
-
-    if (!outputsLayout->offsetsInPrimitive.empty()) {
-      LLPC_OUTS("\nPrimitive outputs layout (stride = " << outputsLayout->primitiveStride << ", exports = "
-                                                        << outputsLayout->primitiveExportCount << "):\n");
-      for (auto &primitiveOutput : outputsLayout->offsetsInPrimitive) {
-        const auto &[location, offsetInPrimitive] = primitiveOutput;
-        const auto &[numComponents, forBuiltIn] = primitiveOutputComponents[location];
-        printOutputLayoutInfo(location, numComponents, offsetInPrimitive, forBuiltIn);
-      }
-    }
-
     LLPC_OUTS("\n");
     LLPC_OUTS("Workgroup Size (X, Y, Z) = (" << meshMode.workgroupSizeX << ", " << meshMode.workgroupSizeY << ", "
                                              << meshMode.workgroupSizeZ << ")\n");
@@ -357,7 +258,7 @@ unsigned MeshTaskShader::layoutMeshShaderLds(PipelineState *pipelineState, Funct
     LLPC_OUTS("Max Vertices = " << meshMode.outputVertices << ", Max Primitives = " << meshMode.outputPrimitives
                                 << "\n");
     if (!meshSharedVars.empty()) {
-      LLPC_OUTS("Shared Variables:\n");
+      LLPC_OUTS("Shared variables:\n");
       for (auto meshSharedVar : meshSharedVars) {
         assert(meshSharedVar->getAlignment() == 4); // Must be 1 dword
         const auto sizeInBytes =
@@ -365,8 +266,8 @@ unsigned MeshTaskShader::layoutMeshShaderLds(PipelineState *pipelineState, Funct
         assert(sizeInBytes % 4 == 0); // Must be multiple of 4
         const auto sizeInDwords = sizeInBytes / 4;
 
-        LLPC_OUTS("-- name = " << meshSharedVar->getName() << ", type = " << getTypeName(meshSharedVar->getValueType())
-                               << ", size (in dwords) = " << sizeInDwords << "\n");
+        LLPC_OUTS("Name = " << meshSharedVar->getName() << ", Type = " << getTypeName(meshSharedVar->getValueType())
+                            << ", Size (in dwords) = " << sizeInDwords << "\n");
       }
     }
     LLPC_OUTS("\n");
@@ -502,7 +403,7 @@ void MeshTaskShader::processMeshShader(Function *entryPoint) {
   //           - SetMeshPrimitiveCulled -> Write null primitive flag to LDS
   //           - GetMeshBuiltinInput -> Lower mesh built-in input
   //           - TaskPayloadPtr -> Transform task payload descriptor
-  //           - WriteMeshOutput -> Write output data to LDS
+  //           - WriteMeshVertexOutput/WriteMeshPrimitiveOutput -> Write output data to LDS
   //     }
   //
   //     Barrier (if needBarrierFlag)
@@ -562,7 +463,7 @@ void MeshTaskShader::processMeshShader(Function *entryPoint) {
   const unsigned waveSize = m_pipelineState->getShaderWaveSize(ShaderStage::Mesh);
 
   // Setup LDS layout
-  layoutMeshShaderLds(m_pipelineState, entryPoint, &m_ldsLayout, &m_outputsLayout);
+  layoutMeshShaderLds(m_pipelineState, entryPoint, &m_ldsLayout);
   m_lds = getOrCreateMeshLds(entryPoint->getParent());
 
   // Mutate mesh shader entry-point
@@ -1440,60 +1341,55 @@ void MeshTaskShader::lowerSetMeshPrimitiveCulled(SetMeshPrimitiveCulledOp &setMe
 }
 
 // =====================================================================================================================
-// Lower write mesh vertex/primitive output. Write mesh shader vertex/primitive outputs to LDS.
+// Lower write mesh vertex output. Write mesh shader vertex outputs to LDS.
 //
-// @param WriteMeshOutputOp : Call instruction op to write vertex/primitive output for mesh shader
-void MeshTaskShader::lowerWriteMeshOutput(WriteMeshOutputOp &writeMeshOutputOp) {
-  m_builder.SetInsertPoint(&writeMeshOutputOp);
-
-  assert(getShaderStage(writeMeshOutputOp.getFunction()) == ShaderStage::Mesh);
-
-  auto isPrimitive = writeMeshOutputOp.getIsPrimitive();
-  auto location = writeMeshOutputOp.getLocation();
-  auto locationOffset = writeMeshOutputOp.getLocationOffset();
-  auto componentIndex = writeMeshOutputOp.getComponentIndex();
-  auto primOrVertexIndex = writeMeshOutputOp.getPrimOrVertexIndex();
-  auto outputValue = writeMeshOutputOp.getOutputValue();
-
-  auto &outputComponents =
-      isPrimitive
-          ? m_pipelineState->getShaderResourceUsage(ShaderStage::Mesh)->inOutUsage.mesh.primitiveOutputComponents
-          : m_pipelineState->getShaderResourceUsage(ShaderStage::Mesh)->inOutUsage.mesh.vertexOutputComponents;
-
-  // ldsOffset = ldsStart + primOrVertexIndex * primOrVertexStride +
-  //             offsetInPrimOrVertex + locationIndex * numComponents + componentIndex
-  Value *ldsStart = m_builder.getInt32(
-      getMeshShaderLdsRegionStart(isPrimitive ? MeshLdsRegion::PrimitiveOutput : MeshLdsRegion::VertexOutput));
-  const unsigned primOrVertexStride = isPrimitive ? m_outputsLayout.primitiveStride : m_outputsLayout.vertexStride;
-  Value *primOrVertexOffset = m_builder.CreateMul(primOrVertexIndex, m_builder.getInt32(primOrVertexStride));
-
-  Value *offsetInPrimOrVertex = m_builder.getInt32(getOutputOffsetInPrimOrVertex(location, isPrimitive));
-  if (locationOffset != m_builder.getInt32(0)) {
-    auto locationIndex = locationOffset;
-
-    assert(outputComponents.count(location) > 0); // Must exist
-    unsigned numComponents = outputComponents[location].first;
-
-    if (numComponents > 4) {
-      // NOTE: Here we encounter 64-bit vec3/vec4 data types. Such types will occupy two consecutive locations and the
-      // provided location offset must be divided by 2 to get real location index.
-      locationIndex = m_builder.CreateLShr(locationOffset, 2);
-    }
+// @param writeMeshVertexOutputOp : Call instruction op to write vertex output for mesh shader
+void MeshTaskShader::lowerWriteMeshVertexOutput(WriteMeshVertexOutputOp &writeMeshVertexOutputOp) {
+  m_builder.SetInsertPoint(&writeMeshVertexOutputOp);
 
-    offsetInPrimOrVertex = m_builder.CreateAdd(offsetInPrimOrVertex,
-                                               m_builder.CreateMul(locationIndex, m_builder.getInt32(numComponents)));
-  }
+  assert(getShaderStage(writeMeshVertexOutputOp.getFunction()) == ShaderStage::Mesh);
 
-  if (componentIndex != m_builder.getInt32(0))
-    offsetInPrimOrVertex = m_builder.CreateAdd(offsetInPrimOrVertex, componentIndex);
+  auto outputOffset = writeMeshVertexOutputOp.getOutputOffset();
+  auto vertexIndex = writeMeshVertexOutputOp.getVertexIndex();
+  auto outputValue = writeMeshVertexOutputOp.getOutputValue();
 
-  auto ldsOffset = ldsStart;
-  ldsOffset = m_builder.CreateAdd(ldsOffset, primOrVertexOffset);
-  ldsOffset = m_builder.CreateAdd(ldsOffset, offsetInPrimOrVertex);
+  const auto resUsage = m_pipelineState->getShaderResourceUsage(ShaderStage::Mesh);
+  const unsigned vertexStride = 4 * resUsage->inOutUsage.outputMapLocCount; // Corresponds to vec4 output
+
+  Value *ldsStart = m_builder.getInt32(getMeshShaderLdsRegionStart(MeshLdsRegion::VertexOutput));
+  Value *ldsOffset = m_builder.CreateMul(vertexIndex, m_builder.getInt32(vertexStride));
+  ldsOffset = m_builder.CreateAdd(ldsOffset, outputOffset);
+  ldsOffset = m_builder.CreateAdd(ldsStart, ldsOffset);
 
   writeValueToLds(outputValue, ldsOffset);
 
-  m_callsToRemove.push_back(&writeMeshOutputOp);
+  m_callsToRemove.push_back(&writeMeshVertexOutputOp);
+}
+
+// =====================================================================================================================
+// Lower write mesh primitive output. Write mesh shader primitive outputs to LDS.
+//
+// @param writeMeshPrimitiveOutputOp : Call instruction op to write primitive output for mesh shader
+void MeshTaskShader::lowerWriteMeshPrimitiveOutput(WriteMeshPrimitiveOutputOp &writeMeshPrimitiveOutputOp) {
+  m_builder.SetInsertPoint(&writeMeshPrimitiveOutputOp);
+
+  assert(getShaderStage(writeMeshPrimitiveOutputOp.getFunction()) == ShaderStage::Mesh);
+
+  auto outputOffset = writeMeshPrimitiveOutputOp.getOutputOffset();
+  auto primitiveIndex = writeMeshPrimitiveOutputOp.getPrimitiveIndex();
+  auto outputValue = writeMeshPrimitiveOutputOp.getOutputValue();
+
+  const auto resUsage = m_pipelineState->getShaderResourceUsage(ShaderStage::Mesh);
+  const unsigned primitiveStride = 4 * resUsage->inOutUsage.perPrimitiveOutputMapLocCount; // Corresponds to vec4 output
+
+  Value *ldsStart = m_builder.getInt32(getMeshShaderLdsRegionStart(MeshLdsRegion::PrimitiveOutput));
+  Value *ldsOffset = m_builder.CreateMul(primitiveIndex, m_builder.getInt32(primitiveStride));
+  ldsOffset = m_builder.CreateAdd(ldsOffset, outputOffset);
+  ldsOffset = m_builder.CreateAdd(ldsStart, ldsOffset);
+
+  writeValueToLds(outputValue, ldsOffset);
+
+  m_callsToRemove.push_back(&writeMeshPrimitiveOutputOp);
 }
 
 // =====================================================================================================================
@@ -1806,7 +1702,8 @@ void MeshTaskShader::lowerMeshShaderBody(BasicBlock *apiMeshEntryBlock, BasicBlo
                             .add<SetMeshPrimitiveIndicesOp>(&MeshTaskShader::lowerSetMeshPrimitiveIndices)
                             .add<SetMeshPrimitiveCulledOp>(&MeshTaskShader::lowerSetMeshPrimitiveCulled)
                             .add<GetMeshBuiltinInputOp>(&MeshTaskShader::lowerGetMeshBuiltinInput)
-                            .add<WriteMeshOutputOp>(&MeshTaskShader::lowerWriteMeshOutput)
+                            .add<WriteMeshVertexOutputOp>(&MeshTaskShader::lowerWriteMeshVertexOutput)
+                            .add<WriteMeshPrimitiveOutputOp>(&MeshTaskShader::lowerWriteMeshPrimitiveOutput)
                             .build();
   visitor.visit(*this, *entryPoint);
 
@@ -1937,52 +1834,28 @@ void MeshTaskShader::exportPrimitive() {
   // Primitive attribute export follows vertex attribute export
   SmallVector<ExportInfo, 32> primAttrExports;
 
-  unsigned startSlot = m_outputsLayout.vertexExportCount;
+  unsigned startSlot = inOutUsage.mesh.vertexGenericOutputExportCount;
+  for (auto &vertexBuiltIn : inOutUsage.mesh.vertexBuiltInExportSlots) {
+    const unsigned exportSlot = vertexBuiltIn.second;
+    startSlot = std::max(startSlot, exportSlot + 1);
+  }
 
   // Export primitive attributes (from generic outputs)
   ldsStart = m_builder.getInt32(getMeshShaderLdsRegionStart(MeshLdsRegion::PrimitiveOutput));
-  auto primitiveOffset =
-      m_builder.CreateMul(m_waveThreadInfo.primOrVertexIndex, m_builder.getInt32(m_outputsLayout.primitiveStride));
-
-  auto &primitiveOutputComponents =
-      m_pipelineState->getShaderResourceUsage(ShaderStage::Mesh)->inOutUsage.mesh.primitiveOutputComponents;
-  unsigned exportSlot = startSlot;
-  for (auto &primitiveOutput : primitiveOutputComponents) {
-    const auto location = primitiveOutput.first;
-    const auto &[numComponents, forBuiltIn] = primitiveOutput.second;
-    assert(numComponents > 0);
-
-    if (forBuiltIn != InvalidValue)
-      continue; // Skip those special outputs mapped from primitive built-ins. They will be handled later on.
+  auto primitiveStride = 4 * inOutUsage.perPrimitiveOutputMapLocCount;
+  auto ldsOffsetBase = m_builder.CreateMul(m_waveThreadInfo.primOrVertexIndex, m_builder.getInt32(primitiveStride));
+  ldsOffsetBase = m_builder.CreateAdd(ldsStart, ldsOffsetBase);
 
-    auto offsetInPrimitive = m_builder.getInt32(getOutputOffsetInPrimOrVertex(location, true));
+  for (unsigned exportSlot = 0; exportSlot < inOutUsage.mesh.primitiveGenericOutputExportCount; ++exportSlot) {
+    auto ldsOffset = m_builder.CreateAdd(ldsOffsetBase, m_builder.getInt32(4 * exportSlot));
+    auto exportValue = readValueFromLds(FixedVectorType::get(m_builder.getFloatTy(), 4), ldsOffset);
 
-    auto ldsOffset = ldsStart;
-    ldsOffset = m_builder.CreateAdd(ldsOffset, primitiveOffset);
-    ldsOffset = m_builder.CreateAdd(ldsOffset, offsetInPrimitive);
+    std::array<Value *, 4> exportValues;
+    for (unsigned j = 0; j < 4; ++j)
+      exportValues[j] = m_builder.CreateExtractElement(exportValue, j);
 
-    auto exportValue = readValueFromLds(FixedVectorType::get(m_builder.getFloatTy(), numComponents), ldsOffset);
-
-    SmallVector<Value *, 8> exporteValues;
-    for (unsigned i = 0; i < numComponents; ++i)
-      exporteValues.push_back(m_builder.CreateExtractElement(exportValue, i));
-
-    // Do array padding
-    if (numComponents <= 4) {
-      while (exporteValues.size() < 4) // <4 x float>
-        exporteValues.push_back(nullptr);
-    } else {
-      while (exporteValues.size() < 8) // <8 x float>
-        exporteValues.push_back(nullptr);
-    }
-
-    primAttrExports.push_back({exportSlot++, exporteValues[0], exporteValues[1], exporteValues[2], exporteValues[3]});
+    primAttrExports.push_back({startSlot + exportSlot, exportValues});
     ++inOutUsage.primExpCount;
-
-    if (numComponents > 4) {
-      primAttrExports.push_back({exportSlot++, exporteValues[4], exporteValues[5], exporteValues[6], exporteValues[7]});
-      ++inOutUsage.primExpCount;
-    }
   }
 
   // Export primitive attributes (from built-ins as generic ones)
@@ -2128,48 +2001,21 @@ void MeshTaskShader::exportVertex() {
 
   // Export vertex attributes (from generic outputs)
   Value *ldsStart = m_builder.getInt32(getMeshShaderLdsRegionStart(MeshLdsRegion::VertexOutput));
-  auto vertexOffset =
-      m_builder.CreateMul(m_waveThreadInfo.primOrVertexIndex, m_builder.getInt32(m_outputsLayout.vertexStride));
-
-  auto &vertexOutputComponents =
-      m_pipelineState->getShaderResourceUsage(ShaderStage::Mesh)->inOutUsage.mesh.vertexOutputComponents;
-  unsigned exportSlot = 0;
-  for (auto &vertexOutput : vertexOutputComponents) {
-    const auto location = vertexOutput.first;
-    const auto &[numComponents, forBuiltIn] = vertexOutput.second;
-    assert(numComponents > 0);
+  auto vertexStride = 4 * inOutUsage.outputMapLocCount;
+  auto ldsOffsetBase = m_builder.CreateMul(m_waveThreadInfo.primOrVertexIndex, m_builder.getInt32(vertexStride));
+  ldsOffsetBase = m_builder.CreateAdd(ldsStart, ldsOffsetBase);
 
-    if (forBuiltIn != InvalidValue)
-      continue; // Skip those special outputs mapped from vertex built-ins. They will be handled later on.
+  for (unsigned exportSlot = 0; exportSlot < inOutUsage.mesh.vertexGenericOutputExportCount; ++exportSlot) {
+    auto ldsOffset = m_builder.CreateAdd(ldsOffsetBase, m_builder.getInt32(4 * exportSlot));
+    auto exportValue = readValueFromLds(FixedVectorType::get(m_builder.getFloatTy(), 4), ldsOffset);
 
-    auto offsetInVertex = m_builder.getInt32(getOutputOffsetInPrimOrVertex(location, false));
-
-    auto ldsOffset = ldsStart;
-    ldsOffset = m_builder.CreateAdd(ldsOffset, vertexOffset);
-    ldsOffset = m_builder.CreateAdd(ldsOffset, offsetInVertex);
-
-    auto exportValue = readValueFromLds(FixedVectorType::get(m_builder.getFloatTy(), numComponents), ldsOffset);
-
-    SmallVector<Value *, 8> exporteValues;
-    for (unsigned i = 0; i < numComponents; ++i)
-      exporteValues.push_back(m_builder.CreateExtractElement(exportValue, i));
-
-    // Do array padding
-    if (numComponents <= 4) {
-      while (exporteValues.size() < 4) // <4 x float>
-        exporteValues.push_back(nullptr);
-    } else {
-      while (exporteValues.size() < 8) // <8 x float>
-        exporteValues.push_back(nullptr);
-    }
+    std::array<Value *, 4> exportValues = {m_builder.CreateExtractElement(exportValue, static_cast<uint64_t>(0)),
+                                           m_builder.CreateExtractElement(exportValue, 1),
+                                           m_builder.CreateExtractElement(exportValue, 2),
+                                           m_builder.CreateExtractElement(exportValue, 3)};
 
-    vertAttrExports.push_back({exportSlot++, exporteValues[0], exporteValues[1], exporteValues[2], exporteValues[3]});
+    vertAttrExports.push_back({exportSlot, exportValues});
     ++inOutUsage.expCount;
-
-    if (numComponents > 4) {
-      vertAttrExports.push_back({exportSlot++, exporteValues[4], exporteValues[5], exporteValues[6], exporteValues[7]});
-      ++inOutUsage.expCount;
-    }
   }
 
   // Export vertex attributes (from built-ins as generic ones)
@@ -2412,14 +2258,32 @@ void MeshTaskShader::doExport(ExportKind kind, ArrayRef<ExportInfo> exports) {
 void MeshTaskShader::prepareAttribRingAccess() {
   assert(m_gfxIp.major >= 11); // Must be GFX11+
 
-  unsigned attribCount = m_outputsLayout.vertexExportCount + m_outputsLayout.primitiveExportCount;
+  // The allocated numbers of vertex/primitive attributes are something as follow:
+  //   1. Generic vertex attributes
+  //   2. Vertex attributes mapped from vertex builtins
+  //   3. Generic primitive attributes
+  //   4. Primitive attributes mapped from primitive builtins
+  const auto &inOutUsage = m_pipelineState->getShaderResourceUsage(ShaderStage::Mesh)->inOutUsage.mesh;
+  unsigned vertAttribCount = inOutUsage.vertexGenericOutputExportCount;
+  for (auto &vertexBuiltIn : inOutUsage.vertexBuiltInExportSlots) {
+    const unsigned exportSlot = vertexBuiltIn.second;
+    vertAttribCount = std::max(vertAttribCount, exportSlot + 1);
+  }
+
+  unsigned primAttribCount = inOutUsage.primitiveGenericOutputExportCount;
+  for (auto &primitiveBuiltIn : inOutUsage.primitiveBuiltInExportSlots) {
+    const unsigned exportSlot = primitiveBuiltIn.second;
+    primAttribCount = std::max(primAttribCount, exportSlot + 1);
+  }
+
+  unsigned attribCount = vertAttribCount + primAttribCount;
   if (attribCount == 0)
     return; // No attribute export
 
   // NOTE: HW allocates and manages attribute ring based on the register fields: VS_EXPORT_COUNT and PRIM_EXPORT_COUNT.
   // When VS_EXPORT_COUNT = 0, HW assumes there is still a vertex attribute exported even though this is not what we
   // want. Hence, we should reserve param0 as a dummy vertex attribute.
-  if (m_outputsLayout.vertexExportCount == 0) {
+  if (vertAttribCount == 0) {
     m_hasNoVertexAttrib = true;
     ++attribCount; // Count in this dummy vertex attribute
   }
@@ -2703,25 +2567,19 @@ Value *MeshTaskShader::readMeshBuiltInFromLds(BuiltInKind builtIn) {
     break;
   }
 
-  // ldsOffset = ldsStart + primOrVertexIndex * primOrVertexStride + offsetInPrimOrVertex
-  Value *primOrVertexOffset = nullptr;
+  Value *ldsOffset = nullptr;
   if (region == MeshLdsRegion::VertexOutput) {
-    primOrVertexOffset =
-        m_builder.CreateMul(m_waveThreadInfo.primOrVertexIndex, m_builder.getInt32(m_outputsLayout.vertexStride));
+    auto vertexStride = 4 * inOutUsage.outputMapLocCount;
+    ldsOffset = m_builder.CreateMul(m_waveThreadInfo.primOrVertexIndex, m_builder.getInt32(vertexStride));
   } else {
     assert(region == MeshLdsRegion::PrimitiveOutput);
-    primOrVertexOffset =
-        m_builder.CreateMul(m_waveThreadInfo.primOrVertexIndex, m_builder.getInt32(m_outputsLayout.primitiveStride));
+    auto primitiveStride = 4 * inOutUsage.perPrimitiveOutputMapLocCount;
+    ldsOffset = m_builder.CreateMul(m_waveThreadInfo.primOrVertexIndex, m_builder.getInt32(primitiveStride));
   }
+  ldsOffset = m_builder.CreateAdd(ldsOffset, m_builder.getInt32(4 * location));
 
-  Value *ldsStart = m_builder.getInt32(getMeshShaderLdsRegionStart(
-      region == MeshLdsRegion::PrimitiveOutput ? MeshLdsRegion::PrimitiveOutput : MeshLdsRegion::VertexOutput));
-  Value *offsetInPrimOrVertex =
-      m_builder.getInt32(getOutputOffsetInPrimOrVertex(location, region == MeshLdsRegion::PrimitiveOutput));
-
-  auto ldsOffset = ldsStart;
-  ldsOffset = m_builder.CreateAdd(ldsOffset, primOrVertexOffset);
-  ldsOffset = m_builder.CreateAdd(ldsOffset, offsetInPrimOrVertex);
+  Value *ldsStart = m_builder.getInt32(getMeshShaderLdsRegionStart(region));
+  ldsOffset = m_builder.CreateAdd(ldsStart, ldsOffset);
 
   return readValueFromLds(readTy, ldsOffset);
 }
diff --git a/lgc/patch/MeshTaskShader.h b/lgc/patch/MeshTaskShader.h
index e224fd7100..853a783c99 100644
--- a/lgc/patch/MeshTaskShader.h
+++ b/lgc/patch/MeshTaskShader.h
@@ -61,17 +61,6 @@ enum class MeshLdsRegion : unsigned {
 // Map: LDS Region -> <Region Offset, Region Size>
 typedef std::unordered_map<MeshLdsRegion, std::pair<unsigned, unsigned>> MeshLdsLayout;
 
-// Mesh shader outputs layout
-struct MeshOutputsLayout {
-  unsigned vertexStride;                        // Vertex stride (in dwords)
-  unsigned vertexExportCount;                   // Vertex export count
-  unsigned primitiveStride;                     // Primitive stride (in dwords)
-  unsigned primitiveExportCount;                // Primitive export count
-  std::map<unsigned, unsigned> offsetsInVertex; // Map from output location to output offset within a vertex (in dwords)
-  std::map<unsigned, unsigned>
-      offsetsInPrimitive; // Map from output location to output offset within a primitive (in dwords)
-};
-
 // =====================================================================================================================
 // Represents the handler of mesh/task shader.
 class MeshTaskShader {
@@ -80,7 +69,7 @@ class MeshTaskShader {
   ~MeshTaskShader();
 
   static unsigned layoutMeshShaderLds(PipelineState *pipelineState, llvm::Function *entryPoint,
-                                      MeshLdsLayout *ldsLayout = nullptr, MeshOutputsLayout *outputsLayout = nullptr);
+                                      MeshLdsLayout *ldsLayout = nullptr);
 
   void process(llvm::Function *taskEntryPoint, llvm::Function *meshEntryPoint);
 
@@ -97,7 +86,8 @@ class MeshTaskShader {
   void lowerSetMeshPrimitiveIndices(SetMeshPrimitiveIndicesOp &setMeshPrimitiveIndicesOp);
   void lowerSetMeshPrimitiveCulled(SetMeshPrimitiveCulledOp &setMeshPrimitiveCulledOp);
   void lowerGetMeshBuiltinInput(GetMeshBuiltinInputOp &getMeshBuiltinInputOp);
-  void lowerWriteMeshOutput(WriteMeshOutputOp &writeMeshOutputOp);
+  void lowerWriteMeshVertexOutput(WriteMeshVertexOutputOp &writeMeshVertexOutputOp);
+  void lowerWriteMeshPrimitiveOutput(WriteMeshPrimitiveOutputOp &writeMeshPrimitiveOutputOp);
 
   void initWaveThreadInfo(llvm::Function *entryPoint);
   llvm::Value *getShaderRingEntryIndex(llvm::Function *entryPoint);
@@ -148,16 +138,6 @@ class MeshTaskShader {
     return m_ldsLayout[region].first;
   }
 
-  unsigned getOutputOffsetInPrimOrVertex(unsigned location, bool inPrimitive) {
-    if (inPrimitive) {
-      assert(m_outputsLayout.offsetsInPrimitive.count(location) > 0); // Must exist
-      return m_outputsLayout.offsetsInPrimitive[location];
-    }
-
-    assert(m_outputsLayout.offsetsInVertex.count(location) > 0); // Must exist
-    return m_outputsLayout.offsetsInVertex[location];
-  }
-
   llvm::Value *readValueFromLds(llvm::Type *readTy, llvm::Value *ldsOffset);
   void writeValueToLds(llvm::Value *writeValue, llvm::Value *ldsOffset);
   void atomicOpWithLds(llvm::AtomicRMWInst::BinOp atomicOp, llvm::Value *atomicValue, llvm::Value *ldsOffset);
@@ -205,8 +185,7 @@ class MeshTaskShader {
 
   GfxIpVersion m_gfxIp; // Graphics IP version info
 
-  MeshLdsLayout m_ldsLayout;         // Mesh shader LDS layout
-  MeshOutputsLayout m_outputsLayout; // Mesh shader outputs layout
+  MeshLdsLayout m_ldsLayout; // Mesh shader LDS layout
 };
 
 } // namespace lgc
diff --git a/lgc/patch/NggPrimShader.cpp b/lgc/patch/NggPrimShader.cpp
index a5a5943ed5..bca8196fef 100644
--- a/lgc/patch/NggPrimShader.cpp
+++ b/lgc/patch/NggPrimShader.cpp
@@ -102,6 +102,22 @@ NggPrimShader::NggPrimShader(PipelineState *pipelineState)
       m_hasGs(pipelineState->hasShaderStage(ShaderStage::Geometry)), m_builder(pipelineState->getContext()) {
   assert(m_nggControl->enableNgg);
 
+  m_maxThreadsPerSubgroup = NggMaxThreadsPerSubgroup;
+  if (m_hasGs) {
+    // NOTE: Normally, the maximum value of GS output vertices is restricted to 256 by HW rasterization. However, we
+    // encounter a special DX case where it emits >256 vertices and just do stream-out operations without
+    // rasterization. Stream-out on GFX11+ is pure SW emulation and we can support such case. In experiments, we
+    // find our HW can support GE_NGG_SUBGRP_CNTL.PRIM_AMP_FACTOR > 256 though it is not documented. There are 9 bits
+    // that program the register field to launch 511 threads at most. With sufficient threads, this case could be
+    // handled by our current design.
+    const auto &geometryMode = pipelineState->getShaderModes()->getGeometryShaderMode();
+    m_maxThreadsPerSubgroup = std::max(NggMaxThreadsPerSubgroup, geometryMode.outputVertices);
+    assert(m_maxThreadsPerSubgroup <= NggMaxPrimitiveAmplifier);
+  }
+  const unsigned waveSize = pipelineState->getShaderWaveSize(
+      m_hasGs ? ShaderStage::Geometry : (m_hasTes ? ShaderStage::TessEval : ShaderStage::Vertex));
+  m_maxWavesPerSubgroup = alignTo(m_maxThreadsPerSubgroup, waveSize) / waveSize;
+
   // Always allow approximation, to change fdiv(1.0, x) to rcp(x)
   FastMathFlags fastMathFlags;
   fastMathFlags.setApproxFunc();
@@ -204,6 +220,20 @@ PrimShaderLdsUsageInfo NggPrimShader::layoutPrimShaderLds(PipelineState *pipelin
     PrimShaderLdsUsageInfo ldsUsageInfo = {};
     ldsUsageInfo.needsLds = true;
 
+    // NOTE: Normally, the maximum value of GS output vertices is restricted to 256 by HW rasterization. However, we
+    // encounter a special DX case where it emits >256 vertices and just do stream-out operations without
+    // rasterization. Stream-out on GFX11+ is pure SW emulation and we can support such case. In experiments, we
+    // find our HW can support GE_NGG_SUBGRP_CNTL.PRIM_AMP_FACTOR > 256 though it is not documented. There are 9 bits
+    // that program the register field to launch 511 threads at most. With sufficient threads, this case could be
+    // handled by our current design.
+    const auto &geometryMode = pipelineState->getShaderModes()->getGeometryShaderMode();
+    unsigned maxThreadsPerSubgroup = std::max(NggMaxThreadsPerSubgroup, geometryMode.outputVertices);
+    assert(maxThreadsPerSubgroup <= NggMaxPrimitiveAmplifier);
+
+    const unsigned waveSize = pipelineState->getShaderWaveSize(ShaderStage::Geometry);
+    assert(waveSize == 32 || waveSize == 64);
+    unsigned maxWavesPerSubgroup = alignTo(maxThreadsPerSubgroup, waveSize) / waveSize;
+
     //
     // The LDS layout is something like this:
     //
@@ -226,7 +256,7 @@ PrimShaderLdsUsageInfo NggPrimShader::layoutPrimShaderLds(PipelineState *pipelin
     }
 
     // Primitive data
-    ldsRegionSize = NggMaxThreadsPerSubgroup * MaxGsStreams; // 1 dword per primitive thread, 4 GS streams
+    ldsRegionSize = maxThreadsPerSubgroup * MaxGsStreams; // 1 dword per primitive thread, 4 GS streams
     if (ldsLayout) {
       printLdsRegionInfo("Primitive Connectivity Data", ldsOffset, ldsRegionSize);
       (*ldsLayout)[PrimShaderLdsRegion::PrimitiveData] = std::make_pair(ldsOffset, ldsRegionSize);
@@ -237,7 +267,7 @@ PrimShaderLdsUsageInfo NggPrimShader::layoutPrimShaderLds(PipelineState *pipelin
     // Primitive counts
     if (pipelineState->enableSwXfb() || pipelineState->enablePrimStats()) {
       ldsRegionSize =
-          (NggMaxWavesPerSubgroup + 1) * MaxGsStreams; // 1 dword per wave and 1 dword per subgroup, 4 GS streams
+          (maxWavesPerSubgroup + 1) * MaxGsStreams; // 1 dword per wave and 1 dword per subgroup, 4 GS streams
       if (ldsLayout) {
         printLdsRegionInfo("Primitive Counts", ldsOffset, ldsRegionSize);
         (*ldsLayout)[PrimShaderLdsRegion::PrimitiveCounts] = std::make_pair(ldsOffset, ldsRegionSize);
@@ -248,7 +278,7 @@ PrimShaderLdsUsageInfo NggPrimShader::layoutPrimShaderLds(PipelineState *pipelin
 
     // Primitive index map (compacted -> uncompacted)
     if (pipelineState->enableSwXfb()) {
-      ldsRegionSize = NggMaxThreadsPerSubgroup * MaxGsStreams; // 1 dword per primitive thread, 4 GS streams
+      ldsRegionSize = maxThreadsPerSubgroup * MaxGsStreams; // 1 dword per primitive thread, 4 GS streams
       if (ldsLayout) {
         printLdsRegionInfo("Primitive Index Map (To Uncompacted)", ldsOffset, ldsRegionSize);
         (*ldsLayout)[PrimShaderLdsRegion::PrimitiveIndexMap] = std::make_pair(ldsOffset, ldsRegionSize);
@@ -268,7 +298,7 @@ PrimShaderLdsUsageInfo NggPrimShader::layoutPrimShaderLds(PipelineState *pipelin
       }
     } else {
       ldsRegionSize =
-          (NggMaxWavesPerSubgroup + 1) * MaxGsStreams; // 1 dword per wave and 1 dword per subgroup, 4 GS streams
+          (maxWavesPerSubgroup + 1) * MaxGsStreams; // 1 dword per wave and 1 dword per subgroup, 4 GS streams
       if (ldsLayout) {
         printLdsRegionInfo("Vertex Counts", ldsOffset, ldsRegionSize);
         (*ldsLayout)[PrimShaderLdsRegion::VertexCounts] = std::make_pair(ldsOffset, ldsRegionSize);
@@ -288,7 +318,7 @@ PrimShaderLdsUsageInfo NggPrimShader::layoutPrimShaderLds(PipelineState *pipelin
                              (*ldsLayout)[PrimShaderLdsRegion::VertexIndexMap].second);
         }
       } else {
-        ldsRegionSize = NggMaxThreadsPerSubgroup * MaxGsStreams; // 1 dword per vertex thread, 4 GS streams
+        ldsRegionSize = maxThreadsPerSubgroup * MaxGsStreams; // 1 dword per vertex thread, 4 GS streams
         if (ldsLayout) {
           printLdsRegionInfo("Vertex Index Map (To Uncompacted)", ldsOffset, ldsRegionSize);
           (*ldsLayout)[PrimShaderLdsRegion::VertexIndexMap] = std::make_pair(ldsOffset, ldsRegionSize);
@@ -325,8 +355,11 @@ PrimShaderLdsUsageInfo NggPrimShader::layoutPrimShaderLds(PipelineState *pipelin
       printLdsRegionInfo("Total LDS", 0, ldsOffset);
       LLPC_OUTS("\n");
       LLPC_OUTS("Needs LDS = " << (ldsUsageInfo.needsLds ? "true" : "false") << "\n");
-      LLPC_OUTS("ES Extra LDS Size (in Dwords) = " << format("0x%04" PRIX32, ldsUsageInfo.esExtraLdsSize) << "\n");
-      LLPC_OUTS("GS Extra LDS Size (in Dwords) = " << format("0x%04" PRIX32, ldsUsageInfo.gsExtraLdsSize) << "\n");
+      LLPC_OUTS("ES Extra LDS Size (in dwords) = " << format("0x%04" PRIX32, ldsUsageInfo.esExtraLdsSize) << "\n");
+      LLPC_OUTS("GS Extra LDS Size (in dwords) = " << format("0x%04" PRIX32, ldsUsageInfo.gsExtraLdsSize) << "\n");
+      LLPC_OUTS("\n");
+      LLPC_OUTS("Max Launched Threads = " << maxThreadsPerSubgroup << "\n");
+      LLPC_OUTS("Max Launched Waves (Wave" << std::to_string(waveSize) << ") = " << maxWavesPerSubgroup << "\n");
       LLPC_OUTS("\n");
     }
 
@@ -337,6 +370,11 @@ PrimShaderLdsUsageInfo NggPrimShader::layoutPrimShaderLds(PipelineState *pipelin
   const bool distributePrimitiveId =
       !hasTes && pipelineState->getShaderResourceUsage(ShaderStage::Vertex)->builtInUsage.vs.primitiveId;
 
+  const unsigned waveSize = pipelineState->getShaderWaveSize(hasTes ? ShaderStage::TessEval : ShaderStage::Vertex);
+  assert(waveSize == 32 || waveSize == 64);
+  unsigned maxThreadsPerSubgroup = NggMaxThreadsPerSubgroup;
+  unsigned maxWavesPerSubgroup = NggMaxThreadsPerSubgroup / waveSize;
+
   //
   // Passthrough mode is enabled (API GS is not present)
   //
@@ -395,8 +433,11 @@ PrimShaderLdsUsageInfo NggPrimShader::layoutPrimShaderLds(PipelineState *pipelin
       printLdsRegionInfo("Total LDS", 0, ldsOffset);
       LLPC_OUTS("\n");
       LLPC_OUTS("Needs LDS = " << (ldsUsageInfo.needsLds ? "true" : "false") << "\n");
-      LLPC_OUTS("ES Extra LDS Size (in Dwords) = " << format("0x%04" PRIX32, ldsUsageInfo.esExtraLdsSize) << "\n");
-      LLPC_OUTS("GS Extra LDS Size (in Dwords) = " << format("0x%04" PRIX32, ldsUsageInfo.gsExtraLdsSize) << "\n");
+      LLPC_OUTS("ES Extra LDS Size (in dwords) = " << format("0x%04" PRIX32, ldsUsageInfo.esExtraLdsSize) << "\n");
+      LLPC_OUTS("GS Extra LDS Size (in dwords) = " << format("0x%04" PRIX32, ldsUsageInfo.gsExtraLdsSize) << "\n");
+      LLPC_OUTS("\n");
+      LLPC_OUTS("Max Launched Threads = " << maxThreadsPerSubgroup << "\n");
+      LLPC_OUTS("Max Launched Waves (Wave" << std::to_string(waveSize) << ") = " << maxWavesPerSubgroup << "\n");
       LLPC_OUTS("\n");
     }
 
@@ -433,7 +474,7 @@ PrimShaderLdsUsageInfo NggPrimShader::layoutPrimShaderLds(PipelineState *pipelin
   ldsOffset = 0; // DistributedPrimitiveId is always the first region and is overlapped with VertexPosition
 
   // Vertex position
-  ldsRegionSize = 4 * NggMaxThreadsPerSubgroup; // 4 dwords per vertex thread
+  ldsRegionSize = 4 * maxThreadsPerSubgroup; // 4 dwords per vertex thread
   if (ldsLayout) {
     printLdsRegionInfo("Vertex Position", ldsOffset, ldsRegionSize);
     (*ldsLayout)[PrimShaderLdsRegion::VertexPosition] = std::make_pair(ldsOffset, ldsRegionSize);
@@ -464,7 +505,7 @@ PrimShaderLdsUsageInfo NggPrimShader::layoutPrimShaderLds(PipelineState *pipelin
   }
 
   // Vertex counts
-  ldsRegionSize = NggMaxWavesPerSubgroup + 1; // 1 dword per wave and 1 dword per subgroup
+  ldsRegionSize = maxWavesPerSubgroup + 1; // 1 dword per wave and 1 dword per subgroup
   if (ldsLayout) {
     printLdsRegionInfo("Vertex Counts", ldsOffset, ldsRegionSize);
     (*ldsLayout)[PrimShaderLdsRegion::VertexCounts] = std::make_pair(ldsOffset, ldsRegionSize);
@@ -474,7 +515,7 @@ PrimShaderLdsUsageInfo NggPrimShader::layoutPrimShaderLds(PipelineState *pipelin
 
   // Vertex index map
   if (pipelineState->getNggControl()->compactVertex) {
-    ldsRegionSize = NggMaxThreadsPerSubgroup; // 1 dword per wave and 1 dword per subgroup
+    ldsRegionSize = maxThreadsPerSubgroup; // 1 dword per wave and 1 dword per subgroup
     if (ldsLayout) {
       printLdsRegionInfo("Vertex Index Map (To Uncompacted)", ldsOffset, ldsRegionSize);
       (*ldsLayout)[PrimShaderLdsRegion::VertexIndexMap] = std::make_pair(ldsOffset, ldsRegionSize);
@@ -487,8 +528,11 @@ PrimShaderLdsUsageInfo NggPrimShader::layoutPrimShaderLds(PipelineState *pipelin
     printLdsRegionInfo("Total LDS", 0, ldsOffset);
     LLPC_OUTS("\n");
     LLPC_OUTS("Needs LDS = " << (ldsUsageInfo.needsLds ? "true" : "false") << "\n");
-    LLPC_OUTS("ES Extra LDS Size (in Dwords) = " << format("0x%04" PRIX32, ldsUsageInfo.esExtraLdsSize) << "\n");
-    LLPC_OUTS("GS Extra LDS Size (in Dwords) = " << format("0x%04" PRIX32, ldsUsageInfo.gsExtraLdsSize) << "\n");
+    LLPC_OUTS("ES Extra LDS Size (in dwords) = " << format("0x%04" PRIX32, ldsUsageInfo.esExtraLdsSize) << "\n");
+    LLPC_OUTS("GS Extra LDS Size (in dwords) = " << format("0x%04" PRIX32, ldsUsageInfo.gsExtraLdsSize) << "\n");
+    LLPC_OUTS("\n");
+    LLPC_OUTS("Max Launched Threads = " << maxThreadsPerSubgroup << "\n");
+    LLPC_OUTS("Max Launched Waves (Wave" << std::to_string(waveSize) << ") = " << maxWavesPerSubgroup << "\n");
     LLPC_OUTS("\n");
   }
 
@@ -1101,8 +1145,6 @@ void NggPrimShader::buildPrimShader(Function *primShader) {
   const unsigned waveSize = m_pipelineState->getShaderWaveSize(ShaderStage::Geometry);
   assert(waveSize == 32 || waveSize == 64);
 
-  const unsigned waveCountInSubgroup = NggMaxThreadsPerSubgroup / waveSize;
-
   SmallVector<Argument *, 32> args;
   for (auto &arg : primShader->args())
     args.push_back(&arg);
@@ -1186,7 +1228,7 @@ void NggPrimShader::buildPrimShader(Function *primShader) {
   //   if (Not runtime passthrough) {
   //     if (threadIdInSubgroup < vertCountInSubgroup)
   //       Initialize vertex draw flag
-  //     if (threadIdInSubgroup < waveCount + 1)
+  //     if (threadIdInSubgroup < maxWaves + 1)
   //       Initialize per-wave and per-subgroup count of output vertices
   //
   //     if (threadIdInWave < vertCountInWave)
@@ -1203,7 +1245,7 @@ void NggPrimShader::buildPrimShader(Function *primShader) {
   //     if (threadIdInSubgroup < vertCountInSubgroup)
   //       Check draw flags of vertices and compute draw mask
   //
-  //     if (threadIdInWave < waveCount - waveId)
+  //     if (threadIdInWave < maxWaves - waveId)
   //       Accumulate per-wave and per-subgroup count of output vertices
   //     Barrier
   //
@@ -1399,7 +1441,7 @@ void NggPrimShader::buildPrimShader(Function *primShader) {
     m_builder.SetInsertPoint(endInitVertexDrawFlagBlock);
 
     auto validWave =
-        m_builder.CreateICmpULT(m_nggInputs.threadIdInSubgroup, m_builder.getInt32(waveCountInSubgroup + 1));
+        m_builder.CreateICmpULT(m_nggInputs.threadIdInSubgroup, m_builder.getInt32(m_maxWavesPerSubgroup + 1));
     m_builder.CreateCondBr(validWave, initVertexCountsBlock, endInitVertexCountsBlock);
   }
 
@@ -1521,7 +1563,7 @@ void NggPrimShader::buildPrimShader(Function *primShader) {
     vertCountInWave = m_builder.CreateIntrinsic(Intrinsic::ctpop, m_builder.getInt64Ty(), drawMask);
     vertCountInWave = m_builder.CreateTrunc(vertCountInWave, m_builder.getInt32Ty());
 
-    auto threadIdUpbound = m_builder.CreateSub(m_builder.getInt32(waveCountInSubgroup), m_nggInputs.waveIdInSubgroup);
+    auto threadIdUpbound = m_builder.CreateSub(m_builder.getInt32(m_maxWavesPerSubgroup), m_nggInputs.waveIdInSubgroup);
     auto validThread = m_builder.CreateICmpULT(m_nggInputs.threadIdInWave, threadIdUpbound);
     m_builder.CreateCondBr(validThread, accumVertexCountsBlock, endAccumVertexCountsBlock);
   }
@@ -1536,7 +1578,7 @@ void NggPrimShader::buildPrimShader(Function *primShader) {
     unsigned regionStart = getLdsRegionStart(PrimShaderLdsRegion::VertexCounts);
 
     ldsOffset = m_builder.CreateAdd(ldsOffset, m_builder.getInt32(regionStart));
-    atomicAdd(vertCountInWave, ldsOffset);
+    atomicOp(AtomicRMWInst::Add, vertCountInWave, ldsOffset);
 
     m_builder.CreateBr(endAccumVertexCountsBlock);
   }
@@ -1556,7 +1598,7 @@ void NggPrimShader::buildPrimShader(Function *primShader) {
     // The last dword following dwords for all waves (each wave has one dword) stores vertex count of the
     // entire subgroup
     vertCountInSubgroup = m_builder.CreateIntrinsic(m_builder.getInt32Ty(), Intrinsic::amdgcn_readlane,
-                                                    {vertCountInWaves, m_builder.getInt32(waveCountInSubgroup)});
+                                                    {vertCountInWaves, m_builder.getInt32(m_maxWavesPerSubgroup)});
 
     if (m_nggControl->compactVertex) {
       // Get vertex count for all waves prior to this wave
@@ -1863,7 +1905,6 @@ void NggPrimShader::buildPrimShaderWithGs(Function *primShader) {
   if (!m_nggControl->compactVertex)
     assert(m_gfxIp >= GfxIpVersion({10, 3})); // Must be GFX10.3+
 
-  const unsigned waveCountInSubgroup = NggMaxThreadsPerSubgroup / waveSize;
   const bool cullingMode = !m_nggControl->passthroughMode;
 
   const auto rasterStream = m_pipelineState->getRasterizerState().rasterStream;
@@ -1917,7 +1958,7 @@ void NggPrimShader::buildPrimShaderWithGs(Function *primShader) {
   //   else if (Enable primitive statistics counting)
   //     Collect primitive statistics
   //
-  //  if (threadIdInSubgroup < waveCount + 1)
+  //  if (threadIdInSubgroup < maxWaves + 1)
   //     Initialize per-wave and per-subgroup count of output vertices
   //   Barrier
   //
@@ -1931,7 +1972,7 @@ void NggPrimShader::buildPrimShaderWithGs(Function *primShader) {
   //   if (threadIdInSubgroup < vertCountInSubgroup)
   //     Check draw flags of output vertices and compute draw mask
   //
-  //   if (threadIdInWave < waveCount - waveId)
+  //   if (threadIdInWave < maxWaves - waveId)
   //     Accumulate per-wave and per-subgroup count of output vertices
   //   Barrier
   //   Update vertCountInSubgroup
@@ -2041,7 +2082,7 @@ void NggPrimShader::buildPrimShaderWithGs(Function *primShader) {
     for (unsigned i = 0; i < MaxGsStreams; ++i) {
       if (m_pipelineState->isVertexStreamActive(i)) { // Initialize primitive connectivity data if the stream is active
         writePerThreadDataToLds(m_builder.getInt32(NullPrim), m_nggInputs.threadIdInSubgroup,
-                                PrimShaderLdsRegion::PrimitiveData, NggMaxThreadsPerSubgroup * i);
+                                PrimShaderLdsRegion::PrimitiveData, m_maxThreadsPerSubgroup * i);
       }
     }
 
@@ -2078,7 +2119,7 @@ void NggPrimShader::buildPrimShaderWithGs(Function *primShader) {
       collectPrimitiveStats();
 
     auto validWave =
-        m_builder.CreateICmpULT(m_nggInputs.threadIdInSubgroup, m_builder.getInt32(waveCountInSubgroup + 1));
+        m_builder.CreateICmpULT(m_nggInputs.threadIdInSubgroup, m_builder.getInt32(m_maxWavesPerSubgroup + 1));
     m_builder.CreateCondBr(validWave, initVertexCountsBlock, endInitVertexCountsBlock);
   }
 
@@ -2087,7 +2128,7 @@ void NggPrimShader::buildPrimShaderWithGs(Function *primShader) {
     m_builder.SetInsertPoint(initVertexCountsBlock);
 
     writePerThreadDataToLds(m_builder.getInt32(0), m_nggInputs.threadIdInSubgroup, PrimShaderLdsRegion::VertexCounts,
-                            (NggMaxWavesPerSubgroup + 1) * rasterStream);
+                            (m_maxWavesPerSubgroup + 1) * rasterStream);
 
     m_builder.CreateBr(endInitVertexCountsBlock);
   }
@@ -2101,7 +2142,7 @@ void NggPrimShader::buildPrimShaderWithGs(Function *primShader) {
 
     if (cullingMode) {
       primData = readPerThreadDataFromLds(m_builder.getInt32Ty(), m_nggInputs.threadIdInSubgroup,
-                                          PrimShaderLdsRegion::PrimitiveData, NggMaxThreadsPerSubgroup * rasterStream);
+                                          PrimShaderLdsRegion::PrimitiveData, m_maxThreadsPerSubgroup * rasterStream);
       auto tryCullPrimitive = m_builder.CreateICmpNE(primData, m_builder.getInt32(NullPrim));
       auto validPrimitive = m_builder.CreateICmpULT(m_nggInputs.threadIdInSubgroup, m_nggInputs.primCountInSubgroup);
       tryCullPrimitive = m_builder.CreateAnd(tryCullPrimitive, validPrimitive);
@@ -2153,7 +2194,7 @@ void NggPrimShader::buildPrimShaderWithGs(Function *primShader) {
       m_builder.SetInsertPoint(nullifyPrimitiveDataBlock);
 
       writePerThreadDataToLds(m_builder.getInt32(NullPrim), m_nggInputs.threadIdInSubgroup,
-                              PrimShaderLdsRegion::PrimitiveData, NggMaxThreadsPerSubgroup * rasterStream);
+                              PrimShaderLdsRegion::PrimitiveData, m_maxThreadsPerSubgroup * rasterStream);
 
       m_builder.CreateBr(endCullPrimitiveBlock);
     }
@@ -2192,7 +2233,7 @@ void NggPrimShader::buildPrimShaderWithGs(Function *primShader) {
     // drawFlag = primData[N] != NullPrim
     auto primData0 =
         readPerThreadDataFromLds(m_builder.getInt32Ty(), m_nggInputs.threadIdInSubgroup,
-                                 PrimShaderLdsRegion::PrimitiveData, NggMaxThreadsPerSubgroup * rasterStream);
+                                 PrimShaderLdsRegion::PrimitiveData, m_maxThreadsPerSubgroup * rasterStream);
     auto drawFlag0 = m_builder.CreateICmpNE(primData0, m_builder.getInt32(NullPrim));
     drawFlag = drawFlag0;
 
@@ -2200,7 +2241,7 @@ void NggPrimShader::buildPrimShaderWithGs(Function *primShader) {
       // drawFlag |= N >= 1 ? (primData[N-1] != NullPrim) : false
       auto primData1 = readPerThreadDataFromLds(
           m_builder.getInt32Ty(), m_builder.CreateSub(m_nggInputs.threadIdInSubgroup, m_builder.getInt32(1)),
-          PrimShaderLdsRegion::PrimitiveData, NggMaxThreadsPerSubgroup * rasterStream);
+          PrimShaderLdsRegion::PrimitiveData, m_maxThreadsPerSubgroup * rasterStream);
       auto drawFlag1 =
           m_builder.CreateSelect(m_builder.CreateICmpUGE(m_nggInputs.threadIdInSubgroup, m_builder.getInt32(1)),
                                  m_builder.CreateICmpNE(primData1, m_builder.getInt32(NullPrim)), m_builder.getFalse());
@@ -2211,7 +2252,7 @@ void NggPrimShader::buildPrimShaderWithGs(Function *primShader) {
       // drawFlag |= N >= 2 ? (primData[N-2] != NullPrim) : false
       auto primData2 = readPerThreadDataFromLds(
           m_builder.getInt32Ty(), m_builder.CreateSub(m_nggInputs.threadIdInSubgroup, m_builder.getInt32(2)),
-          PrimShaderLdsRegion::PrimitiveData, NggMaxThreadsPerSubgroup * rasterStream);
+          PrimShaderLdsRegion::PrimitiveData, m_maxThreadsPerSubgroup * rasterStream);
       auto drawFlag2 =
           m_builder.CreateSelect(m_builder.CreateICmpUGE(m_nggInputs.threadIdInSubgroup, m_builder.getInt32(2)),
                                  m_builder.CreateICmpNE(primData2, m_builder.getInt32(NullPrim)), m_builder.getFalse());
@@ -2235,7 +2276,7 @@ void NggPrimShader::buildPrimShaderWithGs(Function *primShader) {
     vertCountInWave = m_builder.CreateIntrinsic(Intrinsic::ctpop, m_builder.getInt64Ty(), drawMask);
     vertCountInWave = m_builder.CreateTrunc(vertCountInWave, m_builder.getInt32Ty());
 
-    auto threadIdUpbound = m_builder.CreateSub(m_builder.getInt32(waveCountInSubgroup), m_nggInputs.waveIdInSubgroup);
+    auto threadIdUpbound = m_builder.CreateSub(m_builder.getInt32(m_maxWavesPerSubgroup), m_nggInputs.waveIdInSubgroup);
     auto validThread = m_builder.CreateICmpULT(m_nggInputs.threadIdInWave, threadIdUpbound);
 
     m_builder.CreateCondBr(validThread, accumVertexCountsBlock, endAccumVertexCountsBlock);
@@ -2251,8 +2292,8 @@ void NggPrimShader::buildPrimShaderWithGs(Function *primShader) {
     unsigned regionStart = getLdsRegionStart(PrimShaderLdsRegion::VertexCounts);
 
     ldsOffset =
-        m_builder.CreateAdd(ldsOffset, m_builder.getInt32(regionStart + (NggMaxWavesPerSubgroup + 1) * rasterStream));
-    atomicAdd(vertCountInWave, ldsOffset);
+        m_builder.CreateAdd(ldsOffset, m_builder.getInt32(regionStart + (m_maxWavesPerSubgroup + 1) * rasterStream));
+    atomicOp(AtomicRMWInst::Add, vertCountInWave, ldsOffset);
 
     m_builder.CreateBr(endAccumVertexCountsBlock);
   }
@@ -2267,12 +2308,13 @@ void NggPrimShader::buildPrimShaderWithGs(Function *primShader) {
     if (m_nggControl->compactVertex) {
       auto vertCountInWaves =
           readPerThreadDataFromLds(m_builder.getInt32Ty(), m_nggInputs.threadIdInWave,
-                                   PrimShaderLdsRegion::VertexCounts, (NggMaxWavesPerSubgroup + 1) * rasterStream);
+                                   PrimShaderLdsRegion::VertexCounts, (m_maxWavesPerSubgroup + 1) * rasterStream);
 
       // The last dword following dwords for all waves (each wave has one dword) stores GS output vertex count of the
       // entire subgroup
-      auto vertCountInSubgroup = m_builder.CreateIntrinsic(m_builder.getInt32Ty(), Intrinsic::amdgcn_readlane,
-                                                           {vertCountInWaves, m_builder.getInt32(waveCountInSubgroup)});
+      auto vertCountInSubgroup =
+          m_builder.CreateIntrinsic(m_builder.getInt32Ty(), Intrinsic::amdgcn_readlane,
+                                    {vertCountInWaves, m_builder.getInt32(m_maxWavesPerSubgroup)});
 
       // Get output vertex count for all waves prior to this wave
       vertCountInPrevWaves = m_builder.CreateIntrinsic(m_builder.getInt32Ty(), Intrinsic::amdgcn_readlane,
@@ -2542,6 +2584,16 @@ void NggPrimShader::loadStreamOutBufferInfo(Value *userData) {
   else
     streamOutData = m_pipelineState->getShaderInterfaceData(ShaderStage::Vertex)->entryArgIdxs.vs.streamOutData;
 
+  unsigned compositeData = m_pipelineState->getShaderInterfaceData(ShaderStage::Vertex)->entryArgIdxs.vs.compositeData;
+  if (compositeData != 0) {
+    // Use dynamic topology
+    m_verticesPerPrimitive =
+        createUBfe(m_builder.CreateExtractElement(userData, getUserDataIndex(gsOrEsMain, compositeData)), 0, 2);
+  } else {
+    // Use static topology
+    m_verticesPerPrimitive = m_builder.getInt32(m_pipelineState->getVerticesPerPrimitive());
+  }
+
   assert(userData->getType()->isVectorTy());
   const auto constBufferPtrTy = PointerType::get(m_builder.getContext(), ADDR_SPACE_CONST);
 
@@ -2891,7 +2943,7 @@ void NggPrimShader::exportPrimitiveWithGs(Value *startingVertexIndex) {
   const auto rasterStream = m_pipelineState->getRasterizerState().rasterStream;
   Value *primData =
       readPerThreadDataFromLds(m_builder.getInt32Ty(), m_nggInputs.threadIdInSubgroup,
-                               PrimShaderLdsRegion::PrimitiveData, NggMaxThreadsPerSubgroup * rasterStream);
+                               PrimShaderLdsRegion::PrimitiveData, m_maxThreadsPerSubgroup * rasterStream);
   auto validPrimitive = m_builder.CreateICmpNE(primData, m_builder.getInt32(NullPrim));
 
   // Primitive connectivity data have such layout:
@@ -4221,10 +4273,10 @@ Function *NggPrimShader::createGsEmitHandler() {
 
     // Write primitive data (just winding)
     const unsigned regionStart = getLdsRegionStart(PrimShaderLdsRegion::PrimitiveData);
-    // ldsOffset = regionStart + vertexIndex + NggMaxThreadsPerSubgroup * streamId
+    // ldsOffset = regionStart + vertexIndex + maxThreadsPerSubgroup * streamId
     auto ldsOffset = m_builder.CreateAdd(m_builder.getInt32(regionStart), vertexIndex);
     ldsOffset =
-        m_builder.CreateAdd(ldsOffset, m_builder.CreateMul(m_builder.getInt32(NggMaxThreadsPerSubgroup), streamId));
+        m_builder.CreateAdd(ldsOffset, m_builder.CreateMul(m_builder.getInt32(m_maxThreadsPerSubgroup), streamId));
     writeValueToLds(winding, ldsOffset);
 
     m_builder.CreateBr(endEmitPrimBlock);
@@ -6222,18 +6274,8 @@ void NggPrimShader::processSwXfb(ArrayRef<Argument *> args) {
   const auto &xfbStrides = m_pipelineState->getXfbBufferStrides();
 
   bool bufferActive[MaxTransformFeedbackBuffers] = {};
-  unsigned firstActiveXfbBuffer = InvalidValue;
-  unsigned lastActiveXfbBuffer = InvalidValue;
-
-  for (unsigned i = 0; i < MaxTransformFeedbackBuffers; ++i) {
+  for (unsigned i = 0; i < MaxTransformFeedbackBuffers; ++i)
     bufferActive[i] = xfbStrides[i] > 0;
-    if (!bufferActive[i])
-      continue; // Transform feedback buffer is inactive
-
-    if (firstActiveXfbBuffer == InvalidValue)
-      firstActiveXfbBuffer = i;
-    lastActiveXfbBuffer = i;
-  }
 
   //
   // The processing is something like this:
@@ -6244,18 +6286,10 @@ void NggPrimShader::processSwXfb(ArrayRef<Argument *> args) {
   //     Write XFB outputs to LDS region
   //   }
   //
-  //   if (threadIdInSubgroup == 0) {
-  //     Acquire the control of GDS_STRMOUT_DWORDS_WRITTEN_X
-  //     Calculate primsToWrite and dwordsToWrite
-  //     Increment GDS_STRMOUT_DWORDS_WRITTEN_X and release the control
-  //     Store XFB statistics info to LDS
-  //     Increment GDS_STRMOUT_PRIMS_NEEDED_0 and GDS_STRMOUT_PRIMS_WRITTEN_0
-  //   }
+  //   Prepare XFB to update its relevant counters
   //   Barrier
   //
-  //   if (threadIdInWave < MaxXfbBuffers + 1)
-  //     Read XFB statistics info from LDS
-  //
+  //   Read XFB statistics info from LDS
   //   Read primsToWrite and dwordsWritten from XFB statistics info
   //
   //   if (threadIdInSubgroup < primsToWrite)
@@ -6269,20 +6303,20 @@ void NggPrimShader::processSwXfb(ArrayRef<Argument *> args) {
   BasicBlock *endFetchXfbOutputBlock = createBlock(xfbEntryBlock->getParent(), ".endFetchXfbOutput");
   endFetchXfbOutputBlock->moveAfter(fetchXfbOutputBlock);
 
-  BasicBlock *prepareXfbExportBlock = createBlock(xfbEntryBlock->getParent(), ".prepareXfbExport");
-  prepareXfbExportBlock->moveAfter(endFetchXfbOutputBlock);
-  BasicBlock *endPrepareXfbExportBlock = createBlock(xfbEntryBlock->getParent(), ".endPrepareXfbExport");
-  endPrepareXfbExportBlock->moveAfter(prepareXfbExportBlock);
+  unsigned possibleVertsPerPrim = 3;
+  if (isa<ConstantInt>(m_verticesPerPrimitive))
+    possibleVertsPerPrim = cast<ConstantInt>(m_verticesPerPrimitive)->getZExtValue();
 
-  BasicBlock *readXfbStatInfoBlock = createBlock(xfbEntryBlock->getParent(), ".readXfbStatInfo");
-  readXfbStatInfoBlock->moveAfter(endPrepareXfbExportBlock);
-  BasicBlock *endReadXfbStatInfoBlock = createBlock(xfbEntryBlock->getParent(), ".endReadXfbStatInfo");
-  endReadXfbStatInfoBlock->moveAfter(readXfbStatInfoBlock);
+  BasicBlock *exportXfbOutputBlock[3] = {};
+  auto insertPos = endFetchXfbOutputBlock;
+  for (unsigned i = 0; i < possibleVertsPerPrim; ++i) {
+    exportXfbOutputBlock[i] = createBlock(xfbEntryBlock->getParent(), ".exportXfbOutputInVertex" + std::to_string(i));
+    exportXfbOutputBlock[i]->moveAfter(insertPos);
+    insertPos = exportXfbOutputBlock[i];
+  }
 
-  BasicBlock *exportXfbOutputBlock = createBlock(xfbEntryBlock->getParent(), ".exportXfbOutput");
-  exportXfbOutputBlock->moveAfter(endReadXfbStatInfoBlock);
   BasicBlock *endExportXfbOutputBlock = createBlock(xfbEntryBlock->getParent(), ".endExportXfbOutput");
-  endExportXfbOutputBlock->moveAfter(exportXfbOutputBlock);
+  endExportXfbOutputBlock->moveAfter(insertPos);
 
   // Insert branching in current block to process transform feedback export
   {
@@ -6321,164 +6355,18 @@ void NggPrimShader::processSwXfb(ArrayRef<Argument *> args) {
   }
 
   // Construct ".endFetchXfbOutput" block
+  Value *streamOutOffsets[MaxTransformFeedbackBuffers] = {}; // Stream-out offset to write transform feedback outputs
   {
     m_builder.SetInsertPoint(endFetchXfbOutputBlock);
 
-    auto firstThreadInSubgroup = m_builder.CreateICmpEQ(m_nggInputs.threadIdInSubgroup, m_builder.getInt32(0));
-    m_builder.CreateCondBr(firstThreadInSubgroup, prepareXfbExportBlock, endPrepareXfbExportBlock);
-  }
-
-  // Construct ".prepareXfbExport" block
-  {
-    m_builder.SetInsertPoint(prepareXfbExportBlock);
-
-    const unsigned vertsPerPrim = m_pipelineState->getVerticesPerPrimitive();
-    Value *numPrimsToWrite = m_nggInputs.primCountInSubgroup;
-
-    Value *dwordsWritten[MaxTransformFeedbackBuffers] = {};
-    Value *dwordsPerPrim[MaxTransformFeedbackBuffers] = {};
-
-    // Calculate numPrimsToWrite
-    for (unsigned i = 0; i < MaxTransformFeedbackBuffers; ++i) {
-      if (!bufferActive[i])
-        continue;
-
-      if (m_gfxIp.major <= 11) {
-        if (i == firstActiveXfbBuffer) {
-          // ds_ordered_count
-          dwordsWritten[i] = m_builder.CreateIntrinsic(
-              Intrinsic::amdgcn_ds_ordered_add, {},
-              {
-                  m_builder.CreateIntToPtr(m_nggInputs.orderedWaveId,
-                                           PointerType::get(m_builder.getInt32Ty(), ADDR_SPACE_REGION)), // m0
-                  m_builder.getInt32(0),                                                                 // value to add
-                  m_builder.getInt32(0),                                                                 // ordering
-                  m_builder.getInt32(0),                                                                 // scope
-                  m_builder.getFalse(),                                                                  // isVolatile
-                  m_builder.getInt32((GDS_STRMOUT_DWORDS_WRITTEN_0 + i) |
-                                     (1 << 24)), // ordered count index, [27:24] is dword count
-                  m_builder.getFalse(),          // wave release
-                  m_builder.getFalse(),          // wave done
-              });
-        } else {
-          // ds_add_gs_reg
-          dwordsWritten[i] =
-              m_builder.CreateIntrinsic(Intrinsic::amdgcn_ds_add_gs_reg_rtn, m_builder.getInt32Ty(),
-                                        {m_builder.getInt32(0),                                         // value to add
-                                         m_builder.getInt32((GDS_STRMOUT_DWORDS_WRITTEN_0 + i) << 2)}); // count index
-        }
-      } else {
-        llvm_unreachable("Not implemented!");
-      }
-
-      // NUM_RECORDS = SQ_BUF_RSRC_WORD2
-      Value *numRecords = m_builder.CreateExtractElement(m_streamOutBufDescs[i], 2);
-      // bufferSizeInDwords = numRecords >> 2 (NOTE: NUM_RECORDS is set to the byte size of stream-out buffer)
-      Value *bufferSizeInDwords = m_builder.CreateLShr(numRecords, 2);
-      // dwordsRemaining = max(0, bufferSizeInDwords - (bufferOffset + dwordsWritten))
-      Value *dwordsRemaining =
-          m_builder.CreateSub(bufferSizeInDwords, m_builder.CreateAdd(m_streamOutBufOffsets[i], dwordsWritten[i]));
-      dwordsRemaining = m_builder.CreateIntrinsic(Intrinsic::smax, dwordsRemaining->getType(),
-                                                  {dwordsRemaining, m_builder.getInt32(0)});
-      // numPrimsToWrite = min(dwordsRemaining / dwordsPerPrim, numPrimsToWrite)
-      dwordsPerPrim[i] = m_builder.getInt32(vertsPerPrim * xfbStrides[i] / sizeof(unsigned));
-      Value *primsCanWrite = m_builder.CreateUDiv(dwordsRemaining, dwordsPerPrim[i]);
-      numPrimsToWrite =
-          m_builder.CreateIntrinsic(Intrinsic::umin, numPrimsToWrite->getType(), {numPrimsToWrite, primsCanWrite});
-    }
-
-    // Increment dwordsWritten
-    for (unsigned i = 0; i < MaxTransformFeedbackBuffers; ++i) {
-      if (!bufferActive[i])
-        continue;
-
-      Value *dwordsToWrite = m_builder.CreateMul(numPrimsToWrite, dwordsPerPrim[i]);
-
-      if (m_gfxIp.major <= 11) {
-        if (i == lastActiveXfbBuffer) {
-          // ds_ordered_count, wave done
-          dwordsWritten[i] = m_builder.CreateIntrinsic(
-              Intrinsic::amdgcn_ds_ordered_add, {},
-              {
-                  m_builder.CreateIntToPtr(m_nggInputs.orderedWaveId,
-                                           PointerType::get(m_builder.getInt32Ty(), ADDR_SPACE_REGION)), // m0
-                  dwordsToWrite,                                                                         // value to add
-                  m_builder.getInt32(0),                                                                 // ordering
-                  m_builder.getInt32(0),                                                                 // scope
-                  m_builder.getFalse(),                                                                  // isVolatile
-                  m_builder.getInt32((GDS_STRMOUT_DWORDS_WRITTEN_0 + i) |
-                                     (1 << 24)), // ordered count index, [27:24] is dword count
-                  m_builder.getTrue(),           // wave release
-                  m_builder.getTrue(),           // wave done
-              });
-        } else {
-          // ds_add_gs_reg
-          dwordsWritten[i] =
-              m_builder.CreateIntrinsic(Intrinsic::amdgcn_ds_add_gs_reg_rtn, dwordsToWrite->getType(),
-                                        {dwordsToWrite,                                                 // value to add
-                                         m_builder.getInt32((GDS_STRMOUT_DWORDS_WRITTEN_0 + i) << 2)}); // count index
-        }
-      } else {
-        llvm_unreachable("Not implemented!");
-      }
-    }
-
-    // Store transform feedback statistics info to LDS and GDS
-    const unsigned regionStart = getLdsRegionStart(PrimShaderLdsRegion::XfbStats);
-    writeValueToLds(numPrimsToWrite, m_builder.getInt32(regionStart + MaxTransformFeedbackBuffers));
-    for (unsigned i = 0; i < MaxTransformFeedbackBuffers; ++i) {
-      if (!bufferActive[i])
-        continue;
-
-      writeValueToLds(dwordsWritten[i], m_builder.getInt32(regionStart + i));
-    }
-
-    if (m_gfxIp.major <= 11) {
-      m_builder.CreateIntrinsic(Intrinsic::amdgcn_ds_add_gs_reg_rtn, m_nggInputs.primCountInSubgroup->getType(),
-                                {m_nggInputs.primCountInSubgroup,                       // value to add
-                                 m_builder.getInt32(GDS_STRMOUT_PRIMS_NEEDED_0 << 2)}); // count index
-
-      m_builder.CreateIntrinsic(Intrinsic::amdgcn_ds_add_gs_reg_rtn, numPrimsToWrite->getType(),
-                                {numPrimsToWrite,                                        // value to add
-                                 m_builder.getInt32(GDS_STRMOUT_PRIMS_WRITTEN_0 << 2)}); // count index
-    } else {
-      llvm_unreachable("Not implemented!");
-    }
-
-    m_builder.CreateBr(endPrepareXfbExportBlock);
-  }
-
-  // Construct ".endPrepareXfbExport" block
-  {
-    m_builder.SetInsertPoint(endPrepareXfbExportBlock);
+    prepareSwXfb({m_nggInputs.primCountInSubgroup});
 
     // We are going to read transform feedback statistics info and outputs from LDS and export them to transform
-    // feedback buffers. Make sure the output values have been all written before this.
+    // feedback buffers. Make all values have been written before this.
     createFenceAndBarrier();
 
-    auto validThread =
-        m_builder.CreateICmpULT(m_nggInputs.threadIdInWave, m_builder.getInt32(1 + MaxTransformFeedbackBuffers));
-    m_builder.CreateCondBr(validThread, readXfbStatInfoBlock, endReadXfbStatInfoBlock);
-  }
-
-  // Construct ".readXfbStatInfo" block
-  Value *xfbStatInfo = nullptr;
-  {
-    m_builder.SetInsertPoint(readXfbStatInfoBlock);
-
-    xfbStatInfo =
+    auto xfbStatInfo =
         readPerThreadDataFromLds(m_builder.getInt32Ty(), m_nggInputs.threadIdInWave, PrimShaderLdsRegion::XfbStats);
-    m_builder.CreateBr(endReadXfbStatInfoBlock);
-  }
-
-  // Construct ".endReadXfbStatInfo" block
-  Value *streamOutOffsets[MaxTransformFeedbackBuffers] = {}; // Stream-out offset to write transform feedback outputs
-  {
-    m_builder.SetInsertPoint(endReadXfbStatInfoBlock);
-
-    xfbStatInfo = createPhi(
-        {{xfbStatInfo, readXfbStatInfoBlock}, {PoisonValue::get(xfbStatInfo->getType()), endPrepareXfbExportBlock}});
-
     for (unsigned i = 0; i < MaxTransformFeedbackBuffers; ++i) {
       if (bufferActive[i]) {
         streamOutOffsets[i] = m_builder.CreateIntrinsic(m_builder.getInt32Ty(), Intrinsic::amdgcn_readlane,
@@ -6491,104 +6379,105 @@ void NggPrimShader::processSwXfb(ArrayRef<Argument *> args) {
                                                      {xfbStatInfo, m_builder.getInt32(MaxTransformFeedbackBuffers)});
 
     auto validPrimitive = m_builder.CreateICmpULT(m_nggInputs.threadIdInSubgroup, numPrimsToWrite);
-    m_builder.CreateCondBr(validPrimitive, exportXfbOutputBlock, endExportXfbOutputBlock);
-  }
-
-  // Construct ".exportXfbOutput" block
-  {
-    m_builder.SetInsertPoint(exportXfbOutputBlock);
-
-    const unsigned vertsPerPrim = m_pipelineState->getVerticesPerPrimitive();
-    Value *vertexIndices[3] = {};
-    vertexIndices[0] = m_nggInputs.vertexIndex0;
-    if (vertsPerPrim > 1)
-      vertexIndices[1] = m_nggInputs.vertexIndex1;
-    if (vertsPerPrim > 2)
-      vertexIndices[2] = m_nggInputs.vertexIndex2;
-
-    for (unsigned i = 0; i < vertsPerPrim; ++i) {
-      for (unsigned j = 0; j < xfbOutputExports.size(); ++j) {
-        const auto &xfbOutputExport = xfbOutputExports[j];
-        auto outputValue = readXfbOutputFromLds(
-            xfbOutputExport.numElements > 1 ? FixedVectorType::get(m_builder.getFloatTy(), xfbOutputExport.numElements)
-                                            : m_builder.getFloatTy(),
-            vertexIndices[i], xfbOutputExport.offsetInVertex);
-
-        if (xfbOutputExport.is16bit) {
-          // NOTE: For 16-bit transform feedbakc outputs, they are stored as 32-bit without tightly packed in LDS.
-          outputValue = m_builder.CreateBitCast(
-              outputValue, FixedVectorType::get(m_builder.getInt32Ty(), xfbOutputExport.numElements));
-          outputValue = m_builder.CreateTrunc(
-              outputValue, FixedVectorType::get(m_builder.getInt16Ty(), xfbOutputExport.numElements));
-          outputValue = m_builder.CreateBitCast(
-              outputValue, FixedVectorType::get(m_builder.getHalfTy(), xfbOutputExport.numElements));
-        }
+    m_builder.CreateCondBr(validPrimitive, exportXfbOutputBlock[0], endExportXfbOutputBlock);
+  }
+
+  Value *vertexIndices[3] = {};
+  vertexIndices[0] = m_nggInputs.vertexIndex0;
+  vertexIndices[1] = m_nggInputs.vertexIndex1;
+  vertexIndices[2] = m_nggInputs.vertexIndex2;
+
+  for (unsigned i = 0; i < possibleVertsPerPrim; ++i) {
+    // Construct ".exportXfbOutputInVertex[N]" block
+    m_builder.SetInsertPoint(exportXfbOutputBlock[i]);
+
+    for (unsigned j = 0; j < xfbOutputExports.size(); ++j) {
+      const auto &xfbOutputExport = xfbOutputExports[j];
+      auto outputValue = readXfbOutputFromLds(
+          xfbOutputExport.numElements > 1 ? FixedVectorType::get(m_builder.getFloatTy(), xfbOutputExport.numElements)
+                                          : m_builder.getFloatTy(),
+          vertexIndices[i], xfbOutputExport.offsetInVertex);
+
+      if (xfbOutputExport.is16bit) {
+        // NOTE: For 16-bit transform feedbakc outputs, they are stored as 32-bit without tightly packed in LDS.
+        outputValue = m_builder.CreateBitCast(
+            outputValue, FixedVectorType::get(m_builder.getInt32Ty(), xfbOutputExport.numElements));
+        outputValue = m_builder.CreateTrunc(outputValue,
+                                            FixedVectorType::get(m_builder.getInt16Ty(), xfbOutputExport.numElements));
+        outputValue = m_builder.CreateBitCast(outputValue,
+                                              FixedVectorType::get(m_builder.getHalfTy(), xfbOutputExport.numElements));
+      }
 
-        unsigned format = 0;
-        switch (xfbOutputExport.numElements) {
-        case 1:
-          format = xfbOutputExport.is16bit ? BUF_FORMAT_16_FLOAT : BUF_FORMAT_32_FLOAT;
-          break;
-        case 2:
-          format = xfbOutputExport.is16bit ? BUF_FORMAT_16_16_FLOAT : BUF_FORMAT_32_32_FLOAT_GFX11;
-          break;
-        case 3:
-          format = xfbOutputExport.is16bit ? BUF_FORMAT_16_16_FLOAT : BUF_FORMAT_32_32_32_FLOAT_GFX11;
-          break;
-        case 4:
-          format = xfbOutputExport.is16bit ? BUF_FORMAT_16_16_16_16_FLOAT_GFX11 : BUF_FORMAT_32_32_32_32_FLOAT_GFX11;
-          break;
-        default:
-          llvm_unreachable("Unexpected element number!");
-          break;
-        }
+      unsigned format = 0;
+      switch (xfbOutputExport.numElements) {
+      case 1:
+        format = xfbOutputExport.is16bit ? BUF_FORMAT_16_FLOAT : BUF_FORMAT_32_FLOAT;
+        break;
+      case 2:
+        format = xfbOutputExport.is16bit ? BUF_FORMAT_16_16_FLOAT : BUF_FORMAT_32_32_FLOAT_GFX11;
+        break;
+      case 3:
+        format = xfbOutputExport.is16bit ? BUF_FORMAT_16_16_FLOAT : BUF_FORMAT_32_32_32_FLOAT_GFX11;
+        break;
+      case 4:
+        format = xfbOutputExport.is16bit ? BUF_FORMAT_16_16_16_16_FLOAT_GFX11 : BUF_FORMAT_32_32_32_32_FLOAT_GFX11;
+        break;
+      default:
+        llvm_unreachable("Unexpected element number!");
+        break;
+      }
 
-        CoherentFlag coherent = {};
-        if (m_pipelineState->getTargetInfo().getGfxIpVersion().major <= 11) {
-          coherent.bits.glc = true;
-          coherent.bits.slc = true;
-        }
+      CoherentFlag coherent = {};
+      if (m_pipelineState->getTargetInfo().getGfxIpVersion().major <= 11) {
+        coherent.bits.glc = true;
+        coherent.bits.slc = true;
+      }
 
-        // vertexOffset = (threadIdInSubgroup * vertsPerPrim + vertexIndex) * xfbStride
-        Value *vertexOffset =
-            m_builder.CreateAdd(m_builder.CreateMul(m_nggInputs.threadIdInSubgroup, m_builder.getInt32(vertsPerPrim)),
-                                m_builder.getInt32(i));
-        vertexOffset = m_builder.CreateMul(vertexOffset, m_builder.getInt32(xfbStrides[xfbOutputExport.xfbBuffer]));
-        // xfbOutputOffset = vertexOffset + xfbOffset
-        Value *xfbOutputOffset = m_builder.CreateAdd(vertexOffset, m_builder.getInt32(xfbOutputExport.xfbOffset));
-
-        if (xfbOutputExport.is16bit && xfbOutputExport.numElements == 3) {
-          // NOTE: For 16vec3, HW doesn't have a corresponding buffer store instruction. We have to split it to 16vec2
-          // and 16scalar.
-          m_builder.CreateIntrinsic(Intrinsic::amdgcn_raw_tbuffer_store, FixedVectorType::get(m_builder.getHalfTy(), 2),
-                                    {m_builder.CreateShuffleVector(outputValue, ArrayRef<int>{0, 1}), // vdata
-                                     m_streamOutBufDescs[xfbOutputExport.xfbBuffer],                  // rsrc
-                                     xfbOutputOffset,                                                 // offset
-                                     streamOutOffsets[xfbOutputExport.xfbBuffer],                     // soffset
-                                     m_builder.getInt32(BUF_FORMAT_16_16_FLOAT),                      // format
-                                     m_builder.getInt32(coherent.u32All)});                           // auxiliary data
-
-          m_builder.CreateIntrinsic(Intrinsic::amdgcn_raw_tbuffer_store, m_builder.getHalfTy(),
-                                    {m_builder.CreateExtractElement(outputValue, 2), // vdata
-                                     m_streamOutBufDescs[xfbOutputExport.xfbBuffer], // rsrc
-                                     m_builder.CreateAdd(xfbOutputOffset,
-                                                         m_builder.getInt32(2 * sizeof(uint16_t))), // offset
-                                     streamOutOffsets[xfbOutputExport.xfbBuffer],                   // soffset
-                                     m_builder.getInt32(BUF_FORMAT_16_FLOAT),                       // format
-                                     m_builder.getInt32(coherent.u32All)});                         // auxiliary data
-        } else {
-          m_builder.CreateIntrinsic(Intrinsic::amdgcn_raw_tbuffer_store, outputValue->getType(),
-                                    {outputValue,                                    // vdata
-                                     m_streamOutBufDescs[xfbOutputExport.xfbBuffer], // rsrc
-                                     xfbOutputOffset,                                // offset
-                                     streamOutOffsets[xfbOutputExport.xfbBuffer],    // soffset
-                                     m_builder.getInt32(format),                     // format
-                                     m_builder.getInt32(coherent.u32All)});          // auxiliary data
-        }
+      // vertexOffset = (threadIdInSubgroup * vertsPerPrim + vertexIndex) * xfbStride
+      Value *vertexOffset = m_builder.CreateAdd(
+          m_builder.CreateMul(m_nggInputs.threadIdInSubgroup, m_verticesPerPrimitive), m_builder.getInt32(i));
+      vertexOffset = m_builder.CreateMul(vertexOffset, m_builder.getInt32(xfbStrides[xfbOutputExport.xfbBuffer]));
+      // xfbOutputOffset = vertexOffset + xfbOffset
+      Value *xfbOutputOffset = m_builder.CreateAdd(vertexOffset, m_builder.getInt32(xfbOutputExport.xfbOffset));
+
+      if (xfbOutputExport.is16bit && xfbOutputExport.numElements == 3) {
+        // NOTE: For 16vec3, HW doesn't have a corresponding buffer store instruction. We have to split it to 16vec2
+        // and 16scalar.
+        m_builder.CreateIntrinsic(Intrinsic::amdgcn_raw_tbuffer_store, FixedVectorType::get(m_builder.getHalfTy(), 2),
+                                  {m_builder.CreateShuffleVector(outputValue, ArrayRef<int>{0, 1}), // vdata
+                                   m_streamOutBufDescs[xfbOutputExport.xfbBuffer],                  // rsrc
+                                   xfbOutputOffset,                                                 // offset
+                                   streamOutOffsets[xfbOutputExport.xfbBuffer],                     // soffset
+                                   m_builder.getInt32(BUF_FORMAT_16_16_FLOAT),                      // format
+                                   m_builder.getInt32(coherent.u32All)});                           // auxiliary data
+
+        m_builder.CreateIntrinsic(Intrinsic::amdgcn_raw_tbuffer_store, m_builder.getHalfTy(),
+                                  {m_builder.CreateExtractElement(outputValue, 2), // vdata
+                                   m_streamOutBufDescs[xfbOutputExport.xfbBuffer], // rsrc
+                                   m_builder.CreateAdd(xfbOutputOffset,
+                                                       m_builder.getInt32(2 * sizeof(uint16_t))), // offset
+                                   streamOutOffsets[xfbOutputExport.xfbBuffer],                   // soffset
+                                   m_builder.getInt32(BUF_FORMAT_16_FLOAT),                       // format
+                                   m_builder.getInt32(coherent.u32All)});                         // auxiliary data
+      } else {
+        m_builder.CreateIntrinsic(Intrinsic::amdgcn_raw_tbuffer_store, outputValue->getType(),
+                                  {outputValue,                                    // vdata
+                                   m_streamOutBufDescs[xfbOutputExport.xfbBuffer], // rsrc
+                                   xfbOutputOffset,                                // offset
+                                   streamOutOffsets[xfbOutputExport.xfbBuffer],    // soffset
+                                   m_builder.getInt32(format),                     // format
+                                   m_builder.getInt32(coherent.u32All)});          // auxiliary data
       }
     }
 
-    m_builder.CreateBr(endExportXfbOutputBlock);
+    if (i == possibleVertsPerPrim - 1) {
+      // Last vertex
+      m_builder.CreateBr(endExportXfbOutputBlock);
+    } else {
+      // Not last vertex, check if we need to export outputs of next vertex
+      auto exportNextVertex = m_builder.CreateICmpUGT(m_verticesPerPrimitive, m_builder.getInt32(i + 1));
+      m_builder.CreateCondBr(exportNextVertex, exportXfbOutputBlock[i + 1], endExportXfbOutputBlock);
+    }
   }
 
   // Construct ".endExportXfbOutput" block
@@ -6605,24 +6494,12 @@ void NggPrimShader::processSwXfbWithGs(ArrayRef<Argument *> args) {
 
   const unsigned waveSize = m_pipelineState->getShaderWaveSize(ShaderStage::Geometry);
   assert(waveSize == 32 || waveSize == 64);
-  const unsigned waveCountInSubgroup = NggMaxThreadsPerSubgroup / waveSize;
 
   const auto &xfbStrides = m_pipelineState->getXfbBufferStrides();
-  const auto &streamXfbBuffers = m_pipelineState->getStreamXfbBuffers();
 
   bool bufferActive[MaxTransformFeedbackBuffers] = {};
-  unsigned firstActiveXfbBuffer = InvalidValue;
-  unsigned lastActiveXfbBuffer = InvalidValue;
-
-  for (unsigned i = 0; i < MaxTransformFeedbackBuffers; ++i) {
+  for (unsigned i = 0; i < MaxTransformFeedbackBuffers; ++i)
     bufferActive[i] = xfbStrides[i] > 0;
-    if (!bufferActive[i])
-      continue; // Transform feedback buffer is inactive
-
-    if (firstActiveXfbBuffer == InvalidValue)
-      firstActiveXfbBuffer = i;
-    lastActiveXfbBuffer = i;
-  }
 
   unsigned firstActiveStream = InvalidValue;
   unsigned lastActiveStream = InvalidValue;
@@ -6636,31 +6513,18 @@ void NggPrimShader::processSwXfbWithGs(ArrayRef<Argument *> args) {
     lastActiveStream = i;
   }
 
-  unsigned xfbBufferToStream[MaxTransformFeedbackBuffers] = {};
-
-  for (unsigned i = 0; i < MaxTransformFeedbackBuffers; ++i) {
-    for (unsigned j = 0; j < MaxGsStreams; ++j) {
-      if ((streamXfbBuffers[j] & (1 << i)) != 0) {
-        // NOTE: According to GLSL spec, all outputs assigned to a given transform feedback buffer are required to
-        // come from a single vertex stream.
-        xfbBufferToStream[i] = j;
-        break;
-      }
-    }
-  }
-
   //
   // The processing is something like this:
   //
   // NGG_GS_XFB() {
-  //   if (threadIdInSubgroup < waveCount + 1)
+  //   if (threadIdInSubgroup < maxWaves + 1)
   //     Initialize per-wave and per-subgroup count of output primitives
   //   Barrier
   //
   //   if (threadIdInSubgroup < primCountInSubgroup)
   //     Check the draw flag of output primitives and compute draw mask
   //
-  //   if (threadIdInWave < waveCount - waveId)
+  //   if (threadIdInWave < maxWaves - waveId)
   //     Accumulate per-wave and per-subgroup count of output primitives
   //   Barrier
   //
@@ -6671,13 +6535,7 @@ void NggPrimShader::processSwXfbWithGs(ArrayRef<Argument *> args) {
   //
   //   Mutate copy shader to fetch XFB outputs
   //
-  //   if (threadIdInSubgroup == 0) {
-  //     Acquire the control of GDS_STRMOUT_DWORDS_WRITTEN_X
-  //     Calculate primsToWrite and dwordsToWrite
-  //     Increment GDS_STRMOUT_DWORDS_WRITTEN_X and release the control
-  //     Store GS XFB statistics info to LDS
-  //     Increment GDS_STRMOUT_PRIMS_NEEDED_X and GDS_STRMOUT_PRIMS_WRITTEN_X
-  //   }
+  //   Prepare XFB and update its relevant counters
   //   Barrier
   //
   //   Read XFB statistics info from LDS
@@ -6723,14 +6581,8 @@ void NggPrimShader::processSwXfbWithGs(ArrayRef<Argument *> args) {
     }
   }
 
-  BasicBlock *prepareXfbExportBlock = createBlock(xfbEntryBlock->getParent(), ".prepareXfbExport");
-  prepareXfbExportBlock->moveAfter(insertPos);
-  BasicBlock *endPrepareXfbExportBlock = createBlock(xfbEntryBlock->getParent(), ".endPrepareXfbExport");
-  endPrepareXfbExportBlock->moveAfter(prepareXfbExportBlock);
-
   BasicBlock *exportXfbOutputBlock[MaxGsStreams] = {};
   BasicBlock *endExportXfbOutputBlock[MaxGsStreams] = {};
-  insertPos = endPrepareXfbExportBlock;
   for (unsigned i = 0; i < MaxGsStreams; ++i) {
     if (m_pipelineState->isVertexStreamActive(i)) {
       exportXfbOutputBlock[i] = createBlock(xfbEntryBlock->getParent(), ".exportXfbOutputInStream" + std::to_string(i));
@@ -6747,7 +6599,7 @@ void NggPrimShader::processSwXfbWithGs(ArrayRef<Argument *> args) {
   // Insert branching in current block to process transform feedback export
   {
     auto validWave =
-        m_builder.CreateICmpULT(m_nggInputs.threadIdInSubgroup, m_builder.getInt32(waveCountInSubgroup + 1));
+        m_builder.CreateICmpULT(m_nggInputs.threadIdInSubgroup, m_builder.getInt32(m_maxWavesPerSubgroup + 1));
     m_builder.CreateCondBr(validWave, initPrimitiveCountsBlock, endInitPrimitiveCountsBlock);
   }
 
@@ -6758,7 +6610,7 @@ void NggPrimShader::processSwXfbWithGs(ArrayRef<Argument *> args) {
     for (unsigned i = 0; i < MaxGsStreams; ++i) {
       if (m_pipelineState->isVertexStreamActive(i)) {
         writePerThreadDataToLds(m_builder.getInt32(0), m_nggInputs.threadIdInSubgroup,
-                                PrimShaderLdsRegion::PrimitiveCounts, (NggMaxWavesPerSubgroup + 1) * i);
+                                PrimShaderLdsRegion::PrimitiveCounts, (m_maxWavesPerSubgroup + 1) * i);
       }
     }
 
@@ -6784,7 +6636,7 @@ void NggPrimShader::processSwXfbWithGs(ArrayRef<Argument *> args) {
       if (m_pipelineState->isVertexStreamActive(i)) {
         // drawFlag = primData[N] != NullPrim
         auto primData = readPerThreadDataFromLds(m_builder.getInt32Ty(), m_nggInputs.threadIdInSubgroup,
-                                                 PrimShaderLdsRegion::PrimitiveData, NggMaxThreadsPerSubgroup * i);
+                                                 PrimShaderLdsRegion::PrimitiveData, m_maxThreadsPerSubgroup * i);
         drawFlag[i] = m_builder.CreateICmpNE(primData, m_builder.getInt32(NullPrim));
       }
     }
@@ -6814,7 +6666,7 @@ void NggPrimShader::processSwXfbWithGs(ArrayRef<Argument *> args) {
         primCountInWave[i] = m_builder.CreateTrunc(primCountInWave[i], m_builder.getInt32Ty());
       }
     }
-    auto threadIdUpbound = m_builder.CreateSub(m_builder.getInt32(waveCountInSubgroup), m_nggInputs.waveIdInSubgroup);
+    auto threadIdUpbound = m_builder.CreateSub(m_builder.getInt32(m_maxWavesPerSubgroup), m_nggInputs.waveIdInSubgroup);
     auto validThread = m_builder.CreateICmpULT(m_nggInputs.threadIdInWave, threadIdUpbound);
 
     m_builder.CreateCondBr(validThread, accumPrimitiveCountsBlock, endAccumPrimitiveCountsBlock);
@@ -6831,8 +6683,8 @@ void NggPrimShader::processSwXfbWithGs(ArrayRef<Argument *> args) {
 
     for (unsigned i = 0; i < MaxGsStreams; ++i) {
       if (m_pipelineState->isVertexStreamActive(i)) {
-        atomicAdd(primCountInWave[i],
-                  m_builder.CreateAdd(ldsOffset, m_builder.getInt32(regionStart + (NggMaxWavesPerSubgroup + 1) * i)));
+        atomicOp(AtomicRMWInst::Add, primCountInWave[i],
+                 m_builder.CreateAdd(ldsOffset, m_builder.getInt32(regionStart + (m_maxWavesPerSubgroup + 1) * i)));
       }
     }
 
@@ -6853,12 +6705,12 @@ void NggPrimShader::processSwXfbWithGs(ArrayRef<Argument *> args) {
 
       auto primCountInWaves =
           readPerThreadDataFromLds(m_builder.getInt32Ty(), m_nggInputs.threadIdInWave,
-                                   PrimShaderLdsRegion::PrimitiveCounts, (NggMaxWavesPerSubgroup + 1) * i);
+                                   PrimShaderLdsRegion::PrimitiveCounts, (m_maxWavesPerSubgroup + 1) * i);
 
       // The last dword following dwords for all waves (each wave has one dword) stores GS output primitive count of
       // the entire subgroup
       primCountInSubgroup[i] = m_builder.CreateIntrinsic(m_builder.getInt32Ty(), Intrinsic::amdgcn_readlane,
-                                                         {primCountInWaves, m_builder.getInt32(waveCountInSubgroup)});
+                                                         {primCountInWaves, m_builder.getInt32(m_maxWavesPerSubgroup)});
 
       // Get output primitive count for all waves prior to this wave
       primCountInPrevWaves[i] = m_builder.CreateIntrinsic(m_builder.getInt32Ty(), Intrinsic::amdgcn_readlane,
@@ -6893,7 +6745,7 @@ void NggPrimShader::processSwXfbWithGs(ArrayRef<Argument *> args) {
 
       compactedPrimitiveIndex = m_builder.CreateAdd(primCountInPrevWaves[i], compactedPrimitiveIndex);
       writePerThreadDataToLds(m_nggInputs.threadIdInSubgroup, compactedPrimitiveIndex,
-                              PrimShaderLdsRegion::PrimitiveIndexMap, NggMaxThreadsPerSubgroup * i);
+                              PrimShaderLdsRegion::PrimitiveIndexMap, m_maxThreadsPerSubgroup * i);
 
       m_builder.CreateBr(endCompactPrimitiveIndexBlock[i]);
     }
@@ -6906,9 +6758,6 @@ void NggPrimShader::processSwXfbWithGs(ArrayRef<Argument *> args) {
         // Start to fetch transform feedback outputs after we finish compacting primitive index of the last vertex
         // stream.
         fetchXfbOutput(m_gsHandlers.copyShader, args, xfbOutputExports);
-
-        auto firstThreadInSubgroup = m_builder.CreateICmpEQ(m_nggInputs.threadIdInSubgroup, m_builder.getInt32(0));
-        m_builder.CreateCondBr(firstThreadInSubgroup, prepareXfbExportBlock, endPrepareXfbExportBlock);
       } else {
         unsigned nextActiveStream = i + 1;
         while (!m_pipelineState->isVertexStreamActive(nextActiveStream)) {
@@ -6922,141 +6771,10 @@ void NggPrimShader::processSwXfbWithGs(ArrayRef<Argument *> args) {
     }
   }
 
-  // Construct ".prepareXfbExport" block
-  {
-    m_builder.SetInsertPoint(prepareXfbExportBlock);
-
-    const unsigned outVertsPerPrim = m_pipelineState->getVerticesPerPrimitive();
-
-    Value *numPrimsToWrite[MaxGsStreams] = {};
-    for (unsigned i = 0; i < MaxGsStreams; ++i)
-      numPrimsToWrite[i] = primCountInSubgroup[i];
-
-    Value *dwordsWritten[MaxTransformFeedbackBuffers] = {};
-    Value *dwordsPerPrim[MaxTransformFeedbackBuffers] = {};
-
-    // Calculate numPrimsToWrite[N]
-    for (unsigned i = 0; i < MaxTransformFeedbackBuffers; ++i) {
-      if (!bufferActive[i])
-        continue;
-
-      if (m_gfxIp.major <= 11) {
-        if (i == firstActiveXfbBuffer) {
-          // ds_ordered_count
-          dwordsWritten[i] = m_builder.CreateIntrinsic(
-              Intrinsic::amdgcn_ds_ordered_add, {},
-              {
-                  m_builder.CreateIntToPtr(m_nggInputs.orderedWaveId,
-                                           PointerType::get(m_builder.getInt32Ty(), ADDR_SPACE_REGION)), // m0
-                  m_builder.getInt32(0),                                                                 // value to add
-                  m_builder.getInt32(0),                                                                 // ordering
-                  m_builder.getInt32(0),                                                                 // scope
-                  m_builder.getFalse(),                                                                  // isVolatile
-                  m_builder.getInt32((GDS_STRMOUT_DWORDS_WRITTEN_0 + i) |
-                                     (1 << 24)), // ordered count index, [27:24] is dword count
-                  m_builder.getFalse(),          // wave release
-                  m_builder.getFalse(),          // wave done
-              });
-        } else {
-          // ds_add_gs_reg
-          dwordsWritten[i] =
-              m_builder.CreateIntrinsic(Intrinsic::amdgcn_ds_add_gs_reg_rtn, m_builder.getInt32Ty(),
-                                        {m_builder.getInt32(0),                                         // value to add
-                                         m_builder.getInt32((GDS_STRMOUT_DWORDS_WRITTEN_0 + i) << 2)}); // count index
-        }
-      } else {
-        llvm_unreachable("Not implemented!");
-      }
-
-      // NUM_RECORDS = SQ_BUF_RSRC_WORD2
-      Value *numRecords = m_builder.CreateExtractElement(m_streamOutBufDescs[i], 2);
-      // bufferSizeInDwords = numRecords >> 2 (NOTE: NUM_RECORDS is set to the byte size of stream-out buffer)
-      Value *bufferSizeInDwords = m_builder.CreateLShr(numRecords, 2);
-      // dwordsRemaining = max(0, bufferSizeInDwords - (bufferOffset + dwordsWritten))
-      Value *dwordsRemaining =
-          m_builder.CreateSub(bufferSizeInDwords, m_builder.CreateAdd(m_streamOutBufOffsets[i], dwordsWritten[i]));
-      dwordsRemaining = m_builder.CreateIntrinsic(Intrinsic::smax, dwordsRemaining->getType(),
-                                                  {dwordsRemaining, m_builder.getInt32(0)});
-      // numPrimsToWrite = min(dwordsRemaining / dwordsPerPrim, numPrimsToWrite)
-      dwordsPerPrim[i] = m_builder.getInt32(outVertsPerPrim * xfbStrides[i] / sizeof(unsigned));
-      Value *primsCanWrite = m_builder.CreateUDiv(dwordsRemaining, dwordsPerPrim[i]);
-      numPrimsToWrite[xfbBufferToStream[i]] =
-          m_builder.CreateIntrinsic(Intrinsic::umin, numPrimsToWrite[xfbBufferToStream[i]]->getType(),
-                                    {numPrimsToWrite[xfbBufferToStream[i]], primsCanWrite});
-    }
-
-    // Increment dwordsWritten
-    for (unsigned i = 0; i < MaxTransformFeedbackBuffers; ++i) {
-      if (!bufferActive[i])
-        continue;
-
-      Value *dwordsToWrite = m_builder.CreateMul(numPrimsToWrite[xfbBufferToStream[i]], dwordsPerPrim[i]);
-
-      if (m_gfxIp.major <= 11) {
-        if (i == lastActiveXfbBuffer) {
-          // ds_ordered_count, wave done
-          dwordsWritten[i] = m_builder.CreateIntrinsic(
-              Intrinsic::amdgcn_ds_ordered_add, {},
-              {
-                  m_builder.CreateIntToPtr(m_nggInputs.orderedWaveId,
-                                           PointerType::get(m_builder.getInt32Ty(), ADDR_SPACE_REGION)), // m0
-                  dwordsToWrite,                                                                         // value to add
-                  m_builder.getInt32(0),                                                                 // ordering
-                  m_builder.getInt32(0),                                                                 // scope
-                  m_builder.getFalse(),                                                                  // isVolatile
-                  m_builder.getInt32((GDS_STRMOUT_DWORDS_WRITTEN_0 + i) |
-                                     (1 << 24)), // ordered count index, [27:24] is dword count
-                  m_builder.getTrue(),           // wave release
-                  m_builder.getTrue(),           // wave done
-              });
-        } else {
-          // ds_add_gs_reg
-          dwordsWritten[i] =
-              m_builder.CreateIntrinsic(Intrinsic::amdgcn_ds_add_gs_reg_rtn, dwordsToWrite->getType(),
-                                        {dwordsToWrite,                                                 // value to add
-                                         m_builder.getInt32((GDS_STRMOUT_DWORDS_WRITTEN_0 + i) << 2)}); // count index
-        }
-      } else {
-        llvm_unreachable("Not implemented!");
-      }
-    }
-
-    // Store transform feedback statistics info to LDS and GDS
-    const unsigned regionStart = getLdsRegionStart(PrimShaderLdsRegion::XfbStats);
-    for (unsigned i = 0; i < MaxTransformFeedbackBuffers; ++i) {
-      if (!bufferActive[i])
-        continue;
-
-      writeValueToLds(dwordsWritten[i], m_builder.getInt32(regionStart + i));
-    }
-
-    for (unsigned i = 0; i < MaxGsStreams; ++i) {
-      if (!m_pipelineState->isVertexStreamActive(i))
-        continue;
-
-      writeValueToLds(numPrimsToWrite[i], m_builder.getInt32(regionStart + MaxTransformFeedbackBuffers + i));
-
-      if (m_gfxIp.major <= 11) {
-        m_builder.CreateIntrinsic(Intrinsic::amdgcn_ds_add_gs_reg_rtn, primCountInSubgroup[i]->getType(),
-                                  {primCountInSubgroup[i],                                          // value to add
-                                   m_builder.getInt32((GDS_STRMOUT_PRIMS_NEEDED_0 + 2 * i) << 2)}); // count index
-
-        m_builder.CreateIntrinsic(Intrinsic::amdgcn_ds_add_gs_reg_rtn, numPrimsToWrite[i]->getType(),
-                                  {numPrimsToWrite[i],                                               // value to add
-                                   m_builder.getInt32((GDS_STRMOUT_PRIMS_WRITTEN_0 + 2 * i) << 2)}); // count index
-      } else {
-        llvm_unreachable("Not implemented!");
-      }
-    }
-
-    m_builder.CreateBr(endPrepareXfbExportBlock);
-  }
-
-  // Construct ".endPrepareXfbExport" block
   Value *streamOutOffsets[MaxTransformFeedbackBuffers] = {}; // Stream-out offset to write transform feedback outputs
   Value *numPrimsToWrite[MaxGsStreams] = {};
   {
-    m_builder.SetInsertPoint(endPrepareXfbExportBlock);
+    prepareSwXfb(primCountInSubgroup);
 
     // We are going to read transform feedback statistics info from LDS. Make sure the info has been written before
     // this.
@@ -7098,7 +6816,7 @@ void NggPrimShader::processSwXfbWithGs(ArrayRef<Argument *> args) {
 
       Value *uncompactedPrimitiveIndex =
           readPerThreadDataFromLds(m_builder.getInt32Ty(), m_nggInputs.threadIdInSubgroup,
-                                   PrimShaderLdsRegion::PrimitiveIndexMap, NggMaxThreadsPerSubgroup * i);
+                                   PrimShaderLdsRegion::PrimitiveIndexMap, m_maxThreadsPerSubgroup * i);
       Value *vertexIndex = uncompactedPrimitiveIndex;
 
       const unsigned outVertsPerPrim = m_pipelineState->getVerticesPerPrimitive();
@@ -7110,7 +6828,7 @@ void NggPrimShader::processSwXfbWithGs(ArrayRef<Argument *> args) {
         vertexIndices[2] = m_builder.CreateAdd(vertexIndex, m_builder.getInt32(2));
 
         Value *primData = readPerThreadDataFromLds(m_builder.getInt32Ty(), uncompactedPrimitiveIndex,
-                                                   PrimShaderLdsRegion::PrimitiveData, NggMaxThreadsPerSubgroup * i);
+                                                   PrimShaderLdsRegion::PrimitiveData, m_maxThreadsPerSubgroup * i);
         // NOTE: primData[N] corresponds to the forming vertex
         // The vertice indices in the first triangle <N, N+1, N+2>
         // If provoking vertex is the first one, the vertice indices in the second triangle is <N, N+2, N+1>, otherwise
@@ -7245,6 +6963,206 @@ void NggPrimShader::processSwXfbWithGs(ArrayRef<Argument *> args) {
   }
 }
 
+// =====================================================================================================================
+// Prepare SW emulated transform feedback. Update various counter relevant to transform feedback, such as dwordsWritten,
+// primsNeed, and primsWritten.
+//
+// @param primCountInSubgroup : Number of primitives in subgroup for each vertex stream
+void NggPrimShader::prepareSwXfb(ArrayRef<Value *> primCountInSubgroup) {
+  assert(m_gfxIp.major >= 11); // Must be GFX11++
+
+  const auto &xfbStrides = m_pipelineState->getXfbBufferStrides();
+  bool bufferActive[MaxTransformFeedbackBuffers] = {};
+  for (unsigned i = 0; i < MaxTransformFeedbackBuffers; ++i)
+    bufferActive[i] = xfbStrides[i] > 0;
+
+  const auto &streamXfbBuffers = m_pipelineState->getStreamXfbBuffers();
+  unsigned xfbBufferToStream[MaxTransformFeedbackBuffers] = {};
+  if (m_hasGs) {
+    for (unsigned i = 0; i < MaxTransformFeedbackBuffers; ++i) {
+      for (unsigned j = 0; j < MaxGsStreams; ++j) {
+        if ((streamXfbBuffers[j] & (1 << i)) != 0) {
+          // NOTE: According to GLSL spec, all outputs assigned to a given transform feedback buffer are required to
+          // come from a single vertex stream.
+          xfbBufferToStream[i] = j;
+          break;
+        }
+      }
+    }
+  }
+
+  // GFX11 SW emulated stream-out with GDS support
+  if (m_gfxIp.major == 11) {
+    //
+    // The processing is something like this:
+    //
+    // PREPARE_XFB() {
+    //   if (threadIdInSubgroup == 0) {
+    //     Acquire the control of GDS_STRMOUT_DWORDS_WRITTEN_X
+    //     Calculate primsToWrite and dwordsToWrite
+    //     Increment GDS_STRMOUT_DWORDS_WRITTEN_X and release the control
+    //     Store XFB statistics info to LDS
+    //     Increment GDS_STRMOUT_PRIMS_NEEDED_X and GDS_STRMOUT_PRIMS_WRITTEN_X
+    //   }
+    //
+    auto insertBlock = m_builder.GetInsertBlock();
+    auto primShader = insertBlock->getParent();
+
+    auto prepareXfbBlock = createBlock(primShader, ".prepareXfb");
+    prepareXfbBlock->moveAfter(insertBlock);
+
+    auto endPrepareXfbBlock = createBlock(primShader, ".endPrepareXfb");
+    endPrepareXfbBlock->moveAfter(prepareXfbBlock);
+
+    // Continue to construct insert block
+    {
+      auto firstThreadInSubgroup = m_builder.CreateICmpEQ(m_nggInputs.threadIdInSubgroup, m_builder.getInt32(0));
+      m_builder.CreateCondBr(firstThreadInSubgroup, prepareXfbBlock, endPrepareXfbBlock);
+    }
+
+    // Construct ".prepareXfb" block
+    {
+      m_builder.SetInsertPoint(prepareXfbBlock);
+
+      unsigned firstActiveBuffer = InvalidValue;
+      unsigned lastActiveBuffer = InvalidValue;
+
+      for (unsigned i = 0; i < MaxTransformFeedbackBuffers; ++i) {
+        if (!bufferActive[i])
+          continue; // Transform feedback buffer is inactive
+
+        if (firstActiveBuffer == InvalidValue)
+          firstActiveBuffer = i;
+        lastActiveBuffer = i;
+      }
+
+      Value *numPrimsToWrite[MaxGsStreams] = {};
+      for (unsigned i = 0; i < MaxGsStreams; ++i) {
+        if (m_pipelineState->isVertexStreamActive(i))
+          numPrimsToWrite[i] = primCountInSubgroup[i];
+      }
+
+      Value *dwordsWritten[MaxTransformFeedbackBuffers] = {};
+      Value *dwordsPerPrim[MaxTransformFeedbackBuffers] = {};
+
+      // Calculate numPrimsToWrite
+      for (unsigned i = 0; i < MaxTransformFeedbackBuffers; ++i) {
+        if (!bufferActive[i])
+          continue;
+
+        if (i == firstActiveBuffer) {
+          // ds_ordered_count
+          dwordsWritten[i] = m_builder.CreateIntrinsic(
+              Intrinsic::amdgcn_ds_ordered_add, {},
+              {
+                  m_builder.CreateIntToPtr(m_nggInputs.orderedWaveId,
+                                           PointerType::get(m_builder.getInt32Ty(), ADDR_SPACE_REGION)), // m0
+                  m_builder.getInt32(0),                                                                 // value to add
+                  m_builder.getInt32(0),                                                                 // ordering
+                  m_builder.getInt32(0),                                                                 // scope
+                  m_builder.getFalse(),                                                                  // isVolatile
+                  m_builder.getInt32((GDS_STRMOUT_DWORDS_WRITTEN_0 + i) |
+                                     (1 << 24)), // ordered count index, [27:24] is dword count
+                  m_builder.getFalse(),          // wave release
+                  m_builder.getFalse(),          // wave done
+              });
+        } else {
+          // ds_add_gs_reg
+          dwordsWritten[i] =
+              m_builder.CreateIntrinsic(Intrinsic::amdgcn_ds_add_gs_reg_rtn, m_builder.getInt32Ty(),
+                                        {m_builder.getInt32(0),                                         // value to add
+                                         m_builder.getInt32((GDS_STRMOUT_DWORDS_WRITTEN_0 + i) << 2)}); // count index
+        }
+
+        // NUM_RECORDS = SQ_BUF_RSRC_WORD2
+        Value *numRecords = m_builder.CreateExtractElement(m_streamOutBufDescs[i], 2);
+        // bufferSizeInDwords = numRecords >> 2 (NOTE: NUM_RECORDS is set to the byte size of stream-out buffer)
+        Value *bufferSizeInDwords = m_builder.CreateLShr(numRecords, 2);
+        // dwordsRemaining = max(0, bufferSizeInDwords - (bufferOffset + dwordsWritten))
+        Value *dwordsRemaining =
+            m_builder.CreateSub(bufferSizeInDwords, m_builder.CreateAdd(m_streamOutBufOffsets[i], dwordsWritten[i]));
+        dwordsRemaining = m_builder.CreateIntrinsic(Intrinsic::smax, dwordsRemaining->getType(),
+                                                    {dwordsRemaining, m_builder.getInt32(0)});
+        // numPrimsToWrite = min(dwordsRemaining / dwordsPerPrim, numPrimsToWrite)
+        dwordsPerPrim[i] =
+            m_builder.CreateMul(m_verticesPerPrimitive, m_builder.getInt32(xfbStrides[i] / sizeof(unsigned)));
+        Value *primsCanWrite = m_builder.CreateUDiv(dwordsRemaining, dwordsPerPrim[i]);
+        numPrimsToWrite[xfbBufferToStream[i]] =
+            m_builder.CreateIntrinsic(Intrinsic::umin, numPrimsToWrite[xfbBufferToStream[i]]->getType(),
+                                      {numPrimsToWrite[xfbBufferToStream[i]], primsCanWrite});
+      }
+
+      // Increment dwordsWritten
+      for (unsigned i = 0; i < MaxTransformFeedbackBuffers; ++i) {
+        if (!bufferActive[i])
+          continue;
+
+        Value *dwordsToWrite = m_builder.CreateMul(numPrimsToWrite[xfbBufferToStream[i]], dwordsPerPrim[i]);
+
+        if (i == lastActiveBuffer) {
+          // ds_ordered_count, wave done
+          dwordsWritten[i] = m_builder.CreateIntrinsic(
+              Intrinsic::amdgcn_ds_ordered_add, {},
+              {
+                  m_builder.CreateIntToPtr(m_nggInputs.orderedWaveId,
+                                           PointerType::get(m_builder.getInt32Ty(), ADDR_SPACE_REGION)), // m0
+                  dwordsToWrite,                                                                         // value to add
+                  m_builder.getInt32(0),                                                                 // ordering
+                  m_builder.getInt32(0),                                                                 // scope
+                  m_builder.getFalse(),                                                                  // isVolatile
+                  m_builder.getInt32((GDS_STRMOUT_DWORDS_WRITTEN_0 + i) |
+                                     (1 << 24)), // ordered count index, [27:24] is dword count
+                  m_builder.getTrue(),           // wave release
+                  m_builder.getTrue(),           // wave done
+              });
+        } else {
+          // ds_add_gs_reg
+          dwordsWritten[i] =
+              m_builder.CreateIntrinsic(Intrinsic::amdgcn_ds_add_gs_reg_rtn, dwordsToWrite->getType(),
+                                        {dwordsToWrite,                                                 // value to add
+                                         m_builder.getInt32((GDS_STRMOUT_DWORDS_WRITTEN_0 + i) << 2)}); // count index
+        }
+      }
+
+      // Store transform feedback statistics info to LDS and GDS
+      const unsigned regionStart = getLdsRegionStart(PrimShaderLdsRegion::XfbStats);
+      for (unsigned i = 0; i < MaxTransformFeedbackBuffers; ++i) {
+        if (!bufferActive[i])
+          continue;
+
+        writeValueToLds(dwordsWritten[i], m_builder.getInt32(regionStart + i));
+      }
+
+      for (unsigned i = 0; i < MaxGsStreams; ++i) {
+        if (!m_pipelineState->isVertexStreamActive(i))
+          continue;
+
+        writeValueToLds(numPrimsToWrite[i], m_builder.getInt32(regionStart + MaxTransformFeedbackBuffers + i));
+
+        m_builder.CreateIntrinsic(Intrinsic::amdgcn_ds_add_gs_reg_rtn, primCountInSubgroup[i]->getType(),
+                                  {primCountInSubgroup[i],                                          // value to add
+                                   m_builder.getInt32((GDS_STRMOUT_PRIMS_NEEDED_0 + 2 * i) << 2)}); // count index
+
+        m_builder.CreateIntrinsic(Intrinsic::amdgcn_ds_add_gs_reg_rtn, numPrimsToWrite[i]->getType(),
+                                  {numPrimsToWrite[i],                                               // value to add
+                                   m_builder.getInt32((GDS_STRMOUT_PRIMS_WRITTEN_0 + 2 * i) << 2)}); // count index
+      }
+
+      m_builder.CreateBr(endPrepareXfbBlock);
+    }
+
+    // Construct ".endPrepareXfb" block
+    {
+      m_builder.SetInsertPoint(endPrepareXfbBlock);
+      // Nothing to do
+    }
+
+    return;
+  }
+
+  llvm_unreachable("Not implemented!");
+}
+
 // =====================================================================================================================
 // Fetches transform feedback outputs by creating a fetcher cloned from the target function or just mutating
 // the target function and running it after that. Meanwhile, we collect the transform feedback export info.
@@ -7684,7 +7602,7 @@ void NggPrimShader::collectPrimitiveStats() {
       if (m_pipelineState->isVertexStreamActive(i)) {
         // drawFlag = primData[N] != NullPrim
         auto primData = readPerThreadDataFromLds(m_builder.getInt32Ty(), m_nggInputs.threadIdInSubgroup,
-                                                 PrimShaderLdsRegion::PrimitiveData, NggMaxThreadsPerSubgroup * i);
+                                                 PrimShaderLdsRegion::PrimitiveData, m_maxThreadsPerSubgroup * i);
         drawFlag[i] = m_builder.CreateICmpNE(primData, m_builder.getInt32(NullPrim));
       }
     }
@@ -7726,9 +7644,8 @@ void NggPrimShader::collectPrimitiveStats() {
     unsigned regionStart = getLdsRegionStart(PrimShaderLdsRegion::PrimitiveCounts);
 
     for (unsigned i = 0; i < MaxGsStreams; ++i) {
-      if (m_pipelineState->isVertexStreamActive(i)) {
-        atomicAdd(primCountInWave[i], m_builder.getInt32(regionStart + i));
-      }
+      if (m_pipelineState->isVertexStreamActive(i))
+        atomicOp(AtomicRMWInst::Add, primCountInWave[i], m_builder.getInt32(regionStart + i));
     }
 
     m_builder.CreateBr(endCountPrimitivesBlock);
@@ -8035,18 +7952,19 @@ void NggPrimShader::writeValueToLds(Value *writeValue, Value *ldsOffset, bool us
 }
 
 // =====================================================================================================================
-// Do atomic add operation with the value stored in LDS.
+// Do atomic operation with the value stored in LDS.
 //
-// @param valueToAdd : Value to do atomic add
+// @param atomicOp : Atomic operation
+// @param value : Value to do atomic operation
 // @param ldsOffset : Start offset to do LDS atomic operations (in dwords)
-void NggPrimShader::atomicAdd(Value *ValueToAdd, Value *ldsOffset) {
-  assert(ValueToAdd->getType()->isIntegerTy(32));
+void NggPrimShader::atomicOp(AtomicRMWInst::BinOp atomicOp, Value *value, Value *ldsOffset) {
+  assert(value->getType()->isIntegerTy(32));
 
   Value *atomicPtr = m_builder.CreateGEP(m_builder.getInt32Ty(), m_lds, ldsOffset);
 
   SyncScope::ID syncScope = m_builder.getContext().getOrInsertSyncScopeID("workgroup");
-  m_builder.CreateAtomicRMW(AtomicRMWInst::BinOp::Add, atomicPtr, ValueToAdd, MaybeAlign(),
-                            AtomicOrdering::SequentiallyConsistent, syncScope);
+  m_builder.CreateAtomicRMW(atomicOp, atomicPtr, value, MaybeAlign(), AtomicOrdering::SequentiallyConsistent,
+                            syncScope);
 }
 
 // =====================================================================================================================
diff --git a/lgc/patch/NggPrimShader.h b/lgc/patch/NggPrimShader.h
index 4d7efed2b5..7332661439 100644
--- a/lgc/patch/NggPrimShader.h
+++ b/lgc/patch/NggPrimShader.h
@@ -301,6 +301,7 @@ class NggPrimShader {
 
   void processSwXfb(llvm::ArrayRef<llvm::Argument *> args);
   void processSwXfbWithGs(llvm::ArrayRef<llvm::Argument *> args);
+  void prepareSwXfb(llvm::ArrayRef<llvm::Value *> primCountInSubgroup);
   llvm::Value *fetchXfbOutput(llvm::Function *target, llvm::ArrayRef<llvm::Argument *> args,
                               llvm::SmallVector<XfbOutputExport, 32> &xfbOutputExports);
 
@@ -330,7 +331,7 @@ class NggPrimShader {
 
   llvm::Value *readValueFromLds(llvm::Type *readTy, llvm::Value *ldsOffset, bool useDs128 = false);
   void writeValueToLds(llvm::Value *writeValue, llvm::Value *ldsOffset, bool useDs128 = false);
-  void atomicAdd(llvm::Value *valueToAdd, llvm::Value *ldsOffset);
+  void atomicOp(llvm::AtomicRMWInst::BinOp atomicOp, llvm::Value *value, llvm::Value *ldsOffset);
   llvm::Value *readValueFromCb(llvm::Type *readyTy, llvm::Value *bufPtr, llvm::Value *offset, bool isVolatile = false);
 
   static const unsigned NullPrim = (1u << 31); // Null primitive data (invalid)
@@ -400,9 +401,14 @@ class NggPrimShader {
   bool m_hasTes = false; // Whether the pipeline has tessellation evaluation shader
   bool m_hasGs = false;  // Whether the pipeline has geometry shader
 
+  unsigned m_maxThreadsPerSubgroup = 0; // Maximum number of threads in a NGG subgroup
+  unsigned m_maxWavesPerSubgroup = 0;   // Maximum number of waves in a NGG subgroup
+
   llvm::Value *m_streamOutControlBufPtr = nullptr;                      // Stream-out control buffer pointer
   llvm::Value *m_streamOutBufDescs[MaxTransformFeedbackBuffers] = {};   // Stream-out buffer descriptors
   llvm::Value *m_streamOutBufOffsets[MaxTransformFeedbackBuffers] = {}; // Stream-out buffer offsets
+  llvm::Value *m_verticesPerPrimitive = nullptr; // If topology is dynamic, it is a SGPR value from user data
+                                                 // ComplexData; otherwise it is a constant.
 
   bool m_constPositionZ = false; // Whether the Z channel of vertex position data is constant
 
diff --git a/lgc/patch/PatchBufferOp.cpp b/lgc/patch/PatchBufferOp.cpp
index ad4f8716f7..4cf36678ac 100644
--- a/lgc/patch/PatchBufferOp.cpp
+++ b/lgc/patch/PatchBufferOp.cpp
@@ -32,6 +32,7 @@
 #include "lgc/Builder.h"
 #include "lgc/LgcContext.h"
 #include "lgc/LgcDialect.h"
+#include "lgc/builder/BuilderImpl.h"
 #include "lgc/state/IntrinsDefs.h"
 #include "lgc/state/PipelineState.h"
 #include "lgc/state/TargetInfo.h"
@@ -122,7 +123,7 @@ PatchBufferOpImpl::PatchBufferOpImpl(LLVMContext &context, PipelineState &pipeli
 // @param [in,out] function : LLVM function to be run on
 // @returns : True if the module was modified by the transformation and false otherwise
 bool PatchBufferOpImpl::run(Function &function) {
-  LLVM_DEBUG(dbgs() << "Run the pass Patch-Buffer-Op\n");
+  LLVM_DEBUG(dbgs() << "Run the pass Patch-Buffer-Op on: " << function.getName() << '\n');
 
   static const auto visitor = llvm_dialects::VisitorBuilder<PatchBufferOpImpl>()
                                   .nest(&BufferOpLowering::registerVisitors)
@@ -195,6 +196,7 @@ void BufferOpLowering::registerVisitors(llvm_dialects::VisitorBuilder<BufferOpLo
   builder.add(&BufferOpLowering::visitAtomicCmpXchgInst);
   builder.add(&BufferOpLowering::visitAtomicRMWInst);
   builder.add(&BufferOpLowering::visitBitCastInst);
+  builder.add(&BufferOpLowering::visitBufferAddrToPtr);
   builder.add(&BufferOpLowering::visitBufferDescToPtr);
   builder.add(&BufferOpLowering::visitStridedBufferDescToPtr);
   builder.add(&BufferOpLowering::visitStridedBufferAddrAndStrideToPtr);
@@ -202,6 +204,7 @@ void BufferOpLowering::registerVisitors(llvm_dialects::VisitorBuilder<BufferOpLo
   builder.add(&BufferOpLowering::visitBufferLength);
   builder.add(&BufferOpLowering::visitBufferPtrDiff);
   builder.add(&BufferOpLowering::visitGetElementPtrInst);
+  builder.add(&BufferOpLowering::visitLoadTfeOp);
   builder.add(&BufferOpLowering::visitLoadInst);
   builder.add(&BufferOpLowering::visitMemCpyInst);
   builder.add(&BufferOpLowering::visitMemMoveInst);
@@ -228,12 +231,14 @@ void BufferOpLowering::finish() {
       if (newPhi->getParent() == originalPhi->getParent()) {
         DescriptorInfo &di = m_descriptors[newPhi];
         di.divergent = true;
+        LLVM_DEBUG(dbgs() << "Divergent PHI of descriptor: " << *newPhi << '\n');
       }
     }
   }
 
   static const auto visitor = llvm_dialects::VisitorBuilder<BufferOpLowering>()
                                   .add(&BufferOpLowering::postVisitLoadInst)
+                                  .add(&BufferOpLowering::postVisitLoadTfeOp)
                                   .add(&BufferOpLowering::postVisitMemCpyInst)
                                   .add(&BufferOpLowering::postVisitMemSetInst)
                                   .add(&BufferOpLowering::postVisitStoreInst)
@@ -273,8 +278,10 @@ BufferOpLowering::DescriptorInfo BufferOpLowering::getDescriptorInfo(Value *desc
             searchWorklist.push_back(incoming);
         } else if (auto *select = dyn_cast<SelectInst>(current)) {
           assert(select->getOperandUse(0).get() == select->getCondition());
-          if (m_uniformityInfo.isDivergentUse(select->getOperandUse(0)))
+          if (m_uniformityInfo.isDivergentUse(select->getOperandUse(0))) {
             di.divergent = true;
+            LLVM_DEBUG(dbgs() << "Divergent descriptor: " << *select << '\n');
+          }
 
           if (!di.invariant.has_value() || !di.divergent.has_value()) {
             searchWorklist.push_back(select->getTrueValue());
@@ -289,6 +296,8 @@ BufferOpLowering::DescriptorInfo BufferOpLowering::getDescriptorInfo(Value *desc
           if (!di.divergent.has_value()) {
             // TODO: This would be entirely unnecessary if we had updatable divergence info.
             di.divergent = !isConstant;
+            LLVM_DEBUG(dbgs() << (di.divergent.value() ? "Divergent" : "Uniform") << " descriptor: " << *current
+                              << '\n');
           }
         }
       }
@@ -319,10 +328,12 @@ BufferOpLowering::DescriptorInfo BufferOpLowering::getDescriptorInfo(Value *desc
       auto &userDi = m_descriptors[user];
       bool propagate = false;
       if (!userDi.invariant.has_value() && !di.invariant.value_or(true)) {
+        LLVM_DEBUG(dbgs() << "Variant descriptor: " << *user << '\n');
         userDi.invariant = false;
         propagate = true;
       }
       if (!userDi.divergent.has_value() && di.divergent.value_or(false)) {
+        LLVM_DEBUG(dbgs() << "Divergent descriptor: " << *user << '\n');
         userDi.divergent = true;
         propagate = true;
       }
@@ -334,10 +345,14 @@ BufferOpLowering::DescriptorInfo BufferOpLowering::getDescriptorInfo(Value *desc
   // At this point, seen values that are not "variant"/"divergent" are known to be "invariant"/"uniform".
   for (Value *current : seen) {
     auto &di = m_descriptors[current];
-    if (!di.invariant.has_value())
+    if (!di.invariant.has_value()) {
       di.invariant = true;
-    if (!di.divergent.has_value())
+      LLVM_DEBUG(dbgs() << "Invariant descriptor: " << *current << '\n');
+    }
+    if (!di.divergent.has_value()) {
       di.divergent = false;
+      LLVM_DEBUG(dbgs() << "Uniform descriptor: " << *current << '\n');
+    }
   }
 
   return m_descriptors.find(desc)->second;
@@ -653,6 +668,30 @@ void BufferOpLowering::visitBitCastInst(BitCastInst &bitCastInst) {
   m_typeLowering.replaceInstruction(&bitCastInst, m_typeLowering.getValue(bitCastInst.getOperand(0)));
 }
 
+// =====================================================================================================================
+// Lower a buffer.addr.to.ptr op, to convert an i64 address to a buffer fat pointer.
+void BufferOpLowering::visitBufferAddrToPtr(BufferAddrToPtrOp &op) {
+  BuilderImpl builder(&m_pipelineState);
+  builder.setShaderStage(getShaderStage(op.getFunction()));
+  builder.SetInsertPoint(&op);
+
+  // Extend the i64 address to a <4 x i32> descriptor.
+  Value *descriptor = builder.buildBufferCompactDesc(
+      builder.CreateBitCast(op.getAddr(), FixedVectorType::get(builder.getInt32Ty(), 2)), 0);
+  m_typeLowering.replaceInstruction(&op, {descriptor, ConstantPointerNull::get(m_offsetType)});
+
+  auto &di = m_descriptors[descriptor];
+
+#if LLVM_MAIN_REVISION && LLVM_MAIN_REVISION < 458033
+  // Old version of the code
+  di.divergent = m_uniformityInfo.isDivergent(*descriptor);
+#else
+  // New version of the code (also handles unknown version, which we treat as latest)
+  di.divergent = m_uniformityInfo.isDivergent(descriptor);
+#endif
+  LLVM_DEBUG(dbgs() << (di.divergent.value() ? "Divergent" : "Uniform") << " descriptor: " << *descriptor << '\n');
+}
+
 // =====================================================================================================================
 // Visits "buffer.desc.to.ptr" instruction.
 //
@@ -672,6 +711,7 @@ void BufferOpLowering::visitBufferDescToPtr(BufferDescToPtrOp &descToPtr) {
   // New version of the code (also handles unknown version, which we treat as latest)
   di.divergent = m_uniformityInfo.isDivergent(descriptor);
 #endif
+  LLVM_DEBUG(dbgs() << (di.divergent.value() ? "Divergent" : "Uniform") << " descriptor: " << *descriptor << '\n');
 }
 
 // =====================================================================================================================
@@ -694,6 +734,7 @@ void BufferOpLowering::visitStridedBufferDescToPtr(StridedBufferDescToPtrOp &des
   // New version of the code (also handles unknown version, which we treat as latest)
   di.divergent = m_uniformityInfo.isDivergent(descriptor);
 #endif
+  LLVM_DEBUG(dbgs() << (di.divergent.value() ? "Divergent" : "Uniform") << " descriptor: " << *descriptor << '\n');
 }
 
 // =====================================================================================================================
@@ -889,6 +930,8 @@ void BufferOpLowering::postVisitLoadInst(LoadInst &loadInst) {
 
   // Record the load instruction so we remember to delete it later.
   m_typeLowering.eraseInstruction(&loadInst);
+  // Replace the mapping.
+  m_typeLowering.replaceValue(&loadInst, newLoad);
 
   loadInst.replaceAllUsesWith(newLoad);
 }
@@ -1296,6 +1339,48 @@ void BufferOpLowering::postVisitMemSetInst(MemSetInst &memSetInst) {
   m_typeLowering.eraseInstruction(&memSetInst);
 }
 
+// =====================================================================================================================
+// Visits "load.tfe" instruction.
+//
+// @param loadTfe : The instruction
+void BufferOpLowering::visitLoadTfeOp(LoadTfeOp &loadTfe) {
+  assert(isAnyBufferPointer(loadTfe.getPointer()));
+  m_postVisitInsts.push_back(&loadTfe);
+}
+
+// =====================================================================================================================
+// Visits "load.tfe" instruction after the initial pass, when phi nodes have been fixed up and potentially simplified.
+//
+// @param loadTfe : the instruction
+void BufferOpLowering::postVisitLoadTfeOp(LoadTfeOp &loadTfe) {
+  Value *pointerOperand = loadTfe.getPointer();
+
+  m_builder.SetInsertPoint(&loadTfe);
+  auto pointerValues = m_typeLowering.getValue(pointerOperand);
+  Value *bufferDesc = pointerValues[0];
+  Value *const offset = m_builder.CreatePtrToInt(pointerValues[1], m_builder.getInt32Ty());
+  Instruction *bufferLoad = nullptr;
+
+  if (pointerOperand->getType()->getPointerAddressSpace() == ADDR_SPACE_BUFFER_FAT_POINTER) {
+    bufferLoad = m_builder.CreateIntrinsic(Intrinsic::amdgcn_raw_buffer_load, loadTfe.getType(),
+                                           {bufferDesc, offset, m_builder.getInt32(0), m_builder.getInt32(0)});
+  } else {
+    bufferLoad = m_builder.CreateIntrinsic(
+        Intrinsic::amdgcn_struct_buffer_load, loadTfe.getType(),
+        {bufferDesc, m_builder.getInt32(0), offset, m_builder.getInt32(0), m_builder.getInt32(0)});
+  }
+  if (getDescriptorInfo(bufferDesc).divergent.value()) {
+    BuilderImpl builderImpl(&m_pipelineState);
+    bufferLoad = builderImpl.createWaterfallLoop(bufferLoad, 0, false);
+  }
+
+  // Record the load instruction so we remember to delete it later.
+  m_typeLowering.eraseInstruction(&loadTfe);
+  // Replace the mapping.
+  m_typeLowering.replaceValue(&loadTfe, bufferLoad);
+  loadTfe.replaceAllUsesWith(bufferLoad);
+}
+
 // =====================================================================================================================
 // Extract the 64-bit address from a buffer descriptor.
 //
@@ -1522,10 +1607,27 @@ Value *BufferOpLowering::replaceLoadStore(Instruction &inst) {
         coherent.bits.dlc = isDlc;
       }
       if (pointerOperand->getType()->getPointerAddressSpace() == ADDR_SPACE_BUFFER_STRIDED_POINTER) {
-        CallInst *call = m_builder.CreateIntrinsic(
-            Intrinsic::amdgcn_struct_buffer_load, intAccessType,
-            {bufferDesc, pointerValues[2], offsetVal, m_builder.getInt32(0), m_builder.getInt32(coherent.u32All)});
+        Value *indexValue = pointerValues[2];
+        CallInst *call = nullptr;
+        // Especially when the index is a constant, and the stride is known at compile-time,
+        // we should create s_buffer_load instructions with constant offsets: index * stride + offset
+        if ((isInvariant && accessSize >= 4) && isa<ConstantInt>(indexValue)) {
+          Value *desc1 = m_builder.CreateExtractElement(bufferDesc, 1);
+          // stride is 61:48 bits in descriptor, which will always be constantInt when create BufferDesc
+          Value *stride =
+              m_builder.CreateAnd(m_builder.CreateLShr(desc1, m_builder.getInt32(16)), m_builder.getInt32(0x3fff));
+          Value *indexOffsetVal = m_builder.CreateMul(indexValue, stride);
+          offsetVal = m_builder.CreateAdd(offsetVal, indexOffsetVal);
+          call = m_builder.CreateIntrinsic(Intrinsic::amdgcn_s_buffer_load, intAccessType,
+                                           {bufferDesc, offsetVal, m_builder.getInt32(coherent.u32All)});
+        } else {
+          call = m_builder.CreateIntrinsic(
+              Intrinsic::amdgcn_struct_buffer_load, intAccessType,
+              {bufferDesc, indexValue, offsetVal, m_builder.getInt32(0), m_builder.getInt32(coherent.u32All)});
+        }
         copyMetadata(call, &inst);
+        if (isInvariant)
+          call->setMetadata(LLVMContext::MD_invariant_load, MDNode::get(m_builder.getContext(), {}));
         part = call;
       } else if (isInvariant && accessSize >= 4) {
         CallInst *call = m_builder.CreateIntrinsic(Intrinsic::amdgcn_s_buffer_load, intAccessType,
diff --git a/lgc/patch/PatchCheckShaderCache.cpp b/lgc/patch/PatchCheckShaderCache.cpp
index e500d4b296..c48b08991c 100644
--- a/lgc/patch/PatchCheckShaderCache.cpp
+++ b/lgc/patch/PatchCheckShaderCache.cpp
@@ -110,11 +110,10 @@ PreservedAnalyses PatchCheckShaderCache::run(Module &module, ModuleAnalysisManag
       // locations of generic outputs). We have to add it to shader hash calculation.
       streamMapEntries(resUsage->inOutUsage.gs.builtInOutLocs, stream);
     } else if (stage == ShaderStage::Mesh) {
-      // NOTE: For mesh shader, those four special maps are used to export vertex/primitive attributes.
+      // NOTE: For mesh shader, those two special map info (from built-in IDs to export locations of vertex/primitive
+      // attributes) is used to export vertex/primitive attributes.
       streamMapEntries(resUsage->inOutUsage.mesh.vertexBuiltInExportSlots, stream);
       streamMapEntries(resUsage->inOutUsage.mesh.primitiveBuiltInExportSlots, stream);
-      streamMapEntries(resUsage->inOutUsage.mesh.vertexOutputComponents, stream);
-      streamMapEntries(resUsage->inOutUsage.mesh.primitiveOutputComponents, stream);
     }
 
     // Store the result of the hash for this shader stage.
diff --git a/lgc/patch/PatchEntryPointMutate.cpp b/lgc/patch/PatchEntryPointMutate.cpp
index f03f7bfb93..bfc06831de 100644
--- a/lgc/patch/PatchEntryPointMutate.cpp
+++ b/lgc/patch/PatchEntryPointMutate.cpp
@@ -83,6 +83,7 @@
 
 using namespace llvm;
 using namespace lgc;
+using namespace cps;
 
 // =====================================================================================================================
 PatchEntryPointMutate::PatchEntryPointMutate()
@@ -168,6 +169,11 @@ PreservedAnalyses PatchEntryPointMutate::run(Module &module, ModuleAnalysisManag
 
 // =====================================================================================================================
 // Split the input into pieces of i32.
+//
+// @param layout : Data layout
+// @param builder : IR builder
+// @param input : A collection of inputs (structures, arrays, vectors, pointers, or basic primitive types)
+// @param [out] output : A collection of outputs by flattening the inputs to scalar values
 static void splitIntoI32(const DataLayout &layout, IRBuilder<> &builder, ArrayRef<Value *> input,
                          SmallVector<Value *> &output) {
   for (auto *x : input) {
@@ -217,6 +223,10 @@ static void splitIntoI32(const DataLayout &layout, IRBuilder<> &builder, ArrayRe
 
 // =====================================================================================================================
 // Merge the input into a single struct type.
+//
+// @param builder : IR builder
+// @param input : An array of inputs to be structure members
+// @returns : A structure-typed value with inputs as its members
 static Value *mergeIntoStruct(IRBuilder<> &builder, ArrayRef<Value *> input) {
   SmallVector<Type *> types;
   for (auto *v : input)
@@ -230,6 +240,10 @@ static Value *mergeIntoStruct(IRBuilder<> &builder, ArrayRef<Value *> input) {
 
 // =====================================================================================================================
 // Construct vectors of dword, the input should be i32 type.
+//
+// @param builder : IR builder
+// @param input : An array of i32 scalar inputs
+// @returns : An arrayed value of inputs
 static Value *mergeDwordsIntoVector(IRBuilder<> &builder, ArrayRef<Value *> input) {
   unsigned numElem = input.size();
   Type *vecTy = FixedVectorType::get(builder.getInt32Ty(), numElem);
@@ -241,27 +255,36 @@ static Value *mergeDwordsIntoVector(IRBuilder<> &builder, ArrayRef<Value *> inpu
 }
 
 // =====================================================================================================================
+// Process LoadDriverTableEntryOp.
+//
+// @param module : LLVM module
 void PatchEntryPointMutate::processDriverTableLoad(Module &module) {
-  SmallVector<CallInst *> toBeErased;
+  SmallVector<CallInst *> callsToRemove;
+
   struct Payload {
-    SmallVectorImpl<CallInst *> &toBeErased;
+    SmallVectorImpl<CallInst *> &callsToRemove;
     PatchEntryPointMutate *self;
   };
-  Payload payload = {toBeErased, this};
+
+  Payload payload = {callsToRemove, this};
 
   static auto visitor = llvm_dialects::VisitorBuilder<Payload>()
                             .setStrategy(llvm_dialects::VisitorStrategy::ByFunctionDeclaration)
                             .add<LoadDriverTableEntryOp>([](auto &payload, auto &op) {
                               payload.self->lowerDriverTableLoad(op);
-                              payload.toBeErased.push_back(&op);
+                              payload.callsToRemove.push_back(&op);
                             })
                             .build();
   visitor.visit(payload, module);
-  for (auto call : payload.toBeErased)
+
+  for (auto call : payload.callsToRemove)
     call->eraseFromParent();
 }
 
 // =====================================================================================================================
+// Lower LoadDriverTableEntryOp.
+//
+// @param loadDriverTablePtrOp : Call instruction to load driver table pointer
 void PatchEntryPointMutate::lowerDriverTableLoad(LoadDriverTableEntryOp &loadDriverTablePtrOp) {
   BuilderBase builder(&loadDriverTablePtrOp);
   Function *entryPoint = loadDriverTablePtrOp.getFunction();
@@ -276,29 +299,36 @@ void PatchEntryPointMutate::lowerDriverTableLoad(LoadDriverTableEntryOp &loadDri
 }
 
 // =====================================================================================================================
-// Lower GroupMemcpyOp
+// Process GroupMemcpyOp.
+//
+// @param module : LLVM module
 void PatchEntryPointMutate::processGroupMemcpy(Module &module) {
-  SmallVector<CallInst *> toBeErased;
+  SmallVector<CallInst *> callsToRemove;
+
   struct Payload {
-    SmallVectorImpl<CallInst *> &tobeErased;
+    SmallVectorImpl<CallInst *> &callsToRemove;
     PatchEntryPointMutate *self;
   };
-  Payload payload = {toBeErased, this};
+
+  Payload payload = {callsToRemove, this};
 
   static auto visitor = llvm_dialects::VisitorBuilder<Payload>()
                             .setStrategy(llvm_dialects::VisitorStrategy::ByFunctionDeclaration)
                             .add<GroupMemcpyOp>([](auto &payload, auto &op) {
                               payload.self->lowerGroupMemcpy(op);
-                              payload.tobeErased.push_back(&op);
+                              payload.callsToRemove.push_back(&op);
                             })
                             .build();
   visitor.visit(payload, module);
-  for (auto call : payload.tobeErased)
+
+  for (auto call : payload.callsToRemove)
     call->eraseFromParent();
 }
 
 // =====================================================================================================================
 // Lower GroupMemcpyOp - Copy memory using threads in a workgroup (scope=2) or subgroup (scope=3).
+//
+// @param groupMemcpyOp : Call instruction to do group memory copy
 void PatchEntryPointMutate::lowerGroupMemcpy(GroupMemcpyOp &groupMemcpyOp) {
   BuilderImpl builder(m_pipelineState);
   Function *entryPoint = groupMemcpyOp.getFunction();
@@ -631,7 +661,7 @@ bool PatchEntryPointMutate::lowerCpsOps(Function *func, ShaderInputs *shaderInpu
   unsigned vcrIndexInVgpr = haveLocalInvocationId ? 1 : 0;
   auto *vcr = builder.CreateExtractValue(vgprArg, vcrIndexInVgpr);
   auto *vcrTy = vcr->getType();
-
+  Value *pendingBallot = nullptr;
   if (isCpsFunc) {
     auto *vcrShaderArg = func->getArg(numShaderArg);
     // When we are working with LLVM version without the llvm.amdgcn.set.inactive.chain.arg, we cannot simply declare
@@ -642,11 +672,30 @@ bool PatchEntryPointMutate::lowerCpsOps(Function *func, ShaderInputs *shaderInpu
       vcr = builder.CreateIntrinsic(vcrTy, m_setInactiveChainArgId, {vcr, vcrShaderArg});
     else
       vcr = builder.CreateIntrinsic(vcrTy, Intrinsic::amdgcn_set_inactive, {vcr, vcrShaderArg});
+
+    auto level = builder.CreateAnd(vcr, builder.getInt32(0x7));
+    auto funcLevel = static_cast<unsigned>(cps::getCpsLevelFromFunction(*func));
+    static const std::vector<cps::CpsLevel> priorities[] = {
+        // RayGen: Continue with RayGen or hit shaders
+        {CpsLevel::Traversal, CpsLevel::ClosestHit_Miss_Callable, CpsLevel::RayGen},
+        // ClosestHit_Miss_Callable: Continue with hit shaders, then resume RayGen
+        {CpsLevel::Traversal, CpsLevel::RayGen, CpsLevel::ClosestHit_Miss_Callable},
+        // Traversal: Call Intersection or AnyHit, then call hit shaders or continue with RayGen
+        // Traversal can continue with traversal when it wants to wait, so try that last
+        {CpsLevel::Traversal, CpsLevel::RayGen, CpsLevel::ClosestHit_Miss_Callable,
+         CpsLevel::AnyHit_CombinedIntersection_AnyHit, CpsLevel::Intersection},
+        // AnyHit_CombinedIntersection_AnyHit: Continue with AnyHit, then resume Traversal
+        {CpsLevel::Traversal, CpsLevel::Intersection, CpsLevel::AnyHit_CombinedIntersection_AnyHit},
+        // Intersection: Continue with Intersection, then resume Traversal
+        {CpsLevel::Traversal, CpsLevel::AnyHit_CombinedIntersection_AnyHit, CpsLevel::Intersection}};
+    // Get non-zero level execution Mask
+    pendingBallot = takeLevel(level, builder, waveMaskTy, priorities[funcLevel - 1]);
+  } else {
+    // Find first lane having non-null vcr, and use as next jump target.
+    auto *vcrMask = builder.CreateICmpNE(vcr, builder.getInt32(0));
+    pendingBallot = builder.CreateIntrinsic(Intrinsic::amdgcn_ballot, waveMaskTy, vcrMask);
   }
 
-  // Find first lane having non-null vcr, and use as next jump target.
-  auto *vcrMask = builder.CreateICmpNE(vcr, builder.getInt32(0));
-  auto *pendingBallot = builder.CreateIntrinsic(Intrinsic::amdgcn_ballot, waveMaskTy, vcrMask);
   Value *firstActive = builder.CreateIntrinsic(Intrinsic::cttz, waveMaskTy, {pendingBallot, builder.getTrue()});
   if (!waveMaskTy->isIntegerTy(32))
     firstActive = builder.CreateTrunc(firstActive, builder.getInt32Ty());
@@ -830,6 +879,28 @@ Function *PatchEntryPointMutate::lowerCpsFunction(Function *func, ArrayRef<Type
   return newFunc;
 }
 
+// =====================================================================================================================
+// Take the level from priorities list
+
+// @param level : the level to select
+// @param builder: IRBuilder to build instructions
+// @param waveMaskTy : Wave Mask type
+// @param priorties : Priorities list
+Value *PatchEntryPointMutate::takeLevel(Value *level, IRBuilder<> &builder, Type *waveMaskTy,
+                                        ArrayRef<CpsLevel> priorties) {
+  auto levelMask = builder.CreateICmpNE(level, builder.getInt32(0));
+  Value *levelBallot = builder.CreateIntrinsic(Intrinsic::amdgcn_ballot, waveMaskTy, levelMask);
+  Value *cond = nullptr;
+
+  for (auto cpsLevel : priorties) {
+    auto lvMask = builder.CreateICmpEQ(level, builder.getInt32(static_cast<unsigned>(cpsLevel)));
+    Value *lvBallot = builder.CreateIntrinsic(Intrinsic::amdgcn_ballot, waveMaskTy, lvMask);
+    cond = builder.CreateICmpNE(lvBallot, builder.getInt32(0));
+    levelBallot = builder.CreateSelect(cond, lvBallot, levelBallot);
+  }
+  return levelBallot;
+}
+
 // =====================================================================================================================
 // Lower cps.jump, fill cps exit information and branch to tailBlock. Return the state size.
 // This assume the arguments of the parent function are setup correctly.
@@ -1758,6 +1829,12 @@ void PatchEntryPointMutate::addSpecialUserDataArgs(SmallVectorImpl<UserDataArg>
         specialUserDataArgs.push_back(UserDataArg(builder.getInt32Ty(), "drawIndex", UserDataMapping::DrawIndex));
     }
 
+    if ((m_pipelineState->getTargetInfo().getGfxIpVersion().major >= 11) && !m_hasGs && !m_hasTs &&
+        m_pipelineState->enableXfb() &&
+        (m_pipelineState->getOptions().dynamicTopology || m_pipelineState->isUnlinked())) {
+      specialUserDataArgs.push_back(UserDataArg(builder.getInt32Ty(), "compositeData", UserDataMapping::CompositeData,
+                                                &intfData->entryArgIdxs.vs.compositeData));
+    }
   } else if (m_shaderStage == ShaderStage::Compute) {
     // Pass the gl_NumWorkgroups pointer in user data registers.
     // Always enable this, even if unused, if compute library is in use.
@@ -1819,16 +1896,12 @@ void PatchEntryPointMutate::addSpecialUserDataArgs(SmallVectorImpl<UserDataArg>
           UserDataArg(builder.getInt32Ty(), "colorExpAddr", UserDataMapping::ColorExportAddr));
     }
 
-    if (m_pipelineState->getShaderResourceUsage(ShaderStage::Fragment)->builtInUsage.fs.runAtSampleRate &&
-        (m_pipelineState->isUnlinked() || m_pipelineState->getRasterizerState().dynamicSampleInfo)) {
-      specialUserDataArgs.push_back(UserDataArg(builder.getInt32Ty(), "sampleInfo", UserDataMapping::SampleInfo,
-                                                &intfData->entryArgIdxs.fs.sampleInfo));
-    }
-
-    if (userDataUsage->isSpecialUserDataUsed(UserDataMapping::DynamicDualSrcBlendInfo)) {
-      specialUserDataArgs.push_back(UserDataArg(builder.getInt32Ty(), "dualSourceBlendUpdateInfo",
-                                                UserDataMapping::DynamicDualSrcBlendInfo,
-                                                &intfData->entryArgIdxs.fs.dynamicDualSrcBlendInfo));
+    bool useDynamicSampleInfo =
+        m_pipelineState->getShaderResourceUsage(ShaderStage::Fragment)->builtInUsage.fs.runAtSampleRate &&
+        (m_pipelineState->isUnlinked() || m_pipelineState->getRasterizerState().dynamicSampleInfo);
+    if (userDataUsage->isSpecialUserDataUsed(UserDataMapping::CompositeData) || useDynamicSampleInfo) {
+      specialUserDataArgs.push_back(UserDataArg(builder.getInt32Ty(), "compositeData", UserDataMapping::CompositeData,
+                                                &intfData->entryArgIdxs.fs.compositeData));
     }
   }
 
@@ -1920,8 +1993,10 @@ void PatchEntryPointMutate::finalizeUserDataArgs(SmallVectorImpl<UserDataArg> &u
                                    : m_pipelineState->getTargetInfo().getGpuProperty().maxUserDataCount;
 
   // FIXME Restricting user data as the backend does not support more sgprs as arguments
-  if (m_computeWithCalls && userDataAvailable > 16)
-    userDataAvailable = 16;
+  unsigned maxCsUserDataCount = InterfaceData::MaxCsUserDataCount;
+
+  if (m_computeWithCalls)
+    userDataAvailable = std::min(userDataAvailable, maxCsUserDataCount);
 
   for (const auto &userDataArg : specialUserDataArgs)
     userDataAvailable -= userDataArg.argDwordSize;
@@ -2020,11 +2095,13 @@ void PatchEntryPointMutate::finalizeUserDataArgs(SmallVectorImpl<UserDataArg> &u
 
   // Add the special args and the spill table pointer (if any).
   // (specialUserDataArgs is empty for compute, and thus for compute-with-calls.)
-  userDataArgs.insert(userDataArgs.end(), specialUserDataArgs.begin(), specialUserDataArgs.end());
   if (spill) {
     userDataArgs.emplace_back(builder.getInt32Ty(), "spillTable", UserDataMapping::SpillTable,
                               &userDataUsage->spillTableEntryArgIdx);
   }
+  // Make sure the special user data is placed after generic user data because the special user data
+  // of shader debug address must be in the tail of all user data.
+  userDataArgs.insert(userDataArgs.end(), specialUserDataArgs.begin(), specialUserDataArgs.end());
 }
 
 // =====================================================================================================================
diff --git a/lgc/patch/PatchInOutImportExport.cpp b/lgc/patch/PatchInOutImportExport.cpp
index 3fbfda80e3..fb16894137 100644
--- a/lgc/patch/PatchInOutImportExport.cpp
+++ b/lgc/patch/PatchInOutImportExport.cpp
@@ -1254,6 +1254,14 @@ void PatchInOutImportExport::visitReturnInst(ReturnInst &retInst) {
       builder.CreateIntrinsic(builder.getVoidTy(), Intrinsic::amdgcn_exp, args);
     }
 
+    // NOTE: In such case, last shader in the pre-rasterization doesn't export layer while fragment shader expects to
+    // read it. Should export 0 to fragment shader, which is required by the spec.
+    if (!useLayer && nextStage == ShaderStage::Fragment && nextBuiltInUsage.layer) {
+      assert(m_layer == nullptr);
+      m_layer = builder.getInt32(0);
+      useLayer = true;
+    }
+
     // Export gl_ClipDistance[] and gl_CullDistance[] before entry-point returns
     if (clipDistanceCount > 0 || cullDistanceCount > 0) {
       assert(clipDistanceCount + cullDistanceCount <= MaxClipCullDistanceCount);
@@ -1669,13 +1677,21 @@ Value *PatchInOutImportExport::performFsHalfInterpolation(BuilderBase &builder,
     Value *param =
         builder.CreateNamedCall("llvm.amdgcn.lds.param.load", builder.getFloatTy(), {channel, attr, primMask}, attribs);
 
-    // tmp = llvm.amdgcn.interp.inreg.p10.f16(p10, coordI, p0, highHalf)
-    result = builder.CreateNamedCall("llvm.amdgcn.interp.inreg.p10.f16", builder.getFloatTy(),
-                                     {param, coordI, param, highHalf}, attribs);
-
-    // llvm.amdgcn.interp.inreg.p2.f16(p20, coordJ, tmp, highHalf)
-    result = builder.CreateNamedCall("llvm.amdgcn.interp.inreg.p2.f16", builder.getHalfTy(),
-                                     {param, coordJ, result, highHalf}, attribs);
+#if LLVM_MAIN_REVISION && LLVM_MAIN_REVISION < 494282
+    // Old version of code
+    auto interpP10Intrinsic = Intrinsic::amdgcn_interp_inreg_p10_f16;
+    auto interpP2Intrinsic = Intrinsic::amdgcn_interp_inreg_p2_f16;
+#else
+    // New version of the code (also handles unknown version, which we treat as
+    // latest)
+    auto interpP10Intrinsic = Intrinsic::amdgcn_interp_p10_rtz_f16;
+    auto interpP2Intrinsic = Intrinsic::amdgcn_interp_p2_rtz_f16;
+#endif
+    // tmp = interp.p10(p10, coordI, p0, highHalf)
+    result = builder.CreateIntrinsic(builder.getFloatTy(), interpP10Intrinsic, {param, coordI, param, highHalf});
+
+    // interp.p2(p20, coordJ, tmp, highHalf)
+    result = builder.CreateIntrinsic(builder.getHalfTy(), interpP2Intrinsic, {param, coordJ, result, highHalf});
   } else {
     // llvm.amdgcn.interp.p1.f16(coordI, attr_channel, attr, highhalf, m0)
     result = builder.CreateNamedCall("llvm.amdgcn.interp.p1.f16", builder.getFloatTy(),
@@ -2062,10 +2078,21 @@ void PatchInOutImportExport::patchGsGenericOutputExport(Value *output, unsigned
 void PatchInOutImportExport::patchMeshGenericOutputExport(Value *output, unsigned location, Value *locOffset,
                                                           Value *compIdx, Value *vertexOrPrimitiveIdx,
                                                           bool isPerPrimitive, BuilderBase &builder) {
-  if (output->getType()->getScalarSizeInBits() == 64)
+  // outputOffset = (location + locOffset) * 4 + compIdx * (bitWidth == 64 ? 2 : 1)
+  Value *outputOffset = builder.CreateAdd(builder.getInt32(location), locOffset);
+  outputOffset = builder.CreateShl(outputOffset, 2);
+
+  auto outputTy = output->getType();
+  if (outputTy->getScalarSizeInBits() == 64) {
     compIdx = builder.CreateShl(compIdx, 1);
+  }
+
+  outputOffset = builder.CreateAdd(outputOffset, compIdx);
 
-  builder.create<WriteMeshOutputOp>(isPerPrimitive, location, locOffset, compIdx, vertexOrPrimitiveIdx, output);
+  if (isPerPrimitive)
+    builder.create<WriteMeshPrimitiveOutputOp>(outputOffset, vertexOrPrimitiveIdx, output);
+  else
+    builder.create<WriteMeshVertexOutputOp>(outputOffset, vertexOrPrimitiveIdx, output);
 }
 
 // =====================================================================================================================
@@ -2637,10 +2664,10 @@ Value *PatchInOutImportExport::patchFsBuiltInInputImport(Type *inputTy, unsigned
   // Handle internal-use built-ins for sample position emulation
   case BuiltInNumSamples: {
     if (m_pipelineState->isUnlinked() || m_pipelineState->getRasterizerState().dynamicSampleInfo) {
-      assert(entryArgIdxs.sampleInfo != 0);
-      auto sampleInfo = getFunctionArgument(m_entryPoint, entryArgIdxs.sampleInfo);
+      assert(entryArgIdxs.compositeData != 0);
+      auto sampleInfo = getFunctionArgument(m_entryPoint, entryArgIdxs.compositeData);
       input = builder.CreateIntrinsic(Intrinsic::amdgcn_ubfe, builder.getInt32Ty(),
-                                      {sampleInfo, builder.getInt32(0), builder.getInt32(16)});
+                                      {sampleInfo, builder.getInt32(2), builder.getInt32(5)});
     } else {
       input = builder.getInt32(m_pipelineState->getRasterizerState().numSamples);
     }
@@ -2648,10 +2675,13 @@ Value *PatchInOutImportExport::patchFsBuiltInInputImport(Type *inputTy, unsigned
   }
   case BuiltInSamplePatternIdx: {
     if (m_pipelineState->isUnlinked() || m_pipelineState->getRasterizerState().dynamicSampleInfo) {
-      assert(entryArgIdxs.sampleInfo != 0);
-      auto sampleInfo = getFunctionArgument(m_entryPoint, entryArgIdxs.sampleInfo);
-      input = builder.CreateIntrinsic(Intrinsic::amdgcn_ubfe, builder.getInt32Ty(),
-                                      {sampleInfo, builder.getInt32(16), builder.getInt32(16)});
+      assert(entryArgIdxs.compositeData != 0);
+      auto sampleInfo = getFunctionArgument(m_entryPoint, entryArgIdxs.compositeData);
+      Value *numSamples = builder.CreateIntrinsic(Intrinsic::amdgcn_ubfe, builder.getInt32Ty(),
+                                                  {sampleInfo, builder.getInt32(2), builder.getInt32(5)});
+      numSamples = builder.CreateBinaryIntrinsic(Intrinsic::cttz, numSamples, builder.getTrue());
+      input = builder.CreateMul(
+          numSamples, builder.getInt32(m_pipelineState->getTargetInfo().getGpuProperty().maxMsaaRasterizerSamples));
     } else {
       input = builder.getInt32(m_pipelineState->getRasterizerState().samplePatternIdx);
     }
@@ -3388,10 +3418,15 @@ void PatchInOutImportExport::patchMeshBuiltInOutputExport(Value *output, unsigne
 
   (void(builtInUsage)); // Unused
 
-  if (!elemIdx)
-    elemIdx = builder.getInt32(0);
+  // outputOffset = location * 4 + elemIdx
+  Value *outputOffset = builder.getInt32(4 * loc);
+  if (elemIdx)
+    outputOffset = builder.CreateAdd(builder.getInt32(4 * loc), elemIdx);
 
-  builder.create<WriteMeshOutputOp>(isPerPrimitive, loc, builder.getInt32(0), elemIdx, vertexOrPrimitiveIdx, output);
+  if (isPerPrimitive)
+    builder.create<WriteMeshPrimitiveOutputOp>(outputOffset, vertexOrPrimitiveIdx, output);
+  else
+    builder.create<WriteMeshVertexOutputOp>(outputOffset, vertexOrPrimitiveIdx, output);
 }
 
 // =====================================================================================================================
diff --git a/lgc/patch/PatchResourceCollect.cpp b/lgc/patch/PatchResourceCollect.cpp
index 55f1c0f3c7..d9a091bda7 100644
--- a/lgc/patch/PatchResourceCollect.cpp
+++ b/lgc/patch/PatchResourceCollect.cpp
@@ -339,6 +339,9 @@ bool PatchResourceCollect::canUseNggCulling(Module *module) {
       if (tessMode.pointMode || tessMode.primitiveMode == PrimitiveMode::Isolines)
         return false;
     } else {
+      // Primitive topology is unknown, disable NGG culling.
+      if (m_pipelineState->getOptions().dynamicTopology || m_pipelineState->isUnlinked())
+        return false;
       // Check primitive type specified in pipeline state
       if (primType < PrimitiveType::TriangleList)
         return false;
@@ -583,8 +586,7 @@ bool PatchResourceCollect::checkGsOnChipValidity() {
     bool enableMaxVertOut = false;
 
     if (hasGs) {
-      unsigned maxVertOut = geometryMode.outputVertices;
-      assert(maxVertOut >= primAmpFactor);
+      unsigned maxVertOut = std::max(geometryMode.outputVertices, primAmpFactor);
       assert(gsInstanceCount >= 1);
 
       // Each input GS primitive can generate at most maxVertOut vertices. Each output vertex will be emitted
@@ -971,8 +973,8 @@ bool PatchResourceCollect::checkGsOnChipValidity() {
     LLPC_OUTS("\n");
     LLPC_OUTS("GS is on-chip (Mesh)\n");
   } else if (m_pipelineState->getNggControl()->enableNgg) {
-    LLPC_OUTS("GS primitive amplification factor: " << gsResUsage->inOutUsage.gs.calcFactor.primAmpFactor << "\n");
-    LLPC_OUTS("GS enable max output vertices per instance: "
+    LLPC_OUTS("GS primitive amplifier: " << gsResUsage->inOutUsage.gs.calcFactor.primAmpFactor << "\n");
+    LLPC_OUTS("GS enable max output vertices: "
               << (gsResUsage->inOutUsage.gs.calcFactor.enableMaxVertOut ? "true" : "false") << "\n");
     LLPC_OUTS("\n");
     LLPC_OUTS("GS is on-chip (NGG)\n");
@@ -2318,62 +2320,36 @@ void PatchResourceCollect::mapBuiltInToGenericInOut() {
     unsigned availPerPrimitiveOutMapLoc = inOutUsage.perPrimitiveOutputMapLocCount;
 
     // Map per-vertex built-in outputs to generic ones
-    if (builtInUsage.mesh.position) {
-      inOutUsage.builtInOutputLocMap[BuiltInPosition] = availOutMapLoc;
-      inOutUsage.mesh.vertexOutputComponents[availOutMapLoc] = {4, BuiltInPosition}; // vec4
-      ++availOutMapLoc;
-    }
+    if (builtInUsage.mesh.position)
+      inOutUsage.builtInOutputLocMap[BuiltInPosition] = availOutMapLoc++;
 
-    if (builtInUsage.mesh.pointSize) {
-      inOutUsage.builtInOutputLocMap[BuiltInPointSize] = availOutMapLoc;
-      inOutUsage.mesh.vertexOutputComponents[availOutMapLoc] = {1, BuiltInPointSize}; // float
-      ++availOutMapLoc;
-    }
+    if (builtInUsage.mesh.pointSize)
+      inOutUsage.builtInOutputLocMap[BuiltInPointSize] = availOutMapLoc++;
 
     if (builtInUsage.mesh.clipDistance > 0) {
-      inOutUsage.builtInOutputLocMap[BuiltInClipDistance] = availOutMapLoc;
-      inOutUsage.mesh.vertexOutputComponents[availOutMapLoc] = {static_cast<unsigned>(builtInUsage.mesh.clipDistance),
-                                                                BuiltInClipDistance}; // float[]
-      ++availOutMapLoc;
-
+      inOutUsage.builtInOutputLocMap[BuiltInClipDistance] = availOutMapLoc++;
       if (builtInUsage.mesh.clipDistance > 4)
         ++availOutMapLoc;
     }
 
     if (builtInUsage.mesh.cullDistance > 0) {
-      inOutUsage.builtInOutputLocMap[BuiltInCullDistance] = availOutMapLoc;
-      inOutUsage.mesh.vertexOutputComponents[availOutMapLoc] = {static_cast<unsigned>(builtInUsage.mesh.cullDistance),
-                                                                BuiltInCullDistance}; // float[]
-      ++availOutMapLoc;
-
+      inOutUsage.builtInOutputLocMap[BuiltInCullDistance] = availOutMapLoc++;
       if (builtInUsage.mesh.cullDistance > 4)
         ++availOutMapLoc;
     }
 
     // Map per-primitive built-in outputs to generic ones
-    if (builtInUsage.mesh.primitiveId) {
-      inOutUsage.perPrimitiveBuiltInOutputLocMap[BuiltInPrimitiveId] = availPerPrimitiveOutMapLoc;
-      inOutUsage.mesh.primitiveOutputComponents[availPerPrimitiveOutMapLoc] = {1, BuiltInPrimitiveId}; // int
-      ++availPerPrimitiveOutMapLoc;
-    }
+    if (builtInUsage.mesh.primitiveId)
+      inOutUsage.perPrimitiveBuiltInOutputLocMap[BuiltInPrimitiveId] = availPerPrimitiveOutMapLoc++;
 
-    if (builtInUsage.mesh.viewportIndex) {
-      inOutUsage.perPrimitiveBuiltInOutputLocMap[BuiltInViewportIndex] = availPerPrimitiveOutMapLoc;
-      inOutUsage.mesh.primitiveOutputComponents[availPerPrimitiveOutMapLoc] = {1, BuiltInViewportIndex}; // int
-      ++availPerPrimitiveOutMapLoc;
-    }
+    if (builtInUsage.mesh.viewportIndex)
+      inOutUsage.perPrimitiveBuiltInOutputLocMap[BuiltInViewportIndex] = availPerPrimitiveOutMapLoc++;
 
-    if (builtInUsage.mesh.layer) {
-      inOutUsage.perPrimitiveBuiltInOutputLocMap[BuiltInLayer] = availPerPrimitiveOutMapLoc;
-      inOutUsage.mesh.primitiveOutputComponents[availPerPrimitiveOutMapLoc] = {1, BuiltInLayer}; // int
-      ++availPerPrimitiveOutMapLoc;
-    }
+    if (builtInUsage.mesh.layer)
+      inOutUsage.perPrimitiveBuiltInOutputLocMap[BuiltInLayer] = availPerPrimitiveOutMapLoc++;
 
-    if (builtInUsage.mesh.primitiveShadingRate) {
-      inOutUsage.perPrimitiveBuiltInOutputLocMap[BuiltInPrimitiveShadingRate] = availPerPrimitiveOutMapLoc;
-      inOutUsage.mesh.primitiveOutputComponents[availPerPrimitiveOutMapLoc] = {1, BuiltInPrimitiveShadingRate}; // int
-      ++availPerPrimitiveOutMapLoc;
-    }
+    if (builtInUsage.mesh.primitiveShadingRate)
+      inOutUsage.perPrimitiveBuiltInOutputLocMap[BuiltInPrimitiveShadingRate] = availPerPrimitiveOutMapLoc++;
 
     // Map per-vertex built-in outputs to exported locations
     if (nextStage == ShaderStage::Fragment) {
@@ -2454,6 +2430,9 @@ void PatchResourceCollect::mapBuiltInToGenericInOut() {
         inOutUsage.mesh.primitiveBuiltInExportSlots[BuiltInViewportIndex] = availPerPrimitiveExportLoc++;
     }
 
+    inOutUsage.mesh.vertexGenericOutputExportCount = inOutUsage.outputMapLocCount;
+    inOutUsage.mesh.primitiveGenericOutputExportCount = inOutUsage.perPrimitiveOutputMapLocCount;
+
     inOutUsage.outputMapLocCount = std::max(inOutUsage.outputMapLocCount, availOutMapLoc);
     inOutUsage.perPrimitiveOutputMapLocCount =
         std::max(inOutUsage.perPrimitiveOutputMapLocCount, availPerPrimitiveOutMapLoc);
@@ -3011,25 +2990,6 @@ void PatchResourceCollect::updateOutputLocInfoMapWithUnpack() {
       if (m_shaderStage == ShaderStage::Geometry)
         inOutUsage.gs.outLocCount[streamId] = std::max(inOutUsage.gs.outLocCount[streamId], newLocMappedTo + 1);
     }
-
-    // After location mapping is done, we update the location/components map of mesh shader vertex outputs with new
-    // locations.
-    if (m_shaderStage == ShaderStage::Mesh) {
-      // Make a copy and clear the old map
-      auto vertexOutputComponents = inOutUsage.mesh.vertexOutputComponents;
-      inOutUsage.mesh.vertexOutputComponents.clear();
-
-      // Setup a new map with new locations
-      for (auto &locInfoPair : outputLocInfoMap) {
-        const unsigned location = locInfoPair.first.getLocation();
-        const unsigned newLocation = locInfoPair.second.getLocation();
-
-        if (vertexOutputComponents.count(location) == 0)
-          continue; // Skip if not found
-
-        inOutUsage.mesh.vertexOutputComponents[newLocation] = vertexOutputComponents[location];
-      }
-    }
   }
 
   //
@@ -3122,25 +3082,6 @@ void PatchResourceCollect::updateOutputLocInfoMapWithUnpack() {
       assert(newLocMappedTo != InvalidValue);
       locPair.second = newLocMappedTo;
     }
-
-    // After location mapping is done, we update the location/components map of mesh shader primitive outputs with
-    // new locations.
-    if (m_shaderStage == ShaderStage::Mesh) {
-      // Make a copy and clear the old map
-      auto primitiveOutputComponents = inOutUsage.mesh.primitiveOutputComponents;
-      inOutUsage.mesh.primitiveOutputComponents.clear();
-
-      // Setup a new map with new locations
-      for (auto &locPair : perPrimitiveOutputLocMap) {
-        const unsigned location = locPair.first;
-        const unsigned newLocation = locPair.second;
-
-        if (primitiveOutputComponents.count(location) == 0)
-          continue; // Skip if not found
-
-        inOutUsage.mesh.primitiveOutputComponents[newLocation] = primitiveOutputComponents[location];
-      }
-    }
   }
 
   m_outputCalls.clear();
diff --git a/lgc/patch/ShaderInputs.cpp b/lgc/patch/ShaderInputs.cpp
index 126972bfb2..5895a4af38 100644
--- a/lgc/patch/ShaderInputs.cpp
+++ b/lgc/patch/ShaderInputs.cpp
@@ -75,8 +75,8 @@ const char *ShaderInputs::getSpecialUserDataName(UserDataMapping kind) {
     return "StreamOutControlBuf";
   case UserDataMapping::ColorExportAddr:
     return "ColorExportAddr";
-  case UserDataMapping::DynamicDualSrcBlendInfo:
-    return "DualSourceBlendUpdateInfo";
+  case UserDataMapping::CompositeData:
+    return "CompositeData";
   default:
     return "";
   }
diff --git a/lgc/patch/VertexFetch.cpp b/lgc/patch/VertexFetch.cpp
index 5587c8f52e..c8cf1f7e44 100644
--- a/lgc/patch/VertexFetch.cpp
+++ b/lgc/patch/VertexFetch.cpp
@@ -1786,7 +1786,7 @@ std::pair<Value *, Value *> VertexFetchImpl::convertSrdToOffsetMode(Value *vbDes
   // Stride is from the third DWORD.
   auto srdStride = builder.CreateExtractElement(vbDesc, 3);
 
-  SqBufRsrcWord3 sqBufRsrcWord3;
+  SqBufRsrcWord3 sqBufRsrcWord3 = {};
   sqBufRsrcWord3.bits.dstSelX = BUF_DST_SEL_X;
   sqBufRsrcWord3.bits.dstSelY = BUF_DST_SEL_Y;
   sqBufRsrcWord3.bits.dstSelZ = BUF_DST_SEL_Z;
diff --git a/lgc/state/Compiler.cpp b/lgc/state/Compiler.cpp
index bc123a1914..0c627f82f3 100644
--- a/lgc/state/Compiler.cpp
+++ b/lgc/state/Compiler.cpp
@@ -63,9 +63,11 @@ ElfLinker *createElfLinkerImpl(PipelineState *pipelineState, llvm::ArrayRef<llvm
 // with irLink(). This is a static method in Pipeline, as it does not need a Pipeline object, and can be used
 // in the front-end before a shader is associated with a pipeline.
 //
-// @param func : Shader entry-point function
+// @param func : Function to mark. This can instead be a GlobalVariable; that functionality is not used by LGC,
+//               but can be used by a front-end that uses a GlobalVariable to represent a part-pipeline retrieved
+//               from the cache, and wants to mark it with a shader stage
 // @param stage : Shader stage or ShaderStage::Invalid
-void Pipeline::markShaderEntryPoint(Function *func, ShaderStageEnum stage) {
+void Pipeline::markShaderEntryPoint(GlobalObject *func, ShaderStageEnum stage) {
   // We mark the shader entry-point function by
   // 1. marking it external linkage and DLLExportStorageClass; and
   // 2. adding the shader stage metadata.
@@ -81,19 +83,29 @@ void Pipeline::markShaderEntryPoint(Function *func, ShaderStageEnum stage) {
 // =====================================================================================================================
 // Get a function's shader stage.
 //
-// @param func : Function to check
+// @param func : Function to check. This can instead be a GlobalVariable; that functionality is not used by LGC,
+//               but can be used by a front-end that uses a GlobalVariable to represent a part-pipeline retrieved
+//               from the cache, and wants to mark it with a shader stage
 // @returns stage : Shader stage, or nullopt if none
-std::optional<ShaderStageEnum> Pipeline::getShaderStage(llvm::Function *func) {
+std::optional<ShaderStageEnum> Pipeline::getShaderStage(GlobalObject *func) {
   return lgc::getShaderStage(func);
 }
 
+// =====================================================================================================================
+// Set a function's shader subtype. Only has an effect on a compute shader or non-shader export function,
+// where it causes the .shader_subtype PAL metadata item to be set to the arbitrary string given here.
+void Pipeline::setShaderSubtype(GlobalObject *func, StringRef subtype) {
+  lgc::setShaderSubtype(func, subtype);
+}
+
 // =====================================================================================================================
 // Find the shader entry-point from shader module, and set pipeline stage.
 //
 // @param module : Shader module to attach
-void PipelineState::attachModule(llvm::Module *module) {
+void PipelineState::attachModule(llvm::Module *module, PipelineLink pipelineLink) {
   if (!module)
     return;
+  m_pipelineLink = pipelineLink;
 
   // Find the shader entry-point (marked with irLink()), and get the shader stage from that.
   std::optional<ShaderStageEnum> stage;
@@ -136,7 +148,7 @@ std::unique_ptr<Module> PipelineState::irLink(MutableArrayRef<std::unique_ptr<Mo
     if (!module)
       continue;
 
-    attachModule(module.get());
+    attachModule(module.get(), pipelineLink);
   }
 
   // The front-end was using a BuilderRecorder; record pipeline state into IR metadata.
diff --git a/lgc/state/LgcContext.cpp b/lgc/state/LgcContext.cpp
index b16d1046a5..83a3568ec9 100644
--- a/lgc/state/LgcContext.cpp
+++ b/lgc/state/LgcContext.cpp
@@ -156,7 +156,6 @@ void LgcContext::initialize() {
   // TODO: phi-of-ops optimization in NewGVN has some problems, we temporarily
   // disable this to avoid miscompilation, see (https://github.com/GPUOpen-Drivers/llpc/issues/1206).
   setOptionDefault("enable-phi-of-ops", "0");
-  setOptionDefault("simplifycfg-sink-common", "0");
   setOptionDefault("amdgpu-vgpr-index-mode", "1"); // force VGPR indexing on GFX8
 #if LLVM_MAIN_REVISION && LLVM_MAIN_REVISION < 464446
   // Old version of the code
diff --git a/lgc/state/PalMetadata.cpp b/lgc/state/PalMetadata.cpp
index 6fc0f499e3..72d3e6d82b 100644
--- a/lgc/state/PalMetadata.cpp
+++ b/lgc/state/PalMetadata.cpp
@@ -367,7 +367,6 @@ ShaderStageMask PalMetadata::getShaderStageMask() {
         stageMask |= ShaderStageMask(entry.stage);
     }
   }
-  assert(!stageMask.empty());
   return stageMask;
 }
 
@@ -477,11 +476,13 @@ void PalMetadata::finalizePipeline(bool isWholePipeline) {
   if (m_pipelineState->isGraphics())
     finalizeRegisterSettings(isWholePipeline);
 
-  // Set pipeline hash.
+  // Set pipeline hash and resource hash.
   auto pipelineHashNode = m_pipelineNode[Util::Abi::PipelineMetadataKey::InternalPipelineHash].getArray(true);
   const auto &options = m_pipelineState->getOptions();
   pipelineHashNode[0] = options.hash[0];
   pipelineHashNode[1] = options.hash[1];
+  if (options.resourceHash != 0)
+    m_pipelineNode[Util::Abi::PipelineMetadataKey::ResourceHash] = options.resourceHash;
 
   // The rest of this function is used only for whole pipeline PAL metadata or an ELF link.
   if (!isWholePipeline)
diff --git a/lgc/state/PipelineState.cpp b/lgc/state/PipelineState.cpp
index 90148161f7..5734ce2b56 100644
--- a/lgc/state/PipelineState.cpp
+++ b/lgc/state/PipelineState.cpp
@@ -1542,8 +1542,11 @@ void PipelineState::setShaderDefaultWaveSize(ShaderStageEnum stage) {
     checkingStage = hasShaderStage(ShaderStage::TessEval) ? ShaderStage::TessEval : ShaderStage::Vertex;
   }
 
-  if (checkingStage == ShaderStage::Compute)
-    m_waveSize[checkingStage] = m_shaderModes.getComputeShaderMode().subgroupSize;
+  if (checkingStage == ShaderStage::Compute) {
+    const unsigned subgroupSize = m_shaderModes.getComputeShaderMode().subgroupSize;
+    m_waveSize[checkingStage] = subgroupSize;
+    m_subgroupSize[checkingStage] = subgroupSize;
+  }
 
   if (!m_waveSize[checkingStage]) {
     unsigned waveSize = getTargetInfo().getGpuProperty().waveSize;
@@ -1686,7 +1689,7 @@ bool PipelineState::enableSwXfb() {
   lastVertexStage = lastVertexStage == ShaderStage::CopyShader ? ShaderStage::Geometry : lastVertexStage;
 
   if (lastVertexStage == ShaderStage::Invalid) {
-    assert(isUnlinked()); // Unlinked pipeline only having fragment shader.
+    assert(isUnlinked()); // Unlinked fragment shader or part-pipeline
     return false;
   }
 
diff --git a/lgc/state/ShaderStage.cpp b/lgc/state/ShaderStage.cpp
index 73c34b9f0b..c7eb5d2f75 100644
--- a/lgc/state/ShaderStage.cpp
+++ b/lgc/state/ShaderStage.cpp
@@ -43,6 +43,7 @@ using namespace llvm;
 // Named metadata node used on a function to show what shader stage it is part of
 namespace {
 const static char ShaderStageMetadata[] = "lgc.shaderstage";
+const static char ShaderSubtypeMetadata[] = "lgc.shadersubtype";
 } // anonymous namespace
 
 // =====================================================================================================================
@@ -70,9 +71,11 @@ void lgc::setShaderStage(Module *module, std::optional<ShaderStageEnum> stage) {
 // =====================================================================================================================
 // Set shader stage metadata on a function
 //
-// @param [in/out] func : Function to set shader stage on
+// @param [in/out] func : Function to mark. This can instead be a GlobalVariable; that functionality is not used
+//                        by LGC, but can be used by a front-end that uses a GlobalVariable to represent a
+//                        part-pipeline retrieved from the cache, and wants to mark it with a shader stage
 // @param stage : Shader stage to set or ShaderStage::Invalid
-void lgc::setShaderStage(Function *func, std::optional<ShaderStageEnum> stage) {
+void lgc::setShaderStage(GlobalObject *func, std::optional<ShaderStageEnum> stage) {
   unsigned mdKindId = func->getContext().getMDKindID(ShaderStageMetadata);
   if (stage) {
     auto stageMetaNode =
@@ -86,8 +89,10 @@ void lgc::setShaderStage(Function *func, std::optional<ShaderStageEnum> stage) {
 // =====================================================================================================================
 // Gets the shader stage from the specified LLVM function. Returns ShaderStage::Invalid if metadata not found.
 //
-// @param func : LLVM function
-std::optional<ShaderStageEnum> lgc::getShaderStage(const Function *func) {
+// @param func : LLVM function. This can instead be a GlobalVariable; that functionality is not used by LGC,
+//               but can be used by a front-end that uses a GlobalVariable to represent a part-pipeline retrieved
+//               from the cache, and wants to mark it with a shader stage
+std::optional<ShaderStageEnum> lgc::getShaderStage(const GlobalObject *func) {
   // Check for the metadata that is added by PipelineState::link.
   MDNode *stageMetaNode = func->getMetadata(ShaderStageMetadata);
   if (stageMetaNode)
@@ -95,6 +100,28 @@ std::optional<ShaderStageEnum> lgc::getShaderStage(const Function *func) {
   return std::nullopt;
 }
 
+// =====================================================================================================================
+// Set a function's shader subtype. Only has an effect on a compute shader or non-shader export function,
+// where it causes the .shader_subtype PAL metadata item to be set to the arbitrary string given here.
+void lgc::setShaderSubtype(GlobalObject *func, StringRef subtype) {
+  unsigned mdKindId = func->getContext().getMDKindID(ShaderSubtypeMetadata);
+  if (!subtype.empty()) {
+    auto node = MDNode::get(func->getContext(), MDString::get(func->getContext(), subtype));
+    func->setMetadata(mdKindId, node);
+  } else
+    func->eraseMetadata(mdKindId);
+}
+
+// =====================================================================================================================
+// Get a function's shader subtype, or "" if none.
+llvm::StringRef lgc::getShaderSubtype(GlobalObject *func) {
+  MDNode *node = func->getMetadata(ShaderSubtypeMetadata);
+  if (!node)
+    return "";
+  MDString *stringNode = cast<MDString>(node->getOperand(0));
+  return stringNode->getString();
+}
+
 // =====================================================================================================================
 // Determine whether the function is a shader entry-point.
 // A shader entry-point is marked DLLExportStorageClass by markShaderEntryPoint() in Compiler.cpp, which the front-end
@@ -166,7 +193,7 @@ Function *lgc::addFunctionArgs(Function *oldFunc, Type *retTy, ArrayRef<Type *>
     block->insertInto(newFunc);
   }
 
-  // Copy attributes and shader stage from the old function. The new arguments have InReg set iff the corresponding
+  // Copy attributes from the old function. The new arguments have InReg set iff the corresponding
   // bit is set in inRegMask.
   AttributeList oldAttrList = oldFunc->getAttributes();
   SmallVector<AttributeSet, 8> argAttrs;
@@ -192,8 +219,9 @@ Function *lgc::addFunctionArgs(Function *oldFunc, Type *retTy, ArrayRef<Type *>
   newFunc->setAttributes(
       AttributeList::get(oldFunc->getContext(), oldAttrList.getFnAttrs(), oldAttrList.getRetAttrs(), argAttrs));
 
-  // Set the shader stage on the new function (implemented with IR metadata).
+  // Set the shader stage and shader subtype on the new function (implemented with IR metadata).
   setShaderStage(newFunc, getShaderStage(oldFunc));
+  setShaderSubtype(newFunc, getShaderSubtype(oldFunc));
 
   // Replace uses of the old args.
   // Set inreg attributes correctly. We have to use removeAttr because arg attributes are actually attached
diff --git a/lgc/state/TargetInfo.cpp b/lgc/state/TargetInfo.cpp
index c2b2d32604..56b46f36ff 100644
--- a/lgc/state/TargetInfo.cpp
+++ b/lgc/state/TargetInfo.cpp
@@ -77,6 +77,7 @@ static void setGfx10BaseInfo(TargetInfo *targetInfo) {
   targetInfo->getGpuProperty().gsOnChipDefaultLdsSizePerSubgroup = 0; // GFX9+ does not use this
   targetInfo->getGpuProperty().tessFactorBufferSizePerSe = 8192;
   targetInfo->getGpuProperty().numShaderEngines = 4;
+  targetInfo->getGpuProperty().maxMsaaRasterizerSamples = 16;
 }
 
 // gfx10
diff --git a/lgc/test/FDot2Gfx1010.lgc b/lgc/test/FDot2Gfx1010.lgc
new file mode 100644
index 0000000000..06e33debd8
--- /dev/null
+++ b/lgc/test/FDot2Gfx1010.lgc
@@ -0,0 +1,67 @@
+; Test fdot2 on gfx1010 because gfx1010 doesn't support llvm.amdgcn.fdot2 (v_dot2_f32_f16)
+
+; RUN: lgc -mcpu=gfx1010 -print-after=lgc-builder-replayer -o /dev/null 2>&1 - <%s | FileCheck --check-prefixes=CHECK %s
+; CHECK-LABEL: IR Dump After Replay LLPC builder
+; CHECK:  [[TMP0:%.*]] = fpext <2 x half> %{{.*}} to <2 x float>
+; CHECK:  [[TMP1:%.*]] = fpext <2 x half> %{{.*}} to <2 x float>
+; CHECK:  [[TMP2:%.*]] = fmul <2 x float> [[TMP0]], [[TMP1]]
+; CHECK:  [[TMP3:%.*]] = extractelement <2 x float> [[TMP2]], i64 0
+; CHECK:  [[TMP4:%.*]] = extractelement <2 x float> [[TMP2]], i64 1
+; CHECK:  [[TMP5:%.*]] = fadd float [[TMP3]], [[TMP4]]
+; CHECK:  [[TMP6:%.*]] = fadd float [[TMP5]], %.unpack6
+
+; ModuleID = 'LLPC module'
+source_filename = "LLPC module"
+target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-p7:160:256:256:32-p8:128:128-p9:192:256:256:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7:8:9"
+target triple = "amdgcn--amdpal"
+
+define dllexport void @"lgc.shader.CS.7CDDB8A86039EBF37A880DA3889512E6:main"() !lgc.shaderstage !5 !llpcfe.resource.mapping !6 {
+  %1 = call i32 (...) @lgc.create.read.builtin.input.i32(i32 29, i32 0, i32 poison, i32 poison)
+  %2 = call ptr addrspace(7) @lgc.load.buffer.desc(i64 4294967296, i32 0, i32 0, i32 266)
+  %3 = shl i32 %1, 4
+  %4 = getelementptr i8, ptr addrspace(7) %2, i32 %3
+  %.unpack = load half, ptr addrspace(7) %4, align 2
+  %.elt1 = getelementptr inbounds i8, ptr addrspace(7) %4, i32 2
+  %.unpack2 = load half, ptr addrspace(7) %.elt1, align 2
+  %5 = getelementptr i8, ptr addrspace(7) %4, i32 4
+  %.unpack3 = load half, ptr addrspace(7) %5, align 2
+  %.elt4 = getelementptr i8, ptr addrspace(7) %4, i32 6
+  %.unpack5 = load half, ptr addrspace(7) %.elt4, align 2
+  %6 = getelementptr i8, ptr addrspace(7) %4, i32 8
+  %.unpack6 = load float, ptr addrspace(7) %6, align 4
+  %7 = insertelement <2 x half> poison, half %.unpack, i64 0
+  %8 = insertelement <2 x half> %7, half %.unpack2, i64 1
+  %9 = insertelement <2 x half> poison, half %.unpack3, i64 0
+  %10 = insertelement <2 x half> %9, half %.unpack5, i64 1
+  %11 = call float (...) @lgc.create.fdot2.f32(<2 x half> %8, <2 x half> %10, float %.unpack6, i1 false)
+  %12 = getelementptr i8, ptr addrspace(7) %4, i32 12
+  store float %11, ptr addrspace(7) %12, align 4
+  ret void
+}
+
+; Function Attrs: nounwind willreturn memory(read)
+declare i32 @lgc.create.read.builtin.input.i32(...) #0
+
+; Function Attrs: nounwind willreturn memory(none)
+declare ptr addrspace(7) @lgc.load.buffer.desc(i64, i32, i32, i32) #1
+
+; Function Attrs: nounwind memory(none)
+declare float @lgc.create.fdot2.f32(...) #2
+
+attributes #0 = { nounwind willreturn memory(read) }
+attributes #1 = { nounwind willreturn memory(none) }
+attributes #2 = { nounwind memory(none) }
+
+!llpc.compute.mode = !{!0}
+!lgc.client = !{!1}
+!lgc.options = !{!2}
+!lgc.options.CS = !{!3}
+!lgc.user.data.nodes = !{!4}
+
+!0 = !{i32 8, i32 8, i32 1}
+!1 = !{!"DX12"}
+!2 = !{i32 -1010443957, i32 1479304606, i32 -922603727, i32 -992904437, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 256, i32 0, i32 0, i32 0, i32 0, i32 5, i32 0, i32 3, i32 16777216, i32 16842752, i32 0, i32 0, i32 0, i32 -410708944, i32 1187695996}
+!3 = !{i32 -1010443957, i32 1479304606, i32 -922603727, i32 -992904437, i32 0, i32 0, i32 0, i32 -1, i32 -1}
+!4 = !{!"DescriptorBufferCompact", i32 17, i32 255, i32 3, i32 2, i64 4294967296, i32 0, i32 2}
+!5 = !{i32 7}
+!6 = !{i64 5101115464294405168}
diff --git a/lgc/test/TaskShaderOps.lgc b/lgc/test/TaskShaderOps.lgc
index 04ee28a150..16d98df3dd 100644
--- a/lgc/test/TaskShaderOps.lgc
+++ b/lgc/test/TaskShaderOps.lgc
@@ -36,7 +36,7 @@
 ; CHECK-NEXT: [[newBaseAddr:%[0-9]*]] = add nuw nsw i64 [[baseAddr]], [[wrappedEntryIndex64]]
 ; CHECK-NEXT: [[newDescWord0:%[0-9]*]] = trunc i64 [[newBaseAddr]] to i32
 ; CHECK-NEXT: [[newBaseAddrHi64:%[a-z.]*]] = lshr i64 [[newBaseAddr]], 32
-; CHECK-NEXT: [[newBaseAddrHi32:%[0-9]*]] = trunc i64 [[newBaseAddrHi64]] to i32
+; CHECK-NEXT: [[newBaseAddrHi32:%[0-9]*]] = trunc {{(nuw nsw )?}}i64 [[newBaseAddrHi64]] to i32
 ; CHECK-NEXT: [[newBaseAddrHi:%[0-9]*]] = and i32 [[newBaseAddrHi32]], 65535
 ; CHECK-NEXT: [[newDescWord1Tmp:%[0-9]*]] = and i32 [[descWord1]], -65536
 ; CHECK-NEXT: [[newDescWord1:%[0-9]*]] = or {{(disjoint )?}}i32 [[newDescWord1Tmp]], [[newBaseAddrHi]]
diff --git a/lgc/test/Transforms/CombineCooperativeMatrix/constants.lgc b/lgc/test/Transforms/CombineCooperativeMatrix/constants.lgc
index 9c5b754480..95b65d3c05 100644
--- a/lgc/test/Transforms/CombineCooperativeMatrix/constants.lgc
+++ b/lgc/test/Transforms/CombineCooperativeMatrix/constants.lgc
@@ -5,7 +5,7 @@ define <8 x float> @transpose_undef() {
 ; CHECK-LABEL: @transpose_undef(
 ; CHECK-NEXT:    ret <8 x float> undef
 ;
-  %r = call <8 x float> @lgc.cooperative.matrix.transpose.v8f32.v8f32.i32.i32(<8 x float> undef, i32 1, i32 0)
+  %r = call <8 x float> (...) @lgc.cooperative.matrix.transpose__v8f32(<8 x float> undef, i32 1, i32 0)
   ret <8 x float> %r
 }
 
@@ -13,7 +13,7 @@ define <8 x float> @transpose_poison() {
 ; CHECK-LABEL: @transpose_poison(
 ; CHECK-NEXT:    ret <8 x float> poison
 ;
-  %r = call <8 x float> @lgc.cooperative.matrix.transpose.v8f32.v8f32.i32.i32(<8 x float> poison, i32 1, i32 0)
+  %r = call <8 x float> (...) @lgc.cooperative.matrix.transpose__v8f32(<8 x float> poison, i32 1, i32 0)
   ret <8 x float> %r
 }
 
@@ -21,7 +21,7 @@ define <8 x float> @transpose_zero() {
 ; CHECK-LABEL: @transpose_zero(
 ; CHECK-NEXT:    ret <8 x float> zeroinitializer
 ;
-  %r = call <8 x float> @lgc.cooperative.matrix.transpose.v8f32.v8f32.i32.i32(<8 x float> zeroinitializer, i32 1, i32 0)
+  %r = call <8 x float> (...) @lgc.cooperative.matrix.transpose__v8f32(<8 x float> zeroinitializer, i32 1, i32 0)
   ret <8 x float> %r
 }
 
@@ -29,7 +29,7 @@ define <8 x float> @relayout_undef() {
 ; CHECK-LABEL: @relayout_undef(
 ; CHECK-NEXT:    ret <8 x float> undef
 ;
-  %r = call <8 x float> @lgc.cooperative.matrix.convert.v8f32.i32.v8f32.i32.i32.i32.i32(i32 0, <8 x float> undef, i32 1, i32 1, i32 0, i32 1)
+  %r = call <8 x float> (...) @lgc.cooperative.matrix.convert__v8f32(i32 0, <8 x float> undef, i32 1, i32 1, i32 0, i32 1)
   ret <8 x float> %r
 }
 
@@ -37,7 +37,7 @@ define <8 x float> @relayout_poison() {
 ; CHECK-LABEL: @relayout_poison(
 ; CHECK-NEXT:    ret <8 x float> poison
 ;
-  %r = call <8 x float> @lgc.cooperative.matrix.convert.v8f32.i32.v8f32.i32.i32.i32.i32(i32 0, <8 x float> poison, i32 1, i32 1, i32 0, i32 1)
+  %r = call <8 x float> (...) @lgc.cooperative.matrix.convert__v8f32(i32 0, <8 x float> poison, i32 1, i32 1, i32 0, i32 1)
   ret <8 x float> %r
 }
 
@@ -45,16 +45,16 @@ define <8 x float> @relayout_zero() {
 ; CHECK-LABEL: @relayout_zero(
 ; CHECK-NEXT:    ret <8 x float> zeroinitializer
 ;
-  %r = call <8 x float> @lgc.cooperative.matrix.convert.v8f32.i32.v8f32.i32.i32.i32.i32(i32 0, <8 x float> zeroinitializer, i32 1, i32 1, i32 0, i32 1)
+  %r = call <8 x float> (...) @lgc.cooperative.matrix.convert__v8f32(i32 0, <8 x float> zeroinitializer, i32 1, i32 1, i32 0, i32 1)
   ret <8 x float> %r
 }
 
 define <8 x float> @fptrunc_undef() {
 ; CHECK-LABEL: @fptrunc_undef(
-; CHECK-NEXT:    [[R:%.*]] = call <8 x float> @lgc.cooperative.matrix.convert.v8f32.i32.v8f32.i32.i32.i32.i32(i32 45, <8 x float> undef, i32 2, i32 1, i32 0, i32 0)
+; CHECK-NEXT:    [[R:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.convert__v8f32(i32 45, <8 x float> undef, i32 2, i32 1, i32 0, i32 0)
 ; CHECK-NEXT:    ret <8 x float> [[R]]
 ;
-  %r = call <8 x float> @lgc.cooperative.matrix.convert.v8f32.i32.v8f32.i32.i32.i32.i32(i32 45, <8 x float> undef, i32 2, i32 1, i32 0, i32 0)
+  %r = call <8 x float> (...) @lgc.cooperative.matrix.convert__v8f32(i32 45, <8 x float> undef, i32 2, i32 1, i32 0, i32 0)
   ret <8 x float> %r
 }
 
@@ -62,7 +62,7 @@ define <8 x float> @fptrunc_poison() {
 ; CHECK-LABEL: @fptrunc_poison(
 ; CHECK-NEXT:    ret <8 x float> poison
 ;
-  %r = call <8 x float> @lgc.cooperative.matrix.convert.v8f32.i32.v8f32.i32.i32.i32.i32(i32 45, <8 x float> poison, i32 2, i32 1, i32 0, i32 0)
+  %r = call <8 x float> (...) @lgc.cooperative.matrix.convert__v8f32(i32 45, <8 x float> poison, i32 2, i32 1, i32 0, i32 0)
   ret <8 x float> %r
 }
 
@@ -70,16 +70,16 @@ define <8 x float> @fptrunc_zero() {
 ; CHECK-LABEL: @fptrunc_zero(
 ; CHECK-NEXT:    ret <8 x float> zeroinitializer
 ;
-  %r = call <8 x float> @lgc.cooperative.matrix.convert.v8f32.i32.v8f32.i32.i32.i32.i32(i32 45, <8 x float> zeroinitializer, i32 2, i32 1, i32 0, i32 0)
+  %r = call <8 x float> (...) @lgc.cooperative.matrix.convert__v8f32(i32 45, <8 x float> zeroinitializer, i32 2, i32 1, i32 0, i32 0)
   ret <8 x float> %r
 }
 
 define <8 x float> @fpext_undef() {
 ; CHECK-LABEL: @fpext_undef(
-; CHECK-NEXT:    [[R:%.*]] = call <8 x float> @lgc.cooperative.matrix.convert.v8f32.i32.v8f32.i32.i32.i32.i32(i32 46, <8 x float> undef, i32 1, i32 2, i32 0, i32 0)
+; CHECK-NEXT:    [[R:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.convert__v8f32(i32 46, <8 x float> undef, i32 1, i32 2, i32 0, i32 0)
 ; CHECK-NEXT:    ret <8 x float> [[R]]
 ;
-  %r = call <8 x float> @lgc.cooperative.matrix.convert.v8f32.i32.v8f32.i32.i32.i32.i32(i32 46, <8 x float> undef, i32 1, i32 2, i32 0, i32 0)
+  %r = call <8 x float> (...) @lgc.cooperative.matrix.convert__v8f32(i32 46, <8 x float> undef, i32 1, i32 2, i32 0, i32 0)
   ret <8 x float> %r
 }
 
@@ -87,7 +87,7 @@ define <8 x float> @fpext_poison() {
 ; CHECK-LABEL: @fpext_poison(
 ; CHECK-NEXT:    ret <8 x float> poison
 ;
-  %r = call <8 x float> @lgc.cooperative.matrix.convert.v8f32.i32.v8f32.i32.i32.i32.i32(i32 46, <8 x float> poison, i32 1, i32 2, i32 0, i32 0)
+  %r = call <8 x float> (...) @lgc.cooperative.matrix.convert__v8f32(i32 46, <8 x float> poison, i32 1, i32 2, i32 0, i32 0)
   ret <8 x float> %r
 }
 
@@ -95,16 +95,16 @@ define <8 x float> @fpext_zero() {
 ; CHECK-LABEL: @fpext_zero(
 ; CHECK-NEXT:    ret <8 x float> zeroinitializer
 ;
-  %r = call <8 x float> @lgc.cooperative.matrix.convert.v8f32.i32.v8f32.i32.i32.i32.i32(i32 46, <8 x float> zeroinitializer, i32 1, i32 2, i32 0, i32 0)
+  %r = call <8 x float> (...) @lgc.cooperative.matrix.convert__v8f32(i32 46, <8 x float> zeroinitializer, i32 1, i32 2, i32 0, i32 0)
   ret <8 x float> %r
 }
 
 define <8 x i32> @trunc_undef() {
 ; CHECK-LABEL: @trunc_undef(
-; CHECK-NEXT:    [[R:%.*]] = call <8 x i32> @lgc.cooperative.matrix.convert.v8i32.i32.v8i32.i32.i32.i32.i32(i32 38, <8 x i32> undef, i32 5, i32 4, i32 0, i32 0)
+; CHECK-NEXT:    [[R:%.*]] = call <8 x i32> (...) @lgc.cooperative.matrix.convert__v8i32(i32 38, <8 x i32> undef, i32 5, i32 4, i32 0, i32 0)
 ; CHECK-NEXT:    ret <8 x i32> [[R]]
 ;
-  %r = call <8 x i32> @lgc.cooperative.matrix.convert.v8i32.i32.v8i32.i32.i32.i32.i32(i32 38, <8 x i32> undef, i32 5, i32 4, i32 0, i32 0)
+  %r = call <8 x i32> (...) @lgc.cooperative.matrix.convert__v8i32(i32 38, <8 x i32> undef, i32 5, i32 4, i32 0, i32 0)
   ret <8 x i32> %r
 }
 
@@ -112,7 +112,7 @@ define <8 x i32> @trunc_poison() {
 ; CHECK-LABEL: @trunc_poison(
 ; CHECK-NEXT:    ret <8 x i32> poison
 ;
-  %r = call <8 x i32> @lgc.cooperative.matrix.convert.v8i32.i32.v8i32.i32.i32.i32.i32(i32 38, <8 x i32> poison, i32 5, i32 4, i32 0, i32 0)
+  %r = call <8 x i32> (...) @lgc.cooperative.matrix.convert__v8i32(i32 38, <8 x i32> poison, i32 5, i32 4, i32 0, i32 0)
   ret <8 x i32> %r
 }
 
@@ -120,16 +120,16 @@ define <8 x i32> @trunc_zero() {
 ; CHECK-LABEL: @trunc_zero(
 ; CHECK-NEXT:    ret <8 x i32> zeroinitializer
 ;
-  %r = call <8 x i32> @lgc.cooperative.matrix.convert.v8i32.i32.v8i32.i32.i32.i32.i32(i32 38, <8 x i32> zeroinitializer, i32 5, i32 4, i32 0, i32 0)
+  %r = call <8 x i32> (...) @lgc.cooperative.matrix.convert__v8i32(i32 38, <8 x i32> zeroinitializer, i32 5, i32 4, i32 0, i32 0)
   ret <8 x i32> %r
 }
 
 define <8 x i32> @zext_undef() {
 ; CHECK-LABEL: @zext_undef(
-; CHECK-NEXT:    [[R:%.*]] = call <8 x i32> @lgc.cooperative.matrix.convert.v8i32.i32.v8i32.i32.i32.i32.i32(i32 39, <8 x i32> undef, i32 4, i32 5, i32 0, i32 0)
+; CHECK-NEXT:    [[R:%.*]] = call <8 x i32> (...) @lgc.cooperative.matrix.convert__v8i32(i32 39, <8 x i32> undef, i32 4, i32 5, i32 0, i32 0)
 ; CHECK-NEXT:    ret <8 x i32> [[R]]
 ;
-  %r = call <8 x i32> @lgc.cooperative.matrix.convert.v8i32.i32.v8i32.i32.i32.i32.i32(i32 39, <8 x i32> undef, i32 4, i32 5, i32 0, i32 0)
+  %r = call <8 x i32> (...) @lgc.cooperative.matrix.convert__v8i32(i32 39, <8 x i32> undef, i32 4, i32 5, i32 0, i32 0)
   ret <8 x i32> %r
 }
 
@@ -137,7 +137,7 @@ define <8 x i32> @zext_poison() {
 ; CHECK-LABEL: @zext_poison(
 ; CHECK-NEXT:    ret <8 x i32> poison
 ;
-  %r = call <8 x i32> @lgc.cooperative.matrix.convert.v8i32.i32.v8i32.i32.i32.i32.i32(i32 39, <8 x i32> poison, i32 4, i32 5, i32 0, i32 0)
+  %r = call <8 x i32> (...) @lgc.cooperative.matrix.convert__v8i32(i32 39, <8 x i32> poison, i32 4, i32 5, i32 0, i32 0)
   ret <8 x i32> %r
 }
 
@@ -145,10 +145,10 @@ define <8 x i32> @zext_zero() {
 ; CHECK-LABEL: @zext_zero(
 ; CHECK-NEXT:    ret <8 x i32> zeroinitializer
 ;
-  %r = call <8 x i32> @lgc.cooperative.matrix.convert.v8i32.i32.v8i32.i32.i32.i32.i32(i32 39, <8 x i32> zeroinitializer, i32 4, i32 5, i32 0, i32 0)
+  %r = call <8 x i32> (...) @lgc.cooperative.matrix.convert__v8i32(i32 39, <8 x i32> zeroinitializer, i32 4, i32 5, i32 0, i32 0)
   ret <8 x i32> %r
 }
 
-declare <8 x float> @lgc.cooperative.matrix.transpose.v8f32.v8f32.i32.i32(<8 x float>, i32, i32)
-declare <8 x float> @lgc.cooperative.matrix.convert.v8f32.i32.v8f32.i32.i32.i32.i32(i32, <8 x float>, i32, i32, i32, i32)
-declare <8 x i32> @lgc.cooperative.matrix.convert.v8i32.i32.v8i32.i32.i32.i32.i32(i32, <8 x i32>, i32, i32, i32, i32)
+declare <8 x float> @lgc.cooperative.matrix.transpose__v8f32(...)
+declare <8 x float> @lgc.cooperative.matrix.convert__v8f32(...)
+declare <8 x i32> @lgc.cooperative.matrix.convert__v8i32(...)
diff --git a/lgc/test/Transforms/CombineCooperativeMatrix/lit.local.cfg b/lgc/test/Transforms/CombineCooperativeMatrix/lit.local.cfg
deleted file mode 100644
index a4266bc874..0000000000
--- a/lgc/test/Transforms/CombineCooperativeMatrix/lit.local.cfg
+++ /dev/null
@@ -1,2 +0,0 @@
-if "vki_cooperative_matrix" not in config.available_features:
-    config.unsupported = True
diff --git a/lgc/test/Transforms/CombineCooperativeMatrix/matmul-loop.lgc b/lgc/test/Transforms/CombineCooperativeMatrix/matmul-loop.lgc
index 27adadc0b1..7ed782e130 100644
--- a/lgc/test/Transforms/CombineCooperativeMatrix/matmul-loop.lgc
+++ b/lgc/test/Transforms/CombineCooperativeMatrix/matmul-loop.lgc
@@ -5,21 +5,21 @@ define void @matmul_f16(ptr %ptr) {
 ; CHECK-LABEL: define void @matmul_f16
 ; CHECK-SAME: (ptr [[PTR:%.*]]) {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[ACCUM_LOAD:%.*]] = call <8 x float> @lgc.cooperative.matrix.load.v8f32.p0.i32.i1.i32.i32.i32(ptr [[PTR]], i32 4, i1 false, i32 1, i32 1, i32 0)
+; CHECK-NEXT:    [[ACCUM_LOAD:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.load__v8f32(ptr [[PTR]], i32 4, i1 false, i32 1, i32 1, i32 0)
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       loop:
 ; CHECK-NEXT:    [[ACCUM_PHI:%.*]] = phi <8 x float> [ [[ACCUM_LOAD]], [[ENTRY:%.*]] ], [ [[MULADD:%.*]], [[LOOP]] ]
 ; CHECK-NEXT:    [[A:%.*]] = call <8 x float> @getmat1()
 ; CHECK-NEXT:    [[B:%.*]] = call <8 x float> @getmat1()
-; CHECK-NEXT:    [[MULADD]] = call <8 x float> @lgc.cooperative.matrix.muladd.v8f32.v8f32.v8f32.v8f32.i1.i1.i1.i1.i32.i32(<8 x float> [[A]], <8 x float> [[B]], <8 x float> [[ACCUM_PHI]], i1 true, i1 true, i1 false, i1 false, i32 1, i32 1)
+; CHECK-NEXT:    [[MULADD]] = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> [[A]], <8 x float> [[B]], <8 x float> [[ACCUM_PHI]], i1 true, i1 true, i1 false, i1 false, i32 1, i32 1)
 ; CHECK-NEXT:    [[CC:%.*]] = call i1 @getcc()
 ; CHECK-NEXT:    br i1 [[CC]], label [[LOOP]], label [[END:%.*]]
 ; CHECK:       end:
-; CHECK-NEXT:    call void @lgc.cooperative.matrix.store.p0.i32.i1.i32.i32.i32.v8f32(ptr [[PTR]], i32 4, i1 true, i32 1, i32 1, i32 0, <8 x float> [[MULADD]])
+; CHECK-NEXT:    call void (...) @lgc.cooperative.matrix.store(ptr [[PTR]], i32 4, i1 true, i32 1, i32 1, i32 0, i32 16, <8 x float> [[MULADD]])
 ; CHECK-NEXT:    ret void
 ;
 entry:
-  %accum.load = call <8 x float> @lgc.cooperative.matrix.load.v8f32.p0.i32.i1.i32.i32.i32(ptr %ptr, i32 4, i1 false, i32 1, i32 0, i32 0)
+  %accum.load = call <8 x float> (...) @lgc.cooperative.matrix.load__v8f32(ptr %ptr, i32 4, i1 false, i32 1, i32 0, i32 0)
   br label %loop
 
 loop:
@@ -28,15 +28,15 @@ loop:
   %a = call <8 x float> @getmat1()
   %b = call <8 x float> @getmat1()
 
-  %accum.cvt = call <8 x float> @lgc.cooperative.matrix.convert.v8f32.i32.v8f32.i32.i32.i32.i32(i32 0, <8 x float> %accum.phi, i32 1, i32 1, i32 0, i32 1)
-  %muladd = call <8 x float> @lgc.cooperative.matrix.muladd.v8f32.v8f32.v8f32.v8f32.i1.i1.i1.i1.i32.i32(<8 x float> %a, <8 x float> %b, <8 x float> %accum.cvt, i1 true, i1 true, i1 false, i1 false, i32 1, i32 1)
-  %accum.next = call <8 x float> @lgc.cooperative.matrix.convert.v8f32.i32.v8f32.i32.i32.i32.i32(i32 0, <8 x float> %muladd, i32 1, i32 1, i32 1, i32 0)
+  %accum.cvt = call <8 x float> (...) @lgc.cooperative.matrix.convert__v8f32(i32 0, <8 x float> %accum.phi, i32 1, i32 1, i32 0, i32 1)
+  %muladd = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> %a, <8 x float> %b, <8 x float> %accum.cvt, i1 true, i1 true, i1 false, i1 false, i32 1, i32 1)
+  %accum.next = call <8 x float> (...) @lgc.cooperative.matrix.convert__v8f32(i32 0, <8 x float> %muladd, i32 1, i32 1, i32 1, i32 0)
 
   %cc = call i1 @getcc()
   br i1 %cc, label %loop, label %end
 
 end:
-  call void @lgc.cooperative.matrix.store.p0.i32.i1.i32.i32.i32.v8f32(ptr %ptr, i32 4, i1 true, i32 1, i32 0, i32 0, <8 x float> %accum.next)
+  call void (...) @lgc.cooperative.matrix.store(ptr %ptr, i32 4, i1 true, i32 1, i32 0, i32 0, i32 16, <8 x float> %accum.next)
   ret void
 }
 
@@ -49,11 +49,11 @@ define void @matmul_f16_initzero(ptr %ptr) {
 ; CHECK-NEXT:    [[ACCUM_PHI:%.*]] = phi <8 x float> [ zeroinitializer, [[ENTRY:%.*]] ], [ [[MULADD:%.*]], [[LOOP]] ]
 ; CHECK-NEXT:    [[A:%.*]] = call <8 x float> @getmat1()
 ; CHECK-NEXT:    [[B:%.*]] = call <8 x float> @getmat1()
-; CHECK-NEXT:    [[MULADD]] = call <8 x float> @lgc.cooperative.matrix.muladd.v8f32.v8f32.v8f32.v8f32.i1.i1.i1.i1.i32.i32(<8 x float> [[A]], <8 x float> [[B]], <8 x float> [[ACCUM_PHI]], i1 true, i1 true, i1 false, i1 false, i32 1, i32 1)
+; CHECK-NEXT:    [[MULADD]] = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> [[A]], <8 x float> [[B]], <8 x float> [[ACCUM_PHI]], i1 true, i1 true, i1 false, i1 false, i32 1, i32 1)
 ; CHECK-NEXT:    [[CC:%.*]] = call i1 @getcc()
 ; CHECK-NEXT:    br i1 [[CC]], label [[LOOP]], label [[END:%.*]]
 ; CHECK:       end:
-; CHECK-NEXT:    call void @lgc.cooperative.matrix.store.p0.i32.i1.i32.i32.i32.v8f32(ptr [[PTR]], i32 4, i1 true, i32 1, i32 1, i32 0, <8 x float> [[MULADD]])
+; CHECK-NEXT:    call void (...) @lgc.cooperative.matrix.store(ptr [[PTR]], i32 4, i1 true, i32 1, i32 1, i32 0, i32 16, <8 x float> [[MULADD]])
 ; CHECK-NEXT:    ret void
 ;
 entry:
@@ -65,22 +65,22 @@ loop:
   %a = call <8 x float> @getmat1()
   %b = call <8 x float> @getmat1()
 
-  %accum.cvt = call <8 x float> @lgc.cooperative.matrix.convert.v8f32.i32.v8f32.i32.i32.i32.i32(i32 0, <8 x float> %accum.phi, i32 1, i32 1, i32 0, i32 1)
-  %muladd = call <8 x float> @lgc.cooperative.matrix.muladd.v8f32.v8f32.v8f32.v8f32.i1.i1.i1.i1.i32.i32(<8 x float> %a, <8 x float> %b, <8 x float> %accum.cvt, i1 true, i1 true, i1 false, i1 false, i32 1, i32 1)
-  %accum.next = call <8 x float> @lgc.cooperative.matrix.convert.v8f32.i32.v8f32.i32.i32.i32.i32(i32 0, <8 x float> %muladd, i32 1, i32 1, i32 1, i32 0)
+  %accum.cvt = call <8 x float> (...) @lgc.cooperative.matrix.convert__v8f32(i32 0, <8 x float> %accum.phi, i32 1, i32 1, i32 0, i32 1)
+  %muladd = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> %a, <8 x float> %b, <8 x float> %accum.cvt, i1 true, i1 true, i1 false, i1 false, i32 1, i32 1)
+  %accum.next = call <8 x float> (...) @lgc.cooperative.matrix.convert__v8f32(i32 0, <8 x float> %muladd, i32 1, i32 1, i32 1, i32 0)
 
   %cc = call i1 @getcc()
   br i1 %cc, label %loop, label %end
 
 end:
-  call void @lgc.cooperative.matrix.store.p0.i32.i1.i32.i32.i32.v8f32(ptr %ptr, i32 4, i1 true, i32 1, i32 0, i32 0, <8 x float> %accum.next)
+  call void (...) @lgc.cooperative.matrix.store(ptr %ptr, i32 4, i1 true, i32 1, i32 0, i32 0, i32 16, <8 x float> %accum.next)
   ret void
 }
 
 declare i1 @getcc()
 declare <8 x float> @getmat1()
 
-declare <8 x float> @lgc.cooperative.matrix.load.v8f32.p0.i32.i1.i32.i32.i32(ptr, i32, i1, i32, i32, i32)
-declare <8 x float> @lgc.cooperative.matrix.convert.v8f32.i32.v8f32.i32.i32.i32.i32(i32, <8 x float>, i32, i32, i32, i32)
-declare void @lgc.cooperative.matrix.store.p0.i32.i1.i32.i32.i32.v8f32(ptr, i32, i1, i32, i32, i32, <8 x float>)
-declare <8 x float> @lgc.cooperative.matrix.muladd.v8f32.v8f32.v8f32.v8f32.i1.i1.i1.i1.i32.i32(<8 x float>, <8 x float>, <8 x float>, i1, i1, i1, i1, i32, i32)
+declare <8 x float> @lgc.cooperative.matrix.load__v8f32(...)
+declare <8 x float> @lgc.cooperative.matrix.convert__v8f32(...)
+declare void @lgc.cooperative.matrix.store(...)
+declare <8 x float> @lgc.cooperative.matrix.muladd__v8f32(...)
diff --git a/lgc/test/Transforms/CombineCooperativeMatrix/packed-accumulators.lgc b/lgc/test/Transforms/CombineCooperativeMatrix/packed-accumulators.lgc
index cfe6d4d85f..9bff238d2c 100644
--- a/lgc/test/Transforms/CombineCooperativeMatrix/packed-accumulators.lgc
+++ b/lgc/test/Transforms/CombineCooperativeMatrix/packed-accumulators.lgc
@@ -5,20 +5,20 @@ define void @matmul_f16_pack_simple(ptr %out0, ptr %out1, <8 x float> %a, <8 x f
 ; GFX11-LABEL: define void @matmul_f16_pack_simple
 ; GFX11-SAME: (ptr [[OUT0:%.*]], ptr [[OUT1:%.*]], <8 x float> [[A:%.*]], <8 x float> [[B:%.*]], <8 x float> [[C0:%.*]], <8 x float> [[C1:%.*]]) {
 ; GFX11-NEXT:  entry:
-; GFX11-NEXT:    [[TMP0:%.*]] = call <8 x float> @lgc.cooperative.matrix.pack.v8f32.v8f32.v8f32(<8 x float> [[C0]], <8 x float> [[C1]]) #[[ATTR0:[0-9]+]]
-; GFX11-NEXT:    [[MULADDLO:%.*]] = call <8 x float> @lgc.cooperative.matrix.muladd.v8f32.v8f32.v8f32.v8f32.i1.i1.i1.i1.i32.i32(<8 x float> [[A]], <8 x float> [[B]], <8 x float> [[TMP0]], i1 true, i1 true, i1 false, i1 true, i32 1, i32 1)
-; GFX11-NEXT:    [[MULADDHI:%.*]] = call <8 x float> @lgc.cooperative.matrix.muladd.v8f32.v8f32.v8f32.v8f32.i1.i1.i1.i1.i32.i32(<8 x float> [[A]], <8 x float> [[B]], <8 x float> [[MULADDLO]], i1 true, i1 true, i1 true, i1 true, i32 1, i32 1)
-; GFX11-NEXT:    [[TMP1:%.*]] = call <8 x float> @lgc.cooperative.matrix.unpack.v8f32.v8f32.i1(<8 x float> [[MULADDHI]], i1 false) #[[ATTR0]]
-; GFX11-NEXT:    call void @lgc.cooperative.matrix.store.p0.i32.i1.i32.i32.i32.v8f32(ptr [[OUT0]], i32 4, i1 true, i32 1, i32 0, i32 0, <8 x float> [[TMP1]])
-; GFX11-NEXT:    [[TMP2:%.*]] = call <8 x float> @lgc.cooperative.matrix.unpack.v8f32.v8f32.i1(<8 x float> [[MULADDHI]], i1 true) #[[ATTR0]]
-; GFX11-NEXT:    call void @lgc.cooperative.matrix.store.p0.i32.i1.i32.i32.i32.v8f32(ptr [[OUT1]], i32 4, i1 true, i32 1, i32 0, i32 0, <8 x float> [[TMP2]])
+; GFX11-NEXT:    [[TMP0:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.pack__v8f32(<8 x float> [[C0]], <8 x float> [[C1]])
+; GFX11-NEXT:    [[MULADDLO:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> [[A]], <8 x float> [[B]], <8 x float> [[TMP0]], i1 true, i1 true, i1 false, i1 true, i32 1, i32 1)
+; GFX11-NEXT:    [[MULADDHI:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> [[A]], <8 x float> [[B]], <8 x float> [[MULADDLO]], i1 true, i1 true, i1 true, i1 true, i32 1, i32 1)
+; GFX11-NEXT:    [[TMP1:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.unpack__v8f32(<8 x float> [[MULADDHI]], i1 false)
+; GFX11-NEXT:    call void (...) @lgc.cooperative.matrix.store(ptr [[OUT0]], i32 4, i1 true, i32 1, i32 0, i32 0, i32 16, <8 x float> [[TMP1]])
+; GFX11-NEXT:    [[TMP2:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.unpack__v8f32(<8 x float> [[MULADDHI]], i1 true)
+; GFX11-NEXT:    call void (...) @lgc.cooperative.matrix.store(ptr [[OUT1]], i32 4, i1 true, i32 1, i32 0, i32 0, i32 16, <8 x float> [[TMP2]])
 ; GFX11-NEXT:    ret void
 ;
 entry:
-  %muladdLo = call <8 x float> @lgc.cooperative.matrix.muladd.v8f32.v8f32.v8f32.v8f32.i1.i1.i1.i1.i32.i32(<8 x float> %a, <8 x float> %b, <8 x float> %c0, i1 true, i1 true, i1 false, i1 false, i32 1, i32 1)
-  %muladdHi = call <8 x float> @lgc.cooperative.matrix.muladd.v8f32.v8f32.v8f32.v8f32.i1.i1.i1.i1.i32.i32(<8 x float> %a, <8 x float> %b, <8 x float> %c1, i1 true, i1 true, i1 false, i1 false, i32 1, i32 1)
-  call void @lgc.cooperative.matrix.store.p0.i32.i1.i32.i32.i32.v8f32(ptr %out0, i32 4, i1 true, i32 1, i32 0, i32 0, <8 x float> %muladdLo)
-  call void @lgc.cooperative.matrix.store.p0.i32.i1.i32.i32.i32.v8f32(ptr %out1, i32 4, i1 true, i32 1, i32 0, i32 0, <8 x float> %muladdHi)
+  %muladdLo = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> %a, <8 x float> %b, <8 x float> %c0, i1 true, i1 true, i1 false, i1 false, i32 1, i32 1)
+  %muladdHi = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> %a, <8 x float> %b, <8 x float> %c1, i1 true, i1 true, i1 false, i1 false, i32 1, i32 1)
+  call void (...) @lgc.cooperative.matrix.store(ptr %out0, i32 4, i1 true, i32 1, i32 0, i32 0, i32 16, <8 x float> %muladdLo)
+  call void (...) @lgc.cooperative.matrix.store(ptr %out1, i32 4, i1 true, i32 1, i32 0, i32 0, i32 16, <8 x float> %muladdHi)
   ret void
 }
 
@@ -26,24 +26,24 @@ define void @matmul_f16_pack_chain_sequential(ptr %out0, ptr %out1, <8 x float>
 ; GFX11-LABEL: define void @matmul_f16_pack_chain_sequential
 ; GFX11-SAME: (ptr [[OUT0:%.*]], ptr [[OUT1:%.*]], <8 x float> [[A:%.*]], <8 x float> [[B:%.*]], <8 x float> [[C0:%.*]], <8 x float> [[C1:%.*]]) {
 ; GFX11-NEXT:  entry:
-; GFX11-NEXT:    [[TMP0:%.*]] = call <8 x float> @lgc.cooperative.matrix.pack.v8f32.v8f32.v8f32(<8 x float> [[C0]], <8 x float> [[C1]]) #[[ATTR0]]
-; GFX11-NEXT:    [[CHAIN0_1:%.*]] = call <8 x float> @lgc.cooperative.matrix.muladd.v8f32.v8f32.v8f32.v8f32.i1.i1.i1.i1.i32.i32(<8 x float> [[A]], <8 x float> [[B]], <8 x float> [[TMP0]], i1 true, i1 true, i1 false, i1 true, i32 1, i32 1)
-; GFX11-NEXT:    [[CHAIN0_2:%.*]] = call <8 x float> @lgc.cooperative.matrix.muladd.v8f32.v8f32.v8f32.v8f32.i1.i1.i1.i1.i32.i32(<8 x float> [[A]], <8 x float> [[B]], <8 x float> [[CHAIN0_1]], i1 true, i1 true, i1 false, i1 true, i32 1, i32 1)
-; GFX11-NEXT:    [[CHAIN1_1:%.*]] = call <8 x float> @lgc.cooperative.matrix.muladd.v8f32.v8f32.v8f32.v8f32.i1.i1.i1.i1.i32.i32(<8 x float> [[A]], <8 x float> [[B]], <8 x float> [[CHAIN0_2]], i1 true, i1 true, i1 true, i1 true, i32 1, i32 1)
-; GFX11-NEXT:    [[CHAIN1_2:%.*]] = call <8 x float> @lgc.cooperative.matrix.muladd.v8f32.v8f32.v8f32.v8f32.i1.i1.i1.i1.i32.i32(<8 x float> [[A]], <8 x float> [[B]], <8 x float> [[CHAIN1_1]], i1 true, i1 true, i1 true, i1 true, i32 1, i32 1)
-; GFX11-NEXT:    [[TMP1:%.*]] = call <8 x float> @lgc.cooperative.matrix.unpack.v8f32.v8f32.i1(<8 x float> [[CHAIN1_2]], i1 false) #[[ATTR0]]
-; GFX11-NEXT:    call void @lgc.cooperative.matrix.store.p0.i32.i1.i32.i32.i32.v8f32(ptr [[OUT0]], i32 4, i1 true, i32 1, i32 0, i32 0, <8 x float> [[TMP1]])
-; GFX11-NEXT:    [[TMP2:%.*]] = call <8 x float> @lgc.cooperative.matrix.unpack.v8f32.v8f32.i1(<8 x float> [[CHAIN1_2]], i1 true) #[[ATTR0]]
-; GFX11-NEXT:    call void @lgc.cooperative.matrix.store.p0.i32.i1.i32.i32.i32.v8f32(ptr [[OUT1]], i32 4, i1 true, i32 1, i32 0, i32 0, <8 x float> [[TMP2]])
+; GFX11-NEXT:    [[TMP0:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.pack__v8f32(<8 x float> [[C0]], <8 x float> [[C1]])
+; GFX11-NEXT:    [[CHAIN0_1:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> [[A]], <8 x float> [[B]], <8 x float> [[TMP0]], i1 true, i1 true, i1 false, i1 true, i32 1, i32 1)
+; GFX11-NEXT:    [[CHAIN0_2:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> [[A]], <8 x float> [[B]], <8 x float> [[CHAIN0_1]], i1 true, i1 true, i1 false, i1 true, i32 1, i32 1)
+; GFX11-NEXT:    [[CHAIN1_1:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> [[A]], <8 x float> [[B]], <8 x float> [[CHAIN0_2]], i1 true, i1 true, i1 true, i1 true, i32 1, i32 1)
+; GFX11-NEXT:    [[CHAIN1_2:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> [[A]], <8 x float> [[B]], <8 x float> [[CHAIN1_1]], i1 true, i1 true, i1 true, i1 true, i32 1, i32 1)
+; GFX11-NEXT:    [[TMP1:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.unpack__v8f32(<8 x float> [[CHAIN1_2]], i1 false)
+; GFX11-NEXT:    call void (...) @lgc.cooperative.matrix.store(ptr [[OUT0]], i32 4, i1 true, i32 1, i32 0, i32 0, i32 16, <8 x float> [[TMP1]])
+; GFX11-NEXT:    [[TMP2:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.unpack__v8f32(<8 x float> [[CHAIN1_2]], i1 true)
+; GFX11-NEXT:    call void (...) @lgc.cooperative.matrix.store(ptr [[OUT1]], i32 4, i1 true, i32 1, i32 0, i32 0, i32 16, <8 x float> [[TMP2]])
 ; GFX11-NEXT:    ret void
 ;
 entry:
-  %chain0.1 = call <8 x float> @lgc.cooperative.matrix.muladd.v8f32.v8f32.v8f32.v8f32.i1.i1.i1.i1.i32.i32(<8 x float> %a, <8 x float> %b, <8 x float> %c0, i1 true, i1 true, i1 false, i1 false, i32 1, i32 1)
-  %chain0.2 = call <8 x float> @lgc.cooperative.matrix.muladd.v8f32.v8f32.v8f32.v8f32.i1.i1.i1.i1.i32.i32(<8 x float> %a, <8 x float> %b, <8 x float> %chain0.1, i1 true, i1 true, i1 false, i1 false, i32 1, i32 1)
-  %chain1.1 = call <8 x float> @lgc.cooperative.matrix.muladd.v8f32.v8f32.v8f32.v8f32.i1.i1.i1.i1.i32.i32(<8 x float> %a, <8 x float> %b, <8 x float> %c1, i1 true, i1 true, i1 false, i1 false, i32 1, i32 1)
-  %chain1.2 = call <8 x float> @lgc.cooperative.matrix.muladd.v8f32.v8f32.v8f32.v8f32.i1.i1.i1.i1.i32.i32(<8 x float> %a, <8 x float> %b, <8 x float> %chain1.1, i1 true, i1 true, i1 false, i1 false, i32 1, i32 1)
-  call void @lgc.cooperative.matrix.store.p0.i32.i1.i32.i32.i32.v8f32(ptr %out0, i32 4, i1 true, i32 1, i32 0, i32 0, <8 x float> %chain0.2)
-  call void @lgc.cooperative.matrix.store.p0.i32.i1.i32.i32.i32.v8f32(ptr %out1, i32 4, i1 true, i32 1, i32 0, i32 0, <8 x float> %chain1.2)
+  %chain0.1 = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> %a, <8 x float> %b, <8 x float> %c0, i1 true, i1 true, i1 false, i1 false, i32 1, i32 1)
+  %chain0.2 = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> %a, <8 x float> %b, <8 x float> %chain0.1, i1 true, i1 true, i1 false, i1 false, i32 1, i32 1)
+  %chain1.1 = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> %a, <8 x float> %b, <8 x float> %c1, i1 true, i1 true, i1 false, i1 false, i32 1, i32 1)
+  %chain1.2 = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> %a, <8 x float> %b, <8 x float> %chain1.1, i1 true, i1 true, i1 false, i1 false, i32 1, i32 1)
+  call void (...) @lgc.cooperative.matrix.store(ptr %out0, i32 4, i1 true, i32 1, i32 0, i32 0, i32 16, <8 x float> %chain0.2)
+  call void (...) @lgc.cooperative.matrix.store(ptr %out1, i32 4, i1 true, i32 1, i32 0, i32 0, i32 16, <8 x float> %chain1.2)
   ret void
 }
 
@@ -51,24 +51,24 @@ define void @matmul_f16_pack_chain_alternating(ptr %out0, ptr %out1, <8 x float>
 ; GFX11-LABEL: define void @matmul_f16_pack_chain_alternating
 ; GFX11-SAME: (ptr [[OUT0:%.*]], ptr [[OUT1:%.*]], <8 x float> [[A:%.*]], <8 x float> [[B:%.*]], <8 x float> [[C0:%.*]], <8 x float> [[C1:%.*]]) {
 ; GFX11-NEXT:  entry:
-; GFX11-NEXT:    [[TMP0:%.*]] = call <8 x float> @lgc.cooperative.matrix.pack.v8f32.v8f32.v8f32(<8 x float> [[C0]], <8 x float> [[C1]]) #[[ATTR0]]
-; GFX11-NEXT:    [[CHAIN0_1:%.*]] = call <8 x float> @lgc.cooperative.matrix.muladd.v8f32.v8f32.v8f32.v8f32.i1.i1.i1.i1.i32.i32(<8 x float> [[A]], <8 x float> [[B]], <8 x float> [[TMP0]], i1 true, i1 true, i1 false, i1 true, i32 1, i32 1)
-; GFX11-NEXT:    [[CHAIN1_1:%.*]] = call <8 x float> @lgc.cooperative.matrix.muladd.v8f32.v8f32.v8f32.v8f32.i1.i1.i1.i1.i32.i32(<8 x float> [[A]], <8 x float> [[B]], <8 x float> [[CHAIN0_1]], i1 true, i1 true, i1 true, i1 true, i32 1, i32 1)
-; GFX11-NEXT:    [[CHAIN0_2:%.*]] = call <8 x float> @lgc.cooperative.matrix.muladd.v8f32.v8f32.v8f32.v8f32.i1.i1.i1.i1.i32.i32(<8 x float> [[A]], <8 x float> [[B]], <8 x float> [[CHAIN1_1]], i1 true, i1 true, i1 false, i1 true, i32 1, i32 1)
-; GFX11-NEXT:    [[CHAIN1_2:%.*]] = call <8 x float> @lgc.cooperative.matrix.muladd.v8f32.v8f32.v8f32.v8f32.i1.i1.i1.i1.i32.i32(<8 x float> [[A]], <8 x float> [[B]], <8 x float> [[CHAIN0_2]], i1 true, i1 true, i1 true, i1 true, i32 1, i32 1)
-; GFX11-NEXT:    [[TMP1:%.*]] = call <8 x float> @lgc.cooperative.matrix.unpack.v8f32.v8f32.i1(<8 x float> [[CHAIN1_2]], i1 false) #[[ATTR0]]
-; GFX11-NEXT:    call void @lgc.cooperative.matrix.store.p0.i32.i1.i32.i32.i32.v8f32(ptr [[OUT0]], i32 4, i1 true, i32 1, i32 0, i32 0, <8 x float> [[TMP1]])
-; GFX11-NEXT:    [[TMP2:%.*]] = call <8 x float> @lgc.cooperative.matrix.unpack.v8f32.v8f32.i1(<8 x float> [[CHAIN1_2]], i1 true) #[[ATTR0]]
-; GFX11-NEXT:    call void @lgc.cooperative.matrix.store.p0.i32.i1.i32.i32.i32.v8f32(ptr [[OUT1]], i32 4, i1 true, i32 1, i32 0, i32 0, <8 x float> [[TMP2]])
+; GFX11-NEXT:    [[TMP0:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.pack__v8f32(<8 x float> [[C0]], <8 x float> [[C1]])
+; GFX11-NEXT:    [[CHAIN0_1:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> [[A]], <8 x float> [[B]], <8 x float> [[TMP0]], i1 true, i1 true, i1 false, i1 true, i32 1, i32 1)
+; GFX11-NEXT:    [[CHAIN1_1:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> [[A]], <8 x float> [[B]], <8 x float> [[CHAIN0_1]], i1 true, i1 true, i1 true, i1 true, i32 1, i32 1)
+; GFX11-NEXT:    [[CHAIN0_2:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> [[A]], <8 x float> [[B]], <8 x float> [[CHAIN1_1]], i1 true, i1 true, i1 false, i1 true, i32 1, i32 1)
+; GFX11-NEXT:    [[CHAIN1_2:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> [[A]], <8 x float> [[B]], <8 x float> [[CHAIN0_2]], i1 true, i1 true, i1 true, i1 true, i32 1, i32 1)
+; GFX11-NEXT:    [[TMP1:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.unpack__v8f32(<8 x float> [[CHAIN1_2]], i1 false)
+; GFX11-NEXT:    call void (...) @lgc.cooperative.matrix.store(ptr [[OUT0]], i32 4, i1 true, i32 1, i32 0, i32 0, i32 16, <8 x float> [[TMP1]])
+; GFX11-NEXT:    [[TMP2:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.unpack__v8f32(<8 x float> [[CHAIN1_2]], i1 true)
+; GFX11-NEXT:    call void (...) @lgc.cooperative.matrix.store(ptr [[OUT1]], i32 4, i1 true, i32 1, i32 0, i32 0, i32 16, <8 x float> [[TMP2]])
 ; GFX11-NEXT:    ret void
 ;
 entry:
-  %chain0.1 = call <8 x float> @lgc.cooperative.matrix.muladd.v8f32.v8f32.v8f32.v8f32.i1.i1.i1.i1.i32.i32(<8 x float> %a, <8 x float> %b, <8 x float> %c0, i1 true, i1 true, i1 false, i1 false, i32 1, i32 1)
-  %chain1.1 = call <8 x float> @lgc.cooperative.matrix.muladd.v8f32.v8f32.v8f32.v8f32.i1.i1.i1.i1.i32.i32(<8 x float> %a, <8 x float> %b, <8 x float> %c1, i1 true, i1 true, i1 false, i1 false, i32 1, i32 1)
-  %chain0.2 = call <8 x float> @lgc.cooperative.matrix.muladd.v8f32.v8f32.v8f32.v8f32.i1.i1.i1.i1.i32.i32(<8 x float> %a, <8 x float> %b, <8 x float> %chain0.1, i1 true, i1 true, i1 false, i1 false, i32 1, i32 1)
-  %chain1.2 = call <8 x float> @lgc.cooperative.matrix.muladd.v8f32.v8f32.v8f32.v8f32.i1.i1.i1.i1.i32.i32(<8 x float> %a, <8 x float> %b, <8 x float> %chain1.1, i1 true, i1 true, i1 false, i1 false, i32 1, i32 1)
-  call void @lgc.cooperative.matrix.store.p0.i32.i1.i32.i32.i32.v8f32(ptr %out0, i32 4, i1 true, i32 1, i32 0, i32 0, <8 x float> %chain0.2)
-  call void @lgc.cooperative.matrix.store.p0.i32.i1.i32.i32.i32.v8f32(ptr %out1, i32 4, i1 true, i32 1, i32 0, i32 0, <8 x float> %chain1.2)
+  %chain0.1 = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> %a, <8 x float> %b, <8 x float> %c0, i1 true, i1 true, i1 false, i1 false, i32 1, i32 1)
+  %chain1.1 = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> %a, <8 x float> %b, <8 x float> %c1, i1 true, i1 true, i1 false, i1 false, i32 1, i32 1)
+  %chain0.2 = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> %a, <8 x float> %b, <8 x float> %chain0.1, i1 true, i1 true, i1 false, i1 false, i32 1, i32 1)
+  %chain1.2 = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> %a, <8 x float> %b, <8 x float> %chain1.1, i1 true, i1 true, i1 false, i1 false, i32 1, i32 1)
+  call void (...) @lgc.cooperative.matrix.store(ptr %out0, i32 4, i1 true, i32 1, i32 0, i32 0, i32 16, <8 x float> %chain0.2)
+  call void (...) @lgc.cooperative.matrix.store(ptr %out1, i32 4, i1 true, i32 1, i32 0, i32 0, i32 16, <8 x float> %chain1.2)
   ret void
 }
 
@@ -76,24 +76,24 @@ define void @matmul_f16_pack_chain_nested(ptr %out0, ptr %out1, <8 x float> %a,
 ; GFX11-LABEL: define void @matmul_f16_pack_chain_nested
 ; GFX11-SAME: (ptr [[OUT0:%.*]], ptr [[OUT1:%.*]], <8 x float> [[A:%.*]], <8 x float> [[B:%.*]], <8 x float> [[C0:%.*]], <8 x float> [[C1:%.*]]) {
 ; GFX11-NEXT:  entry:
-; GFX11-NEXT:    [[TMP0:%.*]] = call <8 x float> @lgc.cooperative.matrix.pack.v8f32.v8f32.v8f32(<8 x float> [[C0]], <8 x float> [[C1]]) #[[ATTR0]]
-; GFX11-NEXT:    [[CHAIN0_1:%.*]] = call <8 x float> @lgc.cooperative.matrix.muladd.v8f32.v8f32.v8f32.v8f32.i1.i1.i1.i1.i32.i32(<8 x float> [[A]], <8 x float> [[B]], <8 x float> [[TMP0]], i1 true, i1 true, i1 false, i1 true, i32 1, i32 1)
-; GFX11-NEXT:    [[CHAIN1_1:%.*]] = call <8 x float> @lgc.cooperative.matrix.muladd.v8f32.v8f32.v8f32.v8f32.i1.i1.i1.i1.i32.i32(<8 x float> [[A]], <8 x float> [[B]], <8 x float> [[CHAIN0_1]], i1 true, i1 true, i1 true, i1 true, i32 1, i32 1)
-; GFX11-NEXT:    [[CHAIN1_2:%.*]] = call <8 x float> @lgc.cooperative.matrix.muladd.v8f32.v8f32.v8f32.v8f32.i1.i1.i1.i1.i32.i32(<8 x float> [[A]], <8 x float> [[B]], <8 x float> [[CHAIN1_1]], i1 true, i1 true, i1 true, i1 true, i32 1, i32 1)
-; GFX11-NEXT:    [[CHAIN0_2:%.*]] = call <8 x float> @lgc.cooperative.matrix.muladd.v8f32.v8f32.v8f32.v8f32.i1.i1.i1.i1.i32.i32(<8 x float> [[A]], <8 x float> [[B]], <8 x float> [[CHAIN1_2]], i1 true, i1 true, i1 false, i1 true, i32 1, i32 1)
-; GFX11-NEXT:    [[TMP1:%.*]] = call <8 x float> @lgc.cooperative.matrix.unpack.v8f32.v8f32.i1(<8 x float> [[CHAIN0_2]], i1 false) #[[ATTR0]]
-; GFX11-NEXT:    call void @lgc.cooperative.matrix.store.p0.i32.i1.i32.i32.i32.v8f32(ptr [[OUT0]], i32 4, i1 true, i32 1, i32 0, i32 0, <8 x float> [[TMP1]])
-; GFX11-NEXT:    [[TMP2:%.*]] = call <8 x float> @lgc.cooperative.matrix.unpack.v8f32.v8f32.i1(<8 x float> [[CHAIN0_2]], i1 true) #[[ATTR0]]
-; GFX11-NEXT:    call void @lgc.cooperative.matrix.store.p0.i32.i1.i32.i32.i32.v8f32(ptr [[OUT1]], i32 4, i1 true, i32 1, i32 0, i32 0, <8 x float> [[TMP2]])
+; GFX11-NEXT:    [[TMP0:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.pack__v8f32(<8 x float> [[C0]], <8 x float> [[C1]])
+; GFX11-NEXT:    [[CHAIN0_1:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> [[A]], <8 x float> [[B]], <8 x float> [[TMP0]], i1 true, i1 true, i1 false, i1 true, i32 1, i32 1)
+; GFX11-NEXT:    [[CHAIN1_1:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> [[A]], <8 x float> [[B]], <8 x float> [[CHAIN0_1]], i1 true, i1 true, i1 true, i1 true, i32 1, i32 1)
+; GFX11-NEXT:    [[CHAIN1_2:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> [[A]], <8 x float> [[B]], <8 x float> [[CHAIN1_1]], i1 true, i1 true, i1 true, i1 true, i32 1, i32 1)
+; GFX11-NEXT:    [[CHAIN0_2:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> [[A]], <8 x float> [[B]], <8 x float> [[CHAIN1_2]], i1 true, i1 true, i1 false, i1 true, i32 1, i32 1)
+; GFX11-NEXT:    [[TMP1:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.unpack__v8f32(<8 x float> [[CHAIN0_2]], i1 false)
+; GFX11-NEXT:    call void (...) @lgc.cooperative.matrix.store(ptr [[OUT0]], i32 4, i1 true, i32 1, i32 0, i32 0, i32 16, <8 x float> [[TMP1]])
+; GFX11-NEXT:    [[TMP2:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.unpack__v8f32(<8 x float> [[CHAIN0_2]], i1 true)
+; GFX11-NEXT:    call void (...) @lgc.cooperative.matrix.store(ptr [[OUT1]], i32 4, i1 true, i32 1, i32 0, i32 0, i32 16, <8 x float> [[TMP2]])
 ; GFX11-NEXT:    ret void
 ;
 entry:
-  %chain0.1 = call <8 x float> @lgc.cooperative.matrix.muladd.v8f32.v8f32.v8f32.v8f32.i1.i1.i1.i1.i32.i32(<8 x float> %a, <8 x float> %b, <8 x float> %c0, i1 true, i1 true, i1 false, i1 false, i32 1, i32 1)
-  %chain1.1 = call <8 x float> @lgc.cooperative.matrix.muladd.v8f32.v8f32.v8f32.v8f32.i1.i1.i1.i1.i32.i32(<8 x float> %a, <8 x float> %b, <8 x float> %c1, i1 true, i1 true, i1 false, i1 false, i32 1, i32 1)
-  %chain1.2 = call <8 x float> @lgc.cooperative.matrix.muladd.v8f32.v8f32.v8f32.v8f32.i1.i1.i1.i1.i32.i32(<8 x float> %a, <8 x float> %b, <8 x float> %chain1.1, i1 true, i1 true, i1 false, i1 false, i32 1, i32 1)
-  %chain0.2 = call <8 x float> @lgc.cooperative.matrix.muladd.v8f32.v8f32.v8f32.v8f32.i1.i1.i1.i1.i32.i32(<8 x float> %a, <8 x float> %b, <8 x float> %chain0.1, i1 true, i1 true, i1 false, i1 false, i32 1, i32 1)
-  call void @lgc.cooperative.matrix.store.p0.i32.i1.i32.i32.i32.v8f32(ptr %out0, i32 4, i1 true, i32 1, i32 0, i32 0, <8 x float> %chain0.2)
-  call void @lgc.cooperative.matrix.store.p0.i32.i1.i32.i32.i32.v8f32(ptr %out1, i32 4, i1 true, i32 1, i32 0, i32 0, <8 x float> %chain1.2)
+  %chain0.1 = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> %a, <8 x float> %b, <8 x float> %c0, i1 true, i1 true, i1 false, i1 false, i32 1, i32 1)
+  %chain1.1 = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> %a, <8 x float> %b, <8 x float> %c1, i1 true, i1 true, i1 false, i1 false, i32 1, i32 1)
+  %chain1.2 = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> %a, <8 x float> %b, <8 x float> %chain1.1, i1 true, i1 true, i1 false, i1 false, i32 1, i32 1)
+  %chain0.2 = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> %a, <8 x float> %b, <8 x float> %chain0.1, i1 true, i1 true, i1 false, i1 false, i32 1, i32 1)
+  call void (...) @lgc.cooperative.matrix.store(ptr %out0, i32 4, i1 true, i32 1, i32 0, i32 0, i32 16, <8 x float> %chain0.2)
+  call void (...) @lgc.cooperative.matrix.store(ptr %out1, i32 4, i1 true, i32 1, i32 0, i32 0, i32 16, <8 x float> %chain1.2)
   ret void
 }
 
@@ -101,15 +101,15 @@ define void @matmul_f16_no_packable_chain(ptr %out0, ptr %out1, <8 x float> %a,
 ; GFX11-LABEL: define void @matmul_f16_no_packable_chain
 ; GFX11-SAME: (ptr [[OUT0:%.*]], ptr [[OUT1:%.*]], <8 x float> [[A:%.*]], <8 x float> [[B:%.*]], <8 x float> [[C0:%.*]]) {
 ; GFX11-NEXT:  entry:
-; GFX11-NEXT:    [[CHAIN0_1:%.*]] = call <8 x float> @lgc.cooperative.matrix.muladd.v8f32.v8f32.v8f32.v8f32.i1.i1.i1.i1.i32.i32(<8 x float> [[A]], <8 x float> [[B]], <8 x float> [[C0]], i1 true, i1 true, i1 false, i1 false, i32 1, i32 1)
-; GFX11-NEXT:    [[CHAIN1_1:%.*]] = call <8 x float> @lgc.cooperative.matrix.muladd.v8f32.v8f32.v8f32.v8f32.i1.i1.i1.i1.i32.i32(<8 x float> [[A]], <8 x float> [[B]], <8 x float> [[CHAIN0_1]], i1 true, i1 true, i1 false, i1 false, i32 1, i32 1)
-; GFX11-NEXT:    call void @lgc.cooperative.matrix.store.p0.i32.i1.i32.i32.i32.v8f32(ptr [[OUT1]], i32 4, i1 true, i32 1, i32 0, i32 0, <8 x float> [[CHAIN1_1]])
+; GFX11-NEXT:    [[CHAIN0_1:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> [[A]], <8 x float> [[B]], <8 x float> [[C0]], i1 true, i1 true, i1 false, i1 false, i32 1, i32 1)
+; GFX11-NEXT:    [[CHAIN1_1:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> [[A]], <8 x float> [[B]], <8 x float> [[CHAIN0_1]], i1 true, i1 true, i1 false, i1 false, i32 1, i32 1)
+; GFX11-NEXT:    call void (...) @lgc.cooperative.matrix.store(ptr [[OUT1]], i32 4, i1 true, i32 1, i32 0, i32 0, i32 16, <8 x float> [[CHAIN1_1]])
 ; GFX11-NEXT:    ret void
 ;
 entry:
-  %chain0.1 = call <8 x float> @lgc.cooperative.matrix.muladd.v8f32.v8f32.v8f32.v8f32.i1.i1.i1.i1.i32.i32(<8 x float> %a, <8 x float> %b, <8 x float> %c0, i1 true, i1 true, i1 false, i1 false, i32 1, i32 1)
-  %chain1.1 = call <8 x float> @lgc.cooperative.matrix.muladd.v8f32.v8f32.v8f32.v8f32.i1.i1.i1.i1.i32.i32(<8 x float> %a, <8 x float> %b, <8 x float> %chain0.1, i1 true, i1 true, i1 false, i1 false, i32 1, i32 1)
-  call void @lgc.cooperative.matrix.store.p0.i32.i1.i32.i32.i32.v8f32(ptr %out1, i32 4, i1 true, i32 1, i32 0, i32 0, <8 x float> %chain1.1)
+  %chain0.1 = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> %a, <8 x float> %b, <8 x float> %c0, i1 true, i1 true, i1 false, i1 false, i32 1, i32 1)
+  %chain1.1 = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> %a, <8 x float> %b, <8 x float> %chain0.1, i1 true, i1 true, i1 false, i1 false, i32 1, i32 1)
+  call void (...) @lgc.cooperative.matrix.store(ptr %out1, i32 4, i1 true, i32 1, i32 0, i32 0, i32 16, <8 x float> %chain1.1)
   ret void
 }
 
@@ -117,72 +117,73 @@ define void @matmul_f16_chain_loop(ptr %out0, ptr %out1, <8 x float> %a, <8 x fl
 ; GFX11-LABEL: define void @matmul_f16_chain_loop
 ; GFX11-SAME: (ptr [[OUT0:%.*]], ptr [[OUT1:%.*]], <8 x float> [[A:%.*]], <8 x float> [[B:%.*]], <8 x float> [[C0:%.*]], <8 x float> [[C1:%.*]]) {
 ; GFX11-NEXT:  entry:
-; GFX11-NEXT:    [[TMP0:%.*]] = call <8 x float> @lgc.cooperative.matrix.pack.v8f32.v8f32.v8f32(<8 x float> [[C0]], <8 x float> [[C1]]) #[[ATTR0]]
-; GFX11-NEXT:    [[CHAIN0_1:%.*]] = call <8 x float> @lgc.cooperative.matrix.muladd.v8f32.v8f32.v8f32.v8f32.i1.i1.i1.i1.i32.i32(<8 x float> [[A]], <8 x float> [[B]], <8 x float> [[TMP0]], i1 true, i1 true, i1 false, i1 true, i32 1, i32 1)
-; GFX11-NEXT:    [[CHAIN1_1:%.*]] = call <8 x float> @lgc.cooperative.matrix.muladd.v8f32.v8f32.v8f32.v8f32.i1.i1.i1.i1.i32.i32(<8 x float> [[A]], <8 x float> [[B]], <8 x float> [[CHAIN0_1]], i1 true, i1 true, i1 true, i1 true, i32 1, i32 1)
-; GFX11-NEXT:    [[TMP1:%.*]] = call <8 x float> @lgc.cooperative.matrix.unpack.v8f32.v8f32.i1(<8 x float> [[CHAIN1_1]], i1 false) #[[ATTR0]]
-; GFX11-NEXT:    [[TMP2:%.*]] = call <8 x float> @lgc.cooperative.matrix.unpack.v8f32.v8f32.i1(<8 x float> [[CHAIN1_1]], i1 true) #[[ATTR0]]
-; GFX11-NEXT:    [[TMP3:%.*]] = call <8 x float> @lgc.cooperative.matrix.pack.v8f32.v8f32.v8f32(<8 x float> [[TMP1]], <8 x float> [[TMP2]]) #[[ATTR0]]
+; GFX11-NEXT:    [[TMP0:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.pack__v8f32(<8 x float> [[C0]], <8 x float> [[C1]])
+; GFX11-NEXT:    [[CHAIN0_1:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> [[A]], <8 x float> [[B]], <8 x float> [[TMP0]], i1 true, i1 true, i1 false, i1 true, i32 1, i32 1)
+; GFX11-NEXT:    [[CHAIN1_1:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> [[A]], <8 x float> [[B]], <8 x float> [[CHAIN0_1]], i1 true, i1 true, i1 true, i1 true, i32 1, i32 1)
+; GFX11-NEXT:    [[TMP1:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.unpack__v8f32(<8 x float> [[CHAIN1_1]], i1 false)
+; GFX11-NEXT:    [[TMP2:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.unpack__v8f32(<8 x float> [[CHAIN1_1]], i1 true)
+; GFX11-NEXT:    [[TMP3:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.pack__v8f32(<8 x float> [[TMP1]], <8 x float> [[TMP2]])
 ; GFX11-NEXT:    br label [[LOOP:%.*]]
 ; GFX11:       loop:
 ; GFX11-NEXT:    [[ACCUM1_PHI:%.*]] = phi <8 x float> [ [[TMP3]], [[ENTRY:%.*]] ], [ [[CHAIN1_2:%.*]], [[LOOP]] ]
-; GFX11-NEXT:    [[CHAIN0_2:%.*]] = call <8 x float> @lgc.cooperative.matrix.muladd.v8f32.v8f32.v8f32.v8f32.i1.i1.i1.i1.i32.i32(<8 x float> [[A]], <8 x float> [[B]], <8 x float> [[ACCUM1_PHI]], i1 true, i1 true, i1 false, i1 true, i32 1, i32 1)
-; GFX11-NEXT:    [[CHAIN1_2]] = call <8 x float> @lgc.cooperative.matrix.muladd.v8f32.v8f32.v8f32.v8f32.i1.i1.i1.i1.i32.i32(<8 x float> [[A]], <8 x float> [[B]], <8 x float> [[CHAIN0_2]], i1 true, i1 true, i1 true, i1 true, i32 1, i32 1)
+; GFX11-NEXT:    [[CHAIN0_2:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> [[A]], <8 x float> [[B]], <8 x float> [[ACCUM1_PHI]], i1 true, i1 true, i1 false, i1 true, i32 1, i32 1)
+; GFX11-NEXT:    [[CHAIN1_2]] = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> [[A]], <8 x float> [[B]], <8 x float> [[CHAIN0_2]], i1 true, i1 true, i1 true, i1 true, i32 1, i32 1)
 ; GFX11-NEXT:    [[CC:%.*]] = call i1 @getcc()
 ; GFX11-NEXT:    br i1 [[CC]], label [[LOOP]], label [[END:%.*]]
 ; GFX11:       end:
-; GFX11-NEXT:    [[TMP4:%.*]] = call <8 x float> @lgc.cooperative.matrix.unpack.v8f32.v8f32.i1(<8 x float> [[CHAIN1_2]], i1 false) #[[ATTR0]]
-; GFX11-NEXT:    call void @lgc.cooperative.matrix.store.p0.i32.i1.i32.i32.i32.v8f32(ptr [[OUT0]], i32 4, i1 true, i32 1, i32 0, i32 0, <8 x float> [[TMP4]])
-; GFX11-NEXT:    [[TMP5:%.*]] = call <8 x float> @lgc.cooperative.matrix.unpack.v8f32.v8f32.i1(<8 x float> [[CHAIN1_2]], i1 true) #[[ATTR0]]
-; GFX11-NEXT:    call void @lgc.cooperative.matrix.store.p0.i32.i1.i32.i32.i32.v8f32(ptr [[OUT1]], i32 4, i1 true, i32 1, i32 0, i32 0, <8 x float> [[TMP5]])
+; GFX11-NEXT:    [[TMP4:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.unpack__v8f32(<8 x float> [[CHAIN1_2]], i1 false)
+; GFX11-NEXT:    call void (...) @lgc.cooperative.matrix.store(ptr [[OUT0]], i32 4, i1 true, i32 1, i32 0, i32 0, i32 16, <8 x float> [[TMP4]])
+; GFX11-NEXT:    [[TMP5:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.unpack__v8f32(<8 x float> [[CHAIN1_2]], i1 true)
+; GFX11-NEXT:    call void (...) @lgc.cooperative.matrix.store(ptr [[OUT1]], i32 4, i1 true, i32 1, i32 0, i32 0, i32 16, <8 x float> [[TMP5]])
 ; GFX11-NEXT:    ret void
 ;
 entry:
-  %chain0.1 = call <8 x float> @lgc.cooperative.matrix.muladd.v8f32.v8f32.v8f32.v8f32.i1.i1.i1.i1.i32.i32(<8 x float> %a, <8 x float> %b, <8 x float> %c0, i1 true, i1 true, i1 false, i1 false, i32 1, i32 1)
-  %chain1.1 = call <8 x float> @lgc.cooperative.matrix.muladd.v8f32.v8f32.v8f32.v8f32.i1.i1.i1.i1.i32.i32(<8 x float> %a, <8 x float> %b, <8 x float> %c1, i1 true, i1 true, i1 false, i1 false, i32 1, i32 1)
+  %chain0.1 = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> %a, <8 x float> %b, <8 x float> %c0, i1 true, i1 true, i1 false, i1 false, i32 1, i32 1)
+  %chain1.1 = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> %a, <8 x float> %b, <8 x float> %c1, i1 true, i1 true, i1 false, i1 false, i32 1, i32 1)
   br label %loop
 
 loop:
   %accum0.phi = phi <8 x float> [ %chain0.1, %entry ], [ %chain0.2, %loop ]
   %accum1.phi = phi <8 x float> [ %chain1.1, %entry ], [ %chain1.2, %loop ]
 
-  %chain0.2 = call <8 x float> @lgc.cooperative.matrix.muladd.v8f32.v8f32.v8f32.v8f32.i1.i1.i1.i1.i32.i32(<8 x float> %a, <8 x float> %b, <8 x float> %accum0.phi, i1 true, i1 true, i1 false, i1 false, i32 1, i32 1)
-  %chain1.2 = call <8 x float> @lgc.cooperative.matrix.muladd.v8f32.v8f32.v8f32.v8f32.i1.i1.i1.i1.i32.i32(<8 x float> %a, <8 x float> %b, <8 x float> %accum1.phi, i1 true, i1 true, i1 false, i1 false, i32 1, i32 1)
+  %chain0.2 = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> %a, <8 x float> %b, <8 x float> %accum0.phi, i1 true, i1 true, i1 false, i1 false, i32 1, i32 1)
+  %chain1.2 = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> %a, <8 x float> %b, <8 x float> %accum1.phi, i1 true, i1 true, i1 false, i1 false, i32 1, i32 1)
 
   %cc = call i1 @getcc()
   br i1 %cc, label %loop, label %end
 
 end:
-  call void @lgc.cooperative.matrix.store.p0.i32.i1.i32.i32.i32.v8f32(ptr %out0, i32 4, i1 true, i32 1, i32 0, i32 0, <8 x float> %chain0.2)
-  call void @lgc.cooperative.matrix.store.p0.i32.i1.i32.i32.i32.v8f32(ptr %out1, i32 4, i1 true, i32 1, i32 0, i32 0, <8 x float> %chain1.2)
+  call void (...) @lgc.cooperative.matrix.store(ptr %out0, i32 4, i1 true, i32 1, i32 0, i32 0, i32 16, <8 x float> %chain0.2)
+  call void (...) @lgc.cooperative.matrix.store(ptr %out1, i32 4, i1 true, i32 1, i32 0, i32 0, i32 16, <8 x float> %chain1.2)
   ret void
 }
 
-define void @matmul_f16_chain_loop_phis(ptr %out0, ptr %out1, <8 x float> %a, <8 x float> %b, <8 x float> %c0, <8 x float> %c1) {; GFX11-LABEL: define void @matmul_f16_chain_loop_phis
+define void @matmul_f16_chain_loop_phis(ptr %out0, ptr %out1, <8 x float> %a, <8 x float> %b, <8 x float> %c0, <8 x float> %c1) {
+; GFX11-LABEL: define void @matmul_f16_chain_loop_phis
 ; GFX11-SAME: (ptr [[OUT0:%.*]], ptr [[OUT1:%.*]], <8 x float> [[A:%.*]], <8 x float> [[B:%.*]], <8 x float> [[C0:%.*]], <8 x float> [[C1:%.*]]) {
 ; GFX11-NEXT:  entry:
-; GFX11-NEXT:    [[ACCUM0_LOAD:%.*]] = call <8 x float> @lgc.cooperative.matrix.load.v8f32.p0.i32.i1.i32.i32.i32(ptr [[OUT0]], i32 4, i1 false, i32 1, i32 1, i32 0)
-; GFX11-NEXT:    [[ACCUM1_LOAD:%.*]] = call <8 x float> @lgc.cooperative.matrix.load.v8f32.p0.i32.i1.i32.i32.i32(ptr [[OUT1]], i32 4, i1 false, i32 1, i32 1, i32 0)
-; GFX11-NEXT:    [[TMP0:%.*]] = call <8 x float> @lgc.cooperative.matrix.pack.v8f32.v8f32.v8f32(<8 x float> [[ACCUM0_LOAD]], <8 x float> [[ACCUM1_LOAD]]) #[[ATTR0]]
+; GFX11-NEXT:    [[ACCUM0_LOAD:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.load__v8f32(ptr [[OUT0]], i32 4, i1 false, i32 1, i32 1, i32 0, i32 16)
+; GFX11-NEXT:    [[ACCUM1_LOAD:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.load__v8f32(ptr [[OUT1]], i32 4, i1 false, i32 1, i32 1, i32 0, i32 16)
+; GFX11-NEXT:    [[TMP0:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.pack__v8f32(<8 x float> [[ACCUM0_LOAD]], <8 x float> [[ACCUM1_LOAD]])
 ; GFX11-NEXT:    br label [[HEADER:%.*]]
 ; GFX11:       header:
 ; GFX11-NEXT:    [[ACCUM1_PHI:%.*]] = phi <8 x float> [ [[TMP0]], [[ENTRY:%.*]] ], [ [[MULADDHI:%.*]], [[LOOP:%.*]] ]
 ; GFX11-NEXT:    [[CC:%.*]] = call i1 @getcc()
 ; GFX11-NEXT:    br i1 [[CC]], label [[LOOP]], label [[END:%.*]]
 ; GFX11:       loop:
-; GFX11-NEXT:    [[MULADDLO:%.*]] = call <8 x float> @lgc.cooperative.matrix.muladd.v8f32.v8f32.v8f32.v8f32.i1.i1.i1.i1.i32.i32(<8 x float> [[A]], <8 x float> [[B]], <8 x float> [[ACCUM1_PHI]], i1 false, i1 false, i1 false, i1 true, i32 1, i32 1)
-; GFX11-NEXT:    [[MULADDHI]] = call <8 x float> @lgc.cooperative.matrix.muladd.v8f32.v8f32.v8f32.v8f32.i1.i1.i1.i1.i32.i32(<8 x float> [[A]], <8 x float> [[B]], <8 x float> [[MULADDLO]], i1 false, i1 false, i1 true, i1 true, i32 1, i32 1)
+; GFX11-NEXT:    [[MULADDLO:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> [[A]], <8 x float> [[B]], <8 x float> [[ACCUM1_PHI]], i1 false, i1 false, i1 false, i1 true, i32 1, i32 1)
+; GFX11-NEXT:    [[MULADDHI]] = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> [[A]], <8 x float> [[B]], <8 x float> [[MULADDLO]], i1 false, i1 false, i1 true, i1 true, i32 1, i32 1)
 ; GFX11-NEXT:    br label [[HEADER]]
 ; GFX11:       end:
-; GFX11-NEXT:    [[TMP1:%.*]] = call <8 x float> @lgc.cooperative.matrix.unpack.v8f32.v8f32.i1(<8 x float> [[ACCUM1_PHI]], i1 false) #[[ATTR0]]
-; GFX11-NEXT:    call void @lgc.cooperative.matrix.store.p0.i32.i1.i32.i32.i32.v8f32(ptr [[OUT0]], i32 4, i1 true, i32 1, i32 1, i32 0, <8 x float> [[TMP1]])
-; GFX11-NEXT:    [[TMP2:%.*]] = call <8 x float> @lgc.cooperative.matrix.unpack.v8f32.v8f32.i1(<8 x float> [[ACCUM1_PHI]], i1 true) #[[ATTR0]]
-; GFX11-NEXT:    call void @lgc.cooperative.matrix.store.p0.i32.i1.i32.i32.i32.v8f32(ptr [[OUT1]], i32 4, i1 true, i32 1, i32 1, i32 0, <8 x float> [[TMP2]])
+; GFX11-NEXT:    [[TMP1:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.unpack__v8f32(<8 x float> [[ACCUM1_PHI]], i1 false)
+; GFX11-NEXT:    call void (...) @lgc.cooperative.matrix.store(ptr [[OUT0]], i32 4, i1 true, i32 1, i32 1, i32 0, i32 16, <8 x float> [[TMP1]])
+; GFX11-NEXT:    [[TMP2:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.unpack__v8f32(<8 x float> [[ACCUM1_PHI]], i1 true)
+; GFX11-NEXT:    call void (...) @lgc.cooperative.matrix.store(ptr [[OUT1]], i32 4, i1 true, i32 1, i32 1, i32 0, i32 16, <8 x float> [[TMP2]])
 ; GFX11-NEXT:    ret void
 ;
 entry:
-  %accum0.load = call <8 x float> @lgc.cooperative.matrix.load.v8f32.p0.i32.i1.i32.i32.i32(ptr %out0, i32 4, i1 false, i32 1, i32 0, i32 0) #0
-  %accum1.load = call <8 x float> @lgc.cooperative.matrix.load.v8f32.p0.i32.i1.i32.i32.i32(ptr %out1, i32 4, i1 false, i32 1, i32 0, i32 0) #0
+  %accum0.load = call <8 x float> (...) @lgc.cooperative.matrix.load__v8f32(ptr %out0, i32 4, i1 false, i32 1, i32 0, i32 0, i32 16) #0
+  %accum1.load = call <8 x float> (...) @lgc.cooperative.matrix.load__v8f32(ptr %out1, i32 4, i1 false, i32 1, i32 0, i32 0, i32 16) #0
   br label %header
 
 header:
@@ -192,18 +193,18 @@ header:
   br i1 %cc, label %loop, label %end
 
 loop:
-  %accum0.cvt = call <8 x float> @lgc.cooperative.matrix.convert.v8f32.i32.v8f32.i32.i32.i32.i32(i32 0, <8 x float> %accum0.phi, i32 1, i32 1, i32 0, i32 1)
-  %accum1.cvt = call <8 x float> @lgc.cooperative.matrix.convert.v8f32.i32.v8f32.i32.i32.i32.i32(i32 0, <8 x float> %accum1.phi, i32 1, i32 1, i32 0, i32 1)
-  %muladdLo = call <8 x float> @lgc.cooperative.matrix.muladd.v8f32.v8f32.v8f32.v8f32.i1.i1.i1.i1.i32.i32(<8 x float> %a, <8 x float> %b, <8 x float> %accum0.cvt, i1 false, i1 false, i1 false, i1 false, i32 1, i32 1)
-  %muladdHi = call <8 x float> @lgc.cooperative.matrix.muladd.v8f32.v8f32.v8f32.v8f32.i1.i1.i1.i1.i32.i32(<8 x float> %a, <8 x float> %b, <8 x float> %accum1.cvt, i1 false, i1 false, i1 false, i1 false, i32 1, i32 1)
-  %accum0.next = call <8 x float> @lgc.cooperative.matrix.convert.v8f32.i32.v8f32.i32.i32.i32.i32(i32 0, <8 x float> %muladdLo, i32 1, i32 1, i32 1, i32 0)
-  %accum1.next = call <8 x float> @lgc.cooperative.matrix.convert.v8f32.i32.v8f32.i32.i32.i32.i32(i32 0, <8 x float> %muladdHi, i32 1, i32 1, i32 1, i32 0)
+  %accum0.cvt = call <8 x float> (...) @lgc.cooperative.matrix.convert__v8f32(i32 0, <8 x float> %accum0.phi, i32 1, i32 1, i32 0, i32 1)
+  %accum1.cvt = call <8 x float> (...) @lgc.cooperative.matrix.convert__v8f32(i32 0, <8 x float> %accum1.phi, i32 1, i32 1, i32 0, i32 1)
+  %muladdLo = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> %a, <8 x float> %b, <8 x float> %accum0.cvt, i1 false, i1 false, i1 false, i1 false, i32 1, i32 1)
+  %muladdHi = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> %a, <8 x float> %b, <8 x float> %accum1.cvt, i1 false, i1 false, i1 false, i1 false, i32 1, i32 1)
+  %accum0.next = call <8 x float> (...) @lgc.cooperative.matrix.convert__v8f32(i32 0, <8 x float> %muladdLo, i32 1, i32 1, i32 1, i32 0)
+  %accum1.next = call <8 x float> (...) @lgc.cooperative.matrix.convert__v8f32(i32 0, <8 x float> %muladdHi, i32 1, i32 1, i32 1, i32 0)
 
   br label %header
 
 end:
-  call void @lgc.cooperative.matrix.store.p0.i32.i1.i32.i32.i32.v8f32(ptr %out0, i32 4, i1 true, i32 1, i32 0, i32 0, <8 x float> %accum0.phi) #2
-  call void @lgc.cooperative.matrix.store.p0.i32.i1.i32.i32.i32.v8f32(ptr %out1, i32 4, i1 true, i32 1, i32 0, i32 0, <8 x float> %accum1.phi) #2
+  call void (...) @lgc.cooperative.matrix.store(ptr %out0, i32 4, i1 true, i32 1, i32 0, i32 0, i32 16, <8 x float> %accum0.phi) #2
+  call void (...) @lgc.cooperative.matrix.store(ptr %out1, i32 4, i1 true, i32 1, i32 0, i32 0, i32 16, <8 x float> %accum1.phi) #2
   ret void
 }
 
@@ -211,49 +212,49 @@ define void @matmul_f16_chain_branch(ptr %out0, ptr %out1, <8 x float> %a, <8 x
 ; GFX11-LABEL: define void @matmul_f16_chain_branch
 ; GFX11-SAME: (ptr [[OUT0:%.*]], ptr [[OUT1:%.*]], <8 x float> [[A:%.*]], <8 x float> [[B:%.*]], <8 x float> [[C0:%.*]], <8 x float> [[C1:%.*]]) {
 ; GFX11-NEXT:  entry:
-; GFX11-NEXT:    [[TMP0:%.*]] = call <8 x float> @lgc.cooperative.matrix.pack.v8f32.v8f32.v8f32(<8 x float> [[C0]], <8 x float> [[C1]]) #[[ATTR0]]
-; GFX11-NEXT:    [[CHAIN0_1:%.*]] = call <8 x float> @lgc.cooperative.matrix.muladd.v8f32.v8f32.v8f32.v8f32.i1.i1.i1.i1.i32.i32(<8 x float> [[A]], <8 x float> [[B]], <8 x float> [[TMP0]], i1 true, i1 true, i1 false, i1 true, i32 1, i32 1)
-; GFX11-NEXT:    [[CHAIN1_1:%.*]] = call <8 x float> @lgc.cooperative.matrix.muladd.v8f32.v8f32.v8f32.v8f32.i1.i1.i1.i1.i32.i32(<8 x float> [[A]], <8 x float> [[B]], <8 x float> [[CHAIN0_1]], i1 true, i1 true, i1 true, i1 true, i32 1, i32 1)
+; GFX11-NEXT:    [[TMP0:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.pack__v8f32(<8 x float> [[C0]], <8 x float> [[C1]])
+; GFX11-NEXT:    [[CHAIN0_1:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> [[A]], <8 x float> [[B]], <8 x float> [[TMP0]], i1 true, i1 true, i1 false, i1 true, i32 1, i32 1)
+; GFX11-NEXT:    [[CHAIN1_1:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> [[A]], <8 x float> [[B]], <8 x float> [[CHAIN0_1]], i1 true, i1 true, i1 true, i1 true, i32 1, i32 1)
 ; GFX11-NEXT:    [[CC:%.*]] = call i1 @getcc()
 ; GFX11-NEXT:    br i1 [[CC]], label [[IF_TRUE:%.*]], label [[IF_FALSE:%.*]]
 ; GFX11:       if_true:
-; GFX11-NEXT:    [[TMP1:%.*]] = call <8 x float> @lgc.cooperative.matrix.unpack.v8f32.v8f32.i1(<8 x float> [[CHAIN1_1]], i1 false) #[[ATTR0]]
-; GFX11-NEXT:    [[CHAIN0_2:%.*]] = call <8 x float> @lgc.cooperative.matrix.muladd.v8f32.v8f32.v8f32.v8f32.i1.i1.i1.i1.i32.i32(<8 x float> [[A]], <8 x float> [[B]], <8 x float> [[TMP1]], i1 true, i1 true, i1 false, i1 false, i32 1, i32 1)
-; GFX11-NEXT:    [[TMP2:%.*]] = call <8 x float> @lgc.cooperative.matrix.unpack.v8f32.v8f32.i1(<8 x float> [[CHAIN1_1]], i1 true) #[[ATTR0]]
-; GFX11-NEXT:    [[CHAIN1_2:%.*]] = call <8 x float> @lgc.cooperative.matrix.muladd.v8f32.v8f32.v8f32.v8f32.i1.i1.i1.i1.i32.i32(<8 x float> [[A]], <8 x float> [[B]], <8 x float> [[TMP2]], i1 true, i1 true, i1 false, i1 false, i32 1, i32 1)
+; GFX11-NEXT:    [[TMP1:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.unpack__v8f32(<8 x float> [[CHAIN1_1]], i1 false)
+; GFX11-NEXT:    [[CHAIN0_2:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> [[A]], <8 x float> [[B]], <8 x float> [[TMP1]], i1 true, i1 true, i1 false, i1 false, i32 1, i32 1)
+; GFX11-NEXT:    [[TMP2:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.unpack__v8f32(<8 x float> [[CHAIN1_1]], i1 true)
+; GFX11-NEXT:    [[CHAIN1_2:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> [[A]], <8 x float> [[B]], <8 x float> [[TMP2]], i1 true, i1 true, i1 false, i1 false, i32 1, i32 1)
 ; GFX11-NEXT:    br label [[END:%.*]]
 ; GFX11:       if_false:
 ; GFX11-NEXT:    [[A_FALSE:%.*]] = call <8 x float> @getmat1()
 ; GFX11-NEXT:    [[B_FALSE:%.*]] = call <8 x float> @getmat1()
-; GFX11-NEXT:    [[TMP3:%.*]] = call <8 x float> @lgc.cooperative.matrix.unpack.v8f32.v8f32.i1(<8 x float> [[CHAIN1_1]], i1 false) #[[ATTR0]]
-; GFX11-NEXT:    [[CHAIN0_3:%.*]] = call <8 x float> @lgc.cooperative.matrix.muladd.v8f32.v8f32.v8f32.v8f32.i1.i1.i1.i1.i32.i32(<8 x float> [[A_FALSE]], <8 x float> [[B_FALSE]], <8 x float> [[TMP3]], i1 true, i1 true, i1 false, i1 false, i32 1, i32 1)
-; GFX11-NEXT:    [[TMP4:%.*]] = call <8 x float> @lgc.cooperative.matrix.unpack.v8f32.v8f32.i1(<8 x float> [[CHAIN1_1]], i1 true) #[[ATTR0]]
-; GFX11-NEXT:    [[CHAIN1_3:%.*]] = call <8 x float> @lgc.cooperative.matrix.muladd.v8f32.v8f32.v8f32.v8f32.i1.i1.i1.i1.i32.i32(<8 x float> [[A_FALSE]], <8 x float> [[B_FALSE]], <8 x float> [[TMP4]], i1 true, i1 true, i1 false, i1 false, i32 1, i32 1)
+; GFX11-NEXT:    [[TMP3:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.unpack__v8f32(<8 x float> [[CHAIN1_1]], i1 false)
+; GFX11-NEXT:    [[CHAIN0_3:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> [[A_FALSE]], <8 x float> [[B_FALSE]], <8 x float> [[TMP3]], i1 true, i1 true, i1 false, i1 false, i32 1, i32 1)
+; GFX11-NEXT:    [[TMP4:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.unpack__v8f32(<8 x float> [[CHAIN1_1]], i1 true)
+; GFX11-NEXT:    [[CHAIN1_3:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> [[A_FALSE]], <8 x float> [[B_FALSE]], <8 x float> [[TMP4]], i1 true, i1 true, i1 false, i1 false, i32 1, i32 1)
 ; GFX11-NEXT:    br label [[END]]
 ; GFX11:       end:
 ; GFX11-NEXT:    [[ACCUM0_PHI:%.*]] = phi <8 x float> [ [[CHAIN0_2]], [[IF_TRUE]] ], [ [[CHAIN0_3]], [[IF_FALSE]] ]
 ; GFX11-NEXT:    [[ACCUM1_PHI:%.*]] = phi <8 x float> [ [[CHAIN1_2]], [[IF_TRUE]] ], [ [[CHAIN1_3]], [[IF_FALSE]] ]
-; GFX11-NEXT:    call void @lgc.cooperative.matrix.store.p0.i32.i1.i32.i32.i32.v8f32(ptr [[OUT0]], i32 4, i1 true, i32 1, i32 0, i32 0, <8 x float> [[ACCUM0_PHI]])
-; GFX11-NEXT:    call void @lgc.cooperative.matrix.store.p0.i32.i1.i32.i32.i32.v8f32(ptr [[OUT1]], i32 4, i1 true, i32 1, i32 0, i32 0, <8 x float> [[ACCUM1_PHI]])
+; GFX11-NEXT:    call void (...) @lgc.cooperative.matrix.store(ptr [[OUT0]], i32 4, i1 true, i32 1, i32 0, i32 0, i32 16, <8 x float> [[ACCUM0_PHI]])
+; GFX11-NEXT:    call void (...) @lgc.cooperative.matrix.store(ptr [[OUT1]], i32 4, i1 true, i32 1, i32 0, i32 0, i32 16, <8 x float> [[ACCUM1_PHI]])
 ; GFX11-NEXT:    ret void
 ;
 entry:
-  %chain0.1 = call <8 x float> @lgc.cooperative.matrix.muladd.v8f32.v8f32.v8f32.v8f32.i1.i1.i1.i1.i32.i32(<8 x float> %a, <8 x float> %b, <8 x float> %c0, i1 true, i1 true, i1 false, i1 false, i32 1, i32 1)
-  %chain1.1 = call <8 x float> @lgc.cooperative.matrix.muladd.v8f32.v8f32.v8f32.v8f32.i1.i1.i1.i1.i32.i32(<8 x float> %a, <8 x float> %b, <8 x float> %c1, i1 true, i1 true, i1 false, i1 false, i32 1, i32 1)
+  %chain0.1 = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> %a, <8 x float> %b, <8 x float> %c0, i1 true, i1 true, i1 false, i1 false, i32 1, i32 1)
+  %chain1.1 = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> %a, <8 x float> %b, <8 x float> %c1, i1 true, i1 true, i1 false, i1 false, i32 1, i32 1)
   %cc = call i1 @getcc()
   br i1 %cc, label %if_true, label %if_false
 
 if_true:
-  %chain0.2 = call <8 x float> @lgc.cooperative.matrix.muladd.v8f32.v8f32.v8f32.v8f32.i1.i1.i1.i1.i32.i32(<8 x float> %a, <8 x float> %b, <8 x float> %chain0.1, i1 true, i1 true, i1 false, i1 false, i32 1, i32 1)
-  %chain1.2 = call <8 x float> @lgc.cooperative.matrix.muladd.v8f32.v8f32.v8f32.v8f32.i1.i1.i1.i1.i32.i32(<8 x float> %a, <8 x float> %b, <8 x float> %chain1.1, i1 true, i1 true, i1 false, i1 false, i32 1, i32 1)
+  %chain0.2 = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> %a, <8 x float> %b, <8 x float> %chain0.1, i1 true, i1 true, i1 false, i1 false, i32 1, i32 1)
+  %chain1.2 = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> %a, <8 x float> %b, <8 x float> %chain1.1, i1 true, i1 true, i1 false, i1 false, i32 1, i32 1)
 
   br label %end
 if_false:
   %a.false = call <8 x float> @getmat1()
   %b.false = call <8 x float> @getmat1()
 
-  %chain0.3 = call <8 x float> @lgc.cooperative.matrix.muladd.v8f32.v8f32.v8f32.v8f32.i1.i1.i1.i1.i32.i32(<8 x float> %a.false, <8 x float> %b.false, <8 x float> %chain0.1, i1 true, i1 true, i1 false, i1 false, i32 1, i32 1)
-  %chain1.3 = call <8 x float> @lgc.cooperative.matrix.muladd.v8f32.v8f32.v8f32.v8f32.i1.i1.i1.i1.i32.i32(<8 x float> %a.false, <8 x float> %b.false, <8 x float> %chain1.1, i1 true, i1 true, i1 false, i1 false, i32 1, i32 1)
+  %chain0.3 = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> %a.false, <8 x float> %b.false, <8 x float> %chain0.1, i1 true, i1 true, i1 false, i1 false, i32 1, i32 1)
+  %chain1.3 = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> %a.false, <8 x float> %b.false, <8 x float> %chain1.1, i1 true, i1 true, i1 false, i1 false, i32 1, i32 1)
 
   br label %end
 
@@ -261,8 +262,8 @@ end:
   %accum0.phi = phi <8 x float> [ %chain0.2, %if_true ], [ %chain0.3, %if_false ]
   %accum1.phi = phi <8 x float> [ %chain1.2, %if_true ], [ %chain1.3, %if_false ]
 
-  call void @lgc.cooperative.matrix.store.p0.i32.i1.i32.i32.i32.v8f32(ptr %out0, i32 4, i1 true, i32 1, i32 0, i32 0, <8 x float> %accum0.phi)
-  call void @lgc.cooperative.matrix.store.p0.i32.i1.i32.i32.i32.v8f32(ptr %out1, i32 4, i1 true, i32 1, i32 0, i32 0, <8 x float> %accum1.phi)
+  call void (...) @lgc.cooperative.matrix.store(ptr %out0, i32 4, i1 true, i32 1, i32 0, i32 0, i32 16, <8 x float> %accum0.phi)
+  call void (...) @lgc.cooperative.matrix.store(ptr %out1, i32 4, i1 true, i32 1, i32 0, i32 0, i32 16, <8 x float> %accum1.phi)
   ret void
 }
 
@@ -270,36 +271,36 @@ define void @matmul_f16_chain_diff_bbs(ptr %out0, ptr %out1, <8 x float> %a, <8
 ; GFX11-LABEL: define void @matmul_f16_chain_diff_bbs
 ; GFX11-SAME: (ptr [[OUT0:%.*]], ptr [[OUT1:%.*]], <8 x float> [[A:%.*]], <8 x float> [[B:%.*]], <8 x float> [[C0:%.*]], <8 x float> [[C1:%.*]]) {
 ; GFX11-NEXT:  entry:
-; GFX11-NEXT:    [[TMP0:%.*]] = call <8 x float> @lgc.cooperative.matrix.pack.v8f32.v8f32.v8f32(<8 x float> [[C0]], <8 x float> [[C1]]) #[[ATTR0]]
-; GFX11-NEXT:    [[CHAIN0_1:%.*]] = call <8 x float> @lgc.cooperative.matrix.muladd.v8f32.v8f32.v8f32.v8f32.i1.i1.i1.i1.i32.i32(<8 x float> [[A]], <8 x float> [[B]], <8 x float> [[TMP0]], i1 true, i1 true, i1 false, i1 true, i32 1, i32 1)
-; GFX11-NEXT:    [[CHAIN1_1:%.*]] = call <8 x float> @lgc.cooperative.matrix.muladd.v8f32.v8f32.v8f32.v8f32.i1.i1.i1.i1.i32.i32(<8 x float> [[A]], <8 x float> [[B]], <8 x float> [[CHAIN0_1]], i1 true, i1 true, i1 true, i1 true, i32 1, i32 1)
+; GFX11-NEXT:    [[TMP0:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.pack__v8f32(<8 x float> [[C0]], <8 x float> [[C1]])
+; GFX11-NEXT:    [[CHAIN0_1:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> [[A]], <8 x float> [[B]], <8 x float> [[TMP0]], i1 true, i1 true, i1 false, i1 true, i32 1, i32 1)
+; GFX11-NEXT:    [[CHAIN1_1:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> [[A]], <8 x float> [[B]], <8 x float> [[CHAIN0_1]], i1 true, i1 true, i1 true, i1 true, i32 1, i32 1)
 ; GFX11-NEXT:    [[CC:%.*]] = call i1 @getcc()
 ; GFX11-NEXT:    br label [[CONT:%.*]]
 ; GFX11:       cont:
-; GFX11-NEXT:    [[TMP1:%.*]] = call <8 x float> @lgc.cooperative.matrix.unpack.v8f32.v8f32.i1(<8 x float> [[CHAIN1_1]], i1 false) #[[ATTR0]]
-; GFX11-NEXT:    [[CHAIN0_2:%.*]] = call <8 x float> @lgc.cooperative.matrix.muladd.v8f32.v8f32.v8f32.v8f32.i1.i1.i1.i1.i32.i32(<8 x float> [[A]], <8 x float> [[B]], <8 x float> [[TMP1]], i1 true, i1 true, i1 false, i1 false, i32 1, i32 1)
-; GFX11-NEXT:    [[TMP2:%.*]] = call <8 x float> @lgc.cooperative.matrix.unpack.v8f32.v8f32.i1(<8 x float> [[CHAIN1_1]], i1 true) #[[ATTR0]]
-; GFX11-NEXT:    [[CHAIN1_2:%.*]] = call <8 x float> @lgc.cooperative.matrix.muladd.v8f32.v8f32.v8f32.v8f32.i1.i1.i1.i1.i32.i32(<8 x float> [[A]], <8 x float> [[B]], <8 x float> [[TMP2]], i1 true, i1 true, i1 false, i1 false, i32 1, i32 1)
+; GFX11-NEXT:    [[TMP1:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.unpack__v8f32(<8 x float> [[CHAIN1_1]], i1 false)
+; GFX11-NEXT:    [[CHAIN0_2:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> [[A]], <8 x float> [[B]], <8 x float> [[TMP1]], i1 true, i1 true, i1 false, i1 false, i32 1, i32 1)
+; GFX11-NEXT:    [[TMP2:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.unpack__v8f32(<8 x float> [[CHAIN1_1]], i1 true)
+; GFX11-NEXT:    [[CHAIN1_2:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> [[A]], <8 x float> [[B]], <8 x float> [[TMP2]], i1 true, i1 true, i1 false, i1 false, i32 1, i32 1)
 ; GFX11-NEXT:    br label [[END:%.*]]
 ; GFX11:       end:
-; GFX11-NEXT:    call void @lgc.cooperative.matrix.store.p0.i32.i1.i32.i32.i32.v8f32(ptr [[OUT0]], i32 4, i1 true, i32 1, i32 0, i32 0, <8 x float> [[CHAIN0_2]])
-; GFX11-NEXT:    call void @lgc.cooperative.matrix.store.p0.i32.i1.i32.i32.i32.v8f32(ptr [[OUT1]], i32 4, i1 true, i32 1, i32 0, i32 0, <8 x float> [[CHAIN1_2]])
+; GFX11-NEXT:    call void (...) @lgc.cooperative.matrix.store(ptr [[OUT0]], i32 4, i1 true, i32 1, i32 0, i32 0, i32 16, <8 x float> [[CHAIN0_2]])
+; GFX11-NEXT:    call void (...) @lgc.cooperative.matrix.store(ptr [[OUT1]], i32 4, i1 true, i32 1, i32 0, i32 0, i32 16, <8 x float> [[CHAIN1_2]])
 ; GFX11-NEXT:    ret void
 ;
 entry:
-  %chain0.1 = call <8 x float> @lgc.cooperative.matrix.muladd.v8f32.v8f32.v8f32.v8f32.i1.i1.i1.i1.i32.i32(<8 x float> %a, <8 x float> %b, <8 x float> %c0, i1 true, i1 true, i1 false, i1 false, i32 1, i32 1)
-  %chain1.1 = call <8 x float> @lgc.cooperative.matrix.muladd.v8f32.v8f32.v8f32.v8f32.i1.i1.i1.i1.i32.i32(<8 x float> %a, <8 x float> %b, <8 x float> %c1, i1 true, i1 true, i1 false, i1 false, i32 1, i32 1)
+  %chain0.1 = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> %a, <8 x float> %b, <8 x float> %c0, i1 true, i1 true, i1 false, i1 false, i32 1, i32 1)
+  %chain1.1 = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> %a, <8 x float> %b, <8 x float> %c1, i1 true, i1 true, i1 false, i1 false, i32 1, i32 1)
   %cc = call i1 @getcc()
   br label %cont
 cont:
-  %chain0.2 = call <8 x float> @lgc.cooperative.matrix.muladd.v8f32.v8f32.v8f32.v8f32.i1.i1.i1.i1.i32.i32(<8 x float> %a, <8 x float> %b, <8 x float> %chain0.1, i1 true, i1 true, i1 false, i1 false, i32 1, i32 1)
-  %chain1.2 = call <8 x float> @lgc.cooperative.matrix.muladd.v8f32.v8f32.v8f32.v8f32.i1.i1.i1.i1.i32.i32(<8 x float> %a, <8 x float> %b, <8 x float> %chain1.1, i1 true, i1 true, i1 false, i1 false, i32 1, i32 1)
+  %chain0.2 = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> %a, <8 x float> %b, <8 x float> %chain0.1, i1 true, i1 true, i1 false, i1 false, i32 1, i32 1)
+  %chain1.2 = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> %a, <8 x float> %b, <8 x float> %chain1.1, i1 true, i1 true, i1 false, i1 false, i32 1, i32 1)
 
   br label %end
 end:
 
-  call void @lgc.cooperative.matrix.store.p0.i32.i1.i32.i32.i32.v8f32(ptr %out0, i32 4, i1 true, i32 1, i32 0, i32 0, <8 x float> %chain0.2)
-  call void @lgc.cooperative.matrix.store.p0.i32.i1.i32.i32.i32.v8f32(ptr %out1, i32 4, i1 true, i32 1, i32 0, i32 0, <8 x float> %chain1.2)
+  call void (...) @lgc.cooperative.matrix.store(ptr %out0, i32 4, i1 true, i32 1, i32 0, i32 0, i32 16, <8 x float> %chain0.2)
+  call void (...) @lgc.cooperative.matrix.store(ptr %out1, i32 4, i1 true, i32 1, i32 0, i32 0, i32 16, <8 x float> %chain1.2)
   ret void
 }
 
@@ -307,28 +308,28 @@ define void @matmul_f16_pack_loop(ptr %out0, ptr %out1) {
 ; GFX11-LABEL: define void @matmul_f16_pack_loop
 ; GFX11-SAME: (ptr [[OUT0:%.*]], ptr [[OUT1:%.*]]) {
 ; GFX11-NEXT:  entry:
-; GFX11-NEXT:    [[ACCUM0_LOAD:%.*]] = call <8 x float> @lgc.cooperative.matrix.load.v8f32.p0.i32.i1.i32.i32.i32(ptr [[OUT0]], i32 4, i1 false, i32 1, i32 1, i32 0)
-; GFX11-NEXT:    [[ACCUM1_LOAD:%.*]] = call <8 x float> @lgc.cooperative.matrix.load.v8f32.p0.i32.i1.i32.i32.i32(ptr [[OUT1]], i32 4, i1 false, i32 1, i32 1, i32 0)
-; GFX11-NEXT:    [[TMP0:%.*]] = call <8 x float> @lgc.cooperative.matrix.pack.v8f32.v8f32.v8f32(<8 x float> [[ACCUM0_LOAD]], <8 x float> [[ACCUM1_LOAD]]) #[[ATTR0]]
+; GFX11-NEXT:    [[ACCUM0_LOAD:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.load__v8f32(ptr [[OUT0]], i32 4, i1 false, i32 1, i32 1, i32 0, i32 16)
+; GFX11-NEXT:    [[ACCUM1_LOAD:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.load__v8f32(ptr [[OUT1]], i32 4, i1 false, i32 1, i32 1, i32 0, i32 16)
+; GFX11-NEXT:    [[TMP0:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.pack__v8f32(<8 x float> [[ACCUM0_LOAD]], <8 x float> [[ACCUM1_LOAD]])
 ; GFX11-NEXT:    br label [[LOOP:%.*]]
 ; GFX11:       loop:
 ; GFX11-NEXT:    [[ACCUM1_PHI:%.*]] = phi <8 x float> [ [[TMP0]], [[ENTRY:%.*]] ], [ [[MULADDHI:%.*]], [[LOOP]] ]
 ; GFX11-NEXT:    [[A:%.*]] = call <8 x float> @getmat1()
 ; GFX11-NEXT:    [[B:%.*]] = call <8 x float> @getmat1()
-; GFX11-NEXT:    [[MULADDLO:%.*]] = call <8 x float> @lgc.cooperative.matrix.muladd.v8f32.v8f32.v8f32.v8f32.i1.i1.i1.i1.i32.i32(<8 x float> [[A]], <8 x float> [[B]], <8 x float> [[ACCUM1_PHI]], i1 true, i1 true, i1 false, i1 true, i32 1, i32 1)
-; GFX11-NEXT:    [[MULADDHI]] = call <8 x float> @lgc.cooperative.matrix.muladd.v8f32.v8f32.v8f32.v8f32.i1.i1.i1.i1.i32.i32(<8 x float> [[A]], <8 x float> [[B]], <8 x float> [[MULADDLO]], i1 true, i1 true, i1 true, i1 true, i32 1, i32 1)
+; GFX11-NEXT:    [[MULADDLO:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> [[A]], <8 x float> [[B]], <8 x float> [[ACCUM1_PHI]], i1 true, i1 true, i1 false, i1 true, i32 1, i32 1)
+; GFX11-NEXT:    [[MULADDHI]] = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> [[A]], <8 x float> [[B]], <8 x float> [[MULADDLO]], i1 true, i1 true, i1 true, i1 true, i32 1, i32 1)
 ; GFX11-NEXT:    [[CC:%.*]] = call i1 @getcc()
 ; GFX11-NEXT:    br i1 [[CC]], label [[LOOP]], label [[END:%.*]]
 ; GFX11:       end:
-; GFX11-NEXT:    [[TMP1:%.*]] = call <8 x float> @lgc.cooperative.matrix.unpack.v8f32.v8f32.i1(<8 x float> [[MULADDHI]], i1 false) #[[ATTR0]]
-; GFX11-NEXT:    call void @lgc.cooperative.matrix.store.p0.i32.i1.i32.i32.i32.v8f32(ptr [[OUT0]], i32 4, i1 true, i32 1, i32 1, i32 0, <8 x float> [[TMP1]])
-; GFX11-NEXT:    [[TMP2:%.*]] = call <8 x float> @lgc.cooperative.matrix.unpack.v8f32.v8f32.i1(<8 x float> [[MULADDHI]], i1 true) #[[ATTR0]]
-; GFX11-NEXT:    call void @lgc.cooperative.matrix.store.p0.i32.i1.i32.i32.i32.v8f32(ptr [[OUT1]], i32 4, i1 true, i32 1, i32 1, i32 0, <8 x float> [[TMP2]])
+; GFX11-NEXT:    [[TMP1:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.unpack__v8f32(<8 x float> [[MULADDHI]], i1 false)
+; GFX11-NEXT:    call void (...) @lgc.cooperative.matrix.store(ptr [[OUT0]], i32 4, i1 true, i32 1, i32 1, i32 0, i32 16, <8 x float> [[TMP1]])
+; GFX11-NEXT:    [[TMP2:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.unpack__v8f32(<8 x float> [[MULADDHI]], i1 true)
+; GFX11-NEXT:    call void (...) @lgc.cooperative.matrix.store(ptr [[OUT1]], i32 4, i1 true, i32 1, i32 1, i32 0, i32 16, <8 x float> [[TMP2]])
 ; GFX11-NEXT:    ret void
 ;
 entry:
-  %accum0.load = call <8 x float> @lgc.cooperative.matrix.load.v8f32.p0.i32.i1.i32.i32.i32(ptr %out0, i32 4, i1 false, i32 1, i32 0, i32 0)
-  %accum1.load = call <8 x float> @lgc.cooperative.matrix.load.v8f32.p0.i32.i1.i32.i32.i32(ptr %out1, i32 4, i1 false, i32 1, i32 0, i32 0)
+  %accum0.load = call <8 x float> (...) @lgc.cooperative.matrix.load__v8f32(ptr %out0, i32 4, i1 false, i32 1, i32 0, i32 0, i32 16)
+  %accum1.load = call <8 x float> (...) @lgc.cooperative.matrix.load__v8f32(ptr %out1, i32 4, i1 false, i32 1, i32 0, i32 0, i32 16)
   br label %loop
 
 loop:
@@ -338,19 +339,19 @@ loop:
   %a = call <8 x float> @getmat1()
   %b = call <8 x float> @getmat1()
 
-  %accum0.cvt = call <8 x float> @lgc.cooperative.matrix.convert.v8f32.i32.v8f32.i32.i32.i32.i32(i32 0, <8 x float> %accum0.phi, i32 1, i32 1, i32 0, i32 1)
-  %accum1.cvt = call <8 x float> @lgc.cooperative.matrix.convert.v8f32.i32.v8f32.i32.i32.i32.i32(i32 0, <8 x float> %accum1.phi, i32 1, i32 1, i32 0, i32 1)
-  %muladdLo = call <8 x float> @lgc.cooperative.matrix.muladd.v8f32.v8f32.v8f32.v8f32.i1.i1.i1.i1.i32.i32(<8 x float> %a, <8 x float> %b, <8 x float> %accum0.cvt, i1 true, i1 true, i1 false, i1 false, i32 1, i32 1)
-  %muladdHi = call <8 x float> @lgc.cooperative.matrix.muladd.v8f32.v8f32.v8f32.v8f32.i1.i1.i1.i1.i32.i32(<8 x float> %a, <8 x float> %b, <8 x float> %accum1.cvt, i1 true, i1 true, i1 false, i1 false, i32 1, i32 1)
-  %accum0.next = call <8 x float> @lgc.cooperative.matrix.convert.v8f32.i32.v8f32.i32.i32.i32.i32(i32 0, <8 x float> %muladdLo, i32 1, i32 1, i32 1, i32 0)
-  %accum1.next = call <8 x float> @lgc.cooperative.matrix.convert.v8f32.i32.v8f32.i32.i32.i32.i32(i32 0, <8 x float> %muladdHi, i32 1, i32 1, i32 1, i32 0)
+  %accum0.cvt = call <8 x float> (...) @lgc.cooperative.matrix.convert__v8f32(i32 0, <8 x float> %accum0.phi, i32 1, i32 1, i32 0, i32 1)
+  %accum1.cvt = call <8 x float> (...) @lgc.cooperative.matrix.convert__v8f32(i32 0, <8 x float> %accum1.phi, i32 1, i32 1, i32 0, i32 1)
+  %muladdLo = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> %a, <8 x float> %b, <8 x float> %accum0.cvt, i1 true, i1 true, i1 false, i1 false, i32 1, i32 1)
+  %muladdHi = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> %a, <8 x float> %b, <8 x float> %accum1.cvt, i1 true, i1 true, i1 false, i1 false, i32 1, i32 1)
+  %accum0.next = call <8 x float> (...) @lgc.cooperative.matrix.convert__v8f32(i32 0, <8 x float> %muladdLo, i32 1, i32 1, i32 1, i32 0)
+  %accum1.next = call <8 x float> (...) @lgc.cooperative.matrix.convert__v8f32(i32 0, <8 x float> %muladdHi, i32 1, i32 1, i32 1, i32 0)
 
   %cc = call i1 @getcc()
   br i1 %cc, label %loop, label %end
 
 end:
-  call void @lgc.cooperative.matrix.store.p0.i32.i1.i32.i32.i32.v8f32(ptr %out0, i32 4, i1 true, i32 1, i32 0, i32 0, <8 x float> %accum0.next)
-  call void @lgc.cooperative.matrix.store.p0.i32.i1.i32.i32.i32.v8f32(ptr %out1, i32 4, i1 true, i32 1, i32 0, i32 0, <8 x float> %accum1.next)
+  call void (...) @lgc.cooperative.matrix.store(ptr %out0, i32 4, i1 true, i32 1, i32 0, i32 0, i32 16, <8 x float> %accum0.next)
+  call void (...) @lgc.cooperative.matrix.store(ptr %out1, i32 4, i1 true, i32 1, i32 0, i32 0, i32 16, <8 x float> %accum1.next)
   ret void
 }
 
@@ -358,23 +359,23 @@ define void @matmul_f16_pack_scalar_same(ptr %out0, ptr %out1, <8 x float> %a, <
 ; GFX11-LABEL: define void @matmul_f16_pack_scalar_same
 ; GFX11-SAME: (ptr [[OUT0:%.*]], ptr [[OUT1:%.*]], <8 x float> [[A:%.*]], <8 x float> [[B:%.*]], <8 x float> [[C0:%.*]], <8 x float> [[C1:%.*]]) {
 ; GFX11-NEXT:  entry:
-; GFX11-NEXT:    [[TMP0:%.*]] = call <8 x float> @lgc.cooperative.matrix.pack.v8f32.v8f32.v8f32(<8 x float> [[C0]], <8 x float> [[C1]]) #[[ATTR0]]
-; GFX11-NEXT:    [[MULADDLO:%.*]] = call <8 x float> @lgc.cooperative.matrix.muladd.v8f32.v8f32.v8f32.v8f32.i1.i1.i1.i1.i32.i32(<8 x float> [[A]], <8 x float> [[B]], <8 x float> [[TMP0]], i1 true, i1 true, i1 false, i1 true, i32 1, i32 1)
-; GFX11-NEXT:    [[MULADDHI:%.*]] = call <8 x float> @lgc.cooperative.matrix.muladd.v8f32.v8f32.v8f32.v8f32.i1.i1.i1.i1.i32.i32(<8 x float> [[A]], <8 x float> [[B]], <8 x float> [[MULADDLO]], i1 true, i1 true, i1 true, i1 true, i32 1, i32 1)
-; GFX11-NEXT:    [[TMP1:%.*]] = call <8 x float> @lgc.cooperative.matrix.times.scalar.v8f32.v8f32.v2f16.i32.i32(<8 x float> [[MULADDHI]], <2 x half> <half 0xH310F, half 0xH310F>, i32 6, i32 1) #[[ATTR1:[0-9]+]]
-; GFX11-NEXT:    [[TMP2:%.*]] = call <8 x float> @lgc.cooperative.matrix.unpack.v8f32.v8f32.i1(<8 x float> [[TMP1]], i1 false) #[[ATTR0]]
-; GFX11-NEXT:    call void @lgc.cooperative.matrix.store.p0.i32.i1.i32.i32.i32.v8f32(ptr [[OUT0]], i32 4, i1 true, i32 1, i32 0, i32 0, <8 x float> [[TMP2]])
-; GFX11-NEXT:    [[TMP3:%.*]] = call <8 x float> @lgc.cooperative.matrix.unpack.v8f32.v8f32.i1(<8 x float> [[TMP1]], i1 true) #[[ATTR0]]
-; GFX11-NEXT:    call void @lgc.cooperative.matrix.store.p0.i32.i1.i32.i32.i32.v8f32(ptr [[OUT1]], i32 4, i1 true, i32 1, i32 0, i32 0, <8 x float> [[TMP3]])
+; GFX11-NEXT:    [[TMP0:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.pack__v8f32(<8 x float> [[C0]], <8 x float> [[C1]])
+; GFX11-NEXT:    [[MULADDLO:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> [[A]], <8 x float> [[B]], <8 x float> [[TMP0]], i1 true, i1 true, i1 false, i1 true, i32 1, i32 1)
+; GFX11-NEXT:    [[MULADDHI:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> [[A]], <8 x float> [[B]], <8 x float> [[MULADDLO]], i1 true, i1 true, i1 true, i1 true, i32 1, i32 1)
+; GFX11-NEXT:    [[TMP1:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.times.scalar__v8f32(<8 x float> [[MULADDHI]], <2 x half> <half 0xH310F, half 0xH310F>, i32 6, i32 1)
+; GFX11-NEXT:    [[TMP2:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.unpack__v8f32(<8 x float> [[TMP1]], i1 false)
+; GFX11-NEXT:    call void (...) @lgc.cooperative.matrix.store(ptr [[OUT0]], i32 4, i1 true, i32 1, i32 0, i32 0, i32 16, <8 x float> [[TMP2]])
+; GFX11-NEXT:    [[TMP3:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.unpack__v8f32(<8 x float> [[TMP1]], i1 true)
+; GFX11-NEXT:    call void (...) @lgc.cooperative.matrix.store(ptr [[OUT1]], i32 4, i1 true, i32 1, i32 0, i32 0, i32 16, <8 x float> [[TMP3]])
 ; GFX11-NEXT:    ret void
 ;
 entry:
-  %muladdLo = call <8 x float> @lgc.cooperative.matrix.muladd.v8f32.v8f32.v8f32.v8f32.i1.i1.i1.i1.i32.i32(<8 x float> %a, <8 x float> %b, <8 x float> %c0, i1 true, i1 true, i1 false, i1 false, i32 1, i32 1)
-  %muladdHi = call <8 x float> @lgc.cooperative.matrix.muladd.v8f32.v8f32.v8f32.v8f32.i1.i1.i1.i1.i32.i32(<8 x float> %a, <8 x float> %b, <8 x float> %c1, i1 true, i1 true, i1 false, i1 false, i32 1, i32 1)
-  %scaledLo = call <8 x float> @lgc.cooperative.matrix.times.scalar.v8f32.v8f32.f16.i32.i32(<8 x float> %muladdLo, half 0xH310F, i32 1, i32 1)
-  %scaledHi = call <8 x float> @lgc.cooperative.matrix.times.scalar.v8f32.v8f32.f16.i32.i32(<8 x float> %muladdHi, half 0xH310F, i32 1, i32 1)
-  call void @lgc.cooperative.matrix.store.p0.i32.i1.i32.i32.i32.v8f32(ptr %out0, i32 4, i1 true, i32 1, i32 0, i32 0, <8 x float> %scaledLo)
-  call void @lgc.cooperative.matrix.store.p0.i32.i1.i32.i32.i32.v8f32(ptr %out1, i32 4, i1 true, i32 1, i32 0, i32 0, <8 x float> %scaledHi)
+  %muladdLo = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> %a, <8 x float> %b, <8 x float> %c0, i1 true, i1 true, i1 false, i1 false, i32 1, i32 1)
+  %muladdHi = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> %a, <8 x float> %b, <8 x float> %c1, i1 true, i1 true, i1 false, i1 false, i32 1, i32 1)
+  %scaledLo = call <8 x float> (...) @lgc.cooperative.matrix.times.scalar__v8f32(<8 x float> %muladdLo, half 0xH310F, i32 1, i32 1)
+  %scaledHi = call <8 x float> (...) @lgc.cooperative.matrix.times.scalar__v8f32(<8 x float> %muladdHi, half 0xH310F, i32 1, i32 1)
+  call void (...) @lgc.cooperative.matrix.store(ptr %out0, i32 4, i1 true, i32 1, i32 0, i32 0, i32 16, <8 x float> %scaledLo)
+  call void (...) @lgc.cooperative.matrix.store(ptr %out1, i32 4, i1 true, i32 1, i32 0, i32 0, i32 16, <8 x float> %scaledHi)
   ret void
 }
 
@@ -382,23 +383,23 @@ define void @matmul_f16_pack_scalar_different(ptr %out0, ptr %out1, <8 x float>
 ; GFX11-LABEL: define void @matmul_f16_pack_scalar_different
 ; GFX11-SAME: (ptr [[OUT0:%.*]], ptr [[OUT1:%.*]], <8 x float> [[A:%.*]], <8 x float> [[B:%.*]], <8 x float> [[C0:%.*]], <8 x float> [[C1:%.*]]) {
 ; GFX11-NEXT:  entry:
-; GFX11-NEXT:    [[TMP0:%.*]] = call <8 x float> @lgc.cooperative.matrix.pack.v8f32.v8f32.v8f32(<8 x float> [[C0]], <8 x float> [[C1]]) #[[ATTR0]]
-; GFX11-NEXT:    [[MULADDLO:%.*]] = call <8 x float> @lgc.cooperative.matrix.muladd.v8f32.v8f32.v8f32.v8f32.i1.i1.i1.i1.i32.i32(<8 x float> [[A]], <8 x float> [[B]], <8 x float> [[TMP0]], i1 true, i1 true, i1 false, i1 true, i32 1, i32 1)
-; GFX11-NEXT:    [[MULADDHI:%.*]] = call <8 x float> @lgc.cooperative.matrix.muladd.v8f32.v8f32.v8f32.v8f32.i1.i1.i1.i1.i32.i32(<8 x float> [[A]], <8 x float> [[B]], <8 x float> [[MULADDLO]], i1 true, i1 true, i1 true, i1 true, i32 1, i32 1)
-; GFX11-NEXT:    [[TMP1:%.*]] = call <8 x float> @lgc.cooperative.matrix.times.scalar.v8f32.v8f32.v2f16.i32.i32(<8 x float> [[MULADDHI]], <2 x half> <half 0xH310F, half 0xH3100>, i32 6, i32 1) #[[ATTR1]]
-; GFX11-NEXT:    [[TMP2:%.*]] = call <8 x float> @lgc.cooperative.matrix.unpack.v8f32.v8f32.i1(<8 x float> [[TMP1]], i1 false) #[[ATTR0]]
-; GFX11-NEXT:    call void @lgc.cooperative.matrix.store.p0.i32.i1.i32.i32.i32.v8f32(ptr [[OUT0]], i32 4, i1 true, i32 1, i32 0, i32 0, <8 x float> [[TMP2]])
-; GFX11-NEXT:    [[TMP3:%.*]] = call <8 x float> @lgc.cooperative.matrix.unpack.v8f32.v8f32.i1(<8 x float> [[TMP1]], i1 true) #[[ATTR0]]
-; GFX11-NEXT:    call void @lgc.cooperative.matrix.store.p0.i32.i1.i32.i32.i32.v8f32(ptr [[OUT1]], i32 4, i1 true, i32 1, i32 0, i32 0, <8 x float> [[TMP3]])
+; GFX11-NEXT:    [[TMP0:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.pack__v8f32(<8 x float> [[C0]], <8 x float> [[C1]])
+; GFX11-NEXT:    [[MULADDLO:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> [[A]], <8 x float> [[B]], <8 x float> [[TMP0]], i1 true, i1 true, i1 false, i1 true, i32 1, i32 1)
+; GFX11-NEXT:    [[MULADDHI:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> [[A]], <8 x float> [[B]], <8 x float> [[MULADDLO]], i1 true, i1 true, i1 true, i1 true, i32 1, i32 1)
+; GFX11-NEXT:    [[TMP1:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.times.scalar__v8f32(<8 x float> [[MULADDHI]], <2 x half> <half 0xH310F, half 0xH3100>, i32 6, i32 1)
+; GFX11-NEXT:    [[TMP2:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.unpack__v8f32(<8 x float> [[TMP1]], i1 false)
+; GFX11-NEXT:    call void (...) @lgc.cooperative.matrix.store(ptr [[OUT0]], i32 4, i1 true, i32 1, i32 0, i32 0, i32 16, <8 x float> [[TMP2]])
+; GFX11-NEXT:    [[TMP3:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.unpack__v8f32(<8 x float> [[TMP1]], i1 true)
+; GFX11-NEXT:    call void (...) @lgc.cooperative.matrix.store(ptr [[OUT1]], i32 4, i1 true, i32 1, i32 0, i32 0, i32 16, <8 x float> [[TMP3]])
 ; GFX11-NEXT:    ret void
 ;
 entry:
-  %muladdLo = call <8 x float> @lgc.cooperative.matrix.muladd.v8f32.v8f32.v8f32.v8f32.i1.i1.i1.i1.i32.i32(<8 x float> %a, <8 x float> %b, <8 x float> %c0, i1 true, i1 true, i1 false, i1 false, i32 1, i32 1)
-  %muladdHi = call <8 x float> @lgc.cooperative.matrix.muladd.v8f32.v8f32.v8f32.v8f32.i1.i1.i1.i1.i32.i32(<8 x float> %a, <8 x float> %b, <8 x float> %c1, i1 true, i1 true, i1 false, i1 false, i32 1, i32 1)
-  %scaledLo = call <8 x float> @lgc.cooperative.matrix.times.scalar.v8f32.v8f32.f16.i32.i32(<8 x float> %muladdLo, half 0xH310F, i32 1, i32 1)
-  %scaledHi = call <8 x float> @lgc.cooperative.matrix.times.scalar.v8f32.v8f32.f16.i32.i32(<8 x float> %muladdHi, half 0xH3100, i32 1, i32 1)
-  call void @lgc.cooperative.matrix.store.p0.i32.i1.i32.i32.i32.v8f32(ptr %out0, i32 4, i1 true, i32 1, i32 0, i32 0, <8 x float> %scaledLo)
-  call void @lgc.cooperative.matrix.store.p0.i32.i1.i32.i32.i32.v8f32(ptr %out1, i32 4, i1 true, i32 1, i32 0, i32 0, <8 x float> %scaledHi)
+  %muladdLo = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> %a, <8 x float> %b, <8 x float> %c0, i1 true, i1 true, i1 false, i1 false, i32 1, i32 1)
+  %muladdHi = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> %a, <8 x float> %b, <8 x float> %c1, i1 true, i1 true, i1 false, i1 false, i32 1, i32 1)
+  %scaledLo = call <8 x float> (...) @lgc.cooperative.matrix.times.scalar__v8f32(<8 x float> %muladdLo, half 0xH310F, i32 1, i32 1)
+  %scaledHi = call <8 x float> (...) @lgc.cooperative.matrix.times.scalar__v8f32(<8 x float> %muladdHi, half 0xH3100, i32 1, i32 1)
+  call void (...) @lgc.cooperative.matrix.store(ptr %out0, i32 4, i1 true, i32 1, i32 0, i32 0, i32 16, <8 x float> %scaledLo)
+  call void (...) @lgc.cooperative.matrix.store(ptr %out1, i32 4, i1 true, i32 1, i32 0, i32 0, i32 16, <8 x float> %scaledHi)
   ret void
 }
 
@@ -406,22 +407,22 @@ define void @matmul_f16_pack_scalar_only_lo(ptr %out0, ptr %out1, <8 x float> %a
 ; GFX11-LABEL: define void @matmul_f16_pack_scalar_only_lo
 ; GFX11-SAME: (ptr [[OUT0:%.*]], ptr [[OUT1:%.*]], <8 x float> [[A:%.*]], <8 x float> [[B:%.*]], <8 x float> [[C0:%.*]], <8 x float> [[C1:%.*]]) {
 ; GFX11-NEXT:  entry:
-; GFX11-NEXT:    [[TMP0:%.*]] = call <8 x float> @lgc.cooperative.matrix.pack.v8f32.v8f32.v8f32(<8 x float> [[C0]], <8 x float> [[C1]]) #[[ATTR0]]
-; GFX11-NEXT:    [[MULADDLO:%.*]] = call <8 x float> @lgc.cooperative.matrix.muladd.v8f32.v8f32.v8f32.v8f32.i1.i1.i1.i1.i32.i32(<8 x float> [[A]], <8 x float> [[B]], <8 x float> [[TMP0]], i1 true, i1 true, i1 false, i1 true, i32 1, i32 1)
-; GFX11-NEXT:    [[MULADDHI:%.*]] = call <8 x float> @lgc.cooperative.matrix.muladd.v8f32.v8f32.v8f32.v8f32.i1.i1.i1.i1.i32.i32(<8 x float> [[A]], <8 x float> [[B]], <8 x float> [[MULADDLO]], i1 true, i1 true, i1 true, i1 true, i32 1, i32 1)
-; GFX11-NEXT:    [[TMP1:%.*]] = call <8 x float> @lgc.cooperative.matrix.unpack.v8f32.v8f32.i1(<8 x float> [[MULADDHI]], i1 false) #[[ATTR0]]
-; GFX11-NEXT:    [[SCALEDLO:%.*]] = call <8 x float> @lgc.cooperative.matrix.times.scalar.v8f32.v8f32.f16.i32.i32(<8 x float> [[TMP1]], half 0xH310F, i32 1, i32 1)
-; GFX11-NEXT:    call void @lgc.cooperative.matrix.store.p0.i32.i1.i32.i32.i32.v8f32(ptr [[OUT0]], i32 4, i1 true, i32 1, i32 0, i32 0, <8 x float> [[SCALEDLO]])
-; GFX11-NEXT:    [[TMP2:%.*]] = call <8 x float> @lgc.cooperative.matrix.unpack.v8f32.v8f32.i1(<8 x float> [[MULADDHI]], i1 true) #[[ATTR0]]
-; GFX11-NEXT:    call void @lgc.cooperative.matrix.store.p0.i32.i1.i32.i32.i32.v8f32(ptr [[OUT1]], i32 4, i1 true, i32 1, i32 0, i32 0, <8 x float> [[TMP2]])
+; GFX11-NEXT:    [[TMP0:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.pack__v8f32(<8 x float> [[C0]], <8 x float> [[C1]])
+; GFX11-NEXT:    [[MULADDLO:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> [[A]], <8 x float> [[B]], <8 x float> [[TMP0]], i1 true, i1 true, i1 false, i1 true, i32 1, i32 1)
+; GFX11-NEXT:    [[MULADDHI:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> [[A]], <8 x float> [[B]], <8 x float> [[MULADDLO]], i1 true, i1 true, i1 true, i1 true, i32 1, i32 1)
+; GFX11-NEXT:    [[TMP1:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.unpack__v8f32(<8 x float> [[MULADDHI]], i1 false)
+; GFX11-NEXT:    [[SCALEDLO:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.times.scalar__v8f32(<8 x float> [[TMP1]], half 0xH310F, i32 1, i32 1)
+; GFX11-NEXT:    call void (...) @lgc.cooperative.matrix.store(ptr [[OUT0]], i32 4, i1 true, i32 1, i32 0, i32 0, i32 16, <8 x float> [[SCALEDLO]])
+; GFX11-NEXT:    [[TMP2:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.unpack__v8f32(<8 x float> [[MULADDHI]], i1 true)
+; GFX11-NEXT:    call void (...) @lgc.cooperative.matrix.store(ptr [[OUT1]], i32 4, i1 true, i32 1, i32 0, i32 0, i32 16, <8 x float> [[TMP2]])
 ; GFX11-NEXT:    ret void
 ;
 entry:
-  %muladdLo = call <8 x float> @lgc.cooperative.matrix.muladd.v8f32.v8f32.v8f32.v8f32.i1.i1.i1.i1.i32.i32(<8 x float> %a, <8 x float> %b, <8 x float> %c0, i1 true, i1 true, i1 false, i1 false, i32 1, i32 1)
-  %muladdHi = call <8 x float> @lgc.cooperative.matrix.muladd.v8f32.v8f32.v8f32.v8f32.i1.i1.i1.i1.i32.i32(<8 x float> %a, <8 x float> %b, <8 x float> %c1, i1 true, i1 true, i1 false, i1 false, i32 1, i32 1)
-  %scaledLo = call <8 x float> @lgc.cooperative.matrix.times.scalar.v8f32.v8f32.f16.i32.i32(<8 x float> %muladdLo, half 0xH310F, i32 1, i32 1)
-  call void @lgc.cooperative.matrix.store.p0.i32.i1.i32.i32.i32.v8f32(ptr %out0, i32 4, i1 true, i32 1, i32 0, i32 0, <8 x float> %scaledLo)
-  call void @lgc.cooperative.matrix.store.p0.i32.i1.i32.i32.i32.v8f32(ptr %out1, i32 4, i1 true, i32 1, i32 0, i32 0, <8 x float> %muladdHi)
+  %muladdLo = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> %a, <8 x float> %b, <8 x float> %c0, i1 true, i1 true, i1 false, i1 false, i32 1, i32 1)
+  %muladdHi = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> %a, <8 x float> %b, <8 x float> %c1, i1 true, i1 true, i1 false, i1 false, i32 1, i32 1)
+  %scaledLo = call <8 x float> (...) @lgc.cooperative.matrix.times.scalar__v8f32(<8 x float> %muladdLo, half 0xH310F, i32 1, i32 1)
+  call void (...) @lgc.cooperative.matrix.store(ptr %out0, i32 4, i1 true, i32 1, i32 0, i32 0, i32 16, <8 x float> %scaledLo)
+  call void (...) @lgc.cooperative.matrix.store(ptr %out1, i32 4, i1 true, i32 1, i32 0, i32 0, i32 16, <8 x float> %muladdHi)
   ret void
 }
 
@@ -429,22 +430,22 @@ define void @matmul_f16_pack_scalar_only_hi(ptr %out0, ptr %out1, <8 x float> %a
 ; GFX11-LABEL: define void @matmul_f16_pack_scalar_only_hi
 ; GFX11-SAME: (ptr [[OUT0:%.*]], ptr [[OUT1:%.*]], <8 x float> [[A:%.*]], <8 x float> [[B:%.*]], <8 x float> [[C0:%.*]], <8 x float> [[C1:%.*]]) {
 ; GFX11-NEXT:  entry:
-; GFX11-NEXT:    [[TMP0:%.*]] = call <8 x float> @lgc.cooperative.matrix.pack.v8f32.v8f32.v8f32(<8 x float> [[C0]], <8 x float> [[C1]]) #[[ATTR0]]
-; GFX11-NEXT:    [[MULADDLO:%.*]] = call <8 x float> @lgc.cooperative.matrix.muladd.v8f32.v8f32.v8f32.v8f32.i1.i1.i1.i1.i32.i32(<8 x float> [[A]], <8 x float> [[B]], <8 x float> [[TMP0]], i1 true, i1 true, i1 false, i1 true, i32 1, i32 1)
-; GFX11-NEXT:    [[MULADDHI:%.*]] = call <8 x float> @lgc.cooperative.matrix.muladd.v8f32.v8f32.v8f32.v8f32.i1.i1.i1.i1.i32.i32(<8 x float> [[A]], <8 x float> [[B]], <8 x float> [[MULADDLO]], i1 true, i1 true, i1 true, i1 true, i32 1, i32 1)
-; GFX11-NEXT:    [[TMP1:%.*]] = call <8 x float> @lgc.cooperative.matrix.unpack.v8f32.v8f32.i1(<8 x float> [[MULADDHI]], i1 true) #[[ATTR0]]
-; GFX11-NEXT:    [[SCALEDHI:%.*]] = call <8 x float> @lgc.cooperative.matrix.times.scalar.v8f32.v8f32.f16.i32.i32(<8 x float> [[TMP1]], half 0xH3100, i32 1, i32 1)
-; GFX11-NEXT:    [[TMP2:%.*]] = call <8 x float> @lgc.cooperative.matrix.unpack.v8f32.v8f32.i1(<8 x float> [[MULADDHI]], i1 false) #[[ATTR0]]
-; GFX11-NEXT:    call void @lgc.cooperative.matrix.store.p0.i32.i1.i32.i32.i32.v8f32(ptr [[OUT0]], i32 4, i1 true, i32 1, i32 0, i32 0, <8 x float> [[TMP2]])
-; GFX11-NEXT:    call void @lgc.cooperative.matrix.store.p0.i32.i1.i32.i32.i32.v8f32(ptr [[OUT1]], i32 4, i1 true, i32 1, i32 0, i32 0, <8 x float> [[SCALEDHI]])
+; GFX11-NEXT:    [[TMP0:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.pack__v8f32(<8 x float> [[C0]], <8 x float> [[C1]])
+; GFX11-NEXT:    [[MULADDLO:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> [[A]], <8 x float> [[B]], <8 x float> [[TMP0]], i1 true, i1 true, i1 false, i1 true, i32 1, i32 1)
+; GFX11-NEXT:    [[MULADDHI:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> [[A]], <8 x float> [[B]], <8 x float> [[MULADDLO]], i1 true, i1 true, i1 true, i1 true, i32 1, i32 1)
+; GFX11-NEXT:    [[TMP1:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.unpack__v8f32(<8 x float> [[MULADDHI]], i1 true)
+; GFX11-NEXT:    [[SCALEDHI:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.times.scalar__v8f32(<8 x float> [[TMP1]], half 0xH3100, i32 1, i32 1)
+; GFX11-NEXT:    [[TMP2:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.unpack__v8f32(<8 x float> [[MULADDHI]], i1 false)
+; GFX11-NEXT:    call void (...) @lgc.cooperative.matrix.store(ptr [[OUT0]], i32 4, i1 true, i32 1, i32 0, i32 0, i32 16, <8 x float> [[TMP2]])
+; GFX11-NEXT:    call void (...) @lgc.cooperative.matrix.store(ptr [[OUT1]], i32 4, i1 true, i32 1, i32 0, i32 0, i32 16, <8 x float> [[SCALEDHI]])
 ; GFX11-NEXT:    ret void
 ;
 entry:
-  %muladdLo = call <8 x float> @lgc.cooperative.matrix.muladd.v8f32.v8f32.v8f32.v8f32.i1.i1.i1.i1.i32.i32(<8 x float> %a, <8 x float> %b, <8 x float> %c0, i1 true, i1 true, i1 false, i1 false, i32 1, i32 1)
-  %muladdHi = call <8 x float> @lgc.cooperative.matrix.muladd.v8f32.v8f32.v8f32.v8f32.i1.i1.i1.i1.i32.i32(<8 x float> %a, <8 x float> %b, <8 x float> %c1, i1 true, i1 true, i1 false, i1 false, i32 1, i32 1)
-  %scaledHi = call <8 x float> @lgc.cooperative.matrix.times.scalar.v8f32.v8f32.f16.i32.i32(<8 x float> %muladdHi, half 0xH3100, i32 1, i32 1)
-  call void @lgc.cooperative.matrix.store.p0.i32.i1.i32.i32.i32.v8f32(ptr %out0, i32 4, i1 true, i32 1, i32 0, i32 0, <8 x float> %muladdLo)
-  call void @lgc.cooperative.matrix.store.p0.i32.i1.i32.i32.i32.v8f32(ptr %out1, i32 4, i1 true, i32 1, i32 0, i32 0, <8 x float> %scaledHi)
+  %muladdLo = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> %a, <8 x float> %b, <8 x float> %c0, i1 true, i1 true, i1 false, i1 false, i32 1, i32 1)
+  %muladdHi = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> %a, <8 x float> %b, <8 x float> %c1, i1 true, i1 true, i1 false, i1 false, i32 1, i32 1)
+  %scaledHi = call <8 x float> (...) @lgc.cooperative.matrix.times.scalar__v8f32(<8 x float> %muladdHi, half 0xH3100, i32 1, i32 1)
+  call void (...) @lgc.cooperative.matrix.store(ptr %out0, i32 4, i1 true, i32 1, i32 0, i32 0, i32 16, <8 x float> %muladdLo)
+  call void (...) @lgc.cooperative.matrix.store(ptr %out1, i32 4, i1 true, i32 1, i32 0, i32 0, i32 16, <8 x float> %scaledHi)
   ret void
 }
 
@@ -452,39 +453,39 @@ define void @matmul_f16_pack_scalar_diff_bbs(ptr %out0, ptr %out1, <8 x float> %
 ; GFX11-LABEL: define void @matmul_f16_pack_scalar_diff_bbs
 ; GFX11-SAME: (ptr [[OUT0:%.*]], ptr [[OUT1:%.*]], <8 x float> [[A:%.*]], <8 x float> [[B:%.*]], <8 x float> [[C0:%.*]], <8 x float> [[C1:%.*]]) {
 ; GFX11-NEXT:  entry:
-; GFX11-NEXT:    [[TMP0:%.*]] = call <8 x float> @lgc.cooperative.matrix.pack.v8f32.v8f32.v8f32(<8 x float> [[C0]], <8 x float> [[C1]]) #[[ATTR0]]
-; GFX11-NEXT:    [[MULADDLO:%.*]] = call <8 x float> @lgc.cooperative.matrix.muladd.v8f32.v8f32.v8f32.v8f32.i1.i1.i1.i1.i32.i32(<8 x float> [[A]], <8 x float> [[B]], <8 x float> [[TMP0]], i1 true, i1 true, i1 false, i1 true, i32 1, i32 1)
-; GFX11-NEXT:    [[MULADDHI:%.*]] = call <8 x float> @lgc.cooperative.matrix.muladd.v8f32.v8f32.v8f32.v8f32.i1.i1.i1.i1.i32.i32(<8 x float> [[A]], <8 x float> [[B]], <8 x float> [[MULADDLO]], i1 true, i1 true, i1 true, i1 true, i32 1, i32 1)
+; GFX11-NEXT:    [[TMP0:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.pack__v8f32(<8 x float> [[C0]], <8 x float> [[C1]])
+; GFX11-NEXT:    [[MULADDLO:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> [[A]], <8 x float> [[B]], <8 x float> [[TMP0]], i1 true, i1 true, i1 false, i1 true, i32 1, i32 1)
+; GFX11-NEXT:    [[MULADDHI:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> [[A]], <8 x float> [[B]], <8 x float> [[MULADDLO]], i1 true, i1 true, i1 true, i1 true, i32 1, i32 1)
 ; GFX11-NEXT:    br label [[SCALE_LO:%.*]]
 ; GFX11:       scale_lo:
-; GFX11-NEXT:    [[TMP1:%.*]] = call <8 x float> @lgc.cooperative.matrix.unpack.v8f32.v8f32.i1(<8 x float> [[MULADDHI]], i1 false) #[[ATTR0]]
-; GFX11-NEXT:    [[SCALEDLO:%.*]] = call <8 x float> @lgc.cooperative.matrix.times.scalar.v8f32.v8f32.f16.i32.i32(<8 x float> [[TMP1]], half 0xH310F, i32 1, i32 1)
+; GFX11-NEXT:    [[TMP1:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.unpack__v8f32(<8 x float> [[MULADDHI]], i1 false)
+; GFX11-NEXT:    [[SCALEDLO:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.times.scalar__v8f32(<8 x float> [[TMP1]], half 0xH310F, i32 1, i32 1)
 ; GFX11-NEXT:    br label [[SCALE_HI:%.*]]
 ; GFX11:       scale_hi:
-; GFX11-NEXT:    [[TMP2:%.*]] = call <8 x float> @lgc.cooperative.matrix.unpack.v8f32.v8f32.i1(<8 x float> [[MULADDHI]], i1 true) #[[ATTR0]]
-; GFX11-NEXT:    [[SCALEDHI:%.*]] = call <8 x float> @lgc.cooperative.matrix.times.scalar.v8f32.v8f32.f16.i32.i32(<8 x float> [[TMP2]], half 0xH310F, i32 1, i32 1)
+; GFX11-NEXT:    [[TMP2:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.unpack__v8f32(<8 x float> [[MULADDHI]], i1 true)
+; GFX11-NEXT:    [[SCALEDHI:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.times.scalar__v8f32(<8 x float> [[TMP2]], half 0xH310F, i32 1, i32 1)
 ; GFX11-NEXT:    br label [[END:%.*]]
 ; GFX11:       end:
-; GFX11-NEXT:    call void @lgc.cooperative.matrix.store.p0.i32.i1.i32.i32.i32.v8f32(ptr [[OUT0]], i32 4, i1 true, i32 1, i32 0, i32 0, <8 x float> [[SCALEDLO]])
-; GFX11-NEXT:    call void @lgc.cooperative.matrix.store.p0.i32.i1.i32.i32.i32.v8f32(ptr [[OUT1]], i32 4, i1 true, i32 1, i32 0, i32 0, <8 x float> [[SCALEDHI]])
+; GFX11-NEXT:    call void (...) @lgc.cooperative.matrix.store(ptr [[OUT0]], i32 4, i1 true, i32 1, i32 0, i32 0, i32 16, <8 x float> [[SCALEDLO]])
+; GFX11-NEXT:    call void (...) @lgc.cooperative.matrix.store(ptr [[OUT1]], i32 4, i1 true, i32 1, i32 0, i32 0, i32 16, <8 x float> [[SCALEDHI]])
 ; GFX11-NEXT:    ret void
 ;
 entry:
-  %muladdLo = call <8 x float> @lgc.cooperative.matrix.muladd.v8f32.v8f32.v8f32.v8f32.i1.i1.i1.i1.i32.i32(<8 x float> %a, <8 x float> %b, <8 x float> %c0, i1 true, i1 true, i1 false, i1 false, i32 1, i32 1)
-  %muladdHi = call <8 x float> @lgc.cooperative.matrix.muladd.v8f32.v8f32.v8f32.v8f32.i1.i1.i1.i1.i32.i32(<8 x float> %a, <8 x float> %b, <8 x float> %c1, i1 true, i1 true, i1 false, i1 false, i32 1, i32 1)
+  %muladdLo = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> %a, <8 x float> %b, <8 x float> %c0, i1 true, i1 true, i1 false, i1 false, i32 1, i32 1)
+  %muladdHi = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> %a, <8 x float> %b, <8 x float> %c1, i1 true, i1 true, i1 false, i1 false, i32 1, i32 1)
   br label %scale_lo
 
 scale_lo:
-  %scaledLo = call <8 x float> @lgc.cooperative.matrix.times.scalar.v8f32.v8f32.f16.i32.i32(<8 x float> %muladdLo, half 0xH310F, i32 1, i32 1)
+  %scaledLo = call <8 x float> (...) @lgc.cooperative.matrix.times.scalar__v8f32(<8 x float> %muladdLo, half 0xH310F, i32 1, i32 1)
   br label %scale_hi
 
 scale_hi:
-  %scaledHi = call <8 x float> @lgc.cooperative.matrix.times.scalar.v8f32.v8f32.f16.i32.i32(<8 x float> %muladdHi, half 0xH310F, i32 1, i32 1)
+  %scaledHi = call <8 x float> (...) @lgc.cooperative.matrix.times.scalar__v8f32(<8 x float> %muladdHi, half 0xH310F, i32 1, i32 1)
   br label %end
 
 end:
-  call void @lgc.cooperative.matrix.store.p0.i32.i1.i32.i32.i32.v8f32(ptr %out0, i32 4, i1 true, i32 1, i32 0, i32 0, <8 x float> %scaledLo)
-  call void @lgc.cooperative.matrix.store.p0.i32.i1.i32.i32.i32.v8f32(ptr %out1, i32 4, i1 true, i32 1, i32 0, i32 0, <8 x float> %scaledHi)
+  call void (...) @lgc.cooperative.matrix.store(ptr %out0, i32 4, i1 true, i32 1, i32 0, i32 0, i32 16, <8 x float> %scaledLo)
+  call void (...) @lgc.cooperative.matrix.store(ptr %out1, i32 4, i1 true, i32 1, i32 0, i32 0, i32 16, <8 x float> %scaledHi)
   ret void
 }
 
@@ -492,36 +493,36 @@ define void @matmul_f16_pack_user_between_scalar(ptr %out0, ptr %out1, <8 x floa
 ; GFX11-LABEL: define void @matmul_f16_pack_user_between_scalar
 ; GFX11-SAME: (ptr [[OUT0:%.*]], ptr [[OUT1:%.*]], <8 x float> [[A:%.*]], <8 x float> [[B:%.*]], <8 x float> [[C0:%.*]], <8 x float> [[C1:%.*]]) {
 ; GFX11-NEXT:  entry:
-; GFX11-NEXT:    [[TMP0:%.*]] = call <8 x float> @lgc.cooperative.matrix.pack.v8f32.v8f32.v8f32(<8 x float> [[C0]], <8 x float> [[C1]]) #[[ATTR0]]
-; GFX11-NEXT:    [[MULADDLO:%.*]] = call <8 x float> @lgc.cooperative.matrix.muladd.v8f32.v8f32.v8f32.v8f32.i1.i1.i1.i1.i32.i32(<8 x float> [[A]], <8 x float> [[B]], <8 x float> [[TMP0]], i1 true, i1 true, i1 false, i1 true, i32 1, i32 1)
-; GFX11-NEXT:    [[MULADDHI:%.*]] = call <8 x float> @lgc.cooperative.matrix.muladd.v8f32.v8f32.v8f32.v8f32.i1.i1.i1.i1.i32.i32(<8 x float> [[A]], <8 x float> [[B]], <8 x float> [[MULADDLO]], i1 true, i1 true, i1 true, i1 true, i32 1, i32 1)
+; GFX11-NEXT:    [[TMP0:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.pack__v8f32(<8 x float> [[C0]], <8 x float> [[C1]])
+; GFX11-NEXT:    [[MULADDLO:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> [[A]], <8 x float> [[B]], <8 x float> [[TMP0]], i1 true, i1 true, i1 false, i1 true, i32 1, i32 1)
+; GFX11-NEXT:    [[MULADDHI:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> [[A]], <8 x float> [[B]], <8 x float> [[MULADDLO]], i1 true, i1 true, i1 true, i1 true, i32 1, i32 1)
 ; GFX11-NEXT:    br label [[SCALE:%.*]]
 ; GFX11:       scale:
-; GFX11-NEXT:    [[TMP1:%.*]] = call <8 x float> @lgc.cooperative.matrix.unpack.v8f32.v8f32.i1(<8 x float> [[MULADDHI]], i1 false) #[[ATTR0]]
-; GFX11-NEXT:    [[SCALEDLO:%.*]] = call <8 x float> @lgc.cooperative.matrix.times.scalar.v8f32.v8f32.f16.i32.i32(<8 x float> [[TMP1]], half 0xH310F, i32 1, i32 1)
-; GFX11-NEXT:    call void @lgc.cooperative.matrix.store.p0.i32.i1.i32.i32.i32.v8f32(ptr [[OUT1]], i32 4, i1 true, i32 1, i32 0, i32 0, <8 x float> [[SCALEDLO]])
-; GFX11-NEXT:    [[TMP2:%.*]] = call <8 x float> @lgc.cooperative.matrix.unpack.v8f32.v8f32.i1(<8 x float> [[MULADDHI]], i1 true) #[[ATTR0]]
-; GFX11-NEXT:    [[SCALEDHI:%.*]] = call <8 x float> @lgc.cooperative.matrix.times.scalar.v8f32.v8f32.f16.i32.i32(<8 x float> [[TMP2]], half 0xH310F, i32 1, i32 1)
+; GFX11-NEXT:    [[TMP1:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.unpack__v8f32(<8 x float> [[MULADDHI]], i1 false)
+; GFX11-NEXT:    [[SCALEDLO:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.times.scalar__v8f32(<8 x float> [[TMP1]], half 0xH310F, i32 1, i32 1)
+; GFX11-NEXT:    call void (...) @lgc.cooperative.matrix.store(ptr [[OUT1]], i32 4, i1 true, i32 1, i32 0, i32 0, i32 16, <8 x float> [[SCALEDLO]])
+; GFX11-NEXT:    [[TMP2:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.unpack__v8f32(<8 x float> [[MULADDHI]], i1 true)
+; GFX11-NEXT:    [[SCALEDHI:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.times.scalar__v8f32(<8 x float> [[TMP2]], half 0xH310F, i32 1, i32 1)
 ; GFX11-NEXT:    br label [[END:%.*]]
 ; GFX11:       end:
-; GFX11-NEXT:    call void @lgc.cooperative.matrix.store.p0.i32.i1.i32.i32.i32.v8f32(ptr [[OUT0]], i32 4, i1 true, i32 1, i32 0, i32 0, <8 x float> [[SCALEDLO]])
-; GFX11-NEXT:    call void @lgc.cooperative.matrix.store.p0.i32.i1.i32.i32.i32.v8f32(ptr [[OUT1]], i32 4, i1 true, i32 1, i32 0, i32 0, <8 x float> [[SCALEDHI]])
+; GFX11-NEXT:    call void (...) @lgc.cooperative.matrix.store(ptr [[OUT0]], i32 4, i1 true, i32 1, i32 0, i32 0, i32 16, <8 x float> [[SCALEDLO]])
+; GFX11-NEXT:    call void (...) @lgc.cooperative.matrix.store(ptr [[OUT1]], i32 4, i1 true, i32 1, i32 0, i32 0, i32 16, <8 x float> [[SCALEDHI]])
 ; GFX11-NEXT:    ret void
 ;
 entry:
-  %muladdLo = call <8 x float> @lgc.cooperative.matrix.muladd.v8f32.v8f32.v8f32.v8f32.i1.i1.i1.i1.i32.i32(<8 x float> %a, <8 x float> %b, <8 x float> %c0, i1 true, i1 true, i1 false, i1 false, i32 1, i32 1)
-  %muladdHi = call <8 x float> @lgc.cooperative.matrix.muladd.v8f32.v8f32.v8f32.v8f32.i1.i1.i1.i1.i32.i32(<8 x float> %a, <8 x float> %b, <8 x float> %c1, i1 true, i1 true, i1 false, i1 false, i32 1, i32 1)
+  %muladdLo = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> %a, <8 x float> %b, <8 x float> %c0, i1 true, i1 true, i1 false, i1 false, i32 1, i32 1)
+  %muladdHi = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> %a, <8 x float> %b, <8 x float> %c1, i1 true, i1 true, i1 false, i1 false, i32 1, i32 1)
   br label %scale
 
 scale:
-  %scaledLo = call <8 x float> @lgc.cooperative.matrix.times.scalar.v8f32.v8f32.f16.i32.i32(<8 x float> %muladdLo, half 0xH310F, i32 1, i32 1)
-  call void @lgc.cooperative.matrix.store.p0.i32.i1.i32.i32.i32.v8f32(ptr %out1, i32 4, i1 true, i32 1, i32 0, i32 0, <8 x float> %scaledLo)
-  %scaledHi = call <8 x float> @lgc.cooperative.matrix.times.scalar.v8f32.v8f32.f16.i32.i32(<8 x float> %muladdHi, half 0xH310F, i32 1, i32 1)
+  %scaledLo = call <8 x float> (...) @lgc.cooperative.matrix.times.scalar__v8f32(<8 x float> %muladdLo, half 0xH310F, i32 1, i32 1)
+  call void (...) @lgc.cooperative.matrix.store(ptr %out1, i32 4, i1 true, i32 1, i32 0, i32 0, i32 16, <8 x float> %scaledLo)
+  %scaledHi = call <8 x float> (...) @lgc.cooperative.matrix.times.scalar__v8f32(<8 x float> %muladdHi, half 0xH310F, i32 1, i32 1)
   br label %end
 
 end:
-  call void @lgc.cooperative.matrix.store.p0.i32.i1.i32.i32.i32.v8f32(ptr %out0, i32 4, i1 true, i32 1, i32 0, i32 0, <8 x float> %scaledLo)
-  call void @lgc.cooperative.matrix.store.p0.i32.i1.i32.i32.i32.v8f32(ptr %out1, i32 4, i1 true, i32 1, i32 0, i32 0, <8 x float> %scaledHi)
+  call void (...) @lgc.cooperative.matrix.store(ptr %out0, i32 4, i1 true, i32 1, i32 0, i32 0, i32 16, <8 x float> %scaledLo)
+  call void (...) @lgc.cooperative.matrix.store(ptr %out1, i32 4, i1 true, i32 1, i32 0, i32 0, i32 16, <8 x float> %scaledHi)
   ret void
 }
 
@@ -529,36 +530,36 @@ define void @matmul_f16_pack_factor_between_scalar(ptr %in, ptr %out0, ptr %out1
 ; GFX11-LABEL: define void @matmul_f16_pack_factor_between_scalar
 ; GFX11-SAME: (ptr [[IN:%.*]], ptr [[OUT0:%.*]], ptr [[OUT1:%.*]], <8 x float> [[A:%.*]], <8 x float> [[B:%.*]], <8 x float> [[C0:%.*]], <8 x float> [[C1:%.*]]) {
 ; GFX11-NEXT:  entry:
-; GFX11-NEXT:    [[TMP0:%.*]] = call <8 x float> @lgc.cooperative.matrix.pack.v8f32.v8f32.v8f32(<8 x float> [[C0]], <8 x float> [[C1]]) #[[ATTR0]]
-; GFX11-NEXT:    [[MULADDLO:%.*]] = call <8 x float> @lgc.cooperative.matrix.muladd.v8f32.v8f32.v8f32.v8f32.i1.i1.i1.i1.i32.i32(<8 x float> [[A]], <8 x float> [[B]], <8 x float> [[TMP0]], i1 true, i1 true, i1 false, i1 true, i32 1, i32 1)
-; GFX11-NEXT:    [[MULADDHI:%.*]] = call <8 x float> @lgc.cooperative.matrix.muladd.v8f32.v8f32.v8f32.v8f32.i1.i1.i1.i1.i32.i32(<8 x float> [[A]], <8 x float> [[B]], <8 x float> [[MULADDLO]], i1 true, i1 true, i1 true, i1 true, i32 1, i32 1)
+; GFX11-NEXT:    [[TMP0:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.pack__v8f32(<8 x float> [[C0]], <8 x float> [[C1]])
+; GFX11-NEXT:    [[MULADDLO:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> [[A]], <8 x float> [[B]], <8 x float> [[TMP0]], i1 true, i1 true, i1 false, i1 true, i32 1, i32 1)
+; GFX11-NEXT:    [[MULADDHI:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> [[A]], <8 x float> [[B]], <8 x float> [[MULADDLO]], i1 true, i1 true, i1 true, i1 true, i32 1, i32 1)
 ; GFX11-NEXT:    br label [[SCALE:%.*]]
 ; GFX11:       scale:
 ; GFX11-NEXT:    [[FACTORHI:%.*]] = load half, ptr [[IN]], align 2
 ; GFX11-NEXT:    [[TMP1:%.*]] = insertelement <2 x half> <half 0xH310F, half poison>, half [[FACTORHI]], i32 1
-; GFX11-NEXT:    [[TMP2:%.*]] = call <8 x float> @lgc.cooperative.matrix.times.scalar.v8f32.v8f32.v2f16.i32.i32(<8 x float> [[MULADDHI]], <2 x half> [[TMP1]], i32 6, i32 1) #[[ATTR1]]
+; GFX11-NEXT:    [[TMP2:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.times.scalar__v8f32(<8 x float> [[MULADDHI]], <2 x half> [[TMP1]], i32 6, i32 1)
 ; GFX11-NEXT:    br label [[END:%.*]]
 ; GFX11:       end:
-; GFX11-NEXT:    [[TMP3:%.*]] = call <8 x float> @lgc.cooperative.matrix.unpack.v8f32.v8f32.i1(<8 x float> [[TMP2]], i1 false) #[[ATTR0]]
-; GFX11-NEXT:    call void @lgc.cooperative.matrix.store.p0.i32.i1.i32.i32.i32.v8f32(ptr [[OUT0]], i32 4, i1 true, i32 1, i32 0, i32 0, <8 x float> [[TMP3]])
-; GFX11-NEXT:    [[TMP4:%.*]] = call <8 x float> @lgc.cooperative.matrix.unpack.v8f32.v8f32.i1(<8 x float> [[TMP2]], i1 true) #[[ATTR0]]
-; GFX11-NEXT:    call void @lgc.cooperative.matrix.store.p0.i32.i1.i32.i32.i32.v8f32(ptr [[OUT1]], i32 4, i1 true, i32 1, i32 0, i32 0, <8 x float> [[TMP4]])
+; GFX11-NEXT:    [[TMP3:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.unpack__v8f32(<8 x float> [[TMP2]], i1 false)
+; GFX11-NEXT:    call void (...) @lgc.cooperative.matrix.store(ptr [[OUT0]], i32 4, i1 true, i32 1, i32 0, i32 0, i32 16, <8 x float> [[TMP3]])
+; GFX11-NEXT:    [[TMP4:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.unpack__v8f32(<8 x float> [[TMP2]], i1 true)
+; GFX11-NEXT:    call void (...) @lgc.cooperative.matrix.store(ptr [[OUT1]], i32 4, i1 true, i32 1, i32 0, i32 0, i32 16, <8 x float> [[TMP4]])
 ; GFX11-NEXT:    ret void
 ;
 entry:
-  %muladdLo = call <8 x float> @lgc.cooperative.matrix.muladd.v8f32.v8f32.v8f32.v8f32.i1.i1.i1.i1.i32.i32(<8 x float> %a, <8 x float> %b, <8 x float> %c0, i1 true, i1 true, i1 false, i1 false, i32 1, i32 1)
-  %muladdHi = call <8 x float> @lgc.cooperative.matrix.muladd.v8f32.v8f32.v8f32.v8f32.i1.i1.i1.i1.i32.i32(<8 x float> %a, <8 x float> %b, <8 x float> %c1, i1 true, i1 true, i1 false, i1 false, i32 1, i32 1)
+  %muladdLo = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> %a, <8 x float> %b, <8 x float> %c0, i1 true, i1 true, i1 false, i1 false, i32 1, i32 1)
+  %muladdHi = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> %a, <8 x float> %b, <8 x float> %c1, i1 true, i1 true, i1 false, i1 false, i32 1, i32 1)
   br label %scale
 
 scale:
-  %scaledLo = call <8 x float> @lgc.cooperative.matrix.times.scalar.v8f32.v8f32.f16.i32.i32(<8 x float> %muladdLo, half 0xH310F, i32 1, i32 1)
+  %scaledLo = call <8 x float> (...) @lgc.cooperative.matrix.times.scalar__v8f32(<8 x float> %muladdLo, half 0xH310F, i32 1, i32 1)
   %factorHi = load half, ptr %in
-  %scaledHi = call <8 x float> @lgc.cooperative.matrix.times.scalar.v8f32.v8f32.f16.i32.i32(<8 x float> %muladdHi, half %factorHi, i32 1, i32 1)
+  %scaledHi = call <8 x float> (...) @lgc.cooperative.matrix.times.scalar__v8f32(<8 x float> %muladdHi, half %factorHi, i32 1, i32 1)
   br label %end
 
 end:
-  call void @lgc.cooperative.matrix.store.p0.i32.i1.i32.i32.i32.v8f32(ptr %out0, i32 4, i1 true, i32 1, i32 0, i32 0, <8 x float> %scaledLo)
-  call void @lgc.cooperative.matrix.store.p0.i32.i1.i32.i32.i32.v8f32(ptr %out1, i32 4, i1 true, i32 1, i32 0, i32 0, <8 x float> %scaledHi)
+  call void (...) @lgc.cooperative.matrix.store(ptr %out0, i32 4, i1 true, i32 1, i32 0, i32 0, i32 16, <8 x float> %scaledLo)
+  call void (...) @lgc.cooperative.matrix.store(ptr %out1, i32 4, i1 true, i32 1, i32 0, i32 0, i32 16, <8 x float> %scaledHi)
   ret void
 }
 
@@ -566,31 +567,31 @@ define void @matmul_f16_pack_binop_fadd(ptr %out0, ptr %out1, <8 x float> %a, <8
 ; GFX11-LABEL: define void @matmul_f16_pack_binop_fadd
 ; GFX11-SAME: (ptr [[OUT0:%.*]], ptr [[OUT1:%.*]], <8 x float> [[A:%.*]], <8 x float> [[B:%.*]], <8 x float> [[C0:%.*]], <8 x float> [[C1:%.*]], <8 x float> [[C2:%.*]], <8 x float> [[C3:%.*]]) {
 ; GFX11-NEXT:  entry:
-; GFX11-NEXT:    [[TMP0:%.*]] = call <8 x float> @lgc.cooperative.matrix.pack.v8f32.v8f32.v8f32(<8 x float> [[C0]], <8 x float> [[C1]]) #[[ATTR0]]
-; GFX11-NEXT:    [[MULADDLO0:%.*]] = call <8 x float> @lgc.cooperative.matrix.muladd.v8f32.v8f32.v8f32.v8f32.i1.i1.i1.i1.i32.i32(<8 x float> [[A]], <8 x float> [[B]], <8 x float> [[TMP0]], i1 true, i1 true, i1 false, i1 true, i32 1, i32 1)
-; GFX11-NEXT:    [[MULADDHI0:%.*]] = call <8 x float> @lgc.cooperative.matrix.muladd.v8f32.v8f32.v8f32.v8f32.i1.i1.i1.i1.i32.i32(<8 x float> [[A]], <8 x float> [[B]], <8 x float> [[MULADDLO0]], i1 true, i1 true, i1 true, i1 true, i32 1, i32 1)
-; GFX11-NEXT:    [[TMP1:%.*]] = call <8 x float> @lgc.cooperative.matrix.pack.v8f32.v8f32.v8f32(<8 x float> [[C2]], <8 x float> [[C3]]) #[[ATTR0]]
-; GFX11-NEXT:    [[MULADDLO1:%.*]] = call <8 x float> @lgc.cooperative.matrix.muladd.v8f32.v8f32.v8f32.v8f32.i1.i1.i1.i1.i32.i32(<8 x float> [[A]], <8 x float> [[B]], <8 x float> [[TMP1]], i1 true, i1 true, i1 false, i1 true, i32 1, i32 1)
-; GFX11-NEXT:    [[MULADDHI1:%.*]] = call <8 x float> @lgc.cooperative.matrix.muladd.v8f32.v8f32.v8f32.v8f32.i1.i1.i1.i1.i32.i32(<8 x float> [[A]], <8 x float> [[B]], <8 x float> [[MULADDLO1]], i1 true, i1 true, i1 true, i1 true, i32 1, i32 1)
-; GFX11-NEXT:    [[TMP2:%.*]] = call <8 x float> @lgc.cooperative.matrix.unpack.v8f32.v8f32.i1(<8 x float> [[MULADDHI0]], i1 false) #[[ATTR0]]
-; GFX11-NEXT:    [[TMP3:%.*]] = call <8 x float> @lgc.cooperative.matrix.unpack.v8f32.v8f32.i1(<8 x float> [[MULADDHI1]], i1 false) #[[ATTR0]]
-; GFX11-NEXT:    [[BINOPLO:%.*]] = call <8 x float> @lgc.cooperative.matrix.binop.v8f32.i32.v8f32.v8f32.i32.i32(i32 1, <8 x float> [[TMP2]], <8 x float> [[TMP3]], i32 1, i32 1)
-; GFX11-NEXT:    [[TMP4:%.*]] = call <8 x float> @lgc.cooperative.matrix.unpack.v8f32.v8f32.i1(<8 x float> [[MULADDHI0]], i1 true) #[[ATTR0]]
-; GFX11-NEXT:    [[TMP5:%.*]] = call <8 x float> @lgc.cooperative.matrix.unpack.v8f32.v8f32.i1(<8 x float> [[MULADDHI1]], i1 true) #[[ATTR0]]
-; GFX11-NEXT:    [[BINOPHI:%.*]] = call <8 x float> @lgc.cooperative.matrix.binop.v8f32.i32.v8f32.v8f32.i32.i32(i32 1, <8 x float> [[TMP4]], <8 x float> [[TMP5]], i32 1, i32 1)
-; GFX11-NEXT:    call void @lgc.cooperative.matrix.store.p0.i32.i1.i32.i32.i32.v8f32(ptr [[OUT0]], i32 4, i1 true, i32 1, i32 0, i32 0, <8 x float> [[BINOPLO]])
-; GFX11-NEXT:    call void @lgc.cooperative.matrix.store.p0.i32.i1.i32.i32.i32.v8f32(ptr [[OUT1]], i32 4, i1 true, i32 1, i32 0, i32 0, <8 x float> [[BINOPHI]])
+; GFX11-NEXT:    [[TMP0:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.pack__v8f32(<8 x float> [[C0]], <8 x float> [[C1]])
+; GFX11-NEXT:    [[MULADDLO0:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> [[A]], <8 x float> [[B]], <8 x float> [[TMP0]], i1 true, i1 true, i1 false, i1 true, i32 1, i32 1)
+; GFX11-NEXT:    [[MULADDHI0:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> [[A]], <8 x float> [[B]], <8 x float> [[MULADDLO0]], i1 true, i1 true, i1 true, i1 true, i32 1, i32 1)
+; GFX11-NEXT:    [[TMP1:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.pack__v8f32(<8 x float> [[C2]], <8 x float> [[C3]])
+; GFX11-NEXT:    [[MULADDLO1:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> [[A]], <8 x float> [[B]], <8 x float> [[TMP1]], i1 true, i1 true, i1 false, i1 true, i32 1, i32 1)
+; GFX11-NEXT:    [[MULADDHI1:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> [[A]], <8 x float> [[B]], <8 x float> [[MULADDLO1]], i1 true, i1 true, i1 true, i1 true, i32 1, i32 1)
+; GFX11-NEXT:    [[TMP2:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.unpack__v8f32(<8 x float> [[MULADDHI0]], i1 false)
+; GFX11-NEXT:    [[TMP3:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.unpack__v8f32(<8 x float> [[MULADDHI1]], i1 false)
+; GFX11-NEXT:    [[BINOPLO:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.binop__v8f32(i32 1, <8 x float> [[TMP2]], <8 x float> [[TMP3]], i32 1, i32 1)
+; GFX11-NEXT:    [[TMP4:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.unpack__v8f32(<8 x float> [[MULADDHI0]], i1 true)
+; GFX11-NEXT:    [[TMP5:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.unpack__v8f32(<8 x float> [[MULADDHI1]], i1 true)
+; GFX11-NEXT:    [[BINOPHI:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.binop__v8f32(i32 1, <8 x float> [[TMP4]], <8 x float> [[TMP5]], i32 1, i32 1)
+; GFX11-NEXT:    call void (...) @lgc.cooperative.matrix.store(ptr [[OUT0]], i32 4, i1 true, i32 1, i32 0, i32 0, i32 16, <8 x float> [[BINOPLO]])
+; GFX11-NEXT:    call void (...) @lgc.cooperative.matrix.store(ptr [[OUT1]], i32 4, i1 true, i32 1, i32 0, i32 0, i32 16, <8 x float> [[BINOPHI]])
 ; GFX11-NEXT:    ret void
 ;
 entry:
-  %muladdLo0 = call <8 x float> @lgc.cooperative.matrix.muladd.v8f32.v8f32.v8f32.v8f32.i1.i1.i1.i1.i32.i32(<8 x float> %a, <8 x float> %b, <8 x float> %c0, i1 true, i1 true, i1 false, i1 false, i32 1, i32 1)
-  %muladdHi0 = call <8 x float> @lgc.cooperative.matrix.muladd.v8f32.v8f32.v8f32.v8f32.i1.i1.i1.i1.i32.i32(<8 x float> %a, <8 x float> %b, <8 x float> %c1, i1 true, i1 true, i1 false, i1 false, i32 1, i32 1)
-  %muladdLo1 = call <8 x float> @lgc.cooperative.matrix.muladd.v8f32.v8f32.v8f32.v8f32.i1.i1.i1.i1.i32.i32(<8 x float> %a, <8 x float> %b, <8 x float> %c2, i1 true, i1 true, i1 false, i1 false, i32 1, i32 1)
-  %muladdHi1 = call <8 x float> @lgc.cooperative.matrix.muladd.v8f32.v8f32.v8f32.v8f32.i1.i1.i1.i1.i32.i32(<8 x float> %a, <8 x float> %b, <8 x float> %c3, i1 true, i1 true, i1 false, i1 false, i32 1, i32 1)
-  %binOpLo = call <8 x float> @lgc.cooperative.matrix.binop.v8f32.i32.v8f32.v8f32.i32.i32(i32 1, <8 x float> %muladdLo0, <8 x float> %muladdLo1, i32 1, i32 1) #3
-  %binOpHi = call <8 x float> @lgc.cooperative.matrix.binop.v8f32.i32.v8f32.v8f32.i32.i32(i32 1, <8 x float> %muladdHi0, <8 x float> %muladdHi1, i32 1, i32 1) #3
-  call void @lgc.cooperative.matrix.store.p0.i32.i1.i32.i32.i32.v8f32(ptr %out0, i32 4, i1 true, i32 1, i32 0, i32 0, <8 x float> %binOpLo)
-  call void @lgc.cooperative.matrix.store.p0.i32.i1.i32.i32.i32.v8f32(ptr %out1, i32 4, i1 true, i32 1, i32 0, i32 0, <8 x float> %binOpHi)
+  %muladdLo0 = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> %a, <8 x float> %b, <8 x float> %c0, i1 true, i1 true, i1 false, i1 false, i32 1, i32 1)
+  %muladdHi0 = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> %a, <8 x float> %b, <8 x float> %c1, i1 true, i1 true, i1 false, i1 false, i32 1, i32 1)
+  %muladdLo1 = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> %a, <8 x float> %b, <8 x float> %c2, i1 true, i1 true, i1 false, i1 false, i32 1, i32 1)
+  %muladdHi1 = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> %a, <8 x float> %b, <8 x float> %c3, i1 true, i1 true, i1 false, i1 false, i32 1, i32 1)
+  %binOpLo = call <8 x float> (...) @lgc.cooperative.matrix.binop__v8f32(i32 1, <8 x float> %muladdLo0, <8 x float> %muladdLo1, i32 1, i32 1) #3
+  %binOpHi = call <8 x float> (...) @lgc.cooperative.matrix.binop__v8f32(i32 1, <8 x float> %muladdHi0, <8 x float> %muladdHi1, i32 1, i32 1) #3
+  call void (...) @lgc.cooperative.matrix.store(ptr %out0, i32 4, i1 true, i32 1, i32 0, i32 0, i32 16, <8 x float> %binOpLo)
+  call void (...) @lgc.cooperative.matrix.store(ptr %out1, i32 4, i1 true, i32 1, i32 0, i32 0, i32 16, <8 x float> %binOpHi)
   ret void
 }
 
@@ -598,31 +599,31 @@ define void @matmul_f16_pack_binop_incompatible_matrices(ptr %out0, ptr %out1, <
 ; GFX11-LABEL: define void @matmul_f16_pack_binop_incompatible_matrices
 ; GFX11-SAME: (ptr [[OUT0:%.*]], ptr [[OUT1:%.*]], <8 x float> [[A:%.*]], <8 x float> [[B:%.*]], <8 x float> [[C0:%.*]], <8 x float> [[C1:%.*]], <8 x float> [[C2:%.*]], <8 x float> [[C3:%.*]]) {
 ; GFX11-NEXT:  entry:
-; GFX11-NEXT:    [[TMP0:%.*]] = call <8 x float> @lgc.cooperative.matrix.pack.v8f32.v8f32.v8f32(<8 x float> [[C0]], <8 x float> [[C1]]) #[[ATTR0]]
-; GFX11-NEXT:    [[MULADDLO0:%.*]] = call <8 x float> @lgc.cooperative.matrix.muladd.v8f32.v8f32.v8f32.v8f32.i1.i1.i1.i1.i32.i32(<8 x float> [[A]], <8 x float> [[B]], <8 x float> [[TMP0]], i1 true, i1 true, i1 false, i1 true, i32 1, i32 1)
-; GFX11-NEXT:    [[MULADDHI0:%.*]] = call <8 x float> @lgc.cooperative.matrix.muladd.v8f32.v8f32.v8f32.v8f32.i1.i1.i1.i1.i32.i32(<8 x float> [[A]], <8 x float> [[B]], <8 x float> [[MULADDLO0]], i1 true, i1 true, i1 true, i1 true, i32 1, i32 1)
-; GFX11-NEXT:    [[TMP1:%.*]] = call <8 x float> @lgc.cooperative.matrix.pack.v8f32.v8f32.v8f32(<8 x float> [[C2]], <8 x float> [[C3]]) #[[ATTR0]]
-; GFX11-NEXT:    [[MULADDLO1:%.*]] = call <8 x float> @lgc.cooperative.matrix.muladd.v8f32.v8f32.v8f32.v8f32.i1.i1.i1.i1.i32.i32(<8 x float> [[A]], <8 x float> [[B]], <8 x float> [[TMP1]], i1 true, i1 true, i1 false, i1 true, i32 1, i32 1)
-; GFX11-NEXT:    [[MULADDHI1:%.*]] = call <8 x float> @lgc.cooperative.matrix.muladd.v8f32.v8f32.v8f32.v8f32.i1.i1.i1.i1.i32.i32(<8 x float> [[A]], <8 x float> [[B]], <8 x float> [[MULADDLO1]], i1 true, i1 true, i1 true, i1 true, i32 1, i32 1)
-; GFX11-NEXT:    [[TMP2:%.*]] = call <8 x float> @lgc.cooperative.matrix.unpack.v8f32.v8f32.i1(<8 x float> [[MULADDHI0]], i1 false) #[[ATTR0]]
-; GFX11-NEXT:    [[TMP3:%.*]] = call <8 x float> @lgc.cooperative.matrix.unpack.v8f32.v8f32.i1(<8 x float> [[MULADDHI1]], i1 false) #[[ATTR0]]
-; GFX11-NEXT:    [[BINOPLO:%.*]] = call <8 x float> @lgc.cooperative.matrix.binop.v8f32.i32.v8f32.v8f32.i32.i32(i32 1, <8 x float> [[TMP2]], <8 x float> [[TMP3]], i32 1, i32 1)
-; GFX11-NEXT:    [[TMP4:%.*]] = call <8 x float> @lgc.cooperative.matrix.unpack.v8f32.v8f32.i1(<8 x float> [[MULADDHI0]], i1 true) #[[ATTR0]]
-; GFX11-NEXT:    [[TMP5:%.*]] = call <8 x float> @lgc.cooperative.matrix.unpack.v8f32.v8f32.i1(<8 x float> [[MULADDHI0]], i1 true) #[[ATTR0]]
-; GFX11-NEXT:    [[BINOPHI:%.*]] = call <8 x float> @lgc.cooperative.matrix.binop.v8f32.i32.v8f32.v8f32.i32.i32(i32 1, <8 x float> [[TMP5]], <8 x float> [[TMP4]], i32 1, i32 1)
-; GFX11-NEXT:    call void @lgc.cooperative.matrix.store.p0.i32.i1.i32.i32.i32.v8f32(ptr [[OUT0]], i32 4, i1 true, i32 1, i32 0, i32 0, <8 x float> [[BINOPLO]])
-; GFX11-NEXT:    call void @lgc.cooperative.matrix.store.p0.i32.i1.i32.i32.i32.v8f32(ptr [[OUT1]], i32 4, i1 true, i32 1, i32 0, i32 0, <8 x float> [[BINOPHI]])
+; GFX11-NEXT:    [[TMP0:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.pack__v8f32(<8 x float> [[C0]], <8 x float> [[C1]])
+; GFX11-NEXT:    [[MULADDLO0:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> [[A]], <8 x float> [[B]], <8 x float> [[TMP0]], i1 true, i1 true, i1 false, i1 true, i32 1, i32 1)
+; GFX11-NEXT:    [[MULADDHI0:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> [[A]], <8 x float> [[B]], <8 x float> [[MULADDLO0]], i1 true, i1 true, i1 true, i1 true, i32 1, i32 1)
+; GFX11-NEXT:    [[TMP1:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.pack__v8f32(<8 x float> [[C2]], <8 x float> [[C3]])
+; GFX11-NEXT:    [[MULADDLO1:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> [[A]], <8 x float> [[B]], <8 x float> [[TMP1]], i1 true, i1 true, i1 false, i1 true, i32 1, i32 1)
+; GFX11-NEXT:    [[MULADDHI1:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> [[A]], <8 x float> [[B]], <8 x float> [[MULADDLO1]], i1 true, i1 true, i1 true, i1 true, i32 1, i32 1)
+; GFX11-NEXT:    [[TMP2:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.unpack__v8f32(<8 x float> [[MULADDHI0]], i1 false)
+; GFX11-NEXT:    [[TMP3:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.unpack__v8f32(<8 x float> [[MULADDHI1]], i1 false)
+; GFX11-NEXT:    [[BINOPLO:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.binop__v8f32(i32 1, <8 x float> [[TMP2]], <8 x float> [[TMP3]], i32 1, i32 1)
+; GFX11-NEXT:    [[TMP4:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.unpack__v8f32(<8 x float> [[MULADDHI0]], i1 true)
+; GFX11-NEXT:    [[TMP5:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.unpack__v8f32(<8 x float> [[MULADDHI0]], i1 true)
+; GFX11-NEXT:    [[BINOPHI:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.binop__v8f32(i32 1, <8 x float> [[TMP5]], <8 x float> [[TMP4]], i32 1, i32 1)
+; GFX11-NEXT:    call void (...) @lgc.cooperative.matrix.store(ptr [[OUT0]], i32 4, i1 true, i32 1, i32 0, i32 0, i32 16, <8 x float> [[BINOPLO]])
+; GFX11-NEXT:    call void (...) @lgc.cooperative.matrix.store(ptr [[OUT1]], i32 4, i1 true, i32 1, i32 0, i32 0, i32 16, <8 x float> [[BINOPHI]])
 ; GFX11-NEXT:    ret void
 ;
 entry:
-  %muladdLo0 = call <8 x float> @lgc.cooperative.matrix.muladd.v8f32.v8f32.v8f32.v8f32.i1.i1.i1.i1.i32.i32(<8 x float> %a, <8 x float> %b, <8 x float> %c0, i1 true, i1 true, i1 false, i1 false, i32 1, i32 1)
-  %muladdHi0 = call <8 x float> @lgc.cooperative.matrix.muladd.v8f32.v8f32.v8f32.v8f32.i1.i1.i1.i1.i32.i32(<8 x float> %a, <8 x float> %b, <8 x float> %c1, i1 true, i1 true, i1 false, i1 false, i32 1, i32 1)
-  %muladdLo1 = call <8 x float> @lgc.cooperative.matrix.muladd.v8f32.v8f32.v8f32.v8f32.i1.i1.i1.i1.i32.i32(<8 x float> %a, <8 x float> %b, <8 x float> %c2, i1 true, i1 true, i1 false, i1 false, i32 1, i32 1)
-  %muladdHi1 = call <8 x float> @lgc.cooperative.matrix.muladd.v8f32.v8f32.v8f32.v8f32.i1.i1.i1.i1.i32.i32(<8 x float> %a, <8 x float> %b, <8 x float> %c3, i1 true, i1 true, i1 false, i1 false, i32 1, i32 1)
-  %binOpLo = call <8 x float> @lgc.cooperative.matrix.binop.v8f32.i32.v8f32.v8f32.i32.i32(i32 1, <8 x float> %muladdLo0, <8 x float> %muladdLo1, i32 1, i32 1) #3
-  %binOpHi = call <8 x float> @lgc.cooperative.matrix.binop.v8f32.i32.v8f32.v8f32.i32.i32(i32 1, <8 x float> %muladdHi0, <8 x float> %muladdHi0, i32 1, i32 1) #3
-  call void @lgc.cooperative.matrix.store.p0.i32.i1.i32.i32.i32.v8f32(ptr %out0, i32 4, i1 true, i32 1, i32 0, i32 0, <8 x float> %binOpLo)
-  call void @lgc.cooperative.matrix.store.p0.i32.i1.i32.i32.i32.v8f32(ptr %out1, i32 4, i1 true, i32 1, i32 0, i32 0, <8 x float> %binOpHi)
+  %muladdLo0 = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> %a, <8 x float> %b, <8 x float> %c0, i1 true, i1 true, i1 false, i1 false, i32 1, i32 1)
+  %muladdHi0 = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> %a, <8 x float> %b, <8 x float> %c1, i1 true, i1 true, i1 false, i1 false, i32 1, i32 1)
+  %muladdLo1 = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> %a, <8 x float> %b, <8 x float> %c2, i1 true, i1 true, i1 false, i1 false, i32 1, i32 1)
+  %muladdHi1 = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> %a, <8 x float> %b, <8 x float> %c3, i1 true, i1 true, i1 false, i1 false, i32 1, i32 1)
+  %binOpLo = call <8 x float> (...) @lgc.cooperative.matrix.binop__v8f32(i32 1, <8 x float> %muladdLo0, <8 x float> %muladdLo1, i32 1, i32 1) #3
+  %binOpHi = call <8 x float> (...) @lgc.cooperative.matrix.binop__v8f32(i32 1, <8 x float> %muladdHi0, <8 x float> %muladdHi0, i32 1, i32 1) #3
+  call void (...) @lgc.cooperative.matrix.store(ptr %out0, i32 4, i1 true, i32 1, i32 0, i32 0, i32 16, <8 x float> %binOpLo)
+  call void (...) @lgc.cooperative.matrix.store(ptr %out1, i32 4, i1 true, i32 1, i32 0, i32 0, i32 16, <8 x float> %binOpHi)
   ret void
 }
 
@@ -630,31 +631,31 @@ define void @matmul_f16_pack_binop_incompatible_arithop(ptr %out0, ptr %out1, <8
 ; GFX11-LABEL: define void @matmul_f16_pack_binop_incompatible_arithop
 ; GFX11-SAME: (ptr [[OUT0:%.*]], ptr [[OUT1:%.*]], <8 x float> [[A:%.*]], <8 x float> [[B:%.*]], <8 x float> [[C0:%.*]], <8 x float> [[C1:%.*]], <8 x float> [[C2:%.*]], <8 x float> [[C3:%.*]]) {
 ; GFX11-NEXT:  entry:
-; GFX11-NEXT:    [[TMP0:%.*]] = call <8 x float> @lgc.cooperative.matrix.pack.v8f32.v8f32.v8f32(<8 x float> [[C0]], <8 x float> [[C1]]) #[[ATTR0]]
-; GFX11-NEXT:    [[MULADDLO0:%.*]] = call <8 x float> @lgc.cooperative.matrix.muladd.v8f32.v8f32.v8f32.v8f32.i1.i1.i1.i1.i32.i32(<8 x float> [[A]], <8 x float> [[B]], <8 x float> [[TMP0]], i1 true, i1 true, i1 false, i1 true, i32 1, i32 1)
-; GFX11-NEXT:    [[MULADDHI0:%.*]] = call <8 x float> @lgc.cooperative.matrix.muladd.v8f32.v8f32.v8f32.v8f32.i1.i1.i1.i1.i32.i32(<8 x float> [[A]], <8 x float> [[B]], <8 x float> [[MULADDLO0]], i1 true, i1 true, i1 true, i1 true, i32 1, i32 1)
-; GFX11-NEXT:    [[TMP1:%.*]] = call <8 x float> @lgc.cooperative.matrix.pack.v8f32.v8f32.v8f32(<8 x float> [[C2]], <8 x float> [[C3]]) #[[ATTR0]]
-; GFX11-NEXT:    [[MULADDLO1:%.*]] = call <8 x float> @lgc.cooperative.matrix.muladd.v8f32.v8f32.v8f32.v8f32.i1.i1.i1.i1.i32.i32(<8 x float> [[A]], <8 x float> [[B]], <8 x float> [[TMP1]], i1 true, i1 true, i1 false, i1 true, i32 1, i32 1)
-; GFX11-NEXT:    [[MULADDHI1:%.*]] = call <8 x float> @lgc.cooperative.matrix.muladd.v8f32.v8f32.v8f32.v8f32.i1.i1.i1.i1.i32.i32(<8 x float> [[A]], <8 x float> [[B]], <8 x float> [[MULADDLO1]], i1 true, i1 true, i1 true, i1 true, i32 1, i32 1)
-; GFX11-NEXT:    [[TMP2:%.*]] = call <8 x float> @lgc.cooperative.matrix.unpack.v8f32.v8f32.i1(<8 x float> [[MULADDHI0]], i1 false) #[[ATTR0]]
-; GFX11-NEXT:    [[TMP3:%.*]] = call <8 x float> @lgc.cooperative.matrix.unpack.v8f32.v8f32.i1(<8 x float> [[MULADDHI1]], i1 false) #[[ATTR0]]
-; GFX11-NEXT:    [[BINOPLO:%.*]] = call <8 x float> @lgc.cooperative.matrix.binop.v8f32.i32.v8f32.v8f32.i32.i32(i32 1, <8 x float> [[TMP2]], <8 x float> [[TMP3]], i32 1, i32 1)
-; GFX11-NEXT:    [[TMP4:%.*]] = call <8 x float> @lgc.cooperative.matrix.unpack.v8f32.v8f32.i1(<8 x float> [[MULADDHI0]], i1 true) #[[ATTR0]]
-; GFX11-NEXT:    [[TMP5:%.*]] = call <8 x float> @lgc.cooperative.matrix.unpack.v8f32.v8f32.i1(<8 x float> [[MULADDHI1]], i1 true) #[[ATTR0]]
-; GFX11-NEXT:    [[BINOPHI:%.*]] = call <8 x float> @lgc.cooperative.matrix.binop.v8f32.i32.v8f32.v8f32.i32.i32(i32 3, <8 x float> [[TMP4]], <8 x float> [[TMP5]], i32 1, i32 1)
-; GFX11-NEXT:    call void @lgc.cooperative.matrix.store.p0.i32.i1.i32.i32.i32.v8f32(ptr [[OUT0]], i32 4, i1 true, i32 1, i32 0, i32 0, <8 x float> [[BINOPLO]])
-; GFX11-NEXT:    call void @lgc.cooperative.matrix.store.p0.i32.i1.i32.i32.i32.v8f32(ptr [[OUT1]], i32 4, i1 true, i32 1, i32 0, i32 0, <8 x float> [[BINOPHI]])
+; GFX11-NEXT:    [[TMP0:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.pack__v8f32(<8 x float> [[C0]], <8 x float> [[C1]])
+; GFX11-NEXT:    [[MULADDLO0:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> [[A]], <8 x float> [[B]], <8 x float> [[TMP0]], i1 true, i1 true, i1 false, i1 true, i32 1, i32 1)
+; GFX11-NEXT:    [[MULADDHI0:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> [[A]], <8 x float> [[B]], <8 x float> [[MULADDLO0]], i1 true, i1 true, i1 true, i1 true, i32 1, i32 1)
+; GFX11-NEXT:    [[TMP1:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.pack__v8f32(<8 x float> [[C2]], <8 x float> [[C3]])
+; GFX11-NEXT:    [[MULADDLO1:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> [[A]], <8 x float> [[B]], <8 x float> [[TMP1]], i1 true, i1 true, i1 false, i1 true, i32 1, i32 1)
+; GFX11-NEXT:    [[MULADDHI1:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> [[A]], <8 x float> [[B]], <8 x float> [[MULADDLO1]], i1 true, i1 true, i1 true, i1 true, i32 1, i32 1)
+; GFX11-NEXT:    [[TMP2:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.unpack__v8f32(<8 x float> [[MULADDHI0]], i1 false)
+; GFX11-NEXT:    [[TMP3:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.unpack__v8f32(<8 x float> [[MULADDHI1]], i1 false)
+; GFX11-NEXT:    [[BINOPLO:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.binop__v8f32(i32 1, <8 x float> [[TMP2]], <8 x float> [[TMP3]], i32 1, i32 1)
+; GFX11-NEXT:    [[TMP4:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.unpack__v8f32(<8 x float> [[MULADDHI0]], i1 true)
+; GFX11-NEXT:    [[TMP5:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.unpack__v8f32(<8 x float> [[MULADDHI1]], i1 true)
+; GFX11-NEXT:    [[BINOPHI:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.binop__v8f32(i32 3, <8 x float> [[TMP4]], <8 x float> [[TMP5]], i32 1, i32 1)
+; GFX11-NEXT:    call void (...) @lgc.cooperative.matrix.store(ptr [[OUT0]], i32 4, i1 true, i32 1, i32 0, i32 0, i32 16, <8 x float> [[BINOPLO]])
+; GFX11-NEXT:    call void (...) @lgc.cooperative.matrix.store(ptr [[OUT1]], i32 4, i1 true, i32 1, i32 0, i32 0, i32 16, <8 x float> [[BINOPHI]])
 ; GFX11-NEXT:    ret void
 ;
 entry:
-  %muladdLo0 = call <8 x float> @lgc.cooperative.matrix.muladd.v8f32.v8f32.v8f32.v8f32.i1.i1.i1.i1.i32.i32(<8 x float> %a, <8 x float> %b, <8 x float> %c0, i1 true, i1 true, i1 false, i1 false, i32 1, i32 1)
-  %muladdHi0 = call <8 x float> @lgc.cooperative.matrix.muladd.v8f32.v8f32.v8f32.v8f32.i1.i1.i1.i1.i32.i32(<8 x float> %a, <8 x float> %b, <8 x float> %c1, i1 true, i1 true, i1 false, i1 false, i32 1, i32 1)
-  %muladdLo1 = call <8 x float> @lgc.cooperative.matrix.muladd.v8f32.v8f32.v8f32.v8f32.i1.i1.i1.i1.i32.i32(<8 x float> %a, <8 x float> %b, <8 x float> %c2, i1 true, i1 true, i1 false, i1 false, i32 1, i32 1)
-  %muladdHi1 = call <8 x float> @lgc.cooperative.matrix.muladd.v8f32.v8f32.v8f32.v8f32.i1.i1.i1.i1.i32.i32(<8 x float> %a, <8 x float> %b, <8 x float> %c3, i1 true, i1 true, i1 false, i1 false, i32 1, i32 1)
-  %binOpLo = call <8 x float> @lgc.cooperative.matrix.binop.v8f32.i32.v8f32.v8f32.i32.i32(i32 1, <8 x float> %muladdLo0, <8 x float> %muladdLo1, i32 1, i32 1) #3
-  %binOpHi = call <8 x float> @lgc.cooperative.matrix.binop.v8f32.i32.v8f32.v8f32.i32.i32(i32 3, <8 x float> %muladdHi0, <8 x float> %muladdHi1, i32 1, i32 1) #3
-  call void @lgc.cooperative.matrix.store.p0.i32.i1.i32.i32.i32.v8f32(ptr %out0, i32 4, i1 true, i32 1, i32 0, i32 0, <8 x float> %binOpLo)
-  call void @lgc.cooperative.matrix.store.p0.i32.i1.i32.i32.i32.v8f32(ptr %out1, i32 4, i1 true, i32 1, i32 0, i32 0, <8 x float> %binOpHi)
+  %muladdLo0 = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> %a, <8 x float> %b, <8 x float> %c0, i1 true, i1 true, i1 false, i1 false, i32 1, i32 1)
+  %muladdHi0 = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> %a, <8 x float> %b, <8 x float> %c1, i1 true, i1 true, i1 false, i1 false, i32 1, i32 1)
+  %muladdLo1 = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> %a, <8 x float> %b, <8 x float> %c2, i1 true, i1 true, i1 false, i1 false, i32 1, i32 1)
+  %muladdHi1 = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> %a, <8 x float> %b, <8 x float> %c3, i1 true, i1 true, i1 false, i1 false, i32 1, i32 1)
+  %binOpLo = call <8 x float> (...) @lgc.cooperative.matrix.binop__v8f32(i32 1, <8 x float> %muladdLo0, <8 x float> %muladdLo1, i32 1, i32 1) #3
+  %binOpHi = call <8 x float> (...) @lgc.cooperative.matrix.binop__v8f32(i32 3, <8 x float> %muladdHi0, <8 x float> %muladdHi1, i32 1, i32 1) #3
+  call void (...) @lgc.cooperative.matrix.store(ptr %out0, i32 4, i1 true, i32 1, i32 0, i32 0, i32 16, <8 x float> %binOpLo)
+  call void (...) @lgc.cooperative.matrix.store(ptr %out1, i32 4, i1 true, i32 1, i32 0, i32 0, i32 16, <8 x float> %binOpHi)
   ret void
 }
 
@@ -662,31 +663,31 @@ define void @matmul_f16_unpack_before_convert(ptr %out0, ptr %out1, <8 x float>
 ; GFX11-LABEL: define void @matmul_f16_unpack_before_convert
 ; GFX11-SAME: (ptr [[OUT0:%.*]], ptr [[OUT1:%.*]], <8 x float> [[A:%.*]], <8 x float> [[B:%.*]], <8 x float> [[C0:%.*]], <8 x float> [[C1:%.*]]) {
 ; GFX11-NEXT:  entry:
-; GFX11-NEXT:    [[TMP0:%.*]] = call <8 x float> @lgc.cooperative.matrix.pack.v8f32.v8f32.v8f32(<8 x float> [[C0]], <8 x float> [[C1]]) #[[ATTR0]]
-; GFX11-NEXT:    [[MULADDLO0:%.*]] = call <8 x float> @lgc.cooperative.matrix.muladd.v8f32.v8f32.v8f32.v8f32.i1.i1.i1.i1.i32.i32(<8 x float> [[A]], <8 x float> [[B]], <8 x float> [[TMP0]], i1 true, i1 true, i1 false, i1 true, i32 1, i32 1)
-; GFX11-NEXT:    [[MULADDHI0:%.*]] = call <8 x float> @lgc.cooperative.matrix.muladd.v8f32.v8f32.v8f32.v8f32.i1.i1.i1.i1.i32.i32(<8 x float> [[A]], <8 x float> [[B]], <8 x float> [[MULADDLO0]], i1 true, i1 true, i1 true, i1 true, i32 1, i32 1)
-; GFX11-NEXT:    [[TMP1:%.*]] = call <8 x float> @lgc.cooperative.matrix.unpack.v8f32.v8f32.i1(<8 x float> [[MULADDHI0]], i1 false) #[[ATTR0]]
-; GFX11-NEXT:    [[CONVERTLO:%.*]] = call <8 x float> @lgc.cooperative.matrix.convert.v8f32.i32.v8f32.i32.i32.i32.i32(i32 0, <8 x float> [[TMP1]], i32 1, i32 1, i32 1, i32 0)
-; GFX11-NEXT:    [[TMP2:%.*]] = call <8 x float> @lgc.cooperative.matrix.unpack.v8f32.v8f32.i1(<8 x float> [[MULADDHI0]], i1 true) #[[ATTR0]]
-; GFX11-NEXT:    [[CONVERTHI:%.*]] = call <8 x float> @lgc.cooperative.matrix.convert.v8f32.i32.v8f32.i32.i32.i32.i32(i32 0, <8 x float> [[TMP2]], i32 1, i32 1, i32 1, i32 0)
-; GFX11-NEXT:    [[TMP3:%.*]] = call <8 x float> @lgc.cooperative.matrix.pack.v8f32.v8f32.v8f32(<8 x float> [[C0]], <8 x float> [[C1]]) #[[ATTR0]]
-; GFX11-NEXT:    [[MULADDLO1:%.*]] = call <8 x float> @lgc.cooperative.matrix.muladd.v8f32.v8f32.v8f32.v8f32.i1.i1.i1.i1.i32.i32(<8 x float> [[CONVERTLO]], <8 x float> [[B]], <8 x float> [[TMP3]], i1 true, i1 true, i1 false, i1 true, i32 1, i32 1)
-; GFX11-NEXT:    [[MULADDHI1:%.*]] = call <8 x float> @lgc.cooperative.matrix.muladd.v8f32.v8f32.v8f32.v8f32.i1.i1.i1.i1.i32.i32(<8 x float> [[A]], <8 x float> [[CONVERTHI]], <8 x float> [[MULADDLO1]], i1 true, i1 true, i1 true, i1 true, i32 1, i32 1)
-; GFX11-NEXT:    [[TMP4:%.*]] = call <8 x float> @lgc.cooperative.matrix.unpack.v8f32.v8f32.i1(<8 x float> [[MULADDHI1]], i1 false) #[[ATTR0]]
-; GFX11-NEXT:    call void @lgc.cooperative.matrix.store.p0.i32.i1.i32.i32.i32.v8f32(ptr [[OUT0]], i32 4, i1 true, i32 1, i32 0, i32 0, <8 x float> [[TMP4]])
-; GFX11-NEXT:    [[TMP5:%.*]] = call <8 x float> @lgc.cooperative.matrix.unpack.v8f32.v8f32.i1(<8 x float> [[MULADDHI1]], i1 true) #[[ATTR0]]
-; GFX11-NEXT:    call void @lgc.cooperative.matrix.store.p0.i32.i1.i32.i32.i32.v8f32(ptr [[OUT1]], i32 4, i1 true, i32 1, i32 0, i32 0, <8 x float> [[TMP5]])
+; GFX11-NEXT:    [[TMP0:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.pack__v8f32(<8 x float> [[C0]], <8 x float> [[C1]])
+; GFX11-NEXT:    [[MULADDLO0:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> [[A]], <8 x float> [[B]], <8 x float> [[TMP0]], i1 true, i1 true, i1 false, i1 true, i32 1, i32 1)
+; GFX11-NEXT:    [[MULADDHI0:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> [[A]], <8 x float> [[B]], <8 x float> [[MULADDLO0]], i1 true, i1 true, i1 true, i1 true, i32 1, i32 1)
+; GFX11-NEXT:    [[TMP1:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.unpack__v8f32(<8 x float> [[MULADDHI0]], i1 false)
+; GFX11-NEXT:    [[CONVERTLO:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.convert__v8f32(i32 0, <8 x float> [[TMP1]], i32 1, i32 1, i32 1, i32 0)
+; GFX11-NEXT:    [[TMP2:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.unpack__v8f32(<8 x float> [[MULADDHI0]], i1 true)
+; GFX11-NEXT:    [[CONVERTHI:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.convert__v8f32(i32 0, <8 x float> [[TMP2]], i32 1, i32 1, i32 1, i32 0)
+; GFX11-NEXT:    [[TMP3:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.pack__v8f32(<8 x float> [[C0]], <8 x float> [[C1]])
+; GFX11-NEXT:    [[MULADDLO1:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> [[CONVERTLO]], <8 x float> [[B]], <8 x float> [[TMP3]], i1 true, i1 true, i1 false, i1 true, i32 1, i32 1)
+; GFX11-NEXT:    [[MULADDHI1:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> [[A]], <8 x float> [[CONVERTHI]], <8 x float> [[MULADDLO1]], i1 true, i1 true, i1 true, i1 true, i32 1, i32 1)
+; GFX11-NEXT:    [[TMP4:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.unpack__v8f32(<8 x float> [[MULADDHI1]], i1 false)
+; GFX11-NEXT:    call void (...) @lgc.cooperative.matrix.store(ptr [[OUT0]], i32 4, i1 true, i32 1, i32 0, i32 0, i32 16, <8 x float> [[TMP4]])
+; GFX11-NEXT:    [[TMP5:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.unpack__v8f32(<8 x float> [[MULADDHI1]], i1 true)
+; GFX11-NEXT:    call void (...) @lgc.cooperative.matrix.store(ptr [[OUT1]], i32 4, i1 true, i32 1, i32 0, i32 0, i32 16, <8 x float> [[TMP5]])
 ; GFX11-NEXT:    ret void
 ;
 entry:
-  %muladdLo0 = call <8 x float> @lgc.cooperative.matrix.muladd.v8f32.v8f32.v8f32.v8f32.i1.i1.i1.i1.i32.i32(<8 x float> %a, <8 x float> %b, <8 x float> %c0, i1 true, i1 true, i1 false, i1 false, i32 1, i32 1)
-  %muladdHi0 = call <8 x float> @lgc.cooperative.matrix.muladd.v8f32.v8f32.v8f32.v8f32.i1.i1.i1.i1.i32.i32(<8 x float> %a, <8 x float> %b, <8 x float> %c1, i1 true, i1 true, i1 false, i1 false, i32 1, i32 1)
-  %convertLo = call <8 x float> @lgc.cooperative.matrix.convert.v8f32.i32.v8f32.i32.i32.i32.i32(i32 0, <8 x float> %muladdLo0, i32 1, i32 1, i32 1, i32 0)
-  %convertHi = call <8 x float> @lgc.cooperative.matrix.convert.v8f32.i32.v8f32.i32.i32.i32.i32(i32 0, <8 x float> %muladdHi0, i32 1, i32 1, i32 1, i32 0)
-  %muladdLo1 = call <8 x float> @lgc.cooperative.matrix.muladd.v8f32.v8f32.v8f32.v8f32.i1.i1.i1.i1.i32.i32(<8 x float> %convertLo, <8 x float> %b, <8 x float> %c0, i1 true, i1 true, i1 false, i1 false, i32 1, i32 1)
-  %muladdHi1 = call <8 x float> @lgc.cooperative.matrix.muladd.v8f32.v8f32.v8f32.v8f32.i1.i1.i1.i1.i32.i32(<8 x float> %a, <8 x float> %convertHi, <8 x float> %c1, i1 true, i1 true, i1 false, i1 false, i32 1, i32 1)
-  call void @lgc.cooperative.matrix.store.p0.i32.i1.i32.i32.i32.v8f32(ptr %out0, i32 4, i1 true, i32 1, i32 0, i32 0, <8 x float> %muladdLo1)
-  call void @lgc.cooperative.matrix.store.p0.i32.i1.i32.i32.i32.v8f32(ptr %out1, i32 4, i1 true, i32 1, i32 0, i32 0, <8 x float> %muladdHi1)
+  %muladdLo0 = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> %a, <8 x float> %b, <8 x float> %c0, i1 true, i1 true, i1 false, i1 false, i32 1, i32 1)
+  %muladdHi0 = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> %a, <8 x float> %b, <8 x float> %c1, i1 true, i1 true, i1 false, i1 false, i32 1, i32 1)
+  %convertLo = call <8 x float> (...) @lgc.cooperative.matrix.convert__v8f32(i32 0, <8 x float> %muladdLo0, i32 1, i32 1, i32 1, i32 0)
+  %convertHi = call <8 x float> (...) @lgc.cooperative.matrix.convert__v8f32(i32 0, <8 x float> %muladdHi0, i32 1, i32 1, i32 1, i32 0)
+  %muladdLo1 = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> %convertLo, <8 x float> %b, <8 x float> %c0, i1 true, i1 true, i1 false, i1 false, i32 1, i32 1)
+  %muladdHi1 = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> %a, <8 x float> %convertHi, <8 x float> %c1, i1 true, i1 true, i1 false, i1 false, i32 1, i32 1)
+  call void (...) @lgc.cooperative.matrix.store(ptr %out0, i32 4, i1 true, i32 1, i32 0, i32 0, i32 16, <8 x float> %muladdLo1)
+  call void (...) @lgc.cooperative.matrix.store(ptr %out1, i32 4, i1 true, i32 1, i32 0, i32 0, i32 16, <8 x float> %muladdHi1)
   ret void
 }
 
@@ -694,17 +695,17 @@ define void @matmul_f32_no_pack(ptr %out0, ptr %out1, <8 x float> %a, <8 x float
 ; GFX11-LABEL: define void @matmul_f32_no_pack
 ; GFX11-SAME: (ptr [[OUT0:%.*]], ptr [[OUT1:%.*]], <8 x float> [[A:%.*]], <8 x float> [[B:%.*]], <8 x float> [[C0:%.*]], <8 x float> [[C1:%.*]]) {
 ; GFX11-NEXT:  entry:
-; GFX11-NEXT:    [[MULADDLO:%.*]] = call <8 x float> @lgc.cooperative.matrix.muladd.v8f32.v8f32.v8f32.v8f32.i1.i1.i1.i1.i32.i32(<8 x float> [[A]], <8 x float> [[B]], <8 x float> [[C0]], i1 true, i1 true, i1 false, i1 false, i32 2, i32 2)
-; GFX11-NEXT:    [[MULADDHI:%.*]] = call <8 x float> @lgc.cooperative.matrix.muladd.v8f32.v8f32.v8f32.v8f32.i1.i1.i1.i1.i32.i32(<8 x float> [[A]], <8 x float> [[B]], <8 x float> [[C1]], i1 true, i1 true, i1 false, i1 false, i32 2, i32 2)
-; GFX11-NEXT:    call void @lgc.cooperative.matrix.store.p0.i32.i1.i32.i32.i32.v8f32(ptr [[OUT0]], i32 4, i1 true, i32 2, i32 0, i32 0, <8 x float> [[MULADDLO]])
-; GFX11-NEXT:    call void @lgc.cooperative.matrix.store.p0.i32.i1.i32.i32.i32.v8f32(ptr [[OUT1]], i32 4, i1 true, i32 2, i32 0, i32 0, <8 x float> [[MULADDHI]])
+; GFX11-NEXT:    [[MULADDLO:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> [[A]], <8 x float> [[B]], <8 x float> [[C0]], i1 true, i1 true, i1 false, i1 false, i32 2, i32 2)
+; GFX11-NEXT:    [[MULADDHI:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> [[A]], <8 x float> [[B]], <8 x float> [[C1]], i1 true, i1 true, i1 false, i1 false, i32 2, i32 2)
+; GFX11-NEXT:    call void (...) @lgc.cooperative.matrix.store(ptr [[OUT0]], i32 4, i1 true, i32 2, i32 0, i32 0, <8 x float> [[MULADDLO]])
+; GFX11-NEXT:    call void (...) @lgc.cooperative.matrix.store(ptr [[OUT1]], i32 4, i1 true, i32 2, i32 0, i32 0, <8 x float> [[MULADDHI]])
 ; GFX11-NEXT:    ret void
 ;
 entry:
-  %muladdLo = call <8 x float> @lgc.cooperative.matrix.muladd.v8f32.v8f32.v8f32.v8f32.i1.i1.i1.i1.i32.i32(<8 x float> %a, <8 x float> %b, <8 x float> %c0, i1 true, i1 true, i1 false, i1 false, i32 2, i32 2)
-  %muladdHi = call <8 x float> @lgc.cooperative.matrix.muladd.v8f32.v8f32.v8f32.v8f32.i1.i1.i1.i1.i32.i32(<8 x float> %a, <8 x float> %b, <8 x float> %c1, i1 true, i1 true, i1 false, i1 false, i32 2, i32 2)
-  call void @lgc.cooperative.matrix.store.p0.i32.i1.i32.i32.i32.v8f32(ptr %out0, i32 4, i1 true, i32 2, i32 0, i32 0, <8 x float> %muladdLo)
-  call void @lgc.cooperative.matrix.store.p0.i32.i1.i32.i32.i32.v8f32(ptr %out1, i32 4, i1 true, i32 2, i32 0, i32 0, <8 x float> %muladdHi)
+  %muladdLo = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> %a, <8 x float> %b, <8 x float> %c0, i1 true, i1 true, i1 false, i1 false, i32 2, i32 2)
+  %muladdHi = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> %a, <8 x float> %b, <8 x float> %c1, i1 true, i1 true, i1 false, i1 false, i32 2, i32 2)
+  call void (...) @lgc.cooperative.matrix.store(ptr %out0, i32 4, i1 true, i32 2, i32 0, i32 0, <8 x float> %muladdLo)
+  call void (...) @lgc.cooperative.matrix.store(ptr %out1, i32 4, i1 true, i32 2, i32 0, i32 0, <8 x float> %muladdHi)
   ret void
 }
 
@@ -712,19 +713,19 @@ define void @matmul_f16_modified_accumulator(ptr %out0, ptr %out1, <8 x float> %
 ; GFX11-LABEL: define void @matmul_f16_modified_accumulator
 ; GFX11-SAME: (ptr [[OUT0:%.*]], ptr [[OUT1:%.*]], <8 x float> [[A:%.*]], <8 x float> [[B:%.*]], <8 x float> [[C0:%.*]], <8 x float> [[C1:%.*]]) {
 ; GFX11-NEXT:  entry:
-; GFX11-NEXT:    [[MULADDLO:%.*]] = call <8 x float> @lgc.cooperative.matrix.muladd.v8f32.v8f32.v8f32.v8f32.i1.i1.i1.i1.i32.i32(<8 x float> [[A]], <8 x float> [[B]], <8 x float> [[C0]], i1 true, i1 true, i1 false, i1 false, i32 1, i32 1)
-; GFX11-NEXT:    [[ACCUM_C2:%.*]] = call <8 x float> @lgc.cooperative.matrix.binop.v8f32.i32.v8f32.v8f32.i32.i32(i32 1, <8 x float> [[MULADDLO]], <8 x float> [[C1]], i32 1, i32 1)
-; GFX11-NEXT:    [[MULADDHI:%.*]] = call <8 x float> @lgc.cooperative.matrix.muladd.v8f32.v8f32.v8f32.v8f32.i1.i1.i1.i1.i32.i32(<8 x float> [[A]], <8 x float> [[B]], <8 x float> [[ACCUM_C2]], i1 true, i1 true, i1 false, i1 false, i32 1, i32 1)
-; GFX11-NEXT:    call void @lgc.cooperative.matrix.store.p0.i32.i1.i32.i32.i32.v8f32(ptr [[OUT0]], i32 4, i1 true, i32 1, i32 0, i32 0, <8 x float> [[MULADDLO]])
-; GFX11-NEXT:    call void @lgc.cooperative.matrix.store.p0.i32.i1.i32.i32.i32.v8f32(ptr [[OUT1]], i32 4, i1 true, i32 1, i32 0, i32 0, <8 x float> [[MULADDHI]])
+; GFX11-NEXT:    [[MULADDLO:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> [[A]], <8 x float> [[B]], <8 x float> [[C0]], i1 true, i1 true, i1 false, i1 false, i32 1, i32 1)
+; GFX11-NEXT:    [[ACCUM_C2:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.binop__v8f32(i32 1, <8 x float> [[MULADDLO]], <8 x float> [[C1]], i32 1, i32 1)
+; GFX11-NEXT:    [[MULADDHI:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> [[A]], <8 x float> [[B]], <8 x float> [[ACCUM_C2]], i1 true, i1 true, i1 false, i1 false, i32 1, i32 1)
+; GFX11-NEXT:    call void (...) @lgc.cooperative.matrix.store(ptr [[OUT0]], i32 4, i1 true, i32 1, i32 0, i32 0, i32 16, <8 x float> [[MULADDLO]])
+; GFX11-NEXT:    call void (...) @lgc.cooperative.matrix.store(ptr [[OUT1]], i32 4, i1 true, i32 1, i32 0, i32 0, i32 16, <8 x float> [[MULADDHI]])
 ; GFX11-NEXT:    ret void
 ;
 entry:
-  %muladdLo = call <8 x float> @lgc.cooperative.matrix.muladd.v8f32.v8f32.v8f32.v8f32.i1.i1.i1.i1.i32.i32(<8 x float> %a, <8 x float> %b, <8 x float> %c0, i1 true, i1 true, i1 false, i1 false, i32 1, i32 1)
-  %accum.c2 = call <8 x float> @lgc.cooperative.matrix.binop.v8f32.i32.v8f32.v8f32.i32.i32(i32 1, <8 x float> %muladdLo, <8 x float> %c1, i32 1, i32 1)
-  %muladdHi = call <8 x float> @lgc.cooperative.matrix.muladd.v8f32.v8f32.v8f32.v8f32.i1.i1.i1.i1.i32.i32(<8 x float> %a, <8 x float> %b, <8 x float> %accum.c2, i1 true, i1 true, i1 false, i1 false, i32 1, i32 1)
-  call void @lgc.cooperative.matrix.store.p0.i32.i1.i32.i32.i32.v8f32(ptr %out0, i32 4, i1 true, i32 1, i32 0, i32 0, <8 x float> %muladdLo)
-  call void @lgc.cooperative.matrix.store.p0.i32.i1.i32.i32.i32.v8f32(ptr %out1, i32 4, i1 true, i32 1, i32 0, i32 0, <8 x float> %muladdHi)
+  %muladdLo = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> %a, <8 x float> %b, <8 x float> %c0, i1 true, i1 true, i1 false, i1 false, i32 1, i32 1)
+  %accum.c2 = call <8 x float> (...) @lgc.cooperative.matrix.binop__v8f32(i32 1, <8 x float> %muladdLo, <8 x float> %c1, i32 1, i32 1)
+  %muladdHi = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> %a, <8 x float> %b, <8 x float> %accum.c2, i1 true, i1 true, i1 false, i1 false, i32 1, i32 1)
+  call void (...) @lgc.cooperative.matrix.store(ptr %out0, i32 4, i1 true, i32 1, i32 0, i32 0, i32 16, <8 x float> %muladdLo)
+  call void (...) @lgc.cooperative.matrix.store(ptr %out1, i32 4, i1 true, i32 1, i32 0, i32 0, i32 16, <8 x float> %muladdHi)
   ret void
 }
 
@@ -732,24 +733,24 @@ define void @matmul_f16_store_between_muladds(ptr %out0, ptr %out1, <8 x float>
 ; GFX11-LABEL: define void @matmul_f16_store_between_muladds
 ; GFX11-SAME: (ptr [[OUT0:%.*]], ptr [[OUT1:%.*]], <8 x float> [[A:%.*]], <8 x float> [[B:%.*]], <8 x float> [[C0:%.*]], <8 x float> [[C1:%.*]]) {
 ; GFX11-NEXT:  entry:
-; GFX11-NEXT:    [[TMP0:%.*]] = call <8 x float> @lgc.cooperative.matrix.pack.v8f32.v8f32.v8f32(<8 x float> [[C0]], <8 x float> [[C1]]) #[[ATTR0]]
-; GFX11-NEXT:    [[CHAIN0_1:%.*]] = call <8 x float> @lgc.cooperative.matrix.muladd.v8f32.v8f32.v8f32.v8f32.i1.i1.i1.i1.i32.i32(<8 x float> [[A]], <8 x float> [[B]], <8 x float> [[TMP0]], i1 true, i1 true, i1 false, i1 true, i32 1, i32 1)
-; GFX11-NEXT:    [[CHAIN1_1:%.*]] = call <8 x float> @lgc.cooperative.matrix.muladd.v8f32.v8f32.v8f32.v8f32.i1.i1.i1.i1.i32.i32(<8 x float> [[A]], <8 x float> [[B]], <8 x float> [[CHAIN0_1]], i1 true, i1 true, i1 true, i1 true, i32 1, i32 1)
-; GFX11-NEXT:    [[TMP1:%.*]] = call <8 x float> @lgc.cooperative.matrix.unpack.v8f32.v8f32.i1(<8 x float> [[CHAIN1_1]], i1 true) #[[ATTR0]]
-; GFX11-NEXT:    [[CHAIN0_2:%.*]] = call <8 x float> @lgc.cooperative.matrix.muladd.v8f32.v8f32.v8f32.v8f32.i1.i1.i1.i1.i32.i32(<8 x float> [[A]], <8 x float> [[B]], <8 x float> [[TMP1]], i1 true, i1 true, i1 false, i1 false, i32 1, i32 1)
-; GFX11-NEXT:    call void @lgc.cooperative.matrix.store.p0.i32.i1.i32.i32.i32.v8f32(ptr [[OUT0]], i32 4, i1 true, i32 1, i32 0, i32 0, <8 x float> [[CHAIN0_2]])
-; GFX11-NEXT:    [[TMP2:%.*]] = call <8 x float> @lgc.cooperative.matrix.unpack.v8f32.v8f32.i1(<8 x float> [[CHAIN1_1]], i1 true) #[[ATTR0]]
-; GFX11-NEXT:    [[CHAIN1_2:%.*]] = call <8 x float> @lgc.cooperative.matrix.muladd.v8f32.v8f32.v8f32.v8f32.i1.i1.i1.i1.i32.i32(<8 x float> [[A]], <8 x float> [[B]], <8 x float> [[TMP2]], i1 true, i1 true, i1 false, i1 false, i32 1, i32 1)
-; GFX11-NEXT:    call void @lgc.cooperative.matrix.store.p0.i32.i1.i32.i32.i32.v8f32(ptr [[OUT1]], i32 4, i1 true, i32 1, i32 0, i32 0, <8 x float> [[CHAIN1_2]])
+; GFX11-NEXT:    [[TMP0:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.pack__v8f32(<8 x float> [[C0]], <8 x float> [[C1]])
+; GFX11-NEXT:    [[CHAIN0_1:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> [[A]], <8 x float> [[B]], <8 x float> [[TMP0]], i1 true, i1 true, i1 false, i1 true, i32 1, i32 1)
+; GFX11-NEXT:    [[CHAIN1_1:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> [[A]], <8 x float> [[B]], <8 x float> [[CHAIN0_1]], i1 true, i1 true, i1 true, i1 true, i32 1, i32 1)
+; GFX11-NEXT:    [[TMP1:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.unpack__v8f32(<8 x float> [[CHAIN1_1]], i1 true)
+; GFX11-NEXT:    [[CHAIN0_2:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> [[A]], <8 x float> [[B]], <8 x float> [[TMP1]], i1 true, i1 true, i1 false, i1 false, i32 1, i32 1)
+; GFX11-NEXT:    call void (...) @lgc.cooperative.matrix.store(ptr [[OUT0]], i32 4, i1 true, i32 1, i32 0, i32 0, i32 16, <8 x float> [[CHAIN0_2]])
+; GFX11-NEXT:    [[TMP2:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.unpack__v8f32(<8 x float> [[CHAIN1_1]], i1 true)
+; GFX11-NEXT:    [[CHAIN1_2:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> [[A]], <8 x float> [[B]], <8 x float> [[TMP2]], i1 true, i1 true, i1 false, i1 false, i32 1, i32 1)
+; GFX11-NEXT:    call void (...) @lgc.cooperative.matrix.store(ptr [[OUT1]], i32 4, i1 true, i32 1, i32 0, i32 0, i32 16, <8 x float> [[CHAIN1_2]])
 ; GFX11-NEXT:    ret void
 ;
 entry:
-  %chain0.1 = call <8 x float> @lgc.cooperative.matrix.muladd.v8f32.v8f32.v8f32.v8f32.i1.i1.i1.i1.i32.i32(<8 x float> %a, <8 x float> %b, <8 x float> %c0, i1 true, i1 true, i1 false, i1 false, i32 1, i32 1)
-  %chain1.1 = call <8 x float> @lgc.cooperative.matrix.muladd.v8f32.v8f32.v8f32.v8f32.i1.i1.i1.i1.i32.i32(<8 x float> %a, <8 x float> %b, <8 x float> %c1, i1 true, i1 true, i1 false, i1 false, i32 1, i32 1)
-  %chain0.2 = call <8 x float> @lgc.cooperative.matrix.muladd.v8f32.v8f32.v8f32.v8f32.i1.i1.i1.i1.i32.i32(<8 x float> %a, <8 x float> %b, <8 x float> %chain1.1, i1 true, i1 true, i1 false, i1 false, i32 1, i32 1)
-  call void @lgc.cooperative.matrix.store.p0.i32.i1.i32.i32.i32.v8f32(ptr %out0, i32 4, i1 true, i32 1, i32 0, i32 0, <8 x float> %chain0.2)
-  %chain1.2 = call <8 x float> @lgc.cooperative.matrix.muladd.v8f32.v8f32.v8f32.v8f32.i1.i1.i1.i1.i32.i32(<8 x float> %a, <8 x float> %b, <8 x float> %chain1.1, i1 true, i1 true, i1 false, i1 false, i32 1, i32 1)
-  call void @lgc.cooperative.matrix.store.p0.i32.i1.i32.i32.i32.v8f32(ptr %out1, i32 4, i1 true, i32 1, i32 0, i32 0, <8 x float> %chain1.2)
+  %chain0.1 = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> %a, <8 x float> %b, <8 x float> %c0, i1 true, i1 true, i1 false, i1 false, i32 1, i32 1)
+  %chain1.1 = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> %a, <8 x float> %b, <8 x float> %c1, i1 true, i1 true, i1 false, i1 false, i32 1, i32 1)
+  %chain0.2 = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> %a, <8 x float> %b, <8 x float> %chain1.1, i1 true, i1 true, i1 false, i1 false, i32 1, i32 1)
+  call void (...) @lgc.cooperative.matrix.store(ptr %out0, i32 4, i1 true, i32 1, i32 0, i32 0, i32 16, <8 x float> %chain0.2)
+  %chain1.2 = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> %a, <8 x float> %b, <8 x float> %chain1.1, i1 true, i1 true, i1 false, i1 false, i32 1, i32 1)
+  call void (...) @lgc.cooperative.matrix.store(ptr %out1, i32 4, i1 true, i32 1, i32 0, i32 0, i32 16, <8 x float> %chain1.2)
   ret void
 }
 
@@ -757,33 +758,33 @@ define void @matmul_f16_store_within_chain(ptr %out0, ptr %out1, <8 x float> %a,
 ; GFX11-LABEL: define void @matmul_f16_store_within_chain
 ; GFX11-SAME: (ptr [[OUT0:%.*]], ptr [[OUT1:%.*]], <8 x float> [[A:%.*]], <8 x float> [[B:%.*]], <8 x float> [[C0:%.*]], <8 x float> [[C1:%.*]]) {
 ; GFX11-NEXT:  entry:
-; GFX11-NEXT:    [[TMP0:%.*]] = call <8 x float> @lgc.cooperative.matrix.pack.v8f32.v8f32.v8f32(<8 x float> [[C0]], <8 x float> [[C1]]) #[[ATTR0]]
-; GFX11-NEXT:    [[CHAIN0_1:%.*]] = call <8 x float> @lgc.cooperative.matrix.muladd.v8f32.v8f32.v8f32.v8f32.i1.i1.i1.i1.i32.i32(<8 x float> [[A]], <8 x float> [[B]], <8 x float> [[TMP0]], i1 true, i1 true, i1 false, i1 true, i32 1, i32 1)
-; GFX11-NEXT:    [[CHAIN1_1:%.*]] = call <8 x float> @lgc.cooperative.matrix.muladd.v8f32.v8f32.v8f32.v8f32.i1.i1.i1.i1.i32.i32(<8 x float> [[A]], <8 x float> [[B]], <8 x float> [[CHAIN0_1]], i1 true, i1 true, i1 true, i1 true, i32 1, i32 1)
-; GFX11-NEXT:    [[CHAIN0_2:%.*]] = call <8 x float> @lgc.cooperative.matrix.muladd.v8f32.v8f32.v8f32.v8f32.i1.i1.i1.i1.i32.i32(<8 x float> [[A]], <8 x float> [[B]], <8 x float> [[CHAIN1_1]], i1 true, i1 true, i1 false, i1 true, i32 1, i32 1)
-; GFX11-NEXT:    [[TMP1:%.*]] = call <8 x float> @lgc.cooperative.matrix.unpack.v8f32.v8f32.i1(<8 x float> [[CHAIN0_2]], i1 false) #[[ATTR0]]
-; GFX11-NEXT:    call void @lgc.cooperative.matrix.store.p0.i32.i1.i32.i32.i32.v8f32(ptr [[OUT0]], i32 4, i1 true, i32 1, i32 0, i32 0, <8 x float> [[TMP1]])
-; GFX11-NEXT:    [[TMP2:%.*]] = call <8 x float> @lgc.cooperative.matrix.unpack.v8f32.v8f32.i1(<8 x float> [[CHAIN0_2]], i1 true) #[[ATTR0]]
-; GFX11-NEXT:    [[CHAIN1_2:%.*]] = call <8 x float> @lgc.cooperative.matrix.muladd.v8f32.v8f32.v8f32.v8f32.i1.i1.i1.i1.i32.i32(<8 x float> [[A]], <8 x float> [[B]], <8 x float> [[TMP2]], i1 true, i1 true, i1 false, i1 false, i32 1, i32 1)
-; GFX11-NEXT:    call void @lgc.cooperative.matrix.store.p0.i32.i1.i32.i32.i32.v8f32(ptr [[OUT1]], i32 4, i1 true, i32 1, i32 0, i32 0, <8 x float> [[CHAIN1_2]])
+; GFX11-NEXT:    [[TMP0:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.pack__v8f32(<8 x float> [[C0]], <8 x float> [[C1]])
+; GFX11-NEXT:    [[CHAIN0_1:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> [[A]], <8 x float> [[B]], <8 x float> [[TMP0]], i1 true, i1 true, i1 false, i1 true, i32 1, i32 1)
+; GFX11-NEXT:    [[CHAIN1_1:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> [[A]], <8 x float> [[B]], <8 x float> [[CHAIN0_1]], i1 true, i1 true, i1 true, i1 true, i32 1, i32 1)
+; GFX11-NEXT:    [[CHAIN0_2:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> [[A]], <8 x float> [[B]], <8 x float> [[CHAIN1_1]], i1 true, i1 true, i1 false, i1 true, i32 1, i32 1)
+; GFX11-NEXT:    [[TMP1:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.unpack__v8f32(<8 x float> [[CHAIN0_2]], i1 false)
+; GFX11-NEXT:    call void (...) @lgc.cooperative.matrix.store(ptr [[OUT0]], i32 4, i1 true, i32 1, i32 0, i32 0, i32 16, <8 x float> [[TMP1]])
+; GFX11-NEXT:    [[TMP2:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.unpack__v8f32(<8 x float> [[CHAIN0_2]], i1 true)
+; GFX11-NEXT:    [[CHAIN1_2:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> [[A]], <8 x float> [[B]], <8 x float> [[TMP2]], i1 true, i1 true, i1 false, i1 false, i32 1, i32 1)
+; GFX11-NEXT:    call void (...) @lgc.cooperative.matrix.store(ptr [[OUT1]], i32 4, i1 true, i32 1, i32 0, i32 0, i32 16, <8 x float> [[CHAIN1_2]])
 ; GFX11-NEXT:    ret void
 ;
 entry:
-  %chain0.1 = call <8 x float> @lgc.cooperative.matrix.muladd.v8f32.v8f32.v8f32.v8f32.i1.i1.i1.i1.i32.i32(<8 x float> %a, <8 x float> %b, <8 x float> %c0, i1 true, i1 true, i1 false, i1 false, i32 1, i32 1)
-  %chain1.1 = call <8 x float> @lgc.cooperative.matrix.muladd.v8f32.v8f32.v8f32.v8f32.i1.i1.i1.i1.i32.i32(<8 x float> %a, <8 x float> %b, <8 x float> %c1, i1 true, i1 true, i1 false, i1 false, i32 1, i32 1)
-  %chain0.2 = call <8 x float> @lgc.cooperative.matrix.muladd.v8f32.v8f32.v8f32.v8f32.i1.i1.i1.i1.i32.i32(<8 x float> %a, <8 x float> %b, <8 x float> %chain0.1, i1 true, i1 true, i1 false, i1 false, i32 1, i32 1)
-  call void @lgc.cooperative.matrix.store.p0.i32.i1.i32.i32.i32.v8f32(ptr %out0, i32 4, i1 true, i32 1, i32 0, i32 0, <8 x float> %chain0.2)
-  %chain1.2 = call <8 x float> @lgc.cooperative.matrix.muladd.v8f32.v8f32.v8f32.v8f32.i1.i1.i1.i1.i32.i32(<8 x float> %a, <8 x float> %b, <8 x float> %chain1.1, i1 true, i1 true, i1 false, i1 false, i32 1, i32 1)
-  call void @lgc.cooperative.matrix.store.p0.i32.i1.i32.i32.i32.v8f32(ptr %out1, i32 4, i1 true, i32 1, i32 0, i32 0, <8 x float> %chain1.2)
+  %chain0.1 = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> %a, <8 x float> %b, <8 x float> %c0, i1 true, i1 true, i1 false, i1 false, i32 1, i32 1)
+  %chain1.1 = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> %a, <8 x float> %b, <8 x float> %c1, i1 true, i1 true, i1 false, i1 false, i32 1, i32 1)
+  %chain0.2 = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> %a, <8 x float> %b, <8 x float> %chain0.1, i1 true, i1 true, i1 false, i1 false, i32 1, i32 1)
+  call void (...) @lgc.cooperative.matrix.store(ptr %out0, i32 4, i1 true, i32 1, i32 0, i32 0, i32 16, <8 x float> %chain0.2)
+  %chain1.2 = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> %a, <8 x float> %b, <8 x float> %chain1.1, i1 true, i1 true, i1 false, i1 false, i32 1, i32 1)
+  call void (...) @lgc.cooperative.matrix.store(ptr %out1, i32 4, i1 true, i32 1, i32 0, i32 0, i32 16, <8 x float> %chain1.2)
   ret void
 }
 
 declare i1 @getcc()
 declare <8 x float> @getmat1()
 
-declare <8 x float> @lgc.cooperative.matrix.load.v8f32.p0.i32.i1.i32.i32.i32(ptr, i32, i1, i32, i32, i32)
-declare <8 x float> @lgc.cooperative.matrix.convert.v8f32.i32.v8f32.i32.i32.i32.i32(i32, <8 x float>, i32, i32, i32, i32)
-declare void @lgc.cooperative.matrix.store.p0.i32.i1.i32.i32.i32.v8f32(ptr, i32, i1, i32, i32, i32, <8 x float>)
-declare <8 x float> @lgc.cooperative.matrix.muladd.v8f32.v8f32.v8f32.v8f32.i1.i1.i1.i1.i32.i32(<8 x float>, <8 x float>, <8 x float>, i1, i1, i1, i1, i32, i32)
-declare <8 x float> @lgc.cooperative.matrix.times.scalar.v8f32.v8f32.f16.i32.i32(<8 x float>, half, i32, i32)
-declare <8 x float> @lgc.cooperative.matrix.binop.v8f32.i32.v8f32.v8f32.i32.i32(i32, <8 x float>, <8 x float>, i32, i32)
+declare <8 x float> @lgc.cooperative.matrix.load__v8f32(...)
+declare <8 x float> @lgc.cooperative.matrix.convert__v8f32(...)
+declare void @lgc.cooperative.matrix.store(...)
+declare <8 x float> @lgc.cooperative.matrix.muladd__v8f32(...)
+declare <8 x float> @lgc.cooperative.matrix.times.scalar__v8f32(...)
+declare <8 x float> @lgc.cooperative.matrix.binop__v8f32(...)
diff --git a/lgc/test/Transforms/CombineCooperativeMatrix/simple.lgc b/lgc/test/Transforms/CombineCooperativeMatrix/simple.lgc
index 117600ccc8..c08bdd15c4 100644
--- a/lgc/test/Transforms/CombineCooperativeMatrix/simple.lgc
+++ b/lgc/test/Transforms/CombineCooperativeMatrix/simple.lgc
@@ -3,11 +3,11 @@
 
 define <8 x float> @noop_transpose(<8 x float> %x) {
 ; CHECK-LABEL: @noop_transpose(
-; CHECK-NEXT:    [[T:%.*]] = call <8 x float> @lgc.cooperative.matrix.transpose.v8f32.v8f32.i32.i32(<8 x float> [[X:%.*]], i32 1, i32 0)
+; CHECK-NEXT:    [[T:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.transpose__v8f32(<8 x float> [[X:%.*]], i32 1, i32 0)
 ; CHECK-NEXT:    ret <8 x float> [[T]]
 ;
 
-  %t = call <8 x float> @lgc.cooperative.matrix.transpose.v8f32.v8f32.i32.i32(<8 x float> %x, i32 1, i32 0)
+  %t = call <8 x float> (...) @lgc.cooperative.matrix.transpose__v8f32(<8 x float> %x, i32 1, i32 0)
   ret <8 x float> %t
 }
 
@@ -16,37 +16,37 @@ define <8 x float> @collapse_transpose(<8 x float> %x) {
 ; CHECK-NEXT:    ret <8 x float> [[X:%.*]]
 ;
 
-  %t1 = call <8 x float> @lgc.cooperative.matrix.transpose.v8f32.v8f32.i32.i32(<8 x float> %x, i32 1, i32 0)
-  %t2 = call <8 x float> @lgc.cooperative.matrix.transpose.v8f32.v8f32.i32.i32(<8 x float> %t1, i32 1, i32 0)
+  %t1 = call <8 x float> (...) @lgc.cooperative.matrix.transpose__v8f32(<8 x float> %x, i32 1, i32 0)
+  %t2 = call <8 x float> (...) @lgc.cooperative.matrix.transpose__v8f32(<8 x float> %t1, i32 1, i32 0)
   ret <8 x float> %t2
 }
 
 define <8 x float> @test_load_transpose(ptr addrspace(3) %ptr) {
 ; CHECK-LABEL: @test_load_transpose(
-; CHECK-NEXT:    [[A:%.*]] = call <8 x float> @lgc.cooperative.matrix.load.v8f32.p3.i32.i1.i32.i32.i32(ptr addrspace(3) [[PTR:%.*]], i32 4, i1 false, i32 1, i32 0, i32 0)
+; CHECK-NEXT:    [[A:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.load__v8f32(ptr addrspace(3) [[PTR:%.*]], i32 4, i1 false, i32 1, i32 0, i32 0)
 ; CHECK-NEXT:    ret <8 x float> [[A]]
 ;
 
-  %a = call <8 x float> @lgc.cooperative.matrix.load.v8f32.p3.i32.i1.i32.i32.i32(ptr addrspace(3) %ptr, i32 4, i1 true, i32 1, i32 0, i32 0)
-  %t = call <8 x float> @lgc.cooperative.matrix.transpose.v8f32.v8f32.i32.i32(<8 x float> %a, i32 1, i32 0)
+  %a = call <8 x float> (...) @lgc.cooperative.matrix.load__v8f32(ptr addrspace(3) %ptr, i32 4, i1 true, i32 1, i32 0, i32 0)
+  %t = call <8 x float> (...) @lgc.cooperative.matrix.transpose__v8f32(<8 x float> %a, i32 1, i32 0)
   ret <8 x float> %t
 }
 
 define void @test_store_transpose(ptr addrspace(3) %ptr, <8 x float> %a) {
 ; CHECK-LABEL: @test_store_transpose(
-; CHECK-NEXT:    call void @lgc.cooperative.matrix.store.p3.i32.i1.i32.i32.i32.v8f32(ptr addrspace(3) [[PTR:%.*]], i32 4, i1 false, i32 1, i32 0, i32 0, <8 x float> [[A:%.*]])
+; CHECK-NEXT:    call void (...) @lgc.cooperative.matrix.store(ptr addrspace(3) [[PTR:%.*]], i32 4, i1 false, i32 1, i32 0, i32 0, i32 16, <8 x float> [[A:%.*]])
 ; CHECK-NEXT:    ret void
 ;
 
-  %t = call <8 x float> @lgc.cooperative.matrix.transpose.v8f32.v8f32.i32.i32(<8 x float> %a, i32 1, i32 0)
-  call void @lgc.cooperative.matrix.store.p3.i32.i1.i32.i32.i32.v8f32(ptr addrspace(3) %ptr, i32 4, i1 true, i32 1, i32 0, i32 0, <8 x float> %t)
+  %t = call <8 x float> (...) @lgc.cooperative.matrix.transpose__v8f32(<8 x float> %a, i32 1, i32 0)
+  call void (...) @lgc.cooperative.matrix.store(ptr addrspace(3) %ptr, i32 4, i1 true, i32 1, i32 0, i32 0, i32 16, <8 x float> %t)
   ret void
 }
 
 define void @test_phi_transpose(ptr addrspace(7) %ptr, <8 x float> %init) {
 ; CHECK-LABEL: @test_phi_transpose(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = call <8 x float> @lgc.cooperative.matrix.transpose.v8f32.v8f32.i32.i32(<8 x float> [[INIT:%.*]], i32 1, i32 0)
+; CHECK-NEXT:    [[TMP0:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.transpose__v8f32(<8 x float> [[INIT:%.*]], i32 1, i32 0)
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       loop:
 ; CHECK-NEXT:    [[MATRIX:%.*]] = phi <8 x float> [ [[TMP0]], [[ENTRY:%.*]] ], [ [[TMP:%.*]], [[LOOP]] ]
@@ -54,7 +54,7 @@ define void @test_phi_transpose(ptr addrspace(7) %ptr, <8 x float> %init) {
 ; CHECK-NEXT:    [[CC:%.*]] = call i1 @getcc()
 ; CHECK-NEXT:    br i1 [[CC]], label [[LOOP]], label [[END:%.*]]
 ; CHECK:       end:
-; CHECK-NEXT:    call void @lgc.cooperative.matrix.store.p7.i32.i1.i32.i32.i32.v8f32(ptr addrspace(7) [[PTR:%.*]], i32 4, i1 true, i32 1, i32 0, i32 0, <8 x float> [[TMP]])
+; CHECK-NEXT:    call void (...) @lgc.cooperative.matrix.store(ptr addrspace(7) [[PTR:%.*]], i32 4, i1 true, i32 1, i32 0, i32 0, i32 16, <8 x float> [[TMP]])
 ; CHECK-NEXT:    ret void
 ;
 entry:
@@ -62,15 +62,15 @@ entry:
 
 loop:
   %matrix = phi <8 x float> [ %init, %entry ], [ %matrix.new, %loop ]
-  %t1 = call <8 x float> @lgc.cooperative.matrix.transpose.v8f32.v8f32.i32.i32(<8 x float> %matrix, i32 1, i32 0)
+  %t1 = call <8 x float> (...) @lgc.cooperative.matrix.transpose__v8f32(<8 x float> %matrix, i32 1, i32 0)
   %tmp = call <8 x float> @process1(<8 x float> %t1)
-  %matrix.new = call <8 x float> @lgc.cooperative.matrix.transpose.v8f32.v8f32.i32.i32(<8 x float> %tmp, i32 1, i32 0)
+  %matrix.new = call <8 x float> (...) @lgc.cooperative.matrix.transpose__v8f32(<8 x float> %tmp, i32 1, i32 0)
 
   %cc = call i1 @getcc()
   br i1 %cc, label %loop, label %end
 
 end:
-  call void @lgc.cooperative.matrix.store.p7.i32.i1.i32.i32.i32.v8f32(ptr addrspace(7) %ptr, i32 4, i1 false, i32 1, i32 0, i32 0, <8 x float> %matrix.new)
+  call void (...) @lgc.cooperative.matrix.store(ptr addrspace(7) %ptr, i32 4, i1 false, i32 1, i32 0, i32 0, i32 16, <8 x float> %matrix.new)
   ret void
 }
 
@@ -78,8 +78,8 @@ define <8 x float> @test_relayout_simple(<8 x float> %ab) {
 ; CHECK-LABEL: @test_relayout_simple(
 ; CHECK-NEXT:    ret <8 x float> [[AB:%.*]]
 ;
-  %b = call <8 x float> @lgc.cooperative.matrix.convert.v8f32.i32.v8f32.i32.i32.i32.i32(i32 0, <8 x float> %ab, i32 1, i32 1, i32 0, i32 1)
-  %c = call <8 x float> @lgc.cooperative.matrix.convert.v8f32.i32.v8f32.i32.i32.i32.i32(i32 0, <8 x float> %b, i32 1, i32 1, i32 1, i32 0)
+  %b = call <8 x float> (...) @lgc.cooperative.matrix.convert__v8f32(i32 0, <8 x float> %ab, i32 1, i32 1, i32 0, i32 1)
+  %c = call <8 x float> (...) @lgc.cooperative.matrix.convert__v8f32(i32 0, <8 x float> %b, i32 1, i32 1, i32 1, i32 0)
   ret <8 x float> %c
 }
 
@@ -87,56 +87,55 @@ define <8 x float> @test_relayout_simple_reverse(<8 x float> %cd) {
 ; CHECK-LABEL: @test_relayout_simple_reverse(
 ; CHECK-NEXT:    ret <8 x float> [[CD:%.*]]
 ;
-  %b = call <8 x float> @lgc.cooperative.matrix.convert.v8f32.i32.v8f32.i32.i32.i32.i32(i32 0, <8 x float> %cd, i32 1, i32 1, i32 1, i32 0)
-  %c = call <8 x float> @lgc.cooperative.matrix.convert.v8f32.i32.v8f32.i32.i32.i32.i32(i32 0, <8 x float> %b, i32 1, i32 1, i32 0, i32 1)
+  %b = call <8 x float> (...) @lgc.cooperative.matrix.convert__v8f32(i32 0, <8 x float> %cd, i32 1, i32 1, i32 1, i32 0)
+  %c = call <8 x float> (...) @lgc.cooperative.matrix.convert__v8f32(i32 0, <8 x float> %b, i32 1, i32 1, i32 0, i32 1)
   ret <8 x float> %c
 }
 
 define <8 x float> @test_relayout_load(ptr addrspace(3) %ptr) {
 ; CHECK-LABEL: @test_relayout_load(
-; CHECK-NEXT:    [[A:%.*]] = call <8 x float> @lgc.cooperative.matrix.load.v8f32.p3.i32.i1.i32.i32.i32(ptr addrspace(3) [[PTR:%.*]], i32 4, i1 true, i32 1, i32 1, i32 0)
+; CHECK-NEXT:    [[A:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.load__v8f32(ptr addrspace(3) [[PTR:%.*]], i32 4, i1 true, i32 1, i32 1, i32 0)
 ; CHECK-NEXT:    ret <8 x float> [[A]]
 ;
-  %a = call <8 x float> @lgc.cooperative.matrix.load.v8f32.p3.i32.i1.i32.i32.i32(ptr addrspace(3) %ptr, i32 4, i1 true, i32 1, i32 0, i32 0)
-  %b = call <8 x float> @lgc.cooperative.matrix.convert.v8f32.i32.v8f32.i32.i32.i32.i32(i32 0, <8 x float> %a, i32 1, i32 1, i32 0, i32 1)
+  %a = call <8 x float> (...) @lgc.cooperative.matrix.load__v8f32(ptr addrspace(3) %ptr, i32 4, i1 true, i32 1, i32 0, i32 0)
+  %b = call <8 x float> (...) @lgc.cooperative.matrix.convert__v8f32(i32 0, <8 x float> %a, i32 1, i32 1, i32 0, i32 1)
   ret <8 x float> %b
 }
 
 define <8 x float> @test_relayout_load2(ptr addrspace(3) %ptr) {
 ; CHECK-LABEL: @test_relayout_load2(
-; CHECK-NEXT:    [[A:%.*]] = call <8 x float> @lgc.cooperative.matrix.load.v8f32.p3.i32.i1.i32.i32.i32(ptr addrspace(3) [[PTR:%.*]], i32 4, i1 true, i32 1, i32 0, i32 0)
+; CHECK-NEXT:    [[A:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.load__v8f32(ptr addrspace(3) [[PTR:%.*]], i32 4, i1 true, i32 1, i32 0, i32 0)
 ; CHECK-NEXT:    ret <8 x float> [[A]]
 ;
-  %a = call <8 x float> @lgc.cooperative.matrix.load.v8f32.p3.i32.i1.i32.i32.i32(ptr addrspace(3) %ptr, i32 4, i1 true, i32 1, i32 1, i32 0)
-  %b = call <8 x float> @lgc.cooperative.matrix.convert.v8f32.i32.v8f32.i32.i32.i32.i32(i32 0, <8 x float> %a, i32 1, i32 1, i32 1, i32 0)
+  %a = call <8 x float> (...) @lgc.cooperative.matrix.load__v8f32(ptr addrspace(3) %ptr, i32 4, i1 true, i32 1, i32 1, i32 0)
+  %b = call <8 x float> (...) @lgc.cooperative.matrix.convert__v8f32(i32 0, <8 x float> %a, i32 1, i32 1, i32 1, i32 0)
   ret <8 x float> %b
 }
 
 define void @test_relayout_store(ptr addrspace(3) %ptr, <8 x float> %a) {
 ; CHECK-LABEL: @test_relayout_store(
-; CHECK-NEXT:    call void @lgc.cooperative.matrix.store.p3.i32.i1.i32.i32.i32.v8f32(ptr addrspace(3) [[PTR:%.*]], i32 4, i1 true, i32 1, i32 0, i32 0, <8 x float> [[A:%.*]])
+; CHECK-NEXT:    call void (...) @lgc.cooperative.matrix.store(ptr addrspace(3) [[PTR:%.*]], i32 4, i1 true, i32 1, i32 0, i32 0, i32 16, <8 x float> [[A:%.*]])
 ; CHECK-NEXT:    ret void
 ;
-  %b = call <8 x float> @lgc.cooperative.matrix.convert.v8f32.i32.v8f32.i32.i32.i32.i32(i32 0, <8 x float> %a, i32 1, i32 1, i32 0, i32 1)
-  call void @lgc.cooperative.matrix.store.p3.i32.i1.i32.i32.i32.v8f32(ptr addrspace(3) %ptr, i32 4, i1 true, i32 1, i32 1, i32 0, <8 x float> %b)
+  %b = call <8 x float> (...) @lgc.cooperative.matrix.convert__v8f32(i32 0, <8 x float> %a, i32 1, i32 1, i32 0, i32 1)
+  call void (...) @lgc.cooperative.matrix.store(ptr addrspace(3) %ptr, i32 4, i1 true, i32 1, i32 1, i32 0, i32 16, <8 x float> %b)
   ret void
 }
 
 define void @test_relayout_store2(ptr addrspace(3) %ptr, <8 x float> %a) {
 ; CHECK-LABEL: @test_relayout_store2(
-; CHECK-NEXT:    call void @lgc.cooperative.matrix.store.p3.i32.i1.i32.i32.i32.v8f32(ptr addrspace(3) [[PTR:%.*]], i32 4, i1 true, i32 1, i32 1, i32 0, <8 x float> [[A:%.*]])
+; CHECK-NEXT:    call void (...) @lgc.cooperative.matrix.store(ptr addrspace(3) [[PTR:%.*]], i32 4, i1 true, i32 1, i32 1, i32 0, i32 16, <8 x float> [[A:%.*]])
 ; CHECK-NEXT:    ret void
 ;
-  %b = call <8 x float> @lgc.cooperative.matrix.convert.v8f32.i32.v8f32.i32.i32.i32.i32(i32 0, <8 x float> %a, i32 1, i32 1, i32 1, i32 0)
-  call void @lgc.cooperative.matrix.store.p3.i32.i1.i32.i32.i32.v8f32(ptr addrspace(3) %ptr, i32 4, i1 true, i32 1, i32 0, i32 0, <8 x float> %b)
+  %b = call <8 x float> (...) @lgc.cooperative.matrix.convert__v8f32(i32 0, <8 x float> %a, i32 1, i32 1, i32 1, i32 0)
+  call void (...) @lgc.cooperative.matrix.store(ptr addrspace(3) %ptr, i32 4, i1 true, i32 1, i32 0, i32 0, i32 16, <8 x float> %b)
   ret void
 }
 
 declare i1 @getcc()
 declare <8 x float> @process1(<8 x float>)
 
-declare <8 x float> @lgc.cooperative.matrix.load.v8f32.p3.i32.i1.i32.i32.i32(ptr addrspace(3), i32, i1, i32, i32, i32)
-declare <8 x float> @lgc.cooperative.matrix.transpose.v8f32.v8f32.i32.i32(<8 x float>, i32, i32)
-declare <8 x float> @lgc.cooperative.matrix.convert.v8f32.i32.v8f32.i32.i32.i32.i32(i32, <8 x float>, i32, i32, i32, i32)
-declare void @lgc.cooperative.matrix.store.p3.i32.i1.i32.i32.i32.v8f32(ptr addrspace(3), i32, i1, i32, i32, i32, <8 x float>)
-declare void @lgc.cooperative.matrix.store.p7.i32.i1.i32.i32.i32.v8f32(ptr addrspace(7), i32, i1, i32, i32, i32, <8 x float>)
+declare <8 x float> @lgc.cooperative.matrix.load__v8f32(...)
+declare <8 x float> @lgc.cooperative.matrix.transpose__v8f32(...)
+declare <8 x float> @lgc.cooperative.matrix.convert__v8f32(...)
+declare void @lgc.cooperative.matrix.store(...)
diff --git a/lgc/test/Transforms/CombineCooperativeMatrix/unhandled-inout.lgc b/lgc/test/Transforms/CombineCooperativeMatrix/unhandled-inout.lgc
index 08f079cb89..727138420e 100644
--- a/lgc/test/Transforms/CombineCooperativeMatrix/unhandled-inout.lgc
+++ b/lgc/test/Transforms/CombineCooperativeMatrix/unhandled-inout.lgc
@@ -10,16 +10,16 @@ define <8 x float> @insert_transpose(<8 x float> %x) {
 ; CHECK:       loop:
 ; CHECK-NEXT:    [[V_LOOP:%.*]] = phi <8 x float> [ [[X]], [[ENTRY:%.*]] ], [ [[MULADD:%.*]], [[LOOP]] ]
 ; CHECK-NEXT:    [[F:%.*]] = call <8 x float> @getmat1()
-; CHECK-NEXT:    [[MULADD]] = call <8 x float> @lgc.cooperative.matrix.muladd.v8f32.v8f32.v8f32.v8f32.i1.i1.i1.i1.i32.i32(<8 x float> [[F]], <8 x float> [[F]], <8 x float> [[V_LOOP]], i1 true, i1 true, i1 false, i1 false, i32 1, i32 1)
+; CHECK-NEXT:    [[MULADD]] = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> [[F]], <8 x float> [[F]], <8 x float> [[V_LOOP]], i1 true, i1 true, i1 false, i1 false, i32 1, i32 1)
 ; CHECK-NEXT:    [[CC:%.*]] = call i1 @getcc()
 ; CHECK-NEXT:    br i1 [[CC]], label [[LOOP]], label [[END]]
 ; CHECK:       end:
 ; CHECK-NEXT:    [[R:%.*]] = phi <8 x float> [ [[MULADD]], [[LOOP]] ], [ [[X]], [[ENTRY]] ]
-; CHECK-NEXT:    [[TMP0:%.*]] = call <8 x float> @lgc.cooperative.matrix.transpose.v8f32.v8f32.i32.i32(<8 x float> [[R]], i32 1, i32 0)
+; CHECK-NEXT:    [[TMP0:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.transpose__v8f32(<8 x float> [[R]], i32 1, i32 0)
 ; CHECK-NEXT:    ret <8 x float> [[TMP0]]
 ;
 entry:
-  %in.t = call <8 x float> @lgc.cooperative.matrix.transpose.v8f32.v8f32.i32.i32(<8 x float> %x, i32 1, i32 0)
+  %in.t = call <8 x float> (...) @lgc.cooperative.matrix.transpose__v8f32(<8 x float> %x, i32 1, i32 0)
   %guard = call i1 @getcc()
   br i1 %guard, label %loop, label %end
 
@@ -27,9 +27,9 @@ loop:
   %v.loop = phi <8 x float> [ %in.t, %entry ], [ %v.next, %loop ]
 
   %f = call <8 x float> @getmat1()
-  %pre.t = call <8 x float> @lgc.cooperative.matrix.transpose.v8f32.v8f32.i32.i32(<8 x float> %v.loop, i32 1, i32 0)
-  %muladd = call <8 x float> @lgc.cooperative.matrix.muladd.v8f32.v8f32.v8f32.v8f32.i1.i1.i1.i1.i32.i32(<8 x float> %f, <8 x float> %f, <8 x float> %pre.t, i1 true, i1 true, i1 false, i1 false, i32 1, i32 1)
-  %v.next = call <8 x float> @lgc.cooperative.matrix.transpose.v8f32.v8f32.i32.i32(<8 x float> %muladd, i32 1, i32 0)
+  %pre.t = call <8 x float> (...) @lgc.cooperative.matrix.transpose__v8f32(<8 x float> %v.loop, i32 1, i32 0)
+  %muladd = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> %f, <8 x float> %f, <8 x float> %pre.t, i1 true, i1 true, i1 false, i1 false, i32 1, i32 1)
+  %v.next = call <8 x float> (...) @lgc.cooperative.matrix.transpose__v8f32(<8 x float> %muladd, i32 1, i32 0)
 
   %cc = call i1 @getcc()
   br i1 %cc, label %loop, label %end
@@ -42,13 +42,13 @@ end:
 define <8 x float> @reuse_transpose(<8 x float> %x) {
 ; CHECK-LABEL: define <8 x float> @reuse_transpose
 ; CHECK-SAME: (<8 x float> [[X:%.*]]) {
-; CHECK-NEXT:    [[T1:%.*]] = call <8 x float> @lgc.cooperative.matrix.transpose.v8f32.v8f32.i32.i32(<8 x float> [[X]], i32 1, i32 0)
-; CHECK-NEXT:    [[R:%.*]] = call <8 x float> @lgc.cooperative.matrix.muladd.v8f32.v8f32.v8f32.v8f32.i1.i1.i1.i1.i32.i32(<8 x float> [[T1]], <8 x float> [[X]], <8 x float> zeroinitializer, i1 true, i1 true, i1 false, i1 false, i32 1, i32 1)
+; CHECK-NEXT:    [[T1:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.transpose__v8f32(<8 x float> [[X]], i32 1, i32 0)
+; CHECK-NEXT:    [[R:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> [[T1]], <8 x float> [[X]], <8 x float> zeroinitializer, i1 true, i1 true, i1 false, i1 false, i32 1, i32 1)
 ; CHECK-NEXT:    ret <8 x float> [[R]]
 ;
-  %t1 = call <8 x float> @lgc.cooperative.matrix.transpose.v8f32.v8f32.i32.i32(<8 x float> %x, i32 1, i32 0)
-  %t2 = call <8 x float> @lgc.cooperative.matrix.transpose.v8f32.v8f32.i32.i32(<8 x float> %t1, i32 1, i32 0)
-  %r = call <8 x float> @lgc.cooperative.matrix.muladd.v8f32.v8f32.v8f32.v8f32.i1.i1.i1.i1.i32.i32(<8 x float> %t1, <8 x float> %t2, <8 x float> zeroinitializer, i1 true, i1 true, i1 false, i1 false, i32 1, i32 1)
+  %t1 = call <8 x float> (...) @lgc.cooperative.matrix.transpose__v8f32(<8 x float> %x, i32 1, i32 0)
+  %t2 = call <8 x float> (...) @lgc.cooperative.matrix.transpose__v8f32(<8 x float> %t1, i32 1, i32 0)
+  %r = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> %t1, <8 x float> %t2, <8 x float> zeroinitializer, i1 true, i1 true, i1 false, i1 false, i32 1, i32 1)
   ret <8 x float> %r
 }
 
@@ -57,22 +57,22 @@ define <8 x float> @insert_convert(ptr %ptr) {
 ; CHECK-LABEL: define <8 x float> @insert_convert
 ; CHECK-SAME: (ptr [[PTR:%.*]]) {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[LOAD:%.*]] = call <8 x float> @lgc.cooperative.matrix.load.v8f32.p0.i32.i1.i32.i32.i32(ptr [[PTR]], i32 4, i1 false, i32 1, i32 1, i32 0)
+; CHECK-NEXT:    [[LOAD:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.load__v8f32(ptr [[PTR]], i32 4, i1 false, i32 1, i32 1, i32 0)
 ; CHECK-NEXT:    [[GUARD:%.*]] = call i1 @getcc()
 ; CHECK-NEXT:    br i1 [[GUARD]], label [[LOOP:%.*]], label [[END:%.*]]
 ; CHECK:       loop:
 ; CHECK-NEXT:    [[V_LOOP:%.*]] = phi <8 x float> [ [[LOAD]], [[ENTRY:%.*]] ], [ [[MULADD:%.*]], [[LOOP]] ]
 ; CHECK-NEXT:    [[F:%.*]] = call <8 x float> @getmat1()
-; CHECK-NEXT:    [[MULADD]] = call <8 x float> @lgc.cooperative.matrix.muladd.v8f32.v8f32.v8f32.v8f32.i1.i1.i1.i1.i32.i32(<8 x float> [[F]], <8 x float> [[F]], <8 x float> [[V_LOOP]], i1 true, i1 true, i1 false, i1 false, i32 1, i32 1)
+; CHECK-NEXT:    [[MULADD]] = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> [[F]], <8 x float> [[F]], <8 x float> [[V_LOOP]], i1 true, i1 true, i1 false, i1 false, i32 1, i32 1)
 ; CHECK-NEXT:    [[CC:%.*]] = call i1 @getcc()
 ; CHECK-NEXT:    br i1 [[CC]], label [[LOOP]], label [[END]]
 ; CHECK:       end:
 ; CHECK-NEXT:    [[R:%.*]] = phi <8 x float> [ [[MULADD]], [[LOOP]] ], [ [[LOAD]], [[ENTRY]] ]
-; CHECK-NEXT:    [[TMP0:%.*]] = call <8 x float> @lgc.cooperative.matrix.convert.v8f32.i32.v8f32.i32.i32.i32.i32(i32 0, <8 x float> [[R]], i32 1, i32 1, i32 1, i32 0)
+; CHECK-NEXT:    [[TMP0:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.convert__v8f32(i32 0, <8 x float> [[R]], i32 1, i32 1, i32 1, i32 0)
 ; CHECK-NEXT:    ret <8 x float> [[TMP0]]
 ;
 entry:
-  %load = call <8 x float> @lgc.cooperative.matrix.load.v8f32.p0.i32.i1.i32.i32.i32(ptr %ptr, i32 4, i1 false, i32 1, i32 0, i32 0)
+  %load = call <8 x float> (...) @lgc.cooperative.matrix.load__v8f32(ptr %ptr, i32 4, i1 false, i32 1, i32 0, i32 0)
   %guard = call i1 @getcc()
   br i1 %guard, label %loop, label %end
 
@@ -80,9 +80,9 @@ loop:
   %v.loop = phi <8 x float> [ %load, %entry ], [ %v.next, %loop ]
 
   %f = call <8 x float> @getmat1()
-  %pre = call <8 x float> @lgc.cooperative.matrix.convert.v8f32.i32.v8f32.i32.i32.i32.i32(i32 0, <8 x float> %v.loop, i32 1, i32 1, i32 0, i32 1)
-  %muladd = call <8 x float> @lgc.cooperative.matrix.muladd.v8f32.v8f32.v8f32.v8f32.i1.i1.i1.i1.i32.i32(<8 x float> %f, <8 x float> %f, <8 x float> %pre, i1 true, i1 true, i1 false, i1 false, i32 1, i32 1)
-  %v.next = call <8 x float> @lgc.cooperative.matrix.convert.v8f32.i32.v8f32.i32.i32.i32.i32(i32 0, <8 x float> %muladd, i32 1, i32 1, i32 1, i32 0)
+  %pre = call <8 x float> (...) @lgc.cooperative.matrix.convert__v8f32(i32 0, <8 x float> %v.loop, i32 1, i32 1, i32 0, i32 1)
+  %muladd = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> %f, <8 x float> %f, <8 x float> %pre, i1 true, i1 true, i1 false, i1 false, i32 1, i32 1)
+  %v.next = call <8 x float> (...) @lgc.cooperative.matrix.convert__v8f32(i32 0, <8 x float> %muladd, i32 1, i32 1, i32 1, i32 0)
 
   %cc = call i1 @getcc()
   br i1 %cc, label %loop, label %end
@@ -95,13 +95,13 @@ end:
 define <8 x float> @reuse_convert(<8 x float> %x) {
 ; CHECK-LABEL: define <8 x float> @reuse_convert
 ; CHECK-SAME: (<8 x float> [[X:%.*]]) {
-; CHECK-NEXT:    [[CVT1:%.*]] = call <8 x float> @lgc.cooperative.matrix.convert.v8f32.i32.v8f32.i32.i32.i32.i32(i32 0, <8 x float> [[X]], i32 1, i32 1, i32 0, i32 1)
-; CHECK-NEXT:    [[R:%.*]] = call <8 x float> @lgc.cooperative.matrix.muladd.v8f32.v8f32.v8f32.v8f32.i1.i1.i1.i1.i32.i32(<8 x float> [[X]], <8 x float> [[X]], <8 x float> [[CVT1]], i1 true, i1 true, i1 false, i1 false, i32 1, i32 1)
+; CHECK-NEXT:    [[CVT1:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.convert__v8f32(i32 0, <8 x float> [[X]], i32 1, i32 1, i32 0, i32 1)
+; CHECK-NEXT:    [[R:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> [[X]], <8 x float> [[X]], <8 x float> [[CVT1]], i1 true, i1 true, i1 false, i1 false, i32 1, i32 1)
 ; CHECK-NEXT:    ret <8 x float> [[R]]
 ;
-  %cvt1 = call <8 x float> @lgc.cooperative.matrix.convert.v8f32.i32.v8f32.i32.i32.i32.i32(i32 0, <8 x float> %x, i32 1, i32 1, i32 0, i32 1)
-  %cvt2 = call <8 x float> @lgc.cooperative.matrix.convert.v8f32.i32.v8f32.i32.i32.i32.i32(i32 0, <8 x float> %cvt1, i32 1, i32 1, i32 1, i32 0)
-  %r = call <8 x float> @lgc.cooperative.matrix.muladd.v8f32.v8f32.v8f32.v8f32.i1.i1.i1.i1.i32.i32(<8 x float> %cvt2, <8 x float> %cvt2, <8 x float> %cvt1, i1 true, i1 true, i1 false, i1 false, i32 1, i32 1)
+  %cvt1 = call <8 x float> (...) @lgc.cooperative.matrix.convert__v8f32(i32 0, <8 x float> %x, i32 1, i32 1, i32 0, i32 1)
+  %cvt2 = call <8 x float> (...) @lgc.cooperative.matrix.convert__v8f32(i32 0, <8 x float> %cvt1, i32 1, i32 1, i32 1, i32 0)
+  %r = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> %cvt2, <8 x float> %cvt2, <8 x float> %cvt1, i1 true, i1 true, i1 false, i1 false, i32 1, i32 1)
   ret <8 x float> %r
 }
 
@@ -109,20 +109,20 @@ define void @convert_to_acc_inner_binop(ptr %ptr0, ptr %ptr1) {
 ; CHECK-LABEL: define void @convert_to_acc_inner_binop
 ; CHECK-SAME: (ptr [[PTR0:%.*]], ptr [[PTR1:%.*]]) {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[LOAD_A:%.*]] = call <8 x float> @lgc.cooperative.matrix.load.v8f32.p0.i32.i1.i32.i32.i32(ptr [[PTR0]], i32 4, i1 false, i32 1, i32 0, i32 0)
-; CHECK-NEXT:    [[LOAD_B:%.*]] = call <8 x float> @lgc.cooperative.matrix.load.v8f32.p0.i32.i1.i32.i32.i32(ptr [[PTR1]], i32 4, i1 false, i32 1, i32 0, i32 0)
-; CHECK-NEXT:    [[BINOP:%.*]] = call <8 x float> @lgc.cooperative.matrix.binop.v8f32.i32.v8f32.v8f32.i32.i32(i32 1, <8 x float> [[LOAD_A]], <8 x float> [[LOAD_B]], i32 1, i32 0)
-; CHECK-NEXT:    call void @lgc.cooperative.matrix.store.p0.i32.i1.i32.i32.i32.v8f32(ptr [[PTR0]], i32 4, i1 true, i32 1, i32 0, i32 0, <8 x float> [[BINOP]])
+; CHECK-NEXT:    [[LOAD_A:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.load__v8f32(ptr [[PTR0]], i32 4, i1 false, i32 1, i32 1, i32 0)
+; CHECK-NEXT:    [[LOAD_B:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.load__v8f32(ptr [[PTR1]], i32 4, i1 false, i32 1, i32 1, i32 0)
+; CHECK-NEXT:    [[BINOP:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.binop__v8f32(i32 1, <8 x float> [[LOAD_A]], <8 x float> [[LOAD_B]], i32 1, i32 1)
+; CHECK-NEXT:    call void (...) @lgc.cooperative.matrix.store(ptr [[PTR0]], i32 4, i1 true, i32 1, i32 1, i32 0, i32 16, <8 x float> [[BINOP]])
 ; CHECK-NEXT:    ret void
 ;
 entry:
-  %load.a = call <8 x float> @lgc.cooperative.matrix.load.v8f32.p0.i32.i1.i32.i32.i32(ptr %ptr0, i32 4, i1 false, i32 1, i32 0, i32 0)
-  %load.b = call <8 x float> @lgc.cooperative.matrix.load.v8f32.p0.i32.i1.i32.i32.i32(ptr %ptr1, i32 4, i1 false, i32 1, i32 0, i32 0)
-  %conv.a = call <8 x float> @lgc.cooperative.matrix.convert.v8f32.i32.v8f32.i32.i32.i32.i32(i32 0, <8 x float> %load.a, i32 1, i32 1, i32 0, i32 1)
-  %conv.b = call <8 x float> @lgc.cooperative.matrix.convert.v8f32.i32.v8f32.i32.i32.i32.i32(i32 0, <8 x float> %load.b, i32 1, i32 1, i32 0, i32 1)
-  %binop =  call <8 x float> @lgc.cooperative.matrix.binop.v8f32.i32.v8f32.v8f32.i32.i32(i32 1, <8 x float> %conv.a, <8 x float> %conv.b, i32 1, i32 1)
-  %conv.post = call <8 x float> @lgc.cooperative.matrix.convert.v8f32.i32.v8f32.i32.i32.i32.i32(i32 0, <8 x float> %binop, i32 1, i32 1, i32 1, i32 0)
-  call void @lgc.cooperative.matrix.store.p0.i32.i1.i32.i32.i32.v8f32(ptr %ptr0, i32 4, i1 true, i32 1, i32 0, i32 0, <8 x float> %conv.post)
+  %load.a = call <8 x float> (...) @lgc.cooperative.matrix.load__v8f32(ptr %ptr0, i32 4, i1 false, i32 1, i32 0, i32 0)
+  %load.b = call <8 x float> (...) @lgc.cooperative.matrix.load__v8f32(ptr %ptr1, i32 4, i1 false, i32 1, i32 0, i32 0)
+  %conv.a = call <8 x float> (...) @lgc.cooperative.matrix.convert__v8f32(i32 0, <8 x float> %load.a, i32 1, i32 1, i32 0, i32 1)
+  %conv.b = call <8 x float> (...) @lgc.cooperative.matrix.convert__v8f32(i32 0, <8 x float> %load.b, i32 1, i32 1, i32 0, i32 1)
+  %binop =  call <8 x float> (...) @lgc.cooperative.matrix.binop__v8f32(i32 1, <8 x float> %conv.a, <8 x float> %conv.b, i32 1, i32 1)
+  %conv.post = call <8 x float> (...) @lgc.cooperative.matrix.convert__v8f32(i32 0, <8 x float> %binop, i32 1, i32 1, i32 1, i32 0)
+  call void (...) @lgc.cooperative.matrix.store(ptr %ptr0, i32 4, i1 true, i32 1, i32 0, i32 0, i32 16, <8 x float> %conv.post)
   ret void
 }
 
@@ -130,17 +130,17 @@ define void @convert_to_acc_inner_times_scalar(ptr %ptr) {
 ; CHECK-LABEL: define void @convert_to_acc_inner_times_scalar
 ; CHECK-SAME: (ptr [[PTR:%.*]]) {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[LOAD:%.*]] = call <8 x float> @lgc.cooperative.matrix.load.v8f32.p0.i32.i1.i32.i32.i32(ptr [[PTR]], i32 4, i1 false, i32 1, i32 0, i32 0)
-; CHECK-NEXT:    [[SCALAR:%.*]] = call <8 x float> @lgc.cooperative.matrix.times.scalar.v8f32.v8f32.f16.i32.i32(<8 x float> [[LOAD]], half 0xH310F, i32 1, i32 0)
-; CHECK-NEXT:    call void @lgc.cooperative.matrix.store.p0.i32.i1.i32.i32.i32.v8f32(ptr [[PTR]], i32 4, i1 true, i32 1, i32 0, i32 0, <8 x float> [[SCALAR]])
+; CHECK-NEXT:    [[LOAD:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.load__v8f32(ptr [[PTR]], i32 4, i1 false, i32 1, i32 0, i32 0)
+; CHECK-NEXT:    [[SCALAR:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.times.scalar__v8f32(<8 x float> [[LOAD]], half 0xH310F, i32 1, i32 0)
+; CHECK-NEXT:    call void (...) @lgc.cooperative.matrix.store(ptr [[PTR]], i32 4, i1 true, i32 1, i32 0, i32 0, i32 16, <8 x float> [[SCALAR]])
 ; CHECK-NEXT:    ret void
 ;
 entry:
-  %load = call <8 x float> @lgc.cooperative.matrix.load.v8f32.p0.i32.i1.i32.i32.i32(ptr %ptr, i32 4, i1 false, i32 1, i32 0, i32 0)
-  %conv.pre = call <8 x float> @lgc.cooperative.matrix.convert.v8f32.i32.v8f32.i32.i32.i32.i32(i32 0, <8 x float> %load, i32 1, i32 1, i32 0, i32 1)
-  %scalar =  call <8 x float> @lgc.cooperative.matrix.times.scalar.v8f32.v8f32.f16.i32.i32(<8 x float> %conv.pre, half 0xH310F, i32 1, i32 1)
-  %conv.post = call <8 x float> @lgc.cooperative.matrix.convert.v8f32.i32.v8f32.i32.i32.i32.i32(i32 0, <8 x float> %scalar, i32 1, i32 1, i32 1, i32 0)
-  call void @lgc.cooperative.matrix.store.p0.i32.i1.i32.i32.i32.v8f32(ptr %ptr, i32 4, i1 true, i32 1, i32 0, i32 0, <8 x float> %conv.post)
+  %load = call <8 x float> (...) @lgc.cooperative.matrix.load__v8f32(ptr %ptr, i32 4, i1 false, i32 1, i32 0, i32 0)
+  %conv.pre = call <8 x float> (...) @lgc.cooperative.matrix.convert__v8f32(i32 0, <8 x float> %load, i32 1, i32 1, i32 0, i32 1)
+  %scalar =  call <8 x float> (...) @lgc.cooperative.matrix.times.scalar__v8f32(<8 x float> %conv.pre, half 0xH310F, i32 1, i32 1)
+  %conv.post = call <8 x float> (...) @lgc.cooperative.matrix.convert__v8f32(i32 0, <8 x float> %scalar, i32 1, i32 1, i32 1, i32 0)
+  call void (...) @lgc.cooperative.matrix.store(ptr %ptr, i32 4, i1 true, i32 1, i32 0, i32 0, i32 16, <8 x float> %conv.post)
   ret void
 }
 
@@ -148,20 +148,20 @@ define void @convert_to_fact_inner_binop(ptr %ptr0, ptr %ptr1) {
 ; CHECK-LABEL: define void @convert_to_fact_inner_binop
 ; CHECK-SAME: (ptr [[PTR0:%.*]], ptr [[PTR1:%.*]]) {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[LOAD_A:%.*]] = call <8 x float> @lgc.cooperative.matrix.load.v8f32.p0.i32.i1.i32.i32.i32(ptr [[PTR0]], i32 4, i1 false, i32 1, i32 1, i32 0)
-; CHECK-NEXT:    [[LOAD_B:%.*]] = call <8 x float> @lgc.cooperative.matrix.load.v8f32.p0.i32.i1.i32.i32.i32(ptr [[PTR1]], i32 4, i1 false, i32 1, i32 1, i32 0)
-; CHECK-NEXT:    [[BINOP:%.*]] = call <8 x float> @lgc.cooperative.matrix.binop.v8f32.i32.v8f32.v8f32.i32.i32(i32 1, <8 x float> [[LOAD_A]], <8 x float> [[LOAD_B]], i32 1, i32 1)
-; CHECK-NEXT:    call void @lgc.cooperative.matrix.store.p0.i32.i1.i32.i32.i32.v8f32(ptr [[PTR0]], i32 4, i1 true, i32 1, i32 1, i32 0, <8 x float> [[BINOP]])
+; CHECK-NEXT:    [[LOAD_A:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.load__v8f32(ptr [[PTR0]], i32 4, i1 false, i32 1, i32 0, i32 0)
+; CHECK-NEXT:    [[LOAD_B:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.load__v8f32(ptr [[PTR1]], i32 4, i1 false, i32 1, i32 0, i32 0)
+; CHECK-NEXT:    [[BINOP:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.binop__v8f32(i32 1, <8 x float> [[LOAD_A]], <8 x float> [[LOAD_B]], i32 1, i32 0)
+; CHECK-NEXT:    call void (...) @lgc.cooperative.matrix.store(ptr [[PTR0]], i32 4, i1 true, i32 1, i32 0, i32 0, i32 16, <8 x float> [[BINOP]])
 ; CHECK-NEXT:    ret void
 ;
 entry:
-  %load.a = call <8 x float> @lgc.cooperative.matrix.load.v8f32.p0.i32.i1.i32.i32.i32(ptr %ptr0, i32 4, i1 false, i32 1, i32 1, i32 0)
-  %load.b = call <8 x float> @lgc.cooperative.matrix.load.v8f32.p0.i32.i1.i32.i32.i32(ptr %ptr1, i32 4, i1 false, i32 1, i32 1, i32 0)
-  %conv.a = call <8 x float> @lgc.cooperative.matrix.convert.v8f32.i32.v8f32.i32.i32.i32.i32(i32 0, <8 x float> %load.a, i32 1, i32 1, i32 1, i32 0)
-  %conv.b = call <8 x float> @lgc.cooperative.matrix.convert.v8f32.i32.v8f32.i32.i32.i32.i32(i32 0, <8 x float> %load.b, i32 1, i32 1, i32 1, i32 0)
-  %binop =  call <8 x float> @lgc.cooperative.matrix.binop.v8f32.i32.v8f32.v8f32.i32.i32(i32 1, <8 x float> %conv.a, <8 x float> %conv.b, i32 1, i32 0)
-  %conv.post = call <8 x float> @lgc.cooperative.matrix.convert.v8f32.i32.v8f32.i32.i32.i32.i32(i32 0, <8 x float> %binop, i32 1, i32 1, i32 0, i32 1)
-  call void @lgc.cooperative.matrix.store.p0.i32.i1.i32.i32.i32.v8f32(ptr %ptr0, i32 4, i1 true, i32 1, i32 1, i32 0, <8 x float> %conv.post)
+  %load.a = call <8 x float> (...) @lgc.cooperative.matrix.load__v8f32(ptr %ptr0, i32 4, i1 false, i32 1, i32 1, i32 0)
+  %load.b = call <8 x float> (...) @lgc.cooperative.matrix.load__v8f32(ptr %ptr1, i32 4, i1 false, i32 1, i32 1, i32 0)
+  %conv.a = call <8 x float> (...) @lgc.cooperative.matrix.convert__v8f32(i32 0, <8 x float> %load.a, i32 1, i32 1, i32 1, i32 0)
+  %conv.b = call <8 x float> (...) @lgc.cooperative.matrix.convert__v8f32(i32 0, <8 x float> %load.b, i32 1, i32 1, i32 1, i32 0)
+  %binop =  call <8 x float> (...) @lgc.cooperative.matrix.binop__v8f32(i32 1, <8 x float> %conv.a, <8 x float> %conv.b, i32 1, i32 0)
+  %conv.post = call <8 x float> (...) @lgc.cooperative.matrix.convert__v8f32(i32 0, <8 x float> %binop, i32 1, i32 1, i32 0, i32 1)
+  call void (...) @lgc.cooperative.matrix.store(ptr %ptr0, i32 4, i1 true, i32 1, i32 1, i32 0, i32 16, <8 x float> %conv.post)
   ret void
 }
 
@@ -169,17 +169,17 @@ define void @convert_to_fact_inner_times_scalar(ptr %ptr) {
 ; CHECK-LABEL: define void @convert_to_fact_inner_times_scalar
 ; CHECK-SAME: (ptr [[PTR:%.*]]) {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[LOAD:%.*]] = call <8 x float> @lgc.cooperative.matrix.load.v8f32.p0.i32.i1.i32.i32.i32(ptr [[PTR]], i32 4, i1 false, i32 1, i32 1, i32 0)
-; CHECK-NEXT:    [[SCALAR:%.*]] = call <8 x float> @lgc.cooperative.matrix.times.scalar.v8f32.v8f32.f16.i32.i32(<8 x float> [[LOAD]], half 0xH310F, i32 1, i32 1)
-; CHECK-NEXT:    call void @lgc.cooperative.matrix.store.p0.i32.i1.i32.i32.i32.v8f32(ptr [[PTR]], i32 4, i1 true, i32 1, i32 1, i32 0, <8 x float> [[SCALAR]])
+; CHECK-NEXT:    [[LOAD:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.load__v8f32(ptr [[PTR]], i32 4, i1 false, i32 1, i32 1, i32 0)
+; CHECK-NEXT:    [[SCALAR:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.times.scalar__v8f32(<8 x float> [[LOAD]], half 0xH310F, i32 1, i32 1)
+; CHECK-NEXT:    call void (...) @lgc.cooperative.matrix.store(ptr [[PTR]], i32 4, i1 true, i32 1, i32 1, i32 0, i32 16, <8 x float> [[SCALAR]])
 ; CHECK-NEXT:    ret void
 ;
 entry:
-  %load = call <8 x float> @lgc.cooperative.matrix.load.v8f32.p0.i32.i1.i32.i32.i32(ptr %ptr, i32 4, i1 false, i32 1, i32 1, i32 0)
-  %conv.pre = call <8 x float> @lgc.cooperative.matrix.convert.v8f32.i32.v8f32.i32.i32.i32.i32(i32 0, <8 x float> %load, i32 1, i32 1, i32 1, i32 0)
-  %scalar =  call <8 x float> @lgc.cooperative.matrix.times.scalar.v8f32.v8f32.f16.i32.i32(<8 x float> %conv.pre, half 0xH310F, i32 1, i32 0)
-  %conv.post = call <8 x float> @lgc.cooperative.matrix.convert.v8f32.i32.v8f32.i32.i32.i32.i32(i32 0, <8 x float> %scalar, i32 1, i32 1, i32 0, i32 1)
-  call void @lgc.cooperative.matrix.store.p0.i32.i1.i32.i32.i32.v8f32(ptr %ptr, i32 4, i1 true, i32 1, i32 1, i32 0, <8 x float> %conv.post)
+  %load = call <8 x float> (...) @lgc.cooperative.matrix.load__v8f32(ptr %ptr, i32 4, i1 false, i32 1, i32 1, i32 0)
+  %conv.pre = call <8 x float> (...) @lgc.cooperative.matrix.convert__v8f32(i32 0, <8 x float> %load, i32 1, i32 1, i32 1, i32 0)
+  %scalar =  call <8 x float> (...) @lgc.cooperative.matrix.times.scalar__v8f32(<8 x float> %conv.pre, half 0xH310F, i32 1, i32 0)
+  %conv.post = call <8 x float> (...) @lgc.cooperative.matrix.convert__v8f32(i32 0, <8 x float> %scalar, i32 1, i32 1, i32 0, i32 1)
+  call void (...) @lgc.cooperative.matrix.store(ptr %ptr, i32 4, i1 true, i32 1, i32 1, i32 0, i32 16, <8 x float> %conv.post)
   ret void
 }
 
@@ -187,25 +187,25 @@ define void @convert_to_acc_inner_chain(ptr %ptr) {
 ; CHECK-LABEL: define void @convert_to_acc_inner_chain
 ; CHECK-SAME: (ptr [[PTR:%.*]]) {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[LOAD:%.*]] = call <8 x float> @lgc.cooperative.matrix.load.v8f32.p0.i32.i1.i32.i32.i32(ptr [[PTR]], i32 4, i1 false, i32 1, i32 1, i32 0)
+; CHECK-NEXT:    [[LOAD:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.load__v8f32(ptr [[PTR]], i32 4, i1 false, i32 1, i32 1, i32 0)
 ; CHECK-NEXT:    [[GUARD:%.*]] = call i1 @getcc()
 ; CHECK-NEXT:    br i1 [[GUARD]], label [[LOOP:%.*]], label [[END:%.*]]
 ; CHECK:       loop:
 ; CHECK-NEXT:    [[V_LOOP:%.*]] = phi <8 x float> [ [[LOAD]], [[ENTRY:%.*]] ], [ [[SCALAR:%.*]], [[LOOP]] ]
 ; CHECK-NEXT:    [[F:%.*]] = call <8 x float> @getmat1()
-; CHECK-NEXT:    [[MULADD:%.*]] = call <8 x float> @lgc.cooperative.matrix.muladd.v8f32.v8f32.v8f32.v8f32.i1.i1.i1.i1.i32.i32(<8 x float> [[F]], <8 x float> [[F]], <8 x float> [[V_LOOP]], i1 true, i1 true, i1 false, i1 false, i32 1, i32 1)
-; CHECK-NEXT:    [[BINOP:%.*]] = call <8 x float> @lgc.cooperative.matrix.binop.v8f32.i32.v8f32.v8f32.i32.i32(i32 1, <8 x float> [[MULADD]], <8 x float> [[MULADD]], i32 1, i32 1)
-; CHECK-NEXT:    [[SCALAR]] = call <8 x float> @lgc.cooperative.matrix.times.scalar.v8f32.v8f32.f16.i32.i32(<8 x float> [[BINOP]], half 0xH310F, i32 1, i32 1)
+; CHECK-NEXT:    [[MULADD:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> [[F]], <8 x float> [[F]], <8 x float> [[V_LOOP]], i1 true, i1 true, i1 false, i1 false, i32 1, i32 1)
+; CHECK-NEXT:    [[BINOP:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.binop__v8f32(i32 1, <8 x float> [[MULADD]], <8 x float> [[MULADD]], i32 1, i32 1)
+; CHECK-NEXT:    [[SCALAR]] = call <8 x float> (...) @lgc.cooperative.matrix.times.scalar__v8f32(<8 x float> [[BINOP]], half 0xH310F, i32 1, i32 1)
 ; CHECK-NEXT:    [[CC:%.*]] = call i1 @getcc()
 ; CHECK-NEXT:    br i1 [[CC]], label [[LOOP]], label [[END]]
 ; CHECK:       end:
 ; CHECK-NEXT:    [[PHI_END:%.*]] = phi <8 x float> [ [[SCALAR]], [[LOOP]] ], [ [[LOAD]], [[ENTRY]] ]
-; CHECK-NEXT:    [[SCALAR_END:%.*]] = call <8 x float> @lgc.cooperative.matrix.times.scalar.v8f32.v8f32.f16.i32.i32(<8 x float> [[PHI_END]], half 0xH312F, i32 1, i32 1)
-; CHECK-NEXT:    call void @lgc.cooperative.matrix.store.p0.i32.i1.i32.i32.i32.v8f32(ptr [[PTR]], i32 4, i1 true, i32 1, i32 1, i32 0, <8 x float> [[SCALAR_END]])
+; CHECK-NEXT:    [[SCALAR_END:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.times.scalar__v8f32(<8 x float> [[PHI_END]], half 0xH312F, i32 1, i32 1)
+; CHECK-NEXT:    call void (...) @lgc.cooperative.matrix.store(ptr [[PTR]], i32 4, i1 true, i32 1, i32 1, i32 0, i32 16, <8 x float> [[SCALAR_END]])
 ; CHECK-NEXT:    ret void
 ;
 entry:
-  %load = call <8 x float> @lgc.cooperative.matrix.load.v8f32.p0.i32.i1.i32.i32.i32(ptr %ptr, i32 4, i1 false, i32 1, i32 0, i32 0)
+  %load = call <8 x float> (...) @lgc.cooperative.matrix.load__v8f32(ptr %ptr, i32 4, i1 false, i32 1, i32 0, i32 0)
   %guard = call i1 @getcc()
   br i1 %guard, label %loop, label %end
 
@@ -213,19 +213,19 @@ loop:
   %v.loop = phi <8 x float> [ %load, %entry ], [ %v.next, %loop ]
 
   %f = call <8 x float> @getmat1()
-  %pre = call <8 x float> @lgc.cooperative.matrix.convert.v8f32.i32.v8f32.i32.i32.i32.i32(i32 0, <8 x float> %v.loop, i32 1, i32 1, i32 0, i32 1)
-  %muladd = call <8 x float> @lgc.cooperative.matrix.muladd.v8f32.v8f32.v8f32.v8f32.i1.i1.i1.i1.i32.i32(<8 x float> %f, <8 x float> %f, <8 x float> %pre, i1 true, i1 true, i1 false, i1 false, i32 1, i32 1)
-  %binop = call <8 x float> @lgc.cooperative.matrix.binop.v8f32.i32.v8f32.v8f32.i32.i32(i32 1, <8 x float> %muladd, <8 x float> %muladd, i32 1, i32 1)
-  %scalar = call <8 x float> @lgc.cooperative.matrix.times.scalar.v8f32.v8f32.f16.i32.i32(<8 x float> %binop, half 0xH310F, i32 1, i32 1)
-  %v.next = call <8 x float> @lgc.cooperative.matrix.convert.v8f32.i32.v8f32.i32.i32.i32.i32(i32 0, <8 x float> %scalar, i32 1, i32 1, i32 1, i32 0)
+  %pre = call <8 x float> (...) @lgc.cooperative.matrix.convert__v8f32(i32 0, <8 x float> %v.loop, i32 1, i32 1, i32 0, i32 1)
+  %muladd = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f32(<8 x float> %f, <8 x float> %f, <8 x float> %pre, i1 true, i1 true, i1 false, i1 false, i32 1, i32 1)
+  %binop = call <8 x float> (...) @lgc.cooperative.matrix.binop__v8f32(i32 1, <8 x float> %muladd, <8 x float> %muladd, i32 1, i32 1)
+  %scalar = call <8 x float> (...) @lgc.cooperative.matrix.times.scalar__v8f32(<8 x float> %binop, half 0xH310F, i32 1, i32 1)
+  %v.next = call <8 x float> (...) @lgc.cooperative.matrix.convert__v8f32(i32 0, <8 x float> %scalar, i32 1, i32 1, i32 1, i32 0)
 
   %cc = call i1 @getcc()
   br i1 %cc, label %loop, label %end
 
 end:
   %phi.end = phi <8 x float> [ %v.next, %loop ], [ %load, %entry ]
-  %scalar.end = call <8 x float> @lgc.cooperative.matrix.times.scalar.v8f32.v8f32.f16.i32.i32(<8 x float> %phi.end, half 0xH312F, i32 1, i32 0)
-  call void @lgc.cooperative.matrix.store.p0.i32.i1.i32.i32.i32.v8f32(ptr %ptr, i32 4, i1 true, i32 1, i32 0, i32 0, <8 x float> %scalar.end)
+  %scalar.end = call <8 x float> (...) @lgc.cooperative.matrix.times.scalar__v8f32(<8 x float> %phi.end, half 0xH312F, i32 1, i32 0)
+  call void (...) @lgc.cooperative.matrix.store(ptr %ptr, i32 4, i1 true, i32 1, i32 0, i32 0, i32 16, <8 x float> %scalar.end)
   ret void
 }
 
@@ -233,24 +233,24 @@ define void @convert_to_fact_inner_chain(ptr %ptr) {
 ; CHECK-LABEL: define void @convert_to_fact_inner_chain
 ; CHECK-SAME: (ptr [[PTR:%.*]]) {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[LOAD:%.*]] = call <8 x float> @lgc.cooperative.matrix.load.v8f32.p0.i32.i1.i32.i32.i32(ptr [[PTR]], i32 4, i1 false, i32 1, i32 0, i32 0)
+; CHECK-NEXT:    [[LOAD:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.load__v8f32(ptr [[PTR]], i32 4, i1 false, i32 1, i32 0, i32 0)
 ; CHECK-NEXT:    [[GUARD:%.*]] = call i1 @getcc()
 ; CHECK-NEXT:    br i1 [[GUARD]], label [[LOOP:%.*]], label [[END:%.*]]
 ; CHECK:       loop:
 ; CHECK-NEXT:    [[V_LOOP:%.*]] = phi <8 x float> [ [[LOAD]], [[ENTRY:%.*]] ], [ [[SCALAR:%.*]], [[LOOP]] ]
 ; CHECK-NEXT:    [[F:%.*]] = call <8 x float> @getmat1()
-; CHECK-NEXT:    [[BINOP:%.*]] = call <8 x float> @lgc.cooperative.matrix.binop.v8f32.i32.v8f32.v8f32.i32.i32(i32 1, <8 x float> [[V_LOOP]], <8 x float> [[V_LOOP]], i32 1, i32 0)
-; CHECK-NEXT:    [[SCALAR]] = call <8 x float> @lgc.cooperative.matrix.times.scalar.v8f32.v8f32.f16.i32.i32(<8 x float> [[BINOP]], half 0xH310F, i32 1, i32 0)
+; CHECK-NEXT:    [[BINOP:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.binop__v8f32(i32 1, <8 x float> [[V_LOOP]], <8 x float> [[V_LOOP]], i32 1, i32 0)
+; CHECK-NEXT:    [[SCALAR]] = call <8 x float> (...) @lgc.cooperative.matrix.times.scalar__v8f32(<8 x float> [[BINOP]], half 0xH310F, i32 1, i32 0)
 ; CHECK-NEXT:    [[CC:%.*]] = call i1 @getcc()
 ; CHECK-NEXT:    br i1 [[CC]], label [[LOOP]], label [[END]]
 ; CHECK:       end:
 ; CHECK-NEXT:    [[PHI_END:%.*]] = phi <8 x float> [ [[SCALAR]], [[LOOP]] ], [ [[LOAD]], [[ENTRY]] ]
-; CHECK-NEXT:    [[SCALAR_END:%.*]] = call <8 x float> @lgc.cooperative.matrix.times.scalar.v8f32.v8f32.f16.i32.i32(<8 x float> [[PHI_END]], half 0xH312F, i32 1, i32 0)
-; CHECK-NEXT:    call void @lgc.cooperative.matrix.store.p0.i32.i1.i32.i32.i32.v8f32(ptr [[PTR]], i32 4, i1 true, i32 1, i32 0, i32 0, <8 x float> [[SCALAR_END]])
+; CHECK-NEXT:    [[SCALAR_END:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.times.scalar__v8f32(<8 x float> [[PHI_END]], half 0xH312F, i32 1, i32 0)
+; CHECK-NEXT:    call void (...) @lgc.cooperative.matrix.store(ptr [[PTR]], i32 4, i1 true, i32 1, i32 0, i32 0, i32 16, <8 x float> [[SCALAR_END]])
 ; CHECK-NEXT:    ret void
 ;
 entry:
-  %load = call <8 x float> @lgc.cooperative.matrix.load.v8f32.p0.i32.i1.i32.i32.i32(ptr %ptr, i32 4, i1 false, i32 1, i32 1, i32 0)
+  %load = call <8 x float> (...) @lgc.cooperative.matrix.load__v8f32(ptr %ptr, i32 4, i1 false, i32 1, i32 1, i32 0)
   %guard = call i1 @getcc()
   br i1 %guard, label %loop, label %end
 
@@ -258,18 +258,18 @@ loop:
   %v.loop = phi <8 x float> [ %load, %entry ], [ %v.next, %loop ]
 
   %f = call <8 x float> @getmat1()
-  %pre = call <8 x float> @lgc.cooperative.matrix.convert.v8f32.i32.v8f32.i32.i32.i32.i32(i32 0, <8 x float> %v.loop, i32 1, i32 1, i32 1, i32 0)
-  %binop = call <8 x float> @lgc.cooperative.matrix.binop.v8f32.i32.v8f32.v8f32.i32.i32(i32 1, <8 x float> %pre, <8 x float> %pre, i32 1, i32 0)
-  %scalar =  call <8 x float> @lgc.cooperative.matrix.times.scalar.v8f32.v8f32.f16.i32.i32(<8 x float> %binop, half 0xH310F, i32 1, i32 0)
-  %v.next = call <8 x float> @lgc.cooperative.matrix.convert.v8f32.i32.v8f32.i32.i32.i32.i32(i32 0, <8 x float> %scalar, i32 1, i32 1, i32 0, i32 1)
+  %pre = call <8 x float> (...) @lgc.cooperative.matrix.convert__v8f32(i32 0, <8 x float> %v.loop, i32 1, i32 1, i32 1, i32 0)
+  %binop = call <8 x float> (...) @lgc.cooperative.matrix.binop__v8f32(i32 1, <8 x float> %pre, <8 x float> %pre, i32 1, i32 0)
+  %scalar =  call <8 x float> (...) @lgc.cooperative.matrix.times.scalar__v8f32(<8 x float> %binop, half 0xH310F, i32 1, i32 0)
+  %v.next = call <8 x float> (...) @lgc.cooperative.matrix.convert__v8f32(i32 0, <8 x float> %scalar, i32 1, i32 1, i32 0, i32 1)
 
   %cc = call i1 @getcc()
   br i1 %cc, label %loop, label %end
 
 end:
   %phi.end = phi <8 x float> [ %v.next, %loop ], [ %load, %entry ]
-  %scalar.end = call <8 x float> @lgc.cooperative.matrix.times.scalar.v8f32.v8f32.f16.i32.i32(<8 x float> %phi.end, half 0xH312F, i32 1, i32 1)
-  call void @lgc.cooperative.matrix.store.p0.i32.i1.i32.i32.i32.v8f32(ptr %ptr, i32 4, i1 true, i32 1, i32 1, i32 0, <8 x float> %scalar.end)
+  %scalar.end = call <8 x float> (...) @lgc.cooperative.matrix.times.scalar__v8f32(<8 x float> %phi.end, half 0xH312F, i32 1, i32 1)
+  call void (...) @lgc.cooperative.matrix.store(ptr %ptr, i32 4, i1 true, i32 1, i32 1, i32 0, i32 16, <8 x float> %scalar.end)
   ret void
 }
 
@@ -277,17 +277,17 @@ define void @transpose_fact_inner_binop(ptr %ptr) {
 ; CHECK-LABEL: define void @transpose_fact_inner_binop
 ; CHECK-SAME: (ptr [[PTR:%.*]]) {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[LOAD:%.*]] = call <8 x float> @lgc.cooperative.matrix.load.v8f32.p0.i32.i1.i32.i32.i32(ptr [[PTR]], i32 4, i1 true, i32 1, i32 0, i32 0)
-; CHECK-NEXT:    [[BINOP:%.*]] = call <8 x float> @lgc.cooperative.matrix.binop.v8f32.i32.v8f32.v8f32.i32.i32(i32 1, <8 x float> [[LOAD]], <8 x float> [[LOAD]], i32 1, i32 0)
-; CHECK-NEXT:    call void @lgc.cooperative.matrix.store.p0.i32.i1.i32.i32.i32.v8f32(ptr [[PTR]], i32 4, i1 true, i32 1, i32 0, i32 0, <8 x float> [[LOAD]])
+; CHECK-NEXT:    [[LOAD:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.load__v8f32(ptr [[PTR]], i32 4, i1 true, i32 1, i32 0, i32 0)
+; CHECK-NEXT:    [[BINOP:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.binop__v8f32(i32 1, <8 x float> [[LOAD]], <8 x float> [[LOAD]], i32 1, i32 0)
+; CHECK-NEXT:    call void (...) @lgc.cooperative.matrix.store(ptr [[PTR]], i32 4, i1 true, i32 1, i32 0, i32 0, i32 16, <8 x float> [[LOAD]])
 ; CHECK-NEXT:    ret void
 ;
 entry:
-  %load = call <8 x float> @lgc.cooperative.matrix.load.v8f32.p0.i32.i1.i32.i32.i32(ptr %ptr, i32 4, i1 false, i32 1, i32 0, i32 0)
-  %trans.pre = call <8 x float> @lgc.cooperative.matrix.transpose.v8f32.v8f32.i32.i32(<8 x float> %load, i32 1, i32 0)
-  %binop =  call <8 x float> @lgc.cooperative.matrix.binop.v8f32.i32.v8f32.v8f32.i32.i32(i32 1, <8 x float> %trans.pre, <8 x float> %trans.pre, i32 1, i32 0)
-  %trans.post = call <8 x float> @lgc.cooperative.matrix.transpose.v8f32.v8f32.i32.i32(<8 x float> %load, i32 1, i32 0)
-  call void @lgc.cooperative.matrix.store.p0.i32.i1.i32.i32.i32.v8f32(ptr %ptr, i32 4, i1 true, i32 1, i32 0, i32 0, <8 x float> %trans.post)
+  %load = call <8 x float> (...) @lgc.cooperative.matrix.load__v8f32(ptr %ptr, i32 4, i1 false, i32 1, i32 0, i32 0)
+  %trans.pre = call <8 x float> (...) @lgc.cooperative.matrix.transpose__v8f32(<8 x float> %load, i32 1, i32 0)
+  %binop =  call <8 x float> (...) @lgc.cooperative.matrix.binop__v8f32(i32 1, <8 x float> %trans.pre, <8 x float> %trans.pre, i32 1, i32 0)
+  %trans.post = call <8 x float> (...) @lgc.cooperative.matrix.transpose__v8f32(<8 x float> %load, i32 1, i32 0)
+  call void (...) @lgc.cooperative.matrix.store(ptr %ptr, i32 4, i1 true, i32 1, i32 0, i32 0, i32 16, <8 x float> %trans.post)
   ret void
 }
 
@@ -295,17 +295,17 @@ define void @transpose_acc_inner_binop(ptr %ptr) {
 ; CHECK-LABEL: define void @transpose_acc_inner_binop
 ; CHECK-SAME: (ptr [[PTR:%.*]]) {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[LOAD:%.*]] = call <8 x float> @lgc.cooperative.matrix.load.v8f32.p0.i32.i1.i32.i32.i32(ptr [[PTR]], i32 4, i1 true, i32 1, i32 1, i32 0)
-; CHECK-NEXT:    [[BINOP:%.*]] = call <8 x float> @lgc.cooperative.matrix.binop.v8f32.i32.v8f32.v8f32.i32.i32(i32 1, <8 x float> [[LOAD]], <8 x float> [[LOAD]], i32 1, i32 1)
-; CHECK-NEXT:    call void @lgc.cooperative.matrix.store.p0.i32.i1.i32.i32.i32.v8f32(ptr [[PTR]], i32 4, i1 true, i32 1, i32 1, i32 0, <8 x float> [[LOAD]])
+; CHECK-NEXT:    [[LOAD:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.load__v8f32(ptr [[PTR]], i32 4, i1 true, i32 1, i32 1, i32 0)
+; CHECK-NEXT:    [[BINOP:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.binop__v8f32(i32 1, <8 x float> [[LOAD]], <8 x float> [[LOAD]], i32 1, i32 1)
+; CHECK-NEXT:    call void (...) @lgc.cooperative.matrix.store(ptr [[PTR]], i32 4, i1 true, i32 1, i32 1, i32 0, i32 16, <8 x float> [[LOAD]])
 ; CHECK-NEXT:    ret void
 ;
 entry:
-  %load = call <8 x float> @lgc.cooperative.matrix.load.v8f32.p0.i32.i1.i32.i32.i32(ptr %ptr, i32 4, i1 false, i32 1, i32 1, i32 0)
-  %trans.pre = call <8 x float> @lgc.cooperative.matrix.transpose.v8f32.v8f32.i32.i32(<8 x float> %load, i32 1, i32 1)
-  %binop =  call <8 x float> @lgc.cooperative.matrix.binop.v8f32.i32.v8f32.v8f32.i32.i32(i32 1, <8 x float> %trans.pre, <8 x float> %trans.pre, i32 1, i32 1)
-  %trans.post = call <8 x float> @lgc.cooperative.matrix.transpose.v8f32.v8f32.i32.i32(<8 x float> %load, i32 1, i32 1)
-  call void @lgc.cooperative.matrix.store.p0.i32.i1.i32.i32.i32.v8f32(ptr %ptr, i32 4, i1 true, i32 1, i32 1, i32 0, <8 x float> %trans.post)
+  %load = call <8 x float> (...) @lgc.cooperative.matrix.load__v8f32(ptr %ptr, i32 4, i1 false, i32 1, i32 1, i32 0)
+  %trans.pre = call <8 x float> (...) @lgc.cooperative.matrix.transpose__v8f32(<8 x float> %load, i32 1, i32 1)
+  %binop =  call <8 x float> (...) @lgc.cooperative.matrix.binop__v8f32(i32 1, <8 x float> %trans.pre, <8 x float> %trans.pre, i32 1, i32 1)
+  %trans.post = call <8 x float> (...) @lgc.cooperative.matrix.transpose__v8f32(<8 x float> %load, i32 1, i32 1)
+  call void (...) @lgc.cooperative.matrix.store(ptr %ptr, i32 4, i1 true, i32 1, i32 1, i32 0, i32 16, <8 x float> %trans.post)
   ret void
 }
 
@@ -313,17 +313,17 @@ define void @transpose_fact_inner_times_scalar(ptr %ptr) {
 ; CHECK-LABEL: define void @transpose_fact_inner_times_scalar
 ; CHECK-SAME: (ptr [[PTR:%.*]]) {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[LOAD:%.*]] = call <8 x float> @lgc.cooperative.matrix.load.v8f32.p0.i32.i1.i32.i32.i32(ptr [[PTR]], i32 4, i1 true, i32 1, i32 0, i32 0)
-; CHECK-NEXT:    [[SCALAR:%.*]] = call <8 x float> @lgc.cooperative.matrix.times.scalar.v8f32.v8f32.f16.i32.i32(<8 x float> [[LOAD]], half 0xH310F, i32 1, i32 0)
-; CHECK-NEXT:    call void @lgc.cooperative.matrix.store.p0.i32.i1.i32.i32.i32.v8f32(ptr [[PTR]], i32 4, i1 true, i32 1, i32 0, i32 0, <8 x float> [[SCALAR]])
+; CHECK-NEXT:    [[LOAD:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.load__v8f32(ptr [[PTR]], i32 4, i1 true, i32 1, i32 0, i32 0)
+; CHECK-NEXT:    [[SCALAR:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.times.scalar__v8f32(<8 x float> [[LOAD]], half 0xH310F, i32 1, i32 0)
+; CHECK-NEXT:    call void (...) @lgc.cooperative.matrix.store(ptr [[PTR]], i32 4, i1 true, i32 1, i32 0, i32 0, i32 16, <8 x float> [[SCALAR]])
 ; CHECK-NEXT:    ret void
 ;
 entry:
-  %load = call <8 x float> @lgc.cooperative.matrix.load.v8f32.p0.i32.i1.i32.i32.i32(ptr %ptr, i32 4, i1 false, i32 1, i32 0, i32 0)
-  %trans.pre = call <8 x float> @lgc.cooperative.matrix.transpose.v8f32.v8f32.i32.i32(<8 x float> %load, i32 1, i32 0)
-  %scalar =  call <8 x float> @lgc.cooperative.matrix.times.scalar.v8f32.v8f32.f16.i32.i32(<8 x float> %trans.pre, half 0xH310F, i32 1, i32 0)
-  %trans.post = call <8 x float> @lgc.cooperative.matrix.convert.v8f32.i32.v8f32.i32.i32.i32.i32(i32 0, <8 x float> %scalar, i32 1, i32 1, i32 1, i32 0)
-  call void @lgc.cooperative.matrix.store.p0.i32.i1.i32.i32.i32.v8f32(ptr %ptr, i32 4, i1 true, i32 1, i32 0, i32 0, <8 x float> %trans.post)
+  %load = call <8 x float> (...) @lgc.cooperative.matrix.load__v8f32(ptr %ptr, i32 4, i1 false, i32 1, i32 0, i32 0)
+  %trans.pre = call <8 x float> (...) @lgc.cooperative.matrix.transpose__v8f32(<8 x float> %load, i32 1, i32 0)
+  %scalar =  call <8 x float> (...) @lgc.cooperative.matrix.times.scalar__v8f32(<8 x float> %trans.pre, half 0xH310F, i32 1, i32 0)
+  %trans.post = call <8 x float> (...) @lgc.cooperative.matrix.convert__v8f32(i32 0, <8 x float> %scalar, i32 1, i32 1, i32 1, i32 0)
+  call void (...) @lgc.cooperative.matrix.store(ptr %ptr, i32 4, i1 true, i32 1, i32 0, i32 0, i32 16, <8 x float> %trans.post)
   ret void
 }
 
@@ -331,17 +331,17 @@ define void @transpose_acc_inner_times_scalar(ptr %ptr) {
 ; CHECK-LABEL: define void @transpose_acc_inner_times_scalar
 ; CHECK-SAME: (ptr [[PTR:%.*]]) {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[LOAD:%.*]] = call <8 x float> @lgc.cooperative.matrix.load.v8f32.p0.i32.i1.i32.i32.i32(ptr [[PTR]], i32 4, i1 true, i32 1, i32 1, i32 0)
-; CHECK-NEXT:    [[SCALAR:%.*]] = call <8 x float> @lgc.cooperative.matrix.times.scalar.v8f32.v8f32.f16.i32.i32(<8 x float> [[LOAD]], half 0xH310F, i32 1, i32 1)
-; CHECK-NEXT:    call void @lgc.cooperative.matrix.store.p0.i32.i1.i32.i32.i32.v8f32(ptr [[PTR]], i32 4, i1 true, i32 1, i32 0, i32 0, <8 x float> [[SCALAR]])
+; CHECK-NEXT:    [[LOAD:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.load__v8f32(ptr [[PTR]], i32 4, i1 true, i32 1, i32 1, i32 0)
+; CHECK-NEXT:    [[SCALAR:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.times.scalar__v8f32(<8 x float> [[LOAD]], half 0xH310F, i32 1, i32 1)
+; CHECK-NEXT:    call void (...) @lgc.cooperative.matrix.store(ptr [[PTR]], i32 4, i1 true, i32 1, i32 0, i32 0, i32 16, <8 x float> [[SCALAR]])
 ; CHECK-NEXT:    ret void
 ;
 entry:
-  %load = call <8 x float> @lgc.cooperative.matrix.load.v8f32.p0.i32.i1.i32.i32.i32(ptr %ptr, i32 4, i1 false, i32 1, i32 1, i32 0)
-  %trans.pre = call <8 x float> @lgc.cooperative.matrix.transpose.v8f32.v8f32.i32.i32(<8 x float> %load, i32 1, i32 1)
-  %scalar =  call <8 x float> @lgc.cooperative.matrix.times.scalar.v8f32.v8f32.f16.i32.i32(<8 x float> %trans.pre, half 0xH310F, i32 1, i32 1)
-  %trans.post = call <8 x float> @lgc.cooperative.matrix.convert.v8f32.i32.v8f32.i32.i32.i32.i32(i32 0, <8 x float> %scalar, i32 1, i32 1, i32 1, i32 1)
-  call void @lgc.cooperative.matrix.store.p0.i32.i1.i32.i32.i32.v8f32(ptr %ptr, i32 4, i1 true, i32 1, i32 0, i32 0, <8 x float> %trans.post)
+  %load = call <8 x float> (...) @lgc.cooperative.matrix.load__v8f32(ptr %ptr, i32 4, i1 false, i32 1, i32 1, i32 0)
+  %trans.pre = call <8 x float> (...) @lgc.cooperative.matrix.transpose__v8f32(<8 x float> %load, i32 1, i32 1)
+  %scalar =  call <8 x float> (...) @lgc.cooperative.matrix.times.scalar__v8f32(<8 x float> %trans.pre, half 0xH310F, i32 1, i32 1)
+  %trans.post = call <8 x float> (...) @lgc.cooperative.matrix.convert__v8f32(i32 0, <8 x float> %scalar, i32 1, i32 1, i32 1, i32 1)
+  call void (...) @lgc.cooperative.matrix.store(ptr %ptr, i32 4, i1 true, i32 1, i32 0, i32 0, i32 16, <8 x float> %trans.post)
   ret void
 }
 
@@ -349,24 +349,24 @@ define void @transpose_inner_chain(ptr %ptr) {
 ; CHECK-LABEL: define void @transpose_inner_chain
 ; CHECK-SAME: (ptr [[PTR:%.*]]) {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[LOAD:%.*]] = call <8 x float> @lgc.cooperative.matrix.load.v8f32.p0.i32.i1.i32.i32.i32(ptr [[PTR]], i32 4, i1 true, i32 1, i32 0, i32 0)
+; CHECK-NEXT:    [[LOAD:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.load__v8f32(ptr [[PTR]], i32 4, i1 true, i32 1, i32 0, i32 0)
 ; CHECK-NEXT:    [[GUARD:%.*]] = call i1 @getcc()
 ; CHECK-NEXT:    br i1 [[GUARD]], label [[LOOP:%.*]], label [[END:%.*]]
 ; CHECK:       loop:
 ; CHECK-NEXT:    [[V_LOOP:%.*]] = phi <8 x float> [ [[LOAD]], [[ENTRY:%.*]] ], [ [[SCALAR:%.*]], [[LOOP]] ]
 ; CHECK-NEXT:    [[F:%.*]] = call <8 x float> @getmat1()
-; CHECK-NEXT:    [[BINOP:%.*]] = call <8 x float> @lgc.cooperative.matrix.binop.v8f32.i32.v8f32.v8f32.i32.i32(i32 1, <8 x float> [[V_LOOP]], <8 x float> [[V_LOOP]], i32 1, i32 0)
-; CHECK-NEXT:    [[SCALAR]] = call <8 x float> @lgc.cooperative.matrix.times.scalar.v8f32.v8f32.f16.i32.i32(<8 x float> [[BINOP]], half 0xH310F, i32 1, i32 0)
+; CHECK-NEXT:    [[BINOP:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.binop__v8f32(i32 1, <8 x float> [[V_LOOP]], <8 x float> [[V_LOOP]], i32 1, i32 0)
+; CHECK-NEXT:    [[SCALAR]] = call <8 x float> (...) @lgc.cooperative.matrix.times.scalar__v8f32(<8 x float> [[BINOP]], half 0xH310F, i32 1, i32 0)
 ; CHECK-NEXT:    [[CC:%.*]] = call i1 @getcc()
 ; CHECK-NEXT:    br i1 [[CC]], label [[LOOP]], label [[END]]
 ; CHECK:       end:
 ; CHECK-NEXT:    [[PHI_END:%.*]] = phi <8 x float> [ [[SCALAR]], [[LOOP]] ], [ [[LOAD]], [[ENTRY]] ]
-; CHECK-NEXT:    [[SCALAR_END:%.*]] = call <8 x float> @lgc.cooperative.matrix.times.scalar.v8f32.v8f32.f16.i32.i32(<8 x float> [[PHI_END]], half 0xH312F, i32 1, i32 0)
-; CHECK-NEXT:    call void @lgc.cooperative.matrix.store.p0.i32.i1.i32.i32.i32.v8f32(ptr [[PTR]], i32 4, i1 false, i32 1, i32 0, i32 0, <8 x float> [[SCALAR_END]])
+; CHECK-NEXT:    [[SCALAR_END:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.times.scalar__v8f32(<8 x float> [[PHI_END]], half 0xH312F, i32 1, i32 0)
+; CHECK-NEXT:    call void (...) @lgc.cooperative.matrix.store(ptr [[PTR]], i32 4, i1 false, i32 1, i32 0, i32 0, i32 16, <8 x float> [[SCALAR_END]])
 ; CHECK-NEXT:    ret void
 ;
 entry:
-  %load = call <8 x float> @lgc.cooperative.matrix.load.v8f32.p0.i32.i1.i32.i32.i32(ptr %ptr, i32 4, i1 false, i32 1, i32 0, i32 0)
+  %load = call <8 x float> (...) @lgc.cooperative.matrix.load__v8f32(ptr %ptr, i32 4, i1 false, i32 1, i32 0, i32 0)
   %guard = call i1 @getcc()
   br i1 %guard, label %loop, label %end
 
@@ -374,28 +374,28 @@ loop:
   %v.loop = phi <8 x float> [ %load, %entry ], [ %v.next, %loop ]
 
   %f = call <8 x float> @getmat1()
-  %trans.pre = call <8 x float> @lgc.cooperative.matrix.transpose.v8f32.v8f32.i32.i32(<8 x float> %v.loop, i32 1, i32 0)
-  %binop = call <8 x float> @lgc.cooperative.matrix.binop.v8f32.i32.v8f32.v8f32.i32.i32(i32 1, <8 x float> %trans.pre, <8 x float> %trans.pre, i32 1, i32 0)
-  %scalar =  call <8 x float> @lgc.cooperative.matrix.times.scalar.v8f32.v8f32.f16.i32.i32(<8 x float> %binop, half 0xH310F, i32 1, i32 0)
-  %v.next = call <8 x float> @lgc.cooperative.matrix.transpose.v8f32.v8f32.i32.i32(<8 x float> %scalar, i32 1, i32 0)
+  %trans.pre = call <8 x float> (...) @lgc.cooperative.matrix.transpose__v8f32(<8 x float> %v.loop, i32 1, i32 0)
+  %binop = call <8 x float> (...) @lgc.cooperative.matrix.binop__v8f32(i32 1, <8 x float> %trans.pre, <8 x float> %trans.pre, i32 1, i32 0)
+  %scalar =  call <8 x float> (...) @lgc.cooperative.matrix.times.scalar__v8f32(<8 x float> %binop, half 0xH310F, i32 1, i32 0)
+  %v.next = call <8 x float> (...) @lgc.cooperative.matrix.transpose__v8f32(<8 x float> %scalar, i32 1, i32 0)
 
   %cc = call i1 @getcc()
   br i1 %cc, label %loop, label %end
 
 end:
   %phi.end = phi <8 x float> [ %v.next, %loop ], [ %load, %entry ]
-  %scalar.end = call <8 x float> @lgc.cooperative.matrix.times.scalar.v8f32.v8f32.f16.i32.i32(<8 x float> %phi.end, half 0xH312F, i32 1, i32 0)
-  call void @lgc.cooperative.matrix.store.p0.i32.i1.i32.i32.i32.v8f32(ptr %ptr, i32 4, i1 true, i32 1, i32 0, i32 0, <8 x float> %scalar.end)
+  %scalar.end = call <8 x float> (...) @lgc.cooperative.matrix.times.scalar__v8f32(<8 x float> %phi.end, half 0xH312F, i32 1, i32 0)
+  call void (...) @lgc.cooperative.matrix.store(ptr %ptr, i32 4, i1 true, i32 1, i32 0, i32 0, i32 16, <8 x float> %scalar.end)
   ret void
 }
 
 declare i1 @getcc()
 declare <8 x float> @getmat1()
 
-declare <8 x float> @lgc.cooperative.matrix.load.v8f32.p0.i32.i1.i32.i32.i32(ptr, i32, i1, i32, i32, i32)
-declare <8 x float> @lgc.cooperative.matrix.transpose.v8f32.v8f32.i32.i32(<8 x float>, i32, i32)
-declare <8 x float> @lgc.cooperative.matrix.convert.v8f32.i32.v8f32.i32.i32.i32.i32(i32, <8 x float>, i32, i32, i32, i32)
-declare <8 x float> @lgc.cooperative.matrix.muladd.v8f32.v8f32.v8f32.v8f32.i1.i1.i1.i1.i32.i32(<8 x float>, <8 x float>, <8 x float>, i1, i1, i1, i1, i32, i32)
-declare <8 x float> @lgc.cooperative.matrix.times.scalar.v8f32.v8f32.f16.i32.i32(<8 x float>, half, i32, i32)
-declare <8 x float> @lgc.cooperative.matrix.binop.v8f32.i32.v8f32.v8f32.i32.i32(i32, <8 x float>, <8 x float>, i32, i32)
-declare void @lgc.cooperative.matrix.store.p0.i32.i1.i32.i32.i32.v8f32(ptr, i32, i1, i32, i32, i32, <8 x float>)
+declare <8 x float> @lgc.cooperative.matrix.load__v8f32(...)
+declare <8 x float> @lgc.cooperative.matrix.transpose__v8f32(...)
+declare <8 x float> @lgc.cooperative.matrix.convert__v8f32(...)
+declare <8 x float> @lgc.cooperative.matrix.muladd__v8f32(...)
+declare <8 x float> @lgc.cooperative.matrix.times.scalar__v8f32(...)
+declare <8 x float> @lgc.cooperative.matrix.binop__v8f32(...)
+declare void @lgc.cooperative.matrix.store(...)
diff --git a/lgc/test/Transforms/Continufy/simple.lgc b/lgc/test/Transforms/Continufy/simple.lgc
index f9e5197e16..9149ce9784 100644
--- a/lgc/test/Transforms/Continufy/simple.lgc
+++ b/lgc/test/Transforms/Continufy/simple.lgc
@@ -1,7 +1,21 @@
-; NOTE: Assertions have been autogenerated by tool/update_llpc_test_checks.py UTC_ARGS: --tool lgc --function-signature --check-globals
-; RUN: lgc -mcpu=gfx1030 -o - -passes='require<lgc-pipeline-state>,lgc-continufy' %s | FileCheck --check-prefixes=CHECK %s
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --tool lgc --function-signature
+; RUN: lgc -mcpu=gfx1030 -o - -passes="require<lgc-pipeline-state>,lgc-continufy" %s | FileCheck --check-prefixes=CHECK %s
 
 define spir_func void @raygen() !lgc.shaderstage !{i32 7} !continufy.stage !{i32 0} {
+; CHECK-LABEL: define {{[^@]+}}@raygen
+; CHECK-SAME: ({} [[STATE:%.*]], i32 [[RCR:%.*]], i32 [[SHADER_INDEX:%.*]]) !lgc.shaderstage [[META2:![0-9]+]] !continufy.stage [[META3:![0-9]+]] !lgc.cps [[META4:![0-9]+]] {
+; CHECK-NEXT:    [[PUSHCONST:%.*]] = call ptr addrspace(4) @lgc.user.data(i32 0)
+; CHECK-NEXT:    [[FN:%.*]] = load ptr, ptr addrspace(4) [[PUSHCONST]], align 8
+; CHECK-NEXT:    [[P8:%.*]] = getelementptr i8, ptr addrspace(4) [[PUSHCONST]], i32 8
+; CHECK-NEXT:    [[X:%.*]] = load i32, ptr addrspace(4) [[P8]], align 4
+; CHECK-NEXT:    [[P16:%.*]] = getelementptr i8, ptr addrspace(4) [[PUSHCONST]], i32 16
+; CHECK-NEXT:    [[DST:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[P16]], align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = ptrtoint ptr [[FN]] to i32
+; CHECK-NEXT:    [[TMP2:%.*]] = or i32 [[TMP1]], 3
+; CHECK-NEXT:    [[TMP3:%.*]] = call [2 x i32] (...) @lgc.cps.await__a2i32(i32 [[TMP2]], i32 8, i32 poison, i32 [[X]], ptr addrspace(1) [[DST]])
+; CHECK-NEXT:    store [2 x i32] [[TMP3]], ptr addrspace(1) [[DST]], align 4
+; CHECK-NEXT:    ret void
+;
   %pushconst = call ptr addrspace(4) @lgc.user.data(i32 0)
   %fn = load ptr, ptr addrspace(4) %pushconst
   %p8 = getelementptr i8, ptr addrspace(4) %pushconst, i32 8
@@ -14,6 +28,16 @@ define spir_func void @raygen() !lgc.shaderstage !{i32 7} !continufy.stage !{i32
 }
 
 define spir_func i32 @chs(i32 %x) !lgc.shaderstage !{i32 7} !continufy.stage !{i32 3} {
+; CHECK-LABEL: define {{[^@]+}}@chs
+; CHECK-SAME: ({} [[STATE:%.*]], i32 [[RCR:%.*]], i32 [[SHADER_INDEX:%.*]], i32 [[X:%.*]]) !lgc.shaderstage [[META2]] !continufy.stage [[META5:![0-9]+]] !lgc.cps [[META6:![0-9]+]] {
+; CHECK-NEXT:    [[PUSHCONST:%.*]] = call ptr addrspace(4) @lgc.user.data(i32 24)
+; CHECK-NEXT:    [[FN:%.*]] = load ptr, ptr addrspace(4) [[PUSHCONST]], align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = ptrtoint ptr [[FN]] to i32
+; CHECK-NEXT:    [[TMP2:%.*]] = or i32 [[TMP1]], 2
+; CHECK-NEXT:    [[TMP3:%.*]] = call i32 (...) @lgc.cps.await__i32(i32 [[TMP2]], i32 4, i32 poison, i32 [[X]])
+; CHECK-NEXT:    call void (...) @lgc.cps.jump(i32 [[RCR]], i32 8, {} poison, i32 poison, i32 poison, i32 [[TMP3]])
+; CHECK-NEXT:    unreachable
+;
   %pushconst = call ptr addrspace(4) @lgc.user.data(i32 24)
   %fn = load ptr, ptr addrspace(4) %pushconst
   %y = call spir_func i32 %fn(i32 %x), !continufy.stage !{i32 5}
@@ -22,6 +46,22 @@ define spir_func i32 @chs(i32 %x) !lgc.shaderstage !{i32 7} !continufy.stage !{i
 
 ; Note: No !continufy.stage metadata here
 define dllexport void @lgc.shader.CS.main() !lgc.shaderstage !{i32 7} {
+; CHECK-LABEL: define {{[^@]+}}@lgc.shader.CS.main
+; CHECK-SAME: () !lgc.shaderstage [[META2]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[ID:%.*]] = call i32 @lgc.shader.input.LocalInvocationId(i32 49)
+; CHECK-NEXT:    [[LIVE:%.*]] = icmp ult i32 [[ID]], 29
+; CHECK-NEXT:    br i1 [[LIVE]], label [[MAIN:%.*]], label [[EXIT:%.*]]
+; CHECK:       main:
+; CHECK-NEXT:    [[PUSHCONST:%.*]] = call ptr addrspace(4) @lgc.user.data(i32 32)
+; CHECK-NEXT:    [[FN:%.*]] = load ptr, ptr addrspace(4) [[PUSHCONST]], align 8
+; CHECK-NEXT:    [[TMP0:%.*]] = ptrtoint ptr [[FN]] to i32
+; CHECK-NEXT:    [[TMP1:%.*]] = or i32 [[TMP0]], 1
+; CHECK-NEXT:    call void (...) @lgc.cps.await__isVoid(i32 [[TMP1]], i32 2, i32 poison)
+; CHECK-NEXT:    br label [[EXIT]]
+; CHECK:       exit:
+; CHECK-NEXT:    ret void
+;
 entry:
   %id = call i32 @lgc.shader.input.LocalInvocationId(i32 49)
   %live = icmp ult i32 %id, 29
@@ -39,54 +79,3 @@ exit:
 
 declare ptr addrspace(4) @lgc.user.data(i32)
 declare i32 @lgc.shader.input.LocalInvocationId(i32)
-; CHECK-LABEL: define {{[^@]+}}@raygen
-; CHECK-SAME: ({} [[STATE:%.*]], i32 [[RCR:%.*]], i32 [[SHADER_INDEX:%.*]]) !lgc.shaderstage !2 !continufy.stage !3 !lgc.cps !3 {
-; CHECK-NEXT:    [[PUSHCONST:%.*]] = call ptr addrspace(4) @lgc.user.data(i32 0)
-; CHECK-NEXT:    [[FN:%.*]] = load ptr, ptr addrspace(4) [[PUSHCONST]], align 8
-; CHECK-NEXT:    [[P8:%.*]] = getelementptr i8, ptr addrspace(4) [[PUSHCONST]], i32 8
-; CHECK-NEXT:    [[X:%.*]] = load i32, ptr addrspace(4) [[P8]], align 4
-; CHECK-NEXT:    [[P16:%.*]] = getelementptr i8, ptr addrspace(4) [[PUSHCONST]], i32 16
-; CHECK-NEXT:    [[DST:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[P16]], align 8
-; CHECK-NEXT:    [[TMP1:%.*]] = ptrtoint ptr [[FN]] to i32
-; CHECK-NEXT:    [[TMP2:%.*]] = or i32 [[TMP1]], 2
-; CHECK-NEXT:    [[TMP3:%.*]] = call [2 x i32] (...) @lgc.cps.await__a2i32(i32 [[TMP2]], i32 4, i32 poison, i32 [[X]], ptr addrspace(1) [[DST]])
-; CHECK-NEXT:    store [2 x i32] [[TMP3]], ptr addrspace(1) [[DST]], align 4
-; CHECK-NEXT:    ret void
-;
-;
-; CHECK-LABEL: define {{[^@]+}}@chs
-; CHECK-SAME: ({} [[STATE:%.*]], i32 [[RCR:%.*]], i32 [[SHADER_INDEX:%.*]], i32 [[X:%.*]]) !lgc.shaderstage !2 !continufy.stage !4 !lgc.cps !5 {
-; CHECK-NEXT:    [[PUSHCONST:%.*]] = call ptr addrspace(4) @lgc.user.data(i32 24)
-; CHECK-NEXT:    [[FN:%.*]] = load ptr, ptr addrspace(4) [[PUSHCONST]], align 8
-; CHECK-NEXT:    [[TMP1:%.*]] = ptrtoint ptr [[FN]] to i32
-; CHECK-NEXT:    [[TMP2:%.*]] = or i32 [[TMP1]], 1
-; CHECK-NEXT:    [[TMP3:%.*]] = call i32 (...) @lgc.cps.await__i32(i32 [[TMP2]], i32 2, i32 poison, i32 [[X]])
-; CHECK-NEXT:    call void (...) @lgc.cps.jump(i32 [[RCR]], i32 4, {} poison, i32 poison, i32 poison, i32 [[TMP3]])
-; CHECK-NEXT:    unreachable
-;
-;
-; CHECK-LABEL: define {{[^@]+}}@lgc.shader.CS.main() !lgc.shaderstage !2 {
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[ID:%.*]] = call i32 @lgc.shader.input.LocalInvocationId(i32 49)
-; CHECK-NEXT:    [[LIVE:%.*]] = icmp ult i32 [[ID]], 29
-; CHECK-NEXT:    br i1 [[LIVE]], label [[MAIN:%.*]], label [[EXIT:%.*]]
-; CHECK:       main:
-; CHECK-NEXT:    [[PUSHCONST:%.*]] = call ptr addrspace(4) @lgc.user.data(i32 32)
-; CHECK-NEXT:    [[FN:%.*]] = load ptr, ptr addrspace(4) [[PUSHCONST]], align 8
-; CHECK-NEXT:    [[TMP0:%.*]] = ptrtoint ptr [[FN]] to i32
-; CHECK-NEXT:    call void (...) @lgc.cps.await__isVoid(i32 [[TMP0]], i32 1, i32 poison)
-; CHECK-NEXT:    br label [[EXIT]]
-; CHECK:       exit:
-; CHECK-NEXT:    ret void
-;
-;.
-; CHECK: attributes #[[ATTR0:[0-9]+]] = { nounwind willreturn }
-; CHECK: attributes #[[ATTR1:[0-9]+]] = { noreturn }
-;.
-; CHECK: [[META0:![0-9]+]] = !{!""}
-; CHECK: [[META1:![0-9]+]] = !{!"\82\B0amdpal.pipelines\91\82\B0.spill_threshold\CD\FF\FF\B0.user_data_limit\00\AEamdpal.version\92\03\00"}
-; CHECK: [[META2:![0-9]+]] = !{i32 7}
-; CHECK: [[META3:![0-9]+]] = !{i32 0}
-; CHECK: [[META4:![0-9]+]] = !{i32 3}
-; CHECK: [[META5:![0-9]+]] = !{i32 1}
-;.
diff --git a/lgc/test/Transforms/CpsLowering/continuation-basic.lgc b/lgc/test/Transforms/CpsLowering/continuation-basic.lgc
index d93e64a2c0..fa8b13b879 100644
--- a/lgc/test/Transforms/CpsLowering/continuation-basic.lgc
+++ b/lgc/test/Transforms/CpsLowering/continuation-basic.lgc
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --tool lgc --function-signature
-; RUN: lgc -mcpu=gfx1030 -o - -passes='require<lgc-pipeline-state>,lgc-patch-entry-point-mutate' %s | FileCheck --check-prefixes=CHECK %s
+; RUN: lgc -mcpu=gfx1030 -o - -passes="require<lgc-pipeline-state>,lgc-patch-entry-point-mutate" %s | FileCheck --check-prefixes=CHECK %s
 
 declare void @lgc.cps.jump(i32 %target, i32 %levels, {i32} %state, ...) noreturn
 
@@ -37,46 +37,59 @@ define void @test({i32} %state, i32 %arg, ptr %table) !lgc.cps !0 !lgc.shadersta
 ; CHECK-NEXT:    [[TMP16:%.*]] = insertvalue { <3 x i32>, i32, ptr addrspace(5), i32 } [[TMP15]], i32 [[THEN_ARG]], 3
 ; CHECK-NEXT:    [[TMP17:%.*]] = extractvalue { <3 x i32>, i32, ptr addrspace(5), i32 } [[TMP16]], 1
 ; CHECK-NEXT:    [[TMP18:%.*]] = call i32 @llvm.amdgcn.set.inactive.chain.arg.i32(i32 [[TMP17]], i32 [[VCR]])
-; CHECK-NEXT:    [[TMP19:%.*]] = icmp ne i32 [[TMP18]], 0
-; CHECK-NEXT:    [[TMP20:%.*]] = call i32 @llvm.amdgcn.ballot.i32(i1 [[TMP19]])
-; CHECK-NEXT:    [[TMP21:%.*]] = call i32 @llvm.cttz.i32(i32 [[TMP20]], i1 true)
-; CHECK-NEXT:    [[TMP22:%.*]] = call i32 @llvm.amdgcn.readlane(i32 [[TMP18]], i32 [[TMP21]])
-; CHECK-NEXT:    [[TMP23:%.*]] = icmp eq i32 [[TMP18]], [[TMP22]]
-; CHECK-NEXT:    [[TMP24:%.*]] = call i32 @llvm.amdgcn.ballot.i32(i1 [[TMP23]])
-; CHECK-NEXT:    [[TMP25:%.*]] = call i32 @llvm.amdgcn.wwm.i32(i32 [[TMP22]])
-; CHECK-NEXT:    [[TMP26:%.*]] = call i32 @llvm.amdgcn.wwm.i32(i32 [[TMP24]])
-; CHECK-NEXT:    [[TMP27:%.*]] = and i32 [[TMP25]], -64
-; CHECK-NEXT:    [[TMP28:%.*]] = insertelement <2 x i32> [[TMP7]], i32 [[TMP27]], i64 0
-; CHECK-NEXT:    [[TMP29:%.*]] = bitcast <2 x i32> [[TMP28]] to i64
-; CHECK-NEXT:    [[TMP30:%.*]] = inttoptr i64 [[TMP29]] to ptr
-; CHECK-NEXT:    [[TMP31:%.*]] = ptrtoint ptr addrspace(4) [[NUMWORKGROUPSPTR]] to i64
-; CHECK-NEXT:    [[TMP32:%.*]] = bitcast i64 [[TMP31]] to <2 x i32>
-; CHECK-NEXT:    [[TMP33:%.*]] = extractelement <2 x i32> [[TMP32]], i64 0
-; CHECK-NEXT:    [[TMP34:%.*]] = extractelement <2 x i32> [[TMP32]], i64 1
-; CHECK-NEXT:    [[TMP35:%.*]] = extractelement <3 x i32> [[WORKGROUPID]], i64 0
-; CHECK-NEXT:    [[TMP36:%.*]] = extractelement <3 x i32> [[WORKGROUPID]], i64 1
-; CHECK-NEXT:    [[TMP37:%.*]] = extractelement <3 x i32> [[WORKGROUPID]], i64 2
-; CHECK-NEXT:    [[TMP38:%.*]] = insertelement <20 x i32> poison, i32 [[GLOBALTABLE]], i64 0
-; CHECK-NEXT:    [[TMP39:%.*]] = insertelement <20 x i32> [[TMP38]], i32 [[TMP33]], i64 1
-; CHECK-NEXT:    [[TMP40:%.*]] = insertelement <20 x i32> [[TMP39]], i32 [[TMP34]], i64 2
-; CHECK-NEXT:    [[TMP41:%.*]] = insertelement <20 x i32> [[TMP40]], i32 [[PAD0]], i64 3
-; CHECK-NEXT:    [[TMP42:%.*]] = insertelement <20 x i32> [[TMP41]], i32 [[PAD1]], i64 4
-; CHECK-NEXT:    [[TMP43:%.*]] = insertelement <20 x i32> [[TMP42]], i32 [[PAD2]], i64 5
-; CHECK-NEXT:    [[TMP44:%.*]] = insertelement <20 x i32> [[TMP43]], i32 [[PAD3]], i64 6
-; CHECK-NEXT:    [[TMP45:%.*]] = insertelement <20 x i32> [[TMP44]], i32 [[PAD4]], i64 7
-; CHECK-NEXT:    [[TMP46:%.*]] = insertelement <20 x i32> [[TMP45]], i32 [[PAD5]], i64 8
-; CHECK-NEXT:    [[TMP47:%.*]] = insertelement <20 x i32> [[TMP46]], i32 [[PAD6]], i64 9
-; CHECK-NEXT:    [[TMP48:%.*]] = insertelement <20 x i32> [[TMP47]], i32 [[PAD7]], i64 10
-; CHECK-NEXT:    [[TMP49:%.*]] = insertelement <20 x i32> [[TMP48]], i32 [[PAD8]], i64 11
-; CHECK-NEXT:    [[TMP50:%.*]] = insertelement <20 x i32> [[TMP49]], i32 [[PAD9]], i64 12
-; CHECK-NEXT:    [[TMP51:%.*]] = insertelement <20 x i32> [[TMP50]], i32 [[PAD10]], i64 13
-; CHECK-NEXT:    [[TMP52:%.*]] = insertelement <20 x i32> [[TMP51]], i32 [[PAD11]], i64 14
-; CHECK-NEXT:    [[TMP53:%.*]] = insertelement <20 x i32> [[TMP52]], i32 [[SPILLTABLE]], i64 15
-; CHECK-NEXT:    [[TMP54:%.*]] = insertelement <20 x i32> [[TMP53]], i32 [[TMP35]], i64 16
-; CHECK-NEXT:    [[TMP55:%.*]] = insertelement <20 x i32> [[TMP54]], i32 [[TMP36]], i64 17
-; CHECK-NEXT:    [[TMP56:%.*]] = insertelement <20 x i32> [[TMP55]], i32 [[TMP37]], i64 18
-; CHECK-NEXT:    [[TMP57:%.*]] = insertelement <20 x i32> [[TMP56]], i32 [[MULTIDISPATCHINFO]], i64 19
-; CHECK-NEXT:    call void (ptr, i32, <20 x i32>, { <3 x i32>, i32, ptr addrspace(5), i32 }, i32, ...) @llvm.amdgcn.cs.chain.p0.i32.v20i32.sl_v3i32i32p5i32s(ptr inreg [[TMP30]], i32 inreg [[TMP26]], <20 x i32> inreg [[TMP57]], { <3 x i32>, i32, ptr addrspace(5), i32 } [[TMP16]], i32 0)
+; CHECK-NEXT:    [[TMP19:%.*]] = and i32 [[TMP18]], 7
+; CHECK-NEXT:    [[TMP20:%.*]] = icmp ne i32 [[TMP19]], 0
+; CHECK-NEXT:    [[TMP21:%.*]] = call i32 @llvm.amdgcn.ballot.i32(i1 [[TMP20]])
+; CHECK-NEXT:    [[TMP22:%.*]] = icmp eq i32 [[TMP19]], 3
+; CHECK-NEXT:    [[TMP23:%.*]] = call i32 @llvm.amdgcn.ballot.i32(i1 [[TMP22]])
+; CHECK-NEXT:    [[TMP24:%.*]] = icmp ne i32 [[TMP23]], 0
+; CHECK-NEXT:    [[TMP25:%.*]] = select i1 [[TMP24]], i32 [[TMP23]], i32 [[TMP21]]
+; CHECK-NEXT:    [[TMP26:%.*]] = icmp eq i32 [[TMP19]], 2
+; CHECK-NEXT:    [[TMP27:%.*]] = call i32 @llvm.amdgcn.ballot.i32(i1 [[TMP26]])
+; CHECK-NEXT:    [[TMP28:%.*]] = icmp ne i32 [[TMP27]], 0
+; CHECK-NEXT:    [[TMP29:%.*]] = select i1 [[TMP28]], i32 [[TMP27]], i32 [[TMP25]]
+; CHECK-NEXT:    [[TMP30:%.*]] = icmp eq i32 [[TMP19]], 1
+; CHECK-NEXT:    [[TMP31:%.*]] = call i32 @llvm.amdgcn.ballot.i32(i1 [[TMP30]])
+; CHECK-NEXT:    [[TMP32:%.*]] = icmp ne i32 [[TMP31]], 0
+; CHECK-NEXT:    [[TMP33:%.*]] = select i1 [[TMP32]], i32 [[TMP31]], i32 [[TMP29]]
+; CHECK-NEXT:    [[TMP34:%.*]] = call i32 @llvm.cttz.i32(i32 [[TMP33]], i1 true)
+; CHECK-NEXT:    [[TMP35:%.*]] = call i32 @llvm.amdgcn.readlane(i32 [[TMP18]], i32 [[TMP34]])
+; CHECK-NEXT:    [[TMP36:%.*]] = icmp eq i32 [[TMP18]], [[TMP35]]
+; CHECK-NEXT:    [[TMP37:%.*]] = call i32 @llvm.amdgcn.ballot.i32(i1 [[TMP36]])
+; CHECK-NEXT:    [[TMP38:%.*]] = call i32 @llvm.amdgcn.wwm.i32(i32 [[TMP35]])
+; CHECK-NEXT:    [[TMP39:%.*]] = call i32 @llvm.amdgcn.wwm.i32(i32 [[TMP37]])
+; CHECK-NEXT:    [[TMP40:%.*]] = and i32 [[TMP38]], -64
+; CHECK-NEXT:    [[TMP41:%.*]] = insertelement <2 x i32> [[TMP7]], i32 [[TMP40]], i64 0
+; CHECK-NEXT:    [[TMP42:%.*]] = bitcast <2 x i32> [[TMP41]] to i64
+; CHECK-NEXT:    [[TMP43:%.*]] = inttoptr i64 [[TMP42]] to ptr
+; CHECK-NEXT:    [[TMP44:%.*]] = ptrtoint ptr addrspace(4) [[NUMWORKGROUPSPTR]] to i64
+; CHECK-NEXT:    [[TMP45:%.*]] = bitcast i64 [[TMP44]] to <2 x i32>
+; CHECK-NEXT:    [[TMP46:%.*]] = extractelement <2 x i32> [[TMP45]], i64 0
+; CHECK-NEXT:    [[TMP47:%.*]] = extractelement <2 x i32> [[TMP45]], i64 1
+; CHECK-NEXT:    [[TMP48:%.*]] = extractelement <3 x i32> [[WORKGROUPID]], i64 0
+; CHECK-NEXT:    [[TMP49:%.*]] = extractelement <3 x i32> [[WORKGROUPID]], i64 1
+; CHECK-NEXT:    [[TMP50:%.*]] = extractelement <3 x i32> [[WORKGROUPID]], i64 2
+; CHECK-NEXT:    [[TMP51:%.*]] = insertelement <20 x i32> poison, i32 [[GLOBALTABLE]], i64 0
+; CHECK-NEXT:    [[TMP52:%.*]] = insertelement <20 x i32> [[TMP51]], i32 [[TMP46]], i64 1
+; CHECK-NEXT:    [[TMP53:%.*]] = insertelement <20 x i32> [[TMP52]], i32 [[TMP47]], i64 2
+; CHECK-NEXT:    [[TMP54:%.*]] = insertelement <20 x i32> [[TMP53]], i32 [[PAD0]], i64 3
+; CHECK-NEXT:    [[TMP55:%.*]] = insertelement <20 x i32> [[TMP54]], i32 [[PAD1]], i64 4
+; CHECK-NEXT:    [[TMP56:%.*]] = insertelement <20 x i32> [[TMP55]], i32 [[PAD2]], i64 5
+; CHECK-NEXT:    [[TMP57:%.*]] = insertelement <20 x i32> [[TMP56]], i32 [[PAD3]], i64 6
+; CHECK-NEXT:    [[TMP58:%.*]] = insertelement <20 x i32> [[TMP57]], i32 [[PAD4]], i64 7
+; CHECK-NEXT:    [[TMP59:%.*]] = insertelement <20 x i32> [[TMP58]], i32 [[PAD5]], i64 8
+; CHECK-NEXT:    [[TMP60:%.*]] = insertelement <20 x i32> [[TMP59]], i32 [[PAD6]], i64 9
+; CHECK-NEXT:    [[TMP61:%.*]] = insertelement <20 x i32> [[TMP60]], i32 [[PAD7]], i64 10
+; CHECK-NEXT:    [[TMP62:%.*]] = insertelement <20 x i32> [[TMP61]], i32 [[PAD8]], i64 11
+; CHECK-NEXT:    [[TMP63:%.*]] = insertelement <20 x i32> [[TMP62]], i32 [[PAD9]], i64 12
+; CHECK-NEXT:    [[TMP64:%.*]] = insertelement <20 x i32> [[TMP63]], i32 [[PAD10]], i64 13
+; CHECK-NEXT:    [[TMP65:%.*]] = insertelement <20 x i32> [[TMP64]], i32 [[PAD11]], i64 14
+; CHECK-NEXT:    [[TMP66:%.*]] = insertelement <20 x i32> [[TMP65]], i32 [[SPILLTABLE]], i64 15
+; CHECK-NEXT:    [[TMP67:%.*]] = insertelement <20 x i32> [[TMP66]], i32 [[TMP48]], i64 16
+; CHECK-NEXT:    [[TMP68:%.*]] = insertelement <20 x i32> [[TMP67]], i32 [[TMP49]], i64 17
+; CHECK-NEXT:    [[TMP69:%.*]] = insertelement <20 x i32> [[TMP68]], i32 [[TMP50]], i64 18
+; CHECK-NEXT:    [[TMP70:%.*]] = insertelement <20 x i32> [[TMP69]], i32 [[MULTIDISPATCHINFO]], i64 19
+; CHECK-NEXT:    call void (ptr, i32, <20 x i32>, { <3 x i32>, i32, ptr addrspace(5), i32 }, i32, ...) @llvm.amdgcn.cs.chain.p0.i32.v20i32.sl_v3i32i32p5i32s(ptr inreg [[TMP43]], i32 inreg [[TMP39]], <20 x i32> inreg [[TMP70]], { <3 x i32>, i32, ptr addrspace(5), i32 } [[TMP16]], i32 0)
 ; CHECK-NEXT:    unreachable
 ;
 entry:
diff --git a/lgc/test/Transforms/CpsLowering/cps-entry-point.lgc b/lgc/test/Transforms/CpsLowering/cps-entry-point.lgc
index 4764fa42b5..84fd25e3ba 100644
--- a/lgc/test/Transforms/CpsLowering/cps-entry-point.lgc
+++ b/lgc/test/Transforms/CpsLowering/cps-entry-point.lgc
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --tool lgc --function-signature
-; RUN: lgc -mcpu=gfx1030 -o - -passes='require<lgc-pipeline-state>,lgc-patch-entry-point-mutate' %s | FileCheck --check-prefixes=CHECK %s
+; RUN: lgc -mcpu=gfx1030 -o - -passes="require<lgc-pipeline-state>,lgc-patch-entry-point-mutate" %s | FileCheck --check-prefixes=CHECK %s
 
 declare void @lgc.cps.jump(i32, i32, { i32 }, ...) #0
 
diff --git a/lgc/test/Transforms/CpsLowering/cps-from-continufy.lgc b/lgc/test/Transforms/CpsLowering/cps-from-continufy.lgc
index d70431730b..0d97e2cb53 100644
--- a/lgc/test/Transforms/CpsLowering/cps-from-continufy.lgc
+++ b/lgc/test/Transforms/CpsLowering/cps-from-continufy.lgc
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --tool lgc --version 4
-; RUN: lgc -mcpu=gfx1030 -o - -passes='require<lgc-pipeline-state>,lgc-patch-entry-point-mutate' %s | FileCheck --check-prefixes=CHECK %s
+; RUN: lgc -mcpu=gfx1030 -o - -passes="require<lgc-pipeline-state>,lgc-patch-entry-point-mutate" %s | FileCheck --check-prefixes=CHECK %s
 
 %_rgen_1.Frame = type { ptr addrspace(7), ptr addrspace(7), i32 }
 
@@ -85,51 +85,64 @@ define spir_func void @_rgen_1({} %state, i32 %rcr) #0 !spirv.ExecutionModel !15
 ; CHECK-NEXT:    [[TMP67:%.*]] = insertvalue { <3 x i32>, i32, ptr addrspace(5), i32, i32, i32 } poison, <3 x i32> [[LOCALINVOCATIONID]], 0
 ; CHECK-NEXT:    [[TMP68:%.*]] = insertvalue { <3 x i32>, i32, ptr addrspace(5), i32, i32, i32 } [[TMP67]], i32 [[TMP63]], 1
 ; CHECK-NEXT:    [[TMP69:%.*]] = insertvalue { <3 x i32>, i32, ptr addrspace(5), i32, i32, i32 } [[TMP68]], ptr addrspace(5) [[TMP66]], 2
-; CHECK-NEXT:    [[TMP70:%.*]] = insertvalue { <3 x i32>, i32, ptr addrspace(5), i32, i32, i32 } [[TMP69]], i32 ptrtoint (ptr @_rgen_1.resume.0 to i32), 3
+; CHECK-NEXT:    [[TMP70:%.*]] = insertvalue { <3 x i32>, i32, ptr addrspace(5), i32, i32, i32 } [[TMP69]], i32 add (i32 ptrtoint (ptr @_rgen_1.resume.0 to i32), i32 1), 3
 ; CHECK-NEXT:    [[TMP71:%.*]] = insertvalue { <3 x i32>, i32, ptr addrspace(5), i32, i32, i32 } [[TMP70]], i32 undef, 4
 ; CHECK-NEXT:    [[TMP72:%.*]] = insertvalue { <3 x i32>, i32, ptr addrspace(5), i32, i32, i32 } [[TMP71]], i32 [[TMP51]], 5
 ; CHECK-NEXT:    [[TMP73:%.*]] = extractvalue { <3 x i32>, i32, ptr addrspace(5), i32, i32, i32 } [[TMP72]], 1
 ; CHECK-NEXT:    [[TMP74:%.*]] = call i32 @llvm.amdgcn.set.inactive.chain.arg.i32(i32 [[TMP73]], i32 [[VCR]])
-; CHECK-NEXT:    [[TMP75:%.*]] = icmp ne i32 [[TMP74]], 0
-; CHECK-NEXT:    [[TMP76:%.*]] = call i32 @llvm.amdgcn.ballot.i32(i1 [[TMP75]])
-; CHECK-NEXT:    [[TMP77:%.*]] = call i32 @llvm.cttz.i32(i32 [[TMP76]], i1 true)
-; CHECK-NEXT:    [[TMP78:%.*]] = call i32 @llvm.amdgcn.readlane(i32 [[TMP74]], i32 [[TMP77]])
-; CHECK-NEXT:    [[TMP79:%.*]] = icmp eq i32 [[TMP74]], [[TMP78]]
-; CHECK-NEXT:    [[TMP80:%.*]] = call i32 @llvm.amdgcn.ballot.i32(i1 [[TMP79]])
-; CHECK-NEXT:    [[TMP81:%.*]] = call i32 @llvm.amdgcn.wwm.i32(i32 [[TMP78]])
-; CHECK-NEXT:    [[TMP82:%.*]] = call i32 @llvm.amdgcn.wwm.i32(i32 [[TMP80]])
-; CHECK-NEXT:    [[TMP83:%.*]] = and i32 [[TMP81]], -64
-; CHECK-NEXT:    [[TMP84:%.*]] = insertelement <2 x i32> [[TMP7]], i32 [[TMP83]], i64 0
-; CHECK-NEXT:    [[TMP85:%.*]] = bitcast <2 x i32> [[TMP84]] to i64
-; CHECK-NEXT:    [[TMP86:%.*]] = inttoptr i64 [[TMP85]] to ptr
-; CHECK-NEXT:    [[TMP87:%.*]] = ptrtoint ptr addrspace(4) [[NUMWORKGROUPSPTR]] to i64
-; CHECK-NEXT:    [[TMP88:%.*]] = bitcast i64 [[TMP87]] to <2 x i32>
-; CHECK-NEXT:    [[TMP89:%.*]] = extractelement <2 x i32> [[TMP88]], i64 0
-; CHECK-NEXT:    [[TMP90:%.*]] = extractelement <2 x i32> [[TMP88]], i64 1
-; CHECK-NEXT:    [[TMP91:%.*]] = extractelement <3 x i32> [[WORKGROUPID]], i64 0
-; CHECK-NEXT:    [[TMP92:%.*]] = extractelement <3 x i32> [[WORKGROUPID]], i64 1
-; CHECK-NEXT:    [[TMP93:%.*]] = extractelement <3 x i32> [[WORKGROUPID]], i64 2
-; CHECK-NEXT:    [[TMP94:%.*]] = insertelement <20 x i32> poison, i32 [[GLOBALTABLE]], i64 0
-; CHECK-NEXT:    [[TMP95:%.*]] = insertelement <20 x i32> [[TMP94]], i32 [[TMP89]], i64 1
-; CHECK-NEXT:    [[TMP96:%.*]] = insertelement <20 x i32> [[TMP95]], i32 [[TMP90]], i64 2
-; CHECK-NEXT:    [[TMP97:%.*]] = insertelement <20 x i32> [[TMP96]], i32 [[USERDATA0]], i64 3
-; CHECK-NEXT:    [[TMP98:%.*]] = insertelement <20 x i32> [[TMP97]], i32 [[USERDATA1]], i64 4
-; CHECK-NEXT:    [[TMP99:%.*]] = insertelement <20 x i32> [[TMP98]], i32 [[USERDATA2]], i64 5
-; CHECK-NEXT:    [[TMP100:%.*]] = insertelement <20 x i32> [[TMP99]], i32 [[USERDATA3]], i64 6
-; CHECK-NEXT:    [[TMP101:%.*]] = insertelement <20 x i32> [[TMP100]], i32 [[USERDATA4]], i64 7
-; CHECK-NEXT:    [[TMP102:%.*]] = insertelement <20 x i32> [[TMP101]], i32 [[USERDATA5]], i64 8
-; CHECK-NEXT:    [[TMP103:%.*]] = insertelement <20 x i32> [[TMP102]], i32 [[PAD6]], i64 9
-; CHECK-NEXT:    [[TMP104:%.*]] = insertelement <20 x i32> [[TMP103]], i32 [[PAD7]], i64 10
-; CHECK-NEXT:    [[TMP105:%.*]] = insertelement <20 x i32> [[TMP104]], i32 [[PAD8]], i64 11
-; CHECK-NEXT:    [[TMP106:%.*]] = insertelement <20 x i32> [[TMP105]], i32 [[PAD9]], i64 12
-; CHECK-NEXT:    [[TMP107:%.*]] = insertelement <20 x i32> [[TMP106]], i32 [[PAD10]], i64 13
-; CHECK-NEXT:    [[TMP108:%.*]] = insertelement <20 x i32> [[TMP107]], i32 [[PAD11]], i64 14
-; CHECK-NEXT:    [[TMP109:%.*]] = insertelement <20 x i32> [[TMP108]], i32 [[SPILLTABLE]], i64 15
-; CHECK-NEXT:    [[TMP110:%.*]] = insertelement <20 x i32> [[TMP109]], i32 [[TMP91]], i64 16
-; CHECK-NEXT:    [[TMP111:%.*]] = insertelement <20 x i32> [[TMP110]], i32 [[TMP92]], i64 17
-; CHECK-NEXT:    [[TMP112:%.*]] = insertelement <20 x i32> [[TMP111]], i32 [[TMP93]], i64 18
-; CHECK-NEXT:    [[TMP113:%.*]] = insertelement <20 x i32> [[TMP112]], i32 [[MULTIDISPATCHINFO]], i64 19
-; CHECK-NEXT:    call void (ptr, i32, <20 x i32>, { <3 x i32>, i32, ptr addrspace(5), i32, i32, i32 }, i32, ...) @llvm.amdgcn.cs.chain.p0.i32.v20i32.sl_v3i32i32p5i32i32i32s(ptr inreg [[TMP86]], i32 inreg [[TMP82]], <20 x i32> inreg [[TMP113]], { <3 x i32>, i32, ptr addrspace(5), i32, i32, i32 } [[TMP72]], i32 0)
+; CHECK-NEXT:    [[TMP75:%.*]] = and i32 [[TMP74]], 7
+; CHECK-NEXT:    [[TMP76:%.*]] = icmp ne i32 [[TMP75]], 0
+; CHECK-NEXT:    [[TMP77:%.*]] = call i32 @llvm.amdgcn.ballot.i32(i1 [[TMP76]])
+; CHECK-NEXT:    [[TMP78:%.*]] = icmp eq i32 [[TMP75]], 3
+; CHECK-NEXT:    [[TMP79:%.*]] = call i32 @llvm.amdgcn.ballot.i32(i1 [[TMP78]])
+; CHECK-NEXT:    [[TMP80:%.*]] = icmp ne i32 [[TMP79]], 0
+; CHECK-NEXT:    [[TMP81:%.*]] = select i1 [[TMP80]], i32 [[TMP79]], i32 [[TMP77]]
+; CHECK-NEXT:    [[TMP82:%.*]] = icmp eq i32 [[TMP75]], 2
+; CHECK-NEXT:    [[TMP83:%.*]] = call i32 @llvm.amdgcn.ballot.i32(i1 [[TMP82]])
+; CHECK-NEXT:    [[TMP84:%.*]] = icmp ne i32 [[TMP83]], 0
+; CHECK-NEXT:    [[TMP85:%.*]] = select i1 [[TMP84]], i32 [[TMP83]], i32 [[TMP81]]
+; CHECK-NEXT:    [[TMP86:%.*]] = icmp eq i32 [[TMP75]], 1
+; CHECK-NEXT:    [[TMP87:%.*]] = call i32 @llvm.amdgcn.ballot.i32(i1 [[TMP86]])
+; CHECK-NEXT:    [[TMP88:%.*]] = icmp ne i32 [[TMP87]], 0
+; CHECK-NEXT:    [[TMP89:%.*]] = select i1 [[TMP88]], i32 [[TMP87]], i32 [[TMP85]]
+; CHECK-NEXT:    [[TMP90:%.*]] = call i32 @llvm.cttz.i32(i32 [[TMP89]], i1 true)
+; CHECK-NEXT:    [[TMP91:%.*]] = call i32 @llvm.amdgcn.readlane(i32 [[TMP74]], i32 [[TMP90]])
+; CHECK-NEXT:    [[TMP92:%.*]] = icmp eq i32 [[TMP74]], [[TMP91]]
+; CHECK-NEXT:    [[TMP93:%.*]] = call i32 @llvm.amdgcn.ballot.i32(i1 [[TMP92]])
+; CHECK-NEXT:    [[TMP94:%.*]] = call i32 @llvm.amdgcn.wwm.i32(i32 [[TMP91]])
+; CHECK-NEXT:    [[TMP95:%.*]] = call i32 @llvm.amdgcn.wwm.i32(i32 [[TMP93]])
+; CHECK-NEXT:    [[TMP96:%.*]] = and i32 [[TMP94]], -64
+; CHECK-NEXT:    [[TMP97:%.*]] = insertelement <2 x i32> [[TMP7]], i32 [[TMP96]], i64 0
+; CHECK-NEXT:    [[TMP98:%.*]] = bitcast <2 x i32> [[TMP97]] to i64
+; CHECK-NEXT:    [[TMP99:%.*]] = inttoptr i64 [[TMP98]] to ptr
+; CHECK-NEXT:    [[TMP100:%.*]] = ptrtoint ptr addrspace(4) [[NUMWORKGROUPSPTR]] to i64
+; CHECK-NEXT:    [[TMP101:%.*]] = bitcast i64 [[TMP100]] to <2 x i32>
+; CHECK-NEXT:    [[TMP102:%.*]] = extractelement <2 x i32> [[TMP101]], i64 0
+; CHECK-NEXT:    [[TMP103:%.*]] = extractelement <2 x i32> [[TMP101]], i64 1
+; CHECK-NEXT:    [[TMP104:%.*]] = extractelement <3 x i32> [[WORKGROUPID]], i64 0
+; CHECK-NEXT:    [[TMP105:%.*]] = extractelement <3 x i32> [[WORKGROUPID]], i64 1
+; CHECK-NEXT:    [[TMP106:%.*]] = extractelement <3 x i32> [[WORKGROUPID]], i64 2
+; CHECK-NEXT:    [[TMP107:%.*]] = insertelement <20 x i32> poison, i32 [[GLOBALTABLE]], i64 0
+; CHECK-NEXT:    [[TMP108:%.*]] = insertelement <20 x i32> [[TMP107]], i32 [[TMP102]], i64 1
+; CHECK-NEXT:    [[TMP109:%.*]] = insertelement <20 x i32> [[TMP108]], i32 [[TMP103]], i64 2
+; CHECK-NEXT:    [[TMP110:%.*]] = insertelement <20 x i32> [[TMP109]], i32 [[USERDATA0]], i64 3
+; CHECK-NEXT:    [[TMP111:%.*]] = insertelement <20 x i32> [[TMP110]], i32 [[USERDATA1]], i64 4
+; CHECK-NEXT:    [[TMP112:%.*]] = insertelement <20 x i32> [[TMP111]], i32 [[USERDATA2]], i64 5
+; CHECK-NEXT:    [[TMP113:%.*]] = insertelement <20 x i32> [[TMP112]], i32 [[USERDATA3]], i64 6
+; CHECK-NEXT:    [[TMP114:%.*]] = insertelement <20 x i32> [[TMP113]], i32 [[USERDATA4]], i64 7
+; CHECK-NEXT:    [[TMP115:%.*]] = insertelement <20 x i32> [[TMP114]], i32 [[USERDATA5]], i64 8
+; CHECK-NEXT:    [[TMP116:%.*]] = insertelement <20 x i32> [[TMP115]], i32 [[PAD6]], i64 9
+; CHECK-NEXT:    [[TMP117:%.*]] = insertelement <20 x i32> [[TMP116]], i32 [[PAD7]], i64 10
+; CHECK-NEXT:    [[TMP118:%.*]] = insertelement <20 x i32> [[TMP117]], i32 [[PAD8]], i64 11
+; CHECK-NEXT:    [[TMP119:%.*]] = insertelement <20 x i32> [[TMP118]], i32 [[PAD9]], i64 12
+; CHECK-NEXT:    [[TMP120:%.*]] = insertelement <20 x i32> [[TMP119]], i32 [[PAD10]], i64 13
+; CHECK-NEXT:    [[TMP121:%.*]] = insertelement <20 x i32> [[TMP120]], i32 [[PAD11]], i64 14
+; CHECK-NEXT:    [[TMP122:%.*]] = insertelement <20 x i32> [[TMP121]], i32 [[SPILLTABLE]], i64 15
+; CHECK-NEXT:    [[TMP123:%.*]] = insertelement <20 x i32> [[TMP122]], i32 [[TMP104]], i64 16
+; CHECK-NEXT:    [[TMP124:%.*]] = insertelement <20 x i32> [[TMP123]], i32 [[TMP105]], i64 17
+; CHECK-NEXT:    [[TMP125:%.*]] = insertelement <20 x i32> [[TMP124]], i32 [[TMP106]], i64 18
+; CHECK-NEXT:    [[TMP126:%.*]] = insertelement <20 x i32> [[TMP125]], i32 [[MULTIDISPATCHINFO]], i64 19
+; CHECK-NEXT:    call void (ptr, i32, <20 x i32>, { <3 x i32>, i32, ptr addrspace(5), i32, i32, i32 }, i32, ...) @llvm.amdgcn.cs.chain.p0.i32.v20i32.sl_v3i32i32p5i32i32i32s(ptr inreg [[TMP99]], i32 inreg [[TMP95]], <20 x i32> inreg [[TMP126]], { <3 x i32>, i32, ptr addrspace(5), i32, i32, i32 } [[TMP72]], i32 0)
 ; CHECK-NEXT:    unreachable
 ;
 .entry:
@@ -237,49 +250,62 @@ define void @_rgen_1.resume.0({} %0, i32 %1, [1 x i32] %2) !spirv.ExecutionModel
 ; CHECK-NEXT:    [[TMP30:%.*]] = insertvalue { <3 x i32>, i32, ptr addrspace(5) } [[TMP29]], ptr addrspace(5) poison, 2
 ; CHECK-NEXT:    [[TMP31:%.*]] = extractvalue { <3 x i32>, i32, ptr addrspace(5) } [[TMP30]], 1
 ; CHECK-NEXT:    [[TMP32:%.*]] = call i32 @llvm.amdgcn.set.inactive.chain.arg.i32(i32 [[TMP31]], i32 [[VCR]])
-; CHECK-NEXT:    [[TMP33:%.*]] = icmp ne i32 [[TMP32]], 0
-; CHECK-NEXT:    [[TMP34:%.*]] = call i32 @llvm.amdgcn.ballot.i32(i1 [[TMP33]])
-; CHECK-NEXT:    [[TMP35:%.*]] = call i32 @llvm.cttz.i32(i32 [[TMP34]], i1 true)
-; CHECK-NEXT:    [[TMP36:%.*]] = call i32 @llvm.amdgcn.readlane(i32 [[TMP32]], i32 [[TMP35]])
-; CHECK-NEXT:    [[TMP37:%.*]] = icmp eq i32 [[TMP32]], [[TMP36]]
-; CHECK-NEXT:    [[TMP38:%.*]] = call i32 @llvm.amdgcn.ballot.i32(i1 [[TMP37]])
-; CHECK-NEXT:    [[TMP39:%.*]] = call i32 @llvm.amdgcn.wwm.i32(i32 [[TMP36]])
-; CHECK-NEXT:    [[TMP40:%.*]] = call i32 @llvm.amdgcn.wwm.i32(i32 [[TMP38]])
-; CHECK-NEXT:    [[TMP41:%.*]] = icmp eq i32 [[TMP39]], 0
-; CHECK-NEXT:    br i1 [[TMP41]], label [[RET_BLOCK:%.*]], label [[CHAIN_BLOCK:%.*]]
+; CHECK-NEXT:    [[TMP33:%.*]] = and i32 [[TMP32]], 7
+; CHECK-NEXT:    [[TMP34:%.*]] = icmp ne i32 [[TMP33]], 0
+; CHECK-NEXT:    [[TMP35:%.*]] = call i32 @llvm.amdgcn.ballot.i32(i1 [[TMP34]])
+; CHECK-NEXT:    [[TMP36:%.*]] = icmp eq i32 [[TMP33]], 3
+; CHECK-NEXT:    [[TMP37:%.*]] = call i32 @llvm.amdgcn.ballot.i32(i1 [[TMP36]])
+; CHECK-NEXT:    [[TMP38:%.*]] = icmp ne i32 [[TMP37]], 0
+; CHECK-NEXT:    [[TMP39:%.*]] = select i1 [[TMP38]], i32 [[TMP37]], i32 [[TMP35]]
+; CHECK-NEXT:    [[TMP40:%.*]] = icmp eq i32 [[TMP33]], 2
+; CHECK-NEXT:    [[TMP41:%.*]] = call i32 @llvm.amdgcn.ballot.i32(i1 [[TMP40]])
+; CHECK-NEXT:    [[TMP42:%.*]] = icmp ne i32 [[TMP41]], 0
+; CHECK-NEXT:    [[TMP43:%.*]] = select i1 [[TMP42]], i32 [[TMP41]], i32 [[TMP39]]
+; CHECK-NEXT:    [[TMP44:%.*]] = icmp eq i32 [[TMP33]], 1
+; CHECK-NEXT:    [[TMP45:%.*]] = call i32 @llvm.amdgcn.ballot.i32(i1 [[TMP44]])
+; CHECK-NEXT:    [[TMP46:%.*]] = icmp ne i32 [[TMP45]], 0
+; CHECK-NEXT:    [[TMP47:%.*]] = select i1 [[TMP46]], i32 [[TMP45]], i32 [[TMP43]]
+; CHECK-NEXT:    [[TMP48:%.*]] = call i32 @llvm.cttz.i32(i32 [[TMP47]], i1 true)
+; CHECK-NEXT:    [[TMP49:%.*]] = call i32 @llvm.amdgcn.readlane(i32 [[TMP32]], i32 [[TMP48]])
+; CHECK-NEXT:    [[TMP50:%.*]] = icmp eq i32 [[TMP32]], [[TMP49]]
+; CHECK-NEXT:    [[TMP51:%.*]] = call i32 @llvm.amdgcn.ballot.i32(i1 [[TMP50]])
+; CHECK-NEXT:    [[TMP52:%.*]] = call i32 @llvm.amdgcn.wwm.i32(i32 [[TMP49]])
+; CHECK-NEXT:    [[TMP53:%.*]] = call i32 @llvm.amdgcn.wwm.i32(i32 [[TMP51]])
+; CHECK-NEXT:    [[TMP54:%.*]] = icmp eq i32 [[TMP52]], 0
+; CHECK-NEXT:    br i1 [[TMP54]], label [[RET_BLOCK:%.*]], label [[CHAIN_BLOCK:%.*]]
 ; CHECK:       chain.block:
-; CHECK-NEXT:    [[TMP42:%.*]] = and i32 [[TMP39]], -64
-; CHECK-NEXT:    [[TMP43:%.*]] = insertelement <2 x i32> [[TMP9]], i32 [[TMP42]], i64 0
-; CHECK-NEXT:    [[TMP44:%.*]] = bitcast <2 x i32> [[TMP43]] to i64
-; CHECK-NEXT:    [[TMP45:%.*]] = inttoptr i64 [[TMP44]] to ptr
-; CHECK-NEXT:    [[TMP46:%.*]] = ptrtoint ptr addrspace(4) [[NUMWORKGROUPSPTR]] to i64
-; CHECK-NEXT:    [[TMP47:%.*]] = bitcast i64 [[TMP46]] to <2 x i32>
-; CHECK-NEXT:    [[TMP48:%.*]] = extractelement <2 x i32> [[TMP47]], i64 0
-; CHECK-NEXT:    [[TMP49:%.*]] = extractelement <2 x i32> [[TMP47]], i64 1
-; CHECK-NEXT:    [[TMP50:%.*]] = extractelement <3 x i32> [[WORKGROUPID]], i64 0
-; CHECK-NEXT:    [[TMP51:%.*]] = extractelement <3 x i32> [[WORKGROUPID]], i64 1
-; CHECK-NEXT:    [[TMP52:%.*]] = extractelement <3 x i32> [[WORKGROUPID]], i64 2
-; CHECK-NEXT:    [[TMP53:%.*]] = insertelement <20 x i32> poison, i32 [[GLOBALTABLE]], i64 0
-; CHECK-NEXT:    [[TMP54:%.*]] = insertelement <20 x i32> [[TMP53]], i32 [[TMP48]], i64 1
-; CHECK-NEXT:    [[TMP55:%.*]] = insertelement <20 x i32> [[TMP54]], i32 [[TMP49]], i64 2
-; CHECK-NEXT:    [[TMP56:%.*]] = insertelement <20 x i32> [[TMP55]], i32 [[USERDATA0]], i64 3
-; CHECK-NEXT:    [[TMP57:%.*]] = insertelement <20 x i32> [[TMP56]], i32 [[USERDATA1]], i64 4
-; CHECK-NEXT:    [[TMP58:%.*]] = insertelement <20 x i32> [[TMP57]], i32 [[USERDATA2]], i64 5
-; CHECK-NEXT:    [[TMP59:%.*]] = insertelement <20 x i32> [[TMP58]], i32 [[USERDATA3]], i64 6
-; CHECK-NEXT:    [[TMP60:%.*]] = insertelement <20 x i32> [[TMP59]], i32 [[USERDATA4]], i64 7
-; CHECK-NEXT:    [[TMP61:%.*]] = insertelement <20 x i32> [[TMP60]], i32 [[USERDATA5]], i64 8
-; CHECK-NEXT:    [[TMP62:%.*]] = insertelement <20 x i32> [[TMP61]], i32 [[PAD6]], i64 9
-; CHECK-NEXT:    [[TMP63:%.*]] = insertelement <20 x i32> [[TMP62]], i32 [[PAD7]], i64 10
-; CHECK-NEXT:    [[TMP64:%.*]] = insertelement <20 x i32> [[TMP63]], i32 [[PAD8]], i64 11
-; CHECK-NEXT:    [[TMP65:%.*]] = insertelement <20 x i32> [[TMP64]], i32 [[PAD9]], i64 12
-; CHECK-NEXT:    [[TMP66:%.*]] = insertelement <20 x i32> [[TMP65]], i32 [[PAD10]], i64 13
-; CHECK-NEXT:    [[TMP67:%.*]] = insertelement <20 x i32> [[TMP66]], i32 [[PAD11]], i64 14
-; CHECK-NEXT:    [[TMP68:%.*]] = insertelement <20 x i32> [[TMP67]], i32 [[SPILLTABLE]], i64 15
-; CHECK-NEXT:    [[TMP69:%.*]] = insertelement <20 x i32> [[TMP68]], i32 [[TMP50]], i64 16
-; CHECK-NEXT:    [[TMP70:%.*]] = insertelement <20 x i32> [[TMP69]], i32 [[TMP51]], i64 17
-; CHECK-NEXT:    [[TMP71:%.*]] = insertelement <20 x i32> [[TMP70]], i32 [[TMP52]], i64 18
-; CHECK-NEXT:    [[TMP72:%.*]] = insertelement <20 x i32> [[TMP71]], i32 [[MULTIDISPATCHINFO]], i64 19
-; CHECK-NEXT:    call void (ptr, i32, <20 x i32>, { <3 x i32>, i32, ptr addrspace(5) }, i32, ...) @llvm.amdgcn.cs.chain.p0.i32.v20i32.sl_v3i32i32p5s(ptr inreg [[TMP45]], i32 inreg [[TMP40]], <20 x i32> inreg [[TMP72]], { <3 x i32>, i32, ptr addrspace(5) } [[TMP30]], i32 0)
+; CHECK-NEXT:    [[TMP55:%.*]] = and i32 [[TMP52]], -64
+; CHECK-NEXT:    [[TMP56:%.*]] = insertelement <2 x i32> [[TMP9]], i32 [[TMP55]], i64 0
+; CHECK-NEXT:    [[TMP57:%.*]] = bitcast <2 x i32> [[TMP56]] to i64
+; CHECK-NEXT:    [[TMP58:%.*]] = inttoptr i64 [[TMP57]] to ptr
+; CHECK-NEXT:    [[TMP59:%.*]] = ptrtoint ptr addrspace(4) [[NUMWORKGROUPSPTR]] to i64
+; CHECK-NEXT:    [[TMP60:%.*]] = bitcast i64 [[TMP59]] to <2 x i32>
+; CHECK-NEXT:    [[TMP61:%.*]] = extractelement <2 x i32> [[TMP60]], i64 0
+; CHECK-NEXT:    [[TMP62:%.*]] = extractelement <2 x i32> [[TMP60]], i64 1
+; CHECK-NEXT:    [[TMP63:%.*]] = extractelement <3 x i32> [[WORKGROUPID]], i64 0
+; CHECK-NEXT:    [[TMP64:%.*]] = extractelement <3 x i32> [[WORKGROUPID]], i64 1
+; CHECK-NEXT:    [[TMP65:%.*]] = extractelement <3 x i32> [[WORKGROUPID]], i64 2
+; CHECK-NEXT:    [[TMP66:%.*]] = insertelement <20 x i32> poison, i32 [[GLOBALTABLE]], i64 0
+; CHECK-NEXT:    [[TMP67:%.*]] = insertelement <20 x i32> [[TMP66]], i32 [[TMP61]], i64 1
+; CHECK-NEXT:    [[TMP68:%.*]] = insertelement <20 x i32> [[TMP67]], i32 [[TMP62]], i64 2
+; CHECK-NEXT:    [[TMP69:%.*]] = insertelement <20 x i32> [[TMP68]], i32 [[USERDATA0]], i64 3
+; CHECK-NEXT:    [[TMP70:%.*]] = insertelement <20 x i32> [[TMP69]], i32 [[USERDATA1]], i64 4
+; CHECK-NEXT:    [[TMP71:%.*]] = insertelement <20 x i32> [[TMP70]], i32 [[USERDATA2]], i64 5
+; CHECK-NEXT:    [[TMP72:%.*]] = insertelement <20 x i32> [[TMP71]], i32 [[USERDATA3]], i64 6
+; CHECK-NEXT:    [[TMP73:%.*]] = insertelement <20 x i32> [[TMP72]], i32 [[USERDATA4]], i64 7
+; CHECK-NEXT:    [[TMP74:%.*]] = insertelement <20 x i32> [[TMP73]], i32 [[USERDATA5]], i64 8
+; CHECK-NEXT:    [[TMP75:%.*]] = insertelement <20 x i32> [[TMP74]], i32 [[PAD6]], i64 9
+; CHECK-NEXT:    [[TMP76:%.*]] = insertelement <20 x i32> [[TMP75]], i32 [[PAD7]], i64 10
+; CHECK-NEXT:    [[TMP77:%.*]] = insertelement <20 x i32> [[TMP76]], i32 [[PAD8]], i64 11
+; CHECK-NEXT:    [[TMP78:%.*]] = insertelement <20 x i32> [[TMP77]], i32 [[PAD9]], i64 12
+; CHECK-NEXT:    [[TMP79:%.*]] = insertelement <20 x i32> [[TMP78]], i32 [[PAD10]], i64 13
+; CHECK-NEXT:    [[TMP80:%.*]] = insertelement <20 x i32> [[TMP79]], i32 [[PAD11]], i64 14
+; CHECK-NEXT:    [[TMP81:%.*]] = insertelement <20 x i32> [[TMP80]], i32 [[SPILLTABLE]], i64 15
+; CHECK-NEXT:    [[TMP82:%.*]] = insertelement <20 x i32> [[TMP81]], i32 [[TMP63]], i64 16
+; CHECK-NEXT:    [[TMP83:%.*]] = insertelement <20 x i32> [[TMP82]], i32 [[TMP64]], i64 17
+; CHECK-NEXT:    [[TMP84:%.*]] = insertelement <20 x i32> [[TMP83]], i32 [[TMP65]], i64 18
+; CHECK-NEXT:    [[TMP85:%.*]] = insertelement <20 x i32> [[TMP84]], i32 [[MULTIDISPATCHINFO]], i64 19
+; CHECK-NEXT:    call void (ptr, i32, <20 x i32>, { <3 x i32>, i32, ptr addrspace(5) }, i32, ...) @llvm.amdgcn.cs.chain.p0.i32.v20i32.sl_v3i32i32p5s(ptr inreg [[TMP58]], i32 inreg [[TMP53]], <20 x i32> inreg [[TMP85]], { <3 x i32>, i32, ptr addrspace(5) } [[TMP30]], i32 0)
 ; CHECK-NEXT:    unreachable
 ; CHECK:       ret.block:
 ; CHECK-NEXT:    ret void
@@ -356,7 +382,7 @@ attributes #7 = { nounwind willreturn memory(inaccessiblemem: read) }
 !14 = !{!"\82\B0amdpal.pipelines\91\83\B0.spill_threshold\CD\FF\FF\B0.user_data_limit\00\AF.xgl_cache_info\82\B3.128_bit_cache_hash\92\CF\C4jyX\05\E6M\0F\CF\03b\DD\05\C5\B6\DB\B9\AD.llpc_version\A467.0\AEamdpal.version\92\03\00"}
 !15 = !{i32 5313}
 !16 = !{i32 7}
-!17 = !{i32 0}
+!17 = !{i32 1}
 !18 = !{ptr @_rgen_1}
 
 ;
@@ -365,5 +391,5 @@ attributes #7 = { nounwind willreturn memory(inaccessiblemem: read) }
 ;.
 ; CHECK: [[META16]] = !{i32 7}
 ; CHECK: [[META17]] = !{ptr @_rgen_1}
-; CHECK: [[META18]] = !{i32 0}
+; CHECK: [[META18]] = !{i32 1}
 ;.
diff --git a/lgc/test/Transforms/CpsLowering/cps-stack-lowering.lgc b/lgc/test/Transforms/CpsLowering/cps-stack-lowering.lgc
index 118d06b073..170c39683d 100644
--- a/lgc/test/Transforms/CpsLowering/cps-stack-lowering.lgc
+++ b/lgc/test/Transforms/CpsLowering/cps-stack-lowering.lgc
@@ -1,5 +1,5 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --tool lgc --function-signature --check-globals
-; RUN: lgc -mcpu=gfx1030 -o - -passes='require<lgc-pipeline-state>,lgc-patch-entry-point-mutate' %s | FileCheck --check-prefixes=CHECK %s
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --tool lgc --function-signature
+; RUN: lgc -mcpu=gfx1030 -o - -passes="require<lgc-pipeline-state>,lgc-patch-entry-point-mutate" %s | FileCheck --check-prefixes=CHECK %s
 
 declare void @lgc.cps.jump(...) noreturn
 declare ptr addrspace(32) @lgc.cps.alloc(i32)
@@ -53,46 +53,59 @@ define void @test.0({} %unused) !lgc.cps !{i32 1} !lgc.shaderstage !{i32 7} {
 ; CHECK-NEXT:    [[TMP27:%.*]] = insertvalue { <3 x i32>, i32, ptr addrspace(5), i32, i32 } [[TMP26]], i32 [[TMP14]], 4
 ; CHECK-NEXT:    [[TMP28:%.*]] = extractvalue { <3 x i32>, i32, ptr addrspace(5), i32, i32 } [[TMP27]], 1
 ; CHECK-NEXT:    [[TMP29:%.*]] = call i32 @llvm.amdgcn.set.inactive.chain.arg.i32(i32 [[TMP28]], i32 [[VCR]])
-; CHECK-NEXT:    [[TMP30:%.*]] = icmp ne i32 [[TMP29]], 0
-; CHECK-NEXT:    [[TMP31:%.*]] = call i32 @llvm.amdgcn.ballot.i32(i1 [[TMP30]])
-; CHECK-NEXT:    [[TMP32:%.*]] = call i32 @llvm.cttz.i32(i32 [[TMP31]], i1 true)
-; CHECK-NEXT:    [[TMP33:%.*]] = call i32 @llvm.amdgcn.readlane(i32 [[TMP29]], i32 [[TMP32]])
-; CHECK-NEXT:    [[TMP34:%.*]] = icmp eq i32 [[TMP29]], [[TMP33]]
-; CHECK-NEXT:    [[TMP35:%.*]] = call i32 @llvm.amdgcn.ballot.i32(i1 [[TMP34]])
-; CHECK-NEXT:    [[TMP36:%.*]] = call i32 @llvm.amdgcn.wwm.i32(i32 [[TMP33]])
-; CHECK-NEXT:    [[TMP37:%.*]] = call i32 @llvm.amdgcn.wwm.i32(i32 [[TMP35]])
-; CHECK-NEXT:    [[TMP38:%.*]] = and i32 [[TMP36]], -64
-; CHECK-NEXT:    [[TMP39:%.*]] = insertelement <2 x i32> [[TMP8]], i32 [[TMP38]], i64 0
-; CHECK-NEXT:    [[TMP40:%.*]] = bitcast <2 x i32> [[TMP39]] to i64
-; CHECK-NEXT:    [[TMP41:%.*]] = inttoptr i64 [[TMP40]] to ptr
-; CHECK-NEXT:    [[TMP42:%.*]] = ptrtoint ptr addrspace(4) [[NUMWORKGROUPSPTR]] to i64
-; CHECK-NEXT:    [[TMP43:%.*]] = bitcast i64 [[TMP42]] to <2 x i32>
-; CHECK-NEXT:    [[TMP44:%.*]] = extractelement <2 x i32> [[TMP43]], i64 0
-; CHECK-NEXT:    [[TMP45:%.*]] = extractelement <2 x i32> [[TMP43]], i64 1
-; CHECK-NEXT:    [[TMP46:%.*]] = extractelement <3 x i32> [[WORKGROUPID]], i64 0
-; CHECK-NEXT:    [[TMP47:%.*]] = extractelement <3 x i32> [[WORKGROUPID]], i64 1
-; CHECK-NEXT:    [[TMP48:%.*]] = extractelement <3 x i32> [[WORKGROUPID]], i64 2
-; CHECK-NEXT:    [[TMP49:%.*]] = insertelement <20 x i32> poison, i32 [[GLOBALTABLE]], i64 0
-; CHECK-NEXT:    [[TMP50:%.*]] = insertelement <20 x i32> [[TMP49]], i32 [[TMP44]], i64 1
-; CHECK-NEXT:    [[TMP51:%.*]] = insertelement <20 x i32> [[TMP50]], i32 [[TMP45]], i64 2
-; CHECK-NEXT:    [[TMP52:%.*]] = insertelement <20 x i32> [[TMP51]], i32 [[PAD0]], i64 3
-; CHECK-NEXT:    [[TMP53:%.*]] = insertelement <20 x i32> [[TMP52]], i32 [[PAD1]], i64 4
-; CHECK-NEXT:    [[TMP54:%.*]] = insertelement <20 x i32> [[TMP53]], i32 [[PAD2]], i64 5
-; CHECK-NEXT:    [[TMP55:%.*]] = insertelement <20 x i32> [[TMP54]], i32 [[PAD3]], i64 6
-; CHECK-NEXT:    [[TMP56:%.*]] = insertelement <20 x i32> [[TMP55]], i32 [[PAD4]], i64 7
-; CHECK-NEXT:    [[TMP57:%.*]] = insertelement <20 x i32> [[TMP56]], i32 [[PAD5]], i64 8
-; CHECK-NEXT:    [[TMP58:%.*]] = insertelement <20 x i32> [[TMP57]], i32 [[PAD6]], i64 9
-; CHECK-NEXT:    [[TMP59:%.*]] = insertelement <20 x i32> [[TMP58]], i32 [[PAD7]], i64 10
-; CHECK-NEXT:    [[TMP60:%.*]] = insertelement <20 x i32> [[TMP59]], i32 [[PAD8]], i64 11
-; CHECK-NEXT:    [[TMP61:%.*]] = insertelement <20 x i32> [[TMP60]], i32 [[PAD9]], i64 12
-; CHECK-NEXT:    [[TMP62:%.*]] = insertelement <20 x i32> [[TMP61]], i32 [[PAD10]], i64 13
-; CHECK-NEXT:    [[TMP63:%.*]] = insertelement <20 x i32> [[TMP62]], i32 [[PAD11]], i64 14
-; CHECK-NEXT:    [[TMP64:%.*]] = insertelement <20 x i32> [[TMP63]], i32 [[SPILLTABLE]], i64 15
-; CHECK-NEXT:    [[TMP65:%.*]] = insertelement <20 x i32> [[TMP64]], i32 [[TMP46]], i64 16
-; CHECK-NEXT:    [[TMP66:%.*]] = insertelement <20 x i32> [[TMP65]], i32 [[TMP47]], i64 17
-; CHECK-NEXT:    [[TMP67:%.*]] = insertelement <20 x i32> [[TMP66]], i32 [[TMP48]], i64 18
-; CHECK-NEXT:    [[TMP68:%.*]] = insertelement <20 x i32> [[TMP67]], i32 [[MULTIDISPATCHINFO]], i64 19
-; CHECK-NEXT:    call void (ptr, i32, <20 x i32>, { <3 x i32>, i32, ptr addrspace(5), i32, i32 }, i32, ...) @llvm.amdgcn.cs.chain.p0.i32.v20i32.sl_v3i32i32p5i32i32s(ptr inreg [[TMP41]], i32 inreg [[TMP37]], <20 x i32> inreg [[TMP68]], { <3 x i32>, i32, ptr addrspace(5), i32, i32 } [[TMP27]], i32 0)
+; CHECK-NEXT:    [[TMP30:%.*]] = and i32 [[TMP29]], 7
+; CHECK-NEXT:    [[TMP31:%.*]] = icmp ne i32 [[TMP30]], 0
+; CHECK-NEXT:    [[TMP32:%.*]] = call i32 @llvm.amdgcn.ballot.i32(i1 [[TMP31]])
+; CHECK-NEXT:    [[TMP33:%.*]] = icmp eq i32 [[TMP30]], 3
+; CHECK-NEXT:    [[TMP34:%.*]] = call i32 @llvm.amdgcn.ballot.i32(i1 [[TMP33]])
+; CHECK-NEXT:    [[TMP35:%.*]] = icmp ne i32 [[TMP34]], 0
+; CHECK-NEXT:    [[TMP36:%.*]] = select i1 [[TMP35]], i32 [[TMP34]], i32 [[TMP32]]
+; CHECK-NEXT:    [[TMP37:%.*]] = icmp eq i32 [[TMP30]], 2
+; CHECK-NEXT:    [[TMP38:%.*]] = call i32 @llvm.amdgcn.ballot.i32(i1 [[TMP37]])
+; CHECK-NEXT:    [[TMP39:%.*]] = icmp ne i32 [[TMP38]], 0
+; CHECK-NEXT:    [[TMP40:%.*]] = select i1 [[TMP39]], i32 [[TMP38]], i32 [[TMP36]]
+; CHECK-NEXT:    [[TMP41:%.*]] = icmp eq i32 [[TMP30]], 1
+; CHECK-NEXT:    [[TMP42:%.*]] = call i32 @llvm.amdgcn.ballot.i32(i1 [[TMP41]])
+; CHECK-NEXT:    [[TMP43:%.*]] = icmp ne i32 [[TMP42]], 0
+; CHECK-NEXT:    [[TMP44:%.*]] = select i1 [[TMP43]], i32 [[TMP42]], i32 [[TMP40]]
+; CHECK-NEXT:    [[TMP45:%.*]] = call i32 @llvm.cttz.i32(i32 [[TMP44]], i1 true)
+; CHECK-NEXT:    [[TMP46:%.*]] = call i32 @llvm.amdgcn.readlane(i32 [[TMP29]], i32 [[TMP45]])
+; CHECK-NEXT:    [[TMP47:%.*]] = icmp eq i32 [[TMP29]], [[TMP46]]
+; CHECK-NEXT:    [[TMP48:%.*]] = call i32 @llvm.amdgcn.ballot.i32(i1 [[TMP47]])
+; CHECK-NEXT:    [[TMP49:%.*]] = call i32 @llvm.amdgcn.wwm.i32(i32 [[TMP46]])
+; CHECK-NEXT:    [[TMP50:%.*]] = call i32 @llvm.amdgcn.wwm.i32(i32 [[TMP48]])
+; CHECK-NEXT:    [[TMP51:%.*]] = and i32 [[TMP49]], -64
+; CHECK-NEXT:    [[TMP52:%.*]] = insertelement <2 x i32> [[TMP8]], i32 [[TMP51]], i64 0
+; CHECK-NEXT:    [[TMP53:%.*]] = bitcast <2 x i32> [[TMP52]] to i64
+; CHECK-NEXT:    [[TMP54:%.*]] = inttoptr i64 [[TMP53]] to ptr
+; CHECK-NEXT:    [[TMP55:%.*]] = ptrtoint ptr addrspace(4) [[NUMWORKGROUPSPTR]] to i64
+; CHECK-NEXT:    [[TMP56:%.*]] = bitcast i64 [[TMP55]] to <2 x i32>
+; CHECK-NEXT:    [[TMP57:%.*]] = extractelement <2 x i32> [[TMP56]], i64 0
+; CHECK-NEXT:    [[TMP58:%.*]] = extractelement <2 x i32> [[TMP56]], i64 1
+; CHECK-NEXT:    [[TMP59:%.*]] = extractelement <3 x i32> [[WORKGROUPID]], i64 0
+; CHECK-NEXT:    [[TMP60:%.*]] = extractelement <3 x i32> [[WORKGROUPID]], i64 1
+; CHECK-NEXT:    [[TMP61:%.*]] = extractelement <3 x i32> [[WORKGROUPID]], i64 2
+; CHECK-NEXT:    [[TMP62:%.*]] = insertelement <20 x i32> poison, i32 [[GLOBALTABLE]], i64 0
+; CHECK-NEXT:    [[TMP63:%.*]] = insertelement <20 x i32> [[TMP62]], i32 [[TMP57]], i64 1
+; CHECK-NEXT:    [[TMP64:%.*]] = insertelement <20 x i32> [[TMP63]], i32 [[TMP58]], i64 2
+; CHECK-NEXT:    [[TMP65:%.*]] = insertelement <20 x i32> [[TMP64]], i32 [[PAD0]], i64 3
+; CHECK-NEXT:    [[TMP66:%.*]] = insertelement <20 x i32> [[TMP65]], i32 [[PAD1]], i64 4
+; CHECK-NEXT:    [[TMP67:%.*]] = insertelement <20 x i32> [[TMP66]], i32 [[PAD2]], i64 5
+; CHECK-NEXT:    [[TMP68:%.*]] = insertelement <20 x i32> [[TMP67]], i32 [[PAD3]], i64 6
+; CHECK-NEXT:    [[TMP69:%.*]] = insertelement <20 x i32> [[TMP68]], i32 [[PAD4]], i64 7
+; CHECK-NEXT:    [[TMP70:%.*]] = insertelement <20 x i32> [[TMP69]], i32 [[PAD5]], i64 8
+; CHECK-NEXT:    [[TMP71:%.*]] = insertelement <20 x i32> [[TMP70]], i32 [[PAD6]], i64 9
+; CHECK-NEXT:    [[TMP72:%.*]] = insertelement <20 x i32> [[TMP71]], i32 [[PAD7]], i64 10
+; CHECK-NEXT:    [[TMP73:%.*]] = insertelement <20 x i32> [[TMP72]], i32 [[PAD8]], i64 11
+; CHECK-NEXT:    [[TMP74:%.*]] = insertelement <20 x i32> [[TMP73]], i32 [[PAD9]], i64 12
+; CHECK-NEXT:    [[TMP75:%.*]] = insertelement <20 x i32> [[TMP74]], i32 [[PAD10]], i64 13
+; CHECK-NEXT:    [[TMP76:%.*]] = insertelement <20 x i32> [[TMP75]], i32 [[PAD11]], i64 14
+; CHECK-NEXT:    [[TMP77:%.*]] = insertelement <20 x i32> [[TMP76]], i32 [[SPILLTABLE]], i64 15
+; CHECK-NEXT:    [[TMP78:%.*]] = insertelement <20 x i32> [[TMP77]], i32 [[TMP59]], i64 16
+; CHECK-NEXT:    [[TMP79:%.*]] = insertelement <20 x i32> [[TMP78]], i32 [[TMP60]], i64 17
+; CHECK-NEXT:    [[TMP80:%.*]] = insertelement <20 x i32> [[TMP79]], i32 [[TMP61]], i64 18
+; CHECK-NEXT:    [[TMP81:%.*]] = insertelement <20 x i32> [[TMP80]], i32 [[MULTIDISPATCHINFO]], i64 19
+; CHECK-NEXT:    call void (ptr, i32, <20 x i32>, { <3 x i32>, i32, ptr addrspace(5), i32, i32 }, i32, ...) @llvm.amdgcn.cs.chain.p0.i32.v20i32.sl_v3i32i32p5i32i32s(ptr inreg [[TMP54]], i32 inreg [[TMP50]], <20 x i32> inreg [[TMP81]], { <3 x i32>, i32, ptr addrspace(5), i32, i32 } [[TMP27]], i32 0)
 ; CHECK-NEXT:    unreachable
 ;
   %mem = call ptr addrspace(32) @lgc.cps.alloc(i32 10)   ; round up to 12 during lowering
@@ -142,46 +155,59 @@ define void @test.1({} %no_state, ptr addrspace(32) %p2, i32 %q1) !lgc.cps !{i32
 ; CHECK-NEXT:    [[TMP18:%.*]] = insertvalue { <3 x i32>, i32, ptr addrspace(5) } [[TMP17]], ptr addrspace(5) [[TMP15]], 2
 ; CHECK-NEXT:    [[TMP19:%.*]] = extractvalue { <3 x i32>, i32, ptr addrspace(5) } [[TMP18]], 1
 ; CHECK-NEXT:    [[TMP20:%.*]] = call i32 @llvm.amdgcn.set.inactive.chain.arg.i32(i32 [[TMP19]], i32 [[VCR]])
-; CHECK-NEXT:    [[TMP21:%.*]] = icmp ne i32 [[TMP20]], 0
-; CHECK-NEXT:    [[TMP22:%.*]] = call i32 @llvm.amdgcn.ballot.i32(i1 [[TMP21]])
-; CHECK-NEXT:    [[TMP23:%.*]] = call i32 @llvm.cttz.i32(i32 [[TMP22]], i1 true)
-; CHECK-NEXT:    [[TMP24:%.*]] = call i32 @llvm.amdgcn.readlane(i32 [[TMP20]], i32 [[TMP23]])
-; CHECK-NEXT:    [[TMP25:%.*]] = icmp eq i32 [[TMP20]], [[TMP24]]
-; CHECK-NEXT:    [[TMP26:%.*]] = call i32 @llvm.amdgcn.ballot.i32(i1 [[TMP25]])
-; CHECK-NEXT:    [[TMP27:%.*]] = call i32 @llvm.amdgcn.wwm.i32(i32 [[TMP24]])
-; CHECK-NEXT:    [[TMP28:%.*]] = call i32 @llvm.amdgcn.wwm.i32(i32 [[TMP26]])
-; CHECK-NEXT:    [[TMP29:%.*]] = and i32 [[TMP27]], -64
-; CHECK-NEXT:    [[TMP30:%.*]] = insertelement <2 x i32> [[TMP8]], i32 [[TMP29]], i64 0
-; CHECK-NEXT:    [[TMP31:%.*]] = bitcast <2 x i32> [[TMP30]] to i64
-; CHECK-NEXT:    [[TMP32:%.*]] = inttoptr i64 [[TMP31]] to ptr
-; CHECK-NEXT:    [[TMP33:%.*]] = ptrtoint ptr addrspace(4) [[NUMWORKGROUPSPTR]] to i64
-; CHECK-NEXT:    [[TMP34:%.*]] = bitcast i64 [[TMP33]] to <2 x i32>
-; CHECK-NEXT:    [[TMP35:%.*]] = extractelement <2 x i32> [[TMP34]], i64 0
-; CHECK-NEXT:    [[TMP36:%.*]] = extractelement <2 x i32> [[TMP34]], i64 1
-; CHECK-NEXT:    [[TMP37:%.*]] = extractelement <3 x i32> [[WORKGROUPID]], i64 0
-; CHECK-NEXT:    [[TMP38:%.*]] = extractelement <3 x i32> [[WORKGROUPID]], i64 1
-; CHECK-NEXT:    [[TMP39:%.*]] = extractelement <3 x i32> [[WORKGROUPID]], i64 2
-; CHECK-NEXT:    [[TMP40:%.*]] = insertelement <20 x i32> poison, i32 [[GLOBALTABLE]], i64 0
-; CHECK-NEXT:    [[TMP41:%.*]] = insertelement <20 x i32> [[TMP40]], i32 [[TMP35]], i64 1
-; CHECK-NEXT:    [[TMP42:%.*]] = insertelement <20 x i32> [[TMP41]], i32 [[TMP36]], i64 2
-; CHECK-NEXT:    [[TMP43:%.*]] = insertelement <20 x i32> [[TMP42]], i32 [[PAD0]], i64 3
-; CHECK-NEXT:    [[TMP44:%.*]] = insertelement <20 x i32> [[TMP43]], i32 [[PAD1]], i64 4
-; CHECK-NEXT:    [[TMP45:%.*]] = insertelement <20 x i32> [[TMP44]], i32 [[PAD2]], i64 5
-; CHECK-NEXT:    [[TMP46:%.*]] = insertelement <20 x i32> [[TMP45]], i32 [[PAD3]], i64 6
-; CHECK-NEXT:    [[TMP47:%.*]] = insertelement <20 x i32> [[TMP46]], i32 [[PAD4]], i64 7
-; CHECK-NEXT:    [[TMP48:%.*]] = insertelement <20 x i32> [[TMP47]], i32 [[PAD5]], i64 8
-; CHECK-NEXT:    [[TMP49:%.*]] = insertelement <20 x i32> [[TMP48]], i32 [[PAD6]], i64 9
-; CHECK-NEXT:    [[TMP50:%.*]] = insertelement <20 x i32> [[TMP49]], i32 [[PAD7]], i64 10
-; CHECK-NEXT:    [[TMP51:%.*]] = insertelement <20 x i32> [[TMP50]], i32 [[PAD8]], i64 11
-; CHECK-NEXT:    [[TMP52:%.*]] = insertelement <20 x i32> [[TMP51]], i32 [[PAD9]], i64 12
-; CHECK-NEXT:    [[TMP53:%.*]] = insertelement <20 x i32> [[TMP52]], i32 [[PAD10]], i64 13
-; CHECK-NEXT:    [[TMP54:%.*]] = insertelement <20 x i32> [[TMP53]], i32 [[PAD11]], i64 14
-; CHECK-NEXT:    [[TMP55:%.*]] = insertelement <20 x i32> [[TMP54]], i32 [[SPILLTABLE]], i64 15
-; CHECK-NEXT:    [[TMP56:%.*]] = insertelement <20 x i32> [[TMP55]], i32 [[TMP37]], i64 16
-; CHECK-NEXT:    [[TMP57:%.*]] = insertelement <20 x i32> [[TMP56]], i32 [[TMP38]], i64 17
-; CHECK-NEXT:    [[TMP58:%.*]] = insertelement <20 x i32> [[TMP57]], i32 [[TMP39]], i64 18
-; CHECK-NEXT:    [[TMP59:%.*]] = insertelement <20 x i32> [[TMP58]], i32 [[MULTIDISPATCHINFO]], i64 19
-; CHECK-NEXT:    call void (ptr, i32, <20 x i32>, { <3 x i32>, i32, ptr addrspace(5) }, i32, ...) @llvm.amdgcn.cs.chain.p0.i32.v20i32.sl_v3i32i32p5s(ptr inreg [[TMP32]], i32 inreg [[TMP28]], <20 x i32> inreg [[TMP59]], { <3 x i32>, i32, ptr addrspace(5) } [[TMP18]], i32 0)
+; CHECK-NEXT:    [[TMP21:%.*]] = and i32 [[TMP20]], 7
+; CHECK-NEXT:    [[TMP22:%.*]] = icmp ne i32 [[TMP21]], 0
+; CHECK-NEXT:    [[TMP23:%.*]] = call i32 @llvm.amdgcn.ballot.i32(i1 [[TMP22]])
+; CHECK-NEXT:    [[TMP24:%.*]] = icmp eq i32 [[TMP21]], 3
+; CHECK-NEXT:    [[TMP25:%.*]] = call i32 @llvm.amdgcn.ballot.i32(i1 [[TMP24]])
+; CHECK-NEXT:    [[TMP26:%.*]] = icmp ne i32 [[TMP25]], 0
+; CHECK-NEXT:    [[TMP27:%.*]] = select i1 [[TMP26]], i32 [[TMP25]], i32 [[TMP23]]
+; CHECK-NEXT:    [[TMP28:%.*]] = icmp eq i32 [[TMP21]], 2
+; CHECK-NEXT:    [[TMP29:%.*]] = call i32 @llvm.amdgcn.ballot.i32(i1 [[TMP28]])
+; CHECK-NEXT:    [[TMP30:%.*]] = icmp ne i32 [[TMP29]], 0
+; CHECK-NEXT:    [[TMP31:%.*]] = select i1 [[TMP30]], i32 [[TMP29]], i32 [[TMP27]]
+; CHECK-NEXT:    [[TMP32:%.*]] = icmp eq i32 [[TMP21]], 1
+; CHECK-NEXT:    [[TMP33:%.*]] = call i32 @llvm.amdgcn.ballot.i32(i1 [[TMP32]])
+; CHECK-NEXT:    [[TMP34:%.*]] = icmp ne i32 [[TMP33]], 0
+; CHECK-NEXT:    [[TMP35:%.*]] = select i1 [[TMP34]], i32 [[TMP33]], i32 [[TMP31]]
+; CHECK-NEXT:    [[TMP36:%.*]] = call i32 @llvm.cttz.i32(i32 [[TMP35]], i1 true)
+; CHECK-NEXT:    [[TMP37:%.*]] = call i32 @llvm.amdgcn.readlane(i32 [[TMP20]], i32 [[TMP36]])
+; CHECK-NEXT:    [[TMP38:%.*]] = icmp eq i32 [[TMP20]], [[TMP37]]
+; CHECK-NEXT:    [[TMP39:%.*]] = call i32 @llvm.amdgcn.ballot.i32(i1 [[TMP38]])
+; CHECK-NEXT:    [[TMP40:%.*]] = call i32 @llvm.amdgcn.wwm.i32(i32 [[TMP37]])
+; CHECK-NEXT:    [[TMP41:%.*]] = call i32 @llvm.amdgcn.wwm.i32(i32 [[TMP39]])
+; CHECK-NEXT:    [[TMP42:%.*]] = and i32 [[TMP40]], -64
+; CHECK-NEXT:    [[TMP43:%.*]] = insertelement <2 x i32> [[TMP8]], i32 [[TMP42]], i64 0
+; CHECK-NEXT:    [[TMP44:%.*]] = bitcast <2 x i32> [[TMP43]] to i64
+; CHECK-NEXT:    [[TMP45:%.*]] = inttoptr i64 [[TMP44]] to ptr
+; CHECK-NEXT:    [[TMP46:%.*]] = ptrtoint ptr addrspace(4) [[NUMWORKGROUPSPTR]] to i64
+; CHECK-NEXT:    [[TMP47:%.*]] = bitcast i64 [[TMP46]] to <2 x i32>
+; CHECK-NEXT:    [[TMP48:%.*]] = extractelement <2 x i32> [[TMP47]], i64 0
+; CHECK-NEXT:    [[TMP49:%.*]] = extractelement <2 x i32> [[TMP47]], i64 1
+; CHECK-NEXT:    [[TMP50:%.*]] = extractelement <3 x i32> [[WORKGROUPID]], i64 0
+; CHECK-NEXT:    [[TMP51:%.*]] = extractelement <3 x i32> [[WORKGROUPID]], i64 1
+; CHECK-NEXT:    [[TMP52:%.*]] = extractelement <3 x i32> [[WORKGROUPID]], i64 2
+; CHECK-NEXT:    [[TMP53:%.*]] = insertelement <20 x i32> poison, i32 [[GLOBALTABLE]], i64 0
+; CHECK-NEXT:    [[TMP54:%.*]] = insertelement <20 x i32> [[TMP53]], i32 [[TMP48]], i64 1
+; CHECK-NEXT:    [[TMP55:%.*]] = insertelement <20 x i32> [[TMP54]], i32 [[TMP49]], i64 2
+; CHECK-NEXT:    [[TMP56:%.*]] = insertelement <20 x i32> [[TMP55]], i32 [[PAD0]], i64 3
+; CHECK-NEXT:    [[TMP57:%.*]] = insertelement <20 x i32> [[TMP56]], i32 [[PAD1]], i64 4
+; CHECK-NEXT:    [[TMP58:%.*]] = insertelement <20 x i32> [[TMP57]], i32 [[PAD2]], i64 5
+; CHECK-NEXT:    [[TMP59:%.*]] = insertelement <20 x i32> [[TMP58]], i32 [[PAD3]], i64 6
+; CHECK-NEXT:    [[TMP60:%.*]] = insertelement <20 x i32> [[TMP59]], i32 [[PAD4]], i64 7
+; CHECK-NEXT:    [[TMP61:%.*]] = insertelement <20 x i32> [[TMP60]], i32 [[PAD5]], i64 8
+; CHECK-NEXT:    [[TMP62:%.*]] = insertelement <20 x i32> [[TMP61]], i32 [[PAD6]], i64 9
+; CHECK-NEXT:    [[TMP63:%.*]] = insertelement <20 x i32> [[TMP62]], i32 [[PAD7]], i64 10
+; CHECK-NEXT:    [[TMP64:%.*]] = insertelement <20 x i32> [[TMP63]], i32 [[PAD8]], i64 11
+; CHECK-NEXT:    [[TMP65:%.*]] = insertelement <20 x i32> [[TMP64]], i32 [[PAD9]], i64 12
+; CHECK-NEXT:    [[TMP66:%.*]] = insertelement <20 x i32> [[TMP65]], i32 [[PAD10]], i64 13
+; CHECK-NEXT:    [[TMP67:%.*]] = insertelement <20 x i32> [[TMP66]], i32 [[PAD11]], i64 14
+; CHECK-NEXT:    [[TMP68:%.*]] = insertelement <20 x i32> [[TMP67]], i32 [[SPILLTABLE]], i64 15
+; CHECK-NEXT:    [[TMP69:%.*]] = insertelement <20 x i32> [[TMP68]], i32 [[TMP50]], i64 16
+; CHECK-NEXT:    [[TMP70:%.*]] = insertelement <20 x i32> [[TMP69]], i32 [[TMP51]], i64 17
+; CHECK-NEXT:    [[TMP71:%.*]] = insertelement <20 x i32> [[TMP70]], i32 [[TMP52]], i64 18
+; CHECK-NEXT:    [[TMP72:%.*]] = insertelement <20 x i32> [[TMP71]], i32 [[MULTIDISPATCHINFO]], i64 19
+; CHECK-NEXT:    call void (ptr, i32, <20 x i32>, { <3 x i32>, i32, ptr addrspace(5) }, i32, ...) @llvm.amdgcn.cs.chain.p0.i32.v20i32.sl_v3i32i32p5s(ptr inreg [[TMP45]], i32 inreg [[TMP41]], <20 x i32> inreg [[TMP72]], { <3 x i32>, i32, ptr addrspace(5) } [[TMP18]], i32 0)
 ; CHECK-NEXT:    unreachable
 ;
   %p1 = inttoptr i32 %q1 to ptr addrspace(32)
@@ -227,49 +253,62 @@ define void @test.2({ ptr addrspace(32) } %state) !lgc.cps !{i32 1} !lgc.shaders
 ; CHECK-NEXT:    [[TMP21:%.*]] = insertvalue { <3 x i32>, i32, ptr addrspace(5) } [[TMP20]], ptr addrspace(5) poison, 2
 ; CHECK-NEXT:    [[TMP22:%.*]] = extractvalue { <3 x i32>, i32, ptr addrspace(5) } [[TMP21]], 1
 ; CHECK-NEXT:    [[TMP23:%.*]] = call i32 @llvm.amdgcn.set.inactive.chain.arg.i32(i32 [[TMP22]], i32 [[VCR]])
-; CHECK-NEXT:    [[TMP24:%.*]] = icmp ne i32 [[TMP23]], 0
-; CHECK-NEXT:    [[TMP25:%.*]] = call i32 @llvm.amdgcn.ballot.i32(i1 [[TMP24]])
-; CHECK-NEXT:    [[TMP26:%.*]] = call i32 @llvm.cttz.i32(i32 [[TMP25]], i1 true)
-; CHECK-NEXT:    [[TMP27:%.*]] = call i32 @llvm.amdgcn.readlane(i32 [[TMP23]], i32 [[TMP26]])
-; CHECK-NEXT:    [[TMP28:%.*]] = icmp eq i32 [[TMP23]], [[TMP27]]
-; CHECK-NEXT:    [[TMP29:%.*]] = call i32 @llvm.amdgcn.ballot.i32(i1 [[TMP28]])
-; CHECK-NEXT:    [[TMP30:%.*]] = call i32 @llvm.amdgcn.wwm.i32(i32 [[TMP27]])
-; CHECK-NEXT:    [[TMP31:%.*]] = call i32 @llvm.amdgcn.wwm.i32(i32 [[TMP29]])
-; CHECK-NEXT:    [[TMP32:%.*]] = icmp eq i32 [[TMP30]], 0
-; CHECK-NEXT:    br i1 [[TMP32]], label [[RET_BLOCK:%.*]], label [[CHAIN_BLOCK:%.*]]
+; CHECK-NEXT:    [[TMP24:%.*]] = and i32 [[TMP23]], 7
+; CHECK-NEXT:    [[TMP25:%.*]] = icmp ne i32 [[TMP24]], 0
+; CHECK-NEXT:    [[TMP26:%.*]] = call i32 @llvm.amdgcn.ballot.i32(i1 [[TMP25]])
+; CHECK-NEXT:    [[TMP27:%.*]] = icmp eq i32 [[TMP24]], 3
+; CHECK-NEXT:    [[TMP28:%.*]] = call i32 @llvm.amdgcn.ballot.i32(i1 [[TMP27]])
+; CHECK-NEXT:    [[TMP29:%.*]] = icmp ne i32 [[TMP28]], 0
+; CHECK-NEXT:    [[TMP30:%.*]] = select i1 [[TMP29]], i32 [[TMP28]], i32 [[TMP26]]
+; CHECK-NEXT:    [[TMP31:%.*]] = icmp eq i32 [[TMP24]], 2
+; CHECK-NEXT:    [[TMP32:%.*]] = call i32 @llvm.amdgcn.ballot.i32(i1 [[TMP31]])
+; CHECK-NEXT:    [[TMP33:%.*]] = icmp ne i32 [[TMP32]], 0
+; CHECK-NEXT:    [[TMP34:%.*]] = select i1 [[TMP33]], i32 [[TMP32]], i32 [[TMP30]]
+; CHECK-NEXT:    [[TMP35:%.*]] = icmp eq i32 [[TMP24]], 1
+; CHECK-NEXT:    [[TMP36:%.*]] = call i32 @llvm.amdgcn.ballot.i32(i1 [[TMP35]])
+; CHECK-NEXT:    [[TMP37:%.*]] = icmp ne i32 [[TMP36]], 0
+; CHECK-NEXT:    [[TMP38:%.*]] = select i1 [[TMP37]], i32 [[TMP36]], i32 [[TMP34]]
+; CHECK-NEXT:    [[TMP39:%.*]] = call i32 @llvm.cttz.i32(i32 [[TMP38]], i1 true)
+; CHECK-NEXT:    [[TMP40:%.*]] = call i32 @llvm.amdgcn.readlane(i32 [[TMP23]], i32 [[TMP39]])
+; CHECK-NEXT:    [[TMP41:%.*]] = icmp eq i32 [[TMP23]], [[TMP40]]
+; CHECK-NEXT:    [[TMP42:%.*]] = call i32 @llvm.amdgcn.ballot.i32(i1 [[TMP41]])
+; CHECK-NEXT:    [[TMP43:%.*]] = call i32 @llvm.amdgcn.wwm.i32(i32 [[TMP40]])
+; CHECK-NEXT:    [[TMP44:%.*]] = call i32 @llvm.amdgcn.wwm.i32(i32 [[TMP42]])
+; CHECK-NEXT:    [[TMP45:%.*]] = icmp eq i32 [[TMP43]], 0
+; CHECK-NEXT:    br i1 [[TMP45]], label [[RET_BLOCK:%.*]], label [[CHAIN_BLOCK:%.*]]
 ; CHECK:       chain.block:
-; CHECK-NEXT:    [[TMP33:%.*]] = and i32 [[TMP30]], -64
-; CHECK-NEXT:    [[TMP34:%.*]] = insertelement <2 x i32> [[TMP8]], i32 [[TMP33]], i64 0
-; CHECK-NEXT:    [[TMP35:%.*]] = bitcast <2 x i32> [[TMP34]] to i64
-; CHECK-NEXT:    [[TMP36:%.*]] = inttoptr i64 [[TMP35]] to ptr
-; CHECK-NEXT:    [[TMP37:%.*]] = ptrtoint ptr addrspace(4) [[NUMWORKGROUPSPTR]] to i64
-; CHECK-NEXT:    [[TMP38:%.*]] = bitcast i64 [[TMP37]] to <2 x i32>
-; CHECK-NEXT:    [[TMP39:%.*]] = extractelement <2 x i32> [[TMP38]], i64 0
-; CHECK-NEXT:    [[TMP40:%.*]] = extractelement <2 x i32> [[TMP38]], i64 1
-; CHECK-NEXT:    [[TMP41:%.*]] = extractelement <3 x i32> [[WORKGROUPID]], i64 0
-; CHECK-NEXT:    [[TMP42:%.*]] = extractelement <3 x i32> [[WORKGROUPID]], i64 1
-; CHECK-NEXT:    [[TMP43:%.*]] = extractelement <3 x i32> [[WORKGROUPID]], i64 2
-; CHECK-NEXT:    [[TMP44:%.*]] = insertelement <20 x i32> poison, i32 [[GLOBALTABLE]], i64 0
-; CHECK-NEXT:    [[TMP45:%.*]] = insertelement <20 x i32> [[TMP44]], i32 [[TMP39]], i64 1
-; CHECK-NEXT:    [[TMP46:%.*]] = insertelement <20 x i32> [[TMP45]], i32 [[TMP40]], i64 2
-; CHECK-NEXT:    [[TMP47:%.*]] = insertelement <20 x i32> [[TMP46]], i32 [[PAD0]], i64 3
-; CHECK-NEXT:    [[TMP48:%.*]] = insertelement <20 x i32> [[TMP47]], i32 [[PAD1]], i64 4
-; CHECK-NEXT:    [[TMP49:%.*]] = insertelement <20 x i32> [[TMP48]], i32 [[PAD2]], i64 5
-; CHECK-NEXT:    [[TMP50:%.*]] = insertelement <20 x i32> [[TMP49]], i32 [[PAD3]], i64 6
-; CHECK-NEXT:    [[TMP51:%.*]] = insertelement <20 x i32> [[TMP50]], i32 [[PAD4]], i64 7
-; CHECK-NEXT:    [[TMP52:%.*]] = insertelement <20 x i32> [[TMP51]], i32 [[PAD5]], i64 8
-; CHECK-NEXT:    [[TMP53:%.*]] = insertelement <20 x i32> [[TMP52]], i32 [[PAD6]], i64 9
-; CHECK-NEXT:    [[TMP54:%.*]] = insertelement <20 x i32> [[TMP53]], i32 [[PAD7]], i64 10
-; CHECK-NEXT:    [[TMP55:%.*]] = insertelement <20 x i32> [[TMP54]], i32 [[PAD8]], i64 11
-; CHECK-NEXT:    [[TMP56:%.*]] = insertelement <20 x i32> [[TMP55]], i32 [[PAD9]], i64 12
-; CHECK-NEXT:    [[TMP57:%.*]] = insertelement <20 x i32> [[TMP56]], i32 [[PAD10]], i64 13
-; CHECK-NEXT:    [[TMP58:%.*]] = insertelement <20 x i32> [[TMP57]], i32 [[PAD11]], i64 14
-; CHECK-NEXT:    [[TMP59:%.*]] = insertelement <20 x i32> [[TMP58]], i32 [[SPILLTABLE]], i64 15
-; CHECK-NEXT:    [[TMP60:%.*]] = insertelement <20 x i32> [[TMP59]], i32 [[TMP41]], i64 16
-; CHECK-NEXT:    [[TMP61:%.*]] = insertelement <20 x i32> [[TMP60]], i32 [[TMP42]], i64 17
-; CHECK-NEXT:    [[TMP62:%.*]] = insertelement <20 x i32> [[TMP61]], i32 [[TMP43]], i64 18
-; CHECK-NEXT:    [[TMP63:%.*]] = insertelement <20 x i32> [[TMP62]], i32 [[MULTIDISPATCHINFO]], i64 19
-; CHECK-NEXT:    call void (ptr, i32, <20 x i32>, { <3 x i32>, i32, ptr addrspace(5) }, i32, ...) @llvm.amdgcn.cs.chain.p0.i32.v20i32.sl_v3i32i32p5s(ptr inreg [[TMP36]], i32 inreg [[TMP31]], <20 x i32> inreg [[TMP63]], { <3 x i32>, i32, ptr addrspace(5) } [[TMP21]], i32 0)
+; CHECK-NEXT:    [[TMP46:%.*]] = and i32 [[TMP43]], -64
+; CHECK-NEXT:    [[TMP47:%.*]] = insertelement <2 x i32> [[TMP8]], i32 [[TMP46]], i64 0
+; CHECK-NEXT:    [[TMP48:%.*]] = bitcast <2 x i32> [[TMP47]] to i64
+; CHECK-NEXT:    [[TMP49:%.*]] = inttoptr i64 [[TMP48]] to ptr
+; CHECK-NEXT:    [[TMP50:%.*]] = ptrtoint ptr addrspace(4) [[NUMWORKGROUPSPTR]] to i64
+; CHECK-NEXT:    [[TMP51:%.*]] = bitcast i64 [[TMP50]] to <2 x i32>
+; CHECK-NEXT:    [[TMP52:%.*]] = extractelement <2 x i32> [[TMP51]], i64 0
+; CHECK-NEXT:    [[TMP53:%.*]] = extractelement <2 x i32> [[TMP51]], i64 1
+; CHECK-NEXT:    [[TMP54:%.*]] = extractelement <3 x i32> [[WORKGROUPID]], i64 0
+; CHECK-NEXT:    [[TMP55:%.*]] = extractelement <3 x i32> [[WORKGROUPID]], i64 1
+; CHECK-NEXT:    [[TMP56:%.*]] = extractelement <3 x i32> [[WORKGROUPID]], i64 2
+; CHECK-NEXT:    [[TMP57:%.*]] = insertelement <20 x i32> poison, i32 [[GLOBALTABLE]], i64 0
+; CHECK-NEXT:    [[TMP58:%.*]] = insertelement <20 x i32> [[TMP57]], i32 [[TMP52]], i64 1
+; CHECK-NEXT:    [[TMP59:%.*]] = insertelement <20 x i32> [[TMP58]], i32 [[TMP53]], i64 2
+; CHECK-NEXT:    [[TMP60:%.*]] = insertelement <20 x i32> [[TMP59]], i32 [[PAD0]], i64 3
+; CHECK-NEXT:    [[TMP61:%.*]] = insertelement <20 x i32> [[TMP60]], i32 [[PAD1]], i64 4
+; CHECK-NEXT:    [[TMP62:%.*]] = insertelement <20 x i32> [[TMP61]], i32 [[PAD2]], i64 5
+; CHECK-NEXT:    [[TMP63:%.*]] = insertelement <20 x i32> [[TMP62]], i32 [[PAD3]], i64 6
+; CHECK-NEXT:    [[TMP64:%.*]] = insertelement <20 x i32> [[TMP63]], i32 [[PAD4]], i64 7
+; CHECK-NEXT:    [[TMP65:%.*]] = insertelement <20 x i32> [[TMP64]], i32 [[PAD5]], i64 8
+; CHECK-NEXT:    [[TMP66:%.*]] = insertelement <20 x i32> [[TMP65]], i32 [[PAD6]], i64 9
+; CHECK-NEXT:    [[TMP67:%.*]] = insertelement <20 x i32> [[TMP66]], i32 [[PAD7]], i64 10
+; CHECK-NEXT:    [[TMP68:%.*]] = insertelement <20 x i32> [[TMP67]], i32 [[PAD8]], i64 11
+; CHECK-NEXT:    [[TMP69:%.*]] = insertelement <20 x i32> [[TMP68]], i32 [[PAD9]], i64 12
+; CHECK-NEXT:    [[TMP70:%.*]] = insertelement <20 x i32> [[TMP69]], i32 [[PAD10]], i64 13
+; CHECK-NEXT:    [[TMP71:%.*]] = insertelement <20 x i32> [[TMP70]], i32 [[PAD11]], i64 14
+; CHECK-NEXT:    [[TMP72:%.*]] = insertelement <20 x i32> [[TMP71]], i32 [[SPILLTABLE]], i64 15
+; CHECK-NEXT:    [[TMP73:%.*]] = insertelement <20 x i32> [[TMP72]], i32 [[TMP54]], i64 16
+; CHECK-NEXT:    [[TMP74:%.*]] = insertelement <20 x i32> [[TMP73]], i32 [[TMP55]], i64 17
+; CHECK-NEXT:    [[TMP75:%.*]] = insertelement <20 x i32> [[TMP74]], i32 [[TMP56]], i64 18
+; CHECK-NEXT:    [[TMP76:%.*]] = insertelement <20 x i32> [[TMP75]], i32 [[MULTIDISPATCHINFO]], i64 19
+; CHECK-NEXT:    call void (ptr, i32, <20 x i32>, { <3 x i32>, i32, ptr addrspace(5) }, i32, ...) @llvm.amdgcn.cs.chain.p0.i32.v20i32.sl_v3i32i32p5s(ptr inreg [[TMP49]], i32 inreg [[TMP44]], <20 x i32> inreg [[TMP76]], { <3 x i32>, i32, ptr addrspace(5) } [[TMP21]], i32 0)
 ; CHECK-NEXT:    unreachable
 ; CHECK:       ret.block:
 ; CHECK-NEXT:    ret void
@@ -337,46 +376,59 @@ define void @test.gep({} %unused) !lgc.cps !{i32 1} !lgc.shaderstage !{i32 7} {
 ; CHECK-NEXT:    [[TMP36:%.*]] = insertvalue { <3 x i32>, i32, ptr addrspace(5), i32, i32 } [[TMP35]], i32 [[TMP27]], 4
 ; CHECK-NEXT:    [[TMP37:%.*]] = extractvalue { <3 x i32>, i32, ptr addrspace(5), i32, i32 } [[TMP36]], 1
 ; CHECK-NEXT:    [[TMP38:%.*]] = call i32 @llvm.amdgcn.set.inactive.chain.arg.i32(i32 [[TMP37]], i32 [[VCR]])
-; CHECK-NEXT:    [[TMP39:%.*]] = icmp ne i32 [[TMP38]], 0
-; CHECK-NEXT:    [[TMP40:%.*]] = call i32 @llvm.amdgcn.ballot.i32(i1 [[TMP39]])
-; CHECK-NEXT:    [[TMP41:%.*]] = call i32 @llvm.cttz.i32(i32 [[TMP40]], i1 true)
-; CHECK-NEXT:    [[TMP42:%.*]] = call i32 @llvm.amdgcn.readlane(i32 [[TMP38]], i32 [[TMP41]])
-; CHECK-NEXT:    [[TMP43:%.*]] = icmp eq i32 [[TMP38]], [[TMP42]]
-; CHECK-NEXT:    [[TMP44:%.*]] = call i32 @llvm.amdgcn.ballot.i32(i1 [[TMP43]])
-; CHECK-NEXT:    [[TMP45:%.*]] = call i32 @llvm.amdgcn.wwm.i32(i32 [[TMP42]])
-; CHECK-NEXT:    [[TMP46:%.*]] = call i32 @llvm.amdgcn.wwm.i32(i32 [[TMP44]])
-; CHECK-NEXT:    [[TMP47:%.*]] = and i32 [[TMP45]], -64
-; CHECK-NEXT:    [[TMP48:%.*]] = insertelement <2 x i32> [[TMP8]], i32 [[TMP47]], i64 0
-; CHECK-NEXT:    [[TMP49:%.*]] = bitcast <2 x i32> [[TMP48]] to i64
-; CHECK-NEXT:    [[TMP50:%.*]] = inttoptr i64 [[TMP49]] to ptr
-; CHECK-NEXT:    [[TMP51:%.*]] = ptrtoint ptr addrspace(4) [[NUMWORKGROUPSPTR]] to i64
-; CHECK-NEXT:    [[TMP52:%.*]] = bitcast i64 [[TMP51]] to <2 x i32>
-; CHECK-NEXT:    [[TMP53:%.*]] = extractelement <2 x i32> [[TMP52]], i64 0
-; CHECK-NEXT:    [[TMP54:%.*]] = extractelement <2 x i32> [[TMP52]], i64 1
-; CHECK-NEXT:    [[TMP55:%.*]] = extractelement <3 x i32> [[WORKGROUPID]], i64 0
-; CHECK-NEXT:    [[TMP56:%.*]] = extractelement <3 x i32> [[WORKGROUPID]], i64 1
-; CHECK-NEXT:    [[TMP57:%.*]] = extractelement <3 x i32> [[WORKGROUPID]], i64 2
-; CHECK-NEXT:    [[TMP58:%.*]] = insertelement <20 x i32> poison, i32 [[GLOBALTABLE]], i64 0
-; CHECK-NEXT:    [[TMP59:%.*]] = insertelement <20 x i32> [[TMP58]], i32 [[TMP53]], i64 1
-; CHECK-NEXT:    [[TMP60:%.*]] = insertelement <20 x i32> [[TMP59]], i32 [[TMP54]], i64 2
-; CHECK-NEXT:    [[TMP61:%.*]] = insertelement <20 x i32> [[TMP60]], i32 [[PAD0]], i64 3
-; CHECK-NEXT:    [[TMP62:%.*]] = insertelement <20 x i32> [[TMP61]], i32 [[PAD1]], i64 4
-; CHECK-NEXT:    [[TMP63:%.*]] = insertelement <20 x i32> [[TMP62]], i32 [[PAD2]], i64 5
-; CHECK-NEXT:    [[TMP64:%.*]] = insertelement <20 x i32> [[TMP63]], i32 [[PAD3]], i64 6
-; CHECK-NEXT:    [[TMP65:%.*]] = insertelement <20 x i32> [[TMP64]], i32 [[PAD4]], i64 7
-; CHECK-NEXT:    [[TMP66:%.*]] = insertelement <20 x i32> [[TMP65]], i32 [[PAD5]], i64 8
-; CHECK-NEXT:    [[TMP67:%.*]] = insertelement <20 x i32> [[TMP66]], i32 [[PAD6]], i64 9
-; CHECK-NEXT:    [[TMP68:%.*]] = insertelement <20 x i32> [[TMP67]], i32 [[PAD7]], i64 10
-; CHECK-NEXT:    [[TMP69:%.*]] = insertelement <20 x i32> [[TMP68]], i32 [[PAD8]], i64 11
-; CHECK-NEXT:    [[TMP70:%.*]] = insertelement <20 x i32> [[TMP69]], i32 [[PAD9]], i64 12
-; CHECK-NEXT:    [[TMP71:%.*]] = insertelement <20 x i32> [[TMP70]], i32 [[PAD10]], i64 13
-; CHECK-NEXT:    [[TMP72:%.*]] = insertelement <20 x i32> [[TMP71]], i32 [[PAD11]], i64 14
-; CHECK-NEXT:    [[TMP73:%.*]] = insertelement <20 x i32> [[TMP72]], i32 [[SPILLTABLE]], i64 15
-; CHECK-NEXT:    [[TMP74:%.*]] = insertelement <20 x i32> [[TMP73]], i32 [[TMP55]], i64 16
-; CHECK-NEXT:    [[TMP75:%.*]] = insertelement <20 x i32> [[TMP74]], i32 [[TMP56]], i64 17
-; CHECK-NEXT:    [[TMP76:%.*]] = insertelement <20 x i32> [[TMP75]], i32 [[TMP57]], i64 18
-; CHECK-NEXT:    [[TMP77:%.*]] = insertelement <20 x i32> [[TMP76]], i32 [[MULTIDISPATCHINFO]], i64 19
-; CHECK-NEXT:    call void (ptr, i32, <20 x i32>, { <3 x i32>, i32, ptr addrspace(5), i32, i32 }, i32, ...) @llvm.amdgcn.cs.chain.p0.i32.v20i32.sl_v3i32i32p5i32i32s(ptr inreg [[TMP50]], i32 inreg [[TMP46]], <20 x i32> inreg [[TMP77]], { <3 x i32>, i32, ptr addrspace(5), i32, i32 } [[TMP36]], i32 0)
+; CHECK-NEXT:    [[TMP39:%.*]] = and i32 [[TMP38]], 7
+; CHECK-NEXT:    [[TMP40:%.*]] = icmp ne i32 [[TMP39]], 0
+; CHECK-NEXT:    [[TMP41:%.*]] = call i32 @llvm.amdgcn.ballot.i32(i1 [[TMP40]])
+; CHECK-NEXT:    [[TMP42:%.*]] = icmp eq i32 [[TMP39]], 3
+; CHECK-NEXT:    [[TMP43:%.*]] = call i32 @llvm.amdgcn.ballot.i32(i1 [[TMP42]])
+; CHECK-NEXT:    [[TMP44:%.*]] = icmp ne i32 [[TMP43]], 0
+; CHECK-NEXT:    [[TMP45:%.*]] = select i1 [[TMP44]], i32 [[TMP43]], i32 [[TMP41]]
+; CHECK-NEXT:    [[TMP46:%.*]] = icmp eq i32 [[TMP39]], 2
+; CHECK-NEXT:    [[TMP47:%.*]] = call i32 @llvm.amdgcn.ballot.i32(i1 [[TMP46]])
+; CHECK-NEXT:    [[TMP48:%.*]] = icmp ne i32 [[TMP47]], 0
+; CHECK-NEXT:    [[TMP49:%.*]] = select i1 [[TMP48]], i32 [[TMP47]], i32 [[TMP45]]
+; CHECK-NEXT:    [[TMP50:%.*]] = icmp eq i32 [[TMP39]], 1
+; CHECK-NEXT:    [[TMP51:%.*]] = call i32 @llvm.amdgcn.ballot.i32(i1 [[TMP50]])
+; CHECK-NEXT:    [[TMP52:%.*]] = icmp ne i32 [[TMP51]], 0
+; CHECK-NEXT:    [[TMP53:%.*]] = select i1 [[TMP52]], i32 [[TMP51]], i32 [[TMP49]]
+; CHECK-NEXT:    [[TMP54:%.*]] = call i32 @llvm.cttz.i32(i32 [[TMP53]], i1 true)
+; CHECK-NEXT:    [[TMP55:%.*]] = call i32 @llvm.amdgcn.readlane(i32 [[TMP38]], i32 [[TMP54]])
+; CHECK-NEXT:    [[TMP56:%.*]] = icmp eq i32 [[TMP38]], [[TMP55]]
+; CHECK-NEXT:    [[TMP57:%.*]] = call i32 @llvm.amdgcn.ballot.i32(i1 [[TMP56]])
+; CHECK-NEXT:    [[TMP58:%.*]] = call i32 @llvm.amdgcn.wwm.i32(i32 [[TMP55]])
+; CHECK-NEXT:    [[TMP59:%.*]] = call i32 @llvm.amdgcn.wwm.i32(i32 [[TMP57]])
+; CHECK-NEXT:    [[TMP60:%.*]] = and i32 [[TMP58]], -64
+; CHECK-NEXT:    [[TMP61:%.*]] = insertelement <2 x i32> [[TMP8]], i32 [[TMP60]], i64 0
+; CHECK-NEXT:    [[TMP62:%.*]] = bitcast <2 x i32> [[TMP61]] to i64
+; CHECK-NEXT:    [[TMP63:%.*]] = inttoptr i64 [[TMP62]] to ptr
+; CHECK-NEXT:    [[TMP64:%.*]] = ptrtoint ptr addrspace(4) [[NUMWORKGROUPSPTR]] to i64
+; CHECK-NEXT:    [[TMP65:%.*]] = bitcast i64 [[TMP64]] to <2 x i32>
+; CHECK-NEXT:    [[TMP66:%.*]] = extractelement <2 x i32> [[TMP65]], i64 0
+; CHECK-NEXT:    [[TMP67:%.*]] = extractelement <2 x i32> [[TMP65]], i64 1
+; CHECK-NEXT:    [[TMP68:%.*]] = extractelement <3 x i32> [[WORKGROUPID]], i64 0
+; CHECK-NEXT:    [[TMP69:%.*]] = extractelement <3 x i32> [[WORKGROUPID]], i64 1
+; CHECK-NEXT:    [[TMP70:%.*]] = extractelement <3 x i32> [[WORKGROUPID]], i64 2
+; CHECK-NEXT:    [[TMP71:%.*]] = insertelement <20 x i32> poison, i32 [[GLOBALTABLE]], i64 0
+; CHECK-NEXT:    [[TMP72:%.*]] = insertelement <20 x i32> [[TMP71]], i32 [[TMP66]], i64 1
+; CHECK-NEXT:    [[TMP73:%.*]] = insertelement <20 x i32> [[TMP72]], i32 [[TMP67]], i64 2
+; CHECK-NEXT:    [[TMP74:%.*]] = insertelement <20 x i32> [[TMP73]], i32 [[PAD0]], i64 3
+; CHECK-NEXT:    [[TMP75:%.*]] = insertelement <20 x i32> [[TMP74]], i32 [[PAD1]], i64 4
+; CHECK-NEXT:    [[TMP76:%.*]] = insertelement <20 x i32> [[TMP75]], i32 [[PAD2]], i64 5
+; CHECK-NEXT:    [[TMP77:%.*]] = insertelement <20 x i32> [[TMP76]], i32 [[PAD3]], i64 6
+; CHECK-NEXT:    [[TMP78:%.*]] = insertelement <20 x i32> [[TMP77]], i32 [[PAD4]], i64 7
+; CHECK-NEXT:    [[TMP79:%.*]] = insertelement <20 x i32> [[TMP78]], i32 [[PAD5]], i64 8
+; CHECK-NEXT:    [[TMP80:%.*]] = insertelement <20 x i32> [[TMP79]], i32 [[PAD6]], i64 9
+; CHECK-NEXT:    [[TMP81:%.*]] = insertelement <20 x i32> [[TMP80]], i32 [[PAD7]], i64 10
+; CHECK-NEXT:    [[TMP82:%.*]] = insertelement <20 x i32> [[TMP81]], i32 [[PAD8]], i64 11
+; CHECK-NEXT:    [[TMP83:%.*]] = insertelement <20 x i32> [[TMP82]], i32 [[PAD9]], i64 12
+; CHECK-NEXT:    [[TMP84:%.*]] = insertelement <20 x i32> [[TMP83]], i32 [[PAD10]], i64 13
+; CHECK-NEXT:    [[TMP85:%.*]] = insertelement <20 x i32> [[TMP84]], i32 [[PAD11]], i64 14
+; CHECK-NEXT:    [[TMP86:%.*]] = insertelement <20 x i32> [[TMP85]], i32 [[SPILLTABLE]], i64 15
+; CHECK-NEXT:    [[TMP87:%.*]] = insertelement <20 x i32> [[TMP86]], i32 [[TMP68]], i64 16
+; CHECK-NEXT:    [[TMP88:%.*]] = insertelement <20 x i32> [[TMP87]], i32 [[TMP69]], i64 17
+; CHECK-NEXT:    [[TMP89:%.*]] = insertelement <20 x i32> [[TMP88]], i32 [[TMP70]], i64 18
+; CHECK-NEXT:    [[TMP90:%.*]] = insertelement <20 x i32> [[TMP89]], i32 [[MULTIDISPATCHINFO]], i64 19
+; CHECK-NEXT:    call void (ptr, i32, <20 x i32>, { <3 x i32>, i32, ptr addrspace(5), i32, i32 }, i32, ...) @llvm.amdgcn.cs.chain.p0.i32.v20i32.sl_v3i32i32p5i32i32s(ptr inreg [[TMP63]], i32 inreg [[TMP59]], <20 x i32> inreg [[TMP90]], { <3 x i32>, i32, ptr addrspace(5), i32, i32 } [[TMP36]], i32 0)
 ; CHECK-NEXT:    unreachable
 ;
   %mem = call ptr addrspace(32) @lgc.cps.alloc(i32 10)   ; round up to 12 during lowering
@@ -441,46 +493,59 @@ define void @test.nested.gep({} %unused) !lgc.cps !{i32 1} !lgc.shaderstage !{i3
 ; CHECK-NEXT:    [[TMP24:%.*]] = insertvalue { <3 x i32>, i32, ptr addrspace(5), i32, i32 } [[TMP23]], i32 [[TMP15]], 4
 ; CHECK-NEXT:    [[TMP25:%.*]] = extractvalue { <3 x i32>, i32, ptr addrspace(5), i32, i32 } [[TMP24]], 1
 ; CHECK-NEXT:    [[TMP26:%.*]] = call i32 @llvm.amdgcn.set.inactive.chain.arg.i32(i32 [[TMP25]], i32 [[VCR]])
-; CHECK-NEXT:    [[TMP27:%.*]] = icmp ne i32 [[TMP26]], 0
-; CHECK-NEXT:    [[TMP28:%.*]] = call i32 @llvm.amdgcn.ballot.i32(i1 [[TMP27]])
-; CHECK-NEXT:    [[TMP29:%.*]] = call i32 @llvm.cttz.i32(i32 [[TMP28]], i1 true)
-; CHECK-NEXT:    [[TMP30:%.*]] = call i32 @llvm.amdgcn.readlane(i32 [[TMP26]], i32 [[TMP29]])
-; CHECK-NEXT:    [[TMP31:%.*]] = icmp eq i32 [[TMP26]], [[TMP30]]
-; CHECK-NEXT:    [[TMP32:%.*]] = call i32 @llvm.amdgcn.ballot.i32(i1 [[TMP31]])
-; CHECK-NEXT:    [[TMP33:%.*]] = call i32 @llvm.amdgcn.wwm.i32(i32 [[TMP30]])
-; CHECK-NEXT:    [[TMP34:%.*]] = call i32 @llvm.amdgcn.wwm.i32(i32 [[TMP32]])
-; CHECK-NEXT:    [[TMP35:%.*]] = and i32 [[TMP33]], -64
-; CHECK-NEXT:    [[TMP36:%.*]] = insertelement <2 x i32> [[TMP8]], i32 [[TMP35]], i64 0
-; CHECK-NEXT:    [[TMP37:%.*]] = bitcast <2 x i32> [[TMP36]] to i64
-; CHECK-NEXT:    [[TMP38:%.*]] = inttoptr i64 [[TMP37]] to ptr
-; CHECK-NEXT:    [[TMP39:%.*]] = ptrtoint ptr addrspace(4) [[NUMWORKGROUPSPTR]] to i64
-; CHECK-NEXT:    [[TMP40:%.*]] = bitcast i64 [[TMP39]] to <2 x i32>
-; CHECK-NEXT:    [[TMP41:%.*]] = extractelement <2 x i32> [[TMP40]], i64 0
-; CHECK-NEXT:    [[TMP42:%.*]] = extractelement <2 x i32> [[TMP40]], i64 1
-; CHECK-NEXT:    [[TMP43:%.*]] = extractelement <3 x i32> [[WORKGROUPID]], i64 0
-; CHECK-NEXT:    [[TMP44:%.*]] = extractelement <3 x i32> [[WORKGROUPID]], i64 1
-; CHECK-NEXT:    [[TMP45:%.*]] = extractelement <3 x i32> [[WORKGROUPID]], i64 2
-; CHECK-NEXT:    [[TMP46:%.*]] = insertelement <20 x i32> poison, i32 [[GLOBALTABLE]], i64 0
-; CHECK-NEXT:    [[TMP47:%.*]] = insertelement <20 x i32> [[TMP46]], i32 [[TMP41]], i64 1
-; CHECK-NEXT:    [[TMP48:%.*]] = insertelement <20 x i32> [[TMP47]], i32 [[TMP42]], i64 2
-; CHECK-NEXT:    [[TMP49:%.*]] = insertelement <20 x i32> [[TMP48]], i32 [[PAD0]], i64 3
-; CHECK-NEXT:    [[TMP50:%.*]] = insertelement <20 x i32> [[TMP49]], i32 [[PAD1]], i64 4
-; CHECK-NEXT:    [[TMP51:%.*]] = insertelement <20 x i32> [[TMP50]], i32 [[PAD2]], i64 5
-; CHECK-NEXT:    [[TMP52:%.*]] = insertelement <20 x i32> [[TMP51]], i32 [[PAD3]], i64 6
-; CHECK-NEXT:    [[TMP53:%.*]] = insertelement <20 x i32> [[TMP52]], i32 [[PAD4]], i64 7
-; CHECK-NEXT:    [[TMP54:%.*]] = insertelement <20 x i32> [[TMP53]], i32 [[PAD5]], i64 8
-; CHECK-NEXT:    [[TMP55:%.*]] = insertelement <20 x i32> [[TMP54]], i32 [[PAD6]], i64 9
-; CHECK-NEXT:    [[TMP56:%.*]] = insertelement <20 x i32> [[TMP55]], i32 [[PAD7]], i64 10
-; CHECK-NEXT:    [[TMP57:%.*]] = insertelement <20 x i32> [[TMP56]], i32 [[PAD8]], i64 11
-; CHECK-NEXT:    [[TMP58:%.*]] = insertelement <20 x i32> [[TMP57]], i32 [[PAD9]], i64 12
-; CHECK-NEXT:    [[TMP59:%.*]] = insertelement <20 x i32> [[TMP58]], i32 [[PAD10]], i64 13
-; CHECK-NEXT:    [[TMP60:%.*]] = insertelement <20 x i32> [[TMP59]], i32 [[PAD11]], i64 14
-; CHECK-NEXT:    [[TMP61:%.*]] = insertelement <20 x i32> [[TMP60]], i32 [[SPILLTABLE]], i64 15
-; CHECK-NEXT:    [[TMP62:%.*]] = insertelement <20 x i32> [[TMP61]], i32 [[TMP43]], i64 16
-; CHECK-NEXT:    [[TMP63:%.*]] = insertelement <20 x i32> [[TMP62]], i32 [[TMP44]], i64 17
-; CHECK-NEXT:    [[TMP64:%.*]] = insertelement <20 x i32> [[TMP63]], i32 [[TMP45]], i64 18
-; CHECK-NEXT:    [[TMP65:%.*]] = insertelement <20 x i32> [[TMP64]], i32 [[MULTIDISPATCHINFO]], i64 19
-; CHECK-NEXT:    call void (ptr, i32, <20 x i32>, { <3 x i32>, i32, ptr addrspace(5), i32, i32 }, i32, ...) @llvm.amdgcn.cs.chain.p0.i32.v20i32.sl_v3i32i32p5i32i32s(ptr inreg [[TMP38]], i32 inreg [[TMP34]], <20 x i32> inreg [[TMP65]], { <3 x i32>, i32, ptr addrspace(5), i32, i32 } [[TMP24]], i32 0)
+; CHECK-NEXT:    [[TMP27:%.*]] = and i32 [[TMP26]], 7
+; CHECK-NEXT:    [[TMP28:%.*]] = icmp ne i32 [[TMP27]], 0
+; CHECK-NEXT:    [[TMP29:%.*]] = call i32 @llvm.amdgcn.ballot.i32(i1 [[TMP28]])
+; CHECK-NEXT:    [[TMP30:%.*]] = icmp eq i32 [[TMP27]], 3
+; CHECK-NEXT:    [[TMP31:%.*]] = call i32 @llvm.amdgcn.ballot.i32(i1 [[TMP30]])
+; CHECK-NEXT:    [[TMP32:%.*]] = icmp ne i32 [[TMP31]], 0
+; CHECK-NEXT:    [[TMP33:%.*]] = select i1 [[TMP32]], i32 [[TMP31]], i32 [[TMP29]]
+; CHECK-NEXT:    [[TMP34:%.*]] = icmp eq i32 [[TMP27]], 2
+; CHECK-NEXT:    [[TMP35:%.*]] = call i32 @llvm.amdgcn.ballot.i32(i1 [[TMP34]])
+; CHECK-NEXT:    [[TMP36:%.*]] = icmp ne i32 [[TMP35]], 0
+; CHECK-NEXT:    [[TMP37:%.*]] = select i1 [[TMP36]], i32 [[TMP35]], i32 [[TMP33]]
+; CHECK-NEXT:    [[TMP38:%.*]] = icmp eq i32 [[TMP27]], 1
+; CHECK-NEXT:    [[TMP39:%.*]] = call i32 @llvm.amdgcn.ballot.i32(i1 [[TMP38]])
+; CHECK-NEXT:    [[TMP40:%.*]] = icmp ne i32 [[TMP39]], 0
+; CHECK-NEXT:    [[TMP41:%.*]] = select i1 [[TMP40]], i32 [[TMP39]], i32 [[TMP37]]
+; CHECK-NEXT:    [[TMP42:%.*]] = call i32 @llvm.cttz.i32(i32 [[TMP41]], i1 true)
+; CHECK-NEXT:    [[TMP43:%.*]] = call i32 @llvm.amdgcn.readlane(i32 [[TMP26]], i32 [[TMP42]])
+; CHECK-NEXT:    [[TMP44:%.*]] = icmp eq i32 [[TMP26]], [[TMP43]]
+; CHECK-NEXT:    [[TMP45:%.*]] = call i32 @llvm.amdgcn.ballot.i32(i1 [[TMP44]])
+; CHECK-NEXT:    [[TMP46:%.*]] = call i32 @llvm.amdgcn.wwm.i32(i32 [[TMP43]])
+; CHECK-NEXT:    [[TMP47:%.*]] = call i32 @llvm.amdgcn.wwm.i32(i32 [[TMP45]])
+; CHECK-NEXT:    [[TMP48:%.*]] = and i32 [[TMP46]], -64
+; CHECK-NEXT:    [[TMP49:%.*]] = insertelement <2 x i32> [[TMP8]], i32 [[TMP48]], i64 0
+; CHECK-NEXT:    [[TMP50:%.*]] = bitcast <2 x i32> [[TMP49]] to i64
+; CHECK-NEXT:    [[TMP51:%.*]] = inttoptr i64 [[TMP50]] to ptr
+; CHECK-NEXT:    [[TMP52:%.*]] = ptrtoint ptr addrspace(4) [[NUMWORKGROUPSPTR]] to i64
+; CHECK-NEXT:    [[TMP53:%.*]] = bitcast i64 [[TMP52]] to <2 x i32>
+; CHECK-NEXT:    [[TMP54:%.*]] = extractelement <2 x i32> [[TMP53]], i64 0
+; CHECK-NEXT:    [[TMP55:%.*]] = extractelement <2 x i32> [[TMP53]], i64 1
+; CHECK-NEXT:    [[TMP56:%.*]] = extractelement <3 x i32> [[WORKGROUPID]], i64 0
+; CHECK-NEXT:    [[TMP57:%.*]] = extractelement <3 x i32> [[WORKGROUPID]], i64 1
+; CHECK-NEXT:    [[TMP58:%.*]] = extractelement <3 x i32> [[WORKGROUPID]], i64 2
+; CHECK-NEXT:    [[TMP59:%.*]] = insertelement <20 x i32> poison, i32 [[GLOBALTABLE]], i64 0
+; CHECK-NEXT:    [[TMP60:%.*]] = insertelement <20 x i32> [[TMP59]], i32 [[TMP54]], i64 1
+; CHECK-NEXT:    [[TMP61:%.*]] = insertelement <20 x i32> [[TMP60]], i32 [[TMP55]], i64 2
+; CHECK-NEXT:    [[TMP62:%.*]] = insertelement <20 x i32> [[TMP61]], i32 [[PAD0]], i64 3
+; CHECK-NEXT:    [[TMP63:%.*]] = insertelement <20 x i32> [[TMP62]], i32 [[PAD1]], i64 4
+; CHECK-NEXT:    [[TMP64:%.*]] = insertelement <20 x i32> [[TMP63]], i32 [[PAD2]], i64 5
+; CHECK-NEXT:    [[TMP65:%.*]] = insertelement <20 x i32> [[TMP64]], i32 [[PAD3]], i64 6
+; CHECK-NEXT:    [[TMP66:%.*]] = insertelement <20 x i32> [[TMP65]], i32 [[PAD4]], i64 7
+; CHECK-NEXT:    [[TMP67:%.*]] = insertelement <20 x i32> [[TMP66]], i32 [[PAD5]], i64 8
+; CHECK-NEXT:    [[TMP68:%.*]] = insertelement <20 x i32> [[TMP67]], i32 [[PAD6]], i64 9
+; CHECK-NEXT:    [[TMP69:%.*]] = insertelement <20 x i32> [[TMP68]], i32 [[PAD7]], i64 10
+; CHECK-NEXT:    [[TMP70:%.*]] = insertelement <20 x i32> [[TMP69]], i32 [[PAD8]], i64 11
+; CHECK-NEXT:    [[TMP71:%.*]] = insertelement <20 x i32> [[TMP70]], i32 [[PAD9]], i64 12
+; CHECK-NEXT:    [[TMP72:%.*]] = insertelement <20 x i32> [[TMP71]], i32 [[PAD10]], i64 13
+; CHECK-NEXT:    [[TMP73:%.*]] = insertelement <20 x i32> [[TMP72]], i32 [[PAD11]], i64 14
+; CHECK-NEXT:    [[TMP74:%.*]] = insertelement <20 x i32> [[TMP73]], i32 [[SPILLTABLE]], i64 15
+; CHECK-NEXT:    [[TMP75:%.*]] = insertelement <20 x i32> [[TMP74]], i32 [[TMP56]], i64 16
+; CHECK-NEXT:    [[TMP76:%.*]] = insertelement <20 x i32> [[TMP75]], i32 [[TMP57]], i64 17
+; CHECK-NEXT:    [[TMP77:%.*]] = insertelement <20 x i32> [[TMP76]], i32 [[TMP58]], i64 18
+; CHECK-NEXT:    [[TMP78:%.*]] = insertelement <20 x i32> [[TMP77]], i32 [[MULTIDISPATCHINFO]], i64 19
+; CHECK-NEXT:    call void (ptr, i32, <20 x i32>, { <3 x i32>, i32, ptr addrspace(5), i32, i32 }, i32, ...) @llvm.amdgcn.cs.chain.p0.i32.v20i32.sl_v3i32i32p5i32i32s(ptr inreg [[TMP51]], i32 inreg [[TMP47]], <20 x i32> inreg [[TMP78]], { <3 x i32>, i32, ptr addrspace(5), i32, i32 } [[TMP24]], i32 0)
 ; CHECK-NEXT:    unreachable
 ;
   %mem = call ptr addrspace(32) @lgc.cps.alloc(i32 10)   ; round up to 12 during lowering
@@ -526,46 +591,59 @@ define void @test.i64.reference({} %no_state, ptr addrspace(32) %p2, i32 %q1) !l
 ; CHECK-NEXT:    [[TMP18:%.*]] = insertvalue { <3 x i32>, i32, ptr addrspace(5) } [[TMP17]], ptr addrspace(5) [[TMP15]], 2
 ; CHECK-NEXT:    [[TMP19:%.*]] = extractvalue { <3 x i32>, i32, ptr addrspace(5) } [[TMP18]], 1
 ; CHECK-NEXT:    [[TMP20:%.*]] = call i32 @llvm.amdgcn.set.inactive.chain.arg.i32(i32 [[TMP19]], i32 [[VCR]])
-; CHECK-NEXT:    [[TMP21:%.*]] = icmp ne i32 [[TMP20]], 0
-; CHECK-NEXT:    [[TMP22:%.*]] = call i32 @llvm.amdgcn.ballot.i32(i1 [[TMP21]])
-; CHECK-NEXT:    [[TMP23:%.*]] = call i32 @llvm.cttz.i32(i32 [[TMP22]], i1 true)
-; CHECK-NEXT:    [[TMP24:%.*]] = call i32 @llvm.amdgcn.readlane(i32 [[TMP20]], i32 [[TMP23]])
-; CHECK-NEXT:    [[TMP25:%.*]] = icmp eq i32 [[TMP20]], [[TMP24]]
-; CHECK-NEXT:    [[TMP26:%.*]] = call i32 @llvm.amdgcn.ballot.i32(i1 [[TMP25]])
-; CHECK-NEXT:    [[TMP27:%.*]] = call i32 @llvm.amdgcn.wwm.i32(i32 [[TMP24]])
-; CHECK-NEXT:    [[TMP28:%.*]] = call i32 @llvm.amdgcn.wwm.i32(i32 [[TMP26]])
-; CHECK-NEXT:    [[TMP29:%.*]] = and i32 [[TMP27]], -64
-; CHECK-NEXT:    [[TMP30:%.*]] = insertelement <2 x i32> [[TMP8]], i32 [[TMP29]], i64 0
-; CHECK-NEXT:    [[TMP31:%.*]] = bitcast <2 x i32> [[TMP30]] to i64
-; CHECK-NEXT:    [[TMP32:%.*]] = inttoptr i64 [[TMP31]] to ptr
-; CHECK-NEXT:    [[TMP33:%.*]] = ptrtoint ptr addrspace(4) [[NUMWORKGROUPSPTR]] to i64
-; CHECK-NEXT:    [[TMP34:%.*]] = bitcast i64 [[TMP33]] to <2 x i32>
-; CHECK-NEXT:    [[TMP35:%.*]] = extractelement <2 x i32> [[TMP34]], i64 0
-; CHECK-NEXT:    [[TMP36:%.*]] = extractelement <2 x i32> [[TMP34]], i64 1
-; CHECK-NEXT:    [[TMP37:%.*]] = extractelement <3 x i32> [[WORKGROUPID]], i64 0
-; CHECK-NEXT:    [[TMP38:%.*]] = extractelement <3 x i32> [[WORKGROUPID]], i64 1
-; CHECK-NEXT:    [[TMP39:%.*]] = extractelement <3 x i32> [[WORKGROUPID]], i64 2
-; CHECK-NEXT:    [[TMP40:%.*]] = insertelement <20 x i32> poison, i32 [[GLOBALTABLE]], i64 0
-; CHECK-NEXT:    [[TMP41:%.*]] = insertelement <20 x i32> [[TMP40]], i32 [[TMP35]], i64 1
-; CHECK-NEXT:    [[TMP42:%.*]] = insertelement <20 x i32> [[TMP41]], i32 [[TMP36]], i64 2
-; CHECK-NEXT:    [[TMP43:%.*]] = insertelement <20 x i32> [[TMP42]], i32 [[PAD0]], i64 3
-; CHECK-NEXT:    [[TMP44:%.*]] = insertelement <20 x i32> [[TMP43]], i32 [[PAD1]], i64 4
-; CHECK-NEXT:    [[TMP45:%.*]] = insertelement <20 x i32> [[TMP44]], i32 [[PAD2]], i64 5
-; CHECK-NEXT:    [[TMP46:%.*]] = insertelement <20 x i32> [[TMP45]], i32 [[PAD3]], i64 6
-; CHECK-NEXT:    [[TMP47:%.*]] = insertelement <20 x i32> [[TMP46]], i32 [[PAD4]], i64 7
-; CHECK-NEXT:    [[TMP48:%.*]] = insertelement <20 x i32> [[TMP47]], i32 [[PAD5]], i64 8
-; CHECK-NEXT:    [[TMP49:%.*]] = insertelement <20 x i32> [[TMP48]], i32 [[PAD6]], i64 9
-; CHECK-NEXT:    [[TMP50:%.*]] = insertelement <20 x i32> [[TMP49]], i32 [[PAD7]], i64 10
-; CHECK-NEXT:    [[TMP51:%.*]] = insertelement <20 x i32> [[TMP50]], i32 [[PAD8]], i64 11
-; CHECK-NEXT:    [[TMP52:%.*]] = insertelement <20 x i32> [[TMP51]], i32 [[PAD9]], i64 12
-; CHECK-NEXT:    [[TMP53:%.*]] = insertelement <20 x i32> [[TMP52]], i32 [[PAD10]], i64 13
-; CHECK-NEXT:    [[TMP54:%.*]] = insertelement <20 x i32> [[TMP53]], i32 [[PAD11]], i64 14
-; CHECK-NEXT:    [[TMP55:%.*]] = insertelement <20 x i32> [[TMP54]], i32 [[SPILLTABLE]], i64 15
-; CHECK-NEXT:    [[TMP56:%.*]] = insertelement <20 x i32> [[TMP55]], i32 [[TMP37]], i64 16
-; CHECK-NEXT:    [[TMP57:%.*]] = insertelement <20 x i32> [[TMP56]], i32 [[TMP38]], i64 17
-; CHECK-NEXT:    [[TMP58:%.*]] = insertelement <20 x i32> [[TMP57]], i32 [[TMP39]], i64 18
-; CHECK-NEXT:    [[TMP59:%.*]] = insertelement <20 x i32> [[TMP58]], i32 [[MULTIDISPATCHINFO]], i64 19
-; CHECK-NEXT:    call void (ptr, i32, <20 x i32>, { <3 x i32>, i32, ptr addrspace(5) }, i32, ...) @llvm.amdgcn.cs.chain.p0.i32.v20i32.sl_v3i32i32p5s(ptr inreg [[TMP32]], i32 inreg [[TMP28]], <20 x i32> inreg [[TMP59]], { <3 x i32>, i32, ptr addrspace(5) } [[TMP18]], i32 0)
+; CHECK-NEXT:    [[TMP21:%.*]] = and i32 [[TMP20]], 7
+; CHECK-NEXT:    [[TMP22:%.*]] = icmp ne i32 [[TMP21]], 0
+; CHECK-NEXT:    [[TMP23:%.*]] = call i32 @llvm.amdgcn.ballot.i32(i1 [[TMP22]])
+; CHECK-NEXT:    [[TMP24:%.*]] = icmp eq i32 [[TMP21]], 3
+; CHECK-NEXT:    [[TMP25:%.*]] = call i32 @llvm.amdgcn.ballot.i32(i1 [[TMP24]])
+; CHECK-NEXT:    [[TMP26:%.*]] = icmp ne i32 [[TMP25]], 0
+; CHECK-NEXT:    [[TMP27:%.*]] = select i1 [[TMP26]], i32 [[TMP25]], i32 [[TMP23]]
+; CHECK-NEXT:    [[TMP28:%.*]] = icmp eq i32 [[TMP21]], 2
+; CHECK-NEXT:    [[TMP29:%.*]] = call i32 @llvm.amdgcn.ballot.i32(i1 [[TMP28]])
+; CHECK-NEXT:    [[TMP30:%.*]] = icmp ne i32 [[TMP29]], 0
+; CHECK-NEXT:    [[TMP31:%.*]] = select i1 [[TMP30]], i32 [[TMP29]], i32 [[TMP27]]
+; CHECK-NEXT:    [[TMP32:%.*]] = icmp eq i32 [[TMP21]], 1
+; CHECK-NEXT:    [[TMP33:%.*]] = call i32 @llvm.amdgcn.ballot.i32(i1 [[TMP32]])
+; CHECK-NEXT:    [[TMP34:%.*]] = icmp ne i32 [[TMP33]], 0
+; CHECK-NEXT:    [[TMP35:%.*]] = select i1 [[TMP34]], i32 [[TMP33]], i32 [[TMP31]]
+; CHECK-NEXT:    [[TMP36:%.*]] = call i32 @llvm.cttz.i32(i32 [[TMP35]], i1 true)
+; CHECK-NEXT:    [[TMP37:%.*]] = call i32 @llvm.amdgcn.readlane(i32 [[TMP20]], i32 [[TMP36]])
+; CHECK-NEXT:    [[TMP38:%.*]] = icmp eq i32 [[TMP20]], [[TMP37]]
+; CHECK-NEXT:    [[TMP39:%.*]] = call i32 @llvm.amdgcn.ballot.i32(i1 [[TMP38]])
+; CHECK-NEXT:    [[TMP40:%.*]] = call i32 @llvm.amdgcn.wwm.i32(i32 [[TMP37]])
+; CHECK-NEXT:    [[TMP41:%.*]] = call i32 @llvm.amdgcn.wwm.i32(i32 [[TMP39]])
+; CHECK-NEXT:    [[TMP42:%.*]] = and i32 [[TMP40]], -64
+; CHECK-NEXT:    [[TMP43:%.*]] = insertelement <2 x i32> [[TMP8]], i32 [[TMP42]], i64 0
+; CHECK-NEXT:    [[TMP44:%.*]] = bitcast <2 x i32> [[TMP43]] to i64
+; CHECK-NEXT:    [[TMP45:%.*]] = inttoptr i64 [[TMP44]] to ptr
+; CHECK-NEXT:    [[TMP46:%.*]] = ptrtoint ptr addrspace(4) [[NUMWORKGROUPSPTR]] to i64
+; CHECK-NEXT:    [[TMP47:%.*]] = bitcast i64 [[TMP46]] to <2 x i32>
+; CHECK-NEXT:    [[TMP48:%.*]] = extractelement <2 x i32> [[TMP47]], i64 0
+; CHECK-NEXT:    [[TMP49:%.*]] = extractelement <2 x i32> [[TMP47]], i64 1
+; CHECK-NEXT:    [[TMP50:%.*]] = extractelement <3 x i32> [[WORKGROUPID]], i64 0
+; CHECK-NEXT:    [[TMP51:%.*]] = extractelement <3 x i32> [[WORKGROUPID]], i64 1
+; CHECK-NEXT:    [[TMP52:%.*]] = extractelement <3 x i32> [[WORKGROUPID]], i64 2
+; CHECK-NEXT:    [[TMP53:%.*]] = insertelement <20 x i32> poison, i32 [[GLOBALTABLE]], i64 0
+; CHECK-NEXT:    [[TMP54:%.*]] = insertelement <20 x i32> [[TMP53]], i32 [[TMP48]], i64 1
+; CHECK-NEXT:    [[TMP55:%.*]] = insertelement <20 x i32> [[TMP54]], i32 [[TMP49]], i64 2
+; CHECK-NEXT:    [[TMP56:%.*]] = insertelement <20 x i32> [[TMP55]], i32 [[PAD0]], i64 3
+; CHECK-NEXT:    [[TMP57:%.*]] = insertelement <20 x i32> [[TMP56]], i32 [[PAD1]], i64 4
+; CHECK-NEXT:    [[TMP58:%.*]] = insertelement <20 x i32> [[TMP57]], i32 [[PAD2]], i64 5
+; CHECK-NEXT:    [[TMP59:%.*]] = insertelement <20 x i32> [[TMP58]], i32 [[PAD3]], i64 6
+; CHECK-NEXT:    [[TMP60:%.*]] = insertelement <20 x i32> [[TMP59]], i32 [[PAD4]], i64 7
+; CHECK-NEXT:    [[TMP61:%.*]] = insertelement <20 x i32> [[TMP60]], i32 [[PAD5]], i64 8
+; CHECK-NEXT:    [[TMP62:%.*]] = insertelement <20 x i32> [[TMP61]], i32 [[PAD6]], i64 9
+; CHECK-NEXT:    [[TMP63:%.*]] = insertelement <20 x i32> [[TMP62]], i32 [[PAD7]], i64 10
+; CHECK-NEXT:    [[TMP64:%.*]] = insertelement <20 x i32> [[TMP63]], i32 [[PAD8]], i64 11
+; CHECK-NEXT:    [[TMP65:%.*]] = insertelement <20 x i32> [[TMP64]], i32 [[PAD9]], i64 12
+; CHECK-NEXT:    [[TMP66:%.*]] = insertelement <20 x i32> [[TMP65]], i32 [[PAD10]], i64 13
+; CHECK-NEXT:    [[TMP67:%.*]] = insertelement <20 x i32> [[TMP66]], i32 [[PAD11]], i64 14
+; CHECK-NEXT:    [[TMP68:%.*]] = insertelement <20 x i32> [[TMP67]], i32 [[SPILLTABLE]], i64 15
+; CHECK-NEXT:    [[TMP69:%.*]] = insertelement <20 x i32> [[TMP68]], i32 [[TMP50]], i64 16
+; CHECK-NEXT:    [[TMP70:%.*]] = insertelement <20 x i32> [[TMP69]], i32 [[TMP51]], i64 17
+; CHECK-NEXT:    [[TMP71:%.*]] = insertelement <20 x i32> [[TMP70]], i32 [[TMP52]], i64 18
+; CHECK-NEXT:    [[TMP72:%.*]] = insertelement <20 x i32> [[TMP71]], i32 [[MULTIDISPATCHINFO]], i64 19
+; CHECK-NEXT:    call void (ptr, i32, <20 x i32>, { <3 x i32>, i32, ptr addrspace(5) }, i32, ...) @llvm.amdgcn.cs.chain.p0.i32.v20i32.sl_v3i32i32p5s(ptr inreg [[TMP45]], i32 inreg [[TMP41]], <20 x i32> inreg [[TMP72]], { <3 x i32>, i32, ptr addrspace(5) } [[TMP18]], i32 0)
 ; CHECK-NEXT:    unreachable
 ;
   %p1 = inttoptr i32 %q1 to ptr addrspace(32)
@@ -589,16 +667,3 @@ define void @test.i64.reference({} %no_state, ptr addrspace(32) %p2, i32 %q1) !l
 ;
 ;
 ;
-;.
-; CHECK: attributes #[[ATTR0:[0-9]+]] = { noreturn }
-; CHECK: attributes #[[ATTR1]] = { memory(readwrite) "amdgpu-memory-bound"="false" "amdgpu-unroll-threshold"="700" "amdgpu-wave-limiter"="false" }
-; CHECK: attributes #[[ATTR2:[0-9]+]] = { convergent nocallback nofree nounwind willreturn memory(none) }
-; CHECK: attributes #[[ATTR3:[0-9]+]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
-; CHECK: attributes #[[ATTR4:[0-9]+]] = { convergent nocallback nofree nounwind speculatable willreturn memory(none) }
-; CHECK: attributes #[[ATTR5:[0-9]+]] = { convergent noreturn nounwind }
-;.
-; CHECK: [[META0:![0-9]+]] = !{!""}
-; CHECK: [[META1:![0-9]+]] = !{!"\82\B0amdpal.pipelines\91\83\B1.shader_functions\86\A6test.0\81\B4.frontend_stack_size\10\A6test.1\81\B4.frontend_stack_size\00\A6test.2\81\B4.frontend_stack_size\00\A8test.gep\81\B4.frontend_stack_size\0C\B2test.i64.reference\81\B4.frontend_stack_size\00\AFtest.nested.gep\81\B4.frontend_stack_size\0C\B0.spill_threshold\CD\FF\FF\B0.user_data_limit\00\AEamdpal.version\92\03\00"}
-; CHECK: [[META2]] = !{i32 1}
-; CHECK: [[META3]] = !{i32 7}
-;.
diff --git a/lgc/test/Transforms/CpsLowering/cps-unify-exits.lgc b/lgc/test/Transforms/CpsLowering/cps-unify-exits.lgc
index 403bba58f2..9e3e27882d 100644
--- a/lgc/test/Transforms/CpsLowering/cps-unify-exits.lgc
+++ b/lgc/test/Transforms/CpsLowering/cps-unify-exits.lgc
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --tool lgc --function-signature
-; RUN: lgc -mcpu=gfx1030 -o - -passes='require<lgc-pipeline-state>,lgc-patch-entry-point-mutate' %s | FileCheck --check-prefixes=CHECK %s
+; RUN: lgc -mcpu=gfx1030 -o - -passes="require<lgc-pipeline-state>,lgc-patch-entry-point-mutate" %s | FileCheck --check-prefixes=CHECK %s
 
 declare void @lgc.cps.jump(...) noreturn
 
@@ -53,46 +53,59 @@ define void @unify_jumps({i32} %state, i32 %arg, ptr %table) !lgc.cps !0 !lgc.sh
 ; CHECK-NEXT:    [[TMP24:%.*]] = insertvalue { <3 x i32>, i32, ptr addrspace(5), i32, i32 } [[TMP23]], i32 [[TMP19]], 4
 ; CHECK-NEXT:    [[TMP25:%.*]] = extractvalue { <3 x i32>, i32, ptr addrspace(5), i32, i32 } [[TMP24]], 1
 ; CHECK-NEXT:    [[TMP26:%.*]] = call i32 @llvm.amdgcn.set.inactive.chain.arg.i32(i32 [[TMP25]], i32 [[VCR]])
-; CHECK-NEXT:    [[TMP27:%.*]] = icmp ne i32 [[TMP26]], 0
-; CHECK-NEXT:    [[TMP28:%.*]] = call i32 @llvm.amdgcn.ballot.i32(i1 [[TMP27]])
-; CHECK-NEXT:    [[TMP29:%.*]] = call i32 @llvm.cttz.i32(i32 [[TMP28]], i1 true)
-; CHECK-NEXT:    [[TMP30:%.*]] = call i32 @llvm.amdgcn.readlane(i32 [[TMP26]], i32 [[TMP29]])
-; CHECK-NEXT:    [[TMP31:%.*]] = icmp eq i32 [[TMP26]], [[TMP30]]
-; CHECK-NEXT:    [[TMP32:%.*]] = call i32 @llvm.amdgcn.ballot.i32(i1 [[TMP31]])
-; CHECK-NEXT:    [[TMP33:%.*]] = call i32 @llvm.amdgcn.wwm.i32(i32 [[TMP30]])
-; CHECK-NEXT:    [[TMP34:%.*]] = call i32 @llvm.amdgcn.wwm.i32(i32 [[TMP32]])
-; CHECK-NEXT:    [[TMP35:%.*]] = and i32 [[TMP33]], -64
-; CHECK-NEXT:    [[TMP36:%.*]] = insertelement <2 x i32> [[TMP7]], i32 [[TMP35]], i64 0
-; CHECK-NEXT:    [[TMP37:%.*]] = bitcast <2 x i32> [[TMP36]] to i64
-; CHECK-NEXT:    [[TMP38:%.*]] = inttoptr i64 [[TMP37]] to ptr
-; CHECK-NEXT:    [[TMP39:%.*]] = ptrtoint ptr addrspace(4) [[NUMWORKGROUPSPTR]] to i64
-; CHECK-NEXT:    [[TMP40:%.*]] = bitcast i64 [[TMP39]] to <2 x i32>
-; CHECK-NEXT:    [[TMP41:%.*]] = extractelement <2 x i32> [[TMP40]], i64 0
-; CHECK-NEXT:    [[TMP42:%.*]] = extractelement <2 x i32> [[TMP40]], i64 1
-; CHECK-NEXT:    [[TMP43:%.*]] = extractelement <3 x i32> [[WORKGROUPID]], i64 0
-; CHECK-NEXT:    [[TMP44:%.*]] = extractelement <3 x i32> [[WORKGROUPID]], i64 1
-; CHECK-NEXT:    [[TMP45:%.*]] = extractelement <3 x i32> [[WORKGROUPID]], i64 2
-; CHECK-NEXT:    [[TMP46:%.*]] = insertelement <20 x i32> poison, i32 [[GLOBALTABLE]], i64 0
-; CHECK-NEXT:    [[TMP47:%.*]] = insertelement <20 x i32> [[TMP46]], i32 [[TMP41]], i64 1
-; CHECK-NEXT:    [[TMP48:%.*]] = insertelement <20 x i32> [[TMP47]], i32 [[TMP42]], i64 2
-; CHECK-NEXT:    [[TMP49:%.*]] = insertelement <20 x i32> [[TMP48]], i32 [[PAD0]], i64 3
-; CHECK-NEXT:    [[TMP50:%.*]] = insertelement <20 x i32> [[TMP49]], i32 [[PAD1]], i64 4
-; CHECK-NEXT:    [[TMP51:%.*]] = insertelement <20 x i32> [[TMP50]], i32 [[PAD2]], i64 5
-; CHECK-NEXT:    [[TMP52:%.*]] = insertelement <20 x i32> [[TMP51]], i32 [[PAD3]], i64 6
-; CHECK-NEXT:    [[TMP53:%.*]] = insertelement <20 x i32> [[TMP52]], i32 [[PAD4]], i64 7
-; CHECK-NEXT:    [[TMP54:%.*]] = insertelement <20 x i32> [[TMP53]], i32 [[PAD5]], i64 8
-; CHECK-NEXT:    [[TMP55:%.*]] = insertelement <20 x i32> [[TMP54]], i32 [[PAD6]], i64 9
-; CHECK-NEXT:    [[TMP56:%.*]] = insertelement <20 x i32> [[TMP55]], i32 [[PAD7]], i64 10
-; CHECK-NEXT:    [[TMP57:%.*]] = insertelement <20 x i32> [[TMP56]], i32 [[PAD8]], i64 11
-; CHECK-NEXT:    [[TMP58:%.*]] = insertelement <20 x i32> [[TMP57]], i32 [[PAD9]], i64 12
-; CHECK-NEXT:    [[TMP59:%.*]] = insertelement <20 x i32> [[TMP58]], i32 [[PAD10]], i64 13
-; CHECK-NEXT:    [[TMP60:%.*]] = insertelement <20 x i32> [[TMP59]], i32 [[PAD11]], i64 14
-; CHECK-NEXT:    [[TMP61:%.*]] = insertelement <20 x i32> [[TMP60]], i32 [[SPILLTABLE]], i64 15
-; CHECK-NEXT:    [[TMP62:%.*]] = insertelement <20 x i32> [[TMP61]], i32 [[TMP43]], i64 16
-; CHECK-NEXT:    [[TMP63:%.*]] = insertelement <20 x i32> [[TMP62]], i32 [[TMP44]], i64 17
-; CHECK-NEXT:    [[TMP64:%.*]] = insertelement <20 x i32> [[TMP63]], i32 [[TMP45]], i64 18
-; CHECK-NEXT:    [[TMP65:%.*]] = insertelement <20 x i32> [[TMP64]], i32 [[MULTIDISPATCHINFO]], i64 19
-; CHECK-NEXT:    call void (ptr, i32, <20 x i32>, { <3 x i32>, i32, ptr addrspace(5), i32, i32 }, i32, ...) @llvm.amdgcn.cs.chain.p0.i32.v20i32.sl_v3i32i32p5i32i32s(ptr inreg [[TMP38]], i32 inreg [[TMP34]], <20 x i32> inreg [[TMP65]], { <3 x i32>, i32, ptr addrspace(5), i32, i32 } [[TMP24]], i32 0)
+; CHECK-NEXT:    [[TMP27:%.*]] = and i32 [[TMP26]], 7
+; CHECK-NEXT:    [[TMP28:%.*]] = icmp ne i32 [[TMP27]], 0
+; CHECK-NEXT:    [[TMP29:%.*]] = call i32 @llvm.amdgcn.ballot.i32(i1 [[TMP28]])
+; CHECK-NEXT:    [[TMP30:%.*]] = icmp eq i32 [[TMP27]], 3
+; CHECK-NEXT:    [[TMP31:%.*]] = call i32 @llvm.amdgcn.ballot.i32(i1 [[TMP30]])
+; CHECK-NEXT:    [[TMP32:%.*]] = icmp ne i32 [[TMP31]], 0
+; CHECK-NEXT:    [[TMP33:%.*]] = select i1 [[TMP32]], i32 [[TMP31]], i32 [[TMP29]]
+; CHECK-NEXT:    [[TMP34:%.*]] = icmp eq i32 [[TMP27]], 2
+; CHECK-NEXT:    [[TMP35:%.*]] = call i32 @llvm.amdgcn.ballot.i32(i1 [[TMP34]])
+; CHECK-NEXT:    [[TMP36:%.*]] = icmp ne i32 [[TMP35]], 0
+; CHECK-NEXT:    [[TMP37:%.*]] = select i1 [[TMP36]], i32 [[TMP35]], i32 [[TMP33]]
+; CHECK-NEXT:    [[TMP38:%.*]] = icmp eq i32 [[TMP27]], 1
+; CHECK-NEXT:    [[TMP39:%.*]] = call i32 @llvm.amdgcn.ballot.i32(i1 [[TMP38]])
+; CHECK-NEXT:    [[TMP40:%.*]] = icmp ne i32 [[TMP39]], 0
+; CHECK-NEXT:    [[TMP41:%.*]] = select i1 [[TMP40]], i32 [[TMP39]], i32 [[TMP37]]
+; CHECK-NEXT:    [[TMP42:%.*]] = call i32 @llvm.cttz.i32(i32 [[TMP41]], i1 true)
+; CHECK-NEXT:    [[TMP43:%.*]] = call i32 @llvm.amdgcn.readlane(i32 [[TMP26]], i32 [[TMP42]])
+; CHECK-NEXT:    [[TMP44:%.*]] = icmp eq i32 [[TMP26]], [[TMP43]]
+; CHECK-NEXT:    [[TMP45:%.*]] = call i32 @llvm.amdgcn.ballot.i32(i1 [[TMP44]])
+; CHECK-NEXT:    [[TMP46:%.*]] = call i32 @llvm.amdgcn.wwm.i32(i32 [[TMP43]])
+; CHECK-NEXT:    [[TMP47:%.*]] = call i32 @llvm.amdgcn.wwm.i32(i32 [[TMP45]])
+; CHECK-NEXT:    [[TMP48:%.*]] = and i32 [[TMP46]], -64
+; CHECK-NEXT:    [[TMP49:%.*]] = insertelement <2 x i32> [[TMP7]], i32 [[TMP48]], i64 0
+; CHECK-NEXT:    [[TMP50:%.*]] = bitcast <2 x i32> [[TMP49]] to i64
+; CHECK-NEXT:    [[TMP51:%.*]] = inttoptr i64 [[TMP50]] to ptr
+; CHECK-NEXT:    [[TMP52:%.*]] = ptrtoint ptr addrspace(4) [[NUMWORKGROUPSPTR]] to i64
+; CHECK-NEXT:    [[TMP53:%.*]] = bitcast i64 [[TMP52]] to <2 x i32>
+; CHECK-NEXT:    [[TMP54:%.*]] = extractelement <2 x i32> [[TMP53]], i64 0
+; CHECK-NEXT:    [[TMP55:%.*]] = extractelement <2 x i32> [[TMP53]], i64 1
+; CHECK-NEXT:    [[TMP56:%.*]] = extractelement <3 x i32> [[WORKGROUPID]], i64 0
+; CHECK-NEXT:    [[TMP57:%.*]] = extractelement <3 x i32> [[WORKGROUPID]], i64 1
+; CHECK-NEXT:    [[TMP58:%.*]] = extractelement <3 x i32> [[WORKGROUPID]], i64 2
+; CHECK-NEXT:    [[TMP59:%.*]] = insertelement <20 x i32> poison, i32 [[GLOBALTABLE]], i64 0
+; CHECK-NEXT:    [[TMP60:%.*]] = insertelement <20 x i32> [[TMP59]], i32 [[TMP54]], i64 1
+; CHECK-NEXT:    [[TMP61:%.*]] = insertelement <20 x i32> [[TMP60]], i32 [[TMP55]], i64 2
+; CHECK-NEXT:    [[TMP62:%.*]] = insertelement <20 x i32> [[TMP61]], i32 [[PAD0]], i64 3
+; CHECK-NEXT:    [[TMP63:%.*]] = insertelement <20 x i32> [[TMP62]], i32 [[PAD1]], i64 4
+; CHECK-NEXT:    [[TMP64:%.*]] = insertelement <20 x i32> [[TMP63]], i32 [[PAD2]], i64 5
+; CHECK-NEXT:    [[TMP65:%.*]] = insertelement <20 x i32> [[TMP64]], i32 [[PAD3]], i64 6
+; CHECK-NEXT:    [[TMP66:%.*]] = insertelement <20 x i32> [[TMP65]], i32 [[PAD4]], i64 7
+; CHECK-NEXT:    [[TMP67:%.*]] = insertelement <20 x i32> [[TMP66]], i32 [[PAD5]], i64 8
+; CHECK-NEXT:    [[TMP68:%.*]] = insertelement <20 x i32> [[TMP67]], i32 [[PAD6]], i64 9
+; CHECK-NEXT:    [[TMP69:%.*]] = insertelement <20 x i32> [[TMP68]], i32 [[PAD7]], i64 10
+; CHECK-NEXT:    [[TMP70:%.*]] = insertelement <20 x i32> [[TMP69]], i32 [[PAD8]], i64 11
+; CHECK-NEXT:    [[TMP71:%.*]] = insertelement <20 x i32> [[TMP70]], i32 [[PAD9]], i64 12
+; CHECK-NEXT:    [[TMP72:%.*]] = insertelement <20 x i32> [[TMP71]], i32 [[PAD10]], i64 13
+; CHECK-NEXT:    [[TMP73:%.*]] = insertelement <20 x i32> [[TMP72]], i32 [[PAD11]], i64 14
+; CHECK-NEXT:    [[TMP74:%.*]] = insertelement <20 x i32> [[TMP73]], i32 [[SPILLTABLE]], i64 15
+; CHECK-NEXT:    [[TMP75:%.*]] = insertelement <20 x i32> [[TMP74]], i32 [[TMP56]], i64 16
+; CHECK-NEXT:    [[TMP76:%.*]] = insertelement <20 x i32> [[TMP75]], i32 [[TMP57]], i64 17
+; CHECK-NEXT:    [[TMP77:%.*]] = insertelement <20 x i32> [[TMP76]], i32 [[TMP58]], i64 18
+; CHECK-NEXT:    [[TMP78:%.*]] = insertelement <20 x i32> [[TMP77]], i32 [[MULTIDISPATCHINFO]], i64 19
+; CHECK-NEXT:    call void (ptr, i32, <20 x i32>, { <3 x i32>, i32, ptr addrspace(5), i32, i32 }, i32, ...) @llvm.amdgcn.cs.chain.p0.i32.v20i32.sl_v3i32i32p5i32i32s(ptr inreg [[TMP51]], i32 inreg [[TMP47]], <20 x i32> inreg [[TMP78]], { <3 x i32>, i32, ptr addrspace(5), i32, i32 } [[TMP24]], i32 0)
 ; CHECK-NEXT:    unreachable
 ;
 entry:
@@ -159,49 +172,62 @@ define void @unify_jump_ret({i32} %state, i32 %arg, ptr %table) !lgc.cps !0 !lgc
 ; CHECK-NEXT:    [[TMP19:%.*]] = insertvalue { <3 x i32>, i32, ptr addrspace(5), i32 } [[TMP18]], i32 [[TMP15]], 3
 ; CHECK-NEXT:    [[TMP20:%.*]] = extractvalue { <3 x i32>, i32, ptr addrspace(5), i32 } [[TMP19]], 1
 ; CHECK-NEXT:    [[TMP21:%.*]] = call i32 @llvm.amdgcn.set.inactive.chain.arg.i32(i32 [[TMP20]], i32 [[VCR]])
-; CHECK-NEXT:    [[TMP22:%.*]] = icmp ne i32 [[TMP21]], 0
-; CHECK-NEXT:    [[TMP23:%.*]] = call i32 @llvm.amdgcn.ballot.i32(i1 [[TMP22]])
-; CHECK-NEXT:    [[TMP24:%.*]] = call i32 @llvm.cttz.i32(i32 [[TMP23]], i1 true)
-; CHECK-NEXT:    [[TMP25:%.*]] = call i32 @llvm.amdgcn.readlane(i32 [[TMP21]], i32 [[TMP24]])
-; CHECK-NEXT:    [[TMP26:%.*]] = icmp eq i32 [[TMP21]], [[TMP25]]
-; CHECK-NEXT:    [[TMP27:%.*]] = call i32 @llvm.amdgcn.ballot.i32(i1 [[TMP26]])
-; CHECK-NEXT:    [[TMP28:%.*]] = call i32 @llvm.amdgcn.wwm.i32(i32 [[TMP25]])
-; CHECK-NEXT:    [[TMP29:%.*]] = call i32 @llvm.amdgcn.wwm.i32(i32 [[TMP27]])
-; CHECK-NEXT:    [[TMP30:%.*]] = icmp eq i32 [[TMP28]], 0
-; CHECK-NEXT:    br i1 [[TMP30]], label [[RET_BLOCK:%.*]], label [[CHAIN_BLOCK:%.*]]
+; CHECK-NEXT:    [[TMP22:%.*]] = and i32 [[TMP21]], 7
+; CHECK-NEXT:    [[TMP23:%.*]] = icmp ne i32 [[TMP22]], 0
+; CHECK-NEXT:    [[TMP24:%.*]] = call i32 @llvm.amdgcn.ballot.i32(i1 [[TMP23]])
+; CHECK-NEXT:    [[TMP25:%.*]] = icmp eq i32 [[TMP22]], 3
+; CHECK-NEXT:    [[TMP26:%.*]] = call i32 @llvm.amdgcn.ballot.i32(i1 [[TMP25]])
+; CHECK-NEXT:    [[TMP27:%.*]] = icmp ne i32 [[TMP26]], 0
+; CHECK-NEXT:    [[TMP28:%.*]] = select i1 [[TMP27]], i32 [[TMP26]], i32 [[TMP24]]
+; CHECK-NEXT:    [[TMP29:%.*]] = icmp eq i32 [[TMP22]], 2
+; CHECK-NEXT:    [[TMP30:%.*]] = call i32 @llvm.amdgcn.ballot.i32(i1 [[TMP29]])
+; CHECK-NEXT:    [[TMP31:%.*]] = icmp ne i32 [[TMP30]], 0
+; CHECK-NEXT:    [[TMP32:%.*]] = select i1 [[TMP31]], i32 [[TMP30]], i32 [[TMP28]]
+; CHECK-NEXT:    [[TMP33:%.*]] = icmp eq i32 [[TMP22]], 1
+; CHECK-NEXT:    [[TMP34:%.*]] = call i32 @llvm.amdgcn.ballot.i32(i1 [[TMP33]])
+; CHECK-NEXT:    [[TMP35:%.*]] = icmp ne i32 [[TMP34]], 0
+; CHECK-NEXT:    [[TMP36:%.*]] = select i1 [[TMP35]], i32 [[TMP34]], i32 [[TMP32]]
+; CHECK-NEXT:    [[TMP37:%.*]] = call i32 @llvm.cttz.i32(i32 [[TMP36]], i1 true)
+; CHECK-NEXT:    [[TMP38:%.*]] = call i32 @llvm.amdgcn.readlane(i32 [[TMP21]], i32 [[TMP37]])
+; CHECK-NEXT:    [[TMP39:%.*]] = icmp eq i32 [[TMP21]], [[TMP38]]
+; CHECK-NEXT:    [[TMP40:%.*]] = call i32 @llvm.amdgcn.ballot.i32(i1 [[TMP39]])
+; CHECK-NEXT:    [[TMP41:%.*]] = call i32 @llvm.amdgcn.wwm.i32(i32 [[TMP38]])
+; CHECK-NEXT:    [[TMP42:%.*]] = call i32 @llvm.amdgcn.wwm.i32(i32 [[TMP40]])
+; CHECK-NEXT:    [[TMP43:%.*]] = icmp eq i32 [[TMP41]], 0
+; CHECK-NEXT:    br i1 [[TMP43]], label [[RET_BLOCK:%.*]], label [[CHAIN_BLOCK:%.*]]
 ; CHECK:       chain.block:
-; CHECK-NEXT:    [[TMP31:%.*]] = and i32 [[TMP28]], -64
-; CHECK-NEXT:    [[TMP32:%.*]] = insertelement <2 x i32> [[TMP7]], i32 [[TMP31]], i64 0
-; CHECK-NEXT:    [[TMP33:%.*]] = bitcast <2 x i32> [[TMP32]] to i64
-; CHECK-NEXT:    [[TMP34:%.*]] = inttoptr i64 [[TMP33]] to ptr
-; CHECK-NEXT:    [[TMP35:%.*]] = ptrtoint ptr addrspace(4) [[NUMWORKGROUPSPTR]] to i64
-; CHECK-NEXT:    [[TMP36:%.*]] = bitcast i64 [[TMP35]] to <2 x i32>
-; CHECK-NEXT:    [[TMP37:%.*]] = extractelement <2 x i32> [[TMP36]], i64 0
-; CHECK-NEXT:    [[TMP38:%.*]] = extractelement <2 x i32> [[TMP36]], i64 1
-; CHECK-NEXT:    [[TMP39:%.*]] = extractelement <3 x i32> [[WORKGROUPID]], i64 0
-; CHECK-NEXT:    [[TMP40:%.*]] = extractelement <3 x i32> [[WORKGROUPID]], i64 1
-; CHECK-NEXT:    [[TMP41:%.*]] = extractelement <3 x i32> [[WORKGROUPID]], i64 2
-; CHECK-NEXT:    [[TMP42:%.*]] = insertelement <20 x i32> poison, i32 [[GLOBALTABLE]], i64 0
-; CHECK-NEXT:    [[TMP43:%.*]] = insertelement <20 x i32> [[TMP42]], i32 [[TMP37]], i64 1
-; CHECK-NEXT:    [[TMP44:%.*]] = insertelement <20 x i32> [[TMP43]], i32 [[TMP38]], i64 2
-; CHECK-NEXT:    [[TMP45:%.*]] = insertelement <20 x i32> [[TMP44]], i32 [[PAD0]], i64 3
-; CHECK-NEXT:    [[TMP46:%.*]] = insertelement <20 x i32> [[TMP45]], i32 [[PAD1]], i64 4
-; CHECK-NEXT:    [[TMP47:%.*]] = insertelement <20 x i32> [[TMP46]], i32 [[PAD2]], i64 5
-; CHECK-NEXT:    [[TMP48:%.*]] = insertelement <20 x i32> [[TMP47]], i32 [[PAD3]], i64 6
-; CHECK-NEXT:    [[TMP49:%.*]] = insertelement <20 x i32> [[TMP48]], i32 [[PAD4]], i64 7
-; CHECK-NEXT:    [[TMP50:%.*]] = insertelement <20 x i32> [[TMP49]], i32 [[PAD5]], i64 8
-; CHECK-NEXT:    [[TMP51:%.*]] = insertelement <20 x i32> [[TMP50]], i32 [[PAD6]], i64 9
-; CHECK-NEXT:    [[TMP52:%.*]] = insertelement <20 x i32> [[TMP51]], i32 [[PAD7]], i64 10
-; CHECK-NEXT:    [[TMP53:%.*]] = insertelement <20 x i32> [[TMP52]], i32 [[PAD8]], i64 11
-; CHECK-NEXT:    [[TMP54:%.*]] = insertelement <20 x i32> [[TMP53]], i32 [[PAD9]], i64 12
-; CHECK-NEXT:    [[TMP55:%.*]] = insertelement <20 x i32> [[TMP54]], i32 [[PAD10]], i64 13
-; CHECK-NEXT:    [[TMP56:%.*]] = insertelement <20 x i32> [[TMP55]], i32 [[PAD11]], i64 14
-; CHECK-NEXT:    [[TMP57:%.*]] = insertelement <20 x i32> [[TMP56]], i32 [[SPILLTABLE]], i64 15
-; CHECK-NEXT:    [[TMP58:%.*]] = insertelement <20 x i32> [[TMP57]], i32 [[TMP39]], i64 16
-; CHECK-NEXT:    [[TMP59:%.*]] = insertelement <20 x i32> [[TMP58]], i32 [[TMP40]], i64 17
-; CHECK-NEXT:    [[TMP60:%.*]] = insertelement <20 x i32> [[TMP59]], i32 [[TMP41]], i64 18
-; CHECK-NEXT:    [[TMP61:%.*]] = insertelement <20 x i32> [[TMP60]], i32 [[MULTIDISPATCHINFO]], i64 19
-; CHECK-NEXT:    call void (ptr, i32, <20 x i32>, { <3 x i32>, i32, ptr addrspace(5), i32 }, i32, ...) @llvm.amdgcn.cs.chain.p0.i32.v20i32.sl_v3i32i32p5i32s(ptr inreg [[TMP34]], i32 inreg [[TMP29]], <20 x i32> inreg [[TMP61]], { <3 x i32>, i32, ptr addrspace(5), i32 } [[TMP19]], i32 0)
+; CHECK-NEXT:    [[TMP44:%.*]] = and i32 [[TMP41]], -64
+; CHECK-NEXT:    [[TMP45:%.*]] = insertelement <2 x i32> [[TMP7]], i32 [[TMP44]], i64 0
+; CHECK-NEXT:    [[TMP46:%.*]] = bitcast <2 x i32> [[TMP45]] to i64
+; CHECK-NEXT:    [[TMP47:%.*]] = inttoptr i64 [[TMP46]] to ptr
+; CHECK-NEXT:    [[TMP48:%.*]] = ptrtoint ptr addrspace(4) [[NUMWORKGROUPSPTR]] to i64
+; CHECK-NEXT:    [[TMP49:%.*]] = bitcast i64 [[TMP48]] to <2 x i32>
+; CHECK-NEXT:    [[TMP50:%.*]] = extractelement <2 x i32> [[TMP49]], i64 0
+; CHECK-NEXT:    [[TMP51:%.*]] = extractelement <2 x i32> [[TMP49]], i64 1
+; CHECK-NEXT:    [[TMP52:%.*]] = extractelement <3 x i32> [[WORKGROUPID]], i64 0
+; CHECK-NEXT:    [[TMP53:%.*]] = extractelement <3 x i32> [[WORKGROUPID]], i64 1
+; CHECK-NEXT:    [[TMP54:%.*]] = extractelement <3 x i32> [[WORKGROUPID]], i64 2
+; CHECK-NEXT:    [[TMP55:%.*]] = insertelement <20 x i32> poison, i32 [[GLOBALTABLE]], i64 0
+; CHECK-NEXT:    [[TMP56:%.*]] = insertelement <20 x i32> [[TMP55]], i32 [[TMP50]], i64 1
+; CHECK-NEXT:    [[TMP57:%.*]] = insertelement <20 x i32> [[TMP56]], i32 [[TMP51]], i64 2
+; CHECK-NEXT:    [[TMP58:%.*]] = insertelement <20 x i32> [[TMP57]], i32 [[PAD0]], i64 3
+; CHECK-NEXT:    [[TMP59:%.*]] = insertelement <20 x i32> [[TMP58]], i32 [[PAD1]], i64 4
+; CHECK-NEXT:    [[TMP60:%.*]] = insertelement <20 x i32> [[TMP59]], i32 [[PAD2]], i64 5
+; CHECK-NEXT:    [[TMP61:%.*]] = insertelement <20 x i32> [[TMP60]], i32 [[PAD3]], i64 6
+; CHECK-NEXT:    [[TMP62:%.*]] = insertelement <20 x i32> [[TMP61]], i32 [[PAD4]], i64 7
+; CHECK-NEXT:    [[TMP63:%.*]] = insertelement <20 x i32> [[TMP62]], i32 [[PAD5]], i64 8
+; CHECK-NEXT:    [[TMP64:%.*]] = insertelement <20 x i32> [[TMP63]], i32 [[PAD6]], i64 9
+; CHECK-NEXT:    [[TMP65:%.*]] = insertelement <20 x i32> [[TMP64]], i32 [[PAD7]], i64 10
+; CHECK-NEXT:    [[TMP66:%.*]] = insertelement <20 x i32> [[TMP65]], i32 [[PAD8]], i64 11
+; CHECK-NEXT:    [[TMP67:%.*]] = insertelement <20 x i32> [[TMP66]], i32 [[PAD9]], i64 12
+; CHECK-NEXT:    [[TMP68:%.*]] = insertelement <20 x i32> [[TMP67]], i32 [[PAD10]], i64 13
+; CHECK-NEXT:    [[TMP69:%.*]] = insertelement <20 x i32> [[TMP68]], i32 [[PAD11]], i64 14
+; CHECK-NEXT:    [[TMP70:%.*]] = insertelement <20 x i32> [[TMP69]], i32 [[SPILLTABLE]], i64 15
+; CHECK-NEXT:    [[TMP71:%.*]] = insertelement <20 x i32> [[TMP70]], i32 [[TMP52]], i64 16
+; CHECK-NEXT:    [[TMP72:%.*]] = insertelement <20 x i32> [[TMP71]], i32 [[TMP53]], i64 17
+; CHECK-NEXT:    [[TMP73:%.*]] = insertelement <20 x i32> [[TMP72]], i32 [[TMP54]], i64 18
+; CHECK-NEXT:    [[TMP74:%.*]] = insertelement <20 x i32> [[TMP73]], i32 [[MULTIDISPATCHINFO]], i64 19
+; CHECK-NEXT:    call void (ptr, i32, <20 x i32>, { <3 x i32>, i32, ptr addrspace(5), i32 }, i32, ...) @llvm.amdgcn.cs.chain.p0.i32.v20i32.sl_v3i32i32p5i32s(ptr inreg [[TMP47]], i32 inreg [[TMP42]], <20 x i32> inreg [[TMP74]], { <3 x i32>, i32, ptr addrspace(5), i32 } [[TMP19]], i32 0)
 ; CHECK-NEXT:    unreachable
 ; CHECK:       ret.block:
 ; CHECK-NEXT:    ret void
diff --git a/lgc/test/Transforms/LowerCooperativeMatrix/convert.lgc b/lgc/test/Transforms/LowerCooperativeMatrix/convert.lgc
index 32bb1988f8..8e0650f523 100644
--- a/lgc/test/Transforms/LowerCooperativeMatrix/convert.lgc
+++ b/lgc/test/Transforms/LowerCooperativeMatrix/convert.lgc
@@ -4,132 +4,76 @@
 define <8 x float> @convert_f16_to_accumulator(<8 x float> %fact) {
 ; CHECK-LABEL: @convert_f16_to_accumulator(
 ; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0)
-; CHECK-NEXT:    [[TMP2:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 [[TMP1]])
-; CHECK-NEXT:    [[TMP3:%.*]] = udiv i32 [[TMP2]], 16
-; CHECK-NEXT:    [[TMP4:%.*]] = and i32 [[TMP3]], 1
-; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq i32 [[TMP4]], 0
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x float> [[FACT:%.*]] to <8 x i32>
-; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <8 x float> [[FACT]], i64 0
-; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <8 x float> [[FACT]], i64 1
-; CHECK-NEXT:    [[TMP9:%.*]] = and i32 [[TMP3]], 2
-; CHECK-NEXT:    [[TMP10:%.*]] = icmp eq i32 [[TMP9]], 0
-; CHECK-NEXT:    [[TMP11:%.*]] = select i1 [[TMP10]], float [[TMP7]], float [[TMP8]]
-; CHECK-NEXT:    [[ACCUM1:%.*]] = insertelement <4 x float> poison, float [[TMP11]], i64 0
-; CHECK-NEXT:    [[TMP12:%.*]] = extractelement <8 x float> [[FACT]], i64 2
-; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <8 x float> [[FACT]], i64 3
-; CHECK-NEXT:    [[TMP14:%.*]] = and i32 [[TMP3]], 2
-; CHECK-NEXT:    [[TMP15:%.*]] = icmp eq i32 [[TMP14]], 0
-; CHECK-NEXT:    [[TMP16:%.*]] = select i1 [[TMP15]], float [[TMP12]], float [[TMP13]]
-; CHECK-NEXT:    [[ACCUM2:%.*]] = insertelement <4 x float> [[ACCUM1]], float [[TMP16]], i64 1
-; CHECK-NEXT:    [[TMP17:%.*]] = extractelement <8 x float> [[FACT]], i64 4
-; CHECK-NEXT:    [[TMP18:%.*]] = extractelement <8 x float> [[FACT]], i64 5
-; CHECK-NEXT:    [[TMP19:%.*]] = and i32 [[TMP3]], 2
-; CHECK-NEXT:    [[TMP20:%.*]] = icmp eq i32 [[TMP19]], 0
-; CHECK-NEXT:    [[TMP21:%.*]] = select i1 [[TMP20]], float [[TMP17]], float [[TMP18]]
-; CHECK-NEXT:    [[ACCUM3:%.*]] = insertelement <4 x float> [[ACCUM2]], float [[TMP21]], i64 2
-; CHECK-NEXT:    [[TMP22:%.*]] = extractelement <8 x float> [[FACT]], i64 6
-; CHECK-NEXT:    [[TMP23:%.*]] = extractelement <8 x float> [[FACT]], i64 7
-; CHECK-NEXT:    [[TMP24:%.*]] = and i32 [[TMP3]], 2
-; CHECK-NEXT:    [[TMP25:%.*]] = icmp eq i32 [[TMP24]], 0
-; CHECK-NEXT:    [[TMP26:%.*]] = select i1 [[TMP25]], float [[TMP22]], float [[TMP23]]
-; CHECK-NEXT:    [[ACCUM4:%.*]] = insertelement <4 x float> [[ACCUM3]], float [[TMP26]], i64 3
-; CHECK-NEXT:    [[TMP27:%.*]] = bitcast <4 x float> [[ACCUM4]] to <4 x i32>
-; CHECK-NEXT:    [[TMP28:%.*]] = select i1 [[TMP5]], <4 x i32> zeroinitializer, <4 x i32> <i32 16, i32 16, i32 16, i32 16>
-; CHECK-NEXT:    [[ACCUM5:%.*]] = lshr <4 x i32> [[TMP27]], [[TMP28]]
-; CHECK-NEXT:    [[ACCUM6:%.*]] = bitcast <4 x i32> [[ACCUM5]] to <4 x float>
-; CHECK-NEXT:    [[TMP29:%.*]] = shufflevector <4 x float> [[ACCUM6]], <4 x float> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; CHECK-NEXT:    ret <8 x float> [[TMP29]]
+; CHECK-NEXT:    [[TMP2:%.*]] = udiv i32 [[TMP1]], 16
+; CHECK-NEXT:    [[TMP3:%.*]] = and i32 [[TMP2]], 1
+; CHECK-NEXT:    [[TMP4:%.*]] = icmp eq i32 [[TMP3]], 0
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x float> [[FACT:%.*]] to <8 x i32>
+; CHECK-NEXT:    [[TMP6:%.*]] = select i1 [[TMP4]], <8 x i32> zeroinitializer, <8 x i32> <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
+; CHECK-NEXT:    [[RESHAPE16BIT:%.*]] = lshr <8 x i32> [[TMP5]], [[TMP6]]
+; CHECK-NEXT:    [[RESHAPE16BIT1:%.*]] = bitcast <8 x i32> [[RESHAPE16BIT]] to <8 x float>
+; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <8 x float> [[RESHAPE16BIT1]], <8 x float> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    ret <8 x float> [[TMP7]]
 ;
-  %accum = call <8 x float> @lgc.cooperative.matrix.convert.v8f32.i32.v8f32.i32.i32.i32.i32(i32 0, <8 x float> %fact, i32 1, i32 1, i32 0, i32 1)
+  %accum = call <8 x float> (...) @lgc.cooperative.matrix.convert__v8f32(i32 0, <8 x float> %fact, i32 1, i32 1, i32 0, i32 1)
   ret <8 x float> %accum
 }
 
 define <8 x float> @convert_f16_to_factor(<8 x float> %accum) {
 ; CHECK-LABEL: @convert_f16_to_factor(
 ; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0)
-; CHECK-NEXT:    [[TMP2:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 [[TMP1]])
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x float> [[ACCUM:%.*]] to <8 x i32>
-; CHECK-NEXT:    [[TMP4:%.*]] = udiv i32 [[TMP2]], 16
-; CHECK-NEXT:    [[TMP5:%.*]] = and i32 [[TMP4]], 1
-; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i32 [[TMP5]], 0
-; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <8 x i32> [[TMP3]], <8 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <8 x i32> [[TMP7]], i64 0
-; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <8 x i32> [[TMP7]], i64 0
-; CHECK-NEXT:    [[TMP10:%.*]] = call i32 @llvm.amdgcn.permlanex16(i32 [[TMP8]], i32 [[TMP9]], i32 1985229328, i32 -19088744, i1 false, i1 false)
-; CHECK-NEXT:    [[TMP11:%.*]] = extractelement <8 x i32> [[TMP7]], i64 1
-; CHECK-NEXT:    [[TMP12:%.*]] = extractelement <8 x i32> [[TMP7]], i64 1
-; CHECK-NEXT:    [[TMP13:%.*]] = call i32 @llvm.amdgcn.permlanex16(i32 [[TMP11]], i32 [[TMP12]], i32 1985229328, i32 -19088744, i1 false, i1 false)
-; CHECK-NEXT:    [[TMP14:%.*]] = extractelement <8 x i32> [[TMP7]], i64 2
-; CHECK-NEXT:    [[TMP15:%.*]] = extractelement <8 x i32> [[TMP7]], i64 2
-; CHECK-NEXT:    [[TMP16:%.*]] = call i32 @llvm.amdgcn.permlanex16(i32 [[TMP14]], i32 [[TMP15]], i32 1985229328, i32 -19088744, i1 false, i1 false)
-; CHECK-NEXT:    [[TMP17:%.*]] = extractelement <8 x i32> [[TMP7]], i64 3
-; CHECK-NEXT:    [[TMP18:%.*]] = extractelement <8 x i32> [[TMP7]], i64 3
-; CHECK-NEXT:    [[TMP19:%.*]] = call i32 @llvm.amdgcn.permlanex16(i32 [[TMP17]], i32 [[TMP18]], i32 1985229328, i32 -19088744, i1 false, i1 false)
-; CHECK-NEXT:    [[TMP20:%.*]] = extractelement <8 x i32> [[TMP7]], i64 4
-; CHECK-NEXT:    [[TMP21:%.*]] = extractelement <8 x i32> [[TMP7]], i64 4
-; CHECK-NEXT:    [[TMP22:%.*]] = call i32 @llvm.amdgcn.permlanex16(i32 [[TMP20]], i32 [[TMP21]], i32 1985229328, i32 -19088744, i1 false, i1 false)
-; CHECK-NEXT:    [[TMP23:%.*]] = extractelement <8 x i32> [[TMP7]], i64 5
-; CHECK-NEXT:    [[TMP24:%.*]] = extractelement <8 x i32> [[TMP7]], i64 5
-; CHECK-NEXT:    [[TMP25:%.*]] = call i32 @llvm.amdgcn.permlanex16(i32 [[TMP23]], i32 [[TMP24]], i32 1985229328, i32 -19088744, i1 false, i1 false)
-; CHECK-NEXT:    [[TMP26:%.*]] = extractelement <8 x i32> [[TMP7]], i64 6
-; CHECK-NEXT:    [[TMP27:%.*]] = extractelement <8 x i32> [[TMP7]], i64 6
-; CHECK-NEXT:    [[TMP28:%.*]] = call i32 @llvm.amdgcn.permlanex16(i32 [[TMP26]], i32 [[TMP27]], i32 1985229328, i32 -19088744, i1 false, i1 false)
-; CHECK-NEXT:    [[TMP29:%.*]] = extractelement <8 x i32> [[TMP7]], i64 7
-; CHECK-NEXT:    [[TMP30:%.*]] = extractelement <8 x i32> [[TMP7]], i64 7
-; CHECK-NEXT:    [[TMP31:%.*]] = call i32 @llvm.amdgcn.permlanex16(i32 [[TMP29]], i32 [[TMP30]], i32 1985229328, i32 -19088744, i1 false, i1 false)
-; CHECK-NEXT:    [[TMP32:%.*]] = insertelement <8 x i32> poison, i32 [[TMP10]], i64 0
-; CHECK-NEXT:    [[TMP33:%.*]] = insertelement <8 x i32> [[TMP32]], i32 [[TMP13]], i64 1
-; CHECK-NEXT:    [[TMP34:%.*]] = insertelement <8 x i32> [[TMP33]], i32 [[TMP16]], i64 2
-; CHECK-NEXT:    [[TMP35:%.*]] = insertelement <8 x i32> [[TMP34]], i32 [[TMP19]], i64 3
-; CHECK-NEXT:    [[TMP36:%.*]] = insertelement <8 x i32> [[TMP35]], i32 [[TMP22]], i64 4
-; CHECK-NEXT:    [[TMP37:%.*]] = insertelement <8 x i32> [[TMP36]], i32 [[TMP25]], i64 5
-; CHECK-NEXT:    [[TMP38:%.*]] = insertelement <8 x i32> [[TMP37]], i32 [[TMP28]], i64 6
-; CHECK-NEXT:    [[TMP39:%.*]] = insertelement <8 x i32> [[TMP38]], i32 [[TMP31]], i64 7
-; CHECK-NEXT:    [[TMP40:%.*]] = select i1 [[TMP6]], <8 x i32> [[TMP7]], <8 x i32> [[TMP39]]
-; CHECK-NEXT:    [[TMP41:%.*]] = select i1 [[TMP6]], <8 x i32> [[TMP39]], <8 x i32> [[TMP7]]
-; CHECK-NEXT:    [[TMP42:%.*]] = and <8 x i32> [[TMP40]], <i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535>
-; CHECK-NEXT:    [[TMP43:%.*]] = shl <8 x i32> [[TMP41]], <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
-; CHECK-NEXT:    [[TMP44:%.*]] = or <8 x i32> [[TMP42]], [[TMP43]]
-; CHECK-NEXT:    [[TMP45:%.*]] = extractelement <8 x i32> [[TMP44]], i64 0
-; CHECK-NEXT:    [[TMP46:%.*]] = call i32 @llvm.amdgcn.permlane64(i32 [[TMP45]])
-; CHECK-NEXT:    [[TMP47:%.*]] = extractelement <8 x i32> [[TMP44]], i64 1
-; CHECK-NEXT:    [[TMP48:%.*]] = call i32 @llvm.amdgcn.permlane64(i32 [[TMP47]])
-; CHECK-NEXT:    [[TMP49:%.*]] = extractelement <8 x i32> [[TMP44]], i64 2
-; CHECK-NEXT:    [[TMP50:%.*]] = call i32 @llvm.amdgcn.permlane64(i32 [[TMP49]])
-; CHECK-NEXT:    [[TMP51:%.*]] = extractelement <8 x i32> [[TMP44]], i64 3
-; CHECK-NEXT:    [[TMP52:%.*]] = call i32 @llvm.amdgcn.permlane64(i32 [[TMP51]])
-; CHECK-NEXT:    [[TMP53:%.*]] = extractelement <8 x i32> [[TMP44]], i64 4
-; CHECK-NEXT:    [[TMP54:%.*]] = call i32 @llvm.amdgcn.permlane64(i32 [[TMP53]])
-; CHECK-NEXT:    [[TMP55:%.*]] = extractelement <8 x i32> [[TMP44]], i64 5
-; CHECK-NEXT:    [[TMP56:%.*]] = call i32 @llvm.amdgcn.permlane64(i32 [[TMP55]])
-; CHECK-NEXT:    [[TMP57:%.*]] = extractelement <8 x i32> [[TMP44]], i64 6
-; CHECK-NEXT:    [[TMP58:%.*]] = call i32 @llvm.amdgcn.permlane64(i32 [[TMP57]])
-; CHECK-NEXT:    [[TMP59:%.*]] = extractelement <8 x i32> [[TMP44]], i64 7
-; CHECK-NEXT:    [[TMP60:%.*]] = call i32 @llvm.amdgcn.permlane64(i32 [[TMP59]])
-; CHECK-NEXT:    [[TMP61:%.*]] = insertelement <8 x i32> poison, i32 [[TMP46]], i64 0
-; CHECK-NEXT:    [[TMP62:%.*]] = insertelement <8 x i32> [[TMP61]], i32 [[TMP48]], i64 1
-; CHECK-NEXT:    [[TMP63:%.*]] = insertelement <8 x i32> [[TMP62]], i32 [[TMP50]], i64 2
-; CHECK-NEXT:    [[TMP64:%.*]] = insertelement <8 x i32> [[TMP63]], i32 [[TMP52]], i64 3
-; CHECK-NEXT:    [[TMP65:%.*]] = insertelement <8 x i32> [[TMP64]], i32 [[TMP54]], i64 4
-; CHECK-NEXT:    [[TMP66:%.*]] = insertelement <8 x i32> [[TMP65]], i32 [[TMP56]], i64 5
-; CHECK-NEXT:    [[TMP67:%.*]] = insertelement <8 x i32> [[TMP66]], i32 [[TMP58]], i64 6
-; CHECK-NEXT:    [[TMP68:%.*]] = insertelement <8 x i32> [[TMP67]], i32 [[TMP60]], i64 7
-; CHECK-NEXT:    [[TMP69:%.*]] = icmp ult i32 [[TMP2]], 32
-; CHECK-NEXT:    [[TMP70:%.*]] = select i1 [[TMP69]], <8 x i32> [[TMP44]], <8 x i32> [[TMP68]]
-; CHECK-NEXT:    [[TMP71:%.*]] = select i1 [[TMP69]], <8 x i32> [[TMP68]], <8 x i32> [[TMP44]]
-; CHECK-NEXT:    [[FACT1:%.*]] = shufflevector <8 x i32> [[TMP70]], <8 x i32> [[TMP71]], <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
-; CHECK-NEXT:    [[TMP72:%.*]] = bitcast <8 x i32> [[FACT1]] to <8 x float>
-; CHECK-NEXT:    ret <8 x float> [[TMP72]]
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x float> [[ACCUM:%.*]] to <8 x i32>
+; CHECK-NEXT:    [[TMP3:%.*]] = udiv i32 [[TMP1]], 16
+; CHECK-NEXT:    [[TMP4:%.*]] = and i32 [[TMP3]], 1
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq i32 [[TMP4]], 0
+; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <8 x i32> [[TMP2]], <8 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <8 x i32> [[TMP6]], i64 0
+; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <8 x i32> [[TMP6]], i64 0
+; CHECK-NEXT:    [[TMP9:%.*]] = call i32 @llvm.amdgcn.permlanex16(i32 [[TMP7]], i32 [[TMP8]], i32 1985229328, i32 -19088744, i1 false, i1 false)
+; CHECK-NEXT:    [[TMP10:%.*]] = extractelement <8 x i32> [[TMP6]], i64 1
+; CHECK-NEXT:    [[TMP11:%.*]] = extractelement <8 x i32> [[TMP6]], i64 1
+; CHECK-NEXT:    [[TMP12:%.*]] = call i32 @llvm.amdgcn.permlanex16(i32 [[TMP10]], i32 [[TMP11]], i32 1985229328, i32 -19088744, i1 false, i1 false)
+; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <8 x i32> [[TMP6]], i64 2
+; CHECK-NEXT:    [[TMP14:%.*]] = extractelement <8 x i32> [[TMP6]], i64 2
+; CHECK-NEXT:    [[TMP15:%.*]] = call i32 @llvm.amdgcn.permlanex16(i32 [[TMP13]], i32 [[TMP14]], i32 1985229328, i32 -19088744, i1 false, i1 false)
+; CHECK-NEXT:    [[TMP16:%.*]] = extractelement <8 x i32> [[TMP6]], i64 3
+; CHECK-NEXT:    [[TMP17:%.*]] = extractelement <8 x i32> [[TMP6]], i64 3
+; CHECK-NEXT:    [[TMP18:%.*]] = call i32 @llvm.amdgcn.permlanex16(i32 [[TMP16]], i32 [[TMP17]], i32 1985229328, i32 -19088744, i1 false, i1 false)
+; CHECK-NEXT:    [[TMP19:%.*]] = extractelement <8 x i32> [[TMP6]], i64 4
+; CHECK-NEXT:    [[TMP20:%.*]] = extractelement <8 x i32> [[TMP6]], i64 4
+; CHECK-NEXT:    [[TMP21:%.*]] = call i32 @llvm.amdgcn.permlanex16(i32 [[TMP19]], i32 [[TMP20]], i32 1985229328, i32 -19088744, i1 false, i1 false)
+; CHECK-NEXT:    [[TMP22:%.*]] = extractelement <8 x i32> [[TMP6]], i64 5
+; CHECK-NEXT:    [[TMP23:%.*]] = extractelement <8 x i32> [[TMP6]], i64 5
+; CHECK-NEXT:    [[TMP24:%.*]] = call i32 @llvm.amdgcn.permlanex16(i32 [[TMP22]], i32 [[TMP23]], i32 1985229328, i32 -19088744, i1 false, i1 false)
+; CHECK-NEXT:    [[TMP25:%.*]] = extractelement <8 x i32> [[TMP6]], i64 6
+; CHECK-NEXT:    [[TMP26:%.*]] = extractelement <8 x i32> [[TMP6]], i64 6
+; CHECK-NEXT:    [[TMP27:%.*]] = call i32 @llvm.amdgcn.permlanex16(i32 [[TMP25]], i32 [[TMP26]], i32 1985229328, i32 -19088744, i1 false, i1 false)
+; CHECK-NEXT:    [[TMP28:%.*]] = extractelement <8 x i32> [[TMP6]], i64 7
+; CHECK-NEXT:    [[TMP29:%.*]] = extractelement <8 x i32> [[TMP6]], i64 7
+; CHECK-NEXT:    [[TMP30:%.*]] = call i32 @llvm.amdgcn.permlanex16(i32 [[TMP28]], i32 [[TMP29]], i32 1985229328, i32 -19088744, i1 false, i1 false)
+; CHECK-NEXT:    [[TMP31:%.*]] = insertelement <8 x i32> poison, i32 [[TMP9]], i64 0
+; CHECK-NEXT:    [[TMP32:%.*]] = insertelement <8 x i32> [[TMP31]], i32 [[TMP12]], i64 1
+; CHECK-NEXT:    [[TMP33:%.*]] = insertelement <8 x i32> [[TMP32]], i32 [[TMP15]], i64 2
+; CHECK-NEXT:    [[TMP34:%.*]] = insertelement <8 x i32> [[TMP33]], i32 [[TMP18]], i64 3
+; CHECK-NEXT:    [[TMP35:%.*]] = insertelement <8 x i32> [[TMP34]], i32 [[TMP21]], i64 4
+; CHECK-NEXT:    [[TMP36:%.*]] = insertelement <8 x i32> [[TMP35]], i32 [[TMP24]], i64 5
+; CHECK-NEXT:    [[TMP37:%.*]] = insertelement <8 x i32> [[TMP36]], i32 [[TMP27]], i64 6
+; CHECK-NEXT:    [[TMP38:%.*]] = insertelement <8 x i32> [[TMP37]], i32 [[TMP30]], i64 7
+; CHECK-NEXT:    [[TMP39:%.*]] = select i1 [[TMP5]], <8 x i32> [[TMP6]], <8 x i32> [[TMP38]]
+; CHECK-NEXT:    [[TMP40:%.*]] = select i1 [[TMP5]], <8 x i32> [[TMP38]], <8 x i32> [[TMP6]]
+; CHECK-NEXT:    [[TMP41:%.*]] = and <8 x i32> [[TMP39]], <i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535>
+; CHECK-NEXT:    [[TMP42:%.*]] = shl <8 x i32> [[TMP40]], <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
+; CHECK-NEXT:    [[TMP43:%.*]] = or <8 x i32> [[TMP41]], [[TMP42]]
+; CHECK-NEXT:    [[TMP44:%.*]] = bitcast <8 x i32> [[TMP43]] to <8 x float>
+; CHECK-NEXT:    ret <8 x float> [[TMP44]]
 ;
-  %fact = call <8 x float> @lgc.cooperative.matrix.convert.v8f32.i32.v8f32.i32.i32.i32.i32(i32 0, <8 x float> %accum, i32 1, i32 1, i32 1, i32 0)
+  %fact = call <8 x float> (...) @lgc.cooperative.matrix.convert__v8f32(i32 0, <8 x float> %accum, i32 1, i32 1, i32 1, i32 0)
   ret <8 x float> %fact
 }
 
 declare i1 @getcc()
 declare <8 x float> @process1(<8 x float>)
 
-declare <8 x float> @lgc.cooperative.matrix.load.v8f32.p3.i32.i1.i32.i32.i32(ptr addrspace(3), i32, i1, i32, i32, i32)
-declare <8 x float> @lgc.cooperative.matrix.transpose.v8f32.v8f32.i32.i32(<8 x float>, i32, i32)
-declare <8 x float> @lgc.cooperative.matrix.convert.v8f32.i32.v8f32.i32.i32.i32.i32(i32, <8 x float>, i32, i32, i32, i32)
-declare void @lgc.cooperative.matrix.store.p3.i32.i1.i32.i32.i32.v8f32(ptr addrspace(3), i32, i1, i32, i32, i32, <8 x float>)
-declare void @lgc.cooperative.matrix.store.p7.i32.i1.i32.i32.i32.v8f32(ptr addrspace(7), i32, i1, i32, i32, i32, <8 x float>)
+declare <8 x float> @lgc.cooperative.matrix.load__v8f32(...)
+declare <8 x float> @lgc.cooperative.matrix.transpose__v8f32(...)
+declare <8 x float> @lgc.cooperative.matrix.convert__v8f32(...)
+declare void @lgc.cooperative.matrix.store(...)
diff --git a/lgc/test/Transforms/LowerCooperativeMatrix/extract-insert.lgc b/lgc/test/Transforms/LowerCooperativeMatrix/extract-insert.lgc
index 6ddd7c21dd..626bb2ab54 100644
--- a/lgc/test/Transforms/LowerCooperativeMatrix/extract-insert.lgc
+++ b/lgc/test/Transforms/LowerCooperativeMatrix/extract-insert.lgc
@@ -3,19 +3,20 @@
 
 define i32 @test_length_f16() !spirv.ExecutionModel !8 !lgc.shaderstage !9 {
 ; CHECK-LABEL: @test_length_f16(
-; CHECK-NEXT:    ret i32 16
+; CHECK-NEXT:    [[A:%.*]] = call i32 (...) @lgc.cooperative.matrix.length__i32(i32 1, i32 0)
+; CHECK-NEXT:    ret i32 [[A]]
 ;
-  %a = call i32 @lgc.cooperative.matrix.length.i32.i32.i32(i32 1, i32 0)
+  %a = call i32 (...) @lgc.cooperative.matrix.length__i32(i32 1, i32 0)
   ret i32 %a
 }
 
 define half @test_extract_f16(<8 x float> %matrix) !spirv.ExecutionModel !8 !lgc.shaderstage !9 {
 ; CHECK-LABEL: @test_extract_f16(
 ; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x float> [[MATRIX:%.*]] to <16 x half>
-; CHECK-NEXT:    [[R:%.*]] = extractelement <16 x half> [[TMP1]], i32 5
-; CHECK-NEXT:    ret half [[R]]
+; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <16 x half> [[TMP1]], i32 5
+; CHECK-NEXT:    ret half [[TMP2]]
 ;
-  %r = call half @lgc.cooperative.matrix.extract.f16.v8f32.i32.i32.i32(<8 x float> %matrix, i32 5, i32 1, i32 0)
+  %r = call half (...) @lgc.cooperative.matrix.extract__f16(<8 x float> %matrix, i32 5, i32 1, i32 0)
   ret half %r
 }
 
@@ -23,16 +24,16 @@ define <8 x float> @test_insert_f16(<8 x float> %matrix, half %x) !spirv.Executi
 ; CHECK-LABEL: @test_insert_f16(
 ; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x float> [[MATRIX:%.*]] to <16 x half>
 ; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <16 x half> [[TMP1]], half [[X:%.*]], i32 5
-; CHECK-NEXT:    [[R:%.*]] = bitcast <16 x half> [[TMP2]] to <8 x float>
-; CHECK-NEXT:    ret <8 x float> [[R]]
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x half> [[TMP2]] to <8 x float>
+; CHECK-NEXT:    ret <8 x float> [[TMP3]]
 ;
-  %r = call <8 x float> @lgc.cooperative.matrix.insert.v8f32.v8f32.f16.i32.i32.i32(<8 x float> %matrix, half %x, i32 5, i32 1, i32 0)
+  %r = call <8 x float> (...) @lgc.cooperative.matrix.insert__v8f32(<8 x float> %matrix, half %x, i32 5, i32 1, i32 0)
   ret <8 x float> %r
 }
 
-declare i32 @lgc.cooperative.matrix.length.i32.i32.i32(i32, i32)
-declare half @lgc.cooperative.matrix.extract.f16.v8f32.i32.i32.i32(<8 x float>, i32, i32, i32)
-declare <8 x float> @lgc.cooperative.matrix.insert.v8f32.v8f32.f16.i32.i32.i32(<8 x float>, half, i32, i32, i32)
+declare i32 @lgc.cooperative.matrix.length__i32(...)
+declare half @lgc.cooperative.matrix.extract__f16(...)
+declare <8 x float> @lgc.cooperative.matrix.insert__v8f32(...)
 
 !llpc.compute.mode = !{!0}
 !lgc.client = !{!1}
diff --git a/lgc/test/Transforms/LowerCooperativeMatrix/lit.local.cfg b/lgc/test/Transforms/LowerCooperativeMatrix/lit.local.cfg
deleted file mode 100644
index a4266bc874..0000000000
--- a/lgc/test/Transforms/LowerCooperativeMatrix/lit.local.cfg
+++ /dev/null
@@ -1,2 +0,0 @@
-if "vki_cooperative_matrix" not in config.available_features:
-    config.unsupported = True
diff --git a/lgc/test/Transforms/LowerCooperativeMatrix/load-wave64.lgc b/lgc/test/Transforms/LowerCooperativeMatrix/load-wave64.lgc
index 44cde67243..1d245d0a41 100644
--- a/lgc/test/Transforms/LowerCooperativeMatrix/load-wave64.lgc
+++ b/lgc/test/Transforms/LowerCooperativeMatrix/load-wave64.lgc
@@ -8,89 +8,73 @@ define <8 x float> @test_f16_ab_layout(ptr addrspace(7) %ptr) !spirv.ExecutionMo
 ; CHECK-NEXT:    [[TMP3:%.*]] = srem i32 [[TMP2]], 16
 ; CHECK-NEXT:    [[TMP4:%.*]] = add i32 0, [[TMP3]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = add i32 [[TMP4]], 0
-; CHECK-NEXT:    [[TMP6:%.*]] = add i32 [[TMP5]], 0
-; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr half, ptr addrspace(7) [[PTR:%.*]], i32 [[TMP6]]
-; CHECK-NEXT:    [[A1:%.*]] = load half, ptr addrspace(7) [[TMP7]], align 2
-; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <16 x half> poison, half [[A1]], i64 0
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr half, ptr addrspace(7) [[PTR:%.*]], i32 [[TMP5]]
+; CHECK-NEXT:    [[TMP7:%.*]] = load half, ptr addrspace(7) [[TMP6]], align 2
+; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <16 x half> poison, half [[TMP7]], i64 0
 ; CHECK-NEXT:    [[TMP9:%.*]] = add i32 [[TMP4]], 160
-; CHECK-NEXT:    [[TMP10:%.*]] = add i32 [[TMP9]], 0
-; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr half, ptr addrspace(7) [[PTR]], i32 [[TMP10]]
-; CHECK-NEXT:    [[A2:%.*]] = load half, ptr addrspace(7) [[TMP11]], align 2
-; CHECK-NEXT:    [[TMP12:%.*]] = insertelement <16 x half> [[TMP8]], half [[A2]], i64 1
+; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr half, ptr addrspace(7) [[PTR]], i32 [[TMP9]]
+; CHECK-NEXT:    [[TMP11:%.*]] = load half, ptr addrspace(7) [[TMP10]], align 2
+; CHECK-NEXT:    [[TMP12:%.*]] = insertelement <16 x half> [[TMP8]], half [[TMP11]], i64 1
 ; CHECK-NEXT:    [[TMP13:%.*]] = add i32 [[TMP4]], 320
-; CHECK-NEXT:    [[TMP14:%.*]] = add i32 [[TMP13]], 0
-; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr half, ptr addrspace(7) [[PTR]], i32 [[TMP14]]
-; CHECK-NEXT:    [[A3:%.*]] = load half, ptr addrspace(7) [[TMP15]], align 2
-; CHECK-NEXT:    [[TMP16:%.*]] = insertelement <16 x half> [[TMP12]], half [[A3]], i64 2
+; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr half, ptr addrspace(7) [[PTR]], i32 [[TMP13]]
+; CHECK-NEXT:    [[TMP15:%.*]] = load half, ptr addrspace(7) [[TMP14]], align 2
+; CHECK-NEXT:    [[TMP16:%.*]] = insertelement <16 x half> [[TMP12]], half [[TMP15]], i64 2
 ; CHECK-NEXT:    [[TMP17:%.*]] = add i32 [[TMP4]], 480
-; CHECK-NEXT:    [[TMP18:%.*]] = add i32 [[TMP17]], 0
-; CHECK-NEXT:    [[TMP19:%.*]] = getelementptr half, ptr addrspace(7) [[PTR]], i32 [[TMP18]]
-; CHECK-NEXT:    [[A4:%.*]] = load half, ptr addrspace(7) [[TMP19]], align 2
-; CHECK-NEXT:    [[TMP20:%.*]] = insertelement <16 x half> [[TMP16]], half [[A4]], i64 3
+; CHECK-NEXT:    [[TMP18:%.*]] = getelementptr half, ptr addrspace(7) [[PTR]], i32 [[TMP17]]
+; CHECK-NEXT:    [[TMP19:%.*]] = load half, ptr addrspace(7) [[TMP18]], align 2
+; CHECK-NEXT:    [[TMP20:%.*]] = insertelement <16 x half> [[TMP16]], half [[TMP19]], i64 3
 ; CHECK-NEXT:    [[TMP21:%.*]] = add i32 [[TMP4]], 640
-; CHECK-NEXT:    [[TMP22:%.*]] = add i32 [[TMP21]], 0
-; CHECK-NEXT:    [[TMP23:%.*]] = getelementptr half, ptr addrspace(7) [[PTR]], i32 [[TMP22]]
-; CHECK-NEXT:    [[A5:%.*]] = load half, ptr addrspace(7) [[TMP23]], align 2
-; CHECK-NEXT:    [[TMP24:%.*]] = insertelement <16 x half> [[TMP20]], half [[A5]], i64 4
+; CHECK-NEXT:    [[TMP22:%.*]] = getelementptr half, ptr addrspace(7) [[PTR]], i32 [[TMP21]]
+; CHECK-NEXT:    [[TMP23:%.*]] = load half, ptr addrspace(7) [[TMP22]], align 2
+; CHECK-NEXT:    [[TMP24:%.*]] = insertelement <16 x half> [[TMP20]], half [[TMP23]], i64 4
 ; CHECK-NEXT:    [[TMP25:%.*]] = add i32 [[TMP4]], 800
-; CHECK-NEXT:    [[TMP26:%.*]] = add i32 [[TMP25]], 0
-; CHECK-NEXT:    [[TMP27:%.*]] = getelementptr half, ptr addrspace(7) [[PTR]], i32 [[TMP26]]
-; CHECK-NEXT:    [[A6:%.*]] = load half, ptr addrspace(7) [[TMP27]], align 2
-; CHECK-NEXT:    [[TMP28:%.*]] = insertelement <16 x half> [[TMP24]], half [[A6]], i64 5
+; CHECK-NEXT:    [[TMP26:%.*]] = getelementptr half, ptr addrspace(7) [[PTR]], i32 [[TMP25]]
+; CHECK-NEXT:    [[TMP27:%.*]] = load half, ptr addrspace(7) [[TMP26]], align 2
+; CHECK-NEXT:    [[TMP28:%.*]] = insertelement <16 x half> [[TMP24]], half [[TMP27]], i64 5
 ; CHECK-NEXT:    [[TMP29:%.*]] = add i32 [[TMP4]], 960
-; CHECK-NEXT:    [[TMP30:%.*]] = add i32 [[TMP29]], 0
-; CHECK-NEXT:    [[TMP31:%.*]] = getelementptr half, ptr addrspace(7) [[PTR]], i32 [[TMP30]]
-; CHECK-NEXT:    [[A7:%.*]] = load half, ptr addrspace(7) [[TMP31]], align 2
-; CHECK-NEXT:    [[TMP32:%.*]] = insertelement <16 x half> [[TMP28]], half [[A7]], i64 6
+; CHECK-NEXT:    [[TMP30:%.*]] = getelementptr half, ptr addrspace(7) [[PTR]], i32 [[TMP29]]
+; CHECK-NEXT:    [[TMP31:%.*]] = load half, ptr addrspace(7) [[TMP30]], align 2
+; CHECK-NEXT:    [[TMP32:%.*]] = insertelement <16 x half> [[TMP28]], half [[TMP31]], i64 6
 ; CHECK-NEXT:    [[TMP33:%.*]] = add i32 [[TMP4]], 1120
-; CHECK-NEXT:    [[TMP34:%.*]] = add i32 [[TMP33]], 0
-; CHECK-NEXT:    [[TMP35:%.*]] = getelementptr half, ptr addrspace(7) [[PTR]], i32 [[TMP34]]
-; CHECK-NEXT:    [[A8:%.*]] = load half, ptr addrspace(7) [[TMP35]], align 2
-; CHECK-NEXT:    [[TMP36:%.*]] = insertelement <16 x half> [[TMP32]], half [[A8]], i64 7
+; CHECK-NEXT:    [[TMP34:%.*]] = getelementptr half, ptr addrspace(7) [[PTR]], i32 [[TMP33]]
+; CHECK-NEXT:    [[TMP35:%.*]] = load half, ptr addrspace(7) [[TMP34]], align 2
+; CHECK-NEXT:    [[TMP36:%.*]] = insertelement <16 x half> [[TMP32]], half [[TMP35]], i64 7
 ; CHECK-NEXT:    [[TMP37:%.*]] = add i32 [[TMP4]], 1280
-; CHECK-NEXT:    [[TMP38:%.*]] = add i32 [[TMP37]], 0
-; CHECK-NEXT:    [[TMP39:%.*]] = getelementptr half, ptr addrspace(7) [[PTR]], i32 [[TMP38]]
-; CHECK-NEXT:    [[A9:%.*]] = load half, ptr addrspace(7) [[TMP39]], align 2
-; CHECK-NEXT:    [[TMP40:%.*]] = insertelement <16 x half> [[TMP36]], half [[A9]], i64 8
+; CHECK-NEXT:    [[TMP38:%.*]] = getelementptr half, ptr addrspace(7) [[PTR]], i32 [[TMP37]]
+; CHECK-NEXT:    [[TMP39:%.*]] = load half, ptr addrspace(7) [[TMP38]], align 2
+; CHECK-NEXT:    [[TMP40:%.*]] = insertelement <16 x half> [[TMP36]], half [[TMP39]], i64 8
 ; CHECK-NEXT:    [[TMP41:%.*]] = add i32 [[TMP4]], 1440
-; CHECK-NEXT:    [[TMP42:%.*]] = add i32 [[TMP41]], 0
-; CHECK-NEXT:    [[TMP43:%.*]] = getelementptr half, ptr addrspace(7) [[PTR]], i32 [[TMP42]]
-; CHECK-NEXT:    [[A10:%.*]] = load half, ptr addrspace(7) [[TMP43]], align 2
-; CHECK-NEXT:    [[TMP44:%.*]] = insertelement <16 x half> [[TMP40]], half [[A10]], i64 9
+; CHECK-NEXT:    [[TMP42:%.*]] = getelementptr half, ptr addrspace(7) [[PTR]], i32 [[TMP41]]
+; CHECK-NEXT:    [[TMP43:%.*]] = load half, ptr addrspace(7) [[TMP42]], align 2
+; CHECK-NEXT:    [[TMP44:%.*]] = insertelement <16 x half> [[TMP40]], half [[TMP43]], i64 9
 ; CHECK-NEXT:    [[TMP45:%.*]] = add i32 [[TMP4]], 1600
-; CHECK-NEXT:    [[TMP46:%.*]] = add i32 [[TMP45]], 0
-; CHECK-NEXT:    [[TMP47:%.*]] = getelementptr half, ptr addrspace(7) [[PTR]], i32 [[TMP46]]
-; CHECK-NEXT:    [[A11:%.*]] = load half, ptr addrspace(7) [[TMP47]], align 2
-; CHECK-NEXT:    [[TMP48:%.*]] = insertelement <16 x half> [[TMP44]], half [[A11]], i64 10
+; CHECK-NEXT:    [[TMP46:%.*]] = getelementptr half, ptr addrspace(7) [[PTR]], i32 [[TMP45]]
+; CHECK-NEXT:    [[TMP47:%.*]] = load half, ptr addrspace(7) [[TMP46]], align 2
+; CHECK-NEXT:    [[TMP48:%.*]] = insertelement <16 x half> [[TMP44]], half [[TMP47]], i64 10
 ; CHECK-NEXT:    [[TMP49:%.*]] = add i32 [[TMP4]], 1760
-; CHECK-NEXT:    [[TMP50:%.*]] = add i32 [[TMP49]], 0
-; CHECK-NEXT:    [[TMP51:%.*]] = getelementptr half, ptr addrspace(7) [[PTR]], i32 [[TMP50]]
-; CHECK-NEXT:    [[A12:%.*]] = load half, ptr addrspace(7) [[TMP51]], align 2
-; CHECK-NEXT:    [[TMP52:%.*]] = insertelement <16 x half> [[TMP48]], half [[A12]], i64 11
+; CHECK-NEXT:    [[TMP50:%.*]] = getelementptr half, ptr addrspace(7) [[PTR]], i32 [[TMP49]]
+; CHECK-NEXT:    [[TMP51:%.*]] = load half, ptr addrspace(7) [[TMP50]], align 2
+; CHECK-NEXT:    [[TMP52:%.*]] = insertelement <16 x half> [[TMP48]], half [[TMP51]], i64 11
 ; CHECK-NEXT:    [[TMP53:%.*]] = add i32 [[TMP4]], 1920
-; CHECK-NEXT:    [[TMP54:%.*]] = add i32 [[TMP53]], 0
-; CHECK-NEXT:    [[TMP55:%.*]] = getelementptr half, ptr addrspace(7) [[PTR]], i32 [[TMP54]]
-; CHECK-NEXT:    [[A13:%.*]] = load half, ptr addrspace(7) [[TMP55]], align 2
-; CHECK-NEXT:    [[TMP56:%.*]] = insertelement <16 x half> [[TMP52]], half [[A13]], i64 12
+; CHECK-NEXT:    [[TMP54:%.*]] = getelementptr half, ptr addrspace(7) [[PTR]], i32 [[TMP53]]
+; CHECK-NEXT:    [[TMP55:%.*]] = load half, ptr addrspace(7) [[TMP54]], align 2
+; CHECK-NEXT:    [[TMP56:%.*]] = insertelement <16 x half> [[TMP52]], half [[TMP55]], i64 12
 ; CHECK-NEXT:    [[TMP57:%.*]] = add i32 [[TMP4]], 2080
-; CHECK-NEXT:    [[TMP58:%.*]] = add i32 [[TMP57]], 0
-; CHECK-NEXT:    [[TMP59:%.*]] = getelementptr half, ptr addrspace(7) [[PTR]], i32 [[TMP58]]
-; CHECK-NEXT:    [[A14:%.*]] = load half, ptr addrspace(7) [[TMP59]], align 2
-; CHECK-NEXT:    [[TMP60:%.*]] = insertelement <16 x half> [[TMP56]], half [[A14]], i64 13
+; CHECK-NEXT:    [[TMP58:%.*]] = getelementptr half, ptr addrspace(7) [[PTR]], i32 [[TMP57]]
+; CHECK-NEXT:    [[TMP59:%.*]] = load half, ptr addrspace(7) [[TMP58]], align 2
+; CHECK-NEXT:    [[TMP60:%.*]] = insertelement <16 x half> [[TMP56]], half [[TMP59]], i64 13
 ; CHECK-NEXT:    [[TMP61:%.*]] = add i32 [[TMP4]], 2240
-; CHECK-NEXT:    [[TMP62:%.*]] = add i32 [[TMP61]], 0
-; CHECK-NEXT:    [[TMP63:%.*]] = getelementptr half, ptr addrspace(7) [[PTR]], i32 [[TMP62]]
-; CHECK-NEXT:    [[A15:%.*]] = load half, ptr addrspace(7) [[TMP63]], align 2
-; CHECK-NEXT:    [[TMP64:%.*]] = insertelement <16 x half> [[TMP60]], half [[A15]], i64 14
+; CHECK-NEXT:    [[TMP62:%.*]] = getelementptr half, ptr addrspace(7) [[PTR]], i32 [[TMP61]]
+; CHECK-NEXT:    [[TMP63:%.*]] = load half, ptr addrspace(7) [[TMP62]], align 2
+; CHECK-NEXT:    [[TMP64:%.*]] = insertelement <16 x half> [[TMP60]], half [[TMP63]], i64 14
 ; CHECK-NEXT:    [[TMP65:%.*]] = add i32 [[TMP4]], 2400
-; CHECK-NEXT:    [[TMP66:%.*]] = add i32 [[TMP65]], 0
-; CHECK-NEXT:    [[TMP67:%.*]] = getelementptr half, ptr addrspace(7) [[PTR]], i32 [[TMP66]]
-; CHECK-NEXT:    [[A16:%.*]] = load half, ptr addrspace(7) [[TMP67]], align 2
-; CHECK-NEXT:    [[TMP68:%.*]] = insertelement <16 x half> [[TMP64]], half [[A16]], i64 15
+; CHECK-NEXT:    [[TMP66:%.*]] = getelementptr half, ptr addrspace(7) [[PTR]], i32 [[TMP65]]
+; CHECK-NEXT:    [[TMP67:%.*]] = load half, ptr addrspace(7) [[TMP66]], align 2
+; CHECK-NEXT:    [[TMP68:%.*]] = insertelement <16 x half> [[TMP64]], half [[TMP67]], i64 15
 ; CHECK-NEXT:    [[TMP69:%.*]] = bitcast <16 x half> [[TMP68]] to <8 x float>
 ; CHECK-NEXT:    ret <8 x float> [[TMP69]]
 ;
-  %a = call <8 x float> @lgc.cooperative.matrix.load.v8f32.p7.i32.i1.i32.i32.i32(ptr addrspace(7) %ptr, i32 320, i1 false, i32 1, i32 0, i32 0)
+  %a = call <8 x float> (...) @lgc.cooperative.matrix.load__v8f32(ptr addrspace(7) %ptr, i32 320, i1 false, i32 1, i32 0, i32 0, i32 16)
   ret <8 x float> %a
 }
 
@@ -103,30 +87,26 @@ define <8 x float> @test_f16_cd_layout(ptr addrspace(7) %ptr) !spirv.ExecutionMo
 ; CHECK-NEXT:    [[TMP5:%.*]] = mul i32 [[TMP4]], 160
 ; CHECK-NEXT:    [[TMP6:%.*]] = add i32 [[TMP5]], [[TMP3]]
 ; CHECK-NEXT:    [[TMP7:%.*]] = add i32 [[TMP6]], 0
-; CHECK-NEXT:    [[TMP8:%.*]] = add i32 [[TMP7]], 0
-; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr half, ptr addrspace(7) [[PTR:%.*]], i32 [[TMP8]]
-; CHECK-NEXT:    [[A1:%.*]] = load half, ptr addrspace(7) [[TMP9]], align 2
-; CHECK-NEXT:    [[TMP10:%.*]] = insertelement <4 x half> poison, half [[A1]], i64 0
+; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr half, ptr addrspace(7) [[PTR:%.*]], i32 [[TMP7]]
+; CHECK-NEXT:    [[TMP9:%.*]] = load half, ptr addrspace(7) [[TMP8]], align 2
+; CHECK-NEXT:    [[TMP10:%.*]] = insertelement <4 x half> poison, half [[TMP9]], i64 0
 ; CHECK-NEXT:    [[TMP11:%.*]] = add i32 [[TMP6]], 640
-; CHECK-NEXT:    [[TMP12:%.*]] = add i32 [[TMP11]], 0
-; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr half, ptr addrspace(7) [[PTR]], i32 [[TMP12]]
-; CHECK-NEXT:    [[A2:%.*]] = load half, ptr addrspace(7) [[TMP13]], align 2
-; CHECK-NEXT:    [[TMP14:%.*]] = insertelement <4 x half> [[TMP10]], half [[A2]], i64 1
+; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr half, ptr addrspace(7) [[PTR]], i32 [[TMP11]]
+; CHECK-NEXT:    [[TMP13:%.*]] = load half, ptr addrspace(7) [[TMP12]], align 2
+; CHECK-NEXT:    [[TMP14:%.*]] = insertelement <4 x half> [[TMP10]], half [[TMP13]], i64 1
 ; CHECK-NEXT:    [[TMP15:%.*]] = add i32 [[TMP6]], 1280
-; CHECK-NEXT:    [[TMP16:%.*]] = add i32 [[TMP15]], 0
-; CHECK-NEXT:    [[TMP17:%.*]] = getelementptr half, ptr addrspace(7) [[PTR]], i32 [[TMP16]]
-; CHECK-NEXT:    [[A3:%.*]] = load half, ptr addrspace(7) [[TMP17]], align 2
-; CHECK-NEXT:    [[TMP18:%.*]] = insertelement <4 x half> [[TMP14]], half [[A3]], i64 2
+; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr half, ptr addrspace(7) [[PTR]], i32 [[TMP15]]
+; CHECK-NEXT:    [[TMP17:%.*]] = load half, ptr addrspace(7) [[TMP16]], align 2
+; CHECK-NEXT:    [[TMP18:%.*]] = insertelement <4 x half> [[TMP14]], half [[TMP17]], i64 2
 ; CHECK-NEXT:    [[TMP19:%.*]] = add i32 [[TMP6]], 1920
-; CHECK-NEXT:    [[TMP20:%.*]] = add i32 [[TMP19]], 0
-; CHECK-NEXT:    [[TMP21:%.*]] = getelementptr half, ptr addrspace(7) [[PTR]], i32 [[TMP20]]
-; CHECK-NEXT:    [[A4:%.*]] = load half, ptr addrspace(7) [[TMP21]], align 2
-; CHECK-NEXT:    [[TMP22:%.*]] = insertelement <4 x half> [[TMP18]], half [[A4]], i64 3
+; CHECK-NEXT:    [[TMP20:%.*]] = getelementptr half, ptr addrspace(7) [[PTR]], i32 [[TMP19]]
+; CHECK-NEXT:    [[TMP21:%.*]] = load half, ptr addrspace(7) [[TMP20]], align 2
+; CHECK-NEXT:    [[TMP22:%.*]] = insertelement <4 x half> [[TMP18]], half [[TMP21]], i64 3
 ; CHECK-NEXT:    [[TMP23:%.*]] = shufflevector <4 x half> [[TMP22]], <4 x half> poison, <16 x i32> <i32 0, i32 poison, i32 1, i32 poison, i32 2, i32 poison, i32 3, i32 poison, i32 4, i32 poison, i32 5, i32 poison, i32 6, i32 poison, i32 7, i32 poison>
 ; CHECK-NEXT:    [[TMP24:%.*]] = bitcast <16 x half> [[TMP23]] to <8 x float>
 ; CHECK-NEXT:    ret <8 x float> [[TMP24]]
 ;
-  %a = call <8 x float> @lgc.cooperative.matrix.load.v8f32.p7.i32.i1.i32.i32.i32(ptr addrspace(7) %ptr, i32 320, i1 false, i32 1, i32 1, i32 0)
+  %a = call <8 x float> (...) @lgc.cooperative.matrix.load__v8f32(ptr addrspace(7) %ptr, i32 320, i1 false, i32 1, i32 1, i32 0, i32 16)
   ret <8 x float> %a
 }
 
@@ -137,89 +117,73 @@ define <8 x i32> @test_i16_ab_layout(ptr addrspace(7) %ptr) !spirv.ExecutionMode
 ; CHECK-NEXT:    [[TMP3:%.*]] = srem i32 [[TMP2]], 16
 ; CHECK-NEXT:    [[TMP4:%.*]] = add i32 0, [[TMP3]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = add i32 [[TMP4]], 0
-; CHECK-NEXT:    [[TMP6:%.*]] = add i32 [[TMP5]], 0
-; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr i16, ptr addrspace(7) [[PTR:%.*]], i32 [[TMP6]]
-; CHECK-NEXT:    [[A1:%.*]] = load i16, ptr addrspace(7) [[TMP7]], align 2
-; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <16 x i16> poison, i16 [[A1]], i64 0
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr i16, ptr addrspace(7) [[PTR:%.*]], i32 [[TMP5]]
+; CHECK-NEXT:    [[TMP7:%.*]] = load i16, ptr addrspace(7) [[TMP6]], align 2
+; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <16 x i16> poison, i16 [[TMP7]], i64 0
 ; CHECK-NEXT:    [[TMP9:%.*]] = add i32 [[TMP4]], 160
-; CHECK-NEXT:    [[TMP10:%.*]] = add i32 [[TMP9]], 0
-; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr i16, ptr addrspace(7) [[PTR]], i32 [[TMP10]]
-; CHECK-NEXT:    [[A2:%.*]] = load i16, ptr addrspace(7) [[TMP11]], align 2
-; CHECK-NEXT:    [[TMP12:%.*]] = insertelement <16 x i16> [[TMP8]], i16 [[A2]], i64 1
+; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr i16, ptr addrspace(7) [[PTR]], i32 [[TMP9]]
+; CHECK-NEXT:    [[TMP11:%.*]] = load i16, ptr addrspace(7) [[TMP10]], align 2
+; CHECK-NEXT:    [[TMP12:%.*]] = insertelement <16 x i16> [[TMP8]], i16 [[TMP11]], i64 1
 ; CHECK-NEXT:    [[TMP13:%.*]] = add i32 [[TMP4]], 320
-; CHECK-NEXT:    [[TMP14:%.*]] = add i32 [[TMP13]], 0
-; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr i16, ptr addrspace(7) [[PTR]], i32 [[TMP14]]
-; CHECK-NEXT:    [[A3:%.*]] = load i16, ptr addrspace(7) [[TMP15]], align 2
-; CHECK-NEXT:    [[TMP16:%.*]] = insertelement <16 x i16> [[TMP12]], i16 [[A3]], i64 2
+; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr i16, ptr addrspace(7) [[PTR]], i32 [[TMP13]]
+; CHECK-NEXT:    [[TMP15:%.*]] = load i16, ptr addrspace(7) [[TMP14]], align 2
+; CHECK-NEXT:    [[TMP16:%.*]] = insertelement <16 x i16> [[TMP12]], i16 [[TMP15]], i64 2
 ; CHECK-NEXT:    [[TMP17:%.*]] = add i32 [[TMP4]], 480
-; CHECK-NEXT:    [[TMP18:%.*]] = add i32 [[TMP17]], 0
-; CHECK-NEXT:    [[TMP19:%.*]] = getelementptr i16, ptr addrspace(7) [[PTR]], i32 [[TMP18]]
-; CHECK-NEXT:    [[A4:%.*]] = load i16, ptr addrspace(7) [[TMP19]], align 2
-; CHECK-NEXT:    [[TMP20:%.*]] = insertelement <16 x i16> [[TMP16]], i16 [[A4]], i64 3
+; CHECK-NEXT:    [[TMP18:%.*]] = getelementptr i16, ptr addrspace(7) [[PTR]], i32 [[TMP17]]
+; CHECK-NEXT:    [[TMP19:%.*]] = load i16, ptr addrspace(7) [[TMP18]], align 2
+; CHECK-NEXT:    [[TMP20:%.*]] = insertelement <16 x i16> [[TMP16]], i16 [[TMP19]], i64 3
 ; CHECK-NEXT:    [[TMP21:%.*]] = add i32 [[TMP4]], 640
-; CHECK-NEXT:    [[TMP22:%.*]] = add i32 [[TMP21]], 0
-; CHECK-NEXT:    [[TMP23:%.*]] = getelementptr i16, ptr addrspace(7) [[PTR]], i32 [[TMP22]]
-; CHECK-NEXT:    [[A5:%.*]] = load i16, ptr addrspace(7) [[TMP23]], align 2
-; CHECK-NEXT:    [[TMP24:%.*]] = insertelement <16 x i16> [[TMP20]], i16 [[A5]], i64 4
+; CHECK-NEXT:    [[TMP22:%.*]] = getelementptr i16, ptr addrspace(7) [[PTR]], i32 [[TMP21]]
+; CHECK-NEXT:    [[TMP23:%.*]] = load i16, ptr addrspace(7) [[TMP22]], align 2
+; CHECK-NEXT:    [[TMP24:%.*]] = insertelement <16 x i16> [[TMP20]], i16 [[TMP23]], i64 4
 ; CHECK-NEXT:    [[TMP25:%.*]] = add i32 [[TMP4]], 800
-; CHECK-NEXT:    [[TMP26:%.*]] = add i32 [[TMP25]], 0
-; CHECK-NEXT:    [[TMP27:%.*]] = getelementptr i16, ptr addrspace(7) [[PTR]], i32 [[TMP26]]
-; CHECK-NEXT:    [[A6:%.*]] = load i16, ptr addrspace(7) [[TMP27]], align 2
-; CHECK-NEXT:    [[TMP28:%.*]] = insertelement <16 x i16> [[TMP24]], i16 [[A6]], i64 5
+; CHECK-NEXT:    [[TMP26:%.*]] = getelementptr i16, ptr addrspace(7) [[PTR]], i32 [[TMP25]]
+; CHECK-NEXT:    [[TMP27:%.*]] = load i16, ptr addrspace(7) [[TMP26]], align 2
+; CHECK-NEXT:    [[TMP28:%.*]] = insertelement <16 x i16> [[TMP24]], i16 [[TMP27]], i64 5
 ; CHECK-NEXT:    [[TMP29:%.*]] = add i32 [[TMP4]], 960
-; CHECK-NEXT:    [[TMP30:%.*]] = add i32 [[TMP29]], 0
-; CHECK-NEXT:    [[TMP31:%.*]] = getelementptr i16, ptr addrspace(7) [[PTR]], i32 [[TMP30]]
-; CHECK-NEXT:    [[A7:%.*]] = load i16, ptr addrspace(7) [[TMP31]], align 2
-; CHECK-NEXT:    [[TMP32:%.*]] = insertelement <16 x i16> [[TMP28]], i16 [[A7]], i64 6
+; CHECK-NEXT:    [[TMP30:%.*]] = getelementptr i16, ptr addrspace(7) [[PTR]], i32 [[TMP29]]
+; CHECK-NEXT:    [[TMP31:%.*]] = load i16, ptr addrspace(7) [[TMP30]], align 2
+; CHECK-NEXT:    [[TMP32:%.*]] = insertelement <16 x i16> [[TMP28]], i16 [[TMP31]], i64 6
 ; CHECK-NEXT:    [[TMP33:%.*]] = add i32 [[TMP4]], 1120
-; CHECK-NEXT:    [[TMP34:%.*]] = add i32 [[TMP33]], 0
-; CHECK-NEXT:    [[TMP35:%.*]] = getelementptr i16, ptr addrspace(7) [[PTR]], i32 [[TMP34]]
-; CHECK-NEXT:    [[A8:%.*]] = load i16, ptr addrspace(7) [[TMP35]], align 2
-; CHECK-NEXT:    [[TMP36:%.*]] = insertelement <16 x i16> [[TMP32]], i16 [[A8]], i64 7
+; CHECK-NEXT:    [[TMP34:%.*]] = getelementptr i16, ptr addrspace(7) [[PTR]], i32 [[TMP33]]
+; CHECK-NEXT:    [[TMP35:%.*]] = load i16, ptr addrspace(7) [[TMP34]], align 2
+; CHECK-NEXT:    [[TMP36:%.*]] = insertelement <16 x i16> [[TMP32]], i16 [[TMP35]], i64 7
 ; CHECK-NEXT:    [[TMP37:%.*]] = add i32 [[TMP4]], 1280
-; CHECK-NEXT:    [[TMP38:%.*]] = add i32 [[TMP37]], 0
-; CHECK-NEXT:    [[TMP39:%.*]] = getelementptr i16, ptr addrspace(7) [[PTR]], i32 [[TMP38]]
-; CHECK-NEXT:    [[A9:%.*]] = load i16, ptr addrspace(7) [[TMP39]], align 2
-; CHECK-NEXT:    [[TMP40:%.*]] = insertelement <16 x i16> [[TMP36]], i16 [[A9]], i64 8
+; CHECK-NEXT:    [[TMP38:%.*]] = getelementptr i16, ptr addrspace(7) [[PTR]], i32 [[TMP37]]
+; CHECK-NEXT:    [[TMP39:%.*]] = load i16, ptr addrspace(7) [[TMP38]], align 2
+; CHECK-NEXT:    [[TMP40:%.*]] = insertelement <16 x i16> [[TMP36]], i16 [[TMP39]], i64 8
 ; CHECK-NEXT:    [[TMP41:%.*]] = add i32 [[TMP4]], 1440
-; CHECK-NEXT:    [[TMP42:%.*]] = add i32 [[TMP41]], 0
-; CHECK-NEXT:    [[TMP43:%.*]] = getelementptr i16, ptr addrspace(7) [[PTR]], i32 [[TMP42]]
-; CHECK-NEXT:    [[A10:%.*]] = load i16, ptr addrspace(7) [[TMP43]], align 2
-; CHECK-NEXT:    [[TMP44:%.*]] = insertelement <16 x i16> [[TMP40]], i16 [[A10]], i64 9
+; CHECK-NEXT:    [[TMP42:%.*]] = getelementptr i16, ptr addrspace(7) [[PTR]], i32 [[TMP41]]
+; CHECK-NEXT:    [[TMP43:%.*]] = load i16, ptr addrspace(7) [[TMP42]], align 2
+; CHECK-NEXT:    [[TMP44:%.*]] = insertelement <16 x i16> [[TMP40]], i16 [[TMP43]], i64 9
 ; CHECK-NEXT:    [[TMP45:%.*]] = add i32 [[TMP4]], 1600
-; CHECK-NEXT:    [[TMP46:%.*]] = add i32 [[TMP45]], 0
-; CHECK-NEXT:    [[TMP47:%.*]] = getelementptr i16, ptr addrspace(7) [[PTR]], i32 [[TMP46]]
-; CHECK-NEXT:    [[A11:%.*]] = load i16, ptr addrspace(7) [[TMP47]], align 2
-; CHECK-NEXT:    [[TMP48:%.*]] = insertelement <16 x i16> [[TMP44]], i16 [[A11]], i64 10
+; CHECK-NEXT:    [[TMP46:%.*]] = getelementptr i16, ptr addrspace(7) [[PTR]], i32 [[TMP45]]
+; CHECK-NEXT:    [[TMP47:%.*]] = load i16, ptr addrspace(7) [[TMP46]], align 2
+; CHECK-NEXT:    [[TMP48:%.*]] = insertelement <16 x i16> [[TMP44]], i16 [[TMP47]], i64 10
 ; CHECK-NEXT:    [[TMP49:%.*]] = add i32 [[TMP4]], 1760
-; CHECK-NEXT:    [[TMP50:%.*]] = add i32 [[TMP49]], 0
-; CHECK-NEXT:    [[TMP51:%.*]] = getelementptr i16, ptr addrspace(7) [[PTR]], i32 [[TMP50]]
-; CHECK-NEXT:    [[A12:%.*]] = load i16, ptr addrspace(7) [[TMP51]], align 2
-; CHECK-NEXT:    [[TMP52:%.*]] = insertelement <16 x i16> [[TMP48]], i16 [[A12]], i64 11
+; CHECK-NEXT:    [[TMP50:%.*]] = getelementptr i16, ptr addrspace(7) [[PTR]], i32 [[TMP49]]
+; CHECK-NEXT:    [[TMP51:%.*]] = load i16, ptr addrspace(7) [[TMP50]], align 2
+; CHECK-NEXT:    [[TMP52:%.*]] = insertelement <16 x i16> [[TMP48]], i16 [[TMP51]], i64 11
 ; CHECK-NEXT:    [[TMP53:%.*]] = add i32 [[TMP4]], 1920
-; CHECK-NEXT:    [[TMP54:%.*]] = add i32 [[TMP53]], 0
-; CHECK-NEXT:    [[TMP55:%.*]] = getelementptr i16, ptr addrspace(7) [[PTR]], i32 [[TMP54]]
-; CHECK-NEXT:    [[A13:%.*]] = load i16, ptr addrspace(7) [[TMP55]], align 2
-; CHECK-NEXT:    [[TMP56:%.*]] = insertelement <16 x i16> [[TMP52]], i16 [[A13]], i64 12
+; CHECK-NEXT:    [[TMP54:%.*]] = getelementptr i16, ptr addrspace(7) [[PTR]], i32 [[TMP53]]
+; CHECK-NEXT:    [[TMP55:%.*]] = load i16, ptr addrspace(7) [[TMP54]], align 2
+; CHECK-NEXT:    [[TMP56:%.*]] = insertelement <16 x i16> [[TMP52]], i16 [[TMP55]], i64 12
 ; CHECK-NEXT:    [[TMP57:%.*]] = add i32 [[TMP4]], 2080
-; CHECK-NEXT:    [[TMP58:%.*]] = add i32 [[TMP57]], 0
-; CHECK-NEXT:    [[TMP59:%.*]] = getelementptr i16, ptr addrspace(7) [[PTR]], i32 [[TMP58]]
-; CHECK-NEXT:    [[A14:%.*]] = load i16, ptr addrspace(7) [[TMP59]], align 2
-; CHECK-NEXT:    [[TMP60:%.*]] = insertelement <16 x i16> [[TMP56]], i16 [[A14]], i64 13
+; CHECK-NEXT:    [[TMP58:%.*]] = getelementptr i16, ptr addrspace(7) [[PTR]], i32 [[TMP57]]
+; CHECK-NEXT:    [[TMP59:%.*]] = load i16, ptr addrspace(7) [[TMP58]], align 2
+; CHECK-NEXT:    [[TMP60:%.*]] = insertelement <16 x i16> [[TMP56]], i16 [[TMP59]], i64 13
 ; CHECK-NEXT:    [[TMP61:%.*]] = add i32 [[TMP4]], 2240
-; CHECK-NEXT:    [[TMP62:%.*]] = add i32 [[TMP61]], 0
-; CHECK-NEXT:    [[TMP63:%.*]] = getelementptr i16, ptr addrspace(7) [[PTR]], i32 [[TMP62]]
-; CHECK-NEXT:    [[A15:%.*]] = load i16, ptr addrspace(7) [[TMP63]], align 2
-; CHECK-NEXT:    [[TMP64:%.*]] = insertelement <16 x i16> [[TMP60]], i16 [[A15]], i64 14
+; CHECK-NEXT:    [[TMP62:%.*]] = getelementptr i16, ptr addrspace(7) [[PTR]], i32 [[TMP61]]
+; CHECK-NEXT:    [[TMP63:%.*]] = load i16, ptr addrspace(7) [[TMP62]], align 2
+; CHECK-NEXT:    [[TMP64:%.*]] = insertelement <16 x i16> [[TMP60]], i16 [[TMP63]], i64 14
 ; CHECK-NEXT:    [[TMP65:%.*]] = add i32 [[TMP4]], 2400
-; CHECK-NEXT:    [[TMP66:%.*]] = add i32 [[TMP65]], 0
-; CHECK-NEXT:    [[TMP67:%.*]] = getelementptr i16, ptr addrspace(7) [[PTR]], i32 [[TMP66]]
-; CHECK-NEXT:    [[A16:%.*]] = load i16, ptr addrspace(7) [[TMP67]], align 2
-; CHECK-NEXT:    [[TMP68:%.*]] = insertelement <16 x i16> [[TMP64]], i16 [[A16]], i64 15
+; CHECK-NEXT:    [[TMP66:%.*]] = getelementptr i16, ptr addrspace(7) [[PTR]], i32 [[TMP65]]
+; CHECK-NEXT:    [[TMP67:%.*]] = load i16, ptr addrspace(7) [[TMP66]], align 2
+; CHECK-NEXT:    [[TMP68:%.*]] = insertelement <16 x i16> [[TMP64]], i16 [[TMP67]], i64 15
 ; CHECK-NEXT:    [[TMP69:%.*]] = bitcast <16 x i16> [[TMP68]] to <8 x i32>
 ; CHECK-NEXT:    ret <8 x i32> [[TMP69]]
 ;
-  %a = call <8 x i32> @lgc.cooperative.matrix.load.v8i32.p7.i32.i1.i32.i32.i32(ptr addrspace(7) %ptr, i32 320, i1 false, i32 4, i32 0, i32 0)
+  %a = call <8 x i32> (...) @lgc.cooperative.matrix.load__v8i32(ptr addrspace(7) %ptr, i32 320, i1 false, i32 4, i32 0, i32 0, i32 16)
   ret <8 x i32> %a
 }
 
@@ -232,30 +196,26 @@ define <8 x i32> @test_i16_cd_layout(ptr addrspace(7) %ptr) !spirv.ExecutionMode
 ; CHECK-NEXT:    [[TMP5:%.*]] = mul i32 [[TMP4]], 160
 ; CHECK-NEXT:    [[TMP6:%.*]] = add i32 [[TMP5]], [[TMP3]]
 ; CHECK-NEXT:    [[TMP7:%.*]] = add i32 [[TMP6]], 0
-; CHECK-NEXT:    [[TMP8:%.*]] = add i32 [[TMP7]], 0
-; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr i16, ptr addrspace(7) [[PTR:%.*]], i32 [[TMP8]]
-; CHECK-NEXT:    [[A1:%.*]] = load i16, ptr addrspace(7) [[TMP9]], align 2
-; CHECK-NEXT:    [[TMP10:%.*]] = insertelement <4 x i16> poison, i16 [[A1]], i64 0
+; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr i16, ptr addrspace(7) [[PTR:%.*]], i32 [[TMP7]]
+; CHECK-NEXT:    [[TMP9:%.*]] = load i16, ptr addrspace(7) [[TMP8]], align 2
+; CHECK-NEXT:    [[TMP10:%.*]] = insertelement <4 x i16> poison, i16 [[TMP9]], i64 0
 ; CHECK-NEXT:    [[TMP11:%.*]] = add i32 [[TMP6]], 640
-; CHECK-NEXT:    [[TMP12:%.*]] = add i32 [[TMP11]], 0
-; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr i16, ptr addrspace(7) [[PTR]], i32 [[TMP12]]
-; CHECK-NEXT:    [[A2:%.*]] = load i16, ptr addrspace(7) [[TMP13]], align 2
-; CHECK-NEXT:    [[TMP14:%.*]] = insertelement <4 x i16> [[TMP10]], i16 [[A2]], i64 1
+; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr i16, ptr addrspace(7) [[PTR]], i32 [[TMP11]]
+; CHECK-NEXT:    [[TMP13:%.*]] = load i16, ptr addrspace(7) [[TMP12]], align 2
+; CHECK-NEXT:    [[TMP14:%.*]] = insertelement <4 x i16> [[TMP10]], i16 [[TMP13]], i64 1
 ; CHECK-NEXT:    [[TMP15:%.*]] = add i32 [[TMP6]], 1280
-; CHECK-NEXT:    [[TMP16:%.*]] = add i32 [[TMP15]], 0
-; CHECK-NEXT:    [[TMP17:%.*]] = getelementptr i16, ptr addrspace(7) [[PTR]], i32 [[TMP16]]
-; CHECK-NEXT:    [[A3:%.*]] = load i16, ptr addrspace(7) [[TMP17]], align 2
-; CHECK-NEXT:    [[TMP18:%.*]] = insertelement <4 x i16> [[TMP14]], i16 [[A3]], i64 2
+; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr i16, ptr addrspace(7) [[PTR]], i32 [[TMP15]]
+; CHECK-NEXT:    [[TMP17:%.*]] = load i16, ptr addrspace(7) [[TMP16]], align 2
+; CHECK-NEXT:    [[TMP18:%.*]] = insertelement <4 x i16> [[TMP14]], i16 [[TMP17]], i64 2
 ; CHECK-NEXT:    [[TMP19:%.*]] = add i32 [[TMP6]], 1920
-; CHECK-NEXT:    [[TMP20:%.*]] = add i32 [[TMP19]], 0
-; CHECK-NEXT:    [[TMP21:%.*]] = getelementptr i16, ptr addrspace(7) [[PTR]], i32 [[TMP20]]
-; CHECK-NEXT:    [[A4:%.*]] = load i16, ptr addrspace(7) [[TMP21]], align 2
-; CHECK-NEXT:    [[TMP22:%.*]] = insertelement <4 x i16> [[TMP18]], i16 [[A4]], i64 3
+; CHECK-NEXT:    [[TMP20:%.*]] = getelementptr i16, ptr addrspace(7) [[PTR]], i32 [[TMP19]]
+; CHECK-NEXT:    [[TMP21:%.*]] = load i16, ptr addrspace(7) [[TMP20]], align 2
+; CHECK-NEXT:    [[TMP22:%.*]] = insertelement <4 x i16> [[TMP18]], i16 [[TMP21]], i64 3
 ; CHECK-NEXT:    [[TMP23:%.*]] = shufflevector <4 x i16> [[TMP22]], <4 x i16> poison, <16 x i32> <i32 0, i32 poison, i32 1, i32 poison, i32 2, i32 poison, i32 3, i32 poison, i32 4, i32 poison, i32 5, i32 poison, i32 6, i32 poison, i32 7, i32 poison>
 ; CHECK-NEXT:    [[TMP24:%.*]] = bitcast <16 x i16> [[TMP23]] to <8 x i32>
 ; CHECK-NEXT:    ret <8 x i32> [[TMP24]]
 ;
-  %a = call <8 x i32> @lgc.cooperative.matrix.load.v8i32.p7.i32.i1.i32.i32.i32(ptr addrspace(7) %ptr, i32 320, i1 false, i32 4, i32 1, i32 0)
+  %a = call <8 x i32> (...) @lgc.cooperative.matrix.load__v8i32(ptr addrspace(7) %ptr, i32 320, i1 false, i32 4, i32 1, i32 0, i32 16)
   ret <8 x i32> %a
 }
 
@@ -268,29 +228,25 @@ define <8 x float> @test_f32_cd_layout(ptr addrspace(7) %ptr) !spirv.ExecutionMo
 ; CHECK-NEXT:    [[TMP5:%.*]] = mul i32 [[TMP4]], 160
 ; CHECK-NEXT:    [[TMP6:%.*]] = add i32 [[TMP5]], [[TMP3]]
 ; CHECK-NEXT:    [[TMP7:%.*]] = add i32 [[TMP6]], 0
-; CHECK-NEXT:    [[TMP8:%.*]] = add i32 [[TMP7]], 0
-; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr float, ptr addrspace(7) [[PTR:%.*]], i32 [[TMP8]]
-; CHECK-NEXT:    [[A1:%.*]] = load float, ptr addrspace(7) [[TMP9]], align 4
-; CHECK-NEXT:    [[TMP10:%.*]] = insertelement <4 x float> poison, float [[A1]], i64 0
+; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr float, ptr addrspace(7) [[PTR:%.*]], i32 [[TMP7]]
+; CHECK-NEXT:    [[TMP9:%.*]] = load float, ptr addrspace(7) [[TMP8]], align 4
+; CHECK-NEXT:    [[TMP10:%.*]] = insertelement <4 x float> poison, float [[TMP9]], i64 0
 ; CHECK-NEXT:    [[TMP11:%.*]] = add i32 [[TMP6]], 640
-; CHECK-NEXT:    [[TMP12:%.*]] = add i32 [[TMP11]], 0
-; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr float, ptr addrspace(7) [[PTR]], i32 [[TMP12]]
-; CHECK-NEXT:    [[A2:%.*]] = load float, ptr addrspace(7) [[TMP13]], align 4
-; CHECK-NEXT:    [[TMP14:%.*]] = insertelement <4 x float> [[TMP10]], float [[A2]], i64 1
+; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr float, ptr addrspace(7) [[PTR]], i32 [[TMP11]]
+; CHECK-NEXT:    [[TMP13:%.*]] = load float, ptr addrspace(7) [[TMP12]], align 4
+; CHECK-NEXT:    [[TMP14:%.*]] = insertelement <4 x float> [[TMP10]], float [[TMP13]], i64 1
 ; CHECK-NEXT:    [[TMP15:%.*]] = add i32 [[TMP6]], 1280
-; CHECK-NEXT:    [[TMP16:%.*]] = add i32 [[TMP15]], 0
-; CHECK-NEXT:    [[TMP17:%.*]] = getelementptr float, ptr addrspace(7) [[PTR]], i32 [[TMP16]]
-; CHECK-NEXT:    [[A3:%.*]] = load float, ptr addrspace(7) [[TMP17]], align 4
-; CHECK-NEXT:    [[TMP18:%.*]] = insertelement <4 x float> [[TMP14]], float [[A3]], i64 2
+; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr float, ptr addrspace(7) [[PTR]], i32 [[TMP15]]
+; CHECK-NEXT:    [[TMP17:%.*]] = load float, ptr addrspace(7) [[TMP16]], align 4
+; CHECK-NEXT:    [[TMP18:%.*]] = insertelement <4 x float> [[TMP14]], float [[TMP17]], i64 2
 ; CHECK-NEXT:    [[TMP19:%.*]] = add i32 [[TMP6]], 1920
-; CHECK-NEXT:    [[TMP20:%.*]] = add i32 [[TMP19]], 0
-; CHECK-NEXT:    [[TMP21:%.*]] = getelementptr float, ptr addrspace(7) [[PTR]], i32 [[TMP20]]
-; CHECK-NEXT:    [[A4:%.*]] = load float, ptr addrspace(7) [[TMP21]], align 4
-; CHECK-NEXT:    [[TMP22:%.*]] = insertelement <4 x float> [[TMP18]], float [[A4]], i64 3
+; CHECK-NEXT:    [[TMP20:%.*]] = getelementptr float, ptr addrspace(7) [[PTR]], i32 [[TMP19]]
+; CHECK-NEXT:    [[TMP21:%.*]] = load float, ptr addrspace(7) [[TMP20]], align 4
+; CHECK-NEXT:    [[TMP22:%.*]] = insertelement <4 x float> [[TMP18]], float [[TMP21]], i64 3
 ; CHECK-NEXT:    [[TMP23:%.*]] = shufflevector <4 x float> [[TMP22]], <4 x float> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; CHECK-NEXT:    ret <8 x float> [[TMP23]]
 ;
-  %a = call <8 x float> @lgc.cooperative.matrix.load.v8f32.p7.i32.i1.i32.i32.i32(ptr addrspace(7) %ptr, i32 640, i1 false, i32 2, i32 1, i32 0)
+  %a = call <8 x float> (...) @lgc.cooperative.matrix.load__v8f32(ptr addrspace(7) %ptr, i32 640, i1 false, i32 2, i32 1, i32 0, i32 16)
   ret <8 x float> %a
 }
 
@@ -303,34 +259,30 @@ define <8 x i32> @test_i32_cd_layout(ptr addrspace(7) %ptr) !spirv.ExecutionMode
 ; CHECK-NEXT:    [[TMP5:%.*]] = mul i32 [[TMP4]], 160
 ; CHECK-NEXT:    [[TMP6:%.*]] = add i32 [[TMP5]], [[TMP3]]
 ; CHECK-NEXT:    [[TMP7:%.*]] = add i32 [[TMP6]], 0
-; CHECK-NEXT:    [[TMP8:%.*]] = add i32 [[TMP7]], 0
-; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr i32, ptr addrspace(7) [[PTR:%.*]], i32 [[TMP8]]
-; CHECK-NEXT:    [[A1:%.*]] = load i32, ptr addrspace(7) [[TMP9]], align 4
-; CHECK-NEXT:    [[TMP10:%.*]] = insertelement <4 x i32> poison, i32 [[A1]], i64 0
+; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr i32, ptr addrspace(7) [[PTR:%.*]], i32 [[TMP7]]
+; CHECK-NEXT:    [[TMP9:%.*]] = load i32, ptr addrspace(7) [[TMP8]], align 4
+; CHECK-NEXT:    [[TMP10:%.*]] = insertelement <4 x i32> poison, i32 [[TMP9]], i64 0
 ; CHECK-NEXT:    [[TMP11:%.*]] = add i32 [[TMP6]], 640
-; CHECK-NEXT:    [[TMP12:%.*]] = add i32 [[TMP11]], 0
-; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr i32, ptr addrspace(7) [[PTR]], i32 [[TMP12]]
-; CHECK-NEXT:    [[A2:%.*]] = load i32, ptr addrspace(7) [[TMP13]], align 4
-; CHECK-NEXT:    [[TMP14:%.*]] = insertelement <4 x i32> [[TMP10]], i32 [[A2]], i64 1
+; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr i32, ptr addrspace(7) [[PTR]], i32 [[TMP11]]
+; CHECK-NEXT:    [[TMP13:%.*]] = load i32, ptr addrspace(7) [[TMP12]], align 4
+; CHECK-NEXT:    [[TMP14:%.*]] = insertelement <4 x i32> [[TMP10]], i32 [[TMP13]], i64 1
 ; CHECK-NEXT:    [[TMP15:%.*]] = add i32 [[TMP6]], 1280
-; CHECK-NEXT:    [[TMP16:%.*]] = add i32 [[TMP15]], 0
-; CHECK-NEXT:    [[TMP17:%.*]] = getelementptr i32, ptr addrspace(7) [[PTR]], i32 [[TMP16]]
-; CHECK-NEXT:    [[A3:%.*]] = load i32, ptr addrspace(7) [[TMP17]], align 4
-; CHECK-NEXT:    [[TMP18:%.*]] = insertelement <4 x i32> [[TMP14]], i32 [[A3]], i64 2
+; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr i32, ptr addrspace(7) [[PTR]], i32 [[TMP15]]
+; CHECK-NEXT:    [[TMP17:%.*]] = load i32, ptr addrspace(7) [[TMP16]], align 4
+; CHECK-NEXT:    [[TMP18:%.*]] = insertelement <4 x i32> [[TMP14]], i32 [[TMP17]], i64 2
 ; CHECK-NEXT:    [[TMP19:%.*]] = add i32 [[TMP6]], 1920
-; CHECK-NEXT:    [[TMP20:%.*]] = add i32 [[TMP19]], 0
-; CHECK-NEXT:    [[TMP21:%.*]] = getelementptr i32, ptr addrspace(7) [[PTR]], i32 [[TMP20]]
-; CHECK-NEXT:    [[A4:%.*]] = load i32, ptr addrspace(7) [[TMP21]], align 4
-; CHECK-NEXT:    [[TMP22:%.*]] = insertelement <4 x i32> [[TMP18]], i32 [[A4]], i64 3
+; CHECK-NEXT:    [[TMP20:%.*]] = getelementptr i32, ptr addrspace(7) [[PTR]], i32 [[TMP19]]
+; CHECK-NEXT:    [[TMP21:%.*]] = load i32, ptr addrspace(7) [[TMP20]], align 4
+; CHECK-NEXT:    [[TMP22:%.*]] = insertelement <4 x i32> [[TMP18]], i32 [[TMP21]], i64 3
 ; CHECK-NEXT:    [[TMP23:%.*]] = shufflevector <4 x i32> [[TMP22]], <4 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; CHECK-NEXT:    ret <8 x i32> [[TMP23]]
 ;
-  %a = call <8 x i32> @lgc.cooperative.matrix.load.v8i32.p7.i32.i1.i32.i32.i32(ptr addrspace(7) %ptr, i32 640, i1 false, i32 5, i32 1, i32 0)
+  %a = call <8 x i32> (...) @lgc.cooperative.matrix.load__v8i32(ptr addrspace(7) %ptr, i32 640, i1 false, i32 5, i32 1, i32 0, i32 16)
   ret <8 x i32> %a
 }
 
-declare <8 x float> @lgc.cooperative.matrix.load.v8f32.p7.i32.i1.i32.i32.i32(ptr addrspace(7), i32, i1, i32, i32, i32)
-declare <8 x i32> @lgc.cooperative.matrix.load.v8i32.p7.i32.i1.i32.i32.i32(ptr addrspace(7), i32, i1, i32, i32, i32)
+declare <8 x float> @lgc.cooperative.matrix.load__v8f32(...)
+declare <8 x i32> @lgc.cooperative.matrix.load__v8i32(...)
 
 !llpc.compute.mode = !{!0}
 !lgc.client = !{!1}
diff --git a/lgc/test/Transforms/LowerCooperativeMatrix/packed-accumulators-wave64.lgc b/lgc/test/Transforms/LowerCooperativeMatrix/packed-accumulators-wave64.lgc
index 3639dd82b6..fa424765bf 100644
--- a/lgc/test/Transforms/LowerCooperativeMatrix/packed-accumulators-wave64.lgc
+++ b/lgc/test/Transforms/LowerCooperativeMatrix/packed-accumulators-wave64.lgc
@@ -3,7 +3,7 @@
 
 define <8 x float> @test_pack_f16(<8 x float> %a, <8 x float> %b) !spirv.ExecutionModel !8 !lgc.shaderstage !9 {
 ; CHECK-LABEL: define <8 x float> @test_pack_f16
-; CHECK-SAME: (<8 x float> [[A:%.*]], <8 x float> [[B:%.*]]) !spirv.ExecutionModel !5 !lgc.shaderstage !6 {
+; CHECK-SAME: (<8 x float> [[A:%.*]], <8 x float> [[B:%.*]]) !spirv.ExecutionModel !5 !lgc.shaderstage [[META6:![0-9]+]] {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x float> [[A]] to <16 x half>
 ; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x float> [[B]] to <16 x half>
@@ -12,13 +12,13 @@ define <8 x float> @test_pack_f16(<8 x float> %a, <8 x float> %b) !spirv.Executi
 ; CHECK-NEXT:    ret <8 x float> [[TMP3]]
 ;
 entry:
-  %r = call <8 x float > @lgc.cooperative.matrix.pack.v8f32.v8f32(<8 x float> %a, <8 x float> %b)
+  %r = call <8 x float > (...) @lgc.cooperative.matrix.pack__v8f32(<8 x float> %a, <8 x float> %b)
   ret <8 x float> %r
 }
 
 define <8 x float> @test_unpack_lo(<8 x float> %packed) !spirv.ExecutionModel !8 !lgc.shaderstage !9 {
 ; CHECK-LABEL: define <8 x float> @test_unpack_lo
-; CHECK-SAME: (<8 x float> [[PACKED:%.*]]) !spirv.ExecutionModel !5 !lgc.shaderstage !6 {
+; CHECK-SAME: (<8 x float> [[PACKED:%.*]]) !spirv.ExecutionModel !5 !lgc.shaderstage [[META6]] {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x float> [[PACKED]] to <16 x half>
 ; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x half> [[TMP0]], <16 x half> poison, <16 x i32> <i32 0, i32 poison, i32 2, i32 poison, i32 4, i32 poison, i32 6, i32 poison, i32 8, i32 poison, i32 10, i32 poison, i32 12, i32 poison, i32 14, i32 poison>
@@ -26,13 +26,13 @@ define <8 x float> @test_unpack_lo(<8 x float> %packed) !spirv.ExecutionModel !8
 ; CHECK-NEXT:    ret <8 x float> [[TMP2]]
 ;
 entry:
-  %r = call <8 x float >@lgc.cooperative.matrix.unpack.v8f32.i1(<8 x float> %packed, i1 false)
+  %r = call <8 x float > (...) @lgc.cooperative.matrix.unpack__v8f32(<8 x float> %packed, i1 false)
   ret <8 x float> %r
 }
 
 define <8 x float> @test_unpack_hi(<8 x float> %packed) !spirv.ExecutionModel !8 !lgc.shaderstage !9 {
 ; CHECK-LABEL: define <8 x float> @test_unpack_hi
-; CHECK-SAME: (<8 x float> [[PACKED:%.*]]) !spirv.ExecutionModel !5 !lgc.shaderstage !6 {
+; CHECK-SAME: (<8 x float> [[PACKED:%.*]]) !spirv.ExecutionModel !5 !lgc.shaderstage [[META6]] {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x float> [[PACKED]] to <16 x half>
 ; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x half> [[TMP0]], <16 x half> poison, <16 x i32> <i32 1, i32 poison, i32 3, i32 poison, i32 5, i32 poison, i32 7, i32 poison, i32 9, i32 poison, i32 11, i32 poison, i32 13, i32 poison, i32 15, i32 poison>
@@ -40,13 +40,13 @@ define <8 x float> @test_unpack_hi(<8 x float> %packed) !spirv.ExecutionModel !8
 ; CHECK-NEXT:    ret <8 x float> [[TMP2]]
 ;
 entry:
-  %r = call <8 x float >@lgc.cooperative.matrix.unpack.v8f32.i1(<8 x float> %packed, i1 true)
+  %r = call <8 x float > (...) @lgc.cooperative.matrix.unpack__v8f32(<8 x float> %packed, i1 true)
   ret <8 x float> %r
 }
 
 define <8 x float> @test_packed_times_scalar(<8 x float> %packed, <2 x half> %scalar) !spirv.ExecutionModel !8 !lgc.shaderstage !9 {
 ; CHECK-LABEL: define <8 x float> @test_packed_times_scalar
-; CHECK-SAME: (<8 x float> [[PACKED:%.*]], <2 x half> [[SCALAR:%.*]]) !spirv.ExecutionModel !5 !lgc.shaderstage !6 {
+; CHECK-SAME: (<8 x float> [[PACKED:%.*]], <2 x half> [[SCALAR:%.*]]) !spirv.ExecutionModel !5 !lgc.shaderstage [[META6]] {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x float> [[PACKED]] to <16 x half>
 ; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x half> [[TMP0]], <16 x half> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
@@ -57,13 +57,13 @@ define <8 x float> @test_packed_times_scalar(<8 x float> %packed, <2 x half> %sc
 ; CHECK-NEXT:    ret <8 x float> [[TMP5]]
 ;
 entry:
-  %r = call <8 x float> @lgc.cooperative.matrix.times.scalar.v8f32.v8f32.v2f16.i32.i32(<8 x float> %packed, <2 x half> %scalar, i32 6, i32 1)
+  %r = call <8 x float> (...) @lgc.cooperative.matrix.times.scalar__v8f32(<8 x float> %packed, <2 x half> %scalar, i32 6, i32 1)
   ret <8 x float> %r
 }
 
-declare <8 x float> @lgc.cooperative.matrix.pack.v8f32.v8f32(<8 x float>, <8 x float>)
-declare <8 x float> @lgc.cooperative.matrix.unpack.v8f32.i1(<8 x float>, i1)
-declare <8 x float> @lgc.cooperative.matrix.times.scalar.v8f32.v8f32.v2f16.i32.i32(<8 x float>, <2 x half>, i32, i32)
+declare <8 x float> @lgc.cooperative.matrix.pack__v8f32(...)
+declare <8 x float> @lgc.cooperative.matrix.unpack__v8f32(...)
+declare <8 x float> @lgc.cooperative.matrix.times.scalar__v8f32(...)
 
 !llpc.compute.mode = !{!0}
 !lgc.client = !{!1}
diff --git a/lgc/test/Transforms/LowerCooperativeMatrix/store-wave64.lgc b/lgc/test/Transforms/LowerCooperativeMatrix/store-wave64.lgc
index 798c807644..64c888a45a 100644
--- a/lgc/test/Transforms/LowerCooperativeMatrix/store-wave64.lgc
+++ b/lgc/test/Transforms/LowerCooperativeMatrix/store-wave64.lgc
@@ -9,88 +9,72 @@ define void @test_f16_ab_layout(ptr addrspace(7) %ptr, <8 x float> %a) !spirv.Ex
 ; CHECK-NEXT:    [[TMP4:%.*]] = add i32 0, [[TMP3]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x float> [[A:%.*]] to <16 x half>
 ; CHECK-NEXT:    [[TMP6:%.*]] = add i32 [[TMP4]], 0
-; CHECK-NEXT:    [[TMP7:%.*]] = add i32 [[TMP6]], 0
-; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr half, ptr addrspace(7) [[PTR:%.*]], i32 [[TMP7]]
-; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <16 x half> [[TMP5]], i64 0
-; CHECK-NEXT:    store half [[TMP9]], ptr addrspace(7) [[TMP8]], align 2
-; CHECK-NEXT:    [[TMP10:%.*]] = add i32 [[TMP4]], 160
-; CHECK-NEXT:    [[TMP11:%.*]] = add i32 [[TMP10]], 0
-; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr half, ptr addrspace(7) [[PTR]], i32 [[TMP11]]
-; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <16 x half> [[TMP5]], i64 1
-; CHECK-NEXT:    store half [[TMP13]], ptr addrspace(7) [[TMP12]], align 2
-; CHECK-NEXT:    [[TMP14:%.*]] = add i32 [[TMP4]], 320
-; CHECK-NEXT:    [[TMP15:%.*]] = add i32 [[TMP14]], 0
+; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr half, ptr addrspace(7) [[PTR:%.*]], i32 [[TMP6]]
+; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <16 x half> [[TMP5]], i64 0
+; CHECK-NEXT:    store half [[TMP8]], ptr addrspace(7) [[TMP7]], align 2
+; CHECK-NEXT:    [[TMP9:%.*]] = add i32 [[TMP4]], 160
+; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr half, ptr addrspace(7) [[PTR]], i32 [[TMP9]]
+; CHECK-NEXT:    [[TMP11:%.*]] = extractelement <16 x half> [[TMP5]], i64 1
+; CHECK-NEXT:    store half [[TMP11]], ptr addrspace(7) [[TMP10]], align 2
+; CHECK-NEXT:    [[TMP12:%.*]] = add i32 [[TMP4]], 320
+; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr half, ptr addrspace(7) [[PTR]], i32 [[TMP12]]
+; CHECK-NEXT:    [[TMP14:%.*]] = extractelement <16 x half> [[TMP5]], i64 2
+; CHECK-NEXT:    store half [[TMP14]], ptr addrspace(7) [[TMP13]], align 2
+; CHECK-NEXT:    [[TMP15:%.*]] = add i32 [[TMP4]], 480
 ; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr half, ptr addrspace(7) [[PTR]], i32 [[TMP15]]
-; CHECK-NEXT:    [[TMP17:%.*]] = extractelement <16 x half> [[TMP5]], i64 2
+; CHECK-NEXT:    [[TMP17:%.*]] = extractelement <16 x half> [[TMP5]], i64 3
 ; CHECK-NEXT:    store half [[TMP17]], ptr addrspace(7) [[TMP16]], align 2
-; CHECK-NEXT:    [[TMP18:%.*]] = add i32 [[TMP4]], 480
-; CHECK-NEXT:    [[TMP19:%.*]] = add i32 [[TMP18]], 0
-; CHECK-NEXT:    [[TMP20:%.*]] = getelementptr half, ptr addrspace(7) [[PTR]], i32 [[TMP19]]
-; CHECK-NEXT:    [[TMP21:%.*]] = extractelement <16 x half> [[TMP5]], i64 3
-; CHECK-NEXT:    store half [[TMP21]], ptr addrspace(7) [[TMP20]], align 2
-; CHECK-NEXT:    [[TMP22:%.*]] = add i32 [[TMP4]], 640
-; CHECK-NEXT:    [[TMP23:%.*]] = add i32 [[TMP22]], 0
-; CHECK-NEXT:    [[TMP24:%.*]] = getelementptr half, ptr addrspace(7) [[PTR]], i32 [[TMP23]]
-; CHECK-NEXT:    [[TMP25:%.*]] = extractelement <16 x half> [[TMP5]], i64 4
-; CHECK-NEXT:    store half [[TMP25]], ptr addrspace(7) [[TMP24]], align 2
-; CHECK-NEXT:    [[TMP26:%.*]] = add i32 [[TMP4]], 800
-; CHECK-NEXT:    [[TMP27:%.*]] = add i32 [[TMP26]], 0
+; CHECK-NEXT:    [[TMP18:%.*]] = add i32 [[TMP4]], 640
+; CHECK-NEXT:    [[TMP19:%.*]] = getelementptr half, ptr addrspace(7) [[PTR]], i32 [[TMP18]]
+; CHECK-NEXT:    [[TMP20:%.*]] = extractelement <16 x half> [[TMP5]], i64 4
+; CHECK-NEXT:    store half [[TMP20]], ptr addrspace(7) [[TMP19]], align 2
+; CHECK-NEXT:    [[TMP21:%.*]] = add i32 [[TMP4]], 800
+; CHECK-NEXT:    [[TMP22:%.*]] = getelementptr half, ptr addrspace(7) [[PTR]], i32 [[TMP21]]
+; CHECK-NEXT:    [[TMP23:%.*]] = extractelement <16 x half> [[TMP5]], i64 5
+; CHECK-NEXT:    store half [[TMP23]], ptr addrspace(7) [[TMP22]], align 2
+; CHECK-NEXT:    [[TMP24:%.*]] = add i32 [[TMP4]], 960
+; CHECK-NEXT:    [[TMP25:%.*]] = getelementptr half, ptr addrspace(7) [[PTR]], i32 [[TMP24]]
+; CHECK-NEXT:    [[TMP26:%.*]] = extractelement <16 x half> [[TMP5]], i64 6
+; CHECK-NEXT:    store half [[TMP26]], ptr addrspace(7) [[TMP25]], align 2
+; CHECK-NEXT:    [[TMP27:%.*]] = add i32 [[TMP4]], 1120
 ; CHECK-NEXT:    [[TMP28:%.*]] = getelementptr half, ptr addrspace(7) [[PTR]], i32 [[TMP27]]
-; CHECK-NEXT:    [[TMP29:%.*]] = extractelement <16 x half> [[TMP5]], i64 5
+; CHECK-NEXT:    [[TMP29:%.*]] = extractelement <16 x half> [[TMP5]], i64 7
 ; CHECK-NEXT:    store half [[TMP29]], ptr addrspace(7) [[TMP28]], align 2
-; CHECK-NEXT:    [[TMP30:%.*]] = add i32 [[TMP4]], 960
-; CHECK-NEXT:    [[TMP31:%.*]] = add i32 [[TMP30]], 0
-; CHECK-NEXT:    [[TMP32:%.*]] = getelementptr half, ptr addrspace(7) [[PTR]], i32 [[TMP31]]
-; CHECK-NEXT:    [[TMP33:%.*]] = extractelement <16 x half> [[TMP5]], i64 6
-; CHECK-NEXT:    store half [[TMP33]], ptr addrspace(7) [[TMP32]], align 2
-; CHECK-NEXT:    [[TMP34:%.*]] = add i32 [[TMP4]], 1120
-; CHECK-NEXT:    [[TMP35:%.*]] = add i32 [[TMP34]], 0
-; CHECK-NEXT:    [[TMP36:%.*]] = getelementptr half, ptr addrspace(7) [[PTR]], i32 [[TMP35]]
-; CHECK-NEXT:    [[TMP37:%.*]] = extractelement <16 x half> [[TMP5]], i64 7
-; CHECK-NEXT:    store half [[TMP37]], ptr addrspace(7) [[TMP36]], align 2
-; CHECK-NEXT:    [[TMP38:%.*]] = add i32 [[TMP4]], 1280
-; CHECK-NEXT:    [[TMP39:%.*]] = add i32 [[TMP38]], 0
+; CHECK-NEXT:    [[TMP30:%.*]] = add i32 [[TMP4]], 1280
+; CHECK-NEXT:    [[TMP31:%.*]] = getelementptr half, ptr addrspace(7) [[PTR]], i32 [[TMP30]]
+; CHECK-NEXT:    [[TMP32:%.*]] = extractelement <16 x half> [[TMP5]], i64 8
+; CHECK-NEXT:    store half [[TMP32]], ptr addrspace(7) [[TMP31]], align 2
+; CHECK-NEXT:    [[TMP33:%.*]] = add i32 [[TMP4]], 1440
+; CHECK-NEXT:    [[TMP34:%.*]] = getelementptr half, ptr addrspace(7) [[PTR]], i32 [[TMP33]]
+; CHECK-NEXT:    [[TMP35:%.*]] = extractelement <16 x half> [[TMP5]], i64 9
+; CHECK-NEXT:    store half [[TMP35]], ptr addrspace(7) [[TMP34]], align 2
+; CHECK-NEXT:    [[TMP36:%.*]] = add i32 [[TMP4]], 1600
+; CHECK-NEXT:    [[TMP37:%.*]] = getelementptr half, ptr addrspace(7) [[PTR]], i32 [[TMP36]]
+; CHECK-NEXT:    [[TMP38:%.*]] = extractelement <16 x half> [[TMP5]], i64 10
+; CHECK-NEXT:    store half [[TMP38]], ptr addrspace(7) [[TMP37]], align 2
+; CHECK-NEXT:    [[TMP39:%.*]] = add i32 [[TMP4]], 1760
 ; CHECK-NEXT:    [[TMP40:%.*]] = getelementptr half, ptr addrspace(7) [[PTR]], i32 [[TMP39]]
-; CHECK-NEXT:    [[TMP41:%.*]] = extractelement <16 x half> [[TMP5]], i64 8
+; CHECK-NEXT:    [[TMP41:%.*]] = extractelement <16 x half> [[TMP5]], i64 11
 ; CHECK-NEXT:    store half [[TMP41]], ptr addrspace(7) [[TMP40]], align 2
-; CHECK-NEXT:    [[TMP42:%.*]] = add i32 [[TMP4]], 1440
-; CHECK-NEXT:    [[TMP43:%.*]] = add i32 [[TMP42]], 0
-; CHECK-NEXT:    [[TMP44:%.*]] = getelementptr half, ptr addrspace(7) [[PTR]], i32 [[TMP43]]
-; CHECK-NEXT:    [[TMP45:%.*]] = extractelement <16 x half> [[TMP5]], i64 9
-; CHECK-NEXT:    store half [[TMP45]], ptr addrspace(7) [[TMP44]], align 2
-; CHECK-NEXT:    [[TMP46:%.*]] = add i32 [[TMP4]], 1600
-; CHECK-NEXT:    [[TMP47:%.*]] = add i32 [[TMP46]], 0
-; CHECK-NEXT:    [[TMP48:%.*]] = getelementptr half, ptr addrspace(7) [[PTR]], i32 [[TMP47]]
-; CHECK-NEXT:    [[TMP49:%.*]] = extractelement <16 x half> [[TMP5]], i64 10
-; CHECK-NEXT:    store half [[TMP49]], ptr addrspace(7) [[TMP48]], align 2
-; CHECK-NEXT:    [[TMP50:%.*]] = add i32 [[TMP4]], 1760
-; CHECK-NEXT:    [[TMP51:%.*]] = add i32 [[TMP50]], 0
+; CHECK-NEXT:    [[TMP42:%.*]] = add i32 [[TMP4]], 1920
+; CHECK-NEXT:    [[TMP43:%.*]] = getelementptr half, ptr addrspace(7) [[PTR]], i32 [[TMP42]]
+; CHECK-NEXT:    [[TMP44:%.*]] = extractelement <16 x half> [[TMP5]], i64 12
+; CHECK-NEXT:    store half [[TMP44]], ptr addrspace(7) [[TMP43]], align 2
+; CHECK-NEXT:    [[TMP45:%.*]] = add i32 [[TMP4]], 2080
+; CHECK-NEXT:    [[TMP46:%.*]] = getelementptr half, ptr addrspace(7) [[PTR]], i32 [[TMP45]]
+; CHECK-NEXT:    [[TMP47:%.*]] = extractelement <16 x half> [[TMP5]], i64 13
+; CHECK-NEXT:    store half [[TMP47]], ptr addrspace(7) [[TMP46]], align 2
+; CHECK-NEXT:    [[TMP48:%.*]] = add i32 [[TMP4]], 2240
+; CHECK-NEXT:    [[TMP49:%.*]] = getelementptr half, ptr addrspace(7) [[PTR]], i32 [[TMP48]]
+; CHECK-NEXT:    [[TMP50:%.*]] = extractelement <16 x half> [[TMP5]], i64 14
+; CHECK-NEXT:    store half [[TMP50]], ptr addrspace(7) [[TMP49]], align 2
+; CHECK-NEXT:    [[TMP51:%.*]] = add i32 [[TMP4]], 2400
 ; CHECK-NEXT:    [[TMP52:%.*]] = getelementptr half, ptr addrspace(7) [[PTR]], i32 [[TMP51]]
-; CHECK-NEXT:    [[TMP53:%.*]] = extractelement <16 x half> [[TMP5]], i64 11
+; CHECK-NEXT:    [[TMP53:%.*]] = extractelement <16 x half> [[TMP5]], i64 15
 ; CHECK-NEXT:    store half [[TMP53]], ptr addrspace(7) [[TMP52]], align 2
-; CHECK-NEXT:    [[TMP54:%.*]] = add i32 [[TMP4]], 1920
-; CHECK-NEXT:    [[TMP55:%.*]] = add i32 [[TMP54]], 0
-; CHECK-NEXT:    [[TMP56:%.*]] = getelementptr half, ptr addrspace(7) [[PTR]], i32 [[TMP55]]
-; CHECK-NEXT:    [[TMP57:%.*]] = extractelement <16 x half> [[TMP5]], i64 12
-; CHECK-NEXT:    store half [[TMP57]], ptr addrspace(7) [[TMP56]], align 2
-; CHECK-NEXT:    [[TMP58:%.*]] = add i32 [[TMP4]], 2080
-; CHECK-NEXT:    [[TMP59:%.*]] = add i32 [[TMP58]], 0
-; CHECK-NEXT:    [[TMP60:%.*]] = getelementptr half, ptr addrspace(7) [[PTR]], i32 [[TMP59]]
-; CHECK-NEXT:    [[TMP61:%.*]] = extractelement <16 x half> [[TMP5]], i64 13
-; CHECK-NEXT:    store half [[TMP61]], ptr addrspace(7) [[TMP60]], align 2
-; CHECK-NEXT:    [[TMP62:%.*]] = add i32 [[TMP4]], 2240
-; CHECK-NEXT:    [[TMP63:%.*]] = add i32 [[TMP62]], 0
-; CHECK-NEXT:    [[TMP64:%.*]] = getelementptr half, ptr addrspace(7) [[PTR]], i32 [[TMP63]]
-; CHECK-NEXT:    [[TMP65:%.*]] = extractelement <16 x half> [[TMP5]], i64 14
-; CHECK-NEXT:    store half [[TMP65]], ptr addrspace(7) [[TMP64]], align 2
-; CHECK-NEXT:    [[TMP66:%.*]] = add i32 [[TMP4]], 2400
-; CHECK-NEXT:    [[TMP67:%.*]] = add i32 [[TMP66]], 0
-; CHECK-NEXT:    [[TMP68:%.*]] = getelementptr half, ptr addrspace(7) [[PTR]], i32 [[TMP67]]
-; CHECK-NEXT:    [[TMP69:%.*]] = extractelement <16 x half> [[TMP5]], i64 15
-; CHECK-NEXT:    store half [[TMP69]], ptr addrspace(7) [[TMP68]], align 2
 ; CHECK-NEXT:    ret void
 ;
-  call void @lgc.cooperative.matrix.store.p7.i32.i1.i32.i32.i32.v8f32(ptr addrspace(7) %ptr, i32 320, i1 false, i32 1, i32 0, i32 0, <8 x float> %a)
+  call void (...) @lgc.cooperative.matrix.store(ptr addrspace(7) %ptr, i32 320, i1 false, i32 1, i32 0, i32 0, i32 16, <8 x float> %a)
   ret void
 }
 
@@ -105,28 +89,24 @@ define void @test_f16_cd_layout(ptr addrspace(7) %ptr, <8 x float> %a) !spirv.Ex
 ; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <8 x float> [[A:%.*]] to <16 x half>
 ; CHECK-NEXT:    [[TMP8:%.*]] = shufflevector <16 x half> [[TMP7]], <16 x half> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
 ; CHECK-NEXT:    [[TMP9:%.*]] = add i32 [[TMP6]], 0
-; CHECK-NEXT:    [[TMP10:%.*]] = add i32 [[TMP9]], 0
-; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr half, ptr addrspace(7) [[PTR:%.*]], i32 [[TMP10]]
-; CHECK-NEXT:    [[TMP12:%.*]] = extractelement <4 x half> [[TMP8]], i64 0
-; CHECK-NEXT:    store half [[TMP12]], ptr addrspace(7) [[TMP11]], align 2
-; CHECK-NEXT:    [[TMP13:%.*]] = add i32 [[TMP6]], 640
-; CHECK-NEXT:    [[TMP14:%.*]] = add i32 [[TMP13]], 0
-; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr half, ptr addrspace(7) [[PTR]], i32 [[TMP14]]
-; CHECK-NEXT:    [[TMP16:%.*]] = extractelement <4 x half> [[TMP8]], i64 1
-; CHECK-NEXT:    store half [[TMP16]], ptr addrspace(7) [[TMP15]], align 2
-; CHECK-NEXT:    [[TMP17:%.*]] = add i32 [[TMP6]], 1280
-; CHECK-NEXT:    [[TMP18:%.*]] = add i32 [[TMP17]], 0
+; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr half, ptr addrspace(7) [[PTR:%.*]], i32 [[TMP9]]
+; CHECK-NEXT:    [[TMP11:%.*]] = extractelement <4 x half> [[TMP8]], i64 0
+; CHECK-NEXT:    store half [[TMP11]], ptr addrspace(7) [[TMP10]], align 2
+; CHECK-NEXT:    [[TMP12:%.*]] = add i32 [[TMP6]], 640
+; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr half, ptr addrspace(7) [[PTR]], i32 [[TMP12]]
+; CHECK-NEXT:    [[TMP14:%.*]] = extractelement <4 x half> [[TMP8]], i64 1
+; CHECK-NEXT:    store half [[TMP14]], ptr addrspace(7) [[TMP13]], align 2
+; CHECK-NEXT:    [[TMP15:%.*]] = add i32 [[TMP6]], 1280
+; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr half, ptr addrspace(7) [[PTR]], i32 [[TMP15]]
+; CHECK-NEXT:    [[TMP17:%.*]] = extractelement <4 x half> [[TMP8]], i64 2
+; CHECK-NEXT:    store half [[TMP17]], ptr addrspace(7) [[TMP16]], align 2
+; CHECK-NEXT:    [[TMP18:%.*]] = add i32 [[TMP6]], 1920
 ; CHECK-NEXT:    [[TMP19:%.*]] = getelementptr half, ptr addrspace(7) [[PTR]], i32 [[TMP18]]
-; CHECK-NEXT:    [[TMP20:%.*]] = extractelement <4 x half> [[TMP8]], i64 2
+; CHECK-NEXT:    [[TMP20:%.*]] = extractelement <4 x half> [[TMP8]], i64 3
 ; CHECK-NEXT:    store half [[TMP20]], ptr addrspace(7) [[TMP19]], align 2
-; CHECK-NEXT:    [[TMP21:%.*]] = add i32 [[TMP6]], 1920
-; CHECK-NEXT:    [[TMP22:%.*]] = add i32 [[TMP21]], 0
-; CHECK-NEXT:    [[TMP23:%.*]] = getelementptr half, ptr addrspace(7) [[PTR]], i32 [[TMP22]]
-; CHECK-NEXT:    [[TMP24:%.*]] = extractelement <4 x half> [[TMP8]], i64 3
-; CHECK-NEXT:    store half [[TMP24]], ptr addrspace(7) [[TMP23]], align 2
 ; CHECK-NEXT:    ret void
 ;
-  call void @lgc.cooperative.matrix.store.p7.i32.i1.i32.i32.i32.v8f32(ptr addrspace(7) %ptr, i32 320, i1 false, i32 1, i32 1, i32 0, <8 x float> %a)
+  call void (...) @lgc.cooperative.matrix.store(ptr addrspace(7) %ptr, i32 320, i1 false, i32 1, i32 1, i32 0, i32 16, <8 x float> %a)
   ret void
 }
 
@@ -138,88 +118,72 @@ define void @test_i16_ab_layout(ptr addrspace(7) %ptr, <8 x i32> %a) !spirv.Exec
 ; CHECK-NEXT:    [[TMP4:%.*]] = add i32 0, [[TMP3]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x i32> [[A:%.*]] to <16 x i16>
 ; CHECK-NEXT:    [[TMP6:%.*]] = add i32 [[TMP4]], 0
-; CHECK-NEXT:    [[TMP7:%.*]] = add i32 [[TMP6]], 0
-; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr i16, ptr addrspace(7) [[PTR:%.*]], i32 [[TMP7]]
-; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <16 x i16> [[TMP5]], i64 0
-; CHECK-NEXT:    store i16 [[TMP9]], ptr addrspace(7) [[TMP8]], align 2
-; CHECK-NEXT:    [[TMP10:%.*]] = add i32 [[TMP4]], 160
-; CHECK-NEXT:    [[TMP11:%.*]] = add i32 [[TMP10]], 0
-; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr i16, ptr addrspace(7) [[PTR]], i32 [[TMP11]]
-; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <16 x i16> [[TMP5]], i64 1
-; CHECK-NEXT:    store i16 [[TMP13]], ptr addrspace(7) [[TMP12]], align 2
-; CHECK-NEXT:    [[TMP14:%.*]] = add i32 [[TMP4]], 320
-; CHECK-NEXT:    [[TMP15:%.*]] = add i32 [[TMP14]], 0
+; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr i16, ptr addrspace(7) [[PTR:%.*]], i32 [[TMP6]]
+; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <16 x i16> [[TMP5]], i64 0
+; CHECK-NEXT:    store i16 [[TMP8]], ptr addrspace(7) [[TMP7]], align 2
+; CHECK-NEXT:    [[TMP9:%.*]] = add i32 [[TMP4]], 160
+; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr i16, ptr addrspace(7) [[PTR]], i32 [[TMP9]]
+; CHECK-NEXT:    [[TMP11:%.*]] = extractelement <16 x i16> [[TMP5]], i64 1
+; CHECK-NEXT:    store i16 [[TMP11]], ptr addrspace(7) [[TMP10]], align 2
+; CHECK-NEXT:    [[TMP12:%.*]] = add i32 [[TMP4]], 320
+; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr i16, ptr addrspace(7) [[PTR]], i32 [[TMP12]]
+; CHECK-NEXT:    [[TMP14:%.*]] = extractelement <16 x i16> [[TMP5]], i64 2
+; CHECK-NEXT:    store i16 [[TMP14]], ptr addrspace(7) [[TMP13]], align 2
+; CHECK-NEXT:    [[TMP15:%.*]] = add i32 [[TMP4]], 480
 ; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr i16, ptr addrspace(7) [[PTR]], i32 [[TMP15]]
-; CHECK-NEXT:    [[TMP17:%.*]] = extractelement <16 x i16> [[TMP5]], i64 2
+; CHECK-NEXT:    [[TMP17:%.*]] = extractelement <16 x i16> [[TMP5]], i64 3
 ; CHECK-NEXT:    store i16 [[TMP17]], ptr addrspace(7) [[TMP16]], align 2
-; CHECK-NEXT:    [[TMP18:%.*]] = add i32 [[TMP4]], 480
-; CHECK-NEXT:    [[TMP19:%.*]] = add i32 [[TMP18]], 0
-; CHECK-NEXT:    [[TMP20:%.*]] = getelementptr i16, ptr addrspace(7) [[PTR]], i32 [[TMP19]]
-; CHECK-NEXT:    [[TMP21:%.*]] = extractelement <16 x i16> [[TMP5]], i64 3
-; CHECK-NEXT:    store i16 [[TMP21]], ptr addrspace(7) [[TMP20]], align 2
-; CHECK-NEXT:    [[TMP22:%.*]] = add i32 [[TMP4]], 640
-; CHECK-NEXT:    [[TMP23:%.*]] = add i32 [[TMP22]], 0
-; CHECK-NEXT:    [[TMP24:%.*]] = getelementptr i16, ptr addrspace(7) [[PTR]], i32 [[TMP23]]
-; CHECK-NEXT:    [[TMP25:%.*]] = extractelement <16 x i16> [[TMP5]], i64 4
-; CHECK-NEXT:    store i16 [[TMP25]], ptr addrspace(7) [[TMP24]], align 2
-; CHECK-NEXT:    [[TMP26:%.*]] = add i32 [[TMP4]], 800
-; CHECK-NEXT:    [[TMP27:%.*]] = add i32 [[TMP26]], 0
+; CHECK-NEXT:    [[TMP18:%.*]] = add i32 [[TMP4]], 640
+; CHECK-NEXT:    [[TMP19:%.*]] = getelementptr i16, ptr addrspace(7) [[PTR]], i32 [[TMP18]]
+; CHECK-NEXT:    [[TMP20:%.*]] = extractelement <16 x i16> [[TMP5]], i64 4
+; CHECK-NEXT:    store i16 [[TMP20]], ptr addrspace(7) [[TMP19]], align 2
+; CHECK-NEXT:    [[TMP21:%.*]] = add i32 [[TMP4]], 800
+; CHECK-NEXT:    [[TMP22:%.*]] = getelementptr i16, ptr addrspace(7) [[PTR]], i32 [[TMP21]]
+; CHECK-NEXT:    [[TMP23:%.*]] = extractelement <16 x i16> [[TMP5]], i64 5
+; CHECK-NEXT:    store i16 [[TMP23]], ptr addrspace(7) [[TMP22]], align 2
+; CHECK-NEXT:    [[TMP24:%.*]] = add i32 [[TMP4]], 960
+; CHECK-NEXT:    [[TMP25:%.*]] = getelementptr i16, ptr addrspace(7) [[PTR]], i32 [[TMP24]]
+; CHECK-NEXT:    [[TMP26:%.*]] = extractelement <16 x i16> [[TMP5]], i64 6
+; CHECK-NEXT:    store i16 [[TMP26]], ptr addrspace(7) [[TMP25]], align 2
+; CHECK-NEXT:    [[TMP27:%.*]] = add i32 [[TMP4]], 1120
 ; CHECK-NEXT:    [[TMP28:%.*]] = getelementptr i16, ptr addrspace(7) [[PTR]], i32 [[TMP27]]
-; CHECK-NEXT:    [[TMP29:%.*]] = extractelement <16 x i16> [[TMP5]], i64 5
+; CHECK-NEXT:    [[TMP29:%.*]] = extractelement <16 x i16> [[TMP5]], i64 7
 ; CHECK-NEXT:    store i16 [[TMP29]], ptr addrspace(7) [[TMP28]], align 2
-; CHECK-NEXT:    [[TMP30:%.*]] = add i32 [[TMP4]], 960
-; CHECK-NEXT:    [[TMP31:%.*]] = add i32 [[TMP30]], 0
-; CHECK-NEXT:    [[TMP32:%.*]] = getelementptr i16, ptr addrspace(7) [[PTR]], i32 [[TMP31]]
-; CHECK-NEXT:    [[TMP33:%.*]] = extractelement <16 x i16> [[TMP5]], i64 6
-; CHECK-NEXT:    store i16 [[TMP33]], ptr addrspace(7) [[TMP32]], align 2
-; CHECK-NEXT:    [[TMP34:%.*]] = add i32 [[TMP4]], 1120
-; CHECK-NEXT:    [[TMP35:%.*]] = add i32 [[TMP34]], 0
-; CHECK-NEXT:    [[TMP36:%.*]] = getelementptr i16, ptr addrspace(7) [[PTR]], i32 [[TMP35]]
-; CHECK-NEXT:    [[TMP37:%.*]] = extractelement <16 x i16> [[TMP5]], i64 7
-; CHECK-NEXT:    store i16 [[TMP37]], ptr addrspace(7) [[TMP36]], align 2
-; CHECK-NEXT:    [[TMP38:%.*]] = add i32 [[TMP4]], 1280
-; CHECK-NEXT:    [[TMP39:%.*]] = add i32 [[TMP38]], 0
+; CHECK-NEXT:    [[TMP30:%.*]] = add i32 [[TMP4]], 1280
+; CHECK-NEXT:    [[TMP31:%.*]] = getelementptr i16, ptr addrspace(7) [[PTR]], i32 [[TMP30]]
+; CHECK-NEXT:    [[TMP32:%.*]] = extractelement <16 x i16> [[TMP5]], i64 8
+; CHECK-NEXT:    store i16 [[TMP32]], ptr addrspace(7) [[TMP31]], align 2
+; CHECK-NEXT:    [[TMP33:%.*]] = add i32 [[TMP4]], 1440
+; CHECK-NEXT:    [[TMP34:%.*]] = getelementptr i16, ptr addrspace(7) [[PTR]], i32 [[TMP33]]
+; CHECK-NEXT:    [[TMP35:%.*]] = extractelement <16 x i16> [[TMP5]], i64 9
+; CHECK-NEXT:    store i16 [[TMP35]], ptr addrspace(7) [[TMP34]], align 2
+; CHECK-NEXT:    [[TMP36:%.*]] = add i32 [[TMP4]], 1600
+; CHECK-NEXT:    [[TMP37:%.*]] = getelementptr i16, ptr addrspace(7) [[PTR]], i32 [[TMP36]]
+; CHECK-NEXT:    [[TMP38:%.*]] = extractelement <16 x i16> [[TMP5]], i64 10
+; CHECK-NEXT:    store i16 [[TMP38]], ptr addrspace(7) [[TMP37]], align 2
+; CHECK-NEXT:    [[TMP39:%.*]] = add i32 [[TMP4]], 1760
 ; CHECK-NEXT:    [[TMP40:%.*]] = getelementptr i16, ptr addrspace(7) [[PTR]], i32 [[TMP39]]
-; CHECK-NEXT:    [[TMP41:%.*]] = extractelement <16 x i16> [[TMP5]], i64 8
+; CHECK-NEXT:    [[TMP41:%.*]] = extractelement <16 x i16> [[TMP5]], i64 11
 ; CHECK-NEXT:    store i16 [[TMP41]], ptr addrspace(7) [[TMP40]], align 2
-; CHECK-NEXT:    [[TMP42:%.*]] = add i32 [[TMP4]], 1440
-; CHECK-NEXT:    [[TMP43:%.*]] = add i32 [[TMP42]], 0
-; CHECK-NEXT:    [[TMP44:%.*]] = getelementptr i16, ptr addrspace(7) [[PTR]], i32 [[TMP43]]
-; CHECK-NEXT:    [[TMP45:%.*]] = extractelement <16 x i16> [[TMP5]], i64 9
-; CHECK-NEXT:    store i16 [[TMP45]], ptr addrspace(7) [[TMP44]], align 2
-; CHECK-NEXT:    [[TMP46:%.*]] = add i32 [[TMP4]], 1600
-; CHECK-NEXT:    [[TMP47:%.*]] = add i32 [[TMP46]], 0
-; CHECK-NEXT:    [[TMP48:%.*]] = getelementptr i16, ptr addrspace(7) [[PTR]], i32 [[TMP47]]
-; CHECK-NEXT:    [[TMP49:%.*]] = extractelement <16 x i16> [[TMP5]], i64 10
-; CHECK-NEXT:    store i16 [[TMP49]], ptr addrspace(7) [[TMP48]], align 2
-; CHECK-NEXT:    [[TMP50:%.*]] = add i32 [[TMP4]], 1760
-; CHECK-NEXT:    [[TMP51:%.*]] = add i32 [[TMP50]], 0
+; CHECK-NEXT:    [[TMP42:%.*]] = add i32 [[TMP4]], 1920
+; CHECK-NEXT:    [[TMP43:%.*]] = getelementptr i16, ptr addrspace(7) [[PTR]], i32 [[TMP42]]
+; CHECK-NEXT:    [[TMP44:%.*]] = extractelement <16 x i16> [[TMP5]], i64 12
+; CHECK-NEXT:    store i16 [[TMP44]], ptr addrspace(7) [[TMP43]], align 2
+; CHECK-NEXT:    [[TMP45:%.*]] = add i32 [[TMP4]], 2080
+; CHECK-NEXT:    [[TMP46:%.*]] = getelementptr i16, ptr addrspace(7) [[PTR]], i32 [[TMP45]]
+; CHECK-NEXT:    [[TMP47:%.*]] = extractelement <16 x i16> [[TMP5]], i64 13
+; CHECK-NEXT:    store i16 [[TMP47]], ptr addrspace(7) [[TMP46]], align 2
+; CHECK-NEXT:    [[TMP48:%.*]] = add i32 [[TMP4]], 2240
+; CHECK-NEXT:    [[TMP49:%.*]] = getelementptr i16, ptr addrspace(7) [[PTR]], i32 [[TMP48]]
+; CHECK-NEXT:    [[TMP50:%.*]] = extractelement <16 x i16> [[TMP5]], i64 14
+; CHECK-NEXT:    store i16 [[TMP50]], ptr addrspace(7) [[TMP49]], align 2
+; CHECK-NEXT:    [[TMP51:%.*]] = add i32 [[TMP4]], 2400
 ; CHECK-NEXT:    [[TMP52:%.*]] = getelementptr i16, ptr addrspace(7) [[PTR]], i32 [[TMP51]]
-; CHECK-NEXT:    [[TMP53:%.*]] = extractelement <16 x i16> [[TMP5]], i64 11
+; CHECK-NEXT:    [[TMP53:%.*]] = extractelement <16 x i16> [[TMP5]], i64 15
 ; CHECK-NEXT:    store i16 [[TMP53]], ptr addrspace(7) [[TMP52]], align 2
-; CHECK-NEXT:    [[TMP54:%.*]] = add i32 [[TMP4]], 1920
-; CHECK-NEXT:    [[TMP55:%.*]] = add i32 [[TMP54]], 0
-; CHECK-NEXT:    [[TMP56:%.*]] = getelementptr i16, ptr addrspace(7) [[PTR]], i32 [[TMP55]]
-; CHECK-NEXT:    [[TMP57:%.*]] = extractelement <16 x i16> [[TMP5]], i64 12
-; CHECK-NEXT:    store i16 [[TMP57]], ptr addrspace(7) [[TMP56]], align 2
-; CHECK-NEXT:    [[TMP58:%.*]] = add i32 [[TMP4]], 2080
-; CHECK-NEXT:    [[TMP59:%.*]] = add i32 [[TMP58]], 0
-; CHECK-NEXT:    [[TMP60:%.*]] = getelementptr i16, ptr addrspace(7) [[PTR]], i32 [[TMP59]]
-; CHECK-NEXT:    [[TMP61:%.*]] = extractelement <16 x i16> [[TMP5]], i64 13
-; CHECK-NEXT:    store i16 [[TMP61]], ptr addrspace(7) [[TMP60]], align 2
-; CHECK-NEXT:    [[TMP62:%.*]] = add i32 [[TMP4]], 2240
-; CHECK-NEXT:    [[TMP63:%.*]] = add i32 [[TMP62]], 0
-; CHECK-NEXT:    [[TMP64:%.*]] = getelementptr i16, ptr addrspace(7) [[PTR]], i32 [[TMP63]]
-; CHECK-NEXT:    [[TMP65:%.*]] = extractelement <16 x i16> [[TMP5]], i64 14
-; CHECK-NEXT:    store i16 [[TMP65]], ptr addrspace(7) [[TMP64]], align 2
-; CHECK-NEXT:    [[TMP66:%.*]] = add i32 [[TMP4]], 2400
-; CHECK-NEXT:    [[TMP67:%.*]] = add i32 [[TMP66]], 0
-; CHECK-NEXT:    [[TMP68:%.*]] = getelementptr i16, ptr addrspace(7) [[PTR]], i32 [[TMP67]]
-; CHECK-NEXT:    [[TMP69:%.*]] = extractelement <16 x i16> [[TMP5]], i64 15
-; CHECK-NEXT:    store i16 [[TMP69]], ptr addrspace(7) [[TMP68]], align 2
 ; CHECK-NEXT:    ret void
 ;
-  call void @lgc.cooperative.matrix.store.p7.i32.i1.i32.i32.i32.v8i32(ptr addrspace(7) %ptr, i32 320, i1 false, i32 4, i32 0, i32 0, <8 x i32> %a)
+  call void (...) @lgc.cooperative.matrix.store(ptr addrspace(7) %ptr, i32 320, i1 false, i32 4, i32 0, i32 0, i32 16, <8 x i32> %a)
   ret void
 }
 
@@ -234,28 +198,24 @@ define void @test_i16_cd_layout(ptr addrspace(7) %ptr, <8 x i32> %a) !spirv.Exec
 ; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <8 x i32> [[A:%.*]] to <16 x i16>
 ; CHECK-NEXT:    [[TMP8:%.*]] = shufflevector <16 x i16> [[TMP7]], <16 x i16> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
 ; CHECK-NEXT:    [[TMP9:%.*]] = add i32 [[TMP6]], 0
-; CHECK-NEXT:    [[TMP10:%.*]] = add i32 [[TMP9]], 0
-; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr i16, ptr addrspace(7) [[PTR:%.*]], i32 [[TMP10]]
-; CHECK-NEXT:    [[TMP12:%.*]] = extractelement <4 x i16> [[TMP8]], i64 0
-; CHECK-NEXT:    store i16 [[TMP12]], ptr addrspace(7) [[TMP11]], align 2
-; CHECK-NEXT:    [[TMP13:%.*]] = add i32 [[TMP6]], 640
-; CHECK-NEXT:    [[TMP14:%.*]] = add i32 [[TMP13]], 0
-; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr i16, ptr addrspace(7) [[PTR]], i32 [[TMP14]]
-; CHECK-NEXT:    [[TMP16:%.*]] = extractelement <4 x i16> [[TMP8]], i64 1
-; CHECK-NEXT:    store i16 [[TMP16]], ptr addrspace(7) [[TMP15]], align 2
-; CHECK-NEXT:    [[TMP17:%.*]] = add i32 [[TMP6]], 1280
-; CHECK-NEXT:    [[TMP18:%.*]] = add i32 [[TMP17]], 0
+; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr i16, ptr addrspace(7) [[PTR:%.*]], i32 [[TMP9]]
+; CHECK-NEXT:    [[TMP11:%.*]] = extractelement <4 x i16> [[TMP8]], i64 0
+; CHECK-NEXT:    store i16 [[TMP11]], ptr addrspace(7) [[TMP10]], align 2
+; CHECK-NEXT:    [[TMP12:%.*]] = add i32 [[TMP6]], 640
+; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr i16, ptr addrspace(7) [[PTR]], i32 [[TMP12]]
+; CHECK-NEXT:    [[TMP14:%.*]] = extractelement <4 x i16> [[TMP8]], i64 1
+; CHECK-NEXT:    store i16 [[TMP14]], ptr addrspace(7) [[TMP13]], align 2
+; CHECK-NEXT:    [[TMP15:%.*]] = add i32 [[TMP6]], 1280
+; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr i16, ptr addrspace(7) [[PTR]], i32 [[TMP15]]
+; CHECK-NEXT:    [[TMP17:%.*]] = extractelement <4 x i16> [[TMP8]], i64 2
+; CHECK-NEXT:    store i16 [[TMP17]], ptr addrspace(7) [[TMP16]], align 2
+; CHECK-NEXT:    [[TMP18:%.*]] = add i32 [[TMP6]], 1920
 ; CHECK-NEXT:    [[TMP19:%.*]] = getelementptr i16, ptr addrspace(7) [[PTR]], i32 [[TMP18]]
-; CHECK-NEXT:    [[TMP20:%.*]] = extractelement <4 x i16> [[TMP8]], i64 2
+; CHECK-NEXT:    [[TMP20:%.*]] = extractelement <4 x i16> [[TMP8]], i64 3
 ; CHECK-NEXT:    store i16 [[TMP20]], ptr addrspace(7) [[TMP19]], align 2
-; CHECK-NEXT:    [[TMP21:%.*]] = add i32 [[TMP6]], 1920
-; CHECK-NEXT:    [[TMP22:%.*]] = add i32 [[TMP21]], 0
-; CHECK-NEXT:    [[TMP23:%.*]] = getelementptr i16, ptr addrspace(7) [[PTR]], i32 [[TMP22]]
-; CHECK-NEXT:    [[TMP24:%.*]] = extractelement <4 x i16> [[TMP8]], i64 3
-; CHECK-NEXT:    store i16 [[TMP24]], ptr addrspace(7) [[TMP23]], align 2
 ; CHECK-NEXT:    ret void
 ;
-  call void @lgc.cooperative.matrix.store.p7.i32.i1.i32.i32.i32.v8i32(ptr addrspace(7) %ptr, i32 320, i1 false, i32 4, i32 1, i32 0, <8 x i32> %a)
+  call void (...) @lgc.cooperative.matrix.store(ptr addrspace(7) %ptr, i32 320, i1 false, i32 4, i32 1, i32 0, i32 16, <8 x i32> %a)
   ret void
 }
 
@@ -269,28 +229,24 @@ define void @test_f32_cd_layout(ptr addrspace(7) %ptr, <8 x float> %a) !spirv.Ex
 ; CHECK-NEXT:    [[TMP6:%.*]] = add i32 [[TMP5]], [[TMP3]]
 ; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <8 x float> [[A:%.*]], <8 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
 ; CHECK-NEXT:    [[TMP8:%.*]] = add i32 [[TMP6]], 0
-; CHECK-NEXT:    [[TMP9:%.*]] = add i32 [[TMP8]], 0
-; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr float, ptr addrspace(7) [[PTR:%.*]], i32 [[TMP9]]
-; CHECK-NEXT:    [[TMP11:%.*]] = extractelement <4 x float> [[TMP7]], i64 0
-; CHECK-NEXT:    store float [[TMP11]], ptr addrspace(7) [[TMP10]], align 4
-; CHECK-NEXT:    [[TMP12:%.*]] = add i32 [[TMP6]], 640
-; CHECK-NEXT:    [[TMP13:%.*]] = add i32 [[TMP12]], 0
-; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr float, ptr addrspace(7) [[PTR]], i32 [[TMP13]]
-; CHECK-NEXT:    [[TMP15:%.*]] = extractelement <4 x float> [[TMP7]], i64 1
-; CHECK-NEXT:    store float [[TMP15]], ptr addrspace(7) [[TMP14]], align 4
-; CHECK-NEXT:    [[TMP16:%.*]] = add i32 [[TMP6]], 1280
-; CHECK-NEXT:    [[TMP17:%.*]] = add i32 [[TMP16]], 0
+; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr float, ptr addrspace(7) [[PTR:%.*]], i32 [[TMP8]]
+; CHECK-NEXT:    [[TMP10:%.*]] = extractelement <4 x float> [[TMP7]], i64 0
+; CHECK-NEXT:    store float [[TMP10]], ptr addrspace(7) [[TMP9]], align 4
+; CHECK-NEXT:    [[TMP11:%.*]] = add i32 [[TMP6]], 640
+; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr float, ptr addrspace(7) [[PTR]], i32 [[TMP11]]
+; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <4 x float> [[TMP7]], i64 1
+; CHECK-NEXT:    store float [[TMP13]], ptr addrspace(7) [[TMP12]], align 4
+; CHECK-NEXT:    [[TMP14:%.*]] = add i32 [[TMP6]], 1280
+; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr float, ptr addrspace(7) [[PTR]], i32 [[TMP14]]
+; CHECK-NEXT:    [[TMP16:%.*]] = extractelement <4 x float> [[TMP7]], i64 2
+; CHECK-NEXT:    store float [[TMP16]], ptr addrspace(7) [[TMP15]], align 4
+; CHECK-NEXT:    [[TMP17:%.*]] = add i32 [[TMP6]], 1920
 ; CHECK-NEXT:    [[TMP18:%.*]] = getelementptr float, ptr addrspace(7) [[PTR]], i32 [[TMP17]]
-; CHECK-NEXT:    [[TMP19:%.*]] = extractelement <4 x float> [[TMP7]], i64 2
+; CHECK-NEXT:    [[TMP19:%.*]] = extractelement <4 x float> [[TMP7]], i64 3
 ; CHECK-NEXT:    store float [[TMP19]], ptr addrspace(7) [[TMP18]], align 4
-; CHECK-NEXT:    [[TMP20:%.*]] = add i32 [[TMP6]], 1920
-; CHECK-NEXT:    [[TMP21:%.*]] = add i32 [[TMP20]], 0
-; CHECK-NEXT:    [[TMP22:%.*]] = getelementptr float, ptr addrspace(7) [[PTR]], i32 [[TMP21]]
-; CHECK-NEXT:    [[TMP23:%.*]] = extractelement <4 x float> [[TMP7]], i64 3
-; CHECK-NEXT:    store float [[TMP23]], ptr addrspace(7) [[TMP22]], align 4
 ; CHECK-NEXT:    ret void
 ;
-  call void @lgc.cooperative.matrix.store.p7.i32.i1.i32.i32.i32.v8f32(ptr addrspace(7) %ptr, i32 640, i1 false, i32 2, i32 1, i32 0, <8 x float> %a)
+  call void (...) @lgc.cooperative.matrix.store(ptr addrspace(7) %ptr, i32 640, i1 false, i32 2, i32 1, i32 0, i32 16, <8 x float> %a)
   ret void
 }
 
@@ -304,33 +260,28 @@ define void @test_i32_cd_layout(ptr addrspace(7) %ptr, <8 x i32> %a) !spirv.Exec
 ; CHECK-NEXT:    [[TMP6:%.*]] = add i32 [[TMP5]], [[TMP3]]
 ; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
 ; CHECK-NEXT:    [[TMP8:%.*]] = add i32 [[TMP6]], 0
-; CHECK-NEXT:    [[TMP9:%.*]] = add i32 [[TMP8]], 0
-; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr i32, ptr addrspace(7) [[PTR:%.*]], i32 [[TMP9]]
-; CHECK-NEXT:    [[TMP11:%.*]] = extractelement <4 x i32> [[TMP7]], i64 0
-; CHECK-NEXT:    store i32 [[TMP11]], ptr addrspace(7) [[TMP10]], align 4
-; CHECK-NEXT:    [[TMP12:%.*]] = add i32 [[TMP6]], 640
-; CHECK-NEXT:    [[TMP13:%.*]] = add i32 [[TMP12]], 0
-; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr i32, ptr addrspace(7) [[PTR]], i32 [[TMP13]]
-; CHECK-NEXT:    [[TMP15:%.*]] = extractelement <4 x i32> [[TMP7]], i64 1
-; CHECK-NEXT:    store i32 [[TMP15]], ptr addrspace(7) [[TMP14]], align 4
-; CHECK-NEXT:    [[TMP16:%.*]] = add i32 [[TMP6]], 1280
-; CHECK-NEXT:    [[TMP17:%.*]] = add i32 [[TMP16]], 0
+; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr i32, ptr addrspace(7) [[PTR:%.*]], i32 [[TMP8]]
+; CHECK-NEXT:    [[TMP10:%.*]] = extractelement <4 x i32> [[TMP7]], i64 0
+; CHECK-NEXT:    store i32 [[TMP10]], ptr addrspace(7) [[TMP9]], align 4
+; CHECK-NEXT:    [[TMP11:%.*]] = add i32 [[TMP6]], 640
+; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr i32, ptr addrspace(7) [[PTR]], i32 [[TMP11]]
+; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <4 x i32> [[TMP7]], i64 1
+; CHECK-NEXT:    store i32 [[TMP13]], ptr addrspace(7) [[TMP12]], align 4
+; CHECK-NEXT:    [[TMP14:%.*]] = add i32 [[TMP6]], 1280
+; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr i32, ptr addrspace(7) [[PTR]], i32 [[TMP14]]
+; CHECK-NEXT:    [[TMP16:%.*]] = extractelement <4 x i32> [[TMP7]], i64 2
+; CHECK-NEXT:    store i32 [[TMP16]], ptr addrspace(7) [[TMP15]], align 4
+; CHECK-NEXT:    [[TMP17:%.*]] = add i32 [[TMP6]], 1920
 ; CHECK-NEXT:    [[TMP18:%.*]] = getelementptr i32, ptr addrspace(7) [[PTR]], i32 [[TMP17]]
-; CHECK-NEXT:    [[TMP19:%.*]] = extractelement <4 x i32> [[TMP7]], i64 2
+; CHECK-NEXT:    [[TMP19:%.*]] = extractelement <4 x i32> [[TMP7]], i64 3
 ; CHECK-NEXT:    store i32 [[TMP19]], ptr addrspace(7) [[TMP18]], align 4
-; CHECK-NEXT:    [[TMP20:%.*]] = add i32 [[TMP6]], 1920
-; CHECK-NEXT:    [[TMP21:%.*]] = add i32 [[TMP20]], 0
-; CHECK-NEXT:    [[TMP22:%.*]] = getelementptr i32, ptr addrspace(7) [[PTR]], i32 [[TMP21]]
-; CHECK-NEXT:    [[TMP23:%.*]] = extractelement <4 x i32> [[TMP7]], i64 3
-; CHECK-NEXT:    store i32 [[TMP23]], ptr addrspace(7) [[TMP22]], align 4
 ; CHECK-NEXT:    ret void
 ;
-  call void @lgc.cooperative.matrix.store.p7.i32.i1.i32.i32.i32.v8i32(ptr addrspace(7) %ptr, i32 640, i1 false, i32 5, i32 1, i32 0, <8 x i32> %a)
+  call void (...) @lgc.cooperative.matrix.store(ptr addrspace(7) %ptr, i32 640, i1 false, i32 5, i32 1, i32 0, i32 16, <8 x i32> %a)
   ret void
 }
 
-declare void @lgc.cooperative.matrix.store.p7.i32.i1.i32.i32.i32.v8f32([4 x float] addrspace(7)*, i32, i1, i32, i32, i32, <8 x float>)
-declare void @lgc.cooperative.matrix.store.p7.i32.i1.i32.i32.i32.v8i32([4 x float] addrspace(7)*, i32, i1, i32, i32, i32, <8 x i32>)
+declare void @lgc.cooperative.matrix.store(...)
 
 !llpc.compute.mode = !{!0}
 !lgc.client = !{!1}
diff --git a/lgc/test/Transforms/PatchBufferOp/strided-buffer-ops.lgc b/lgc/test/Transforms/PatchBufferOp/strided-buffer-ops.lgc
index 843fdbf3ba..cae2e83a00 100644
--- a/lgc/test/Transforms/PatchBufferOp/strided-buffer-ops.lgc
+++ b/lgc/test/Transforms/PatchBufferOp/strided-buffer-ops.lgc
@@ -1,9 +1,9 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --tool lgc --version 2
-; RUN: lgc --mcpu=gfx1100 -o - -passes='require<lgc-pipeline-state>,function(lgc-patch-buffer-op)' %s | FileCheck --check-prefixes=GFX11 %s
+; RUN: lgc --mcpu=gfx1100 -o - -passes='require<lgc-pipeline-state>,module(lgc-lower-desc),module(lgc-patch-entry-point-mutate),function(lgc-patch-buffer-op)' %s | FileCheck --check-prefixes=GFX11 %s
 
 define amdgpu_kernel void @strided_buffer_desc_to_ptr(<4 x i32> inreg %desc, ptr %out) {
-; GFX11-LABEL: define amdgpu_kernel void @strided_buffer_desc_to_ptr
-; GFX11-SAME: (<4 x i32> inreg [[DESC:%.*]], ptr [[OUT:%.*]]) {
+; GFX11-LABEL: define amdgpu_gfx void @strided_buffer_desc_to_ptr
+; GFX11-SAME: (<4 x i32> inreg [[DESC:%.*]], ptr [[OUT:%.*]], i32 inreg noundef [[GLOBALTABLE:%.*]], ptr addrspace(4) inreg noundef [[NUMWORKGROUPSPTR:%.*]], i32 inreg noundef [[USERDATA0:%.*]], i32 inreg noundef [[USERDATA1:%.*]], i32 inreg noundef [[USERDATA2:%.*]], i32 inreg noundef [[USERDATA3:%.*]], i32 inreg noundef [[USERDATA4:%.*]], i32 inreg noundef [[USERDATA5:%.*]], i32 inreg noundef [[PAD6:%.*]], i32 inreg noundef [[PAD7:%.*]], i32 inreg noundef [[PAD8:%.*]], i32 inreg noundef [[PAD9:%.*]], i32 inreg noundef [[PAD10:%.*]], i32 inreg noundef [[PAD11:%.*]], i32 inreg noundef [[SPILLTABLE:%.*]], <3 x i32> inreg noundef [[WORKGROUPID:%.*]], i32 inreg noundef [[MULTIDISPATCHINFO:%.*]], i32 noundef [[LOCALINVOCATIONID:%.*]]) #[[ATTR0:[0-9]+]] {
 ; GFX11-NEXT:  entry:
 ; GFX11-NEXT:    [[TMP0:%.*]] = call i32 @llvm.amdgcn.struct.buffer.load.i32(<4 x i32> [[DESC]], i32 0, i32 0, i32 0, i32 0)
 ; GFX11-NEXT:    [[TMP1:%.*]] = bitcast i32 [[TMP0]] to float
@@ -18,8 +18,8 @@ entry:
 }
 
 define amdgpu_kernel void @strided_buffer_desc_to_ptr_index(<4 x i32> inreg %desc, i32 %index, ptr %out) {
-; GFX11-LABEL: define amdgpu_kernel void @strided_buffer_desc_to_ptr_index
-; GFX11-SAME: (<4 x i32> inreg [[DESC:%.*]], i32 [[INDEX:%.*]], ptr [[OUT:%.*]]) {
+; GFX11-LABEL: define amdgpu_gfx void @strided_buffer_desc_to_ptr_index
+; GFX11-SAME: (<4 x i32> inreg [[DESC:%.*]], i32 [[INDEX:%.*]], ptr [[OUT:%.*]], i32 inreg noundef [[GLOBALTABLE:%.*]], ptr addrspace(4) inreg noundef [[NUMWORKGROUPSPTR:%.*]], i32 inreg noundef [[USERDATA0:%.*]], i32 inreg noundef [[USERDATA1:%.*]], i32 inreg noundef [[USERDATA2:%.*]], i32 inreg noundef [[USERDATA3:%.*]], i32 inreg noundef [[USERDATA4:%.*]], i32 inreg noundef [[USERDATA5:%.*]], i32 inreg noundef [[PAD6:%.*]], i32 inreg noundef [[PAD7:%.*]], i32 inreg noundef [[PAD8:%.*]], i32 inreg noundef [[PAD9:%.*]], i32 inreg noundef [[PAD10:%.*]], i32 inreg noundef [[PAD11:%.*]], i32 inreg noundef [[SPILLTABLE:%.*]], <3 x i32> inreg noundef [[WORKGROUPID:%.*]], i32 inreg noundef [[MULTIDISPATCHINFO:%.*]], i32 noundef [[LOCALINVOCATIONID:%.*]]) #[[ATTR0]] {
 ; GFX11-NEXT:  entry:
 ; GFX11-NEXT:    [[TMP0:%.*]] = call i32 @llvm.amdgcn.struct.buffer.load.i32(<4 x i32> [[DESC]], i32 [[INDEX]], i32 0, i32 0, i32 0)
 ; GFX11-NEXT:    [[TMP1:%.*]] = bitcast i32 [[TMP0]] to float
@@ -35,8 +35,8 @@ entry:
 }
 
 define amdgpu_kernel void @strided_buffer_desc_to_ptr_index_offset(<4 x i32> inreg %desc, i32 %index, ptr %out) {
-; GFX11-LABEL: define amdgpu_kernel void @strided_buffer_desc_to_ptr_index_offset
-; GFX11-SAME: (<4 x i32> inreg [[DESC:%.*]], i32 [[INDEX:%.*]], ptr [[OUT:%.*]]) {
+; GFX11-LABEL: define amdgpu_gfx void @strided_buffer_desc_to_ptr_index_offset
+; GFX11-SAME: (<4 x i32> inreg [[DESC:%.*]], i32 [[INDEX:%.*]], ptr [[OUT:%.*]], i32 inreg noundef [[GLOBALTABLE:%.*]], ptr addrspace(4) inreg noundef [[NUMWORKGROUPSPTR:%.*]], i32 inreg noundef [[USERDATA0:%.*]], i32 inreg noundef [[USERDATA1:%.*]], i32 inreg noundef [[USERDATA2:%.*]], i32 inreg noundef [[USERDATA3:%.*]], i32 inreg noundef [[USERDATA4:%.*]], i32 inreg noundef [[USERDATA5:%.*]], i32 inreg noundef [[PAD6:%.*]], i32 inreg noundef [[PAD7:%.*]], i32 inreg noundef [[PAD8:%.*]], i32 inreg noundef [[PAD9:%.*]], i32 inreg noundef [[PAD10:%.*]], i32 inreg noundef [[PAD11:%.*]], i32 inreg noundef [[SPILLTABLE:%.*]], <3 x i32> inreg noundef [[WORKGROUPID:%.*]], i32 inreg noundef [[MULTIDISPATCHINFO:%.*]], i32 noundef [[LOCALINVOCATIONID:%.*]]) #[[ATTR0]] {
 ; GFX11-NEXT:  entry:
 ; GFX11-NEXT:    [[TMP0:%.*]] = call i32 @llvm.amdgcn.struct.buffer.load.i32(<4 x i32> [[DESC]], i32 [[INDEX]], i32 ptrtoint (ptr addrspace(6) getelementptr inbounds (i8, ptr addrspace(6) null, i32 8) to i32), i32 0, i32 0)
 ; GFX11-NEXT:    [[TMP1:%.*]] = bitcast i32 [[TMP0]] to float
@@ -53,8 +53,8 @@ entry:
 }
 
 define amdgpu_kernel void @strided_buffer_desc_to_ptr_index_add_zero(<4 x i32> inreg %desc, ptr %out) {
-; GFX11-LABEL: define amdgpu_kernel void @strided_buffer_desc_to_ptr_index_add_zero
-; GFX11-SAME: (<4 x i32> inreg [[DESC:%.*]], ptr [[OUT:%.*]]) {
+; GFX11-LABEL: define amdgpu_gfx void @strided_buffer_desc_to_ptr_index_add_zero
+; GFX11-SAME: (<4 x i32> inreg [[DESC:%.*]], ptr [[OUT:%.*]], i32 inreg noundef [[GLOBALTABLE:%.*]], ptr addrspace(4) inreg noundef [[NUMWORKGROUPSPTR:%.*]], i32 inreg noundef [[USERDATA0:%.*]], i32 inreg noundef [[USERDATA1:%.*]], i32 inreg noundef [[USERDATA2:%.*]], i32 inreg noundef [[USERDATA3:%.*]], i32 inreg noundef [[USERDATA4:%.*]], i32 inreg noundef [[USERDATA5:%.*]], i32 inreg noundef [[PAD6:%.*]], i32 inreg noundef [[PAD7:%.*]], i32 inreg noundef [[PAD8:%.*]], i32 inreg noundef [[PAD9:%.*]], i32 inreg noundef [[PAD10:%.*]], i32 inreg noundef [[PAD11:%.*]], i32 inreg noundef [[SPILLTABLE:%.*]], <3 x i32> inreg noundef [[WORKGROUPID:%.*]], i32 inreg noundef [[MULTIDISPATCHINFO:%.*]], i32 noundef [[LOCALINVOCATIONID:%.*]]) #[[ATTR0]] {
 ; GFX11-NEXT:  entry:
 ; GFX11-NEXT:    [[TMP0:%.*]] = call i32 @llvm.amdgcn.struct.buffer.load.i32(<4 x i32> [[DESC]], i32 0, i32 0, i32 0, i32 0)
 ; GFX11-NEXT:    [[TMP1:%.*]] = bitcast i32 [[TMP0]] to float
@@ -70,8 +70,8 @@ entry:
 }
 
 define amdgpu_kernel void @strided_buffer_desc_to_ptr_index_add_twice(<4 x i32> inreg %desc, i32 %index, ptr %out) {
-; GFX11-LABEL: define amdgpu_kernel void @strided_buffer_desc_to_ptr_index_add_twice
-; GFX11-SAME: (<4 x i32> inreg [[DESC:%.*]], i32 [[INDEX:%.*]], ptr [[OUT:%.*]]) {
+; GFX11-LABEL: define amdgpu_gfx void @strided_buffer_desc_to_ptr_index_add_twice
+; GFX11-SAME: (<4 x i32> inreg [[DESC:%.*]], i32 [[INDEX:%.*]], ptr [[OUT:%.*]], i32 inreg noundef [[GLOBALTABLE:%.*]], ptr addrspace(4) inreg noundef [[NUMWORKGROUPSPTR:%.*]], i32 inreg noundef [[USERDATA0:%.*]], i32 inreg noundef [[USERDATA1:%.*]], i32 inreg noundef [[USERDATA2:%.*]], i32 inreg noundef [[USERDATA3:%.*]], i32 inreg noundef [[USERDATA4:%.*]], i32 inreg noundef [[USERDATA5:%.*]], i32 inreg noundef [[PAD6:%.*]], i32 inreg noundef [[PAD7:%.*]], i32 inreg noundef [[PAD8:%.*]], i32 inreg noundef [[PAD9:%.*]], i32 inreg noundef [[PAD10:%.*]], i32 inreg noundef [[PAD11:%.*]], i32 inreg noundef [[SPILLTABLE:%.*]], <3 x i32> inreg noundef [[WORKGROUPID:%.*]], i32 inreg noundef [[MULTIDISPATCHINFO:%.*]], i32 noundef [[LOCALINVOCATIONID:%.*]]) #[[ATTR0]] {
 ; GFX11-NEXT:  entry:
 ; GFX11-NEXT:    [[TMP0:%.*]] = add i32 [[INDEX]], [[INDEX]]
 ; GFX11-NEXT:    [[TMP1:%.*]] = call i32 @llvm.amdgcn.struct.buffer.load.i32(<4 x i32> [[DESC]], i32 [[TMP0]], i32 0, i32 0, i32 0)
@@ -89,8 +89,8 @@ entry:
 }
 
 define amdgpu_kernel void @strided_buffer_desc_to_ptr_index_add_twice_constant_new(<4 x i32> inreg %desc, i32 %index, ptr %out) {
-; GFX11-LABEL: define amdgpu_kernel void @strided_buffer_desc_to_ptr_index_add_twice_constant_new
-; GFX11-SAME: (<4 x i32> inreg [[DESC:%.*]], i32 [[INDEX:%.*]], ptr [[OUT:%.*]]) {
+; GFX11-LABEL: define amdgpu_gfx void @strided_buffer_desc_to_ptr_index_add_twice_constant_new
+; GFX11-SAME: (<4 x i32> inreg [[DESC:%.*]], i32 [[INDEX:%.*]], ptr [[OUT:%.*]], i32 inreg noundef [[GLOBALTABLE:%.*]], ptr addrspace(4) inreg noundef [[NUMWORKGROUPSPTR:%.*]], i32 inreg noundef [[USERDATA0:%.*]], i32 inreg noundef [[USERDATA1:%.*]], i32 inreg noundef [[USERDATA2:%.*]], i32 inreg noundef [[USERDATA3:%.*]], i32 inreg noundef [[USERDATA4:%.*]], i32 inreg noundef [[USERDATA5:%.*]], i32 inreg noundef [[PAD6:%.*]], i32 inreg noundef [[PAD7:%.*]], i32 inreg noundef [[PAD8:%.*]], i32 inreg noundef [[PAD9:%.*]], i32 inreg noundef [[PAD10:%.*]], i32 inreg noundef [[PAD11:%.*]], i32 inreg noundef [[SPILLTABLE:%.*]], <3 x i32> inreg noundef [[WORKGROUPID:%.*]], i32 inreg noundef [[MULTIDISPATCHINFO:%.*]], i32 noundef [[LOCALINVOCATIONID:%.*]]) #[[ATTR0]] {
 ; GFX11-NEXT:  entry:
 ; GFX11-NEXT:    [[TMP0:%.*]] = add i32 [[INDEX]], 4
 ; GFX11-NEXT:    [[TMP1:%.*]] = call i32 @llvm.amdgcn.struct.buffer.load.i32(<4 x i32> [[DESC]], i32 [[TMP0]], i32 0, i32 0, i32 0)
@@ -108,8 +108,8 @@ entry:
 }
 
 define amdgpu_kernel void @strided_buffer_desc_to_ptr_index_add_twice_constant_old(<4 x i32> inreg %desc, i32 %index, ptr %out) {
-; GFX11-LABEL: define amdgpu_kernel void @strided_buffer_desc_to_ptr_index_add_twice_constant_old
-; GFX11-SAME: (<4 x i32> inreg [[DESC:%.*]], i32 [[INDEX:%.*]], ptr [[OUT:%.*]]) {
+; GFX11-LABEL: define amdgpu_gfx void @strided_buffer_desc_to_ptr_index_add_twice_constant_old
+; GFX11-SAME: (<4 x i32> inreg [[DESC:%.*]], i32 [[INDEX:%.*]], ptr [[OUT:%.*]], i32 inreg noundef [[GLOBALTABLE:%.*]], ptr addrspace(4) inreg noundef [[NUMWORKGROUPSPTR:%.*]], i32 inreg noundef [[USERDATA0:%.*]], i32 inreg noundef [[USERDATA1:%.*]], i32 inreg noundef [[USERDATA2:%.*]], i32 inreg noundef [[USERDATA3:%.*]], i32 inreg noundef [[USERDATA4:%.*]], i32 inreg noundef [[USERDATA5:%.*]], i32 inreg noundef [[PAD6:%.*]], i32 inreg noundef [[PAD7:%.*]], i32 inreg noundef [[PAD8:%.*]], i32 inreg noundef [[PAD9:%.*]], i32 inreg noundef [[PAD10:%.*]], i32 inreg noundef [[PAD11:%.*]], i32 inreg noundef [[SPILLTABLE:%.*]], <3 x i32> inreg noundef [[WORKGROUPID:%.*]], i32 inreg noundef [[MULTIDISPATCHINFO:%.*]], i32 noundef [[LOCALINVOCATIONID:%.*]]) #[[ATTR0]] {
 ; GFX11-NEXT:  entry:
 ; GFX11-NEXT:    [[TMP0:%.*]] = add i32 4, [[INDEX]]
 ; GFX11-NEXT:    [[TMP1:%.*]] = call i32 @llvm.amdgcn.struct.buffer.load.i32(<4 x i32> [[DESC]], i32 [[TMP0]], i32 0, i32 0, i32 0)
@@ -127,8 +127,8 @@ entry:
 }
 
 define amdgpu_kernel void @strided_buffer_desc_to_ptr_index_add_twice_constant_both(<4 x i32> inreg %desc, ptr %out) {
-; GFX11-LABEL: define amdgpu_kernel void @strided_buffer_desc_to_ptr_index_add_twice_constant_both
-; GFX11-SAME: (<4 x i32> inreg [[DESC:%.*]], ptr [[OUT:%.*]]) {
+; GFX11-LABEL: define amdgpu_gfx void @strided_buffer_desc_to_ptr_index_add_twice_constant_both
+; GFX11-SAME: (<4 x i32> inreg [[DESC:%.*]], ptr [[OUT:%.*]], i32 inreg noundef [[GLOBALTABLE:%.*]], ptr addrspace(4) inreg noundef [[NUMWORKGROUPSPTR:%.*]], i32 inreg noundef [[USERDATA0:%.*]], i32 inreg noundef [[USERDATA1:%.*]], i32 inreg noundef [[USERDATA2:%.*]], i32 inreg noundef [[USERDATA3:%.*]], i32 inreg noundef [[USERDATA4:%.*]], i32 inreg noundef [[USERDATA5:%.*]], i32 inreg noundef [[PAD6:%.*]], i32 inreg noundef [[PAD7:%.*]], i32 inreg noundef [[PAD8:%.*]], i32 inreg noundef [[PAD9:%.*]], i32 inreg noundef [[PAD10:%.*]], i32 inreg noundef [[PAD11:%.*]], i32 inreg noundef [[SPILLTABLE:%.*]], <3 x i32> inreg noundef [[WORKGROUPID:%.*]], i32 inreg noundef [[MULTIDISPATCHINFO:%.*]], i32 noundef [[LOCALINVOCATIONID:%.*]]) #[[ATTR0]] {
 ; GFX11-NEXT:  entry:
 ; GFX11-NEXT:    [[TMP0:%.*]] = call i32 @llvm.amdgcn.struct.buffer.load.i32(<4 x i32> [[DESC]], i32 6, i32 0, i32 0, i32 0)
 ; GFX11-NEXT:    [[TMP1:%.*]] = bitcast i32 [[TMP0]] to float
@@ -145,8 +145,8 @@ entry:
 }
 
 define amdgpu_kernel void @strided_buffer_desc_to_ptr_offset_index(<4 x i32> inreg %desc, i32 %index, ptr %out) {
-; GFX11-LABEL: define amdgpu_kernel void @strided_buffer_desc_to_ptr_offset_index
-; GFX11-SAME: (<4 x i32> inreg [[DESC:%.*]], i32 [[INDEX:%.*]], ptr [[OUT:%.*]]) {
+; GFX11-LABEL: define amdgpu_gfx void @strided_buffer_desc_to_ptr_offset_index
+; GFX11-SAME: (<4 x i32> inreg [[DESC:%.*]], i32 [[INDEX:%.*]], ptr [[OUT:%.*]], i32 inreg noundef [[GLOBALTABLE:%.*]], ptr addrspace(4) inreg noundef [[NUMWORKGROUPSPTR:%.*]], i32 inreg noundef [[USERDATA0:%.*]], i32 inreg noundef [[USERDATA1:%.*]], i32 inreg noundef [[USERDATA2:%.*]], i32 inreg noundef [[USERDATA3:%.*]], i32 inreg noundef [[USERDATA4:%.*]], i32 inreg noundef [[USERDATA5:%.*]], i32 inreg noundef [[PAD6:%.*]], i32 inreg noundef [[PAD7:%.*]], i32 inreg noundef [[PAD8:%.*]], i32 inreg noundef [[PAD9:%.*]], i32 inreg noundef [[PAD10:%.*]], i32 inreg noundef [[PAD11:%.*]], i32 inreg noundef [[SPILLTABLE:%.*]], <3 x i32> inreg noundef [[WORKGROUPID:%.*]], i32 inreg noundef [[MULTIDISPATCHINFO:%.*]], i32 noundef [[LOCALINVOCATIONID:%.*]]) #[[ATTR0]] {
 ; GFX11-NEXT:  entry:
 ; GFX11-NEXT:    [[TMP0:%.*]] = call i32 @llvm.amdgcn.struct.buffer.load.i32(<4 x i32> [[DESC]], i32 [[INDEX]], i32 ptrtoint (ptr addrspace(6) getelementptr inbounds (i8, ptr addrspace(6) null, i32 8) to i32), i32 0, i32 0)
 ; GFX11-NEXT:    [[TMP1:%.*]] = bitcast i32 [[TMP0]] to float
@@ -163,8 +163,8 @@ entry:
 }
 
 define float @addr_and_stride_to_ptr(i64 %addr, i32 %stride) {
-; GFX11-LABEL: define float @addr_and_stride_to_ptr
-; GFX11-SAME: (i64 [[ADDR:%.*]], i32 [[STRIDE:%.*]]) {
+; GFX11-LABEL: define amdgpu_gfx float @addr_and_stride_to_ptr
+; GFX11-SAME: (i64 [[ADDR:%.*]], i32 [[STRIDE:%.*]], i32 inreg noundef [[GLOBALTABLE:%.*]], ptr addrspace(4) inreg noundef [[NUMWORKGROUPSPTR:%.*]], i32 inreg noundef [[USERDATA0:%.*]], i32 inreg noundef [[USERDATA1:%.*]], i32 inreg noundef [[USERDATA2:%.*]], i32 inreg noundef [[USERDATA3:%.*]], i32 inreg noundef [[USERDATA4:%.*]], i32 inreg noundef [[USERDATA5:%.*]], i32 inreg noundef [[PAD6:%.*]], i32 inreg noundef [[PAD7:%.*]], i32 inreg noundef [[PAD8:%.*]], i32 inreg noundef [[PAD9:%.*]], i32 inreg noundef [[PAD10:%.*]], i32 inreg noundef [[PAD11:%.*]], i32 inreg noundef [[SPILLTABLE:%.*]], <3 x i32> inreg noundef [[WORKGROUPID:%.*]], i32 inreg noundef [[MULTIDISPATCHINFO:%.*]], i32 noundef [[LOCALINVOCATIONID:%.*]]) #[[ATTR0]] {
 ; GFX11-NEXT:  entry:
 ; GFX11-NEXT:    [[TMP0:%.*]] = trunc i64 [[ADDR]] to i32
 ; GFX11-NEXT:    [[TMP1:%.*]] = insertelement <4 x i32> poison, i32 [[TMP0]], i64 0
@@ -186,8 +186,8 @@ entry:
 }
 
 define float @addr_and_stride_to_ptr_index(i64 %addr, i32 %index, i32 %stride) {
-; GFX11-LABEL: define float @addr_and_stride_to_ptr_index
-; GFX11-SAME: (i64 [[ADDR:%.*]], i32 [[INDEX:%.*]], i32 [[STRIDE:%.*]]) {
+; GFX11-LABEL: define amdgpu_gfx float @addr_and_stride_to_ptr_index
+; GFX11-SAME: (i64 [[ADDR:%.*]], i32 [[INDEX:%.*]], i32 [[STRIDE:%.*]], i32 inreg noundef [[GLOBALTABLE:%.*]], ptr addrspace(4) inreg noundef [[NUMWORKGROUPSPTR:%.*]], i32 inreg noundef [[USERDATA0:%.*]], i32 inreg noundef [[USERDATA1:%.*]], i32 inreg noundef [[USERDATA2:%.*]], i32 inreg noundef [[USERDATA3:%.*]], i32 inreg noundef [[USERDATA4:%.*]], i32 inreg noundef [[USERDATA5:%.*]], i32 inreg noundef [[PAD6:%.*]], i32 inreg noundef [[PAD7:%.*]], i32 inreg noundef [[PAD8:%.*]], i32 inreg noundef [[PAD9:%.*]], i32 inreg noundef [[PAD10:%.*]], i32 inreg noundef [[PAD11:%.*]], i32 inreg noundef [[SPILLTABLE:%.*]], <3 x i32> inreg noundef [[WORKGROUPID:%.*]], i32 inreg noundef [[MULTIDISPATCHINFO:%.*]], i32 noundef [[LOCALINVOCATIONID:%.*]]) #[[ATTR0]] {
 ; GFX11-NEXT:  entry:
 ; GFX11-NEXT:    [[TMP0:%.*]] = trunc i64 [[ADDR]] to i32
 ; GFX11-NEXT:    [[TMP1:%.*]] = insertelement <4 x i32> poison, i32 [[TMP0]], i64 0
@@ -210,8 +210,8 @@ entry:
 }
 
 define float @addr_and_stride_to_ptr_index_offset(i64 %addr, i32 %index, i32 %stride) {
-; GFX11-LABEL: define float @addr_and_stride_to_ptr_index_offset
-; GFX11-SAME: (i64 [[ADDR:%.*]], i32 [[INDEX:%.*]], i32 [[STRIDE:%.*]]) {
+; GFX11-LABEL: define amdgpu_gfx float @addr_and_stride_to_ptr_index_offset
+; GFX11-SAME: (i64 [[ADDR:%.*]], i32 [[INDEX:%.*]], i32 [[STRIDE:%.*]], i32 inreg noundef [[GLOBALTABLE:%.*]], ptr addrspace(4) inreg noundef [[NUMWORKGROUPSPTR:%.*]], i32 inreg noundef [[USERDATA0:%.*]], i32 inreg noundef [[USERDATA1:%.*]], i32 inreg noundef [[USERDATA2:%.*]], i32 inreg noundef [[USERDATA3:%.*]], i32 inreg noundef [[USERDATA4:%.*]], i32 inreg noundef [[USERDATA5:%.*]], i32 inreg noundef [[PAD6:%.*]], i32 inreg noundef [[PAD7:%.*]], i32 inreg noundef [[PAD8:%.*]], i32 inreg noundef [[PAD9:%.*]], i32 inreg noundef [[PAD10:%.*]], i32 inreg noundef [[PAD11:%.*]], i32 inreg noundef [[SPILLTABLE:%.*]], <3 x i32> inreg noundef [[WORKGROUPID:%.*]], i32 inreg noundef [[MULTIDISPATCHINFO:%.*]], i32 noundef [[LOCALINVOCATIONID:%.*]]) #[[ATTR0]] {
 ; GFX11-NEXT:  entry:
 ; GFX11-NEXT:    [[TMP0:%.*]] = trunc i64 [[ADDR]] to i32
 ; GFX11-NEXT:    [[TMP1:%.*]] = insertelement <4 x i32> poison, i32 [[TMP0]], i64 0
@@ -235,8 +235,8 @@ entry:
 }
 
 define float @addr_and_stride_to_ptr_offset_index(i64 %addr, i32 %index, i32 %stride) {
-; GFX11-LABEL: define float @addr_and_stride_to_ptr_offset_index
-; GFX11-SAME: (i64 [[ADDR:%.*]], i32 [[INDEX:%.*]], i32 [[STRIDE:%.*]]) {
+; GFX11-LABEL: define amdgpu_gfx float @addr_and_stride_to_ptr_offset_index
+; GFX11-SAME: (i64 [[ADDR:%.*]], i32 [[INDEX:%.*]], i32 [[STRIDE:%.*]], i32 inreg noundef [[GLOBALTABLE:%.*]], ptr addrspace(4) inreg noundef [[NUMWORKGROUPSPTR:%.*]], i32 inreg noundef [[USERDATA0:%.*]], i32 inreg noundef [[USERDATA1:%.*]], i32 inreg noundef [[USERDATA2:%.*]], i32 inreg noundef [[USERDATA3:%.*]], i32 inreg noundef [[USERDATA4:%.*]], i32 inreg noundef [[USERDATA5:%.*]], i32 inreg noundef [[PAD6:%.*]], i32 inreg noundef [[PAD7:%.*]], i32 inreg noundef [[PAD8:%.*]], i32 inreg noundef [[PAD9:%.*]], i32 inreg noundef [[PAD10:%.*]], i32 inreg noundef [[PAD11:%.*]], i32 inreg noundef [[SPILLTABLE:%.*]], <3 x i32> inreg noundef [[WORKGROUPID:%.*]], i32 inreg noundef [[MULTIDISPATCHINFO:%.*]], i32 noundef [[LOCALINVOCATIONID:%.*]]) #[[ATTR0]] {
 ; GFX11-NEXT:  entry:
 ; GFX11-NEXT:    [[TMP0:%.*]] = trunc i64 [[ADDR]] to i32
 ; GFX11-NEXT:    [[TMP1:%.*]] = insertelement <4 x i32> poison, i32 [[TMP0]], i64 0
@@ -259,6 +259,44 @@ entry:
   ret float %res
 }
 
+define amdgpu_kernel void @constant_strided_buffer_desc_to_ptr_index(<4 x i32> inreg %desc, i32 %index, ptr %out) #0 !lgc.shaderstage !4 {
+; GFX11-LABEL: define amdgpu_gfx void @constant_strided_buffer_desc_to_ptr_index
+; GFX11-SAME: (<4 x i32> inreg [[DESC:%.*]], i32 [[INDEX:%.*]], ptr [[OUT:%.*]], i32 inreg noundef [[GLOBALTABLE:%.*]], ptr addrspace(4) inreg noundef [[NUMWORKGROUPSPTR:%.*]], i32 inreg noundef [[USERDATA0:%.*]], i32 inreg noundef [[USERDATA1:%.*]], i32 inreg noundef [[USERDATA2:%.*]], i32 inreg noundef [[USERDATA3:%.*]], i32 inreg noundef [[USERDATA4:%.*]], i32 inreg noundef [[USERDATA5:%.*]], i32 inreg noundef [[PAD6:%.*]], i32 inreg noundef [[PAD7:%.*]], i32 inreg noundef [[PAD8:%.*]], i32 inreg noundef [[PAD9:%.*]], i32 inreg noundef [[PAD10:%.*]], i32 inreg noundef [[PAD11:%.*]], i32 inreg noundef [[SPILLTABLE:%.*]], <3 x i32> inreg noundef [[WORKGROUPID:%.*]], i32 inreg noundef [[MULTIDISPATCHINFO:%.*]], i32 noundef [[LOCALINVOCATIONID:%.*]]) #[[ATTR1:[0-9]+]] !lgc.shaderstage [[META6:![0-9]+]] {
+; GFX11-NEXT:  entry:
+; GFX11-NEXT:    [[TMP0:%.*]] = call i64 @llvm.amdgcn.s.getpc()
+; GFX11-NEXT:    [[TMP1:%.*]] = bitcast i64 [[TMP0]] to <2 x i32>
+; GFX11-NEXT:    [[TMP2:%.*]] = insertelement <2 x i32> [[TMP1]], i32 [[PAD11]], i64 0
+; GFX11-NEXT:    [[TMP3:%.*]] = bitcast <2 x i32> [[TMP2]] to i64
+; GFX11-NEXT:    [[TMP4:%.*]] = inttoptr i64 [[TMP3]] to ptr addrspace(4)
+; GFX11-NEXT:    [[TMP5:%.*]] = insertelement <2 x i32> poison, i32 [[USERDATA3]], i64 0
+; GFX11-NEXT:    [[TMP6:%.*]] = insertelement <2 x i32> [[TMP5]], i32 [[USERDATA4]], i64 1
+; GFX11-NEXT:    [[TMP7:%.*]] = extractelement <2 x i32> [[TMP6]], i64 0
+; GFX11-NEXT:    [[TMP8:%.*]] = extractelement <2 x i32> [[TMP6]], i64 1
+; GFX11-NEXT:    [[TMP9:%.*]] = insertelement <4 x i32> poison, i32 [[TMP7]], i64 0
+; GFX11-NEXT:    [[TMP10:%.*]] = and i32 [[TMP8]], 65535
+; GFX11-NEXT:    [[TMP11:%.*]] = or i32 [[TMP10]], 1048576
+; GFX11-NEXT:    [[TMP12:%.*]] = insertelement <4 x i32> [[TMP9]], i32 [[TMP11]], i64 1
+; GFX11-NEXT:    [[TMP13:%.*]] = insertelement <4 x i32> [[TMP12]], i32 -1, i64 2
+; GFX11-NEXT:    [[TMP14:%.*]] = insertelement <4 x i32> [[TMP13]], i32 805392300, i64 3
+; GFX11-NEXT:    [[TMP15:%.*]] = extractelement <4 x i32> [[TMP14]], i64 1
+; GFX11-NEXT:    [[TMP16:%.*]] = lshr i32 [[TMP15]], 16
+; GFX11-NEXT:    [[TMP17:%.*]] = and i32 [[TMP16]], 16383
+; GFX11-NEXT:    [[TMP18:%.*]] = mul i32 24, [[TMP17]]
+; GFX11-NEXT:    [[TMP19:%.*]] = add i32 0, [[TMP18]]
+; GFX11-NEXT:    [[TMP20:%.*]] = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> [[TMP14]], i32 [[TMP19]], i32 0), !invariant.load [[META7:![0-9]+]]
+; GFX11-NEXT:    [[TMP21:%.*]] = bitcast i32 [[TMP20]] to float
+; GFX11-NEXT:    store float [[TMP21]], ptr [[OUT]], align 4
+; GFX11-NEXT:    ret void
+;
+entry:
+  %145 = call ptr addrspace(9) @lgc.load.strided.buffer.desc(i64 8589934592, i32 0, i32 0, i32 4, i32 16)
+  %146 = call ptr @llvm.invariant.start.p9(i64 -1, ptr addrspace(9) %145)
+  %147 = call ptr addrspace(9) @lgc.strided.index.add(ptr addrspace(9) %145, i32 24)
+  %res = load float, ptr addrspace(9) %147, align 16
+  store float %res, ptr %out, align 4
+  ret void
+}
+
 ; Function Attrs: nounwind willreturn memory(none)
 declare ptr addrspace(9) @lgc.strided.buffer.desc.to.ptr(<4 x i32>) #0
 
@@ -268,4 +306,22 @@ declare ptr addrspace(9) @lgc.strided.buffer.addr.and.stride.to.ptr(i64, i32) #0
 ; Function Attrs: nounwind willreturn memory(none)
 declare ptr addrspace(9) @lgc.strided.index.add(ptr addrspace(9), i32) #0
 
+; Function Attrs: nounwind willreturn memory(none)
+declare ptr addrspace(9) @lgc.load.strided.buffer.desc(i64, i32, i32, i32, i32) #0
+
+; Function Attrs: nocallback nofree nosync nounwind willreturn memory(argmem: readwrite)
+declare ptr @llvm.invariant.start.p9(i64 immarg, ptr addrspace(9) nocapture) #1
+
 attributes #0 = { nounwind willreturn memory(none) }
+attributes #1 = { nocallback nofree nosync nounwind willreturn memory(argmem: readwrite) }
+
+!llpc.compute.mode = !{!0}
+
+!lgc.user.data.nodes = !{!1, !2, !3}
+
+!0 = !{i32 16, i32 16, i32 1}
+
+!1 = !{!"DescriptorTableVaPtr", i32 7, i32 255, i32 3, i32 1, i32 1}
+!2 = !{!"DescriptorMutable", i32 17, i32 0, i32 0, i32 40, i64 4294967296, i32 0, i32 8}
+!3 = !{!"DescriptorConstBufferCompact", i32 15, i32 255, i32 4, i32 2, i64 8589934592, i32 0, i32 2}
+!4 = !{i32 7}
diff --git a/lgc/test/scalarizationOfDescriptorLoadsTest1.lgc b/lgc/test/scalarizationOfDescriptorLoadsTest1.lgc
new file mode 100644
index 0000000000..cf01ecaead
--- /dev/null
+++ b/lgc/test/scalarizationOfDescriptorLoadsTest1.lgc
@@ -0,0 +1,101 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --tool lgc --version 4
+; RUN: lgc -mcpu=gfx1010 -print-after=lgc-builder-replayer -o - %s 2>&1 | FileCheck --check-prefixes=CHECK %s
+; ModuleID = 'lgcPipeline'
+source_filename = "lgcPipeline"
+target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-p7:160:256:256:32-p8:128:128-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7:8-p32:32:32"
+target triple = "amdgcn--amdpal"
+
+; Function Attrs: nounwind
+define dllexport spir_func void @lgc.shader.VS.main() local_unnamed_addr #0 !spirv.ExecutionModel !14 !lgc.shaderstage !15 {
+; CHECK-LABEL: define dllexport spir_func void @lgc.shader.VS.main(
+; CHECK-SAME: ) local_unnamed_addr #[[ATTR0:[0-9]+]] !spirv.ExecutionModel !14 !lgc.shaderstage [[META15:![0-9]+]] {
+; CHECK-NEXT:  .entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = call i32 @lgc.input.import.generic__i32(i1 false, i32 0, i32 0, i32 0, i32 poison)
+; CHECK-NEXT:    [[TMP1:%.*]] = mul i32 [[TMP0]], poison
+; CHECK-NEXT:    [[TMP2:%.*]] = sext i32 [[TMP1]] to i64
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr addrspace(4) poison, i64 [[TMP2]]
+; CHECK-NEXT:    [[TMP4:%.*]] = load <4 x i32>, ptr addrspace(4) [[TMP3]], align 32, !invariant.load [[META16:![0-9]+]]
+; CHECK-NEXT:    [[TMP5:%.*]] = mul i32 [[TMP0]], poison
+; CHECK-NEXT:    [[TMP6:%.*]] = sext i32 [[TMP5]] to i64
+; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr i8, ptr addrspace(4) poison, i64 [[TMP6]]
+; CHECK-NEXT:    [[TMP8:%.*]] = load <4 x i32>, ptr addrspace(4) [[TMP7]], align 16, !invariant.load [[META16]]
+; CHECK-NEXT:    [[TMP9:%.*]] = call i32 @llvm.amdgcn.waterfall.begin.i32(i32 0, i32 [[TMP5]])
+; CHECK-NEXT:    [[TMP10:%.*]] = call i32 @llvm.amdgcn.waterfall.readfirstlane.i32.i32(i32 [[TMP9]], i32 [[TMP5]])
+; CHECK-NEXT:    [[TMP11:%.*]] = sext i32 [[TMP10]] to i64
+; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr i8, ptr addrspace(4) poison, i64 [[TMP11]]
+; CHECK-NEXT:    [[TMP13:%.*]] = load <4 x i32>, ptr addrspace(4) [[TMP12]], align 16, !invariant.load [[META16]]
+; CHECK-NEXT:    [[TMP14:%.*]] = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.struct.buffer.load.format.v4f32(<4 x i32> [[TMP13]], i32 0, i32 0, i32 0, i32 0)
+; CHECK-NEXT:    [[TMP15:%.*]] = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.waterfall.end.v4f32(i32 [[TMP9]], <4 x float> [[TMP14]])
+; CHECK-NEXT:    [[TMP16:%.*]] = call i32 @llvm.amdgcn.waterfall.begin.i32(i32 0, i32 [[TMP1]])
+; CHECK-NEXT:    [[TMP17:%.*]] = call i32 @llvm.amdgcn.waterfall.readfirstlane.i32.i32(i32 [[TMP16]], i32 [[TMP1]])
+; CHECK-NEXT:    [[TMP18:%.*]] = sext i32 [[TMP17]] to i64
+; CHECK-NEXT:    [[TMP19:%.*]] = getelementptr i8, ptr addrspace(4) poison, i64 [[TMP18]]
+; CHECK-NEXT:    [[TMP20:%.*]] = load <4 x i32>, ptr addrspace(4) [[TMP19]], align 32, !invariant.load [[META16]]
+; CHECK-NEXT:    [[TMP21:%.*]] = call <4 x i32> @llvm.amdgcn.waterfall.last.use.v4i32(i32 [[TMP16]], <4 x i32> [[TMP20]])
+; CHECK-NEXT:    call void @llvm.amdgcn.struct.buffer.store.format.v4f32(<4 x float> [[TMP15]], <4 x i32> [[TMP21]], i32 1, i32 0, i32 0, i32 0)
+; CHECK-NEXT:    ret void
+;
+.entry:
+  %0 = call i32 (...) @lgc.create.read.generic.input__i32(i32 0, i32 0, i32 0, i32 0, i32 0, i32 poison)
+  %1 = call ptr addrspace(4) (...) @lgc.create.get.desc.ptr.p4(i32 1, i32 1, i64 0, i32 0)
+  %2 = call i32 (...) @lgc.create.get.desc.stride__i32(i32 1, i32 1, i64 0, i32 0)
+  %3 = mul i32 %0, %2
+  %4 = sext i32 %3 to i64
+  %5 = getelementptr i8, ptr addrspace(4) %1, i64 %4
+  %6 = load <4 x i32>, ptr addrspace(4) %5, align 32, !invariant.load !16
+  %7 = mul i32 %0, %2
+  %8 = sext i32 %7 to i64
+  %9 = getelementptr i8, ptr addrspace(4) %1, i64 %8
+  %10 = load <4 x i32>, ptr addrspace(4) %9, align 16, !invariant.load !16
+  %11 = call reassoc nnan nsz arcp contract afn <4 x float> (...) @lgc.create.image.load.v4f32(i32 0, i32 8, <4 x i32> %10, i32 0)
+  call void (...) @lgc.create.image.store(<4 x float> %11, i32 0, i32 8, <4 x i32> %6, i32 1)
+  ret void
+}
+
+; Function Attrs: nounwind willreturn memory(read)
+declare i32 @lgc.create.read.generic.input__i32(...) local_unnamed_addr #1
+
+; Function Attrs: nounwind memory(none)
+declare ptr addrspace(4) @lgc.create.get.desc.ptr.p4(...) local_unnamed_addr #2
+
+; Function Attrs: nounwind memory(none)
+declare i32 @lgc.create.get.desc.stride__i32(...) local_unnamed_addr #2
+
+; Function Attrs: nounwind willreturn memory(read)
+declare <4 x float> @lgc.create.image.load.v4f32(...) local_unnamed_addr #1
+
+; Function Attrs: nounwind memory(write)
+declare void @lgc.create.image.store(...) local_unnamed_addr #3
+
+attributes #0 = { nounwind "denormal-fp-math-f32"="preserve-sign" }
+attributes #1 = { nounwind willreturn memory(read) }
+attributes #2 = { nounwind memory(none) }
+attributes #3 = { nounwind memory(write) }
+
+!lgc.client = !{!0}
+!lgc.options = !{!1}
+!lgc.options.VS = !{!2}
+!lgc.options.FS = !{!3}
+!lgc.user.data.nodes = !{!4, !5, !6, !7}
+!lgc.vertex.inputs = !{!8, !9, !10}
+!lgc.color.export.formats = !{!11}
+!lgc.rasterizer.state = !{!12}
+!amdgpu.pal.metadata.msgpack = !{!13}
+
+!0 = !{!"Vulkan"}
+!1 = !{i32 -1055878566, i32 -1332805290, i32 1045905781, i32 -589165353, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 256, i32 256, i32 -1, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 16777216}
+!2 = !{i32 1639417258, i32 -1495429105, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 64, i32 0, i32 0, i32 3, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 20, i32 1800, i32 0, i32 0, i32 1}
+!3 = !{i32 -1409621709, i32 -171549995, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 64, i32 64, i32 0, i32 0, i32 3, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 20, i32 1800, i32 0, i32 0, i32 1}
+!4 = !{!"DescriptorTableVaPtr", i32 7, i32 66, i32 0, i32 1, i32 1}
+!5 = !{!"DescriptorTexelBuffer", i32 4, i32 66, i32 0, i32 16384, i64 0, i32 0, i32 4}
+!6 = !{!"StreamOutTableVaPtr", i32 11, i32 4, i32 1, i32 1, i32 0}
+!7 = !{!"IndirectUserDataVaPtr", i32 8, i32 2, i32 2, i32 1, i32 4}
+!8 = !{i32 0, i32 0, i32 0, i32 40, i32 14, i32 7}
+!9 = !{i32 1, i32 0, i32 16, i32 40, i32 11, i32 7}
+!10 = !{i32 2, i32 0, i32 24, i32 40, i32 14, i32 5}
+!11 = !{i32 14, i32 7, i32 0, i32 0, i32 15}
+!12 = !{i32 0, i32 0, i32 0, i32 1}
+!13 = !{!"\82\B0amdpal.pipelines\91\83\B0.spill_threshold\CD\FF\FF\B0.user_data_limit\00\AF.xgl_cache_info\82\B3.128_bit_cache_hash\92\CF\D6\9C\B4\09\0A8A\DA\CF3\09\AF\FF\11\A9U\06\AD.llpc_version\A470.1\AEamdpal.version\92\03\00"}
+!14 = !{i32 0}
+!15 = !{i32 1}
+!16 = !{}
diff --git a/lgc/test/scalarizationOfDescriptorLoadsTest10.lgc b/lgc/test/scalarizationOfDescriptorLoadsTest10.lgc
new file mode 100644
index 0000000000..7835bf62e7
--- /dev/null
+++ b/lgc/test/scalarizationOfDescriptorLoadsTest10.lgc
@@ -0,0 +1,149 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --tool lgc --version 4
+; RUN: lgc -mcpu=gfx1010 -print-after=lgc-builder-replayer -o - %s 2>&1 | FileCheck --check-prefixes=CHECK %s
+
+; This test checks how the scalarization of descriptor loads works if the values are defined in different basic blocks of a loop.
+
+source_filename = "llpc_fragment_7"
+target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-p7:160:256:256:32-p8:128:128-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7:8-p32:32:32"
+target triple = "amdgcn--amdpal"
+
+; Function Attrs: nounwind
+define dllexport spir_func void @lgc.shader.FS.main() local_unnamed_addr #0 !spirv.ExecutionModel !8 !lgc.shaderstage !9 {
+; CHECK-LABEL: define dllexport spir_func void @lgc.shader.FS.main(
+; CHECK-SAME: ) local_unnamed_addr #[[ATTR0:[0-9]+]] !spirv.ExecutionModel !8 !lgc.shaderstage [[META9:![0-9]+]] {
+; CHECK-NEXT:  .entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.amdgcn.s.getpc()
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i64 [[TMP0]] to <2 x i32>
+; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.amdgcn.s.getpc()
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i64 [[TMP2]] to <2 x i32>
+; CHECK-NEXT:    [[I:%.*]] = call i32 (...) @lgc.input.import.interpolated__i32(i1 false, i32 3, i32 0, i32 0, i32 poison, i32 1, i32 poison)
+; CHECK-NEXT:    [[TMP4:%.*]] = call i32 @lgc.load.user.data__i32(i32 0)
+; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <2 x i32> [[TMP3]], i32 [[TMP4]], i64 0
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <2 x i32> [[TMP5]] to i64
+; CHECK-NEXT:    [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr addrspace(4)
+; CHECK-NEXT:    [[I1:%.*]] = getelementptr i8, ptr addrspace(4) [[TMP7]], i32 0
+; CHECK-NEXT:    [[TMP8:%.*]] = call i32 @lgc.load.user.data__i32(i32 0)
+; CHECK-NEXT:    [[TMP9:%.*]] = insertelement <2 x i32> [[TMP1]], i32 [[TMP8]], i64 0
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast <2 x i32> [[TMP9]] to i64
+; CHECK-NEXT:    [[TMP11:%.*]] = inttoptr i64 [[TMP10]] to ptr addrspace(4)
+; CHECK-NEXT:    [[I2:%.*]] = getelementptr i8, ptr addrspace(4) [[TMP11]], i32 32
+; CHECK-NEXT:    br label [[LOOP:%.*]]
+; CHECK:       loop:
+; CHECK-NEXT:    [[PHI_IND:%.*]] = phi i32 [ 0, [[DOTENTRY:%.*]] ], [ [[IND:%.*]], [[LOOP_LATCH:%.*]] ]
+; CHECK-NEXT:    [[COND1:%.*]] = icmp ne i32 [[PHI_IND]], 1000
+; CHECK-NEXT:    br i1 [[COND1]], label [[BB1:%.*]], label [[BB2:%.*]]
+; CHECK:       bb1:
+; CHECK-NEXT:    br label [[LOOP_LATCH]]
+; CHECK:       bb2:
+; CHECK-NEXT:    br label [[LOOP_LATCH]]
+; CHECK:       loop.latch:
+; CHECK-NEXT:    [[PHI:%.*]] = phi i32 [ 48, [[BB1]] ], [ 48, [[BB2]] ]
+; CHECK-NEXT:    [[I3:%.*]] = mul i32 [[PHI_IND]], [[PHI]]
+; CHECK-NEXT:    [[I4:%.*]] = sext i32 [[I3]] to i64
+; CHECK-NEXT:    [[I5:%.*]] = getelementptr i8, ptr addrspace(4) [[I1]], i64 [[I4]]
+; CHECK-NEXT:    [[I6:%.*]] = mul i32 [[PHI_IND]], [[PHI]]
+; CHECK-NEXT:    [[I7:%.*]] = sext i32 [[I6]] to i64
+; CHECK-NEXT:    [[I8:%.*]] = getelementptr i8, ptr addrspace(4) [[I2]], i64 [[I7]]
+; CHECK-NEXT:    [[I9:%.*]] = load <4 x i32>, ptr addrspace(4) [[I8]], align 16, !invariant.load [[META10:![0-9]+]]
+; CHECK-NEXT:    [[I10:%.*]] = load <8 x i32>, ptr addrspace(4) [[I5]], align 32, !invariant.load [[META10]]
+; CHECK-NEXT:    [[TMP12:%.*]] = call i32 @llvm.amdgcn.waterfall.begin.i32(i32 0, i32 [[I3]])
+; CHECK-NEXT:    [[TMP13:%.*]] = call i32 @llvm.amdgcn.waterfall.readfirstlane.i32.i32(i32 [[TMP12]], i32 [[I3]])
+; CHECK-NEXT:    [[TMP14:%.*]] = sext i32 [[TMP13]] to i64
+; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr addrspace(4) [[I1]], i64 [[TMP14]]
+; CHECK-NEXT:    [[TMP16:%.*]] = load <8 x i32>, ptr addrspace(4) [[TMP15]], align 32, !invariant.load [[META10]]
+; CHECK-NEXT:    [[TMP17:%.*]] = sext i32 [[TMP13]] to i64
+; CHECK-NEXT:    [[TMP18:%.*]] = getelementptr i8, ptr addrspace(4) [[I2]], i64 [[TMP17]]
+; CHECK-NEXT:    [[TMP19:%.*]] = load <4 x i32>, ptr addrspace(4) [[TMP18]], align 16, !invariant.load [[META10]]
+; CHECK-NEXT:    [[TMP20:%.*]] = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32 15, float 0.000000e+00, float 0.000000e+00, <8 x i32> [[TMP16]], <4 x i32> [[TMP19]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    [[I11:%.*]] = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.waterfall.end.v4f32(i32 [[TMP12]], <4 x float> [[TMP20]])
+; CHECK-NEXT:    call void @lgc.output.export.generic.i32.i32.v4f32(i32 0, i32 0, <4 x float> [[I11]]) #[[ATTR5:[0-9]+]]
+; CHECK-NEXT:    [[IND]] = add i32 [[PHI_IND]], 1
+; CHECK-NEXT:    [[COND2:%.*]] = icmp ne i32 [[IND]], 1000
+; CHECK-NEXT:    br i1 [[COND2]], label [[LOOP]], label [[EXIT:%.*]]
+; CHECK:       exit:
+; CHECK-NEXT:    ret void
+;
+.entry:
+  %i = call i32 (...) @lgc.create.read.generic.input__i32(i32 3, i32 0, i32 0, i32 0, i32 17, i32 poison)
+  %i1 = call ptr addrspace(4) (...) @lgc.create.get.desc.ptr.p4(i32 1, i32 1, i64 0, i32 7)
+  %i2 = call ptr addrspace(4) (...) @lgc.create.get.desc.ptr.p4(i32 2, i32 2, i64 0, i32 7)
+  br label %loop
+
+loop:                                             ; preds = %loop.latch, %.entry
+  %phi.ind = phi i32 [ 0, %.entry ], [ %ind, %loop.latch ]
+  %cond1 = icmp ne i32 %phi.ind, 1000
+  br i1 %cond1, label %bb1, label %bb2
+
+bb1:                                              ; preds = %loop
+  %a = call i32 (...) @lgc.create.get.desc.stride__i32(i32 1, i32 1, i64 0, i32 7)
+  br label %loop.latch
+
+bb2:                                              ; preds = %loop
+  %b = call i32 (...) @lgc.create.get.desc.stride__i32(i32 2, i32 2, i64 0, i32 7)
+  br label %loop.latch
+
+loop.latch:                                       ; preds = %bb2, %bb1
+  %phi = phi i32 [ %a, %bb1 ], [ %b, %bb2 ]
+  %i3 = mul i32 %phi.ind, %phi
+  %i4 = sext i32 %i3 to i64
+  %i5 = getelementptr i8, ptr addrspace(4) %i1, i64 %i4
+  %i6 = mul i32 %phi.ind, %phi
+  %i7 = sext i32 %i6 to i64
+  %i8 = getelementptr i8, ptr addrspace(4) %i2, i64 %i7
+  %i9 = load <4 x i32>, ptr addrspace(4) %i8, align 16, !invariant.load !10
+  %i10 = load <8 x i32>, ptr addrspace(4) %i5, align 32, !invariant.load !10
+  %i11 = call reassoc nnan nsz arcp contract afn <4 x float> (...) @lgc.create.image.sample.v4f32(i32 1, i32 24, <8 x i32> %i10, <4 x i32> %i9, i32 1, <2 x float> zeroinitializer)
+  call void (...) @lgc.create.write.generic.output(<4 x float> %i11, i32 0, i32 0, i32 0, i32 0, i32 0, i32 poison)
+  %ind = add i32 %phi.ind, 1
+  %cond2 = icmp ne i32 %ind, 1000
+  br i1 %cond2, label %loop, label %exit
+
+exit:                                             ; preds = %loop.latch
+  ret void
+}
+
+declare spir_func void @spirv.NonUniform.i32(i32) local_unnamed_addr
+
+; Function Attrs: nounwind memory(none)
+declare ptr addrspace(4) @lgc.create.get.desc.ptr.p4(...) local_unnamed_addr #1
+
+; Function Attrs: nounwind memory(none)
+declare i32 @lgc.create.get.desc.stride__i32(...) local_unnamed_addr #1
+
+declare spir_func void @"spirv.NonUniform.s[s[p4,i32,i32,i32],s[p4,i32,i32]]"({ { ptr addrspace(4), i32, i32, i32 }, { ptr addrspace(4), i32, i32 } }) local_unnamed_addr
+
+declare spir_func void @"spirv.NonUniform.s[a3v8i32,s[v4i32,i32]]"({ [3 x <8 x i32>], { <4 x i32>, i32 } }) local_unnamed_addr
+
+; Function Attrs: nounwind willreturn memory(read)
+declare <4 x float> @lgc.create.image.sample.v4f32(...) local_unnamed_addr #2
+
+; Function Attrs: nounwind willreturn memory(read)
+declare i32 @lgc.create.read.generic.input__i32(...) local_unnamed_addr #2
+
+; Function Attrs: nounwind
+declare void @lgc.create.write.generic.output(...) local_unnamed_addr #3
+
+attributes #0 = { nounwind "denormal-fp-math-f32"="preserve-sign" }
+attributes #1 = { nounwind memory(none) }
+attributes #2 = { nounwind willreturn memory(read) }
+attributes #3 = { nounwind }
+
+!lgc.client = !{!0}
+!lgc.unlinked = !{!1}
+!lgc.options = !{!2}
+!lgc.options.FS = !{!3}
+!lgc.user.data.nodes = !{!4, !5}
+!lgc.color.export.formats = !{!6}
+!amdgpu.pal.metadata.msgpack = !{!7}
+
+!0 = !{!"Vulkan"}
+!1 = !{i32 1}
+!2 = !{i32 -158725823, i32 1419665388, i32 -1015833383, i32 -491143713, i32 1, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 256, i32 256, i32 2, i32 1}
+!3 = !{i32 -1822594139, i32 1920663194, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 64, i32 0, i32 0, i32 3, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 20, i32 1800, i32 0, i32 0, i32 1}
+!4 = !{!"DescriptorTableVaPtr", i32 7, i32 64, i32 0, i32 1, i32 1}
+!5 = !{!"DescriptorCombinedTexture", i32 3, i32 64, i32 0, i32 192, i64 0, i32 7, i32 12}
+!6 = !{i32 14, i32 7, i32 0, i32 0, i32 15}
+!7 = !{!"\82\B0amdpal.pipelines\91\83\B0.spill_threshold\CD\FF\FF\B0.user_data_limit\00\AF.xgl_cache_info\82\B3.128_bit_cache_hash\92\CF\E8\D2\98>j\B9B\94\CF2\DEF\BF\9Fx\BC1\AD.llpc_version\A470.1\AEamdpal.version\92\03\00"}
+!8 = !{i32 4}
+!9 = !{i32 6}
+!10 = !{}
diff --git a/lgc/test/scalarizationOfDescriptorLoadsTest11.lgc b/lgc/test/scalarizationOfDescriptorLoadsTest11.lgc
new file mode 100644
index 0000000000..2a2d450d1b
--- /dev/null
+++ b/lgc/test/scalarizationOfDescriptorLoadsTest11.lgc
@@ -0,0 +1,145 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --tool lgc --version 4
+; RUN: lgc -mcpu=gfx1010 -print-after=lgc-builder-replayer -o - %s 2>&1 | FileCheck --check-prefixes=CHECK %s
+
+; This test checks how the scalarization of descriptor loads works if the values are defined in different basic blocks of a loop.
+
+source_filename = "llpc_fragment_7"
+target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-p7:160:256:256:32-p8:128:128-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7:8-p32:32:32"
+target triple = "amdgcn--amdpal"
+
+; Function Attrs: nounwind
+define dllexport spir_func void @lgc.shader.FS.main() local_unnamed_addr #0 !spirv.ExecutionModel !8 !lgc.shaderstage !9 {
+; CHECK-LABEL: define dllexport spir_func void @lgc.shader.FS.main(
+; CHECK-SAME: ) local_unnamed_addr #[[ATTR0:[0-9]+]] !spirv.ExecutionModel !8 !lgc.shaderstage [[META9:![0-9]+]] {
+; CHECK-NEXT:  .entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.amdgcn.s.getpc()
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i64 [[TMP0]] to <2 x i32>
+; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.amdgcn.s.getpc()
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i64 [[TMP2]] to <2 x i32>
+; CHECK-NEXT:    [[I:%.*]] = call i32 (...) @lgc.input.import.interpolated__i32(i1 false, i32 3, i32 0, i32 0, i32 poison, i32 1, i32 poison)
+; CHECK-NEXT:    [[TMP4:%.*]] = call i32 @lgc.load.user.data__i32(i32 0)
+; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <2 x i32> [[TMP3]], i32 [[TMP4]], i64 0
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <2 x i32> [[TMP5]] to i64
+; CHECK-NEXT:    [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr addrspace(4)
+; CHECK-NEXT:    [[I1:%.*]] = getelementptr i8, ptr addrspace(4) [[TMP7]], i32 0
+; CHECK-NEXT:    [[TMP8:%.*]] = call i32 @lgc.load.user.data__i32(i32 0)
+; CHECK-NEXT:    [[TMP9:%.*]] = insertelement <2 x i32> [[TMP1]], i32 [[TMP8]], i64 0
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast <2 x i32> [[TMP9]] to i64
+; CHECK-NEXT:    [[TMP11:%.*]] = inttoptr i64 [[TMP10]] to ptr addrspace(4)
+; CHECK-NEXT:    [[I2:%.*]] = getelementptr i8, ptr addrspace(4) [[TMP11]], i32 32
+; CHECK-NEXT:    br label [[LOOP:%.*]]
+; CHECK:       loop:
+; CHECK-NEXT:    [[PHI_IND:%.*]] = phi i32 [ 0, [[DOTENTRY:%.*]] ], [ [[IND:%.*]], [[LOOP_LATCH:%.*]] ]
+; CHECK-NEXT:    [[COND1:%.*]] = icmp ne i32 [[PHI_IND]], 1000
+; CHECK-NEXT:    br i1 [[COND1]], label [[BB1:%.*]], label [[BB2:%.*]]
+; CHECK:       bb1:
+; CHECK-NEXT:    [[I3:%.*]] = mul i32 [[I]], 48
+; CHECK-NEXT:    [[I4:%.*]] = sext i32 [[I3]] to i64
+; CHECK-NEXT:    [[I5:%.*]] = getelementptr i8, ptr addrspace(4) [[I1]], i64 [[I4]]
+; CHECK-NEXT:    br label [[LOOP_LATCH]]
+; CHECK:       bb2:
+; CHECK-NEXT:    [[I6:%.*]] = mul i32 [[I]], 48
+; CHECK-NEXT:    [[I7:%.*]] = sext i32 [[I6]] to i64
+; CHECK-NEXT:    [[I8:%.*]] = getelementptr i8, ptr addrspace(4) [[I2]], i64 [[I7]]
+; CHECK-NEXT:    br label [[LOOP_LATCH]]
+; CHECK:       loop.latch:
+; CHECK-NEXT:    [[PHI:%.*]] = phi ptr addrspace(4) [ [[I5]], [[BB1]] ], [ [[I8]], [[BB2]] ]
+; CHECK-NEXT:    [[I9:%.*]] = load <4 x i32>, ptr addrspace(4) [[PHI]], align 16, !invariant.load [[META10:![0-9]+]]
+; CHECK-NEXT:    [[I10:%.*]] = load <8 x i32>, ptr addrspace(4) [[PHI]], align 32, !invariant.load [[META10]]
+; CHECK-NEXT:    [[TMP12:%.*]] = call i32 @llvm.amdgcn.waterfall.begin.v8i32(i32 0, <8 x i32> [[I10]])
+; CHECK-NEXT:    [[TMP13:%.*]] = call i32 @llvm.amdgcn.waterfall.begin.v4i32(i32 [[TMP12]], <4 x i32> [[I9]])
+; CHECK-NEXT:    [[TMP14:%.*]] = call <8 x i32> @llvm.amdgcn.waterfall.readfirstlane.v8i32.v8i32(i32 [[TMP13]], <8 x i32> [[I10]])
+; CHECK-NEXT:    [[TMP15:%.*]] = call <4 x i32> @llvm.amdgcn.waterfall.readfirstlane.v4i32.v4i32(i32 [[TMP13]], <4 x i32> [[I9]])
+; CHECK-NEXT:    [[TMP16:%.*]] = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32 15, float 0.000000e+00, float 0.000000e+00, <8 x i32> [[TMP14]], <4 x i32> [[TMP15]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    [[I11:%.*]] = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.waterfall.end.v4f32(i32 [[TMP13]], <4 x float> [[TMP16]])
+; CHECK-NEXT:    call void @lgc.output.export.generic.i32.i32.v4f32(i32 0, i32 0, <4 x float> [[I11]]) #[[ATTR5:[0-9]+]]
+; CHECK-NEXT:    [[IND]] = add i32 [[PHI_IND]], 1
+; CHECK-NEXT:    [[COND2:%.*]] = icmp ne i32 [[IND]], 1000
+; CHECK-NEXT:    br i1 [[COND2]], label [[LOOP]], label [[EXIT:%.*]]
+; CHECK:       exit:
+; CHECK-NEXT:    ret void
+;
+.entry:
+  %i = call i32 (...) @lgc.create.read.generic.input__i32(i32 3, i32 0, i32 0, i32 0, i32 17, i32 poison)
+  %i1 = call ptr addrspace(4) (...) @lgc.create.get.desc.ptr.p4(i32 1, i32 1, i64 0, i32 7)
+  %i2 = call ptr addrspace(4) (...) @lgc.create.get.desc.ptr.p4(i32 2, i32 2, i64 0, i32 7)
+  br label %loop
+
+loop:                                             ; preds = %loop.latch, %.entry
+  %phi.ind = phi i32 [ 0, %.entry ], [ %ind, %loop.latch ]
+  %cond1 = icmp ne i32 %phi.ind, 1000
+  br i1 %cond1, label %bb1, label %bb2
+
+bb1:                                              ; preds = %loop
+  %a = call i32 (...) @lgc.create.get.desc.stride__i32(i32 1, i32 1, i64 0, i32 7)
+  %i3 = mul i32 %i, %a
+  %i4 = sext i32 %i3 to i64
+  %i5 = getelementptr i8, ptr addrspace(4) %i1, i64 %i4
+  br label %loop.latch
+
+bb2:                                              ; preds = %loop
+  %b = call i32 (...) @lgc.create.get.desc.stride__i32(i32 2, i32 2, i64 0, i32 7)
+  %i6 = mul i32 %i, %b
+  %i7 = sext i32 %i6 to i64
+  %i8 = getelementptr i8, ptr addrspace(4) %i2, i64 %i7
+  br label %loop.latch
+
+loop.latch:                                       ; preds = %bb2, %bb1
+  %phi = phi ptr addrspace(4) [ %i5, %bb1 ], [ %i8, %bb2 ]
+  %i9 = load <4 x i32>, ptr addrspace(4) %phi, align 16, !invariant.load !10
+  %i10 = load <8 x i32>, ptr addrspace(4) %phi, align 32, !invariant.load !10
+  %i11 = call reassoc nnan nsz arcp contract afn <4 x float> (...) @lgc.create.image.sample.v4f32(i32 1, i32 24, <8 x i32> %i10, <4 x i32> %i9, i32 1, <2 x float> zeroinitializer)
+  call void (...) @lgc.create.write.generic.output(<4 x float> %i11, i32 0, i32 0, i32 0, i32 0, i32 0, i32 poison)
+  %ind = add i32 %phi.ind, 1
+  %cond2 = icmp ne i32 %ind, 1000
+  br i1 %cond2, label %loop, label %exit
+
+exit:                                             ; preds = %loop.latch
+  ret void
+}
+
+declare spir_func void @spirv.NonUniform.i32(i32) local_unnamed_addr
+
+; Function Attrs: nounwind memory(none)
+declare ptr addrspace(4) @lgc.create.get.desc.ptr.p4(...) local_unnamed_addr #1
+
+; Function Attrs: nounwind memory(none)
+declare i32 @lgc.create.get.desc.stride__i32(...) local_unnamed_addr #1
+
+declare spir_func void @"spirv.NonUniform.s[s[p4,i32,i32,i32],s[p4,i32,i32]]"({ { ptr addrspace(4), i32, i32, i32 }, { ptr addrspace(4), i32, i32 } }) local_unnamed_addr
+
+declare spir_func void @"spirv.NonUniform.s[a3v8i32,s[v4i32,i32]]"({ [3 x <8 x i32>], { <4 x i32>, i32 } }) local_unnamed_addr
+
+; Function Attrs: nounwind willreturn memory(read)
+declare <4 x float> @lgc.create.image.sample.v4f32(...) local_unnamed_addr #2
+
+; Function Attrs: nounwind willreturn memory(read)
+declare i32 @lgc.create.read.generic.input__i32(...) local_unnamed_addr #2
+
+; Function Attrs: nounwind
+declare void @lgc.create.write.generic.output(...) local_unnamed_addr #3
+
+attributes #0 = { nounwind "denormal-fp-math-f32"="preserve-sign" }
+attributes #1 = { nounwind memory(none) }
+attributes #2 = { nounwind willreturn memory(read) }
+attributes #3 = { nounwind }
+
+!lgc.client = !{!0}
+!lgc.unlinked = !{!1}
+!lgc.options = !{!2}
+!lgc.options.FS = !{!3}
+!lgc.user.data.nodes = !{!4, !5}
+!lgc.color.export.formats = !{!6}
+!amdgpu.pal.metadata.msgpack = !{!7}
+
+!0 = !{!"Vulkan"}
+!1 = !{i32 1}
+!2 = !{i32 -158725823, i32 1419665388, i32 -1015833383, i32 -491143713, i32 1, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 256, i32 256, i32 2, i32 1}
+!3 = !{i32 -1822594139, i32 1920663194, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 64, i32 0, i32 0, i32 3, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 20, i32 1800, i32 0, i32 0, i32 1}
+!4 = !{!"DescriptorTableVaPtr", i32 7, i32 64, i32 0, i32 1, i32 1}
+!5 = !{!"DescriptorCombinedTexture", i32 3, i32 64, i32 0, i32 192, i64 0, i32 7, i32 12}
+!6 = !{i32 14, i32 7, i32 0, i32 0, i32 15}
+!7 = !{!"\82\B0amdpal.pipelines\91\83\B0.spill_threshold\CD\FF\FF\B0.user_data_limit\00\AF.xgl_cache_info\82\B3.128_bit_cache_hash\92\CF\E8\D2\98>j\B9B\94\CF2\DEF\BF\9Fx\BC1\AD.llpc_version\A470.1\AEamdpal.version\92\03\00"}
+!8 = !{i32 4}
+!9 = !{i32 6}
+!10 = !{}
diff --git a/lgc/test/scalarizationOfDescriptorLoadsTest12.lgc b/lgc/test/scalarizationOfDescriptorLoadsTest12.lgc
new file mode 100644
index 0000000000..afd0072c01
--- /dev/null
+++ b/lgc/test/scalarizationOfDescriptorLoadsTest12.lgc
@@ -0,0 +1,135 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --tool lgc --version 4
+; RUN: lgc -mcpu=gfx1010 -print-after=lgc-builder-replayer -o - %s 2>&1 | FileCheck --check-prefixes=CHECK %s
+
+; This test checks how the scalarization of descriptor loads works if there is a lgc.create.image.sample.v4f32 call
+; inside a loop and its returned value is used in a lgc.create.write.generic.output call outside of the loop.
+
+source_filename = "llpc_fragment_7"
+target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-p7:160:256:256:32-p8:128:128-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7:8-p32:32:32"
+target triple = "amdgcn--amdpal"
+
+; Function Attrs: nounwind
+define dllexport spir_func void @lgc.shader.FS.main() local_unnamed_addr #0 !spirv.ExecutionModel !8 !lgc.shaderstage !9 {
+; CHECK-LABEL: define dllexport spir_func void @lgc.shader.FS.main(
+; CHECK-SAME: ) local_unnamed_addr #[[ATTR0:[0-9]+]] !spirv.ExecutionModel !8 !lgc.shaderstage [[META9:![0-9]+]] {
+; CHECK-NEXT:  .entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.amdgcn.s.getpc()
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i64 [[TMP0]] to <2 x i32>
+; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.amdgcn.s.getpc()
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i64 [[TMP2]] to <2 x i32>
+; CHECK-NEXT:    [[I:%.*]] = call i32 (...) @lgc.input.import.interpolated__i32(i1 false, i32 3, i32 0, i32 0, i32 poison, i32 1, i32 poison)
+; CHECK-NEXT:    [[TMP4:%.*]] = call i32 @lgc.load.user.data__i32(i32 0)
+; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <2 x i32> [[TMP3]], i32 [[TMP4]], i64 0
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <2 x i32> [[TMP5]] to i64
+; CHECK-NEXT:    [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr addrspace(4)
+; CHECK-NEXT:    [[I1:%.*]] = getelementptr i8, ptr addrspace(4) [[TMP7]], i32 0
+; CHECK-NEXT:    [[TMP8:%.*]] = call i32 @lgc.load.user.data__i32(i32 0)
+; CHECK-NEXT:    [[TMP9:%.*]] = insertelement <2 x i32> [[TMP1]], i32 [[TMP8]], i64 0
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast <2 x i32> [[TMP9]] to i64
+; CHECK-NEXT:    [[TMP11:%.*]] = inttoptr i64 [[TMP10]] to ptr addrspace(4)
+; CHECK-NEXT:    [[I2:%.*]] = getelementptr i8, ptr addrspace(4) [[TMP11]], i32 32
+; CHECK-NEXT:    br label [[LOOP:%.*]]
+; CHECK:       loop:
+; CHECK-NEXT:    [[PHI_IND:%.*]] = phi i32 [ 0, [[DOTENTRY:%.*]] ], [ [[IND:%.*]], [[LOOP]] ]
+; CHECK-NEXT:    [[PHI_IMG:%.*]] = phi <4 x float> [ <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>, [[DOTENTRY]] ], [ [[I11:%.*]], [[LOOP]] ]
+; CHECK-NEXT:    [[I3:%.*]] = mul i32 [[PHI_IND]], 48
+; CHECK-NEXT:    [[I4:%.*]] = sext i32 [[I3]] to i64
+; CHECK-NEXT:    [[I5:%.*]] = getelementptr i8, ptr addrspace(4) [[I1]], i64 [[I4]]
+; CHECK-NEXT:    [[I6:%.*]] = mul i32 [[PHI_IND]], 48
+; CHECK-NEXT:    [[I7:%.*]] = sext i32 [[I6]] to i64
+; CHECK-NEXT:    [[I8:%.*]] = getelementptr i8, ptr addrspace(4) [[I2]], i64 [[I7]]
+; CHECK-NEXT:    [[I9:%.*]] = load <4 x i32>, ptr addrspace(4) [[I8]], align 16, !invariant.load [[META10:![0-9]+]]
+; CHECK-NEXT:    [[I10:%.*]] = load <8 x i32>, ptr addrspace(4) [[I5]], align 32, !invariant.load [[META10]]
+; CHECK-NEXT:    [[TMP12:%.*]] = call i32 @llvm.amdgcn.waterfall.begin.i32(i32 0, i32 [[I3]])
+; CHECK-NEXT:    [[TMP13:%.*]] = call i32 @llvm.amdgcn.waterfall.readfirstlane.i32.i32(i32 [[TMP12]], i32 [[I3]])
+; CHECK-NEXT:    [[TMP14:%.*]] = sext i32 [[TMP13]] to i64
+; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr addrspace(4) [[I1]], i64 [[TMP14]]
+; CHECK-NEXT:    [[TMP16:%.*]] = load <8 x i32>, ptr addrspace(4) [[TMP15]], align 32, !invariant.load [[META10]]
+; CHECK-NEXT:    [[TMP17:%.*]] = sext i32 [[TMP13]] to i64
+; CHECK-NEXT:    [[TMP18:%.*]] = getelementptr i8, ptr addrspace(4) [[I2]], i64 [[TMP17]]
+; CHECK-NEXT:    [[TMP19:%.*]] = load <4 x i32>, ptr addrspace(4) [[TMP18]], align 16, !invariant.load [[META10]]
+; CHECK-NEXT:    [[TMP20:%.*]] = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32 15, float 0.000000e+00, float 0.000000e+00, <8 x i32> [[TMP16]], <4 x i32> [[TMP19]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    [[I11]] = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.waterfall.end.v4f32(i32 [[TMP12]], <4 x float> [[TMP20]])
+; CHECK-NEXT:    [[I12:%.*]] = fadd <4 x float> [[PHI_IMG]], <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>
+; CHECK-NEXT:    [[IND]] = add i32 [[PHI_IND]], 1
+; CHECK-NEXT:    [[COND:%.*]] = icmp ne i32 [[IND]], 1000
+; CHECK-NEXT:    br i1 [[COND]], label [[LOOP]], label [[EXIT:%.*]]
+; CHECK:       exit:
+; CHECK-NEXT:    call void @lgc.output.export.generic.i32.i32.v4f32(i32 0, i32 0, <4 x float> [[I12]]) #[[ATTR5:[0-9]+]]
+; CHECK-NEXT:    ret void
+;
+.entry:
+  %i = call i32 (...) @lgc.create.read.generic.input__i32(i32 3, i32 0, i32 0, i32 0, i32 17, i32 poison)
+  %i1 = call ptr addrspace(4) (...) @lgc.create.get.desc.ptr.p4(i32 1, i32 1, i64 0, i32 7)
+  %i2 = call ptr addrspace(4) (...) @lgc.create.get.desc.ptr.p4(i32 2, i32 2, i64 0, i32 7)
+  br label %loop
+
+loop:                                             ; preds = %loop, %.entry
+  %phi.ind = phi i32 [ 0, %.entry ], [ %ind, %loop ]
+  %phi.img = phi <4 x float> [ <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>, %.entry ], [ %i11, %loop ]
+  %a = call i32 (...) @lgc.create.get.desc.stride__i32(i32 1, i32 1, i64 0, i32 7)
+  %b = call i32 (...) @lgc.create.get.desc.stride__i32(i32 2, i32 2, i64 0, i32 7)
+  %i3 = mul i32 %phi.ind, %a
+  %i4 = sext i32 %i3 to i64
+  %i5 = getelementptr i8, ptr addrspace(4) %i1, i64 %i4
+  %i6 = mul i32 %phi.ind, %b
+  %i7 = sext i32 %i6 to i64
+  %i8 = getelementptr i8, ptr addrspace(4) %i2, i64 %i7
+  %i9 = load <4 x i32>, ptr addrspace(4) %i8, align 16, !invariant.load !10
+  %i10 = load <8 x i32>, ptr addrspace(4) %i5, align 32, !invariant.load !10
+  %i11 = call reassoc nnan nsz arcp contract afn <4 x float> (...) @lgc.create.image.sample.v4f32(i32 1, i32 24, <8 x i32> %i10, <4 x i32> %i9, i32 1, <2 x float> zeroinitializer)
+  %i12 = fadd <4 x float> %phi.img, <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>
+  %ind = add i32 %phi.ind, 1
+  %cond = icmp ne i32 %ind, 1000
+  br i1 %cond, label %loop, label %exit
+
+exit:                                             ; preds = %loop
+  call void (...) @lgc.create.write.generic.output(<4 x float> %i12, i32 0, i32 0, i32 0, i32 0, i32 0, i32 poison)
+  ret void
+}
+
+declare spir_func void @spirv.NonUniform.i32(i32) local_unnamed_addr
+
+; Function Attrs: nounwind memory(none)
+declare ptr addrspace(4) @lgc.create.get.desc.ptr.p4(...) local_unnamed_addr #1
+
+; Function Attrs: nounwind memory(none)
+declare i32 @lgc.create.get.desc.stride__i32(...) local_unnamed_addr #1
+
+declare spir_func void @"spirv.NonUniform.s[s[p4,i32,i32,i32],s[p4,i32,i32]]"({ { ptr addrspace(4), i32, i32, i32 }, { ptr addrspace(4), i32, i32 } }) local_unnamed_addr
+
+declare spir_func void @"spirv.NonUniform.s[a3v8i32,s[v4i32,i32]]"({ [3 x <8 x i32>], { <4 x i32>, i32 } }) local_unnamed_addr
+
+; Function Attrs: nounwind willreturn memory(read)
+declare <4 x float> @lgc.create.image.sample.v4f32(...) local_unnamed_addr #2
+
+; Function Attrs: nounwind willreturn memory(read)
+declare i32 @lgc.create.read.generic.input__i32(...) local_unnamed_addr #2
+
+; Function Attrs: nounwind
+declare void @lgc.create.write.generic.output(...) local_unnamed_addr #3
+
+attributes #0 = { nounwind "denormal-fp-math-f32"="preserve-sign" }
+attributes #1 = { nounwind memory(none) }
+attributes #2 = { nounwind willreturn memory(read) }
+attributes #3 = { nounwind }
+
+!lgc.client = !{!0}
+!lgc.unlinked = !{!1}
+!lgc.options = !{!2}
+!lgc.options.FS = !{!3}
+!lgc.user.data.nodes = !{!4, !5}
+!lgc.color.export.formats = !{!6}
+!amdgpu.pal.metadata.msgpack = !{!7}
+
+!0 = !{!"Vulkan"}
+!1 = !{i32 1}
+!2 = !{i32 -158725823, i32 1419665388, i32 -1015833383, i32 -491143713, i32 1, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 256, i32 256, i32 2, i32 1}
+!3 = !{i32 -1822594139, i32 1920663194, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 64, i32 0, i32 0, i32 3, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 20, i32 1800, i32 0, i32 0, i32 1}
+!4 = !{!"DescriptorTableVaPtr", i32 7, i32 64, i32 0, i32 1, i32 1}
+!5 = !{!"DescriptorCombinedTexture", i32 3, i32 64, i32 0, i32 192, i64 0, i32 7, i32 12}
+!6 = !{i32 14, i32 7, i32 0, i32 0, i32 15}
+!7 = !{!"\82\B0amdpal.pipelines\91\83\B0.spill_threshold\CD\FF\FF\B0.user_data_limit\00\AF.xgl_cache_info\82\B3.128_bit_cache_hash\92\CF\E8\D2\98>j\B9B\94\CF2\DEF\BF\9Fx\BC1\AD.llpc_version\A470.1\AEamdpal.version\92\03\00"}
+!8 = !{i32 4}
+!9 = !{i32 6}
+!10 = !{}
diff --git a/lgc/test/scalarizationOfDescriptorLoadsTest13.lgc b/lgc/test/scalarizationOfDescriptorLoadsTest13.lgc
new file mode 100644
index 0000000000..cd41d3a2bf
--- /dev/null
+++ b/lgc/test/scalarizationOfDescriptorLoadsTest13.lgc
@@ -0,0 +1,134 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --tool lgc --version 4
+; RUN: lgc -mcpu=gfx1010 -print-after=lgc-builder-replayer -o - %s 2>&1 | FileCheck --check-prefixes=CHECK %s
+
+; This test checks how the scalarization of descriptor loads works if some values are defined in the loop pre-header
+; and others inside the loop.
+
+source_filename = "llpc_fragment_7"
+target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-p7:160:256:256:32-p8:128:128-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7:8-p32:32:32"
+target triple = "amdgcn--amdpal"
+
+; Function Attrs: nounwind
+define dllexport spir_func void @lgc.shader.FS.main() local_unnamed_addr #0 !spirv.ExecutionModel !8 !lgc.shaderstage !9 {
+; CHECK-LABEL: define dllexport spir_func void @lgc.shader.FS.main(
+; CHECK-SAME: ) local_unnamed_addr #[[ATTR0:[0-9]+]] !spirv.ExecutionModel !8 !lgc.shaderstage [[META9:![0-9]+]] {
+; CHECK-NEXT:  .entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.amdgcn.s.getpc()
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i64 [[TMP0]] to <2 x i32>
+; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.amdgcn.s.getpc()
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i64 [[TMP2]] to <2 x i32>
+; CHECK-NEXT:    [[I:%.*]] = call i32 (...) @lgc.input.import.interpolated__i32(i1 false, i32 3, i32 0, i32 0, i32 poison, i32 1, i32 poison)
+; CHECK-NEXT:    [[TMP4:%.*]] = call i32 @lgc.load.user.data__i32(i32 0)
+; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <2 x i32> [[TMP3]], i32 [[TMP4]], i64 0
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <2 x i32> [[TMP5]] to i64
+; CHECK-NEXT:    [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr addrspace(4)
+; CHECK-NEXT:    [[I1:%.*]] = getelementptr i8, ptr addrspace(4) [[TMP7]], i32 0
+; CHECK-NEXT:    [[TMP8:%.*]] = call i32 @lgc.load.user.data__i32(i32 0)
+; CHECK-NEXT:    [[TMP9:%.*]] = insertelement <2 x i32> [[TMP1]], i32 [[TMP8]], i64 0
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast <2 x i32> [[TMP9]] to i64
+; CHECK-NEXT:    [[TMP11:%.*]] = inttoptr i64 [[TMP10]] to ptr addrspace(4)
+; CHECK-NEXT:    [[I2:%.*]] = getelementptr i8, ptr addrspace(4) [[TMP11]], i32 32
+; CHECK-NEXT:    [[I3:%.*]] = mul i32 [[I]], 48
+; CHECK-NEXT:    [[I4:%.*]] = sext i32 [[I3]] to i64
+; CHECK-NEXT:    [[I5:%.*]] = getelementptr i8, ptr addrspace(4) [[I1]], i64 [[I4]]
+; CHECK-NEXT:    [[L:%.*]] = load <8 x i32>, ptr addrspace(4) [[I5]], align 32, !invariant.load [[META10:![0-9]+]]
+; CHECK-NEXT:    br label [[LOOP:%.*]]
+; CHECK:       loop:
+; CHECK-NEXT:    [[PHI_IND:%.*]] = phi i32 [ 0, [[DOTENTRY:%.*]] ], [ [[IND:%.*]], [[LOOP]] ]
+; CHECK-NEXT:    [[PHI_LOAD:%.*]] = phi <8 x i32> [ [[L]], [[DOTENTRY]] ], [ [[I10:%.*]], [[LOOP]] ]
+; CHECK-NEXT:    [[I6:%.*]] = mul i32 [[PHI_IND]], 48
+; CHECK-NEXT:    [[I7:%.*]] = sext i32 [[I6]] to i64
+; CHECK-NEXT:    [[I8:%.*]] = getelementptr i8, ptr addrspace(4) [[I2]], i64 [[I7]]
+; CHECK-NEXT:    [[I9:%.*]] = load <4 x i32>, ptr addrspace(4) [[I8]], align 16, !invariant.load [[META10]]
+; CHECK-NEXT:    [[I10]] = load <8 x i32>, ptr addrspace(4) [[I8]], align 32, !invariant.load [[META10]]
+; CHECK-NEXT:    [[TMP12:%.*]] = call i32 @llvm.amdgcn.waterfall.begin.v8i32(i32 0, <8 x i32> [[PHI_LOAD]])
+; CHECK-NEXT:    [[TMP13:%.*]] = call i32 @llvm.amdgcn.waterfall.begin.i32(i32 [[TMP12]], i32 [[I6]])
+; CHECK-NEXT:    [[TMP14:%.*]] = call <8 x i32> @llvm.amdgcn.waterfall.readfirstlane.v8i32.v8i32(i32 [[TMP13]], <8 x i32> [[PHI_LOAD]])
+; CHECK-NEXT:    [[TMP15:%.*]] = call i32 @llvm.amdgcn.waterfall.readfirstlane.i32.i32(i32 [[TMP13]], i32 [[I6]])
+; CHECK-NEXT:    [[TMP16:%.*]] = sext i32 [[TMP15]] to i64
+; CHECK-NEXT:    [[TMP17:%.*]] = getelementptr i8, ptr addrspace(4) [[I2]], i64 [[TMP16]]
+; CHECK-NEXT:    [[TMP18:%.*]] = load <4 x i32>, ptr addrspace(4) [[TMP17]], align 16, !invariant.load [[META10]]
+; CHECK-NEXT:    [[TMP19:%.*]] = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32 15, float 0.000000e+00, float 0.000000e+00, <8 x i32> [[TMP14]], <4 x i32> [[TMP18]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    [[I11:%.*]] = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.waterfall.end.v4f32(i32 [[TMP13]], <4 x float> [[TMP19]])
+; CHECK-NEXT:    call void @lgc.output.export.generic.i32.i32.v4f32(i32 0, i32 0, <4 x float> [[I11]]) #[[ATTR5:[0-9]+]]
+; CHECK-NEXT:    [[IND]] = add i32 [[PHI_IND]], 1
+; CHECK-NEXT:    [[COND:%.*]] = icmp ne i32 [[IND]], 1000
+; CHECK-NEXT:    br i1 [[COND]], label [[LOOP]], label [[EXIT:%.*]]
+; CHECK:       exit:
+; CHECK-NEXT:    ret void
+;
+.entry:
+  %i = call i32 (...) @lgc.create.read.generic.input__i32(i32 3, i32 0, i32 0, i32 0, i32 17, i32 poison)
+  %i1 = call ptr addrspace(4) (...) @lgc.create.get.desc.ptr.p4(i32 1, i32 1, i64 0, i32 7)
+  %i2 = call ptr addrspace(4) (...) @lgc.create.get.desc.ptr.p4(i32 2, i32 2, i64 0, i32 7)
+  %a = call i32 (...) @lgc.create.get.desc.stride__i32(i32 1, i32 1, i64 0, i32 7)
+  %i3 = mul i32 %i, %a
+  %i4 = sext i32 %i3 to i64
+  %i5 = getelementptr i8, ptr addrspace(4) %i1, i64 %i4
+  %l = load <8 x i32>, ptr addrspace(4) %i5, align 32, !invariant.load !10
+  br label %loop
+
+loop:                                             ; preds = %loop, %.entry
+  %phi.ind = phi i32 [ 0, %.entry ], [ %ind, %loop ]
+  %phi.load = phi <8 x i32> [ %l, %.entry ], [ %i10, %loop ]
+  %b = call i32 (...) @lgc.create.get.desc.stride__i32(i32 2, i32 2, i64 0, i32 7)
+  %i6 = mul i32 %phi.ind, %b
+  %i7 = sext i32 %i6 to i64
+  %i8 = getelementptr i8, ptr addrspace(4) %i2, i64 %i7
+  %i9 = load <4 x i32>, ptr addrspace(4) %i8, align 16, !invariant.load !10
+  %i10 = load <8 x i32>, ptr addrspace(4) %i8, align 32, !invariant.load !10
+  %i11 = call reassoc nnan nsz arcp contract afn <4 x float> (...) @lgc.create.image.sample.v4f32(i32 1, i32 24, <8 x i32> %phi.load, <4 x i32> %i9, i32 1, <2 x float> zeroinitializer)
+  call void (...) @lgc.create.write.generic.output(<4 x float> %i11, i32 0, i32 0, i32 0, i32 0, i32 0, i32 poison)
+  %ind = add i32 %phi.ind, 1
+  %cond = icmp ne i32 %ind, 1000
+  br i1 %cond, label %loop, label %exit
+
+exit:                                             ; preds = %loop
+  ret void
+}
+
+declare spir_func void @spirv.NonUniform.i32(i32) local_unnamed_addr
+
+; Function Attrs: nounwind memory(none)
+declare ptr addrspace(4) @lgc.create.get.desc.ptr.p4(...) local_unnamed_addr #1
+
+; Function Attrs: nounwind memory(none)
+declare i32 @lgc.create.get.desc.stride__i32(...) local_unnamed_addr #1
+
+declare spir_func void @"spirv.NonUniform.s[s[p4,i32,i32,i32],s[p4,i32,i32]]"({ { ptr addrspace(4), i32, i32, i32 }, { ptr addrspace(4), i32, i32 } }) local_unnamed_addr
+
+declare spir_func void @"spirv.NonUniform.s[a3v8i32,s[v4i32,i32]]"({ [3 x <8 x i32>], { <4 x i32>, i32 } }) local_unnamed_addr
+
+; Function Attrs: nounwind willreturn memory(read)
+declare <4 x float> @lgc.create.image.sample.v4f32(...) local_unnamed_addr #2
+
+; Function Attrs: nounwind willreturn memory(read)
+declare i32 @lgc.create.read.generic.input__i32(...) local_unnamed_addr #2
+
+; Function Attrs: nounwind
+declare void @lgc.create.write.generic.output(...) local_unnamed_addr #3
+
+attributes #0 = { nounwind "denormal-fp-math-f32"="preserve-sign" }
+attributes #1 = { nounwind memory(none) }
+attributes #2 = { nounwind willreturn memory(read) }
+attributes #3 = { nounwind }
+
+!lgc.client = !{!0}
+!lgc.unlinked = !{!1}
+!lgc.options = !{!2}
+!lgc.options.FS = !{!3}
+!lgc.user.data.nodes = !{!4, !5}
+!lgc.color.export.formats = !{!6}
+!amdgpu.pal.metadata.msgpack = !{!7}
+
+!0 = !{!"Vulkan"}
+!1 = !{i32 1}
+!2 = !{i32 -158725823, i32 1419665388, i32 -1015833383, i32 -491143713, i32 1, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 256, i32 256, i32 2, i32 1}
+!3 = !{i32 -1822594139, i32 1920663194, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 64, i32 0, i32 0, i32 3, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 20, i32 1800, i32 0, i32 0, i32 1}
+!4 = !{!"DescriptorTableVaPtr", i32 7, i32 64, i32 0, i32 1, i32 1}
+!5 = !{!"DescriptorCombinedTexture", i32 3, i32 64, i32 0, i32 192, i64 0, i32 7, i32 12}
+!6 = !{i32 14, i32 7, i32 0, i32 0, i32 15}
+!7 = !{!"\82\B0amdpal.pipelines\91\83\B0.spill_threshold\CD\FF\FF\B0.user_data_limit\00\AF.xgl_cache_info\82\B3.128_bit_cache_hash\92\CF\E8\D2\98>j\B9B\94\CF2\DEF\BF\9Fx\BC1\AD.llpc_version\A470.1\AEamdpal.version\92\03\00"}
+!8 = !{i32 4}
+!9 = !{i32 6}
+!10 = !{}
diff --git a/lgc/test/scalarizationOfDescriptorLoadsTest14.lgc b/lgc/test/scalarizationOfDescriptorLoadsTest14.lgc
new file mode 100644
index 0000000000..bba218fbc6
--- /dev/null
+++ b/lgc/test/scalarizationOfDescriptorLoadsTest14.lgc
@@ -0,0 +1,142 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --tool lgc --version 4
+; RUN: lgc -mcpu=gfx1010 -print-after=lgc-builder-replayer -o - %s 2>&1 | FileCheck --check-prefixes=CHECK %s
+
+; This test checks how the scalarization of descriptor loads works if some values are defined in the loop pre-header
+; and others inside the loop and the returned value of lgc.create.image.sample.v4f32 call is used outside of the loop
+; in lgc.create.image.store.
+
+source_filename = "llpc_fragment_7"
+target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-p7:160:256:256:32-p8:128:128-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7:8-p32:32:32"
+target triple = "amdgcn--amdpal"
+
+; Function Attrs: nounwind
+define dllexport spir_func void @lgc.shader.FS.main() local_unnamed_addr #0 !spirv.ExecutionModel !8 !lgc.shaderstage !9 {
+; CHECK-LABEL: define dllexport spir_func void @lgc.shader.FS.main(
+; CHECK-SAME: ) local_unnamed_addr #[[ATTR0:[0-9]+]] !spirv.ExecutionModel !8 !lgc.shaderstage [[META9:![0-9]+]] {
+; CHECK-NEXT:  .entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.amdgcn.s.getpc()
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i64 [[TMP0]] to <2 x i32>
+; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.amdgcn.s.getpc()
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i64 [[TMP2]] to <2 x i32>
+; CHECK-NEXT:    [[I:%.*]] = call i32 (...) @lgc.input.import.interpolated__i32(i1 false, i32 3, i32 0, i32 0, i32 poison, i32 1, i32 poison)
+; CHECK-NEXT:    [[TMP4:%.*]] = call i32 @lgc.load.user.data__i32(i32 0)
+; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <2 x i32> [[TMP3]], i32 [[TMP4]], i64 0
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <2 x i32> [[TMP5]] to i64
+; CHECK-NEXT:    [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr addrspace(4)
+; CHECK-NEXT:    [[I1:%.*]] = getelementptr i8, ptr addrspace(4) [[TMP7]], i32 0
+; CHECK-NEXT:    [[TMP8:%.*]] = call i32 @lgc.load.user.data__i32(i32 0)
+; CHECK-NEXT:    [[TMP9:%.*]] = insertelement <2 x i32> [[TMP1]], i32 [[TMP8]], i64 0
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast <2 x i32> [[TMP9]] to i64
+; CHECK-NEXT:    [[TMP11:%.*]] = inttoptr i64 [[TMP10]] to ptr addrspace(4)
+; CHECK-NEXT:    [[I2:%.*]] = getelementptr i8, ptr addrspace(4) [[TMP11]], i32 32
+; CHECK-NEXT:    br label [[LOOP:%.*]]
+; CHECK:       loop:
+; CHECK-NEXT:    [[PHI_IND:%.*]] = phi i32 [ 0, [[DOTENTRY:%.*]] ], [ [[IND:%.*]], [[LOOP]] ]
+; CHECK-NEXT:    [[PHI_IMG:%.*]] = phi <4 x float> [ <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>, [[DOTENTRY]] ], [ [[I11:%.*]], [[LOOP]] ]
+; CHECK-NEXT:    [[I3:%.*]] = mul i32 [[PHI_IND]], 48
+; CHECK-NEXT:    [[I4:%.*]] = sext i32 [[I3]] to i64
+; CHECK-NEXT:    [[I5:%.*]] = getelementptr i8, ptr addrspace(4) [[I1]], i64 [[I4]]
+; CHECK-NEXT:    [[I6:%.*]] = mul i32 [[PHI_IND]], 48
+; CHECK-NEXT:    [[I7:%.*]] = sext i32 [[I6]] to i64
+; CHECK-NEXT:    [[I8:%.*]] = getelementptr i8, ptr addrspace(4) [[I2]], i64 [[I7]]
+; CHECK-NEXT:    [[I9:%.*]] = load <4 x i32>, ptr addrspace(4) [[I8]], align 16, !invariant.load [[META10:![0-9]+]]
+; CHECK-NEXT:    [[I10:%.*]] = load <8 x i32>, ptr addrspace(4) [[I5]], align 32, !invariant.load [[META10]]
+; CHECK-NEXT:    [[TMP12:%.*]] = call i32 @llvm.amdgcn.waterfall.begin.i32(i32 0, i32 [[I3]])
+; CHECK-NEXT:    [[TMP13:%.*]] = call i32 @llvm.amdgcn.waterfall.readfirstlane.i32.i32(i32 [[TMP12]], i32 [[I3]])
+; CHECK-NEXT:    [[TMP14:%.*]] = sext i32 [[TMP13]] to i64
+; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr addrspace(4) [[I1]], i64 [[TMP14]]
+; CHECK-NEXT:    [[TMP16:%.*]] = load <8 x i32>, ptr addrspace(4) [[TMP15]], align 32, !invariant.load [[META10]]
+; CHECK-NEXT:    [[TMP17:%.*]] = sext i32 [[TMP13]] to i64
+; CHECK-NEXT:    [[TMP18:%.*]] = getelementptr i8, ptr addrspace(4) [[I2]], i64 [[TMP17]]
+; CHECK-NEXT:    [[TMP19:%.*]] = load <4 x i32>, ptr addrspace(4) [[TMP18]], align 16, !invariant.load [[META10]]
+; CHECK-NEXT:    [[TMP20:%.*]] = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32 15, float 0.000000e+00, float 0.000000e+00, <8 x i32> [[TMP16]], <4 x i32> [[TMP19]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    [[I11]] = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.waterfall.end.v4f32(i32 [[TMP12]], <4 x float> [[TMP20]])
+; CHECK-NEXT:    [[I12:%.*]] = fadd <4 x float> [[PHI_IMG]], <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>
+; CHECK-NEXT:    [[IND]] = add i32 [[PHI_IND]], 1
+; CHECK-NEXT:    [[COND:%.*]] = icmp ne i32 [[IND]], 1000
+; CHECK-NEXT:    br i1 [[COND]], label [[LOOP]], label [[EXIT:%.*]]
+; CHECK:       exit:
+; CHECK-NEXT:    [[TMP21:%.*]] = call i32 @llvm.amdgcn.waterfall.begin.i32(i32 0, i32 [[I6]])
+; CHECK-NEXT:    [[TMP22:%.*]] = call i32 @llvm.amdgcn.waterfall.readfirstlane.i32.i32(i32 [[TMP21]], i32 [[I6]])
+; CHECK-NEXT:    [[TMP23:%.*]] = sext i32 [[TMP22]] to i64
+; CHECK-NEXT:    [[TMP24:%.*]] = getelementptr i8, ptr addrspace(4) [[I2]], i64 [[TMP23]]
+; CHECK-NEXT:    [[TMP25:%.*]] = load <4 x i32>, ptr addrspace(4) [[TMP24]], align 16, !invariant.load [[META10]]
+; CHECK-NEXT:    [[TMP26:%.*]] = call <4 x i32> @llvm.amdgcn.waterfall.last.use.v4i32(i32 [[TMP21]], <4 x i32> [[TMP25]])
+; CHECK-NEXT:    call void @llvm.amdgcn.struct.buffer.store.format.v4f32(<4 x float> [[I12]], <4 x i32> [[TMP26]], i32 1, i32 0, i32 0, i32 0)
+; CHECK-NEXT:    ret void
+;
+.entry:
+  %i = call i32 (...) @lgc.create.read.generic.input__i32(i32 3, i32 0, i32 0, i32 0, i32 17, i32 poison)
+  %i1 = call ptr addrspace(4) (...) @lgc.create.get.desc.ptr.p4(i32 1, i32 1, i64 0, i32 7)
+  %i2 = call ptr addrspace(4) (...) @lgc.create.get.desc.ptr.p4(i32 2, i32 2, i64 0, i32 7)
+  br label %loop
+
+loop:                                             ; preds = %loop, %.entry
+  %phi.ind = phi i32 [ 0, %.entry ], [ %ind, %loop ]
+  %phi.img = phi <4 x float> [ <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>, %.entry ], [ %i11, %loop ]
+  %a = call i32 (...) @lgc.create.get.desc.stride__i32(i32 1, i32 1, i64 0, i32 7)
+  %b = call i32 (...) @lgc.create.get.desc.stride__i32(i32 2, i32 2, i64 0, i32 7)
+  %i3 = mul i32 %phi.ind, %a
+  %i4 = sext i32 %i3 to i64
+  %i5 = getelementptr i8, ptr addrspace(4) %i1, i64 %i4
+  %i6 = mul i32 %phi.ind, %b
+  %i7 = sext i32 %i6 to i64
+  %i8 = getelementptr i8, ptr addrspace(4) %i2, i64 %i7
+  %i9 = load <4 x i32>, ptr addrspace(4) %i8, align 16, !invariant.load !10
+  %i10 = load <8 x i32>, ptr addrspace(4) %i5, align 32, !invariant.load !10
+  %i11 = call reassoc nnan nsz arcp contract afn <4 x float> (...) @lgc.create.image.sample.v4f32(i32 1, i32 24, <8 x i32> %i10, <4 x i32> %i9, i32 1, <2 x float> zeroinitializer)
+  %i12 = fadd <4 x float> %phi.img, <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>
+  %ind = add i32 %phi.ind, 1
+  %cond = icmp ne i32 %ind, 1000
+  br i1 %cond, label %loop, label %exit
+
+exit:                                             ; preds = %loop
+  call void (...) @lgc.create.image.store(<4 x float> %i12, i32 0, i32 8, <4 x i32> %i9, i32 1)
+  ret void
+}
+
+declare spir_func void @spirv.NonUniform.i32(i32) local_unnamed_addr
+
+; Function Attrs: nounwind memory(none)
+declare ptr addrspace(4) @lgc.create.get.desc.ptr.p4(...) local_unnamed_addr #1
+
+; Function Attrs: nounwind memory(none)
+declare i32 @lgc.create.get.desc.stride__i32(...) local_unnamed_addr #1
+
+declare spir_func void @"spirv.NonUniform.s[s[p4,i32,i32,i32],s[p4,i32,i32]]"({ { ptr addrspace(4), i32, i32, i32 }, { ptr addrspace(4), i32, i32 } }) local_unnamed_addr
+
+declare spir_func void @"spirv.NonUniform.s[a3v8i32,s[v4i32,i32]]"({ [3 x <8 x i32>], { <4 x i32>, i32 } }) local_unnamed_addr
+
+; Function Attrs: nounwind willreturn memory(read)
+declare <4 x float> @lgc.create.image.sample.v4f32(...) local_unnamed_addr #2
+
+; Function Attrs: nounwind willreturn memory(read)
+declare i32 @lgc.create.read.generic.input__i32(...) local_unnamed_addr #2
+
+; Function Attrs: nounwind memory(write)
+declare void @lgc.create.image.store(...) local_unnamed_addr #3
+
+attributes #0 = { nounwind "denormal-fp-math-f32"="preserve-sign" }
+attributes #1 = { nounwind memory(none) }
+attributes #2 = { nounwind willreturn memory(read) }
+attributes #3 = { nounwind memory(write) }
+
+!lgc.client = !{!0}
+!lgc.unlinked = !{!1}
+!lgc.options = !{!2}
+!lgc.options.FS = !{!3}
+!lgc.user.data.nodes = !{!4, !5}
+!lgc.color.export.formats = !{!6}
+!amdgpu.pal.metadata.msgpack = !{!7}
+
+!0 = !{!"Vulkan"}
+!1 = !{i32 1}
+!2 = !{i32 -158725823, i32 1419665388, i32 -1015833383, i32 -491143713, i32 1, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 256, i32 256, i32 2, i32 1}
+!3 = !{i32 -1822594139, i32 1920663194, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 64, i32 0, i32 0, i32 3, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 20, i32 1800, i32 0, i32 0, i32 1}
+!4 = !{!"DescriptorTableVaPtr", i32 7, i32 64, i32 0, i32 1, i32 1}
+!5 = !{!"DescriptorCombinedTexture", i32 3, i32 64, i32 0, i32 192, i64 0, i32 7, i32 12}
+!6 = !{i32 14, i32 7, i32 0, i32 0, i32 15}
+!7 = !{!"\82\B0amdpal.pipelines\91\83\B0.spill_threshold\CD\FF\FF\B0.user_data_limit\00\AF.xgl_cache_info\82\B3.128_bit_cache_hash\92\CF\E8\D2\98>j\B9B\94\CF2\DEF\BF\9Fx\BC1\AD.llpc_version\A470.1\AEamdpal.version\92\03\00"}
+!8 = !{i32 4}
+!9 = !{i32 6}
+!10 = !{}
diff --git a/lgc/test/scalarizationOfDescriptorLoadsTest15.lgc b/lgc/test/scalarizationOfDescriptorLoadsTest15.lgc
new file mode 100644
index 0000000000..c22cf5fc59
--- /dev/null
+++ b/lgc/test/scalarizationOfDescriptorLoadsTest15.lgc
@@ -0,0 +1,141 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --tool lgc --version 4
+; RUN: lgc -mcpu=gfx1010 -print-after=lgc-builder-replayer -o - %s 2>&1 | FileCheck --check-prefixes=CHECK %s
+
+; This test checks how the scalarization of descriptor loads works if the lgc.create.image.sample.v4f32 call and
+; the lgc.create.image.store call are defined inside the loop.
+
+source_filename = "llpc_fragment_7"
+target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-p7:160:256:256:32-p8:128:128-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7:8-p32:32:32"
+target triple = "amdgcn--amdpal"
+
+; Function Attrs: nounwind
+define dllexport spir_func void @lgc.shader.FS.main() local_unnamed_addr #0 !spirv.ExecutionModel !8 !lgc.shaderstage !9 {
+; CHECK-LABEL: define dllexport spir_func void @lgc.shader.FS.main(
+; CHECK-SAME: ) local_unnamed_addr #[[ATTR0:[0-9]+]] !spirv.ExecutionModel !8 !lgc.shaderstage [[META9:![0-9]+]] {
+; CHECK-NEXT:  .entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.amdgcn.s.getpc()
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i64 [[TMP0]] to <2 x i32>
+; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.amdgcn.s.getpc()
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i64 [[TMP2]] to <2 x i32>
+; CHECK-NEXT:    [[I:%.*]] = call i32 (...) @lgc.input.import.interpolated__i32(i1 false, i32 3, i32 0, i32 0, i32 poison, i32 1, i32 poison)
+; CHECK-NEXT:    [[TMP4:%.*]] = call i32 @lgc.load.user.data__i32(i32 0)
+; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <2 x i32> [[TMP3]], i32 [[TMP4]], i64 0
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <2 x i32> [[TMP5]] to i64
+; CHECK-NEXT:    [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr addrspace(4)
+; CHECK-NEXT:    [[I1:%.*]] = getelementptr i8, ptr addrspace(4) [[TMP7]], i32 0
+; CHECK-NEXT:    [[TMP8:%.*]] = call i32 @lgc.load.user.data__i32(i32 0)
+; CHECK-NEXT:    [[TMP9:%.*]] = insertelement <2 x i32> [[TMP1]], i32 [[TMP8]], i64 0
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast <2 x i32> [[TMP9]] to i64
+; CHECK-NEXT:    [[TMP11:%.*]] = inttoptr i64 [[TMP10]] to ptr addrspace(4)
+; CHECK-NEXT:    [[I2:%.*]] = getelementptr i8, ptr addrspace(4) [[TMP11]], i32 32
+; CHECK-NEXT:    br label [[LOOP:%.*]]
+; CHECK:       loop:
+; CHECK-NEXT:    [[PHI_IND:%.*]] = phi i32 [ 0, [[DOTENTRY:%.*]] ], [ [[IND:%.*]], [[LOOP]] ]
+; CHECK-NEXT:    [[PHI_IMG:%.*]] = phi <4 x float> [ <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>, [[DOTENTRY]] ], [ [[I11:%.*]], [[LOOP]] ]
+; CHECK-NEXT:    [[I3:%.*]] = mul i32 [[PHI_IND]], 48
+; CHECK-NEXT:    [[I4:%.*]] = sext i32 [[I3]] to i64
+; CHECK-NEXT:    [[I5:%.*]] = getelementptr i8, ptr addrspace(4) [[I1]], i64 [[I4]]
+; CHECK-NEXT:    [[I6:%.*]] = mul i32 [[PHI_IND]], 48
+; CHECK-NEXT:    [[I7:%.*]] = sext i32 [[I6]] to i64
+; CHECK-NEXT:    [[I8:%.*]] = getelementptr i8, ptr addrspace(4) [[I2]], i64 [[I7]]
+; CHECK-NEXT:    [[I9:%.*]] = load <4 x i32>, ptr addrspace(4) [[I8]], align 16, !invariant.load [[META10:![0-9]+]]
+; CHECK-NEXT:    [[I10:%.*]] = load <8 x i32>, ptr addrspace(4) [[I5]], align 32, !invariant.load [[META10]]
+; CHECK-NEXT:    [[TMP12:%.*]] = call i32 @llvm.amdgcn.waterfall.begin.i32(i32 0, i32 [[I3]])
+; CHECK-NEXT:    [[TMP13:%.*]] = call i32 @llvm.amdgcn.waterfall.readfirstlane.i32.i32(i32 [[TMP12]], i32 [[I3]])
+; CHECK-NEXT:    [[TMP14:%.*]] = sext i32 [[TMP13]] to i64
+; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr addrspace(4) [[I1]], i64 [[TMP14]]
+; CHECK-NEXT:    [[TMP16:%.*]] = load <8 x i32>, ptr addrspace(4) [[TMP15]], align 32, !invariant.load [[META10]]
+; CHECK-NEXT:    [[TMP17:%.*]] = sext i32 [[TMP13]] to i64
+; CHECK-NEXT:    [[TMP18:%.*]] = getelementptr i8, ptr addrspace(4) [[I2]], i64 [[TMP17]]
+; CHECK-NEXT:    [[TMP19:%.*]] = load <4 x i32>, ptr addrspace(4) [[TMP18]], align 16, !invariant.load [[META10]]
+; CHECK-NEXT:    [[TMP20:%.*]] = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32 15, float 0.000000e+00, float 0.000000e+00, <8 x i32> [[TMP16]], <4 x i32> [[TMP19]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    [[I11]] = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.waterfall.end.v4f32(i32 [[TMP12]], <4 x float> [[TMP20]])
+; CHECK-NEXT:    [[I12:%.*]] = fadd <4 x float> [[PHI_IMG]], <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>
+; CHECK-NEXT:    [[TMP21:%.*]] = call i32 @llvm.amdgcn.waterfall.begin.i32(i32 0, i32 [[I6]])
+; CHECK-NEXT:    [[TMP22:%.*]] = call i32 @llvm.amdgcn.waterfall.readfirstlane.i32.i32(i32 [[TMP21]], i32 [[I6]])
+; CHECK-NEXT:    [[TMP23:%.*]] = sext i32 [[TMP22]] to i64
+; CHECK-NEXT:    [[TMP24:%.*]] = getelementptr i8, ptr addrspace(4) [[I2]], i64 [[TMP23]]
+; CHECK-NEXT:    [[TMP25:%.*]] = load <4 x i32>, ptr addrspace(4) [[TMP24]], align 16, !invariant.load [[META10]]
+; CHECK-NEXT:    [[TMP26:%.*]] = call <4 x i32> @llvm.amdgcn.waterfall.last.use.v4i32(i32 [[TMP21]], <4 x i32> [[TMP25]])
+; CHECK-NEXT:    call void @llvm.amdgcn.struct.buffer.store.format.v4f32(<4 x float> [[I12]], <4 x i32> [[TMP26]], i32 1, i32 0, i32 0, i32 0)
+; CHECK-NEXT:    [[IND]] = add i32 [[PHI_IND]], 1
+; CHECK-NEXT:    [[COND:%.*]] = icmp ne i32 [[IND]], 1000
+; CHECK-NEXT:    br i1 [[COND]], label [[LOOP]], label [[EXIT:%.*]]
+; CHECK:       exit:
+; CHECK-NEXT:    ret void
+;
+.entry:
+  %i = call i32 (...) @lgc.create.read.generic.input__i32(i32 3, i32 0, i32 0, i32 0, i32 17, i32 poison)
+  %i1 = call ptr addrspace(4) (...) @lgc.create.get.desc.ptr.p4(i32 1, i32 1, i64 0, i32 7)
+  %i2 = call ptr addrspace(4) (...) @lgc.create.get.desc.ptr.p4(i32 2, i32 2, i64 0, i32 7)
+  br label %loop
+
+loop:                                             ; preds = %loop, %.entry
+  %phi.ind = phi i32 [ 0, %.entry ], [ %ind, %loop ]
+  %phi.img = phi <4 x float> [ <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>, %.entry ], [ %i11, %loop ]
+  %a = call i32 (...) @lgc.create.get.desc.stride__i32(i32 1, i32 1, i64 0, i32 7)
+  %b = call i32 (...) @lgc.create.get.desc.stride__i32(i32 2, i32 2, i64 0, i32 7)
+  %i3 = mul i32 %phi.ind, %a
+  %i4 = sext i32 %i3 to i64
+  %i5 = getelementptr i8, ptr addrspace(4) %i1, i64 %i4
+  %i6 = mul i32 %phi.ind, %b
+  %i7 = sext i32 %i6 to i64
+  %i8 = getelementptr i8, ptr addrspace(4) %i2, i64 %i7
+  %i9 = load <4 x i32>, ptr addrspace(4) %i8, align 16, !invariant.load !10
+  %i10 = load <8 x i32>, ptr addrspace(4) %i5, align 32, !invariant.load !10
+  %i11 = call reassoc nnan nsz arcp contract afn <4 x float> (...) @lgc.create.image.sample.v4f32(i32 1, i32 24, <8 x i32> %i10, <4 x i32> %i9, i32 1, <2 x float> zeroinitializer)
+  %i12 = fadd <4 x float> %phi.img, <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>
+  call void (...) @lgc.create.image.store(<4 x float> %i12, i32 0, i32 8, <4 x i32> %i9, i32 1)
+  %ind = add i32 %phi.ind, 1
+  %cond = icmp ne i32 %ind, 1000
+  br i1 %cond, label %loop, label %exit
+
+exit:                                             ; preds = %loop
+  ret void
+}
+
+declare spir_func void @spirv.NonUniform.i32(i32) local_unnamed_addr
+
+; Function Attrs: nounwind memory(none)
+declare ptr addrspace(4) @lgc.create.get.desc.ptr.p4(...) local_unnamed_addr #1
+
+; Function Attrs: nounwind memory(none)
+declare i32 @lgc.create.get.desc.stride__i32(...) local_unnamed_addr #1
+
+declare spir_func void @"spirv.NonUniform.s[s[p4,i32,i32,i32],s[p4,i32,i32]]"({ { ptr addrspace(4), i32, i32, i32 }, { ptr addrspace(4), i32, i32 } }) local_unnamed_addr
+
+declare spir_func void @"spirv.NonUniform.s[a3v8i32,s[v4i32,i32]]"({ [3 x <8 x i32>], { <4 x i32>, i32 } }) local_unnamed_addr
+
+; Function Attrs: nounwind willreturn memory(read)
+declare <4 x float> @lgc.create.image.sample.v4f32(...) local_unnamed_addr #2
+
+; Function Attrs: nounwind willreturn memory(read)
+declare i32 @lgc.create.read.generic.input__i32(...) local_unnamed_addr #2
+
+; Function Attrs: nounwind memory(write)
+declare void @lgc.create.image.store(...) local_unnamed_addr #3
+
+attributes #0 = { nounwind "denormal-fp-math-f32"="preserve-sign" }
+attributes #1 = { nounwind memory(none) }
+attributes #2 = { nounwind willreturn memory(read) }
+attributes #3 = { nounwind memory(write) }
+
+!lgc.client = !{!0}
+!lgc.unlinked = !{!1}
+!lgc.options = !{!2}
+!lgc.options.FS = !{!3}
+!lgc.user.data.nodes = !{!4, !5}
+!lgc.color.export.formats = !{!6}
+!amdgpu.pal.metadata.msgpack = !{!7}
+
+!0 = !{!"Vulkan"}
+!1 = !{i32 1}
+!2 = !{i32 -158725823, i32 1419665388, i32 -1015833383, i32 -491143713, i32 1, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 256, i32 256, i32 2, i32 1}
+!3 = !{i32 -1822594139, i32 1920663194, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 64, i32 0, i32 0, i32 3, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 20, i32 1800, i32 0, i32 0, i32 1}
+!4 = !{!"DescriptorTableVaPtr", i32 7, i32 64, i32 0, i32 1, i32 1}
+!5 = !{!"DescriptorCombinedTexture", i32 3, i32 64, i32 0, i32 192, i64 0, i32 7, i32 12}
+!6 = !{i32 14, i32 7, i32 0, i32 0, i32 15}
+!7 = !{!"\82\B0amdpal.pipelines\91\83\B0.spill_threshold\CD\FF\FF\B0.user_data_limit\00\AF.xgl_cache_info\82\B3.128_bit_cache_hash\92\CF\E8\D2\98>j\B9B\94\CF2\DEF\BF\9Fx\BC1\AD.llpc_version\A470.1\AEamdpal.version\92\03\00"}
+!8 = !{i32 4}
+!9 = !{i32 6}
+!10 = !{}
diff --git a/lgc/test/scalarizationOfDescriptorLoadsTest16.lgc b/lgc/test/scalarizationOfDescriptorLoadsTest16.lgc
new file mode 100644
index 0000000000..573c55b677
--- /dev/null
+++ b/lgc/test/scalarizationOfDescriptorLoadsTest16.lgc
@@ -0,0 +1,127 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --tool lgc --version 4
+; RUN: lgc -mcpu=gfx1010 -print-after=lgc-builder-replayer -o - %s 2>&1 | FileCheck --check-prefixes=CHECK %s
+
+; This test checks how the scalarization of descriptor loads works if the lgc.create.image.store call
+; is defined inside the loop.
+
+source_filename = "llpc_fragment_7"
+target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-p7:160:256:256:32-p8:128:128-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7:8-p32:32:32"
+target triple = "amdgcn--amdpal"
+
+; Function Attrs: nounwind
+define dllexport spir_func void @lgc.shader.FS.main() local_unnamed_addr #0 !spirv.ExecutionModel !8 !lgc.shaderstage !9 {
+; CHECK-LABEL: define dllexport spir_func void @lgc.shader.FS.main(
+; CHECK-SAME: ) local_unnamed_addr #[[ATTR0:[0-9]+]] !spirv.ExecutionModel !8 !lgc.shaderstage [[META9:![0-9]+]] {
+; CHECK-NEXT:  .entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.amdgcn.s.getpc()
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i64 [[TMP0]] to <2 x i32>
+; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.amdgcn.s.getpc()
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i64 [[TMP2]] to <2 x i32>
+; CHECK-NEXT:    [[I:%.*]] = call i32 (...) @lgc.input.import.interpolated__i32(i1 false, i32 3, i32 0, i32 0, i32 poison, i32 1, i32 poison)
+; CHECK-NEXT:    [[TMP4:%.*]] = call i32 @lgc.load.user.data__i32(i32 0)
+; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <2 x i32> [[TMP3]], i32 [[TMP4]], i64 0
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <2 x i32> [[TMP5]] to i64
+; CHECK-NEXT:    [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr addrspace(4)
+; CHECK-NEXT:    [[I1:%.*]] = getelementptr i8, ptr addrspace(4) [[TMP7]], i32 0
+; CHECK-NEXT:    [[TMP8:%.*]] = call i32 @lgc.load.user.data__i32(i32 0)
+; CHECK-NEXT:    [[TMP9:%.*]] = insertelement <2 x i32> [[TMP1]], i32 [[TMP8]], i64 0
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast <2 x i32> [[TMP9]] to i64
+; CHECK-NEXT:    [[TMP11:%.*]] = inttoptr i64 [[TMP10]] to ptr addrspace(4)
+; CHECK-NEXT:    [[I2:%.*]] = getelementptr i8, ptr addrspace(4) [[TMP11]], i32 32
+; CHECK-NEXT:    br label [[LOOP:%.*]]
+; CHECK:       loop:
+; CHECK-NEXT:    [[PHI_IND:%.*]] = phi i32 [ 0, [[DOTENTRY:%.*]] ], [ [[IND:%.*]], [[LOOP]] ]
+; CHECK-NEXT:    [[I3:%.*]] = mul i32 [[PHI_IND]], 48
+; CHECK-NEXT:    [[I4:%.*]] = sext i32 [[I3]] to i64
+; CHECK-NEXT:    [[I5:%.*]] = getelementptr i8, ptr addrspace(4) [[I1]], i64 [[I4]]
+; CHECK-NEXT:    [[I6:%.*]] = mul i32 [[PHI_IND]], 48
+; CHECK-NEXT:    [[I7:%.*]] = sext i32 [[I6]] to i64
+; CHECK-NEXT:    [[I8:%.*]] = getelementptr i8, ptr addrspace(4) [[I2]], i64 [[I7]]
+; CHECK-NEXT:    [[I9:%.*]] = load <4 x i32>, ptr addrspace(4) [[I8]], align 16, !invariant.load [[META10:![0-9]+]]
+; CHECK-NEXT:    [[I10:%.*]] = load <4 x i32>, ptr addrspace(4) [[I5]], align 32, !invariant.load [[META10]]
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <4 x i32> [[I10]] to <4 x float>
+; CHECK-NEXT:    [[TMP13:%.*]] = call i32 @llvm.amdgcn.waterfall.begin.i32(i32 0, i32 [[I6]])
+; CHECK-NEXT:    [[TMP14:%.*]] = call i32 @llvm.amdgcn.waterfall.readfirstlane.i32.i32(i32 [[TMP13]], i32 [[I6]])
+; CHECK-NEXT:    [[TMP15:%.*]] = sext i32 [[TMP14]] to i64
+; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr addrspace(4) [[I2]], i64 [[TMP15]]
+; CHECK-NEXT:    [[TMP17:%.*]] = load <4 x i32>, ptr addrspace(4) [[TMP16]], align 16, !invariant.load [[META10]]
+; CHECK-NEXT:    [[TMP18:%.*]] = call <4 x i32> @llvm.amdgcn.waterfall.last.use.v4i32(i32 [[TMP13]], <4 x i32> [[TMP17]])
+; CHECK-NEXT:    call void @llvm.amdgcn.struct.buffer.store.format.v4f32(<4 x float> [[TMP12]], <4 x i32> [[TMP18]], i32 1, i32 0, i32 0, i32 0)
+; CHECK-NEXT:    [[IND]] = add i32 [[PHI_IND]], 1
+; CHECK-NEXT:    [[COND:%.*]] = icmp ne i32 [[IND]], 1000
+; CHECK-NEXT:    br i1 [[COND]], label [[LOOP]], label [[EXIT:%.*]]
+; CHECK:       exit:
+; CHECK-NEXT:    ret void
+;
+.entry:
+  %i = call i32 (...) @lgc.create.read.generic.input__i32(i32 3, i32 0, i32 0, i32 0, i32 17, i32 poison)
+  %i1 = call ptr addrspace(4) (...) @lgc.create.get.desc.ptr.p4(i32 1, i32 1, i64 0, i32 7)
+  %i2 = call ptr addrspace(4) (...) @lgc.create.get.desc.ptr.p4(i32 2, i32 2, i64 0, i32 7)
+  br label %loop
+
+loop:                                             ; preds = %loop, %.entry
+  %phi.ind = phi i32 [ 0, %.entry ], [ %ind, %loop ]
+  %a = call i32 (...) @lgc.create.get.desc.stride__i32(i32 1, i32 1, i64 0, i32 7)
+  %b = call i32 (...) @lgc.create.get.desc.stride__i32(i32 2, i32 2, i64 0, i32 7)
+  %i3 = mul i32 %phi.ind, %a
+  %i4 = sext i32 %i3 to i64
+  %i5 = getelementptr i8, ptr addrspace(4) %i1, i64 %i4
+  %i6 = mul i32 %phi.ind, %b
+  %i7 = sext i32 %i6 to i64
+  %i8 = getelementptr i8, ptr addrspace(4) %i2, i64 %i7
+  %i9 = load <4 x i32>, ptr addrspace(4) %i8, align 16, !invariant.load !10
+  %i10 = load <4 x i32>, ptr addrspace(4) %i5, align 32, !invariant.load !10
+  call void (...) @lgc.create.image.store(<4 x i32> %i10, i32 0, i32 8, <4 x i32> %i9, i32 1)
+  %ind = add i32 %phi.ind, 1
+  %cond = icmp ne i32 %ind, 1000
+  br i1 %cond, label %loop, label %exit
+
+exit:                                             ; preds = %loop
+  ret void
+}
+
+declare spir_func void @spirv.NonUniform.i32(i32) local_unnamed_addr
+
+; Function Attrs: nounwind memory(none)
+declare ptr addrspace(4) @lgc.create.get.desc.ptr.p4(...) local_unnamed_addr #1
+
+; Function Attrs: nounwind memory(none)
+declare i32 @lgc.create.get.desc.stride__i32(...) local_unnamed_addr #1
+
+declare spir_func void @"spirv.NonUniform.s[s[p4,i32,i32,i32],s[p4,i32,i32]]"({ { ptr addrspace(4), i32, i32, i32 }, { ptr addrspace(4), i32, i32 } }) local_unnamed_addr
+
+declare spir_func void @"spirv.NonUniform.s[a3v8i32,s[v4i32,i32]]"({ [3 x <8 x i32>], { <4 x i32>, i32 } }) local_unnamed_addr
+
+; Function Attrs: nounwind willreturn memory(read)
+declare <4 x float> @lgc.create.image.sample.v4f32(...) local_unnamed_addr #2
+
+; Function Attrs: nounwind willreturn memory(read)
+declare i32 @lgc.create.read.generic.input__i32(...) local_unnamed_addr #2
+
+; Function Attrs: nounwind memory(write)
+declare void @lgc.create.image.store(...) local_unnamed_addr #3
+
+attributes #0 = { nounwind "denormal-fp-math-f32"="preserve-sign" }
+attributes #1 = { nounwind memory(none) }
+attributes #2 = { nounwind willreturn memory(read) }
+attributes #3 = { nounwind memory(write) }
+
+!lgc.client = !{!0}
+!lgc.unlinked = !{!1}
+!lgc.options = !{!2}
+!lgc.options.FS = !{!3}
+!lgc.user.data.nodes = !{!4, !5}
+!lgc.color.export.formats = !{!6}
+!amdgpu.pal.metadata.msgpack = !{!7}
+
+!0 = !{!"Vulkan"}
+!1 = !{i32 1}
+!2 = !{i32 -158725823, i32 1419665388, i32 -1015833383, i32 -491143713, i32 1, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 256, i32 256, i32 2, i32 1}
+!3 = !{i32 -1822594139, i32 1920663194, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 64, i32 0, i32 0, i32 3, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 20, i32 1800, i32 0, i32 0, i32 1}
+!4 = !{!"DescriptorTableVaPtr", i32 7, i32 64, i32 0, i32 1, i32 1}
+!5 = !{!"DescriptorCombinedTexture", i32 3, i32 64, i32 0, i32 192, i64 0, i32 7, i32 12}
+!6 = !{i32 14, i32 7, i32 0, i32 0, i32 15}
+!7 = !{!"\82\B0amdpal.pipelines\91\83\B0.spill_threshold\CD\FF\FF\B0.user_data_limit\00\AF.xgl_cache_info\82\B3.128_bit_cache_hash\92\CF\E8\D2\98>j\B9B\94\CF2\DEF\BF\9Fx\BC1\AD.llpc_version\A470.1\AEamdpal.version\92\03\00"}
+!8 = !{i32 4}
+!9 = !{i32 6}
+!10 = !{}
diff --git a/lgc/test/scalarizationOfDescriptorLoadsTest2.lgc b/lgc/test/scalarizationOfDescriptorLoadsTest2.lgc
new file mode 100644
index 0000000000..bca91884fb
--- /dev/null
+++ b/lgc/test/scalarizationOfDescriptorLoadsTest2.lgc
@@ -0,0 +1,88 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --tool lgc --version 4
+; RUN: lgc -mcpu=gfx1010 -print-after=lgc-builder-replayer -o - %s 2>&1 | FileCheck --check-prefixes=CHECK %s
+; ModuleID = 'lgcPipeline'
+source_filename = "lgcPipeline"
+target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-p7:160:256:256:32-p8:128:128-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7:8-p32:32:32"
+target triple = "amdgcn--amdpal"
+
+; Function Attrs: nounwind
+define dllexport spir_func void @lgc.shader.VS.main() local_unnamed_addr #0 !spirv.ExecutionModel !14 !lgc.shaderstage !15 {
+; CHECK-LABEL: define dllexport spir_func void @lgc.shader.VS.main(
+; CHECK-SAME: ) local_unnamed_addr #[[ATTR0:[0-9]+]] !spirv.ExecutionModel !14 !lgc.shaderstage [[META15:![0-9]+]] {
+; CHECK-NEXT:  .entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = call i32 @lgc.input.import.generic__i32(i1 false, i32 0, i32 0, i32 0, i32 poison)
+; CHECK-NEXT:    [[TMP1:%.*]] = mul i32 [[TMP0]], poison
+; CHECK-NEXT:    [[TMP2:%.*]] = sext i32 [[TMP1]] to i64
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr addrspace(4) poison, i64 [[TMP2]]
+; CHECK-NEXT:    [[TMP4:%.*]] = load <4 x i32>, ptr addrspace(4) [[TMP3]], align 16, !invariant.load [[META16:![0-9]+]]
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr addrspace(4) poison, i64 [[TMP2]]
+; CHECK-NEXT:    [[TMP6:%.*]] = load <4 x i32>, ptr addrspace(4) [[TMP5]], align 16, !invariant.load [[META16]]
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <4 x i32> [[TMP6]] to <4 x float>
+; CHECK-NEXT:    [[TMP8:%.*]] = call i32 @llvm.amdgcn.waterfall.begin.i32(i32 0, i32 [[TMP1]])
+; CHECK-NEXT:    [[TMP9:%.*]] = call i32 @llvm.amdgcn.waterfall.readfirstlane.i32.i32(i32 [[TMP8]], i32 [[TMP1]])
+; CHECK-NEXT:    [[TMP10:%.*]] = sext i32 [[TMP9]] to i64
+; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr i8, ptr addrspace(4) poison, i64 [[TMP10]]
+; CHECK-NEXT:    [[TMP12:%.*]] = load <4 x i32>, ptr addrspace(4) [[TMP11]], align 16, !invariant.load [[META16]]
+; CHECK-NEXT:    [[TMP13:%.*]] = call <4 x i32> @llvm.amdgcn.waterfall.last.use.v4i32(i32 [[TMP8]], <4 x i32> [[TMP12]])
+; CHECK-NEXT:    call void @llvm.amdgcn.struct.buffer.store.format.v4f32(<4 x float> [[TMP7]], <4 x i32> [[TMP13]], i32 1, i32 0, i32 0, i32 0)
+; CHECK-NEXT:    ret void
+;
+.entry:
+  %0 = call i32 (...) @lgc.create.read.generic.input__i32(i32 0, i32 0, i32 0, i32 0, i32 0, i32 poison)
+  %1 = call ptr addrspace(4) (...) @lgc.create.get.desc.ptr.p4(i32 1, i32 1, i64 0, i32 0)
+  %2 = call i32 (...) @lgc.create.get.desc.stride__i32(i32 1, i32 1, i64 0, i32 0)
+  %3 = mul i32 %0, %2
+  %4 = sext i32 %3 to i64
+  %5 = getelementptr i8, ptr addrspace(4) %1, i64 %4
+  %6 = load <4 x i32>, ptr addrspace(4) %5, align 16, !invariant.load !16
+  %7 = getelementptr i8, ptr addrspace(4) %1, i64 %4
+  %8 = load <4 x i32>, ptr addrspace(4) %7, align 16, !invariant.load !16
+  call void (...) @lgc.create.image.store(<4 x i32> %8, i32 0, i32 8, <4 x i32> %6, i32 1)
+  ret void
+}
+
+; Function Attrs: nounwind willreturn memory(read)
+declare i32 @lgc.create.read.generic.input__i32(...) local_unnamed_addr #1
+
+; Function Attrs: nounwind memory(none)
+declare ptr addrspace(4) @lgc.create.get.desc.ptr.p4(...) local_unnamed_addr #2
+
+; Function Attrs: nounwind memory(none)
+declare i32 @lgc.create.get.desc.stride__i32(...) local_unnamed_addr #2
+
+; Function Attrs: nounwind memory(write)
+declare void @lgc.create.image.store(...) local_unnamed_addr #3
+
+attributes #0 = { nounwind "denormal-fp-math-f32"="preserve-sign" }
+attributes #1 = { nounwind willreturn memory(read) }
+attributes #2 = { nounwind memory(none) }
+attributes #3 = { nounwind memory(write) }
+attributes #4 = { nounwind }
+
+!lgc.client = !{!0}
+!lgc.options = !{!1}
+!lgc.options.VS = !{!2}
+!lgc.options.FS = !{!3}
+!lgc.user.data.nodes = !{!4, !5, !6, !7}
+!lgc.vertex.inputs = !{!8, !9, !10}
+!lgc.color.export.formats = !{!11}
+!lgc.rasterizer.state = !{!12}
+!amdgpu.pal.metadata.msgpack = !{!13}
+
+!0 = !{!"Vulkan"}
+!1 = !{i32 -1055878566, i32 -1332805290, i32 1045905781, i32 -589165353, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 256, i32 256, i32 -1, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 16777216}
+!2 = !{i32 1639417258, i32 -1495429105, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 64, i32 0, i32 0, i32 3, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 20, i32 1800, i32 0, i32 0, i32 1}
+!3 = !{i32 -1409621709, i32 -171549995, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 64, i32 64, i32 0, i32 0, i32 3, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 20, i32 1800, i32 0, i32 0, i32 1}
+!4 = !{!"DescriptorTableVaPtr", i32 7, i32 66, i32 0, i32 1, i32 1}
+!5 = !{!"DescriptorTexelBuffer", i32 4, i32 66, i32 0, i32 16384, i64 0, i32 0, i32 4}
+!6 = !{!"StreamOutTableVaPtr", i32 11, i32 4, i32 1, i32 1, i32 0}
+!7 = !{!"IndirectUserDataVaPtr", i32 8, i32 2, i32 2, i32 1, i32 4}
+!8 = !{i32 0, i32 0, i32 0, i32 40, i32 14, i32 7}
+!9 = !{i32 1, i32 0, i32 16, i32 40, i32 11, i32 7}
+!10 = !{i32 2, i32 0, i32 24, i32 40, i32 14, i32 5}
+!11 = !{i32 14, i32 7, i32 0, i32 0, i32 15}
+!12 = !{i32 0, i32 0, i32 0, i32 1}
+!13 = !{!"\82\B0amdpal.pipelines\91\83\B0.spill_threshold\CD\FF\FF\B0.user_data_limit\00\AF.xgl_cache_info\82\B3.128_bit_cache_hash\92\CF\D6\9C\B4\09\0A8A\DA\CF3\09\AF\FF\11\A9U\06\AD.llpc_version\A470.1\AEamdpal.version\92\03\00"}
+!14 = !{i32 0}
+!15 = !{i32 1}
+!16 = !{}
diff --git a/lgc/test/scalarizationOfDescriptorLoadsTest3.lgc b/lgc/test/scalarizationOfDescriptorLoadsTest3.lgc
new file mode 100644
index 0000000000..21d460b530
--- /dev/null
+++ b/lgc/test/scalarizationOfDescriptorLoadsTest3.lgc
@@ -0,0 +1,89 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --tool lgc --version 4
+; RUN: lgc -mcpu=gfx1010 -print-after=lgc-builder-replayer -o - %s 2>&1 | FileCheck --check-prefixes=CHECK %s
+; ModuleID = 'lgcPipeline'
+source_filename = "lgcPipeline"
+target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-p7:160:256:256:32-p8:128:128-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7:8-p32:32:32"
+target triple = "amdgcn--amdpal"
+
+; Function Attrs: nounwind
+define dllexport spir_func void @lgc.shader.VS.main() local_unnamed_addr #0 !spirv.ExecutionModel !14 !lgc.shaderstage !15 {
+; CHECK-LABEL: define dllexport spir_func void @lgc.shader.VS.main(
+; CHECK-SAME: ) local_unnamed_addr #[[ATTR0:[0-9]+]] !spirv.ExecutionModel !14 !lgc.shaderstage [[META15:![0-9]+]] {
+; CHECK-NEXT:  .entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = call i32 @lgc.input.import.generic__i32(i1 false, i32 0, i32 0, i32 0, i32 poison)
+; CHECK-NEXT:    [[TMP1:%.*]] = mul i32 [[TMP0]], poison
+; CHECK-NEXT:    [[TMP2:%.*]] = sext i32 [[TMP1]] to i64
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr addrspace(4) poison, i64 [[TMP2]]
+; CHECK-NEXT:    [[TMP4:%.*]] = load <4 x i32>, ptr addrspace(4) [[TMP3]], align 16, !invariant.load [[META16:![0-9]+]]
+; CHECK-NEXT:    [[TMP5:%.*]] = mul i32 [[TMP0]], poison
+; CHECK-NEXT:    [[TMP6:%.*]] = sext i32 [[TMP5]] to i64
+; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr i8, ptr addrspace(4) poison, i64 [[TMP6]]
+; CHECK-NEXT:    [[TMP8:%.*]] = load <8 x i32>, ptr addrspace(4) [[TMP7]], align 16, !invariant.load [[META16]]
+; CHECK-NEXT:    [[TMP9:%.*]] = call i32 @llvm.amdgcn.waterfall.begin.i32(i32 0, i32 [[TMP5]])
+; CHECK-NEXT:    [[TMP10:%.*]] = call i32 @llvm.amdgcn.waterfall.readfirstlane.i32.i32(i32 [[TMP9]], i32 [[TMP5]])
+; CHECK-NEXT:    [[TMP11:%.*]] = sext i32 [[TMP10]] to i64
+; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr i8, ptr addrspace(4) poison, i64 [[TMP11]]
+; CHECK-NEXT:    [[TMP13:%.*]] = load <8 x i32>, ptr addrspace(4) [[TMP12]], align 16, !invariant.load [[META16]]
+; CHECK-NEXT:    [[TMP14:%.*]] = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32 15, float 0.000000e+00, float 0.000000e+00, <8 x i32> [[TMP13]], <4 x i32> <i32 42, i32 42, i32 42, i32 42>, i1 false, i32 0, i32 0)
+; CHECK-NEXT:    [[TMP15:%.*]] = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.waterfall.end.v4f32(i32 [[TMP9]], <4 x float> [[TMP14]])
+; CHECK-NEXT:    ret void
+;
+.entry:
+  %0 = call i32 (...) @lgc.create.read.generic.input__i32(i32 0, i32 0, i32 0, i32 0, i32 0, i32 poison)
+  %1 = call ptr addrspace(4) (...) @lgc.create.get.desc.ptr.p4(i32 1, i32 1, i64 0, i32 0)
+  %2 = call i32 (...) @lgc.create.get.desc.stride__i32(i32 1, i32 1, i64 0, i32 0)
+  %3 = mul i32 %0, %2
+  %4 = sext i32 %3 to i64
+  %5 = getelementptr i8, ptr addrspace(4) %1, i64 %4
+  %6 = load <4 x i32>, ptr addrspace(4) %5, align 16, !invariant.load !16
+  %7 = mul i32 %0, %2
+  %8 = sext i32 %7 to i64
+  %9 = getelementptr i8, ptr addrspace(4) %1, i64 %8
+  %10 = load <8 x i32>, ptr addrspace(4) %9, align 16, !invariant.load !16
+  %11 = call reassoc nnan nsz arcp contract afn <4 x float> (...) @lgc.create.image.sample.v4f32(i32 1, i32 24, <8 x i32> %10, <4 x i32> <i32 42, i32 42, i32 42, i32 42>, i32 1, <2 x float> zeroinitializer)
+  ret void
+}
+
+; Function Attrs: nounwind willreturn memory(read)
+declare i32 @lgc.create.read.generic.input__i32(...) local_unnamed_addr #1
+
+; Function Attrs: nounwind memory(none)
+declare ptr addrspace(4) @lgc.create.get.desc.ptr.p4(...) local_unnamed_addr #2
+
+; Function Attrs: nounwind memory(none)
+declare i32 @lgc.create.get.desc.stride__i32(...) local_unnamed_addr #2
+
+; Function Attrs: nounwind willreturn memory(read)
+declare <4 x float> @lgc.create.image.sample.v4f32(...) local_unnamed_addr #1
+
+attributes #0 = { nounwind "denormal-fp-math-f32"="preserve-sign" }
+attributes #1 = { nounwind willreturn memory(read) }
+attributes #2 = { nounwind memory(none) }
+
+!lgc.client = !{!0}
+!lgc.options = !{!1}
+!lgc.options.VS = !{!2}
+!lgc.options.FS = !{!3}
+!lgc.user.data.nodes = !{!4, !5, !6, !7}
+!lgc.vertex.inputs = !{!8, !9, !10}
+!lgc.color.export.formats = !{!11}
+!lgc.rasterizer.state = !{!12}
+!amdgpu.pal.metadata.msgpack = !{!13}
+
+!0 = !{!"Vulkan"}
+!1 = !{i32 -1055878566, i32 -1332805290, i32 1045905781, i32 -589165353, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 256, i32 256, i32 -1, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 16777216}
+!2 = !{i32 1639417258, i32 -1495429105, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 64, i32 0, i32 0, i32 3, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 20, i32 1800, i32 0, i32 0, i32 1}
+!3 = !{i32 -1409621709, i32 -171549995, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 64, i32 64, i32 0, i32 0, i32 3, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 20, i32 1800, i32 0, i32 0, i32 1}
+!4 = !{!"DescriptorTableVaPtr", i32 7, i32 66, i32 0, i32 1, i32 1}
+!5 = !{!"DescriptorTexelBuffer", i32 4, i32 66, i32 0, i32 16384, i64 0, i32 0, i32 4}
+!6 = !{!"StreamOutTableVaPtr", i32 11, i32 4, i32 1, i32 1, i32 0}
+!7 = !{!"IndirectUserDataVaPtr", i32 8, i32 2, i32 2, i32 1, i32 4}
+!8 = !{i32 0, i32 0, i32 0, i32 40, i32 14, i32 7}
+!9 = !{i32 1, i32 0, i32 16, i32 40, i32 11, i32 7}
+!10 = !{i32 2, i32 0, i32 24, i32 40, i32 14, i32 5}
+!11 = !{i32 14, i32 7, i32 0, i32 0, i32 15}
+!12 = !{i32 0, i32 0, i32 0, i32 1}
+!13 = !{!"\82\B0amdpal.pipelines\91\83\B0.spill_threshold\CD\FF\FF\B0.user_data_limit\00\AF.xgl_cache_info\82\B3.128_bit_cache_hash\92\CF\D6\9C\B4\09\0A8A\DA\CF3\09\AF\FF\11\A9U\06\AD.llpc_version\A470.1\AEamdpal.version\92\03\00"}
+!14 = !{i32 0}
+!15 = !{i32 1}
+!16 = !{}
diff --git a/lgc/test/scalarizationOfDescriptorLoadsTest4.lgc b/lgc/test/scalarizationOfDescriptorLoadsTest4.lgc
new file mode 100644
index 0000000000..a6076a3787
--- /dev/null
+++ b/lgc/test/scalarizationOfDescriptorLoadsTest4.lgc
@@ -0,0 +1,95 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --tool lgc --version 4
+; RUN: lgc -mcpu=gfx1010 -print-after=lgc-builder-replayer -o - %s 2>&1 | FileCheck --check-prefixes=CHECK %s
+; ModuleID = 'lgcPipeline'
+source_filename = "lgcPipeline"
+target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-p7:160:256:256:32-p8:128:128-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7:8-p32:32:32"
+target triple = "amdgcn--amdpal"
+
+declare <4 x i32> @foo1(i32 %V)
+
+; Function Attrs: nounwind
+define dllexport spir_func void @lgc.shader.VS.main() local_unnamed_addr #0 !spirv.ExecutionModel !14 !lgc.shaderstage !15 {
+; CHECK-LABEL: define dllexport spir_func void @lgc.shader.VS.main(
+; CHECK-SAME: ) local_unnamed_addr #[[ATTR0:[0-9]+]] !spirv.ExecutionModel !14 !lgc.shaderstage [[META15:![0-9]+]] {
+; CHECK-NEXT:  .entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = call i32 @lgc.input.import.generic__i32(i1 false, i32 0, i32 0, i32 0, i32 poison)
+; CHECK-NEXT:    [[TMP1:%.*]] = mul i32 [[TMP0]], poison
+; CHECK-NEXT:    [[TMP2:%.*]] = sext i32 [[TMP1]] to i64
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr addrspace(4) poison, i64 [[TMP2]]
+; CHECK-NEXT:    [[TMP4:%.*]] = load <4 x i32>, ptr addrspace(4) [[TMP3]], align 16, !invariant.load [[META16:![0-9]+]]
+; CHECK-NEXT:    [[TMP5:%.*]] = mul i32 [[TMP0]], poison
+; CHECK-NEXT:    [[TMP6:%.*]] = sext i32 [[TMP5]] to i64
+; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr i8, ptr addrspace(4) poison, i64 [[TMP6]]
+; CHECK-NEXT:    [[TMP8:%.*]] = load <8 x i32>, ptr addrspace(4) [[TMP7]], align 16, !invariant.load [[META16]]
+; CHECK-NEXT:    [[TMP9:%.*]] = call <4 x i32> @foo1(i32 [[TMP0]])
+; CHECK-NEXT:    [[TMP10:%.*]] = call i32 @llvm.amdgcn.waterfall.begin.i32(i32 0, i32 [[TMP5]])
+; CHECK-NEXT:    [[TMP11:%.*]] = call i32 @llvm.amdgcn.waterfall.begin.v4i32(i32 [[TMP10]], <4 x i32> [[TMP9]])
+; CHECK-NEXT:    [[TMP12:%.*]] = call i32 @llvm.amdgcn.waterfall.readfirstlane.i32.i32(i32 [[TMP11]], i32 [[TMP5]])
+; CHECK-NEXT:    [[TMP13:%.*]] = sext i32 [[TMP12]] to i64
+; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr i8, ptr addrspace(4) poison, i64 [[TMP13]]
+; CHECK-NEXT:    [[TMP15:%.*]] = load <8 x i32>, ptr addrspace(4) [[TMP14]], align 16, !invariant.load [[META16]]
+; CHECK-NEXT:    [[TMP16:%.*]] = call <4 x i32> @llvm.amdgcn.waterfall.readfirstlane.v4i32.v4i32(i32 [[TMP11]], <4 x i32> [[TMP9]])
+; CHECK-NEXT:    [[TMP17:%.*]] = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32 15, float 0.000000e+00, float 0.000000e+00, <8 x i32> [[TMP15]], <4 x i32> [[TMP16]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    [[TMP18:%.*]] = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.waterfall.end.v4f32(i32 [[TMP11]], <4 x float> [[TMP17]])
+; CHECK-NEXT:    ret void
+;
+.entry:
+  %0 = call i32 (...) @lgc.create.read.generic.input__i32(i32 0, i32 0, i32 0, i32 0, i32 0, i32 poison)
+  %1 = call ptr addrspace(4) (...) @lgc.create.get.desc.ptr.p4(i32 1, i32 1, i64 0, i32 0)
+  %2 = call i32 (...) @lgc.create.get.desc.stride__i32(i32 1, i32 1, i64 0, i32 0)
+  %3 = mul i32 %0, %2
+  %4 = sext i32 %3 to i64
+  %5 = getelementptr i8, ptr addrspace(4) %1, i64 %4
+  %6 = load <4 x i32>, ptr addrspace(4) %5, align 16, !invariant.load !16
+  %7 = mul i32 %0, %2
+  %8 = sext i32 %7 to i64
+  %9 = getelementptr i8, ptr addrspace(4) %1, i64 %8
+  %10 = load <8 x i32>, ptr addrspace(4) %9, align 16, !invariant.load !16
+  %11 = call <4 x i32> @foo1(i32 %0)
+  %12 = call reassoc nnan nsz arcp contract afn <4 x float> (...) @lgc.create.image.sample.v4f32(i32 1, i32 24, <8 x i32> %10, <4 x i32> %11, i32 1, <2 x float> zeroinitializer)
+  ret void
+}
+
+; Function Attrs: nounwind willreturn memory(read)
+declare i32 @lgc.create.read.generic.input__i32(...) local_unnamed_addr #1
+
+; Function Attrs: nounwind memory(none)
+declare ptr addrspace(4) @lgc.create.get.desc.ptr.p4(...) local_unnamed_addr #2
+
+; Function Attrs: nounwind memory(none)
+declare i32 @lgc.create.get.desc.stride__i32(...) local_unnamed_addr #2
+
+; Function Attrs: nounwind willreturn memory(read)
+declare <4 x float> @lgc.create.image.sample.v4f32(...) local_unnamed_addr #1
+
+attributes #0 = { nounwind "denormal-fp-math-f32"="preserve-sign" }
+attributes #1 = { nounwind willreturn memory(read) }
+attributes #2 = { nounwind memory(none) }
+
+!lgc.client = !{!0}
+!lgc.options = !{!1}
+!lgc.options.VS = !{!2}
+!lgc.options.FS = !{!3}
+!lgc.user.data.nodes = !{!4, !5, !6, !7}
+!lgc.vertex.inputs = !{!8, !9, !10}
+!lgc.color.export.formats = !{!11}
+!lgc.rasterizer.state = !{!12}
+!amdgpu.pal.metadata.msgpack = !{!13}
+
+!0 = !{!"Vulkan"}
+!1 = !{i32 -1055878566, i32 -1332805290, i32 1045905781, i32 -589165353, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 256, i32 256, i32 -1, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 16777216}
+!2 = !{i32 1639417258, i32 -1495429105, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 64, i32 0, i32 0, i32 3, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 20, i32 1800, i32 0, i32 0, i32 1}
+!3 = !{i32 -1409621709, i32 -171549995, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 64, i32 64, i32 0, i32 0, i32 3, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 20, i32 1800, i32 0, i32 0, i32 1}
+!4 = !{!"DescriptorTableVaPtr", i32 7, i32 66, i32 0, i32 1, i32 1}
+!5 = !{!"DescriptorTexelBuffer", i32 4, i32 66, i32 0, i32 16384, i64 0, i32 0, i32 4}
+!6 = !{!"StreamOutTableVaPtr", i32 11, i32 4, i32 1, i32 1, i32 0}
+!7 = !{!"IndirectUserDataVaPtr", i32 8, i32 2, i32 2, i32 1, i32 4}
+!8 = !{i32 0, i32 0, i32 0, i32 40, i32 14, i32 7}
+!9 = !{i32 1, i32 0, i32 16, i32 40, i32 11, i32 7}
+!10 = !{i32 2, i32 0, i32 24, i32 40, i32 14, i32 5}
+!11 = !{i32 14, i32 7, i32 0, i32 0, i32 15}
+!12 = !{i32 0, i32 0, i32 0, i32 1}
+!13 = !{!"\82\B0amdpal.pipelines\91\83\B0.spill_threshold\CD\FF\FF\B0.user_data_limit\00\AF.xgl_cache_info\82\B3.128_bit_cache_hash\92\CF\D6\9C\B4\09\0A8A\DA\CF3\09\AF\FF\11\A9U\06\AD.llpc_version\A470.1\AEamdpal.version\92\03\00"}
+!14 = !{i32 0}
+!15 = !{i32 1}
+!16 = !{}
diff --git a/lgc/test/scalarizationOfDescriptorLoadsTest5.lgc b/lgc/test/scalarizationOfDescriptorLoadsTest5.lgc
new file mode 100644
index 0000000000..6cb1fdfdcc
--- /dev/null
+++ b/lgc/test/scalarizationOfDescriptorLoadsTest5.lgc
@@ -0,0 +1,113 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --tool lgc --version 4
+; RUN: lgc -mcpu=gfx1010 -print-after=lgc-builder-replayer -o - %s 2>&1 | FileCheck --check-prefixes=CHECK %s
+; ModuleID = 'lgcPipeline'
+source_filename = "lgcPipeline"
+target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-p7:160:256:256:32-p8:128:128-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7:8-p32:32:32"
+target triple = "amdgcn--amdpal"
+
+; Function Attrs: nounwind
+define dllexport spir_func void @lgc.shader.VS.main() local_unnamed_addr #0 !spirv.ExecutionModel !14 !lgc.shaderstage !15 {
+; CHECK-LABEL: define dllexport spir_func void @lgc.shader.VS.main(
+; CHECK-SAME: ) local_unnamed_addr #[[ATTR0:[0-9]+]] !spirv.ExecutionModel !14 !lgc.shaderstage [[META15:![0-9]+]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = call i32 @lgc.input.import.generic__i32(i1 false, i32 0, i32 0, i32 0, i32 poison)
+; CHECK-NEXT:    [[DOTNOT:%.*]] = icmp eq i32 [[TMP0]], 0
+; CHECK-NEXT:    br i1 [[DOTNOT]], label [[RET:%.*]], label [[BB:%.*]]
+; CHECK:       bb:
+; CHECK-NEXT:    [[TMP1:%.*]] = mul i32 [[TMP0]], poison
+; CHECK-NEXT:    [[TMP2:%.*]] = sext i32 [[TMP1]] to i64
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr addrspace(4) poison, i64 [[TMP2]]
+; CHECK-NEXT:    [[TMP4:%.*]] = load <4 x i32>, ptr addrspace(4) [[TMP3]], align 16, !invariant.load [[META16:![0-9]+]]
+; CHECK-NEXT:    [[TMP5:%.*]] = mul i32 [[TMP0]], poison
+; CHECK-NEXT:    [[TMP6:%.*]] = sext i32 [[TMP5]] to i64
+; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr i8, ptr addrspace(4) poison, i64 [[TMP6]]
+; CHECK-NEXT:    [[TMP8:%.*]] = load <4 x i32>, ptr addrspace(4) [[TMP7]], align 16, !invariant.load [[META16]]
+; CHECK-NEXT:    [[TMP9:%.*]] = call i32 @llvm.amdgcn.waterfall.begin.i32(i32 0, i32 [[TMP5]])
+; CHECK-NEXT:    [[TMP10:%.*]] = call i32 @llvm.amdgcn.waterfall.readfirstlane.i32.i32(i32 [[TMP9]], i32 [[TMP5]])
+; CHECK-NEXT:    [[TMP11:%.*]] = sext i32 [[TMP10]] to i64
+; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr i8, ptr addrspace(4) poison, i64 [[TMP11]]
+; CHECK-NEXT:    [[TMP13:%.*]] = load <4 x i32>, ptr addrspace(4) [[TMP12]], align 16, !invariant.load [[META16]]
+; CHECK-NEXT:    [[TMP14:%.*]] = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.struct.buffer.load.format.v4f32(<4 x i32> [[TMP13]], i32 0, i32 0, i32 0, i32 0)
+; CHECK-NEXT:    [[TMP15:%.*]] = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.waterfall.end.v4f32(i32 [[TMP9]], <4 x float> [[TMP14]])
+; CHECK-NEXT:    [[TMP16:%.*]] = call i32 @llvm.amdgcn.waterfall.begin.i32(i32 0, i32 [[TMP1]])
+; CHECK-NEXT:    [[TMP17:%.*]] = call i32 @llvm.amdgcn.waterfall.readfirstlane.i32.i32(i32 [[TMP16]], i32 [[TMP1]])
+; CHECK-NEXT:    [[TMP18:%.*]] = sext i32 [[TMP17]] to i64
+; CHECK-NEXT:    [[TMP19:%.*]] = getelementptr i8, ptr addrspace(4) poison, i64 [[TMP18]]
+; CHECK-NEXT:    [[TMP20:%.*]] = load <4 x i32>, ptr addrspace(4) [[TMP19]], align 16, !invariant.load [[META16]]
+; CHECK-NEXT:    [[TMP21:%.*]] = call <4 x i32> @llvm.amdgcn.waterfall.last.use.v4i32(i32 [[TMP16]], <4 x i32> [[TMP20]])
+; CHECK-NEXT:    call void @llvm.amdgcn.struct.buffer.store.format.v4f32(<4 x float> [[TMP15]], <4 x i32> [[TMP21]], i32 1, i32 0, i32 0, i32 0)
+; CHECK-NEXT:    br label [[RET]]
+; CHECK:       ret:
+; CHECK-NEXT:    ret void
+;
+entry:
+  %0 = call i32 (...) @lgc.create.read.generic.input__i32(i32 0, i32 0, i32 0, i32 0, i32 0, i32 poison)
+  %.not = icmp eq i32 %0, 0
+  br i1 %.not, label %ret, label %bb
+
+bb:                                                ; preds = %entry
+  %1 = call ptr addrspace(4) (...) @lgc.create.get.desc.ptr.p4(i32 1, i32 1, i64 0, i32 0)
+  %2 = call i32 (...) @lgc.create.get.desc.stride__i32(i32 1, i32 1, i64 0, i32 0)
+  %3 = mul i32 %0, %2
+  %4 = sext i32 %3 to i64
+  %5 = getelementptr i8, ptr addrspace(4) %1, i64 %4
+  %6 = load <4 x i32>, ptr addrspace(4) %5, align 16, !invariant.load !16
+  %7 = mul i32 %0, %2
+  %8 = sext i32 %7 to i64
+  %9 = getelementptr i8, ptr addrspace(4) %1, i64 %8
+  %10 = load <4 x i32>, ptr addrspace(4) %9, align 16, !invariant.load !16
+  %11 = call reassoc nnan nsz arcp contract afn <4 x float> (...) @lgc.create.image.load.v4f32(i32 0, i32 8, <4 x i32> %10, i32 0)
+  call void (...) @lgc.create.image.store(<4 x float> %11, i32 0, i32 8, <4 x i32> %6, i32 1)
+  br label %ret
+
+ret:                                               ; preds = %bb, %entry
+  ret void
+}
+
+; Function Attrs: nounwind willreturn memory(read)
+declare i32 @lgc.create.read.generic.input__i32(...) local_unnamed_addr #1
+
+; Function Attrs: nounwind memory(none)
+declare ptr addrspace(4) @lgc.create.get.desc.ptr.p4(...) local_unnamed_addr #2
+
+; Function Attrs: nounwind memory(none)
+declare i32 @lgc.create.get.desc.stride__i32(...) local_unnamed_addr #2
+
+; Function Attrs: nounwind willreturn memory(read)
+declare <4 x float> @lgc.create.image.load.v4f32(...) local_unnamed_addr #1
+
+; Function Attrs: nounwind memory(write)
+declare void @lgc.create.image.store(...) local_unnamed_addr #3
+
+attributes #0 = { nounwind "denormal-fp-math-f32"="preserve-sign" }
+attributes #1 = { nounwind willreturn memory(read) }
+attributes #2 = { nounwind memory(none) }
+attributes #3 = { nounwind memory(write) }
+
+!lgc.client = !{!0}
+!lgc.options = !{!1}
+!lgc.options.VS = !{!2}
+!lgc.options.FS = !{!3}
+!lgc.user.data.nodes = !{!4, !5, !6, !7}
+!lgc.vertex.inputs = !{!8, !9, !10}
+!lgc.color.export.formats = !{!11}
+!lgc.rasterizer.state = !{!12}
+!amdgpu.pal.metadata.msgpack = !{!13}
+
+!0 = !{!"Vulkan"}
+!1 = !{i32 -1055878566, i32 -1332805290, i32 1045905781, i32 -589165353, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 256, i32 256, i32 -1, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 16777216}
+!2 = !{i32 1639417258, i32 -1495429105, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 64, i32 0, i32 0, i32 3, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 20, i32 1800, i32 0, i32 0, i32 1}
+!3 = !{i32 -1409621709, i32 -171549995, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 64, i32 64, i32 0, i32 0, i32 3, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 20, i32 1800, i32 0, i32 0, i32 1}
+!4 = !{!"DescriptorTableVaPtr", i32 7, i32 66, i32 0, i32 1, i32 1}
+!5 = !{!"DescriptorTexelBuffer", i32 4, i32 66, i32 0, i32 16384, i64 0, i32 0, i32 4}
+!6 = !{!"StreamOutTableVaPtr", i32 11, i32 4, i32 1, i32 1, i32 0}
+!7 = !{!"IndirectUserDataVaPtr", i32 8, i32 2, i32 2, i32 1, i32 4}
+!8 = !{i32 0, i32 0, i32 0, i32 40, i32 14, i32 7}
+!9 = !{i32 1, i32 0, i32 16, i32 40, i32 11, i32 7}
+!10 = !{i32 2, i32 0, i32 24, i32 40, i32 14, i32 5}
+!11 = !{i32 14, i32 7, i32 0, i32 0, i32 15}
+!12 = !{i32 0, i32 0, i32 0, i32 1}
+!13 = !{!"\82\B0amdpal.pipelines\91\83\B0.spill_threshold\CD\FF\FF\B0.user_data_limit\00\AF.xgl_cache_info\82\B3.128_bit_cache_hash\92\CF\D6\9C\B4\09\0A8A\DA\CF3\09\AF\FF\11\A9U\06\AD.llpc_version\A470.1\AEamdpal.version\92\03\00"}
+!14 = !{i32 0}
+!15 = !{i32 1}
+!16 = !{}
diff --git a/lgc/test/scalarizationOfDescriptorLoadsTest6.lgc b/lgc/test/scalarizationOfDescriptorLoadsTest6.lgc
new file mode 100644
index 0000000000..cf57f85ba7
--- /dev/null
+++ b/lgc/test/scalarizationOfDescriptorLoadsTest6.lgc
@@ -0,0 +1,193 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --tool lgc --version 4
+; RUN: lgc -mcpu=gfx1010 -print-after=lgc-builder-replayer -o - %s 2>&1 | FileCheck --check-prefixes=CHECK %s
+; ModuleID = 'lgcPipeline'
+source_filename = "lgcPipeline"
+target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-p7:160:256:256:32-p8:128:128-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7:8-p32:32:32"
+target triple = "amdgcn--amdpal"
+
+; Function Attrs: nounwind
+define dllexport spir_func void @lgc.shader.FS.main() local_unnamed_addr #0 !spirv.ExecutionModel !22 !lgc.shaderstage !23 {
+; CHECK-LABEL: define dllexport spir_func void @lgc.shader.FS.main(
+; CHECK-SAME: ) local_unnamed_addr #[[ATTR0:[0-9]+]] !spirv.ExecutionModel !22 !lgc.shaderstage [[META23:![0-9]+]] {
+; CHECK-NEXT:  .entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.amdgcn.s.getpc()
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i64 [[TMP0]] to <2 x i32>
+; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.amdgcn.s.getpc()
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i64 [[TMP2]] to <2 x i32>
+; CHECK-NEXT:    [[TMP4:%.*]] = call i64 @llvm.amdgcn.s.getpc()
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i64 [[TMP4]] to <2 x i32>
+; CHECK-NEXT:    [[TMP6:%.*]] = call ptr addrspace(4) @lgc.user.data(i32 28)
+; CHECK-NEXT:    [[TMP7:%.*]] = call i32 (...) @lgc.input.import.interpolated__i32(i1 false, i32 3, i32 0, i32 0, i32 poison, i32 1, i32 poison)
+; CHECK-NEXT:    [[TMP8:%.*]] = load i32, ptr addrspace(4) [[TMP6]], align 4
+; CHECK-NEXT:    br label [[TMP9:%.*]]
+; CHECK:       9:
+; CHECK-NEXT:    [[DOT010:%.*]] = phi <4 x float> [ zeroinitializer, [[DOTENTRY:%.*]] ], [ [[TMP64:%.*]], [[TMP13:%.*]] ]
+; CHECK-NEXT:    [[DOT09:%.*]] = phi <4 x float> [ zeroinitializer, [[DOTENTRY]] ], [ [[TMP62:%.*]], [[TMP13]] ]
+; CHECK-NEXT:    [[DOT0:%.*]] = phi i32 [ [[TMP8]], [[DOTENTRY]] ], [ [[TMP65:%.*]], [[TMP13]] ]
+; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds <{ i32, i32 }>, ptr addrspace(4) [[TMP6]], i64 0, i32 1
+; CHECK-NEXT:    [[TMP11:%.*]] = load i32, ptr addrspace(4) [[TMP10]], align 4
+; CHECK-NEXT:    [[TMP12:%.*]] = icmp slt i32 [[DOT0]], [[TMP11]]
+; CHECK-NEXT:    [[COND_FREEZE:%.*]] = freeze i1 [[TMP12]]
+; CHECK-NEXT:    br i1 [[COND_FREEZE]], label [[TMP13]], label [[TMP66:%.*]]
+; CHECK:       13:
+; CHECK-NEXT:    [[TMP14:%.*]] = call i32 @lgc.load.user.data__i32(i32 40)
+; CHECK-NEXT:    [[TMP15:%.*]] = insertelement <2 x i32> [[TMP5]], i32 [[TMP14]], i64 0
+; CHECK-NEXT:    [[TMP16:%.*]] = bitcast <2 x i32> [[TMP15]] to i64
+; CHECK-NEXT:    [[TMP17:%.*]] = inttoptr i64 [[TMP16]] to ptr addrspace(4)
+; CHECK-NEXT:    [[TMP18:%.*]] = getelementptr i8, ptr addrspace(4) [[TMP17]], i32 0
+; CHECK-NEXT:    [[TMP19:%.*]] = load <4 x i32>, ptr addrspace(4) [[TMP18]], align 16, !invariant.load [[META24:![0-9]+]]
+; CHECK-NEXT:    [[TMP20:%.*]] = call <4 x i32> @llvm.amdgcn.struct.buffer.load.format.v4i32(<4 x i32> [[TMP19]], i32 [[DOT0]], i32 0, i32 0, i32 0), !invariant.load [[META24]]
+; CHECK-NEXT:    [[TMP21:%.*]] = extractelement <4 x i32> [[TMP20]], i64 0
+; CHECK-NEXT:    [[TMP22:%.*]] = call i32 @lgc.load.user.data__i32(i32 36)
+; CHECK-NEXT:    [[TMP23:%.*]] = insertelement <2 x i32> [[TMP3]], i32 [[TMP22]], i64 0
+; CHECK-NEXT:    [[TMP24:%.*]] = bitcast <2 x i32> [[TMP23]] to i64
+; CHECK-NEXT:    [[TMP25:%.*]] = inttoptr i64 [[TMP24]] to ptr addrspace(4)
+; CHECK-NEXT:    [[TMP26:%.*]] = getelementptr i8, ptr addrspace(4) [[TMP25]], i32 16
+; CHECK-NEXT:    [[TMP27:%.*]] = mul i32 [[TMP21]], 32
+; CHECK-NEXT:    [[TMP28:%.*]] = sext i32 [[TMP27]] to i64
+; CHECK-NEXT:    [[TMP29:%.*]] = getelementptr i8, ptr addrspace(4) [[TMP26]], i64 [[TMP28]]
+; CHECK-NEXT:    [[TMP30:%.*]] = load <8 x i32>, ptr addrspace(4) [[TMP29]], align 32, !invariant.load [[META24]]
+; CHECK-NEXT:    [[TMP31:%.*]] = call i32 @lgc.load.user.data__i32(i32 36)
+; CHECK-NEXT:    [[TMP32:%.*]] = insertelement <2 x i32> [[TMP1]], i32 [[TMP31]], i64 0
+; CHECK-NEXT:    [[TMP33:%.*]] = bitcast <2 x i32> [[TMP32]] to i64
+; CHECK-NEXT:    [[TMP34:%.*]] = inttoptr i64 [[TMP33]] to ptr addrspace(4)
+; CHECK-NEXT:    [[TMP35:%.*]] = getelementptr i8, ptr addrspace(4) [[TMP34]], i32 0
+; CHECK-NEXT:    [[TMP36:%.*]] = load <4 x i32>, ptr addrspace(4) [[TMP35]], align 16, !invariant.load [[META24]]
+; CHECK-NEXT:    [[TMP37:%.*]] = call i32 @llvm.amdgcn.waterfall.begin.i32(i32 0, i32 [[TMP27]])
+; CHECK-NEXT:    [[TMP38:%.*]] = call i32 @llvm.amdgcn.waterfall.readfirstlane.i32.i32(i32 [[TMP37]], i32 [[TMP27]])
+; CHECK-NEXT:    [[TMP39:%.*]] = sext i32 [[TMP38]] to i64
+; CHECK-NEXT:    [[TMP40:%.*]] = getelementptr i8, ptr addrspace(4) [[TMP26]], i64 [[TMP39]]
+; CHECK-NEXT:    [[TMP41:%.*]] = load <8 x i32>, ptr addrspace(4) [[TMP40]], align 32, !invariant.load [[META24]]
+; CHECK-NEXT:    [[TMP42:%.*]] = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32 15, float 0.000000e+00, float 0.000000e+00, <8 x i32> [[TMP41]], <4 x i32> [[TMP36]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    [[TMP43:%.*]] = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.waterfall.end.v4f32(i32 [[TMP37]], <4 x float> [[TMP42]])
+; CHECK-NEXT:    [[TMP44:%.*]] = mul i32 [[TMP7]], 32
+; CHECK-NEXT:    [[TMP45:%.*]] = sext i32 [[TMP44]] to i64
+; CHECK-NEXT:    [[TMP46:%.*]] = getelementptr i8, ptr addrspace(4) [[TMP26]], i64 [[TMP45]]
+; CHECK-NEXT:    [[TMP47:%.*]] = load <8 x i32>, ptr addrspace(4) [[TMP46]], align 32, !invariant.load [[META24]]
+; CHECK-NEXT:    [[TMP48:%.*]] = call i32 @llvm.amdgcn.waterfall.begin.i32(i32 0, i32 [[TMP44]])
+; CHECK-NEXT:    [[TMP49:%.*]] = call i32 @llvm.amdgcn.waterfall.readfirstlane.i32.i32(i32 [[TMP48]], i32 [[TMP44]])
+; CHECK-NEXT:    [[TMP50:%.*]] = sext i32 [[TMP49]] to i64
+; CHECK-NEXT:    [[TMP51:%.*]] = getelementptr i8, ptr addrspace(4) [[TMP26]], i64 [[TMP50]]
+; CHECK-NEXT:    [[TMP52:%.*]] = load <8 x i32>, ptr addrspace(4) [[TMP51]], align 32, !invariant.load [[META24]]
+; CHECK-NEXT:    [[TMP53:%.*]] = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32 15, float 0.000000e+00, float 0.000000e+00, <8 x i32> [[TMP52]], <4 x i32> [[TMP36]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    [[TMP54:%.*]] = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.waterfall.end.v4f32(i32 [[TMP48]], <4 x float> [[TMP53]])
+; CHECK-NEXT:    [[TMP55:%.*]] = call i32 @llvm.amdgcn.waterfall.begin.i32(i32 0, i32 [[TMP27]])
+; CHECK-NEXT:    [[TMP56:%.*]] = call i32 @llvm.amdgcn.waterfall.readfirstlane.i32.i32(i32 [[TMP55]], i32 [[TMP27]])
+; CHECK-NEXT:    [[TMP57:%.*]] = sext i32 [[TMP56]] to i64
+; CHECK-NEXT:    [[TMP58:%.*]] = getelementptr i8, ptr addrspace(4) [[TMP26]], i64 [[TMP57]]
+; CHECK-NEXT:    [[TMP59:%.*]] = load <8 x i32>, ptr addrspace(4) [[TMP58]], align 32, !invariant.load [[META24]]
+; CHECK-NEXT:    [[TMP60:%.*]] = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32 15, float 0.000000e+00, float 0.000000e+00, <8 x i32> [[TMP59]], <4 x i32> [[TMP36]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    [[TMP61:%.*]] = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.waterfall.end.v4f32(i32 [[TMP55]], <4 x float> [[TMP60]])
+; CHECK-NEXT:    [[TMP62]] = fadd reassoc nnan nsz arcp contract afn <4 x float> [[DOT09]], [[TMP61]]
+; CHECK-NEXT:    [[TMP63:%.*]] = fadd reassoc nnan nsz arcp contract afn <4 x float> [[TMP43]], [[TMP54]]
+; CHECK-NEXT:    [[TMP64]] = fadd reassoc nnan nsz arcp contract afn <4 x float> [[DOT010]], [[TMP63]]
+; CHECK-NEXT:    [[TMP65]] = add i32 [[DOT0]], 1
+; CHECK-NEXT:    br label [[TMP9]], !llvm.loop [[LOOP25:![0-9]+]]
+; CHECK:       66:
+; CHECK-NEXT:    ret void
+;
+.entry:
+  %0 = call ptr addrspace(4) (...) @lgc.create.load.push.constants.ptr.p4()
+  %1 = call i32 (...) @lgc.create.read.generic.input__i32(i32 3, i32 0, i32 0, i32 0, i32 17, i32 poison)
+  %2 = load i32, ptr addrspace(4) %0, align 4
+  br label %3
+
+3:                                                ; preds = %7, %.entry
+  %.010 = phi <4 x float> [ zeroinitializer, %.entry ], [ %30, %7 ]
+  %.09 = phi <4 x float> [ zeroinitializer, %.entry ], [ %28, %7 ]
+  %.0 = phi i32 [ %2, %.entry ], [ %31, %7 ]
+  %4 = getelementptr inbounds <{ i32, i32 }>, ptr addrspace(4) %0, i64 0, i32 1
+  %5 = load i32, ptr addrspace(4) %4, align 4
+  %6 = icmp slt i32 %.0, %5
+  %cond.freeze = freeze i1 %6
+  br i1 %cond.freeze, label %7, label %32
+
+7:                                                ; preds = %3
+  %8 = call ptr addrspace(4) (...) @lgc.create.get.desc.ptr.p4(i32 4, i32 4, i64 1, i32 12)
+  %9 = call i32 (...) @lgc.create.get.desc.stride__i32(i32 4, i32 4, i64 1, i32 12)
+  %10 = load <4 x i32>, ptr addrspace(4) %8, align 16, !invariant.load !24
+  %11 = call <4 x i32> (...) @lgc.create.image.load.v4i32(i32 0, i32 1536, <4 x i32> %10, i32 %.0)
+  %12 = extractelement <4 x i32> %11, i64 0
+  %13 = call ptr addrspace(4) (...) @lgc.create.get.desc.ptr.p4(i32 1, i32 1, i64 0, i32 6)
+  %14 = call i32 (...) @lgc.create.get.desc.stride__i32(i32 1, i32 1, i64 0, i32 6)
+  %15 = mul i32 %12, %14
+  %16 = sext i32 %15 to i64
+  %17 = getelementptr i8, ptr addrspace(4) %13, i64 %16
+  %18 = load <8 x i32>, ptr addrspace(4) %17, align 32, !invariant.load !24
+  %19 = call ptr addrspace(4) (...) @lgc.create.get.desc.ptr.p4(i32 2, i32 2, i64 0, i32 5)
+  %20 = load <4 x i32>, ptr addrspace(4) %19, align 16, !invariant.load !24
+  %21 = call reassoc nnan nsz arcp contract afn <4 x float> (...) @lgc.create.image.sample.v4f32(i32 1, i32 24, <8 x i32> %18, <4 x i32> %20, i32 1, <2 x float> zeroinitializer)
+  %22 = mul i32 %1, %14
+  %23 = sext i32 %22 to i64
+  %24 = getelementptr i8, ptr addrspace(4) %13, i64 %23
+  %25 = load <8 x i32>, ptr addrspace(4) %24, align 32, !invariant.load !24
+  %26 = call reassoc nnan nsz arcp contract afn <4 x float> (...) @lgc.create.image.sample.v4f32(i32 1, i32 24, <8 x i32> %25, <4 x i32> %20, i32 1, <2 x float> zeroinitializer)
+  %27 = call reassoc nnan nsz arcp contract afn <4 x float> (...) @lgc.create.image.sample.v4f32(i32 1, i32 24, <8 x i32> %18, <4 x i32> %20, i32 1, <2 x float> zeroinitializer)
+  %28 = fadd reassoc nnan nsz arcp contract afn <4 x float> %.09, %27
+  %29 = fadd reassoc nnan nsz arcp contract afn <4 x float> %21, %26
+  %30 = fadd reassoc nnan nsz arcp contract afn <4 x float> %.010, %29
+  %31 = add i32 %.0, 1
+  br label %3, !llvm.loop !25
+
+32:                                               ; preds = %3
+  ret void
+}
+
+; Function Attrs: nounwind willreturn memory(read)
+declare ptr addrspace(4) @lgc.create.load.push.constants.ptr.p4(...) local_unnamed_addr #1
+
+; Function Attrs: nounwind willreturn memory(read)
+declare i32 @lgc.create.read.generic.input__i32(...) local_unnamed_addr #1
+
+; Function Attrs: nounwind memory(none)
+declare ptr addrspace(4) @lgc.create.get.desc.ptr.p4(...) local_unnamed_addr #2
+
+; Function Attrs: nounwind memory(none)
+declare i32 @lgc.create.get.desc.stride__i32(...) local_unnamed_addr #2
+
+; Function Attrs: nounwind willreturn memory(read)
+declare <4 x i32> @lgc.create.image.load.v4i32(...) local_unnamed_addr #1
+
+; Function Attrs: nounwind willreturn memory(read)
+declare <4 x float> @lgc.create.image.sample.v4f32(...) local_unnamed_addr #1
+
+attributes #0 = { nounwind "denormal-fp-math-f32"="preserve-sign" }
+attributes #1 = { nounwind willreturn memory(read) }
+attributes #2 = { nounwind memory(none) }
+
+!lgc.client = !{!0}
+!lgc.options = !{!1}
+!lgc.options.VS = !{!2}
+!lgc.options.FS = !{!3}
+!lgc.user.data.nodes = !{!4, !5, !6, !7, !8, !9, !10, !11, !12, !13, !14, !15}
+!lgc.vertex.inputs = !{!16, !17, !18}
+!lgc.color.export.formats = !{!19}
+!lgc.rasterizer.state = !{!20}
+!amdgpu.pal.metadata.msgpack = !{!21}
+
+!0 = !{!"Vulkan"}
+!1 = !{i32 1397006593, i32 1762399868, i32 679484448, i32 1745956893, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 272, i32 0, i32 0, i32 1, i32 256, i32 256, i32 -1, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 16777216}
+!2 = !{i32 1156202838, i32 -1602642692, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 64, i32 0, i32 0, i32 3, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 20, i32 1800, i32 0, i32 0, i32 1}
+!3 = !{i32 -1603553139, i32 446675175, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 64, i32 0, i32 0, i32 3, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 20, i32 1800, i32 0, i32 0, i32 1}
+!4 = !{!"IndirectUserDataVaPtr", i32 8, i32 2, i32 0, i32 1, i32 4}
+!5 = !{!"DescriptorTableVaPtr", i32 7, i32 66, i32 11, i32 1, i32 3}
+!6 = !{!"DescriptorBufferCompact", i32 10, i32 66, i32 0, i32 2, i64 93, i32 17, i32 2}
+!7 = !{!"DescriptorBuffer", i32 6, i32 66, i32 2, i32 4, i64 93, i32 0, i32 4}
+!8 = !{!"DescriptorBuffer", i32 6, i32 66, i32 6, i32 4, i64 93, i32 1, i32 4}
+!9 = !{!"StreamOutTableVaPtr", i32 11, i32 4, i32 3, i32 1, i32 0}
+!10 = !{!"PushConst", i32 9, i32 66, i32 7, i32 2, i64 4294967295, i32 0, i32 4}
+!11 = !{!"DescriptorTableVaPtr", i32 7, i32 66, i32 9, i32 1, i32 2}
+!12 = !{!"DescriptorSampler", i32 2, i32 66, i32 0, i32 4, i64 0, i32 5, i32 4}
+!13 = !{!"DescriptorResource", i32 1, i32 66, i32 4, i32 32768, i64 0, i32 6, i32 8}
+!14 = !{!"DescriptorTableVaPtr", i32 7, i32 66, i32 10, i32 1, i32 1}
+!15 = !{!"DescriptorTexelBuffer", i32 4, i32 66, i32 0, i32 4, i64 1, i32 12, i32 4}
+!16 = !{i32 0, i32 0, i32 0, i32 40, i32 14, i32 7}
+!17 = !{i32 1, i32 0, i32 16, i32 40, i32 11, i32 7}
+!18 = !{i32 2, i32 0, i32 24, i32 40, i32 14, i32 5}
+!19 = !{i32 14, i32 7, i32 0, i32 0, i32 15}
+!20 = !{i32 0, i32 0, i32 0, i32 1}
+!21 = !{!"\82\B0amdpal.pipelines\91\83\B0.spill_threshold\CD\FF\FF\B0.user_data_limit\00\AF.xgl_cache_info\82\B3.128_bit_cache_hash\92\CF\9BN7\81A[\8A\DB\CF\9Daz\E2A\8F\88\AD\AD.llpc_version\A470.1\AEamdpal.version\92\03\00"}
+!22 = !{i32 4}
+!23 = !{i32 6}
+!24 = !{}
+!25 = distinct !{!25}
diff --git a/lgc/test/scalarizationOfDescriptorLoadsTest7.lgc b/lgc/test/scalarizationOfDescriptorLoadsTest7.lgc
new file mode 100644
index 0000000000..445a355b8d
--- /dev/null
+++ b/lgc/test/scalarizationOfDescriptorLoadsTest7.lgc
@@ -0,0 +1,129 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --tool lgc --version 4
+; RUN: lgc -mcpu=gfx1010 -print-after=lgc-builder-replayer -o - %s 2>&1 | FileCheck --check-prefixes=CHECK %s
+
+; This test checks how the scalarization of descriptor loads works if the values are defined in different basic blocks.
+
+source_filename = "llpc_fragment_7"
+target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-p7:160:256:256:32-p8:128:128-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7:8-p32:32:32"
+target triple = "amdgcn--amdpal"
+
+; Function Attrs: nounwind
+define dllexport spir_func void @lgc.shader.FS.main() local_unnamed_addr #0 !spirv.ExecutionModel !8 !lgc.shaderstage !9 {
+; CHECK-LABEL: define dllexport spir_func void @lgc.shader.FS.main(
+; CHECK-SAME: ) local_unnamed_addr #[[ATTR0:[0-9]+]] !spirv.ExecutionModel !8 !lgc.shaderstage [[META9:![0-9]+]] {
+; CHECK-NEXT:  .entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.amdgcn.s.getpc()
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i64 [[TMP0]] to <2 x i32>
+; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.amdgcn.s.getpc()
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i64 [[TMP2]] to <2 x i32>
+; CHECK-NEXT:    [[I:%.*]] = call i32 (...) @lgc.input.import.interpolated__i32(i1 false, i32 3, i32 0, i32 0, i32 poison, i32 1, i32 poison)
+; CHECK-NEXT:    [[TMP4:%.*]] = call i32 @lgc.load.user.data__i32(i32 0)
+; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <2 x i32> [[TMP3]], i32 [[TMP4]], i64 0
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <2 x i32> [[TMP5]] to i64
+; CHECK-NEXT:    [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr addrspace(4)
+; CHECK-NEXT:    [[I1:%.*]] = getelementptr i8, ptr addrspace(4) [[TMP7]], i32 0
+; CHECK-NEXT:    [[TMP8:%.*]] = call i32 @lgc.load.user.data__i32(i32 0)
+; CHECK-NEXT:    [[TMP9:%.*]] = insertelement <2 x i32> [[TMP1]], i32 [[TMP8]], i64 0
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast <2 x i32> [[TMP9]] to i64
+; CHECK-NEXT:    [[TMP11:%.*]] = inttoptr i64 [[TMP10]] to ptr addrspace(4)
+; CHECK-NEXT:    [[I3:%.*]] = getelementptr i8, ptr addrspace(4) [[TMP11]], i32 32
+; CHECK-NEXT:    br label [[BB1:%.*]]
+; CHECK:       bb1:
+; CHECK-NEXT:    [[I5:%.*]] = mul i32 [[I]], 48
+; CHECK-NEXT:    [[I6:%.*]] = sext i32 [[I5]] to i64
+; CHECK-NEXT:    [[I7:%.*]] = getelementptr i8, ptr addrspace(4) [[I1]], i64 [[I6]]
+; CHECK-NEXT:    br label [[BB2:%.*]]
+; CHECK:       bb2:
+; CHECK-NEXT:    [[I8:%.*]] = mul i32 [[I]], 48
+; CHECK-NEXT:    [[I9:%.*]] = sext i32 [[I8]] to i64
+; CHECK-NEXT:    [[I10:%.*]] = getelementptr i8, ptr addrspace(4) [[I3]], i64 [[I9]]
+; CHECK-NEXT:    [[I11:%.*]] = load <4 x i32>, ptr addrspace(4) [[I10]], align 16, !invariant.load [[META10:![0-9]+]]
+; CHECK-NEXT:    br label [[BB3:%.*]]
+; CHECK:       bb3:
+; CHECK-NEXT:    [[I12:%.*]] = load <8 x i32>, ptr addrspace(4) [[I7]], align 32, !invariant.load [[META10]]
+; CHECK-NEXT:    [[TMP12:%.*]] = call i32 @llvm.amdgcn.waterfall.begin.i32(i32 0, i32 [[I5]])
+; CHECK-NEXT:    [[TMP13:%.*]] = call i32 @llvm.amdgcn.waterfall.readfirstlane.i32.i32(i32 [[TMP12]], i32 [[I5]])
+; CHECK-NEXT:    [[TMP14:%.*]] = sext i32 [[TMP13]] to i64
+; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr addrspace(4) [[I1]], i64 [[TMP14]]
+; CHECK-NEXT:    [[TMP16:%.*]] = load <8 x i32>, ptr addrspace(4) [[TMP15]], align 32, !invariant.load [[META10]]
+; CHECK-NEXT:    [[TMP17:%.*]] = sext i32 [[TMP13]] to i64
+; CHECK-NEXT:    [[TMP18:%.*]] = getelementptr i8, ptr addrspace(4) [[I3]], i64 [[TMP17]]
+; CHECK-NEXT:    [[TMP19:%.*]] = load <4 x i32>, ptr addrspace(4) [[TMP18]], align 16, !invariant.load [[META10]]
+; CHECK-NEXT:    [[TMP20:%.*]] = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32 15, float 0.000000e+00, float 0.000000e+00, <8 x i32> [[TMP16]], <4 x i32> [[TMP19]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    [[I13:%.*]] = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.waterfall.end.v4f32(i32 [[TMP12]], <4 x float> [[TMP20]])
+; CHECK-NEXT:    call void @lgc.output.export.generic.i32.i32.v4f32(i32 0, i32 0, <4 x float> [[I13]]) #[[ATTR5:[0-9]+]]
+; CHECK-NEXT:    ret void
+;
+.entry:
+  %i = call i32 (...) @lgc.create.read.generic.input__i32(i32 3, i32 0, i32 0, i32 0, i32 17, i32 poison)
+  %i1 = call ptr addrspace(4) (...) @lgc.create.get.desc.ptr.p4(i32 1, i32 1, i64 0, i32 7)
+  %i2 = call i32 (...) @lgc.create.get.desc.stride__i32(i32 1, i32 1, i64 0, i32 7)
+  %i3 = call ptr addrspace(4) (...) @lgc.create.get.desc.ptr.p4(i32 2, i32 2, i64 0, i32 7)
+  %i4 = call i32 (...) @lgc.create.get.desc.stride__i32(i32 2, i32 2, i64 0, i32 7)
+  br label %bb1
+
+bb1:                                              ; preds = %.entry
+  %i5 = mul i32 %i, %i2
+  %i6 = sext i32 %i5 to i64
+  %i7 = getelementptr i8, ptr addrspace(4) %i1, i64 %i6
+  br label %bb2
+
+bb2:                                              ; preds = %bb1
+  %i8 = mul i32 %i, %i4
+  %i9 = sext i32 %i8 to i64
+  %i10 = getelementptr i8, ptr addrspace(4) %i3, i64 %i9
+  %i11 = load <4 x i32>, ptr addrspace(4) %i10, align 16, !invariant.load !10
+  br label %bb3
+
+bb3:                                              ; preds = %bb2
+  %i12 = load <8 x i32>, ptr addrspace(4) %i7, align 32, !invariant.load !10
+  %i13 = call reassoc nnan nsz arcp contract afn <4 x float> (...) @lgc.create.image.sample.v4f32(i32 1, i32 24, <8 x i32> %i12, <4 x i32> %i11, i32 1, <2 x float> zeroinitializer)
+  call void (...) @lgc.create.write.generic.output(<4 x float> %i13, i32 0, i32 0, i32 0, i32 0, i32 0, i32 poison)
+  ret void
+}
+
+declare spir_func void @spirv.NonUniform.i32(i32) local_unnamed_addr
+
+; Function Attrs: nounwind memory(none)
+declare ptr addrspace(4) @lgc.create.get.desc.ptr.p4(...) local_unnamed_addr #1
+
+; Function Attrs: nounwind memory(none)
+declare i32 @lgc.create.get.desc.stride__i32(...) local_unnamed_addr #1
+
+declare spir_func void @"spirv.NonUniform.s[s[p4,i32,i32,i32],s[p4,i32,i32]]"({ { ptr addrspace(4), i32, i32, i32 }, { ptr addrspace(4), i32, i32 } }) local_unnamed_addr
+
+declare spir_func void @"spirv.NonUniform.s[a3v8i32,s[v4i32,i32]]"({ [3 x <8 x i32>], { <4 x i32>, i32 } }) local_unnamed_addr
+
+; Function Attrs: nounwind willreturn memory(read)
+declare <4 x float> @lgc.create.image.sample.v4f32(...) local_unnamed_addr #2
+
+; Function Attrs: nounwind willreturn memory(read)
+declare i32 @lgc.create.read.generic.input__i32(...) local_unnamed_addr #2
+
+; Function Attrs: nounwind
+declare void @lgc.create.write.generic.output(...) local_unnamed_addr #3
+
+attributes #0 = { nounwind "denormal-fp-math-f32"="preserve-sign" }
+attributes #1 = { nounwind memory(none) }
+attributes #2 = { nounwind willreturn memory(read) }
+attributes #3 = { nounwind }
+
+!lgc.client = !{!0}
+!lgc.unlinked = !{!1}
+!lgc.options = !{!2}
+!lgc.options.FS = !{!3}
+!lgc.user.data.nodes = !{!4, !5}
+!lgc.color.export.formats = !{!6}
+!amdgpu.pal.metadata.msgpack = !{!7}
+
+!0 = !{!"Vulkan"}
+!1 = !{i32 1}
+!2 = !{i32 -158725823, i32 1419665388, i32 -1015833383, i32 -491143713, i32 1, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 256, i32 256, i32 2, i32 1}
+!3 = !{i32 -1822594139, i32 1920663194, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 64, i32 0, i32 0, i32 3, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 20, i32 1800, i32 0, i32 0, i32 1}
+!4 = !{!"DescriptorTableVaPtr", i32 7, i32 64, i32 0, i32 1, i32 1}
+!5 = !{!"DescriptorCombinedTexture", i32 3, i32 64, i32 0, i32 192, i64 0, i32 7, i32 12}
+!6 = !{i32 14, i32 7, i32 0, i32 0, i32 15}
+!7 = !{!"\82\B0amdpal.pipelines\91\83\B0.spill_threshold\CD\FF\FF\B0.user_data_limit\00\AF.xgl_cache_info\82\B3.128_bit_cache_hash\92\CF\E8\D2\98>j\B9B\94\CF2\DEF\BF\9Fx\BC1\AD.llpc_version\A470.1\AEamdpal.version\92\03\00"}
+!8 = !{i32 4}
+!9 = !{i32 6}
+!10 = !{}
diff --git a/lgc/test/scalarizationOfDescriptorLoadsTest8.lgc b/lgc/test/scalarizationOfDescriptorLoadsTest8.lgc
new file mode 100644
index 0000000000..03b16464d2
--- /dev/null
+++ b/lgc/test/scalarizationOfDescriptorLoadsTest8.lgc
@@ -0,0 +1,133 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --tool lgc --version 4
+; RUN: lgc -mcpu=gfx1010 -print-after=lgc-builder-replayer -o - %s 2>&1 | FileCheck --check-prefixes=CHECK %s
+
+; This test checks how the scalarization of descriptor loads works if the values are defined in different basic blocks.
+
+source_filename = "llpc_fragment_7"
+target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-p7:160:256:256:32-p8:128:128-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7:8-p32:32:32"
+target triple = "amdgcn--amdpal"
+
+; Function Attrs: nounwind
+define dllexport spir_func void @lgc.shader.FS.main() local_unnamed_addr #0 !spirv.ExecutionModel !8 !lgc.shaderstage !9 {
+; CHECK-LABEL: define dllexport spir_func void @lgc.shader.FS.main(
+; CHECK-SAME: ) local_unnamed_addr #[[ATTR0:[0-9]+]] !spirv.ExecutionModel !8 !lgc.shaderstage [[META9:![0-9]+]] {
+; CHECK-NEXT:  .entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.amdgcn.s.getpc()
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i64 [[TMP0]] to <2 x i32>
+; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.amdgcn.s.getpc()
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i64 [[TMP2]] to <2 x i32>
+; CHECK-NEXT:    [[I:%.*]] = call i32 (...) @lgc.input.import.interpolated__i32(i1 false, i32 3, i32 0, i32 0, i32 poison, i32 1, i32 poison)
+; CHECK-NEXT:    [[TMP4:%.*]] = call i32 @lgc.load.user.data__i32(i32 0)
+; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <2 x i32> [[TMP3]], i32 [[TMP4]], i64 0
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <2 x i32> [[TMP5]] to i64
+; CHECK-NEXT:    [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr addrspace(4)
+; CHECK-NEXT:    [[I1:%.*]] = getelementptr i8, ptr addrspace(4) [[TMP7]], i32 0
+; CHECK-NEXT:    [[TMP8:%.*]] = call i32 @lgc.load.user.data__i32(i32 0)
+; CHECK-NEXT:    [[TMP9:%.*]] = insertelement <2 x i32> [[TMP1]], i32 [[TMP8]], i64 0
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast <2 x i32> [[TMP9]] to i64
+; CHECK-NEXT:    [[TMP11:%.*]] = inttoptr i64 [[TMP10]] to ptr addrspace(4)
+; CHECK-NEXT:    [[I2:%.*]] = getelementptr i8, ptr addrspace(4) [[TMP11]], i32 32
+; CHECK-NEXT:    [[COND:%.*]] = icmp ne i32 [[I]], 0
+; CHECK-NEXT:    br i1 [[COND]], label [[BB1:%.*]], label [[BB2:%.*]]
+; CHECK:       bb1:
+; CHECK-NEXT:    br label [[BB3:%.*]]
+; CHECK:       bb2:
+; CHECK-NEXT:    br label [[BB3]]
+; CHECK:       bb3:
+; CHECK-NEXT:    [[PHI:%.*]] = phi i32 [ 48, [[BB1]] ], [ 48, [[BB2]] ]
+; CHECK-NEXT:    [[I3:%.*]] = mul i32 [[I]], [[PHI]]
+; CHECK-NEXT:    [[I4:%.*]] = sext i32 [[I3]] to i64
+; CHECK-NEXT:    [[I5:%.*]] = getelementptr i8, ptr addrspace(4) [[I1]], i64 [[I4]]
+; CHECK-NEXT:    [[I6:%.*]] = mul i32 [[I]], [[PHI]]
+; CHECK-NEXT:    [[I7:%.*]] = sext i32 [[I6]] to i64
+; CHECK-NEXT:    [[I8:%.*]] = getelementptr i8, ptr addrspace(4) [[I2]], i64 [[I7]]
+; CHECK-NEXT:    [[I9:%.*]] = load <4 x i32>, ptr addrspace(4) [[I8]], align 16, !invariant.load [[META10:![0-9]+]]
+; CHECK-NEXT:    [[I10:%.*]] = load <8 x i32>, ptr addrspace(4) [[I5]], align 32, !invariant.load [[META10]]
+; CHECK-NEXT:    [[TMP12:%.*]] = call i32 @llvm.amdgcn.waterfall.begin.i32(i32 0, i32 [[I3]])
+; CHECK-NEXT:    [[TMP13:%.*]] = call i32 @llvm.amdgcn.waterfall.readfirstlane.i32.i32(i32 [[TMP12]], i32 [[I3]])
+; CHECK-NEXT:    [[TMP14:%.*]] = sext i32 [[TMP13]] to i64
+; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr addrspace(4) [[I1]], i64 [[TMP14]]
+; CHECK-NEXT:    [[TMP16:%.*]] = load <8 x i32>, ptr addrspace(4) [[TMP15]], align 32, !invariant.load [[META10]]
+; CHECK-NEXT:    [[TMP17:%.*]] = sext i32 [[TMP13]] to i64
+; CHECK-NEXT:    [[TMP18:%.*]] = getelementptr i8, ptr addrspace(4) [[I2]], i64 [[TMP17]]
+; CHECK-NEXT:    [[TMP19:%.*]] = load <4 x i32>, ptr addrspace(4) [[TMP18]], align 16, !invariant.load [[META10]]
+; CHECK-NEXT:    [[TMP20:%.*]] = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32 15, float 0.000000e+00, float 0.000000e+00, <8 x i32> [[TMP16]], <4 x i32> [[TMP19]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    [[I11:%.*]] = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.waterfall.end.v4f32(i32 [[TMP12]], <4 x float> [[TMP20]])
+; CHECK-NEXT:    call void @lgc.output.export.generic.i32.i32.v4f32(i32 0, i32 0, <4 x float> [[I11]]) #[[ATTR5:[0-9]+]]
+; CHECK-NEXT:    ret void
+;
+.entry:
+  %i = call i32 (...) @lgc.create.read.generic.input__i32(i32 3, i32 0, i32 0, i32 0, i32 17, i32 poison)
+  %i1 = call ptr addrspace(4) (...) @lgc.create.get.desc.ptr.p4(i32 1, i32 1, i64 0, i32 7)
+  %i2 = call ptr addrspace(4) (...) @lgc.create.get.desc.ptr.p4(i32 2, i32 2, i64 0, i32 7)
+  %cond = icmp ne i32 %i, 0
+  br i1 %cond, label %bb1, label %bb2
+
+bb1:                                              ; preds = %.entry
+  %a = call i32 (...) @lgc.create.get.desc.stride__i32(i32 1, i32 1, i64 0, i32 7)
+  br label %bb3
+
+bb2:                                              ; preds = %.entry
+  %b = call i32 (...) @lgc.create.get.desc.stride__i32(i32 2, i32 2, i64 0, i32 7)
+  br label %bb3
+
+bb3:                                              ; preds = %bb2, %bb1
+  %phi = phi i32 [ %a, %bb1 ], [ %b, %bb2 ]
+  %i3 = mul i32 %i, %phi
+  %i4 = sext i32 %i3 to i64
+  %i5 = getelementptr i8, ptr addrspace(4) %i1, i64 %i4
+  %i6 = mul i32 %i, %phi
+  %i7 = sext i32 %i6 to i64
+  %i8 = getelementptr i8, ptr addrspace(4) %i2, i64 %i7
+  %i9 = load <4 x i32>, ptr addrspace(4) %i8, align 16, !invariant.load !10
+  %i10 = load <8 x i32>, ptr addrspace(4) %i5, align 32, !invariant.load !10
+  %i11 = call reassoc nnan nsz arcp contract afn <4 x float> (...) @lgc.create.image.sample.v4f32(i32 1, i32 24, <8 x i32> %i10, <4 x i32> %i9, i32 1, <2 x float> zeroinitializer)
+  call void (...) @lgc.create.write.generic.output(<4 x float> %i11, i32 0, i32 0, i32 0, i32 0, i32 0, i32 poison)
+  ret void
+}
+
+declare spir_func void @spirv.NonUniform.i32(i32) local_unnamed_addr
+
+; Function Attrs: nounwind memory(none)
+declare ptr addrspace(4) @lgc.create.get.desc.ptr.p4(...) local_unnamed_addr #1
+
+; Function Attrs: nounwind memory(none)
+declare i32 @lgc.create.get.desc.stride__i32(...) local_unnamed_addr #1
+
+declare spir_func void @"spirv.NonUniform.s[s[p4,i32,i32,i32],s[p4,i32,i32]]"({ { ptr addrspace(4), i32, i32, i32 }, { ptr addrspace(4), i32, i32 } }) local_unnamed_addr
+
+declare spir_func void @"spirv.NonUniform.s[a3v8i32,s[v4i32,i32]]"({ [3 x <8 x i32>], { <4 x i32>, i32 } }) local_unnamed_addr
+
+; Function Attrs: nounwind willreturn memory(read)
+declare <4 x float> @lgc.create.image.sample.v4f32(...) local_unnamed_addr #2
+
+; Function Attrs: nounwind willreturn memory(read)
+declare i32 @lgc.create.read.generic.input__i32(...) local_unnamed_addr #2
+
+; Function Attrs: nounwind
+declare void @lgc.create.write.generic.output(...) local_unnamed_addr #3
+
+attributes #0 = { nounwind "denormal-fp-math-f32"="preserve-sign" }
+attributes #1 = { nounwind memory(none) }
+attributes #2 = { nounwind willreturn memory(read) }
+attributes #3 = { nounwind }
+
+!lgc.client = !{!0}
+!lgc.unlinked = !{!1}
+!lgc.options = !{!2}
+!lgc.options.FS = !{!3}
+!lgc.user.data.nodes = !{!4, !5}
+!lgc.color.export.formats = !{!6}
+!amdgpu.pal.metadata.msgpack = !{!7}
+
+!0 = !{!"Vulkan"}
+!1 = !{i32 1}
+!2 = !{i32 -158725823, i32 1419665388, i32 -1015833383, i32 -491143713, i32 1, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 256, i32 256, i32 2, i32 1}
+!3 = !{i32 -1822594139, i32 1920663194, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 64, i32 0, i32 0, i32 3, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 20, i32 1800, i32 0, i32 0, i32 1}
+!4 = !{!"DescriptorTableVaPtr", i32 7, i32 64, i32 0, i32 1, i32 1}
+!5 = !{!"DescriptorCombinedTexture", i32 3, i32 64, i32 0, i32 192, i64 0, i32 7, i32 12}
+!6 = !{i32 14, i32 7, i32 0, i32 0, i32 15}
+!7 = !{!"\82\B0amdpal.pipelines\91\83\B0.spill_threshold\CD\FF\FF\B0.user_data_limit\00\AF.xgl_cache_info\82\B3.128_bit_cache_hash\92\CF\E8\D2\98>j\B9B\94\CF2\DEF\BF\9Fx\BC1\AD.llpc_version\A470.1\AEamdpal.version\92\03\00"}
+!8 = !{i32 4}
+!9 = !{i32 6}
+!10 = !{}
diff --git a/lgc/test/scalarizationOfDescriptorLoadsTest9.lgc b/lgc/test/scalarizationOfDescriptorLoadsTest9.lgc
new file mode 100644
index 0000000000..97e7f1777c
--- /dev/null
+++ b/lgc/test/scalarizationOfDescriptorLoadsTest9.lgc
@@ -0,0 +1,129 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --tool lgc --version 4
+; RUN: lgc -mcpu=gfx1010 -print-after=lgc-builder-replayer -o - %s 2>&1 | FileCheck --check-prefixes=CHECK %s
+
+; This test checks how the scalarization of descriptor loads works if the values are defined in a loop.
+source_filename = "llpc_fragment_7"
+target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-p7:160:256:256:32-p8:128:128-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7:8-p32:32:32"
+target triple = "amdgcn--amdpal"
+
+; Function Attrs: nounwind
+define dllexport spir_func void @lgc.shader.FS.main() local_unnamed_addr #0 !spirv.ExecutionModel !8 !lgc.shaderstage !9 {
+; CHECK-LABEL: define dllexport spir_func void @lgc.shader.FS.main(
+; CHECK-SAME: ) local_unnamed_addr #[[ATTR0:[0-9]+]] !spirv.ExecutionModel !8 !lgc.shaderstage [[META9:![0-9]+]] {
+; CHECK-NEXT:  .entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.amdgcn.s.getpc()
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i64 [[TMP0]] to <2 x i32>
+; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.amdgcn.s.getpc()
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i64 [[TMP2]] to <2 x i32>
+; CHECK-NEXT:    [[I:%.*]] = call i32 (...) @lgc.input.import.interpolated__i32(i1 false, i32 3, i32 0, i32 0, i32 poison, i32 1, i32 poison)
+; CHECK-NEXT:    [[TMP4:%.*]] = call i32 @lgc.load.user.data__i32(i32 0)
+; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <2 x i32> [[TMP3]], i32 [[TMP4]], i64 0
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <2 x i32> [[TMP5]] to i64
+; CHECK-NEXT:    [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr addrspace(4)
+; CHECK-NEXT:    [[I1:%.*]] = getelementptr i8, ptr addrspace(4) [[TMP7]], i32 0
+; CHECK-NEXT:    [[TMP8:%.*]] = call i32 @lgc.load.user.data__i32(i32 0)
+; CHECK-NEXT:    [[TMP9:%.*]] = insertelement <2 x i32> [[TMP1]], i32 [[TMP8]], i64 0
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast <2 x i32> [[TMP9]] to i64
+; CHECK-NEXT:    [[TMP11:%.*]] = inttoptr i64 [[TMP10]] to ptr addrspace(4)
+; CHECK-NEXT:    [[I2:%.*]] = getelementptr i8, ptr addrspace(4) [[TMP11]], i32 32
+; CHECK-NEXT:    br label [[LOOP:%.*]]
+; CHECK:       loop:
+; CHECK-NEXT:    [[PHI_IND:%.*]] = phi i32 [ 0, [[DOTENTRY:%.*]] ], [ [[IND:%.*]], [[LOOP]] ]
+; CHECK-NEXT:    [[I3:%.*]] = mul i32 [[PHI_IND]], 48
+; CHECK-NEXT:    [[I4:%.*]] = sext i32 [[I3]] to i64
+; CHECK-NEXT:    [[I5:%.*]] = getelementptr i8, ptr addrspace(4) [[I1]], i64 [[I4]]
+; CHECK-NEXT:    [[I6:%.*]] = mul i32 [[PHI_IND]], 48
+; CHECK-NEXT:    [[I7:%.*]] = sext i32 [[I6]] to i64
+; CHECK-NEXT:    [[I8:%.*]] = getelementptr i8, ptr addrspace(4) [[I2]], i64 [[I7]]
+; CHECK-NEXT:    [[I9:%.*]] = load <4 x i32>, ptr addrspace(4) [[I8]], align 16, !invariant.load [[META10:![0-9]+]]
+; CHECK-NEXT:    [[I10:%.*]] = load <8 x i32>, ptr addrspace(4) [[I5]], align 32, !invariant.load [[META10]]
+; CHECK-NEXT:    [[TMP12:%.*]] = call i32 @llvm.amdgcn.waterfall.begin.i32(i32 0, i32 [[I3]])
+; CHECK-NEXT:    [[TMP13:%.*]] = call i32 @llvm.amdgcn.waterfall.readfirstlane.i32.i32(i32 [[TMP12]], i32 [[I3]])
+; CHECK-NEXT:    [[TMP14:%.*]] = sext i32 [[TMP13]] to i64
+; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr addrspace(4) [[I1]], i64 [[TMP14]]
+; CHECK-NEXT:    [[TMP16:%.*]] = load <8 x i32>, ptr addrspace(4) [[TMP15]], align 32, !invariant.load [[META10]]
+; CHECK-NEXT:    [[TMP17:%.*]] = sext i32 [[TMP13]] to i64
+; CHECK-NEXT:    [[TMP18:%.*]] = getelementptr i8, ptr addrspace(4) [[I2]], i64 [[TMP17]]
+; CHECK-NEXT:    [[TMP19:%.*]] = load <4 x i32>, ptr addrspace(4) [[TMP18]], align 16, !invariant.load [[META10]]
+; CHECK-NEXT:    [[TMP20:%.*]] = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32 15, float 0.000000e+00, float 0.000000e+00, <8 x i32> [[TMP16]], <4 x i32> [[TMP19]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    [[I11:%.*]] = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.waterfall.end.v4f32(i32 [[TMP12]], <4 x float> [[TMP20]])
+; CHECK-NEXT:    call void @lgc.output.export.generic.i32.i32.v4f32(i32 0, i32 0, <4 x float> [[I11]]) #[[ATTR5:[0-9]+]]
+; CHECK-NEXT:    [[IND]] = add i32 [[PHI_IND]], 1
+; CHECK-NEXT:    [[COND:%.*]] = icmp ne i32 [[IND]], 1000
+; CHECK-NEXT:    br i1 [[COND]], label [[LOOP]], label [[EXIT:%.*]]
+; CHECK:       exit:
+; CHECK-NEXT:    ret void
+;
+.entry:
+  %i = call i32 (...) @lgc.create.read.generic.input__i32(i32 3, i32 0, i32 0, i32 0, i32 17, i32 poison)
+  %i1 = call ptr addrspace(4) (...) @lgc.create.get.desc.ptr.p4(i32 1, i32 1, i64 0, i32 7)
+  %i2 = call ptr addrspace(4) (...) @lgc.create.get.desc.ptr.p4(i32 2, i32 2, i64 0, i32 7)
+  br label %loop
+
+loop:                                             ; preds = %loop, %.entry
+  %phi.ind = phi i32 [ 0, %.entry ], [ %ind, %loop ]
+  %a = call i32 (...) @lgc.create.get.desc.stride__i32(i32 1, i32 1, i64 0, i32 7)
+  %b = call i32 (...) @lgc.create.get.desc.stride__i32(i32 2, i32 2, i64 0, i32 7)
+  %i3 = mul i32 %phi.ind, %a
+  %i4 = sext i32 %i3 to i64
+  %i5 = getelementptr i8, ptr addrspace(4) %i1, i64 %i4
+  %i6 = mul i32 %phi.ind, %b
+  %i7 = sext i32 %i6 to i64
+  %i8 = getelementptr i8, ptr addrspace(4) %i2, i64 %i7
+  %i9 = load <4 x i32>, ptr addrspace(4) %i8, align 16, !invariant.load !10
+  %i10 = load <8 x i32>, ptr addrspace(4) %i5, align 32, !invariant.load !10
+  %i11 = call reassoc nnan nsz arcp contract afn <4 x float> (...) @lgc.create.image.sample.v4f32(i32 1, i32 24, <8 x i32> %i10, <4 x i32> %i9, i32 1, <2 x float> zeroinitializer)
+  call void (...) @lgc.create.write.generic.output(<4 x float> %i11, i32 0, i32 0, i32 0, i32 0, i32 0, i32 poison)
+  %ind = add i32 %phi.ind, 1
+  %cond = icmp ne i32 %ind, 1000
+  br i1 %cond, label %loop, label %exit
+
+exit:                                             ; preds = %loop
+  ret void
+}
+
+declare spir_func void @spirv.NonUniform.i32(i32) local_unnamed_addr
+
+; Function Attrs: nounwind memory(none)
+declare ptr addrspace(4) @lgc.create.get.desc.ptr.p4(...) local_unnamed_addr #1
+
+; Function Attrs: nounwind memory(none)
+declare i32 @lgc.create.get.desc.stride__i32(...) local_unnamed_addr #1
+
+declare spir_func void @"spirv.NonUniform.s[s[p4,i32,i32,i32],s[p4,i32,i32]]"({ { ptr addrspace(4), i32, i32, i32 }, { ptr addrspace(4), i32, i32 } }) local_unnamed_addr
+
+declare spir_func void @"spirv.NonUniform.s[a3v8i32,s[v4i32,i32]]"({ [3 x <8 x i32>], { <4 x i32>, i32 } }) local_unnamed_addr
+
+; Function Attrs: nounwind willreturn memory(read)
+declare <4 x float> @lgc.create.image.sample.v4f32(...) local_unnamed_addr #2
+
+; Function Attrs: nounwind willreturn memory(read)
+declare i32 @lgc.create.read.generic.input__i32(...) local_unnamed_addr #2
+
+; Function Attrs: nounwind
+declare void @lgc.create.write.generic.output(...) local_unnamed_addr #3
+
+attributes #0 = { nounwind "denormal-fp-math-f32"="preserve-sign" }
+attributes #1 = { nounwind memory(none) }
+attributes #2 = { nounwind willreturn memory(read) }
+attributes #3 = { nounwind }
+
+!lgc.client = !{!0}
+!lgc.unlinked = !{!1}
+!lgc.options = !{!2}
+!lgc.options.FS = !{!3}
+!lgc.user.data.nodes = !{!4, !5}
+!lgc.color.export.formats = !{!6}
+!amdgpu.pal.metadata.msgpack = !{!7}
+
+!0 = !{!"Vulkan"}
+!1 = !{i32 1}
+!2 = !{i32 -158725823, i32 1419665388, i32 -1015833383, i32 -491143713, i32 1, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 256, i32 256, i32 2, i32 1}
+!3 = !{i32 -1822594139, i32 1920663194, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 64, i32 0, i32 0, i32 3, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 20, i32 1800, i32 0, i32 0, i32 1}
+!4 = !{!"DescriptorTableVaPtr", i32 7, i32 64, i32 0, i32 1, i32 1}
+!5 = !{!"DescriptorCombinedTexture", i32 3, i32 64, i32 0, i32 192, i64 0, i32 7, i32 12}
+!6 = !{i32 14, i32 7, i32 0, i32 0, i32 15}
+!7 = !{!"\82\B0amdpal.pipelines\91\83\B0.spill_threshold\CD\FF\FF\B0.user_data_limit\00\AF.xgl_cache_info\82\B3.128_bit_cache_hash\92\CF\E8\D2\98>j\B9B\94\CF2\DEF\BF\9Fx\BC1\AD.llpc_version\A470.1\AEamdpal.version\92\03\00"}
+!8 = !{i32 4}
+!9 = !{i32 6}
+!10 = !{}
diff --git a/lgc/util/AddressExtender.cpp b/lgc/util/AddressExtender.cpp
index c9480353d8..450fee8b15 100644
--- a/lgc/util/AddressExtender.cpp
+++ b/lgc/util/AddressExtender.cpp
@@ -51,7 +51,7 @@ Instruction *AddressExtender::getFirstInsertionPt() {
 //
 // @param addr32 : Address as 32-bit value
 // @param highHalf : Value to use for high half; The constant HighAddrPc to use PC
-// @param ptrTy : Type to cast pointer to
+// @param ptrTy : Type to cast pointer to; nullptr to return as i64
 // @param builder : IRBuilder to use, already set to the required insert point
 // @returns : 64-bit pointer value
 Instruction *AddressExtender::extend(Value *addr32, Value *highHalf, Type *ptrTy, IRBuilder<> &builder) {
@@ -67,6 +67,8 @@ Instruction *AddressExtender::extend(Value *addr32, Value *highHalf, Type *ptrTy
     ptr = builder.CreateInsertElement(ptr, highHalf, 1);
   }
   ptr = builder.CreateBitCast(ptr, builder.getInt64Ty());
+  if (!ptrTy)
+    return cast<Instruction>(ptr);
   return cast<Instruction>(builder.CreateIntToPtr(ptr, ptrTy));
 }
 
@@ -75,12 +77,14 @@ Instruction *AddressExtender::extend(Value *addr32, Value *highHalf, Type *ptrTy
 //
 // @param addr32 : Address as 32-bit value
 // @param highHalf : Value to use for high half; The constant HighAddrPc to use PC
-// @param ptrTy : Type to cast pointer to
+// @param ptrTy : Type to cast pointer to; nullptr to return as i64
 // @param builder : IRBuilder to use, already set to the required insert point
 // @returns : 64-bit pointer value
 Instruction *AddressExtender::extendWithPc(Value *addr32, Type *ptrTy, IRBuilder<> &builder) {
   Value *ptr = builder.CreateInsertElement(getPc(), addr32, uint64_t(0));
   ptr = builder.CreateBitCast(ptr, builder.getInt64Ty());
+  if (!ptrTy)
+    return cast<Instruction>(ptr);
   return cast<Instruction>(builder.CreateIntToPtr(ptr, ptrTy));
 }
 
diff --git a/llpc/context/llpcCompiler.cpp b/llpc/context/llpcCompiler.cpp
index cc8207067f..6544139aa1 100644
--- a/llpc/context/llpcCompiler.cpp
+++ b/llpc/context/llpcCompiler.cpp
@@ -1560,20 +1560,11 @@ Result Compiler::buildUnlinkedShaderInternal(Context *context, ArrayRef<const Pi
     // If fragment use builtIn inputs, return RequireFullPipeline.
     const ShaderModuleData *moduleData =
         static_cast<const ShaderModuleData *>(shaderInfo[ShaderStageFragment]->pModuleData);
-    if (moduleData->usage.useGenericBuiltIn || moduleData->usage.useBarycentric) {
+    if (moduleData->usage.useGenericBuiltIn) {
       // TODO: We have added semantic to support generic builtIn, however, there seems to be some errors. We need to
       // add more info to sync inputs and outputs.
       return Result::RequireFullPipeline;
     }
-  } else if (stage == UnlinkedStageVertexProcess) {
-    bool hasVs = shaderInfo[ShaderStageVertex]->pModuleData != nullptr;
-    bool hasTes = (shaderInfo[ShaderStageTessControl]->pModuleData != nullptr) ||
-                  (shaderInfo[ShaderStageTessControlBit]->pModuleData != nullptr);
-    bool hasGs = shaderInfo[ShaderStageGeometry]->pModuleData != nullptr;
-    if (m_gfxIp.major >= 11 && hasVs && !hasGs && !hasTes &&
-        static_cast<const ShaderModuleData *>(shaderInfo[ShaderStageVertex]->pModuleData)->usage.enableXfb) {
-      return Result::RequireFullPipeline;
-    }
   }
 
   unsigned originalShaderStageMask = context->getPipelineContext()->getShaderStageMask();
@@ -2852,7 +2843,7 @@ Result Compiler::buildRayTracingPipelineElf(Context *context, std::unique_ptr<Mo
 
   if (rtContext->getIndirectStageMask() == 0) {
     options.rtIndirectMode = lgc::RayTracingIndirectMode::NotIndirect;
-  } else if (rtContext->isContinuationsMode()) {
+  } else if (rtContext->isContinuationsMode() && !LgcContext::getEmitLgc()) {
     // For continuations mode, we need to run LowerRaytracingPipelinePass here first separately because we need to
     // collect metadata added by the pass
     std::unique_ptr<lgc::PassManager> passMgr(lgc::PassManager::Create(context->getLgcContext()));
diff --git a/llpc/context/llpcGraphicsContext.cpp b/llpc/context/llpcGraphicsContext.cpp
index 708c773c50..453253d363 100644
--- a/llpc/context/llpcGraphicsContext.cpp
+++ b/llpc/context/llpcGraphicsContext.cpp
@@ -254,6 +254,7 @@ Options GraphicsContext::computePipelineOptions() const {
   options.enableColorExportShader = pipelineInfo->enableColorExportShader;
   options.useSoftwareVertexBufferDescriptors = pipelineInfo->useSoftwareVertexBufferDescriptors;
   options.vbAddressLowBitsKnown = pipelineInfo->getGlState().vbAddressLowBitsKnown;
+  options.dynamicTopology = pipelineInfo->dynamicTopology;
   // Only set NGG options for a GFX10+ graphics pipeline.
   const auto &nggState = pipelineInfo->nggState;
   if (!nggState.enableNgg && getGfxIpVersion().major < 11) // GFX11+ must enable NGG
diff --git a/llpc/context/llpcPipelineContext.cpp b/llpc/context/llpcPipelineContext.cpp
index 5349096c8e..808cf24cdd 100644
--- a/llpc/context/llpcPipelineContext.cpp
+++ b/llpc/context/llpcPipelineContext.cpp
@@ -593,10 +593,9 @@ ShaderOptions PipelineContext::computeShaderOptions(const PipelineShaderInfo &sh
   if (ScalarizeWaterfallDescriptorLoads.getNumOccurrences() > 0) {
     shaderOptions.scalarizeWaterfallLoads = ScalarizeWaterfallDescriptorLoads;
   } else {
-    shaderOptions.scalarizeWaterfallLoads = shaderInfo.options.scalarizeWaterfallLoads;
-    // Enable waterfall load scalarization when vgpr limit is set.
-    if (shaderOptions.vgprLimit != 0 && shaderOptions.vgprLimit != UINT_MAX)
-      shaderOptions.scalarizeWaterfallLoads = true;
+    shaderOptions.scalarizeWaterfallLoads = true;
+    if (shaderInfo.options.scalarizeWaterfallLoads.has_value())
+      shaderOptions.scalarizeWaterfallLoads = *shaderInfo.options.scalarizeWaterfallLoads;
   }
 
   shaderOptions.sgprLimit = shaderInfo.options.sgprLimit;
@@ -1079,6 +1078,9 @@ uint32_t PipelineContext::getGlResourceNodeSetFromType(Vkgc::ResourceMappingNode
   case ResourceMappingNodeType::DescriptorSampler:
     resourceSet = GlResourceMappingSet::DescriptorSampler;
     break;
+  case ResourceMappingNodeType::DescriptorYCbCrSampler:
+    resourceSet = GlResourceMappingSet::DescriptorResource;
+    break;
   case ResourceMappingNodeType::DescriptorFmask:
     resourceSet = GlResourceMappingSet::DescriptorFmask;
     break;
diff --git a/llpc/docs/DdnBindlessTexture.md b/llpc/docs/DdnBindlessTexture.md
new file mode 100644
index 0000000000..da974d90bf
--- /dev/null
+++ b/llpc/docs/DdnBindlessTexture.md
@@ -0,0 +1,387 @@
+# Support OpenGL bindless texture in LLPC
+
+## Introduction
+[Bindless texture extension](http://www.opengl.org/registry/specs/ARB/bindless_texture.txt) allows OpenGL applications to access texture objects in shaders without first binding each texture to one of a limited number of texture image units.  Using this extension, an application can query a 64-bit unsigned integer texture handle for each texture that it wants to access and then use that handle directly in GLSL or assembly-based shaders.  The ability to access textures without having to bind and/or re-bind them can significantly reduce the amount of API calls and internal GL driver overhead which needed to manage resource bindings.
+
+### The following pseudo code shows the differences between bound texture and bindless texture:
+1). Bound texture
+``` c++
+#version 450
+layout(location = 0) in vec2 vs_texCoord;
+layout(location = 0) uniform sampler2D tex0;
+void main()
+{
+    gl_FragColor = texture(tex0, vs_texCoord);
+}
+// Pseudo code for API calls
+// Create texture object, bind the object to given target, upload the texture data
+glGenTextures(1, &tex);
+glBindTexture(GL_TEXTURE_2D, tex);
+glTexImage2D(GL_TEXTURE_2D,0,GL_RGBA,16,16,0,GL_RGBA,GL_UNSIGNED_BYTE, texture);
+…
+glUseProgram(po);
+
+// Set the texture unit
+int location = glGetUniformLocation(po, "tex0");
+glUniform1i(location, 0);
+…
+glDrawElements(GL_TRIANGLES, 12, GL_UNSIGNED_INT, 0);
+
+```
+
+2). Bindless texture
+``` c++
+#version 450
+#extension GL_ARB_bindless_texture : enable
+layout(location = 0) in vec2 vs_texCoord;
+layout(bindless_sampler) uniform sampler2D tex0;
+void main()
+{
+    gl_FragColor = texture(tex0, vs_texCoord);
+}
+
+// Pseudo code for API calls
+
+// Create texture object, bind the object to given target, upload the texture data
+glGenTextures(1, &tex);
+glBindTexture(GL_TEXTURE_2D, tex);
+glTexImage2D(GL_TEXTURE_2D,0,GL_RGBA,16,16,0,GL_RGBA,GL_UNSIGNED_BYTE, texture);
+
+// To access texture or image resources using handles, the handles must first be made resident.
+
+GLuint64 texHandle = glGetTextureHandleARB(tex);
+glMakeTextureHandleResidentARB(texHandle);
+
+glUseProgram(po);
+
+// Upload texture handle by default uniform
+int location = glGetUniformLocation(po, "tex0");
+glUniformHandleui64ARB(location, texHandle);
+…
+glDrawElements(GL_TRIANGLES, 12, GL_UNSIGNED_INT, 0);
+
+```
+### Changes in GLSL
+
+This extension adds no new data types to GLSL. Instead, it uses existing sampler and image data types and allows them to be populated with texture and image handles. sampler and image types may be used as shader inputs/outputs, temporary variables, uniform block members, buffer members and structure members and may be assigned to by shader code. Constructors are provided to convert unsigned integer values to and from sampler and image data types. For example, the following shader code are valid for bindless textures, but will cause compile-time errors for bound textures:
+
+- **Declare bindless texture/image as vertex shader input:**
+
+Currently OGLP driver doesn’t support bindless handle as VS input, as this feature depends on the extension `NV_vertex_attrib_integer_64bit`, the extension is not supported in our driver.
+``` glsl
+#version 450
+#extension GL_ARB_bindless_texture : enable
+layout(location = 0) in vec4 pos;
+layout(location = 0) out vec4 vsColor;
+layout(location = 1) in sampler2D tex;
+void main() {
+     gl_Position = pos;
+     vsColor = texture(tex, vec2(0.0));
+}
+
+```
+-  **Declare bindless texture/image as interface symbols:**
+``` glsl
+// Vertex shader
+#version 450
+#extension GL_ARB_bindless_texture : enable
+layout(location = 0) in vec4 pos;
+layout(location = 0) out sampler2D vsTex;
+layout(bindless_sampler) uniform sampler2D tex;
+void main(){
+    gl_Position = pos;
+    vsTex = tex;   // Pass texture handle to fragment shader
+}
+```
+
+``` glsl
+// Fragment shader
+#version 450
+#extension GL_ARB_bindless_texture : enable
+layout(location = 0) in sampler2D fsTex;
+layout(location = 0) out vec4  fragColor;
+void main() {
+     fragColor = texture(fsTex, vec2(0.0f));
+}
+
+```
+- **Declare bindless texture/image as uniform block member:**
+``` glsl
+#version 450
+#extension GL_ARB_bindless_texture : enable
+layout(bindless_sampler, std140, binding = 0) uniform ubo {
+    sampler2D tex;
+};
+layout(location = 0) out vec4 fragColor;
+void main() {
+     fragColor = texture(tex, vec2(0.0f));
+}
+
+```
+- **Declare bindless texture/image as shader storage block member:**
+``` glsl
+#version 450
+#extension GL_ARB_bindless_texture : enable
+layout(bindless_sampler, std430, binding = 0) buffer ssbo {
+    sampler2D tex;
+};
+layout(location = 0) out vec4  fragColor;
+void main() {
+     fragColor = texture(tex, vec2(0.0f));
+}
+
+```
+
+- **Declare bindless texture/image as a temp variable:**
+``` glsl
+#version 450
+#extension GL_ARB_bindless_texture : enable
+layout(bindless_sampler) uniform sampler2D tex;
+layout(location = 0) out vec4  fragColor;
+void main() {
+     sampler2D tempTex = tex;
+     fragColor = texture(tempTex, vec2(0.0f));
+}
+
+```
+
+Besides the bindless texture/image handle can be declared as samplerXX type, the extension ARB_bindless_texture also allows to use uvec2 to declare a 64bit texture/image handle, if we use uvec2 to declare a texture/image handle, we must call an explicit constructor to do type conversion before calling texture/image functions.
+
+``` glsl
+#version 450
+#extension GL_ARB_bindless_texture : enable
+layout(location = 0) uniform uvec2 tex;
+layout(location = 0) out vec4  fragColor;
+void main() {
+     fragColor = texture(samper2D(tex), vec2(0.0f));
+}
+```
+
+In the following four constructors, the low 32 bits of the sampler type correspond to the .x component of the uvec2 and the high 32 bits correspond to the .y component.
+``` glsl
+uvec2(any sampler type)   // Converts a sampler type to a pair of 32-bit unsigned integers
+any sampler type(uvec2)   // Converts a pair of 32-bit unsigned integers to a sampler type
+uvec2(any image type)     // Converts an image type to a pair of 32-bit unsigned integers
+any image type(uvec2)     // Converts a pair of 32-bit unsigned integers to an image type
+
+```
+
+Currently all the above special grammar rules and the constructors have been supported in glslang, now we will have a glance at the spirv binary, two sample cases are provided, one is texture handle is declared as sampler2D type, another case is that the texture handle is declared as uvec2 type, these two declarations are handled differently in LLPC:
+
+1). **Declare bindless handle as sampler2D**
+``` glsl
+#version 450
+#extension GL_ARB_bindless_texture : enable
+layout(bindless_sampler) uniform sampler2D s1;
+layout(location = 1) in vec2 coord;
+layout(location = 0) out vec4 FragColor;
+
+void main()
+{
+    FragColor = texture(s1, coord);
+}
+
+... ...
+    OpSourceExtension "GL_ARB_bindless_texture"
+    OpName %4 "main"
+    OpName %9 "FragColor"
+    OpName %13 "s1"
+ ... ...
+ ... ...
+%10 = OpTypeImage %6 2D 0 0 0 1 Unknown
+%11 = OpTypeSampledImage %10
+%12 = OpTypePointer UniformConstant %11
+%13 = OpVariable %12 UniformConstant
+%15 = OpTypeVector %6 2
+%16 = OpTypePointer Input %15
+%17 = OpVariable %16 Input
+%4 = OpFunction %2 None %3
+%5 = OpLabel
+%14 = OpLoad %11 %13
+%18 = OpLoad %15 %17
+%19 = OpImageSampleImplicitLod %7 %14 %18
+    OpStore %9 %19
+```
+
+From the above sample case we can see if the bindless texture handle is declared as sampler2D type, the shader’s SPIR-V binary is exactly the same with the SPIR-V binary generated by a regular(bound) texture, but we need to remember that variable %13 is a 64-bit texture handle, not a sampledImage, so when the following OpLoad instruction executes : `%14 = OpLoad %11 %13`, we can’t get the image pointer, but get a 64bit texture handle instead:
+
+2). **Declare bindless handle as uvec2**
+``` glsl
+#version 450
+#extension GL_ARB_bindless_texture : enable
+layout(location = 0) uniform uvec2 s1;
+layout(location = 1) in vec2 coord;
+layout(location = 0) out vec4 FragColor;
+
+void main()
+{
+    FragColor = texture(sampler2D(tex), coord);
+}
+
+... ...
+    OpSourceExtension "GL_ARB_bindless_texture"
+    OpName %4 "main"
+    OpName %9 "FragColor"
+    OpName %13 "s1"
+... ...
+... ...
+%12 = OpTypePointer UniformConstant %11
+%13 = OpVariable %12 UniformConstant
+%15 = OpTypeImage %6 2D 0 0 0 1 Unknown
+%16 = OpTypeSampledImage %15
+%18 = OpTypeVector %6 2
+%19 = OpTypePointer Input %18
+%20 = OpVariable %19 Input
+%4 = OpFunction %2 None %3
+%5 = OpLabel
+%14 = OpLoad %11 %13
+%17 = OpBitcast %16 %14
+%21 = OpLoad %18 %20
+%22 = OpImageSampleImplicitLod %7 %17 %21
+    OpStore %9 %22
+    OpReturn
+    OpFunctionEnd
+```
+
+If a bindless texture is declared as uvec2,  it behaves identically to a normal vector variable, we need to call an explicit type conversion sampler2D(tex) before calling texture functions, in SPIR-V binary we can see after the 64bit texture handle is loaded (%14 = OpLoad %11 %13), we insert a OpBitcast instruction to convert a 64bit handle to a sampledImage type variable, the next step we will see how to support above two cases in LLPC.
+
+## Implementation in LLPC
+### Interface changes
+
+The ARB_bindless_texture extension was published in 2013, when we implemented this extension in OGLP driver there was no SPIR-V opcode or extension support it, so we had to add two flags to indicate whether the bindless texture/image are used in the program, we can get this state from glslang, when one texture/image in a shader is declared as bindless, all the textures/images in the given program will be handled as bindless mode, which can simplify our driver’s implementation, so in LLPC’s implementation we will continue to follow this way.
+
+Two pipeline options are added to indicate whether the bindless texture or image is used, these flags are set at program link-time, so that when Llpc::Compiler::buildShaderModuleResourceUsage() is called,  the texture variables can be recognized as its real type variables (eg. if declared as `layout(bindless_sampler) uniform sampler2D s1;`, it will be recognized as a 64bit uint typed default uniform variable, instead of a texture), so that we can create the correct resourceMappingNode table for each kind of resource. And these two flags will also be checked at pipeline compile-time, so that we can generate the correct LLVM IR for bindless texture.
+
+``` c++
+struct PipelineOptions {
+…
+  bool bindlessTextureMode;   ///< For OGL only, true if bindless textures are used
+  bool bindlessImageMode;     ///< For OGL only, true if bindless images are used
+};
+```
+
+### LLPC changes
+Before implementation in LLPC, we need to clarify what is bindless texture/image handle is in our driver, what’s the relationship between bindless texture/image handle and the texture/image descriptor, here is the API `glGetTextureHandleARB`’s implementation in OGL driver:
+
+``` c++
+gsl::GpuMemAllocRef descMemory;
+Pal::Result result = pGslContext->AcquireImmutableImgSrd(dataSize, descMemory);
+*pHandle = descMemory->GetGpuVirtAddr();
+```
+
+From the above implementation we can see the bindless texture/image handle is its descriptor’s GPU memory address, which would be easy to create equivalent pointer as bound texture,  we can see the following dumped pipeline for a regular(bound) texture, in the first pass – translate SPIR-V binary to LLVM IR,  we can see the first statement is to get the texture descriptor’s pointer `%0 = call ptr addrspace(4) (...) @lgc.create.get.desc.ptr.p4(i32 1, i32 1, i64 4, i32 0)`, for bindless texture we can get the same result by converting a 64-bit handle to a pointer.
+
+``` glsl
+----------------glsl ----------------
+#version 450
+layout(location = 0) uniform sampler2D s1;
+layout(location = 1) in vec2 coord;
+layout(location = 0) out vec4 FragColor;
+
+void main()
+{
+    FragColor = texture(s1, coord);
+}
+------------- SPIR-V -----------------
+...
+%10 = OpTypeImage %6 2D 0 0 0 1 Unknown
+%11 = OpTypeSampledImage %10
+%12 = OpTypePointer UniformConstant %11
+%13 = OpVariable %12 UniformConstant
+%15 = OpTypeVector %6 2
+%16 = OpTypePointer Input %15
+%17 = OpVariable %16 Input
+%4 = OpFunction %2 None %3
+%5 = OpLabel
+%14 = OpLoad %11 %13
+%18 = OpLoad %15 %17
+%19 = OpImageSampleImplicitLod %7 %14 %18
+...
+*** IR Dump After LLPC translate SPIR-V binary to LLVM IR on [module] ***
+  %0 = call ptr addrspace(4) (...) @lgc.create.get.desc.ptr.p4(i32 1, i32 1, i64 4, i32 0)
+  %1 = call i32 (...) @lgc.create.get.desc.stride.i32(i32 1, i32 1, i64 4, i32 0)
+  %2 = insertvalue { ptr addrspace(4), i32, i32, i32 } poison, ptr addrspace(4) %0, 0
+  %3 = insertvalue { ptr addrspace(4), i32, i32, i32 } %2, i32 %1, 1
+  %4 = insertvalue { ptr addrspace(4), i32, i32, i32 } %3, i32 32, 2
+  %5 = insertvalue { ptr addrspace(4), i32, i32, i32 } %4, i32 1, 3
+  %6 = call ptr addrspace(4) (...) @lgc.create.get.desc.ptr.p4(i32 2, i32 2, i64 5, i32 0)
+```
+
+The implementation of declaring bindless textures declare as a samplerXX is different from the implementation of declaring textures as uvec2 types, I will introduce the implementation for two cases in more details.
+
+#### 1. Declare bindless texture handle as samplerXX type
+If declare a bindless texture handle as samplerXX type, it will be a `OpTypeSampledImage` type variable in SPIR-V binary:
+- At program link-time, when calling `Llpc::Compiler::buildShaderModuleResourceUsage()`, we need to recognize `OpTypeSampledImage` type variable as a 64-bit unsigned integer typed default uniform, so that we will not generate resource mapping node for texture, but generate a default uniform instead;
+- At pipeline compile time, we only need to add two patches in spirvReader:
+
+    1). When calls `SPIRVToLLVM::transVariable()` to translate variable `%13`, we need to force to change the variable type from `OpTypedSapledImage` to int64, so that we can generate a uniform variable’s declaration, and we can handle `OpLoad` instruction correctly;
+
+        ```
+        %11 = OpTypeSampledImage %10
+        %12 = OpTypePointer UniformConstant %11
+        %13 = OpVariable %12 UniformConstant
+        %15 = OpTypeVector %6 2
+        %16 = OpTypePointer Input %15
+        %17 = OpVariable %16 Input
+        %4 = OpFunction %2 None %3
+        %5 = OpLabel
+        %14 = OpLoad %11 %13
+        %18 = OpLoad %15 %17
+        %19 = OpImageSampleImplicitLod %7 %14 %18
+        ```
+    2). When calling `SPIRVToLLVM::transValueWithOpcode<OpLoad>()` to load the bindless texture handle, we need to do two things:
+    i). Load 64-bit image descriptor address, then convert it to an int pointer with correct address space;
+
+    ii). Currently image descriptor, sampler descriptor and fmask descriptor are stored in a structure, we need to obtain the each descriptor after loading the image descriptor address, then insert all descriptors in the structure;
+
+After the above change, we can see the pipeline dumps for the above shader, the pass “LLPC translate SPIR-V binary to LLVM IR”  and the ISA code dump looks as following, the cases that declare bindless textures handle as sampler2D can run correctly.
+
+![](./DdnBindlessTexturePipelineDumpDeclSamplerType.PNG)
+
+#### 2. Declare bindless texture handle as uvec2 type
+If declare a bindless texture as uniform uvec2 type, the solution would be much easier,  we don’t need to change the variable’s data type at program link-time or when `SPIRVToLLVM::transVariable()` is called, an `OpBitcast` instruction was added by SPIR-V builder to convert a 64-bit handle to a sampler, which need to handle specially for bindless texture. As the bindless handle is a native 64-bit data type, so the result of this instruction `%14 = OpLoad %11 %13` is a 64-bit texture handle, when translate the following instruction
+`%17 = OpBitcast %16 %14`, we need to do the same thing as above case(declared the handle by sampler2D):
+
+- Load 64-bit image descriptor address, then convert it to an int pointer with correct address space;
+- Obtain the each descriptor’s pointer after image descriptor address is loaded, then insert all descriptors in the structure;
+
+``` glsl
+#version 450
+#extension GL_ARB_bindless_texture : enable
+layout(location = 0) uniform uvec2 s1;
+layout(location = 1) in vec2 coord;
+layout(location = 0) out vec4 FragColor;
+
+void main()
+{
+    FragColor = texture(sampler2D(tex), coord);
+}
+
+...
+%12 = OpTypePointer UniformConstant %11
+%13 = OpVariable %12 UniformConstant
+%15 = OpTypeImage %6 2D 0 0 0 1 Unknown
+%16 = OpTypeSampledImage %15
+%18 = OpTypeVector %6 2
+%19 = OpTypePointer Input %18
+%20 = OpVariable %19 Input
+%4 = OpFunction %2 None %3
+%5 = OpLabel
+%14 = OpLoad %11 %13
+%17 = OpBitcast %16 %14
+%21 = OpLoad %18 %20
+%22 = OpImageSampleImplicitLod %7 %17 %21
+...
+```
+
+After the above change, we can see the pipeline dumps for the above shader, the pass “LLPC translate SPIR-V binary to LLVM IR”  and the ISA code dump looks as following, the cases that declare bindless textures by as uniform uvec2 type can run correctly.
+
+![](./DdnBindlessTexturePipelineDumpDeclUvec2Type.PNG)
+
+From theory, any uvec2 type data can be used as a bindless texture/image handle, so we also need to consider the case that declare a uvec2 type bindless texture/image handle as shader inputs/outputs, temporary variables, uniform block members, buffer members, structure members, function parameters and function return value.
+
+Besides above cases, we also need to consider the texture/image with multi-samples and texture/image buffers, because these types of texture/image have different resource descriptors.
+
+## Summary
+
+GL_ARB_bindless_texture is a big and flexible feature, we have supported this feature before(in close source compiler), so the implementation for driver and glslang can be reused, and fortunately LLPC’s architecture is very friendly for bindless texture/image‘s implementation, we can completely support this feature in LLPC's front-end. Although the above solutions are only for the simplest cases, these two cases are representative, so the implementation for other cases such as declare a bindless handle as interface symbol or block members will not demonstrate here, we will handle these cases by following above solutions.
diff --git a/llpc/docs/DdnBindlessTexturePipelineDumpDeclSamplerType.PNG b/llpc/docs/DdnBindlessTexturePipelineDumpDeclSamplerType.PNG
new file mode 100644
index 0000000000..eeb1a2a4b5
Binary files /dev/null and b/llpc/docs/DdnBindlessTexturePipelineDumpDeclSamplerType.PNG differ
diff --git a/llpc/docs/DdnBindlessTexturePipelineDumpDeclUvec2Type.PNG b/llpc/docs/DdnBindlessTexturePipelineDumpDeclUvec2Type.PNG
new file mode 100644
index 0000000000..5c2eef2925
Binary files /dev/null and b/llpc/docs/DdnBindlessTexturePipelineDumpDeclUvec2Type.PNG differ
diff --git a/llpc/lower/LowerGLCompatibility.cpp b/llpc/lower/LowerGLCompatibility.cpp
index cb2d972495..62c82a0697 100644
--- a/llpc/lower/LowerGLCompatibility.cpp
+++ b/llpc/lower/LowerGLCompatibility.cpp
@@ -46,8 +46,10 @@ namespace Llpc {
 
 // =====================================================================================================================
 LowerGLCompatibility::LowerGLCompatibility()
-    : m_retInst(nullptr), m_out(nullptr), m_clipVertex(nullptr), m_clipDistance(nullptr), m_clipPlane(nullptr),
-      m_frontColor(nullptr), m_backColor(nullptr), m_frontSecondaryColor(nullptr), m_backSecondaryColor(nullptr) {
+    : m_retInst(nullptr), m_entryPointEnd(nullptr), m_originalEntryBlock(nullptr), m_out(nullptr),
+      m_clipVertex(nullptr), m_clipDistance(nullptr), m_clipPlane(nullptr), m_frontColor(nullptr), m_backColor(nullptr),
+      m_frontSecondaryColor(nullptr), m_backSecondaryColor(nullptr), m_color(nullptr), m_secondaryColor(nullptr),
+      m_frontFacing(nullptr), m_patchTexCoord(nullptr), m_fragColor(nullptr), m_fragDepth(), m_fragStencilRef() {
 }
 
 // =====================================================================================================================
@@ -65,7 +67,8 @@ PreservedAnalyses LowerGLCompatibility::run(Module &module, ModuleAnalysisManage
   collectEmulationResource();
 
   if (!needLowerClipVertex() && !needLowerFrontColor() && !needLowerBackColor() && !needLowerFrontSecondaryColor() &&
-      !needLowerBackSecondaryColor())
+      !needLowerBackSecondaryColor() && !needEmulateDrawPixels() && !needEmulateTwoSideLighting() &&
+      !needEmulateBitmap() && !needLowerFragColor())
     return PreservedAnalyses::all();
 
   buildPatchPositionInfo();
@@ -85,6 +88,20 @@ PreservedAnalyses LowerGLCompatibility::run(Module &module, ModuleAnalysisManage
   if (needLowerBackSecondaryColor())
     lowerBackSecondaryColor();
 
+  if (needLowerFragColor())
+    lowerFragColor();
+
+  if (needEmulateDrawPixels())
+    emulateDrawPixels();
+
+  // Two side lighting patch should place just before bitmap patch.
+  if (needEmulateTwoSideLighting())
+    emulateTwoSideLighting();
+
+  // Bit map patch should be the last patch in the pass.
+  if (needEmulateBitmap())
+    emulateBitmap();
+
   return PreservedAnalyses::none();
 }
 
@@ -97,11 +114,17 @@ bool LowerGLCompatibility::needRun() {
         static_cast<const ShaderModuleData *>(static_cast<GraphicsContext *>(m_context->getPipelineContext())
                                                   ->getPipelineShaderInfo(m_shaderStage)
                                                   ->pModuleData);
+    auto *buildInfo = static_cast<const Vkgc::GraphicsPipelineBuildInfo *>(m_context->getPipelineBuildInfo());
     result |= moduleData->usage.useClipVertex;
     result |= moduleData->usage.useFrontColor;
     result |= moduleData->usage.useBackColor;
     result |= moduleData->usage.useFrontSecondaryColor;
     result |= moduleData->usage.useBackSecondaryColor;
+    result |= buildInfo->glState.drawPixelsType != Vkgc::DrawPixelsTypeNone;
+    result |= buildInfo->glState.enableTwoSideLighting;
+    result |= buildInfo->glState.enableBitmap;
+    result |= buildInfo->glState.enableBitmapLsb;
+    result |= buildInfo->glState.enableColorClampFs;
   }
   return result;
 }
@@ -198,6 +221,8 @@ void LowerGLCompatibility::decodeInOutMetaRecursively(llvm::Type *valueTy, llvm:
 
 // =====================================================================================================================
 // Collect "Return" instructions and replace those instructions with a branch instruction point to "ReturnBlock".
+//
+// @param [in]  func : The entry function of the shader module.
 void LowerGLCompatibility::unifyFunctionReturn(Function *func) {
   SmallVector<ReturnInst *> retInsts;
   for (BasicBlock &block : *func) {
@@ -211,7 +236,7 @@ void LowerGLCompatibility::unifyFunctionReturn(Function *func) {
 
   if (retInsts.size() > 1) {
     // Only create unify return block when the function's return instruction more then one.
-    auto retBlock = BasicBlock::Create(*m_context, "", m_entryPoint);
+    auto retBlock = BasicBlock::Create(*m_context, ".gl.compatibility.ret", m_entryPoint);
     m_retInst = ReturnInst::Create(*m_context, retBlock);
     for (auto inst : retInsts) {
       BranchInst::Create(retBlock, inst->getParent());
@@ -252,7 +277,35 @@ void LowerGLCompatibility::collectEmulationResource() {
         m_clipPlane = &global;
       }
     } else if (global.getType()->getAddressSpace() == SPIRAS_Input) {
-      continue;
+      llvm::SmallVector<ShaderInOutMetadata> mds;
+      MDNode *metaNode = global.getMetadata(gSPIRVMD::InOut);
+      assert(metaNode);
+      auto inOutMetaConst = mdconst::dyn_extract<Constant>(metaNode->getOperand(0));
+      auto valueType = global.getValueType();
+      bool isStructureOrArrayOfStructure =
+          (valueType->isStructTy() || (valueType->isArrayTy() && valueType->getArrayElementType()->isStructTy()));
+      decodeInOutMetaRecursively(valueType, inOutMetaConst, mds);
+      if (m_shaderStage == ShaderStageFragment) {
+        // In fragment shader, gl_Color have same location with gl_FrontColor in pre-stage outputs.
+        // gl_SecondaryColor have same location with gl_FrontSecondaryColor in pre-stage outputs.
+        // So we can use location of gl_FrontColor and gl_FrontSecondaryColor to find gl_Color and gl_FrontColor
+        for (auto md : mds) {
+          if (md.IsLoc) {
+            if (md.Value == Vkgc::GlCompatibilityInOutLocation::FrontColor) {
+              if (isStructureOrArrayOfStructure)
+                m_out = &global;
+              else
+                m_color = &global;
+            }
+            if (md.Value == Vkgc::GlCompatibilityInOutLocation::FrontSecondaryColor) {
+              if (isStructureOrArrayOfStructure)
+                m_out = &global;
+              else
+                m_secondaryColor = &global;
+            }
+          }
+        }
+      }
     } else if (global.getType()->getAddressSpace() == SPIRAS_Output) {
       llvm::SmallVector<ShaderInOutMetadata> mds;
       MDNode *metaNode = global.getMetadata(gSPIRVMD::InOut);
@@ -262,6 +315,21 @@ void LowerGLCompatibility::collectEmulationResource() {
       bool isStructureOrArrayOfStructure =
           (valueType->isStructTy() || (valueType->isArrayTy() && valueType->getArrayElementType()->isStructTy()));
       decodeInOutMetaRecursively(valueType, inOutMetaConst, mds);
+      if (m_shaderStage == ShaderStageFragment) {
+        for (auto md : mds) {
+          if (md.IsBuiltIn) {
+            if (md.Value == spv::BuiltInFragDepth) {
+              m_fragDepth = &global;
+            }
+            if (md.Value == spv::BuiltInFragStencilRefEXT) {
+              m_fragStencilRef = &global;
+            }
+          } else {
+            assert(m_fragColor == nullptr);
+            m_fragColor = &global;
+          }
+        }
+      }
       for (auto md : mds) {
         if (md.IsLoc) {
           if (md.Value == Vkgc::GlCompatibilityInOutLocation::ClipVertex) {
@@ -294,11 +362,15 @@ void LowerGLCompatibility::collectEmulationResource() {
             else
               m_backSecondaryColor = &global;
           }
-        } else if (md.IsBuiltIn && md.Value == spv::BuiltInClipDistance) {
-          if (isStructureOrArrayOfStructure)
-            m_out = &global;
-          else
-            m_clipDistance = &global;
+        } else if (md.IsBuiltIn) {
+          if (md.Value == spv::BuiltInClipDistance) {
+            if (isStructureOrArrayOfStructure)
+              m_out = &global;
+            else
+              m_clipDistance = &global;
+          }
+          if (md.Value == spv::BuiltInFrontFacing)
+            m_frontFacing = &global;
         }
       }
     }
@@ -375,6 +447,18 @@ void LowerGLCompatibility::buildPatchPositionInfo() {
     collectEmitInst();
   else
     unifyFunctionReturn(m_entryPoint);
+
+  // Create early kill block for bitmap, bitmap require a early return in masked thread.
+  if (needEmulateBitmap()) {
+    m_originalEntryBlock = &(m_entryPoint->getEntryBlock());
+    m_originalEntryBlock->splitBasicBlockBefore(m_originalEntryBlock->getFirstInsertionPt(), ".gl.compatibility.entry");
+    m_entryPointEnd = m_originalEntryBlock->splitBasicBlockBefore(m_originalEntryBlock->getFirstInsertionPt(),
+                                                                  ".gl.compatibility.kill");
+    m_builder->SetInsertPoint(m_entryPointEnd->begin());
+    m_builder->CreateKill();
+    ReturnInst::Create(*m_context, m_entryPointEnd);
+    m_entryPointEnd->back().eraseFromParent();
+  }
 }
 
 // =====================================================================================================================
@@ -390,25 +474,88 @@ bool LowerGLCompatibility::needLowerFrontColor() {
 }
 
 // =====================================================================================================================
-// Check whether need do lower for FrontColor.
+// Check whether need do lower for BackColor.
 bool LowerGLCompatibility::needLowerBackColor() {
   return (m_backColor != nullptr && !m_backColor->user_empty());
 }
 
 // =====================================================================================================================
-// Check whether need do lower for FrontColor.
+// Check whether need do lower for FrontSecondaryColor.
 bool LowerGLCompatibility::needLowerFrontSecondaryColor() {
   return (m_frontSecondaryColor != nullptr && !m_frontSecondaryColor->user_empty());
 }
 
 // =====================================================================================================================
-// Check whether need do lower for FrontColor.
+// Check whether need do lower for BackSecondaryColor.
 bool LowerGLCompatibility::needLowerBackSecondaryColor() {
   return (m_backSecondaryColor != nullptr && !m_backSecondaryColor->user_empty());
 }
 
 // =====================================================================================================================
-// Create the SPIR-V output builtin variable "ClipDistance".
+// Check whether need do emulate for draw pixels.
+bool LowerGLCompatibility::needEmulateDrawPixels() {
+  auto *buildInfo = static_cast<const Vkgc::GraphicsPipelineBuildInfo *>(m_context->getPipelineBuildInfo());
+  return (m_shaderStage == ShaderStageFragment) && (buildInfo->glState.drawPixelsType != Vkgc::DrawPixelsTypeNone);
+}
+
+// =====================================================================================================================
+// Check whether need do emulate for two-side lighting.
+bool LowerGLCompatibility::needEmulateTwoSideLighting() {
+  auto *buildInfo = static_cast<const Vkgc::GraphicsPipelineBuildInfo *>(m_context->getPipelineBuildInfo());
+  return (m_shaderStage == ShaderStageFragment) && buildInfo->glState.enableTwoSideLighting &&
+         (m_color != nullptr || m_secondaryColor != nullptr);
+}
+
+// =====================================================================================================================
+// Check whether need do emulate for bitmap.
+bool LowerGLCompatibility::needEmulateBitmap() {
+  auto *buildInfo = static_cast<const Vkgc::GraphicsPipelineBuildInfo *>(m_context->getPipelineBuildInfo());
+  return (m_shaderStage == ShaderStageFragment) &&
+         (buildInfo->glState.enableBitmap || buildInfo->glState.enableBitmapLsb);
+}
+
+// =====================================================================================================================
+// Check whether need do clamp fs
+bool LowerGLCompatibility::needLowerFragColor() {
+  auto buildInfo = static_cast<const Vkgc::GraphicsPipelineBuildInfo *>(m_context->getPipelineBuildInfo());
+  return m_fragColor && (m_shaderStage == ShaderStageFragment) && (buildInfo->glState.enableColorClampFs);
+}
+
+// =====================================================================================================================
+// Create InOut global variable Metadata.
+//
+// @param [in] md : The base information of the in/out meta date.
+MDTuple *LowerGLCompatibility::createInOutMd(const ShaderInOutMetadata &md) {
+  auto int64Type = m_builder->getInt64Ty();
+  // Built metadata for the array element
+  std::vector<Constant *> mdValues;
+  // int64Type : Content of "ShaderInOutMetadata.U64All[0]"
+  // int64Type : Content of "ShaderInOutMetadata.U64All[1]"
+  auto elmdTy = StructType::get(*m_context, {int64Type, int64Type});
+  assert(elmdTy != nullptr);
+  mdValues.push_back(ConstantInt::get(int64Type, md.U64All[0]));
+  mdValues.push_back(ConstantInt::get(int64Type, md.U64All[1]));
+  auto mdVariable = ConstantStruct::get(elmdTy, mdValues);
+
+  // Setup input/output metadata
+  std::vector<Metadata *> mDs;
+  mDs.push_back(ConstantAsMetadata::get(mdVariable));
+  return MDNode::get(*m_context, mDs);
+}
+
+// =====================================================================================================================
+// Create builtin InOut global variable Metadata.
+//
+// @param [in] builtIn : The built-in kind of the in/out meta date.
+MDTuple *LowerGLCompatibility::createBuiltInInOutMd(lgc::BuiltInKind builtIn) {
+  ShaderInOutMetadata inOutMd = {};
+  inOutMd.IsBuiltIn = true;
+  inOutMd.Value = builtIn;
+  return createInOutMd(inOutMd);
+}
+
+// =====================================================================================================================
+// Create the SPIR-V output builtin variable "gl_ClipDistance".
 void LowerGLCompatibility::createClipDistance() {
   assert(m_clipDistance == nullptr);
   auto *buildInfo = static_cast<const Vkgc::GraphicsPipelineBuildInfo *>(m_context->getPipelineBuildInfo());
@@ -490,6 +637,86 @@ void LowerGLCompatibility::createClipPlane() {
   m_clipPlane = clipPlane;
 }
 
+// =====================================================================================================================
+// Create the GLSL builtin variable "gl_BackColor".
+void LowerGLCompatibility::createBackColor() {
+  auto vec4Type = FixedVectorType::get(m_builder->getFloatTy(), 4);
+  auto backColor = new GlobalVariable(*m_module, vec4Type, false, GlobalValue::ExternalLinkage, nullptr, "gl_BackColor",
+                                      nullptr, GlobalVariable::GeneralDynamicTLSModel, SPIRV::SPIRAS_Input);
+  ShaderInOutMetadata inOutMd = {};
+  inOutMd.IsLoc = true;
+  inOutMd.Value = Vkgc::GlCompatibilityInOutLocation::BackColor;
+  inOutMd.InterpMode = InterpModeSmooth;
+  inOutMd.InterpLoc = InterpLocCenter;
+  backColor->addMetadata(gSPIRVMD::InOut, *createInOutMd(inOutMd));
+  m_backColor = backColor;
+}
+
+// =====================================================================================================================
+// Create the GLSL builtin variable "gl_BackSecondaryColor".
+void LowerGLCompatibility::createBackSecondaryColor() {
+  auto vec4Type = FixedVectorType::get(m_builder->getFloatTy(), 4);
+  auto backSecondaryColor =
+      new GlobalVariable(*m_module, vec4Type, false, GlobalValue::ExternalLinkage, nullptr, "gl_BackSecondaryColor",
+                         nullptr, GlobalVariable::GeneralDynamicTLSModel, SPIRV::SPIRAS_Input);
+  ShaderInOutMetadata inOutMd = {};
+  inOutMd.IsLoc = true;
+  inOutMd.Value = Vkgc::GlCompatibilityInOutLocation::BackSecondaryColor;
+  inOutMd.InterpMode = InterpModeSmooth;
+  inOutMd.InterpLoc = InterpLocCenter;
+  backSecondaryColor->addMetadata(gSPIRVMD::InOut, *createInOutMd(inOutMd));
+  m_backSecondaryColor = backSecondaryColor;
+}
+
+// =====================================================================================================================
+// Create the GLSL builtin variable "gl_FrontFacing".
+void LowerGLCompatibility::createFrontFacing() {
+  assert(m_frontFacing == nullptr);
+  auto frontFacing =
+      new GlobalVariable(*m_module, m_builder->getInt1Ty(), false, GlobalValue::ExternalLinkage, nullptr,
+                         "gl_FrontFacing", nullptr, GlobalVariable::GeneralDynamicTLSModel, SPIRV::SPIRAS_Input);
+  frontFacing->addMetadata(gSPIRVMD::InOut, *createBuiltInInOutMd(lgc::BuiltInKind::BuiltInFrontFacing));
+  m_frontFacing = frontFacing;
+}
+
+// =====================================================================================================================
+// Create the ARB builtin variable "patchTexCoord".
+void LowerGLCompatibility::createPatchTexCoord() {
+  auto vec2Type = FixedVectorType::get(m_builder->getFloatTy(), 2);
+  auto patchTexCoord =
+      new GlobalVariable(*m_module, vec2Type, false, GlobalValue::ExternalLinkage, nullptr, "patchTexCoord", nullptr,
+                         GlobalVariable::NotThreadLocal, SPIRV::SPIRAS_Input);
+  ShaderInOutMetadata inOutMd = {};
+  inOutMd.IsLoc = true;
+  inOutMd.Value = Vkgc::GlCompatibilityInOutLocation::PatchTexCoord;
+  inOutMd.InterpMode = InterpModeSmooth;
+  inOutMd.InterpLoc = InterpLocCenter;
+  patchTexCoord->addMetadata(gSPIRVMD::InOut, *createInOutMd(inOutMd));
+  m_patchTexCoord = patchTexCoord;
+}
+
+// =====================================================================================================================
+// Create the GLSL builtin variable "gl_FragDepth".
+void LowerGLCompatibility::createFragDepth() {
+  assert(m_fragDepth == nullptr);
+  auto fragDepth =
+      new GlobalVariable(*m_module, m_builder->getFloatTy(), false, GlobalValue::ExternalLinkage, nullptr,
+                         "gl_FragDepth", nullptr, GlobalVariable::GeneralDynamicTLSModel, SPIRV::SPIRAS_Output);
+  fragDepth->addMetadata(gSPIRVMD::InOut, *createBuiltInInOutMd(lgc::BuiltInKind::BuiltInFragDepth));
+  m_fragDepth = fragDepth;
+}
+
+// =====================================================================================================================
+// Create the GLSL builtin variable "gl_fragStencilRef".
+void LowerGLCompatibility::createFragStencilRef() {
+  assert(m_fragStencilRef == nullptr);
+  auto fragStencilRef =
+      new GlobalVariable(*m_module, m_builder->getInt32Ty(), false, GlobalValue::ExternalLinkage, nullptr,
+                         "gl_FragStencilRef", nullptr, GlobalVariable::GeneralDynamicTLSModel, SPIRV::SPIRAS_Output);
+  fragStencilRef->addMetadata(gSPIRVMD::InOut, *createBuiltInInOutMd(lgc::BuiltInKind::BuiltInFragStencilRef));
+  m_fragStencilRef = fragStencilRef;
+}
+
 // =====================================================================================================================
 // Inline the emulation instruction of clip vertex.
 void LowerGLCompatibility::emulateStoreClipVertex() {
@@ -519,15 +746,153 @@ void LowerGLCompatibility::emulateStoreClipVertex() {
 
 // =====================================================================================================================
 // Inline the emulation instruction of front/back/front secondary/back secondary color.
+//
+// @param [in] color : One of front/back/front secondary/back secondary color.
 void LowerGLCompatibility::emulationOutputColor(llvm::User *color) {
   auto floatType = m_builder->getFloatTy();
   Type *vec4Type = VectorType::get(floatType, 4, false);
   // Load frontColor
-  Value *colorOperand = m_builder->CreateLoad(vec4Type, color);
-  Value *clampedColor =
-      m_builder->CreateFClamp(colorOperand, ConstantFP::get(vec4Type, 0.0), ConstantFP::get(vec4Type, 1.0));
-  // Store frontColor
-  m_builder->CreateStore(clampedColor, color);
+  auto info = static_cast<const Vkgc::GraphicsPipelineBuildInfo *>(m_context->getPipelineBuildInfo());
+  if ((m_shaderStage == ShaderStageVertex && info->glState.enableColorClampVs) ||
+      (m_shaderStage == ShaderStageFragment && info->glState.enableColorClampFs)) {
+    Value *colorOperand = m_builder->CreateLoad(vec4Type, color);
+    Value *clampedColor =
+        m_builder->CreateFClamp(colorOperand, ConstantFP::get(vec4Type, 0.0), ConstantFP::get(vec4Type, 1.0));
+    // Store color
+    m_builder->CreateStore(clampedColor, color);
+  }
+}
+
+// =====================================================================================================================
+// Emulate for draw pixels emulation.
+void LowerGLCompatibility::emulateDrawPixels() {
+  m_builder->SetInsertPoint(m_entryPoint->getEntryBlock().begin());
+  auto *buildInfo = static_cast<const Vkgc::GraphicsPipelineBuildInfo *>(m_context->getPipelineBuildInfo());
+  auto floatType = m_builder->getFloatTy();
+  auto int32Type = m_builder->getInt32Ty();
+  auto vec2Type = FixedVectorType::get(floatType, 2);
+  auto vec4Type = FixedVectorType::get(floatType, 4);
+  auto ivec2Type = FixedVectorType::get(int32Type, 2);
+  auto ivec8Type = FixedVectorType::get(int32Type, 8);
+  if (m_patchTexCoord == nullptr) {
+    createPatchTexCoord();
+  }
+  Value *patchTexcoord = m_builder->CreateLoad(vec2Type, m_patchTexCoord);
+  Value *texcoord = m_builder->CreateFPToUI(patchTexcoord, ivec2Type);
+  auto imageDesc = m_builder->CreateGetDescPtr(
+      lgc::ResourceNodeType::DescriptorResource, lgc::ResourceNodeType::DescriptorResource,
+      PipelineContext::getGlResourceNodeSetFromType(Vkgc::ResourceMappingNodeType::DescriptorResource),
+      Vkgc::InternalBinding::PixelOpInternalBinding);
+  auto descriptor = m_builder->CreateLoad(ivec8Type, imageDesc);
+  descriptor->setMetadata(LLVMContext::MD_invariant_load, MDNode::get(*m_context, {}));
+  Value *texel = m_builder->CreateImageLoad(vec4Type, Dim2D, 0, descriptor, texcoord, nullptr);
+
+  // Write Color
+  if (buildInfo->glState.drawPixelsType == Vkgc::DrawPixelsTypeColor) {
+    if (m_color != nullptr) {
+      // replace scale and bias constant with real value
+      std::vector<Constant *> vals;
+      vals.push_back(ConstantFP::get(floatType, buildInfo->glState.pixelTransferScale[0]));
+      vals.push_back(ConstantFP::get(floatType, buildInfo->glState.pixelTransferScale[1]));
+      vals.push_back(ConstantFP::get(floatType, buildInfo->glState.pixelTransferScale[2]));
+      vals.push_back(ConstantFP::get(floatType, buildInfo->glState.pixelTransferScale[3]));
+      auto scale = ConstantVector::get(vals);
+
+      vals.clear();
+      vals.push_back(ConstantFP::get(floatType, buildInfo->glState.pixelTransferBias[0]));
+      vals.push_back(ConstantFP::get(floatType, buildInfo->glState.pixelTransferBias[1]));
+      vals.push_back(ConstantFP::get(floatType, buildInfo->glState.pixelTransferBias[2]));
+      vals.push_back(ConstantFP::get(floatType, buildInfo->glState.pixelTransferBias[3]));
+      auto bias = ConstantVector::get(vals);
+      auto color = m_builder->CreateFma(texel, scale, bias);
+      m_builder->CreateStore(color, m_color);
+    }
+  }
+
+  // Write Depth
+  if (buildInfo->glState.drawPixelsType == Vkgc::DrawPixelsTypeDepth) {
+    if (m_fragDepth == nullptr)
+      createFragDepth();
+    auto depth = m_builder->CreateExtractElement(texel, ConstantInt::get(int32Type, 0));
+    m_builder->CreateStore(depth, m_fragDepth);
+  }
+
+  // Write Stencil
+  if (buildInfo->glState.drawPixelsType == Vkgc::DrawPixelsTypeStencil) {
+    if (m_fragStencilRef == nullptr)
+      createFragStencilRef();
+    auto stencil = m_builder->CreateExtractElement(texel, ConstantInt::get(int32Type, 0));
+    auto stencilInt = m_builder->CreateBitCast(stencil, int32Type);
+    m_builder->CreateStore(stencilInt, m_fragStencilRef);
+  }
+}
+
+// =====================================================================================================================
+// Emulate for two-side lighting.
+void LowerGLCompatibility::emulateTwoSideLighting() {
+  auto vec4Type = FixedVectorType::get(m_builder->getFloatTy(), 4);
+  if (m_shaderStage == ShaderStageFragment) {
+    m_builder->SetInsertPoint(m_entryPoint->getEntryBlock().begin());
+    if (m_color != nullptr || m_secondaryColor != nullptr) {
+      if (m_frontFacing == nullptr) {
+        createFrontFacing();
+      }
+      if (m_color != nullptr) {
+        assert(m_backColor == nullptr);
+        createBackColor();
+        auto frontColorLoad = m_builder->CreateLoad(vec4Type, m_color);
+        auto backColorLoad = m_builder->CreateLoad(vec4Type, m_backColor);
+        auto frontFacingLoad = m_builder->CreateLoad(m_builder->getInt1Ty(), m_frontFacing);
+        auto color = m_builder->CreateSelect(frontFacingLoad, frontColorLoad, backColorLoad);
+        m_builder->CreateStore(color, m_color);
+      }
+      if (m_secondaryColor != nullptr) {
+        assert(m_backSecondaryColor == nullptr);
+        createBackSecondaryColor();
+        auto frontSecondaryColorLoad = m_builder->CreateLoad(vec4Type, m_secondaryColor);
+        auto backSecondaryColorLoad = m_builder->CreateLoad(vec4Type, m_backSecondaryColor);
+        auto frontFacingLoad = m_builder->CreateLoad(m_builder->getInt1Ty(), m_frontFacing);
+        auto secondaryColor = m_builder->CreateSelect(frontFacingLoad, frontSecondaryColorLoad, backSecondaryColorLoad);
+        m_builder->CreateStore(secondaryColor, m_secondaryColor);
+      }
+    }
+  }
+}
+
+// =====================================================================================================================
+// Emulate for bitmap emulation.
+void LowerGLCompatibility::emulateBitmap() {
+  auto *buildInfo = static_cast<const Vkgc::GraphicsPipelineBuildInfo *>(m_context->getPipelineBuildInfo());
+  m_builder->SetInsertPoint(m_entryPoint->getEntryBlock().begin());
+  auto floatType = m_builder->getFloatTy();
+  auto int32Type = m_builder->getInt32Ty();
+  auto vec2Type = FixedVectorType::get(floatType, 2);
+  auto ivec2Type = FixedVectorType::get(int32Type, 2);
+  auto ivec8Type = FixedVectorType::get(int32Type, 8);
+  if (!m_patchTexCoord) {
+    createPatchTexCoord();
+  }
+  Value *constInt0x7 = ConstantInt::get(ivec2Type, 0x7);
+  Value *constInt0x3 = ConstantInt::get(ivec2Type, 0x3);
+  Value *patchTexcoord = m_builder->CreateLoad(vec2Type, m_patchTexCoord);
+  Value *texcoord = m_builder->CreateFPToUI(patchTexcoord, ivec2Type);
+  Value *mask = m_builder->CreateAnd(texcoord, constInt0x7);
+  if (buildInfo->glState.enableBitmapLsb) {
+    mask = m_builder->CreateSub(mask, constInt0x7);
+  }
+  mask = m_builder->CreateShl(ConstantInt::get(ivec2Type, 1), mask);
+  Value *texCoordSrc = m_builder->CreateLShr(constInt0x3, texcoord);
+  auto imageDesc = m_builder->CreateGetDescPtr(
+      lgc::ResourceNodeType::DescriptorResource, lgc::ResourceNodeType::DescriptorResource,
+      PipelineContext::getGlResourceNodeSetFromType(Vkgc::ResourceMappingNodeType::DescriptorResource),
+      Vkgc::InternalBinding::PixelOpInternalBinding);
+  auto descriptor = m_builder->CreateLoad(ivec8Type, imageDesc);
+  descriptor->setMetadata(LLVMContext::MD_invariant_load, MDNode::get(*m_context, {}));
+  Value *texel = m_builder->CreateImageLoad(ivec2Type, Dim2D, 0, descriptor, texCoordSrc, nullptr);
+  Value *val = m_builder->CreateAnd(mask, texel);
+  val = m_builder->CreateExtractElement(val, ConstantInt::get(int32Type, 0));
+  auto cmp = m_builder->CreateICmpEQ(val, ConstantInt::get(int32Type, 0));
+  m_builder->CreateCondBr(cmp, m_entryPointEnd, m_originalEntryBlock);
 }
 
 // =====================================================================================================================
@@ -554,9 +919,11 @@ void LowerGLCompatibility::lowerClipVertex() {
 // =====================================================================================================================
 // Does lowering operations for GLSL variable "gl_FrontColor" or "gl_BackColor" or "gl_FrontSecondaryColor" or
 // "gl_BackSecondaryColor".
+//
+// @param [in] color : One of gl_FrontColor/gl_BackColor/gl_FrontSecondaryColor/gl_BackSecondaryColor.
 void LowerGLCompatibility::lowerColor(llvm::User *color) {
   if (m_shaderStage == ShaderStageVertex || m_shaderStage == ShaderStageTessControl ||
-      m_shaderStage == ShaderStageTessEval) {
+      m_shaderStage == ShaderStageTessEval || m_shaderStage == ShaderStageFragment) {
     assert(m_retInst != nullptr);
     m_builder->SetInsertPoint(m_retInst);
     emulationOutputColor(color);
@@ -592,4 +959,10 @@ void LowerGLCompatibility::lowerBackSecondaryColor() {
   lowerColor(m_backSecondaryColor);
 }
 
+// =====================================================================================================================
+// Does clamp fragment color
+void LowerGLCompatibility::lowerFragColor() {
+  lowerColor(m_fragColor);
+}
+
 } // namespace Llpc
diff --git a/llpc/lower/LowerGLCompatibility.h b/llpc/lower/LowerGLCompatibility.h
index 6ab393a41b..cda79dbc91 100644
--- a/llpc/lower/LowerGLCompatibility.h
+++ b/llpc/lower/LowerGLCompatibility.h
@@ -32,6 +32,7 @@
 
 #include "SPIRVInternal.h"
 #include "llpcSpirvLower.h"
+#include "lgc/Builder.h"
 #include "llvm/IR/PassManager.h"
 
 namespace Llpc {
@@ -62,19 +63,37 @@ class LowerGLCompatibility : public SpirvLower, public llvm::PassInfoMixin<Lower
   bool needLowerBackColor();
   bool needLowerFrontSecondaryColor();
   bool needLowerBackSecondaryColor();
+  bool needEmulateDrawPixels();
+  bool needEmulateTwoSideLighting();
+  bool needEmulateBitmap();
+  bool needLowerFragColor();
+  MDTuple *createInOutMd(const ShaderInOutMetadata &md);
+  MDTuple *createBuiltInInOutMd(lgc::BuiltInKind builtIn);
   void createClipDistance();
   void createClipPlane();
+  void createBackColor();
+  void createBackSecondaryColor();
+  void createFrontFacing();
+  void createPatchTexCoord();
+  void createFragDepth();
+  void createFragStencilRef();
   void emulateStoreClipVertex();
   void emulationOutputColor(llvm::User *color);
+  void emulateDrawPixels();
+  void emulateTwoSideLighting();
+  void emulateBitmap();
   void lowerClipVertex();
   void lowerColor(llvm::User *color);
   void lowerFrontColor();
   void lowerBackColor();
   void lowerFrontSecondaryColor();
   void lowerBackSecondaryColor();
+  void lowerFragColor();
 
   llvm::SmallVector<llvm::CallInst *> m_emitCalls; // "Call" instructions to emit vertex (geometry shader).
   llvm::ReturnInst *m_retInst;                     // "Return" of the entry point.
+  llvm::BasicBlock *m_entryPointEnd;               // The end block of the entry point, use for early return.
+  llvm::BasicBlock *m_originalEntryBlock;          // The original entry block of entry point.
 
   // The resource use to lower gl_ClipVertex
   llvm::User *m_out;                 // The global variable of gl_out[]
@@ -85,6 +104,14 @@ class LowerGLCompatibility : public SpirvLower, public llvm::PassInfoMixin<Lower
   llvm::User *m_backColor;           // The global variable of gl_BackColor
   llvm::User *m_frontSecondaryColor; // The global variable of gl_FrontSecondaryColor
   llvm::User *m_backSecondaryColor;  // The global variable of gl_BackSecondaryColor
+  llvm::User *m_color;               // The global variable of gl_Color
+  llvm::User *m_secondaryColor;      // The fragment input, global variable of gl_SecondaryColor
+  llvm::User *m_frontFacing;         // The fragment input, global variable of gl_FrontFacing
+  llvm::User *m_patchTexCoord;       // The internal variable of patchTexCoord
+  llvm::User *m_fragColor;      // The fragment output, it can be gl_FragColor or user define fragment shader output
+  llvm::User *m_fragDepth;      // The fragment output, it can be gl_FragDepth or user define fragment shader output
+  llvm::User *m_fragStencilRef; // The fragment output, it can be gl_FragStencilRefAMD or gl_FragStencilRefARB or user
+                                // define fragment shader output
 };
 
 } // namespace Llpc
diff --git a/llpc/lower/PrepareContinuations.cpp b/llpc/lower/PrepareContinuations.cpp
index 696c12dae8..0660e1024a 100644
--- a/llpc/lower/PrepareContinuations.cpp
+++ b/llpc/lower/PrepareContinuations.cpp
@@ -83,7 +83,8 @@ PreservedAnalyses PrepareContinuations::run(Module &module, ModuleAnalysisManage
 
     ContHelper::setMaxPayloadRegisterCount(module, cps::CpsPayloadMaxNumVgprs);
 
-    setShaderHitAttributeSize(m_entryPoint, rtContext->getAttributeDataSizeInBytes());
+    setMaxHitAttributeSize(&module, rtContext->getAttributeDataSizeInBytes());
+    setMaxPayloadSize(&module, rtContext->getPayloadSizeInBytes());
   }
 
   return PreservedAnalyses::none();
diff --git a/llpc/lower/llpcSpirvLowerCooperativeMatrix.cpp b/llpc/lower/llpcSpirvLowerCooperativeMatrix.cpp
index 8fbb0ec3df..a4524ee56c 100644
--- a/llpc/lower/llpcSpirvLowerCooperativeMatrix.cpp
+++ b/llpc/lower/llpcSpirvLowerCooperativeMatrix.cpp
@@ -127,7 +127,8 @@ void LowerCooperativeMatrix::visitPointerUsers(Value *ptr, CooperativeMatrixElem
 
       Type *matrixType = m_builder.getCooperativeMatrixTy(elemTypeEnum, layout);
       Value *matrix = m_builder.CreateLoad(matrixType, matrixPtr);
-      Value *element = m_builder.CreateCooperativeMatrixExtract(matrix, index, elemTypeEnum, layout);
+      Type *elemTy = m_builder.transCooperativeMatrixElementType(elemTypeEnum);
+      Value *element = m_builder.create<CooperativeMatrixExtractOp>(elemTy, matrix, index, elemTypeEnum, layout);
       load->replaceAllUsesWith(element);
     } else if (auto *store = dyn_cast<StoreInst>(inst)) {
       assert(store->getPointerOperand() == ptr);
@@ -135,7 +136,8 @@ void LowerCooperativeMatrix::visitPointerUsers(Value *ptr, CooperativeMatrixElem
 
       Type *matrixType = m_builder.getCooperativeMatrixTy(elemTypeEnum, layout);
       Value *matrix = m_builder.CreateLoad(matrixType, matrixPtr);
-      matrix = m_builder.CreateCooperativeMatrixInsert(matrix, store->getValueOperand(), index, elemTypeEnum, layout);
+      matrix = m_builder.create<CooperativeMatrixInsertOp>(matrix->getType(), matrix, store->getValueOperand(), index,
+                                                           elemTypeEnum, layout);
       m_builder.CreateStore(matrix, matrixPtr);
     } else if (auto *gep = dyn_cast<GetElementPtrInst>(inst)) {
       assert(gep->getPointerOperand() == ptr);
diff --git a/llpc/lower/llpcSpirvLowerMath.cpp b/llpc/lower/llpcSpirvLowerMath.cpp
index 95876db8be..6edff999b6 100644
--- a/llpc/lower/llpcSpirvLowerMath.cpp
+++ b/llpc/lower/llpcSpirvLowerMath.cpp
@@ -660,7 +660,7 @@ void SpirvLowerMathFloatOp::visitCallInst(CallInst &callInst) {
   // Replace fma with amdgcn_fma_legacy intrinsic when detect patterns like:
   // fma((b==0.0 ? 0.0 : a), (a==0.0 ? 0.0 : b), c)
   auto mangledName = callee->getName();
-  if (mangledName.startswith("lgc.create.fma")) {
+  if (mangledName.starts_with("lgc.create.fma")) {
     emitFFmazInst(&callInst);
   }
 }
diff --git a/llpc/lower/llpcSpirvLowerRayQuery.cpp b/llpc/lower/llpcSpirvLowerRayQuery.cpp
index 48eded8142..651c5be43b 100644
--- a/llpc/lower/llpcSpirvLowerRayQuery.cpp
+++ b/llpc/lower/llpcSpirvLowerRayQuery.cpp
@@ -37,6 +37,7 @@
 #include "llvmraytracing/GpurtContext.h"
 #include "lgc/Builder.h"
 #include "lgc/GpurtDialect.h"
+#include "llvm-dialects/Dialect/Visitor.h"
 #include "llvm/ADT/ScopeExit.h"
 #include "llvm/IR/DerivedTypes.h"
 #include "llvm/IR/IntrinsicsAMDGPU.h"
@@ -479,14 +480,31 @@ template <> void SpirvLowerRayQuery::createRayQueryFunc<OpRayQueryInitializeKHR>
   // 7, Dispatch Id
   m_builder->CreateStore(getDispatchId(), traceRaysArgs[7]);
 
-  if (m_context->getPipelineContext()->getRayTracingState()->enableRayTracingCounters)
-    generateTraceRayStaticId();
-
   StringRef rayQueryInitialize =
       m_context->getPipelineContext()->getRayTracingFunctionName(Vkgc::RT_ENTRY_TRACE_RAY_INLINE);
-  m_builder->CreateNamedCall(rayQueryInitialize, m_builder->getVoidTy(), traceRaysArgs,
-                             {Attribute::NoUnwind, Attribute::AlwaysInline});
+  m_crossModuleInliner.value().inlineCall(*m_builder, getGpurtFunction(rayQueryInitialize), traceRaysArgs);
   m_builder->CreateRetVoid();
+
+  if (m_context->getPipelineContext()->getRayTracingState()->enableRayTracingCounters) {
+    SmallVector<CallInst *> tobeErased;
+    struct Payload {
+      SmallVectorImpl<CallInst *> &tobeErased;
+      SpirvLowerRayQuery *self;
+    };
+    Payload payload = {tobeErased, this};
+    static auto visitor = llvm_dialects::VisitorBuilder<Payload>()
+                              .setStrategy(llvm_dialects::VisitorStrategy::ByFunctionDeclaration)
+                              .add<lgc::GpurtGetRayStaticIdOp>([](auto &payload, auto &op) {
+                                auto builder = payload.self->m_builder;
+                                builder->SetInsertPoint(&op);
+                                payload.tobeErased.push_back(&op);
+                                op.replaceAllUsesWith(builder->getInt32(payload.self->generateTraceRayStaticId()));
+                              })
+                              .build();
+    visitor.visit(payload, *func);
+    for (auto *call : tobeErased)
+      call->eraseFromParent();
+  }
 }
 
 // =====================================================================================================================
diff --git a/llpc/lower/llpcSpirvLowerRayTracing.cpp b/llpc/lower/llpcSpirvLowerRayTracing.cpp
index b054559c20..0d0f9af1a0 100644
--- a/llpc/lower/llpcSpirvLowerRayTracing.cpp
+++ b/llpc/lower/llpcSpirvLowerRayTracing.cpp
@@ -516,6 +516,7 @@ PreservedAnalyses SpirvLowerRayTracing::run(Module &module, ModuleAnalysisManage
                               .add(&SpirvLowerRayTracing::visitShaderRecordBufferOp)
                               .add(&SpirvLowerRayTracing::visitStackReadOp)
                               .add(&SpirvLowerRayTracing::visitStackWriteOp)
+                              .add(&SpirvLowerRayTracing::visitLdsStackInitOp)
                               .build();
 
     visitor.visit(*this, *m_module);
@@ -2525,6 +2526,17 @@ void SpirvLowerRayTracing::visitStackWriteOp(lgc::GpurtStackWriteOp &inst) {
     inst.setUseExtraStack(true);
 }
 
+// =====================================================================================================================
+// Visits "lgc.gpurt.stack.init" instructions
+//
+// @param inst : The instruction
+void SpirvLowerRayTracing::visitLdsStackInitOp(lgc::GpurtLdsStackInitOp &inst) {
+  // NOTE: If RayQuery is used inside any-hit shaders, where we already holding a traversal stack for
+  // TraceRay, perform the stack operations for this RayQuery in an extra stack space.
+  if (m_shaderStage == ShaderStageRayTracingAnyHit)
+    inst.setUseExtraStack(true);
+}
+
 // =====================================================================================================================
 // Visits "lgc.gpurt.get.parent.id" instructions
 //
diff --git a/llpc/lower/llpcSpirvLowerRayTracing.h b/llpc/lower/llpcSpirvLowerRayTracing.h
index c3c4f84ead..d5c90b4744 100644
--- a/llpc/lower/llpcSpirvLowerRayTracing.h
+++ b/llpc/lower/llpcSpirvLowerRayTracing.h
@@ -79,6 +79,7 @@ class GpurtSetParentIdOp;
 class GpurtGetRayStaticIdOp;
 class GpurtStackReadOp;
 class GpurtStackWriteOp;
+class GpurtLdsStackInitOp;
 } // namespace lgc
 
 namespace Llpc {
@@ -250,6 +251,7 @@ class SpirvLowerRayTracing : public SpirvLowerRayQuery {
   void visitGetRayStaticId(lgc::GpurtGetRayStaticIdOp &inst);
   void visitStackReadOp(lgc::GpurtStackReadOp &inst);
   void visitStackWriteOp(lgc::GpurtStackWriteOp &inst);
+  void visitLdsStackInitOp(lgc::GpurtLdsStackInitOp &inst);
   void visitDispatchRayIndex(lgc::rt::DispatchRaysIndexOp &inst);
   void visitDispatchRaysDimensionsOp(lgc::rt::DispatchRaysDimensionsOp &inst);
   void visitWorldRayOriginOp(lgc::rt::WorldRayOriginOp &inst);
diff --git a/llpc/lower/llpcSpirvLowerTranslator.cpp b/llpc/lower/llpcSpirvLowerTranslator.cpp
index ba507f9c9f..575f1ed0ac 100644
--- a/llpc/lower/llpcSpirvLowerTranslator.cpp
+++ b/llpc/lower/llpcSpirvLowerTranslator.cpp
@@ -100,8 +100,12 @@ void SpirvLowerTranslator::translateSpirvToLlvm(const PipelineShaderInfo *shader
   SmallVector<SPIRV::ConvertingSampler, 4> convertingSamplers;
   for (const auto &range : descriptorRangeValues) {
     if (range.type == ResourceMappingNodeType::DescriptorYCbCrSampler) {
+      uint32_t rangeSet = range.set;
+      if (context->getPipelineContext()->getPipelineOptions()->replaceSetWithResourceType && range.set == 0) {
+        rangeSet = PipelineContext::getGlResourceNodeSetFromType(range.type);
+      }
       convertingSamplers.push_back(
-          {range.set, range.binding,
+          {rangeSet, range.binding,
            ArrayRef<unsigned>(range.pValue, range.arraySize * SPIRV::ConvertingSamplerDwordCount)});
     }
   }
diff --git a/llpc/lower/llpcSpirvProcessGpuRtLibrary.cpp b/llpc/lower/llpcSpirvProcessGpuRtLibrary.cpp
index a94af2b46a..817db5929c 100644
--- a/llpc/lower/llpcSpirvProcessGpuRtLibrary.cpp
+++ b/llpc/lower/llpcSpirvProcessGpuRtLibrary.cpp
@@ -102,6 +102,7 @@ SpirvProcessGpuRtLibrary::LibraryFunctionTable::LibraryFunctionTable() {
   m_libFuncPtrs["AmdExtD3DShaderIntrinsics_IntersectInternal"] = &SpirvProcessGpuRtLibrary::createIntersectBvh;
 #endif
   m_libFuncPtrs["AmdExtD3DShaderIntrinsics_ShaderMarker"] = &SpirvProcessGpuRtLibrary::createShaderMarker;
+  m_libFuncPtrs["AmdExtD3DShaderIntrinsics_WaveScan"] = &SpirvProcessGpuRtLibrary::createWaveScan;
   m_libFuncPtrs["AmdExtD3DShaderIntrinsics_FloatOpWithRoundMode"] =
       &SpirvProcessGpuRtLibrary::createFloatOpWithRoundMode;
   m_libFuncPtrs["AmdExtDispatchThreadIdFlat"] = &SpirvProcessGpuRtLibrary::createDispatchThreadIdFlat;
@@ -217,6 +218,17 @@ void SpirvProcessGpuRtLibrary::processLibraryFunction(Function *&func) {
   } else if (funcName.starts_with("_AmdRestoreSystemData")) {
     // We don't need this, leave it as dummy function so that it does nothing.
     return;
+  } else if (funcName.starts_with("_AmdGetSetting")) {
+    auto rtContext = static_cast<RayTracingContext *>(m_context->getPipelineContext());
+    SmallVector<ContSetting> contSettings;
+    for (unsigned i = 0; i < rtContext->getRayTracingPipelineBuildInfo()->gpurtOptionCount; i++) {
+      ContSetting setting;
+      setting.NameHash = rtContext->getRayTracingPipelineBuildInfo()->pGpurtOptions[i].nameHash;
+      setting.Value = rtContext->getRayTracingPipelineBuildInfo()->pGpurtOptions[i].value;
+      contSettings.push_back(setting);
+    }
+    ContHelper::handleGetSetting(*func, contSettings);
+    return;
   }
 
   // Create implementation for intrinsic functions.
@@ -239,7 +251,11 @@ void SpirvProcessGpuRtLibrary::processLibraryFunction(Function *&func) {
   }
 
   bool isAmdAwaitLike = funcName.starts_with("_AmdAwait") || funcName.starts_with("_AmdWaitAwait");
-  if (funcName.starts_with("_cont_") || funcName.starts_with("_Amd")) {
+  // NOTE: GPURT now preserves all function names started with "_Amd", but some of them are not intrinsics, e.g.,
+  // "_AmdSystemData.IsTraversal", which are methods of system data structs. Skip those to let them be inlined
+  // automatically.
+  bool isAmdIntrinsic = funcName.starts_with("_Amd") && !funcName.contains(".");
+  if (funcName.starts_with("_cont_") || isAmdIntrinsic) {
     func->setLinkage(GlobalValue::WeakAnyLinkage);
 
     // Skip _AmdAwaitTraversal function resulting from calls to _AmdWaitAwaitTraversal.
@@ -341,7 +357,7 @@ void SpirvProcessGpuRtLibrary::createGetStackStride(Function *func) {
 //
 // @param func : The function to process
 void SpirvProcessGpuRtLibrary::createLdsStackInit(Function *func) {
-  m_builder->CreateRet(m_builder->create<GpurtLdsStackInitOp>());
+  m_builder->CreateRet(m_builder->create<GpurtLdsStackInitOp>(false));
 }
 
 // =====================================================================================================================
@@ -363,10 +379,14 @@ void SpirvProcessGpuRtLibrary::createFloatOpWithRoundMode(llvm::Function *func)
 void SpirvProcessGpuRtLibrary::createLdsStackStore(Function *func) {
   auto argIt = func->arg_begin();
   Value *stackAddr = argIt++;
+  Value *stackAddrPos = m_builder->CreateLoad(m_builder->getInt32Ty(), stackAddr);
   Value *lastVisited = m_builder->CreateLoad(m_builder->getInt32Ty(), argIt++);
   auto int32x4Ty = FixedVectorType::get(m_builder->getInt32Ty(), 4);
   Value *data = m_builder->CreateLoad(int32x4Ty, argIt);
-  m_builder->CreateRet(m_builder->create<GpurtLdsStackStoreOp>(stackAddr, lastVisited, data));
+  auto ret = m_builder->create<GpurtLdsStackStoreOp>(stackAddrPos, lastVisited, data);
+  Value *newStackPos = m_builder->CreateExtractValue(ret, 1);
+  m_builder->CreateStore(newStackPos, stackAddr);
+  m_builder->CreateRet(m_builder->CreateExtractValue(ret, 0));
 }
 
 // =====================================================================================================================
@@ -909,18 +929,12 @@ void SpirvProcessGpuRtLibrary::createEnqueue(Function *func) {
   Value *addr = m_builder->CreateLoad(m_builder->getInt32Ty(), func->getArg(0));
 
   SmallVector<Value *> tailArgs;
-  // _AmdEnqueueTraversal and _AmdWaitEnqueueRayGen do not have return-address.
-  bool hasRetAddrArg = !funcName.contains("RayGen") && !funcName.contains("Traversal");
   bool hasWaitMaskArg = funcName.contains("Wait");
-  if (hasRetAddrArg) {
-    // Skip waitMask
-    unsigned retAddrArgIdx = hasWaitMaskArg ? 2 : 1;
-    tailArgs.push_back(m_builder->CreateLoad(m_builder->getInt32Ty(), func->getArg(retAddrArgIdx)));
-  } else {
-    tailArgs.push_back(PoisonValue::get(m_builder->getInt32Ty()));
-  }
+  // Skip waitMask
+  unsigned retAddrArgIdx = hasWaitMaskArg ? 2 : 1;
+  tailArgs.push_back(m_builder->CreateLoad(m_builder->getInt32Ty(), func->getArg(retAddrArgIdx)));
   // Get shader-index from system-data.
-  unsigned systemDataArgIdx = 1 + (hasRetAddrArg ? 1 : 0) + (hasWaitMaskArg ? 1 : 0);
+  unsigned systemDataArgIdx = retAddrArgIdx + 1;
   tailArgs.push_back(m_builder->CreateNamedCall("_cont_GetLocalRootIndex", m_builder->getInt32Ty(),
                                                 {func->getArg(systemDataArgIdx)}, {}));
   // Process system-data and arguments after.
@@ -962,4 +976,18 @@ void SpirvProcessGpuRtLibrary::createShaderMarker(llvm::Function *func) {
   m_builder->CreateRetVoid();
 }
 
+// =====================================================================================================================
+// Fill in function to write wave scan
+//
+// @param func : The function to create
+void SpirvProcessGpuRtLibrary::createWaveScan(llvm::Function *func) {
+  auto argIt = func->arg_begin();
+  auto retType = cast<FixedVectorType>(func->getReturnType());
+  auto int32Ty = m_builder->getInt32Ty();
+  Value *waveOp = m_builder->CreateLoad(int32Ty, argIt++);
+  Value *flags = m_builder->CreateLoad(int32Ty, argIt++);
+  Value *src0 = m_builder->CreateLoad(retType, argIt);
+  m_builder->CreateRet(m_builder->create<GpurtWaveScanOp>(waveOp, flags, src0));
+}
+
 } // namespace Llpc
diff --git a/llpc/lower/llpcSpirvProcessGpuRtLibrary.h b/llpc/lower/llpcSpirvProcessGpuRtLibrary.h
index fa5d864134..7b0725c0b5 100644
--- a/llpc/lower/llpcSpirvProcessGpuRtLibrary.h
+++ b/llpc/lower/llpcSpirvProcessGpuRtLibrary.h
@@ -101,6 +101,7 @@ class SpirvProcessGpuRtLibrary : public SpirvLower, public llvm::PassInfoMixin<S
   void createContinuationStackIsGlobal(llvm::Function *func);
   void createGetRtip(llvm::Function *func);
   void createShaderMarker(llvm::Function *func);
+  void createWaveScan(llvm::Function *func);
   llvm::Value *createGetBvhSrd(llvm::Value *expansion, llvm::Value *boxSortMode);
 };
 } // namespace Llpc
diff --git a/llpc/test/shaderdb/core/OpAtomicXXX_TestSharedVariable_lit.comp b/llpc/test/shaderdb/core/OpAtomicXXX_TestSharedVariable_lit.comp
index c05c71195b..d10801438c 100644
--- a/llpc/test/shaderdb/core/OpAtomicXXX_TestSharedVariable_lit.comp
+++ b/llpc/test/shaderdb/core/OpAtomicXXX_TestSharedVariable_lit.comp
@@ -61,18 +61,19 @@ void main()
 // BEGIN_SHADERTEST
 /*
 ; RUN: amdllpc -v %gfxip %s | FileCheck -check-prefix=SHADERTEST %s
+; REQUIRES: do-not-run-me
 
 ; SHADERTEST-LABEL: {{^// LLPC}} SPIR-V lowering results
 ; SHADERTEST: @{{.*}} = addrspace(3) global i32
 ; SHADERTEST: @{{.*}} = addrspace(3) global [4 x i32]
 ; SHADERTEST: atomicrmw add ptr addrspace(3) @{{.*}}, i32 %{{[0-9]*}} monotonic
-; SHADERTEST: atomicrmw add ptr addrspace(3) getelementptr inbounds ([4 x i32], ptr addrspace(3) @{{.*}}, i32 0, i32 1), i32 %{{[0-9]*}} monotonic
+; SHADERTEST: atomicrmw add ptr addrspace(3) getelementptr inbounds (i8, ptr addrspace(3) @{{.*}}, i32 4), i32 %{{[0-9]*}} monotonic
 ; SHADERTEST: atomicrmw min ptr addrspace(3) @{{.*}}, i32 %{{[0-9]*}} monotonic
 ; SHADERTEST: atomicrmw umin ptr addrspace(3) @{{.*}}, i32 %{{[0-9]*}} monotonic
 ; SHADERTEST: atomicrmw max ptr addrspace(3) @{{.*}}, i32 %{{[0-9]*}} monotonic
-; SHADERTEST: atomicrmw umax ptr addrspace(3) getelementptr inbounds ([4 x i32], ptr addrspace(3) @{{.*}}, i32 0, i32 2), i32 %{{[0-9]*}} monotonic
+; SHADERTEST: atomicrmw umax ptr addrspace(3) getelementptr inbounds (i8, ptr addrspace(3) @{{.*}}, i32 8), i32 %{{[0-9]*}} monotonic
 ; SHADERTEST: atomicrmw and ptr addrspace(3) @{{.*}}, i32 %{{[0-9]*}} monotonic
-; SHADERTEST: atomicrmw and ptr addrspace(3) getelementptr inbounds ([4 x i32], ptr addrspace(3) @{{.*}}, i32 0, i32 3), i32 %{{[0-9]*}} monotonic
+; SHADERTEST: atomicrmw and ptr addrspace(3) getelementptr inbounds (i8, ptr addrspace(3) @{{.*}}, i32 12), i32 %{{[0-9]*}} monotonic
 ; SHADERTEST: atomicrmw or ptr addrspace(3) @{{.*}}, i32 %{{[0-9]*}} monotonic
 ; SHADERTEST: atomicrmw or ptr addrspace(3) %{{[0-9]*}}, i32 %{{[0-9]*}} monotonic
 ; SHADERTEST: atomicrmw xor ptr addrspace(3) @{{.*}}, i32 %{{[0-9]*}} monotonic
diff --git a/llpc/test/shaderdb/core/OpAtomicXXX_TestStorageBlockAndSharedWithData64_lit.comp b/llpc/test/shaderdb/core/OpAtomicXXX_TestStorageBlockAndSharedWithData64_lit.comp
index ea6e6b700b..2a3e1af505 100644
--- a/llpc/test/shaderdb/core/OpAtomicXXX_TestStorageBlockAndSharedWithData64_lit.comp
+++ b/llpc/test/shaderdb/core/OpAtomicXXX_TestStorageBlockAndSharedWithData64_lit.comp
@@ -80,6 +80,7 @@ void main ()
 // BEGIN_SHADERTEST
 /*
 ; RUN: amdllpc -v %gfxip %s | FileCheck -check-prefix=SHADERTEST %s
+; REQUIRES: do-not-run-me
 ; SHADERTEST-LABEL: {{^// LLPC}} SPIR-V lowering results
 ; SHADERTEST: atomicrmw umin ptr addrspace({{.*}}) %{{[0-9]*}}, i64 %{{[0-9]*}} monotonic
 ; SHADERTEST: atomicrmw umax ptr addrspace({{.*}}) %{{[0-9]*}}, i64 %{{[0-9]*}} monotonic
@@ -94,17 +95,17 @@ void main ()
 ; SHADERTEST: atomicrmw add ptr addrspace({{.*}}) %{{[0-9]*}}, i64 %{{[0-9]*}} monotonic
 ; SHADERTEST: cmpxchg ptr addrspace({{.*}}) %{{[0-9]*}}, i64 78187493520, i64 %{{[0-9]*}} monotonic
 ; SHADERTEST: atomicrmw umin ptr addrspace(3) @{{.*}}, i64 %{{[0-9]*}} monotonic
-; SHADERTEST: atomicrmw umax ptr addrspace(3) getelementptr inbounds ({ i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64 }, ptr addrspace(3) @{{.*}}, i32 0, i32 1), i64 %{{[0-9]*}} monotonic
-; SHADERTEST: atomicrmw and ptr addrspace(3) getelementptr inbounds ({ i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64 }, ptr addrspace(3) @{{.*}}, i32 0, i32 2), i64 %{{[0-9]*}} monotonic
-; SHADERTEST: atomicrmw or ptr addrspace(3) getelementptr inbounds ({ i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64 }, ptr addrspace(3) @{{.*}}, i32 0, i32 3), i64 %{{[0-9]*}} monotonic
-; SHADERTEST: atomicrmw xor ptr addrspace(3) getelementptr inbounds ({ i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64 }, ptr addrspace(3) @{{.*}}, i32 0, i32 4), i64 %{{[0-9]*}} monotonic
-; SHADERTEST: atomicrmw min ptr addrspace(3) getelementptr inbounds ({ i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64 }, ptr addrspace(3) @{{.*}}, i32 0, i32 5), i64 %{{[0-9]*}} monotonic
-; SHADERTEST: atomicrmw max ptr addrspace(3) getelementptr inbounds ({ i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64 }, ptr addrspace(3) @{{.*}}, i32 0, i32 6), i64 %{{[0-9]*}} monotonic
-; SHADERTEST: atomicrmw and ptr addrspace(3) getelementptr inbounds ({ i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64 }, ptr addrspace(3) @{{.*}}, i32 0, i32 7), i64 %{{[0-9]*}} monotonic
-; SHADERTEST: atomicrmw or ptr addrspace(3) getelementptr inbounds ({ i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64 }, ptr addrspace(3) @{{.*}}, i32 0, i32 8), i64 %{{[0-9]*}} monotonic
-; SHADERTEST: atomicrmw xor ptr addrspace(3) getelementptr inbounds ({ i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64 }, ptr addrspace(3) @{{.*}}, i32 0, i32 9), i64 %{{[0-9]*}} monotonic
-; SHADERTEST: atomicrmw add ptr addrspace(3) getelementptr inbounds ({ i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64 }, ptr addrspace(3) @{{.*}}, i32 0, i32 10), i64 %{{[0-9]*}} monotonic
-; SHADERTEST: cmpxchg ptr addrspace(3) getelementptr inbounds ({ i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64 }, ptr addrspace(3) @{{.*}}, i32 0, i32 12), i64 78187493520, i64 %{{[0-9]*}} monotonic
+; SHADERTEST: atomicrmw umax ptr addrspace(3) getelementptr inbounds (i8, ptr addrspace(3) @{{.*}}, i32 8), i64 %{{[0-9]*}} monotonic
+; SHADERTEST: atomicrmw and ptr addrspace(3) getelementptr inbounds (i8, ptr addrspace(3) @{{.*}}, i32 16), i64 %{{[0-9]*}} monotonic
+; SHADERTEST: atomicrmw or ptr addrspace(3) getelementptr inbounds (i8, ptr addrspace(3) @{{.*}}, i32 24), i64 %{{[0-9]*}} monotonic
+; SHADERTEST: atomicrmw xor ptr addrspace(3) getelementptr inbounds (i8, ptr addrspace(3) @{{.*}}, i32 32), i64 %{{[0-9]*}} monotonic
+; SHADERTEST: atomicrmw min ptr addrspace(3) getelementptr inbounds (i8, ptr addrspace(3) @{{.*}}, i32 40), i64 %{{[0-9]*}} monotonic
+; SHADERTEST: atomicrmw max ptr addrspace(3) getelementptr inbounds (i8, ptr addrspace(3) @{{.*}}, i32 48), i64 %{{[0-9]*}} monotonic
+; SHADERTEST: atomicrmw and ptr addrspace(3) getelementptr inbounds (i8, ptr addrspace(3) @{{.*}}, i32 56), i64 %{{[0-9]*}} monotonic
+; SHADERTEST: atomicrmw or ptr addrspace(3) getelementptr inbounds (i8, ptr addrspace(3) @{{.*}}, i32 64), i64 %{{[0-9]*}} monotonic
+; SHADERTEST: atomicrmw xor ptr addrspace(3) getelementptr inbounds (i8, ptr addrspace(3) @{{.*}}, i32 72), i64 %{{[0-9]*}} monotonic
+; SHADERTEST: atomicrmw add ptr addrspace(3) getelementptr inbounds (i8, ptr addrspace(3) @{{.*}}, i32 80), i64 %{{[0-9]*}} monotonic
+; SHADERTEST: cmpxchg ptr addrspace(3) getelementptr inbounds (i8, ptr addrspace(3) @{{.*}}, i32 96), i64 78187493520, i64 %{{[0-9]*}} monotonic
 
 ; SHADERTEST-LABEL: {{^// LLPC}} pipeline patching results
 ; SHADERTEST: call i64 @llvm.amdgcn.raw.buffer.atomic.umin.i64(i64 %{{[0-9]*}}, <4 x i32> %{{[0-9]*}}, i32 0, i32 0, i32 0)
@@ -119,17 +120,17 @@ void main ()
 ; SHADERTEST: call i64 @llvm.amdgcn.raw.buffer.atomic.xor.i64(i64 %{{[0-9]*}}, <4 x i32> %{{[0-9]*}}, i32 72, i32 0, i32 0)
 ; SHADERTEST: call i64 @llvm.amdgcn.raw.buffer.atomic.add.i64(i64 %{{[0-9]*}}, <4 x i32> %{{[0-9]*}}, i32 80, i32 0, i32 0)
 ; SHADERTEST: atomicrmw umin ptr addrspace(3) @{{.*}} i64 %{{[0-9]*}} monotonic
-; SHADERTEST: atomicrmw umax ptr addrspace(3) getelementptr inbounds ({ i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64 }, ptr addrspace(3) @{{.*}}, i32 0, i32 1), i64 %{{[0-9]*}} monotonic
-; SHADERTEST: atomicrmw and ptr addrspace(3) getelementptr inbounds ({ i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64 }, ptr addrspace(3) @{{.*}}, i32 0, i32 2), i64 %{{[0-9]*}} monotonic
-; SHADERTEST: atomicrmw or ptr addrspace(3) getelementptr inbounds ({ i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64 }, ptr addrspace(3) @{{.*}}, i32 0, i32 3), i64 %{{[0-9]*}} monotonic
-; SHADERTEST: atomicrmw xor ptr addrspace(3) getelementptr inbounds ({ i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64 }, ptr addrspace(3) @{{.*}}, i32 0, i32 4), i64 %{{[0-9]*}} monotonic
-; SHADERTEST: atomicrmw min ptr addrspace(3) getelementptr inbounds ({ i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64 }, ptr addrspace(3) @{{.*}}, i32 0, i32 5), i64 %{{[0-9]*}} monotonic
-; SHADERTEST: atomicrmw max ptr addrspace(3) getelementptr inbounds ({ i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64 }, ptr addrspace(3) @{{.*}}, i32 0, i32 6), i64 %{{[0-9]*}} monotonic
-; SHADERTEST: atomicrmw and ptr addrspace(3) getelementptr inbounds ({ i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64 }, ptr addrspace(3) @{{.*}}, i32 0, i32 7), i64 %{{[0-9]*}} monotonic
-; SHADERTEST: atomicrmw or ptr addrspace(3) getelementptr inbounds ({ i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64 }, ptr addrspace(3) @{{.*}}, i32 0, i32 8), i64 %{{[0-9]*}} monotonic
-; SHADERTEST: atomicrmw xor ptr addrspace(3) getelementptr inbounds ({ i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64 }, ptr addrspace(3) @{{.*}}, i32 0, i32 9), i64 %{{[0-9]*}} monotonic
-; SHADERTEST: atomicrmw add ptr addrspace(3) getelementptr inbounds ({ i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64 }, ptr addrspace(3) @{{.*}}, i32 0, i32 10), i64 %{{[0-9]*}} monotonic
-; SHADERTEST: cmpxchg ptr addrspace(3) getelementptr inbounds ({ i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64 }, ptr addrspace(3) @{{.*}}, i32 0, i32 12), i64 78187493520, i64 %{{[0-9]*}} monotonic
+; SHADERTEST: atomicrmw umax ptr addrspace(3) getelementptr inbounds (i8, ptr addrspace(3) @{{.*}}, i32 8), i64 %{{[0-9]*}} monotonic
+; SHADERTEST: atomicrmw and ptr addrspace(3) getelementptr inbounds (i8, ptr addrspace(3) @{{.*}}, i32 16), i64 %{{[0-9]*}} monotonic
+; SHADERTEST: atomicrmw or ptr addrspace(3) getelementptr inbounds (i8, ptr addrspace(3) @{{.*}}, i32 24), i64 %{{[0-9]*}} monotonic
+; SHADERTEST: atomicrmw xor ptr addrspace(3) getelementptr inbounds (i8, ptr addrspace(3) @{{.*}}, i32 32), i64 %{{[0-9]*}} monotonic
+; SHADERTEST: atomicrmw min ptr addrspace(3) getelementptr inbounds (i8, ptr addrspace(3) @{{.*}}, i32 40), i64 %{{[0-9]*}} monotonic
+; SHADERTEST: atomicrmw max ptr addrspace(3) getelementptr inbounds (i8, ptr addrspace(3) @{{.*}}, i32 48), i64 %{{[0-9]*}} monotonic
+; SHADERTEST: atomicrmw and ptr addrspace(3) getelementptr inbounds (i8, ptr addrspace(3) @{{.*}}, i32 56), i64 %{{[0-9]*}} monotonic
+; SHADERTEST: atomicrmw or ptr addrspace(3) getelementptr inbounds (i8, ptr addrspace(3) @{{.*}}, i32 64), i64 %{{[0-9]*}} monotonic
+; SHADERTEST: atomicrmw xor ptr addrspace(3) getelementptr inbounds (i8, ptr addrspace(3) @{{.*}}, i32 72), i64 %{{[0-9]*}} monotonic
+; SHADERTEST: atomicrmw add ptr addrspace(3) getelementptr inbounds (i8, ptr addrspace(3) @{{.*}}, i32 80), i64 %{{[0-9]*}} monotonic
+; SHADERTEST: cmpxchg ptr addrspace(3) getelementptr inbounds (i8, ptr addrspace(3) @{{.*}}, i32 96), i64 78187493520, i64 %{{[0-9]*}} monotonic
 
 ; SHADERTEST: AMDLLPC SUCCESS
 */
diff --git a/llpc/test/shaderdb/core/OpBitCount_TestIntConst_lit.frag b/llpc/test/shaderdb/core/OpBitCount_TestIntConst_lit.frag
index a6096ce356..07685ccd9f 100644
--- a/llpc/test/shaderdb/core/OpBitCount_TestIntConst_lit.frag
+++ b/llpc/test/shaderdb/core/OpBitCount_TestIntConst_lit.frag
@@ -11,7 +11,7 @@ void main()
 /*
 ; RUN: amdllpc -v %gfxip %s | FileCheck -check-prefix=SHADERTEST %s
 ; SHADERTEST-LABEL: {{^// LLPC.*}} SPIR-V lowering results
-; SHADERTEST: call i32 @llvm.ctpop.i32
+; SHADERTEST: call {{(range.*)?}}i32 @llvm.ctpop.i32
 ; SHADERTEST: add nuw nsw i32 %{{[0-9*]}}, 2
 
 ; SHADERTEST: AMDLLPC SUCCESS
diff --git a/llpc/test/shaderdb/core/OpBitCount_TestInt_lit.frag b/llpc/test/shaderdb/core/OpBitCount_TestInt_lit.frag
index 0c6789c59f..41345ce4c0 100644
--- a/llpc/test/shaderdb/core/OpBitCount_TestInt_lit.frag
+++ b/llpc/test/shaderdb/core/OpBitCount_TestInt_lit.frag
@@ -20,7 +20,7 @@ void main()
 /*
 ; RUN: amdllpc -v %gfxip %s | FileCheck -check-prefix=SHADERTEST %s
 ; SHADERTEST-LABEL: {{^// LLPC.*}} SPIR-V lowering results
-; SHADERTEST-COUNT-2: call i32 @llvm.ctpop.i32
+; SHADERTEST-COUNT-2: call {{(range.*)?}}i32 @llvm.ctpop.i32
 
 ; SHADERTEST: AMDLLPC SUCCESS
 */
diff --git a/llpc/test/shaderdb/core/OpBitCount_TestIvec4_lit.frag b/llpc/test/shaderdb/core/OpBitCount_TestIvec4_lit.frag
index b11df75a01..fa37a08a8b 100644
--- a/llpc/test/shaderdb/core/OpBitCount_TestIvec4_lit.frag
+++ b/llpc/test/shaderdb/core/OpBitCount_TestIvec4_lit.frag
@@ -11,7 +11,7 @@ void main()
 /*
 ; RUN: amdllpc -v %gfxip %s | FileCheck -check-prefix=SHADERTEST %s
 ; SHADERTEST-LABEL: {{^// LLPC.*}} SPIR-V lowering results
-; SHADERTEST-COUNT-4: call i32 @llvm.ctpop.i32
+; SHADERTEST-COUNT-4: call {{(range.*)?}}i32 @llvm.ctpop.i32
 
 ; SHADERTEST: AMDLLPC SUCCESS
 */
diff --git a/llpc/test/shaderdb/core/OpBitCount_TestUint_lit.frag b/llpc/test/shaderdb/core/OpBitCount_TestUint_lit.frag
index c31918b8c4..09b228517c 100644
--- a/llpc/test/shaderdb/core/OpBitCount_TestUint_lit.frag
+++ b/llpc/test/shaderdb/core/OpBitCount_TestUint_lit.frag
@@ -20,7 +20,7 @@ void main()
 /*
 ; RUN: amdllpc -v %gfxip %s | FileCheck -check-prefix=SHADERTEST %s
 ; SHADERTEST-LABEL: {{^// LLPC.*}} SPIR-V lowering results
-; SHADERTEST-COUNT-2: call i32 @llvm.ctpop.i32
+; SHADERTEST-COUNT-2: call {{(range.*)?}}i32 @llvm.ctpop.i32
 
 ; SHADERTEST: AMDLLPC SUCCESS
 */
diff --git a/llpc/test/shaderdb/core/OpGroupNonUniformMax.comp b/llpc/test/shaderdb/core/OpGroupNonUniformMax.comp
index 309b69b649..2ec5ad0d1a 100644
--- a/llpc/test/shaderdb/core/OpGroupNonUniformMax.comp
+++ b/llpc/test/shaderdb/core/OpGroupNonUniformMax.comp
@@ -54,17 +54,15 @@ void main() {
 // CHECK-NEXT:    v_mov_b32_e32 v2, 0xff800000
 // CHECK-NEXT:    s_not_b64 exec, exec
 // CHECK-NEXT:    s_or_saveexec_b64 s[2:3], -1
-// CHECK-NEXT:    v_mov_b32_e32 v3, 0xff800000
-// CHECK-NEXT:    v_mov_b32_e32 v4, 0xff800000
 // CHECK-NEXT:    v_mov_b32_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf
+// CHECK-NEXT:    v_mov_b32_e32 v3, 0xff800000
+// CHECK-NEXT:    v_lshlrev_b64 v[4:5], v0, 1
 // CHECK-NEXT:    v_cmp_gt_u32_e64 s[0:1], 32, v0
-// CHECK-NEXT:    v_mov_b32_dpp v3, v2 row_shr:2 row_mask:0xf bank_mask:0xf
-// CHECK-NEXT:    v_mov_b32_dpp v4, v2 row_shr:3 row_mask:0xf bank_mask:0xf
 // CHECK-NEXT:    v_max_f32_e32 v1, v2, v1
 // CHECK-NEXT:    v_mov_b32_e32 v2, 0xff800000
-// CHECK-NEXT:    v_max3_f32 v1, v1, v3, v4
+// CHECK-NEXT:    v_mov_b32_dpp v3, v1 row_shr:2 row_mask:0xf bank_mask:0xf
+// CHECK-NEXT:    v_max_f32_e32 v1, v1, v3
 // CHECK-NEXT:    v_mov_b32_e32 v3, 0xff800000
-// CHECK-NEXT:    v_lshlrev_b64 v[4:5], v0, 1
 // CHECK-NEXT:    v_mov_b32_dpp v2, v1 row_shr:4 row_mask:0xf bank_mask:0xe
 // CHECK-NEXT:    v_max_f32_e32 v1, v1, v2
 // CHECK-NEXT:    v_and_b32_e32 v2, 0xffff0000, v4
diff --git a/llpc/test/shaderdb/core/OpMemoryBarrier_TestMemoryBarrierBuffer_lit.frag b/llpc/test/shaderdb/core/OpMemoryBarrier_TestMemoryBarrierBuffer_lit.frag
index f33ad561ae..1a3d964e63 100644
--- a/llpc/test/shaderdb/core/OpMemoryBarrier_TestMemoryBarrierBuffer_lit.frag
+++ b/llpc/test/shaderdb/core/OpMemoryBarrier_TestMemoryBarrierBuffer_lit.frag
@@ -17,7 +17,7 @@ void main()
 /*
 ; RUN: amdllpc -v %gfxip %s | FileCheck -check-prefix=SHADERTEST %s
 ; SHADERTEST-LABEL: {{^// LLPC}} SPIRV-to-LLVM translation results
-; SHADERTEST: fence acq_rel
+; SHADERTEST: fence syncscope("agent") acq_rel
 
 ; SHADERTEST: AMDLLPC SUCCESS
 */
diff --git a/llpc/test/shaderdb/core/OpMemoryBarrier_TestMemoryBarrierImage_lit.frag b/llpc/test/shaderdb/core/OpMemoryBarrier_TestMemoryBarrierImage_lit.frag
index db0c26ab99..648121dcf5 100644
--- a/llpc/test/shaderdb/core/OpMemoryBarrier_TestMemoryBarrierImage_lit.frag
+++ b/llpc/test/shaderdb/core/OpMemoryBarrier_TestMemoryBarrierImage_lit.frag
@@ -13,7 +13,7 @@ void main()
 /*
 ; RUN: amdllpc -v %gfxip %s | FileCheck -check-prefix=SHADERTEST %s
 ; SHADERTEST-LABEL: {{^// LLPC}} SPIRV-to-LLVM translation results
-; SHADERTEST: fence acq_rel
+; SHADERTEST: fence syncscope("agent") acq_rel
 
 ; SHADERTEST: AMDLLPC SUCCESS
 */
diff --git a/llpc/test/shaderdb/core/OpMemoryBarrier_TestMemoryBarrierShared_lit.comp b/llpc/test/shaderdb/core/OpMemoryBarrier_TestMemoryBarrierShared_lit.comp
index 016659e08b..c839780637 100644
--- a/llpc/test/shaderdb/core/OpMemoryBarrier_TestMemoryBarrierShared_lit.comp
+++ b/llpc/test/shaderdb/core/OpMemoryBarrier_TestMemoryBarrierShared_lit.comp
@@ -15,7 +15,7 @@ void main()
 /*
 ; RUN: amdllpc -v %gfxip %s | FileCheck -check-prefix=SHADERTEST %s
 ; SHADERTEST-LABEL: {{^// LLPC}} SPIRV-to-LLVM translation results
-; SHADERTEST: fence acq_rel
+; SHADERTEST: fence syncscope("agent") acq_rel
 
 ; SHADERTEST: AMDLLPC SUCCESS
 */
diff --git a/llpc/test/shaderdb/core/OpMemoryBarrier_TestMemoryBarrier_lit.comp b/llpc/test/shaderdb/core/OpMemoryBarrier_TestMemoryBarrier_lit.comp
index 49a7a3347d..7f0d1806dd 100644
--- a/llpc/test/shaderdb/core/OpMemoryBarrier_TestMemoryBarrier_lit.comp
+++ b/llpc/test/shaderdb/core/OpMemoryBarrier_TestMemoryBarrier_lit.comp
@@ -19,7 +19,7 @@ void main()
 /*
 ; RUN: amdllpc -v %gfxip %s | FileCheck -check-prefix=SHADERTEST %s
 ; SHADERTEST-LABEL: {{^// LLPC}} SPIRV-to-LLVM translation results
-; SHADERTEST: fence acq_rel
+; SHADERTEST: fence syncscope("agent") acq_rel
 
 ; SHADERTEST: AMDLLPC SUCCESS
 */
diff --git a/llpc/test/shaderdb/core/OpSwitch_Test64BitCaseLabel_lit.spvasm b/llpc/test/shaderdb/core/OpSwitch_Test64BitCaseLabel_lit.spvasm
index 6f9d974bc8..7e7bc9e56f 100644
--- a/llpc/test/shaderdb/core/OpSwitch_Test64BitCaseLabel_lit.spvasm
+++ b/llpc/test/shaderdb/core/OpSwitch_Test64BitCaseLabel_lit.spvasm
@@ -1,7 +1,7 @@
 ; BEGIN_SHADERTEST
 ; RUN: amdllpc -v %gfxip %s | FileCheck -check-prefix=SHADERTEST %s
 ; SHADERTEST-LABEL: {{^// LLPC}} SPIRV-to-LLVM translation results
-; SHADERTEST:   switch i64 %{{[0-9]+}}, label %{{[0-9]+}} [
+; SHADERTEST:   switch i64 %{{.+}}, label %{{[0-9]+}} [
 ; SHADERTEST:    i64 0, label %{{[0-9]*}}
 ; SHADERTEST:    i64 1, label %{{[0-9]*}}
 ; SHADERTEST:    i64 2, label %{{[0-9]*}}
diff --git a/llpc/test/shaderdb/core/OpTypeSampledImage_TestWaterfallInsertion.frag b/llpc/test/shaderdb/core/OpTypeSampledImage_TestWaterfallInsertion.frag
index 6845f3f011..74975a3767 100644
--- a/llpc/test/shaderdb/core/OpTypeSampledImage_TestWaterfallInsertion.frag
+++ b/llpc/test/shaderdb/core/OpTypeSampledImage_TestWaterfallInsertion.frag
@@ -18,16 +18,24 @@ void main()
     _3 = texture(_11[nonuniformEXT(_12)], vec2(0.0));
 }
 
-// BEGIN_SHADERTEST
-/*
-; RUN: amdllpc -v %gfxip %s | FileCheck -check-prefix=SHADERTEST %s
-; Make sure that the begin indices chosen are the non-uniform offsets rather than the whole resource desc
-; Make sure that there's a waterfall.readfirstlane for both the image resource desc and sample desc
-; SHADERTEST-LABEL: {{^// LLPC}} pipeline patching results
-; SHADERTEST-DAG: call i32 @llvm.amdgcn.waterfall.begin.i32
-; SHADERTEST-DAG: call i32 @llvm.amdgcn.waterfall.begin.i32
-; SHADERTEST-DAG: call <8 x i32> @llvm.amdgcn.waterfall.readfirstlane.v8i32.v8i32
-; SHADERTEST-DAG: call <4 x i32> @llvm.amdgcn.waterfall.readfirstlane.v4i32.v4i32
-; SHADERTEST: AMDLLPC SUCCESS
-*/
-// END_SHADERTEST
+// RUN: amdllpc -v %gfxip %s | FileCheck -check-prefix=SHADERTEST %s
+// Make sure that the begin indices chosen are the non-uniform offsets rather than the whole resource desc
+// Make sure that there's a waterfall.readfirstlane for both the image resource desc and sample desc
+// SHADERTEST-LABEL: {{^// LLPC}} pipeline patching results
+// SHADERTEST: %[[mul:[0-9]+]] = mul i32 %{{.*}}, 48
+// SHADERTEST-NEXT: %[[begin:[0-9]+]] = call i32 @llvm.amdgcn.waterfall.begin.i32(i32 0, i32 %[[mul]])
+// SHADERTEST-NEXT: %[[readfirstlane:[0-9]+]] = call i32 @llvm.amdgcn.waterfall.readfirstlane.i32.i32(i32 %[[begin]], i32 %[[mul]])
+// SHADERTEST-NEXT: %[[sext:[0-9]+]] = sext i32 %[[readfirstlane]] to i64
+// SHADERTEST-NEXT: %[[gep1:[0-9]+]] = getelementptr i8, ptr addrspace(4) %{{.*}}, i64 %[[sext]]
+// SHADERTEST-NEXT: %[[load1:[0-9]+]] = load <8 x i32>, ptr addrspace(4) %[[gep1]], align 32
+// SHADERTEST-NEXT: %[[extract:[.a-z0-9]+]] = extractelement <8 x i32> %[[load1]], i64 3
+// SHADERTEST-NEXT: %[[and:[0-9]+]] = and i32 %[[extract]], 268435455
+// SHADERTEST-NEXT: %[[cmp:[0-9]+]] = icmp slt i32 %[[extract]], 0
+// SHADERTEST-NEXT: %[[select:[0-9]+]] = select i1 %[[cmp]], i32 %[[extract]], i32 %[[and]]
+// SHADERTEST-NEXT: %[[insert:[.a-z0-9]+]] = insertelement <8 x i32> %[[load1]], i32 %[[select]], i64 3
+// SHADERTEST-NEXT: %[[shufflevector:[0-9]+]] = shufflevector <8 x i32> %[[insert]], <8 x i32> %[[load1]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
+// SHADERTEST-NEXT: %[[gep2:[0-9]+]] = getelementptr i8, ptr addrspace(4) %{{.*}}, i64 %[[sext]]
+// SHADERTEST-NEXT: %[[load2:[0-9]+]] = load <4 x i32>, ptr addrspace(4) %[[gep2]], align 16
+// SHADERTEST-NEXT: %[[image_call:[0-9]+]] = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f16(i32 15, half 0xH0000, half 0xH0000, <8 x i32> %[[shufflevector]], <4 x i32> %[[load2]], i1 false, i32 0, i32 0)
+// SHADERTEST-NEXT: %[[end:[0-9]+]] = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.waterfall.end.v4f32(i32 %[[begin]], <4 x float> %[[image_call]])
+// SHADERTEST: AMDLLPC SUCCESS
diff --git a/llpc/test/shaderdb/core/OpTypeSampledImage_TestWaterfallScalarize.frag b/llpc/test/shaderdb/core/OpTypeSampledImage_TestWaterfallScalarize.frag
index fbf9c25c0f..283422ccc1 100644
--- a/llpc/test/shaderdb/core/OpTypeSampledImage_TestWaterfallScalarize.frag
+++ b/llpc/test/shaderdb/core/OpTypeSampledImage_TestWaterfallScalarize.frag
@@ -1,6 +1,3 @@
-// Make sure that there is a single begin index
-// Make sure that there is a single waterfall.readfirstlane for the offset
-
 #version 450
 #extension GL_EXT_nonuniform_qualifier : require
 
@@ -16,18 +13,58 @@ void main()
     _3 = texture(_11[nonuniformEXT(_12)], _6);
 }
 
-// BEGIN_SHADERTEST
-//
-// RUN: amdllpc -scalarize-waterfall-descriptor-loads -v %gfxip %s | FileCheck -check-prefix=SHADERTEST %s
+// RUN: amdllpc -scalarize-waterfall-descriptor-loads -v %gfxip %s | FileCheck -check-prefix=SHADERTEST-GFX %s
 // Explicitly check GFX10.3 ASIC variants:
-// RUN: amdllpc -scalarize-waterfall-descriptor-loads -v --gfxip=10.3.0 %s | FileCheck -check-prefix=SHADERTEST %s
-// RUN: amdllpc -scalarize-waterfall-descriptor-loads -v --gfxip=10.3.2 %s | FileCheck -check-prefix=SHADERTEST %s
-// SHADERTEST-LABEL: {{^// LLPC}} pipeline patching results
-// SHADERTEST: call i32 @llvm.amdgcn.waterfall.begin.i32
-// SHADERTEST-NOT: call i32 @llvm.amdgcn.waterfall.begin.i32
-// SHADERTEST: call i32 @llvm.amdgcn.waterfall.readfirstlane.i32.i32
-// SHADERTEST-NOT: call i32 @llvm.amdgcn.waterfall.readfirstlane.i32.i32
-// SHADERTEST: call {{.*}} <4 x float> @llvm.amdgcn.waterfall.end.v4f32
-// SHADERTEST: AMDLLPC SUCCESS
+// RUN: amdllpc -scalarize-waterfall-descriptor-loads -v --gfxip=10.3.0 %s | FileCheck -check-prefix=SHADERTEST-GFX_10_3_0 %s
+// RUN: amdllpc -scalarize-waterfall-descriptor-loads -v --gfxip=10.3.2 %s | FileCheck -check-prefix=SHADERTEST-GFX_10_3_2 %s
+// Make sure that both the image resource desc and sample desc have the same index and there is only one
+// waterfall.readfirstlane for both of them
+
+// SHADERTEST-GFX-LABEL: {{^// LLPC}} pipeline patching results
+// SHADERTEST-GFX: %[[mul:[0-9]+]] = mul i32 %{{.*}}, 48
+// SHADERTEST-GFX-NEXT: %[[begin:[0-9]+]] = call i32 @llvm.amdgcn.waterfall.begin.i32(i32 0, i32 %[[mul]])
+// SHADERTEST-GFX-NEXT: %[[readfirstlane:[0-9]+]] = call i32 @llvm.amdgcn.waterfall.readfirstlane.i32.i32(i32 %[[begin]], i32 %[[mul]])
+// SHADERTEST-GFX-NEXT: %[[sext:[0-9]+]] = sext i32 %[[readfirstlane]] to i64
+// SHADERTEST-GFX-NEXT: %[[gep1:[0-9]+]] = getelementptr i8, ptr addrspace(4) %{{.*}}, i64 %[[sext]]
+// SHADERTEST-GFX-NEXT: %[[load1:[0-9]+]] = load <8 x i32>, ptr addrspace(4) %[[gep1]], align 32
+// SHADERTEST-GFX-NEXT: %[[extract:[.a-z0-9]+]] = extractelement <8 x i32> %[[load1]], i64 3
+// SHADERTEST-GFX-NEXT: %[[and:[0-9]+]] = and i32 %[[extract]], 268435455
+// SHADERTEST-GFX-NEXT: %[[cmp:[0-9]+]] = icmp slt i32 %[[extract]], 0
+// SHADERTEST-GFX-NEXT: %[[select:[0-9]+]] = select i1 %[[cmp]], i32 %[[extract]], i32 %[[and]]
+// SHADERTEST-GFX-NEXT: %[[insert:[.a-z0-9]+]] = insertelement <8 x i32> %[[load1]], i32 %[[select]], i64 3
+// SHADERTEST-GFX-NEXT: %[[shufflevector:[0-9]+]] = shufflevector <8 x i32> %[[insert]], <8 x i32> %[[load1]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
+// SHADERTEST-GFX-NEXT: %[[gep2:[0-9]+]] = getelementptr i8, ptr addrspace(4) %{{.*}}, i64 %[[sext]]
+// SHADERTEST-GFX-NEXT: %[[load2:[0-9]+]] = load <4 x i32>, ptr addrspace(4) %[[gep2]], align 16
+// SHADERTEST-GFX-NEXT: %[[image_call:[0-9]+]] = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32 15, float %{{.*}}, float %{{.*}}, <8 x i32> %[[shufflevector]], <4 x i32> %[[load2]], i1 false, i32 0, i32 0)
+// SHADERTEST-GFX-NEXT: %[[end:[0-9]+]] = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.waterfall.end.v4f32(i32 %[[begin]], <4 x float> %[[image_call]])
+// SHADERTEST-GFX: AMDLLPC SUCCESS
+//
+// SHADERTEST-GFX_10_3_0-LABEL: {{^// LLPC}} pipeline patching results
+// SHADERTEST-GFX_10_3_0: %[[mul:[0-9]+]] = mul i32 %{{.*}}, 48
+// SHADERTEST-GFX_10_3_0-NEXT: %[[begin:[0-9]+]] = call i32 @llvm.amdgcn.waterfall.begin.i32(i32 0, i32 %[[mul]])
+// SHADERTEST-GFX_10_3_0-NEXT: %[[readfirstlane:[0-9]+]] = call i32 @llvm.amdgcn.waterfall.readfirstlane.i32.i32(i32 %[[begin]], i32 %[[mul]])
+// SHADERTEST-GFX_10_3_0-NEXT: %[[sext:[0-9]+]] = sext i32 %[[readfirstlane]] to i64
+// SHADERTEST-GFX_10_3_0-NEXT: %[[gep1:[0-9]+]] = getelementptr i8, ptr addrspace(4) %{{.*}}, i64 %[[sext]]
+// SHADERTEST-GFX_10_3_0-NEXT: %[[load1:[0-9]+]] = load <8 x i32>, ptr addrspace(4) %[[gep1]], align 32
+// SHADERTEST-GFX_10_3_0-NEXT: %[[gep2:[0-9]+]] = getelementptr i8, ptr addrspace(4) %{{.*}}, i64 %[[sext]]
+// SHADERTEST-GFX_10_3_0-NEXT: %[[load2:[0-9]+]] = load <4 x i32>, ptr addrspace(4) %[[gep2]], align 16
+// SHADERTEST-GFX_10_3_0-NEXT: %[[image_call:[0-9]+]] = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32 15, float %{{.*}}, float %{{.*}}, <8 x i32> %[[load1]], <4 x i32> %[[load2]], i1 false, i32 0, i32 0)
+// SHADERTEST-GFX_10_3_0-NEXT: %[[end:[0-9]+]] = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.waterfall.end.v4f32(i32 %[[begin]], <4 x float> %[[image_call]])
+// SHADERTEST-GFX_10_3_0: AMDLLPC SUCCESS
 //
-// END_SHADERTEST
+// SHADERTEST-GFX_10_3_2-LABEL: {{^// LLPC}} pipeline patching results
+// SHADERTEST-GFX_10_3_2: %[[mul:[0-9]+]] = mul i32 %{{.*}}, 48
+// SHADERTEST-GFX_10_3_2-NEXT: %[[begin:[0-9]+]] = call i32 @llvm.amdgcn.waterfall.begin.i32(i32 0, i32 %[[mul]])
+// SHADERTEST-GFX_10_3_2-NEXT: %[[readfirstlane:[0-9]+]] = call i32 @llvm.amdgcn.waterfall.readfirstlane.i32.i32(i32 %[[begin]], i32 %[[mul]])
+// SHADERTEST-GFX_10_3_2-NEXT: %[[sext:[0-9]+]] = sext i32 %[[readfirstlane]] to i64
+// SHADERTEST-GFX_10_3_2-NEXT: %[[gep1:[0-9]+]] = getelementptr i8, ptr addrspace(4) %{{.*}}, i64 %[[sext]]
+// SHADERTEST-GFX_10_3_2-NEXT: %[[load1:[0-9]+]] = load <8 x i32>, ptr addrspace(4) %[[gep1]], align 32
+// SHADERTEST-GFX_10_3_2-NEXT: %[[extract:[.a-z0-9]+]] = extractelement <8 x i32> %[[load1]], i64 6
+// SHADERTEST-GFX_10_3_2-NEXT: %[[and:[0-9]+]] = and i32 %[[extract]], -1048577
+// SHADERTEST-GFX_10_3_2-NEXT: %[[insert:[.a-z0-9]+]] = insertelement <8 x i32> %[[load1]], i32 %[[and]], i64 6
+// SHADERTEST-GFX_10_3_2-NEXT: %[[shufflevector:[0-9]+]] = shufflevector <8 x i32> %[[insert]], <8 x i32> %[[load1]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 15>
+// SHADERTEST-GFX_10_3_2-NEXT: %[[gep2:[0-9]+]] = getelementptr i8, ptr addrspace(4) %{{.*}}, i64 %[[sext]]
+// SHADERTEST-GFX_10_3_2-NEXT: %[[load2:[0-9]+]] = load <4 x i32>, ptr addrspace(4) %[[gep2]], align 16
+// SHADERTEST-GFX_10_3_2-NEXT: %[[image_call:[0-9]+]] = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32 15, float %{{.*}}, float %{{.*}}, <8 x i32> %[[shufflevector]], <4 x i32> %[[load2]], i1 false, i32 0, i32 0)
+// SHADERTEST-GFX_10_3_2-NEXT: %[[end:[0-9]+]] = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.waterfall.end.v4f32(i32 %[[begin]], <4 x float> %[[image_call]])
+// SHADERTEST-GFX_10_3_2: AMDLLPC SUCCESS
diff --git a/llpc/test/shaderdb/core/OpTypeSampledImage_TestWaterfallScalarize_MultiBlock.frag b/llpc/test/shaderdb/core/OpTypeSampledImage_TestWaterfallScalarize_MultiBlock.frag
index 82cd87a930..a4b12e33a0 100644
--- a/llpc/test/shaderdb/core/OpTypeSampledImage_TestWaterfallScalarize_MultiBlock.frag
+++ b/llpc/test/shaderdb/core/OpTypeSampledImage_TestWaterfallScalarize_MultiBlock.frag
@@ -1,6 +1,3 @@
-// Make sure that there are two non-overlapping waterfall loops
-// First is scalarized and second is vector type
-
 #version 450
 #extension GL_EXT_nonuniform_qualifier : require
 
@@ -25,24 +22,139 @@ void main()
     _3 = samp0 + samp1;
 }
 
-// BEGIN_SHADERTEST
-//
-// RUN: amdllpc -scalarize-waterfall-descriptor-loads -v %gfxip %s | FileCheck -check-prefix=SHADERTEST %s
+// RUN: amdllpc -scalarize-waterfall-descriptor-loads -v %gfxip %s | FileCheck -check-prefix=SHADERTEST-GFX %s
 // Explicitly check GFX10.3 ASIC variants:
-// RUN: amdllpc -scalarize-waterfall-descriptor-loads -v --gfxip=10.3.0 %s | FileCheck -check-prefix=SHADERTEST %s
-// RUN: amdllpc -scalarize-waterfall-descriptor-loads -v --gfxip=10.3.2 %s | FileCheck -check-prefix=SHADERTEST %s
-// SHADERTEST-LABEL: {{^// LLPC}} pipeline patching results
-// SHADERTEST: call i32 @llvm.amdgcn.waterfall.begin.i32
-// SHADERTEST-NOT: call i32 @llvm.amdgcn.waterfall.begin.i32
-// SHADERTEST: call i32 @llvm.amdgcn.waterfall.readfirstlane.i32.i32
-// SHADERTEST-NOT: call i32 @llvm.amdgcn.waterfall.begin.i32
-// SHADERTEST: call {{.*}} <4 x float> @llvm.amdgcn.waterfall.end.v4f32
-// SHADERTEST-NOT: call i32 @llvm.amdgcn.waterfall.begin.i32
-// SHADERTEST: call {{.*}} <4 x float> @llvm.amdgcn.waterfall.end.v4f32
-// SHADERTEST: call i32 @llvm.amdgcn.waterfall.begin.i32
-// SHADERTEST: call <8 x i32> @llvm.amdgcn.waterfall.readfirstlane.v8i32.v8i32
-// SHADERTEST: call <4 x i32> @llvm.amdgcn.waterfall.readfirstlane.v4i32.v4i32
-// SHADERTEST: call {{.*}} <4 x float> @llvm.amdgcn.waterfall.end.v4f32
-// SHADERTEST: AMDLLPC SUCCESS
-//
-// END_SHADERTEST
+// RUN: amdllpc -scalarize-waterfall-descriptor-loads -v --gfxip=10.3.0 %s | FileCheck -check-prefix=SHADERTEST-GFX_10_3_0 %s
+// RUN: amdllpc -scalarize-waterfall-descriptor-loads -v --gfxip=10.3.2 %s | FileCheck -check-prefix=SHADERTEST-GFX_10_3_2 %s
+// Make sure that there are two non-overlapping waterfall loops
+// Make sure that both the image resource desc and sample desc have the same index and there is only one
+// waterfall.readfirstlane for both of them
+
+// SHADERTEST-GFX-LABEL: {{^// LLPC}} pipeline patching results
+// SHADERTEST-GFX: %[[mul1:[0-9]+]] = mul i32 %{{.*}}, 48
+// SHADERTEST-GFX-NEXT: %[[begin1:[0-9]+]] = call i32 @llvm.amdgcn.waterfall.begin.i32(i32 0, i32 %[[mul1]])
+// SHADERTEST-GFX-NEXT: %[[readfirstlane1:[0-9]+]] = call i32 @llvm.amdgcn.waterfall.readfirstlane.i32.i32(i32 %[[begin1]], i32 %[[mul1]])
+// SHADERTEST-GFX-NEXT: %[[sext1:[0-9]+]] = sext i32 %[[readfirstlane1]] to i64
+// SHADERTEST-GFX-NEXT: %[[gep1:[0-9]+]] = getelementptr i8, ptr addrspace(4) %{{.*}}, i64 %[[sext1]]
+// SHADERTEST-GFX-NEXT: %[[load1:[0-9]+]] = load <8 x i32>, ptr addrspace(4) %[[gep1]], align 32
+// SHADERTEST-GFX-NEXT: %[[extract1:[.a-z0-9]+]] = extractelement <8 x i32> %[[load1]], i64 3
+// SHADERTEST-GFX-NEXT: %[[and1:[0-9]+]] = and i32 %[[extract1]], 268435455
+// SHADERTEST-GFX-NEXT: %[[cmp1:[0-9]+]] = icmp slt i32 %[[extract1]], 0
+// SHADERTEST-GFX-NEXT: %[[select1:[0-9]+]] = select i1 %[[cmp1]], i32 %[[extract1]], i32 %[[and1]]
+// SHADERTEST-GFX-NEXT: %[[insert1:[.a-z0-9]+]] = insertelement <8 x i32> %[[load1]], i32 %[[select1]], i64 3
+// SHADERTEST-GFX-NEXT: %[[shufflevector1:[0-9]+]] = shufflevector <8 x i32> %[[insert1]], <8 x i32> %[[load1]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
+// SHADERTEST-GFX-NEXT: %[[gep2:[0-9]+]] = getelementptr i8, ptr addrspace(4) %{{.*}}, i64 %[[sext1]]
+// SHADERTEST-GFX-NEXT: %[[load2:[0-9]+]] = load <4 x i32>, ptr addrspace(4) %[[gep2]], align 16
+// SHADERTEST-GFX-NEXT: %[[image_call1:[0-9]+]] = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32 15, float %{{.*}}, float %{{.*}}, <8 x i32> %[[shufflevector1]], <4 x i32> %[[load2]], i1 false, i32 0, i32 0)
+// SHADERTEST-GFX-NEXT: %[[end1:[0-9]+]] = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.waterfall.end.v4f32(i32 %[[begin1]], <4 x float> %[[image_call1]])
+//
+// SHADERTEST-GFX-NEXT: %[[begin2:[0-9]+]] = call i32 @llvm.amdgcn.waterfall.begin.i32(i32 0, i32 %[[mul1]])
+// SHADERTEST-GFX-NEXT: %[[readfirstlane2:[0-9]+]] = call i32 @llvm.amdgcn.waterfall.readfirstlane.i32.i32(i32 %[[begin2]], i32 %[[mul1]])
+// SHADERTEST-GFX-NEXT: %[[sext2:[0-9]+]] = sext i32 %[[readfirstlane2]] to i64
+// SHADERTEST-GFX-NEXT: %[[gep3:[0-9]+]] = getelementptr i8, ptr addrspace(4) %{{.*}}, i64 %[[sext2]]
+// SHADERTEST-GFX-NEXT: %[[load3:[0-9]+]] = load <8 x i32>, ptr addrspace(4) %[[gep3]], align 32
+// SHADERTEST-GFX-NEXT: %[[extract2:[.a-z0-9]+]] = extractelement <8 x i32> %[[load3]], i64 3
+// SHADERTEST-GFX-NEXT: %[[and2:[0-9]+]] = and i32 %[[extract2]], 268435455
+// SHADERTEST-GFX-NEXT: %[[cmp2:[0-9]+]] = icmp slt i32 %[[extract2]], 0
+// SHADERTEST-GFX-NEXT: %[[select2:[0-9]+]] = select i1 %[[cmp2]], i32 %[[extract2]], i32 %[[and2]]
+// SHADERTEST-GFX-NEXT: %[[insert2:[.a-z0-9]+]] = insertelement <8 x i32> %[[load3]], i32 %[[select2]], i64 3
+// SHADERTEST-GFX-NEXT: %[[shufflevector2:[0-9]+]] = shufflevector <8 x i32> %[[insert2]], <8 x i32> %[[load3]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
+// SHADERTEST-GFX-NEXT: %[[gep4:[0-9]+]] = getelementptr i8, ptr addrspace(4) %{{.*}}, i64 %[[sext2]]
+// SHADERTEST-GFX-NEXT: %[[load4:[0-9]+]] = load <4 x i32>, ptr addrspace(4) %[[gep4]], align 16
+// SHADERTEST-GFX-NEXT: %[[image_call2:[0-9]+]] = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32 15, float %{{.*}}, float %{{.*}}, <8 x i32> %[[shufflevector2]], <4 x i32> %[[load4]], i1 false, i32 0, i32 0)
+// SHADERTEST-GFX-NEXT: %[[end2:[0-9]+]] = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.waterfall.end.v4f32(i32 %[[begin2]], <4 x float> %[[image_call2]])
+//
+// SHADERTEST-GFX: %[[begin3:[0-9]+]] = call i32 @llvm.amdgcn.waterfall.begin.i32(i32 0, i32 %[[mul1]])
+// SHADERTEST-GFX-NEXT: %[[readfirstlane3:[0-9]+]] = call i32 @llvm.amdgcn.waterfall.readfirstlane.i32.i32(i32 %[[begin3]], i32 %[[mul1]])
+// SHADERTEST-GFX-NEXT: %[[sext3:[0-9]+]] = sext i32 %[[readfirstlane3]] to i64
+// SHADERTEST-GFX-NEXT: %[[gep5:[0-9]+]] = getelementptr i8, ptr addrspace(4) %{{.*}}, i64 %[[sext3]]
+// SHADERTEST-GFX-NEXT: %[[load5:[0-9]+]] = load <8 x i32>, ptr addrspace(4) %[[gep5]], align 32
+// SHADERTEST-GFX-NEXT: %[[extract3:[.a-z0-9]+]] = extractelement <8 x i32> %[[load5]], i64 3
+// SHADERTEST-GFX-NEXT: %[[and3:[0-9]+]] = and i32 %[[extract3]], 268435455
+// SHADERTEST-GFX-NEXT: %[[cmp3:[0-9]+]] = icmp slt i32 %[[extract3]], 0
+// SHADERTEST-GFX-NEXT: %[[select3:[0-9]+]] = select i1 %[[cmp3]], i32 %[[extract3]], i32 %[[and3]]
+// SHADERTEST-GFX-NEXT: %[[insert3:[.a-z0-9]+]] = insertelement <8 x i32> %[[load5]], i32 %[[select3]], i64 3
+// SHADERTEST-GFX-NEXT: %[[shufflevector3:[0-9]+]] = shufflevector <8 x i32> %[[insert3]], <8 x i32> %[[load5]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
+// SHADERTEST-GFX-NEXT: %[[gep6:[0-9]+]] = getelementptr i8, ptr addrspace(4) %{{.*}}, i64 %[[sext3]]
+// SHADERTEST-GFX-NEXT: %[[load6:[0-9]+]] = load <4 x i32>, ptr addrspace(4) %[[gep6]], align 16
+// SHADERTEST-GFX-NEXT: [[image_call3:[0-9]+]] = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32 15, float %{{.*}}, float %{{.*}}, <8 x i32> %[[shufflevector3]], <4 x i32> %[[load6]], i1 false, i32 0, i32 0)
+// SHADERTEST-GFX-NEXT: %[[end3:[0-9]+]] = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.waterfall.end.v4f32(i32 %[[begin3]], <4 x float> %[[image_call3]])
+// SHADERTEST-GFX: AMDLLPC SUCCESS
+//
+// SHADERTEST-GFX_10_3_0-LABEL: {{^// LLPC}} pipeline patching results
+// SHADERTEST-GFX_10_3_0: %[[mul1:[0-9]+]] = mul i32 %{{.*}}, 48
+// SHADERTEST-GFX_10_3_0-NEXT: %[[begin1:[0-9]+]] = call i32 @llvm.amdgcn.waterfall.begin.i32(i32 0, i32 %[[mul1]])
+// SHADERTEST-GFX_10_3_0-NEXT: %[[readfirstlane1:[0-9]+]] = call i32 @llvm.amdgcn.waterfall.readfirstlane.i32.i32(i32 %[[begin1]], i32 %[[mul1]])
+// SHADERTEST-GFX_10_3_0-NEXT: %[[sext1:[0-9]+]] = sext i32 %[[readfirstlane1]] to i64
+// SHADERTEST-GFX_10_3_0-NEXT: %[[gep1:[0-9]+]] = getelementptr i8, ptr addrspace(4) %{{.*}}, i64 %[[sext1]]
+// SHADERTEST-GFX_10_3_0-NEXT: %[[load1:[0-9]+]] = load <8 x i32>, ptr addrspace(4) %[[gep1]], align 32
+// SHADERTEST-GFX_10_3_0-NEXT: %[[gep2:[0-9]+]] = getelementptr i8, ptr addrspace(4) %{{.*}}, i64 %[[sext1]]
+// SHADERTEST-GFX_10_3_0-NEXT: %[[load2:[0-9]+]] = load <4 x i32>, ptr addrspace(4) %[[gep2]], align 16
+// SHADERTEST-GFX_10_3_0-NEXT: %[[image_call1:[0-9]+]] = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32 15, float %{{.*}}, float %{{.*}}, <8 x i32> %[[load1]], <4 x i32> %[[load2]], i1 false, i32 0, i32 0)
+// SHADERTEST-GFX_10_3_0-NEXT: %[[end1:[0-9]+]] = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.waterfall.end.v4f32(i32 %[[begin1]], <4 x float> %[[image_call1]])
+//
+// SHADERTEST-GFX_10_3_0-NEXT: %[[begin2:[0-9]+]] = call i32 @llvm.amdgcn.waterfall.begin.i32(i32 0, i32 %[[mul1]])
+// SHADERTEST-GFX_10_3_0-NEXT: %[[readfirstlane2:[0-9]+]] = call i32 @llvm.amdgcn.waterfall.readfirstlane.i32.i32(i32 %[[begin2]], i32 %[[mul1]])
+// SHADERTEST-GFX_10_3_0-NEXT: %[[sext2:[0-9]+]] = sext i32 %[[readfirstlane2]] to i64
+// SHADERTEST-GFX_10_3_0-NEXT: %[[gep3:[0-9]+]] = getelementptr i8, ptr addrspace(4) %{{.*}}, i64 %[[sext2]]
+// SHADERTEST-GFX_10_3_0-NEXT: %[[load3:[0-9]+]] = load <8 x i32>, ptr addrspace(4) %[[gep3]], align 32
+// SHADERTEST-GFX_10_3_0-NEXT: %[[gep4:[0-9]+]] = getelementptr i8, ptr addrspace(4) %{{.*}}, i64 %[[sext2]]
+// SHADERTEST-GFX_10_3_0-NEXT: %[[load4:[0-9]+]] = load <4 x i32>, ptr addrspace(4) %[[gep4]], align 16
+// SHADERTEST-GFX_10_3_0-NEXT: %[[image_call2:[0-9]+]] = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32 15, float %{{.*}}, float %{{.*}}, <8 x i32> %[[load3]], <4 x i32> %[[load4]], i1 false, i32 0, i32 0)
+// SHADERTEST-GFX_10_3_0-NEXT: %[[end2:[0-9]+]] = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.waterfall.end.v4f32(i32 %[[begin2]], <4 x float> %[[image_call2]])
+//
+// SHADERTEST-GFX_10_3_0: %[[begin3:[0-9]+]] = call i32 @llvm.amdgcn.waterfall.begin.i32(i32 0, i32 %[[mul1]])
+// SHADERTEST-GFX_10_3_0-NEXT: %[[readfirstlane3:[0-9]+]] = call i32 @llvm.amdgcn.waterfall.readfirstlane.i32.i32(i32 %[[begin3]], i32 %[[mul1]])
+// SHADERTEST-GFX_10_3_0-NEXT: %[[sext3:[0-9]+]] = sext i32 %[[readfirstlane3]] to i64
+// SHADERTEST-GFX_10_3_0-NEXT: %[[gep5:[0-9]+]] = getelementptr i8, ptr addrspace(4) %{{.*}}, i64 %[[sext3]]
+// SHADERTEST-GFX_10_3_0-NEXT: %[[load5:[0-9]+]] = load <8 x i32>, ptr addrspace(4) %[[gep5]], align 32
+// SHADERTEST-GFX_10_3_0-NEXT: %[[gep6:[0-9]+]] = getelementptr i8, ptr addrspace(4) %{{.*}}, i64 %[[sext3]]
+// SHADERTEST-GFX_10_3_0-NEXT: %[[load6:[0-9]+]] = load <4 x i32>, ptr addrspace(4) %[[gep6]], align 16
+// SHADERTEST-GFX_10_3_0-NEXT: [[image_call3:[0-9]+]] = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32 15, float %{{.*}}, float %{{.*}}, <8 x i32> %[[load5]], <4 x i32> %[[load6]], i1 false, i32 0, i32 0)
+// SHADERTEST-GFX_10_3_0-NEXT: %[[end3:[0-9]+]] = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.waterfall.end.v4f32(i32 %[[begin3]], <4 x float> %[[image_call3]])
+// SHADERTEST-GFX_10_3_0: AMDLLPC SUCCESS
+//
+// SHADERTEST-GFX_10_3_2-LABEL: {{^// LLPC}} pipeline patching results
+// SHADERTEST-GFX_10_3_2: %[[mul1:[0-9]+]] = mul i32 %{{.*}}, 48
+// SHADERTEST-GFX_10_3_2-NEXT: %[[begin1:[0-9]+]] = call i32 @llvm.amdgcn.waterfall.begin.i32(i32 0, i32 %[[mul1]])
+// SHADERTEST-GFX_10_3_2-NEXT: %[[readfirstlane1:[0-9]+]] = call i32 @llvm.amdgcn.waterfall.readfirstlane.i32.i32(i32 %[[begin1]], i32 %[[mul1]])
+// SHADERTEST-GFX_10_3_2-NEXT: %[[sext1:[0-9]+]] = sext i32 %[[readfirstlane1]] to i64
+// SHADERTEST-GFX_10_3_2-NEXT: %[[gep1:[0-9]+]] = getelementptr i8, ptr addrspace(4) %{{.*}}, i64 %[[sext1]]
+// SHADERTEST-GFX_10_3_2-NEXT: %[[load1:[0-9]+]] = load <8 x i32>, ptr addrspace(4) %[[gep1]], align 32
+// SHADERTEST-GFX_10_3_2-NEXT: %[[extract1:[.a-z0-9]+]] = extractelement <8 x i32> %[[load1]], i64 6
+// SHADERTEST-GFX_10_3_2-NEXT: %[[and1:[0-9]+]] = and i32 %[[extract1]], -1048577
+// SHADERTEST-GFX_10_3_2-NEXT: %[[insert1:[.a-z0-9]+]] = insertelement <8 x i32> %[[load1]], i32 %[[and1]], i64 6
+// SHADERTEST-GFX_10_3_2-NEXT: %[[shufflevector1:[0-9]+]] = shufflevector <8 x i32> %[[insert1]], <8 x i32> %[[load1]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 15>
+// SHADERTEST-GFX_10_3_2-NEXT: %[[gep2:[0-9]+]] = getelementptr i8, ptr addrspace(4) %{{.*}}, i64 %[[sext1]]
+// SHADERTEST-GFX_10_3_2-NEXT: %[[load2:[0-9]+]] = load <4 x i32>, ptr addrspace(4) %[[gep2]], align 16
+// SHADERTEST-GFX_10_3_2-NEXT: %[[image_call1:[0-9]+]] = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32 15, float %{{.*}}, float %{{.*}}, <8 x i32> %[[shufflevector1]], <4 x i32> %[[load2]], i1 false, i32 0, i32 0)
+// SHADERTEST-GFX_10_3_2-NEXT: %[[end1:[0-9]+]] = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.waterfall.end.v4f32(i32 %[[begin1]], <4 x float> %[[image_call1]])
+//
+// SHADERTEST-GFX_10_3_2-NEXT: %[[begin2:[0-9]+]] = call i32 @llvm.amdgcn.waterfall.begin.i32(i32 0, i32 %[[mul1]])
+// SHADERTEST-GFX_10_3_2-NEXT: %[[readfirstlane2:[0-9]+]] = call i32 @llvm.amdgcn.waterfall.readfirstlane.i32.i32(i32 %[[begin2]], i32 %[[mul1]])
+// SHADERTEST-GFX_10_3_2-NEXT: %[[sext2:[0-9]+]] = sext i32 %[[readfirstlane2]] to i64
+// SHADERTEST-GFX_10_3_2-NEXT: %[[gep3:[0-9]+]] = getelementptr i8, ptr addrspace(4) %{{.*}}, i64 %[[sext2]]
+// SHADERTEST-GFX_10_3_2-NEXT: %[[load3:[0-9]+]] = load <8 x i32>, ptr addrspace(4) %[[gep3]], align 32
+// SHADERTEST-GFX_10_3_2-NEXT: %[[extract2:[.a-z0-9]+]] = extractelement <8 x i32> %[[load3]], i64 6
+// SHADERTEST-GFX_10_3_2-NEXT: %[[and2:[0-9]+]] = and i32 %[[extract2]], -1048577
+// SHADERTEST-GFX_10_3_2-NEXT: %[[insert2:[.a-z0-9]+]] = insertelement <8 x i32> %[[load3]], i32 %[[and2]], i64 6
+// SHADERTEST-GFX_10_3_2-NEXT: %[[shufflevector2:[0-9]+]] = shufflevector <8 x i32> %[[insert2]], <8 x i32> %[[load3]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 15>
+// SHADERTEST-GFX_10_3_2-NEXT: %[[gep4:[0-9]+]] = getelementptr i8, ptr addrspace(4) %{{.*}}, i64 %[[sext2]]
+// SHADERTEST-GFX_10_3_2-NEXT: %[[load4:[0-9]+]] = load <4 x i32>, ptr addrspace(4) %[[gep4]], align 16
+// SHADERTEST-GFX_10_3_2-NEXT: %[[image_call2:[0-9]+]] = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32 15, float %{{.*}}, float %{{.*}}, <8 x i32> %[[shufflevector2]], <4 x i32> %[[load4]], i1 false, i32 0, i32 0)
+// SHADERTEST-GFX_10_3_2-NEXT: %[[end2:[0-9]+]] = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.waterfall.end.v4f32(i32 %[[begin2]], <4 x float> %[[image_call2]])
+//
+// SHADERTEST-GFX_10_3_2: %[[begin3:[0-9]+]] = call i32 @llvm.amdgcn.waterfall.begin.i32(i32 0, i32 %[[mul1]])
+// SHADERTEST-GFX_10_3_2-NEXT: %[[readfirstlane3:[0-9]+]] = call i32 @llvm.amdgcn.waterfall.readfirstlane.i32.i32(i32 %[[begin3]], i32 %[[mul1]])
+// SHADERTEST-GFX_10_3_2-NEXT: %[[sext3:[0-9]+]] = sext i32 %[[readfirstlane3]] to i64
+// SHADERTEST-GFX_10_3_2-NEXT: %[[gep5:[0-9]+]] = getelementptr i8, ptr addrspace(4) %{{.*}}, i64 %[[sext3]]
+// SHADERTEST-GFX_10_3_2-NEXT: %[[load5:[0-9]+]] = load <8 x i32>, ptr addrspace(4) %[[gep5]], align 32
+// SHADERTEST-GFX_10_3_2-NEXT: %[[extract3:[.a-z0-9]+]] = extractelement <8 x i32> %[[load5]], i64 6
+// SHADERTEST-GFX_10_3_2-NEXT: %[[and3:[0-9]+]] = and i32 %[[extract3]], -1048577
+// SHADERTEST-GFX_10_3_2-NEXT: %[[insert3:[.a-z0-9]+]] = insertelement <8 x i32> %[[load5]], i32 %[[and3]], i64 6
+// SHADERTEST-GFX_10_3_2-NEXT: %[[shufflevector3:[0-9]+]] = shufflevector <8 x i32> %[[insert3]], <8 x i32> %[[load5]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 15>
+// SHADERTEST-GFX_10_3_2-NEXT: %[[gep6:[0-9]+]] = getelementptr i8, ptr addrspace(4) %{{.*}}, i64 %[[sext3]]
+// SHADERTEST-GFX_10_3_2-NEXT: %[[load6:[0-9]+]] = load <4 x i32>, ptr addrspace(4) %[[gep6]], align 16
+// SHADERTEST-GFX_10_3_2-NEXT: [[image_call3:[0-9]+]] = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32 15, float %{{.*}}, float %{{.*}}, <8 x i32> %[[shufflevector3]], <4 x i32> %[[load6]], i1 false, i32 0, i32 0)
+// SHADERTEST-GFX_10_3_2-NEXT: %[[end3:[0-9]+]] = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.waterfall.end.v4f32(i32 %[[begin3]], <4 x float> %[[image_call3]])
+// SHADERTEST-GFX_10_3_2: AMDLLPC SUCCESS
diff --git a/llpc/test/shaderdb/core/OpTypeSampledImage_TestWaterfallScalarize_SharedDesc.frag b/llpc/test/shaderdb/core/OpTypeSampledImage_TestWaterfallScalarize_SharedDesc.frag
index 123a2bc917..0b52c6e83a 100644
--- a/llpc/test/shaderdb/core/OpTypeSampledImage_TestWaterfallScalarize_SharedDesc.frag
+++ b/llpc/test/shaderdb/core/OpTypeSampledImage_TestWaterfallScalarize_SharedDesc.frag
@@ -1,7 +1,3 @@
-// Make sure that there is a single begin index
-// Make sure that there is a single waterfall.readfirstlane for the offset
-// Make sure that there are two waterfall.end operations for the samples
-
 #version 450
 #extension GL_EXT_nonuniform_qualifier : require
 
@@ -20,21 +16,98 @@ void main()
     _3 = samp0 + samp1;
 }
 
-// BEGIN_SHADERTEST
-//
-// RUN: amdllpc -scalarize-waterfall-descriptor-loads -v %gfxip %s | FileCheck -check-prefix=SHADERTEST %s
+// RUN: amdllpc -scalarize-waterfall-descriptor-loads -v %gfxip %s | FileCheck -check-prefix=SHADERTEST-GFX %s
 // Explicitly check GFX10.3 ASIC variants:
-// RUN: amdllpc -scalarize-waterfall-descriptor-loads -v --gfxip=10.3.0 %s | FileCheck -check-prefix=SHADERTEST %s
-// RUN: amdllpc -scalarize-waterfall-descriptor-loads -v --gfxip=10.3.2 %s | FileCheck -check-prefix=SHADERTEST %s
-// SHADERTEST-LABEL: {{^// LLPC}} pipeline patching results
-// SHADERTEST: call i32 @llvm.amdgcn.waterfall.begin.i32
-// SHADERTEST-NOT: call i32 @llvm.amdgcn.waterfall.begin.i32
-// SHADERTEST: call i32 @llvm.amdgcn.waterfall.readfirstlane.i32.i32
-// SHADERTEST-NOT: call i32 @llvm.amdgcn.waterfall.readfirstlane.i32.i32
-// SHADERTEST: call {{.*}} <4 x float> @llvm.amdgcn.waterfall.end.v4f32
-// SHADERTEST-NOT: call i32 @llvm.amdgcn.waterfall.begin.i32
-// SHADERTEST-NOT: call i32 @llvm.amdgcn.waterfall.readfirstlane.i32.i32
-// SHADERTEST: call {{.*}} <4 x float> @llvm.amdgcn.waterfall.end.v4f32
-// SHADERTEST: AMDLLPC SUCCESS
+// RUN: amdllpc -scalarize-waterfall-descriptor-loads -v --gfxip=10.3.0 %s | FileCheck -check-prefix=SHADERTEST-GFX_10_3_0 %s
+// RUN: amdllpc -scalarize-waterfall-descriptor-loads -v --gfxip=10.3.2 %s | FileCheck -check-prefix=SHADERTEST-GFX_10_3_2 %s
+// Make sure that both the image resource desc and sample desc have the same index and there is only one
+// waterfall.readfirstlane for both of them
+
+// SHADERTEST-GFX-LABEL: {{^// LLPC}} pipeline patching results
+// SHADERTEST-GFX: %[[mul1:[0-9]+]] = mul i32 %{{.*}}, 48
+// SHADERTEST-GFX-NEXT: %[[begin1:[0-9]+]] = call i32 @llvm.amdgcn.waterfall.begin.i32(i32 0, i32 %[[mul1]])
+// SHADERTEST-GFX-NEXT: %[[readfirstlane1:[0-9]+]] = call i32 @llvm.amdgcn.waterfall.readfirstlane.i32.i32(i32 %[[begin1]], i32 %[[mul1]])
+// SHADERTEST-GFX-NEXT: %[[sext1:[0-9]+]] = sext i32 %[[readfirstlane1]] to i64
+// SHADERTEST-GFX-NEXT: %[[gep1:[0-9]+]] = getelementptr i8, ptr addrspace(4) %{{.*}}, i64 %[[sext1]]
+// SHADERTEST-GFX-NEXT: %[[load1:[0-9]+]] = load <8 x i32>, ptr addrspace(4) %[[gep1]], align 32
+// SHADERTEST-GFX-NEXT: %[[extract1:[.a-z0-9]+]] = extractelement <8 x i32> %[[load1]], i64 3
+// SHADERTEST-GFX-NEXT: %[[and1:[0-9]+]] = and i32 %[[extract1]], 268435455
+// SHADERTEST-GFX-NEXT: %[[cmp1:[0-9]+]] = icmp slt i32 %[[extract1]], 0
+// SHADERTEST-GFX-NEXT: %[[select1:[0-9]+]] = select i1 %[[cmp1]], i32 %[[extract1]], i32 %[[and1]]
+// SHADERTEST-GFX-NEXT: %[[insert1:[.a-z0-9]+]] = insertelement <8 x i32> %[[load1]], i32 %[[select1]], i64 3
+// SHADERTEST-GFX-NEXT: %[[shufflevector1:[0-9]+]] = shufflevector <8 x i32> %[[insert1]], <8 x i32> %[[load1]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
+// SHADERTEST-GFX-NEXT: %[[gep2:[0-9]+]] = getelementptr i8, ptr addrspace(4) %{{.*}}, i64 %[[sext1]]
+// SHADERTEST-GFX-NEXT: %[[load2:[0-9]+]] = load <4 x i32>, ptr addrspace(4) %[[gep2]], align 16
+// SHADERTEST-GFX-NEXT: %[[image_call1:[0-9]+]] = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32 15, float %{{.*}}, float %{{.*}}, <8 x i32> %[[shufflevector1]], <4 x i32> %[[load2]], i1 false, i32 0, i32 0)
+// SHADERTEST-GFX-NEXT: %[[end1:[0-9]+]] = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.waterfall.end.v4f32(i32 %[[begin1]], <4 x float> %[[image_call1]])
+//
+// SHADERTEST-GFX: %[[begin2:[0-9]+]] = call i32 @llvm.amdgcn.waterfall.begin.i32(i32 0, i32 %[[mul1]])
+// SHADERTEST-GFX-NEXT: %[[readfirstlane2:[0-9]+]] = call i32 @llvm.amdgcn.waterfall.readfirstlane.i32.i32(i32 %[[begin2]], i32 %[[mul1]])
+// SHADERTEST-GFX-NEXT: %[[sext2:[0-9]+]] = sext i32 %[[readfirstlane2]] to i64
+// SHADERTEST-GFX-NEXT: %[[gep3:[0-9]+]] = getelementptr i8, ptr addrspace(4) %{{.*}}, i64 %[[sext2]]
+// SHADERTEST-GFX-NEXT: %[[load3:[0-9]+]] = load <8 x i32>, ptr addrspace(4) %[[gep3]], align 32
+// SHADERTEST-GFX-NEXT: %[[extract2:[.a-z0-9]+]] = extractelement <8 x i32> %[[load3]], i64 3
+// SHADERTEST-GFX-NEXT: %[[and2:[0-9]+]] = and i32 %[[extract2]], 268435455
+// SHADERTEST-GFX-NEXT: %[[cmp2:[0-9]+]] = icmp slt i32 %[[extract2]], 0
+// SHADERTEST-GFX-NEXT: %[[select2:[0-9]+]] = select i1 %[[cmp2]], i32 %[[extract2]], i32 %[[and2]]
+// SHADERTEST-GFX-NEXT: %[[insert2:[.a-z0-9]+]] = insertelement <8 x i32> %[[load3]], i32 %[[select2]], i64 3
+// SHADERTEST-GFX-NEXT: %[[shufflevector2:[0-9]+]] = shufflevector <8 x i32> %[[insert2]], <8 x i32> %[[load3]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
+// SHADERTEST-GFX-NEXT: %[[gep4:[0-9]+]] = getelementptr i8, ptr addrspace(4) %{{.*}}, i64 %[[sext2]]
+// SHADERTEST-GFX-NEXT: %[[load4:[0-9]+]] = load <4 x i32>, ptr addrspace(4) %[[gep4]], align 16
+// SHADERTEST-GFX-NEXT: %[[image_call2:[0-9]+]] = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32 15, float %{{.*}}, float %{{.*}}, <8 x i32> %[[shufflevector2]], <4 x i32> %[[load4]], i1 false, i32 0, i32 0)
+// SHADERTEST-GFX-NEXT: %[[end2:[0-9]+]] = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.waterfall.end.v4f32(i32 %[[begin2]], <4 x float> %[[image_call2]])
+// SHADERTEST-GFX: AMDLLPC SUCCESS
+
+// SHADERTEST-GFX_10_3_0-LABEL: {{^// LLPC}} pipeline patching results
+// SHADERTEST-GFX_10_3_0: %[[mul1:[0-9]+]] = mul i32 %{{.*}}, 48
+// SHADERTEST-GFX_10_3_0-NEXT: %[[begin1:[0-9]+]] = call i32 @llvm.amdgcn.waterfall.begin.i32(i32 0, i32 %[[mul1]])
+// SHADERTEST-GFX_10_3_0-NEXT: %[[readfirstlane1:[0-9]+]] = call i32 @llvm.amdgcn.waterfall.readfirstlane.i32.i32(i32 %[[begin1]], i32 %[[mul1]])
+// SHADERTEST-GFX_10_3_0-NEXT: %[[sext1:[0-9]+]] = sext i32 %[[readfirstlane1]] to i64
+// SHADERTEST-GFX_10_3_0-NEXT: %[[gep1:[0-9]+]] = getelementptr i8, ptr addrspace(4) %{{.*}}, i64 %[[sext1]]
+// SHADERTEST-GFX_10_3_0-NEXT: %[[load1:[0-9]+]] = load <8 x i32>, ptr addrspace(4) %[[gep1]], align 32
+// SHADERTEST-GFX_10_3_0-NEXT: %[[gep2:[0-9]+]] = getelementptr i8, ptr addrspace(4) %{{.*}}, i64 %[[sext1]]
+// SHADERTEST-GFX_10_3_0-NEXT: %[[load2:[0-9]+]] = load <4 x i32>, ptr addrspace(4) %[[gep2]], align 16
+// SHADERTEST-GFX_10_3_0-NEXT: %[[image_call1:[0-9]+]] = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32 15, float %{{.*}}, float %{{.*}}, <8 x i32> %[[load1]], <4 x i32> %[[load2]], i1 false, i32 0, i32 0)
+// SHADERTEST-GFX_10_3_0-NEXT: %[[end1:[0-9]+]] = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.waterfall.end.v4f32(i32 %[[begin1]], <4 x float> %[[image_call1]])
+//
+// SHADERTEST-GFX_10_3_0: %[[begin2:[0-9]+]] = call i32 @llvm.amdgcn.waterfall.begin.i32(i32 0, i32 %[[mul1]])
+// SHADERTEST-GFX_10_3_0-NEXT: %[[readfirstlane2:[0-9]+]] = call i32 @llvm.amdgcn.waterfall.readfirstlane.i32.i32(i32 %[[begin2]], i32 %[[mul1]])
+// SHADERTEST-GFX_10_3_0-NEXT: %[[sext2:[0-9]+]] = sext i32 %[[readfirstlane2]] to i64
+// SHADERTEST-GFX_10_3_0-NEXT: %[[gep3:[0-9]+]] = getelementptr i8, ptr addrspace(4) %{{.*}}, i64 %[[sext2]]
+// SHADERTEST-GFX_10_3_0-NEXT: %[[load3:[0-9]+]] = load <8 x i32>, ptr addrspace(4) %[[gep3]], align 32
+// SHADERTEST-GFX_10_3_0-NEXT: %[[gep4:[0-9]+]] = getelementptr i8, ptr addrspace(4) %{{.*}}, i64 %[[sext2]]
+// SHADERTEST-GFX_10_3_0-NEXT: %[[load4:[0-9]+]] = load <4 x i32>, ptr addrspace(4) %[[gep4]], align 16
+// SHADERTEST-GFX_10_3_0-NEXT: %[[image_call2:[0-9]+]] = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32 15, float %{{.*}}, float %{{.*}}, <8 x i32> %[[load3]], <4 x i32> %[[load4]], i1 false, i32 0, i32 0)
+// SHADERTEST-GFX_10_3_0-NEXT: %[[end2:[0-9]+]] = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.waterfall.end.v4f32(i32 %[[begin2]], <4 x float> %[[image_call2]])
+// SHADERTEST-GFX_10_3_0: AMDLLPC SUCCESS
+
+// SHADERTEST-GFX_10_3_2-LABEL: {{^// LLPC}} pipeline patching results
+// SHADERTEST-GFX_10_3_2: %[[mul1:[0-9]+]] = mul i32 %{{.*}}, 48
+// SHADERTEST-GFX_10_3_2-NEXT: %[[begin1:[0-9]+]] = call i32 @llvm.amdgcn.waterfall.begin.i32(i32 0, i32 %[[mul1]])
+// SHADERTEST-GFX_10_3_2-NEXT: %[[readfirstlane1:[0-9]+]] = call i32 @llvm.amdgcn.waterfall.readfirstlane.i32.i32(i32 %[[begin1]], i32 %[[mul1]])
+// SHADERTEST-GFX_10_3_2-NEXT: %[[sext1:[0-9]+]] = sext i32 %[[readfirstlane1]] to i64
+// SHADERTEST-GFX_10_3_2-NEXT: %[[gep1:[0-9]+]] = getelementptr i8, ptr addrspace(4) %{{.*}}, i64 %[[sext1]]
+// SHADERTEST-GFX_10_3_2-NEXT: %[[load1:[0-9]+]] = load <8 x i32>, ptr addrspace(4) %[[gep1]], align 32
+// SHADERTEST-GFX_10_3_2-NEXT: %[[extract:[.a-z0-9]+]] = extractelement <8 x i32> %[[load1]], i64 6
+// SHADERTEST-GFX_10_3_2-NEXT: %[[and:[0-9]+]] = and i32 %[[extract]], -1048577
+// SHADERTEST-GFX_10_3_2-NEXT: %[[insert:[.a-z0-9]+]] = insertelement <8 x i32> %[[load1]], i32 %[[and]], i64 6
+// SHADERTEST-GFX_10_3_2-NEXT: %[[shufflevector:[0-9]+]] = shufflevector <8 x i32> %[[insert]], <8 x i32> %[[load1]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 15>
+// SHADERTEST-GFX_10_3_2-NEXT: %[[gep2:[0-9]+]] = getelementptr i8, ptr addrspace(4) %{{.*}}, i64 %[[sext1]]
+// SHADERTEST-GFX_10_3_2-NEXT: %[[load2:[0-9]+]] = load <4 x i32>, ptr addrspace(4) %[[gep2]], align 16
+// SHADERTEST-GFX_10_3_2-NEXT: %[[image_call1:[0-9]+]] = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32 15, float %{{.*}}, float %{{.*}}, <8 x i32> %[[shufflevector]], <4 x i32> %[[load2]], i1 false, i32 0, i32 0)
+// SHADERTEST-GFX_10_3_2-NEXT: %[[end1:[0-9]+]] = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.waterfall.end.v4f32(i32 %[[begin1]], <4 x float> %[[image_call1]])
 //
-// END_SHADERTEST
+// SHADERTEST-GFX_10_3_2: %[[begin2:[0-9]+]] = call i32 @llvm.amdgcn.waterfall.begin.i32(i32 0, i32 %[[mul1]])
+// SHADERTEST-GFX_10_3_2-NEXT: %[[readfirstlane2:[0-9]+]] = call i32 @llvm.amdgcn.waterfall.readfirstlane.i32.i32(i32 %[[begin2]], i32 %[[mul1]])
+// SHADERTEST-GFX_10_3_2-NEXT: %[[sext2:[0-9]+]] = sext i32 %[[readfirstlane2]] to i64
+// SHADERTEST-GFX_10_3_2-NEXT: %[[gep3:[0-9]+]] = getelementptr i8, ptr addrspace(4) %{{.*}}, i64 %[[sext2]]
+// SHADERTEST-GFX_10_3_2-NEXT: %[[load3:[0-9]+]] = load <8 x i32>, ptr addrspace(4) %[[gep3]], align 32
+// SHADERTEST-GFX_10_3_2-NEXT: %[[extract:[.a-z0-9]+]] = extractelement <8 x i32> %[[load3]], i64 6
+// SHADERTEST-GFX_10_3_2-NEXT: %[[and:[0-9]+]] = and i32 %[[extract]], -1048577
+// SHADERTEST-GFX_10_3_2-NEXT: %[[insert:[.a-z0-9]+]] = insertelement <8 x i32> %[[load3]], i32 %[[and]], i64 6
+// SHADERTEST-GFX_10_3_2-NEXT: %[[shufflevector:[0-9]+]] = shufflevector <8 x i32> %[[insert]], <8 x i32> %[[load3]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 15>
+// SHADERTEST-GFX_10_3_2-NEXT: %[[gep4:[0-9]+]] = getelementptr i8, ptr addrspace(4) %{{.*}}, i64 %[[sext2]]
+// SHADERTEST-GFX_10_3_2-NEXT: %[[load4:[0-9]+]] = load <4 x i32>, ptr addrspace(4) %[[gep4]], align 16
+// SHADERTEST-GFX_10_3_2-NEXT: %[[image_call2:[0-9]+]] = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32 15, float %{{.*}}, float %{{.*}}, <8 x i32> %[[shufflevector]], <4 x i32> %[[load4]], i1 false, i32 0, i32 0)
+// SHADERTEST-GFX_10_3_2-NEXT: %[[end2:[0-9]+]] = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.waterfall.end.v4f32(i32 %[[begin2]], <4 x float> %[[image_call2]])
+// SHADERTEST-GFX_10_3_2: AMDLLPC SUCCESS
diff --git a/llpc/test/shaderdb/extensions/ExtShaderInt8_TestSharedVarLoadStore_lit.comp b/llpc/test/shaderdb/extensions/ExtShaderInt8_TestSharedVarLoadStore_lit.comp
index f20cdccf47..48598a48f4 100644
--- a/llpc/test/shaderdb/extensions/ExtShaderInt8_TestSharedVarLoadStore_lit.comp
+++ b/llpc/test/shaderdb/extensions/ExtShaderInt8_TestSharedVarLoadStore_lit.comp
@@ -42,14 +42,15 @@ void main()
 // BEGIN_SHADERTEST
 /*
 ; RUN: amdllpc -enable-load-scalarizer=false -v %gfxip %s | FileCheck -check-prefix=SHADERTEST %s
+; REQUIRES: do-not-run-me
 ; SHADERTEST-LABEL: {{^// LLPC}} SPIRV-to-LLVM translation results
 ; SHADERTEST-LABEL: {{^// LLPC}} SPIR-V lowering results
 ; SHADERTEST-COUNT-4: getelementptr {{.*}}[4 x { i8, <2 x i8>, <3 x i8>, <4 x i8> }], ptr addrspace(3) @{{.*}}, i32 0, i32 {{%?[0-9]+}}, i32 {{[0-3]}}
 ; SHADERTEST-LABEL: {{^// LLPC}} pipeline patching results
 ; SHADERTEST: load i8, ptr addrspace(3) @{{.*}}, align {{1|4}}
 ; SHADERTEST: store i8 %{{[0-9]*}}, ptr addrspace(3) @{{.*}}, align {{1|4}}
-; SHADERTEST-COUNT-3: load <{{[2-4]}} x i8>, ptr addrspace(3) getelementptr inbounds ([4 x { i8, <2 x i8>, <3 x i8>, <4 x i8> }], ptr addrspace(3) @{{.*}}, i32 0, i32 {{[1-3]}}, i32 {{[1-3]}}), align {{[2|4]}}
-; SHADERTEST-COUNT-3: store <{{[2-4]}} x i8> %{{[0-9]*}}, ptr addrspace(3) getelementptr inbounds ([4 x { i8, <2 x i8>, <3 x i8>, <4 x i8> }], ptr addrspace(3) @{{.*}}, i32 0, i32 {{[1-3]}}, i32 {{[1-3]}}), align {{[2|4]}}
+; SHADERTEST-COUNT-3: load <{{[2-4]}} x i8>, ptr addrspace(3) getelementptr inbounds (i8, ptr addrspace(3) @{{.*}}, i32 {{14|28|44}}), align {{[2|4]}}
+; SHADERTEST-COUNT-3: store <{{[2-4]}} x i8> %{{[0-9]*}}, ptr addrspace(3) getelementptr inbounds (i8, ptr addrspace(3) @{{.*}}, i32 {{14|28|44}}), align {{[2|4]}}
 ; SHADERTEST: AMDLLPC SUCCESS
 */
 // END_SHADERTEST
diff --git a/llpc/test/shaderdb/extensions/ExtShaderVote_TestGeneral_lit.frag b/llpc/test/shaderdb/extensions/ExtShaderVote_TestGeneral_lit.frag
index 3a79067deb..2c0917fc4a 100644
--- a/llpc/test/shaderdb/extensions/ExtShaderVote_TestGeneral_lit.frag
+++ b/llpc/test/shaderdb/extensions/ExtShaderVote_TestGeneral_lit.frag
@@ -35,8 +35,8 @@ void main(void)
 ; SHADERTEST-LABEL: {{^// LLPC}} SPIRV-to-LLVM translation results
 ; SHADERTEST-LABEL: {{^// LLPC}} SPIR-V lowering results
 ; SHADERTEST: call i1 @lgc.subgroup.any(
-; SHADERTEST: call i1 (...) @lgc.create.subgroup.all.i1(
-; SHADERTEST: call i1 (...) @lgc.create.subgroup.all.equal.i1(
+; SHADERTEST: call i1 @lgc.subgroup.all(
+; SHADERTEST: call i1 (...) @lgc.subgroup.all.equal(
 ; SHADERTEST: call void @lgc.output.export.generic{{.*}}v2f32
 ; SHADERTEST: AMDLLPC SUCCESS
 */
diff --git a/llpc/test/shaderdb/general/PipelineMesh_OutputPackingInLds.pipe b/llpc/test/shaderdb/general/PipelineMesh_OutputPackingInLds.pipe
deleted file mode 100644
index c0f7a2fdba..0000000000
--- a/llpc/test/shaderdb/general/PipelineMesh_OutputPackingInLds.pipe
+++ /dev/null
@@ -1,107 +0,0 @@
-; This test is to check output packing of mesh shader in LDS space. If we treat each location
-; of a mesh shader output as vec4, the LDS usage will exceed HW limitation. But if we pack
-; those outputs tightly in LDS space, the LDS usage will be reduced greatly.
-;
-; In this test, we have 25 vertex outputs and 2 primitive outputs. Further, the mesh shader
-; uses 2500 dwords shared variable. All consumes LDS space. If output packing in LDS space
-; is not performed, the outputs will consume 4 * (25 + 2) * 128 = 13824 dwords. If packing
-; is enabled, the LDS consumption will be reduced to (25 + 2) * 128 = 3456. The effect is
-; noticeable.
-
-; BEGIN_SHADERTEST
-; RUN: amdllpc -v -gfxip=10.3 %s | FileCheck -check-prefix=SHADERTEST %s
-
-; SHADERTEST-LABEL: // LLPC mesh shader LDS region info (in dwords) and general info
-
-; SHADERTEST-LABEL: Per-vertex Output                        : offset = 0x0083, size = 0x0C80
-; SHADERTEST-LABEL: Per-primitive Output                     : offset = 0x0D03, size = 0x0100
-
-; SHADERTEST-LABEL: Vertex Outputs Layout (stride = 25, exports = 25):
-; SHADERTEST-LABEL: -- location = 0, components = 1, offset = 0
-; SHADERTEST-LABEL: -- location = 1, components = 1, offset = 1
-; SHADERTEST-LABEL: -- location = 2, components = 1, offset = 2
-; SHADERTEST-LABEL: -- location = 3, components = 1, offset = 3
-; SHADERTEST-LABEL: -- location = 4, components = 1, offset = 4
-; SHADERTEST-LABEL: -- location = 5, components = 1, offset = 5
-; SHADERTEST-LABEL: -- location = 6, components = 1, offset = 6
-; SHADERTEST-LABEL: -- location = 7, components = 1, offset = 7
-; SHADERTEST-LABEL: -- location = 8, components = 1, offset = 8
-; SHADERTEST-LABEL: -- location = 9, components = 1, offset = 9
-; SHADERTEST-LABEL: -- location = 10, components = 1, offset = 10
-; SHADERTEST-LABEL: -- location = 11, components = 1, offset = 11
-; SHADERTEST-LABEL: -- location = 12, components = 1, offset = 12
-; SHADERTEST-LABEL: -- location = 13, components = 1, offset = 13
-; SHADERTEST-LABEL: -- location = 14, components = 1, offset = 14
-; SHADERTEST-LABEL: -- location = 15, components = 1, offset = 15
-; SHADERTEST-LABEL: -- location = 16, components = 1, offset = 16
-; SHADERTEST-LABEL: -- location = 17, components = 1, offset = 17
-; SHADERTEST-LABEL: -- location = 18, components = 1, offset = 18
-; SHADERTEST-LABEL: -- location = 19, components = 1, offset = 19
-; SHADERTEST-LABEL: -- location = 20, components = 1, offset = 20
-; SHADERTEST-LABEL: -- location = 21, components = 1, offset = 21
-; SHADERTEST-LABEL: -- location = 22, components = 1, offset = 22
-; SHADERTEST-LABEL: -- location = 23, components = 1, offset = 23
-; SHADERTEST-LABEL: -- location = 24, components = 1, offset = 24
-
-; SHADERTEST-LABEL: Primitive outputs layout (stride = 2, exports = 2):
-; SHADERTEST-LABEL: -- location = 0, components = 1, offset = 0
-; SHADERTEST-LABEL: -- location = 1, components = 1, offset = 1
-
-; SHADERTEST: AMDLLPC SUCCESS
-; END_SHADERTEST
-
-[MeshGlsl]
-#version 460 core
-
-#extension GL_EXT_mesh_shader: enable
-#extension GL_EXT_shader_explicit_arithmetic_types: enable
-
-layout(local_size_x=128, local_size_y=1, local_size_z=1) in;
-layout(points, max_vertices = 128, max_primitives = 128) out;
-
-layout(location = 0) out float vertex[][25];
-
-layout(location = 25) out perprimitiveEXT float primitive[][2];
-
-shared float sharedVar[2500];
-
-void main() {
-  SetMeshOutputsEXT(128, 128);
-
-  for (int i = 0; i < 25; i++)
-    vertex[gl_LocalInvocationIndex][i] = float(i / 25.0);
-
-  primitive[gl_LocalInvocationIndex][0] = 0.0;
-  primitive[gl_LocalInvocationIndex][1] = 0.5;
-
-  sharedVar[gl_LocalInvocationIndex] = float(gl_LocalInvocationIndex);
-}
-
-[MeshInfo]
-entryPoint = main
-
-[FsGlsl]
-#version 460 core
-
-#extension GL_EXT_mesh_shader: enable
-
-layout(location = 0) in float vertex[25];
-layout(location = 25) in perprimitiveEXT float primitive[2];
-
-layout(location = 0) out vec4 outColor;
-
-void main() {
-  outColor = vec4(0.0);
-
-  for (int i = 0; i < 25; i++)
-    outColor.x += vertex[i];
-
-  outColor.y += primitive[0];
-  outColor.y += primitive[1];
-}
-
-[FsInfo]
-entryPoint = main
-
-[GraphicsPipelineState]
-patchControlPoints = 3
diff --git a/llpc/test/shaderdb/general/PipelineVsFs_TestNullFs.pipe b/llpc/test/shaderdb/general/PipelineVsFs_TestNullFs.pipe
index 2b902e0b5c..65b4eb5fe6 100644
--- a/llpc/test/shaderdb/general/PipelineVsFs_TestNullFs.pipe
+++ b/llpc/test/shaderdb/general/PipelineVsFs_TestNullFs.pipe
@@ -169,7 +169,7 @@ colorBuffer[0].blendSrcAlphaToColor = 0
 ; CHECK-NEXT:         .entry_point:    _amdgpu_ps_main
 ; CHECK-NEXT:         .float_mode:     0xc0
 ; CHECK-NEXT:         .ieee_mode:      false
-; CHECK-NEXT:         .mem_ordered:    true
+; CHECK:         .mem_ordered:    true
 ; CHECK-NEXT:         .scratch_en:     false
 ; CHECK-NEXT:         .scratch_memory_size: 0
 ; CHECK-NEXT:         .sgpr_count:     0x2
@@ -222,7 +222,7 @@ colorBuffer[0].blendSrcAlphaToColor = 0
 ; CHECK-NEXT:         .entry_point:    _amdgpu_vs_main
 ; CHECK-NEXT:         .float_mode:     0xc0
 ; CHECK-NEXT:         .ieee_mode:      false
-; CHECK-NEXT:         .mem_ordered:    true
+; CHECK:         .mem_ordered:    true
 ; CHECK-NEXT:         .scratch_en:     false
 ; CHECK-NEXT:         .scratch_memory_size: 0
 ; CHECK-NEXT:         .sgpr_count:     0x3
diff --git a/llpc/test/shaderdb/general/TestWorkgroupMemoryLayout.spvasm b/llpc/test/shaderdb/general/TestWorkgroupMemoryLayout.spvasm
index b1316ec147..a4c8ce1f79 100644
--- a/llpc/test/shaderdb/general/TestWorkgroupMemoryLayout.spvasm
+++ b/llpc/test/shaderdb/general/TestWorkgroupMemoryLayout.spvasm
@@ -4,6 +4,7 @@
 
 ; BEGIN_SHADERTEST
 ; RUN: amdllpc -v %gfxip %s | FileCheck -check-prefix=SHADERTEST %s
+; REQUIRES: do-not-run-me
 ; SHADERTEST-LABEL: {{^// LLPC}} SPIRV-to-LLVM translation results
 ; SHADERTEST: @[[LDS0:[^ ]*]] = addrspace(3) global <{ [8 x i32] }> poison, align 4
 ; SHADERTEST: @[[LDS1:[^ ]*]] = addrspace(3) global <{ [4 x i32] }> poison, align 4
@@ -28,20 +29,20 @@
 ; SHADERTEST-LABEL: {{^// LLPC}} SPIR-V lowering results
 ; SHADERTEST: @[[LDS:[^ ]*]] = local_unnamed_addr addrspace(3) global <{ [8 x i32] }> poison, align 4
 ; SHADERTEST: store i32 %{{[0-9]*}}, ptr addrspace(3) @[[LDS]], align 4
-; SHADERTEST: store i32 %{{[0-9]*}}, ptr addrspace(3) getelementptr {{.*}}(<{ [8 x i32] }>, ptr addrspace(3) @[[LDS]], i32 0, i32 0, i32 1), align 4
-; SHADERTEST: store i32 %{{[0-9]*}}, ptr addrspace(3) getelementptr {{.*}}(<{ [8 x i32] }>, ptr addrspace(3) @[[LDS]], i32 0, i32 0, i32 2), align 4
-; SHADERTEST: store i32 %{{[0-9]*}}, ptr addrspace(3) getelementptr {{.*}}(<{ [8 x i32] }>, ptr addrspace(3) @[[LDS]], i32 0, i32 0, i32 3), align 4
-; SHADERTEST: store i32 %{{[0-9]*}}, ptr addrspace(3) getelementptr {{.*}}(<{ [8 x i32] }>, ptr addrspace(3) @[[LDS]], i32 0, i32 0, i32 4), align 4
-; SHADERTEST: store i32 %{{[0-9]*}}, ptr addrspace(3) getelementptr {{.*}}(<{ [8 x i32] }>, ptr addrspace(3) @[[LDS]], i32 0, i32 0, i32 5), align 4
-; SHADERTEST: store i32 %{{[0-9]*}}, ptr addrspace(3) getelementptr {{.*}}(<{ [8 x i32] }>, ptr addrspace(3) @[[LDS]], i32 0, i32 0, i32 6), align 4
-; SHADERTEST: store i32 %{{[0-9]*}}, ptr addrspace(3) getelementptr {{.*}}(<{ [8 x i32] }>, ptr addrspace(3) @[[LDS]], i32 0, i32 0, i32 7), align 4
+; SHADERTEST: store i32 %{{[0-9]*}}, ptr addrspace(3) getelementptr {{.*}}(i8, ptr addrspace(3) @[[LDS]], i32 4), align 4
+; SHADERTEST: store i32 %{{[0-9]*}}, ptr addrspace(3) getelementptr {{.*}}(i8, ptr addrspace(3) @[[LDS]], i32 8), align 4
+; SHADERTEST: store i32 %{{[0-9]*}}, ptr addrspace(3) getelementptr {{.*}}(i8, ptr addrspace(3) @[[LDS]], i32 12), align 4
+; SHADERTEST: store i32 %{{[0-9]*}}, ptr addrspace(3) getelementptr {{.*}}(i8, ptr addrspace(3) @[[LDS]], i32 16), align 4
+; SHADERTEST: store i32 %{{[0-9]*}}, ptr addrspace(3) getelementptr {{.*}}(i8, ptr addrspace(3) @[[LDS]], i32 20), align 4
+; SHADERTEST: store i32 %{{[0-9]*}}, ptr addrspace(3) getelementptr {{.*}}(i8, ptr addrspace(3) @[[LDS]], i32 24), align 4
+; SHADERTEST: store i32 %{{[0-9]*}}, ptr addrspace(3) getelementptr {{.*}}(i8, ptr addrspace(3) @[[LDS]], i32 28), align 4
 ; SHADERTEST: load i32, ptr addrspace(3) @[[LDS]], align 4
-; SHADERTEST: load i32, ptr addrspace(3) getelementptr {{.*}}(<{ [8 x i32] }>, ptr addrspace(3) @[[LDS]], i32 0, i32 0, i32 1), align 4
-; SHADERTEST: load i32, ptr addrspace(3) getelementptr {{.*}}(<{ [8 x i32] }>, ptr addrspace(3) @[[LDS]], i32 0, i32 0, i32 2), align 4
-; SHADERTEST: load i32, ptr addrspace(3) getelementptr {{.*}}(<{ [8 x i32] }>, ptr addrspace(3) @[[LDS]], i32 0, i32 0, i32 3), align 4
-; SHADERTEST: load i32, ptr addrspace(3) getelementptr {{.*}}(<{ [8 x i32] }>, ptr addrspace(3) @[[LDS]], i32 0, i32 0, i32 4), align 4
-; SHADERTEST: load i32, ptr addrspace(3) getelementptr {{.*}}(<{ [8 x i32] }>, ptr addrspace(3) @[[LDS]], i32 0, i32 0, i32 5), align 4
-; SHADERTEST: load i32, ptr addrspace(3) getelementptr {{.*}}(<{ [8 x i32] }>, ptr addrspace(3) @[[LDS]], i32 0, i32 0, i32 6), align 4
+; SHADERTEST: load i32, ptr addrspace(3) getelementptr {{.*}}(i8, ptr addrspace(3) @[[LDS]], i32 4), align 4
+; SHADERTEST: load i32, ptr addrspace(3) getelementptr {{.*}}(i8, ptr addrspace(3) @[[LDS]], i32 8), align 4
+; SHADERTEST: load i32, ptr addrspace(3) getelementptr {{.*}}(i8, ptr addrspace(3) @[[LDS]], i32 12), align 4
+; SHADERTEST: load i32, ptr addrspace(3) getelementptr {{.*}}(i8, ptr addrspace(3) @[[LDS]], i32 16), align 4
+; SHADERTEST: load i32, ptr addrspace(3) getelementptr {{.*}}(i8, ptr addrspace(3) @[[LDS]], i32 20), align 4
+; SHADERTEST: load i32, ptr addrspace(3) getelementptr {{.*}}(i8, ptr addrspace(3) @[[LDS]], i32 24), align 4
 ; index = 7 is optimized.
 
 ; SHADERTEST: AMDLLPC SUCCESS
diff --git a/llpc/test/shaderdb/gfx10/PipelineVsFs_TestVsOutMiscSideBusEna.pipe b/llpc/test/shaderdb/gfx10/PipelineVsFs_TestVsOutMiscSideBusEna.pipe
index a2d8a6b716..074a293e19 100644
--- a/llpc/test/shaderdb/gfx10/PipelineVsFs_TestVsOutMiscSideBusEna.pipe
+++ b/llpc/test/shaderdb/gfx10/PipelineVsFs_TestVsOutMiscSideBusEna.pipe
@@ -193,7 +193,7 @@ entryPoint = main
 ; SHADERTEST-NEXT:         .entry_point:    _amdgpu_ps_main
 ; SHADERTEST-NEXT:         .float_mode:     0xc0
 ; SHADERTEST-NEXT:         .ieee_mode:      false
-; SHADERTEST-NEXT:         .mem_ordered:    true
+; SHADERTEST:         .mem_ordered:    true
 ; SHADERTEST-NEXT:         .scratch_en:     false
 ; SHADERTEST-NEXT:         .scratch_memory_size: 0
 ; SHADERTEST-NEXT:         .sgpr_count:     0x2
@@ -246,7 +246,7 @@ entryPoint = main
 ; SHADERTEST-NEXT:         .entry_point:    _amdgpu_vs_main
 ; SHADERTEST-NEXT:         .float_mode:     0xc0
 ; SHADERTEST-NEXT:         .ieee_mode:      false
-; SHADERTEST-NEXT:         .mem_ordered:    true
+; SHADERTEST:         .mem_ordered:    true
 ; SHADERTEST-NEXT:         .scratch_en:     false
 ; SHADERTEST-NEXT:         .scratch_memory_size: 0
 ; SHADERTEST-NEXT:         .sgpr_count:     0x3
diff --git a/llpc/test/shaderdb/gfx11/HalfAttribute.frag b/llpc/test/shaderdb/gfx11/HalfAttribute.frag
new file mode 100644
index 0000000000..1f34547b47
--- /dev/null
+++ b/llpc/test/shaderdb/gfx11/HalfAttribute.frag
@@ -0,0 +1,22 @@
+// Check that f16 attribute was interpolated using rtz intrinsic.
+
+// RUN: amdllpc %gfxip --v %s |\
+// RUN:   FileCheck %s --check-prefix=CHECK
+//
+// CHECK-LABEL: {{^}}// LLPC pipeline patching results
+// CHECK:  [[P:%.*]] = call float @llvm.amdgcn.lds.param.load(i32 immarg 0, i32 immarg 0, i32 %PrimMask) #1
+// CHECK:  [[P1:%.*]] = call float @llvm.amdgcn.interp.p10.rtz.f16(float [[P]], float %PerspInterpCenter.i0, float [[P]], i1 false)
+// CHECK:  [[P2:%.*]]  = call half @llvm.amdgcn.interp.p2.rtz.f16(float [[P]], float %PerspInterpCenter.i1, float [[P1]], i1 false)
+// CHECK-LABEL: {{^}}===== AMDLLPC SUCCESS =====
+
+
+#version 450
+#extension GL_AMD_gpu_shader_half_float: enable
+
+layout (location = 0) in f16vec2 texCoordIn;
+layout (binding = 0) uniform sampler2D image1;
+layout (location = 0) out vec4 fragColor;
+
+void main() {
+  fragColor = texture(image1, texCoordIn);
+}
diff --git a/llpc/test/shaderdb/gfx11/SgprUserDataInit_Fs.pipe b/llpc/test/shaderdb/gfx11/SgprUserDataInit_Fs.pipe
index 30a256a175..32b699566e 100644
--- a/llpc/test/shaderdb/gfx11/SgprUserDataInit_Fs.pipe
+++ b/llpc/test/shaderdb/gfx11/SgprUserDataInit_Fs.pipe
@@ -298,7 +298,7 @@ colorBuffer[0].blendSrcAlphaToColor = 0
 ; CHECK-NEXT:         .ieee_mode:      false
 ; CHECK-NEXT:         .image_op:       false
 ; CHECK-NEXT:         .lds_size:       0
-; CHECK-NEXT:         .mem_ordered:    true
+; CHECK:         .mem_ordered:    true
 ; CHECK-NEXT:         .offchip_lds_en: false
 ; CHECK-NEXT:         .scratch_en:     false
 ; CHECK-NEXT:         .scratch_memory_size: 0
@@ -350,7 +350,7 @@ colorBuffer[0].blendSrcAlphaToColor = 0
 ; CHECK-NEXT:         .float_mode:     0xc0
 ; CHECK-NEXT:         .ieee_mode:      false
 ; CHECK-NEXT:         .image_op:       true
-; CHECK-NEXT:         .mem_ordered:    true
+; CHECK:         .mem_ordered:    true
 ; CHECK-NEXT:         .scratch_en:     false
 ; CHECK-NEXT:         .scratch_memory_size: 0
 ; CHECK-NEXT:         .sgpr_count:     0x11
@@ -359,7 +359,7 @@ colorBuffer[0].blendSrcAlphaToColor = 0
 ; CHECK-NEXT:         .user_data_reg_map:
 ; CHECK-NEXT:           - 0x10000000
 ; CHECK-NEXT:           - 0x12
-; CHECK-NEXT:           - 0x10000022
+; CHECK-NEXT:           - 0x10000023
 ; CHECK-NEXT:           - 0xffffffff
 ; CHECK-NEXT:           - 0xffffffff
 ; CHECK-NEXT:           - 0xffffffff
diff --git a/llpc/test/shaderdb/gfx11/cooperativeMatrix/array-of-matrices.comp b/llpc/test/shaderdb/gfx11/cooperativeMatrix/array-of-matrices.comp
index f483594686..4ca900bf0d 100644
--- a/llpc/test/shaderdb/gfx11/cooperativeMatrix/array-of-matrices.comp
+++ b/llpc/test/shaderdb/gfx11/cooperativeMatrix/array-of-matrices.comp
@@ -30,32 +30,32 @@ void main() {
   coopMatStore(A[idx2], buf.x, 64, 4, 0);
 }
 
-// CHECK-LABEL:  @lgc.shader.CS.main(
-// CHECK-LABEL:  .entry:
+// CHECK-LABEL: @lgc.shader.CS.main(
+// CHECK-NEXT:  .entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = call ptr addrspace(4) (...) @lgc.create.load.push.constants.ptr.p4()
 // CHECK-NEXT:    [[TMP1:%.*]] = call ptr addrspace(7) @lgc.load.buffer.desc(i64 0, i32 0, i32 0, i32 2)
-// CHECK-NEXT:    [[TMP2:%.*]] = call <8 x float> @lgc.cooperative.matrix.load.v8f32.p7.i32.i1.i32.i32.i32(ptr addrspace(7) [[TMP1]], i32 32, i1 true, i32 1, i32 0, i32 0) #[[ATTR1:[0-9]+]]
-// CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds <{ [4294967295 x [4 x i32]] }>, ptr addrspace(7) [[TMP1]], i32 0, i32 0, i32 32
-// CHECK-NEXT:    [[TMP4:%.*]] = call <8 x float> @lgc.cooperative.matrix.load.v8f32.p7.i32.i1.i32.i32.i32(ptr addrspace(7) [[TMP3]], i32 32, i1 true, i32 1, i32 0, i32 0) #[[ATTR1]]
-// CHECK-NEXT:    [[TMP5:%.*]] = load i32, ptr addrspace(4) [[TMP0]], align 4
-// CHECK-NEXT:    [[TMP6:%.*]] = icmp ult i32 [[TMP5]], 2
-// CHECK-NEXT:    br i1 [[TMP6]], label %[[LABEL7:.*]], label %[[LABEL12:.*]]
-// CHECK:       [[LABEL7]]:
-// CHECK-NEXT:    [[TMP8:%.*]] = icmp eq i32 [[TMP5]], 1
-// CHECK-NEXT:    [[TMP9:%.*]] = select i1 [[TMP8]], <8 x float> [[TMP4]], <8 x float> [[TMP2]]
-// CHECK-NEXT:    [[TMP10:%.*]] = call half @lgc.cooperative.matrix.extract.f16.v8f32.i32.i32.i32(<8 x float> [[TMP9]], i32 3, i32 1, i32 0) #[[ATTR3:[0-9]+]]
-// CHECK-NEXT:    [[TMP11:%.*]] = fptoui half [[TMP10]] to i32
-// CHECK-NEXT:    br label %[[LABEL12]]
-// CHECK:       [[LABEL12]]:
-// CHECK-NEXT:    [[TMP13:%.*]] = phi i32 [ 0, [[DOTENTRY:%.*]] ], [ [[TMP11]], %[[LABEL7]] ]
-// CHECK-NEXT:    store i32 [[TMP13]], ptr addrspace(7) [[TMP1]], align 4
-// CHECK-NEXT:    [[TMP14:%.*]] = getelementptr inbounds <{ i32, i32 }>, ptr addrspace(4) [[TMP0]], i64 0, i32 1
-// CHECK-NEXT:    [[TMP15:%.*]] = load i32, ptr addrspace(4) [[TMP14]], align 4
-// CHECK-NEXT:    [[TMP16:%.*]] = icmp ult i32 [[TMP15]], 2
-// CHECK-NEXT:    [[TMP17:%.*]] = icmp eq i32 [[TMP15]], 1
-// CHECK-NEXT:    [[TMP18:%.*]] = select i1 [[TMP17]], <8 x float> [[TMP4]], <8 x float> [[TMP2]]
-// CHECK-NEXT:    [[TMP19:%.*]] = select i1 [[TMP16]], <8 x float> [[TMP18]], <8 x float> zeroinitializer
-// CHECK-NEXT:    [[TMP20:%.*]] = getelementptr inbounds <{ [4294967295 x [4 x i32]] }>, ptr addrspace(7) [[TMP1]], i32 0, i32 0, i32 64
-// CHECK-NEXT:    call void @lgc.cooperative.matrix.store.p7.i32.i1.i32.i32.i32.v8f32(ptr addrspace(7) [[TMP20]], i32 64, i1 true, i32 1, i32 0, i32 0, <8 x float> [[TMP19]]) #[[ATTR2:[0-9]+]]
+// CHECK-NEXT:    [[LOAD:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.load__v8f32(ptr addrspace(7) [[TMP1]], i32 32, i1 true, i32 1, i32 0, i32 0, i32 16)
+// CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i8, ptr addrspace(7) [[TMP1]], i32 512
+// CHECK-NEXT:    [[LOAD2:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.load__v8f32(ptr addrspace(7) [[TMP2]], i32 32, i1 true, i32 1, i32 0, i32 0, i32 16)
+// CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr addrspace(4) [[TMP0]], align 4
+// CHECK-NEXT:    [[TMP4:%.*]] = icmp ult i32 [[TMP3]], 2
+// CHECK-NEXT:    br i1 [[TMP4]], label [[TMP5:%.*]], label [[TMP10:%.*]]
+// CHECK:       5:
+// CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i32 [[TMP3]], 1
+// CHECK-NEXT:    [[TMP7:%.*]] = select i1 [[TMP6]], <8 x float> [[LOAD2]], <8 x float> [[LOAD]]
+// CHECK-NEXT:    [[TMP8:%.*]] = call half (...) @lgc.cooperative.matrix.extract__f16(<8 x float> [[TMP7]], i32 3, i32 1, i32 0)
+// CHECK-NEXT:    [[TMP9:%.*]] = fptoui half [[TMP8]] to i32
+// CHECK-NEXT:    br label [[TMP10]]
+// CHECK:       10:
+// CHECK-NEXT:    [[TMP11:%.*]] = phi i32 [ 0, [[DOTENTRY:%.*]] ], [ [[TMP9]], [[TMP5]] ]
+// CHECK-NEXT:    store i32 [[TMP11]], ptr addrspace(7) [[TMP1]], align 4
+// CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TMP0]], i64 4
+// CHECK-NEXT:    [[TMP13:%.*]] = load i32, ptr addrspace(4) [[TMP12]], align 4
+// CHECK-NEXT:    [[TMP14:%.*]] = icmp ult i32 [[TMP13]], 2
+// CHECK-NEXT:    [[TMP15:%.*]] = icmp eq i32 [[TMP13]], 1
+// CHECK-NEXT:    [[TMP16:%.*]] = select i1 [[TMP15]], <8 x float> [[LOAD2]], <8 x float> [[LOAD]]
+// CHECK-NEXT:    [[TMP17:%.*]] = select i1 [[TMP14]], <8 x float> [[TMP16]], <8 x float> zeroinitializer
+// CHECK-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i8, ptr addrspace(7) [[TMP1]], i32 1024
+// CHECK-NEXT:    call void (...) @lgc.cooperative.matrix.store(ptr addrspace(7) [[TMP18]], i32 64, i1 true, i32 1, i32 0, i32 0, i32 16, <8 x float> [[TMP17]])
 // CHECK-NEXT:    ret void
 //
diff --git a/llpc/test/shaderdb/gfx11/cooperativeMatrix/extract-insert.spvasm b/llpc/test/shaderdb/gfx11/cooperativeMatrix/extract-insert.spvasm
index 422ae6f48e..bd0816b5e8 100644
--- a/llpc/test/shaderdb/gfx11/cooperativeMatrix/extract-insert.spvasm
+++ b/llpc/test/shaderdb/gfx11/cooperativeMatrix/extract-insert.spvasm
@@ -119,29 +119,29 @@
                OpReturn
                OpFunctionEnd
 
-; CHECK-LABEL:  @lgc.shader.CS.main(
-; CHECK-LABEL:  .entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = call ptr addrspace(7) @lgc.load.buffer.desc(i64 0, i32 2, i32 0, i32 2)
-; CHECK-NEXT:    [[TMP1:%.*]] = call ptr addrspace(7) @lgc.load.buffer.desc(i64 0, i32 1, i32 0, i32 2)
-; CHECK-NEXT:    [[TMP2:%.*]] = call ptr addrspace(7) @lgc.load.buffer.desc(i64 0, i32 0, i32 0, i32 2)
-; CHECK-NEXT:    [[TMP3:%.*]] = call <8 x float> @lgc.cooperative.matrix.load.v8f32.p7.i32.i1.i32.i32.i32(ptr addrspace(7) [[TMP2]], i32 32, i1 true, i32 1, i32 0, i32 0) #[[ATTR1:[0-9]+]]
-; CHECK-NEXT:    [[TMP4:%.*]] = call <8 x float> @lgc.cooperative.matrix.load.v8f32.p7.i32.i1.i32.i32.i32(ptr addrspace(7) [[TMP1]], i32 32, i1 true, i32 1, i32 0, i32 0) #[[ATTR1]]
-; CHECK-NEXT:    br label [[TMP5:%.*]]
+; CHECK-LABEL: @lgc.shader.CS.main(
+; CHECK-NEXT:  .entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = call ptr addrspace(7) @lgc.load.buffer.desc(i64 0, i32 1, i32 0, i32 2)
+; CHECK-NEXT:    [[TMP1:%.*]] = call ptr addrspace(7) @lgc.load.buffer.desc(i64 0, i32 0, i32 0, i32 2)
+; CHECK-NEXT:    [[LOAD:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.load__v8f32(ptr addrspace(7) [[TMP1]], i32 32, i1 true, i32 1, i32 0, i32 0, i32 16)
+; CHECK-NEXT:    [[LOAD1:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.load__v8f32(ptr addrspace(7) [[TMP0]], i32 32, i1 true, i32 1, i32 0, i32 0, i32 16)
+; CHECK-NEXT:    br label [[TMP2:%.*]]
+; CHECK:       2:
+; CHECK-NEXT:    [[DOT012:%.*]] = phi i32 [ 0, [[DOTENTRY:%.*]] ], [ [[TMP10:%.*]], [[TMP5:%.*]] ]
+; CHECK-NEXT:    [[DOT0:%.*]] = phi <8 x float> [ undef, [[DOTENTRY]] ], [ [[TMP9:%.*]], [[TMP5]] ]
+; CHECK-NEXT:    [[TMP3:%.*]] = call i32 @lgc.cooperative.matrix.length(i32 0)
+; CHECK-NEXT:    [[TMP4:%.*]] = icmp slt i32 [[DOT012]], [[TMP3]]
+; CHECK-NEXT:    [[COND_FREEZE:%.*]] = freeze i1 [[TMP4]]
+; CHECK-NEXT:    br i1 [[COND_FREEZE]], label [[TMP5]], label [[TMP11:%.*]]
 ; CHECK:       5:
-; CHECK-NEXT:    [[DOT011:%.*]] = phi i32 [ 0, [[DOTENTRY:%.*]] ], [ [[TMP13:%.*]], [[TMP8:%.*]] ]
-; CHECK-NEXT:    [[DOT0:%.*]] = phi <8 x float> [ undef, [[DOTENTRY]] ], [ [[TMP12:%.*]], [[TMP8]] ]
-; CHECK-NEXT:    [[TMP6:%.*]] = call i32 @lgc.cooperative.matrix.length.i32.i32.i32(i32 1, i32 0) #[[ATTR2:[0-9]+]]
-; CHECK-NEXT:    [[TMP7:%.*]] = icmp slt i32 [[DOT011]], [[TMP6]]
-; CHECK-NEXT:    [[FR:%.*]] = freeze i1 [[TMP7]]
-; CHECK-NEXT:    br i1 [[FR]], label [[TMP8]], label [[TMP14:%.*]]
-; CHECK:       8:
-; CHECK-NEXT:    [[TMP9:%.*]] = call half @lgc.cooperative.matrix.extract.f16.v8f32.i32.i32.i32(<8 x float> [[TMP3]], i32 [[DOT011]], i32 1, i32 0) #[[ATTR2]]
-; CHECK-NEXT:    [[TMP10:%.*]] = call half @lgc.cooperative.matrix.extract.f16.v8f32.i32.i32.i32(<8 x float> [[TMP4]], i32 [[DOT011]], i32 1, i32 0) #[[ATTR2]]
-; CHECK-NEXT:    [[TMP11:%.*]] = fmul reassoc nnan nsz arcp contract afn half [[TMP9]], [[TMP10]]
-; CHECK-NEXT:    [[TMP12]] = call <8 x float> @lgc.cooperative.matrix.insert.v8f32.v8f32.f16.i32.i32.i32(<8 x float> [[DOT0]], half [[TMP11]], i32 [[DOT011]], i32 1, i32 0) #[[ATTR2]]
-; CHECK-NEXT:    [[TMP13]] = add i32 [[DOT011]], 1
-; CHECK-NEXT:    br label [[TMP5]], !llvm.loop [[LOOP8:![0-9]+]]
-; CHECK:       14:
-; CHECK-NEXT:    call void @lgc.cooperative.matrix.store.p7.i32.i1.i32.i32.i32.v8f32(ptr addrspace(7) [[TMP0]], i32 64, i1 true, i32 1, i32 0, i32 0, <8 x float> [[DOT0]]) #[[ATTR3:[0-9]+]]
+; CHECK-NEXT:    [[TMP6:%.*]] = call half (...) @lgc.cooperative.matrix.extract__f16(<8 x float> [[LOAD]], i32 [[DOT012]], i32 1, i32 0)
+; CHECK-NEXT:    [[TMP7:%.*]] = call half (...) @lgc.cooperative.matrix.extract__f16(<8 x float> [[LOAD1]], i32 [[DOT012]], i32 1, i32 0)
+; CHECK-NEXT:    [[TMP8:%.*]] = fmul reassoc nnan nsz arcp contract afn half [[TMP6]], [[TMP7]]
+; CHECK-NEXT:    [[TMP9]] = call <8 x float> (...) @lgc.cooperative.matrix.insert__v8f32(<8 x float> [[DOT0]], half [[TMP8]], i32 [[DOT012]], i32 1, i32 0)
+; CHECK-NEXT:    [[TMP10]] = add i32 [[DOT012]], 1
+; CHECK-NEXT:    br label [[TMP2]], !llvm.loop [[LOOP12:![0-9]+]]
+; CHECK:       11:
+; CHECK-NEXT:    [[TMP12:%.*]] = call ptr addrspace(7) @lgc.load.buffer.desc(i64 0, i32 2, i32 0, i32 2)
+; CHECK-NEXT:    call void (...) @lgc.cooperative.matrix.store(ptr addrspace(7) [[TMP12]], i32 64, i1 true, i32 1, i32 0, i32 0, i32 16, <8 x float> [[DOT0]])
 ; CHECK-NEXT:    ret void
 ;
diff --git a/llpc/test/shaderdb/gfx11/cooperativeMatrix/lit.local.cfg b/llpc/test/shaderdb/gfx11/cooperativeMatrix/lit.local.cfg
deleted file mode 100644
index a4266bc874..0000000000
--- a/llpc/test/shaderdb/gfx11/cooperativeMatrix/lit.local.cfg
+++ /dev/null
@@ -1,2 +0,0 @@
-if "vki_cooperative_matrix" not in config.available_features:
-    config.unsupported = True
diff --git a/llpc/test/shaderdb/gfx11/cooperativeMatrix/loadstore-uvec4.comp b/llpc/test/shaderdb/gfx11/cooperativeMatrix/loadstore-uvec4.comp
index df715b16a1..7b5e948de7 100644
--- a/llpc/test/shaderdb/gfx11/cooperativeMatrix/loadstore-uvec4.comp
+++ b/llpc/test/shaderdb/gfx11/cooperativeMatrix/loadstore-uvec4.comp
@@ -20,11 +20,11 @@ void main() {
   coopMatStore(matrix, bufOut.x, 0, 4, 0);
 }
 
-// CHECK-LABEL:  @lgc.shader.CS.main(
-// CHECK-LABEL:  .entry:
-// CHECK-NEXT:    [[TMP0:%[0-9]*]] = call ptr addrspace(7) @lgc.load.buffer.desc(i64 0, i32 1, i32 0, i32 2)
-// CHECK-NEXT:    [[TMP1:%[0-9]*]] = call ptr addrspace(7) @lgc.load.buffer.desc(i64 0, i32 0, i32 0, i32 2)
-// CHECK-NEXT:    [[TMP2:%[0-9]*]] = call <8 x float> @lgc.cooperative.matrix.load.v8f32.p7.i32.i1.i32.i32.i32(ptr addrspace(7) [[TMP1]], i32 64, i1 true, i32 1, i32 0, i32 0) #[[ATTR1:[0-9]+]]
-// CHECK-NEXT:    call void @lgc.cooperative.matrix.store.p7.i32.i1.i32.i32.i32.v8f32(ptr addrspace(7) [[TMP0]], i32 64, i1 true, i32 1, i32 0, i32 0, <8 x float> [[TMP2]]) #[[ATTR2:[0-9]+]]
+// CHECK-LABEL: @lgc.shader.CS.main(
+// CHECK-NEXT:  .entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = call ptr addrspace(7) @lgc.load.buffer.desc(i64 0, i32 1, i32 0, i32 2)
+// CHECK-NEXT:    [[TMP1:%.*]] = call ptr addrspace(7) @lgc.load.buffer.desc(i64 0, i32 0, i32 0, i32 2)
+// CHECK-NEXT:    [[LOAD:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.load__v8f32(ptr addrspace(7) [[TMP1]], i32 64, i1 true, i32 1, i32 0, i32 0, i32 16)
+// CHECK-NEXT:    call void (...) @lgc.cooperative.matrix.store(ptr addrspace(7) [[TMP0]], i32 64, i1 true, i32 1, i32 0, i32 0, i32 16, <8 x float> [[LOAD]])
 // CHECK-NEXT:    ret void
 //
diff --git a/llpc/test/shaderdb/ray_tracing/PipelineRays_Continufy.pipe b/llpc/test/shaderdb/ray_tracing/PipelineRays_Continufy.pipe
deleted file mode 100644
index 586e8ec6bc..0000000000
--- a/llpc/test/shaderdb/ray_tracing/PipelineRays_Continufy.pipe
+++ /dev/null
@@ -1,153 +0,0 @@
-; Check that the ray tracing continufy mode is working.
-; Generating the instruction 'image_bvh64_intersect_ray' indicates the trace ray library is linked correctly.
-
-; TODO: Change this to ISA / assembly output checks once the LLVM backend has settled
-
-; RUN: amdllpc -gfxip 11.0 -emit-llvm -o - %s | FileCheck -check-prefixes=CHECK %s
-
-; CHECK-LABEL: @_amdgpu_cs_main(
-; CHECK:     call void {{.*}} @llvm.amdgcn.cs.chain.
-
-; CHECK-LABEL: @_rgen_1(
-; CHECK-NOT: br i1
-; CHECK:     call void {{.*}} @llvm.amdgcn.cs.chain.
-
-; CHECK-LABEL: @_rgen_1.resume.0(
-; CHECK:     call void {{.*}} @llvm.amdgcn.cs.chain.
-; CHECK:     unreachable
-; CHECK:     ret void
-
-; CHECK-LABEL: @_chit_2(
-; CHECK-NOT: br i1
-; CHECK:     call void {{.*}} @llvm.amdgcn.cs.chain.
-
-; CHECK-LABEL: @_cs_(
-; CHECK:     call <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.
-; CHECK-NOT: ret void
-; CHECK:     call void {{.*}} @llvm.amdgcn.cs.chain.
-; CHECK-NOT: ret void
-
-[Version]
-version = 69
-
-[rgenGlsl]
-#version 460
-#extension GL_EXT_ray_tracing : require
-
-struct RayPayload {
-  vec3 color;
-};
-
-layout(binding = 0, set = 0) uniform accelerationStructureEXT g_bvh;
-layout(binding = 1, set = 0, rgba32f) uniform image2D g_dst;
-
-layout(location = 14) rayPayloadEXT RayPayload g_ray;
-
-void main() {
-  vec3 origin;
-  origin.x = gl_LaunchIDEXT.x;
-  origin.y = gl_LaunchIDEXT.y;
-  origin.z = 0;
-
-  traceRayEXT(g_bvh, /* ray flags */ 0, /* cull mask */ 0xff, 
-              /* sbt offset */ 0, /* sbt stride */ 1, /* miss index */ 0,
-              origin.xyz, /* tmin */ 0.0, /* direction */ vec3(1, 0, 0),
-              /* tmax */ 48.0, /* payload location */ 14);
-
-  imageStore(g_dst, ivec2(gl_LaunchIDEXT.xy), vec4(g_ray.color, 0));
-}
-
-[rgenInfo]
-entryPoint = main
-
-[chitGlsl]
-#version 460
-#extension GL_EXT_ray_tracing : require
-
-struct RayPayload {
-  vec3 color;
-};
-
-layout(shaderRecordEXT, std430) buffer sbt {
-  float z;
-};
-
-hitAttributeEXT vec2 g_hit;
-rayPayloadInEXT RayPayload g_ray;
-
-void main() {
-  g_ray.color.xy = g_hit;
-  g_ray.color.z = z;
-}
-
-[chitInfo]
-entryPoint = main
-
-[ResourceMapping]
-userDataNode[0].visibility = 0xffffffff
-userDataNode[0].type = DescriptorTableVaPtr
-userDataNode[0].offsetInDwords = 0
-userDataNode[0].sizeInDwords = 1
-userDataNode[0].next[0].type = DescriptorConstBuffer
-userDataNode[0].next[0].offsetInDwords = 0
-userDataNode[0].next[0].sizeInDwords = 4
-userDataNode[0].next[0].set = 0x00000000
-userDataNode[0].next[0].binding = 0
-userDataNode[0].next[1].type = DescriptorImage
-userDataNode[0].next[1].offsetInDwords = 4
-userDataNode[0].next[1].sizeInDwords = 8
-userDataNode[0].next[1].set = 0x00000000
-userDataNode[0].next[1].binding = 1
-userDataNode[1].visibility = 0xffffffff
-userDataNode[1].type = DescriptorTableVaPtr
-userDataNode[1].offsetInDwords = 1
-userDataNode[1].sizeInDwords = 1
-userDataNode[1].next[0].type = DescriptorConstBufferCompact
-userDataNode[1].next[0].offsetInDwords = 0
-userDataNode[1].next[0].sizeInDwords = 2
-userDataNode[1].next[0].set = 0x0000005D
-userDataNode[1].next[0].binding = 17
-userDataNode[1].next[1].type = DescriptorConstBuffer
-userDataNode[1].next[1].offsetInDwords = 2
-userDataNode[1].next[1].sizeInDwords = 4
-userDataNode[1].next[1].set = 0x0000005D
-userDataNode[1].next[1].binding = 0
-userDataNode[1].next[2].type = DescriptorBuffer
-userDataNode[1].next[2].offsetInDwords = 6
-userDataNode[1].next[2].sizeInDwords = 4
-userDataNode[1].next[2].set = 0x0000005D
-userDataNode[1].next[2].binding = 1
-
-[RayTracingPipelineState]
-groups[0].type = VK_RAY_TRACING_SHADER_GROUP_TYPE_GENERAL_KHR
-groups[0].generalShader = 0
-groups[0].closestHitShader = -1
-groups[0].anyHitShader = -1
-groups[0].intersectionShader = -1
-groups[1].type = VK_RAY_TRACING_SHADER_GROUP_TYPE_TRIANGLES_HIT_GROUP_KHR
-groups[1].closestHitShader = 1
-maxRecursionDepth = 1
-indirectStageMask = 0xffffffff
-mode = 2
-rtState.bvhResDescSize = 4
-rtState.bvhResDesc[0] = 0
-rtState.bvhResDesc[1] = 2197815296
-rtState.bvhResDesc[2] = 4294967295
-rtState.bvhResDesc[3] = 2164261887
-rtState.nodeStrideShift = 7
-rtState.threadGroupSizeX = 8
-rtState.threadGroupSizeY = 4
-rtState.threadGroupSizeZ = 1
-rtState.rayQueryCsSwizzle = 1
-rtState.ldsStackSize = 16
-rtState.dispatchRaysThreadGroupSize = 32
-rtState.ldsSizePerThreadGroup = 65536
-rtState.outerTileSize = 4
-rtState.dispatchDimSwizzleMode = 0
-rtState.enableDispatchRaysInnerSwizzle = 1
-rtState.enableDispatchRaysOuterSwizzle = 1
-rtState.enableOptimalLdsStackSizeForIndirect = 1
-rtState.enableOptimalLdsStackSizeForUnified = 1
-payloadSizeMaxInLib = 12
-attributeSizeMaxInLib = 8
-hasPipelineLibrary = 1
diff --git a/llpc/test/shaderdb/ray_tracing/PipelineRays_TestLaunchKernel.pipe b/llpc/test/shaderdb/ray_tracing/PipelineRays_TestLaunchKernel.pipe
index 733bbcb060..7d6b67f683 100644
--- a/llpc/test/shaderdb/ray_tracing/PipelineRays_TestLaunchKernel.pipe
+++ b/llpc/test/shaderdb/ray_tracing/PipelineRays_TestLaunchKernel.pipe
@@ -1,4 +1,4 @@
-; RUN: amdllpc -o - -print-after=prepare-continuations -gpurt-use-dumped=true -llpc-raytracing-mode=continuations -emit-lgc %s | FileCheck -check-prefixes=CHECK %s
+; RUN: amdllpc -gfxip 11.0 -o - -print-after=prepare-continuations -gpurt-use-dumped=true -llpc-raytracing-mode=continuations -emit-lgc %s | FileCheck -check-prefixes=CHECK %s
 
 [Version]
 version = 70
@@ -160,7 +160,7 @@ rtState.traceRayWaveDensityThreshold[10] = 1
 rtState.traceRayWaveDensityThreshold[11] = 1
 rtState.traceRayWaveDensityThreshold[12] = 1
 rtState.gpurtFeatureFlags = 0
-rtState.gpurtShaderLibrary = Shader_0xAC2A9C902883FD2A.spv
+rtState.gpurtShaderLibrary = Shader_0x6E9B9DD1ADBD5A1D.spv
 rtState.gpurtFuncTable.pFunc[0] = TraceRay2_0
 rtState.gpurtFuncTable.pFunc[1] = TraceRayInline2_0
 rtState.gpurtFuncTable.pFunc[2] = TraceRayUsingHitToken2_0
@@ -185,4 +185,4 @@ pipelineLibStageMask = 0
 ; CHECK-LABEL: ; ModuleID = 'lgcPipeline'
 ; CHECK-NEXT: source_filename = "main"
 ; CHECK: define dllexport void @lgc.shader.CS.main() !lgc.shaderstage !{{[0-9]+}} !lgc.rt.shaderstage !{{[0-9]+}} {
-; CHECK: call void (...) @lgc.cps.jump(i32 %{{[0-9]+}}, i32 -1, {} poison, i32 poison, i32 %{{[0-9]+}}, { <3 x i32>, i32 } %{{[0-9]+}})
+; CHECK: call void (...) @lgc.cps.jump(
diff --git a/llpc/test/shaderdb/ray_tracing/Shader_0x6E9B9DD1ADBD5A1D.spv b/llpc/test/shaderdb/ray_tracing/Shader_0x6E9B9DD1ADBD5A1D.spv
new file mode 100644
index 0000000000..c926dc0d1b
Binary files /dev/null and b/llpc/test/shaderdb/ray_tracing/Shader_0x6E9B9DD1ADBD5A1D.spv differ
diff --git a/llpc/test/shaderdb/ray_tracing/Shader_0xAC2A9C902883FD2A.spv b/llpc/test/shaderdb/ray_tracing/Shader_0xAC2A9C902883FD2A.spv
deleted file mode 100644
index 2aec42d200..0000000000
Binary files a/llpc/test/shaderdb/ray_tracing/Shader_0xAC2A9C902883FD2A.spv and /dev/null differ
diff --git a/llpc/test/shaderdb/ray_tracing/Shader_0xE4BF4BB5EC6FAB41.spv b/llpc/test/shaderdb/ray_tracing/Shader_0xE4BF4BB5EC6FAB41.spv
deleted file mode 100644
index 7851b171db..0000000000
Binary files a/llpc/test/shaderdb/ray_tracing/Shader_0xE4BF4BB5EC6FAB41.spv and /dev/null differ
diff --git a/llpc/test/shaderdb/ray_tracing/TestProcessGpuRtLibrary.rgen b/llpc/test/shaderdb/ray_tracing/TestProcessGpuRtLibrary.rgen
deleted file mode 100644
index 9ab6e81397..0000000000
--- a/llpc/test/shaderdb/ray_tracing/TestProcessGpuRtLibrary.rgen
+++ /dev/null
@@ -1,13 +0,0 @@
-// RUN: amdllpc %gfxip --print-after=llpc-spirv-lower-gpurt-library 2>&1 %s | FileCheck -check-prefix=CHECK %s
-// Disable this test for now as continuations part of GPURT is not opensourced yet.
-// REQUIRES: do-not-run-me
-#version 460
-#extension GL_EXT_ray_tracing : enable
-
-void main()
-{
-}
-// Check these _Amd intrinsics's bodies are deleted.
-// CHECK: declare dso_local spir_func i32 @_AmdGetShaderKind()
-// CHECK: declare dso_local spir_func i64 @_AmdGetResumePointAddr()
-// CHECK: declare dso_local spir_func {{.*}} @_AmdAwait{{.*}}(
diff --git a/llpc/translator/lib/SPIRV/SPIRVReader.cpp b/llpc/translator/lib/SPIRV/SPIRVReader.cpp
index f65b03279d..9d55f8e7c5 100644
--- a/llpc/translator/lib/SPIRV/SPIRVReader.cpp
+++ b/llpc/translator/lib/SPIRV/SPIRVReader.cpp
@@ -1,4 +1,4 @@
-﻿//===- SPIRVReader.cpp - Converts SPIR-V to LLVM ----------------*- C++ -*-===//
+//===- SPIRVReader.cpp - Converts SPIR-V to LLVM ----------------*- C++ -*-===//
 //
 //                     The LLVM/SPIR-V Translator
 //
@@ -1339,7 +1339,9 @@ Value *SPIRVToLLVM::transConvertInst(SPIRVValue *bv, Function *f, BasicBlock *bb
   assert(CastInst::isCast(co) && "Invalid cast op code");
   if (bb) {
     if (bv->getType()->isTypeCooperativeMatrixKHR()) {
-      return getBuilder()->CreateCooperativeMatrixConvert(co, src, srcElemTy, dstElemTy, srcLayout, dstLayout);
+      Type *matrixType = getBuilder()->getCooperativeMatrixTy(dstElemTy, dstLayout);
+      return getBuilder()->create<CooperativeMatrixConvertOp>(matrixType, co, src, srcElemTy, dstElemTy, srcLayout,
+                                                              dstLayout, "convert");
     }
     bool srcIsPtr = srcType->isPtrOrPtrVectorTy();
     bool dstIsPtr = dstType->isPtrOrPtrVectorTy();
@@ -2971,7 +2973,7 @@ Value *SPIRVToLLVM::transLoadImage(SPIRVValue *spvImageLoadPtr) {
   SPIRVType *spvElementTy = spvImageLoadPtr->getType()->getPointerElementType();
   Type *elementTy = transType(spvElementTy, 0, false, false, LayoutMode::Native);
   BasicBlock *bb = getBuilder()->GetInsertBlock();
-  Value *base = transValue(spvImageLoadPtr, bb->getParent(), bb);
+  Value *base = transValueMulti(spvImageLoadPtr, bb->getParent(), bb)[0];
   return loadImageSampler(elementTy, base);
 }
 
@@ -3052,7 +3054,8 @@ Value *SPIRVToLLVM::transImagePointer(SPIRVValue *spvImagePtr, SPIRVType *baseTy
 
   if (spvImagePtr->getOpCode() != OpVariable ||
       static_cast<SPIRVTypePointer *>(spvImagePtr->getType())->getStorageClass() != StorageClassUniformConstant) {
-    Value *v = transValue(spvImagePtr, getBuilder()->GetInsertBlock()->getParent(), getBuilder()->GetInsertBlock());
+    Value *v =
+        transValueMulti(spvImagePtr, getBuilder()->GetInsertBlock()->getParent(), getBuilder()->GetInsertBlock())[0];
 
     // For function parameter, if it translated to an pointer type, then, it's a struct with sampler type, we should not
     // return it.
@@ -3072,8 +3075,19 @@ Value *SPIRVToLLVM::transImagePointer(SPIRVValue *spvImagePtr, SPIRVType *baseTy
   (void)hasDescriptorSet;
 
   SPIRVType *spvTy = spvImagePtr->getType()->getPointerElementType();
-  while (spvTy->getOpCode() == OpTypeArray || spvTy->getOpCode() == OpTypeRuntimeArray)
-    spvTy = spvTy->getArrayElementType();
+  SmallVector<SPIRVType *> elementWorklist;
+  elementWorklist.push_back(spvTy);
+  while (!elementWorklist.empty()) {
+    spvTy = elementWorklist.pop_back_val();
+    if (spvTy->getOpCode() == OpTypeImage || spvTy->getOpCode() == OpTypeSampler ||
+        spvTy->getOpCode() == OpTypeSampledImage)
+      break;
+    else if (spvTy->getOpCode() == OpTypeArray || spvTy->getOpCode() == OpTypeRuntimeArray)
+      elementWorklist.push_back(spvTy->getArrayElementType());
+    else if (spvTy->getOpCode() == OpTypeStruct)
+      for (int i = 0, e = spvTy->getStructMemberCount(); i < e; i++)
+        elementWorklist.push_back(spvTy->getStructMemberType(i));
+  }
 
   Value *imageDescPtr = nullptr;
   Value *samplerDescPtr = nullptr;
@@ -3196,8 +3210,18 @@ Value *SPIRVToLLVM::getDescPointerAndStride(ResourceNodeType resType, unsigned d
   // to detect whether it is a converting sampler, and set up the converting sampler index.
   unsigned convertingSamplerIdx = 0;
   unsigned nextIdx = 1;
+  unsigned convertingSamplerDescriptorSet = descriptorSet;
+  if (getPipelineOptions()->replaceSetWithResourceType &&
+      descriptorSet ==
+          PipelineContext::getGlResourceNodeSetFromType(Vkgc::ResourceMappingNodeType::DescriptorSampler)) {
+    // When using 'replaceSetWithResourceType' option (OGL default) it's not possible to match converting samplers
+    // for 'DescriptorResource' and 'DescriptorSampler' at the same time, which is needed to handle YCbCr formats.
+    // Converting sampler with YCbCr metadata has 'DescriptorResource' set assigned, hence looking for it instead.
+    convertingSamplerDescriptorSet =
+        PipelineContext::getGlResourceNodeSetFromType(Vkgc::ResourceMappingNodeType::DescriptorResource);
+  }
   for (const ConvertingSampler &convertingSampler : m_convertingSamplers) {
-    if (convertingSampler.set == descriptorSet && convertingSampler.binding == binding) {
+    if (convertingSampler.set == convertingSamplerDescriptorSet && convertingSampler.binding == binding) {
       convertingSamplerIdx = nextIdx;
       break;
     }
@@ -3417,48 +3441,42 @@ SmallVector<Value *> SPIRVToLLVM::transAccessChain(SPIRVValue *const spvValue) {
     }
   }
 
-  // Special for UniformConstant: determine whether result/base is the mixed image/non-image case
-  bool baseHasImage = false;
-  bool baseHasNonImage = false;
-  bool resultHasImage = false;
-  bool resultHasNonImage = false;
-  if (storageClass == StorageClassUniformConstant) {
-    auto detectImage = [](SPIRVType *type) -> std::pair<bool, bool> {
-      bool hasImage = false;
-      bool hasNonImage = false;
-      SmallVector<SPIRVType *> elementWorklist;
-      elementWorklist.push_back(type);
-      while (!elementWorklist.empty()) {
-        SPIRVType *spvTy = elementWorklist.pop_back_val();
-        switch (spvTy->getOpCode()) {
-        case OpTypeImage:
-        case OpTypeSampler:
-        case OpTypeSampledImage:
-          hasImage = true;
-          break;
-        case OpTypeArray:
-        case OpTypeRuntimeArray:
-          elementWorklist.push_back(spvTy->getArrayElementType());
-          break;
-        case OpTypeStruct:
-          for (int i = 0, e = spvTy->getStructMemberCount(); i < e; i++)
-            elementWorklist.push_back(spvTy->getStructMemberType(i));
-          hasNonImage = true;
-          break;
-        default:
-          hasNonImage = true;
-        }
+  // Determine whether result/base is the mixed image/non-image case
+  auto detectImage = [](SPIRVType *type) -> std::pair<bool, bool> {
+    bool hasImage = false;
+    bool hasNonImage = false;
+    SmallVector<SPIRVType *> elementWorklist;
+    elementWorklist.push_back(type);
+    while (!elementWorklist.empty()) {
+      SPIRVType *spvTy = elementWorklist.pop_back_val();
+      switch (spvTy->getOpCode()) {
+      case OpTypeImage:
+      case OpTypeSampler:
+      case OpTypeSampledImage:
+        hasImage = true;
+        break;
+      case OpTypeArray:
+      case OpTypeRuntimeArray:
+        elementWorklist.push_back(spvTy->getArrayElementType());
+        break;
+      case OpTypeStruct:
+        for (int i = 0, e = spvTy->getStructMemberCount(); i < e; i++)
+          elementWorklist.push_back(spvTy->getStructMemberType(i));
+        hasNonImage = true;
+        break;
+      default:
+        hasNonImage = true;
       }
-      return std::make_pair(hasImage, hasNonImage);
-    };
+    }
+    return std::make_pair(hasImage, hasNonImage);
+  };
 
-    auto pair = detectImage(spvResultType);
-    resultHasImage = pair.first;
-    resultHasNonImage = pair.second;
-    pair = detectImage(spvAccessType);
-    baseHasImage = pair.first;
-    baseHasNonImage = pair.second;
-  }
+  auto pair = detectImage(spvResultType);
+  bool resultHasImage = pair.first;
+  bool resultHasNonImage = pair.second;
+  pair = detectImage(spvAccessType);
+  bool baseHasImage = pair.first;
+  bool baseHasNonImage = pair.second;
 
   // Translate the base variable and indices
   auto fullBase = transValueMulti(spvBase, f, bb);
@@ -3471,8 +3489,9 @@ SmallVector<Value *> SPIRVToLLVM::transAccessChain(SPIRVValue *const spvValue) {
   SmallVector<Value *> result;
 
   // First, translate the access chain for any non-image parts.
-  if (!resultHasImage) {
+  if (resultHasNonImage) {
     Value *base = fullBase[0];
+
     const SPIRVStorageClassKind pointerStorageClass = spvBaseType->getPointerStorageClass();
 
     const bool typeMaybeRemapped = isStorageClassExplicitlyLaidOut(m_bm, pointerStorageClass) ||
@@ -3489,9 +3508,6 @@ SmallVector<Value *> SPIRVToLLVM::transAccessChain(SPIRVValue *const spvValue) {
     assert(spvBaseType->isTypePointer());
     gepIndices.push_back(srcIndices[0]);
 
-    if (spvBaseType->isTypePointer())
-      spvAccessType = spvBaseType->getPointerElementType();
-
     auto flushGep = [&]() {
       if (gepIndices.size() == 1) {
         if (auto *constant = dyn_cast<ConstantInt>(gepIndices[0])) {
@@ -3512,21 +3528,23 @@ SmallVector<Value *> SPIRVToLLVM::transAccessChain(SPIRVValue *const spvValue) {
     // Run over the indices and map the SPIR-V level indices to LLVM indices, which may be different because the LLVM
     // types may contain manual padding fields to model the power of Vulkan's layout options.
     // Additionally, break up the GEP sequence to handle some special cases like row major matrices.
-    for (Value *index : ArrayRef(srcIndices).drop_front()) {
-      switch (spvAccessType->getOpCode()) {
+    SPIRVType *spvAccessElementType = spvAccessType;
+    ArrayRef<Value *> srcIndicesArray = srcIndices;
+    for (Value *index : srcIndicesArray.drop_front()) {
+      switch (spvAccessElementType->getOpCode()) {
       case OpTypeStruct: {
         ConstantInt *constIndex = cast<ConstantInt>(index);
         const uint64_t origMemberIndex = constIndex->getZExtValue();
         Type *castType = nullptr;
 
         if (typeMaybeRemapped) {
-          if (isRemappedTypeElements(spvAccessType)) {
-            const uint64_t remappedMemberIndex = lookupRemappedTypeElements(spvAccessType, origMemberIndex);
+          if (isRemappedTypeElements(spvAccessElementType)) {
+            const uint64_t remappedMemberIndex = lookupRemappedTypeElements(spvAccessElementType, origMemberIndex);
             constIndex = getBuilder()->getInt32(remappedMemberIndex);
           }
 
           // If the struct member was actually overlapping another struct member, we need a split here.
-          const auto structIndexPair = std::make_pair(spvAccessType, origMemberIndex);
+          const auto structIndexPair = std::make_pair(spvAccessElementType, origMemberIndex);
 
           if (m_overlappingStructTypeWorkaroundMap.count(structIndexPair) > 0)
             castType = m_overlappingStructTypeWorkaroundMap[structIndexPair];
@@ -3541,20 +3559,20 @@ SmallVector<Value *> SPIRVToLLVM::transAccessChain(SPIRVValue *const spvValue) {
                                              basePointeeType->getPointerTo(base->getType()->getPointerAddressSpace()));
         }
 
-        spvAccessType = spvAccessType->getStructMemberType(origMemberIndex);
+        spvAccessElementType = spvAccessElementType->getStructMemberType(origMemberIndex);
         break;
       }
       case OpTypeArray:
       case OpTypeRuntimeArray: {
         gepIndices.push_back(index);
 
-        if (typeMaybeRemapped && isRemappedTypeElements(spvAccessType)) {
+        if (typeMaybeRemapped && isRemappedTypeElements(spvAccessElementType)) {
           // If we have padding in an array, we inserted a struct to add that
           // padding, and so we need an extra constant 0 index.
           gepIndices.push_back(getBuilder()->getInt32(0));
         }
 
-        spvAccessType = spvAccessType->getArrayElementType();
+        spvAccessElementType = spvAccessElementType->getArrayElementType();
         break;
       }
       case OpTypeMatrix: {
@@ -3579,22 +3597,22 @@ SmallVector<Value *> SPIRVToLLVM::transAccessChain(SPIRVValue *const spvValue) {
           }
         }
 
-        spvAccessType = spvAccessType->getMatrixColumnType();
+        spvAccessElementType = spvAccessElementType->getMatrixColumnType();
         break;
       }
       case OpTypeVector: {
         gepIndices.push_back(index);
-        spvAccessType = spvAccessType->getVectorComponentType();
+        spvAccessElementType = spvAccessElementType->getVectorComponentType();
         break;
       }
       case OpTypeCooperativeMatrixKHR: {
         flushGep();
-        auto use = spvAccessType->getCooperativeMatrixKHRUse();
-        unsigned rows = spvAccessType->getCooperativeMatrixKHRRows();
-        unsigned columns = spvAccessType->getCooperativeMatrixKHRColumns();
-        spvAccessType = spvAccessType->getCooperativeMatrixKHRComponentType();
-        basePointeeType = transType(spvAccessType);
-        lgc::CooperativeMatrixElementType elemType = mapToBasicType(spvAccessType);
+        auto use = spvAccessElementType->getCooperativeMatrixKHRUse();
+        unsigned rows = spvAccessElementType->getCooperativeMatrixKHRRows();
+        unsigned columns = spvAccessElementType->getCooperativeMatrixKHRColumns();
+        spvAccessElementType = spvAccessElementType->getCooperativeMatrixKHRComponentType();
+        basePointeeType = transType(spvAccessElementType);
+        lgc::CooperativeMatrixElementType elemType = mapToBasicType(spvAccessElementType);
         lgc::CooperativeMatrixLayout layout =
             getCooperativeMatrixKHRLayout(static_cast<CooperativeMatrixUse>(use), elemType, rows, columns);
 
@@ -3623,8 +3641,10 @@ SmallVector<Value *> SPIRVToLLVM::transAccessChain(SPIRVValue *const spvValue) {
     result.push_back(base);
   }
 
-  // Second, translate the access chain for any mixed-image parts.
+  // Second, translate the access chain for any image parts.
   if (resultHasImage) {
+    assert(baseHasImage);
+    (void)baseHasImage;
     Value *base = fullBase[baseHasNonImage ? 1 : 0];
 
     if (spvIndices.empty()) {
@@ -3632,25 +3652,19 @@ SmallVector<Value *> SPIRVToLLVM::transAccessChain(SPIRVValue *const spvValue) {
     } else {
       // 'proxyType' is the replaced type for struct/array type with image/sampler member.
       // In which, image/sampler member is replaced by int8 type, and non-image member is replaced by empty sturct.
-      Type *proxyType = transType(spvAccessType, 0, true, true, layout);
+      Type *proxyType = nullptr;
       SPIRVTypeContext ctx(spvAccessType, 0, true, true, layout);
       auto it = m_imageTypeMap.find(ctx.asTuple());
       if (it != m_imageTypeMap.end())
         proxyType = it->second;
+      assert(proxyType != nullptr);
 
-      // Calculate the offset:
-      // 1. Calculate the current accessed member in the proxyType: offset = offsetof(proxyType, proxyType[srcIndices]);
-      // 2. Correct offset if 'proxyType' is sturct:
-      //    a. If it has pre OpAccessChain, add the pre offset value from pre OpAccessChain
-      //    b. If not have pre OpAccessChain, the 'offset' value won't be modified.
-      //       Because during transValueMultiWithOpcode<OpVariable>(spvBase), we append a constant value 0, so this
-      //       always add 0.
-      //    c. If 'proxyType' is array type(which means the `baseHasNonImage` is false), we didn't append 0 for base
-      //    variable, so should NOT correct the offset value.
+      // Calculate the current accessed member in the proxyType: offset = offsetof(proxyType, proxyType[srcIndices]);
       Value *proxy = getBuilder()->CreateGEP(proxyType, ConstantPointerNull::get(getBuilder()->getPtrTy()), srcIndices);
       Value *offset = getBuilder()->CreatePtrToInt(proxy, getBuilder()->getInt32Ty());
 
-      // Translate the image/sampler member pointer.
+      // If result type NOT have non-image type, it does not have pre OpAccessChain, so look up the image member type,
+      // and translate the image pointer.
       SPIRVType *spvElementType = spvAccessType;
 
       SmallVector<SPIRVType *> elementWorklist;
@@ -3860,7 +3874,7 @@ template <> Value *SPIRVToLLVM::transValueWithOpcode<OpGroupAll>(SPIRVValue *con
   BasicBlock *const block = getBuilder()->GetInsertBlock();
   Function *const func = getBuilder()->GetInsertBlock()->getParent();
   Value *const predicate = transValue(spvOperands[1], func, block);
-  return getBuilder()->CreateSubgroupAll(predicate);
+  return getBuilder()->create<SubgroupAllOp>(predicate);
 }
 
 // =====================================================================================================================
@@ -3977,7 +3991,7 @@ template <> Value *SPIRVToLLVM::transValueWithOpcode<OpGroupNonUniformAll>(SPIRV
   BasicBlock *const block = getBuilder()->GetInsertBlock();
   Function *const func = getBuilder()->GetInsertBlock()->getParent();
   Value *const predicate = transValue(spvOperands[1], func, block);
-  return getBuilder()->CreateSubgroupAll(predicate);
+  return getBuilder()->create<SubgroupAllOp>(predicate);
 }
 
 // =====================================================================================================================
@@ -4005,7 +4019,7 @@ template <> Value *SPIRVToLLVM::transValueWithOpcode<OpGroupNonUniformAllEqual>(
   BasicBlock *const block = getBuilder()->GetInsertBlock();
   Function *const func = getBuilder()->GetInsertBlock()->getParent();
   Value *const value = transValue(spvOperands[1], func, block);
-  return getBuilder()->CreateSubgroupAllEqual(value);
+  return getBuilder()->create<SubgroupAllEqualOp>(value);
 }
 
 // =====================================================================================================================
@@ -4023,7 +4037,7 @@ template <> Value *SPIRVToLLVM::transValueWithOpcode<OpGroupNonUniformRotateKHR>
   Value *const delta = transValue(spvOperands[2], func, block);
   Value *const clusterSize =
       spvOperands.size() > 3 ? transValue(spvOperands[3], func, block) : PoisonValue::get(Type::getInt32Ty(*m_context));
-  return getBuilder()->CreateSubgroupRotate(value, delta, clusterSize);
+  return getBuilder()->create<SubgroupRotateOp>(value, delta, clusterSize);
 }
 
 // =====================================================================================================================
@@ -4649,7 +4663,7 @@ template <> Value *SPIRVToLLVM::transValueWithOpcode<OpSubgroupAllKHR>(SPIRVValu
   BasicBlock *const block = getBuilder()->GetInsertBlock();
   Function *const func = getBuilder()->GetInsertBlock()->getParent();
   Value *const predicate = transValue(spvOperands[0], func, block);
-  return getBuilder()->CreateSubgroupAll(predicate);
+  return getBuilder()->create<SubgroupAllOp>(predicate);
 }
 
 // =====================================================================================================================
@@ -4677,7 +4691,7 @@ template <> Value *SPIRVToLLVM::transValueWithOpcode<OpSubgroupAllEqualKHR>(SPIR
   BasicBlock *const block = getBuilder()->GetInsertBlock();
   Function *const func = getBuilder()->GetInsertBlock()->getParent();
   Value *const value = transValue(spvOperands[0], func, block);
-  return getBuilder()->CreateSubgroupAllEqual(value);
+  return getBuilder()->create<SubgroupAllEqualOp>(value);
 }
 
 // =====================================================================================================================
@@ -4979,11 +4993,20 @@ SmallVector<Value *> SPIRVToLLVM::transValueMultiWithOpcode<OpVariable>(SPIRVVal
   bool variableHasImage = false;
   if (storageClass == StorageClassUniformConstant) {
     SPIRVType *spvElementType = spvVar->getType()->getPointerElementType();
-    while (spvElementType->getOpCode() == OpTypeArray || spvElementType->getOpCode() == OpTypeRuntimeArray)
-      spvElementType = spvElementType->getArrayElementType();
-    if (spvElementType->getOpCode() == OpTypeImage || spvElementType->getOpCode() == OpTypeSampler ||
-        spvElementType->getOpCode() == OpTypeSampledImage) {
-      variableHasImage = true;
+
+    SmallVector<SPIRVType *> elementWorklist;
+    elementWorklist.push_back(spvElementType);
+    while (!elementWorklist.empty()) {
+      SPIRVType *spvTy = elementWorklist.pop_back_val();
+      if (spvTy->getOpCode() == OpTypeImage || spvTy->getOpCode() == OpTypeSampler ||
+          spvTy->getOpCode() == OpTypeSampledImage) {
+        variableHasImage = true;
+        break;
+      } else if (spvTy->getOpCode() == OpTypeArray || spvTy->getOpCode() == OpTypeRuntimeArray)
+        elementWorklist.push_back(spvTy->getArrayElementType());
+      else if (spvTy->getOpCode() == OpTypeStruct)
+        for (int i = 0, e = spvTy->getStructMemberCount(); i < e; i++)
+          elementWorklist.push_back(spvTy->getStructMemberType(i));
     }
   }
 
@@ -4992,6 +5015,10 @@ SmallVector<Value *> SPIRVToLLVM::transValueMultiWithOpcode<OpVariable>(SPIRVVal
     IRBuilderBase::InsertPointGuard ipg(*getBuilder());
     getBuilder()->SetInsertPointPastAllocas(f);
     values.push_back(transImagePointer(spvVar, spvVar->getMemObjType()));
+
+    // Append const value 0 as default offset if this variable is struct type with image member.
+    if (itNonImage->second)
+      values.push_back(getBuilder()->getInt32(0));
   }
 
   m_variableMap.try_emplace({spvValue, f}, values);
@@ -5016,7 +5043,8 @@ Value *SPIRVToLLVM::transVariableNonImage(SPIRVValue *const spvValue) {
     case OpTypeImage:
     case OpTypeSampler:
     case OpTypeSampledImage:
-      // Do nothing for image/sampler/sampledimage.
+      // Only translate image/sampler array type to record the m_imageTypeMap
+      transType(spvVarType, 0, true, true, layout);
       return nullptr;
     default:
       if (!isAccelerationStructureType(spvElementType))
@@ -5222,7 +5250,7 @@ template <> Value *SPIRVToLLVM::transValueWithOpcode<OpMatrixTimesScalar>(SPIRVV
     lgc::CooperativeMatrixLayout layout = getCooperativeMatrixKHRLayout(
         static_cast<CooperativeMatrixUse>(spvOperands[0]->getType()->getCooperativeMatrixKHRUse()), elemType, rows,
         columns);
-    return getBuilder()->CreateCoopMatrixTimesScalar(matrix, scalar, elemType, layout);
+    return getBuilder()->create<CooperativeMatrixTimesScalarOp>(matrix->getType(), matrix, scalar, elemType, layout);
   } else {
     return getBuilder()->CreateMatrixTimesScalar(matrix, scalar);
   }
@@ -5419,7 +5447,7 @@ template <> Value *SPIRVToLLVM::transValueWithOpcode<OpCooperativeMatrixLengthKH
   unsigned rows = matrixType->getCooperativeMatrixKHRRows();
   unsigned columns = matrixType->getCooperativeMatrixKHRColumns();
   auto layout = getCooperativeMatrixKHRLayout(matrixUse, elemType, rows, columns);
-  return getBuilder()->CreateCooperativeMatrixLength(elemType, layout);
+  return getBuilder()->create<CooperativeMatrixLengthOp>(layout);
 }
 
 // =====================================================================================================================
@@ -5507,8 +5535,9 @@ template <> Value *SPIRVToLLVM::transValueWithOpcode<OpCooperativeMatrixLoadKHR>
   unsigned alignmentInRowCol = (isColMajor ? rows : columns) * elementSize;
   unsigned loadAlignment = std::min((unsigned)16, alignmentInRowCol);
   lgc::CooperativeMatrixLayout layout = getCooperativeMatrixKHRLayout(use, elemType, rows, columns);
-  auto CoopMatLoadInst = getBuilder()->CreateCooperativeMatrixLoad(pointer, stride, isColMajor, elemType, layout,
-                                                                   memoryAccess, Align(loadAlignment));
+  Type *coopMatrixTy = getBuilder()->getCooperativeMatrixTy(elemType, layout);
+  auto CoopMatLoadInst = getBuilder()->create<CooperativeMatrixLoadOp>(
+      coopMatrixTy, pointer, stride, isColMajor, elemType, layout, memoryAccess, loadAlignment, "load");
   return CoopMatLoadInst;
 }
 
@@ -5603,8 +5632,8 @@ template <> Value *SPIRVToLLVM::transValueWithOpcode<OpCooperativeMatrixStoreKHR
   unsigned elementSize = static_cast<unsigned>(m_m->getDataLayout().getTypeSizeInBits(elemltType) / 8);
   unsigned alignmentInRowCol = (isColMajor ? rows : columns) * elementSize;
   unsigned storeAlignment = std::min((unsigned)16, alignmentInRowCol);
-  getBuilder()->CreateCooperativeMatrixStore(pointer, matrix, stride, isColMajor, elemType, layout, memoryAccess,
-                                             Align(storeAlignment));
+  getBuilder()->create<CooperativeMatrixStoreOp>(pointer, stride, isColMajor, elemType, layout, memoryAccess,
+                                                 storeAlignment, matrix);
   return nullptr;
 }
 
@@ -5630,8 +5659,9 @@ template <> Value *SPIRVToLLVM::transValueWithOpcode<OpCooperativeMatrixMulAddKH
   bool isSignedB = static_cast<bool>(static_cast<SPIRVCooperativeMatrixMulAddKHR *>(spvInst)->getMatrixBSigned());
   bool isSat = static_cast<bool>(static_cast<SPIRVCooperativeMatrixMulAddKHR *>(spvInst)->getMatrixSatAccumulation());
 
-  Value *coopMatrixD = getBuilder()->CreateCooperativeMatrixMulAdd(coopMatrixA, coopMatrixB, coopMatrixC, isSignedA,
-                                                                   isSignedB, isSat, 0, elemBasicTypeC, elemBasicTypeA);
+  Value *coopMatrixD = getBuilder()->create<CooperativeMatrixMulAddOp>(coopMatrixC->getType(), coopMatrixA, coopMatrixB,
+                                                                       coopMatrixC, isSignedA, isSignedB, isSat, 0,
+                                                                       elemBasicTypeC, elemBasicTypeA, "mulAdd");
   return coopMatrixD;
 }
 
@@ -6115,9 +6145,11 @@ SmallVector<Value *> SPIRVToLLVM::transValueWithoutDecoration(SPIRVValue *bv, Fu
       unsigned columns = matrixType->getCooperativeMatrixKHRColumns();
       auto layout = getCooperativeMatrixKHRLayout(
           static_cast<CooperativeMatrixUse>(matrixType->getCooperativeMatrixKHRUse()), elemType, rows, columns);
+      Type *extractElementType = getBuilder()->transCooperativeMatrixElementType(elemType);
       Value *matrix = transValue(ce->getComposite(), f, bb);
       Value *index = getBuilder()->getInt32(ce->getIndices()[0]);
-      return mapValue(bv, getBuilder()->CreateCooperativeMatrixExtract(matrix, index, elemType, layout));
+      return mapValue(
+          bv, getBuilder()->create<CooperativeMatrixExtractOp>(extractElementType, matrix, index, elemType, layout));
     }
 
     auto cv = transValue(ce->getComposite(), f, bb);
@@ -6168,7 +6200,8 @@ SmallVector<Value *> SPIRVToLLVM::transValueWithoutDecoration(SPIRVValue *bv, Fu
       Value *matrix = transValue(ci->getComposite(), f, bb);
       Value *value = transValue(ci->getObject(), f, bb);
       Value *index = getBuilder()->getInt32(ci->getIndices()[0]);
-      return mapValue(bv, getBuilder()->CreateCooperativeMatrixInsert(matrix, value, index, elemType, layout));
+      return mapValue(bv, getBuilder()->create<CooperativeMatrixInsertOp>(matrix->getType(), matrix, value, index,
+                                                                          elemType, layout));
     }
 
     auto cv = transValue(ci->getComposite(), f, bb);
@@ -6318,8 +6351,10 @@ SmallVector<Value *> SPIRVToLLVM::transValueWithoutDecoration(SPIRVValue *bv, Fu
       lgc::CooperativeMatrixLayout dstLayout = getCooperativeMatrixKHRLayout(
           static_cast<CooperativeMatrixUse>(bc->getType()->getCooperativeMatrixKHRUse()), basicDstElemTy,
           bc->getType()->getCooperativeMatrixKHRRows(), bc->getType()->getCooperativeMatrixKHRColumns());
-      return mapValue(bv, getBuilder()->CreateCooperativeMatrixConvert(co, val, basicSrcElemTy, basicDstElemTy,
-                                                                       srcLayout, dstLayout));
+
+      Type *matrixType = getBuilder()->getCooperativeMatrixTy(basicDstElemTy, dstLayout);
+      return mapValue(bv, getBuilder()->create<CooperativeMatrixConvertOp>(
+                              matrixType, co, val, basicSrcElemTy, basicDstElemTy, srcLayout, dstLayout, "fConvert"));
     }
     if (val->getType()->getScalarType()->getPrimitiveSizeInBits() <= destTy->getScalarType()->getPrimitiveSizeInBits())
       return mapValue(bv, getBuilder()->CreateFPExt(val, destTy));
@@ -6648,7 +6683,7 @@ SmallVector<Value *> SPIRVToLLVM::transValueWithoutDecoration(SPIRVValue *bv, Fu
   case OpEndStreamPrimitive:
     return mapValue(bv, transValueWithOpcode<OpEndStreamPrimitive>(bv));
   case OpAccessChain:
-    return mapValue(bv, transValueMultiWithOpcode<OpAccessChain>(bv));
+    return {mapValue(bv, transValueMultiWithOpcode<OpAccessChain>(bv))[0]};
   case OpArrayLength:
     return mapValue(bv, transValueWithOpcode<OpArrayLength>(bv));
   case OpInBoundsAccessChain:
@@ -8540,6 +8575,7 @@ bool SPIRVToLLVM::transMetadata() {
         execModel == ExecutionModelTaskEXT || execModel == ExecutionModelMeshEXT) {
       // Give the shader modes to middle-end
       if (execModel == ExecutionModelVertex) {
+        // Nothing to do.
       } else if (execModel == ExecutionModelTessellationControl || execModel == ExecutionModelTessellationEvaluation) {
         TessellationMode tessellationMode = {};
 
@@ -10409,10 +10445,12 @@ Instruction *SPIRVToLLVM::transMemFence(BasicBlock *bb, SPIRVWord memSema, SPIRV
 
   switch (memScope) {
   case ScopeCrossDevice:
-  case ScopeDevice:
+    scope = SyncScope::System;
+    break;
   case ScopeQueueFamilyKHR:
   case ScopeShaderCallKHR:
-    scope = SyncScope::System;
+  case ScopeDevice:
+    scope = m_context->getOrInsertSyncScopeID("agent");
     break;
   case ScopeInvocation:
     scope = SyncScope::SingleThread;
@@ -10978,8 +11016,9 @@ Value *SPIRVToLLVM::transCooperativeMatrixArithInst(SPIRVValue *spvVal, BasicBlo
           static_cast<CooperativeMatrixUse>(unary->getOperand(0)->getType()->getCooperativeMatrixKHRUse()), elemType,
           rows, columns);
     }
-    return getBuilder()->CreateCooperativeMatrixBinaryOp(arithOp, Constant::getNullValue(srcVal->getType()), srcVal,
-                                                         elemType, layout);
+    Type *resultTy = getBuilder()->getCooperativeMatrixTy(elemType, layout);
+    return getBuilder()->create<CooperativeMatrixBinaryOp>(resultTy, arithOp, Constant::getNullValue(srcVal->getType()),
+                                                           srcVal, elemType, layout);
   } else {
     auto binary = static_cast<SPIRVBinary *>(spvVal);
     Value *lhs = transValue(binary->getOperand(0), func, bb);
@@ -10993,7 +11032,8 @@ Value *SPIRVToLLVM::transCooperativeMatrixArithInst(SPIRVValue *spvVal, BasicBlo
           static_cast<CooperativeMatrixUse>(binary->getOperand(0)->getType()->getCooperativeMatrixKHRUse()), elemType,
           rows, columns);
     }
-    return getBuilder()->CreateCooperativeMatrixBinaryOp(arithOp, lhs, rhs, elemType, layout);
+    Type *resultTy = getBuilder()->getCooperativeMatrixTy(elemType, layout);
+    return getBuilder()->create<CooperativeMatrixBinaryOp>(resultTy, arithOp, lhs, rhs, elemType, layout);
   }
 }
 
@@ -11005,7 +11045,8 @@ Value *SPIRVToLLVM::transCooperativeMatrixKHRFromConstruct(SPIRVType *spvCoopMat
   lgc::CooperativeMatrixLayout layout = getCooperativeMatrixKHRLayout(
       static_cast<CooperativeMatrixUse>(spvCoopMatTy->getCooperativeMatrixKHRUse()), elemType,
       spvCoopMatTy->getCooperativeMatrixKHRRows(), spvCoopMatTy->getCooperativeMatrixKHRColumns());
-  return getBuilder()->CreateCooperativeMatrixFill(constituents[0], elemType, layout);
+  Type *coopMatrixTy = getBuilder()->getCooperativeMatrixTy(elemType, layout);
+  return getBuilder()->create<CooperativeMatrixFillOp>(coopMatrixTy, constituents[0], elemType, layout);
 }
 
 } // namespace SPIRV
diff --git a/llpc/util/llpcElfWriter.cpp b/llpc/util/llpcElfWriter.cpp
index 0aba94213f..00d754f0a5 100644
--- a/llpc/util/llpcElfWriter.cpp
+++ b/llpc/util/llpcElfWriter.cpp
@@ -251,20 +251,13 @@ void ElfWriter<Elf>::mergeMetaNote(Context *pContext, const ElfNote *pNote1, con
   // 3. Then it will merge non-fs generated in noia.2 and fs which is saved in cached after noia.0
   // There will be a merge on threshold here, just merge fs_threshold@fs_in_cache not
   // Pipelinelevel_threshold@fs_in_cache
+  auto hwPsStageName = HwStageNames[static_cast<unsigned>(Util::Abi::HardwareStage::Ps)];
   unsigned srcSpillValue = USHRT_MAX;
-  if (pContext->getGfxIpVersion().major > 10) {
-    auto srcPsHwStage = srcPipeline.getMap(true)[PalAbi::PipelineMetadataKey::HardwareStages]
-                            .getMap(true)[static_cast<unsigned>(Util::Abi::HardwareStage::Ps)]
-                            .getMap(true);
-    auto srcSpillThreshold = &srcPsHwStage[ShaderSpillThreshold];
-    if (!srcSpillThreshold->isEmpty()) {
-      srcSpillValue = srcPsHwStage[ShaderSpillThreshold].getUInt();
-    }
-  } else {
-    // This is to revert and keep legacy behavior on gfx10 as to fix the block issue: hang on PAL for
-    // gfx_bench. Todo: Needs to keep same with gfx10+
-    srcSpillValue = srcPipeline.getMap(true)[PalAbi::PipelineMetadataKey::SpillThreshold].getUInt();
-  }
+  auto srcPsHwStage =
+      srcPipeline.getMap(true)[PalAbi::PipelineMetadataKey::HardwareStages].getMap(true)[hwPsStageName].getMap(true);
+  auto srcSpillThresholdIt = srcPsHwStage.find(ShaderSpillThreshold);
+  if (srcSpillThresholdIt != srcPsHwStage.end())
+    srcSpillValue = srcSpillThresholdIt->second.getUInt();
 
   unsigned destSpillThreshold = destPipeline.getMap(true)[PalAbi::PipelineMetadataKey::SpillThreshold].getUInt();
   destPipeline.getMap(true)[PalAbi::PipelineMetadataKey::SpillThreshold] =
@@ -279,7 +272,6 @@ void ElfWriter<Elf>::mergeMetaNote(Context *pContext, const ElfNote *pNote1, con
   // Copy whole .ps hw stage
   auto destHwStages = destPipeline.getMap(true)[PalAbi::PipelineMetadataKey::HardwareStages].getMap(true);
   auto srcHwStages = srcPipeline.getMap(true)[PalAbi::PipelineMetadataKey::HardwareStages].getMap(true);
-  auto hwPsStageName = HwStageNames[static_cast<unsigned>(Util::Abi::HardwareStage::Ps)];
   destHwStages[hwPsStageName] = srcHwStages[hwPsStageName];
 
   // Copy whole .pixel shader
diff --git a/llpc/util/llpcShaderModuleHelper.cpp b/llpc/util/llpcShaderModuleHelper.cpp
index 981afbfcc2..9f9959f940 100644
--- a/llpc/util/llpcShaderModuleHelper.cpp
+++ b/llpc/util/llpcShaderModuleHelper.cpp
@@ -158,16 +158,10 @@ ShaderModuleUsage ShaderModuleHelper::getShaderModuleUsageInfo(const BinaryData
           shaderModuleUsage.useGenericBuiltIn = true;
           break;
         }
-        case BuiltInBaryCoordKHR:
-        case BuiltInBaryCoordNoPerspKHR:
-          shaderModuleUsage.useBarycentric = true;
-          break;
         default: {
           break;
         }
         }
-      } else if (decoration == DecorationPerVertexKHR) {
-        shaderModuleUsage.useBarycentric = true;
       } else if (decoration == DecorationIndex) {
         hasIndexDecoration = true;
       }
diff --git a/llvmraytracing/CMakeLists.txt b/llvmraytracing/CMakeLists.txt
index 780f763a20..db81197eb4 100644
--- a/llvmraytracing/CMakeLists.txt
+++ b/llvmraytracing/CMakeLists.txt
@@ -16,7 +16,6 @@ option(LLVMRAYTRACING_BUILD_TESTS "Build raytracing tests")
 add_llvm_library(LLVMRaytracing
   lib/CleanupContinuations.cpp
   lib/Continuations.cpp
-  lib/ContinuationsDialect.cpp
   lib/CpsStackLowering.cpp
   lib/DXILContIntrinsicPrepare.cpp
   lib/DXILContLgcRtOpConverter.cpp
@@ -26,6 +25,7 @@ add_llvm_library(LLVMRaytracing
   lib/GpurtDialect.cpp
   lib/LegacyCleanupContinuations.cpp
   lib/LgcCpsDialect.cpp
+  lib/LgcIlCpsDialect.cpp
   lib/LgcRtDialect.cpp
   lib/LgcRtqDialect.cpp
   lib/LowerAwait.cpp
@@ -82,7 +82,7 @@ macro(raytracing_tablegen DIALECTNAME FILE OUTPUT_FILENAME)
   target_sources(LLVMRaytracing PRIVATE ${FILE})
 endmacro()
 
-raytracing_tablegen(continuations include/llvmraytracing/ContinuationsDialect.td ContinuationsDialect)
+raytracing_tablegen(lgc.ilcps include/lgc/LgcIlCpsDialect.td LgcIlCpsDialect)
 raytracing_tablegen(lgc.cps include/lgc/LgcCpsDialect.td LgcCpsDialect)
 raytracing_tablegen(lgc.rt include/lgc/LgcRtDialect.td LgcRtDialect)
 raytracing_tablegen(lgc.rtq include/lgc/LgcRtqDialect.td LgcRtqDialect)
@@ -94,7 +94,7 @@ set_target_properties(LLVMRaytracing PROPERTIES CXX_EXTENSIONS OFF)
 add_subdirectory(plugin)
 
 if(CONTINUATIONS_BUILD_TESTS)
-  message(WARNING "Deprecated flag CONTINUATIONS_BUILD_TEST used; use LLVMRAYTRACING_BUILD_TESTS instead")
+  message(WARNING "Deprecated flag CONTINUATIONS_BUILD_TESTS used; use LLVMRAYTRACING_BUILD_TESTS instead")
   set(LLVMRAYTRACING_BUILD_TESTS ON)
 endif()
 if(LLVMRAYTRACING_BUILD_TESTS)
diff --git a/llvmraytracing/include/continuations/ContinuationsDialect.h b/llvmraytracing/include/continuations/ContinuationsDialect.h
deleted file mode 100644
index 7e3bb63cf4..0000000000
--- a/llvmraytracing/include/continuations/ContinuationsDialect.h
+++ /dev/null
@@ -1,2 +0,0 @@
-// Transition header -- to be removed
-#include "llvmraytracing/ContinuationsDialect.h"
diff --git a/llvmraytracing/include/lgc/GpurtDialect.td b/llvmraytracing/include/lgc/GpurtDialect.td
index 20ef2a410a..ac76f07dd0 100644
--- a/llvmraytracing/include/lgc/GpurtDialect.td
+++ b/llvmraytracing/include/lgc/GpurtDialect.td
@@ -38,6 +38,15 @@ def PrivatePointer : TgConstant<(PointerType 5)>, Type;
 def V2I32 : TgConstant<(FixedVectorType I32, 2)>, Type;
 def V4I32 : TgConstant<(FixedVectorType I32, 4)>, Type;
 
+def PairStructType : BuiltinType {
+  let arguments = (args type:$self, type:$ele_type);
+  let evaluate = "::llvm::StructType::get($_context, {$ele_type, $ele_type})";
+  let check = "$self->isStructTy()";
+  let capture = [];
+}
+
+def StoreStackTy : TgConstant<(PairStructType I32)>, Type;
+
 def GpurtGetStackSizeOp : GpurtOp<"get.stack.size", [Memory<[]>, WillReturn]> {
   let arguments = (ins);
   let results = (outs I32:$result);
@@ -82,17 +91,24 @@ def GpurtGetStackStrideOp : GpurtOp<"get.stack.stride", [Memory<[]>, WillReturn]
 }
 
 def GpurtLdsStackInitOp : GpurtOp<"lds.stack.init", [Memory<[]>, WillReturn]> {
-  let arguments = (ins);
+  let arguments = (ins AttrI1:$use_extra_stack);
   let results = (outs I32:$result);
   let summary = "return the initial stack dword position for use with lds.stack.store";
+  let description = [{
+    Initialize the lds/(scratch buffer) stack at index position
+
+    Setting use_extra_stack to true means this initialize will perform on an extra piece of stack to avoid traversal stack conflict.
+  }];
 }
 
 def GpurtLdsStackStoreOp : GpurtOp<"lds.stack.store", [Memory<[(write)]>, WillReturn]> {
-  let arguments = (ins PrivatePointer:$new_pos, I32:$old_pos, V4I32:$data);
-  let results = (outs I32:$result);
+  let arguments = (ins I32:$old_pos, I32:$last_node, V4I32:$data);
+  let results = (outs StoreStackTy:$result);
   let summary = "perform a combined lds stack push and pop operation.";
   let description = [{
-    this pushes $data and pops a dword from the stack, and data and positions are interpreted according to the ds_bvh_stack_rtn instruction.
+    lds.stack.store pushes $data onto the stack and returns a struct of a popped nodePtr and
+    the new stackAddress (stack position),
+    The nodePtr and stackAddress are interpreted as by the ds_bvh_stack_rtn instruction.
   }];
 }
 
@@ -102,12 +118,30 @@ def GpurtFloatWithRoundModeOp : GpurtOp<"rt.floatop.roundmode", [Memory<[]>, Wil
   let summary = "return result of floatOp with roundmode";
 }
 
+def GpurtWaveScanOp : GpurtOp<"rt.scanop.wavescan", [Memory<[]>, WillReturn]> {
+  let arguments = (ins I32:$operation, I32:$flags, (ScalarOrFixedVector F32):$src0);
+  let results = (outs (eq $src0):$result);
+  let summary = "return result of wavescan with waveop";
+}
+
 def GpurtGetBoxSortHeuristicModeOp : GpurtOp<"get.box.sort.heuristic.mode", [Memory<[]>, WillReturn]> {
   let arguments = (ins);
   let results = (outs I32:$result);
   let summary = "return the box sort heuristic mode";
 }
 
+def GpurtGetKnownSetRayFlagsOp : GpurtOp<"get.known.set.ray.flags", [Memory<[]>, WillReturn]> {
+  let arguments = (ins);
+  let results = (outs I32:$result);
+  let summary = "return the known-set bits of the traceRay specialization flags";
+}
+
+def GpurtGetKnownUnsetRayFlagsOp : GpurtOp<"get.known.unset.ray.flags", [Memory<[]>, WillReturn]> {
+  let arguments = (ins);
+  let results = (outs I32:$result);
+  let summary = "return the known-unset bits of the traceRay specialization flags";
+}
+
 def GpurtGetStaticFlagsOp : GpurtOp<"get.static.flags", [Memory<[]>, WillReturn]> {
   let arguments = (ins);
   let results = (outs I32:$result);
@@ -267,6 +301,16 @@ def GpurtCallIntersectionShaderOp : GpurtOp<"call.intersection.shader", [Memory<
   }];
 }
 
+def GpurtSetHitTokenDataOp : GpurtOp<"set.hit.token.data", [Memory<[(write InaccessibleMem)]>, WillReturn]> {
+  let arguments =  (ins I32:$arg0, I32:$arg1);
+  let results = (outs);
+
+  let summary = "Set hit token data";
+  let description = [{
+    Set hit token data.
+  }];
+}
+
 def GpurtSetTriangleIntersectionAttributesOp
     : GpurtOp<"set.triangle.intersection.attributes", [Memory<[(write InaccessibleMem)]>, WillReturn]> {
   let arguments =  (ins V2F32:$barycentrics);
diff --git a/llvmraytracing/include/lgc/LgcCpsDialect.h b/llvmraytracing/include/lgc/LgcCpsDialect.h
index 5fe96b5a5e..9528c8730c 100644
--- a/llvmraytracing/include/lgc/LgcCpsDialect.h
+++ b/llvmraytracing/include/lgc/LgcCpsDialect.h
@@ -48,7 +48,7 @@ enum class RayTracingShaderStage;
 
 namespace lgc::cps {
 enum class CpsLevel : uint8_t {
-  RayGen = 0,
+  RayGen = 1,
   ClosestHit_Miss_Callable,
   Traversal,
   AnyHit_CombinedIntersection_AnyHit,
diff --git a/llvmraytracing/include/lgc/LgcIlCpsDialect.h b/llvmraytracing/include/lgc/LgcIlCpsDialect.h
new file mode 100644
index 0000000000..58af07bb70
--- /dev/null
+++ b/llvmraytracing/include/lgc/LgcIlCpsDialect.h
@@ -0,0 +1,32 @@
+/*
+ ***********************************************************************************************************************
+ *
+ *  Copyright (c) 2022-2024 Advanced Micro Devices, Inc. All Rights Reserved.
+ *
+ *  Permission is hereby granted, free of charge, to any person obtaining a copy
+ *  of this software and associated documentation files (the "Software"), to
+ *  deal in the Software without restriction, including without limitation the
+ *  rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ *  sell copies of the Software, and to permit persons to whom the Software is
+ *  furnished to do so, subject to the following conditions:
+ *
+ *  The above copyright notice and this permission notice shall be included in
+ *all copies or substantial portions of the Software.
+ *
+ *  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ *  IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ *  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ *  AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ *  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ *  FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ *  IN THE SOFTWARE.
+ *
+ **********************************************************************************************************************/
+
+//===- LgcIlCpsDialect.h - Dialect definitions -----------------------===//
+
+#pragma once
+
+#define GET_INCLUDES
+#define GET_DIALECT_DECLS
+#include "LgcIlCpsDialect.h.inc"
diff --git a/llvmraytracing/include/llvmraytracing/ContinuationsDialect.td b/llvmraytracing/include/lgc/LgcIlCpsDialect.td
similarity index 68%
rename from llvmraytracing/include/llvmraytracing/ContinuationsDialect.td
rename to llvmraytracing/include/lgc/LgcIlCpsDialect.td
index d4173142eb..75c9fbf8ed 100644
--- a/llvmraytracing/include/llvmraytracing/ContinuationsDialect.td
+++ b/llvmraytracing/include/lgc/LgcIlCpsDialect.td
@@ -23,19 +23,19 @@
  *
  **********************************************************************************************************************/
 
-//===- ContinuationsDialect.td - Dialect definitions -------*- tablegen -*-===//
+//===- LgcIlCpsDialect.td - Dialect definitions -------*- tablegen -*-===//
 
 include "llvm-dialects/Dialect/Dialect.td"
 
-def ContinuationsDialect : Dialect {
-  let name = "continuations";
-  let cppNamespace = "continuations";
+def LgcIlCpsDialect : Dialect {
+  let name = "lgc.ilcps";
+  let cppNamespace = "lgc::ilcps";
 }
 
-class ContinuationsOp<string mnemonic_, list<Trait> traits_ = []>
-    : Op<ContinuationsDialect, mnemonic_, traits_ # [NoUnwind]>;
+class LgcIlCpsOp<string mnemonic_, list<Trait> traits_ = []>
+    : Op<LgcIlCpsDialect, mnemonic_, traits_ # [NoUnwind]>;
 
-def GetReturnValueOp : ContinuationsOp<"getReturnValue", [NoUnwind, WillReturn]> {
+def GetReturnValueOp : LgcIlCpsOp<"getReturnValue", [NoUnwind, WillReturn]> {
   let arguments = (ins);
   let results = (outs value:$result);
 
@@ -52,3 +52,21 @@ def GetReturnValueOp : ContinuationsOp<"getReturnValue", [NoUnwind, WillReturn]>
     coroutine.
   }];
 }
+
+def ReturnOp : LgcIlCpsOp<"return", [NoReturn]> {
+  let arguments = (ins value:$returnAddr, varargs:$args);
+  let results = (outs);
+
+  let summary =
+    "represents the return from a shader";
+
+  let description = [{
+    Describes the return operation for a continuation shader.
+
+    In non-lgc.cps mode, this is used to jump to the incoming return address
+    for non-RGS, and optionally passing return values in the varargs list.
+
+    For RGS, this is used to terminate the shader after coroutine passes
+    by passing an undef (non-lgc.cps mode)/poison (lgc.cps mode) address.
+  }];
+}
diff --git a/llvmraytracing/include/lgc/LgcRtDialect.h b/llvmraytracing/include/lgc/LgcRtDialect.h
index 0a8b4d5863..67caf9abaa 100644
--- a/llvmraytracing/include/lgc/LgcRtDialect.h
+++ b/llvmraytracing/include/lgc/LgcRtDialect.h
@@ -36,6 +36,7 @@
 namespace llvm {
 class Constant;
 class Function;
+class Module;
 } // namespace llvm
 
 namespace lgc {
@@ -56,13 +57,19 @@ enum class RayTracingShaderStage {
 
 // Set shader stage metadata on a LLVM function and erase it by setting
 // std::nullopt.
-void setLgcRtShaderStage(llvm::Function *func,
+// func can instead be a GlobalVariable, allowing a front-end to use a
+// GlobalVariable to represent a shader retrieved from the cache, and wants to
+// mark it with a shader stage.
+void setLgcRtShaderStage(llvm::GlobalObject *func,
                          std::optional<RayTracingShaderStage> stage);
 
 // Gets the shader stage from the specified LLVM function or std::nullopt
 // if no metadata is apparent.
+// func can instead be a GlobalVariable, allowing a front-end to use a
+// GlobalVariable to represent a shader retrieved from the cache, and wants to
+// mark it with a shader stage.
 std::optional<RayTracingShaderStage>
-getLgcRtShaderStage(const llvm::Function *func);
+getLgcRtShaderStage(const llvm::GlobalObject *func);
 
 // Get the metadata IDs associated with the lgc.rt dialect, so the caller knows
 // which ones can be removed when the dialect is processed.
@@ -95,5 +102,21 @@ std::optional<size_t> getShaderHitAttributeSize(const llvm::Function *func);
 // function.
 void setShaderHitAttributeSize(llvm::Function *func, size_t size);
 
+// Get max hit attribute size (in bytes) metadata for a ray-tracing module.
+// This is a pipeline-wide upper bound on the per-function hit attribute sizes.
+std::optional<size_t> getMaxHitAttributeSize(const llvm::Module *module);
+
+// Set max hit attribute size (in bytes) metadata for a ray-tracing module.
+// This is a pipeline-wide upper bound on the per-function hit attribute sizes.
+void setMaxHitAttributeSize(llvm::Module *module, size_t size);
+
+// Get max payload size (in bytes) metadata for a ray-tracing module.
+// This is a pipeline-wide upper bound on the per-function payload sizes.
+std::optional<size_t> getMaxPayloadSize(const llvm::Module *module);
+
+// Set max hit attribute size (in bytes) metadata for a ray-tracing module.
+// This is a pipeline-wide upper bound on the per-function payload sizes.
+void setMaxPayloadSize(llvm::Module *module, size_t size);
+
 } // namespace rt
 } // namespace lgc
diff --git a/llvmraytracing/include/lgc/LgcRtDialect.td b/llvmraytracing/include/lgc/LgcRtDialect.td
index c0da863447..29b10f859d 100644
--- a/llvmraytracing/include/lgc/LgcRtDialect.td
+++ b/llvmraytracing/include/lgc/LgcRtDialect.td
@@ -44,10 +44,15 @@
 //   get that with setShaderPaq() and getShaderPaq() functions in LgcRtDialect.h. A PAQ is a constant array
 //   of ints; currently the only supported form is a single-entry array where the value is the size in bytes
 //   of the payload. This is the same form of PAQ that is passed as an arg to TraceRayOp.
+//   A pipeline-wide upper bound on the payload size in bytes can be accessed via getMaxPayloadSize() and
+//   getMaxPayloadSize().
 //
-// * A shader with a hit attribute arg has a metadata item giving the hit attribute size in bytes, which
-//   is the same for all shaders in the pipeline. That is set and got using setShaderHitAttributeSize() and
+// * A shader with a hit attribute arg has a metadata item giving the hit attribute size in bytes,
+//   which is guaranteed to be not larger than the pipeline-wide max hit attribute size.
+//   The per-shader hit attribute size is accessed using setShaderHitAttributeSize() and
 //   getHitShaderAttributeSize() in LgcRtDialect.h.
+//   The pipeline-wide maximum sizes can be accessed accessed using getMaxHitAttributeSize()
+//   and setMaxHitAttributeSize().
 //
 // * A callable shader has a metadata item giving the callable data size in bytes; that is set and got using
 //   setShaderArgSize() and getShaderArgSize() in LgcRtDialect.h.
diff --git a/llvmraytracing/include/llvmraytracing/ContinuationsDialect.h b/llvmraytracing/include/llvmraytracing/ContinuationsDialect.h
index fa74b7710f..d1ca5f0959 100644
--- a/llvmraytracing/include/llvmraytracing/ContinuationsDialect.h
+++ b/llvmraytracing/include/llvmraytracing/ContinuationsDialect.h
@@ -1,32 +1,6 @@
-/*
- ***********************************************************************************************************************
- *
- *  Copyright (c) 2022-2024 Advanced Micro Devices, Inc. All Rights Reserved.
- *
- *  Permission is hereby granted, free of charge, to any person obtaining a copy
- *  of this software and associated documentation files (the "Software"), to
- *  deal in the Software without restriction, including without limitation the
- *  rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- *  sell copies of the Software, and to permit persons to whom the Software is
- *  furnished to do so, subject to the following conditions:
- *
- *  The above copyright notice and this permission notice shall be included in
- *all copies or substantial portions of the Software.
- *
- *  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- *  IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- *  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- *  AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- *  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- *  FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- *  IN THE SOFTWARE.
- *
- **********************************************************************************************************************/
+// Transition header -- to be removed
+#include "lgc/LgcIlCpsDialect.h"
 
-//===- ContinuationsDialect.h - Dialect definitions -----------------------===//
-
-#pragma once
-
-#define GET_INCLUDES
-#define GET_DIALECT_DECLS
-#include "ContinuationsDialect.h.inc"
+namespace continuations {
+using ContinuationsDialect = lgc::ilcps::LgcIlCpsDialect;
+} // namespace continuations
diff --git a/llvmraytracing/include/llvmraytracing/ContinuationsUtil.h b/llvmraytracing/include/llvmraytracing/ContinuationsUtil.h
index 620435a544..8fe18562f6 100644
--- a/llvmraytracing/include/llvmraytracing/ContinuationsUtil.h
+++ b/llvmraytracing/include/llvmraytracing/ContinuationsUtil.h
@@ -272,13 +272,6 @@ class ContHelper {
   // Marks an await as a waiting one with a wait mask.
   static constexpr const char *MDIsWaitAwaitName = "continuation.wait.await";
 
-  // Function-scope metadata for payload and hit attribute size limits,
-  // referring to the app-defined structs only.
-  static constexpr const char *MDMaxHitAttributeBytesName =
-      "continuation.maxHitAttributeBytes";
-  static constexpr const char *MDMaxPayloadBytesName =
-      "continuation.maxPayloadBytes";
-
   // Whether this is a load instruction that should translate to a last_use
   // load.
   static constexpr const char *MDIsLastUseName = "amdgpu.last.use";
@@ -484,21 +477,6 @@ class ContHelper {
     MD->addOperand(getI32MDConstant(M.getContext(), MaxPayloadRegisterCount));
   }
 
-  static void setMaxHitAttributeByteCount(Function &F,
-                                          uint32_t MaxHitAttributeByteCount) {
-    lgc::rt::setShaderHitAttributeSize(&F, MaxHitAttributeByteCount);
-  }
-
-  static void setMaxPayloadByteCount(Function &F,
-                                     uint32_t MaxPayloadByteCount) {
-    F.setMetadata(MDMaxPayloadBytesName,
-                  getI32MDConstant(F.getContext(), MaxPayloadByteCount));
-  }
-
-  static std::optional<uint32_t> tryGetMaxPayloadByteCount(const Function &F) {
-    return extractZExtI32Constant(F.getMetadata(MDMaxPayloadBytesName));
-  }
-
   static void setStackSize(Function *F, uint32_t StackSize) {
     F->setMetadata(MDStackSizeName,
                    getI32MDConstant(F->getContext(), StackSize));
@@ -657,6 +635,9 @@ class ContHelper {
   // using the GpuRt version flags intrinsic. If the intrinsic is not found,
   // returns true, enabling new behavior (e.g. for tests).
   static bool getGpurtVersionFlag(Module &GpurtModule, GpuRtVersionFlag Flag);
+
+  // Handles _AmdGetSetting_* intrinsics.
+  static void handleGetSetting(Function &F, ArrayRef<ContSetting> Settings);
 };
 
 class ShaderStageHelper final {
@@ -718,6 +699,7 @@ DRIVER_FUNC_NAME(GetCandidateState)
 DRIVER_FUNC_NAME(GetCommittedState)
 DRIVER_FUNC_NAME(GetContinuationStackAddr)
 DRIVER_FUNC_NAME(SetupRayGen)
+DRIVER_FUNC_NAME(ExitRayGen)
 DRIVER_FUNC_NAME(IsEndSearch)
 DRIVER_FUNC_NAME(GetLocalRootIndex)
 DRIVER_FUNC_NAME(SetLocalRootIndex)
@@ -731,6 +713,7 @@ DRIVER_FUNC_NAME(HitKind)
 DRIVER_FUNC_NAME(Traversal)
 DRIVER_FUNC_NAME(KernelEntry)
 DRIVER_FUNC_NAME(GpurtVersionFlags)
+DRIVER_FUNC_NAME(ShaderStart)
 
 #undef DRIVER_FUNC_NAME
 } // namespace ContDriverFunc
@@ -750,6 +733,10 @@ void forEachCall(Function &F, CallbackTy Callback) {
   }
 }
 
+// Replace all calls to a given function with some value.
+// Removes the original call.
+void replaceCallsToFunction(llvm::Function &F, llvm::Value &Replacement);
+
 bool isLgcRtOp(const llvm::Function *F);
 
 // Move all basic blocks of OldFunc to NewFunc.
diff --git a/llvmraytracing/include/llvmraytracing/PayloadAccessQualifiers.h b/llvmraytracing/include/llvmraytracing/PayloadAccessQualifiers.h
index 0acfac6051..640a9ba5ec 100644
--- a/llvmraytracing/include/llvmraytracing/PayloadAccessQualifiers.h
+++ b/llvmraytracing/include/llvmraytracing/PayloadAccessQualifiers.h
@@ -370,6 +370,8 @@ class PAQAccessMask {
     return AccessMask == RHS.AccessMask;
   }
 
+  bool operator!=(const PAQAccessMask &RHS) const { return !(*this == RHS); }
+
   // Prints HLSL-like qualifier string as in "write(..) : read(..)"
   // If AccessKind is set, only prints the part corresponding to that kind.
   void print(llvm::raw_ostream &,
@@ -607,7 +609,7 @@ struct PAQNode {
 
   // Collect a set of PAQNodes representing the tree rooted at this node,
   // and append it to Result.
-  void collectLeafNodes(SmallVectorImpl<const PAQNode *> &Result) const;
+  void collectNodes(SmallVectorImpl<const PAQNode *> &Result) const;
 };
 
 inline raw_ostream &operator<<(raw_ostream &Stream, const PAQNode &NodeInfo) {
@@ -762,7 +764,7 @@ struct PAQSerializationInfoBase {
     if (PayloadMemPointerNode) {
       Result.push_back(PayloadMemPointerNode.get());
     }
-    PayloadRootNode->collectLeafNodes(Result);
+    PayloadRootNode->collectNodes(Result);
   }
 };
 
diff --git a/llvmraytracing/lib/CleanupContinuations.cpp b/llvmraytracing/lib/CleanupContinuations.cpp
index faf8030575..34d7414674 100644
--- a/llvmraytracing/lib/CleanupContinuations.cpp
+++ b/llvmraytracing/lib/CleanupContinuations.cpp
@@ -49,7 +49,7 @@
 // the compiler backend.
 // 1. Replace returning handle with lgc.cps.jump() with the right continuation
 //    reference.
-// 2. Replace @continuation.return with simple `ret`, which means thread
+// 2. Replace @lgc.ilcps.return with simple `ret`, which means thread
 //    termination.
 // 3. Edit function signatures, like removing coroutine frame pointer argument,
 //    adding needed ones (state, rcr, returned_values) for resume function.
@@ -58,9 +58,9 @@
 
 #include "compilerutils/CompilerUtils.h"
 #include "lgc/LgcCpsDialect.h"
+#include "lgc/LgcIlCpsDialect.h"
 #include "llvm-dialects/Dialect/Visitor.h"
 #include "llvmraytracing/Continuations.h"
-#include "llvmraytracing/ContinuationsDialect.h"
 #include "llvmraytracing/ContinuationsUtil.h"
 #include "llvmraytracing/GpurtContext.h"
 #include "llvm/ADT/STLExtras.h"
@@ -250,9 +250,9 @@ static void buildCpsArgInfos(Function *F, bool IsStart,
     AllArgTypes.push_back(IntegerType::get(Context, 32));
     AllArgValues.push_back(nullptr);
 
-    // Find arguments from continuation.returnvalue calls
+    // Find arguments from lgc.ilcps.getreturnvalue calls
     for (auto &I : F->getEntryBlock()) {
-      if (auto *Intr = dyn_cast<continuations::GetReturnValueOp>(&I)) {
+      if (auto *Intr = dyn_cast<lgc::ilcps::GetReturnValueOp>(&I)) {
         AllArgTypes.push_back(Intr->getType());
         AllArgValues.push_back(Intr);
         InstsToRemove.push_back(Intr);
@@ -297,8 +297,9 @@ void CleanupContinuationsPass::removeContFreeCall(Function *F,
 }
 
 /// Insert cps.free() before the original function exits.
-/// Note: we skip the cps.free() insertion before calls to @continuation.return.
-/// Because this is not useful any more as it means the thread termination.
+/// Note: we skip the cps.free() insertion before calls to
+/// @lgc.ilcps.return. Because this is not useful any more as it means the
+/// thread termination.
 void CleanupContinuationsPass::freeCpsStack(Function *F,
                                             ContinuationData &CpsInfo) {
   struct VisitState {
@@ -390,11 +391,10 @@ void CleanupContinuationsPass::processContinuations() {
         if (isa<ReturnInst>(I)) {
           handleContinue(FuncData.second, I);
         } else if (I->getOpcode() == Instruction::Unreachable) {
-          // We should only possibly have 'continuation.return' or
+          // We should only possibly have 'lgc.ilcps.return' or
           // 'lgc.cps.jump' call before unreachable.
           auto *Call = cast<CallInst>(--I->getIterator());
-          auto *Called = Call->getCalledFunction();
-          if (Called->getName() == "continuation.return") {
+          if (auto *ContRet = dyn_cast<lgc::ilcps::ReturnOp>(Call)) {
             Builder->SetInsertPoint(Call);
             Builder->CreateRetVoid();
             Call->eraseFromParent();
diff --git a/llvmraytracing/lib/Continuations.cpp b/llvmraytracing/lib/Continuations.cpp
index 5a476813dc..5ecf797ef2 100644
--- a/llvmraytracing/lib/Continuations.cpp
+++ b/llvmraytracing/lib/Continuations.cpp
@@ -32,10 +32,11 @@
 #include "llvmraytracing/Continuations.h"
 #include "compilerutils/CompilerUtils.h"
 #include "lgc/LgcCpsDialect.h"
+#include "lgc/LgcIlCpsDialect.h"
 #include "lgc/LgcRtDialect.h"
+#include "llvm-dialects/Dialect/Builder.h"
 #include "llvm-dialects/Dialect/Dialect.h"
 #include "llvm-dialects/Dialect/OpSet.h"
-#include "llvmraytracing/ContinuationsDialect.h"
 #include "llvmraytracing/ContinuationsUtil.h"
 #include "llvmraytracing/GpurtContext.h"
 #include "llvm/ADT/IntervalTree.h"
@@ -100,6 +101,19 @@ const llvm_dialects::OpMap<llvm::GpuRtIntrinsicEntry> llvm::LgcRtGpuRtMap = {{
 
 #undef GPURTMAP_ENTRY
 
+void llvm::replaceCallsToFunction(Function &F, Value &Replacement) {
+  llvm::forEachCall(F, [&](CallInst &CInst) {
+    // Basic sanity check. We should also check for dominance.
+    assert((!isa<Instruction>(&Replacement) ||
+            cast<Instruction>(&Replacement)->getFunction() ==
+                CInst.getFunction()) &&
+           "llvm::replaceCallsToFunction: Replacement should "
+           "reside in the same function as CallInst to replace!");
+    CInst.replaceAllUsesWith(&Replacement);
+    CInst.eraseFromParent();
+  });
+}
+
 bool llvm::isLgcRtOp(const llvm::Function *F) {
   return F && F->getName().starts_with("lgc.rt");
 }
@@ -180,12 +194,10 @@ Type *ContHelper::getPaddingType(const DataLayout &DL, LLVMContext &Context,
 
   assert(DwordsOccupied <= TargetNumDwords);
   unsigned DwordsRemaining = TargetNumDwords - DwordsOccupied;
-  if (DwordsRemaining > 0) {
-    auto I32 = Type::getInt32Ty(Context);
-    return ArrayType::get(I32, DwordsRemaining);
-  } else {
-    return StructType::get(Context);
-  }
+  if (DwordsRemaining > 0)
+    return ArrayType::get(Type::getInt32Ty(Context), DwordsRemaining);
+
+  return StructType::get(Context);
 }
 
 void ContHelper::addPaddingType(const DataLayout &DL, LLVMContext &Context,
@@ -608,11 +620,10 @@ DialectContextAnalysis::Result
 DialectContextAnalysis::run(llvm::Module &M,
                             llvm::ModuleAnalysisManager &AnalysisManager) {
   if (NeedDialectContext) {
-    Context =
-        llvm_dialects::DialectContext::make<continuations::ContinuationsDialect,
-                                            lgc::rt::LgcRtDialect,
-                                            lgc::cps::LgcCpsDialect>(
-            M.getContext());
+    Context = llvm_dialects::DialectContext::make<lgc::ilcps::LgcIlCpsDialect,
+                                                  lgc::rt::LgcRtDialect,
+                                                  lgc::cps::LgcCpsDialect>(
+        M.getContext());
   }
   return DialectContextAnalysis::Result();
 }
@@ -955,15 +966,15 @@ static void replaceEnqueueIntrinsic(Function &F, Function *NewFunc) {
   for (auto &Use : make_early_inc_range(F.uses())) {
     if (auto *CInst = dyn_cast<CallInst>(Use.getUser())) {
       if (CInst->isCallee(&Use)) {
-        IRBuilder<> B(CInst);
+        llvm_dialects::Builder B(CInst);
         SmallVector<Value *> Args(CInst->args());
         bool IsEnqueue = F.getName().contains("Enqueue");
         // Add the current function as return address to the call.
         // Used when Traversal calls AnyHit or Intersection.
         if (IsEnqueue && F.getName().contains("EnqueueCall")) {
           bool HasWaitMask = F.getName().contains("WaitEnqueue");
-          auto *RetAddr =
-              B.CreatePtrToInt(CInst->getFunction(), B.getInt64Ty());
+          auto *RetAddr = B.create<lgc::cps::AsContinuationReferenceOp>(
+              B.getInt64Ty(), CInst->getFunction());
           Args.insert(Args.begin() + (HasWaitMask ? 3 : 2), RetAddr);
         }
 
@@ -983,10 +994,7 @@ static void handleContinuationStackIsGlobal(Function &Func,
   auto *IsGlobal = ConstantInt::getBool(
       Func.getContext(), StackAddrspace == ContStackAddrspace::Global);
 
-  llvm::forEachCall(Func, [&](llvm::CallInst &CInst) {
-    CInst.replaceAllUsesWith(IsGlobal);
-    CInst.eraseFromParent();
-  });
+  llvm::replaceCallsToFunction(Func, *IsGlobal);
 }
 
 static void handleContinuationsGetFlags(Function &Func, uint32_t Flags) {
@@ -997,10 +1005,7 @@ static void handleContinuationsGetFlags(Function &Func, uint32_t Flags) {
   auto *FlagsConst =
       ConstantInt::get(IntegerType::get(Func.getContext(), 32), Flags);
 
-  llvm::forEachCall(Func, [&](llvm::CallInst &CInst) {
-    CInst.replaceAllUsesWith(FlagsConst);
-    CInst.eraseFromParent();
-  });
+  llvm::replaceCallsToFunction(Func, *FlagsConst);
 }
 
 static void handleGetRtip(Function &Func, uint32_t RtipLevel) {
@@ -1034,7 +1039,7 @@ static void handleGetUninitialized(Function &Func) {
   });
 }
 
-static void handleGetSetting(Function &F, ArrayRef<ContSetting> Settings) {
+void ContHelper::handleGetSetting(Function &F, ArrayRef<ContSetting> Settings) {
   auto *Ty = dyn_cast<IntegerType>(F.getReturnType());
   if (!Ty)
     report_fatal_error(Twine("Only integer settings are supported but '") +
@@ -1071,10 +1076,7 @@ static void handleGetSetting(Function &F, ArrayRef<ContSetting> Settings) {
 
   auto *Val = ConstantInt::get(Ty, Value);
 
-  forEachCall(F, [&](CallInst &Call) {
-    Call.replaceAllUsesWith(Val);
-    Call.eraseFromParent();
-  });
+  replaceCallsToFunction(F, *Val);
 }
 
 void llvm::terminateShader(IRBuilder<> &Builder, CallInst *CompleteCall) {
@@ -1098,7 +1100,7 @@ void llvm::terminateShader(IRBuilder<> &Builder, CallInst *CompleteCall) {
          "terminateShader: Invalid terminator instruction provided!");
 
   // If there is some code after the call to _AmdComplete or the intended
-  // continuation.return that aborts the shader, do the following:
+  // lgc.ilcps.return that aborts the shader, do the following:
   // - Split everything after the completion call into a separate block
   // - Remove the newly inserted unconditional branch to the split block
   // - Remove the complete call.
@@ -1158,7 +1160,7 @@ bool llvm::earlyDriverTransform(Module &M) {
       handleGetUninitialized(F);
     } else if (Name.starts_with("_AmdGetSetting")) {
       Changed = true;
-      handleGetSetting(F, GpurtSettings);
+      ContHelper::handleGetSetting(F, GpurtSettings);
     }
   }
 
diff --git a/llvmraytracing/lib/CpsStackLowering.cpp b/llvmraytracing/lib/CpsStackLowering.cpp
index ccb9a7c903..398e14abf4 100644
--- a/llvmraytracing/lib/CpsStackLowering.cpp
+++ b/llvmraytracing/lib/CpsStackLowering.cpp
@@ -191,21 +191,22 @@ void CpsStackLowering::visitStore(llvm::StoreInst &Store) {
 // @param Func: the function where stack pointers should be added to continue
 //              calls
 void CpsStackLowering::visitContinueCalls(llvm::Function *Func) {
-  llvm::forEachTerminator(Func, {Instruction::Unreachable, Instruction::Ret},
-                          [&](Instruction &Terminator) {
-                            auto *BB = Terminator.getParent();
-                            if (&Terminator != &*BB->begin()) {
-                              auto Before = --Terminator.getIterator();
-                              if (auto *CInst = dyn_cast<CallInst>(Before)) {
-                                if (auto *Func = CInst->getCalledFunction()) {
-                                  auto Name = Func->getName();
-                                  if (Name == "continuation.continue" ||
-                                      Name == "continuation.waitContinue")
-                                    visitContinueCall(*CInst);
-                                }
-                              }
-                            }
-                          });
+  llvm::forEachTerminator(
+      Func, {Instruction::Unreachable, Instruction::Ret},
+      [&](Instruction &Terminator) {
+        auto *BB = Terminator.getParent();
+        if (&Terminator != &*BB->begin()) {
+          auto Before = --Terminator.getIterator();
+          if (auto *CInst = dyn_cast<CallInst>(Before)) {
+            if (auto *Func = CInst->getCalledFunction()) {
+              auto Name = Func->getName();
+              if (Name.starts_with("continuation.continue") ||
+                  Name.starts_with("continuation.waitContinue"))
+                visitContinueCall(*CInst);
+            }
+          }
+        }
+      });
 }
 
 // =====================================================================================================================
diff --git a/llvmraytracing/lib/DXILContIntrinsicPrepare.cpp b/llvmraytracing/lib/DXILContIntrinsicPrepare.cpp
index 98ba8a98fe..b2af28f724 100644
--- a/llvmraytracing/lib/DXILContIntrinsicPrepare.cpp
+++ b/llvmraytracing/lib/DXILContIntrinsicPrepare.cpp
@@ -144,13 +144,14 @@ static bool isUtilFunction(StringRef Name) {
       "ContinuationStackIsGlobal",
       "ContStack",
       "Enqueue", // To detect the mangled name of a declaration
-      "GetI32",
+      "ExitRayGen",
       "GetCandidateState",
       "GetCommittedState",
       "GetContinuationStackAddr",
       "GetContinuationStackGlobalMemBase",
       "GetCurrentFuncAddr",
       "GetFuncAddr",
+      "GetI32",
       "GetLocalRootIndex",
       "GetResumePointAddr",
       "GetRtip",
@@ -169,6 +170,7 @@ static bool isUtilFunction(StringRef Name) {
       "SetupRayGen",
       "TraceRay",
       "Traversal",
+      "ShaderStart",
   };
 
   for (const char *UtilName : UtilNames) {
@@ -183,6 +185,8 @@ llvm::PreservedAnalyses DXILContIntrinsicPreparePass::run(
     llvm::Module &M, llvm::ModuleAnalysisManager &AnalysisManager) {
   LLVM_DEBUG(dbgs() << "Run the dxil-cont-intrinsic-prepare pass\n");
 
+  AnalysisManager.getResult<DialectContextAnalysis>(M);
+
   SmallVector<Function *> Funcs(make_pointer_range(M.functions()));
 
   for (auto *F : Funcs) {
diff --git a/llvmraytracing/lib/DXILContPostProcess.cpp b/llvmraytracing/lib/DXILContPostProcess.cpp
index ffce304fdc..56c3c4f05b 100644
--- a/llvmraytracing/lib/DXILContPostProcess.cpp
+++ b/llvmraytracing/lib/DXILContPostProcess.cpp
@@ -38,11 +38,11 @@
 
 #include "compilerutils/CompilerUtils.h"
 #include "lgc/LgcCpsDialect.h"
+#include "lgc/LgcIlCpsDialect.h"
 #include "lgc/LgcRtDialect.h"
 #include "llpc/GpurtEnums.h"
 #include "llvm-dialects/Dialect/Builder.h"
 #include "llvmraytracing/Continuations.h"
-#include "llvmraytracing/ContinuationsDialect.h"
 #include "llvmraytracing/ContinuationsUtil.h"
 #include "llvmraytracing/CpsStackLowering.h"
 #include "llvmraytracing/GpurtContext.h"
@@ -90,12 +90,9 @@ namespace {
 class DXILContPostProcessPassImpl final {
 public:
   DXILContPostProcessPassImpl(Module &M, Module &GpurtLibrary);
-  bool run(llvm::ModuleAnalysisManager &AnalysisManager);
-
-  static constexpr unsigned SystemDataArgumentIndexStartWithoutCsp = 1;
-  static constexpr unsigned SystemDataArgumentIndexContinuationWithoutCsp = 0;
-  static constexpr unsigned SystemDataArgumentIndexRayGen = 0;
+  PreservedAnalyses run(ModuleAnalysisManager &AnalysisManager);
 
+  static constexpr unsigned SystemDataArgumentIndex = 1;
   struct FunctionData {
     DXILShaderKind Kind = DXILShaderKind::Invalid;
     /// Calls to hlsl intrinsics
@@ -114,18 +111,12 @@ class DXILContPostProcessPassImpl final {
   void handleValueGetI32(Function &F);
   void handleValueSetI32(Function &F);
 
-  void handleContPayloadRegisterI32Count(Function &F);
-  void handleContPayloadRegistersGetI32(Function &F);
-  void handleContPayloadRegistersSetI32(Function &F);
   void handleContStackIntrinsic(FunctionAnalysisManager &FAM, Function &F);
 
   void initializeProcessableFunctionData();
   bool handleRegisterBufferCalls();
   bool replaceIntrinsicCalls(Function &F, const FunctionData &Data);
-  [[nodiscard]] std::pair<bool, Function *>
-  insertSetupRayGen(Function &F, const FunctionData &Data);
   bool handleIntrinsicCalls(llvm::ModuleAnalysisManager &AnalysisManager);
-  bool replaceIntrinsicCallsAndSetupRayGen();
   bool lowerCpsOps();
   void lowerJumpOp(lgc::cps::JumpOp &JumpOp);
   bool unfoldGlobals();
@@ -135,7 +126,6 @@ class DXILContPostProcessPassImpl final {
   Module *GpurtLibrary;
   GlobalVariable *Registers;
   MapVector<Function *, FunctionData> ToProcess;
-  Function *SetupRayGen;
   llvm_dialects::Builder Builder;
   std::optional<ContStackAddrspace> StackAddrspace;
   std::optional<CpsStackLowering> StackLowering;
@@ -165,7 +155,7 @@ static void reportContStateSizes(Module &M) {
   // Determine the set of entry functions which have a continuation function
   // We cannot rely on the state size for this, because functions without a
   // continuation (e.g. a non-recursive CHS) have a state size of 0 in metadata.
-  DenseSet<Function *> EntriesWithContinuationFunctions;
+  SetVector<Function *> EntriesWithContinuationFunctions;
   for (auto &F : M.functions()) {
     if (F.isDeclaration())
       continue;
@@ -177,21 +167,18 @@ static void reportContStateSizes(Module &M) {
     }
   }
 
-  for (auto &F : M) {
-    auto Stage = lgc::rt::getLgcRtShaderStage(&F);
-    if (!Stage || F.isDeclaration())
-      continue;
-
-    if (!EntriesWithContinuationFunctions.contains(&F))
+  for (auto *F : EntriesWithContinuationFunctions) {
+    auto Stage = lgc::rt::getLgcRtShaderStage(F);
+    if (!Stage || F->isDeclaration())
       continue;
 
-    auto OptStateSize = ContHelper::tryGetContinuationStateByteCount(F);
+    auto OptStateSize = ContHelper::tryGetContinuationStateByteCount(*F);
     if (!OptStateSize.has_value())
       continue;
 
     DXILShaderKind ShaderKind =
         ShaderStageHelper::rtShaderStageToDxilShaderKind(*Stage);
-    dbgs() << "Continuation state size of \"" << F.getName() << "\" ("
+    dbgs() << "Continuation state size of \"" << F->getName() << "\" ("
            << ShaderKind << "): " << OptStateSize.value() << " bytes\n";
   }
 }
@@ -265,115 +252,9 @@ static Function *getContinuationGetAddrAndMD(Module &M) {
     return F;
   auto &C = M.getContext();
   auto *I64 = Type::getInt64Ty(C);
-  // To avoid having multiple copies of the intrinsic for each referenced
-  // function type, keep existing inttoptr to convert the function pointer to
-  // i64, and pass that i64 to the intrinsic.
-  // TODO: With opaque pointers, instead just pass a ptr to the function.
-  auto *FuncTy = FunctionType::get(I64, {I64}, false);
-  return cast<Function>(M.getOrInsertFunction(Name, FuncTy).getCallee());
-}
+  auto *FuncTy = FunctionType::get(I64, {PointerType::get(C, 0)}, false);
 
-// If this function returns false, we know that F cannot be used as pointer,
-// e.g. because it is an intrinsic.
-static bool canBeUsedAsPtr(const Function &F) {
-  return !F.getName().starts_with("dx.op");
-}
-
-// Collects all function pointers (uses of functions that are not calls),
-// and adds metadata to them using the `continuations.getAddrAndMD` intrinsic.
-// TODO: In the future, we might instead want to directly insert the intrinsic
-//       in places depending on function pointers (resume functions, and
-//       traversal). This function here is a stop-gap to enable implementing the
-//       intrinsic without having to deal with all corner cases that might arise
-//       above. Thus, we are just handling the cases known to occur.
-//       One might be tempted to speed this up by instead traversing usages of
-//       the continuation.continue intrinsics, getting the passed function
-//       pointers, and tracing these back. However, this way we would miss
-//       function pointers stored to memory, as we do for the return address
-//       stored in system data.
-static bool addGetAddrAndMDIntrinsicCalls(Module &M) {
-  Function *GetAddrAndMD = getContinuationGetAddrAndMD(M);
-  IRBuilder<> B{M.getContext()};
-
-  bool Changed = false;
-  // We will first traverse all uses, and resolve everything up to constant
-  // expressions. However, there might be nested constant expressions, each
-  // having multiple users, so we resolve those using a worklist.
-  SmallVector<ConstantExpr *> CEWorkList;
-  SmallVector<User *> CurrentCEUsers;
-
-  for (auto &F : M.functions()) {
-    // Speed-up: Skip F if it cannot be used as pointer, e.g. dx intrinsics.
-    if (!canBeUsedAsPtr(F))
-      continue;
-
-    CEWorkList.clear();
-    for (auto *U : F.users()) {
-      // Ignore calls of the function.
-      auto *CI = dyn_cast<CallInst>(U);
-      if (CI && CI->getCalledFunction() == &F)
-        continue;
-
-      if (auto *GA = dyn_cast<GlobalAlias>(U)) {
-        // Ignore global aliases. Check that these have no users,
-        // as these would need to be changed.
-        assert(GA->user_empty());
-        continue;
-      }
-
-      // Must be ConstantExpr
-      ConstantExpr *CE = cast<ConstantExpr>(U);
-      CEWorkList.push_back(CE);
-    }
-
-    while (!CEWorkList.empty()) {
-      auto *CE = CEWorkList.pop_back_val();
-      assert(
-          (isa<BitCastOperator, PtrToIntOperator, ConstantExpr, BinaryOperator>(
-              CE)) &&
-          "Unexpected use of function!");
-
-      // Copy the users of CE into a local SmallVector before traversing it,
-      // because we are going to add new users of CE that we do *not* want to
-      // traverse.
-      CurrentCEUsers.assign(CE->user_begin(), CE->user_end());
-      for (User *CEU : CurrentCEUsers) {
-        if (auto *NestedCE = dyn_cast<ConstantExpr>(CEU)) {
-          CEWorkList.push_back(NestedCE);
-          continue;
-        }
-
-        if (auto *GA = dyn_cast<GlobalAlias>(CEU)) {
-          assert(GA->user_empty());
-          continue;
-        }
-
-        // Final case: A real instruction using the function. Wrap
-        // the value into the intrinsic and pass that one to the instruction
-        // Set insertion point, and replace CE with the intrinsic
-        auto *I = cast<Instruction>(CEU);
-        assert(CE->getType() == Type::getInt64Ty(M.getContext()) &&
-               "Function use should be as an i64!");
-        B.SetInsertPoint(I);
-
-        auto *AddrWithMD = B.CreateCall(GetAddrAndMD, {CE});
-
-        // Can't RAUW because the CE might be used by different instructions.
-        // Instead, manually replace the instruction's operand.
-        [[maybe_unused]] bool Found = false;
-        for (unsigned OpIdx = 0, E = I->getNumOperands(); OpIdx < E; ++OpIdx) {
-          if (I->getOperand(OpIdx) == CE) {
-            I->setOperand(OpIdx, AddrWithMD);
-            Found = true;
-            Changed = true;
-          }
-        }
-        assert(Found);
-      }
-    }
-  }
-
-  return Changed;
+  return cast<Function>(M.getOrInsertFunction(Name, FuncTy).getCallee());
 }
 
 /// Checks some properties guaranteed for a module containing continuations
@@ -442,19 +323,6 @@ void DXILContPostProcessPassImpl::lowerGetResumePointAddr(Function &F) {
 
   // Search calls to GetResumePointAddr, and lower it to the argument of the
   // next continue call. Then remove it from that continue call.
-  // TODO: The return address being implicitly added to the next continue call,
-  //       and then being implicitly removed by the use of this intrinsic feels
-  //       a bit fragile.
-  //       We are currently planning to move to a scheme where every await
-  //       call is preceded by a call to GetResumePointAddr (in order to set
-  //       a scheduling priority). If we decide to stick to that scheme, we
-  //       could instead move lowering of GetResumePointAddr() to the
-  //       continuation cleanup pass before forming continue calls, and then
-  //       never add the resume address to the continue call there. I'm leaving
-  //       this for later in case we change the scheme again to avoid
-  //       unnecessary code churn. For the time being, the resume function
-  //       being added to the continue statement is necessary for us to find
-  //       it here.
   for (auto &Use : make_early_inc_range(GetResumePointAddr->uses())) {
     auto *CInst = dyn_cast<CallInst>(Use.getUser());
     if (!CInst || !CInst->isCallee(&Use) ||
@@ -476,14 +344,12 @@ void DXILContPostProcessPassImpl::lowerGetResumePointAddr(Function &F) {
     }
     auto *ContinueCall = *FoundContinueCall;
 
-    Value *ReturnAddr = nullptr;
-    bool IsCpsFunc = lgc::cps::isCpsFunction(*CInst->getFunction());
-
     // Only used for non-cps functions.
     unsigned ReturnAddrArgNum = 1;
+    Value *ReturnAddr = nullptr;
 
     if (auto *Jump = dyn_cast<lgc::cps::JumpOp>(ContinueCall); Jump) {
-      ReturnAddr = Builder.CreateZExt(Jump->getTarget(), Builder.getInt64Ty());
+      ReturnAddr = Jump->getTarget();
     } else {
       auto Name = ContinueCall->getCalledFunction()->getName();
 
@@ -522,11 +388,6 @@ void DXILContPostProcessPassImpl::lowerGetResumePointAddr(Function &F) {
 
     CInst->replaceAllUsesWith(ReturnAddr);
 
-    if (IsCpsFunc) {
-      CInst->eraseFromParent();
-      continue;
-    }
-
     // Re-create the continuation.continue call without the return address
     // argument.
     SmallVector<Value *> Args;
@@ -577,10 +438,7 @@ void DXILContPostProcessPassImpl::handleValueI32Count(Function &F) {
   auto *Ty = getFuncArgPtrElementType(&F, 0);
   auto *Size = Builder.getInt32(
       Mod->getDataLayout().getTypeStoreSize(Ty).getFixedValue() / 4);
-  llvm::forEachCall(F, [&](CallInst &CInst) {
-    CInst.replaceAllUsesWith(Size);
-    CInst.eraseFromParent();
-  });
+  llvm::replaceCallsToFunction(F, *Size);
 }
 
 void DXILContPostProcessPassImpl::handleValueGetI32(Function &F) {
@@ -626,60 +484,6 @@ void DXILContPostProcessPassImpl::handleValueSetI32(Function &F) {
   });
 }
 
-void DXILContPostProcessPassImpl::handleContPayloadRegisterI32Count(
-    Function &F) {
-  assert(F.arg_empty()
-         // register count
-         && F.getFunctionType()->getReturnType()->isIntegerTy(32));
-
-  uint32_t RegCount =
-      ContHelper::tryGetMaxUsedPayloadRegisterCount(*Mod).value_or(0);
-  auto *RegCountAsConstant =
-      ConstantInt::get(IntegerType::get(F.getContext(), 32), RegCount);
-
-  llvm::forEachCall(F, [&](CallInst &CInst) {
-    CInst.replaceAllUsesWith(RegCountAsConstant);
-    CInst.eraseFromParent();
-  });
-}
-
-void DXILContPostProcessPassImpl::handleContPayloadRegistersGetI32(
-    Function &F) {
-  assert(F.getReturnType()->isIntegerTy(32) &&
-         F.arg_size() == 1
-         // index
-         && F.getFunctionType()->getParamType(0)->isIntegerTy(32));
-
-  llvm::forEachCall(F, [&](CallInst &CInst) {
-    Builder.SetInsertPoint(&CInst);
-    auto *Addr =
-        Builder.CreateGEP(Registers->getValueType(), Registers,
-                          {Builder.getInt32(0), CInst.getArgOperand(0)});
-    auto *Load = Builder.CreateLoad(Builder.getInt32Ty(), Addr);
-    CInst.replaceAllUsesWith(Load);
-    CInst.eraseFromParent();
-  });
-}
-
-void DXILContPostProcessPassImpl::handleContPayloadRegistersSetI32(
-    Function &F) {
-  assert(F.getReturnType()->isVoidTy() &&
-         F.arg_size() == 2
-         // index
-         && F.getFunctionType()->getParamType(0)->isIntegerTy(32)
-         // value
-         && F.getFunctionType()->getParamType(1)->isIntegerTy(32));
-
-  llvm::forEachCall(F, [&](CallInst &CInst) {
-    Builder.SetInsertPoint(&CInst);
-    auto *Addr =
-        Builder.CreateGEP(Registers->getValueType(), Registers,
-                          {Builder.getInt32(0), CInst.getArgOperand(0)});
-    Builder.CreateStore(CInst.getOperand(1), Addr);
-    CInst.eraseFromParent();
-  });
-}
-
 // Replace calls to _AmdContStack* with calls to lgc.cps dialect ops.
 // Do some simple constant propagation on the fly.
 void DXILContPostProcessPassImpl::handleContStackIntrinsic(
@@ -737,23 +541,23 @@ void DXILContPostProcessPassImpl::handleContStackIntrinsic(
     Type *DestTy = CInst.getType();
 
     bool IsMemoryAccess = false;
-    if (FuncName == "Alloc") {
+    if (FuncName.starts_with("Alloc")) {
       Value *SizeArg =
           ConstantFoldInstruction(CInst.getFunction(), CInst.getArgOperand(0));
       Replacement = Builder.create<lgc::cps::AllocOp>(SizeArg);
 
       if (auto *Size = dyn_cast<ConstantInt>(SizeArg))
         ContHelper::addStackSize(CInst.getFunction(), Size->getSExtValue());
-    } else if (FuncName == "Free") {
+    } else if (FuncName.starts_with("Free")) {
       Value *SizeArg =
           ConstantFoldInstruction(CInst.getFunction(), CInst.getArgOperand(0));
       Replacement = Builder.create<lgc::cps::FreeOp>(SizeArg);
-    } else if (FuncName == "SetPtr") {
+    } else if (FuncName.starts_with("SetPtr")) {
       Value *Vsp = CInst.getArgOperand(0);
       Replacement = Builder.create<lgc::cps::SetVspOp>(Builder.CreateIntToPtr(
           Vsp,
           PointerType::get(Builder.getInt8Ty(), lgc::cps::stackAddrSpace)));
-    } else if (FuncName == "GetPtr") {
+    } else if (FuncName.starts_with("GetPtr")) {
       Replacement = Builder.create<lgc::cps::GetVspOp>();
     } else if (FuncName.starts_with("Load")) {
       Value *Addr =
@@ -840,7 +644,7 @@ void DXILContPostProcessPassImpl::initializeProcessableFunctionData() {
       Data.Kind = Kind;
 
       Data.SystemDataArgumentIndex =
-          !IsCpsFunction ? SystemDataArgumentIndexRayGen : CpsArgIdxSystemData;
+          !IsCpsFunction ? SystemDataArgumentIndex : CpsArgIdxSystemData;
 
       Data.SystemDataTy =
           F.getFunctionType()->getParamType(Data.SystemDataArgumentIndex);
@@ -859,8 +663,7 @@ void DXILContPostProcessPassImpl::initializeProcessableFunctionData() {
       Data.Kind = Kind;
 
       Data.SystemDataArgumentIndex =
-          !IsCpsFunction ? SystemDataArgumentIndexStartWithoutCsp
-                         : CpsArgIdxSystemData;
+          !IsCpsFunction ? SystemDataArgumentIndex : CpsArgIdxSystemData;
       Data.SystemDataTy =
           F.getFunctionType()->getParamType(Data.SystemDataArgumentIndex);
       [[maybe_unused]] bool DidInsert =
@@ -885,10 +688,9 @@ void DXILContPostProcessPassImpl::initializeProcessableFunctionData() {
         FunctionData Data = ToProcess[EntryF];
         Data.IsStart = false;
 
-        Data.SystemDataArgumentIndex =
-            !lgc::cps::isCpsFunction(F)
-                ? SystemDataArgumentIndexContinuationWithoutCsp
-                : CpsArgIdxSystemData;
+        Data.SystemDataArgumentIndex = !lgc::cps::isCpsFunction(F)
+                                           ? SystemDataArgumentIndex
+                                           : CpsArgIdxSystemData;
 
         Data.SystemDataTy = F.getArg(Data.SystemDataArgumentIndex)->getType();
         [[maybe_unused]] bool DidInsert =
@@ -981,73 +783,6 @@ bool DXILContPostProcessPassImpl::replaceIntrinsicCalls(
   return true;
 }
 
-std::pair<bool, Function *>
-DXILContPostProcessPassImpl::insertSetupRayGen(Function &F,
-                                               const FunctionData &Data) {
-  // The start part of the RayGen shader is the only occurrence where we need to
-  // call SetupRayGen
-  if (Data.Kind != DXILShaderKind::RayGeneration || !Data.IsStart)
-    return {false, &F};
-
-  auto *FuncTy = F.getFunctionType();
-
-  assert(FuncTy->getNumParams() > Data.SystemDataArgumentIndex &&
-         "Missing system data argument");
-
-  Argument *const SystemDataArgument = F.getArg(Data.SystemDataArgumentIndex);
-
-  // Replace usages of the system data argument with the result of SetupRayGen
-  Builder.SetInsertPointPastAllocas(&F);
-
-  auto *SystemDataInit = Builder.CreateCall(SetupRayGen);
-  assert(SystemDataInit->getType() == Data.SystemDataTy &&
-         "SetupRayGen return type does not match system data type");
-  SystemDataInit->setName("system.data");
-  SystemDataArgument->replaceAllUsesWith(SystemDataInit);
-  CrossInliner.inlineCall(*SystemDataInit);
-
-  // Change function signature to remove the system data argument
-  SmallVector<Type *> ArgTypes;
-  ArgTypes.append(FuncTy->param_begin(),
-                  FuncTy->param_begin() + Data.SystemDataArgumentIndex);
-  ArgTypes.append(FuncTy->param_begin() + (Data.SystemDataArgumentIndex + 1),
-                  FuncTy->param_end());
-  auto *NewFuncTy = FunctionType::get(FuncTy->getReturnType(), ArgTypes, false);
-
-  Function *NewFunc = CompilerUtils::cloneFunctionHeader(
-      F, NewFuncTy, ArrayRef<AttributeSet>{});
-  NewFunc->takeName(&F);
-
-  llvm::moveFunctionBody(F, *NewFunc);
-
-  F.replaceAllUsesWith(ConstantExpr::getBitCast(NewFunc, F.getType()));
-  F.eraseFromParent();
-
-  return {true, NewFunc};
-}
-
-bool DXILContPostProcessPassImpl::replaceIntrinsicCallsAndSetupRayGen() {
-  bool Changed = false;
-
-  // We will change some function signatures and populate a new MapVector as we
-  // go, to then replace ToProcess
-  MapVector<Function *, FunctionData> ToProcessNew;
-  ToProcessNew.reserve(ToProcess.size());
-
-  for (auto &[Func, Data] : ToProcess) {
-    Changed |= replaceIntrinsicCalls(*Func, Data);
-
-    auto const [DidInsert, NewFunc] = insertSetupRayGen(*Func, Data);
-    Changed |= DidInsert;
-
-    // Func could have been changed, but Data is the same
-    ToProcessNew.insert({NewFunc, std::move(Data)});
-  }
-
-  ToProcess = std::move(ToProcessNew);
-  return Changed;
-}
-
 //
 // Entry point for all lgc.cps lowering.
 //
@@ -1058,6 +793,7 @@ bool DXILContPostProcessPassImpl::lowerCpsOps() {
     DXILContPostProcessPassImpl &Self;
     bool &Changed;
     llvm_dialects::Builder &Builder;
+    Function *GetAddrAndMD;
   };
 
   // Note: It is a bit unlucky that we are using both a visitor for
@@ -1071,10 +807,10 @@ bool DXILContPostProcessPassImpl::lowerCpsOps() {
           .add<lgc::cps::AsContinuationReferenceOp>(
               [](CpsVisitorState &State,
                  lgc::cps::AsContinuationReferenceOp &AsCrOp) {
-                Value *LoweredReference =
-                    lgc::cps::lowerAsContinuationReference(State.Builder,
-                                                           AsCrOp);
-                AsCrOp.replaceAllUsesWith(LoweredReference);
+                State.Builder.SetInsertPoint(&AsCrOp);
+                auto *AddrWithMD = State.Builder.CreateCall(State.GetAddrAndMD,
+                                                            {AsCrOp.getFn()});
+                AsCrOp.replaceAllUsesWith(AddrWithMD);
                 AsCrOp.eraseFromParent();
                 State.Changed = true;
               })
@@ -1085,7 +821,8 @@ bool DXILContPostProcessPassImpl::lowerCpsOps() {
               })
           .build();
 
-  CpsVisitorState State{*this, Changed, Builder};
+  CpsVisitorState State{*this, Changed, Builder,
+                        getContinuationGetAddrAndMD(*Mod)};
 
   struct CspCandidateInfo {
     bool RequiresCspArgument = false;
@@ -1098,21 +835,19 @@ bool DXILContPostProcessPassImpl::lowerCpsOps() {
     if (Func.isDeclaration())
       continue;
 
-    if (Func.hasMetadata(ContHelper::MDContinuationName)) {
-      CandidateInfo.push_back(
-          {!ContHelper::isLegacyEntryFunction(&Func), &Func});
-      continue;
-    }
-
     if (lgc::rt::getLgcRtShaderStage(&Func) ==
         lgc::rt::RayTracingShaderStage::KernelEntry) {
       CandidateInfo.push_back({false, &Func});
       continue;
     }
 
+    if (Func.hasMetadata(ContHelper::MDContinuationName)) {
+      CandidateInfo.push_back({true, &Func});
+      continue;
+    }
+
     if (lgc::cps::isCpsFunction(Func)) {
-      CandidateInfo.push_back(
-          {!ContHelper::isLegacyEntryFunction(&Func), &Func});
+      CandidateInfo.push_back({true, &Func});
       continue;
     }
   }
@@ -1204,15 +939,6 @@ bool DXILContPostProcessPassImpl::handleAmdInternals() {
     } else if (Name.starts_with("_AmdValueSetI32")) {
       Changed = true;
       handleValueSetI32(F);
-    } else if (Name.starts_with("_AmdContPayloadRegistersI32Count")) {
-      Changed = true;
-      handleContPayloadRegisterI32Count(F);
-    } else if (Name.starts_with("_AmdContPayloadRegistersGetI32")) {
-      Changed = true;
-      handleContPayloadRegistersGetI32(F);
-    } else if (Name.starts_with("_AmdContPayloadRegistersSetI32")) {
-      Changed = true;
-      handleContPayloadRegistersSetI32(F);
     }
   }
 
@@ -1221,12 +947,11 @@ bool DXILContPostProcessPassImpl::handleAmdInternals() {
 
 DXILContPostProcessPassImpl::DXILContPostProcessPassImpl(Module &M,
                                                          Module &GpurtLibrary)
-    : Mod{&M}, GpurtLibrary{&GpurtLibrary},
-      SetupRayGen{GpurtLibrary.getFunction(ContDriverFunc::SetupRayGenName)},
-      Builder{Mod->getContext()}, StackAddrspace{
-                                      ContHelper::tryGetStackAddrspace(*Mod)} {}
+    : Mod{&M}, GpurtLibrary{&GpurtLibrary}, Builder{Mod->getContext()},
+      StackAddrspace{ContHelper::tryGetStackAddrspace(*Mod)} {}
 
-bool DXILContPostProcessPassImpl::run(ModuleAnalysisManager &AnalysisManager) {
+PreservedAnalyses
+DXILContPostProcessPassImpl::run(ModuleAnalysisManager &AnalysisManager) {
   bool Changed = false;
 
   StackLowering.emplace(Mod->getContext(),
@@ -1241,7 +966,9 @@ bool DXILContPostProcessPassImpl::run(ModuleAnalysisManager &AnalysisManager) {
   Changed |= unfoldGlobals();
   Changed |= handleAmdInternals();
   Changed |= handleIntrinsicCalls(AnalysisManager);
-  Changed |= replaceIntrinsicCallsAndSetupRayGen();
+
+  for (auto &[Func, Data] : ToProcess)
+    Changed |= replaceIntrinsicCalls(*Func, Data);
 
   for (auto &F : make_early_inc_range(*Mod)) {
     auto FuncName = F.getName();
@@ -1260,9 +987,6 @@ bool DXILContPostProcessPassImpl::run(ModuleAnalysisManager &AnalysisManager) {
 
   Changed |= fixupDxilMetadata(*Mod);
 
-  // Change function pointer accesses to include metadata
-  Changed |= addGetAddrAndMDIntrinsicCalls(*Mod);
-
 #ifndef NDEBUG
   checkContinuationsModule(*Mod, ContinueCalls);
 #endif
@@ -1278,7 +1002,7 @@ bool DXILContPostProcessPassImpl::run(ModuleAnalysisManager &AnalysisManager) {
 
   Changed |= llvm::removeUnusedFunctionDecls(Mod, false);
 
-  return Changed;
+  return Changed ? PreservedAnalyses::none() : PreservedAnalyses::all();
 }
 } // anonymous namespace
 
@@ -1291,7 +1015,5 @@ DXILContPostProcessPass::run(llvm::Module &Module,
   auto &GpurtContext = lgc::GpurtContext::get(Module.getContext());
   DXILContPostProcessPassImpl Impl{
       Module, GpurtContext.theModule ? *GpurtContext.theModule : Module};
-  bool Changed = Impl.run(AnalysisManager);
-
-  return Changed ? PreservedAnalyses::none() : PreservedAnalyses::all();
+  return Impl.run(AnalysisManager);
 }
diff --git a/llvmraytracing/lib/LegacyCleanupContinuations.cpp b/llvmraytracing/lib/LegacyCleanupContinuations.cpp
index bec489fba9..18fed80ce7 100644
--- a/llvmraytracing/lib/LegacyCleanupContinuations.cpp
+++ b/llvmraytracing/lib/LegacyCleanupContinuations.cpp
@@ -38,10 +38,10 @@
 
 #include "compilerutils/CompilerUtils.h"
 #include "lgc/LgcCpsDialect.h"
+#include "lgc/LgcIlCpsDialect.h"
 #include "lgc/LgcRtDialect.h"
 #include "llvm-dialects/Dialect/Builder.h"
 #include "llvmraytracing/Continuations.h"
-#include "llvmraytracing/ContinuationsDialect.h"
 #include "llvmraytracing/ContinuationsUtil.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringRef.h"
@@ -61,7 +61,7 @@ class LegacyCleanupContinuationsPassImpl {
   LegacyCleanupContinuationsPassImpl(
       llvm::Module &Mod, llvm::ModuleAnalysisManager &AnalysisManager);
 
-  llvm::PreservedAnalyses run();
+  PreservedAnalyses run();
 
 private:
   struct ContinuationData {
@@ -94,7 +94,7 @@ class LegacyCleanupContinuationsPassImpl {
   void handleContinue(ContinuationData &Data, Instruction *Ret);
   void handleSingleContinue(ContinuationData &Data, CallInst *Call,
                             Value *ResumeFun);
-  void handleReturn(ContinuationData &Data, CallInst *ContRet);
+  void handleReturn(ContinuationData &Data, lgc::ilcps::ReturnOp &ContRet);
 
   Module &M;
   LLVMContext &Context;
@@ -252,8 +252,8 @@ uint32_t getIncomingRegisterCount(Function *ResumeFunc) {
   std::optional<uint32_t> RegCount;
   while (!Worklist.empty()) {
     auto *U = Worklist.pop_back_val();
-    if (auto *Const = dyn_cast<Constant>(U)) {
-      Worklist.append(Const->user_begin(), Const->user_end());
+    if (isa<Constant>(U) || isa<lgc::cps::AsContinuationReferenceOp>(U)) {
+      Worklist.append(U->user_begin(), U->user_end());
       continue;
     }
     assert(isa<CallInst>(U) &&
@@ -369,9 +369,13 @@ void LegacyCleanupContinuationsPassImpl::processContinuation(
     } else {
       B.SetInsertPoint(&*F->getEntryBlock().getFirstNonPHIOrDbgOrAlloca());
 
-      // Find arguments from continuation.returnvalue calls
+      AllArgTypes.push_back(
+          B.getInt64Ty()); // Dummy return address for resume functions
+      AllArgValues.push_back(nullptr);
+
+      // Find arguments from lgc.ilcps.getreturnvalue calls
       for (auto &I : F->getEntryBlock()) {
-        if (auto *Intr = dyn_cast<continuations::GetReturnValueOp>(&I)) {
+        if (auto *Intr = dyn_cast<lgc::ilcps::GetReturnValueOp>(&I)) {
           AllArgTypes.push_back(Intr->getType());
           AllArgValues.push_back(Intr);
           InstsToRemove.push_back(Intr);
@@ -410,14 +414,18 @@ void LegacyCleanupContinuationsPassImpl::processContinuation(
     llvm::moveFunctionBody(*F, *NewFunc);
 
     // Set arg names for new function
+    // Skip the dummy return address for non-start functions
     for (unsigned Idx = 0; Idx != NewFunc->getFunctionType()->params().size();
          ++Idx) {
-      Argument *Arg = NewFunc->getArg(Idx);
       Value *OldVal = AllArgValues[Idx];
-      if (OldVal) {
-        Arg->setName(OldVal->getName());
-        OldVal->replaceAllUsesWith(Arg);
-      }
+      // Skip the dummy return address.
+      if (!OldVal)
+        continue;
+
+      Argument *Arg = NewFunc->getArg(Idx);
+      Arg->setName(OldVal->getName());
+      OldVal->replaceAllUsesWith(Arg);
+
       if (IsStart) {
         Argument *OldArg = F->getArg(Idx);
         if (OldArg->hasInRegAttr())
@@ -465,8 +473,8 @@ void LegacyCleanupContinuationsPassImpl::processContinuation(
       } else if (I->getOpcode() == Instruction::Unreachable) {
         if (auto *Call = dyn_cast<CallInst>(--I->getIterator())) {
           if (auto *Called = Call->getCalledFunction()) {
-            if (Called->getName() == "continuation.return")
-              handleReturn(FuncData, Call);
+            if (auto *ContRet = dyn_cast<lgc::ilcps::ReturnOp>(Call))
+              handleReturn(FuncData, *ContRet);
           }
         }
       }
@@ -587,7 +595,8 @@ void LegacyCleanupContinuationsPassImpl::handleSingleContinue(
   // Pass resume address as argument
   B.SetInsertPoint(Call);
 
-  auto *ReturnAddrInt = B.CreatePtrToInt(ResumeFun, I64);
+  auto *ContinuationReference =
+      B.create<lgc::cps::AsContinuationReferenceOp>(I64, ResumeFun);
 
   bool IsWait = ContHelper::isWaitAwaitCall(*Call);
   Function *ContinueFunction = IsWait ? WaitContinue : Continue;
@@ -598,7 +607,8 @@ void LegacyCleanupContinuationsPassImpl::handleSingleContinue(
   // The wait mask is the first argument after the function pointer
   if (IsWait)
     Args.push_back(*Call->arg_begin());
-  Args.push_back(ReturnAddrInt);
+  Args.push_back(ContinuationReference);
+
   Args.append(Call->arg_begin() + (IsWait ? 1 : 0), Call->arg_end());
   auto *ContinueCall = B.CreateCall(ContinueFunction, Args);
 
@@ -620,39 +630,41 @@ void LegacyCleanupContinuationsPassImpl::handleSingleContinue(
 }
 
 /// Transform
-///   call void (i64, ...) @continuation.return(i64 %returnaddr, <return value>)
-///   unreachable
+///   call void (i64, ...) @lgc.ilcps.return(i64 %returnaddr, <return
+///   value>) unreachable
 /// to
 ///   <decrement CSP>
 ///   call void @continuation.continue(i64 %returnaddr, <return value>)
 ///   unreachable
-void LegacyCleanupContinuationsPassImpl::handleReturn(ContinuationData &Data,
-                                                      CallInst *ContRet) {
-  LLVM_DEBUG(dbgs() << "Converting return to continue: " << *ContRet << "\n");
-  bool IsEntry = isa<UndefValue>(ContRet->getArgOperand(0));
-  B.SetInsertPoint(ContRet);
+void LegacyCleanupContinuationsPassImpl::handleReturn(
+    ContinuationData &Data, lgc::ilcps::ReturnOp &ContRet) {
+  LLVM_DEBUG(dbgs() << "Converting return to continue: " << ContRet << "\n");
+  bool IsEntry = isa<UndefValue>(ContRet.getReturnAddr());
+  B.SetInsertPoint(&ContRet);
 
   uint32_t NeededStackSize = Data.getContStateStackBytes();
   if (NeededStackSize > 0)
     B.create<lgc::cps::FreeOp>(B.getInt32(NeededStackSize));
 
   if (IsEntry) {
-    assert(ContRet->arg_size() == 1 &&
+    assert(ContRet.getArgs().empty() &&
            "Entry functions ignore the return value");
 
-    llvm::terminateShader(B, ContRet);
+    llvm::terminateShader(B, &ContRet);
   } else {
     // Create the call to continuation.continue, but with the same argument list
-    // as for continuation.return. The CSP is appended during
+    // as for lgc.ilcps.return. The CSP is appended during
     // DXILContPostProcess.
-    SmallVector<Value *> Args(ContRet->args());
+    // Append the dummy return address as well.
+    SmallVector<Value *> Args(ContRet.args());
+    Args.insert(Args.begin() + 1, PoisonValue::get(B.getInt64Ty()));
     auto *ContinueCall = B.CreateCall(Continue, Args);
     Data.NewReturnContinues.push_back(ContinueCall);
 
-    ContinueCall->copyMetadata(*ContRet);
+    ContinueCall->copyMetadata(ContRet);
     assert(ContHelper::tryGetOutgoingRegisterCount(ContinueCall) &&
            "Missing registercount metadata!");
-    ContRet->eraseFromParent();
+    ContRet.eraseFromParent();
   }
 }
 
@@ -667,7 +679,7 @@ LegacyCleanupContinuationsPassImpl::LegacyCleanupContinuationsPassImpl(
   ContFree = M.getFunction("continuation.free");
 }
 
-llvm::PreservedAnalyses LegacyCleanupContinuationsPassImpl::run() {
+PreservedAnalyses LegacyCleanupContinuationsPassImpl::run() {
   bool Changed = false;
 
   // Map the entry function of a continuation to the analysis result
@@ -676,14 +688,18 @@ llvm::PreservedAnalyses LegacyCleanupContinuationsPassImpl::run() {
       continue;
     if (auto *MD = F.getMetadata(ContHelper::MDContinuationName)) {
       analyzeContinuation(F, MD);
-    } else if (lgc::rt::getLgcRtShaderStage(&F) ==
-               lgc::rt::RayTracingShaderStage::Traversal) {
-      Changed = true;
-      // Add !continuation metadata to Traversal after coroutine passes.
-      // The traversal loop is written as like the coroutine passes were applied
-      // manually.
-      MDTuple *ContMDTuple = MDTuple::get(Context, {ValueAsMetadata::get(&F)});
-      F.setMetadata(ContHelper::MDContinuationName, ContMDTuple);
+    } else {
+      auto ShaderStage = lgc::rt::getLgcRtShaderStage(&F);
+      if (ShaderStage == lgc::rt::RayTracingShaderStage::Traversal ||
+          ShaderStage == lgc::rt::RayTracingShaderStage::KernelEntry) {
+        Changed = true;
+        // Add !continuation metadata to KernelEntry and Traversal after
+        // coroutine passes. The traversal loop is written as like the coroutine
+        // passes were applied manually.
+        MDTuple *ContMDTuple =
+            MDTuple::get(Context, {ValueAsMetadata::get(&F)});
+        F.setMetadata(ContHelper::MDContinuationName, ContMDTuple);
+      }
     }
   }
 
@@ -707,9 +723,7 @@ llvm::PreservedAnalyses LegacyCleanupContinuationsPassImpl::run() {
     fixupDxilMetadata(M);
   }
 
-  if (Changed)
-    return PreservedAnalyses::none();
-  return PreservedAnalyses::all();
+  return Changed ? PreservedAnalyses::none() : PreservedAnalyses::all();
 }
 
 } // namespace
diff --git a/llvmraytracing/lib/ContinuationsDialect.cpp b/llvmraytracing/lib/LgcIlCpsDialect.cpp
similarity index 89%
rename from llvmraytracing/lib/ContinuationsDialect.cpp
rename to llvmraytracing/lib/LgcIlCpsDialect.cpp
index f18a3243c2..facdcfab40 100644
--- a/llvmraytracing/lib/ContinuationsDialect.cpp
+++ b/llvmraytracing/lib/LgcIlCpsDialect.cpp
@@ -23,10 +23,10 @@
  *
  **********************************************************************************************************************/
 
-//===- ContinuationsDialect.cpp - Dialect implementation ------------------===//
+//===- LgcIlCpsDialect.cpp - Dialect implementation ------------------===//
 
-#include "llvmraytracing/ContinuationsDialect.h"
+#include "lgc/LgcIlCpsDialect.h"
 
 #define GET_INCLUDES
 #define GET_DIALECT_DEFS
-#include "ContinuationsDialect.cpp.inc"
+#include "LgcIlCpsDialect.cpp.inc"
diff --git a/llvmraytracing/lib/LgcRtDialect.cpp b/llvmraytracing/lib/LgcRtDialect.cpp
index 7d4ee1c3ec..0f4934a1da 100644
--- a/llvmraytracing/lib/LgcRtDialect.cpp
+++ b/llvmraytracing/lib/LgcRtDialect.cpp
@@ -37,6 +37,10 @@
 
 using namespace llvm;
 
+namespace llvm {
+class LLVMContext;
+} // namespace llvm
+
 namespace {
 
 // Shader stage metadata to identify the shader stage of a given function.
@@ -65,16 +69,69 @@ constexpr const char ArgSizeMetadata[] = "lgc.rt.arg.size";
 // bytes.
 constexpr const char AttributeSizeMetadata[] = "lgc.rt.attribute.size";
 
+// Pipeline-wide max attribute size module metadata, giving the maximum
+// attribute size in bytes.
+constexpr const char MaxAttributeSizeMetadata[] = "lgc.rt.max.attribute.size";
+
+// Pipeline-wide max payload size module metadata, giving the maximum
+// payload size in bytes.
+constexpr const char MaxPayloadSizeMetadata[] = "lgc.rt.max.payload.size";
+
 // ==============================================================================================
-// Wrapper around setMetadata for unsigned integer cases.
-void setMetadataNumericValue(Function *func, StringRef Kind, size_t size) {
-  func->setMetadata(
-      Kind, MDNode::get(func->getContext(),
-                        {ConstantAsMetadata::get(ConstantInt::get(
-                            Type::getInt32Ty(func->getContext()), size))}));
+// Helper to create an MDNode containing a constant.
+MDNode *getMdNodeForNumericConstant(LLVMContext &context, size_t value) {
+  return MDNode::get(context, {ConstantAsMetadata::get(ConstantInt::get(
+                                  Type::getInt32Ty(context), value))});
 }
 
-} // anonymous namespace
+// ==============================================================================================
+// Helper to create an MDNode containing a constant.
+std::optional<size_t> extractNumericConstantFromMdNode(MDNode *node) {
+  if (!node)
+    return std::nullopt;
+  assert(node->getNumOperands() == 1);
+  if (auto *value = mdconst::dyn_extract<ConstantInt>(node->getOperand(0)))
+    return value->getZExtValue();
+
+  return std::nullopt;
+}
+
+// ==============================================================================================
+// Wrapper around setMetadata for unsigned integer cases, global object/function
+// version.
+void setMetadataNumericValue(GlobalObject *func, StringRef Kind, size_t size) {
+  func->setMetadata(Kind,
+                    getMdNodeForNumericConstant(func->getContext(), size));
+}
+
+// ==============================================================================================
+// Helper to obtain a constant from global object/function metadata.
+std::optional<size_t> getMetadataNumericValue(const GlobalObject *obj,
+                                              StringRef Kind) {
+  MDNode *node = obj->getMetadata(Kind);
+  return extractNumericConstantFromMdNode(node);
+}
+
+// ==============================================================================================
+// Wrapper around setMetadata for unsigned integer cases, module version.
+void setMetadataNumericValue(Module *module, StringRef Kind, size_t size) {
+  auto *node = module->getOrInsertNamedMetadata(Kind);
+  node->clearOperands();
+  node->addOperand(getMdNodeForNumericConstant(module->getContext(), size));
+}
+
+// ==============================================================================================
+// Helper to obtain a constant from a named metadata value.
+std::optional<size_t> getMetadataNumericValue(const llvm::Module *module,
+                                              StringRef Kind) {
+  NamedMDNode *node = module->getNamedMetadata(Kind);
+  if (!node)
+    return std::nullopt;
+  assert(node->getNumOperands() == 1);
+  return extractNumericConstantFromMdNode(node->getOperand(0));
+}
+
+} // namespace
 
 // ==============================================================================================
 // Get the metadata IDs associated with the lgc.rt dialect, so the caller knows
@@ -90,7 +147,10 @@ void lgc::rt::getLgcRtMetadataIds(LLVMContext &context,
 // ==============================================================================================
 // Sets the given shader stage to a LLVM function. If std::nullopt is
 // passed, then the shader stage metadata is removed from the function.
-void lgc::rt::setLgcRtShaderStage(Function *func,
+// func can instead be a GlobalVariable, allowing a front-end to use a
+// GlobalVariable to represent a shader retrieved from the cache, and wants to
+// mark it with a shader stage.
+void lgc::rt::setLgcRtShaderStage(GlobalObject *func,
                                   std::optional<RayTracingShaderStage> stage) {
   if (stage.has_value())
     setMetadataNumericValue(func, ShaderStageMetadata,
@@ -102,15 +162,16 @@ void lgc::rt::setLgcRtShaderStage(Function *func,
 // ==============================================================================================
 // Get the lgc.rt shader stage from a given function. If there is no shader
 // stage metadata apparent, then std::nullopt is returned.
+// func can instead be a GlobalVariable, allowing a front-end to use a
+// GlobalVariable to represent a shader retrieved from the cache, and wants to
+// mark it with a shader stage.
 std::optional<lgc::rt::RayTracingShaderStage>
-lgc::rt::getLgcRtShaderStage(const Function *func) {
-  MDNode *stageMetaNode = func->getMetadata(ShaderStageMetadata);
-  if (stageMetaNode) {
-    if (auto *value =
-            mdconst::dyn_extract<ConstantInt>(stageMetaNode->getOperand(0)))
-      return RayTracingShaderStage(value->getZExtValue());
+lgc::rt::getLgcRtShaderStage(const GlobalObject *func) {
+  std::optional<size_t> mdValue =
+      getMetadataNumericValue(func, ShaderStageMetadata);
+  if (mdValue.has_value()) {
+    return RayTracingShaderStage(*mdValue);
   }
-
   return std::nullopt;
 }
 
@@ -154,15 +215,13 @@ Constant *lgc::rt::getPaqFromSize(LLVMContext &context, size_t size) {
 // this code. We assume that the language reader correctly called
 // setShaderArgSize for any callable shader.
 size_t lgc::rt::getShaderArgSize(Function *func) {
-  MDNode *node = func->getMetadata(ArgSizeMetadata);
-
-  assert(node && "lgc::rt::getShaderArgSize: ArgSize metadata missing - forgot "
-                 "to call setShaderArgSize?");
+  std::optional<size_t> result = getMetadataNumericValue(func, ArgSizeMetadata);
 
-  if (auto *value = mdconst::dyn_extract<ConstantInt>(node->getOperand(0)))
-    return value->getZExtValue();
+  assert(result.has_value() &&
+         "lgc::rt::getShaderArgSize: ArgSize metadata missing - forgot "
+         "to call setShaderArgSize?");
 
-  return 0;
+  return result.value();
 }
 
 // ==============================================================================================
@@ -174,18 +233,41 @@ void lgc::rt::setShaderArgSize(Function *func, size_t size) {
 // ==============================================================================================
 // Get attribute size (in bytes) metadata for a ray-tracing shader function.
 std::optional<size_t> lgc::rt::getShaderHitAttributeSize(const Function *func) {
-  MDNode *node = func->getMetadata(AttributeSizeMetadata);
-  if (!node)
-    return std::nullopt;
-
-  if (auto *value = mdconst::dyn_extract<ConstantInt>(node->getOperand(0)))
-    return value->getZExtValue();
-
-  return std::nullopt;
+  return getMetadataNumericValue(func, AttributeSizeMetadata);
 }
 
 // ==============================================================================================
 // Set attribute size (in bytes) metadata for a ray-tracing shader function.
 void lgc::rt::setShaderHitAttributeSize(Function *func, size_t size) {
+  assert(getMaxHitAttributeSize(func->getParent()).value_or(size) >= size);
   setMetadataNumericValue(func, AttributeSizeMetadata, size);
 }
+
+// ==============================================================================================
+// Get max hit attribute size (in bytes) metadata for a ray-tracing module.
+// This is a pipeline-wide upper bound on the per-function hit attribute sizes.
+std::optional<size_t>
+lgc::rt::getMaxHitAttributeSize(const llvm::Module *module) {
+  return getMetadataNumericValue(module, MaxAttributeSizeMetadata);
+}
+
+// ==============================================================================================
+// Set max hit attribute size (in bytes) metadata for a ray-tracing module.
+// This is a pipeline-wide upper bound on the per-function hit attribute sizes.
+void lgc::rt::setMaxHitAttributeSize(llvm::Module *module, size_t size) {
+  setMetadataNumericValue(module, MaxAttributeSizeMetadata, size);
+}
+
+// ==============================================================================================
+// Get max payload size (in bytes) metadata for a ray-tracing module.
+// This is a pipeline-wide upper bound on the per-function payload sizes.
+std::optional<size_t> lgc::rt::getMaxPayloadSize(const llvm::Module *module) {
+  return getMetadataNumericValue(module, MaxPayloadSizeMetadata);
+}
+
+// ==============================================================================================
+// Set max hit attribute size (in bytes) metadata for a ray-tracing module.
+// This is a pipeline-wide upper bound on the per-function payload sizes.
+void lgc::rt::setMaxPayloadSize(llvm::Module *module, size_t size) {
+  setMetadataNumericValue(module, MaxPayloadSizeMetadata, size);
+}
diff --git a/llvmraytracing/lib/LowerAwait.cpp b/llvmraytracing/lib/LowerAwait.cpp
index 1ff9b8fbd5..235e513ead 100644
--- a/llvmraytracing/lib/LowerAwait.cpp
+++ b/llvmraytracing/lib/LowerAwait.cpp
@@ -29,28 +29,40 @@
 // a resume point.
 //
 // This pass introduces a global for the return address, which is saved at the
-// start of a function and used in a `@continuation.return(i64)` call in the
+// start of a function and used in a `@lgc.ilcps.return(i64)` call in the
 // end.
 //
 //===----------------------------------------------------------------------===//
 
-#include "compilerutils/CompilerUtils.h"
 #include "lgc/LgcCpsDialect.h"
-#include "llvm-dialects/Dialect/Builder.h"
+#include "lgc/LgcIlCpsDialect.h"
+#include "lgc/LgcRtDialect.h"
 #include "llvm-dialects/Dialect/Visitor.h"
 #include "llvmraytracing/Continuations.h"
-#include "llvmraytracing/ContinuationsDialect.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringRef.h"
-#include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/Module.h"
+#include "llvm/IR/PassManager.h"
 #include "llvm/IR/Type.h"
-#include <cassert>
 
 using namespace llvm;
 
 #define DEBUG_TYPE "lower-await"
 
+namespace {
+class LowerAwaitPassImpl final {
+public:
+  LowerAwaitPassImpl(Module &Mod);
+  PreservedAnalyses run();
+
+private:
+  Module &Mod;
+  MapVector<Function *, SmallVector<CallInst *>> ToProcess;
+  void collectContinuationFunctions();
+  void processContinuations(bool IsLgcCpsMode);
+};
+} // anonymous namespace
+
 Function *llvm::getContinuationWaitContinue(Module &M) {
   auto *Name = "continuation.waitContinue";
   if (auto *F = M.getFunction(Name))
@@ -66,35 +78,36 @@ Function *llvm::getContinuationWaitContinue(Module &M) {
 
 Function *llvm::getContinuationAwait(Module &M, Type *TokenTy,
                                      StructType *RetTy) {
-  std::string Name = "await.";
-  Name += RetTy->getStructName();
-  if (auto *F = M.getFunction(Name))
-    return F;
+  std::string Name = "await";
   auto &C = M.getContext();
-  AttributeList AL =
+  auto *AwaitTy = FunctionType::get(RetTy, TokenTy, false);
+  auto *AwaitFun = Function::Create(
+      AwaitTy, GlobalValue::LinkageTypes::ExternalLinkage, Name, &M);
+  AwaitFun->setAttributes(
       AttributeList::get(C, AttributeList::FunctionIndex,
-                         {Attribute::NoUnwind, Attribute::WillReturn});
-  return cast<Function>(
-      M.getOrInsertFunction(Name, AL, RetTy, TokenTy).getCallee());
+                         {Attribute::NoUnwind, Attribute::WillReturn}));
+  return AwaitFun;
 }
 
-static Function *getContinuationReturn(Module &M) {
-  auto *Name = "continuation.return";
-  if (auto *F = M.getFunction(Name))
-    return F;
-  auto &C = M.getContext();
-  auto *Void = Type::getVoidTy(C);
-  auto *FuncTy = FunctionType::get(Void, {}, true);
-  AttributeList AL = AttributeList::get(C, AttributeList::FunctionIndex,
-                                        {Attribute::NoReturn});
-  return cast<Function>(M.getOrInsertFunction(Name, FuncTy, AL).getCallee());
-}
+LowerAwaitPassImpl::LowerAwaitPassImpl(Module &Mod) : Mod{Mod} {}
 
-LowerAwaitPass::LowerAwaitPass() {}
+void LowerAwaitPassImpl::collectContinuationFunctions() {
+  for (auto &F : Mod.functions()) {
+    if (!F.getName().starts_with("await")) {
+      // Force processing annotated functions, even if they don't have await
+      // calls
+      if (F.hasMetadata(ContHelper::MDContinuationName))
+        ToProcess.insert({&F, {}});
+      continue;
+    }
+    for (auto *U : F.users()) {
+      if (auto *Inst = dyn_cast<CallInst>(U))
+        ToProcess[Inst->getFunction()].push_back(Inst);
+    }
+  }
+}
 
-static void processContinuations(
-    Module &M, const MapVector<Function *, SmallVector<CallInst *>> &ToProcess,
-    bool IsLgcCpsMode) {
+void LowerAwaitPassImpl::processContinuations(bool IsLgcCpsMode) {
   // We definitely have a call that requires continuation in this function
   //
   // If this is the first time we've done this for this function
@@ -105,7 +118,7 @@ static void processContinuations(
   // Replace the call with
   //    co.flag = llvm.coro.suspend.retcon
   //       unreachable
-  auto &Context = M.getContext();
+  auto &Context = Mod.getContext();
   auto *I8Ptr = Type::getInt8Ty(Context)->getPointerTo();
   auto *I32 = Type::getInt32Ty(Context);
   auto *I64 = Type::getInt64Ty(Context);
@@ -129,11 +142,10 @@ static void processContinuations(
 
     // Lgc.cps dialect will handle stack pointer and return address in
     // DXILContPostProcessPass.
+    bool IsTraversal = lgc::rt::getLgcRtShaderStage(F) ==
+                       lgc::rt::RayTracingShaderStage::Traversal;
     bool IsLegacyNonEntry =
-        !ContHelper::isLegacyEntryFunction(F) && !IsLgcCpsMode;
-    // Add passed return address.
-    if (IsLegacyNonEntry)
-      AllArgTypes.push_back(I64);
+        !ContHelper::isLegacyEntryFunction(F) && !IsLgcCpsMode && !IsTraversal;
 
     for (auto const &Arg : F->args())
       AllArgTypes.push_back(Arg.getType());
@@ -151,13 +163,9 @@ static void processContinuations(
     // Transfer code from old function to new function
     llvm::moveFunctionBody(*F, *NewFunc);
 
-    // Set arg names for new function
-    if (IsLegacyNonEntry)
-      NewFunc->getArg(0)->setName("returnAddr");
-
     for (unsigned Idx = 0; Idx != F->getFunctionType()->params().size();
          ++Idx) {
-      Argument *Arg = NewFunc->getArg(Idx + (IsLegacyNonEntry ? 1 : 0));
+      Argument *Arg = NewFunc->getArg(Idx);
       Argument *OldArg = F->getArg(Idx);
       Arg->setName(OldArg->getName());
       OldArg->replaceAllUsesWith(Arg);
@@ -177,11 +185,11 @@ static void processContinuations(
     // We need one per continuation because they have different metadata
     SmallVector<char> StrBuf;
     auto *ContProtoFunc = cast<Function>(
-        M.getOrInsertFunction(
-             (Twine("continuation.prototype.") + NewFunc->getName())
-                 .toStringRef(StrBuf),
-             FunctionType::get(NewRetTy, {I8Ptr, Type::getInt1Ty(Context)},
-                               false))
+        Mod.getOrInsertFunction(
+               (Twine("continuation.prototype.") + NewFunc->getName())
+                   .toStringRef(StrBuf),
+               FunctionType::get(NewRetTy, {I8Ptr, Type::getInt1Ty(Context)},
+                                 false))
             .getCallee());
 
     // Add metadata, marking it as a continuation function
@@ -195,13 +203,15 @@ static void processContinuations(
     // Alloc and free prototypes too
     auto *ContMallocTy = FunctionType::get(I8Ptr, {I32}, false);
     auto *ContMalloc = dyn_cast<Function>(
-        M.getOrInsertFunction("continuation.malloc", ContMallocTy).getCallee());
+        Mod.getOrInsertFunction("continuation.malloc", ContMallocTy)
+            .getCallee());
     auto *ContMallocPtr = ConstantExpr::getBitCast(ContMalloc, I8Ptr);
 
     auto *ContDeallocTy =
         FunctionType::get(Type::getVoidTy(Context), {I8Ptr}, false);
     auto *ContDealloc = dyn_cast<Function>(
-        M.getOrInsertFunction("continuation.free", ContDeallocTy).getCallee());
+        Mod.getOrInsertFunction("continuation.free", ContDeallocTy)
+            .getCallee());
     auto *ContDeallocPtr = ConstantExpr::getBitCast(ContDealloc, I8Ptr);
 
     llvm_dialects::Builder B(
@@ -242,7 +252,7 @@ static void processContinuations(
                         SuspendRetconArg);
       auto *RetTy = CI->getType();
       if (!RetTy->isVoidTy()) {
-        auto *RetVal = B.create<continuations::GetReturnValueOp>(RetTy);
+        auto *RetVal = B.create<lgc::ilcps::GetReturnValueOp>(RetTy);
         CI->replaceAllUsesWith(RetVal);
       }
       CI->eraseFromParent();
@@ -257,70 +267,76 @@ static void processContinuations(
         SavedRetAddr = NewFunc->getArg(0); // Return addr
       else
         SavedRetAddr = UndefValue::get(I64);
+    } else {
+      // We omit the "return address" later, make sure the
+      // dialects verifier doesn't fail since we disallow `nullptr` arguments
+      // right now.
+      SavedRetAddr = PoisonValue::get(I32);
     }
-    // Convert returns to continuation.return calls
-    auto *ContRet = getContinuationReturn(M);
+
+    // Convert returns to lgc.ilcps.return calls
     for (auto &BB : *NewFunc) {
       auto *I = BB.getTerminator();
       if (I->getOpcode() == Instruction::Ret) {
-        // Replace this instruction with a call to continuation.return
+        // Replace this instruction with a call to lgc.ilcps.return
         B.SetInsertPoint(I);
-        SmallVector<Value *, 2> RetVals;
+        SmallVector<Value *, 1> RetVals;
 
         if (!IsLgcCpsMode) {
-          RetVals.push_back(SavedRetAddr);
           if (I->getNumOperands() != 0)
             RetVals.push_back(I->getOperand(0));
         }
 
-        auto *ContRetCall = B.CreateCall(ContRet, RetVals);
+        auto *ContRetOp = B.create<lgc::ilcps::ReturnOp>(SavedRetAddr, RetVals);
         // DXILCont passes use annotations on the ret to pass information
         // on the shader exit to later passes. Copy such metadata to the ContRet
         // so later passes can pick it up from there.
-        ContRetCall->copyMetadata(*I);
+        ContRetOp->copyMetadata(*I);
         B.CreateUnreachable();
         I->eraseFromParent();
       }
     }
   }
-  fixupDxilMetadata(M);
 }
 
-llvm::PreservedAnalyses
-LowerAwaitPass::run(llvm::Module &M,
-                    llvm::ModuleAnalysisManager &AnalysisManager) {
-  LLVM_DEBUG(dbgs() << "Run the lower-await pass\n");
-  AnalysisManager.getResult<DialectContextAnalysis>(M);
+PreservedAnalyses LowerAwaitPassImpl::run() {
+  struct VisitorPayload {
+    LowerAwaitPassImpl &Self;
+    bool HasCpsAwaitCalls = false;
+  };
 
-  MapVector<Function *, SmallVector<CallInst *>> ToProcess;
   static auto Visitor =
-      llvm_dialects::VisitorBuilder<
-          MapVector<Function *, SmallVector<CallInst *>>>()
-          .setStrategy(llvm_dialects::VisitorStrategy::ByFunctionDeclaration)
-          .add<lgc::cps::AwaitOp>([](auto &ToProcess, auto &Op) {
-            ToProcess[Op.getFunction()].push_back(&Op);
+      llvm_dialects::VisitorBuilder<VisitorPayload>()
+          .add<lgc::cps::AwaitOp>([](VisitorPayload &Payload, auto &Op) {
+            Payload.Self.ToProcess[Op.getFunction()].push_back(&Op);
+            Payload.HasCpsAwaitCalls = true;
           })
           .build();
-  Visitor.visit(ToProcess, M);
-  bool IsLgcCpsMode = !ToProcess.empty() || ContHelper::isLgcCpsModule(M);
 
-  for (auto &F : M.functions()) {
-    if (!F.getName().starts_with("await.")) {
-      // Force processing annotated functions, even if they don't have await
-      // calls
-      if (F.hasMetadata(ContHelper::MDContinuationName))
-        ToProcess[&F].size();
-      continue;
-    }
-    for (auto *U : F.users()) {
-      if (auto *Inst = dyn_cast<CallInst>(U))
-        ToProcess[Inst->getFunction()].push_back(Inst);
-    }
-  }
+  VisitorPayload P{*this};
+  Visitor.visit(P, Mod);
+
+  collectContinuationFunctions();
 
   if (!ToProcess.empty()) {
-    processContinuations(M, ToProcess, IsLgcCpsMode);
+    bool IsLgcCpsMode = P.HasCpsAwaitCalls || ContHelper::isLgcCpsModule(Mod);
+    processContinuations(IsLgcCpsMode);
+    fixupDxilMetadata(Mod);
     return PreservedAnalyses::none();
   }
+
   return PreservedAnalyses::all();
 }
+
+LowerAwaitPass::LowerAwaitPass() {}
+
+llvm::PreservedAnalyses
+LowerAwaitPass::run(llvm::Module &M,
+                    llvm::ModuleAnalysisManager &AnalysisManager) {
+  LLVM_DEBUG(dbgs() << "Run the lower-await pass\n");
+  AnalysisManager.getResult<DialectContextAnalysis>(M);
+
+  LowerAwaitPassImpl Impl{M};
+
+  return Impl.run();
+}
diff --git a/llvmraytracing/lib/LowerRaytracingPipeline.cpp b/llvmraytracing/lib/LowerRaytracingPipeline.cpp
index bb3791ea4d..7d662c5876 100644
--- a/llvmraytracing/lib/LowerRaytracingPipeline.cpp
+++ b/llvmraytracing/lib/LowerRaytracingPipeline.cpp
@@ -40,11 +40,11 @@
 
 #include "compilerutils/CompilerUtils.h"
 #include "lgc/LgcCpsDialect.h"
+#include "lgc/LgcIlCpsDialect.h"
 #include "lgc/LgcRtDialect.h"
 #include "llvm-dialects/Dialect/OpSet.h"
 #include "llvm-dialects/Dialect/Visitor.h"
 #include "llvmraytracing/Continuations.h"
-#include "llvmraytracing/ContinuationsDialect.h"
 #include "llvmraytracing/ContinuationsUtil.h"
 #include "llvmraytracing/GpurtContext.h"
 #include "llvmraytracing/PayloadAccessQualifiers.h"
@@ -253,6 +253,10 @@ class ModuleMetadataState final {
     return MaxUsedPayloadRegisterCount;
   }
 
+  uint32_t getMaxHitAttributeByteCount() const {
+    return MaxHitAttributeByteCount;
+  }
+
   bool isInLgcCpsMode() const { return IsInLgcCpsMode; }
 
   void updateModuleMetadata() const;
@@ -272,6 +276,8 @@ class ModuleMetadataState final {
   ///        shader in the module. This excludes intersection shaders, which
   ///        just pass through an existing payload.
   uint32_t MaxUsedPayloadRegisterCount = 0;
+  /// [In]: The maximum size of hit attribute stored on the module as metadata.
+  uint32_t MaxHitAttributeByteCount = 0;
   /// [In]: The address space used for the continuations stack.
   ///       Either stack or global memory.
   ContStackAddrspace StackAddrspace = ContHelper::DefaultStackAddrspace;
@@ -283,20 +289,9 @@ class ModuleMetadataState final {
 class LowerRaytracingPipelinePassImpl final {
 public:
   LowerRaytracingPipelinePassImpl(Module &M, Module &GpurtLibrary);
-  bool run();
+  PreservedAnalyses run();
 
 private:
-  struct FunctionConfig {
-    // Maximum allowed size of hit attributes to be used in a TraceRay together
-    // with this function, even if this function does not touch hit attributes
-    // (e.g. a Miss shader).
-    uint32_t MaxHitAttributeBytes = 0;
-
-    bool operator==(const FunctionConfig &Other) const {
-      return MaxHitAttributeBytes == Other.MaxHitAttributeBytes;
-    }
-  };
-
   struct FunctionData {
     RayTracingShaderStage Kind = RayTracingShaderStage::Count;
     SmallVector<CallInst *> TraceRayCalls;
@@ -326,7 +321,6 @@ class LowerRaytracingPipelinePassImpl final {
     int32_t PayloadSpillSize = 0;
     /// Type of the incoming payload
     Type *IncomingPayload = nullptr;
-    FunctionConfig FuncConfig = {};
     /// Serialization info for the incoming payload, if there is one.
     /// Also applies to the outgoing payload in that case.
     PAQSerializationInfoBase *IncomingPayloadSerializationInfo = nullptr;
@@ -353,9 +347,126 @@ class LowerRaytracingPipelinePassImpl final {
     Type *NewRetTy = nullptr;
   };
 
+  // Simplify some code used to handle padding and payload computation and
+  // related things.
+  class PayloadHelper final {
+  public:
+    PayloadHelper(Module &Mod, const DataLayout &DL,
+                  llvm_dialects::Builder &Builder, bool CpsMode)
+        : Mod{Mod}, DL{DL}, Builder{Builder}, IsCpsMode{CpsMode} {}
+
+    /// Append padding and payload to lgc.cps.jump calls.
+    void patchJumpCalls(Function *Parent, ArrayRef<JumpOp *> JumpCalls,
+                        uint32_t PayloadStartDword) {
+      if (!IsCpsMode)
+        return;
+
+      for (auto *Jump : JumpCalls) {
+        Builder.SetInsertPoint(Jump);
+        SmallVector<Value *> NewTailArgs(Jump->getTail());
+
+        // Add padding so that payload starts at a fixed dword.
+        ContHelper::addPaddingValue(DL, Parent->getContext(), NewTailArgs,
+                                    PayloadStartDword);
+
+        // Insert payload into tail args.
+        NewTailArgs.push_back(Parent->getArg(CpsArgIdxPayload));
+
+        Builder.create<JumpOp>(Jump->getTarget(), Jump->getLevels(),
+                               Jump->getState(), NewTailArgs);
+        Jump->dropAllReferences();
+        Jump->eraseFromParent();
+      }
+    }
+
+    /// Create and initialize payload serialization storage from the incoming
+    /// payload argument.
+    void initializePayloadSerializationStorage(Function *Parent,
+                                               FunctionData &Data) {
+      llvm_dialects::Builder::InsertPointGuard Guard{Builder};
+      Builder.SetInsertPointPastAllocas(Parent);
+      Data.PayloadStorage = Builder.CreateAlloca(Data.PayloadStorageTy);
+      Data.PayloadStorage->setName("payload.serialization.alloca");
+      // TODO: We shouldn't need to create the alloca for RGS.
+      if (Data.Kind != RayTracingShaderStage::RayGeneration)
+        Builder.CreateStore(Parent->getArg(Parent->arg_size() - 1),
+                            Data.PayloadStorage);
+    }
+
+    Type *getPayloadStorageTy(uint32_t MaxPayloadRegisterCount,
+                              FunctionData &Data) {
+      uint32_t PayloadStorageI32s =
+          std::max(MaxPayloadRegisterCount, Data.MaxOutgoingPayloadI32s);
+      if (Data.IncomingPayloadSerializationInfo)
+        PayloadStorageI32s =
+            std::max(PayloadStorageI32s,
+                     Data.IncomingPayloadSerializationInfo->MaxStorageI32s);
+
+      return ArrayType::get(Builder.getInt32Ty(), PayloadStorageI32s);
+    }
+
+    // Compute the dword at which payload starts in the argument at most in the
+    // argument list. Only valid for lgc.cps mode since we only compute padding
+    // there.
+    uint32_t getPayloadStartDword(FunctionData &Data,
+                                  uint32_t MaxHitAttributeBytes,
+                                  Type *TraversalDataTy) {
+      assert(TraversalDataTy && "Failed to detect traversal system data type");
+      assert(IsCpsMode);
+
+      // For lgc.cps mode, take into account that the return address and shader
+      // index dwords are inserted at a later stage.
+      // For non-lgc.cps mode, we do not use padding yet.
+      return 1 + 1 + getArgumentDwordCount(DL, TraversalDataTy) +
+             std::max(divideCeil(MaxHitAttributeBytes, RegisterBytes),
+                      uint64_t(2));
+    }
+
+    /// Compute padding and payload arguments based on the passed arguments and
+    /// append them to ArgTys.
+    /// Returns a pair (paddingType, payloadType).
+    std::pair<Type *, Type *>
+    computePaddingAndPayloadArgTys(SmallVectorImpl<Type *> &ArgTys,
+                                   uint32_t PayloadSizeDwords,
+                                   uint32_t PayloadStartDword) {
+      // Compute padding type so that payload starts at a fixed dword.
+      Type *PaddingTy = ContHelper::getPaddingType(DL, Mod.getContext(), ArgTys,
+                                                   PayloadStartDword);
+      Type *PayloadTy = ArrayType::get(Builder.getInt32Ty(), PayloadSizeDwords);
+
+#ifndef NDEBUG
+      LLVM_DEBUG(
+          dbgs() << "Computing padding and payload based on following data:\n"
+                 << "Payload size: " << PayloadSizeDwords << " dwords\n"
+                 << "Payload start dword: " << PayloadStartDword
+                 << "\nArgument types:\n");
+      for (Type *Ty : ArgTys)
+        LLVM_DEBUG(dbgs() << *Ty << ": "
+                          << lgc::cps::getArgumentDwordCount(DL, Ty)
+                          << " dwords\n");
+
+      LLVM_DEBUG(dbgs() << "Resulting padding type: " << *PaddingTy
+                        << "\nResulting payload type: " << *PayloadTy
+                        << "\n---\n");
+#endif
+
+      ArgTys.push_back(PaddingTy);
+      ArgTys.push_back(PayloadTy);
+
+      return {PaddingTy, PayloadTy};
+    }
+
+  private:
+    Module &Mod;
+    const DataLayout &DL;
+    llvm_dialects::Builder &Builder;
+    bool IsCpsMode = false;
+  };
+
   void replaceCall(FunctionData &Data, CallInst *Call, Function *Func,
                    ContinuationCallType CallType);
   void handleRestoreSystemData(CallInst *Call);
+  void handleExitRayGen(const FunctionData &Data);
   void replaceContinuationCall(ContinuationCallType CallType, CallInst *Call,
                                const FunctionData &Data, Value *PayloadOrAttrs,
                                Type *PayloadOrAttrsTy);
@@ -376,6 +487,13 @@ class LowerRaytracingPipelinePassImpl final {
 
   void collectGpuRtFunctions();
 
+  // Computes an upper bound on the number of required payload registers
+  // for a TraceRay call, based on module-wide max attribute and payload size.
+  // In lgc.cps mode, this determines the number of payload
+  // registers preserved by Intersection shaders.
+  // Doesn't apply to callable shaders.
+  unsigned getUpperBoundOnTraceRayPayloadRegisters() const;
+
   // Copy the payload content between (global) payload storage and local
   // payload. Excludes the stack pointer or hit attributes which may also reside
   // in payload storage. If Stage is not set, all fields in SerializationInfo
@@ -428,6 +546,9 @@ class LowerRaytracingPipelinePassImpl final {
   void processFunctionEntry(FunctionData &Data, Argument *SystemDataArgument);
   void processFunctionEnd(FunctionData &Data, FunctionEndData &EData);
   void processFunction(Function *F, FunctionData &FuncData);
+  void handleContPayloadRegisterI32Count(Function &F);
+  void handleContPayloadRegistersGetI32(Function &F);
+  void handleContPayloadRegistersSetI32(Function &F);
 
   void collectProcessableFunctions();
 
@@ -444,6 +565,7 @@ class LowerRaytracingPipelinePassImpl final {
   llvm_dialects::Builder Builder;
   ModuleMetadataState MetadataState;
   PAQSerializationInfoManager PAQManager;
+  PayloadHelper PayloadHelper;
   CompilerUtils::CrossModuleInliner CrossInliner;
   Type *I32;
   Type *TokenTy;
@@ -465,12 +587,14 @@ class LowerRaytracingPipelinePassImpl final {
   Function *SetTriangleHitAttributes;
   Function *GetLocalRootIndex;
   Function *SetLocalRootIndex;
+  Function *ExitRayGen;
   Function *TraceRay;
   Function *CallShader;
   Function *ReportHit;
   Function *AcceptHit;
   Function *GetSbtAddress;
   Function *GetSbtStride;
+  MapVector<Type *, Function *> ShaderStartOverloads;
 };
 
 } // namespace
@@ -497,6 +621,20 @@ ModuleMetadataState::ModuleMetadataState(Module &Module) : Mod{Module} {
     MaxUsedPayloadRegisterCount = std::max(
         MaxUsedPayloadRegisterCount, PreservedPayloadRegisterCount.value());
 
+  // Use max hit attribute size from metadata, or use globally max allowed
+  // value for the max if metadata is not set
+  MaxHitAttributeByteCount =
+      getMaxHitAttributeSize(&Mod).value_or(GlobalMaxHitAttributeBytes);
+
+  if (MaxHitAttributeByteCount % RegisterBytes != 0) {
+    auto AlignedMaxHitAttributeSize =
+        alignTo(MaxHitAttributeByteCount, RegisterBytes);
+    LLVM_DEBUG(dbgs() << "Aligning misaligned max hit attribute size "
+                      << MaxHitAttributeByteCount << " to "
+                      << AlignedMaxHitAttributeSize << "\n");
+    MaxHitAttributeByteCount = AlignedMaxHitAttributeSize;
+  }
+
   // Import StackAddrspace from metadata if set, otherwise from default
   auto StackAddrspaceMD = ContHelper::tryGetStackAddrspace(Module);
   StackAddrspace = StackAddrspaceMD.value_or(ContHelper::DefaultStackAddrspace);
@@ -765,7 +903,7 @@ void LowerRaytracingPipelinePassImpl::replaceContinuationCall(
     // Specify hit attribute size also in case it is used for CallShader.
     // It is ignored by the implementation in that case.
     PAQPayloadConfig PAQConfig = {PayloadOrAttrsTy,
-                                  Data.FuncConfig.MaxHitAttributeBytes};
+                                  MetadataState.getMaxHitAttributeByteCount()};
     if (CallType == ContinuationCallType::Traversal) {
       const auto *TraceRayInfo =
           &PAQManager.getOrCreateTraceRaySerializationInfo(PAQConfig);
@@ -876,10 +1014,7 @@ void LowerRaytracingPipelinePassImpl::replaceContinuationCall(
     Args.append(Call->arg_begin() + 1, Call->arg_end());
   }
 
-  auto *SystemDataTy = DispatchSystemDataTy;
   if (CallType == ContinuationCallType::AnyHit) {
-    assert(TraversalDataTy && "Failed to detect traversal system data type");
-    SystemDataTy = TraversalDataTy;
     // Add hit attributes to arguments
     ArgTys.push_back(PayloadOrAttrsTy);
     auto *HitAttrs = Builder.CreateLoad(PayloadOrAttrsTy, PayloadOrAttrs);
@@ -890,34 +1025,29 @@ void LowerRaytracingPipelinePassImpl::replaceContinuationCall(
   Value *NewCall = nullptr;
 
   if (MetadataState.isInLgcCpsMode()) {
-    // Add padding so that payload starts at a fixed dword.
-    // NOTE: Minus 1 as return address is not included
-    ContHelper::addPaddingType(*DL, *Context, ArgTys,
-                               Data.FirstPayloadArgumentDword - 1);
-    Args.push_back(PoisonValue::get(ArgTys.back()));
-
     // Put payload at last
     auto OutgoingPayloadI32s =
         std::min(OutgoingSerializationLayout
                      ? OutgoingSerializationLayout->NumStorageI32s
                      : MetadataState.getMaxPayloadRegisterCount(),
                  MetadataState.getMaxPayloadRegisterCount());
-    auto *OutgoingPayloadTy = ArrayType::get(I32, OutgoingPayloadI32s);
-    ArgTys.push_back(OutgoingPayloadTy);
+
+    // Add padding so that payload starts at a fixed dword.
+    // NOTE: Minus 1 as return address is not included
+    const auto &[OutgoingPaddingTy, OutgoingPayloadTy] =
+        PayloadHelper.computePaddingAndPayloadArgTys(
+            ArgTys, OutgoingPayloadI32s, Data.FirstPayloadArgumentDword - 1);
+    Args.push_back(PoisonValue::get(OutgoingPaddingTy));
     Args.push_back(Builder.CreateLoad(OutgoingPayloadTy, Data.PayloadStorage));
 
-    auto *OrigRetTy = Call->getType();
+    SmallVector<Type *, 3> ReturnedArgTys{Call->getType()};
 
     // Add padding so that returned payload starts at a fixed dword.
     // NOTE: Minus 2 as return address and shader index are not included.
-    auto *PaddingTy = ContHelper::getPaddingType(
-        *DL, *Context, OrigRetTy, Data.FirstPayloadArgumentDword - 2);
-
-    // Also need to return payload
-    auto *NewRetTy =
-        StructType::get(Builder.getContext(),
-                        {OrigRetTy, PaddingTy,
-                         ArrayType::get(I32, ReturnedRegisterCount.value())});
+    PayloadHelper.computePaddingAndPayloadArgTys(
+        ReturnedArgTys, ReturnedRegisterCount.value(),
+        Data.FirstPayloadArgumentDword - 2);
+    auto *NewRetTy = StructType::get(Builder.getContext(), ReturnedArgTys);
 
     Annotatable =
         insertCpsAwait(NewRetTy, ShaderAddr, Call, Args, CallType, Data.Kind);
@@ -930,13 +1060,21 @@ void LowerRaytracingPipelinePassImpl::replaceContinuationCall(
 
     NewCall = Builder.CreateExtractValue(NewCall, 0);
   } else {
+    // Patch the dummy return address into await calls resulting from
+    // WaitAwaitTraversal. Note: this needs to be removed once we have the
+    // TraversalEntry function.
+    if (CallType == ContinuationCallType::Traversal) {
+      ArgTys.insert(ArgTys.begin() + 1, Builder.getInt64Ty());
+      Args.insert(Args.begin() + 1, PoisonValue::get(Builder.getInt64Ty()));
+    }
+
     auto *ShaderTy = FunctionType::get(TokenTy, ArgTys, false);
     auto *ShaderFun =
         Builder.CreateIntToPtr(ShaderAddr, ShaderTy->getPointerTo());
 
     auto *Token = Builder.CreateCall(ShaderTy, ShaderFun, Args);
     auto *Await =
-        getContinuationAwait(*Mod, TokenTy, cast<StructType>(SystemDataTy));
+        getContinuationAwait(*Mod, TokenTy, cast<StructType>(Call->getType()));
     NewCall = Builder.CreateCall(Await, {Token});
     Annotatable = Token;
   }
@@ -1048,28 +1186,6 @@ void LowerRaytracingPipelinePassImpl::replaceShaderRecordBufferCall(
   Call->eraseFromParent();
 }
 
-void LowerRaytracingPipelinePassImpl::handleGetFuncAddr(Function &Func) {
-  assert(Func.arg_empty()
-         // returns i64 or i32
-         && (Func.getFunctionType()->getReturnType()->isIntegerTy(64) ||
-             Func.getFunctionType()->getReturnType()->isIntegerTy(32)));
-
-  auto Name = Func.getName();
-  [[maybe_unused]] bool Consumed = Name.consume_front("_AmdGetFuncAddr");
-  assert(Consumed);
-
-  Constant *Addr = Mod->getFunction(Name);
-  if (!Addr)
-    report_fatal_error(Twine("Did not find function '") + Name +
-                       "' requested by _AmdGetFuncAddr");
-  Addr = ConstantExpr::getPtrToInt(Addr, Func.getReturnType());
-
-  llvm::forEachCall(Func, [&](llvm::CallInst &CInst) {
-    CInst.replaceAllUsesWith(Addr);
-    CInst.eraseFromParent();
-  });
-}
-
 void LowerRaytracingPipelinePassImpl::handleGetShaderKind(Function &Func) {
   assert(Func.getReturnType()->isIntegerTy(32) && Func.arg_size() == 0);
 
@@ -1092,8 +1208,32 @@ void LowerRaytracingPipelinePassImpl::handleGetShaderKind(Function &Func) {
   });
 }
 
+void LowerRaytracingPipelinePassImpl::handleGetFuncAddr(Function &Func) {
+  assert(Func.arg_empty()
+         // returns i64 or i32
+         && (Func.getFunctionType()->getReturnType()->isIntegerTy(64) ||
+             Func.getFunctionType()->getReturnType()->isIntegerTy(32)));
+
+  auto Name = Func.getName();
+  [[maybe_unused]] bool Consumed = Name.consume_front("_AmdGetFuncAddr");
+  assert(Consumed);
+
+  Function *F = Mod->getFunction(Name);
+  if (!F)
+    report_fatal_error(Twine("Did not find function '") + Name +
+                       "' requested by _AmdGetFuncAddr");
+
+  llvm::forEachCall(Func, [&](llvm::CallInst &CInst) {
+    auto *RetTy = Func.getReturnType();
+    Builder.SetInsertPoint(&CInst);
+    Value *AsContRef = Builder.create<AsContinuationReferenceOp>(RetTy, F);
+    CInst.replaceAllUsesWith(AsContRef);
+    CInst.eraseFromParent();
+  });
+}
+
 void LowerRaytracingPipelinePassImpl::handleGetCurrentFuncAddr(Function &Func) {
-  assert(Func.arg_size() == 0 &&
+  assert(Func.empty() &&
          // Returns an i32 or i64
          (Func.getReturnType()->isIntegerTy(32) ||
           Func.getReturnType()->isIntegerTy(64)));
@@ -1101,15 +1241,9 @@ void LowerRaytracingPipelinePassImpl::handleGetCurrentFuncAddr(Function &Func) {
   llvm::forEachCall(Func, [&](llvm::CallInst &CInst) {
     auto *F = CInst.getFunction();
     auto *RetTy = Func.getReturnType();
-    Value *FuncPtrToInt = nullptr;
-    if (MetadataState.isInLgcCpsMode()) {
-      // Add CPS level to function address
-      Builder.SetInsertPoint(&CInst);
-      FuncPtrToInt = Builder.create<AsContinuationReferenceOp>(RetTy, F);
-    } else {
-      FuncPtrToInt = ConstantExpr::getPtrToInt(F, RetTy);
-    }
-    CInst.replaceAllUsesWith(FuncPtrToInt);
+    Builder.SetInsertPoint(&CInst);
+    Value *AsContRef = Builder.create<AsContinuationReferenceOp>(RetTy, F);
+    CInst.replaceAllUsesWith(AsContRef);
     CInst.eraseFromParent();
   });
 }
@@ -1342,7 +1476,7 @@ void LowerRaytracingPipelinePassImpl::copyHitAttributes(
     // We are in an intersection shader, which does not know the payload type.
     // Assume maximum possible size
     PayloadHitAttrBytes =
-        Data.FuncConfig.MaxHitAttributeBytes - InlineHitAttrsBytes;
+        MetadataState.getMaxHitAttributeByteCount() - InlineHitAttrsBytes;
     // Use hit attribute storage at fixed index
     PayloadHitAttrs =
         SimplifyingCreateConstGEP1_32(Builder, I32, Data.PayloadStorage,
@@ -1351,7 +1485,7 @@ void LowerRaytracingPipelinePassImpl::copyHitAttributes(
 
   uint64_t HitAttrsBytes =
       DL->getTypeStoreSize(Data.HitAttributes).getFixedValue();
-  if (HitAttrsBytes > Data.FuncConfig.MaxHitAttributeBytes)
+  if (HitAttrsBytes > MetadataState.getMaxHitAttributeByteCount())
     report_fatal_error("Hit attributes are too large!");
   assert(InlineHitAttrsBytes + PayloadHitAttrBytes >= HitAttrsBytes &&
          "Insufficient hit attribute storage!");
@@ -1451,8 +1585,12 @@ void LowerRaytracingPipelinePassImpl::setGpurtEntryRegisterCountMetadata() {
 
       uint32_t InRegisterCount = 0;
       uint32_t OutRegisterCount = 0;
-      auto *Func = CI->getFunction();
-      switch (getLgcRtShaderStage(Func).value()) {
+      auto *CallerFunc = CI->getFunction();
+      auto ShaderStage = getLgcRtShaderStage(CallerFunc);
+      if (!ShaderStage)
+        continue;
+
+      switch (ShaderStage.value()) {
       case RayTracingShaderStage::Traversal:
         InRegisterCount = MaxRegisterCount;
         OutRegisterCount = MaxRegisterCount;
@@ -1470,10 +1608,10 @@ void LowerRaytracingPipelinePassImpl::setGpurtEntryRegisterCountMetadata() {
       ContHelper::setOutgoingRegisterCount(CI, OutRegisterCount);
       MetadataState.updateMaxUsedPayloadRegisterCount(OutRegisterCount);
 
-      assert(ContHelper::tryGetIncomingRegisterCount(Func).value_or(
-                 InRegisterCount) == InRegisterCount &&
+      assert(ContHelper::tryGetIncomingRegisterCount(CallerFunc)
+                     .value_or(InRegisterCount) == InRegisterCount &&
              "Unexpected incoming register count on Traversal");
-      ContHelper::setIncomingRegisterCount(Func, InRegisterCount);
+      ContHelper::setIncomingRegisterCount(CallerFunc, InRegisterCount);
       MetadataState.updateMaxUsedPayloadRegisterCount(InRegisterCount);
     }
   }
@@ -1502,6 +1640,23 @@ void LowerRaytracingPipelinePassImpl::processFunctionEntry(
   // Initialize system data by copying the argument
   Data.SystemDataFirstStore =
       Builder.CreateStore(SystemDataArgument, Data.SystemData);
+
+  // Shader preamble
+  // NOTE: Skip Traversal, as it can call its own shader start function in
+  // GPURT directly if needed.
+  if (Data.Kind != RayTracingShaderStage::Traversal) {
+    auto ShaderStart = ShaderStartOverloads[Data.SystemDataTy];
+    if (ShaderStart) {
+      CrossInliner.inlineCall(Builder, ShaderStart, Data.SystemData);
+    } else if (Mod != GpurtLibrary) {
+      // Skip for tests that do not intended to test this functionality,
+      // otherwise we need to handwrite _cont_ShaderStart for each test which is
+      // redundant and unnecessary.
+      // But ensure that it is present in production path, otherwise there could
+      // be correctness issue.
+      report_fatal_error("_cont_ShaderStart function is missing");
+    }
+  }
 }
 
 void LowerRaytracingPipelinePassImpl::processFunctionEnd(
@@ -1596,11 +1751,17 @@ void LowerRaytracingPipelinePassImpl::processFunctionEnd(
           ? std::min(EData.OutgoingSerializationLayout->NumStorageI32s,
                      MetadataState.getMaxPayloadRegisterCount())
           : MetadataState.getMaxPayloadRegisterCount();
-  if (MetadataState.isInLgcCpsMode()) {
-    if (Data.Kind == RayTracingShaderStage::RayGeneration) {
-      assert(!RetValue && "RayGen cannot return anything");
-      Builder.CreateRetVoid();
-    } else {
+
+  if (Data.Kind == RayTracingShaderStage::RayGeneration) {
+    assert(!RetValue && "RayGen cannot return anything");
+    if (ExitRayGen)
+      handleExitRayGen(Data);
+
+    Builder.CreateRetVoid();
+  } else {
+    Function *Parent = EData.Terminator->getFunction();
+
+    if (MetadataState.isInLgcCpsMode()) {
       uint32_t CpsRetLevel = getPotentialCpsReturnLevels(Data.Kind);
       // Jump to resume point of caller, pass Poison Rcr and ShaderIndex as they
       // are not meaningful for the case.
@@ -1617,13 +1778,18 @@ void LowerRaytracingPipelinePassImpl::processFunctionEnd(
           ArrayType::get(I32, OutgoingRegisterCount), Data.PayloadStorage));
 
       Ret = Builder.create<JumpOp>(
-          EData.Terminator->getFunction()->getArg(CpsArgIdxReturnAddr),
-          CpsRetLevel, PoisonValue::get(StructType::get(Builder.getContext())),
-          TailArgs);
+          Parent->getArg(CpsArgIdxReturnAddr), CpsRetLevel,
+          PoisonValue::get(StructType::get(Builder.getContext())), TailArgs);
+      Builder.CreateUnreachable();
+    } else {
+      SmallVector<Value *> TailArgs;
+
+      if (RetValue)
+        TailArgs.push_back(RetValue);
+
+      Ret = Builder.create<lgc::ilcps::ReturnOp>(Parent->getArg(0), TailArgs);
       Builder.CreateUnreachable();
     }
-  } else {
-    Ret = RetValue ? Builder.CreateRet(RetValue) : Builder.CreateRetVoid();
   }
 
   if (Ret) {
@@ -1638,6 +1804,34 @@ void LowerRaytracingPipelinePassImpl::processFunctionEnd(
   EData.Terminator->eraseFromParent();
 }
 
+void LowerRaytracingPipelinePassImpl::handleExitRayGen(
+    const FunctionData &Data) {
+  assert(ExitRayGen && "Could not find ExitRayGen function");
+  // Create a call to _cont_ExitRayGen
+  auto *SystemDataTy =
+      cast<StructType>(getFuncArgPtrElementType(ExitRayGen, 0));
+  auto *SystemData = getDXILSystemData(Builder, Data.SystemData,
+                                       Data.SystemDataTy, SystemDataTy);
+  CrossInliner.inlineCall(Builder, ExitRayGen, SystemData);
+}
+
+unsigned
+LowerRaytracingPipelinePassImpl::getUpperBoundOnTraceRayPayloadRegisters()
+    const {
+  unsigned MaxHitAttributeBytes = MetadataState.getMaxHitAttributeByteCount();
+  unsigned AttributeBytes =
+      MaxHitAttributeBytes -
+      std::min(MaxHitAttributeBytes,
+               unsigned(getInlineHitAttrsBytes(*GpurtLibrary)));
+  unsigned PayloadBytes = getMaxPayloadSize(Mod).value_or(
+      MetadataState.getMaxPayloadRegisterCount() * RegisterBytes);
+
+  unsigned IncomingStorageBytes = alignTo(AttributeBytes, RegisterBytes) +
+                                  alignTo(PayloadBytes, RegisterBytes);
+  return std::min(unsigned(divideCeil(IncomingStorageBytes, RegisterBytes)),
+                  MetadataState.getMaxPayloadRegisterCount());
+}
+
 void LowerRaytracingPipelinePassImpl::processFunction(Function *F,
                                                       FunctionData &Data) {
   Builder.SetInsertPointPastAllocas(F);
@@ -1646,8 +1840,8 @@ void LowerRaytracingPipelinePassImpl::processFunction(Function *F,
   SmallVector<Type *> AllArgTypes;
   Type *NewRetTy;
   Type *SystemDataTy = nullptr;
-  Type *CpsArgPayloadTy = nullptr;
 
+  unsigned IncomingStorageI32s = 0;
   if (MetadataState.isInLgcCpsMode()) {
     // Create the CPS function header.
 
@@ -1667,27 +1861,12 @@ void LowerRaytracingPipelinePassImpl::processFunction(Function *F,
     AllArgTypes.push_back(Builder.getInt32Ty());
 
     // Determine payload storage type
-    uint32_t PayloadStorageI32s = MetadataState.getMaxPayloadRegisterCount();
-    PayloadStorageI32s =
-        std::max(PayloadStorageI32s, Data.MaxOutgoingPayloadI32s);
-    if (Data.IncomingPayloadSerializationInfo)
-      PayloadStorageI32s =
-          std::max(PayloadStorageI32s,
-                   Data.IncomingPayloadSerializationInfo->MaxStorageI32s);
-    Data.PayloadStorageTy = ArrayType::get(I32, PayloadStorageI32s);
-
-    // Determine payload starting dword
-    // NOTE: _AmdEnqueueAnyHit always passes a 2 dword barycentrics, need to
-    // take that into account in the situation that hit attributes are not used
-    // or less than 2 dwords.
-    assert(TraversalDataTy && "Failed to detect traversal system data type");
-    Data.FirstPayloadArgumentDword =
-        1 + 1 + getArgumentDwordCount(*DL, TraversalDataTy) +
-        std::max(
-            divideCeil(Data.FuncConfig.MaxHitAttributeBytes, RegisterBytes),
-            uint64_t(2));
+    Data.PayloadStorageTy = PayloadHelper.getPayloadStorageTy(
+        MetadataState.getMaxPayloadRegisterCount(), Data);
+
+    Data.FirstPayloadArgumentDword = PayloadHelper.getPayloadStartDword(
+        Data, MetadataState.getMaxHitAttributeByteCount(), TraversalDataTy);
 
-    unsigned IncomingStorageI32s = 0;
     if (Data.Kind != RayTracingShaderStage::RayGeneration &&
         Data.Kind != RayTracingShaderStage::Intersection &&
         Data.Kind != RayTracingShaderStage::Traversal) {
@@ -1708,11 +1887,12 @@ void LowerRaytracingPipelinePassImpl::processFunction(Function *F,
           std::max(MetadataState.tryGetPreservedPayloadRegisterCount().value_or(
                        MetadataState.getMaxPayloadRegisterCount()),
                    MetadataState.getMaxUsedPayloadRegisterCount());
-    } else {
-      // For IS, use max size
-      IncomingStorageI32s = MetadataState.getMaxPayloadRegisterCount();
+    } else if (Data.Kind == RayTracingShaderStage::Intersection) {
+      IncomingStorageI32s = getUpperBoundOnTraceRayPayloadRegisters();
     }
-    CpsArgPayloadTy = ArrayType::get(I32, IncomingStorageI32s);
+  } else {
+    // Pass in the return address.
+    AllArgTypes.push_back(Builder.getInt64Ty());
   }
 
   const auto SystemDataArgumentIndex = AllArgTypes.size();
@@ -1754,7 +1934,11 @@ void LowerRaytracingPipelinePassImpl::processFunction(Function *F,
     break;
   }
   case RayTracingShaderStage::Traversal: {
-    SystemDataTy = getFuncArgPtrElementType(F, 0);
+    if (MetadataState.isInLgcCpsMode())
+      SystemDataTy = getFuncArgPtrElementType(F, 0);
+    else
+      SystemDataTy = F->getArg(0)->getType();
+
     AllArgTypes.push_back(SystemDataTy);
     NewRetTy = SystemDataTy;
     break;
@@ -1773,11 +1957,8 @@ void LowerRaytracingPipelinePassImpl::processFunction(Function *F,
       AllArgTypes.push_back(DummyArgTy);
     }
 
-    // Add padding so that payload starts at a fixed dword.
-    ContHelper::addPaddingType(*DL, *Context, AllArgTypes,
-                               Data.FirstPayloadArgumentDword);
-    // Place payload at the end
-    AllArgTypes.push_back(CpsArgPayloadTy);
+    PayloadHelper.computePaddingAndPayloadArgTys(
+        AllArgTypes, IncomingStorageI32s, Data.FirstPayloadArgumentDword);
   }
 
   Data.PayloadSpillSize = computeNeededStackSizeForRegisterBuffer(
@@ -1790,15 +1971,19 @@ void LowerRaytracingPipelinePassImpl::processFunction(Function *F,
     Data.PayloadStorageTy = PayloadStorageGlobal->getValueType();
   }
 
-  auto *FunctionTypeRetTy =
-      MetadataState.isInLgcCpsMode() ? Builder.getVoidTy() : NewRetTy;
+  Type *FunctionTypeRetTy = nullptr;
+  if (MetadataState.isInLgcCpsMode())
+    FunctionTypeRetTy = Builder.getVoidTy();
+  else
+    FunctionTypeRetTy = NewRetTy;
+
   // Create new function to change signature
   auto *NewFuncTy = FunctionType::get(FunctionTypeRetTy, AllArgTypes, false);
   Function *NewFunc = CompilerUtils::cloneFunctionHeader(
       *F, NewFuncTy, ArrayRef<AttributeSet>{});
   NewFunc->takeName(F);
   // FIXME: Remove !types metadata to workaround an llvm bug. If struct types
-  // are referenced only from metadataa, LLVM omits the type declaration when
+  // are referenced only from metadata, LLVM omits the type declaration when
   // printing IR and fails to read it back in because of an unknown type.
   NewFunc->setMetadata("types", nullptr);
 
@@ -1807,9 +1992,12 @@ void LowerRaytracingPipelinePassImpl::processFunction(Function *F,
   Data.SystemDataTy = cast<StructType>(SystemDataTy);
   processFunctionEntry(Data, NewFunc->getArg(SystemDataArgumentIndex));
 
+  Value *NewSystemData = nullptr;
+  uint64_t RetAddrArgIdx = 0;
+
   if (MetadataState.isInLgcCpsMode()) {
     NewFunc->getArg(CpsArgIdxContState)->setName("cont.state");
-    NewFunc->getArg(CpsArgIdxReturnAddr)->setName("return.addr");
+    RetAddrArgIdx = CpsArgIdxReturnAddr;
     NewFunc->getArg(CpsArgIdxShaderIndex)->setName("shader.index");
     if (Data.Kind != RayTracingShaderStage::RayGeneration) {
       NewFunc->getArg(CpsArgIdxSystemData)->setName("system.data");
@@ -1831,20 +2019,29 @@ void LowerRaytracingPipelinePassImpl::processFunction(Function *F,
       Data.SystemData->mutateType(getWithSamePointeeType(
           Data.SystemData->getType(),
           F->getArg(0)->getType()->getPointerAddressSpace()));
-      F->getArg(0)->replaceAllUsesWith(Data.SystemData);
+      NewSystemData = Data.SystemData;
     } else {
       // Create local payload storage for non-Traversal shader.
-      IRBuilder<>::InsertPointGuard Guard(Builder);
-      Builder.SetInsertPointPastAllocas(NewFunc);
-      Data.PayloadStorage = Builder.CreateAlloca(Data.PayloadStorageTy);
-      Data.PayloadStorage->setName("payload.alloca");
-      // TODO: We shouldn't need to create the alloca for RGS.
-      if (Data.Kind != RayTracingShaderStage::RayGeneration)
-        Builder.CreateStore(NewFunc->getArg(CpsArgIdxPayload),
-                            Data.PayloadStorage);
+      PayloadHelper.initializePayloadSerializationStorage(NewFunc, Data);
     }
+  } else if (Data.Kind == RayTracingShaderStage::Traversal) {
+    // Replace old system data argument with cloned functions' argument
+    NewSystemData = NewFunc->getArg(1);
   }
 
+  if (auto *ContPayloadRegistersGetI32 =
+          Mod->getFunction("_AmdContPayloadRegistersGetI32"))
+    handleContPayloadRegistersGetI32(*ContPayloadRegistersGetI32);
+
+  if (auto *ContPayloadRegistersSetI32 =
+          Mod->getFunction("_AmdContPayloadRegistersSetI32"))
+    handleContPayloadRegistersSetI32(*ContPayloadRegistersSetI32);
+
+  if (NewSystemData)
+    F->getArg(0)->replaceAllUsesWith(NewSystemData);
+
+  NewFunc->getArg(RetAddrArgIdx)->setName("returnAddr");
+
   FunctionEndData EData;
   if (Data.Kind == RayTracingShaderStage::RayGeneration) {
     if (!MetadataState.isInLgcCpsMode()) {
@@ -1867,6 +2064,16 @@ void LowerRaytracingPipelinePassImpl::processFunction(Function *F,
     PAQSerializationInfoBase *SerializationInfo =
         Data.IncomingPayloadSerializationInfo;
 
+    // Check that our assumptions about the number of required payload registers
+    // are correct. We exclude callable shaders because the max payload size
+    // doesn't apply to them.
+    assert((Data.Kind == RayTracingShaderStage::Callable ||
+            SerializationInfo == nullptr ||
+            std::min(MetadataState.getMaxPayloadRegisterCount(),
+                     SerializationInfo->MaxStorageI32s) <=
+                getUpperBoundOnTraceRayPayloadRegisters()) &&
+           "Payload serialization layout uses too many registers!");
+
     // For ClosestHit and Miss, we need to determine the out layout
     // early on in order to determine which payload fields to save in case of
     // recursive TraceRay / CallShader.
@@ -1974,15 +2181,22 @@ void LowerRaytracingPipelinePassImpl::processFunction(Function *F,
       }
     } else {
       if (!MetadataState.isInLgcCpsMode()) {
-        // Annotate intersection shader with the maximum number of registers
-        // used for payload
-        // TODO: When compiling a pipeline and not a library, we could figure
-        //       out the pipeline-wide max (on a higher level than here) and use
-        //       that instead. For a library compile, we can't know the max
-        //       payload size of shaders in pipelines this shader is used in.
-        ContHelper::setIncomingRegisterCount(
-            NewFunc, MetadataState.getMaxPayloadRegisterCount());
-        // Intentionally do NOT update MaxUsedPayloadRegisterCount
+        if (Data.Kind == RayTracingShaderStage::Intersection) {
+          // Annotate intersection shader with the maximum number of registers
+          // used for payload
+          // TODO: When compiling a pipeline and not a library, we could figure
+          //       out the pipeline-wide max (on a higher level than here) and
+          //       use that instead. For a library compile, we can't know the
+          //       max payload size of shaders in pipelines this shader is used
+          //       in.
+          ContHelper::setIncomingRegisterCount(
+              NewFunc, MetadataState.getMaxPayloadRegisterCount());
+          // Intentionally do NOT update MaxUsedPayloadRegisterCount
+        } else {
+          assert(Data.Kind == RayTracingShaderStage::Traversal);
+          // Intentionally do nothing for Traversal. We explicitly add Traversal
+          // register count metadata elsewhere.
+        }
       }
     }
 
@@ -2000,23 +2214,8 @@ void LowerRaytracingPipelinePassImpl::processFunction(Function *F,
   // functions, so we copy them beforehand.
   if (MetadataState.isInLgcCpsMode() &&
       Data.Kind == RayTracingShaderStage::Traversal) {
-    // Fixup lgc.cps.jump
-    for (auto *Jump : Data.JumpCalls) {
-      Builder.SetInsertPoint(Jump);
-      SmallVector<Value *> NewTailArgs(Jump->getTail());
-
-      // Add padding so that payload starts at a fixed dword.
-      ContHelper::addPaddingValue(*DL, *Context, NewTailArgs,
-                                  Data.FirstPayloadArgumentDword);
-
-      // Insert payload into tail args.
-      NewTailArgs.push_back(NewFunc->getArg(CpsArgIdxPayload));
-
-      Builder.create<JumpOp>(Jump->getTarget(), Jump->getLevels(),
-                             Jump->getState(), NewTailArgs);
-      Jump->dropAllReferences();
-      Jump->eraseFromParent();
-    }
+    PayloadHelper.patchJumpCalls(NewFunc, Data.JumpCalls,
+                                 Data.FirstPayloadArgumentDword);
   } else {
     SmallVector<BasicBlock *> BBs(make_pointer_range(*NewFunc));
     for (auto *BB : BBs) {
@@ -2089,19 +2288,55 @@ void LowerRaytracingPipelinePassImpl::processFunction(Function *F,
 #endif
 }
 
-static uint32_t getMaxHitAttributeByteCount(const Function &F) {
-  // Use max hit attribute size from metadata, or use globally max allowed
-  // value for the max if metadata is not set
-  auto HitAttributeSizeFromMD = getShaderHitAttributeSize(&F);
-  uint32_t Result = HitAttributeSizeFromMD.value_or(GlobalMaxHitAttributeBytes);
+void LowerRaytracingPipelinePassImpl::handleContPayloadRegisterI32Count(
+    Function &F) {
+  assert(F.arg_empty()
+         // register count
+         && F.getFunctionType()->getReturnType()->isIntegerTy(32));
 
-  if (Result % RegisterBytes != 0) {
-    auto AlignedSize = alignTo(Result, RegisterBytes);
-    LLVM_DEBUG(dbgs() << "Aligning misaligned max hit attribute size " << Result
-                      << " to " << AlignedSize << "\n");
-    Result = AlignedSize;
-  }
-  return Result;
+  uint32_t RegCount =
+      ContHelper::tryGetMaxUsedPayloadRegisterCount(*Mod).value_or(0);
+  auto *RegCountAsConstant =
+      ConstantInt::get(IntegerType::get(F.getContext(), 32), RegCount);
+
+  llvm::replaceCallsToFunction(F, *RegCountAsConstant);
+}
+
+void LowerRaytracingPipelinePassImpl::handleContPayloadRegistersGetI32(
+    Function &F) {
+  assert(F.getReturnType()->isIntegerTy(32) &&
+         F.arg_size() == 1
+         // index
+         && F.getFunctionType()->getParamType(0)->isIntegerTy(32));
+
+  llvm::forEachCall(F, [&](CallInst &CInst) {
+    Builder.SetInsertPoint(&CInst);
+    auto *Addr = Builder.CreateGEP(
+        PayloadStorageGlobal->getValueType(), PayloadStorageGlobal,
+        {Builder.getInt32(0), CInst.getArgOperand(0)});
+    auto *Load = Builder.CreateLoad(Builder.getInt32Ty(), Addr);
+    CInst.replaceAllUsesWith(Load);
+    CInst.eraseFromParent();
+  });
+}
+
+void LowerRaytracingPipelinePassImpl::handleContPayloadRegistersSetI32(
+    Function &F) {
+  assert(F.getReturnType()->isVoidTy() &&
+         F.arg_size() == 2
+         // index
+         && F.getFunctionType()->getParamType(0)->isIntegerTy(32)
+         // value
+         && F.getFunctionType()->getParamType(1)->isIntegerTy(32));
+
+  llvm::forEachCall(F, [&](CallInst &CInst) {
+    Builder.SetInsertPoint(&CInst);
+    auto *Addr = Builder.CreateGEP(
+        PayloadStorageGlobal->getValueType(), PayloadStorageGlobal,
+        {Builder.getInt32(0), CInst.getArgOperand(0)});
+    Builder.CreateStore(CInst.getOperand(1), Addr);
+    CInst.eraseFromParent();
+  });
 }
 
 void LowerRaytracingPipelinePassImpl::collectProcessableFunctions() {
@@ -2114,11 +2349,6 @@ void LowerRaytracingPipelinePassImpl::collectProcessableFunctions() {
     if (Stage == RayTracingShaderStage::KernelEntry)
       continue;
 
-    // Skip Traversal for non-lgc.cps
-    if (Stage == RayTracingShaderStage::Traversal &&
-        !MetadataState.isInLgcCpsMode())
-      continue;
-
     RayTracingShaderStage Kind = *Stage;
     switch (Kind) {
     case RayTracingShaderStage::RayGeneration:
@@ -2130,18 +2360,14 @@ void LowerRaytracingPipelinePassImpl::collectProcessableFunctions() {
     case RayTracingShaderStage::Traversal: {
       FunctionData Data;
       Data.Kind = Kind;
-      Data.FuncConfig.MaxHitAttributeBytes = getMaxHitAttributeByteCount(Func);
-      LLVM_DEBUG(dbgs() << "Shader " << Func.getName()
-                        << " uses max hit attribute size of "
-                        << Data.FuncConfig.MaxHitAttributeBytes << "\n");
 
       if (Kind != RayTracingShaderStage::Intersection &&
           Kind != RayTracingShaderStage::RayGeneration &&
           Kind != RayTracingShaderStage::Traversal) {
         assert(!Func.arg_empty() && "Shader must have at least one argument");
         Data.IncomingPayload = getFuncArgPtrElementType(&Func, 0);
-        PAQPayloadConfig PAQConfig = {Data.IncomingPayload,
-                                      Data.FuncConfig.MaxHitAttributeBytes};
+        PAQPayloadConfig PAQConfig = {
+            Data.IncomingPayload, MetadataState.getMaxHitAttributeByteCount()};
         Data.IncomingPayloadSerializationInfo =
             &PAQManager.getOrCreateSerializationInfo(PAQConfig, Kind);
         assert(Data.IncomingPayloadSerializationInfo != nullptr &&
@@ -2281,6 +2507,12 @@ void LowerRaytracingPipelinePassImpl::collectGpuRtFunctions() {
 
   SetLocalRootIndex = getSetLocalRootIndex(*Mod);
 
+  ExitRayGen = GpurtLibrary->getFunction(ContDriverFunc::ExitRayGenName);
+  if (ExitRayGen)
+    assert(ExitRayGen->getReturnType()->isVoidTy() &&
+           ExitRayGen->arg_size() == 1 &&
+           ExitRayGen->getFunctionType()->getParamType(0)->isPointerTy());
+
   TraceRay = GpurtLibrary->getFunction(ContDriverFunc::TraceRayName);
   if (TraceRay)
     assert(TraceRay->getReturnType()->isVoidTy() &&
@@ -2321,6 +2553,17 @@ void LowerRaytracingPipelinePassImpl::collectGpuRtFunctions() {
   if (GetSbtStride)
     assert(GetSbtStride->getReturnType()->isIntegerTy(32) &&
            GetSbtStride->arg_empty());
+
+  // _cont_ShaderStart has one overload for each system data type
+  llvm::for_each(GpurtLibrary->functions(), [&](Function &F) {
+    if (F.getName().starts_with(ContDriverFunc::ShaderStartName)) {
+      assert(F.getReturnType()->isVoidTy() &&
+             F.arg_size() == 1
+             // System data
+             && F.getFunctionType()->getParamType(0)->isPointerTy());
+      ShaderStartOverloads[getFuncArgPtrElementType(&F, 0)] = &F;
+    }
+  });
 }
 
 LowerRaytracingPipelinePassImpl::LowerRaytracingPipelinePassImpl(
@@ -2328,9 +2571,10 @@ LowerRaytracingPipelinePassImpl::LowerRaytracingPipelinePassImpl(
     : Mod{&M}, GpurtLibrary{&GpurtLibrary}, Context{&M.getContext()},
       DL{&M.getDataLayout()}, Builder{Mod->getContext()}, MetadataState{*Mod},
       PAQManager{Mod, &GpurtLibrary,
-                 MetadataState.getMaxPayloadRegisterCount()} {}
+                 MetadataState.getMaxPayloadRegisterCount()},
+      PayloadHelper{*Mod, *DL, Builder, MetadataState.isInLgcCpsMode()} {}
 
-bool LowerRaytracingPipelinePassImpl::run() {
+PreservedAnalyses LowerRaytracingPipelinePassImpl::run() {
   collectGpuRtFunctions();
   DispatchSystemDataTy = getFuncArgPtrElementType(GetLocalRootIndex, 0);
   assert(DispatchSystemDataTy && "LowerRaytracingPipelinePassImpl::run: Could "
@@ -2342,6 +2586,7 @@ bool LowerRaytracingPipelinePassImpl::run() {
   struct VisitorState {
     PAQSerializationInfoManager &PAQManager;
     MapVector<Function *, FunctionData> &Processables;
+    ModuleMetadataState &Metadata;
   };
 
   static const auto Visitor =
@@ -2374,7 +2619,7 @@ bool LowerRaytracingPipelinePassImpl::run() {
 
             if (!isa<ReportHitOp>(Op)) {
               PAQPayloadConfig PAQPayload = {
-                  PayloadTy, Data->second.FuncConfig.MaxHitAttributeBytes};
+                  PayloadTy, State.Metadata.getMaxHitAttributeByteCount()};
 
               uint32_t PayloadStorageI32s = 0;
               if (isa<TraceRayOp>(Op)) {
@@ -2406,7 +2651,7 @@ bool LowerRaytracingPipelinePassImpl::run() {
           })
           .build();
 
-  VisitorState S{PAQManager, ToProcess};
+  VisitorState S{PAQManager, ToProcess, MetadataState};
   Visitor.visit(S, *Mod);
 
   handleUnrematerializableCandidates();
@@ -2442,6 +2687,7 @@ bool LowerRaytracingPipelinePassImpl::run() {
       auto Name = F.getName();
       if (Name.starts_with(ContDriverFunc::TraceRayName) ||
           Name.starts_with(ContDriverFunc::CallShaderName) ||
+          Name.starts_with(ContDriverFunc::ExitRayGenName) ||
           Name.starts_with(ContDriverFunc::ReportHitName)) {
         F.eraseFromParent();
       }
@@ -2456,7 +2702,11 @@ bool LowerRaytracingPipelinePassImpl::run() {
 
   MetadataState.updateModuleMetadata();
 
-  return true;
+  if (auto *ContPayloadRegistersI32Count =
+          Mod->getFunction("_AmdContPayloadRegistersI32Count"))
+    handleContPayloadRegisterI32Count(*ContPayloadRegistersI32Count);
+
+  return PreservedAnalyses::none();
 }
 
 std::optional<PAQShaderStage>
@@ -2490,7 +2740,5 @@ LowerRaytracingPipelinePass::run(llvm::Module &M,
   auto &GpurtContext = lgc::GpurtContext::get(M.getContext());
   LowerRaytracingPipelinePassImpl Impl(
       M, GpurtContext.theModule ? *GpurtContext.theModule : M);
-  bool Changed = Impl.run();
-
-  return Changed ? PreservedAnalyses::none() : PreservedAnalyses::all();
+  return Impl.run();
 }
diff --git a/llvmraytracing/lib/PayloadAccessQualifiers.cpp b/llvmraytracing/lib/PayloadAccessQualifiers.cpp
index 7f7729089c..f43fb1bef3 100644
--- a/llvmraytracing/lib/PayloadAccessQualifiers.cpp
+++ b/llvmraytracing/lib/PayloadAccessQualifiers.cpp
@@ -537,11 +537,15 @@ createPayloadRootNode(Type &PayloadType, MDTuple *TypeAnnotationMDTuple) {
 
   std::unique_ptr<PAQNode> RootNode =
       std::make_unique<PAQNode>(PAQNode{&PayloadType});
-  RootNode->Children.reserve(NumElements);
 
-  for (uint32_t I = 0; I < NumElements; ++I) {
-    PAQNode ChildNode = {PayloadStructType->getElementType(I)};
-    if (TypeAnnotationMDTuple) {
+  // If the payload type is PAQ-annotated, create child nodes
+  // with their access masks. Otherwise, set a trivial access
+  // mask on the root node, and let later hierarchical propagation
+  // expand the children.
+  if (TypeAnnotationMDTuple) {
+    RootNode->Children.reserve(NumElements);
+    for (uint32_t I = 0; I < NumElements; ++I) {
+      PAQNode ChildNode = {PayloadStructType->getElementType(I)};
       // TypeAnnotationMDTuple should contain a nested tuple for every
       // element, consisting of a tag i32, and the bitmask i32.
       const MDOperand &FieldOperand = TypeAnnotationMDTuple->getOperand(I);
@@ -576,16 +580,16 @@ createPayloadRootNode(Type &PayloadType, MDTuple *TypeAnnotationMDTuple) {
         ChildNode.AccessMask =
             importPAQAccessMaskFromDXILBitMask(BitMask.value());
       }
-    } else {
-      // No metadata available, assume all read/all write
-      ChildNode.AccessMask.emplace();
-      for (PAQShaderStage Stage : PAQShaderStages) {
-        ChildNode.AccessMask->set(Stage, PAQAccessKind::Write);
-        ChildNode.AccessMask->set(Stage, PAQAccessKind::Read);
-      }
-    }
 
-    RootNode->Children.push_back(std::move(ChildNode));
+      RootNode->Children.push_back(std::move(ChildNode));
+    }
+  } else {
+    // No metadata available, assume all read/all write
+    RootNode->AccessMask.emplace();
+    for (PAQShaderStage Stage : PAQShaderStages) {
+      RootNode->AccessMask->set(Stage, PAQAccessKind::Write);
+      RootNode->AccessMask->set(Stage, PAQAccessKind::Read);
+    }
   }
 
   return RootNode;
@@ -593,14 +597,16 @@ createPayloadRootNode(Type &PayloadType, MDTuple *TypeAnnotationMDTuple) {
 
 // Recursive implementation for createNestedStructHierarchy.
 //
-// Creates child nodes, and sets the lifetime class. The access mask
-// of Node is set by the caller.
+// Creates child nodes, and sets the lifetime class.
+// The access mask of Node can be set by the caller,
+// in which case it is propagated to its children.
+// If no access mask is set, the access mask is propagated
+// from children if uniform.
 // For leaves, the lifetime class is set from the access mask (if set).
 // For inner nodes, the lifetime class is propagated from children if uniform.
 static void createNestedStructHierarchyRecursively(
-    Type *Ty, PAQNode &Node,
+    PAQNode &Node,
     const MapVector<Type *, std::unique_ptr<PAQNode>> *ModulePayloadRootNodes) {
-  assert(Node.Children.empty() && "PAQ hierarchy already created!");
 
   // If Node.AccessMask is unset, there are two possible cases:
   //  - Node is a nested payload field. In this case, the field was *not*
@@ -626,12 +632,10 @@ static void createNestedStructHierarchyRecursively(
   // However, in this case the whole struct is write(all) + read(all) anyways,
   // and nested payload structs can be ignored.
   const PAQNode *PayloadTypeRootNode = nullptr;
-  StructType *StructTy = dyn_cast<StructType>(Ty);
+  StructType *StructTy = dyn_cast<StructType>(Node.Ty);
   if (!Node.AccessMask) {
     bool IsNestedPayload = false;
-    if (StructTy) {
-      assert(ModulePayloadRootNodes != nullptr &&
-             "Missing module payload root nodes!");
+    if (StructTy && ModulePayloadRootNodes) {
       auto It = ModulePayloadRootNodes->find(StructTy);
       if (It != ModulePayloadRootNodes->end()) {
         IsNestedPayload = true;
@@ -658,31 +662,62 @@ static void createNestedStructHierarchyRecursively(
     return;
   }
 
-  Node.Children.reserve(StructTy->getNumElements());
-  bool LifetimeClassesAreUniform = true;
-  // Construct child nodes, and propagate their lifetime class if uniform
+  // Create child nodes, unless they already exist
+  // For PAQ-annotated payload root nodes, child nodes are already added earlier
+  // as part of PAQ qualifier import.
+  // If child nodes are already present, check that they are as expected
+  bool ChildrenArePrepopulated = !Node.Children.empty();
+  if (ChildrenArePrepopulated) {
+    assert(Node.Children.size() == StructTy->getNumElements());
+  } else {
+    Node.Children.reserve(StructTy->getNumElements());
+  }
   for (uint32_t I = 0; I < StructTy->getNumElements(); ++I) {
-    Type *ChildTy = StructTy->getElementType(I);
-    PAQNode ChildNode{ChildTy};
+    std::optional<PAQAccessMask> ChildAccessMask;
     if (Node.AccessMask) {
       // Use access mask from parent
-      ChildNode.AccessMask = Node.AccessMask;
+      ChildAccessMask = Node.AccessMask;
     } else if (PayloadTypeRootNode) {
       // Use access mask from payload type definition
       // May be unset if ChildTy is again a payload struct type
-      ChildNode.AccessMask = PayloadTypeRootNode->Children[I].AccessMask;
+      ChildAccessMask = PayloadTypeRootNode->Children[I].AccessMask;
+    }
+    if (ChildrenArePrepopulated) {
+      assert(!ChildAccessMask.has_value() ||
+             Node.Children[I].AccessMask == ChildAccessMask);
+    } else {
+      Type *ChildTy = StructTy->getElementType(I);
+      Node.Children.emplace_back();
+      Node.Children.back().Ty = ChildTy;
+      Node.Children.back().AccessMask = ChildAccessMask;
     }
+  }
+
+  // Propagate into/from child nodes
+  bool LifetimeClassesAreUniform = true;
+  bool AccessMasksAreUniform = true;
 
-    createNestedStructHierarchyRecursively(ChildTy, ChildNode,
-                                           ModulePayloadRootNodes);
-    Node.Children.push_back(std::move(ChildNode));
+  for (uint32_t I = 0; I < StructTy->getNumElements(); ++I) {
+    PAQNode &ChildNode = Node.Children.at(I);
+
+    createNestedStructHierarchyRecursively(ChildNode, ModulePayloadRootNodes);
 
-    if (Node.Children.back().LifetimeClass !=
-        Node.Children.front().LifetimeClass)
+    if (ChildNode.LifetimeClass != Node.Children.front().LifetimeClass)
       LifetimeClassesAreUniform = false;
+    if (ChildNode.AccessMask != Node.Children.front().AccessMask)
+      AccessMasksAreUniform = false;
   }
   if (LifetimeClassesAreUniform)
     Node.LifetimeClass = Node.Children[0].LifetimeClass;
+  // Propagate uniform access masks to the parent struct value
+  if (AccessMasksAreUniform && Node.Children[0].AccessMask.has_value()) {
+    PAQAccessMask CommonAccessMask = *Node.Children[0].AccessMask;
+    assert(Node.AccessMask.value_or(CommonAccessMask) == CommonAccessMask);
+    Node.AccessMask = CommonAccessMask;
+    assert(!Node.LifetimeClass.has_value() ||
+           *Node.LifetimeClass ==
+               lifetimeClassFromAccessMask(Node.AccessMask.value()));
+  }
 }
 
 [[maybe_unused]] static void dumpPAQTree(StructType *PayloadType,
@@ -701,8 +736,9 @@ static void createNestedStructHierarchyRecursively(
 // of payload type, in which case qualifiers of nested fields need to
 // be determined from the nested payload type.
 // Hence, a map of all root nodes of payload structs in the module is passed.
-// These are not yet hierarchically expanding (because that is what this
-// function does), which is fine because only the root nodes are accessed.
+// These are not yet hierarchically expanded (because that is what this
+// function does), which is fine because only the root nodes of other payload
+// types are accessed.
 // ModulePayloadRootNodes may be nullptr, in which case no unqualified fields
 // may exist in Node.
 // Note that setting an access mask for a node applies the same mask to its
@@ -710,14 +746,8 @@ static void createNestedStructHierarchyRecursively(
 static void createNestedStructHierarchy(
     Type *PayloadType, PAQNode &Node,
     const MapVector<Type *, std::unique_ptr<PAQNode>> *ModulePayloadRootNodes) {
-  StructType *StructTy = cast<StructType>(PayloadType);
-  for (uint32_t I = 0; I < StructTy->getNumElements(); ++I) {
-    PAQNode &ChildNode = Node.Children[I];
-    createNestedStructHierarchyRecursively(StructTy->getElementType(I),
-                                           ChildNode, ModulePayloadRootNodes);
-  }
-
-  LLVM_DEBUG(dumpPAQTree(StructTy, Node));
+  createNestedStructHierarchyRecursively(Node, ModulePayloadRootNodes);
+  LLVM_DEBUG(dumpPAQTree(cast<StructType>(PayloadType), Node));
 }
 
 static std::unique_ptr<PAQNode>
@@ -858,15 +888,25 @@ importModulePayloadPAQNodes(const Module &M) {
   return PayloadRootNodes;
 }
 
-void PAQNode::collectLeafNodes(SmallVectorImpl<const PAQNode *> &Result) const {
-  if (Ty->isStructTy()) {
-    // If Node.LifetimeClass is set, we could keep the struct together
-    // instead of dissolving it into its elements, but dissolving
-    // has the advantage to reduce potential padding.
-    // Node.Children may be empty for empty structs,
-    // leading to intentionally non-represented subtrees.
-    for (const PAQNode &ChildNode : Children)
-      ChildNode.collectLeafNodes(Result);
+void PAQNode::collectNodes(SmallVectorImpl<const PAQNode *> &Result) const {
+  StructType *StructTy = dyn_cast<StructType>(Ty);
+  if (StructTy) {
+    // For struct types where all fields have the same access mask
+    // (indicated by AccessMask being set), and are live at all
+    // (indicated by LifetimeClass being set), we have a choice of
+    // whether we represent the whole struct with a single node,
+    // or whether we recurse and represent each node individually.
+    // We choose to represent the whole struct by a single node,
+    // thereby avoiding padding between the storage intervals
+    // of smaller-than-dword (e.g. 16-bit) fields, as storage
+    // intervals are allocated on dword granularity.
+    if (LifetimeClass.has_value() && AccessMask.has_value()) {
+      if (!StructTy->isEmptyTy())
+        Result.push_back(this);
+    } else {
+      for (const PAQNode &ChildNode : Children)
+        ChildNode.collectNodes(Result);
+    }
   } else {
     // Fields with write() : read() have no lifetime class
     // and are not collected for serialization
diff --git a/llvmraytracing/lib/RemoveTypesMetadata.cpp b/llvmraytracing/lib/RemoveTypesMetadata.cpp
index f45803e3d1..031e3f84da 100644
--- a/llvmraytracing/lib/RemoveTypesMetadata.cpp
+++ b/llvmraytracing/lib/RemoveTypesMetadata.cpp
@@ -32,7 +32,6 @@
 #include "llvmraytracing/Continuations.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/IR/Module.h"
-#include <cassert>
 
 using namespace llvm;
 
diff --git a/llvmraytracing/test/dx/cleanup-continuations-malloc.ll b/llvmraytracing/test/dx/cleanup-continuations-malloc.ll
index 29b8085b78..cef999368a 100644
--- a/llvmraytracing/test/dx/cleanup-continuations-malloc.ll
+++ b/llvmraytracing/test/dx/cleanup-continuations-malloc.ll
@@ -1,24 +1,24 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 3
-; RUN: opt --verify-each -passes='lower-await,lint,coro-early,dxil-coro-split,coro-cleanup,lint,legacy-cleanup-continuations,lint' -S %s 2> %t.stderr | FileCheck %s
-; RUN: count 0 < %t.stderr
+; RUN: opt --verify-each -passes='lower-await,lint,coro-early,dxil-coro-split,coro-cleanup,lint,legacy-cleanup-continuations,lint' -S %s --lint-abort-on-error | FileCheck %s
 
-target datalayout = "e-m:e-p:64:32-p20:32:32-p21:32:32-p32:32:32-i1:32-i8:8-i16:32-i32:32-i64:32-f16:32-f32:32-f64:32-v16:32-v32:32-v48:32-v64:32-v80:32-v96:32-v112:32-v128:32-v144:32-v160:32-v176:32-v192:32-v208:32-v224:32-v240:32-v256:32-n8:16:32"
+target datalayout = "e-m:e-p:64:32-p20:32:32-p21:32:32-p32:32:32-i1:32-i8:8-i16:16-i32:32-i64:32-f16:16-f32:32-f64:32-v8:8-v16:16-v32:32-v48:32-v64:32-v80:32-v96:32-v112:32-v128:32-v144:32-v160:32-v176:32-v192:32-v208:32-v224:32-v240:32-v256:32-n8:16:32"
 
 %continuation.token = type { }
 
 declare void @await.void(%continuation.token*)
 declare %continuation.token* @async_fun()
 
-define <4 x i32> @simple_await(<4 x i32> %arg) !continuation.registercount !1 {
+define <4 x i32> @simple_await(i64 %dummyRet, <4 x i32> %arg) !continuation.registercount !1 {
 ; CHECK-LABEL: define void @simple_await(
-; CHECK-SAME: i64 [[RETURNADDR:%.*]], <4 x i32> [[ARG:%.*]]) !continuation.registercount [[META1:![0-9]+]] !continuation [[META2:![0-9]+]] !continuation.stacksize [[META3:![0-9]+]] !continuation.state [[META3]] {
+; CHECK-SAME: i64 [[DUMMYRET:%.*]], <4 x i32> [[ARG:%.*]]) !continuation.registercount [[META1:![0-9]+]] !continuation [[META2:![0-9]+]] !continuation.stacksize [[META3:![0-9]+]] !continuation.state [[META3]] {
 ; CHECK-NEXT:  AllocaSpillBB:
 ; CHECK-NEXT:    [[CONT_STATE_STACK_SEGMENT:%.*]] = call ptr addrspace(32) @lgc.cps.alloc(i32 24)
 ; CHECK-NEXT:    [[ARG_SPILL_ADDR:%.*]] = getelementptr inbounds [[SIMPLE_AWAIT_FRAME:%.*]], ptr addrspace(32) [[CONT_STATE_STACK_SEGMENT]], i32 0, i32 0
 ; CHECK-NEXT:    store <4 x i32> [[ARG]], ptr addrspace(32) [[ARG_SPILL_ADDR]], align 4
 ; CHECK-NEXT:    [[RETURNADDR_SPILL_ADDR:%.*]] = getelementptr inbounds [[SIMPLE_AWAIT_FRAME]], ptr addrspace(32) [[CONT_STATE_STACK_SEGMENT]], i32 0, i32 1
-; CHECK-NEXT:    store i64 [[RETURNADDR]], ptr addrspace(32) [[RETURNADDR_SPILL_ADDR]], align 4
-; CHECK-NEXT:    call void (i64, ...) @continuation.continue(i64 ptrtoint (ptr @async_fun to i64), i64 ptrtoint (ptr @simple_await.resume.0 to i64)), !continuation.registercount [[META1]], !continuation.returnedRegistercount !1
+; CHECK-NEXT:    store i64 [[DUMMYRET]], ptr addrspace(32) [[RETURNADDR_SPILL_ADDR]], align 4
+; CHECK-NEXT:    [[TMP0:%.*]] = call i64 (...) @lgc.cps.as.continuation.reference__i64(ptr @simple_await.resume.0)
+; CHECK-NEXT:    call void (i64, ...) @continuation.continue(i64 ptrtoint (ptr @async_fun to i64), i64 [[TMP0]]), !continuation.registercount [[META1]], !continuation.returnedRegistercount [[META1]]
 ; CHECK-NEXT:    unreachable
 ;
   %tok = call %continuation.token* @async_fun(), !continuation.registercount !1, !continuation.returnedRegistercount !1
@@ -26,16 +26,17 @@ define <4 x i32> @simple_await(<4 x i32> %arg) !continuation.registercount !1 {
   ret <4 x i32> %arg, !continuation.registercount !1
 }
 
-define void @simple_await_entry(<4 x i32> %arg, <4 x i32> addrspace(1)* %mem) !continuation.entry !0 !continuation.registercount !1 {
+define void @simple_await_entry(i64 %dummyRet, <4 x i32> %arg, <4 x i32> addrspace(1)* %mem) !continuation.entry !0 !continuation.registercount !1 {
 ; CHECK-LABEL: define void @simple_await_entry(
-; CHECK-SAME: <4 x i32> [[ARG:%.*]], ptr addrspace(1) [[MEM:%.*]]) !continuation.registercount [[META1]] !continuation.entry [[META4:![0-9]+]] !continuation [[META5:![0-9]+]] !continuation.stacksize [[META3]] !continuation.state [[META3]] {
+; CHECK-SAME: i64 [[DUMMYRET:%.*]], <4 x i32> [[ARG:%.*]], ptr addrspace(1) [[MEM:%.*]]) !continuation.registercount [[META1]] !continuation.entry [[META4:![0-9]+]] !continuation [[META5:![0-9]+]] !continuation.stacksize [[META3]] !continuation.state [[META3]] {
 ; CHECK-NEXT:  AllocaSpillBB:
 ; CHECK-NEXT:    [[CONT_STATE_STACK_SEGMENT:%.*]] = call ptr addrspace(32) @lgc.cps.alloc(i32 24)
 ; CHECK-NEXT:    [[MEM_SPILL_ADDR:%.*]] = getelementptr inbounds [[SIMPLE_AWAIT_ENTRY_FRAME:%.*]], ptr addrspace(32) [[CONT_STATE_STACK_SEGMENT]], i32 0, i32 1
 ; CHECK-NEXT:    store ptr addrspace(1) [[MEM]], ptr addrspace(32) [[MEM_SPILL_ADDR]], align 4
 ; CHECK-NEXT:    [[ARG_SPILL_ADDR:%.*]] = getelementptr inbounds [[SIMPLE_AWAIT_ENTRY_FRAME]], ptr addrspace(32) [[CONT_STATE_STACK_SEGMENT]], i32 0, i32 0
 ; CHECK-NEXT:    store <4 x i32> [[ARG]], ptr addrspace(32) [[ARG_SPILL_ADDR]], align 4
-; CHECK-NEXT:    call void (i64, ...) @continuation.continue(i64 ptrtoint (ptr @async_fun to i64), i64 ptrtoint (ptr @simple_await_entry.resume.0 to i64)), !continuation.registercount [[META1]], !continuation.returnedRegistercount !1
+; CHECK-NEXT:    [[TMP0:%.*]] = call i64 (...) @lgc.cps.as.continuation.reference__i64(ptr @simple_await_entry.resume.0)
+; CHECK-NEXT:    call void (i64, ...) @continuation.continue(i64 ptrtoint (ptr @async_fun to i64), i64 [[TMP0]]), !continuation.registercount [[META1]], !continuation.returnedRegistercount [[META1]]
 ; CHECK-NEXT:    unreachable
 ;
   %tok = call %continuation.token* @async_fun(), !continuation.registercount !1, !continuation.returnedRegistercount !1
diff --git a/llvmraytracing/test/dx/cleanup-continuations.ll b/llvmraytracing/test/dx/cleanup-continuations.ll
index a5372bc86c..6f61f71de3 100644
--- a/llvmraytracing/test/dx/cleanup-continuations.ll
+++ b/llvmraytracing/test/dx/cleanup-continuations.ll
@@ -1,8 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals --version 3
-; RUN: opt --verify-each -passes='legacy-cleanup-continuations,lint' -S %s 2> %t.stderr | FileCheck %s
-; RUN: count 0 < %t.stderr
+; RUN: opt --verify-each -passes='legacy-cleanup-continuations,lint' -S %s --lint-abort-on-error | FileCheck %s
 
-target datalayout = "e-m:e-p:64:32-p20:32:32-p21:32:32-p32:32:32-i1:32-i8:8-i16:32-i32:32-i64:32-f16:32-f32:32-f64:32-v16:32-v32:32-v48:32-v64:32-v80:32-v96:32-v112:32-v128:32-v144:32-v160:32-v176:32-v192:32-v208:32-v224:32-v240:32-v256:32-n8:16:32"
+target datalayout = "e-m:e-p:64:32-p20:32:32-p21:32:32-p32:32:32-i1:32-i8:8-i16:16-i32:32-i64:32-f16:16-f32:32-f64:32-v8:8-v16:16-v32:32-v48:32-v64:32-v80:32-v96:32-v112:32-v128:32-v144:32-v160:32-v176:32-v192:32-v208:32-v224:32-v240:32-v256:32-n8:16:32"
 
 %continuation.token = type { }
 %await_with_ret_value.Frame = type { i64 }
@@ -10,18 +9,19 @@ target datalayout = "e-m:e-p:64:32-p20:32:32-p21:32:32-p32:32:32-i1:32-i8:8-i16:
 %simple_await_entry.Frame = type { }
 
 declare %continuation.token* @async_fun()
-declare i32 @continuations.getReturnValue__i32() #0
-declare void @continuation.return(i64, ...)
+declare i32 @lgc.ilcps.getReturnValue__i32() #0
+declare void @lgc.ilcps.return(i64, ...)
 
-define { i8*, %continuation.token* } @simple_await(i8* %0) !continuation !0 !continuation.registercount !4 {
+define { i8*, %continuation.token* } @simple_await(i64 %dummyRet, i8* %0) !continuation !0 !continuation.registercount !4 {
 ; CHECK-LABEL: define void @simple_await(
-; CHECK-SAME: ) !continuation [[META1:![0-9]+]] !continuation.registercount [[META2:![0-9]+]] !continuation.stacksize [[META3:![0-9]+]] !continuation.state [[META3]] {
+; CHECK-SAME: i64 [[DUMMYRET:%.*]]) !continuation [[META1:![0-9]+]] !continuation.registercount [[META2:![0-9]+]] !continuation.stacksize [[META3:![0-9]+]] !continuation.state [[META3]] {
 ; CHECK-NEXT:  AllocaSpillBB:
 ; CHECK-NEXT:    [[CONT_STATE_STACK_SEGMENT:%.*]] = call ptr addrspace(32) @lgc.cps.alloc(i32 8)
 ; CHECK-NEXT:    [[FRAMEPTR:%.*]] = bitcast ptr addrspace(32) [[CONT_STATE_STACK_SEGMENT]] to ptr addrspace(32)
 ; CHECK-NEXT:    [[DOTSPILL_ADDR:%.*]] = getelementptr inbounds [[SIMPLE_AWAIT_FRAME:%.*]], ptr addrspace(32) [[FRAMEPTR]], i32 0, i32 0
 ; CHECK-NEXT:    store i64 -1, ptr addrspace(32) [[DOTSPILL_ADDR]], align 4
-; CHECK-NEXT:    call void (i64, ...) @continuation.continue(i64 ptrtoint (ptr @async_fun to i64), i64 ptrtoint (ptr @simple_await.resume.0 to i64)), !continuation.registercount [[META2]], !continuation.returnedRegistercount !2
+; CHECK-NEXT:    [[TMP0:%.*]] = call i64 (...) @lgc.cps.as.continuation.reference__i64(ptr @simple_await.resume.0)
+; CHECK-NEXT:    call void (i64, ...) @continuation.continue(i64 ptrtoint (ptr @async_fun to i64), i64 [[TMP0]]), !continuation.registercount [[META2]], !continuation.returnedRegistercount [[META2]]
 ; CHECK-NEXT:    unreachable
 ;
 AllocaSpillBB:
@@ -35,7 +35,7 @@ AllocaSpillBB:
 
 define internal { i8*, %continuation.token* } @simple_await.resume.0(i8* noalias nonnull align 16 dereferenceable(8) %0, i1 %1) !continuation !0 {
 ; CHECK-LABEL: define dso_local void @simple_await.resume.0(
-; CHECK-SAME: ) !continuation [[META1]] !continuation.registercount [[META2]] {
+; CHECK-SAME: i64 [[TMP0:%.*]]) !continuation [[META1]] !continuation.registercount [[META2]] {
 ; CHECK-NEXT:  entryresume.0:
 ; CHECK-NEXT:    [[CONT_STATE_STACK_SEGMENT:%.*]] = call ptr addrspace(32) @lgc.cps.peek(i32 8)
 ; CHECK-NEXT:    [[FRAMEPTR:%.*]] = bitcast ptr addrspace(32) [[CONT_STATE_STACK_SEGMENT]] to ptr addrspace(32)
@@ -43,7 +43,7 @@ define internal { i8*, %continuation.token* } @simple_await.resume.0(i8* noalias
 ; CHECK-NEXT:    [[DOTRELOAD_ADDR:%.*]] = getelementptr inbounds [[SIMPLE_AWAIT_FRAME:%.*]], ptr addrspace(32) [[FRAMEPTR]], i32 0, i32 0
 ; CHECK-NEXT:    [[DOTRELOAD:%.*]] = load i64, ptr addrspace(32) [[DOTRELOAD_ADDR]], align 4
 ; CHECK-NEXT:    call void @lgc.cps.free(i32 8)
-; CHECK-NEXT:    call void (i64, ...) @continuation.continue(i64 [[DOTRELOAD]]), !continuation.registercount [[META2]]
+; CHECK-NEXT:    call void (i64, ...) @continuation.continue(i64 [[DOTRELOAD]], i64 poison, i64 undef), !continuation.registercount [[META2]]
 ; CHECK-NEXT:    unreachable
 ;
 entryresume.0:
@@ -51,17 +51,18 @@ entryresume.0:
   %vFrame = bitcast %simple_await.Frame* %FramePtr to i8*
   %.reload.addr = getelementptr inbounds %simple_await.Frame, %simple_await.Frame* %FramePtr, i32 0, i32 0
   %.reload = load i64, i64* %.reload.addr, align 4
-  call void (i64, ...) @continuation.return(i64 %.reload), !continuation.registercount !4
+  call void (i64, ...) @lgc.ilcps.return(i64 %.reload, i64 undef), !continuation.registercount !4
   unreachable
 }
 
-define { i8*, %continuation.token* } @simple_await_entry(i8* %0) !continuation.entry !2 !continuation !3 !continuation.registercount !4 {
+define { i8*, %continuation.token* } @simple_await_entry(i64 %dummyRet, i8* %0) !continuation.entry !2 !continuation !3 !continuation.registercount !4 {
 ; CHECK-LABEL: define void @simple_await_entry(
-; CHECK-SAME: ) !continuation [[META4:![0-9]+]] !continuation.registercount [[META2]] !continuation.entry [[META5:![0-9]+]] !continuation.stacksize [[META3]] !continuation.state [[META3]] {
+; CHECK-SAME: i64 [[DUMMYRET:%.*]]) !continuation [[META4:![0-9]+]] !continuation.registercount [[META2]] !continuation.entry [[META5:![0-9]+]] !continuation.stacksize [[META3]] !continuation.state [[META3]] {
 ; CHECK-NEXT:  AllocaSpillBB:
 ; CHECK-NEXT:    [[CONT_STATE_STACK_SEGMENT:%.*]] = call ptr addrspace(32) @lgc.cps.alloc(i32 8)
 ; CHECK-NEXT:    [[FRAMEPTR:%.*]] = bitcast ptr addrspace(32) [[CONT_STATE_STACK_SEGMENT]] to ptr addrspace(32)
-; CHECK-NEXT:    call void (i64, ...) @continuation.continue(i64 ptrtoint (ptr @async_fun to i64), i64 ptrtoint (ptr @simple_await_entry.resume.0 to i64)), !continuation.registercount [[META2]], !continuation.returnedRegistercount !2
+; CHECK-NEXT:    [[TMP0:%.*]] = call i64 (...) @lgc.cps.as.continuation.reference__i64(ptr @simple_await_entry.resume.0)
+; CHECK-NEXT:    call void (i64, ...) @continuation.continue(i64 ptrtoint (ptr @async_fun to i64), i64 [[TMP0]]), !continuation.registercount [[META2]], !continuation.returnedRegistercount [[META2]]
 ; CHECK-NEXT:    unreachable
 ;
 AllocaSpillBB:
@@ -75,7 +76,7 @@ AllocaSpillBB:
 
 define internal { i8*, %continuation.token* } @simple_await_entry.resume.0(i8* noalias nonnull align 16 dereferenceable(8) %0, i1 %1) !continuation.entry !2 !continuation !3 {
 ; CHECK-LABEL: define dso_local void @simple_await_entry.resume.0(
-; CHECK-SAME: ) !continuation [[META4]] !continuation.registercount [[META2]] {
+; CHECK-SAME: i64 [[TMP0:%.*]]) !continuation [[META4]] !continuation.registercount [[META2]] {
 ; CHECK-NEXT:  entryresume.0:
 ; CHECK-NEXT:    [[CONT_STATE_STACK_SEGMENT:%.*]] = call ptr addrspace(32) @lgc.cps.peek(i32 8)
 ; CHECK-NEXT:    [[FRAMEPTR:%.*]] = bitcast ptr addrspace(32) [[CONT_STATE_STACK_SEGMENT]] to ptr addrspace(32)
@@ -88,18 +89,19 @@ define internal { i8*, %continuation.token* } @simple_await_entry.resume.0(i8* n
 entryresume.0:
   %FramePtr = bitcast i8* %0 to %simple_await_entry.Frame*
   %vFrame = bitcast %simple_await_entry.Frame* %FramePtr to i8*
-  call void (i64, ...) @continuation.return(i64 undef), !continuation.registercount !4
+  call void (i64, ...) @lgc.ilcps.return(i64 undef), !continuation.registercount !4
   unreachable
 }
 
-define { i8*, %continuation.token* } @await_with_ret_value(i8* %0) !continuation !1 !continuation.registercount !4 {
+define { i8*, %continuation.token* } @await_with_ret_value(i64 %dummyRet, i8* %0) !continuation !1 !continuation.registercount !4 {
 ; CHECK-LABEL: define void @await_with_ret_value(
-; CHECK-SAME: ) !continuation [[META6:![0-9]+]] !continuation.registercount [[META2]] !continuation.stacksize [[META3]] !continuation.state [[META3]] {
+; CHECK-SAME: i64 [[DUMMYRET:%.*]]) !continuation [[META6:![0-9]+]] !continuation.registercount [[META2]] !continuation.stacksize [[META3]] !continuation.state [[META3]] {
 ; CHECK-NEXT:    [[CONT_STATE_STACK_SEGMENT:%.*]] = call ptr addrspace(32) @lgc.cps.alloc(i32 8)
 ; CHECK-NEXT:    [[FRAMEPTR:%.*]] = bitcast ptr addrspace(32) [[CONT_STATE_STACK_SEGMENT]] to ptr addrspace(32)
 ; CHECK-NEXT:    [[DOTSPILL_ADDR:%.*]] = getelementptr inbounds [[AWAIT_WITH_RET_VALUE_FRAME:%.*]], ptr addrspace(32) [[FRAMEPTR]], i32 0, i32 0
 ; CHECK-NEXT:    store i64 -1, ptr addrspace(32) [[DOTSPILL_ADDR]], align 4
-; CHECK-NEXT:    call void (i64, ...) @continuation.continue(i64 ptrtoint (ptr @async_fun to i64), i64 ptrtoint (ptr @await_with_ret_value.resume.0 to i64)), !continuation.registercount [[META2]], !continuation.returnedRegistercount !2
+; CHECK-NEXT:    [[TMP1:%.*]] = call i64 (...) @lgc.cps.as.continuation.reference__i64(ptr @await_with_ret_value.resume.0)
+; CHECK-NEXT:    call void (i64, ...) @continuation.continue(i64 ptrtoint (ptr @async_fun to i64), i64 [[TMP1]]), !continuation.registercount [[META2]], !continuation.returnedRegistercount [[META2]]
 ; CHECK-NEXT:    unreachable
 ;
   %FramePtr = bitcast i8* %0 to %await_with_ret_value.Frame*
@@ -112,22 +114,22 @@ define { i8*, %continuation.token* } @await_with_ret_value(i8* %0) !continuation
 
 define internal { i8*, %continuation.token* } @await_with_ret_value.resume.0(i8* noalias nonnull align 16 dereferenceable(8) %0, i1 %1) !continuation !1 {
 ; CHECK-LABEL: define dso_local void @await_with_ret_value.resume.0(
-; CHECK-SAME: i32 [[RES1:%.*]]) !continuation [[META6]] !continuation.registercount [[META2]] {
+; CHECK-SAME: i64 [[TMP0:%.*]], i32 [[RES1:%.*]]) !continuation [[META6]] !continuation.registercount [[META2]] {
 ; CHECK-NEXT:    [[CONT_STATE_STACK_SEGMENT:%.*]] = call ptr addrspace(32) @lgc.cps.peek(i32 8)
 ; CHECK-NEXT:    [[FRAMEPTR:%.*]] = bitcast ptr addrspace(32) [[CONT_STATE_STACK_SEGMENT]] to ptr addrspace(32)
 ; CHECK-NEXT:    [[VFRAME:%.*]] = bitcast ptr addrspace(32) [[FRAMEPTR]] to ptr addrspace(32)
 ; CHECK-NEXT:    [[DOTRELOAD_ADDR:%.*]] = getelementptr inbounds [[AWAIT_WITH_RET_VALUE_FRAME:%.*]], ptr addrspace(32) [[FRAMEPTR]], i32 0, i32 0
 ; CHECK-NEXT:    [[DOTRELOAD:%.*]] = load i64, ptr addrspace(32) [[DOTRELOAD_ADDR]], align 4
 ; CHECK-NEXT:    call void @lgc.cps.free(i32 8)
-; CHECK-NEXT:    call void (i64, ...) @continuation.continue(i64 [[DOTRELOAD]], i32 [[RES1]]), !continuation.registercount [[META2]]
+; CHECK-NEXT:    call void (i64, ...) @continuation.continue(i64 [[DOTRELOAD]], i64 poison, i32 [[RES1]], i64 undef), !continuation.registercount [[META2]]
 ; CHECK-NEXT:    unreachable
 ;
   %FramePtr = bitcast i8* %0 to %await_with_ret_value.Frame*
   %vFrame = bitcast %await_with_ret_value.Frame* %FramePtr to i8*
   %.reload.addr = getelementptr inbounds %await_with_ret_value.Frame, %await_with_ret_value.Frame* %FramePtr, i32 0, i32 0
   %.reload = load i64, i64* %.reload.addr, align 4
-  %res = call i32 @continuations.getReturnValue__i32()
-  call void (i64, ...) @continuation.return(i64 %.reload, i32 %res), !continuation.registercount !4
+  %res = call i32 @lgc.ilcps.getReturnValue__i32()
+  call void (i64, ...) @lgc.ilcps.return(i64 %.reload, i32 %res, i64 undef), !continuation.registercount !4
   unreachable
 }
 
@@ -145,7 +147,8 @@ attributes #0 = { nounwind }
 ; CHECK: attributes #[[ATTR0:[0-9]+]] = { nounwind }
 ; CHECK: attributes #[[ATTR1:[0-9]+]] = { noreturn }
 ; CHECK: attributes #[[ATTR2:[0-9]+]] = { nounwind willreturn memory(inaccessiblemem: readwrite) }
-; CHECK: attributes #[[ATTR3:[0-9]+]] = { nounwind willreturn memory(inaccessiblemem: read) }
+; CHECK: attributes #[[ATTR3:[0-9]+]] = { nounwind willreturn }
+; CHECK: attributes #[[ATTR4:[0-9]+]] = { nounwind willreturn memory(inaccessiblemem: read) }
 ;.
 ; CHECK: [[META0:![0-9]+]] = !{i32 21}
 ; CHECK: [[META1]] = !{ptr @simple_await}
diff --git a/llvmraytracing/test/dx/closest-hit-procedural.ll b/llvmraytracing/test/dx/closest-hit-procedural.ll
index 80b5bca1c8..efbe700e7b 100644
--- a/llvmraytracing/test/dx/closest-hit-procedural.ll
+++ b/llvmraytracing/test/dx/closest-hit-procedural.ll
@@ -1,11 +1,10 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 3
-; RUN: opt --verify-each -passes='dxil-cont-intrinsic-prepare,lint,dxil-cont-lgc-rt-op-converter,lint,lower-raytracing-pipeline,lint,remove-types-metadata' -S %s 2> %t0.stderr | FileCheck -check-prefix=LOWERRAYTRACINGPIPELINE %s
-; RUN: opt --verify-each -passes='dxil-cont-intrinsic-prepare,lint,dxil-cont-lgc-rt-op-converter,lint,inline,lint,lower-raytracing-pipeline,lint,sroa,lint,lower-await,lint,coro-early,dxil-coro-split,coro-cleanup,lint,legacy-cleanup-continuations,lint,register-buffer,lint,dxil-cont-post-process,lint,remove-types-metadata' -S %s 2> %t1.stderr | FileCheck -check-prefix=DXILCONTPOSTPROCESS %s
-; RUN: count 0 < %t1.stderr
+; RUN: opt --verify-each -passes='dxil-cont-intrinsic-prepare,lint,dxil-cont-lgc-rt-op-converter,lint,lower-raytracing-pipeline,lint,remove-types-metadata' -S %s --lint-abort-on-error | FileCheck -check-prefix=LOWERRAYTRACINGPIPELINE %s
+; RUN: opt --verify-each -passes='dxil-cont-intrinsic-prepare,lint,dxil-cont-lgc-rt-op-converter,lint,inline,lint,lower-raytracing-pipeline,lint,sroa,lint,lower-await,lint,coro-early,dxil-coro-split,coro-cleanup,lint,legacy-cleanup-continuations,lint,register-buffer,lint,dxil-cont-post-process,lint,remove-types-metadata' -S %s --lint-abort-on-error | FileCheck -check-prefix=DXILCONTPOSTPROCESS %s
 
 ; Check a procedural closest hit shader with hit attributes that does not fit into system data alone
 
-target datalayout = "e-m:e-p:64:32-p20:32:32-p21:32:32-p32:32:32-i1:32-i8:8-i16:32-i32:32-i64:32-f16:32-f32:32-f64:32-v16:32-v32:32-v48:32-v64:32-v80:32-v96:32-v112:32-v128:32-v144:32-v160:32-v176:32-v192:32-v208:32-v224:32-v240:32-v256:32-n8:16:32"
+target datalayout = "e-m:e-p:64:32-p20:32:32-p21:32:32-p32:32:32-i1:32-i8:8-i16:16-i32:32-i64:32-f16:16-f32:32-f64:32-v8:8-v16:16-v32:32-v48:32-v64:32-v80:32-v96:32-v112:32-v128:32-v144:32-v160:32-v176:32-v192:32-v208:32-v224:32-v240:32-v256:32-n8:16:32"
 
 %dx.types.Handle = type { i8* }
 %struct.BuiltInTriangleIntersectionAttributes = type { <2 x float> }
@@ -119,14 +118,14 @@ declare !types !35 i32 @_cont_HitKind(%struct.SystemData* nocapture readnone, %s
 ; Function Attrs: nounwind
 define void @ClosestHit(%struct.RayPayload* noalias nocapture %payload, %struct.HitAttributes* nocapture readonly %attr) #3 !types !36 {
 ; LOWERRAYTRACINGPIPELINE-LABEL: define %struct.DispatchSystemData @ClosestHit(
-; LOWERRAYTRACINGPIPELINE-SAME: [[STRUCT_SYSTEMDATA:%.*]] [[TMP0:%.*]]) #[[ATTR4:[0-9]+]] !lgc.rt.shaderstage [[META19:![0-9]+]] !continuation [[META20:![0-9]+]] !continuation.registercount [[META16:![0-9]+]] {
+; LOWERRAYTRACINGPIPELINE-SAME: i64 [[RETURNADDR:%.*]], [[STRUCT_SYSTEMDATA:%.*]] [[TMP0:%.*]]) #[[ATTR4:[0-9]+]] !lgc.rt.shaderstage [[META19:![0-9]+]] !continuation [[META20:![0-9]+]] !continuation.registercount [[META16:![0-9]+]] {
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP2:%.*]] = alloca [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES:%.*]], align 8
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[SYSTEM_DATA_ALLOCA:%.*]] = alloca [[STRUCT_SYSTEMDATA]], align 8
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP3:%.*]] = alloca [[STRUCT_RAYPAYLOAD:%.*]], align 8
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[HITATTRS:%.*]] = alloca [[STRUCT_HITATTRIBUTES:%.*]], align 8
 ; LOWERRAYTRACINGPIPELINE-NEXT:    store [[STRUCT_SYSTEMDATA]] [[TMP0]], ptr [[SYSTEM_DATA_ALLOCA]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [[STRUCT_SYSTEMDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [[STRUCT_RAYPAYLOAD]], ptr [[TMP3]], i32 0, i32 0
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [[STRUCT_RAYPAYLOAD]], ptr [[TMP3]], i32 0
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP8:%.*]] = load i32, ptr addrspace(20) @PAYLOAD, align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP8]], ptr [[TMP5]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[TMP5]], i32 1
@@ -153,7 +152,7 @@ define void @ClosestHit(%struct.RayPayload* noalias nocapture %payload, %struct.
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP26:%.*]] = load i32, ptr addrspace(20) getelementptr inbounds ([[STRUCT_RAYPAYLOAD_ATTR_MAX_8_I32S_LAYOUT_3_CLOSESTHIT_IN_PAYLOAD_ATTR_2_I32S]], ptr addrspace(20) @PAYLOAD, i32 0, i32 0, i32 2), align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP26]], ptr [[TMP25]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    call void @amd.dx.setLocalRootIndex(i32 5)
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP27:%.*]] = getelementptr inbounds [[STRUCT_RAYPAYLOAD]], ptr [[TMP3]], i32 0, i32 0
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP27:%.*]] = getelementptr inbounds [[STRUCT_RAYPAYLOAD]], ptr [[TMP3]], i32 0
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP30:%.*]] = load i32, ptr [[TMP27]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP30]], ptr addrspace(20) @PAYLOAD, align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP31:%.*]] = getelementptr inbounds i32, ptr [[TMP27]], i32 1
@@ -167,7 +166,8 @@ define void @ClosestHit(%struct.RayPayload* noalias nocapture %payload, %struct.
 ; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP37]], ptr addrspace(20) getelementptr inbounds (i32, ptr addrspace(20) @PAYLOAD, i32 9), align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP38:%.*]] = getelementptr inbounds [[STRUCT_SYSTEMDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP39:%.*]] = load [[STRUCT_DISPATCHSYSTEMDATA:%.*]], ptr [[TMP38]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    ret [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP39]], !continuation.registercount [[META16]]
+; LOWERRAYTRACINGPIPELINE-NEXT:    call void (...) @lgc.ilcps.return(i64 [[RETURNADDR]], [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP39]]), !continuation.registercount [[META16]]
+; LOWERRAYTRACINGPIPELINE-NEXT:    unreachable
 ;
 ; DXILCONTPOSTPROCESS-LABEL: define void @ClosestHit(
 ; DXILCONTPOSTPROCESS-SAME: i32 [[CSPINIT:%.*]], i64 [[RETURNADDR:%.*]], [[STRUCT_SYSTEMDATA:%.*]] [[TMP0:%.*]]) #[[ATTR1:[0-9]+]] !lgc.rt.shaderstage [[META18:![0-9]+]] !continuation [[META19:![0-9]+]] !continuation.registercount [[META16:![0-9]+]] !continuation.state [[META14:![0-9]+]] {
@@ -201,7 +201,7 @@ define void @ClosestHit(%struct.RayPayload* noalias nocapture %payload, %struct.
 ; DXILCONTPOSTPROCESS-NEXT:    [[DOTFCA_0_LOAD:%.*]] = load <3 x i32>, ptr [[DOTFCA_0_GEP]], align 4
 ; DXILCONTPOSTPROCESS-NEXT:    [[DOTFCA_0_INSERT:%.*]] = insertvalue [[STRUCT_DISPATCHSYSTEMDATA]] poison, <3 x i32> [[DOTFCA_0_LOAD]], 0
 ; DXILCONTPOSTPROCESS-NEXT:    [[TMP12:%.*]] = load i32, ptr [[CSP]], align 4
-; DXILCONTPOSTPROCESS-NEXT:    call void (i64, ...) @continuation.continue(i64 [[RETURNADDR]], i32 [[TMP12]], [[STRUCT_DISPATCHSYSTEMDATA]] [[DOTFCA_0_INSERT]]), !continuation.registercount [[META16]]
+; DXILCONTPOSTPROCESS-NEXT:    call void (i64, ...) @continuation.continue(i64 [[RETURNADDR]], i32 [[TMP12]], i64 poison, [[STRUCT_DISPATCHSYSTEMDATA]] [[DOTFCA_0_INSERT]]), !continuation.registercount [[META16]]
 ; DXILCONTPOSTPROCESS-NEXT:    unreachable
 ;
   ret void
diff --git a/llvmraytracing/test/dx/closest-hit-traceray.ll b/llvmraytracing/test/dx/closest-hit-traceray.ll
index 683eb79e8f..0b161b949d 100644
--- a/llvmraytracing/test/dx/closest-hit-traceray.ll
+++ b/llvmraytracing/test/dx/closest-hit-traceray.ll
@@ -1,10 +1,8 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 3
-; RUN: opt --verify-each -passes='dxil-cont-intrinsic-prepare,lint,dxil-cont-lgc-rt-op-converter,lint,lower-raytracing-pipeline,lint,remove-types-metadata' -S %s 2> %t0.stderr | FileCheck -check-prefix=LOWERRAYTRACINGPIPELINE %s
-; RUN: count 0 < %t0.stderr
-; RUN: opt --verify-each -passes='dxil-cont-intrinsic-prepare,lint,dxil-cont-lgc-rt-op-converter,lint,inline,lint,lower-raytracing-pipeline,lint,sroa,lint,lower-await,lint,coro-early,dxil-coro-split,coro-cleanup,lint,legacy-cleanup-continuations,lint,register-buffer,lint,dxil-cont-post-process,lint,remove-types-metadata' -S %s 2> %t1.stderr | FileCheck -check-prefix=DXILCONTPOSTPROCESS %s
-; RUN: count 0 < %t1.stderr
+; RUN: opt --verify-each -passes='dxil-cont-intrinsic-prepare,lint,dxil-cont-lgc-rt-op-converter,lint,lower-raytracing-pipeline,lint,remove-types-metadata' -S %s --lint-abort-on-error | FileCheck -check-prefix=LOWERRAYTRACINGPIPELINE %s
+; RUN: opt --verify-each -passes='dxil-cont-intrinsic-prepare,lint,dxil-cont-lgc-rt-op-converter,lint,inline,lint,lower-raytracing-pipeline,lint,sroa,lint,lower-await,lint,coro-early,dxil-coro-split,coro-cleanup,lint,legacy-cleanup-continuations,lint,register-buffer,lint,dxil-cont-post-process,lint,remove-types-metadata' -S %s --lint-abort-on-error | FileCheck -check-prefix=DXILCONTPOSTPROCESS %s
 
-target datalayout = "e-m:e-p:64:32-p20:32:32-p21:32:32-p32:32:32-i1:32-i8:8-i16:32-i32:32-i64:32-f16:32-f32:32-f64:32-v16:32-v32:32-v48:32-v64:32-v80:32-v96:32-v112:32-v128:32-v144:32-v160:32-v176:32-v192:32-v208:32-v224:32-v240:32-v256:32-n8:16:32"
+target datalayout = "e-m:e-p:64:32-p20:32:32-p21:32:32-p32:32:32-i1:32-i8:8-i16:16-i32:32-i64:32-f16:16-f32:32-f64:32-v8:8-v16:16-v32:32-v48:32-v64:32-v80:32-v96:32-v112:32-v128:32-v144:32-v160:32-v176:32-v192:32-v208:32-v224:32-v240:32-v256:32-n8:16:32"
 
 %dx.types.Handle = type { i8* }
 %struct.BuiltInTriangleIntersectionAttributes = type { <2 x float> }
@@ -117,14 +115,14 @@ declare !types !35 i32 @_cont_HitKind(%struct.SystemData* nocapture readnone, %s
 ; Function Attrs: nounwind
 define void @ClosestHit(%struct.RayPayload* noalias nocapture %payload, %struct.BuiltInTriangleIntersectionAttributes* nocapture readonly %attr) #3 !types !36 {
 ; LOWERRAYTRACINGPIPELINE-LABEL: define %struct.DispatchSystemData @ClosestHit(
-; LOWERRAYTRACINGPIPELINE-SAME: [[STRUCT_SYSTEMDATA:%.*]] [[TMP0:%.*]]) #[[ATTR5:[0-9]+]] !continuation [[META18:![0-9]+]] !lgc.rt.shaderstage [[META19:![0-9]+]] !continuation.registercount [[META16:![0-9]+]] {
+; LOWERRAYTRACINGPIPELINE-SAME: i64 [[RETURNADDR:%.*]], [[STRUCT_SYSTEMDATA:%.*]] [[TMP0:%.*]]) #[[ATTR5:[0-9]+]] !continuation [[META18:![0-9]+]] !lgc.rt.shaderstage [[META19:![0-9]+]] !continuation.registercount [[META16:![0-9]+]] {
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP2:%.*]] = alloca [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES:%.*]], align 8
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[SYSTEM_DATA_ALLOCA:%.*]] = alloca [[STRUCT_SYSTEMDATA]], align 8
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP3:%.*]] = alloca [[STRUCT_RAYPAYLOAD:%.*]], align 8
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[HITATTRS:%.*]] = alloca [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]], align 8
 ; LOWERRAYTRACINGPIPELINE-NEXT:    store [[STRUCT_SYSTEMDATA]] [[TMP0]], ptr [[SYSTEM_DATA_ALLOCA]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [[STRUCT_SYSTEMDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [[STRUCT_RAYPAYLOAD]], ptr [[TMP3]], i32 0, i32 0
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [[STRUCT_RAYPAYLOAD]], ptr [[TMP3]], i32 0
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP8:%.*]] = load i32, ptr addrspace(20) @PAYLOAD, align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP8]], ptr [[TMP5]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[TMP5]], i32 1
@@ -149,7 +147,7 @@ define void @ClosestHit(%struct.RayPayload* noalias nocapture %payload, %struct.
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP24:%.*]] = load [[DX_TYPES_HANDLE]], ptr @"\01?RenderTarget@@3V?$RWTexture2D@V?$vector@M$03@@@@A", align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP25:%.*]] = alloca [[STRUCT_RAYPAYLOAD]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP26:%.*]] = bitcast ptr [[TMP25]] to ptr
-; LOWERRAYTRACINGPIPELINE-NEXT:    call void @llvm.lifetime.start.p0(i64 16, ptr [[TMP26]]) #[[ATTR9:[0-9]+]]
+; LOWERRAYTRACINGPIPELINE-NEXT:    call void @llvm.lifetime.start.p0(i64 16, ptr [[TMP26]]) #[[ATTR10:[0-9]+]]
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP27:%.*]] = getelementptr inbounds [[STRUCT_RAYPAYLOAD]], ptr [[TMP25]], i32 0, i32 0
 ; LOWERRAYTRACINGPIPELINE-NEXT:    store <4 x float> zeroinitializer, ptr [[TMP27]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP28:%.*]] = call [[DX_TYPES_HANDLE]] [[DX_OP_CREATEHANDLEFORLIB_DX_TYPES_HANDLE:@[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]](i32 160, [[DX_TYPES_HANDLE]] [[TMP23]])
@@ -161,7 +159,7 @@ define void @ClosestHit(%struct.RayPayload* noalias nocapture %payload, %struct.
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TRAV_DATA_I:%.*]] = insertvalue [[STRUCT_TRAVERSALDATA:%.*]] undef, [[STRUCT_SYSTEMDATA]] [[SYS_DATA_I]], 0
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[NEWDATA_I:%.*]] = call [[STRUCT_DISPATCHSYSTEMDATA]] [[AMD_DX_TRAVERSAL:@[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]]([[STRUCT_TRAVERSALDATA]] [[TRAV_DATA_I]])
 ; LOWERRAYTRACINGPIPELINE-NEXT:    store [[STRUCT_DISPATCHSYSTEMDATA]] [[NEWDATA_I]], ptr [[TMP31]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP32:%.*]] = getelementptr inbounds [[STRUCT_RAYPAYLOAD]], ptr [[TMP3]], i32 0, i32 0
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP32:%.*]] = getelementptr inbounds [[STRUCT_RAYPAYLOAD]], ptr [[TMP3]], i32 0
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP35:%.*]] = load i32, ptr [[TMP32]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP35]], ptr addrspace(20) @PAYLOAD, align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP36:%.*]] = getelementptr inbounds i32, ptr [[TMP32]], i32 1
@@ -175,7 +173,8 @@ define void @ClosestHit(%struct.RayPayload* noalias nocapture %payload, %struct.
 ; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP42]], ptr addrspace(20) getelementptr inbounds (i32, ptr addrspace(20) @PAYLOAD, i32 9), align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP43:%.*]] = getelementptr inbounds [[STRUCT_SYSTEMDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP44:%.*]] = load [[STRUCT_DISPATCHSYSTEMDATA]], ptr [[TMP43]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    ret [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP44]], !continuation.registercount [[META16]]
+; LOWERRAYTRACINGPIPELINE-NEXT:    call void (...) @lgc.ilcps.return(i64 [[RETURNADDR]], [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP44]]), !continuation.registercount [[META16]]
+; LOWERRAYTRACINGPIPELINE-NEXT:    unreachable
 ;
 ; DXILCONTPOSTPROCESS-LABEL: define void @ClosestHit(
 ; DXILCONTPOSTPROCESS-SAME: i32 [[CSPINIT:%.*]], i64 [[RETURNADDR:%.*]], [[STRUCT_SYSTEMDATA:%.*]] [[TMP0:%.*]]) #[[ATTR2:[0-9]+]] !continuation [[META18:![0-9]+]] !lgc.rt.shaderstage [[META19:![0-9]+]] !continuation.registercount [[META16:![0-9]+]] !continuation.state [[META14:![0-9]+]] {
@@ -222,7 +221,7 @@ define void @ClosestHit(%struct.RayPayload* noalias nocapture %payload, %struct.
 ; DXILCONTPOSTPROCESS-NEXT:    [[DOTFCA_0_LOAD:%.*]] = load <3 x i32>, ptr [[DOTFCA_0_GEP]], align 4
 ; DXILCONTPOSTPROCESS-NEXT:    [[DOTFCA_0_INSERT:%.*]] = insertvalue [[STRUCT_DISPATCHSYSTEMDATA]] poison, <3 x i32> [[DOTFCA_0_LOAD]], 0
 ; DXILCONTPOSTPROCESS-NEXT:    [[TMP16:%.*]] = load i32, ptr [[CSP]], align 4
-; DXILCONTPOSTPROCESS-NEXT:    call void (i64, ...) @continuation.continue(i64 [[RETURNADDR]], i32 [[TMP16]], [[STRUCT_DISPATCHSYSTEMDATA]] [[DOTFCA_0_INSERT]]), !continuation.registercount [[META16]]
+; DXILCONTPOSTPROCESS-NEXT:    call void (i64, ...) @continuation.continue(i64 [[RETURNADDR]], i32 [[TMP16]], i64 poison, [[STRUCT_DISPATCHSYSTEMDATA]] [[DOTFCA_0_INSERT]]), !continuation.registercount [[META16]]
 ; DXILCONTPOSTPROCESS-NEXT:    unreachable
 ;
   %1 = load %dx.types.Handle, %dx.types.Handle* @"\01?Scene@@3URaytracingAccelerationStructure@@A", align 4
diff --git a/llvmraytracing/test/dx/closest-hit.ll b/llvmraytracing/test/dx/closest-hit.ll
index 8f9291ee73..1b22be001c 100644
--- a/llvmraytracing/test/dx/closest-hit.ll
+++ b/llvmraytracing/test/dx/closest-hit.ll
@@ -1,8 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 3
-; RUN: opt --verify-each -passes='dxil-cont-intrinsic-prepare,lint,dxil-cont-lgc-rt-op-converter,lint,lower-raytracing-pipeline,lint,remove-types-metadata' -S %s 2> %t0.stderr | FileCheck -check-prefix=LOWERRAYTRACINGPIPELINE %s
-; RUN: count 0 < %t0.stderr
+; RUN: opt --verify-each -passes='dxil-cont-intrinsic-prepare,lint,dxil-cont-lgc-rt-op-converter,lint,lower-raytracing-pipeline,lint,remove-types-metadata' -S %s --lint-abort-on-error | FileCheck -check-prefix=LOWERRAYTRACINGPIPELINE %s
 
-target datalayout = "e-m:e-p:64:32-p20:32:32-p21:32:32-p32:32:32-i1:32-i8:8-i16:32-i32:32-i64:32-f16:32-f32:32-f64:32-v16:32-v32:32-v48:32-v64:32-v80:32-v96:32-v112:32-v128:32-v144:32-v160:32-v176:32-v192:32-v208:32-v224:32-v240:32-v256:32-n8:16:32"
+target datalayout = "e-m:e-p:64:32-p20:32:32-p21:32:32-p32:32:32-i1:32-i8:8-i16:16-i32:32-i64:32-f16:16-f32:32-f64:32-v8:8-v16:16-v32:32-v48:32-v64:32-v80:32-v96:32-v112:32-v128:32-v144:32-v160:32-v176:32-v192:32-v208:32-v224:32-v240:32-v256:32-n8:16:32"
 
 %struct.BuiltInTriangleIntersectionAttributes = type { <2 x float> }
 %struct.SystemData = type { %struct.DispatchSystemData }
@@ -104,14 +103,14 @@ declare !types !29 i32 @_cont_HitKind(%struct.SystemData* nocapture readnone, %s
 ; Function Attrs: nounwind
 define void @ClosestHit(%struct.RayPayload* noalias nocapture %payload, %struct.BuiltInTriangleIntersectionAttributes* nocapture readonly %attr) #3 !types !30 {
 ; LOWERRAYTRACINGPIPELINE-LABEL: define %struct.DispatchSystemData @ClosestHit(
-; LOWERRAYTRACINGPIPELINE-SAME: [[STRUCT_SYSTEMDATA:%.*]] [[TMP0:%.*]]) #[[ATTR4:[0-9]+]] !lgc.rt.shaderstage [[META13:![0-9]+]] !continuation [[META14:![0-9]+]] !continuation.registercount [[META10:![0-9]+]] {
+; LOWERRAYTRACINGPIPELINE-SAME: i64 [[RETURNADDR:%.*]], [[STRUCT_SYSTEMDATA:%.*]] [[TMP0:%.*]]) #[[ATTR4:[0-9]+]] !lgc.rt.shaderstage [[META13:![0-9]+]] !continuation [[META14:![0-9]+]] !continuation.registercount [[META10:![0-9]+]] {
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP2:%.*]] = alloca [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES:%.*]], align 8
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[SYSTEM_DATA_ALLOCA:%.*]] = alloca [[STRUCT_SYSTEMDATA]], align 8
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP3:%.*]] = alloca [[STRUCT_RAYPAYLOAD:%.*]], align 8
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[HITATTRS:%.*]] = alloca [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]], align 8
 ; LOWERRAYTRACINGPIPELINE-NEXT:    store [[STRUCT_SYSTEMDATA]] [[TMP0]], ptr [[SYSTEM_DATA_ALLOCA]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [[STRUCT_SYSTEMDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [[STRUCT_RAYPAYLOAD]], ptr [[TMP3]], i32 0, i32 0
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [[STRUCT_RAYPAYLOAD]], ptr [[TMP3]], i32 0
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP8:%.*]] = load i32, ptr addrspace(20) @PAYLOAD, align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP8]], ptr [[TMP5]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[TMP5]], i32 1
@@ -130,7 +129,7 @@ define void @ClosestHit(%struct.RayPayload* noalias nocapture %payload, %struct.
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[BARYPTR:%.*]] = getelementptr inbounds [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]], ptr [[HITATTRS]], i32 0, i32 0
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[BARY:%.*]] = load <2 x float>, ptr [[BARYPTR]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    store <2 x float> [[BARY]], ptr [[PTR]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP19:%.*]] = getelementptr inbounds [[STRUCT_RAYPAYLOAD]], ptr [[TMP3]], i32 0, i32 0
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP19:%.*]] = getelementptr inbounds [[STRUCT_RAYPAYLOAD]], ptr [[TMP3]], i32 0
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP22:%.*]] = load i32, ptr [[TMP19]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP22]], ptr addrspace(20) @PAYLOAD, align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP23:%.*]] = getelementptr inbounds i32, ptr [[TMP19]], i32 1
@@ -138,7 +137,8 @@ define void @ClosestHit(%struct.RayPayload* noalias nocapture %payload, %struct.
 ; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP25]], ptr addrspace(20) getelementptr inbounds (i32, ptr addrspace(20) @PAYLOAD, i32 7), align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP26:%.*]] = getelementptr inbounds [[STRUCT_SYSTEMDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP27:%.*]] = load [[STRUCT_DISPATCHSYSTEMDATA:%.*]], ptr [[TMP26]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    ret [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP27]], !continuation.registercount [[META10]]
+; LOWERRAYTRACINGPIPELINE-NEXT:    call void (...) @lgc.ilcps.return(i64 [[RETURNADDR]], [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP27]]), !continuation.registercount [[META10]]
+; LOWERRAYTRACINGPIPELINE-NEXT:    unreachable
 ;
   %ptr = getelementptr inbounds %struct.RayPayload, %struct.RayPayload* %payload, i32 0, i32 0
   %baryPtr = getelementptr inbounds %struct.BuiltInTriangleIntersectionAttributes, %struct.BuiltInTriangleIntersectionAttributes* %attr, i32 0, i32 0
diff --git a/llvmraytracing/test/dx/continuation-registercount.ll b/llvmraytracing/test/dx/continuation-registercount.ll
index a3964eacd1..cbe8bf81d0 100644
--- a/llvmraytracing/test/dx/continuation-registercount.ll
+++ b/llvmraytracing/test/dx/continuation-registercount.ll
@@ -1,21 +1,15 @@
-; RUN: grep -v SKIP_LINE_BY_DEFAULT %s | \
-; RUN:    opt --verify-each -passes='dxil-cont-intrinsic-prepare,lint,dxil-cont-lgc-rt-op-converter,lint,inline,lint,lower-raytracing-pipeline,lint,sroa,lint,lower-await,lint,coro-early,dxil-coro-split,coro-cleanup,lint,legacy-cleanup-continuations,lint,register-buffer,lint,dxil-cont-post-process,lint,remove-types-metadata' -S 2> %t0.stderr | \
-; RUN:    FileCheck -check-prefix=POSTPROCESS-REGCOUNT %s
-; RUN: count 0 < %t0.stderr
+; RUN: grep -v MAX_REG_10 %s | \
+; RUN:    opt --verify-each -passes='dxil-cont-intrinsic-prepare,lint,dxil-cont-lgc-rt-op-converter,lint,inline,lint,lower-raytracing-pipeline,lint,sroa,lint,lower-await,lint,coro-early,dxil-coro-split,coro-cleanup,lint,legacy-cleanup-continuations,lint,register-buffer,lint,dxil-cont-post-process,lint,remove-types-metadata' -S --lint-abort-on-error | \
+; RUN:    FileCheck -check-prefixes=COMMON,MAX30 %s
 ;
-; RUN: grep -v SKIP_LINE_BY_DEFAULT %s | \
-; RUN:    opt --verify-each -passes='dxil-cont-intrinsic-prepare,lint,dxil-cont-lgc-rt-op-converter,lint,inline,lint,lower-raytracing-pipeline,lint,sroa,lint,lower-await,lint,coro-early,dxil-coro-split,coro-cleanup,lint,legacy-cleanup-continuations,lint,register-buffer,lint,dxil-cont-post-process,lint,remove-types-metadata' -S 2> %t1.stderr | \
-; RUN:    FileCheck -check-prefix=POSTPROCESS-REGCOUNT2 %s
-; RUN: count 0 < %t1.stderr
-;
-; RUN: opt --verify-each -passes='dxil-cont-intrinsic-prepare,lint,dxil-cont-lgc-rt-op-converter,lint,inline,lint,lower-raytracing-pipeline,lint,sroa,lint,lower-await,lint,coro-early,dxil-coro-split,coro-cleanup,lint,legacy-cleanup-continuations,lint,register-buffer,lint,dxil-cont-post-process,lint,remove-types-metadata' -S %s 2> %t2.stderr | \
-; RUN:    FileCheck -check-prefix=POSTPROCESS-REGCOUNT-FEWREGS %s
-; RUN: count 0 < %t2.stderr
+; RUN: grep -v MAX_REG_30 %s | \
+; RUN:    opt --verify-each -passes='dxil-cont-intrinsic-prepare,lint,dxil-cont-lgc-rt-op-converter,lint,inline,lint,lower-raytracing-pipeline,lint,sroa,lint,lower-await,lint,coro-early,dxil-coro-split,coro-cleanup,lint,legacy-cleanup-continuations,lint,register-buffer,lint,dxil-cont-post-process,lint,remove-types-metadata' -S --lint-abort-on-error | \
+; RUN:    FileCheck -check-prefixes=COMMON,MAX10 %s
 
 ; The order of metadata on functions is non-deterministic, so make two different runs to match both of them.
 ; The 'grep' commands filter out a metadata node that reduces the payload register count.
 
-target datalayout = "e-m:e-p:64:32-p20:32:32-p21:32:32-p32:32:32-i1:32-i8:8-i16:32-i32:32-i64:32-f16:32-f32:32-f64:32-v16:32-v32:32-v48:32-v64:32-v80:32-v96:32-v112:32-v128:32-v144:32-v160:32-v176:32-v192:32-v208:32-v224:32-v240:32-v256:32-n8:16:32"
+target datalayout = "e-m:e-p:64:32-p20:32:32-p21:32:32-p32:32:32-i1:32-i8:8-i16:16-i32:32-i64:32-f16:16-f32:32-f64:32-v8:8-v16:16-v32:32-v48:32-v64:32-v80:32-v96:32-v112:32-v128:32-v144:32-v160:32-v176:32-v192:32-v208:32-v224:32-v240:32-v256:32-n8:16:32"
 
 %dx.types.Handle = type { i8* }
 %struct.DispatchSystemData = type { i32 }
@@ -25,7 +19,8 @@ target datalayout = "e-m:e-p:64:32-p20:32:32-p21:32:32-p32:32:32-i1:32-i8:8-i16:
 %struct.AnyHitTraversalData = type { %struct.TraversalData, %struct.HitData }
 %struct.HitData = type { float, i32 }
 %struct.TheirParams = type { [10 x i32] }
-%struct.RayPayload = type { [9 x i32] }
+%struct.RayPayload = type { [15 x i32] }
+%struct.PayloadWithI16 = type { i16, i16 }
 %dx.types.ResourceProperties = type { i32, i32 }
 %struct.MyParams = type { [26 x i32] }
 %struct.TheirParams2 = type { [27 x i32] }
@@ -67,6 +62,10 @@ declare !types !29 void @_cont_AcceptHit(%struct.AnyHitTraversalData* nocapture
 ; Function Attrs: alwaysinline
 declare i1 @opaqueIsEnd() #0
 
+define void @_cont_ExitRayGen(ptr nocapture readonly %data) alwaysinline nounwind !types !{!"function", !"void", !{i32 0, %struct.DispatchSystemData poison}} {
+  ret void
+}
+
 ; Function Attrs: alwaysinline
 define i1 @_cont_IsEndSearch(%struct.TraversalData* %data) #0 !types !31 {
   %isEnd = call i1 @opaqueIsEnd()
@@ -121,8 +120,15 @@ define i1 @_cont_ReportHit(%struct.AnyHitTraversalData* %data, float %t, i32 %hi
   ret i1 true
 }
 
-; POSTPROCESS-REGCOUNT-DAG: call void (i64, ...) @continuation.continue(i64 2, {{.*}}, %struct.DispatchSystemData %{{[^ ]+}}), !continuation.registercount ![[callshader_registercount:[0-9]+]]
-; POSTPROCESS-REGCOUNT-DAG: ![[callshader_registercount]] = !{i32 10}
+; COMMON-DAG: ![[MD_I32_1:[0-9]+]] = !{i32 1}
+; COMMON-DAG: ![[MD_I32_10:[0-9]+]] = !{i32 10}
+; MAX30-DAG: ![[MD_I32_15:[0-9]+]] = !{i32 15}
+; MAX30-DAG: ![[MD_I32_26:[0-9]+]] = !{i32 26}
+; MAX30-DAG: ![[MD_I32_27:[0-9]+]] = !{i32 27}
+; MAX30-DAG: ![[MD_I32_30:[0-9]+]] = !{i32 30}
+
+; COMMON-DAG: define void @main(
+; COMMON-DAG: call void (i64, ...) @continuation.continue(i64 2, {{.*}}, %struct.DispatchSystemData %{{[^ ]+}}), !continuation.registercount ![[MD_I32_10]]
 
 define void @main() {
   %params = alloca %struct.TheirParams, align 4
@@ -130,8 +136,9 @@ define void @main() {
   ret void
 }
 
-; POSTPROCESS-REGCOUNT-DAG: call void (i64, ...) @continuation.continue(i64 4, {{.*}} %struct.TraversalData %{{[^ ]+}}), !continuation.registercount ![[traceray_registercount:[0-9]+]]
-; POSTPROCESS-REGCOUNT-DAG: ![[traceray_registercount]] = !{i32 15}
+; COMMON-DAG: define void @mainTrace(
+; MAX10-DAG: call void (i64, ...) @continuation.continue(i64 4, {{.*}} %struct.TraversalData %{{.*}}), !continuation.registercount ![[MD_I32_10]]
+; MAX30-DAG: call void (i64, ...) @continuation.continue(i64 4, {{.*}} %struct.TraversalData %{{.*}}), !continuation.registercount ![[MD_I32_15]]
 
 define void @mainTrace() {
   %1 = load %dx.types.Handle, %dx.types.Handle* @"\01?Scene@@3URaytracingAccelerationStructure@@A", align 4
@@ -144,16 +151,11 @@ define void @mainTrace() {
   ret void
 }
 
-; POSTPROCESS-REGCOUNT-DAG: define void @called({{.*}}%struct.DispatchSystemData %0){{.*}} !continuation.registercount ![[called_registercount:[0-9]+]]
-; POSTPROCESS-REGCOUNT-DAG: define dso_local void @called.resume.0({{.*}}%struct.DispatchSystemData %0){{.*}} !continuation.registercount ![[called_resume_registercount:[0-9]+]]
-; POSTPROCESS-REGCOUNT-DAG: ![[called_registercount]] = !{i32 26}
-; POSTPROCESS-REGCOUNT-DAG: ![[called_resume_registercount]] = !{i32 27}
-
 ; If we set maxPayloadRegisterCount to 10, both functions use only 10 payload registers.
-; Note that due to metadata uniquing, both use the same metadata node.
-; POSTPROCESS-REGCOUNT-FEWREGS-DAG: define void @called({{.*}}%struct.DispatchSystemData %0){{.*}} !continuation.registercount ![[registercount:[0-9]+]]
-; POSTPROCESS-REGCOUNT-FEWREGS-DAG: define dso_local void @called.resume.0({{.*}}%struct.DispatchSystemData %0){{.*}} !continuation.registercount ![[registercount]]
-; POSTPROCESS-REGCOUNT-FEWREGS-DAG: ![[registercount]] = !{i32 10}
+; MAX10-DAG: define void @called({{.*}}%struct.DispatchSystemData %0){{.*}} !continuation.registercount ![[MD_I32_10]]
+; MAX10-DAG: define dso_local void @called.resume.0({{.*}}%struct.DispatchSystemData{{.*}} !continuation.registercount ![[MD_I32_10]]
+; MAX30-DAG: define void @called({{.*}}%struct.DispatchSystemData %0){{.*}} !continuation.registercount ![[MD_I32_26]]
+; MAX30-DAG: define dso_local void @called.resume.0({{.*}}%struct.DispatchSystemData{{.*}} !continuation.registercount ![[MD_I32_27]]
 
 define void @called(%struct.MyParams* %arg) !types !39 {
   %params = alloca %struct.TheirParams2, align 4
@@ -161,10 +163,12 @@ define void @called(%struct.MyParams* %arg) !types !39 {
   ret void
 }
 
-; POSTPROCESS-REGCOUNT-DAG: define void @Intersection({{.*}}%struct.AnyHitTraversalData %0){{.*}} !continuation.registercount ![[intersection_registercount:[0-9]+]]
-; POSTPROCESS-REGCOUNT-DAG: define dso_local void @Intersection.resume.0({{.*}}%struct.AnyHitTraversalData %0){{.*}} !continuation.registercount ![[intersection_registercount]]
-; POSTPROCESS-REGCOUNT-DAG: call void (i64, ...) @continuation.continue(i64 3, {{.*}} float 4.000000e+00, i32 0, %struct.BuiltInTriangleIntersectionAttributes {{.*}}), !continuation.registercount ![[intersection_registercount]]
-; POSTPROCESS-REGCOUNT-DAG: ![[intersection_registercount]] = !{i32 30}
+; MAX10-DAG: define void @Intersection({{.*}}%struct.AnyHitTraversalData %0){{.*}} !continuation.registercount ![[MD_I32_10]]
+; MAX10-DAG: define dso_local void @Intersection.resume.0({{.*}}%struct.AnyHitTraversalData{{.*}} !continuation.registercount ![[MD_I32_10]]
+; MAX10-DAG: call void (i64, ...) @continuation.continue(i64 3, {{.*}} float 4.000000e+00, i32 0, %struct.BuiltInTriangleIntersectionAttributes {{.*}}), !continuation.registercount ![[MD_I32_10]]
+; MAX30-DAG: define void @Intersection({{.*}}%struct.AnyHitTraversalData %0){{.*}} !continuation.registercount ![[MD_I32_30]]
+; MAX30-DAG: define dso_local void @Intersection.resume.0({{.*}}%struct.AnyHitTraversalData{{.*}} !continuation.registercount ![[MD_I32_30]]
+; MAX30-DAG: call void (i64, ...) @continuation.continue(i64 3, {{.*}} float 4.000000e+00, i32 0, %struct.BuiltInTriangleIntersectionAttributes {{.*}}), !continuation.registercount ![[MD_I32_30]]
 
 define void @Intersection() #3 {
   %a = alloca %struct.BuiltInTriangleIntersectionAttributes, align 4
@@ -172,33 +176,36 @@ define void @Intersection() #3 {
   ret void
 }
 
-; POSTPROCESS-REGCOUNT2-DAG: define void @AnyHit({{.*}}%struct.AnyHitTraversalData %0, %struct.BuiltInTriangleIntersectionAttributes %1){{.*}} !continuation.registercount ![[anyhit_registercount:[0-9]+]]
-; POSTPROCESS-REGCOUNT2-DAG: ![[anyhit_registercount]] = !{i32 15}
+; MAX10-DAG: define void @AnyHit({{.*}}%struct.AnyHitTraversalData %0, %struct.BuiltInTriangleIntersectionAttributes %1){{.*}} !continuation.registercount ![[MD_I32_10]]
+; MAX30-DAG: define void @AnyHit({{.*}}%struct.AnyHitTraversalData %0, %struct.BuiltInTriangleIntersectionAttributes %1){{.*}} !continuation.registercount ![[MD_I32_15]]
 
 define void @AnyHit(%struct.RayPayload* noalias nocapture %payload, %struct.BuiltInTriangleIntersectionAttributes* nocapture readonly %attr) #3 !types !41 {
   ret void
 }
 
-; With fixed hit attribute registers and without PAQs, ClosestHitOut also contains storage for hit attributes,
-; so we re-used the anyhit_registercount metadata for the match.
-; POSTPROCESS-REGCOUNT2-DAG: define void @ClosestHit({{.*}}%struct.SystemData %0){{.*}} !continuation.registercount ![[anyhit_registercount]]
+; With fixed hit attribute registers and without PAQs, ClosestHitOut also contains storage for hit attributes
+; MAX10-DAG: define void @ClosestHit({{.*}}%struct.SystemData %0){{.*}} !continuation.registercount ![[MD_I32_10]]
+; MAX30-DAG: define void @ClosestHit({{.*}}%struct.SystemData %0){{.*}} !continuation.registercount ![[MD_I32_15]]
 
 define void @ClosestHit(%struct.RayPayload* noalias nocapture %payload, %struct.BuiltInTriangleIntersectionAttributes* nocapture readonly %attr) #3 !types !41 {
   ret void
 }
 
-declare void @continuation.continue(i64, ...)
+; COMMON-DAG: define void @Miss16({{.*}}%struct.SystemData %0){{.*}} !continuation.registercount ![[MD_I32_1]]
+define void @Miss16(%struct.PayloadWithI16* noalias nocapture %payload) !types !55 {
+  ret void
+}
 
-; POSTPROCESS-REGCOUNT-FEWREGS-DAG: define %struct._AmdTraversalResultData @_cont_Traversal({{.*}} !continuation.registercount ![[registercount]]
-;                                                                                                       ^--- this MD node has value 10
-; POSTPROCESS-REGCOUNT-FEWREGS-DAG: call {{.*}} @continuation.continue({{.*}} !continuation.registercount ![[registercount]]
-; POSTPROCESS-REGCOUNT-DAG: define %struct._AmdTraversalResultData @_cont_Traversal({{.*}} !continuation.registercount ![[intersection_registercount]]
-;                                                                                               ^--- this MD node has value 30
-; POSTPROCESS-REGCOUNT-DAG: call {{.*}} @continuation.continue({{.*}} !continuation.registercount ![[intersection_registercount]]
+declare void @_AmdEnqueueAnyHit(i64, %struct._AmdSystemData) #0
+
+; MAX10-DAG: define void @_cont_Traversal({{.*}} !continuation.registercount ![[MD_I32_10]]
+; MAX10-DAG: call {{.*}} @continuation.continue({{.*}} !continuation.registercount ![[MD_I32_10]]
+; MAX30-DAG: define void @_cont_Traversal({{.*}} !continuation.registercount ![[MD_I32_27]]
+; MAX30-DAG: call {{.*}} @continuation.continue({{.*}} !continuation.registercount ![[MD_I32_27]]
 
 define void @_cont_Traversal(%struct._AmdTraversalResultData* noalias nocapture sret(%struct._AmdTraversalResultData) %agg.result, %struct._AmdSystemData* noalias %data) !types !44 {
-  call void (i64, ...) @continuation.continue(i64 0, i8 addrspace(21)* undef)
-  ret void
+  call void @_AmdEnqueueAnyHit(i64 0, %struct._AmdSystemData undef)
+  unreachable
 }
 
 ; Function Attrs: nounwind
@@ -227,8 +234,12 @@ attributes #3 = { nounwind }
 !dx.version = !{!1}
 !dx.valver = !{!1}
 !dx.shaderModel = !{!2}
-!dx.entryPoints = !{!3, !6, !13, !15, !17, !19, !21}
-!continuation.maxPayloadRegisterCount = !{!23} ; SKIP_LINE_BY_DEFAULT
+!dx.entryPoints = !{!3, !6, !13, !15, !17, !19, !21, !57}
+!continuation.maxPayloadRegisterCount = !{!23} ; 10; only for MAX_REG_10
+!continuation.maxPayloadRegisterCount = !{!53} ; 30; only for MAX_REG_30
+!continuation.preservedPayloadRegisterCount = !{!23} ; 10; only for MAX_REG_10
+!continuation.preservedPayloadRegisterCount = !{!54} ; 27; only for MAX_REG_30
+!lgc.rt.max.attribute.size = !{!60}
 
 !0 = !{!"clang version 3.7.0 (tags/RELEASE_370/final)"}
 !1 = !{i32 1, i32 6}
@@ -283,3 +294,11 @@ attributes #3 = { nounwind }
 !50 = !{!"function", !"void", i32 poison, i32 poison, !51}
 !51 = !{i32 0, %struct.TheirParams2 poison}
 !52 = !{!"function", i1 poison, i32 poison, float poison, i32 poison, !43}
+!53 = !{i32 30}
+!54 = !{i32 27}
+!55 = !{!"function", !"void", !56}
+!56 = !{i32 0, %struct.PayloadWithI16 poison}
+!57 = !{void (%struct.PayloadWithI16*)* @Miss16, !"Miss16", null, null, !58}
+!58 = !{i32 8, i32 11, i32 6, i32 24, i32 5, !59}
+!59 = !{i32 0}
+!60 = !{i32 8}
diff --git a/llvmraytracing/test/dx/continuation-stacksize.ll b/llvmraytracing/test/dx/continuation-stacksize.ll
index f3b7b288fd..176f580c7f 100644
--- a/llvmraytracing/test/dx/continuation-stacksize.ll
+++ b/llvmraytracing/test/dx/continuation-stacksize.ll
@@ -1,13 +1,11 @@
 ; RUN: opt --verify-each -passes='dxil-cont-lgc-rt-op-converter,lint,inline,lint,lower-raytracing-pipeline,lint,sroa,lint,lower-await,lint,coro-early,dxil-coro-split,coro-cleanup,lint,legacy-cleanup-continuations,lint,register-buffer,lint,dxil-cont-post-process,lint,remove-types-metadata' \
-; RUN:     -S %s 2> %t0.stderr | FileCheck -check-prefix=POSTPROCESS-STACKSIZE %s
-; RUN: count 0 < %t0.stderr
+; RUN:     -S %s --lint-abort-on-error | FileCheck -check-prefix=POSTPROCESS-STACKSIZE %s
 ; RUN: opt --verify-each -passes='dxil-cont-lgc-rt-op-converter,lint,inline,lint,lower-raytracing-pipeline,lint,sroa,lint,lower-await,lint,coro-early,dxil-coro-split,coro-cleanup,lint,legacy-cleanup-continuations,lint,register-buffer,lint,dxil-cont-post-process,lint,remove-types-metadata' \
-; RUN:     -S %s 2> %t1.stderr | FileCheck -check-prefix=POSTPROCESS-STATESIZE %s
-; RUN: count 0 < %t1.stderr
+; RUN:     -S %s --lint-abort-on-error | FileCheck -check-prefix=POSTPROCESS-STATESIZE %s
 
 ; The order of metadata on functions is non-deterministic, so make two different runs to match both of them.
 
-target datalayout = "e-m:e-p:64:32-p20:32:32-p21:32:32-p32:32:32-i1:32-i8:8-i16:32-i32:32-i64:32-f16:32-f32:32-f64:32-v16:32-v32:32-v48:32-v64:32-v80:32-v96:32-v112:32-v128:32-v144:32-v160:32-v176:32-v192:32-v208:32-v224:32-v240:32-v256:32-n8:16:32"
+target datalayout = "e-m:e-p:64:32-p20:32:32-p21:32:32-p32:32:32-i1:32-i8:8-i16:16-i32:32-i64:32-f16:16-f32:32-f64:32-v8:8-v16:16-v32:32-v48:32-v64:32-v80:32-v96:32-v112:32-v128:32-v144:32-v160:32-v176:32-v192:32-v208:32-v224:32-v240:32-v256:32-n8:16:32"
 
 %dx.types.Handle = type { i8* }
 %struct.DispatchSystemData = type { i32 }
@@ -42,6 +40,10 @@ declare !types !17 %struct.BuiltInTriangleIntersectionAttributes @_cont_GetTrian
 ; Function Attrs: nounwind memory(none)
 declare !types !19 void @_AmdRestoreSystemData(%struct.DispatchSystemData*) #1
 
+define void @_cont_ExitRayGen(ptr nocapture readonly %data) alwaysinline nounwind !types !{!"function", !"void", !{i32 0, %struct.DispatchSystemData poison}} {
+  ret void
+}
+
 ; Function Attrs: alwaysinline
 define i32 @_cont_GetLocalRootIndex(%struct.DispatchSystemData* %data) #0 !types !21 {
   ret i32 5
@@ -69,9 +71,9 @@ define void @_cont_CallShader(%struct.DispatchSystemData* %data, i32 %0) #0 !typ
 ; LOWERRAYTRACINGPIPELINE-STACKSIZE-DAG: define void @main(%struct.DispatchSystemData %0){{.*}} !continuation.stacksize ![[main_stacksize:[0-9]+]]
 ; LOWERRAYTRACINGPIPELINE-STACKSIZE-DAG: ![[main_stacksize]] = !{i32 140}
 
-; POSTPROCESS-STACKSIZE-DAG: define void @main(){{.*}} !continuation.stacksize ![[main_stacksize:[0-9]+]]
+; POSTPROCESS-STACKSIZE-DAG: define void @main({{.*}}){{.*}} !continuation.stacksize ![[main_stacksize:[0-9]+]]
 ; POSTPROCESS-STACKSIZE-DAG: ![[main_stacksize]] = !{i32 140}
-; POSTPROCESS-STATESIZE-DAG: define void @main(){{.*}} !continuation.state ![[main_state:[0-9]+]]
+; POSTPROCESS-STATESIZE-DAG: define void @main({{.*}}){{.*}} !continuation.state ![[main_state:[0-9]+]]
 ; POSTPROCESS-STATESIZE-DAG: ![[main_state]] = !{i32 0}
 
 define void @main() {
@@ -102,7 +104,7 @@ define void @mainTrace() {
   ret void
 }
 
-; LOWERRAYTRACINGPIPELINE-STACKSIZE-DAG: define %struct.DispatchSystemData @called(%struct.DispatchSystemData %0){{.*}} !continuation.stacksize ![[called_stacksize:[0-9]+]]
+; LOWERRAYTRACINGPIPELINE-STACKSIZE-DAG: define %struct.DispatchSystemData @called({{.*}}%struct.DispatchSystemData %0){{.*}} !continuation.stacksize ![[called_stacksize:[0-9]+]]
 ; LOWERRAYTRACINGPIPELINE-STACKSIZE-DAG: ![[called_stacksize]] = !{i32 144}
 
 ; CLEANUP-STACKSIZE-DAG: define void @called({{.*}}%struct.DispatchSystemData %0){{.*}} !continuation.stacksize ![[called_stacksize:[0-9]+]]
diff --git a/llvmraytracing/test/dx/continuation-state.ll b/llvmraytracing/test/dx/continuation-state.ll
index 014ea354ba..1d2e413cb4 100644
--- a/llvmraytracing/test/dx/continuation-state.ll
+++ b/llvmraytracing/test/dx/continuation-state.ll
@@ -1,11 +1,9 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --include-generated-funcs --version 3
-; RUN: opt --verify-each -passes='lower-await,lint,coro-early,dxil-coro-split,coro-cleanup,lint,legacy-cleanup-continuations,lint' -S %s 2> %t0.stderr | FileCheck -check-prefix=CLEANUP %s
-; RUN: count 0 < %t0.stderr
+; RUN: opt --verify-each -passes='lower-await,lint,coro-early,dxil-coro-split,coro-cleanup,lint,legacy-cleanup-continuations,lint' -S %s --lint-abort-on-error | FileCheck -check-prefix=CLEANUP %s
 ; RUN: opt --verify-each -passes='lower-await,lint,coro-early,dxil-coro-split,coro-cleanup,lint,legacy-cleanup-continuations,lint,register-buffer,lint' \
-; RUN:     -S %s 2> %t1.stderr | FileCheck -check-prefix=REGISTERBUFFER %s
-; RUN: count 0 < %t1.stderr
+; RUN:     -S %s --lint-abort-on-error | FileCheck -check-prefix=REGISTERBUFFER %s
 
-target datalayout = "e-m:e-p:64:32-p20:32:32-p21:32:32-p32:32:32-i1:32-i8:8-i16:32-i32:32-i64:32-f16:32-f32:32-f64:32-v16:32-v32:32-v48:32-v64:32-v80:32-v96:32-v112:32-v128:32-v144:32-v160:32-v176:32-v192:32-v208:32-v224:32-v240:32-v256:32-n8:16:32"
+target datalayout = "e-m:e-p:64:32-p20:32:32-p21:32:32-p32:32:32-i1:32-i8:8-i16:16-i32:32-i64:32-f16:16-f32:32-f64:32-v8:8-v16:16-v32:32-v48:32-v64:32-v80:32-v96:32-v112:32-v128:32-v144:32-v160:32-v176:32-v192:32-v208:32-v224:32-v240:32-v256:32-n8:16:32"
 
 %continuation.token = type { }
 
@@ -15,13 +13,13 @@ declare %continuation.token* @async_fun()
 
 @PAYLOAD = external addrspace(20) global [30 x i32]
 
-define <4 x i32> @simple_await(<4 x i32> %arg) !continuation.registercount !1 {
+define <4 x i32> @simple_await(i64 %returnAddr, <4 x i32> %arg) !continuation.registercount !1 {
   %tok = call %continuation.token* @async_fun(), !continuation.registercount !1, !continuation.returnedRegistercount !1
   call void @await.void(%continuation.token* %tok)
   ret <4 x i32> %arg, !continuation.registercount !1
 }
 
-define void @simple_await_entry(<4 x i32> %arg, <4 x i32> addrspace(1)* %mem) !continuation.entry !0 !continuation.registercount !1 {
+define void @simple_await_entry(i64 %returnAddr, <4 x i32> %arg, <4 x i32> addrspace(1)* %mem) !continuation.entry !0 !continuation.registercount !1 {
   %tok = call %continuation.token* @async_fun(), !continuation.registercount !1, !continuation.returnedRegistercount !1
   call void @await.void(%continuation.token* %tok)
   store <4 x i32> %arg, <4 x i32> addrspace(1)* %mem
@@ -43,12 +41,13 @@ define void @simple_await_entry(<4 x i32> %arg, <4 x i32> addrspace(1)* %mem) !c
 ; CLEANUP-NEXT:    store <4 x i32> [[ARG]], ptr addrspace(32) [[ARG_SPILL_ADDR]], align 4
 ; CLEANUP-NEXT:    [[RETURNADDR_SPILL_ADDR:%.*]] = getelementptr inbounds [[SIMPLE_AWAIT_FRAME]], ptr addrspace(32) [[CONT_STATE_STACK_SEGMENT]], i32 0, i32 1
 ; CLEANUP-NEXT:    store i64 [[RETURNADDR]], ptr addrspace(32) [[RETURNADDR_SPILL_ADDR]], align 4
-; CLEANUP-NEXT:    call void (i64, ...) @continuation.continue(i64 ptrtoint (ptr @async_fun to i64), i64 ptrtoint (ptr @simple_await.resume.0 to i64)), !continuation.registercount [[META2]], !continuation.returnedRegistercount !2
+; CLEANUP-NEXT:    [[TMP0:%.*]] = call i64 (...) @lgc.cps.as.continuation.reference__i64(ptr @simple_await.resume.0)
+; CLEANUP-NEXT:    call void (i64, ...) @continuation.continue(i64 ptrtoint (ptr @async_fun to i64), i64 [[TMP0]]), !continuation.registercount [[META2]], !continuation.returnedRegistercount [[META2]]
 ; CLEANUP-NEXT:    unreachable
 ;
 ;
 ; CLEANUP-LABEL: define dso_local void @simple_await.resume.0(
-; CLEANUP-SAME: ) !continuation.registercount [[META2]] !continuation [[META3]] {
+; CLEANUP-SAME: i64 [[TMP0:%.*]]) !continuation.registercount [[META2]] !continuation [[META3]] {
 ; CLEANUP-NEXT:  entryresume.0:
 ; CLEANUP-NEXT:    [[CONT_STATE_STACK_SEGMENT:%.*]] = call ptr addrspace(32) @lgc.cps.peek(i32 24)
 ; CLEANUP-NEXT:    [[ARG_RELOAD_ADDR:%.*]] = getelementptr inbounds [[SIMPLE_AWAIT_FRAME:%.*]], ptr addrspace(32) [[CONT_STATE_STACK_SEGMENT]], i32 0, i32 0
@@ -56,24 +55,25 @@ define void @simple_await_entry(<4 x i32> %arg, <4 x i32> addrspace(1)* %mem) !c
 ; CLEANUP-NEXT:    [[RETURNADDR_RELOAD_ADDR:%.*]] = getelementptr inbounds [[SIMPLE_AWAIT_FRAME]], ptr addrspace(32) [[CONT_STATE_STACK_SEGMENT]], i32 0, i32 1
 ; CLEANUP-NEXT:    [[RETURNADDR_RELOAD:%.*]] = load i64, ptr addrspace(32) [[RETURNADDR_RELOAD_ADDR]], align 4
 ; CLEANUP-NEXT:    call void @lgc.cps.free(i32 24)
-; CLEANUP-NEXT:    call void (i64, ...) @continuation.continue(i64 [[RETURNADDR_RELOAD]], <4 x i32> [[ARG_RELOAD]]), !continuation.registercount [[META2]]
+; CLEANUP-NEXT:    call void (i64, ...) @continuation.continue(i64 [[RETURNADDR_RELOAD]], i64 poison, <4 x i32> [[ARG_RELOAD]]), !continuation.registercount [[META2]]
 ; CLEANUP-NEXT:    unreachable
 ;
 ;
 ; CLEANUP-LABEL: define void @simple_await_entry(
-; CLEANUP-SAME: <4 x i32> [[ARG:%.*]], ptr addrspace(1) [[MEM:%.*]]) !continuation.registercount [[META2]] !continuation.entry [[META5:![0-9]+]] !continuation [[META6:![0-9]+]] !continuation.stacksize [[META4]] !continuation.state [[META4]] {
+; CLEANUP-SAME: i64 [[RETURNADDR:%.*]], <4 x i32> [[ARG:%.*]], ptr addrspace(1) [[MEM:%.*]]) !continuation.registercount [[META2]] !continuation.entry [[META5:![0-9]+]] !continuation [[META6:![0-9]+]] !continuation.stacksize [[META4]] !continuation.state [[META4]] {
 ; CLEANUP-NEXT:  AllocaSpillBB:
 ; CLEANUP-NEXT:    [[CONT_STATE_STACK_SEGMENT:%.*]] = call ptr addrspace(32) @lgc.cps.alloc(i32 24)
 ; CLEANUP-NEXT:    [[MEM_SPILL_ADDR:%.*]] = getelementptr inbounds [[SIMPLE_AWAIT_ENTRY_FRAME:%.*]], ptr addrspace(32) [[CONT_STATE_STACK_SEGMENT]], i32 0, i32 1
 ; CLEANUP-NEXT:    store ptr addrspace(1) [[MEM]], ptr addrspace(32) [[MEM_SPILL_ADDR]], align 4
 ; CLEANUP-NEXT:    [[ARG_SPILL_ADDR:%.*]] = getelementptr inbounds [[SIMPLE_AWAIT_ENTRY_FRAME]], ptr addrspace(32) [[CONT_STATE_STACK_SEGMENT]], i32 0, i32 0
 ; CLEANUP-NEXT:    store <4 x i32> [[ARG]], ptr addrspace(32) [[ARG_SPILL_ADDR]], align 4
-; CLEANUP-NEXT:    call void (i64, ...) @continuation.continue(i64 ptrtoint (ptr @async_fun to i64), i64 ptrtoint (ptr @simple_await_entry.resume.0 to i64)), !continuation.registercount [[META2]], !continuation.returnedRegistercount !2
+; CLEANUP-NEXT:    [[TMP0:%.*]] = call i64 (...) @lgc.cps.as.continuation.reference__i64(ptr @simple_await_entry.resume.0)
+; CLEANUP-NEXT:    call void (i64, ...) @continuation.continue(i64 ptrtoint (ptr @async_fun to i64), i64 [[TMP0]]), !continuation.registercount [[META2]], !continuation.returnedRegistercount [[META2]]
 ; CLEANUP-NEXT:    unreachable
 ;
 ;
 ; CLEANUP-LABEL: define dso_local void @simple_await_entry.resume.0(
-; CLEANUP-SAME: ) !continuation.registercount [[META2]] !continuation [[META6]] {
+; CLEANUP-SAME: i64 [[TMP0:%.*]]) !continuation.registercount [[META2]] !continuation [[META6]] {
 ; CLEANUP-NEXT:  entryresume.0:
 ; CLEANUP-NEXT:    [[CONT_STATE_STACK_SEGMENT:%.*]] = call ptr addrspace(32) @lgc.cps.peek(i32 24)
 ; CLEANUP-NEXT:    [[MEM_RELOAD_ADDR:%.*]] = getelementptr inbounds [[SIMPLE_AWAIT_ENTRY_FRAME:%.*]], ptr addrspace(32) [[CONT_STATE_STACK_SEGMENT]], i32 0, i32 1
@@ -95,12 +95,13 @@ define void @simple_await_entry(<4 x i32> %arg, <4 x i32> addrspace(1)* %mem) !c
 ; REGISTERBUFFER-NEXT:    store <4 x i32> [[ARG]], ptr addrspace(32) [[ARG_SPILL_ADDR]], align 4
 ; REGISTERBUFFER-NEXT:    [[RETURNADDR_SPILL_ADDR:%.*]] = getelementptr inbounds [[SIMPLE_AWAIT_FRAME]], ptr addrspace(32) [[CONT_STATE_STACK_SEGMENT]], i32 0, i32 1
 ; REGISTERBUFFER-NEXT:    store i64 [[RETURNADDR]], ptr addrspace(32) [[RETURNADDR_SPILL_ADDR]], align 4
-; REGISTERBUFFER-NEXT:    call void (i64, ...) @continuation.continue(i64 ptrtoint (ptr @async_fun to i64), i64 ptrtoint (ptr @simple_await.resume.0 to i64)), !continuation.registercount [[META2]], !continuation.returnedRegistercount !2
+; REGISTERBUFFER-NEXT:    [[TMP0:%.*]] = call i64 (...) @lgc.cps.as.continuation.reference__i64(ptr @simple_await.resume.0)
+; REGISTERBUFFER-NEXT:    call void (i64, ...) @continuation.continue(i64 ptrtoint (ptr @async_fun to i64), i64 [[TMP0]]), !continuation.registercount [[META2]], !continuation.returnedRegistercount [[META2]]
 ; REGISTERBUFFER-NEXT:    unreachable
 ;
 ;
 ; REGISTERBUFFER-LABEL: define dso_local void @simple_await.resume.0(
-; REGISTERBUFFER-SAME: ) !continuation.registercount [[META2]] !continuation [[META3]] {
+; REGISTERBUFFER-SAME: i64 [[TMP0:%.*]]) !continuation.registercount [[META2]] !continuation [[META3]] {
 ; REGISTERBUFFER-NEXT:  entryresume.0:
 ; REGISTERBUFFER-NEXT:    [[CONT_STATE_STACK_SEGMENT:%.*]] = call ptr addrspace(32) @lgc.cps.peek(i32 24)
 ; REGISTERBUFFER-NEXT:    [[ARG_RELOAD_ADDR:%.*]] = getelementptr inbounds [[SIMPLE_AWAIT_FRAME:%.*]], ptr addrspace(32) [[CONT_STATE_STACK_SEGMENT]], i32 0, i32 0
@@ -108,24 +109,25 @@ define void @simple_await_entry(<4 x i32> %arg, <4 x i32> addrspace(1)* %mem) !c
 ; REGISTERBUFFER-NEXT:    [[RETURNADDR_RELOAD_ADDR:%.*]] = getelementptr inbounds [[SIMPLE_AWAIT_FRAME]], ptr addrspace(32) [[CONT_STATE_STACK_SEGMENT]], i32 0, i32 1
 ; REGISTERBUFFER-NEXT:    [[RETURNADDR_RELOAD:%.*]] = load i64, ptr addrspace(32) [[RETURNADDR_RELOAD_ADDR]], align 4
 ; REGISTERBUFFER-NEXT:    call void @lgc.cps.free(i32 24)
-; REGISTERBUFFER-NEXT:    call void (i64, ...) @continuation.continue(i64 [[RETURNADDR_RELOAD]], <4 x i32> [[ARG_RELOAD]]), !continuation.registercount [[META2]]
+; REGISTERBUFFER-NEXT:    call void (i64, ...) @continuation.continue(i64 [[RETURNADDR_RELOAD]], i64 poison, <4 x i32> [[ARG_RELOAD]]), !continuation.registercount [[META2]]
 ; REGISTERBUFFER-NEXT:    unreachable
 ;
 ;
 ; REGISTERBUFFER-LABEL: define void @simple_await_entry(
-; REGISTERBUFFER-SAME: <4 x i32> [[ARG:%.*]], ptr addrspace(1) [[MEM:%.*]]) !continuation.registercount [[META2]] !continuation.entry [[META5:![0-9]+]] !continuation [[META6:![0-9]+]] !continuation.stacksize [[META4]] !continuation.state [[META4]] {
+; REGISTERBUFFER-SAME: i64 [[RETURNADDR:%.*]], <4 x i32> [[ARG:%.*]], ptr addrspace(1) [[MEM:%.*]]) !continuation.registercount [[META2]] !continuation.entry [[META5:![0-9]+]] !continuation [[META6:![0-9]+]] !continuation.stacksize [[META4]] !continuation.state [[META4]] {
 ; REGISTERBUFFER-NEXT:  AllocaSpillBB:
 ; REGISTERBUFFER-NEXT:    [[CONT_STATE_STACK_SEGMENT:%.*]] = call ptr addrspace(32) @lgc.cps.alloc(i32 24)
 ; REGISTERBUFFER-NEXT:    [[MEM_SPILL_ADDR:%.*]] = getelementptr inbounds [[SIMPLE_AWAIT_ENTRY_FRAME:%.*]], ptr addrspace(32) [[CONT_STATE_STACK_SEGMENT]], i32 0, i32 1
 ; REGISTERBUFFER-NEXT:    store ptr addrspace(1) [[MEM]], ptr addrspace(32) [[MEM_SPILL_ADDR]], align 4
 ; REGISTERBUFFER-NEXT:    [[ARG_SPILL_ADDR:%.*]] = getelementptr inbounds [[SIMPLE_AWAIT_ENTRY_FRAME]], ptr addrspace(32) [[CONT_STATE_STACK_SEGMENT]], i32 0, i32 0
 ; REGISTERBUFFER-NEXT:    store <4 x i32> [[ARG]], ptr addrspace(32) [[ARG_SPILL_ADDR]], align 4
-; REGISTERBUFFER-NEXT:    call void (i64, ...) @continuation.continue(i64 ptrtoint (ptr @async_fun to i64), i64 ptrtoint (ptr @simple_await_entry.resume.0 to i64)), !continuation.registercount [[META2]], !continuation.returnedRegistercount !2
+; REGISTERBUFFER-NEXT:    [[TMP0:%.*]] = call i64 (...) @lgc.cps.as.continuation.reference__i64(ptr @simple_await_entry.resume.0)
+; REGISTERBUFFER-NEXT:    call void (i64, ...) @continuation.continue(i64 ptrtoint (ptr @async_fun to i64), i64 [[TMP0]]), !continuation.registercount [[META2]], !continuation.returnedRegistercount [[META2]]
 ; REGISTERBUFFER-NEXT:    unreachable
 ;
 ;
 ; REGISTERBUFFER-LABEL: define dso_local void @simple_await_entry.resume.0(
-; REGISTERBUFFER-SAME: ) !continuation.registercount [[META2]] !continuation [[META6]] {
+; REGISTERBUFFER-SAME: i64 [[TMP0:%.*]]) !continuation.registercount [[META2]] !continuation [[META6]] {
 ; REGISTERBUFFER-NEXT:  entryresume.0:
 ; REGISTERBUFFER-NEXT:    [[CONT_STATE_STACK_SEGMENT:%.*]] = call ptr addrspace(32) @lgc.cps.peek(i32 24)
 ; REGISTERBUFFER-NEXT:    [[MEM_RELOAD_ADDR:%.*]] = getelementptr inbounds [[SIMPLE_AWAIT_ENTRY_FRAME:%.*]], ptr addrspace(32) [[CONT_STATE_STACK_SEGMENT]], i32 0, i32 1
diff --git a/llvmraytracing/test/dx/continuation-without-await.ll b/llvmraytracing/test/dx/continuation-without-await.ll
index 4e4bb95f7b..f47ee48bee 100644
--- a/llvmraytracing/test/dx/continuation-without-await.ll
+++ b/llvmraytracing/test/dx/continuation-without-await.ll
@@ -1,19 +1,15 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --include-generated-funcs --version 3
-; RUN: opt --verify-each -passes='dxil-cont-lgc-rt-op-converter,lint,lower-raytracing-pipeline,lint,remove-types-metadata' -S %s 2> %t0.stderr | FileCheck -check-prefix=LOWERRAYTRACINGPIPELINE %s
-; RUN: count 0 < %t0.stderr
+; RUN: opt --verify-each -passes='dxil-cont-lgc-rt-op-converter,lint,lower-raytracing-pipeline,lint,remove-types-metadata' -S %s --lint-abort-on-error | FileCheck -check-prefix=LOWERRAYTRACINGPIPELINE %s
 ; RUN: opt --verify-each -passes='dxil-cont-lgc-rt-op-converter,lint,inline,lint,lower-raytracing-pipeline,lint,sroa,lint,lower-await,lint,coro-early,dxil-coro-split,coro-cleanup,lint,legacy-cleanup-continuations,lint,remove-types-metadata' \
-; RUN:     -S %s 2> %t1.stderr | FileCheck -check-prefix=CLEANUP %s
-; RUN: count 0 < %t1.stderr
+; RUN:     -S %s --lint-abort-on-error | FileCheck -check-prefix=CLEANUP %s
 ; RUN: opt --verify-each -passes='dxil-cont-lgc-rt-op-converter,lint,inline,lint,lower-raytracing-pipeline,lint,sroa,lint,lower-await,lint,coro-early,dxil-coro-split,coro-cleanup,lint,legacy-cleanup-continuations,lint,register-buffer,lint,remove-types-metadata' \
-; RUN:     -S %s 2> %t2.stderr | FileCheck -check-prefix=REGISTERBUFFER %s
-; RUN: count 0 < %t2.stderr
+; RUN:     -S %s --lint-abort-on-error | FileCheck -check-prefix=REGISTERBUFFER %s
 ; RUN: opt --verify-each -passes='dxil-cont-lgc-rt-op-converter,lint,inline,lint,lower-raytracing-pipeline,lint,sroa,lint,lower-await,lint,coro-early,dxil-coro-split,coro-cleanup,lint,legacy-cleanup-continuations,lint,register-buffer,lint,dxil-cont-post-process,lint,remove-types-metadata' \
-; RUN:     -S %s 2> %t3.stderr | FileCheck -check-prefix=POSTPROCESS %s
-; RUN: count 0 < %t3.stderr
+; RUN:     -S %s --lint-abort-on-error | FileCheck -check-prefix=POSTPROCESS %s
 
 ; @called and @main_no_call must be marked as continuation and end with a continue call to the return address
 
-target datalayout = "e-m:e-p:64:32-p20:32:32-p21:32:32-p32:32:32-i1:32-i8:8-i16:32-i32:32-i64:32-f16:32-f32:32-f64:32-v16:32-v32:32-v48:32-v64:32-v80:32-v96:32-v112:32-v128:32-v144:32-v160:32-v176:32-v192:32-v208:32-v224:32-v240:32-v256:32-n8:16:32"
+target datalayout = "e-m:e-p:64:32-p20:32:32-p21:32:32-p32:32:32-i1:32-i8:8-i16:16-i32:32-i64:32-f16:16-f32:32-f64:32-v8:8-v16:16-v32:32-v48:32-v64:32-v80:32-v96:32-v112:32-v128:32-v144:32-v160:32-v176:32-v192:32-v208:32-v224:32-v240:32-v256:32-n8:16:32"
 
 %dx.types.Handle = type { i8* }
 %struct.DispatchSystemData = type { i32 }
@@ -30,8 +26,6 @@ target datalayout = "e-m:e-p:64:32-p20:32:32-p21:32:32-p32:32:32-i1:32-i8:8-i16:
 
 declare i32 @_cont_GetContinuationStackAddr()
 
-declare %struct.DispatchSystemData @_cont_SetupRayGen()
-
 declare %struct.DispatchSystemData @_AmdAwaitTraversal(i64, %struct.TraversalData)
 
 declare %struct.DispatchSystemData @_AmdAwaitShader(i64, %struct.DispatchSystemData)
@@ -41,6 +35,10 @@ declare !types !16 %struct.BuiltInTriangleIntersectionAttributes @_cont_GetTrian
 ; Function Attrs: nounwind memory(none)
 declare !types !18 void @_AmdRestoreSystemData(%struct.DispatchSystemData*) #0
 
+define void @_cont_ExitRayGen(ptr nocapture readonly %data) alwaysinline nounwind !types !{!"function", !"void", !{i32 0, %struct.DispatchSystemData poison}} {
+  ret void
+}
+
 define i32 @_cont_GetLocalRootIndex(%struct.DispatchSystemData* %data) !types !20 {
   ret i32 5
 }
@@ -130,41 +128,41 @@ attributes #2 = { nounwind }
 ;
 ;
 ; LOWERRAYTRACINGPIPELINE-LABEL: define void @main(
-; LOWERRAYTRACINGPIPELINE-SAME: [[STRUCT_DISPATCHSYSTEMDATA:%.*]] [[TMP0:%.*]]) !lgc.rt.shaderstage [[META8:![0-9]+]] !continuation.entry [[META19:![0-9]+]] !continuation.registercount [[META8]] !continuation [[META20:![0-9]+]] {
+; LOWERRAYTRACINGPIPELINE-SAME: i64 [[RETURNADDR:%.*]], [[STRUCT_DISPATCHSYSTEMDATA:%.*]] [[TMP0:%.*]]) !lgc.rt.shaderstage [[META8:![0-9]+]] !continuation.entry [[META19:![0-9]+]] !continuation.registercount [[META8]] !continuation [[META20:![0-9]+]] {
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[PARAMS:%.*]] = alloca [[STRUCT_THEIRPARAMS:%.*]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[SYSTEM_DATA_ALLOCA:%.*]] = alloca [[STRUCT_DISPATCHSYSTEMDATA]], align 8
 ; LOWERRAYTRACINGPIPELINE-NEXT:    store [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP0]], ptr [[SYSTEM_DATA_ALLOCA]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    call void @amd.dx.setLocalRootIndex(i32 0)
 ; LOWERRAYTRACINGPIPELINE-NEXT:    store [[STRUCT_THEIRPARAMS]] zeroinitializer, ptr [[PARAMS]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[DIS_DATA_I:%.*]] = load [[STRUCT_DISPATCHSYSTEMDATA]], ptr [[SYSTEM_DATA_ALLOCA]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [[STRUCT_THEIRPARAMS]], ptr [[PARAMS]], i32 0, i32 0
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [[STRUCT_THEIRPARAMS]], ptr [[PARAMS]], i32 0
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP5:%.*]] = load i32, ptr [[TMP2]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP5]], ptr addrspace(20) @PAYLOAD, align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP6:%.*]] = call ptr inttoptr (i64 2 to ptr)([[STRUCT_DISPATCHSYSTEMDATA]] [[DIS_DATA_I]]), !continuation.registercount [[META21:![0-9]+]], !continuation.returnedRegistercount !21
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP7:%.*]] = call [[STRUCT_DISPATCHSYSTEMDATA]] [[AWAIT_STRUCT_DISPATCHSYSTEMDATA:@[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]](ptr [[TMP6]])
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP4:%.*]] = call ptr inttoptr (i64 2 to ptr)([[STRUCT_DISPATCHSYSTEMDATA]] [[DIS_DATA_I]]), !continuation.registercount [[META21:![0-9]+]], !continuation.returnedRegistercount [[META21]]
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP7:%.*]] = call [[STRUCT_DISPATCHSYSTEMDATA]] [[AWAIT:@[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]](ptr [[TMP4]])
 ; LOWERRAYTRACINGPIPELINE-NEXT:    store [[STRUCT_THEIRPARAMS]] poison, ptr [[PARAMS]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [[STRUCT_THEIRPARAMS]], ptr [[PARAMS]], i32 0, i32 0
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [[STRUCT_THEIRPARAMS]], ptr [[PARAMS]], i32 0
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP11:%.*]] = load i32, ptr addrspace(20) @PAYLOAD, align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP11]], ptr [[TMP8]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    store [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP7]], ptr [[SYSTEM_DATA_ALLOCA]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    call void @amd.dx.setLocalRootIndex(i32 0)
-; LOWERRAYTRACINGPIPELINE-NEXT:    ret void, !continuation.registercount [[META16:![0-9]+]]
+; LOWERRAYTRACINGPIPELINE-NEXT:    ret void
 ;
 ;
 ; LOWERRAYTRACINGPIPELINE-LABEL: define void @main_no_call(
-; LOWERRAYTRACINGPIPELINE-SAME: [[STRUCT_DISPATCHSYSTEMDATA:%.*]] [[TMP0:%.*]]) !lgc.rt.shaderstage [[META8]] !continuation.entry [[META19]] !continuation.registercount [[META8]] !continuation [[META22:![0-9]+]] {
+; LOWERRAYTRACINGPIPELINE-SAME: i64 [[RETURNADDR:%.*]], [[STRUCT_DISPATCHSYSTEMDATA:%.*]] [[TMP0:%.*]]) !lgc.rt.shaderstage [[META8]] !continuation.entry [[META19]] !continuation.registercount [[META8]] !continuation [[META22:![0-9]+]] {
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[SYSTEM_DATA_ALLOCA:%.*]] = alloca [[STRUCT_DISPATCHSYSTEMDATA]], align 8
 ; LOWERRAYTRACINGPIPELINE-NEXT:    store [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP0]], ptr [[SYSTEM_DATA_ALLOCA]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    call void @amd.dx.setLocalRootIndex(i32 0)
-; LOWERRAYTRACINGPIPELINE-NEXT:    ret void, !continuation.registercount [[META16]]
+; LOWERRAYTRACINGPIPELINE-NEXT:    ret void
 ;
 ;
 ; LOWERRAYTRACINGPIPELINE-LABEL: define %struct.DispatchSystemData @called(
-; LOWERRAYTRACINGPIPELINE-SAME: [[STRUCT_DISPATCHSYSTEMDATA:%.*]] [[TMP0:%.*]]) !lgc.rt.shaderstage [[META23:![0-9]+]] !continuation.registercount [[META17:![0-9]+]] !continuation [[META24:![0-9]+]] {
+; LOWERRAYTRACINGPIPELINE-SAME: i64 [[RETURNADDR:%.*]], [[STRUCT_DISPATCHSYSTEMDATA:%.*]] [[TMP0:%.*]]) !lgc.rt.shaderstage [[META23:![0-9]+]] !continuation.registercount [[META17:![0-9]+]] !continuation [[META24:![0-9]+]] {
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[SYSTEM_DATA_ALLOCA:%.*]] = alloca [[STRUCT_DISPATCHSYSTEMDATA]], align 8
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP2:%.*]] = alloca [[STRUCT_MYPARAMS:%.*]], align 8
 ; LOWERRAYTRACINGPIPELINE-NEXT:    store [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP0]], ptr [[SYSTEM_DATA_ALLOCA]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [[STRUCT_MYPARAMS]], ptr [[TMP2]], i32 0, i32 0
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [[STRUCT_MYPARAMS]], ptr [[TMP2]], i32 0
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP6:%.*]] = load i32, ptr addrspace(20) @PAYLOAD, align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP6]], ptr [[TMP3]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[TMP3]], i32 1
@@ -174,7 +172,7 @@ attributes #2 = { nounwind }
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP10:%.*]] = load i32, ptr addrspace(20) getelementptr inbounds (i32, ptr addrspace(20) @PAYLOAD, i32 2), align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP10]], ptr [[TMP9]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    call void @amd.dx.setLocalRootIndex(i32 5)
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP11:%.*]] = getelementptr inbounds [[STRUCT_MYPARAMS]], ptr [[TMP2]], i32 0, i32 0
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP11:%.*]] = getelementptr inbounds [[STRUCT_MYPARAMS]], ptr [[TMP2]], i32 0
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP14:%.*]] = load i32, ptr [[TMP11]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP14]], ptr addrspace(20) @PAYLOAD, align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i32, ptr [[TMP11]], i32 1
@@ -184,7 +182,8 @@ attributes #2 = { nounwind }
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP18:%.*]] = load i32, ptr [[TMP17]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP18]], ptr addrspace(20) getelementptr inbounds (i32, ptr addrspace(20) @PAYLOAD, i32 2), align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP19:%.*]] = load [[STRUCT_DISPATCHSYSTEMDATA]], ptr [[SYSTEM_DATA_ALLOCA]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    ret [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP19]], !continuation.registercount [[META17]]
+; LOWERRAYTRACINGPIPELINE-NEXT:    call void (...) @lgc.ilcps.return(i64 [[RETURNADDR]], [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP19]]), !continuation.registercount [[META17]]
+; LOWERRAYTRACINGPIPELINE-NEXT:    unreachable
 ;
 ;
 ; CLEANUP-LABEL: define i32 @_cont_GetLocalRootIndex(
@@ -193,21 +192,22 @@ attributes #2 = { nounwind }
 ;
 ;
 ; CLEANUP-LABEL: define void @main(
-; CLEANUP-SAME: [[STRUCT_DISPATCHSYSTEMDATA:%.*]] [[TMP0:%.*]]) !lgc.rt.shaderstage [[META8:![0-9]+]] !continuation.entry [[META19:![0-9]+]] !continuation.registercount [[META8]] !continuation [[META20:![0-9]+]] !continuation.state [[META8]] {
+; CLEANUP-SAME: i64 [[RETURNADDR:%.*]], [[STRUCT_DISPATCHSYSTEMDATA:%.*]] [[TMP0:%.*]]) !lgc.rt.shaderstage [[META8:![0-9]+]] !continuation.entry [[META19:![0-9]+]] !continuation.registercount [[META8]] !continuation [[META20:![0-9]+]] !continuation.state [[META8]] {
 ; CLEANUP-NEXT:  AllocaSpillBB:
 ; CLEANUP-NEXT:    [[DOTFCA_0_EXTRACT:%.*]] = extractvalue [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP0]], 0
 ; CLEANUP-NEXT:    call void @amd.dx.setLocalRootIndex(i32 0)
 ; CLEANUP-NEXT:    [[DIS_DATA_I_FCA_0_INSERT:%.*]] = insertvalue [[STRUCT_DISPATCHSYSTEMDATA]] poison, i32 [[DOTFCA_0_EXTRACT]], 0
 ; CLEANUP-NEXT:    store i32 0, ptr addrspace(20) @PAYLOAD, align 4
-; CLEANUP-NEXT:    call void (i64, ...) @continuation.continue(i64 2, i64 ptrtoint (ptr @main.resume.0 to i64), [[STRUCT_DISPATCHSYSTEMDATA]] [[DIS_DATA_I_FCA_0_INSERT]]), !continuation.registercount [[META21:![0-9]+]], !continuation.returnedRegistercount !21
+; CLEANUP-NEXT:    [[TMP1:%.*]] = call i64 (...) @lgc.cps.as.continuation.reference__i64(ptr @main.resume.0)
+; CLEANUP-NEXT:    call void (i64, ...) @continuation.continue(i64 2, i64 [[TMP1]], [[STRUCT_DISPATCHSYSTEMDATA]] [[DIS_DATA_I_FCA_0_INSERT]]), !continuation.registercount [[META21:![0-9]+]], !continuation.returnedRegistercount [[META21]]
 ; CLEANUP-NEXT:    unreachable
 ;
 ;
 ; CLEANUP-LABEL: define dso_local void @main.resume.0(
-; CLEANUP-SAME: [[STRUCT_DISPATCHSYSTEMDATA:%.*]] [[TMP0:%.*]]) !lgc.rt.shaderstage [[META8]] !continuation.registercount [[META21]] !continuation [[META20]] {
+; CLEANUP-SAME: i64 [[TMP0:%.*]], [[STRUCT_DISPATCHSYSTEMDATA:%.*]] [[TMP1:%.*]]) !lgc.rt.shaderstage [[META8]] !continuation.registercount [[META21]] !continuation [[META20]] {
 ; CLEANUP-NEXT:  entryresume.0:
-; CLEANUP-NEXT:    [[TMP1:%.*]] = load i32, ptr addrspace(20) @PAYLOAD, align 4
-; CLEANUP-NEXT:    [[DOTFCA_0_EXTRACT1:%.*]] = extractvalue [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP0]], 0
+; CLEANUP-NEXT:    [[TMP2:%.*]] = load i32, ptr addrspace(20) @PAYLOAD, align 4
+; CLEANUP-NEXT:    [[DOTFCA_0_EXTRACT1:%.*]] = extractvalue [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP1]], 0
 ; CLEANUP-NEXT:    call void @amd.dx.setLocalRootIndex(i32 0)
 ; CLEANUP-NEXT:    ret void
 ; CLEANUP:       entryresume.0.split:
@@ -215,7 +215,7 @@ attributes #2 = { nounwind }
 ;
 ;
 ; CLEANUP-LABEL: define void @main_no_call(
-; CLEANUP-SAME: [[STRUCT_DISPATCHSYSTEMDATA:%.*]] [[TMP0:%.*]]) !lgc.rt.shaderstage [[META8]] !continuation.entry [[META19]] !continuation.registercount [[META8]] !continuation [[META22:![0-9]+]] !continuation.state [[META8]] {
+; CLEANUP-SAME: i64 [[RETURNADDR:%.*]], [[STRUCT_DISPATCHSYSTEMDATA:%.*]] [[TMP0:%.*]]) !lgc.rt.shaderstage [[META8]] !continuation.entry [[META19]] !continuation.registercount [[META8]] !continuation [[META22:![0-9]+]] !continuation.state [[META8]] {
 ; CLEANUP-NEXT:  AllocaSpillBB:
 ; CLEANUP-NEXT:    [[DOTFCA_0_EXTRACT:%.*]] = extractvalue [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP0]], 0
 ; CLEANUP-NEXT:    call void @amd.dx.setLocalRootIndex(i32 0)
@@ -236,7 +236,7 @@ attributes #2 = { nounwind }
 ; CLEANUP-NEXT:    store i32 [[TMP2]], ptr addrspace(20) getelementptr inbounds (i32, ptr addrspace(20) @PAYLOAD, i32 1), align 4
 ; CLEANUP-NEXT:    store i32 [[TMP3]], ptr addrspace(20) getelementptr inbounds (i32, ptr addrspace(20) @PAYLOAD, i32 2), align 4
 ; CLEANUP-NEXT:    [[DOTFCA_0_INSERT:%.*]] = insertvalue [[STRUCT_DISPATCHSYSTEMDATA]] poison, i32 [[DOTFCA_0_EXTRACT]], 0
-; CLEANUP-NEXT:    call void (i64, ...) @continuation.continue(i64 [[RETURNADDR]], [[STRUCT_DISPATCHSYSTEMDATA]] [[DOTFCA_0_INSERT]]), !continuation.registercount [[META17]]
+; CLEANUP-NEXT:    call void (i64, ...) @continuation.continue(i64 [[RETURNADDR]], i64 poison, [[STRUCT_DISPATCHSYSTEMDATA]] [[DOTFCA_0_INSERT]]), !continuation.registercount [[META17]]
 ; CLEANUP-NEXT:    unreachable
 ;
 ;
@@ -246,21 +246,22 @@ attributes #2 = { nounwind }
 ;
 ;
 ; REGISTERBUFFER-LABEL: define void @main(
-; REGISTERBUFFER-SAME: [[STRUCT_DISPATCHSYSTEMDATA:%.*]] [[TMP0:%.*]]) !lgc.rt.shaderstage [[META8:![0-9]+]] !continuation.entry [[META19:![0-9]+]] !continuation.registercount [[META8]] !continuation [[META20:![0-9]+]] !continuation.state [[META8]] {
+; REGISTERBUFFER-SAME: i64 [[RETURNADDR:%.*]], [[STRUCT_DISPATCHSYSTEMDATA:%.*]] [[TMP0:%.*]]) !lgc.rt.shaderstage [[META8:![0-9]+]] !continuation.entry [[META19:![0-9]+]] !continuation.registercount [[META8]] !continuation [[META20:![0-9]+]] !continuation.state [[META8]] {
 ; REGISTERBUFFER-NEXT:  AllocaSpillBB:
 ; REGISTERBUFFER-NEXT:    [[DOTFCA_0_EXTRACT:%.*]] = extractvalue [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP0]], 0
 ; REGISTERBUFFER-NEXT:    call void @amd.dx.setLocalRootIndex(i32 0)
 ; REGISTERBUFFER-NEXT:    [[DIS_DATA_I_FCA_0_INSERT:%.*]] = insertvalue [[STRUCT_DISPATCHSYSTEMDATA]] poison, i32 [[DOTFCA_0_EXTRACT]], 0
 ; REGISTERBUFFER-NEXT:    store i32 0, ptr addrspace(20) @PAYLOAD, align 4
-; REGISTERBUFFER-NEXT:    call void (i64, ...) @continuation.continue(i64 2, i64 ptrtoint (ptr @main.resume.0 to i64), [[STRUCT_DISPATCHSYSTEMDATA]] [[DIS_DATA_I_FCA_0_INSERT]]), !continuation.registercount [[META21:![0-9]+]], !continuation.returnedRegistercount !21
+; REGISTERBUFFER-NEXT:    [[TMP1:%.*]] = call i64 (...) @lgc.cps.as.continuation.reference__i64(ptr @main.resume.0)
+; REGISTERBUFFER-NEXT:    call void (i64, ...) @continuation.continue(i64 2, i64 [[TMP1]], [[STRUCT_DISPATCHSYSTEMDATA]] [[DIS_DATA_I_FCA_0_INSERT]]), !continuation.registercount [[META21:![0-9]+]], !continuation.returnedRegistercount [[META21]]
 ; REGISTERBUFFER-NEXT:    unreachable
 ;
 ;
 ; REGISTERBUFFER-LABEL: define dso_local void @main.resume.0(
-; REGISTERBUFFER-SAME: [[STRUCT_DISPATCHSYSTEMDATA:%.*]] [[TMP0:%.*]]) !lgc.rt.shaderstage [[META8]] !continuation.registercount [[META21]] !continuation [[META20]] {
+; REGISTERBUFFER-SAME: i64 [[TMP0:%.*]], [[STRUCT_DISPATCHSYSTEMDATA:%.*]] [[TMP1:%.*]]) !lgc.rt.shaderstage [[META8]] !continuation.registercount [[META21]] !continuation [[META20]] {
 ; REGISTERBUFFER-NEXT:  entryresume.0:
-; REGISTERBUFFER-NEXT:    [[TMP1:%.*]] = load i32, ptr addrspace(20) @PAYLOAD, align 4
-; REGISTERBUFFER-NEXT:    [[DOTFCA_0_EXTRACT1:%.*]] = extractvalue [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP0]], 0
+; REGISTERBUFFER-NEXT:    [[TMP2:%.*]] = load i32, ptr addrspace(20) @PAYLOAD, align 4
+; REGISTERBUFFER-NEXT:    [[DOTFCA_0_EXTRACT1:%.*]] = extractvalue [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP1]], 0
 ; REGISTERBUFFER-NEXT:    call void @amd.dx.setLocalRootIndex(i32 0)
 ; REGISTERBUFFER-NEXT:    ret void
 ; REGISTERBUFFER:       entryresume.0.split:
@@ -268,7 +269,7 @@ attributes #2 = { nounwind }
 ;
 ;
 ; REGISTERBUFFER-LABEL: define void @main_no_call(
-; REGISTERBUFFER-SAME: [[STRUCT_DISPATCHSYSTEMDATA:%.*]] [[TMP0:%.*]]) !lgc.rt.shaderstage [[META8]] !continuation.entry [[META19]] !continuation.registercount [[META8]] !continuation [[META22:![0-9]+]] !continuation.state [[META8]] {
+; REGISTERBUFFER-SAME: i64 [[RETURNADDR:%.*]], [[STRUCT_DISPATCHSYSTEMDATA:%.*]] [[TMP0:%.*]]) !lgc.rt.shaderstage [[META8]] !continuation.entry [[META19]] !continuation.registercount [[META8]] !continuation [[META22:![0-9]+]] !continuation.state [[META8]] {
 ; REGISTERBUFFER-NEXT:  AllocaSpillBB:
 ; REGISTERBUFFER-NEXT:    [[DOTFCA_0_EXTRACT:%.*]] = extractvalue [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP0]], 0
 ; REGISTERBUFFER-NEXT:    call void @amd.dx.setLocalRootIndex(i32 0)
@@ -289,7 +290,7 @@ attributes #2 = { nounwind }
 ; REGISTERBUFFER-NEXT:    store i32 [[TMP2]], ptr addrspace(20) getelementptr inbounds (i32, ptr addrspace(20) @PAYLOAD, i32 1), align 4
 ; REGISTERBUFFER-NEXT:    store i32 [[TMP3]], ptr addrspace(20) getelementptr inbounds (i32, ptr addrspace(20) @PAYLOAD, i32 2), align 4
 ; REGISTERBUFFER-NEXT:    [[DOTFCA_0_INSERT:%.*]] = insertvalue [[STRUCT_DISPATCHSYSTEMDATA]] poison, i32 [[DOTFCA_0_EXTRACT]], 0
-; REGISTERBUFFER-NEXT:    call void (i64, ...) @continuation.continue(i64 [[RETURNADDR]], [[STRUCT_DISPATCHSYSTEMDATA]] [[DOTFCA_0_INSERT]]), !continuation.registercount [[META17]]
+; REGISTERBUFFER-NEXT:    call void (i64, ...) @continuation.continue(i64 [[RETURNADDR]], i64 poison, [[STRUCT_DISPATCHSYSTEMDATA]] [[DOTFCA_0_INSERT]]), !continuation.registercount [[META17]]
 ; REGISTERBUFFER-NEXT:    unreachable
 ;
 ;
@@ -299,29 +300,27 @@ attributes #2 = { nounwind }
 ;
 ;
 ; POSTPROCESS-LABEL: define void @main(
-; POSTPROCESS-SAME: ) !lgc.rt.shaderstage [[META8:![0-9]+]] !continuation.entry [[META19:![0-9]+]] !continuation.registercount [[META8]] !continuation [[META20:![0-9]+]] !continuation.state [[META8]] {
+; POSTPROCESS-SAME: i32 [[CSPINIT:%.*]], i64 [[RETURNADDR:%.*]], [[STRUCT_DISPATCHSYSTEMDATA:%.*]] [[TMP0:%.*]]) !lgc.rt.shaderstage [[META8:![0-9]+]] !continuation.entry [[META19:![0-9]+]] !continuation.registercount [[META8]] !continuation [[META20:![0-9]+]] !continuation.state [[META8]] {
 ; POSTPROCESS-NEXT:  AllocaSpillBB:
 ; POSTPROCESS-NEXT:    [[CSP:%.*]] = alloca i32, align 4
-; POSTPROCESS-NEXT:    [[TMP0:%.*]] = call i32 @_cont_GetContinuationStackAddr()
-; POSTPROCESS-NEXT:    store i32 [[TMP0]], ptr [[CSP]], align 4
-; POSTPROCESS-NEXT:    [[SYSTEM_DATA:%.*]] = call [[STRUCT_DISPATCHSYSTEMDATA:%.*]] [[_CONT_SETUPRAYGEN:@[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]]()
-; POSTPROCESS-NEXT:    [[DOTFCA_0_EXTRACT:%.*]] = extractvalue [[STRUCT_DISPATCHSYSTEMDATA]] [[SYSTEM_DATA]], 0
+; POSTPROCESS-NEXT:    store i32 [[CSPINIT]], ptr [[CSP]], align 4
+; POSTPROCESS-NEXT:    [[DOTFCA_0_EXTRACT:%.*]] = extractvalue [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP0]], 0
 ; POSTPROCESS-NEXT:    call void @amd.dx.setLocalRootIndex(i32 0)
 ; POSTPROCESS-NEXT:    [[DIS_DATA_I_FCA_0_INSERT:%.*]] = insertvalue [[STRUCT_DISPATCHSYSTEMDATA]] poison, i32 [[DOTFCA_0_EXTRACT]], 0
 ; POSTPROCESS-NEXT:    store i32 0, ptr addrspace(20) @REGISTERS, align 4
+; POSTPROCESS-NEXT:    [[TMP2:%.*]] = call i64 @continuation.getAddrAndMD(ptr @main.resume.0)
 ; POSTPROCESS-NEXT:    [[TMP1:%.*]] = load i32, ptr [[CSP]], align 4
-; POSTPROCESS-NEXT:    [[TMP2:%.*]] = call i64 @continuation.getAddrAndMD(i64 ptrtoint (ptr @main.resume.0 to i64))
-; POSTPROCESS-NEXT:    call void (i64, ...) @continuation.continue(i64 2, i32 [[TMP1]], i64 [[TMP2]], [[STRUCT_DISPATCHSYSTEMDATA]] [[DIS_DATA_I_FCA_0_INSERT]]), !continuation.registercount [[META21:![0-9]+]], !continuation.returnedRegistercount !21
+; POSTPROCESS-NEXT:    call void (i64, ...) @continuation.continue(i64 2, i32 [[TMP1]], i64 [[TMP2]], [[STRUCT_DISPATCHSYSTEMDATA]] [[DIS_DATA_I_FCA_0_INSERT]]), !continuation.registercount [[META21:![0-9]+]], !continuation.returnedRegistercount [[META21]]
 ; POSTPROCESS-NEXT:    unreachable
 ;
 ;
 ; POSTPROCESS-LABEL: define dso_local void @main.resume.0(
-; POSTPROCESS-SAME: i32 [[CSPINIT:%.*]], [[STRUCT_DISPATCHSYSTEMDATA:%.*]] [[TMP0:%.*]]) !lgc.rt.shaderstage [[META8]] !continuation.registercount [[META21]] !continuation [[META20]] {
+; POSTPROCESS-SAME: i32 [[CSPINIT:%.*]], i64 [[TMP0:%.*]], [[STRUCT_DISPATCHSYSTEMDATA:%.*]] [[TMP1:%.*]]) !lgc.rt.shaderstage [[META8]] !continuation.registercount [[META21]] !continuation [[META20]] {
 ; POSTPROCESS-NEXT:  entryresume.0:
 ; POSTPROCESS-NEXT:    [[CSP:%.*]] = alloca i32, align 4
 ; POSTPROCESS-NEXT:    store i32 [[CSPINIT]], ptr [[CSP]], align 4
-; POSTPROCESS-NEXT:    [[TMP1:%.*]] = load i32, ptr addrspace(20) @REGISTERS, align 4
-; POSTPROCESS-NEXT:    [[DOTFCA_0_EXTRACT1:%.*]] = extractvalue [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP0]], 0
+; POSTPROCESS-NEXT:    [[TMP2:%.*]] = load i32, ptr addrspace(20) @REGISTERS, align 4
+; POSTPROCESS-NEXT:    [[DOTFCA_0_EXTRACT1:%.*]] = extractvalue [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP1]], 0
 ; POSTPROCESS-NEXT:    call void @amd.dx.setLocalRootIndex(i32 0)
 ; POSTPROCESS-NEXT:    ret void
 ; POSTPROCESS:       entryresume.0.split:
@@ -329,13 +328,11 @@ attributes #2 = { nounwind }
 ;
 ;
 ; POSTPROCESS-LABEL: define void @main_no_call(
-; POSTPROCESS-SAME: ) !lgc.rt.shaderstage [[META8]] !continuation.entry [[META19]] !continuation.registercount [[META8]] !continuation [[META22:![0-9]+]] !continuation.state [[META8]] {
+; POSTPROCESS-SAME: i32 [[CSPINIT:%.*]], i64 [[RETURNADDR:%.*]], [[STRUCT_DISPATCHSYSTEMDATA:%.*]] [[TMP0:%.*]]) !lgc.rt.shaderstage [[META8]] !continuation.entry [[META19]] !continuation.registercount [[META8]] !continuation [[META22:![0-9]+]] !continuation.state [[META8]] {
 ; POSTPROCESS-NEXT:  AllocaSpillBB:
 ; POSTPROCESS-NEXT:    [[CSP:%.*]] = alloca i32, align 4
-; POSTPROCESS-NEXT:    [[TMP0:%.*]] = call i32 @_cont_GetContinuationStackAddr()
-; POSTPROCESS-NEXT:    store i32 [[TMP0]], ptr [[CSP]], align 4
-; POSTPROCESS-NEXT:    [[SYSTEM_DATA:%.*]] = call [[STRUCT_DISPATCHSYSTEMDATA:%.*]] [[_CONT_SETUPRAYGEN:@[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]]()
-; POSTPROCESS-NEXT:    [[DOTFCA_0_EXTRACT:%.*]] = extractvalue [[STRUCT_DISPATCHSYSTEMDATA]] [[SYSTEM_DATA]], 0
+; POSTPROCESS-NEXT:    store i32 [[CSPINIT]], ptr [[CSP]], align 4
+; POSTPROCESS-NEXT:    [[DOTFCA_0_EXTRACT:%.*]] = extractvalue [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP0]], 0
 ; POSTPROCESS-NEXT:    call void @amd.dx.setLocalRootIndex(i32 0)
 ; POSTPROCESS-NEXT:    ret void
 ; POSTPROCESS:       AllocaSpillBB.split:
@@ -357,6 +354,6 @@ attributes #2 = { nounwind }
 ; POSTPROCESS-NEXT:    store i32 [[TMP3]], ptr addrspace(20) getelementptr inbounds (i32, ptr addrspace(20) @REGISTERS, i32 2), align 4
 ; POSTPROCESS-NEXT:    [[DOTFCA_0_INSERT:%.*]] = insertvalue [[STRUCT_DISPATCHSYSTEMDATA]] poison, i32 [[DOTFCA_0_EXTRACT]], 0
 ; POSTPROCESS-NEXT:    [[TMP4:%.*]] = load i32, ptr [[CSP]], align 4
-; POSTPROCESS-NEXT:    call void (i64, ...) @continuation.continue(i64 [[RETURNADDR]], i32 [[TMP4]], [[STRUCT_DISPATCHSYSTEMDATA]] [[DOTFCA_0_INSERT]]), !continuation.registercount [[META17]]
+; POSTPROCESS-NEXT:    call void (i64, ...) @continuation.continue(i64 [[RETURNADDR]], i32 [[TMP4]], i64 poison, [[STRUCT_DISPATCHSYSTEMDATA]] [[DOTFCA_0_INSERT]]), !continuation.registercount [[META17]]
 ; POSTPROCESS-NEXT:    unreachable
 ;
diff --git a/llvmraytracing/test/dx/dxil-cont-convert-lgc-rt-op-trace-payload-type.ll b/llvmraytracing/test/dx/dxil-cont-convert-lgc-rt-op-trace-payload-type.ll
index 0d52591c11..415975625d 100644
--- a/llvmraytracing/test/dx/dxil-cont-convert-lgc-rt-op-trace-payload-type.ll
+++ b/llvmraytracing/test/dx/dxil-cont-convert-lgc-rt-op-trace-payload-type.ll
@@ -1,11 +1,8 @@
-; RUN: opt --verify-each -passes='dxil-cont-lgc-rt-op-converter,lint' -S %s 2> %t0.stderr | FileCheck -check-prefix=PAYLOADTYPE-OPAQUE %s
-; RUN: count 0 < %t0.stderr
-; RUN: opt --verify-each -passes='dxil-cont-lgc-rt-op-converter,lint' -S %s 2> %t1.stderr | FileCheck -check-prefix=PAYLOADTYPE2-OPAQUE %s
-; RUN: count 0 < %t1.stderr
-; RUN: opt --verify-each -passes='dxil-cont-lgc-rt-op-converter,lint' -S %s 2> %t2.stderr | FileCheck -check-prefix=PAYLOADTYPE3-OPAQUE %s
-; RUN: count 0 < %t2.stderr
-
-target datalayout = "e-m:e-p:64:32-p20:32:32-p21:32:32-p32:32:32-i1:32-i8:8-i16:32-i32:32-i64:32-f16:32-f32:32-f64:32-v16:32-v32:32-v48:32-v64:32-v80:32-v96:32-v112:32-v128:32-v144:32-v160:32-v176:32-v192:32-v208:32-v224:32-v240:32-v256:32-n8:16:32"
+; RUN: opt --verify-each -passes='dxil-cont-lgc-rt-op-converter,lint' -S %s --lint-abort-on-error | FileCheck -check-prefix=PAYLOADTYPE-OPAQUE %s
+; RUN: opt --verify-each -passes='dxil-cont-lgc-rt-op-converter,lint' -S %s --lint-abort-on-error | FileCheck -check-prefix=PAYLOADTYPE2-OPAQUE %s
+; RUN: opt --verify-each -passes='dxil-cont-lgc-rt-op-converter,lint' -S %s --lint-abort-on-error | FileCheck -check-prefix=PAYLOADTYPE3-OPAQUE %s
+
+target datalayout = "e-m:e-p:64:32-p20:32:32-p21:32:32-p32:32:32-i1:32-i8:8-i16:16-i32:32-i64:32-f16:16-f32:32-f64:32-v8:8-v16:16-v32:32-v48:32-v64:32-v80:32-v96:32-v112:32-v128:32-v144:32-v160:32-v176:32-v192:32-v208:32-v224:32-v240:32-v256:32-n8:16:32"
 
 %struct.MyParams = type { [48 x i32] }
 %struct.TheirParams = type { [64 x i32] }
diff --git a/llvmraytracing/test/dx/dxil-cont-convert-lgc-rt-op-trace.ll b/llvmraytracing/test/dx/dxil-cont-convert-lgc-rt-op-trace.ll
index 2e3e4948b1..a704ebc222 100644
--- a/llvmraytracing/test/dx/dxil-cont-convert-lgc-rt-op-trace.ll
+++ b/llvmraytracing/test/dx/dxil-cont-convert-lgc-rt-op-trace.ll
@@ -1,8 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 3
-; RUN: opt --verify-each -passes='dxil-cont-lgc-rt-op-converter,lint' -S %s 2> %t0.stderr | FileCheck %s
-; RUN: count 0 < %t0.stderr
+; RUN: opt --verify-each -passes='dxil-cont-lgc-rt-op-converter,lint' -S %s --lint-abort-on-error | FileCheck %s
 
-target datalayout = "e-m:e-p:64:32-p20:32:32-p21:32:32-p32:32:32-i1:32-i8:8-i16:32-i32:32-i64:32-f16:32-f32:32-f64:32-v16:32-v32:32-v48:32-v64:32-v80:32-v96:32-v112:32-v128:32-v144:32-v160:32-v176:32-v192:32-v208:32-v224:32-v240:32-v256:32-n8:16:32"
+target datalayout = "e-m:e-p:64:32-p20:32:32-p21:32:32-p32:32:32-i1:32-i8:8-i16:16-i32:32-i64:32-f16:16-f32:32-f64:32-v8:8-v16:16-v32:32-v48:32-v64:32-v80:32-v96:32-v112:32-v128:32-v144:32-v160:32-v176:32-v192:32-v208:32-v224:32-v240:32-v256:32-n8:16:32"
 
 %struct.MyParams = type { [48 x i32] }
 %struct.TheirParams = type { [64 x i32] }
@@ -82,8 +81,8 @@ define void @mainTrace() {
 ; CHECK-NEXT:    [[TMP3:%.*]] = load [[DX_TYPES_HANDLE]], ptr @"\01?RenderTarget@@3V?$RWTexture2D@V?$vector@M$03@@@@A", align 4
 ; CHECK-NEXT:    [[TMP4:%.*]] = alloca [[STRUCT_RAYPAYLOAD:%.*]], align 4
 ; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [[STRUCT_RAYPAYLOAD]], ptr [[TMP4]], i32 0, i32 0
-; CHECK-NEXT:    [[TMP6:%.*]] = call [[DX_TYPES_HANDLE]] @dx.op.createHandleForLib.dx.types.Handle(i32 160, [[DX_TYPES_HANDLE]] [[TMP2]])
-; CHECK-NEXT:    [[TMP7:%.*]] = call [[DX_TYPES_HANDLE]] @dx.op.annotateHandle(i32 216, [[DX_TYPES_HANDLE]] [[TMP6]], [[DX_TYPES_RESOURCEPROPERTIES:%.*]] { i32 16, i32 0 })
+; CHECK-NEXT:    [[TMP6:%.*]] = call [[DX_TYPES_HANDLE]] [[DX_OP_CREATEHANDLEFORLIB_DX_TYPES_HANDLE:@[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]](i32 160, [[DX_TYPES_HANDLE]] [[TMP2]])
+; CHECK-NEXT:    [[TMP7:%.*]] = call [[DX_TYPES_HANDLE]] [[DX_OP_ANNOTATEHANDLE:@[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]](i32 216, [[DX_TYPES_HANDLE]] [[TMP6]], [[DX_TYPES_RESOURCEPROPERTIES:%.*]] { i32 16, i32 0 })
 ; CHECK-NEXT:    [[TMP8:%.*]] = call i64 @amd.dx.getAccelStructAddr([[DX_TYPES_HANDLE]] [[TMP7]])
 ; CHECK-NEXT:    call void (...) @lgc.rt.trace.ray(i64 [[TMP8]], i32 16, i32 -1, i32 0, i32 1, i32 0, <3 x float> zeroinitializer, float 0x3F50624DE0000000, <3 x float> <float 1.000000e+00, float 0.000000e+00, float 0.000000e+00>, float 1.000000e+04, ptr [[TMP4]], [1 x i32] [i32 272]), !cont.payload.type [[META27:![0-9]+]]
 ; CHECK-NEXT:    ret void
diff --git a/llvmraytracing/test/dx/dxil-cont-convert-lgc-rt-op.ll b/llvmraytracing/test/dx/dxil-cont-convert-lgc-rt-op.ll
index 3320a0c84a..17f7e087f8 100644
--- a/llvmraytracing/test/dx/dxil-cont-convert-lgc-rt-op.ll
+++ b/llvmraytracing/test/dx/dxil-cont-convert-lgc-rt-op.ll
@@ -1,8 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --function ClosestHit --version 3
-; RUN: opt --verify-each -passes='dxil-cont-intrinsic-prepare,lint,dxil-cont-lgc-rt-op-converter,lint,remove-types-metadata' -S %s 2> %t0.stderr | FileCheck %s
-; RUN: count 0 < %t0.stderr
+; RUN: opt --verify-each -passes='dxil-cont-intrinsic-prepare,lint,dxil-cont-lgc-rt-op-converter,lint,remove-types-metadata' -S %s --lint-abort-on-error | FileCheck %s
 
-target datalayout = "e-m:e-p:64:32-p20:32:32-p21:32:32-p32:32:32-i1:32-i8:8-i16:32-i32:32-i64:32-f16:32-f32:32-f64:32-v16:32-v32:32-v48:32-v64:32-v80:32-v96:32-v112:32-v128:32-v144:32-v160:32-v176:32-v192:32-v208:32-v224:32-v240:32-v256:32-n8:16:32"
+target datalayout = "e-m:e-p:64:32-p20:32:32-p21:32:32-p32:32:32-i1:32-i8:8-i16:16-i32:32-i64:32-f16:16-f32:32-f64:32-v8:8-v16:16-v32:32-v48:32-v64:32-v80:32-v96:32-v112:32-v128:32-v144:32-v160:32-v176:32-v192:32-v208:32-v224:32-v240:32-v256:32-n8:16:32"
 
 %struct.DispatchSystemData = type { <3 x i32> }
 %struct.SystemData = type { %struct.DispatchSystemData }
diff --git a/llvmraytracing/test/dx/dxil-cont-intrinsic-prepare.ll b/llvmraytracing/test/dx/dxil-cont-intrinsic-prepare.ll
index 3d3585152a..e8bd6d7720 100644
--- a/llvmraytracing/test/dx/dxil-cont-intrinsic-prepare.ll
+++ b/llvmraytracing/test/dx/dxil-cont-intrinsic-prepare.ll
@@ -1,8 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --include-generated-funcs --version 3
-; RUN: opt --verify-each -passes='dxil-cont-intrinsic-prepare,lint' -S %s 2> %t.stderr | FileCheck %s
-; RUN: count 0 < %t.stderr
+; RUN: opt --verify-each -passes='dxil-cont-intrinsic-prepare,lint' -S %s --lint-abort-on-error | FileCheck %s
 
-target datalayout = "e-m:e-p:64:32-p20:32:32-p21:32:32-p32:32:32-i1:32-i8:8-i16:32-i32:32-i64:32-f16:32-f32:32-f64:32-v16:32-v32:32-v48:32-v64:32-v80:32-v96:32-v112:32-v128:32-v144:32-v160:32-v176:32-v192:32-v208:32-v224:32-v240:32-v256:32-n8:16:32"
+target datalayout = "e-m:e-p:64:32-p20:32:32-p21:32:32-p32:32:32-i1:32-i8:8-i16:16-i32:32-i64:32-f16:16-f32:32-f64:32-v8:8-v16:16-v32:32-v48:32-v64:32-v80:32-v96:32-v112:32-v128:32-v144:32-v160:32-v176:32-v192:32-v208:32-v224:32-v240:32-v256:32-n8:16:32"
 
 %struct.DispatchSystemData = type { i32 }
 %struct.TraversalData = type { %struct.SystemData, i32, i64 }
@@ -114,7 +113,7 @@ attributes #4 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 ; CHECK-NEXT:    [[A:%.*]] = getelementptr inbounds [[STRUCT_TRAVERSALDATA]], ptr [[TMP1]], i32 0, i32 2
 ; CHECK-NEXT:    store i64 [[ADDR]], ptr [[A]], align 4
 ; CHECK-NEXT:    [[TMP8:%.*]] = load [[STRUCT_TRAVERSALDATA]], ptr [[TMP1]], align 4
-; CHECK-NEXT:    [[TMP9:%.*]] = call [[STRUCT_DISPATCHSYSTEMDATA]] @_AmdAwait(i64 3, [[STRUCT_TRAVERSALDATA]] [[TMP8]])
+; CHECK-NEXT:    [[TMP9:%.*]] = call [[STRUCT_DISPATCHSYSTEMDATA]] [[_AMDAWAIT:@[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]](i64 3, [[STRUCT_TRAVERSALDATA]] [[TMP8]])
 ; CHECK-NEXT:    store [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP9]], ptr [[TMP2]], align 4
 ; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [[STRUCT_DISPATCHSYSTEMDATA]], ptr [[TMP2]], i32 0, i32 0
 ; CHECK-NEXT:    [[TMP11:%.*]] = load i32, ptr [[TMP10]], align 4
diff --git a/llvmraytracing/test/dx/dxil-cont-post-process-report-sizes.ll b/llvmraytracing/test/dx/dxil-cont-post-process-report-sizes.ll
index 114fd214c6..62fc6da8f8 100644
--- a/llvmraytracing/test/dx/dxil-cont-post-process-report-sizes.ll
+++ b/llvmraytracing/test/dx/dxil-cont-post-process-report-sizes.ll
@@ -1,8 +1,8 @@
-; RUN: opt --report-cont-state-sizes       --verify-each -passes='dxil-cont-post-process,lint,remove-types-metadata' -S %s 2>&1 | FileCheck %s --check-prefix=REPORT-CONT-SIZES
-; RUN: opt --report-payload-register-sizes --verify-each -passes='dxil-cont-post-process,lint,remove-types-metadata' -S %s 2>&1 | FileCheck %s --check-prefix=REPORT-PAYLOAD-SIZES
-; RUN: opt --report-system-data-sizes      --verify-each -passes='dxil-cont-post-process,lint,remove-types-metadata' -S %s 2>&1 | FileCheck %s --check-prefix=REPORT-SYSTEM-DATA-SIZES
+; RUN: opt --report-cont-state-sizes       --verify-each -passes='dxil-cont-post-process,lint,remove-types-metadata' -S %s --lint-abort-on-error 2>&1 | FileCheck %s --check-prefix=REPORT-CONT-SIZES
+; RUN: opt --report-payload-register-sizes --verify-each -passes='dxil-cont-post-process,lint,remove-types-metadata' -S %s --lint-abort-on-error 2>&1 | FileCheck %s --check-prefix=REPORT-PAYLOAD-SIZES
+; RUN: opt --report-system-data-sizes      --verify-each -passes='dxil-cont-post-process,lint,remove-types-metadata' -S %s --lint-abort-on-error 2>&1 | FileCheck %s --check-prefix=REPORT-SYSTEM-DATA-SIZES
 
-target datalayout = "e-m:e-p:64:32-p20:32:32-p21:32:32-p32:32:32-i1:32-i8:8-i16:32-i32:32-i64:32-f16:32-f32:32-f64:32-v16:32-v32:32-v48:32-v64:32-v80:32-v96:32-v112:32-v128:32-v144:32-v160:32-v176:32-v192:32-v208:32-v224:32-v240:32-v256:32-n8:16:32"
+target datalayout = "e-m:e-p:64:32-p20:32:32-p21:32:32-p32:32:32-i1:32-i8:8-i16:16-i32:32-i64:32-f16:16-f32:32-f64:32-v8:8-v16:16-v32:32-v48:32-v64:32-v80:32-v96:32-v112:32-v128:32-v144:32-v160:32-v176:32-v192:32-v208:32-v224:32-v240:32-v256:32-n8:16:32"
 
 %struct.DispatchSystemData = type { i32 }
 %struct.CHSSystemData = type { [100 x i32] }
@@ -15,7 +15,7 @@ declare void @continuation.continue(i64, ...)
 
 ; REPORT-CONT-SIZES: Continuation state size of "RayGen" (raygeneration): 108 bytes
 ; REPORT-PAYLOAD-SIZES: Incoming and max outgoing payload VGPR size of "RayGen" (raygeneration): 28 and 24 bytes
-define void @RayGen(%struct.DispatchSystemData %0) !continuation.entry !0 !continuation !3 !continuation.state !5 !continuation.registercount !7 !lgc.rt.shaderstage !12 {
+define void @RayGen(i64 %dummyRetAddr, %struct.DispatchSystemData %0) !continuation.entry !0 !continuation !3 !continuation.state !5 !continuation.registercount !7 !lgc.rt.shaderstage !12 {
   %csp = alloca i32, align 4
   %cspInit = call i32 @continuation.initialContinuationStackPtr()
   store i32 %cspInit, i32* %csp
@@ -26,7 +26,7 @@ define void @RayGen(%struct.DispatchSystemData %0) !continuation.entry !0 !conti
 ; This is needed as fake continuation of RayGen, because we only report continuation state sizes
 ; if we find a continuation function using !continuation metadata.
 ; REPORT-SYSTEM-DATA-SIZES-DAG: Incoming system data of "RayGen.resume.0" (raygeneration) is "struct.DispatchSystemData", size: 4 bytes
-define void @RayGen.resume.0(%struct.DispatchSystemData %0) !continuation !3 !lgc.rt.shaderstage !12 {
+define void @RayGen.resume.0(i64 %0, %struct.DispatchSystemData  %1) !continuation !3 !lgc.rt.shaderstage !12 {
   ret void
 }
 
diff --git a/llvmraytracing/test/dx/dxil-cont-post-process.ll b/llvmraytracing/test/dx/dxil-cont-post-process.ll
index 2bd1e5d275..f1df1da558 100644
--- a/llvmraytracing/test/dx/dxil-cont-post-process.ll
+++ b/llvmraytracing/test/dx/dxil-cont-post-process.ll
@@ -1,8 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 3
-; RUN: opt --verify-each -passes='dxil-cont-post-process,lint' -S %s 2> %t.stderr | FileCheck %s
-; RUN: count 0 < %t.stderr
+; RUN: opt --verify-each -passes='dxil-cont-post-process,lint' -S %s --lint-abort-on-error | FileCheck %s
 
-target datalayout = "e-m:e-p:64:32-p20:32:32-p21:32:32-p32:32:32-i1:32-i8:8-i16:32-i32:32-i64:32-f16:32-f32:32-f64:32-v16:32-v32:32-v48:32-v64:32-v80:32-v96:32-v112:32-v128:32-v144:32-v160:32-v176:32-v192:32-v208:32-v224:32-v240:32-v256:32-n8:16:32"
+target datalayout = "e-m:e-p:64:32-p20:32:32-p21:32:32-p32:32:32-i1:32-i8:8-i16:16-i32:32-i64:32-f16:16-f32:32-f64:32-v8:8-v16:16-v32:32-v48:32-v64:32-v80:32-v96:32-v112:32-v128:32-v144:32-v160:32-v176:32-v192:32-v208:32-v224:32-v240:32-v256:32-n8:16:32"
 
 %struct.DispatchSystemData = type { i32 }
 
@@ -37,27 +36,19 @@ end:                                              ; preds = %complete, %0
   ret %struct.DispatchSystemData %data
 }
 
-define void @RayGen(%struct.DispatchSystemData %0) !lgc.rt.shaderstage !5 !continuation.entry !0 !continuation !3 {
+define void @RayGen(i64 %dummyRetAddr, %struct.DispatchSystemData %0) !lgc.rt.shaderstage !5 !continuation.entry !0 !continuation !3 {
 ; CHECK-LABEL: define void @RayGen(
-; CHECK-SAME: ) !lgc.rt.shaderstage [[META3:![0-9]+]] !continuation.entry [[META4:![0-9]+]] !continuation [[META5:![0-9]+]] {
+; CHECK-SAME: i32 [[CSPINIT:%.*]], i64 [[DUMMYRETADDR:%.*]], [[STRUCT_DISPATCHSYSTEMDATA:%.*]] [[TMP0:%.*]]) !lgc.rt.shaderstage [[META3:![0-9]+]] !continuation.entry [[META4:![0-9]+]] !continuation [[META5:![0-9]+]] {
 ; CHECK-NEXT:    [[CSP:%.*]] = alloca i32, align 4
-; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @_cont_GetContinuationStackAddr()
-; CHECK-NEXT:    store i32 [[TMP1]], ptr [[CSP]], align 4
-; CHECK-NEXT:    [[C_I:%.*]] = load i1, ptr @debug_global, align 1
-; CHECK-NEXT:    br i1 [[C_I]], label [[COMPLETE_I:%.*]], label [[_CONT_SETUPRAYGEN_EXIT:%.*]]
-; CHECK:       complete.i:
-; CHECK-NEXT:    ret void
-; CHECK:       complete.i.split:
-; CHECK-NEXT:    br label [[_CONT_SETUPRAYGEN_EXIT]]
-; CHECK:       _cont_SetupRayGen.exit:
+; CHECK-NEXT:    store i32 [[CSPINIT]], ptr [[CSP]], align 4
 ; CHECK-NEXT:    ret void
 ;
   ret void
 }
 
-define void @RayGen.resume.0(%struct.DispatchSystemData %0) !lgc.rt.shaderstage !5 !continuation !3 {
+define void @RayGen.resume.0(i64 %0, %struct.DispatchSystemData %1) !lgc.rt.shaderstage !5 !continuation !3 {
 ; CHECK-LABEL: define void @RayGen.resume.0(
-; CHECK-SAME: i32 [[CSPINIT:%.*]], [[STRUCT_DISPATCHSYSTEMDATA:%.*]] [[TMP0:%.*]]) !lgc.rt.shaderstage [[META3]] !continuation [[META5]] {
+; CHECK-SAME: i32 [[CSPINIT:%.*]], i64 [[TMP0:%.*]], [[STRUCT_DISPATCHSYSTEMDATA:%.*]] [[TMP1:%.*]]) !lgc.rt.shaderstage [[META3]] !continuation [[META5]] {
 ; CHECK-NEXT:    [[CSP:%.*]] = alloca i32, align 4
 ; CHECK-NEXT:    store i32 [[CSPINIT]], ptr [[CSP]], align 4
 ; CHECK-NEXT:    ret void
diff --git a/llvmraytracing/test/dx/dxil-cont-prepare-traversal.ll b/llvmraytracing/test/dx/dxil-cont-prepare-traversal.ll
index dff10ffd45..917b3c4eb0 100644
--- a/llvmraytracing/test/dx/dxil-cont-prepare-traversal.ll
+++ b/llvmraytracing/test/dx/dxil-cont-prepare-traversal.ll
@@ -1,10 +1,8 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --include-generated-funcs --version 3
-; RUN: opt --verify-each -passes='dxil-cont-intrinsic-prepare,lint,remove-types-metadata' -S %s 2> %t0.stderr | FileCheck --check-prefix=PREPARE %s
-; RUN: count 0 < %t0.stderr
-; RUN: opt --verify-each -passes='dxil-cont-intrinsic-prepare,lint,dxil-cont-lgc-rt-op-converter,lint,inline,lint,lower-raytracing-pipeline,lint,sroa,lint,lower-await,lint,coro-early,dxil-coro-split,coro-cleanup,lint,legacy-cleanup-continuations,lint,register-buffer,lint,dxil-cont-post-process,lint,remove-types-metadata' -S %s 2> %t1.stderr | FileCheck --check-prefix=ALL %s
-; RUN: count 0 < %t1.stderr
+; RUN: opt --verify-each -passes='dxil-cont-intrinsic-prepare,lint,remove-types-metadata' -S %s --lint-abort-on-error | FileCheck --check-prefix=PREPARE %s
+; RUN: opt --verify-each -passes='dxil-cont-intrinsic-prepare,lint,dxil-cont-lgc-rt-op-converter,lint,inline,lint,lower-raytracing-pipeline,lint,sroa,lint,lower-await,lint,coro-early,dxil-coro-split,coro-cleanup,lint,legacy-cleanup-continuations,lint,register-buffer,lint,dxil-cont-post-process,lint,remove-types-metadata' -S %s --lint-abort-on-error | FileCheck --check-prefix=ALL %s
 
-target datalayout = "e-m:e-p:64:32-p20:32:32-p21:32:32-p32:32:32-i1:32-i8:8-i16:32-i32:32-i64:32-f16:32-f32:32-f64:32-v16:32-v32:32-v48:32-v64:32-v80:32-v96:32-v112:32-v128:32-v144:32-v160:32-v176:32-v192:32-v208:32-v224:32-v240:32-v256:32-n8:16:32"
+target datalayout = "e-m:e-p:64:32-p20:32:32-p21:32:32-p32:32:32-i1:32-i8:8-i16:16-i32:32-i64:32-f16:16-f32:32-f64:32-v8:8-v16:16-v32:32-v48:32-v64:32-v80:32-v96:32-v112:32-v128:32-v144:32-v160:32-v176:32-v192:32-v208:32-v224:32-v240:32-v256:32-n8:16:32"
 
 %struct.TraversalData = type { %struct.SystemData, i32 }
 %struct.SystemData = type { %struct.DispatchSystemData, float }
@@ -27,7 +25,7 @@ declare !types !3 void @"\01?_AmdValueSetI32Something@@YA_KXZ"(%struct.Traversal
 declare !types !8 i32 @_cont_GetLocalRootIndex(%struct.DispatchSystemData*)
 
 ; Function Attrs: nounwind
-define void @_cont_Traversal(i32 %stackPtr, %struct.TraversalData* %data) #0 !types !4 {
+define void @_cont_Traversal(%struct.TraversalData* %data) #0 !types !4 {
   %1 = getelementptr inbounds %struct.TraversalData, %struct.TraversalData* %data, i32 0, i32 1
   %2 = load i32, i32* %1, align 4
   %3 = icmp eq i32 %2, 0
@@ -48,11 +46,11 @@ define void @_cont_Traversal(i32 %stackPtr, %struct.TraversalData* %data) #0 !ty
   %a3 = add i32 %a2, %i3
   %a4 = add i32 %a3, %i4
   %addr = zext i32 %a4 to i64
-  call void @_AmdWaitEnqueueCall(i64 %addr, i64 -1, i32 %stackPtr, %struct.SystemData* %4) #2
+  call void @_AmdWaitEnqueueCall(i64 %addr, i64 -1, i32 0, %struct.SystemData* %4) #2
   br label %7
 
 6:                                                ; preds = %0
-  call void @_AmdWaitEnqueue(i64 0, i64 -1, i32 %stackPtr, %struct.SystemData* %4) #2
+  call void @_AmdWaitEnqueue(i64 0, i64 -1, i32 2, %struct.SystemData* %4) #2
   br label %7
 
 7:                                                ; preds = %6, %5
@@ -71,13 +69,13 @@ attributes #2 = { nounwind }
 !1 = !{i32 0, %struct.TraversalData poison}
 !2 = !{!"function", i32 poison, !1, i32 poison}
 !3 = !{!"function", !"void", !1, i32 poison, i32 poison}
-!4 = !{!"function", !"void", i32 poison, !1}
+!4 = !{!"function", !"void", !1}
 !5 = !{!"function", !"void", i64 poison, i64 poison, i32 poison, !6}
 !6 = !{i32 0, %struct.SystemData poison}
 !7 = !{i32 0, %struct.DispatchSystemData poison}
 !8 = !{!"function", i32 poison, !7}
 ; PREPARE-LABEL: define void @_cont_Traversal(
-; PREPARE-SAME: i32 [[STACKPTR:%.*]], [[STRUCT_TRAVERSALDATA:%.*]] [[DATA:%.*]]) #[[ATTR1:[0-9]+]] !lgc.rt.shaderstage [[META0:![0-9]+]] {
+; PREPARE-SAME: [[STRUCT_TRAVERSALDATA:%.*]] [[DATA:%.*]]) #[[ATTR1:[0-9]+]] !lgc.rt.shaderstage [[META0:![0-9]+]] {
 ; PREPARE-NEXT:    [[TMP1:%.*]] = alloca [[STRUCT_TRAVERSALDATA]], align 8
 ; PREPARE-NEXT:    store [[STRUCT_TRAVERSALDATA]] [[DATA]], ptr [[TMP1]], align 4
 ; PREPARE-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [[STRUCT_TRAVERSALDATA]], ptr [[TMP1]], i32 0, i32 1
@@ -99,28 +97,33 @@ attributes #2 = { nounwind }
 ; PREPARE-NEXT:    [[A4:%.*]] = add i32 [[A3]], [[I4]]
 ; PREPARE-NEXT:    [[ADDR:%.*]] = zext i32 [[A4]] to i64
 ; PREPARE-NEXT:    [[TMP7:%.*]] = load [[STRUCT_SYSTEMDATA:%.*]], ptr [[TMP5]], align 4
-; PREPARE-NEXT:    call void (i64, i64, ...) @continuation.waitContinue(i64 [[ADDR]], i64 -1, i32 [[STACKPTR]], i64 ptrtoint (ptr @_cont_Traversal to i64), [[STRUCT_SYSTEMDATA]] [[TMP7]])
+; PREPARE-NEXT:    [[TMP10:%.*]] = call i64 (...) @lgc.cps.as.continuation.reference__i64(ptr @_cont_Traversal)
+; PREPARE-NEXT:    call void (i64, i64, ...) @continuation.waitContinue(i64 [[ADDR]], i64 -1, i32 0, i64 [[TMP10]], [[STRUCT_SYSTEMDATA]] [[TMP7]])
 ; PREPARE-NEXT:    unreachable
-; PREPARE:       8:
+; PREPARE:       9:
 ; PREPARE-NEXT:    [[TMP9:%.*]] = load [[STRUCT_SYSTEMDATA]], ptr [[TMP5]], align 4
-; PREPARE-NEXT:    call void (i64, i64, ...) @continuation.waitContinue(i64 0, i64 -1, i32 [[STACKPTR]], [[STRUCT_SYSTEMDATA]] [[TMP9]])
+; PREPARE-NEXT:    call void (i64, i64, ...) @continuation.waitContinue(i64 0, i64 -1, i32 2, [[STRUCT_SYSTEMDATA]] [[TMP9]])
 ; PREPARE-NEXT:    unreachable
-; PREPARE:       10:
+; PREPARE:       11:
 ; PREPARE-NEXT:    ret void
 ;
 ;
 ; ALL-LABEL: define void @_cont_Traversal(
-; ALL-SAME: i32 [[CSPINIT:%.*]], i32 [[STACKPTR:%.*]], [[STRUCT_TRAVERSALDATA:%.*]] [[DATA:%.*]]) #[[ATTR0:[0-9]+]] !lgc.rt.shaderstage [[META2:![0-9]+]] !continuation [[META3:![0-9]+]] !continuation.registercount [[META0:![0-9]+]] {
+; ALL-SAME: i32 [[CSPINIT:%.*]], i64 [[RETURNADDR:%.*]], [[STRUCT_TRAVERSALDATA:%.*]] [[TMP0:%.*]]) #[[ATTR0:[0-9]+]] !lgc.rt.shaderstage [[META2:![0-9]+]] !continuation [[META3:![0-9]+]] !continuation.registercount [[META0:![0-9]+]] !continuation.state [[META4:![0-9]+]] {
+; ALL-NEXT:  AllocaSpillBB:
 ; ALL-NEXT:    [[TMP1:%.*]] = alloca [[STRUCT_TRAVERSALDATA]], align 8
 ; ALL-NEXT:    [[CSP:%.*]] = alloca i32, align 4
 ; ALL-NEXT:    store i32 [[CSPINIT]], ptr [[CSP]], align 4
-; ALL-NEXT:    [[DATA_FCA_0_0_0_EXTRACT:%.*]] = extractvalue [[STRUCT_TRAVERSALDATA]] [[DATA]], 0, 0, 0
+; ALL-NEXT:    [[DOTFCA_0_0_0_EXTRACT:%.*]] = extractvalue [[STRUCT_TRAVERSALDATA]] [[TMP0]], 0, 0, 0
+; ALL-NEXT:    [[DOTFCA_0_1_EXTRACT:%.*]] = extractvalue [[STRUCT_TRAVERSALDATA]] [[TMP0]], 0, 1
+; ALL-NEXT:    [[DOTFCA_1_EXTRACT:%.*]] = extractvalue [[STRUCT_TRAVERSALDATA]] [[TMP0]], 1
+; ALL-NEXT:    [[DATA_FCA_0_0_0_EXTRACT:%.*]] = extractvalue [[STRUCT_TRAVERSALDATA]] [[TMP0]], 0, 0, 0
 ; ALL-NEXT:    [[DATA_FCA_0_0_0_GEP:%.*]] = getelementptr inbounds [[STRUCT_TRAVERSALDATA]], ptr [[TMP1]], i32 0, i32 0, i32 0, i32 0
 ; ALL-NEXT:    store i32 [[DATA_FCA_0_0_0_EXTRACT]], ptr [[DATA_FCA_0_0_0_GEP]], align 4
-; ALL-NEXT:    [[DATA_FCA_0_1_EXTRACT:%.*]] = extractvalue [[STRUCT_TRAVERSALDATA]] [[DATA]], 0, 1
+; ALL-NEXT:    [[DATA_FCA_0_1_EXTRACT:%.*]] = extractvalue [[STRUCT_TRAVERSALDATA]] [[TMP0]], 0, 1
 ; ALL-NEXT:    [[DATA_FCA_0_1_GEP:%.*]] = getelementptr inbounds [[STRUCT_TRAVERSALDATA]], ptr [[TMP1]], i32 0, i32 0, i32 1
 ; ALL-NEXT:    store float [[DATA_FCA_0_1_EXTRACT]], ptr [[DATA_FCA_0_1_GEP]], align 4
-; ALL-NEXT:    [[DATA_FCA_1_EXTRACT:%.*]] = extractvalue [[STRUCT_TRAVERSALDATA]] [[DATA]], 1
+; ALL-NEXT:    [[DATA_FCA_1_EXTRACT:%.*]] = extractvalue [[STRUCT_TRAVERSALDATA]] [[TMP0]], 1
 ; ALL-NEXT:    [[DATA_FCA_1_GEP:%.*]] = getelementptr inbounds [[STRUCT_TRAVERSALDATA]], ptr [[TMP1]], i32 0, i32 1
 ; ALL-NEXT:    store i32 [[DATA_FCA_1_EXTRACT]], ptr [[DATA_FCA_1_GEP]], align 4
 ; ALL-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [[STRUCT_TRAVERSALDATA]], ptr [[TMP1]], i32 0, i32 1
@@ -147,9 +150,9 @@ attributes #2 = { nounwind }
 ; ALL-NEXT:    [[DOTFCA_1_GEP:%.*]] = getelementptr inbounds [[STRUCT_SYSTEMDATA]], ptr [[TMP5]], i32 0, i32 1
 ; ALL-NEXT:    [[DOTFCA_1_LOAD:%.*]] = load float, ptr [[DOTFCA_1_GEP]], align 4
 ; ALL-NEXT:    [[DOTFCA_1_INSERT:%.*]] = insertvalue [[STRUCT_SYSTEMDATA]] [[DOTFCA_0_0_INSERT]], float [[DOTFCA_1_LOAD]], 1
+; ALL-NEXT:    [[TMP12:%.*]] = call i64 @continuation.getAddrAndMD(ptr @_cont_Traversal)
 ; ALL-NEXT:    [[TMP11:%.*]] = load i32, ptr [[CSP]], align 4
-; ALL-NEXT:    [[TMP12:%.*]] = call i64 @continuation.getAddrAndMD(i64 ptrtoint (ptr @_cont_Traversal to i64))
-; ALL-NEXT:    call void (i64, i64, ...) @continuation.waitContinue(i64 [[ADDR]], i64 -1, i32 [[TMP11]], i32 [[STACKPTR]], i64 [[TMP12]], [[STRUCT_SYSTEMDATA]] [[DOTFCA_1_INSERT]]), !continuation.registercount [[META0]]
+; ALL-NEXT:    call void (i64, i64, ...) @continuation.waitContinue(i64 [[ADDR]], i64 -1, i32 [[TMP11]], i32 0, i64 [[TMP12]], [[STRUCT_SYSTEMDATA]] [[DOTFCA_1_INSERT]]), !continuation.registercount [[META0]]
 ; ALL-NEXT:    unreachable
 ; ALL:       13:
 ; ALL-NEXT:    [[DOTFCA_0_0_GEP1:%.*]] = getelementptr inbounds [[STRUCT_SYSTEMDATA]], ptr [[TMP5]], i32 0, i32 0, i32 0
@@ -159,8 +162,6 @@ attributes #2 = { nounwind }
 ; ALL-NEXT:    [[DOTFCA_1_LOAD5:%.*]] = load float, ptr [[DOTFCA_1_GEP4]], align 4
 ; ALL-NEXT:    [[DOTFCA_1_INSERT6:%.*]] = insertvalue [[STRUCT_SYSTEMDATA]] [[DOTFCA_0_0_INSERT3]], float [[DOTFCA_1_LOAD5]], 1
 ; ALL-NEXT:    [[TMP14:%.*]] = load i32, ptr [[CSP]], align 4
-; ALL-NEXT:    call void (i64, i64, ...) @continuation.waitContinue(i64 0, i64 -1, i32 [[TMP14]], i32 [[STACKPTR]], [[STRUCT_SYSTEMDATA]] [[DOTFCA_1_INSERT6]]), !continuation.registercount [[META0]]
+; ALL-NEXT:    call void (i64, i64, ...) @continuation.waitContinue(i64 0, i64 -1, i32 [[TMP14]], i32 2, [[STRUCT_SYSTEMDATA]] [[DOTFCA_1_INSERT6]]), !continuation.registercount [[META0]]
 ; ALL-NEXT:    unreachable
-; ALL:       15:
-; ALL-NEXT:    ret void
 ;
diff --git a/llvmraytracing/test/dx/dxil-cps-stack-lowering-global.ll b/llvmraytracing/test/dx/dxil-cps-stack-lowering-global.ll
index d449f86723..f3acbd5aff 100644
--- a/llvmraytracing/test/dx/dxil-cps-stack-lowering-global.ll
+++ b/llvmraytracing/test/dx/dxil-cps-stack-lowering-global.ll
@@ -1,8 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --include-generated-funcs --version 3
-; RUN: opt --verify-each -passes='dxil-cont-post-process,lint,remove-types-metadata' -S %s 2> %t0.stderr | FileCheck -check-prefix=CPS-STACK-LOWERING-CPS %s
-; RUN: count 0 < %t0.stderr
+; RUN: opt --verify-each -passes='dxil-cont-post-process,lint,remove-types-metadata' -S %s --lint-abort-on-error | FileCheck -check-prefix=CPS-STACK-LOWERING-CPS %s
 
-target datalayout = "e-m:e-p:64:32-p20:32:32-p21:32:32-p32:32:32-i1:32-i8:8-i16:32-i32:32-i64:32-f16:32-f32:32-f64:32-v16:32-v32:32-v48:32-v64:32-v80:32-v96:32-v112:32-v128:32-v144:32-v160:32-v176:32-v192:32-v208:32-v224:32-v240:32-v256:32-n8:16:32"
+target datalayout = "e-m:e-p:64:32-p20:32:32-p21:32:32-p32:32:32-i1:32-i8:8-i16:16-i32:32-i64:32-f16:16-f32:32-f64:32-v8:8-v16:16-v32:32-v48:32-v64:32-v80:32-v96:32-v112:32-v128:32-v144:32-v160:32-v176:32-v192:32-v208:32-v224:32-v240:32-v256:32-n8:16:32"
 
 %dx.types.Handle = type { ptr }
 %struct.DispatchSystemData = type { i32 }
@@ -169,7 +168,7 @@ attributes #6 = { nounwind willreturn memory(inaccessiblemem: read) }
 ; CPS-STACK-LOWERING-CPS-NEXT:    store i32 [[TMP4]], ptr [[CSP]], align 4
 ; CPS-STACK-LOWERING-CPS-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP2]], i32 [[TMP3]]
 ; CPS-STACK-LOWERING-CPS-NEXT:    store i32 [[RETURN_ADDR]], ptr addrspace(22) [[TMP5]], align 4
-; CPS-STACK-LOWERING-CPS-NEXT:    [[TMP6:%.*]] = call [[STRUCT_DISPATCHSYSTEMDATA]] @continuations.getSystemData.s_struct.DispatchSystemDatas()
+; CPS-STACK-LOWERING-CPS-NEXT:    [[TMP6:%.*]] = call [[STRUCT_DISPATCHSYSTEMDATA]] [[CONTINUATIONS_GETSYSTEMDATA_S_STRUCT_DISPATCHSYSTEMDATAS:@[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]]()
 ; CPS-STACK-LOWERING-CPS-NEXT:    [[DOTFCA_0_EXTRACT:%.*]] = extractvalue [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP6]], 0
 ; CPS-STACK-LOWERING-CPS-NEXT:    call void @amd.dx.setLocalRootIndex(i32 5)
 ; CPS-STACK-LOWERING-CPS-NEXT:    [[TMP7:%.*]] = add i32 [[TMP3]], 9
@@ -177,8 +176,8 @@ attributes #6 = { nounwind willreturn memory(inaccessiblemem: read) }
 ; CPS-STACK-LOWERING-CPS-NEXT:    store i32 99, ptr addrspace(22) [[TMP8]], align 4
 ; CPS-STACK-LOWERING-CPS-NEXT:    [[DIS_DATA_I_FCA_0_INSERT:%.*]] = insertvalue [[STRUCT_DISPATCHSYSTEMDATA]] poison, i32 [[DOTFCA_0_EXTRACT]], 0
 ; CPS-STACK-LOWERING-CPS-NEXT:    store i32 undef, ptr addrspace(20) @REGISTERS, align 4
+; CPS-STACK-LOWERING-CPS-NEXT:    [[TMP10:%.*]] = call i64 @continuation.getAddrAndMD(ptr @called.resume.0)
 ; CPS-STACK-LOWERING-CPS-NEXT:    [[TMP9:%.*]] = load i32, ptr [[CSP]], align 4
-; CPS-STACK-LOWERING-CPS-NEXT:    [[TMP10:%.*]] = call i64 @continuation.getAddrAndMD(i64 ptrtoint (ptr @called.resume.0 to i64))
 ; CPS-STACK-LOWERING-CPS-NEXT:    call void (i64, ...) @continuation.continue(i64 2, i32 [[TMP9]], i64 [[TMP10]], [[STRUCT_DISPATCHSYSTEMDATA]] [[DIS_DATA_I_FCA_0_INSERT]]), !continuation.registercount [[META16]]
 ; CPS-STACK-LOWERING-CPS-NEXT:    unreachable
 ;
diff --git a/llvmraytracing/test/dx/dxil-cps-stack-lowering-scratch.ll b/llvmraytracing/test/dx/dxil-cps-stack-lowering-scratch.ll
index b0ec32bdb7..4d16405118 100644
--- a/llvmraytracing/test/dx/dxil-cps-stack-lowering-scratch.ll
+++ b/llvmraytracing/test/dx/dxil-cps-stack-lowering-scratch.ll
@@ -1,8 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --include-generated-funcs --version 3
-; RUN: opt --verify-each -passes='dxil-cont-post-process,lint,remove-types-metadata' -S %s 2> %t0.stderr | FileCheck -check-prefix=CPS-STACK-LOWERING-CPS %s
-; RUN: count 0 < %t0.stderr
+; RUN: opt --verify-each -passes='dxil-cont-post-process,lint,remove-types-metadata' -S %s --lint-abort-on-error | FileCheck -check-prefix=CPS-STACK-LOWERING-CPS %s
 
-target datalayout = "e-m:e-p:64:32-p20:32:32-p21:32:32-p32:32:32-i1:32-i8:8-i16:32-i32:32-i64:32-f16:32-f32:32-f64:32-v16:32-v32:32-v48:32-v64:32-v80:32-v96:32-v112:32-v128:32-v144:32-v160:32-v176:32-v192:32-v208:32-v224:32-v240:32-v256:32-n8:16:32"
+target datalayout = "e-m:e-p:64:32-p20:32:32-p21:32:32-p32:32:32-i1:32-i8:8-i16:16-i32:32-i64:32-f16:16-f32:32-f64:32-v8:8-v16:16-v32:32-v48:32-v64:32-v80:32-v96:32-v112:32-v128:32-v144:32-v160:32-v176:32-v192:32-v208:32-v224:32-v240:32-v256:32-n8:16:32"
 
 %dx.types.Handle = type { ptr }
 %struct.DispatchSystemData = type { i32 }
@@ -166,7 +165,7 @@ attributes #6 = { nounwind willreturn memory(inaccessiblemem: read) }
 ; CPS-STACK-LOWERING-CPS-NEXT:    [[TMP3:%.*]] = inttoptr i32 [[TMP1]] to ptr addrspace(21)
 ; CPS-STACK-LOWERING-CPS-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP3]], i32 0
 ; CPS-STACK-LOWERING-CPS-NEXT:    store i32 [[RETURN_ADDR]], ptr addrspace(21) [[TMP4]], align 4
-; CPS-STACK-LOWERING-CPS-NEXT:    [[TMP5:%.*]] = call [[STRUCT_DISPATCHSYSTEMDATA]] @continuations.getSystemData.s_struct.DispatchSystemDatas()
+; CPS-STACK-LOWERING-CPS-NEXT:    [[TMP5:%.*]] = call [[STRUCT_DISPATCHSYSTEMDATA]] [[CONTINUATIONS_GETSYSTEMDATA_S_STRUCT_DISPATCHSYSTEMDATAS:@[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]]()
 ; CPS-STACK-LOWERING-CPS-NEXT:    [[DOTFCA_0_EXTRACT:%.*]] = extractvalue [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP5]], 0
 ; CPS-STACK-LOWERING-CPS-NEXT:    call void @amd.dx.setLocalRootIndex(i32 5)
 ; CPS-STACK-LOWERING-CPS-NEXT:    [[TMP6:%.*]] = add i32 [[TMP1]], 9
@@ -175,8 +174,8 @@ attributes #6 = { nounwind willreturn memory(inaccessiblemem: read) }
 ; CPS-STACK-LOWERING-CPS-NEXT:    store i32 99, ptr addrspace(21) [[TMP8]], align 4
 ; CPS-STACK-LOWERING-CPS-NEXT:    [[DIS_DATA_I_FCA_0_INSERT:%.*]] = insertvalue [[STRUCT_DISPATCHSYSTEMDATA]] poison, i32 [[DOTFCA_0_EXTRACT]], 0
 ; CPS-STACK-LOWERING-CPS-NEXT:    store i32 undef, ptr addrspace(20) @REGISTERS, align 4
+; CPS-STACK-LOWERING-CPS-NEXT:    [[TMP10:%.*]] = call i64 @continuation.getAddrAndMD(ptr @called.resume.0)
 ; CPS-STACK-LOWERING-CPS-NEXT:    [[TMP9:%.*]] = load i32, ptr [[CSP]], align 4
-; CPS-STACK-LOWERING-CPS-NEXT:    [[TMP10:%.*]] = call i64 @continuation.getAddrAndMD(i64 ptrtoint (ptr @called.resume.0 to i64))
 ; CPS-STACK-LOWERING-CPS-NEXT:    call void (i64, ...) @continuation.continue(i64 2, i32 [[TMP9]], i64 [[TMP10]], [[STRUCT_DISPATCHSYSTEMDATA]] [[DIS_DATA_I_FCA_0_INSERT]]), !continuation.registercount [[META16]]
 ; CPS-STACK-LOWERING-CPS-NEXT:    unreachable
 ;
diff --git a/llvmraytracing/test/dx/global-mem-stack.ll b/llvmraytracing/test/dx/global-mem-stack.ll
index 9dfec793e6..95a7dfc352 100644
--- a/llvmraytracing/test/dx/global-mem-stack.ll
+++ b/llvmraytracing/test/dx/global-mem-stack.ll
@@ -1,8 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --include-generated-funcs --version 3
-; RUN: opt --verify-each -passes='dxil-cont-lgc-rt-op-converter,lint,inline,lint,lower-raytracing-pipeline,lint,sroa,lint,lower-await,lint,coro-early,dxil-coro-split,coro-cleanup,lint,legacy-cleanup-continuations,lint,register-buffer,lint,dxil-cont-post-process,lint,remove-types-metadata' -S %s 2> %t.stderr | FileCheck %s
-; RUN: count 0 < %t.stderr
+; RUN: opt --verify-each -passes='dxil-cont-lgc-rt-op-converter,lint,inline,lint,lower-raytracing-pipeline,lint,sroa,lint,lower-await,lint,coro-early,dxil-coro-split,coro-cleanup,lint,legacy-cleanup-continuations,lint,register-buffer,lint,dxil-cont-post-process,lint,remove-types-metadata' -S %s --lint-abort-on-error | FileCheck %s
 
-target datalayout = "e-m:e-p:64:32-p20:32:32-p21:32:32-p32:32:32-i1:32-i8:8-i16:32-i32:32-i64:32-f16:32-f32:32-f64:32-v16:32-v32:32-v48:32-v64:32-v80:32-v96:32-v112:32-v128:32-v144:32-v160:32-v176:32-v192:32-v208:32-v224:32-v240:32-v256:32-n8:16:32"
+target datalayout = "e-m:e-p:64:32-p20:32:32-p21:32:32-p32:32:32-i1:32-i8:8-i16:16-i32:32-i64:32-f16:16-f32:32-f64:32-v8:8-v16:16-v32:32-v48:32-v64:32-v80:32-v96:32-v112:32-v128:32-v144:32-v160:32-v176:32-v192:32-v208:32-v224:32-v240:32-v256:32-n8:16:32"
 
 %struct.DispatchSystemData = type { <3 x i32> }
 %struct.TraversalData = type { %struct.SystemData, %struct.HitData, <3 x float>, <3 x float>, float, i64 }
@@ -213,6 +212,6 @@ define void @MyClosestHitShader(%struct.RayPayload* noalias nocapture %payload,
 ; CHECK-NEXT:    store i32 [[TMP43]], ptr addrspace(22) [[TMP44]], align 4
 ; CHECK-NEXT:    [[DOTFCA_0_INSERT:%.*]] = insertvalue [[STRUCT_DISPATCHSYSTEMDATA:%.*]] poison, <3 x i32> [[DOTFCA_0_0_EXTRACT]], 0
 ; CHECK-NEXT:    [[TMP45:%.*]] = load i32, ptr [[CSP]], align 4
-; CHECK-NEXT:    call void (i64, ...) @continuation.continue(i64 [[RETURNADDR]], i32 [[TMP45]], [[STRUCT_DISPATCHSYSTEMDATA]] [[DOTFCA_0_INSERT]]), !continuation.registercount [[META8]]
+; CHECK-NEXT:    call void (i64, ...) @continuation.continue(i64 [[RETURNADDR]], i32 [[TMP45]], i64 poison, [[STRUCT_DISPATCHSYSTEMDATA]] [[DOTFCA_0_INSERT]]), !continuation.registercount [[META8]]
 ; CHECK-NEXT:    unreachable
 ;
diff --git a/llvmraytracing/test/dx/intersection-registercount.ll b/llvmraytracing/test/dx/intersection-registercount.ll
index 9c6db7f9ce..1faa240456 100644
--- a/llvmraytracing/test/dx/intersection-registercount.ll
+++ b/llvmraytracing/test/dx/intersection-registercount.ll
@@ -1,5 +1,4 @@
-; RUN: opt --verify-each -passes='dxil-cont-intrinsic-prepare,lint,inline,lint,lower-raytracing-pipeline,lint,sroa,lint,lower-await,lint,coro-early,dxil-coro-split,coro-cleanup,lint,legacy-cleanup-continuations,lint,register-buffer,lint,dxil-cont-post-process,lint,remove-types-metadata' -S %s 2> %t1.stderr | FileCheck %s
-; RUN: count 0 < %t1.stderr
+; RUN: opt --verify-each -passes='dxil-cont-intrinsic-prepare,lint,inline,lint,lower-raytracing-pipeline,lint,sroa,lint,lower-await,lint,coro-early,dxil-coro-split,coro-cleanup,lint,legacy-cleanup-continuations,lint,register-buffer,lint,dxil-cont-post-process,lint,remove-types-metadata' -S %s --lint-abort-on-error | FileCheck %s
 
 ; Check that the size of @REGISTERS is as big as the continuation.registercount when there is an intersection shader
 ; CHECK: @REGISTERS = external addrspace(20) global [25 x i32]
@@ -8,7 +7,7 @@
 ; CHECK: define void @Intersection{{.*}}!continuation.registercount ![[MDREGCOUNT:[0-9]+]]
 ; CHECK: ![[MDREGCOUNT]] = !{i32 25}
 
-target datalayout = "e-m:e-p:64:32-p20:32:32-p21:32:32-p32:32:32-i1:32-i8:8-i16:32-i32:32-i64:32-f16:32-f32:32-f64:32-v16:32-v32:32-v48:32-v64:32-v80:32-v96:32-v112:32-v128:32-v144:32-v160:32-v176:32-v192:32-v208:32-v224:32-v240:32-v256:32-n8:16:32"
+target datalayout = "e-m:e-p:64:32-p20:32:32-p21:32:32-p32:32:32-i1:32-i8:8-i16:16-i32:32-i64:32-f16:16-f32:32-f64:32-v8:8-v16:16-v32:32-v48:32-v64:32-v80:32-v96:32-v112:32-v128:32-v144:32-v160:32-v176:32-v192:32-v208:32-v224:32-v240:32-v256:32-n8:16:32"
 
 %dx.types.Handle = type { i8* }
 %struct.BuiltInTriangleIntersectionAttributes = type { <2 x float> }
diff --git a/llvmraytracing/test/dx/intrinsics/cont-payload-registers-get-i32.ll b/llvmraytracing/test/dx/intrinsics/cont-payload-registers-get-i32.ll
index d0ac18dfac..5f4629bae2 100644
--- a/llvmraytracing/test/dx/intrinsics/cont-payload-registers-get-i32.ll
+++ b/llvmraytracing/test/dx/intrinsics/cont-payload-registers-get-i32.ll
@@ -1,6 +1,6 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --function main --version 2
-; RUN: opt --verify-each -passes='dxil-cont-intrinsic-prepare,lint,dxil-cont-lgc-rt-op-converter,lint,inline,lint,lower-raytracing-pipeline,lint,sroa,lint,lower-await,lint,coro-early,dxil-coro-split,coro-cleanup,lint,legacy-cleanup-continuations,lint,register-buffer,lint,dxil-cont-post-process,lint,remove-types-metadata' -S %s 2> %t.stderr | FileCheck %s
-; RUN: count 0 < %t.stderr
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --function main --version 3
+; RUN: opt --verify-each -passes='dxil-cont-intrinsic-prepare,lint,dxil-cont-lgc-rt-op-converter,lint,inline,lint,lower-raytracing-pipeline,lint,sroa,lint,lower-await,lint,coro-early,dxil-coro-split,coro-cleanup,lint,legacy-cleanup-continuations,lint,register-buffer,lint,dxil-cont-post-process,lint,remove-types-metadata' -S %s --lint-abort-on-error | FileCheck -check-prefix=ALL %s
+; RUN: opt --verify-each -passes='dxil-cont-intrinsic-prepare,lint,dxil-cont-lgc-rt-op-converter,lint,lower-raytracing-pipeline,lint,remove-types-metadata' -S %s --lint-abort-on-error | FileCheck -check-prefix=LOWERRAYTRACINGPIPELINE %s
 
 %struct.DispatchSystemData = type { i32 }
 %struct.BuiltInTriangleIntersectionAttributes = type { <2 x float> }
@@ -12,7 +12,6 @@
 declare i32 @_AmdContPayloadRegistersGetI32(i32)
 
 declare %struct.DispatchSystemData @_cont_SetupRayGen()
-declare i32 @_cont_GetContinuationStackAddr() #0
 
 declare !types !9 i32 @_cont_GetLocalRootIndex(%struct.DispatchSystemData*)
 
@@ -20,21 +19,33 @@ declare !types !11 %struct.BuiltInTriangleIntersectionAttributes @_cont_GetTrian
 
 declare !types !12 i32 @_cont_HitKind(%struct.DispatchSystemData*, %struct.HitData*)
 
+define void @_cont_ExitRayGen(ptr nocapture readonly %data) alwaysinline nounwind !types !{!"function", !"void", !{i32 0, %struct.DispatchSystemData poison}} {
+  ret void
+}
+
 define void @main() {
-; CHECK-LABEL: define void @main
-; CHECK-SAME: () !continuation [[META11:![0-9]+]] !lgc.rt.shaderstage [[META5:![0-9]+]] !continuation.entry [[META12:![0-9]+]] !continuation.registercount [[META5]] !continuation.state [[META5]] {
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[CSP:%.*]] = alloca i32, align 4
-; CHECK-NEXT:    [[TMP0:%.*]] = call i32 @_cont_GetContinuationStackAddr()
-; CHECK-NEXT:    store i32 [[TMP0]], ptr [[CSP]], align 4
-; CHECK-NEXT:    [[SYSTEM_DATA:%.*]] = call [[STRUCT_DISPATCHSYSTEMDATA:%.*]] @_cont_SetupRayGen()
-; CHECK-NEXT:    [[DOTFCA_0_EXTRACT:%.*]] = extractvalue [[STRUCT_DISPATCHSYSTEMDATA]] [[SYSTEM_DATA]], 0
-; CHECK-NEXT:    call void @amd.dx.setLocalRootIndex(i32 0)
-; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr addrspace(20) getelementptr inbounds ([30 x i32], ptr addrspace(20) @REGISTERS, i32 0, i32 5), align 4
-; CHECK-NEXT:    store i32 [[TMP1]], ptr @debug_global, align 4
-; CHECK-NEXT:    ret void
-; CHECK:       entry.split:
-; CHECK-NEXT:    unreachable
+; ALL-LABEL: define void @main(
+; ALL-SAME: i32 [[CSPINIT:%.*]], i64 [[RETURNADDR:%.*]], [[STRUCT_DISPATCHSYSTEMDATA:%.*]] [[TMP0:%.*]]) !continuation [[META12:![0-9]+]] !lgc.rt.shaderstage [[META5:![0-9]+]] !continuation.entry [[META13:![0-9]+]] !continuation.registercount [[META5]] !continuation.state [[META5]] {
+; ALL-NEXT:  entry:
+; ALL-NEXT:    [[CSP:%.*]] = alloca i32, align 4
+; ALL-NEXT:    store i32 [[CSPINIT]], ptr [[CSP]], align 4
+; ALL-NEXT:    [[DOTFCA_0_EXTRACT:%.*]] = extractvalue [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP0]], 0
+; ALL-NEXT:    call void @amd.dx.setLocalRootIndex(i32 0)
+; ALL-NEXT:    [[TMP2:%.*]] = load i32, ptr addrspace(20) getelementptr inbounds ([30 x i32], ptr addrspace(20) @REGISTERS, i32 0, i32 5), align 4
+; ALL-NEXT:    store i32 [[TMP2]], ptr @debug_global, align 4
+; ALL-NEXT:    ret void
+; ALL:       entry.split:
+; ALL-NEXT:    unreachable
+;
+; LOWERRAYTRACINGPIPELINE-LABEL: define void @main(
+; LOWERRAYTRACINGPIPELINE-SAME: i64 [[RETURNADDR:%.*]], [[STRUCT_DISPATCHSYSTEMDATA:%.*]] [[TMP0:%.*]]) !continuation [[META12:![0-9]+]] !lgc.rt.shaderstage [[META5:![0-9]+]] !continuation.entry [[META13:![0-9]+]] !continuation.registercount [[META5]] {
+; LOWERRAYTRACINGPIPELINE-NEXT:  entry:
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[SYSTEM_DATA_ALLOCA:%.*]] = alloca [[STRUCT_DISPATCHSYSTEMDATA]], align 8
+; LOWERRAYTRACINGPIPELINE-NEXT:    store [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP0]], ptr [[SYSTEM_DATA_ALLOCA]], align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    call void @amd.dx.setLocalRootIndex(i32 0)
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[VAL:%.*]] = load i32, ptr addrspace(20) getelementptr inbounds ([30 x i32], ptr addrspace(20) @PAYLOAD, i32 0, i32 5), align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[VAL]], ptr @debug_global, align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    ret void
 ;
 entry:
   %val = call i32 @_AmdContPayloadRegistersGetI32(i32 5)
diff --git a/llvmraytracing/test/dx/intrinsics/cont-payload-registers-i32-count.ll b/llvmraytracing/test/dx/intrinsics/cont-payload-registers-i32-count.ll
index 0362e57888..b50f4f4e27 100644
--- a/llvmraytracing/test/dx/intrinsics/cont-payload-registers-i32-count.ll
+++ b/llvmraytracing/test/dx/intrinsics/cont-payload-registers-i32-count.ll
@@ -1,6 +1,6 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 2
-; RUN: opt --verify-each -passes='dxil-cont-intrinsic-prepare,lint,dxil-cont-lgc-rt-op-converter,lint,inline,lint,lower-raytracing-pipeline,lint,sroa,lint,lower-await,lint,coro-early,dxil-coro-split,coro-cleanup,lint,legacy-cleanup-continuations,lint,register-buffer,lint,dxil-cont-post-process,lint,remove-types-metadata' -S %s 2> %t0.stderr | FileCheck -check-prefix=MINCOUNT %s
-; RUN: count 0 < %t0.stderr
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 3
+; RUN: opt --verify-each -passes='dxil-cont-intrinsic-prepare,lint,dxil-cont-lgc-rt-op-converter,lint,inline,lint,lower-raytracing-pipeline,lint,sroa,lint,lower-await,lint,coro-early,dxil-coro-split,coro-cleanup,lint,legacy-cleanup-continuations,lint,register-buffer,lint,dxil-cont-post-process,lint,remove-types-metadata' -S %s --lint-abort-on-error | FileCheck -check-prefix=MINCOUNT %s
+; RUN: opt --verify-each -passes='dxil-cont-intrinsic-prepare,lint,dxil-cont-lgc-rt-op-converter,lint,inline,lint,lower-raytracing-pipeline,lint,remove-types-metadata' -S %s --lint-abort-on-error | FileCheck -check-prefix=LOWERRAYTRACINGPIPELINE-MINCOUNT %s
 
 %struct.DispatchSystemData = type { i32 }
 
@@ -9,25 +9,35 @@
 declare i32 @_AmdContPayloadRegistersI32Count()
 
 declare %struct.DispatchSystemData @_cont_SetupRayGen()
-declare i32 @_cont_GetContinuationStackAddr() #0
 
 declare !types !9 i32 @_cont_GetLocalRootIndex(%struct.DispatchSystemData*)
 
+define void @_cont_ExitRayGen(ptr nocapture readonly %data) alwaysinline nounwind !types !{!"function", !"void", !{i32 0, %struct.DispatchSystemData poison}} {
+  ret void
+}
+
 define void @main() {
-; MINCOUNT-LABEL: define void @main
-; MINCOUNT-SAME: () !continuation [[META10:![0-9]+]] !lgc.rt.shaderstage [[META5:![0-9]+]] !continuation.entry [[META11:![0-9]+]] !continuation.registercount [[META5]] !continuation.state [[META5]] {
+; MINCOUNT-LABEL: define void @main(
+; MINCOUNT-SAME: i32 [[CSPINIT:%.*]], i64 [[RETURNADDR:%.*]], [[STRUCT_DISPATCHSYSTEMDATA:%.*]] [[TMP0:%.*]]) !continuation [[META10:![0-9]+]] !lgc.rt.shaderstage [[META5:![0-9]+]] !continuation.entry [[META11:![0-9]+]] !continuation.registercount [[META5]] !continuation.state [[META5]] {
 ; MINCOUNT-NEXT:  entry:
 ; MINCOUNT-NEXT:    [[CSP:%.*]] = alloca i32, align 4
-; MINCOUNT-NEXT:    [[TMP0:%.*]] = call i32 @_cont_GetContinuationStackAddr()
-; MINCOUNT-NEXT:    store i32 [[TMP0]], ptr [[CSP]], align 4
-; MINCOUNT-NEXT:    [[SYSTEM_DATA:%.*]] = call [[STRUCT_DISPATCHSYSTEMDATA:%.*]] @_cont_SetupRayGen()
-; MINCOUNT-NEXT:    [[DOTFCA_0_EXTRACT:%.*]] = extractvalue [[STRUCT_DISPATCHSYSTEMDATA]] [[SYSTEM_DATA]], 0
+; MINCOUNT-NEXT:    store i32 [[CSPINIT]], ptr [[CSP]], align 4
+; MINCOUNT-NEXT:    [[DOTFCA_0_EXTRACT:%.*]] = extractvalue [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP0]], 0
 ; MINCOUNT-NEXT:    call void @amd.dx.setLocalRootIndex(i32 0)
 ; MINCOUNT-NEXT:    store i32 11, ptr @debug_global, align 4
 ; MINCOUNT-NEXT:    ret void
 ; MINCOUNT:       entry.split:
 ; MINCOUNT-NEXT:    unreachable
 ;
+; LOWERRAYTRACINGPIPELINE-MINCOUNT-LABEL: define void @main(
+; LOWERRAYTRACINGPIPELINE-MINCOUNT-SAME: i64 [[RETURNADDR:%.*]], [[STRUCT_DISPATCHSYSTEMDATA:%.*]] [[TMP0:%.*]]) !continuation [[META10:![0-9]+]] !lgc.rt.shaderstage [[META5:![0-9]+]] !continuation.entry [[META11:![0-9]+]] !continuation.registercount [[META5]] {
+; LOWERRAYTRACINGPIPELINE-MINCOUNT-NEXT:  entry:
+; LOWERRAYTRACINGPIPELINE-MINCOUNT-NEXT:    [[SYSTEM_DATA_ALLOCA:%.*]] = alloca [[STRUCT_DISPATCHSYSTEMDATA]], align 8
+; LOWERRAYTRACINGPIPELINE-MINCOUNT-NEXT:    store [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP0]], ptr [[SYSTEM_DATA_ALLOCA]], align 4
+; LOWERRAYTRACINGPIPELINE-MINCOUNT-NEXT:    call void @amd.dx.setLocalRootIndex(i32 0)
+; LOWERRAYTRACINGPIPELINE-MINCOUNT-NEXT:    store i32 11, ptr @debug_global, align 4
+; LOWERRAYTRACINGPIPELINE-MINCOUNT-NEXT:    ret void
+;
 entry:
   %val = call i32 @_AmdContPayloadRegistersI32Count()
   store i32 %val, i32* @debug_global, align 4
diff --git a/llvmraytracing/test/dx/intrinsics/cont-payload-registers-set-i32.ll b/llvmraytracing/test/dx/intrinsics/cont-payload-registers-set-i32.ll
index 5de545c731..14a1f07454 100644
--- a/llvmraytracing/test/dx/intrinsics/cont-payload-registers-set-i32.ll
+++ b/llvmraytracing/test/dx/intrinsics/cont-payload-registers-set-i32.ll
@@ -1,6 +1,6 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --function main --version 2
-; RUN: opt --verify-each -passes='dxil-cont-intrinsic-prepare,lint,dxil-cont-lgc-rt-op-converter,lint,inline,lint,lower-raytracing-pipeline,lint,sroa,lint,lower-await,lint,coro-early,dxil-coro-split,coro-cleanup,lint,legacy-cleanup-continuations,lint,register-buffer,lint,dxil-cont-post-process,lint,remove-types-metadata' -S %s 2> %t.stderr | FileCheck %s
-; RUN: count 0 < %t.stderr
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --function main --version 3
+; RUN: opt --verify-each -passes='dxil-cont-intrinsic-prepare,lint,dxil-cont-lgc-rt-op-converter,lint,inline,lint,lower-raytracing-pipeline,lint,sroa,lint,lower-await,lint,coro-early,dxil-coro-split,coro-cleanup,lint,legacy-cleanup-continuations,lint,register-buffer,lint,dxil-cont-post-process,lint,remove-types-metadata' -S %s --lint-abort-on-error | FileCheck -check-prefix=ALL %s
+; RUN: opt --verify-each -passes='dxil-cont-intrinsic-prepare,lint,dxil-cont-lgc-rt-op-converter,lint,lower-raytracing-pipeline,lint,remove-types-metadata' -S %s --lint-abort-on-error | FileCheck -check-prefix=LOWERRAYTRACINGPIPELINE %s
 
 %struct.DispatchSystemData = type { i32 }
 %struct.BuiltInTriangleIntersectionAttributes = type { <2 x float> }
@@ -10,7 +10,6 @@
 declare void @_AmdContPayloadRegistersSetI32(i32, i32)
 
 declare %struct.DispatchSystemData @_cont_SetupRayGen()
-declare i32 @_cont_GetContinuationStackAddr() #0
 
 declare !types !9 i32 @_cont_GetLocalRootIndex(%struct.DispatchSystemData*)
 
@@ -18,20 +17,31 @@ declare !types !11 %struct.BuiltInTriangleIntersectionAttributes @_cont_GetTrian
 
 declare !types !12 i32 @_cont_HitKind(%struct.DispatchSystemData*, %struct.HitData*)
 
+define void @_cont_ExitRayGen(ptr nocapture readonly %data) alwaysinline nounwind !types !{!"function", !"void", !{i32 0, %struct.DispatchSystemData poison}} {
+  ret void
+}
+
 define void @main() {
-; CHECK-LABEL: define void @main
-; CHECK-SAME: () !continuation [[META11:![0-9]+]] !lgc.rt.shaderstage [[META5:![0-9]+]] !continuation.entry [[META12:![0-9]+]] !continuation.registercount [[META5]] !continuation.state [[META5]] {
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[CSP:%.*]] = alloca i32, align 4
-; CHECK-NEXT:    [[TMP0:%.*]] = call i32 @_cont_GetContinuationStackAddr()
-; CHECK-NEXT:    store i32 [[TMP0]], ptr [[CSP]], align 4
-; CHECK-NEXT:    [[SYSTEM_DATA:%.*]] = call [[STRUCT_DISPATCHSYSTEMDATA:%.*]] @_cont_SetupRayGen()
-; CHECK-NEXT:    [[DOTFCA_0_EXTRACT:%.*]] = extractvalue [[STRUCT_DISPATCHSYSTEMDATA]] [[SYSTEM_DATA]], 0
-; CHECK-NEXT:    call void @amd.dx.setLocalRootIndex(i32 0)
-; CHECK-NEXT:    store i32 42, ptr addrspace(20) getelementptr inbounds ([30 x i32], ptr addrspace(20) @REGISTERS, i32 0, i32 5), align 4
-; CHECK-NEXT:    ret void
-; CHECK:       entry.split:
-; CHECK-NEXT:    unreachable
+; ALL-LABEL: define void @main(
+; ALL-SAME: i32 [[CSPINIT:%.*]], i64 [[RETURNADDR:%.*]], [[STRUCT_DISPATCHSYSTEMDATA:%.*]] [[TMP0:%.*]]) !continuation [[META12:![0-9]+]] !lgc.rt.shaderstage [[META5:![0-9]+]] !continuation.entry [[META13:![0-9]+]] !continuation.registercount [[META5]] !continuation.state [[META5]] {
+; ALL-NEXT:  entry:
+; ALL-NEXT:    [[CSP:%.*]] = alloca i32, align 4
+; ALL-NEXT:    store i32 [[CSPINIT]], ptr [[CSP]], align 4
+; ALL-NEXT:    [[DOTFCA_0_EXTRACT:%.*]] = extractvalue [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP0]], 0
+; ALL-NEXT:    call void @amd.dx.setLocalRootIndex(i32 0)
+; ALL-NEXT:    store i32 42, ptr addrspace(20) getelementptr inbounds ([30 x i32], ptr addrspace(20) @REGISTERS, i32 0, i32 5), align 4
+; ALL-NEXT:    ret void
+; ALL:       entry.split:
+; ALL-NEXT:    unreachable
+;
+; LOWERRAYTRACINGPIPELINE-LABEL: define void @main(
+; LOWERRAYTRACINGPIPELINE-SAME: i64 [[RETURNADDR:%.*]], [[STRUCT_DISPATCHSYSTEMDATA:%.*]] [[TMP0:%.*]]) !continuation [[META12:![0-9]+]] !lgc.rt.shaderstage [[META5:![0-9]+]] !continuation.entry [[META13:![0-9]+]] !continuation.registercount [[META5]] {
+; LOWERRAYTRACINGPIPELINE-NEXT:  entry:
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[SYSTEM_DATA_ALLOCA:%.*]] = alloca [[STRUCT_DISPATCHSYSTEMDATA]], align 8
+; LOWERRAYTRACINGPIPELINE-NEXT:    store [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP0]], ptr [[SYSTEM_DATA_ALLOCA]], align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    call void @amd.dx.setLocalRootIndex(i32 0)
+; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 42, ptr addrspace(20) getelementptr inbounds ([30 x i32], ptr addrspace(20) @PAYLOAD, i32 0, i32 5), align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    ret void
 ;
 entry:
   call void @_AmdContPayloadRegistersSetI32(i32 5, i32 42)
diff --git a/llvmraytracing/test/dx/intrinsics/cont-stack-access.ll b/llvmraytracing/test/dx/intrinsics/cont-stack-access.ll
index 9d0dc51dc7..413c7c4492 100644
--- a/llvmraytracing/test/dx/intrinsics/cont-stack-access.ll
+++ b/llvmraytracing/test/dx/intrinsics/cont-stack-access.ll
@@ -1,8 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 3
-; RUN: grep -v SKIP_GLOBAL_ADDRSPACE %s | opt --verify-each -passes='dxil-cont-post-process,lint,remove-types-metadata' -S 2> %t0.stderr | FileCheck %s -check-prefix=STACK_SCRATCH
-; RUN: count 0 < %t0.stderr
-; RUN: grep -v SKIP_SCRATCH_ADDRSPACE %s | opt --verify-each -passes='dxil-cont-post-process,lint,remove-types-metadata' -S 2> %t1.stderr | FileCheck %s -check-prefix=STACK_GLOBAL
-; RUN: count 0 < %t1.stderr
+; RUN: grep -v SKIP_GLOBAL_ADDRSPACE %s | opt --verify-each -passes='dxil-cont-post-process,lint,remove-types-metadata' -S --lint-abort-on-error | FileCheck %s -check-prefix=STACK_SCRATCH
+; RUN: grep -v SKIP_SCRATCH_ADDRSPACE %s | opt --verify-each -passes='dxil-cont-post-process,lint,remove-types-metadata' -S --lint-abort-on-error | FileCheck %s -check-prefix=STACK_GLOBAL
 
 declare i32 @_AmdContStackAlloc(i32 %size)
 declare i32 @_AmdContStackLoadI32(i32 %addr)
diff --git a/llvmraytracing/test/dx/intrinsics/cont-stack-alloc.ll b/llvmraytracing/test/dx/intrinsics/cont-stack-alloc.ll
index 1a883d0619..1cef618038 100644
--- a/llvmraytracing/test/dx/intrinsics/cont-stack-alloc.ll
+++ b/llvmraytracing/test/dx/intrinsics/cont-stack-alloc.ll
@@ -1,6 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --function main --version 2
-; RUN: opt --verify-each -passes='cgscc(inline),lint,lower-raytracing-pipeline,lint,sroa,lint,lower-await,lint,coro-early,dxil-coro-split,coro-cleanup,lint,legacy-cleanup-continuations,lint,register-buffer,lint,dxil-cont-post-process,lint,remove-types-metadata' -S %s 2> %t.stderr | FileCheck %s
-; RUN: count 0 < %t.stderr
+; RUN: opt --verify-each -passes='cgscc(inline),lint,lower-raytracing-pipeline,lint,sroa,lint,lower-await,lint,coro-early,dxil-coro-split,coro-cleanup,lint,legacy-cleanup-continuations,lint,register-buffer,lint,dxil-cont-post-process,lint,remove-types-metadata' -S %s --lint-abort-on-error | FileCheck %s
 
 declare i32 @_AmdContStackAlloc(i32 %size)
 declare i32 @_AmdContPayloadRegistersI32Count()
@@ -18,15 +17,17 @@ declare !types !12 i32 @_cont_HitKind(%struct.DispatchSystemData*, %struct.HitDa
 
 @debug_global = external global i32
 
+define void @_cont_ExitRayGen(ptr nocapture readonly %data) alwaysinline nounwind !types !{!"function", !"void", !{i32 0, %struct.DispatchSystemData poison}} {
+  ret void
+}
+
 define void @main() !lgc.rt.shaderstage !17 {
 ; CHECK-LABEL: define void @main
-; CHECK-SAME: () !lgc.rt.shaderstage [[META5:![0-9]+]] !continuation.entry [[META11:![0-9]+]] !continuation.registercount [[META5]] !continuation [[META12:![0-9]+]] !continuation.stacksize [[META13:![0-9]+]] !continuation.state [[META5]] {
+; CHECK-SAME: (i32 [[CSPINIT:%.*]], i64 [[RETURNADDR:%.*]], [[STRUCT_DISPATCHSYSTEMDATA:%.*]] [[TMP0:%.*]]) !lgc.rt.shaderstage [[META5:![0-9]+]] !continuation.entry [[META11:![0-9]+]] !continuation.registercount [[META5]] !continuation [[META12:![0-9]+]] !continuation.stacksize [[META13:![0-9]+]] !continuation.state [[META5]] {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[CSP:%.*]] = alloca i32, align 4
-; CHECK-NEXT:    [[TMP0:%.*]] = call i32 @_cont_GetContinuationStackAddr()
-; CHECK-NEXT:    store i32 [[TMP0]], ptr [[CSP]], align 4
-; CHECK-NEXT:    [[SYSTEM_DATA:%.*]] = call [[STRUCT_DISPATCHSYSTEMDATA:%.*]] @_cont_SetupRayGen()
-; CHECK-NEXT:    [[DOTFCA_0_EXTRACT:%.*]] = extractvalue [[STRUCT_DISPATCHSYSTEMDATA]] [[SYSTEM_DATA]], 0
+; CHECK-NEXT:    store i32 [[CSPINIT]], ptr [[CSP]], align 4
+; CHECK-NEXT:    [[DOTFCA_0_EXTRACT:%.*]] = extractvalue [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP0]], 0
 ; CHECK-NEXT:    [[PL_BYTES:%.*]] = mul i32 30, 4
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr [[CSP]], align 4
 ; CHECK-NEXT:    [[TMP2:%.*]] = add i32 [[TMP1]], 120
diff --git a/llvmraytracing/test/dx/intrinsics/continuation-stack-is-global-false.ll b/llvmraytracing/test/dx/intrinsics/continuation-stack-is-global-false.ll
index 9c47ed0d7f..e52eb1c016 100644
--- a/llvmraytracing/test/dx/intrinsics/continuation-stack-is-global-false.ll
+++ b/llvmraytracing/test/dx/intrinsics/continuation-stack-is-global-false.ll
@@ -1,6 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 3
-; RUN: opt --verify-each -passes='dxil-cont-lgc-rt-op-converter,lint,dxil-cont-intrinsic-prepare,lint' -S %s 2> %t.stderr | FileCheck %s
-; RUN: count 0 < %t.stderr
+; RUN: opt --verify-each -passes='dxil-cont-lgc-rt-op-converter,lint,dxil-cont-intrinsic-prepare,lint' -S %s --lint-abort-on-error | FileCheck %s
 
 %struct.DispatchSystemData = type { i32 }
 
diff --git a/llvmraytracing/test/dx/intrinsics/continuation-stack-is-global-true.ll b/llvmraytracing/test/dx/intrinsics/continuation-stack-is-global-true.ll
index cb440e7b9a..31da237f86 100644
--- a/llvmraytracing/test/dx/intrinsics/continuation-stack-is-global-true.ll
+++ b/llvmraytracing/test/dx/intrinsics/continuation-stack-is-global-true.ll
@@ -1,6 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 3
-; RUN: opt --verify-each -passes='dxil-cont-lgc-rt-op-converter,lint,dxil-cont-intrinsic-prepare,lint' -S %s 2> %t.stderr | FileCheck %s
-; RUN: count 0 < %t.stderr
+; RUN: opt --verify-each -passes='dxil-cont-lgc-rt-op-converter,lint,dxil-cont-intrinsic-prepare,lint' -S %s --lint-abort-on-error | FileCheck %s
 
 %struct.DispatchSystemData = type { i32 }
 
diff --git a/llvmraytracing/test/dx/intrinsics/get-current-func-addr.ll b/llvmraytracing/test/dx/intrinsics/get-current-func-addr.ll
index 16407ac36d..bc4726f56f 100644
--- a/llvmraytracing/test/dx/intrinsics/get-current-func-addr.ll
+++ b/llvmraytracing/test/dx/intrinsics/get-current-func-addr.ll
@@ -1,8 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 3
-; RUN: grep -v lgc.cps.module %s | opt --verify-each -passes='lower-raytracing-pipeline,lint' -S 2> %t0.stderr | FileCheck --check-prefix=CHECK %s
-; RUN: count 0 < %t0.stderr
-; RUN: opt --verify-each -passes='lower-raytracing-pipeline,lint' -S %s 2> %t1.stderr | FileCheck --check-prefix=CHECK-CPS %s
-; RUN: count 0 < %t1.stderr
+; RUN: grep -v lgc.cps.module %s | opt --verify-each -passes='lower-raytracing-pipeline,lint' -S --lint-abort-on-error | FileCheck --check-prefix=CHECK %s
+; RUN: opt --verify-each -passes='lower-raytracing-pipeline,lint' -S %s --lint-abort-on-error | FileCheck --check-prefix=CHECK-CPS %s
 
 %struct.DispatchSystemData = type { i32 }
 
@@ -13,7 +11,8 @@ declare !types !2 i32 @_cont_GetLocalRootIndex(%struct.DispatchSystemData*)
 define void @MyRayGen() {
 ; CHECK-LABEL: define void @MyRayGen() {
 ; CHECK-NEXT:  AllocaSpillBB:
-; CHECK-NEXT:    call void @Use(i64 ptrtoint (ptr @MyRayGen to i64))
+; CHECK-NEXT:    [[TMP0:%.*]] = call i64 (...) @lgc.cps.as.continuation.reference__i64(ptr @MyRayGen)
+; CHECK-NEXT:    call void @Use(i64 [[TMP0]])
 ; CHECK-NEXT:    ret void
 ;
 ; CHECK-CPS-LABEL: define void @MyRayGen() {
@@ -31,7 +30,8 @@ AllocaSpillBB:
 define void @MyRayGen.resume.0() {
 ; CHECK-LABEL: define void @MyRayGen.resume.0() {
 ; CHECK-NEXT:  entryresume.0:
-; CHECK-NEXT:    call void @Use(i64 ptrtoint (ptr @MyRayGen.resume.0 to i64))
+; CHECK-NEXT:    [[TMP0:%.*]] = call i64 (...) @lgc.cps.as.continuation.reference__i64(ptr @MyRayGen.resume.0)
+; CHECK-NEXT:    call void @Use(i64 [[TMP0]])
 ; CHECK-NEXT:    ret void
 ;
 ; CHECK-CPS-LABEL: define void @MyRayGen.resume.0() {
diff --git a/llvmraytracing/test/dx/intrinsics/get-flags.ll b/llvmraytracing/test/dx/intrinsics/get-flags.ll
index e442c896be..2cc945f7e6 100644
--- a/llvmraytracing/test/dx/intrinsics/get-flags.ll
+++ b/llvmraytracing/test/dx/intrinsics/get-flags.ll
@@ -1,6 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 3
-; RUN: opt --verify-each -passes='dxil-cont-intrinsic-prepare,lint' -S %s 2> %t.stderr | FileCheck %s
-; RUN: count 0 < %t.stderr
+; RUN: opt --verify-each -passes='dxil-cont-intrinsic-prepare,lint' -S %s --lint-abort-on-error | FileCheck %s
 
 declare i32 @_AmdContinuationsGetFlags()
 
diff --git a/llvmraytracing/test/dx/intrinsics/get-rtip.ll b/llvmraytracing/test/dx/intrinsics/get-rtip.ll
index 0ddfdcebe7..08faf97e4e 100644
--- a/llvmraytracing/test/dx/intrinsics/get-rtip.ll
+++ b/llvmraytracing/test/dx/intrinsics/get-rtip.ll
@@ -1,6 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 3
-; RUN: opt --verify-each -passes='dxil-cont-intrinsic-prepare,lint' -S %s 2> %t.stderr | FileCheck %s
-; RUN: count 0 < %t.stderr
+; RUN: opt --verify-each -passes='dxil-cont-intrinsic-prepare,lint' -S %s --lint-abort-on-error | FileCheck %s
 
 declare i32 @_AmdGetRtip()
 
diff --git a/llvmraytracing/test/dx/intrinsics/get-setting.ll b/llvmraytracing/test/dx/intrinsics/get-setting.ll
index f6a2433fc4..d9d38e41a6 100644
--- a/llvmraytracing/test/dx/intrinsics/get-setting.ll
+++ b/llvmraytracing/test/dx/intrinsics/get-setting.ll
@@ -1,6 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 3
-; RUN: opt --verify-each -passes='dxil-cont-intrinsic-prepare,lint' -S %s 2> %t.stderr | FileCheck %s
-; RUN: count 0 < %t.stderr
+; RUN: opt --verify-each -passes='dxil-cont-intrinsic-prepare,lint' -S %s --lint-abort-on-error | FileCheck %s
 
 declare i32 @_AmdGetSetting_123()
 
diff --git a/llvmraytracing/test/dx/intrinsics/get-shader-kind.ll b/llvmraytracing/test/dx/intrinsics/get-shader-kind.ll
index d5b25bae75..ad9bd34f88 100644
--- a/llvmraytracing/test/dx/intrinsics/get-shader-kind.ll
+++ b/llvmraytracing/test/dx/intrinsics/get-shader-kind.ll
@@ -1,6 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 2
-; RUN: opt --verify-each -passes='dxil-cont-lgc-rt-op-converter,lint,lower-raytracing-pipeline,lint' -S %s 2> %t.stderr | FileCheck %s
-; RUN: count 0 < %t.stderr
+; RUN: opt --verify-each -passes='dxil-cont-lgc-rt-op-converter,lint,lower-raytracing-pipeline,lint' -S %s --lint-abort-on-error | FileCheck %s
 
 %struct.DispatchSystemData = type { i32 }
 %struct.BuiltInTriangleIntersectionAttributes = type { <2 x float> }
@@ -29,22 +28,23 @@ define float @_cont_RayTCurrent() {
 ; Note: DXILShaderKind::Miss has value 11
 define void @MyMiss(%struct.Payload* %payload) !types !1 !lgc.rt.shaderstage !16 {
 ; CHECK-LABEL: define %struct.DispatchSystemData @MyMiss
-; CHECK-SAME: ([[STRUCT_DISPATCHSYSTEMDATA:%.*]] [[TMP0:%.*]]) !lgc.rt.shaderstage [[META12:![0-9]+]] !continuation.registercount [[META5:![0-9]+]] !continuation [[META13:![0-9]+]] {
+; CHECK-SAME: (i64 [[RETURNADDR:%.*]], [[STRUCT_DISPATCHSYSTEMDATA:%.*]] [[TMP0:%.*]]) !lgc.rt.shaderstage [[META12:![0-9]+]] !continuation.registercount [[META5:![0-9]+]] !continuation [[META13:![0-9]+]] {
 ; CHECK-NEXT:    [[SYSTEM_DATA_ALLOCA:%.*]] = alloca [[STRUCT_DISPATCHSYSTEMDATA]], align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = alloca [[STRUCT_PAYLOAD:%.*]], align 8
 ; CHECK-NEXT:    store [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP0]], ptr [[SYSTEM_DATA_ALLOCA]], align 4
 ; CHECK-NEXT:    [[TMP3:%.*]] = call i32 @_cont_GetLocalRootIndex(ptr [[SYSTEM_DATA_ALLOCA]])
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [[STRUCT_PAYLOAD]], ptr [[TMP2]], i32 0, i32 0
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [[STRUCT_PAYLOAD]], ptr [[TMP2]], i32 0
 ; CHECK-NEXT:    [[TMP5:%.*]] = load i32, ptr addrspace(20) @PAYLOAD, align 4
 ; CHECK-NEXT:    store i32 [[TMP5]], ptr [[TMP4]], align 4
 ; CHECK-NEXT:    call void @amd.dx.setLocalRootIndex(i32 [[TMP3]])
 ; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[STRUCT_PAYLOAD]], ptr [[TMP2]], i32 0, i32 0
 ; CHECK-NEXT:    store i32 11, ptr [[TMP6]], align 4
-; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT_PAYLOAD]], ptr [[TMP2]], i32 0, i32 0
+; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT_PAYLOAD]], ptr [[TMP2]], i32 0
 ; CHECK-NEXT:    [[TMP8:%.*]] = load i32, ptr [[TMP7]], align 4
 ; CHECK-NEXT:    store i32 [[TMP8]], ptr addrspace(20) @PAYLOAD, align 4
 ; CHECK-NEXT:    [[TMP9:%.*]] = load [[STRUCT_DISPATCHSYSTEMDATA]], ptr [[SYSTEM_DATA_ALLOCA]], align 4
-; CHECK-NEXT:    ret [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP9]], !continuation.registercount [[META5]]
+; CHECK-NEXT:    call void (...) @lgc.ilcps.return(i64 [[RETURNADDR]], [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP9]]), !continuation.registercount [[META5]]
+; CHECK-NEXT:    unreachable
 ;
   %1 = call i32 @_AmdGetShaderKind()
   %2 = getelementptr inbounds %struct.Payload, %struct.Payload* %payload, i32 0, i32 0
diff --git a/llvmraytracing/test/dx/intrinsics/shader-index.ll b/llvmraytracing/test/dx/intrinsics/shader-index.ll
index 0ac86c8c75..6913bd849b 100644
--- a/llvmraytracing/test/dx/intrinsics/shader-index.ll
+++ b/llvmraytracing/test/dx/intrinsics/shader-index.ll
@@ -1,6 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 3
-; RUN: opt --verify-each -passes='dxil-cont-lgc-rt-op-converter,lint,lower-raytracing-pipeline,lint' -S %s 2> %t.stderr | FileCheck %s
-; RUN: count 0 < %t.stderr
+; RUN: opt --verify-each -passes="dxil-cont-lgc-rt-op-converter,lint,lower-raytracing-pipeline,lint" -S %s --lint-abort-on-error | FileCheck %s
 
 %struct.DispatchSystemData = type { i32 }
 %struct.Payload = type { i32 }
@@ -17,9 +16,13 @@ define i1 @_cont_ReportHit(%struct.DispatchSystemData* %data, float %t, i32 %hit
   ret i1 true
 }
 
+define void @_cont_ExitRayGen(ptr nocapture readonly %data) alwaysinline nounwind !types !{!"function", !"void", !{i32 0, %struct.DispatchSystemData poison}} {
+  ret void
+}
+
 define void @main() !lgc.rt.shaderstage !24 {
 ; CHECK-LABEL: define void @main(
-; CHECK-SAME: {} [[CONT_STATE:%.*]], i32 [[RETURN_ADDR:%.*]], i32 [[SHADER_INDEX:%.*]], [[STRUCT_DISPATCHSYSTEMDATA:%.*]] [[TMP0:%.*]]) !lgc.rt.shaderstage [[META13:![0-9]+]] !lgc.cps [[META13]] !continuation [[META14:![0-9]+]] {
+; CHECK-SAME: {} [[CONT_STATE:%.*]], i32 [[RETURNADDR:%.*]], i32 [[SHADER_INDEX:%.*]], [[STRUCT_DISPATCHSYSTEMDATA:%.*]] [[TMP0:%.*]]) !lgc.rt.shaderstage [[META13:![0-9]+]] !lgc.cps [[META10:![0-9]+]] !continuation [[META14:![0-9]+]] {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[SYSTEM_DATA_ALLOCA:%.*]] = alloca [[STRUCT_DISPATCHSYSTEMDATA]], align 8
 ; CHECK-NEXT:    [[PAYLOAD_ALLOCA:%.*]] = alloca [30 x i32], align 4
@@ -36,24 +39,24 @@ entry:
 
 define void @callable(%struct.Payload* %payload) !types !22 !lgc.rt.shaderstage !25 {
 ; CHECK-LABEL: define void @callable(
-; CHECK-SAME: {} [[CONT_STATE:%.*]], i32 [[RETURN_ADDR:%.*]], i32 [[SHADER_INDEX:%.*]], [[STRUCT_DISPATCHSYSTEMDATA:%.*]] [[SYSTEM_DATA:%.*]], {} [[HIT_ATTRS:%.*]], [8 x i32] [[PADDING:%.*]], [1 x i32] [[PAYLOAD:%.*]]) !lgc.rt.shaderstage [[META15:![0-9]+]] !lgc.cps [[META10:![0-9]+]] !continuation [[META16:![0-9]+]] {
+; CHECK-SAME: {} [[CONT_STATE:%.*]], i32 [[RETURNADDR:%.*]], i32 [[SHADER_INDEX:%.*]], [[STRUCT_DISPATCHSYSTEMDATA:%.*]] [[SYSTEM_DATA:%.*]], {} [[HIT_ATTRS:%.*]], [8 x i32] [[PADDING:%.*]], [1 x i32] [[PAYLOAD:%.*]]) !lgc.rt.shaderstage [[META15:![0-9]+]] !lgc.cps [[META16:![0-9]+]] !continuation [[META17:![0-9]+]] {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[SYSTEM_DATA_ALLOCA:%.*]] = alloca [[STRUCT_DISPATCHSYSTEMDATA]], align 8
 ; CHECK-NEXT:    [[PAYLOAD_ALLOCA:%.*]] = alloca [30 x i32], align 4
 ; CHECK-NEXT:    [[TMP0:%.*]] = alloca [[STRUCT_PAYLOAD:%.*]], align 8
 ; CHECK-NEXT:    store [1 x i32] [[PAYLOAD]], ptr [[PAYLOAD_ALLOCA]], align 4
 ; CHECK-NEXT:    store [[STRUCT_DISPATCHSYSTEMDATA]] [[SYSTEM_DATA]], ptr [[SYSTEM_DATA_ALLOCA]], align 4
-; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [[STRUCT_PAYLOAD]], ptr [[TMP0]], i32 0, i32 0
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [[STRUCT_PAYLOAD]], ptr [[TMP0]], i32 0
 ; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr [[PAYLOAD_ALLOCA]], align 4
 ; CHECK-NEXT:    store i32 [[TMP2]], ptr [[TMP1]], align 4
 ; CHECK-NEXT:    call void @amd.dx.setLocalRootIndex(i32 [[SHADER_INDEX]])
 ; CHECK-NEXT:    store i32 [[SHADER_INDEX]], ptr @debug_global, align 4
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [[STRUCT_PAYLOAD]], ptr [[TMP0]], i32 0, i32 0
-; CHECK-NEXT:    [[TMP5:%.*]] = load i32, ptr [[TMP4]], align 4
-; CHECK-NEXT:    store i32 [[TMP5]], ptr [[PAYLOAD_ALLOCA]], align 4
-; CHECK-NEXT:    [[TMP7:%.*]] = load [[STRUCT_DISPATCHSYSTEMDATA]], ptr [[SYSTEM_DATA_ALLOCA]], align 4
-; CHECK-NEXT:    [[TMP8:%.*]] = load [1 x i32], ptr [[PAYLOAD_ALLOCA]], align 4
-; CHECK-NEXT:    call void (...) @lgc.cps.jump(i32 [[RETURN_ADDR]], i32 3, {} poison, i32 poison, i32 poison, [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP7]], [8 x i32] poison, [1 x i32] [[TMP8]]), !continuation.registercount [[META10]]
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [[STRUCT_PAYLOAD]], ptr [[TMP0]], i32 0
+; CHECK-NEXT:    [[TMP4:%.*]] = load i32, ptr [[TMP3]], align 4
+; CHECK-NEXT:    store i32 [[TMP4]], ptr [[PAYLOAD_ALLOCA]], align 4
+; CHECK-NEXT:    [[TMP5:%.*]] = load [[STRUCT_DISPATCHSYSTEMDATA]], ptr [[SYSTEM_DATA_ALLOCA]], align 4
+; CHECK-NEXT:    [[TMP6:%.*]] = load [1 x i32], ptr [[PAYLOAD_ALLOCA]], align 4
+; CHECK-NEXT:    call void (...) @lgc.cps.jump(i32 [[RETURNADDR]], i32 6, {} poison, i32 poison, i32 poison, [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP5]], [8 x i32] poison, [1 x i32] [[TMP6]]), !continuation.registercount [[META10]]
 ; CHECK-NEXT:    unreachable
 ;
 entry:
diff --git a/llvmraytracing/test/dx/intrinsics/value-i32.ll b/llvmraytracing/test/dx/intrinsics/value-i32.ll
index 5bc45184a7..d6952f0be3 100644
--- a/llvmraytracing/test/dx/intrinsics/value-i32.ll
+++ b/llvmraytracing/test/dx/intrinsics/value-i32.ll
@@ -1,6 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 2
-; RUN: opt --verify-each -passes='dxil-cont-post-process,lint' -S %s 2> %t.stderr | FileCheck %s
-; RUN: count 0 < %t.stderr
+; RUN: opt --verify-each -passes='dxil-cont-post-process,lint' -S %s --lint-abort-on-error | FileCheck %s
 
 %struct.Payload = type { float, i32, i64, i32 }
 
@@ -12,7 +11,7 @@ declare !types !3 void @_AmdValueSetI32(%struct.Payload*, i32, i32)
 
 define i32 @count(%struct.Payload* %pl) !types !0 {
 ; CHECK-LABEL: define i32 @count
-; CHECK-SAME: (ptr [[PL:%.*]]) !types [[META0:![0-9]+]] {
+; CHECK-SAME: (ptr [[PL:%.*]]) !types [[META1:![0-9]+]] {
 ; CHECK-NEXT:    ret i32 5
 ;
   %val = call i32 @_AmdValueI32Count(%struct.Payload* %pl)
@@ -21,7 +20,7 @@ define i32 @count(%struct.Payload* %pl) !types !0 {
 
 define i32 @get(%struct.Payload* %pl) !types !0 {
 ; CHECK-LABEL: define i32 @get
-; CHECK-SAME: (ptr [[PL:%.*]]) !types [[META0]] {
+; CHECK-SAME: (ptr [[PL:%.*]]) !types [[META1]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr i32, ptr [[PL]], i32 2
 ; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr [[TMP1]], align 4
 ; CHECK-NEXT:    ret i32 [[TMP2]]
@@ -32,7 +31,7 @@ define i32 @get(%struct.Payload* %pl) !types !0 {
 
 define void @set(%struct.Payload* %pl, i32 %val) !types !4 {
 ; CHECK-LABEL: define void @set
-; CHECK-SAME: (ptr [[PL:%.*]], i32 [[VAL:%.*]]) !types [[META2:![0-9]+]] {
+; CHECK-SAME: (ptr [[PL:%.*]], i32 [[VAL:%.*]]) !types [[META3:![0-9]+]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr i32, ptr [[PL]], i32 2
 ; CHECK-NEXT:    store i32 [[VAL]], ptr [[TMP1]], align 4
 ; CHECK-NEXT:    ret void
diff --git a/llvmraytracing/test/dx/lower-await.ll b/llvmraytracing/test/dx/lower-await.ll
index 04e054bf5c..c3ba72d4ad 100644
--- a/llvmraytracing/test/dx/lower-await.ll
+++ b/llvmraytracing/test/dx/lower-await.ll
@@ -1,12 +1,9 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 3
-; RUN: opt --verify-each -passes='lower-await,lint' -S %s 2> %t0.stderr | FileCheck -check-prefix=AWAIT %s
-; RUN: count 0 < %t0.stderr
-; RUN: opt --verify-each -passes='lower-await,lint,coro-early,dxil-coro-split,coro-cleanup,lint' -S %s 2> %t1.stderr | FileCheck -check-prefix=CORO %s
-; RUN: count 0 < %t1.stderr
-; RUN: opt --verify-each -passes='lower-await,lint,coro-early,dxil-coro-split,coro-cleanup,lint,legacy-cleanup-continuations,lint' -S %s 2> %t2.stderr | FileCheck -check-prefix=CLEANED %s
-; RUN: count 0 < %t2.stderr
+; RUN: opt --verify-each -passes='lower-await,lint' -S %s --lint-abort-on-error | FileCheck -check-prefix=AWAIT %s
+; RUN: opt --verify-each -passes='lower-await,lint,coro-early,dxil-coro-split,coro-cleanup,lint' -S %s --lint-abort-on-error | FileCheck -check-prefix=CORO %s
+; RUN: opt --verify-each -passes='lower-await,lint,coro-early,dxil-coro-split,coro-cleanup,lint,legacy-cleanup-continuations,lint' -S %s --lint-abort-on-error | FileCheck -check-prefix=CLEANED %s
 
-target datalayout = "e-m:e-p:64:32-p20:32:32-p21:32:32-p32:32:32-i1:32-i8:8-i16:32-i32:32-i64:32-f16:32-f32:32-f64:32-v16:32-v32:32-v48:32-v64:32-v80:32-v96:32-v112:32-v128:32-v144:32-v160:32-v176:32-v192:32-v208:32-v224:32-v240:32-v256:32-n8:16:32"
+target datalayout = "e-m:e-p:64:32-p20:32:32-p21:32:32-p32:32:32-i1:32-i8:8-i16:16-i32:32-i64:32-f16:16-f32:32-f64:32-v8:8-v16:16-v32:32-v48:32-v64:32-v80:32-v96:32-v112:32-v128:32-v144:32-v160:32-v176:32-v192:32-v208:32-v224:32-v240:32-v256:32-n8:16:32"
 
 %continuation.token = type { }
 
@@ -16,33 +13,34 @@ declare %continuation.token* @async_fun()
 declare %continuation.token* @async_fun_with_waitmask(i64)
 declare %continuation.token* @async_fun_with_arg(i32)
 
-define void @simple_await() !continuation.registercount !1 {
+define void @simple_await(i64 %dummyRetAddr) !continuation.registercount !1 {
 ; AWAIT-LABEL: define { ptr, ptr } @simple_await(
-; AWAIT-SAME: i64 [[RETURNADDR:%.*]], ptr [[TMP0:%.*]]) !continuation.registercount [[META1:![0-9]+]] !continuation [[META2:![0-9]+]] {
+; AWAIT-SAME: i64 [[DUMMYRETADDR:%.*]], ptr [[TMP0:%.*]]) !continuation.registercount [[META1:![0-9]+]] !continuation [[META2:![0-9]+]] {
 ; AWAIT-NEXT:    [[TMP2:%.*]] = call token @llvm.coro.id.retcon(i32 8, i32 4, ptr [[TMP0]], ptr @continuation.prototype.simple_await, ptr @continuation.malloc, ptr @continuation.free)
 ; AWAIT-NEXT:    [[TMP3:%.*]] = call ptr @llvm.coro.begin(token [[TMP2]], ptr null)
-; AWAIT-NEXT:    [[TOK:%.*]] = call ptr @async_fun(), !continuation.registercount [[META1]], !continuation.returnedRegistercount !1
+; AWAIT-NEXT:    [[TOK:%.*]] = call ptr @async_fun(), !continuation.registercount [[META1]], !continuation.returnedRegistercount [[META1]]
 ; AWAIT-NEXT:    [[TMP4:%.*]] = call i1 (...) @llvm.coro.suspend.retcon.i1(ptr [[TOK]])
-; AWAIT-NEXT:    call void (...) @continuation.return(i64 [[RETURNADDR]]), !continuation.registercount [[META1]]
+; AWAIT-NEXT:    call void (...) @lgc.ilcps.return(i64 [[DUMMYRETADDR]]), !continuation.registercount [[META1]]
 ; AWAIT-NEXT:    unreachable
 ;
 ; CORO-LABEL: define { ptr, ptr } @simple_await(
-; CORO-SAME: i64 [[RETURNADDR:%.*]], ptr [[TMP0:%.*]]) !continuation.registercount [[META1:![0-9]+]] !continuation [[META2:![0-9]+]] {
+; CORO-SAME: i64 [[DUMMYRETADDR:%.*]], ptr [[TMP0:%.*]]) !continuation.registercount [[META1:![0-9]+]] !continuation [[META2:![0-9]+]] {
 ; CORO-NEXT:  AllocaSpillBB:
 ; CORO-NEXT:    [[RETURNADDR_SPILL_ADDR:%.*]] = getelementptr inbounds [[SIMPLE_AWAIT_FRAME:%.*]], ptr [[TMP0]], i32 0, i32 0
-; CORO-NEXT:    store i64 [[RETURNADDR]], ptr [[RETURNADDR_SPILL_ADDR]], align 4
-; CORO-NEXT:    [[TOK:%.*]] = call ptr @async_fun(), !continuation.registercount [[META1]], !continuation.returnedRegistercount !1
-; CORO-NEXT:    [[TMP1:%.*]] = insertvalue { ptr, ptr } {{undef|poison}}, ptr @simple_await.resume.0, 0
+; CORO-NEXT:    store i64 [[DUMMYRETADDR]], ptr [[RETURNADDR_SPILL_ADDR]], align 4
+; CORO-NEXT:    [[TOK:%.*]] = call ptr @async_fun(), !continuation.registercount [[META1]], !continuation.returnedRegistercount [[META1]]
+; CORO-NEXT:    [[TMP1:%.*]] = insertvalue { ptr, ptr } poison, ptr @simple_await.resume.0, 0
 ; CORO-NEXT:    [[TMP2:%.*]] = insertvalue { ptr, ptr } [[TMP1]], ptr [[TOK]], 1
 ; CORO-NEXT:    ret { ptr, ptr } [[TMP2]]
 ;
 ; CLEANED-LABEL: define void @simple_await(
-; CLEANED-SAME: i64 [[RETURNADDR:%.*]]) !continuation.registercount [[META1:![0-9]+]] !continuation [[META2:![0-9]+]] !continuation.stacksize [[META3:![0-9]+]] !continuation.state [[META3]] {
+; CLEANED-SAME: i64 [[DUMMYRETADDR:%.*]]) !continuation.registercount [[META1:![0-9]+]] !continuation [[META2:![0-9]+]] !continuation.stacksize [[META3:![0-9]+]] !continuation.state [[META3]] {
 ; CLEANED-NEXT:  AllocaSpillBB:
 ; CLEANED-NEXT:    [[CONT_STATE_STACK_SEGMENT:%.*]] = call ptr addrspace(32) @lgc.cps.alloc(i32 8)
 ; CLEANED-NEXT:    [[RETURNADDR_SPILL_ADDR:%.*]] = getelementptr inbounds [[SIMPLE_AWAIT_FRAME:%.*]], ptr addrspace(32) [[CONT_STATE_STACK_SEGMENT]], i32 0, i32 0
-; CLEANED-NEXT:    store i64 [[RETURNADDR]], ptr addrspace(32) [[RETURNADDR_SPILL_ADDR]], align 4
-; CLEANED-NEXT:    call void (i64, ...) @continuation.continue(i64 ptrtoint (ptr @async_fun to i64), i64 ptrtoint (ptr @simple_await.resume.0 to i64)), !continuation.registercount [[META1]], !continuation.returnedRegistercount !1
+; CLEANED-NEXT:    store i64 [[DUMMYRETADDR]], ptr addrspace(32) [[RETURNADDR_SPILL_ADDR]], align 4
+; CLEANED-NEXT:    [[TMP0:%.*]] = call i64 (...) @lgc.cps.as.continuation.reference__i64(ptr @simple_await.resume.0)
+; CLEANED-NEXT:    call void (i64, ...) @continuation.continue(i64 ptrtoint (ptr @async_fun to i64), i64 [[TMP0]]), !continuation.registercount [[META1]], !continuation.returnedRegistercount [[META1]]
 ; CLEANED-NEXT:    unreachable
 ;
   %tok = call %continuation.token* @async_fun(), !continuation.registercount !1, !continuation.returnedRegistercount !1
@@ -55,23 +53,24 @@ define void @simple_await_entry() !continuation.entry !0 !continuation.registerc
 ; AWAIT-SAME: ptr [[TMP0:%.*]]) !continuation.registercount [[META1]] !continuation.entry [[META3:![0-9]+]] !continuation [[META4:![0-9]+]] {
 ; AWAIT-NEXT:    [[TMP2:%.*]] = call token @llvm.coro.id.retcon(i32 8, i32 4, ptr [[TMP0]], ptr @continuation.prototype.simple_await_entry, ptr @continuation.malloc, ptr @continuation.free)
 ; AWAIT-NEXT:    [[TMP3:%.*]] = call ptr @llvm.coro.begin(token [[TMP2]], ptr null)
-; AWAIT-NEXT:    [[TOK:%.*]] = call ptr @async_fun(), !continuation.registercount [[META1]], !continuation.returnedRegistercount !1
+; AWAIT-NEXT:    [[TOK:%.*]] = call ptr @async_fun(), !continuation.registercount [[META1]], !continuation.returnedRegistercount [[META1]]
 ; AWAIT-NEXT:    [[TMP4:%.*]] = call i1 (...) @llvm.coro.suspend.retcon.i1(ptr [[TOK]])
-; AWAIT-NEXT:    call void (...) @continuation.return(i64 undef)
+; AWAIT-NEXT:    call void (...) @lgc.ilcps.return(i64 undef)
 ; AWAIT-NEXT:    unreachable
 ;
 ; CORO-LABEL: define { ptr, ptr } @simple_await_entry(
 ; CORO-SAME: ptr [[TMP0:%.*]]) !continuation.registercount [[META1]] !continuation.entry [[META3:![0-9]+]] !continuation [[META4:![0-9]+]] {
 ; CORO-NEXT:  AllocaSpillBB:
-; CORO-NEXT:    [[TOK:%.*]] = call ptr @async_fun(), !continuation.registercount [[META1]], !continuation.returnedRegistercount !1
-; CORO-NEXT:    [[TMP1:%.*]] = insertvalue { ptr, ptr } {{undef|poison}}, ptr @simple_await_entry.resume.0, 0
+; CORO-NEXT:    [[TOK:%.*]] = call ptr @async_fun(), !continuation.registercount [[META1]], !continuation.returnedRegistercount [[META1]]
+; CORO-NEXT:    [[TMP1:%.*]] = insertvalue { ptr, ptr } poison, ptr @simple_await_entry.resume.0, 0
 ; CORO-NEXT:    [[TMP2:%.*]] = insertvalue { ptr, ptr } [[TMP1]], ptr [[TOK]], 1
 ; CORO-NEXT:    ret { ptr, ptr } [[TMP2]]
 ;
 ; CLEANED-LABEL: define void @simple_await_entry(
 ; CLEANED-SAME: ) !continuation.registercount [[META1]] !continuation.entry [[META4:![0-9]+]] !continuation [[META5:![0-9]+]] !continuation.state [[META1]] {
 ; CLEANED-NEXT:  AllocaSpillBB:
-; CLEANED-NEXT:    call void (i64, ...) @continuation.continue(i64 ptrtoint (ptr @async_fun to i64), i64 ptrtoint (ptr @simple_await_entry.resume.0 to i64)), !continuation.registercount [[META1]], !continuation.returnedRegistercount !1
+; CLEANED-NEXT:    [[TMP0:%.*]] = call i64 (...) @lgc.cps.as.continuation.reference__i64(ptr @simple_await_entry.resume.0)
+; CLEANED-NEXT:    call void (i64, ...) @continuation.continue(i64 ptrtoint (ptr @async_fun to i64), i64 [[TMP0]]), !continuation.registercount [[META1]], !continuation.returnedRegistercount [[META1]]
 ; CLEANED-NEXT:    unreachable
 ;
   %tok = call %continuation.token* @async_fun(), !continuation.registercount !1, !continuation.returnedRegistercount !1
@@ -80,33 +79,34 @@ define void @simple_await_entry() !continuation.entry !0 !continuation.registerc
   ret void
 }
 
-define void @await_with_arg(i32 %i) !continuation.registercount !1 {
+define void @await_with_arg(i64 %dummyRetAddr, i32 %i) !continuation.registercount !1 {
 ; AWAIT-LABEL: define { ptr, ptr } @await_with_arg(
-; AWAIT-SAME: i64 [[RETURNADDR:%.*]], i32 [[I:%.*]], ptr [[TMP0:%.*]]) !continuation.registercount [[META1]] !continuation [[META5:![0-9]+]] {
+; AWAIT-SAME: i64 [[DUMMYRETADDR:%.*]], i32 [[I:%.*]], ptr [[TMP0:%.*]]) !continuation.registercount [[META1]] !continuation [[META5:![0-9]+]] {
 ; AWAIT-NEXT:    [[TMP2:%.*]] = call token @llvm.coro.id.retcon(i32 8, i32 4, ptr [[TMP0]], ptr @continuation.prototype.await_with_arg, ptr @continuation.malloc, ptr @continuation.free)
 ; AWAIT-NEXT:    [[TMP3:%.*]] = call ptr @llvm.coro.begin(token [[TMP2]], ptr null)
-; AWAIT-NEXT:    [[TOK:%.*]] = call ptr @async_fun_with_arg(i32 [[I]]), !continuation.registercount [[META1]], !continuation.returnedRegistercount !1
+; AWAIT-NEXT:    [[TOK:%.*]] = call ptr @async_fun_with_arg(i32 [[I]]), !continuation.registercount [[META1]], !continuation.returnedRegistercount [[META1]]
 ; AWAIT-NEXT:    [[TMP4:%.*]] = call i1 (...) @llvm.coro.suspend.retcon.i1(ptr [[TOK]])
-; AWAIT-NEXT:    call void (...) @continuation.return(i64 [[RETURNADDR]]), !continuation.registercount [[META1]]
+; AWAIT-NEXT:    call void (...) @lgc.ilcps.return(i64 [[DUMMYRETADDR]]), !continuation.registercount [[META1]]
 ; AWAIT-NEXT:    unreachable
 ;
 ; CORO-LABEL: define { ptr, ptr } @await_with_arg(
-; CORO-SAME: i64 [[RETURNADDR:%.*]], i32 [[I:%.*]], ptr [[TMP0:%.*]]) !continuation.registercount [[META1]] !continuation [[META5:![0-9]+]] {
+; CORO-SAME: i64 [[DUMMYRETADDR:%.*]], i32 [[I:%.*]], ptr [[TMP0:%.*]]) !continuation.registercount [[META1]] !continuation [[META5:![0-9]+]] {
 ; CORO-NEXT:  AllocaSpillBB:
 ; CORO-NEXT:    [[RETURNADDR_SPILL_ADDR:%.*]] = getelementptr inbounds [[AWAIT_WITH_ARG_FRAME:%.*]], ptr [[TMP0]], i32 0, i32 0
-; CORO-NEXT:    store i64 [[RETURNADDR]], ptr [[RETURNADDR_SPILL_ADDR]], align 4
-; CORO-NEXT:    [[TOK:%.*]] = call ptr @async_fun_with_arg(i32 [[I]]), !continuation.registercount [[META1]], !continuation.returnedRegistercount !1
-; CORO-NEXT:    [[TMP1:%.*]] = insertvalue { ptr, ptr } {{undef|poison}}, ptr @await_with_arg.resume.0, 0
+; CORO-NEXT:    store i64 [[DUMMYRETADDR]], ptr [[RETURNADDR_SPILL_ADDR]], align 4
+; CORO-NEXT:    [[TOK:%.*]] = call ptr @async_fun_with_arg(i32 [[I]]), !continuation.registercount [[META1]], !continuation.returnedRegistercount [[META1]]
+; CORO-NEXT:    [[TMP1:%.*]] = insertvalue { ptr, ptr } poison, ptr @await_with_arg.resume.0, 0
 ; CORO-NEXT:    [[TMP2:%.*]] = insertvalue { ptr, ptr } [[TMP1]], ptr [[TOK]], 1
 ; CORO-NEXT:    ret { ptr, ptr } [[TMP2]]
 ;
 ; CLEANED-LABEL: define void @await_with_arg(
-; CLEANED-SAME: i64 [[RETURNADDR:%.*]], i32 [[I:%.*]]) !continuation.registercount [[META1]] !continuation [[META6:![0-9]+]] !continuation.stacksize [[META3]] !continuation.state [[META3]] {
+; CLEANED-SAME: i64 [[DUMMYRETADDR:%.*]], i32 [[I:%.*]]) !continuation.registercount [[META1]] !continuation [[META6:![0-9]+]] !continuation.stacksize [[META3]] !continuation.state [[META3]] {
 ; CLEANED-NEXT:  AllocaSpillBB:
 ; CLEANED-NEXT:    [[CONT_STATE_STACK_SEGMENT:%.*]] = call ptr addrspace(32) @lgc.cps.alloc(i32 8)
 ; CLEANED-NEXT:    [[RETURNADDR_SPILL_ADDR:%.*]] = getelementptr inbounds [[AWAIT_WITH_ARG_FRAME:%.*]], ptr addrspace(32) [[CONT_STATE_STACK_SEGMENT]], i32 0, i32 0
-; CLEANED-NEXT:    store i64 [[RETURNADDR]], ptr addrspace(32) [[RETURNADDR_SPILL_ADDR]], align 4
-; CLEANED-NEXT:    call void (i64, ...) @continuation.continue(i64 ptrtoint (ptr @async_fun_with_arg to i64), i64 ptrtoint (ptr @await_with_arg.resume.0 to i64), i32 [[I]]), !continuation.registercount [[META1]], !continuation.returnedRegistercount !1
+; CLEANED-NEXT:    store i64 [[DUMMYRETADDR]], ptr addrspace(32) [[RETURNADDR_SPILL_ADDR]], align 4
+; CLEANED-NEXT:    [[TMP0:%.*]] = call i64 (...) @lgc.cps.as.continuation.reference__i64(ptr @await_with_arg.resume.0)
+; CLEANED-NEXT:    call void (i64, ...) @continuation.continue(i64 ptrtoint (ptr @async_fun_with_arg to i64), i64 [[TMP0]], i32 [[I]]), !continuation.registercount [[META1]], !continuation.returnedRegistercount [[META1]]
 ; CLEANED-NEXT:    unreachable
 ;
   %tok = call %continuation.token* @async_fun_with_arg(i32 %i), !continuation.registercount !1,  !continuation.returnedRegistercount !1
@@ -114,34 +114,35 @@ define void @await_with_arg(i32 %i) !continuation.registercount !1 {
   ret void, !continuation.registercount !1
 }
 
-define i32 @await_with_ret_value() !continuation.registercount !1 {
+define i32 @await_with_ret_value(i64 %dummyRetAddr) !continuation.registercount !1 {
 ; AWAIT-LABEL: define { ptr, ptr } @await_with_ret_value(
-; AWAIT-SAME: i64 [[RETURNADDR:%.*]], ptr [[TMP0:%.*]]) !continuation.registercount [[META1]] !continuation [[META6:![0-9]+]] {
+; AWAIT-SAME: i64 [[DUMMYRETADDR:%.*]], ptr [[TMP0:%.*]]) !continuation.registercount [[META1]] !continuation [[META6:![0-9]+]] {
 ; AWAIT-NEXT:    [[TMP2:%.*]] = call token @llvm.coro.id.retcon(i32 8, i32 4, ptr [[TMP0]], ptr @continuation.prototype.await_with_ret_value, ptr @continuation.malloc, ptr @continuation.free)
 ; AWAIT-NEXT:    [[TMP3:%.*]] = call ptr @llvm.coro.begin(token [[TMP2]], ptr null)
-; AWAIT-NEXT:    [[TOK:%.*]] = call ptr @async_fun(), !continuation.registercount [[META1]], !continuation.returnedRegistercount !1
+; AWAIT-NEXT:    [[TOK:%.*]] = call ptr @async_fun(), !continuation.registercount [[META1]], !continuation.returnedRegistercount [[META1]]
 ; AWAIT-NEXT:    [[TMP4:%.*]] = call i1 (...) @llvm.coro.suspend.retcon.i1(ptr [[TOK]])
-; AWAIT-NEXT:    [[TMP5:%.*]] = call i32 @continuations.getReturnValue__i32()
-; AWAIT-NEXT:    call void (...) @continuation.return(i64 [[RETURNADDR]], i32 [[TMP5]]), !continuation.registercount [[META1]]
+; AWAIT-NEXT:    [[TMP5:%.*]] = call i32 @lgc.ilcps.getReturnValue__i32()
+; AWAIT-NEXT:    call void (...) @lgc.ilcps.return(i64 [[DUMMYRETADDR]], i32 [[TMP5]]), !continuation.registercount [[META1]]
 ; AWAIT-NEXT:    unreachable
 ;
 ; CORO-LABEL: define { ptr, ptr } @await_with_ret_value(
-; CORO-SAME: i64 [[RETURNADDR:%.*]], ptr [[TMP0:%.*]]) !continuation.registercount [[META1]] !continuation [[META6:![0-9]+]] {
+; CORO-SAME: i64 [[DUMMYRETADDR:%.*]], ptr [[TMP0:%.*]]) !continuation.registercount [[META1]] !continuation [[META6:![0-9]+]] {
 ; CORO-NEXT:  AllocaSpillBB:
 ; CORO-NEXT:    [[RETURNADDR_SPILL_ADDR:%.*]] = getelementptr inbounds [[AWAIT_WITH_RET_VALUE_FRAME:%.*]], ptr [[TMP0]], i32 0, i32 0
-; CORO-NEXT:    store i64 [[RETURNADDR]], ptr [[RETURNADDR_SPILL_ADDR]], align 4
-; CORO-NEXT:    [[TOK:%.*]] = call ptr @async_fun(), !continuation.registercount [[META1]], !continuation.returnedRegistercount !1
-; CORO-NEXT:    [[TMP1:%.*]] = insertvalue { ptr, ptr } {{undef|poison}}, ptr @await_with_ret_value.resume.0, 0
+; CORO-NEXT:    store i64 [[DUMMYRETADDR]], ptr [[RETURNADDR_SPILL_ADDR]], align 4
+; CORO-NEXT:    [[TOK:%.*]] = call ptr @async_fun(), !continuation.registercount [[META1]], !continuation.returnedRegistercount [[META1]]
+; CORO-NEXT:    [[TMP1:%.*]] = insertvalue { ptr, ptr } poison, ptr @await_with_ret_value.resume.0, 0
 ; CORO-NEXT:    [[TMP2:%.*]] = insertvalue { ptr, ptr } [[TMP1]], ptr [[TOK]], 1
 ; CORO-NEXT:    ret { ptr, ptr } [[TMP2]]
 ;
 ; CLEANED-LABEL: define void @await_with_ret_value(
-; CLEANED-SAME: i64 [[RETURNADDR:%.*]]) !continuation.registercount [[META1]] !continuation [[META7:![0-9]+]] !continuation.stacksize [[META3]] !continuation.state [[META3]] {
+; CLEANED-SAME: i64 [[DUMMYRETADDR:%.*]]) !continuation.registercount [[META1]] !continuation [[META7:![0-9]+]] !continuation.stacksize [[META3]] !continuation.state [[META3]] {
 ; CLEANED-NEXT:  AllocaSpillBB:
 ; CLEANED-NEXT:    [[CONT_STATE_STACK_SEGMENT:%.*]] = call ptr addrspace(32) @lgc.cps.alloc(i32 8)
 ; CLEANED-NEXT:    [[RETURNADDR_SPILL_ADDR:%.*]] = getelementptr inbounds [[AWAIT_WITH_RET_VALUE_FRAME:%.*]], ptr addrspace(32) [[CONT_STATE_STACK_SEGMENT]], i32 0, i32 0
-; CLEANED-NEXT:    store i64 [[RETURNADDR]], ptr addrspace(32) [[RETURNADDR_SPILL_ADDR]], align 4
-; CLEANED-NEXT:    call void (i64, ...) @continuation.continue(i64 ptrtoint (ptr @async_fun to i64), i64 ptrtoint (ptr @await_with_ret_value.resume.0 to i64)), !continuation.registercount [[META1]], !continuation.returnedRegistercount !1
+; CLEANED-NEXT:    store i64 [[DUMMYRETADDR]], ptr addrspace(32) [[RETURNADDR_SPILL_ADDR]], align 4
+; CLEANED-NEXT:    [[TMP0:%.*]] = call i64 (...) @lgc.cps.as.continuation.reference__i64(ptr @await_with_ret_value.resume.0)
+; CLEANED-NEXT:    call void (i64, ...) @continuation.continue(i64 ptrtoint (ptr @async_fun to i64), i64 [[TMP0]]), !continuation.registercount [[META1]], !continuation.returnedRegistercount [[META1]]
 ; CLEANED-NEXT:    unreachable
 ;
   %tok = call %continuation.token* @async_fun(), !continuation.registercount !1, !continuation.returnedRegistercount !1
@@ -149,33 +150,34 @@ define i32 @await_with_ret_value() !continuation.registercount !1 {
   ret i32 %res, !continuation.registercount !1
 }
 
-define void @wait_await() !continuation.registercount !1 {
+define void @wait_await(i64 %dummyRetAddr) !continuation.registercount !1 {
 ; AWAIT-LABEL: define { ptr, ptr } @wait_await(
-; AWAIT-SAME: i64 [[RETURNADDR:%.*]], ptr [[TMP0:%.*]]) !continuation.registercount [[META1]] !continuation [[META7:![0-9]+]] {
+; AWAIT-SAME: i64 [[DUMMYRETADDR:%.*]], ptr [[TMP0:%.*]]) !continuation.registercount [[META1]] !continuation [[META7:![0-9]+]] {
 ; AWAIT-NEXT:    [[TMP2:%.*]] = call token @llvm.coro.id.retcon(i32 8, i32 4, ptr [[TMP0]], ptr @continuation.prototype.wait_await, ptr @continuation.malloc, ptr @continuation.free)
 ; AWAIT-NEXT:    [[TMP3:%.*]] = call ptr @llvm.coro.begin(token [[TMP2]], ptr null)
-; AWAIT-NEXT:    [[TOK:%.*]] = call ptr @async_fun_with_waitmask(i64 -1), !continuation.registercount [[META1]], !continuation.returnedRegistercount !1, !continuation.wait.await [[META3]]
+; AWAIT-NEXT:    [[TOK:%.*]] = call ptr @async_fun_with_waitmask(i64 -1), !continuation.registercount [[META1]], !continuation.returnedRegistercount [[META1]], !continuation.wait.await [[META3]]
 ; AWAIT-NEXT:    [[TMP4:%.*]] = call i1 (...) @llvm.coro.suspend.retcon.i1(ptr [[TOK]])
-; AWAIT-NEXT:    call void (...) @continuation.return(i64 [[RETURNADDR]]), !continuation.registercount [[META1]]
+; AWAIT-NEXT:    call void (...) @lgc.ilcps.return(i64 [[DUMMYRETADDR]]), !continuation.registercount [[META1]]
 ; AWAIT-NEXT:    unreachable
 ;
 ; CORO-LABEL: define { ptr, ptr } @wait_await(
-; CORO-SAME: i64 [[RETURNADDR:%.*]], ptr [[TMP0:%.*]]) !continuation.registercount [[META1]] !continuation [[META7:![0-9]+]] {
+; CORO-SAME: i64 [[DUMMYRETADDR:%.*]], ptr [[TMP0:%.*]]) !continuation.registercount [[META1]] !continuation [[META7:![0-9]+]] {
 ; CORO-NEXT:  AllocaSpillBB:
 ; CORO-NEXT:    [[RETURNADDR_SPILL_ADDR:%.*]] = getelementptr inbounds [[WAIT_AWAIT_FRAME:%.*]], ptr [[TMP0]], i32 0, i32 0
-; CORO-NEXT:    store i64 [[RETURNADDR]], ptr [[RETURNADDR_SPILL_ADDR]], align 4
-; CORO-NEXT:    [[TOK:%.*]] = call ptr @async_fun_with_waitmask(i64 -1), !continuation.registercount [[META1]], !continuation.returnedRegistercount !1, !continuation.wait.await [[META3]]
-; CORO-NEXT:    [[TMP1:%.*]] = insertvalue { ptr, ptr } {{undef|poison}}, ptr @wait_await.resume.0, 0
+; CORO-NEXT:    store i64 [[DUMMYRETADDR]], ptr [[RETURNADDR_SPILL_ADDR]], align 4
+; CORO-NEXT:    [[TOK:%.*]] = call ptr @async_fun_with_waitmask(i64 -1), !continuation.registercount [[META1]], !continuation.returnedRegistercount [[META1]], !continuation.wait.await [[META3]]
+; CORO-NEXT:    [[TMP1:%.*]] = insertvalue { ptr, ptr } poison, ptr @wait_await.resume.0, 0
 ; CORO-NEXT:    [[TMP2:%.*]] = insertvalue { ptr, ptr } [[TMP1]], ptr [[TOK]], 1
 ; CORO-NEXT:    ret { ptr, ptr } [[TMP2]]
 ;
 ; CLEANED-LABEL: define void @wait_await(
-; CLEANED-SAME: i64 [[RETURNADDR:%.*]]) !continuation.registercount [[META1]] !continuation [[META8:![0-9]+]] !continuation.stacksize [[META3]] !continuation.state [[META3]] {
+; CLEANED-SAME: i64 [[DUMMYRETADDR:%.*]]) !continuation.registercount [[META1]] !continuation [[META8:![0-9]+]] !continuation.stacksize [[META3]] !continuation.state [[META3]] {
 ; CLEANED-NEXT:  AllocaSpillBB:
 ; CLEANED-NEXT:    [[CONT_STATE_STACK_SEGMENT:%.*]] = call ptr addrspace(32) @lgc.cps.alloc(i32 8)
 ; CLEANED-NEXT:    [[RETURNADDR_SPILL_ADDR:%.*]] = getelementptr inbounds [[WAIT_AWAIT_FRAME:%.*]], ptr addrspace(32) [[CONT_STATE_STACK_SEGMENT]], i32 0, i32 0
-; CLEANED-NEXT:    store i64 [[RETURNADDR]], ptr addrspace(32) [[RETURNADDR_SPILL_ADDR]], align 4
-; CLEANED-NEXT:    call void (i64, i64, ...) @continuation.waitContinue(i64 ptrtoint (ptr @async_fun_with_waitmask to i64), i64 -1, i64 ptrtoint (ptr @wait_await.resume.0 to i64)), !continuation.registercount [[META1]], !continuation.returnedRegistercount !1
+; CLEANED-NEXT:    store i64 [[DUMMYRETADDR]], ptr addrspace(32) [[RETURNADDR_SPILL_ADDR]], align 4
+; CLEANED-NEXT:    [[TMP0:%.*]] = call i64 (...) @lgc.cps.as.continuation.reference__i64(ptr @wait_await.resume.0)
+; CLEANED-NEXT:    call void (i64, i64, ...) @continuation.waitContinue(i64 ptrtoint (ptr @async_fun_with_waitmask to i64), i64 -1, i64 [[TMP0]]), !continuation.registercount [[META1]], !continuation.returnedRegistercount [[META1]]
 ; CLEANED-NEXT:    unreachable
 ;
   %tok = call %continuation.token* @async_fun_with_waitmask(i64 -1), !continuation.wait.await !0, !continuation.registercount !1, !continuation.returnedRegistercount !1
diff --git a/llvmraytracing/test/dx/lower-rt-pipeline-call-shader.ll b/llvmraytracing/test/dx/lower-rt-pipeline-call-shader.ll
index 1f0a0edfe6..e122b59b9d 100644
--- a/llvmraytracing/test/dx/lower-rt-pipeline-call-shader.ll
+++ b/llvmraytracing/test/dx/lower-rt-pipeline-call-shader.ll
@@ -1,14 +1,10 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --include-generated-funcs --version 3
-; RUN: grep -v lgc.cps.module %s | opt --verify-each -passes='dxil-cont-lgc-rt-op-converter,lint,lower-raytracing-pipeline,lint,remove-types-metadata' -S 2> %t0.stderr | FileCheck -check-prefix=LOWERRAYTRACINGPIPELINE %s
-; RUN: count 0 < %t0.stderr
-; RUN: opt --verify-each -passes='dxil-cont-lgc-rt-op-converter,lint,lower-raytracing-pipeline,lint,remove-types-metadata' -S %s 2> %t1.stderr | FileCheck -check-prefix=LOWERRAYTRACINGPIPELINE-CPS %s
-; RUN: count 0 < %t1.stderr
-; RUN: opt --verify-each -passes='dxil-cont-lgc-rt-op-converter,lint,inline,lint,lower-raytracing-pipeline,lint,sroa,lint,lower-await,lint,coro-early,dxil-coro-split,coro-cleanup,lint,dxil-cleanup-continuations,lint,register-buffer,lint,remove-types-metadata' -S %s 2> %t2.stderr | FileCheck -check-prefix=REGISTERBUFFER-CPS %s
-; RUN: count 0 < %t2.stderr
-; RUN: opt --verify-each -passes='dxil-cont-lgc-rt-op-converter,lint,inline,lint,lower-raytracing-pipeline,lint,sroa,lint,lower-await,lint,coro-early,dxil-coro-split,coro-cleanup,lint,dxil-cleanup-continuations,lint,register-buffer,lint,dxil-cont-post-process,lint,remove-types-metadata' -S %s 2> %t3.stderr | FileCheck -check-prefix=POSTPROCESS-CPS %s
-; RUN: count 0 < %t3.stderr
+; RUN: grep -v lgc.cps.module %s | opt --verify-each -passes="dxil-cont-lgc-rt-op-converter,lint,lower-raytracing-pipeline,lint,remove-types-metadata" -S --lint-abort-on-error | FileCheck -check-prefix=LOWERRAYTRACINGPIPELINE %s
+; RUN: opt --verify-each -passes="dxil-cont-lgc-rt-op-converter,lint,lower-raytracing-pipeline,lint,remove-types-metadata" -S %s --lint-abort-on-error | FileCheck -check-prefix=LOWERRAYTRACINGPIPELINE-CPS %s
+; RUN: opt --verify-each -passes="dxil-cont-lgc-rt-op-converter,lint,inline,lint,lower-raytracing-pipeline,lint,sroa,lint,lower-await,lint,coro-early,dxil-coro-split,coro-cleanup,lint,dxil-cleanup-continuations,lint,register-buffer,lint,remove-types-metadata" -S %s --lint-abort-on-error | FileCheck -check-prefix=REGISTERBUFFER-CPS %s
+; RUN: opt --verify-each -passes="dxil-cont-lgc-rt-op-converter,lint,inline,lint,lower-raytracing-pipeline,lint,sroa,lint,lower-await,lint,coro-early,dxil-coro-split,coro-cleanup,lint,dxil-cleanup-continuations,lint,register-buffer,lint,dxil-cont-post-process,lint,remove-types-metadata" -S %s --lint-abort-on-error | FileCheck -check-prefix=POSTPROCESS-CPS %s
 
-target datalayout = "e-m:e-p:64:32-p20:32:32-p21:32:32-p32:32:32-i1:32-i8:8-i16:32-i32:32-i64:32-f16:32-f32:32-f64:32-v16:32-v32:32-v48:32-v64:32-v80:32-v96:32-v112:32-v128:32-v144:32-v160:32-v176:32-v192:32-v208:32-v224:32-v240:32-v256:32-n8:16:32"
+target datalayout = "e-m:e-p:64:32-p20:32:32-p21:32:32-p32:32:32-i1:32-i8:8-i16:16-i32:32-i64:32-f16:16-f32:32-f64:32-v8:8-v16:16-v32:32-v48:32-v64:32-v80:32-v96:32-v112:32-v128:32-v144:32-v160:32-v176:32-v192:32-v208:32-v224:32-v240:32-v256:32-n8:16:32"
 
 %dx.types.Handle = type { i8* }
 %struct.DispatchSystemData = type { i32 }
@@ -39,6 +35,10 @@ declare !types !15 %struct.BuiltInTriangleIntersectionAttributes @_cont_GetTrian
 
 declare !types !17 void @_AmdRestoreSystemData(%struct.DispatchSystemData*)
 
+define void @_cont_ExitRayGen(ptr nocapture readonly %data) alwaysinline nounwind !types !{!"function", !"void", !{i32 0, %struct.DispatchSystemData poison}} {
+  ret void
+}
+
 define void @_cont_CallShader(%struct.DispatchSystemData* %data, i32 %0) !types !18 {
   %dis_data = load %struct.DispatchSystemData, %struct.DispatchSystemData* %data, align 4
   %newdata = call %struct.DispatchSystemData @_AmdAwaitShader(i64 2, %struct.DispatchSystemData %dis_data)
@@ -94,24 +94,24 @@ attributes #0 = { nounwind }
 ;
 ;
 ; LOWERRAYTRACINGPIPELINE-LABEL: define void @main(
-; LOWERRAYTRACINGPIPELINE-SAME: [[STRUCT_DISPATCHSYSTEMDATA:%.*]] [[TMP0:%.*]]) !lgc.rt.shaderstage [[META8:![0-9]+]] !continuation.entry [[META16:![0-9]+]] !continuation.registercount [[META8]] !continuation [[META17:![0-9]+]] {
+; LOWERRAYTRACINGPIPELINE-SAME: i64 [[RETURNADDR:%.*]], [[STRUCT_DISPATCHSYSTEMDATA:%.*]] [[TMP0:%.*]]) !lgc.rt.shaderstage [[META8:![0-9]+]] !continuation.entry [[META16:![0-9]+]] !continuation.registercount [[META8]] !continuation [[META17:![0-9]+]] {
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[PARAMS:%.*]] = alloca [[STRUCT_THEIRPARAMS:%.*]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[SYSTEM_DATA_ALLOCA:%.*]] = alloca [[STRUCT_DISPATCHSYSTEMDATA]], align 8
 ; LOWERRAYTRACINGPIPELINE-NEXT:    store [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP0]], ptr [[SYSTEM_DATA_ALLOCA]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    call void @amd.dx.setLocalRootIndex(i32 0)
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[DIS_DATA_I:%.*]] = load [[STRUCT_DISPATCHSYSTEMDATA]], ptr [[SYSTEM_DATA_ALLOCA]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [[STRUCT_THEIRPARAMS]], ptr [[PARAMS]], i32 0, i32 0
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP5:%.*]] = load i32, ptr [[TMP2]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP5]], ptr addrspace(20) @PAYLOAD, align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP6:%.*]] = call ptr inttoptr (i64 2 to ptr)([[STRUCT_DISPATCHSYSTEMDATA]] [[DIS_DATA_I]]), !continuation.registercount [[META14:![0-9]+]], !continuation.returnedRegistercount !14
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP7:%.*]] = call [[STRUCT_DISPATCHSYSTEMDATA]] [[AWAIT_STRUCT_DISPATCHSYSTEMDATA:@[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]](ptr [[TMP6]])
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [[STRUCT_THEIRPARAMS]], ptr [[PARAMS]], i32 0
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP3:%.*]] = load i32, ptr [[TMP2]], align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP3]], ptr addrspace(20) @PAYLOAD, align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP4:%.*]] = call ptr inttoptr (i64 2 to ptr)([[STRUCT_DISPATCHSYSTEMDATA]] [[DIS_DATA_I]]), !continuation.registercount [[META14:![0-9]+]], !continuation.returnedRegistercount [[META14]]
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP5:%.*]] = call [[STRUCT_DISPATCHSYSTEMDATA]] [[AWAIT:@[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]](ptr [[TMP4]])
 ; LOWERRAYTRACINGPIPELINE-NEXT:    store [[STRUCT_THEIRPARAMS]] poison, ptr [[PARAMS]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [[STRUCT_THEIRPARAMS]], ptr [[PARAMS]], i32 0, i32 0
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP11:%.*]] = load i32, ptr addrspace(20) @PAYLOAD, align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP11]], ptr [[TMP8]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    store [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP7]], ptr [[SYSTEM_DATA_ALLOCA]], align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[STRUCT_THEIRPARAMS]], ptr [[PARAMS]], i32 0
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP7:%.*]] = load i32, ptr addrspace(20) @PAYLOAD, align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP7]], ptr [[TMP6]], align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    store [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP5]], ptr [[SYSTEM_DATA_ALLOCA]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    call void @amd.dx.setLocalRootIndex(i32 0)
-; LOWERRAYTRACINGPIPELINE-NEXT:    ret void, !continuation.registercount [[META13:![0-9]+]]
+; LOWERRAYTRACINGPIPELINE-NEXT:    ret void
 ;
 ;
 ; LOWERRAYTRACINGPIPELINE-CPS-LABEL: define i32 @_cont_GetLocalRootIndex(
@@ -120,26 +120,26 @@ attributes #0 = { nounwind }
 ;
 ;
 ; LOWERRAYTRACINGPIPELINE-CPS-LABEL: define void @main(
-; LOWERRAYTRACINGPIPELINE-CPS-SAME: {} [[CONT_STATE:%.*]], i32 [[RETURN_ADDR:%.*]], i32 [[SHADER_INDEX:%.*]], [[STRUCT_DISPATCHSYSTEMDATA:%.*]] [[TMP0:%.*]]) !lgc.rt.shaderstage [[META8:![0-9]+]] !lgc.cps [[META8]] !continuation [[META16:![0-9]+]] {
+; LOWERRAYTRACINGPIPELINE-CPS-SAME: {} [[CONT_STATE:%.*]], i32 [[RETURNADDR:%.*]], i32 [[SHADER_INDEX:%.*]], [[STRUCT_DISPATCHSYSTEMDATA:%.*]] [[TMP0:%.*]]) !lgc.rt.shaderstage [[META8:![0-9]+]] !lgc.cps [[META14:![0-9]+]] !continuation [[META16:![0-9]+]] {
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[PARAMS:%.*]] = alloca [[STRUCT_THEIRPARAMS:%.*]], align 4
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[SYSTEM_DATA_ALLOCA:%.*]] = alloca [[STRUCT_DISPATCHSYSTEMDATA]], align 8
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[PAYLOAD_ALLOCA:%.*]] = alloca [30 x i32], align 4
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP0]], ptr [[SYSTEM_DATA_ALLOCA]], align 4
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    call void @amd.dx.setLocalRootIndex(i32 0)
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[DIS_DATA_I:%.*]] = load [[STRUCT_DISPATCHSYSTEMDATA]], ptr [[SYSTEM_DATA_ALLOCA]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [[STRUCT_THEIRPARAMS]], ptr [[PARAMS]], i32 0, i32 0
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [[STRUCT_THEIRPARAMS]], ptr [[PARAMS]], i32 0
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP3:%.*]] = load i32, ptr [[TMP2]], align 4
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store i32 [[TMP3]], ptr [[PAYLOAD_ALLOCA]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP8:%.*]] = load [1 x i32], ptr [[PAYLOAD_ALLOCA]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP9:%.*]] = call { [[STRUCT_DISPATCHSYSTEMDATA]], [19 x i32], [1 x i32] } (...) @lgc.cps.await__sl_s_struct.DispatchSystemDatasa19i32a1i32s(i32 2, i32 2, i32 5, [20 x i32] poison, [1 x i32] [[TMP8]]), !continuation.returnedRegistercount !14, !continuation.registercount [[META14:![0-9]+]]
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP10:%.*]] = extractvalue { [[STRUCT_DISPATCHSYSTEMDATA]], [19 x i32], [1 x i32] } [[TMP9]], 2
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store [1 x i32] [[TMP10]], ptr [[PAYLOAD_ALLOCA]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP11:%.*]] = extractvalue { [[STRUCT_DISPATCHSYSTEMDATA]], [19 x i32], [1 x i32] } [[TMP9]], 0
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP4:%.*]] = load [1 x i32], ptr [[PAYLOAD_ALLOCA]], align 4
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP5:%.*]] = call { [[STRUCT_DISPATCHSYSTEMDATA]], [19 x i32], [1 x i32] } (...) @lgc.cps.await__sl_s_struct.DispatchSystemDatasa19i32a1i32s(i32 2, i32 4, i32 5, [20 x i32] poison, [1 x i32] [[TMP4]]), !continuation.returnedRegistercount [[META14]], !continuation.registercount [[META14]]
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP6:%.*]] = extractvalue { [[STRUCT_DISPATCHSYSTEMDATA]], [19 x i32], [1 x i32] } [[TMP5]], 2
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store [1 x i32] [[TMP6]], ptr [[PAYLOAD_ALLOCA]], align 4
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP7:%.*]] = extractvalue { [[STRUCT_DISPATCHSYSTEMDATA]], [19 x i32], [1 x i32] } [[TMP5]], 0
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store [[STRUCT_THEIRPARAMS]] poison, ptr [[PARAMS]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [[STRUCT_THEIRPARAMS]], ptr [[PARAMS]], i32 0, i32 0
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP13:%.*]] = load i32, ptr [[PAYLOAD_ALLOCA]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store i32 [[TMP13]], ptr [[TMP12]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP11]], ptr [[SYSTEM_DATA_ALLOCA]], align 4
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [[STRUCT_THEIRPARAMS]], ptr [[PARAMS]], i32 0
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP9:%.*]] = load i32, ptr [[PAYLOAD_ALLOCA]], align 4
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store i32 [[TMP9]], ptr [[TMP8]], align 4
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP7]], ptr [[SYSTEM_DATA_ALLOCA]], align 4
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    call void @amd.dx.setLocalRootIndex(i32 0)
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    ret void
 ;
@@ -150,19 +150,19 @@ attributes #0 = { nounwind }
 ;
 ;
 ; REGISTERBUFFER-CPS-LABEL: define void @main(
-; REGISTERBUFFER-CPS-SAME: {} [[CONT_STATE:%.*]], i32 [[RETURN_ADDR:%.*]], i32 [[SHADER_INDEX:%.*]], [[STRUCT_DISPATCHSYSTEMDATA:%.*]] [[TMP0:%.*]]) !lgc.rt.shaderstage [[META8:![0-9]+]] !lgc.cps [[META8]] !continuation [[META16:![0-9]+]] {
+; REGISTERBUFFER-CPS-SAME: {} [[CONT_STATE:%.*]], i32 [[RETURNADDR:%.*]], i32 [[SHADER_INDEX:%.*]], [[STRUCT_DISPATCHSYSTEMDATA:%.*]] [[TMP0:%.*]]) !lgc.rt.shaderstage [[META8:![0-9]+]] !lgc.cps [[META14:![0-9]+]] !continuation [[META16:![0-9]+]] {
 ; REGISTERBUFFER-CPS-NEXT:  AllocaSpillBB:
 ; REGISTERBUFFER-CPS-NEXT:    [[DOTFCA_0_EXTRACT3:%.*]] = extractvalue [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP0]], 0
 ; REGISTERBUFFER-CPS-NEXT:    call void @amd.dx.setLocalRootIndex(i32 0)
 ; REGISTERBUFFER-CPS-NEXT:    [[DIS_DATA_I_FCA_0_INSERT:%.*]] = insertvalue [[STRUCT_DISPATCHSYSTEMDATA]] poison, i32 [[DOTFCA_0_EXTRACT3]], 0
 ; REGISTERBUFFER-CPS-NEXT:    [[DOTFCA_0_INSERT:%.*]] = insertvalue [1 x i32] poison, i32 undef, 0
 ; REGISTERBUFFER-CPS-NEXT:    [[TMP1:%.*]] = call i64 (...) @lgc.cps.as.continuation.reference__i64(ptr @main.resume.0)
-; REGISTERBUFFER-CPS-NEXT:    call void (...) @lgc.cps.jump(i32 2, i32 2, {} poison, i64 [[TMP1]], i32 5, [20 x i32] poison, [1 x i32] [[DOTFCA_0_INSERT]]), !continuation.returnedRegistercount !14, !continuation.registercount [[META14:![0-9]+]]
+; REGISTERBUFFER-CPS-NEXT:    call void (...) @lgc.cps.jump(i32 2, i32 4, {} poison, i64 [[TMP1]], i32 5, [20 x i32] poison, [1 x i32] [[DOTFCA_0_INSERT]]), !continuation.returnedRegistercount [[META14]], !continuation.registercount [[META14]]
 ; REGISTERBUFFER-CPS-NEXT:    unreachable
 ;
 ;
 ; REGISTERBUFFER-CPS-LABEL: define dso_local void @main.resume.0(
-; REGISTERBUFFER-CPS-SAME: {} [[TMP0:%.*]], i32 [[TMP1:%.*]], i32 [[TMP2:%.*]], { [[STRUCT_DISPATCHSYSTEMDATA:%.*]], [19 x i32], [1 x i32] } [[TMP3:%.*]]) !lgc.rt.shaderstage [[META8]] !lgc.cps [[META8]] !continuation [[META16]] {
+; REGISTERBUFFER-CPS-SAME: {} [[TMP0:%.*]], i32 [[TMP1:%.*]], i32 [[TMP2:%.*]], { [[STRUCT_DISPATCHSYSTEMDATA:%.*]], [19 x i32], [1 x i32] } [[TMP3:%.*]]) !lgc.rt.shaderstage [[META8]] !lgc.cps [[META14]] !continuation [[META16]] {
 ; REGISTERBUFFER-CPS-NEXT:  entryresume.0:
 ; REGISTERBUFFER-CPS-NEXT:    [[TMP4:%.*]] = extractvalue { [[STRUCT_DISPATCHSYSTEMDATA]], [19 x i32], [1 x i32] } [[TMP3]], 2
 ; REGISTERBUFFER-CPS-NEXT:    [[DOTFCA_0_EXTRACT:%.*]] = extractvalue [1 x i32] [[TMP4]], 0
@@ -178,23 +178,22 @@ attributes #0 = { nounwind }
 ;
 ;
 ; POSTPROCESS-CPS-LABEL: define void @main(
-; POSTPROCESS-CPS-SAME: {} [[TMP0:%.*]], i32 [[CSPINIT:%.*]], i32 [[TMP1:%.*]], i32 [[TMP2:%.*]]) !lgc.rt.shaderstage [[META8:![0-9]+]] !lgc.cps [[META8]] !continuation [[META16:![0-9]+]] {
+; POSTPROCESS-CPS-SAME: {} [[CONT_STATE:%.*]], i32 [[CSPINIT:%.*]], i32 [[RETURNADDR:%.*]], i32 [[SHADER_INDEX:%.*]], [[STRUCT_DISPATCHSYSTEMDATA:%.*]] [[TMP0:%.*]]) !lgc.rt.shaderstage [[META8:![0-9]+]] !lgc.cps [[META14:![0-9]+]] !continuation [[META16:![0-9]+]] {
 ; POSTPROCESS-CPS-NEXT:  AllocaSpillBB:
 ; POSTPROCESS-CPS-NEXT:    [[CSP:%.*]] = alloca i32, align 4
 ; POSTPROCESS-CPS-NEXT:    store i32 [[CSPINIT]], ptr [[CSP]], align 4
-; POSTPROCESS-CPS-NEXT:    [[SYSTEM_DATA:%.*]] = call [[STRUCT_DISPATCHSYSTEMDATA:%.*]] [[_CONT_SETUPRAYGEN:@[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]]()
-; POSTPROCESS-CPS-NEXT:    [[DOTFCA_0_EXTRACT3:%.*]] = extractvalue [[STRUCT_DISPATCHSYSTEMDATA]] [[SYSTEM_DATA]], 0
+; POSTPROCESS-CPS-NEXT:    [[DOTFCA_0_EXTRACT3:%.*]] = extractvalue [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP0]], 0
 ; POSTPROCESS-CPS-NEXT:    call void @amd.dx.setLocalRootIndex(i32 0)
 ; POSTPROCESS-CPS-NEXT:    [[DIS_DATA_I_FCA_0_INSERT:%.*]] = insertvalue [[STRUCT_DISPATCHSYSTEMDATA]] poison, i32 [[DOTFCA_0_EXTRACT3]], 0
 ; POSTPROCESS-CPS-NEXT:    [[DOTFCA_0_INSERT:%.*]] = insertvalue [1 x i32] poison, i32 undef, 0
+; POSTPROCESS-CPS-NEXT:    [[TMP4:%.*]] = call i64 @continuation.getAddrAndMD(ptr @main.resume.0)
 ; POSTPROCESS-CPS-NEXT:    [[TMP3:%.*]] = load i32, ptr [[CSP]], align 4
-; POSTPROCESS-CPS-NEXT:    [[TMP4:%.*]] = call i64 @continuation.getAddrAndMD(i64 ptrtoint (ptr @main.resume.0 to i64))
-; POSTPROCESS-CPS-NEXT:    call void (i64, ...) @continuation.continue(i64 2, i32 [[TMP3]], i64 [[TMP4]], i32 5, [20 x i32] poison, [1 x i32] [[DOTFCA_0_INSERT]]), !continuation.returnedRegistercount !14, !continuation.registercount [[META14:![0-9]+]]
+; POSTPROCESS-CPS-NEXT:    call void (i64, ...) @continuation.continue(i64 2, i32 [[TMP3]], i64 [[TMP4]], i32 5, [20 x i32] poison, [1 x i32] [[DOTFCA_0_INSERT]]), !continuation.returnedRegistercount [[META14]], !continuation.registercount [[META14]]
 ; POSTPROCESS-CPS-NEXT:    unreachable
 ;
 ;
 ; POSTPROCESS-CPS-LABEL: define dso_local void @main.resume.0(
-; POSTPROCESS-CPS-SAME: {} [[TMP0:%.*]], i32 [[CSPINIT:%.*]], i32 [[TMP1:%.*]], i32 [[TMP2:%.*]], { [[STRUCT_DISPATCHSYSTEMDATA:%.*]], [19 x i32], [1 x i32] } [[TMP3:%.*]]) !lgc.rt.shaderstage [[META8]] !lgc.cps [[META8]] !continuation [[META16]] {
+; POSTPROCESS-CPS-SAME: {} [[TMP0:%.*]], i32 [[CSPINIT:%.*]], i32 [[TMP1:%.*]], i32 [[TMP2:%.*]], { [[STRUCT_DISPATCHSYSTEMDATA:%.*]], [19 x i32], [1 x i32] } [[TMP3:%.*]]) !lgc.rt.shaderstage [[META8]] !lgc.cps [[META14]] !continuation [[META16]] {
 ; POSTPROCESS-CPS-NEXT:  entryresume.0:
 ; POSTPROCESS-CPS-NEXT:    [[CSP:%.*]] = alloca i32, align 4
 ; POSTPROCESS-CPS-NEXT:    store i32 [[CSPINIT]], ptr [[CSP]], align 4
diff --git a/llvmraytracing/test/dx/lower-rt-pipeline-exit-raygen.ll b/llvmraytracing/test/dx/lower-rt-pipeline-exit-raygen.ll
new file mode 100644
index 0000000000..9e86e4437f
--- /dev/null
+++ b/llvmraytracing/test/dx/lower-rt-pipeline-exit-raygen.ll
@@ -0,0 +1,87 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --include-generated-funcs --version 3
+; RUN: opt --verify-each -passes="lower-raytracing-pipeline,lint,remove-types-metadata" -S %s --lint-abort-on-error | FileCheck -check-prefix=LOWERRAYTRACINGPIPELINE %s
+
+target datalayout = "e-m:e-p:64:32-p20:32:32-p21:32:32-p32:32:32-i1:32-i8:8-i16:16-i32:32-i64:32-f16:16-f32:32-f64:32-v8:8-v16:16-v32:32-v48:32-v64:32-v80:32-v96:32-v112:32-v128:32-v144:32-v160:32-v176:32-v192:32-v208:32-v224:32-v240:32-v256:32-n8:16:32"
+
+%dx.types.Handle = type { i8* }
+%struct.DispatchSystemData = type { <3 x i32> }
+%struct.TraversalData = type { %struct.SystemData, %struct.HitData, <3 x float>, <3 x float>, float, i64 }
+%struct.SystemData = type { %struct.DispatchSystemData, %struct.BuiltInTriangleIntersectionAttributes }
+%struct.BuiltInTriangleIntersectionAttributes = type { <2 x float> }
+%struct.HitData = type { float, i32 }
+%struct.RaytracingAccelerationStructure = type { i32 }
+%"class.RWTexture2D<vector<float, 4> >" = type { <4 x float> }
+
+@"\01?Scene@@3URaytracingAccelerationStructure@@A" = external constant %dx.types.Handle, align 4
+@"\01?RenderTarget@@3V?$RWTexture2D@V?$vector@M$03@@@@A" = external constant %dx.types.Handle, align 4
+
+declare void @continuation.waitContinue(i64, i64, ...) noreturn
+
+declare !types !24 i32 @_cont_GetLocalRootIndex(%struct.DispatchSystemData* %data)
+
+define void @_cont_ExitRayGen(ptr nocapture readonly %data) alwaysinline nounwind !types !{!"function", !"void", !{i32 0, %struct.DispatchSystemData poison}} {
+  %dispatchPayloadPtr = getelementptr inbounds %struct.DispatchSystemData, ptr %data, i32 0, i32 0
+  %dispatchPayload = load <3 x i32>, ptr %dispatchPayloadPtr, align 4
+  %deadLaneDispatchPayload = insertelement <3 x i32> %dispatchPayload, i32 -11, i32 0
+  %systemData = insertvalue %struct.SystemData poison, <3 x i32> %deadLaneDispatchPayload, 0, 0
+  %addrSuffix = load i32, ptr %data, align 4
+  %addr = zext i32 %addrSuffix to i64
+  call void @continuation.waitContinue(i64 %addr, i64 -1, %struct.SystemData %systemData)
+  unreachable
+}
+
+; Function Attrs: nounwind
+define void @MyRayGen() #0 !lgc.rt.shaderstage !20 {
+  ret void
+}
+
+attributes #0 = { nounwind "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-realign-stack" "stack-protector-buffer-size"="0" "unsafe-fp-math"="false" "use-soft-float"="false" }
+
+!llvm.ident = !{!0}
+!dx.version = !{!1}
+!dx.valver = !{!1}
+!dx.shaderModel = !{!2}
+!dx.resources = !{!3}
+!dx.typeAnnotations = !{!10}
+!dx.entryPoints = !{!21}
+
+!0 = !{!"clang version 3.7.0 (tags/RELEASE_370/final)"}
+!1 = !{i32 1, i32 6}
+!2 = !{!"lib", i32 6, i32 6}
+!3 = !{!4, !7, null, null}
+!4 = !{!5}
+!5 = !{i32 0, %struct.RaytracingAccelerationStructure* bitcast (%dx.types.Handle* @"\01?Scene@@3URaytracingAccelerationStructure@@A" to %struct.RaytracingAccelerationStructure*), !"Scene", i32 0, i32 0, i32 1, i32 16, i32 0, !6}
+!6 = !{i32 0, i32 4}
+!7 = !{!8}
+!8 = !{i32 0, %"class.RWTexture2D<vector<float, 4> >"* bitcast (%dx.types.Handle* @"\01?RenderTarget@@3V?$RWTexture2D@V?$vector@M$03@@@@A" to %"class.RWTexture2D<vector<float, 4> >"*), !"RenderTarget", i32 0, i32 0, i32 1, i32 2, i1 false, i1 false, i1 false, !9}
+!9 = !{i32 0, i32 9}
+!10 = !{i32 1, void ()* @MyRayGen, !11}
+!11 = !{!12}
+!12 = !{i32 1, !13, !13}
+!13 = !{}
+!14 = !{!12, !15, !16}
+!15 = !{i32 2, !13, !13}
+!16 = !{i32 0, !13, !13}
+!17 = !{!12, !15}
+!18 = !{null, !"", null, !3, !19}
+!19 = !{i32 0, i64 65536}
+!20 = !{i32 0}
+!21 = !{void ()* @MyRayGen, !"MyRayGen", null, null, !22}
+!22 = !{i32 8, i32 7, i32 5, !23}
+!23 = !{i32 0}
+!24 = !{!"function", i32 poison, !25}
+!25 = !{i32 0, %struct.DispatchSystemData poison}
+; LOWERRAYTRACINGPIPELINE-LABEL: define void @MyRayGen(
+; LOWERRAYTRACINGPIPELINE-SAME: i64 [[RETURNADDR:%.*]], [[STRUCT_DISPATCHSYSTEMDATA:%.*]] [[TMP0:%.*]]) #[[ATTR1:[0-9]+]] !lgc.rt.shaderstage [[META16:![0-9]+]] !continuation.entry [[META13:![0-9]+]] !continuation.registercount [[META16]] !continuation [[META19:![0-9]+]] {
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[SYSTEM_DATA_ALLOCA:%.*]] = alloca [[STRUCT_DISPATCHSYSTEMDATA]], align 8
+; LOWERRAYTRACINGPIPELINE-NEXT:    store [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP0]], ptr [[SYSTEM_DATA_ALLOCA]], align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[DISPATCHPAYLOAD_I:%.*]] = load <3 x i32>, ptr [[SYSTEM_DATA_ALLOCA]], align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[DEADLANEDISPATCHPAYLOAD_I:%.*]] = insertelement <3 x i32> [[DISPATCHPAYLOAD_I]], i32 -11, i32 0
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[SYSTEMDATA_I:%.*]] = insertvalue [[STRUCT_SYSTEMDATA:%.*]] poison, <3 x i32> [[DEADLANEDISPATCHPAYLOAD_I]], 0, 0
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[ADDRSUFFIX_I:%.*]] = load i32, ptr [[SYSTEM_DATA_ALLOCA]], align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[ADDR_I:%.*]] = zext i32 [[ADDRSUFFIX_I]] to i64
+; LOWERRAYTRACINGPIPELINE-NEXT:    call void @continuation.waitContinue(i64 [[ADDR_I]], i64 -1, [[STRUCT_SYSTEMDATA]] [[SYSTEMDATA_I]]) #[[ATTR3:[0-9]+]]
+; LOWERRAYTRACINGPIPELINE-NEXT:    unreachable
+; LOWERRAYTRACINGPIPELINE:       _cont_ExitRayGen.exit:
+; LOWERRAYTRACINGPIPELINE-NEXT:    ret void
+;
diff --git a/llvmraytracing/test/dx/lower-rt-pipeline-intrinsics-hit.ll b/llvmraytracing/test/dx/lower-rt-pipeline-intrinsics-hit.ll
index 4dd21e2073..d3b03afd08 100644
--- a/llvmraytracing/test/dx/lower-rt-pipeline-intrinsics-hit.ll
+++ b/llvmraytracing/test/dx/lower-rt-pipeline-intrinsics-hit.ll
@@ -1,10 +1,8 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 3
-; RUN: opt --verify-each -passes='dxil-cont-intrinsic-prepare,lint,dxil-cont-lgc-rt-op-converter,lint,lower-raytracing-pipeline,lint,remove-types-metadata' -S %s 2> %t0.stderr | FileCheck -check-prefix=LOWERRAYTRACINGPIPELINE %s
-; RUN: count 0 < %t0.stderr
-; RUN: opt --verify-each -passes='dxil-cont-intrinsic-prepare,lint,dxil-cont-lgc-rt-op-converter,lint,inline,lint,lower-raytracing-pipeline,lint,sroa,lint,lower-await,lint,coro-early,dxil-coro-split,coro-cleanup,lint,legacy-cleanup-continuations,lint,register-buffer,lint,dxil-cont-post-process,lint,remove-types-metadata' -S %s 2> %t1.stderr | FileCheck -check-prefix=DXILCONTPOSTPROCESS %s
-; RUN: count 0 < %t1.stderr
+; RUN: opt --verify-each -passes='dxil-cont-intrinsic-prepare,lint,dxil-cont-lgc-rt-op-converter,lint,lower-raytracing-pipeline,lint,remove-types-metadata' -S %s --lint-abort-on-error | FileCheck -check-prefix=LOWERRAYTRACINGPIPELINE %s
+; RUN: opt --verify-each -passes='dxil-cont-intrinsic-prepare,lint,dxil-cont-lgc-rt-op-converter,lint,inline,lint,lower-raytracing-pipeline,lint,sroa,lint,lower-await,lint,coro-early,dxil-coro-split,coro-cleanup,lint,legacy-cleanup-continuations,lint,register-buffer,lint,dxil-cont-post-process,lint,remove-types-metadata' -S %s --lint-abort-on-error | FileCheck -check-prefix=DXILCONTPOSTPROCESS %s
 
-target datalayout = "e-m:e-p:64:32-p20:32:32-p21:32:32-p32:32:32-i1:32-i8:8-i16:32-i32:32-i64:32-f16:32-f32:32-f64:32-v16:32-v32:32-v48:32-v64:32-v80:32-v96:32-v112:32-v128:32-v144:32-v160:32-v176:32-v192:32-v208:32-v224:32-v240:32-v256:32-n8:16:32"
+target datalayout = "e-m:e-p:64:32-p20:32:32-p21:32:32-p32:32:32-i1:32-i8:8-i16:16-i32:32-i64:32-f16:16-f32:32-f64:32-v8:8-v16:16-v32:32-v48:32-v64:32-v80:32-v96:32-v112:32-v128:32-v144:32-v160:32-v176:32-v192:32-v208:32-v224:32-v240:32-v256:32-n8:16:32"
 
 %dx.types.Handle = type { i8* }
 %struct.BuiltInTriangleIntersectionAttributes = type { <2 x float> }
@@ -41,6 +39,10 @@ declare !types !29 void @_cont_AcceptHit(%struct.AnyHitTraversalData* nocapture
 
 declare i1 @opaqueIsEnd() #0
 
+define void @_cont_ExitRayGen(ptr nocapture readonly %data) alwaysinline nounwind !types !{!"function", !"void", !{i32 0, %struct.DispatchSystemData poison}} {
+  ret void
+}
+
 define i1 @_cont_IsEndSearch(%struct.TraversalData* %data) #0 !types !31 {
 ; LOWERRAYTRACINGPIPELINE-LABEL: define i1 @_cont_IsEndSearch(
 ; LOWERRAYTRACINGPIPELINE-SAME: ptr [[DATA:%.*]]) #[[ATTR0:[0-9]+]] {
@@ -181,20 +183,18 @@ declare !types !46 [4 x <3 x float>] @_cont_WorldToObject4x3(%struct.DispatchSys
 ; Function Attrs: nounwind
 define void @RayGen() #3 {
 ; LOWERRAYTRACINGPIPELINE-LABEL: define void @RayGen(
-; LOWERRAYTRACINGPIPELINE-SAME: [[STRUCT_DISPATCHSYSTEMDATA:%.*]] [[TMP0:%.*]]) #[[ATTR5:[0-9]+]] !lgc.rt.shaderstage [[META18:![0-9]+]] !continuation [[META29:![0-9]+]] !continuation.entry [[META13:![0-9]+]] !continuation.registercount [[META18]] {
+; LOWERRAYTRACINGPIPELINE-SAME: i64 [[RETURNADDR:%.*]], [[STRUCT_DISPATCHSYSTEMDATA:%.*]] [[TMP0:%.*]]) #[[ATTR5:[0-9]+]] !lgc.rt.shaderstage [[META18:![0-9]+]] !continuation [[META29:![0-9]+]] !continuation.entry [[META13:![0-9]+]] !continuation.registercount [[META18]] {
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[SYSTEM_DATA_ALLOCA:%.*]] = alloca [[STRUCT_DISPATCHSYSTEMDATA]], align 8
 ; LOWERRAYTRACINGPIPELINE-NEXT:    store [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP0]], ptr [[SYSTEM_DATA_ALLOCA]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    call void @amd.dx.setLocalRootIndex(i32 0)
-; LOWERRAYTRACINGPIPELINE-NEXT:    ret void, !continuation.registercount [[META25:![0-9]+]]
+; LOWERRAYTRACINGPIPELINE-NEXT:    ret void
 ;
 ; DXILCONTPOSTPROCESS-LABEL: define void @RayGen(
-; DXILCONTPOSTPROCESS-SAME: ) #[[ATTR5:[0-9]+]] !lgc.rt.shaderstage [[META18:![0-9]+]] !continuation [[META28:![0-9]+]] !continuation.entry [[META13:![0-9]+]] !continuation.registercount [[META18]] !continuation.state [[META18]] {
+; DXILCONTPOSTPROCESS-SAME: i32 [[CSPINIT:%.*]], i64 [[RETURNADDR:%.*]], [[STRUCT_DISPATCHSYSTEMDATA:%.*]] [[TMP0:%.*]]) #[[ATTR5:[0-9]+]] !lgc.rt.shaderstage [[META18:![0-9]+]] !continuation [[META28:![0-9]+]] !continuation.entry [[META13:![0-9]+]] !continuation.registercount [[META18]] !continuation.state [[META18]] {
 ; DXILCONTPOSTPROCESS-NEXT:  AllocaSpillBB:
 ; DXILCONTPOSTPROCESS-NEXT:    [[CSP:%.*]] = alloca i32, align 4
-; DXILCONTPOSTPROCESS-NEXT:    [[TMP0:%.*]] = call i32 @_cont_GetContinuationStackAddr()
-; DXILCONTPOSTPROCESS-NEXT:    store i32 [[TMP0]], ptr [[CSP]], align 4
-; DXILCONTPOSTPROCESS-NEXT:    [[SYSTEM_DATA:%.*]] = call [[STRUCT_DISPATCHSYSTEMDATA:%.*]] [[_CONT_SETUPRAYGEN:@[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]]()
-; DXILCONTPOSTPROCESS-NEXT:    [[DOTFCA_0_EXTRACT:%.*]] = extractvalue [[STRUCT_DISPATCHSYSTEMDATA]] [[SYSTEM_DATA]], 0
+; DXILCONTPOSTPROCESS-NEXT:    store i32 [[CSPINIT]], ptr [[CSP]], align 4
+; DXILCONTPOSTPROCESS-NEXT:    [[DOTFCA_0_EXTRACT:%.*]] = extractvalue [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP0]], 0
 ; DXILCONTPOSTPROCESS-NEXT:    call void @amd.dx.setLocalRootIndex(i32 0)
 ; DXILCONTPOSTPROCESS-NEXT:    ret void
 ; DXILCONTPOSTPROCESS:       AllocaSpillBB.split:
@@ -206,7 +206,7 @@ define void @RayGen() #3 {
 ; Function Attrs: nounwind
 define void @Intersection() #3 {
 ; LOWERRAYTRACINGPIPELINE-LABEL: define %struct.AnyHitTraversalData @Intersection(
-; LOWERRAYTRACINGPIPELINE-SAME: [[STRUCT_ANYHITTRAVERSALDATA:%.*]] [[TMP0:%.*]]) #[[ATTR5]] !lgc.rt.shaderstage [[META30:![0-9]+]] !continuation [[META31:![0-9]+]] !continuation.registercount [[META25]] {
+; LOWERRAYTRACINGPIPELINE-SAME: i64 [[RETURNADDR:%.*]], [[STRUCT_ANYHITTRAVERSALDATA:%.*]] [[TMP0:%.*]]) #[[ATTR5]] !lgc.rt.shaderstage [[META30:![0-9]+]] !continuation [[META31:![0-9]+]] !continuation.registercount [[META25:![0-9]+]] {
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP2:%.*]] = alloca [[STRUCT_HITDATA:%.*]], align 8
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP3:%.*]] = alloca [[STRUCT_HITDATA]], align 8
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[SYSTEM_DATA_ALLOCA:%.*]] = alloca [[STRUCT_ANYHITTRAVERSALDATA]], align 8
@@ -231,18 +231,20 @@ define void @Intersection() #3 {
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP11:%.*]] = alloca [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES:%.*]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TRAV_DATA_I:%.*]] = load [[STRUCT_ANYHITTRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP12:%.*]] = load [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]], ptr [[TMP11]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP13:%.*]] = call ptr inttoptr (i64 3 to ptr)([[STRUCT_ANYHITTRAVERSALDATA]] [[TRAV_DATA_I]], float 4.000000e+00, i32 0, [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]] [[TMP12]]), !continuation.registercount [[META25]], !continuation.returnedRegistercount !25
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP14:%.*]] = call [[STRUCT_ANYHITTRAVERSALDATA]] [[AWAIT_STRUCT_ANYHITTRAVERSALDATA:@[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]](ptr [[TMP13]])
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP13:%.*]] = call ptr inttoptr (i64 3 to ptr)([[STRUCT_ANYHITTRAVERSALDATA]] [[TRAV_DATA_I]], float 4.000000e+00, i32 0, [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]] [[TMP12]]), !continuation.registercount [[META25]], !continuation.returnedRegistercount [[META25]]
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP14:%.*]] = call [[STRUCT_ANYHITTRAVERSALDATA]] [[AWAIT:@[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]](ptr [[TMP13]])
 ; LOWERRAYTRACINGPIPELINE-NEXT:    store [[STRUCT_ANYHITTRAVERSALDATA]] [[TMP14]], ptr [[SYSTEM_DATA_ALLOCA]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [[STRUCT_ANYHITTRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[ISEND_I:%.*]] = call i1 @opaqueIsEnd()
 ; LOWERRAYTRACINGPIPELINE-NEXT:    br i1 [[ISEND_I]], label [[TMP16:%.*]], label [[TMP18:%.*]]
 ; LOWERRAYTRACINGPIPELINE:       16:
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP17:%.*]] = load [[STRUCT_ANYHITTRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    ret [[STRUCT_ANYHITTRAVERSALDATA]] [[TMP17]], !continuation.registercount [[META25]]
+; LOWERRAYTRACINGPIPELINE-NEXT:    call void (...) @lgc.ilcps.return(i64 [[RETURNADDR]], [[STRUCT_ANYHITTRAVERSALDATA]] [[TMP17]]), !continuation.registercount [[META25]]
+; LOWERRAYTRACINGPIPELINE-NEXT:    unreachable
 ; LOWERRAYTRACINGPIPELINE:       18:
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP19:%.*]] = load [[STRUCT_ANYHITTRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    ret [[STRUCT_ANYHITTRAVERSALDATA]] [[TMP19]], !continuation.registercount [[META25]]
+; LOWERRAYTRACINGPIPELINE-NEXT:    call void (...) @lgc.ilcps.return(i64 [[RETURNADDR]], [[STRUCT_ANYHITTRAVERSALDATA]] [[TMP19]]), !continuation.registercount [[META25]]
+; LOWERRAYTRACINGPIPELINE-NEXT:    unreachable
 ;
 ; DXILCONTPOSTPROCESS-LABEL: define void @Intersection(
 ; DXILCONTPOSTPROCESS-SAME: i32 [[CSPINIT:%.*]], i64 [[RETURNADDR:%.*]], [[STRUCT_ANYHITTRAVERSALDATA:%.*]] [[TMP0:%.*]]) #[[ATTR5]] !lgc.rt.shaderstage [[META29:![0-9]+]] !continuation [[META30:![0-9]+]] !continuation.registercount [[META25:![0-9]+]] !continuation.stacksize [[META31:![0-9]+]] !continuation.state [[META31]] {
@@ -294,9 +296,9 @@ define void @Intersection() #3 {
 ; DXILCONTPOSTPROCESS-NEXT:    [[TRAV_DATA_I_FCA_1_0_INSERT:%.*]] = insertvalue [[STRUCT_ANYHITTRAVERSALDATA]] [[TRAV_DATA_I_FCA_0_4_INSERT]], float [[DOTFCA_1_0_EXTRACT]], 1, 0
 ; DXILCONTPOSTPROCESS-NEXT:    [[TRAV_DATA_I_FCA_1_1_INSERT:%.*]] = insertvalue [[STRUCT_ANYHITTRAVERSALDATA]] [[TRAV_DATA_I_FCA_1_0_INSERT]], i32 [[DOTFCA_1_1_EXTRACT]], 1, 1
 ; DXILCONTPOSTPROCESS-NEXT:    [[DOTFCA_0_INSERT:%.*]] = insertvalue [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES:%.*]] poison, <2 x float> undef, 0
+; DXILCONTPOSTPROCESS-NEXT:    [[TMP12:%.*]] = call i64 @continuation.getAddrAndMD(ptr @Intersection.resume.0)
 ; DXILCONTPOSTPROCESS-NEXT:    [[TMP11:%.*]] = load i32, ptr [[CSP]], align 4
-; DXILCONTPOSTPROCESS-NEXT:    [[TMP12:%.*]] = call i64 @continuation.getAddrAndMD(i64 ptrtoint (ptr @Intersection.resume.0 to i64))
-; DXILCONTPOSTPROCESS-NEXT:    call void (i64, ...) @continuation.continue(i64 3, i32 [[TMP11]], i64 [[TMP12]], [[STRUCT_ANYHITTRAVERSALDATA]] [[TRAV_DATA_I_FCA_1_1_INSERT]], float 4.000000e+00, i32 0, [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]] [[DOTFCA_0_INSERT]]), !continuation.registercount [[META25]], !continuation.returnedRegistercount !25
+; DXILCONTPOSTPROCESS-NEXT:    call void (i64, ...) @continuation.continue(i64 3, i32 [[TMP11]], i64 [[TMP12]], [[STRUCT_ANYHITTRAVERSALDATA]] [[TRAV_DATA_I_FCA_1_1_INSERT]], float 4.000000e+00, i32 0, [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]] [[DOTFCA_0_INSERT]]), !continuation.registercount [[META25]], !continuation.returnedRegistercount [[META25]]
 ; DXILCONTPOSTPROCESS-NEXT:    unreachable
 ;
   %1 = call float @dx.op.rayTMin.f32(i32 153)
@@ -311,7 +313,7 @@ define void @Intersection() #3 {
 ; Function Attrs: nounwind
 define void @AnyHit(%struct.RayPayload* noalias nocapture %payload, %struct.BuiltInTriangleIntersectionAttributes* nocapture readonly %attr) #3 !types !47 {
 ; LOWERRAYTRACINGPIPELINE-LABEL: define %struct.AnyHitTraversalData @AnyHit(
-; LOWERRAYTRACINGPIPELINE-SAME: [[STRUCT_ANYHITTRAVERSALDATA:%.*]] [[TMP0:%.*]], [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES:%.*]] [[TMP1:%.*]]) #[[ATTR5]] !lgc.rt.shaderstage [[META32:![0-9]+]] !continuation [[META33:![0-9]+]] !continuation.registercount [[META26:![0-9]+]] {
+; LOWERRAYTRACINGPIPELINE-SAME: i64 [[RETURNADDR:%.*]], [[STRUCT_ANYHITTRAVERSALDATA:%.*]] [[TMP0:%.*]], [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES:%.*]] [[TMP1:%.*]]) #[[ATTR5]] !lgc.rt.shaderstage [[META32:![0-9]+]] !continuation [[META33:![0-9]+]] !continuation.registercount [[META26:![0-9]+]] {
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP3:%.*]] = alloca [[STRUCT_HITDATA:%.*]], align 8
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP4:%.*]] = alloca [[STRUCT_HITDATA]], align 8
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP5:%.*]] = alloca [[STRUCT_HITDATA]], align 8
@@ -323,16 +325,16 @@ define void @AnyHit(%struct.RayPayload* noalias nocapture %payload, %struct.Buil
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[HITATTRSALLOCA:%.*]] = alloca [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]], align 8
 ; LOWERRAYTRACINGPIPELINE-NEXT:    store [[STRUCT_ANYHITTRAVERSALDATA]] [[TMP0]], ptr [[SYSTEM_DATA_ALLOCA]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [[STRUCT_ANYHITTRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0, i32 0, i32 0
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [[STRUCT_RAYPAYLOAD]], ptr [[TMP8]], i32 0, i32 0
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [[STRUCT_RAYPAYLOAD]], ptr [[TMP8]], i32 0
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP13:%.*]] = load i32, ptr addrspace(20) @PAYLOAD, align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP13]], ptr [[TMP10]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [[STRUCT_RAYPAYLOAD]], ptr [[TMP8]], i32 0, i32 1
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[TMP10]], i32 1
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP17:%.*]] = load i32, ptr addrspace(20) getelementptr inbounds (i32, ptr addrspace(20) @PAYLOAD, i32 7), align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP17]], ptr [[TMP12]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP14:%.*]] = getelementptr inbounds [[STRUCT_RAYPAYLOAD]], ptr [[TMP8]], i32 0, i32 2
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i32, ptr [[TMP12]], i32 1
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP59:%.*]] = load i32, ptr addrspace(20) getelementptr inbounds (i32, ptr addrspace(20) @PAYLOAD, i32 8), align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP59]], ptr [[TMP14]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [[STRUCT_RAYPAYLOAD]], ptr [[TMP8]], i32 0, i32 3
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i32, ptr [[TMP12]], i32 2
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP65:%.*]] = load i32, ptr addrspace(20) getelementptr inbounds (i32, ptr addrspace(20) @PAYLOAD, i32 9), align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP65]], ptr [[TMP16]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP18:%.*]] = getelementptr inbounds [[STRUCT_ANYHITTRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0, i32 0
@@ -373,16 +375,16 @@ define void @AnyHit(%struct.RayPayload* noalias nocapture %payload, %struct.Buil
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP38:%.*]] = insertvalue [[STRUCT_RAYPAYLOAD]] [[TMP37]], i32 [[RES_I5]], 3
 ; LOWERRAYTRACINGPIPELINE-NEXT:    store [[STRUCT_RAYPAYLOAD]] [[TMP38]], ptr [[TMP8]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    call void @_cont_AcceptHit(ptr [[SYSTEM_DATA_ALLOCA]])
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP39:%.*]] = getelementptr inbounds [[STRUCT_RAYPAYLOAD]], ptr [[TMP8]], i32 0, i32 0
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP39:%.*]] = getelementptr inbounds [[STRUCT_RAYPAYLOAD]], ptr [[TMP8]], i32 0
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP68:%.*]] = load i32, ptr [[TMP39]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP68]], ptr addrspace(20) @PAYLOAD, align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP41:%.*]] = getelementptr inbounds [[STRUCT_RAYPAYLOAD]], ptr [[TMP8]], i32 0, i32 1
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP41:%.*]] = getelementptr inbounds i32, ptr [[TMP39]], i32 1
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP71:%.*]] = load i32, ptr [[TMP41]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP71]], ptr addrspace(20) getelementptr inbounds (i32, ptr addrspace(20) @PAYLOAD, i32 7), align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP43:%.*]] = getelementptr inbounds [[STRUCT_RAYPAYLOAD]], ptr [[TMP8]], i32 0, i32 2
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP43:%.*]] = getelementptr inbounds i32, ptr [[TMP41]], i32 1
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP58:%.*]] = load i32, ptr [[TMP43]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP58]], ptr addrspace(20) getelementptr inbounds (i32, ptr addrspace(20) @PAYLOAD, i32 8), align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP45:%.*]] = getelementptr inbounds [[STRUCT_RAYPAYLOAD]], ptr [[TMP8]], i32 0, i32 3
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP45:%.*]] = getelementptr inbounds i32, ptr [[TMP41]], i32 2
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP62:%.*]] = load i32, ptr [[TMP45]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP62]], ptr addrspace(20) getelementptr inbounds (i32, ptr addrspace(20) @PAYLOAD, i32 9), align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP46:%.*]] = load i32, ptr [[HITATTRSALLOCA]], align 4
@@ -395,7 +397,8 @@ define void @AnyHit(%struct.RayPayload* noalias nocapture %payload, %struct.Buil
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP54:%.*]] = getelementptr inbounds [[STRUCT_ANYHITTRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0, i32 0
 ; LOWERRAYTRACINGPIPELINE-NEXT:    call void @_cont_SetTriangleHitAttributes(ptr [[TMP54]], [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]] [[TMP53]])
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP55:%.*]] = load [[STRUCT_ANYHITTRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    ret [[STRUCT_ANYHITTRAVERSALDATA]] [[TMP55]], !continuation.registercount [[META26]]
+; LOWERRAYTRACINGPIPELINE-NEXT:    call void (...) @lgc.ilcps.return(i64 [[RETURNADDR]], [[STRUCT_ANYHITTRAVERSALDATA]] [[TMP55]]), !continuation.registercount [[META26]]
+; LOWERRAYTRACINGPIPELINE-NEXT:    unreachable
 ;
 ; DXILCONTPOSTPROCESS-LABEL: define void @AnyHit(
 ; DXILCONTPOSTPROCESS-SAME: i32 [[CSPINIT:%.*]], i64 [[RETURNADDR:%.*]], [[STRUCT_ANYHITTRAVERSALDATA:%.*]] [[TMP0:%.*]], [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES:%.*]] [[TMP1:%.*]]) #[[ATTR5]] !lgc.rt.shaderstage [[META32:![0-9]+]] !continuation [[META33:![0-9]+]] !continuation.registercount [[META26:![0-9]+]] !continuation.state [[META18]] {
@@ -543,7 +546,7 @@ define void @AnyHit(%struct.RayPayload* noalias nocapture %payload, %struct.Buil
 ; DXILCONTPOSTPROCESS-NEXT:    [[DOTFCA_1_1_LOAD:%.*]] = load i32, ptr [[DOTFCA_1_1_GEP17]], align 4
 ; DXILCONTPOSTPROCESS-NEXT:    [[DOTFCA_1_1_INSERT:%.*]] = insertvalue [[STRUCT_ANYHITTRAVERSALDATA]] [[DOTFCA_1_0_INSERT]], i32 [[DOTFCA_1_1_LOAD]], 1, 1
 ; DXILCONTPOSTPROCESS-NEXT:    [[TMP30:%.*]] = load i32, ptr [[CSP]], align 4
-; DXILCONTPOSTPROCESS-NEXT:    call void (i64, ...) @continuation.continue(i64 [[RETURNADDR]], i32 [[TMP30]], [[STRUCT_ANYHITTRAVERSALDATA]] [[DOTFCA_1_1_INSERT]]), !continuation.registercount [[META26]]
+; DXILCONTPOSTPROCESS-NEXT:    call void (i64, ...) @continuation.continue(i64 [[RETURNADDR]], i32 [[TMP30]], i64 poison, [[STRUCT_ANYHITTRAVERSALDATA]] [[DOTFCA_1_1_INSERT]]), !continuation.registercount [[META26]]
 ; DXILCONTPOSTPROCESS-NEXT:    unreachable
 ;
   %1 = call float @dx.op.rayTMin.f32(i32 153)
@@ -561,7 +564,7 @@ define void @AnyHit(%struct.RayPayload* noalias nocapture %payload, %struct.Buil
 ; Function Attrs: nounwind
 define void @ClosestHit(%struct.RayPayload* noalias nocapture %payload, %struct.BuiltInTriangleIntersectionAttributes* nocapture readonly %attr) #3 !types !47 {
 ; LOWERRAYTRACINGPIPELINE-LABEL: define %struct.DispatchSystemData @ClosestHit(
-; LOWERRAYTRACINGPIPELINE-SAME: [[STRUCT_SYSTEMDATA:%.*]] [[TMP0:%.*]]) #[[ATTR5]] !lgc.rt.shaderstage [[META34:![0-9]+]] !continuation [[META35:![0-9]+]] !continuation.registercount [[META26]] {
+; LOWERRAYTRACINGPIPELINE-SAME: i64 [[RETURNADDR:%.*]], [[STRUCT_SYSTEMDATA:%.*]] [[TMP0:%.*]]) #[[ATTR5]] !lgc.rt.shaderstage [[META34:![0-9]+]] !continuation [[META35:![0-9]+]] !continuation.registercount [[META26]] {
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP2:%.*]] = alloca [[STRUCT_HITDATA:%.*]], align 8
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP3:%.*]] = alloca [[STRUCT_HITDATA]], align 8
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP4:%.*]] = alloca [[STRUCT_HITDATA]], align 8
@@ -571,16 +574,16 @@ define void @ClosestHit(%struct.RayPayload* noalias nocapture %payload, %struct.
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[HITATTRS:%.*]] = alloca [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]], align 8
 ; LOWERRAYTRACINGPIPELINE-NEXT:    store [[STRUCT_SYSTEMDATA]] [[TMP0]], ptr [[SYSTEM_DATA_ALLOCA]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT_SYSTEMDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [[STRUCT_RAYPAYLOAD]], ptr [[TMP6]], i32 0, i32 0
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [[STRUCT_RAYPAYLOAD]], ptr [[TMP6]], i32 0
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP11:%.*]] = load i32, ptr addrspace(20) @PAYLOAD, align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP11]], ptr [[TMP8]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [[STRUCT_RAYPAYLOAD]], ptr [[TMP6]], i32 0, i32 1
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[TMP8]], i32 1
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP15:%.*]] = load i32, ptr addrspace(20) getelementptr inbounds (i32, ptr addrspace(20) @PAYLOAD, i32 7), align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP15]], ptr [[TMP10]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [[STRUCT_RAYPAYLOAD]], ptr [[TMP6]], i32 0, i32 2
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[TMP10]], i32 1
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP43:%.*]] = load i32, ptr addrspace(20) getelementptr inbounds (i32, ptr addrspace(20) @PAYLOAD, i32 8), align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP43]], ptr [[TMP12]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP14:%.*]] = getelementptr inbounds [[STRUCT_RAYPAYLOAD]], ptr [[TMP6]], i32 0, i32 3
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i32, ptr [[TMP10]], i32 2
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP55:%.*]] = load i32, ptr addrspace(20) getelementptr inbounds (i32, ptr addrspace(20) @PAYLOAD, i32 9), align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP55]], ptr [[TMP14]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP16:%.*]] = call [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]] [[_CONT_GETTRIANGLEHITATTRIBUTES:@[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]](ptr [[SYSTEM_DATA_ALLOCA]])
@@ -606,21 +609,22 @@ define void @ClosestHit(%struct.RayPayload* noalias nocapture %payload, %struct.
 ; LOWERRAYTRACINGPIPELINE-NEXT:    store [[STRUCT_HITDATA]] [[TMP30]], ptr [[TMP3]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[RESPTR_I:%.*]] = getelementptr [[STRUCT_SYSTEMDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 1
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[RES_I1:%.*]] = load i32, ptr [[RESPTR_I]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP31:%.*]] = getelementptr inbounds [[STRUCT_RAYPAYLOAD]], ptr [[TMP6]], i32 0, i32 0
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP31:%.*]] = getelementptr inbounds [[STRUCT_RAYPAYLOAD]], ptr [[TMP6]], i32 0
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP42:%.*]] = load i32, ptr [[TMP31]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP42]], ptr addrspace(20) @PAYLOAD, align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP33:%.*]] = getelementptr inbounds [[STRUCT_RAYPAYLOAD]], ptr [[TMP6]], i32 0, i32 1
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP33:%.*]] = getelementptr inbounds i32, ptr [[TMP31]], i32 1
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP46:%.*]] = load i32, ptr [[TMP33]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP46]], ptr addrspace(20) getelementptr inbounds (i32, ptr addrspace(20) @PAYLOAD, i32 7), align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP35:%.*]] = getelementptr inbounds [[STRUCT_RAYPAYLOAD]], ptr [[TMP6]], i32 0, i32 2
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP35:%.*]] = getelementptr inbounds i32, ptr [[TMP33]], i32 1
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP50:%.*]] = load i32, ptr [[TMP35]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP50]], ptr addrspace(20) getelementptr inbounds (i32, ptr addrspace(20) @PAYLOAD, i32 8), align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP37:%.*]] = getelementptr inbounds [[STRUCT_RAYPAYLOAD]], ptr [[TMP6]], i32 0, i32 3
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP37:%.*]] = getelementptr inbounds i32, ptr [[TMP33]], i32 2
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP54:%.*]] = load i32, ptr [[TMP37]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP54]], ptr addrspace(20) getelementptr inbounds (i32, ptr addrspace(20) @PAYLOAD, i32 9), align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP39:%.*]] = getelementptr inbounds [[STRUCT_SYSTEMDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP40:%.*]] = load [[STRUCT_DISPATCHSYSTEMDATA:%.*]], ptr [[TMP39]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    ret [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP40]], !continuation.registercount [[META26]]
+; LOWERRAYTRACINGPIPELINE-NEXT:    call void (...) @lgc.ilcps.return(i64 [[RETURNADDR]], [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP40]]), !continuation.registercount [[META26]]
+; LOWERRAYTRACINGPIPELINE-NEXT:    unreachable
 ;
 ; DXILCONTPOSTPROCESS-LABEL: define void @ClosestHit(
 ; DXILCONTPOSTPROCESS-SAME: i32 [[CSPINIT:%.*]], i64 [[RETURNADDR:%.*]], [[STRUCT_SYSTEMDATA:%.*]] [[TMP0:%.*]]) #[[ATTR5]] !lgc.rt.shaderstage [[META34:![0-9]+]] !continuation [[META35:![0-9]+]] !continuation.registercount [[META26]] !continuation.state [[META18]] {
@@ -676,7 +680,7 @@ define void @ClosestHit(%struct.RayPayload* noalias nocapture %payload, %struct.
 ; DXILCONTPOSTPROCESS-NEXT:    [[DOTFCA_0_LOAD:%.*]] = load <3 x i32>, ptr [[DOTFCA_0_GEP]], align 4
 ; DXILCONTPOSTPROCESS-NEXT:    [[DOTFCA_0_INSERT:%.*]] = insertvalue [[STRUCT_DISPATCHSYSTEMDATA]] poison, <3 x i32> [[DOTFCA_0_LOAD]], 0
 ; DXILCONTPOSTPROCESS-NEXT:    [[TMP19:%.*]] = load i32, ptr [[CSP]], align 4
-; DXILCONTPOSTPROCESS-NEXT:    call void (i64, ...) @continuation.continue(i64 [[RETURNADDR]], i32 [[TMP19]], [[STRUCT_DISPATCHSYSTEMDATA]] [[DOTFCA_0_INSERT]]), !continuation.registercount [[META26]]
+; DXILCONTPOSTPROCESS-NEXT:    call void (i64, ...) @continuation.continue(i64 [[RETURNADDR]], i32 [[TMP19]], i64 poison, [[STRUCT_DISPATCHSYSTEMDATA]] [[DOTFCA_0_INSERT]]), !continuation.registercount [[META26]]
 ; DXILCONTPOSTPROCESS-NEXT:    unreachable
 ;
   %1 = call float @dx.op.rayTMin.f32(i32 153)
diff --git a/llvmraytracing/test/dx/lower-rt-pipeline-intrinsics.ll b/llvmraytracing/test/dx/lower-rt-pipeline-intrinsics.ll
index a558cdf01b..54c79eb65b 100644
--- a/llvmraytracing/test/dx/lower-rt-pipeline-intrinsics.ll
+++ b/llvmraytracing/test/dx/lower-rt-pipeline-intrinsics.ll
@@ -1,10 +1,8 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --function ClosestHit --version 3
-; RUN: opt --verify-each -passes='dxil-cont-intrinsic-prepare,lint,dxil-cont-lgc-rt-op-converter,lint,lower-raytracing-pipeline,lint,remove-types-metadata' -S %s 2> %t0.stderr | FileCheck -check-prefix=LOWERRAYTRACINGPIPELINE %s
-; RUN: count 0 < %t0.stderr
-; RUN: opt --verify-each -passes='dxil-cont-intrinsic-prepare,lint,dxil-cont-lgc-rt-op-converter,lint,inline,lint,lower-raytracing-pipeline,lint,sroa,lint,lower-await,lint,coro-early,dxil-coro-split,coro-cleanup,lint,legacy-cleanup-continuations,lint,register-buffer,lint,dxil-cont-post-process,lint,remove-types-metadata' -S %s 2> %t1.stderr | FileCheck -check-prefix=DXILCONTPOSTPROCESS %s
-; RUN: count 0 < %t1.stderr
+; RUN: opt --verify-each -passes='dxil-cont-intrinsic-prepare,lint,dxil-cont-lgc-rt-op-converter,lint,lower-raytracing-pipeline,lint,remove-types-metadata' -S %s --lint-abort-on-error | FileCheck -check-prefix=LOWERRAYTRACINGPIPELINE %s
+; RUN: opt --verify-each -passes='dxil-cont-intrinsic-prepare,lint,dxil-cont-lgc-rt-op-converter,lint,inline,lint,lower-raytracing-pipeline,lint,sroa,lint,lower-await,lint,coro-early,dxil-coro-split,coro-cleanup,lint,legacy-cleanup-continuations,lint,register-buffer,lint,dxil-cont-post-process,lint,remove-types-metadata' -S %s --lint-abort-on-error | FileCheck -check-prefix=DXILCONTPOSTPROCESS %s
 
-target datalayout = "e-m:e-p:64:32-p20:32:32-p21:32:32-p32:32:32-i1:32-i8:8-i16:32-i32:32-i64:32-f16:32-f32:32-f64:32-v16:32-v32:32-v48:32-v64:32-v80:32-v96:32-v112:32-v128:32-v144:32-v160:32-v176:32-v192:32-v208:32-v224:32-v240:32-v256:32-n8:16:32"
+target datalayout = "e-m:e-p:64:32-p20:32:32-p21:32:32-p32:32:32-i1:32-i8:8-i16:16-i32:32-i64:32-f16:16-f32:32-f64:32-v8:8-v16:16-v32:32-v48:32-v64:32-v80:32-v96:32-v112:32-v128:32-v144:32-v160:32-v176:32-v192:32-v208:32-v224:32-v240:32-v256:32-n8:16:32"
 
 %dx.types.Handle = type { i8* }
 %struct.BuiltInTriangleIntersectionAttributes = type { <2 x float> }
@@ -97,7 +95,7 @@ declare !types !36 i32 @_cont_HitKind(%struct.SystemData* nocapture readnone, %s
 ; Function Attrs: nounwind
 define void @ClosestHit(%struct.RayPayload* %0, %struct.BuiltInTriangleIntersectionAttributes* %1) #3 !types !37 {
 ; LOWERRAYTRACINGPIPELINE-LABEL: define %struct.DispatchSystemData @ClosestHit(
-; LOWERRAYTRACINGPIPELINE-SAME: [[STRUCT_SYSTEMDATA:%.*]] [[TMP0:%.*]]) #[[ATTR3:[0-9]+]] !lgc.rt.shaderstage [[META23:![0-9]+]] !continuation [[META24:![0-9]+]] !continuation.registercount [[META20:![0-9]+]] {
+; LOWERRAYTRACINGPIPELINE-SAME: i64 [[RETURNADDR:%.*]], [[STRUCT_SYSTEMDATA:%.*]] [[TMP0:%.*]]) #[[ATTR3:[0-9]+]] !lgc.rt.shaderstage [[META23:![0-9]+]] !continuation [[META24:![0-9]+]] !continuation.registercount [[META20:![0-9]+]] {
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP2:%.*]] = alloca [[STRUCT_HITDATA:%.*]], align 8
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP3:%.*]] = alloca [[STRUCT_HITDATA]], align 8
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP4:%.*]] = alloca [[STRUCT_HITDATA]], align 8
@@ -115,16 +113,16 @@ define void @ClosestHit(%struct.RayPayload* %0, %struct.BuiltInTriangleIntersect
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[HITATTRS:%.*]] = alloca [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]], align 8
 ; LOWERRAYTRACINGPIPELINE-NEXT:    store [[STRUCT_SYSTEMDATA]] [[TMP0]], ptr [[SYSTEM_DATA_ALLOCA]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [[STRUCT_SYSTEMDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [[STRUCT_RAYPAYLOAD]], ptr [[TMP14]], i32 0, i32 0
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [[STRUCT_RAYPAYLOAD]], ptr [[TMP14]], i32 0
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP19:%.*]] = load i32, ptr addrspace(20) @PAYLOAD, align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP19]], ptr [[TMP16]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP18:%.*]] = getelementptr inbounds [[STRUCT_RAYPAYLOAD]], ptr [[TMP14]], i32 0, i32 1
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i32, ptr [[TMP16]], i32 1
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP23:%.*]] = load i32, ptr addrspace(20) getelementptr inbounds (i32, ptr addrspace(20) @PAYLOAD, i32 7), align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP23]], ptr [[TMP18]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [[STRUCT_RAYPAYLOAD]], ptr [[TMP14]], i32 0, i32 2
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP20:%.*]] = getelementptr inbounds i32, ptr [[TMP18]], i32 1
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP79:%.*]] = load i32, ptr addrspace(20) getelementptr inbounds (i32, ptr addrspace(20) @PAYLOAD, i32 8), align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP79]], ptr [[TMP20]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP22:%.*]] = getelementptr inbounds [[STRUCT_RAYPAYLOAD]], ptr [[TMP14]], i32 0, i32 3
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP22:%.*]] = getelementptr inbounds i32, ptr [[TMP18]], i32 2
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP91:%.*]] = load i32, ptr addrspace(20) getelementptr inbounds (i32, ptr addrspace(20) @PAYLOAD, i32 9), align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP91]], ptr [[TMP22]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP24:%.*]] = call [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]] [[_CONT_GETTRIANGLEHITATTRIBUTES:@[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]](ptr [[SYSTEM_DATA_ALLOCA]])
@@ -195,21 +193,22 @@ define void @ClosestHit(%struct.RayPayload* %0, %struct.BuiltInTriangleIntersect
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP65:%.*]] = call [[STRUCT_HITDATA]] [[_CONT_GETCOMMITTEDSTATE]](ptr [[SYSTEM_DATA_ALLOCA]])
 ; LOWERRAYTRACINGPIPELINE-NEXT:    store [[STRUCT_HITDATA]] [[TMP65]], ptr [[TMP7]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP66:%.*]] = call i32 @_cont_HitKind(ptr [[SYSTEM_DATA_ALLOCA]], ptr [[TMP7]])
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP67:%.*]] = getelementptr inbounds [[STRUCT_RAYPAYLOAD]], ptr [[TMP14]], i32 0, i32 0
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP67:%.*]] = getelementptr inbounds [[STRUCT_RAYPAYLOAD]], ptr [[TMP14]], i32 0
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP78:%.*]] = load i32, ptr [[TMP67]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP78]], ptr addrspace(20) @PAYLOAD, align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP69:%.*]] = getelementptr inbounds [[STRUCT_RAYPAYLOAD]], ptr [[TMP14]], i32 0, i32 1
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP69:%.*]] = getelementptr inbounds i32, ptr [[TMP67]], i32 1
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP82:%.*]] = load i32, ptr [[TMP69]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP82]], ptr addrspace(20) getelementptr inbounds (i32, ptr addrspace(20) @PAYLOAD, i32 7), align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP71:%.*]] = getelementptr inbounds [[STRUCT_RAYPAYLOAD]], ptr [[TMP14]], i32 0, i32 2
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP71:%.*]] = getelementptr inbounds i32, ptr [[TMP69]], i32 1
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP86:%.*]] = load i32, ptr [[TMP71]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP86]], ptr addrspace(20) getelementptr inbounds (i32, ptr addrspace(20) @PAYLOAD, i32 8), align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP73:%.*]] = getelementptr inbounds [[STRUCT_RAYPAYLOAD]], ptr [[TMP14]], i32 0, i32 3
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP73:%.*]] = getelementptr inbounds i32, ptr [[TMP69]], i32 2
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP90:%.*]] = load i32, ptr [[TMP73]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP90]], ptr addrspace(20) getelementptr inbounds (i32, ptr addrspace(20) @PAYLOAD, i32 9), align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP75:%.*]] = getelementptr inbounds [[STRUCT_SYSTEMDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP76:%.*]] = load [[STRUCT_DISPATCHSYSTEMDATA:%.*]], ptr [[TMP75]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    ret [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP76]], !continuation.registercount [[META20]]
+; LOWERRAYTRACINGPIPELINE-NEXT:    call void (...) @lgc.ilcps.return(i64 [[RETURNADDR]], [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP76]]), !continuation.registercount [[META20]]
+; LOWERRAYTRACINGPIPELINE-NEXT:    unreachable
 ;
 ; DXILCONTPOSTPROCESS-LABEL: define void @ClosestHit(
 ; DXILCONTPOSTPROCESS-SAME: i32 [[CSPINIT:%.*]], i64 [[RETURNADDR:%.*]], [[STRUCT_SYSTEMDATA:%.*]] [[TMP0:%.*]]) #[[ATTR3:[0-9]+]] !lgc.rt.shaderstage [[META22:![0-9]+]] !continuation [[META23:![0-9]+]] !continuation.registercount [[META20:![0-9]+]] !continuation.state [[META18:![0-9]+]] {
@@ -360,7 +359,7 @@ define void @ClosestHit(%struct.RayPayload* %0, %struct.BuiltInTriangleIntersect
 ; DXILCONTPOSTPROCESS-NEXT:    [[DOTFCA_0_LOAD:%.*]] = load <3 x i32>, ptr [[DOTFCA_0_GEP]], align 4
 ; DXILCONTPOSTPROCESS-NEXT:    [[DOTFCA_0_INSERT:%.*]] = insertvalue [[STRUCT_DISPATCHSYSTEMDATA]] poison, <3 x i32> [[DOTFCA_0_LOAD]], 0
 ; DXILCONTPOSTPROCESS-NEXT:    [[TMP57:%.*]] = load i32, ptr [[CSP]], align 4
-; DXILCONTPOSTPROCESS-NEXT:    call void (i64, ...) @continuation.continue(i64 [[RETURNADDR]], i32 [[TMP57]], [[STRUCT_DISPATCHSYSTEMDATA]] [[DOTFCA_0_INSERT]]), !continuation.registercount [[META20]]
+; DXILCONTPOSTPROCESS-NEXT:    call void (i64, ...) @continuation.continue(i64 [[RETURNADDR]], i32 [[TMP57]], i64 poison, [[STRUCT_DISPATCHSYSTEMDATA]] [[DOTFCA_0_INSERT]]), !continuation.registercount [[META20]]
 ; DXILCONTPOSTPROCESS-NEXT:    unreachable
 ;
   %a = call i32 @dx.op.dispatchRaysIndex.i32(i32 145, i8 0)
diff --git a/llvmraytracing/test/dx/lower-rt-pipeline-large-payload.ll b/llvmraytracing/test/dx/lower-rt-pipeline-large-payload.ll
index b027b2b936..858295ca4d 100644
--- a/llvmraytracing/test/dx/lower-rt-pipeline-large-payload.ll
+++ b/llvmraytracing/test/dx/lower-rt-pipeline-large-payload.ll
@@ -3,16 +3,12 @@
 ; We set the max number of payload registers to 2, so relatively small payloads need to spill already.
 ; This results in a bit nicer result IR, containing less "spam" copying payload fields around.
 ; We also set a max hit attribute size ensuring there is no need for hit attribute storage in the payload.
-; RUN: grep -v lgc.cps.module %s | opt --verify-each -passes='dxil-cont-lgc-rt-op-converter,lint,lower-raytracing-pipeline,lint,remove-types-metadata' -S 2> %t0.stderr | FileCheck -check-prefix=LOWERRAYTRACINGPIPELINE %s
-; RUN: count 0 < %t0.stderr
-; RUN: grep -v lgc.cps.module %s | opt --verify-each -passes='dxil-cont-lgc-rt-op-converter,lint,inline,lint,lower-raytracing-pipeline,lint,sroa,lint,lower-await,lint,coro-early,dxil-coro-split,coro-cleanup,lint,legacy-cleanup-continuations,lint,remove-types-metadata' -S 2> %t1.stderr | FileCheck -check-prefix=CLEANUP %s
-; RUN: count 0 < %t1.stderr
-; RUN: opt --verify-each -passes='dxil-cont-lgc-rt-op-converter,lint,inline,lint,lower-raytracing-pipeline,lint,sroa,lint,lower-await,lint,coro-early,dxil-coro-split,coro-cleanup,lint,dxil-cleanup-continuations,lint,remove-types-metadata' -S %s 2> %t2.stderr | FileCheck -check-prefix=CLEANUP-CPS %s
-; RUN: count 0 < %t2.stderr
-; RUN: grep -v lgc.cps.module %s | opt --verify-each -passes='dxil-cont-lgc-rt-op-converter,lint,inline,lint,lower-raytracing-pipeline,lint,sroa,lint,lower-await,lint,coro-early,dxil-coro-split,coro-cleanup,lint,legacy-cleanup-continuations,lint,register-buffer,lint,dxil-cont-post-process,lint,remove-types-metadata' -S 2> %t3.stderr | FileCheck -check-prefix=DXILCONTPOSTPROCESS %s
-; RUN: count 0 < %t3.stderr
+; RUN: grep -v lgc.cps.module %s | opt --verify-each -passes="dxil-cont-lgc-rt-op-converter,lint,lower-raytracing-pipeline,lint,remove-types-metadata" -S --lint-abort-on-error | FileCheck -check-prefix=LOWERRAYTRACINGPIPELINE %s
+; RUN: grep -v lgc.cps.module %s | opt --verify-each -passes="dxil-cont-lgc-rt-op-converter,lint,inline,lint,lower-raytracing-pipeline,lint,sroa,lint,lower-await,lint,coro-early,dxil-coro-split,coro-cleanup,lint,legacy-cleanup-continuations,lint,remove-types-metadata" -S --lint-abort-on-error | FileCheck -check-prefix=CLEANUP %s
+; RUN: opt --verify-each -passes="dxil-cont-lgc-rt-op-converter,lint,inline,lint,lower-raytracing-pipeline,lint,sroa,lint,lower-await,lint,coro-early,dxil-coro-split,coro-cleanup,lint,dxil-cleanup-continuations,lint,remove-types-metadata" -S %s --lint-abort-on-error | FileCheck -check-prefix=CLEANUP-CPS %s
+; RUN: grep -v lgc.cps.module %s | opt --verify-each -passes="dxil-cont-lgc-rt-op-converter,lint,inline,lint,lower-raytracing-pipeline,lint,sroa,lint,lower-await,lint,coro-early,dxil-coro-split,coro-cleanup,lint,legacy-cleanup-continuations,lint,register-buffer,lint,dxil-cont-post-process,lint,remove-types-metadata" -S --lint-abort-on-error | FileCheck -check-prefix=DXILCONTPOSTPROCESS %s
 
-target datalayout = "e-m:e-p:64:32-p20:32:32-p21:32:32-p32:32:32-i1:32-i8:8-i16:32-i32:32-i64:32-f16:32-f32:32-f64:32-v16:32-v32:32-v48:32-v64:32-v80:32-v96:32-v112:32-v128:32-v144:32-v160:32-v176:32-v192:32-v208:32-v224:32-v240:32-v256:32-n8:16:32"
+target datalayout = "e-m:e-p:64:32-p20:32:32-p21:32:32-p32:32:32-i1:32-i8:8-i16:16-i32:32-i64:32-f16:16-f32:32-f64:32-v8:8-v16:16-v32:32-v48:32-v64:32-v80:32-v96:32-v112:32-v128:32-v144:32-v160:32-v176:32-v192:32-v208:32-v224:32-v240:32-v256:32-n8:16:32"
 
 %dx.types.Handle = type { i8* }
 %dx.types.ResourceProperties = type { i32, i32 }
@@ -33,6 +29,9 @@ target datalayout = "e-m:e-p:64:32-p20:32:32-p21:32:32-p32:32:32-i1:32-i8:8-i16:
 ; Need _cont_ReportHit to get system data type
 declare  !types !206 i1 @_cont_ReportHit(%struct.AnyHitTraversalData* %data, float %t, i32 %hitKind)
 
+; Function Attrs: nounwind
+declare i64 @_AmdGetResumePointAddr() #3
+
 declare %dx.types.Handle @dx.op.annotateHandle(i32, %dx.types.Handle, %dx.types.ResourceProperties) #3
 declare %dx.types.Handle @dx.op.createHandleForLib.dx.types.Handle(i32, %dx.types.Handle) #4
 declare !types !200 void @dx.op.traceRay.struct.SmallPayload(i32, %dx.types.Handle, i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, %struct.SmallPayload*)
@@ -43,14 +42,15 @@ define void @_cont_TraceRay(%struct.DispatchSystemData* %data, i64 %0, i32 %1, i
   %dis_data = load %struct.DispatchSystemData, %struct.DispatchSystemData* %data, align 4
   %sys_data = insertvalue %struct.SystemData undef, %struct.DispatchSystemData %dis_data, 0
   %trav_data = insertvalue %struct.TraversalData undef, %struct.SystemData %sys_data, 0
-  %trav_data2 = insertvalue %struct.TraversalData %trav_data, i64 -1, 5
-  %newdata = call %struct.DispatchSystemData @_AmdAwaitTraversal(i64 4, %struct.TraversalData %trav_data2)
+  %addr = call i64 @_AmdGetResumePointAddr() #3
+  %trav_data2 = insertvalue %struct.TraversalData %trav_data, i64 %addr, 5
+  %newdata = call %struct.DispatchSystemData @_AmdWaitAwaitTraversal(i64 4, i64 -1, %struct.TraversalData %trav_data2)
   store %struct.DispatchSystemData %newdata, %struct.DispatchSystemData* %data, align 4
   call void @_AmdRestoreSystemData(%struct.DispatchSystemData* %data)
   ret void
 }
 
-define void @Miss(%struct.SmallPayload* noalias nocapture %outerpayload) !types !204 !lgc.rt.attribute.size !32 {
+define void @Miss(%struct.SmallPayload* noalias nocapture %outerpayload) !types !204 {
   %p1 = alloca %struct.SmallPayload
   %p2 = alloca %struct.MediumPayload
   %p3 = alloca %struct.LargePayload
@@ -74,7 +74,7 @@ define void @Miss(%struct.SmallPayload* noalias nocapture %outerpayload) !types
 declare %struct.DispatchSystemData @_cont_SetupRayGen() #1
 
 ; Function Attrs: alwaysinline
-declare %struct.DispatchSystemData @_AmdAwaitTraversal(i64, %struct.TraversalData) #1
+declare %struct.DispatchSystemData @_AmdWaitAwaitTraversal(i64, i64, %struct.TraversalData) #1
 
 ; Function Attrs: alwaysinline
 declare %struct.DispatchSystemData @_AmdAwaitShader(i64, %struct.DispatchSystemData) #1
@@ -118,6 +118,7 @@ attributes #3 = { nounwind memory(none) }
 !dx.entryPoints = !{!12, !14}
 !continuation.maxPayloadRegisterCount = !{!31}
 !lgc.cps.module = !{}
+!lgc.rt.max.attribute.size = !{!32}
 
 !0 = !{!"dxcoob 2019.05.00"}
 !1 = !{i32 1, i32 7}
@@ -162,7 +163,7 @@ attributes #3 = { nounwind memory(none) }
 !205 = !{i32 0, %struct.AnyHitTraversalData poison}
 !206 = !{!"function", i1 poison, !205, float poison, i32 poison}
 ; LOWERRAYTRACINGPIPELINE-LABEL: define %struct.DispatchSystemData @Miss(
-; LOWERRAYTRACINGPIPELINE-SAME: [[STRUCT_SYSTEMDATA:%.*]] [[TMP0:%.*]]) !lgc.rt.attribute.size [[META15:![0-9]+]] !lgc.rt.shaderstage [[META16:![0-9]+]] !continuation.registercount [[META17:![0-9]+]] !continuation [[META18:![0-9]+]] !continuation.stacksize [[META19:![0-9]+]] {
+; LOWERRAYTRACINGPIPELINE-SAME: i64 [[RETURNADDR:%.*]], [[STRUCT_SYSTEMDATA:%.*]] [[TMP0:%.*]]) !lgc.rt.shaderstage [[META16:![0-9]+]] !continuation.registercount [[META17:![0-9]+]] !continuation [[META18:![0-9]+]] !continuation.stacksize [[META19:![0-9]+]] {
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[P1:%.*]] = alloca [[STRUCT_SMALLPAYLOAD:%.*]], align 8
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[P2:%.*]] = alloca [[STRUCT_MEDIUMPAYLOAD:%.*]], align 8
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[P3:%.*]] = alloca [[STRUCT_LARGEPAYLOAD:%.*]], align 8
@@ -171,9 +172,9 @@ attributes #3 = { nounwind memory(none) }
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP2:%.*]] = alloca [[STRUCT_SMALLPAYLOAD]], align 8
 ; LOWERRAYTRACINGPIPELINE-NEXT:    store [[STRUCT_SYSTEMDATA]] [[TMP0]], ptr [[SYSTEM_DATA_ALLOCA]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [[STRUCT_SYSTEMDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [[STRUCT_SMALLPAYLOAD]], ptr [[TMP2]], i32 0, i32 0
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP7:%.*]] = load i32, ptr addrspace(20) @PAYLOAD, align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP7]], ptr [[TMP4]], align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [[STRUCT_SMALLPAYLOAD]], ptr [[TMP2]], i32 0
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP5:%.*]] = load i32, ptr addrspace(20) @PAYLOAD, align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP5]], ptr [[TMP4]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    call void @amd.dx.setLocalRootIndex(i32 5)
 ; LOWERRAYTRACINGPIPELINE-NEXT:    store [[STRUCT_SMALLPAYLOAD]] zeroinitializer, ptr [[P1]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    store [[STRUCT_MEDIUMPAYLOAD]] zeroinitializer, ptr [[P2]], align 4
@@ -181,95 +182,98 @@ attributes #3 = { nounwind memory(none) }
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[T1:%.*]] = load [[DX_TYPES_HANDLE:%.*]], ptr @"\01?Scene@@3URaytracingAccelerationStructure@@A", align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[T2:%.*]] = call [[DX_TYPES_HANDLE]] [[DX_OP_CREATEHANDLEFORLIB_DX_TYPES_HANDLE:@[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]](i32 160, [[DX_TYPES_HANDLE]] [[T1]])
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[T3:%.*]] = call [[DX_TYPES_HANDLE]] [[DX_OP_ANNOTATEHANDLE:@[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]](i32 216, [[DX_TYPES_HANDLE]] [[T2]], [[DX_TYPES_RESOURCEPROPERTIES:%.*]] { i32 16, i32 0 })
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP8:%.*]] = call i64 @amd.dx.getAccelStructAddr([[DX_TYPES_HANDLE]] [[T3]])
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [[STRUCT_SYSTEMDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[DIS_DATA_I:%.*]] = load [[STRUCT_DISPATCHSYSTEMDATA:%.*]], ptr [[TMP9]], align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP6:%.*]] = call i64 @amd.dx.getAccelStructAddr([[DX_TYPES_HANDLE]] [[T3]])
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT_SYSTEMDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[DIS_DATA_I:%.*]] = load [[STRUCT_DISPATCHSYSTEMDATA:%.*]], ptr [[TMP7]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[SYS_DATA_I:%.*]] = insertvalue [[STRUCT_SYSTEMDATA]] undef, [[STRUCT_DISPATCHSYSTEMDATA]] [[DIS_DATA_I]], 0
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TRAV_DATA_I:%.*]] = insertvalue [[STRUCT_TRAVERSALDATA:%.*]] undef, [[STRUCT_SYSTEMDATA]] [[SYS_DATA_I]], 0
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TRAV_DATA2_I:%.*]] = insertvalue [[STRUCT_TRAVERSALDATA]] [[TRAV_DATA_I]], i64 -1, 5
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [[STRUCT_SMALLPAYLOAD]], ptr [[P1]], i32 0, i32 0
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP13:%.*]] = load i32, ptr [[TMP10]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP13]], ptr addrspace(20) @PAYLOAD, align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP11:%.*]] = call ptr inttoptr (i64 4 to ptr)([[STRUCT_TRAVERSALDATA]] [[TRAV_DATA2_I]]), !continuation.registercount [[META17]], !continuation.returnedRegistercount !17
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP15:%.*]] = call [[STRUCT_DISPATCHSYSTEMDATA]] [[AWAIT_STRUCT_DISPATCHSYSTEMDATA:@[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]](ptr [[TMP11]])
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[ADDR_I:%.*]] = call i64 @_AmdGetResumePointAddr() #[[ATTR0:[0-9]+]]
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TRAV_DATA2_I:%.*]] = insertvalue [[STRUCT_TRAVERSALDATA]] [[TRAV_DATA_I]], i64 [[ADDR_I]], 5
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [[STRUCT_SMALLPAYLOAD]], ptr [[P1]], i32 0
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP9:%.*]] = load i32, ptr [[TMP8]], align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP9]], ptr addrspace(20) @PAYLOAD, align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP10:%.*]] = call ptr inttoptr (i64 4 to ptr)(i64 -1, i64 poison, [[STRUCT_TRAVERSALDATA]] [[TRAV_DATA2_I]]), !continuation.registercount [[META17]], !continuation.wait.await [[META6:![0-9]+]], !continuation.returnedRegistercount [[META17]]
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP11:%.*]] = call [[STRUCT_DISPATCHSYSTEMDATA]] [[AWAIT:@[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]](ptr [[TMP10]])
 ; LOWERRAYTRACINGPIPELINE-NEXT:    store [[STRUCT_SMALLPAYLOAD]] poison, ptr [[P1]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [[STRUCT_SMALLPAYLOAD]], ptr [[P1]], i32 0, i32 0
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP19:%.*]] = load i32, ptr addrspace(20) @PAYLOAD, align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP19]], ptr [[TMP16]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    store [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP15]], ptr [[TMP9]], align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [[STRUCT_SMALLPAYLOAD]], ptr [[P1]], i32 0
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP13:%.*]] = load i32, ptr addrspace(20) @PAYLOAD, align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP13]], ptr [[TMP12]], align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    store [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP11]], ptr [[TMP7]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    call void @amd.dx.setLocalRootIndex(i32 5)
 ; LOWERRAYTRACINGPIPELINE-NEXT:    br label [[DOTSPLIT10:%.*]]
-; LOWERRAYTRACINGPIPELINE:       .split10:
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP20:%.*]] = call i64 @amd.dx.getAccelStructAddr([[DX_TYPES_HANDLE]] [[T3]])
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP21:%.*]] = getelementptr inbounds [[STRUCT_SYSTEMDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[DIS_DATA_I1:%.*]] = load [[STRUCT_DISPATCHSYSTEMDATA]], ptr [[TMP21]], align 4
+; LOWERRAYTRACINGPIPELINE:       .split12:
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP14:%.*]] = call i64 @amd.dx.getAccelStructAddr([[DX_TYPES_HANDLE]] [[T3]])
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [[STRUCT_SYSTEMDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[DIS_DATA_I1:%.*]] = load [[STRUCT_DISPATCHSYSTEMDATA]], ptr [[TMP16]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[SYS_DATA_I2:%.*]] = insertvalue [[STRUCT_SYSTEMDATA]] undef, [[STRUCT_DISPATCHSYSTEMDATA]] [[DIS_DATA_I1]], 0
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TRAV_DATA_I3:%.*]] = insertvalue [[STRUCT_TRAVERSALDATA]] undef, [[STRUCT_SYSTEMDATA]] [[SYS_DATA_I2]], 0
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TRAV_DATA2_I4:%.*]] = insertvalue [[STRUCT_TRAVERSALDATA]] [[TRAV_DATA_I3]], i64 -1, 5
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP22:%.*]] = ptrtoint ptr [[PAYLOAD_SPILL_ALLOCA]] to i32
-; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP22]], ptr addrspace(20) @PAYLOAD, align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP23:%.*]] = load ptr addrspace(32), ptr addrspace(20) @PAYLOAD, align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [[STRUCT_MEDIUMPAYLOAD]], ptr [[P2]], i32 0, i32 0
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP28:%.*]] = load i32, ptr [[TMP24]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP28]], ptr addrspace(20) getelementptr inbounds (i32, ptr addrspace(20) @PAYLOAD, i32 1), align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP29:%.*]] = getelementptr inbounds i32, ptr [[TMP24]], i32 1
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP25:%.*]] = load i32, ptr [[TMP29]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP25]], ptr addrspace(32) [[TMP23]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP33:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP23]], i32 1
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP34:%.*]] = getelementptr inbounds i32, ptr [[TMP29]], i32 1
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP35:%.*]] = load i32, ptr [[TMP34]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP35]], ptr addrspace(32) [[TMP33]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP36:%.*]] = call ptr inttoptr (i64 4 to ptr)([[STRUCT_TRAVERSALDATA]] [[TRAV_DATA2_I4]]), !continuation.registercount [[META13:![0-9]+]], !continuation.returnedRegistercount !13
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP37:%.*]] = call [[STRUCT_DISPATCHSYSTEMDATA]] [[AWAIT_STRUCT_DISPATCHSYSTEMDATA]](ptr [[TMP36]])
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[ADDR_I4:%.*]] = call i64 @_AmdGetResumePointAddr() #[[ATTR0]]
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TRAV_DATA2_I5:%.*]] = insertvalue [[STRUCT_TRAVERSALDATA]] [[TRAV_DATA_I3]], i64 [[ADDR_I4]], 5
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP33:%.*]] = ptrtoint ptr [[PAYLOAD_SPILL_ALLOCA]] to i32
+; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP33]], ptr addrspace(20) @PAYLOAD, align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP17:%.*]] = load ptr addrspace(32), ptr addrspace(20) @PAYLOAD, align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP18:%.*]] = getelementptr inbounds [[STRUCT_MEDIUMPAYLOAD]], ptr [[P2]], i32 0
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP19:%.*]] = load i32, ptr [[TMP18]], align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP19]], ptr addrspace(20) getelementptr inbounds (i32, ptr addrspace(20) @PAYLOAD, i32 1), align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP20:%.*]] = getelementptr inbounds i32, ptr [[TMP18]], i32 1
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP21:%.*]] = load i32, ptr [[TMP20]], align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP21]], ptr addrspace(32) [[TMP17]], align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP22:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP17]], i32 1
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP23:%.*]] = getelementptr inbounds i32, ptr [[TMP20]], i32 1
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP24:%.*]] = load i32, ptr [[TMP23]], align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP24]], ptr addrspace(32) [[TMP22]], align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP25:%.*]] = call ptr inttoptr (i64 4 to ptr)(i64 -1, i64 poison, [[STRUCT_TRAVERSALDATA]] [[TRAV_DATA2_I5]]), !continuation.registercount [[META13:![0-9]+]], !continuation.wait.await [[META6]], !continuation.returnedRegistercount [[META13]]
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP26:%.*]] = call [[STRUCT_DISPATCHSYSTEMDATA]] [[AWAIT_1:@[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]](ptr [[TMP25]])
 ; LOWERRAYTRACINGPIPELINE-NEXT:    store [[STRUCT_MEDIUMPAYLOAD]] poison, ptr [[P2]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP38:%.*]] = load ptr addrspace(32), ptr addrspace(20) @PAYLOAD, align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP39:%.*]] = getelementptr inbounds [[STRUCT_MEDIUMPAYLOAD]], ptr [[P2]], i32 0, i32 0
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP43:%.*]] = load i32, ptr addrspace(20) getelementptr inbounds (i32, ptr addrspace(20) @PAYLOAD, i32 1), align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP43]], ptr [[TMP39]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP44:%.*]] = getelementptr inbounds i32, ptr [[TMP39]], i32 1
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP31:%.*]] = load i32, ptr addrspace(32) [[TMP38]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP31]], ptr [[TMP44]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP48:%.*]] = getelementptr inbounds i32, ptr [[TMP44]], i32 1
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP49:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP38]], i32 1
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP50:%.*]] = load i32, ptr addrspace(32) [[TMP49]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP50]], ptr [[TMP48]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP51:%.*]] = load ptr addrspace(32), ptr addrspace(20) @PAYLOAD, align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    store [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP37]], ptr [[TMP21]], align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP27:%.*]] = load ptr addrspace(32), ptr addrspace(20) @PAYLOAD, align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP28:%.*]] = getelementptr inbounds [[STRUCT_MEDIUMPAYLOAD]], ptr [[P2]], i32 0
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP29:%.*]] = load i32, ptr addrspace(20) getelementptr inbounds (i32, ptr addrspace(20) @PAYLOAD, i32 1), align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP29]], ptr [[TMP28]], align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP30:%.*]] = getelementptr inbounds i32, ptr [[TMP28]], i32 1
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP31:%.*]] = load i32, ptr addrspace(32) [[TMP27]], align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP31]], ptr [[TMP30]], align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP32:%.*]] = getelementptr inbounds i32, ptr [[TMP30]], i32 1
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP42:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP27]], i32 1
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP43:%.*]] = load i32, ptr addrspace(32) [[TMP42]], align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP43]], ptr [[TMP32]], align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP44:%.*]] = load ptr addrspace(32), ptr addrspace(20) @PAYLOAD, align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    store [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP26]], ptr [[TMP16]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    call void @amd.dx.setLocalRootIndex(i32 5)
 ; LOWERRAYTRACINGPIPELINE-NEXT:    br label [[DOTSPLIT9:%.*]]
-; LOWERRAYTRACINGPIPELINE:       .split9:
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP52:%.*]] = call i64 @amd.dx.getAccelStructAddr([[DX_TYPES_HANDLE]] [[T3]])
+; LOWERRAYTRACINGPIPELINE:       .split11:
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP46:%.*]] = call i64 @amd.dx.getAccelStructAddr([[DX_TYPES_HANDLE]] [[T3]])
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP53:%.*]] = getelementptr inbounds [[STRUCT_SYSTEMDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[DIS_DATA_I5:%.*]] = load [[STRUCT_DISPATCHSYSTEMDATA]], ptr [[TMP53]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[SYS_DATA_I6:%.*]] = insertvalue [[STRUCT_SYSTEMDATA]] undef, [[STRUCT_DISPATCHSYSTEMDATA]] [[DIS_DATA_I5]], 0
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TRAV_DATA_I7:%.*]] = insertvalue [[STRUCT_TRAVERSALDATA]] undef, [[STRUCT_SYSTEMDATA]] [[SYS_DATA_I6]], 0
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TRAV_DATA2_I8:%.*]] = insertvalue [[STRUCT_TRAVERSALDATA]] [[TRAV_DATA_I7]], i64 -1, 5
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP54:%.*]] = ptrtoint ptr [[PAYLOAD_SPILL_ALLOCA]] to i32
-; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP54]], ptr addrspace(20) @PAYLOAD, align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP55:%.*]] = load ptr addrspace(32), ptr addrspace(20) @PAYLOAD, align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP56:%.*]] = getelementptr inbounds [[STRUCT_LARGEPAYLOAD]], ptr [[P3]], i32 0, i32 0
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP60:%.*]] = load i32, ptr [[TMP56]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP60]], ptr addrspace(20) getelementptr inbounds (i32, ptr addrspace(20) @PAYLOAD, i32 1), align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP61:%.*]] = getelementptr inbounds i32, ptr [[TMP56]], i32 1
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP45:%.*]] = load i32, ptr [[TMP61]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP45]], ptr addrspace(32) [[TMP55]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP65:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP55]], i32 1
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP66:%.*]] = getelementptr inbounds i32, ptr [[TMP61]], i32 1
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP67:%.*]] = load i32, ptr [[TMP66]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP67]], ptr addrspace(32) [[TMP65]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP68:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP55]], i32 2
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP69:%.*]] = getelementptr inbounds i32, ptr [[TMP61]], i32 2
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP70:%.*]] = load i32, ptr [[TMP69]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP70]], ptr addrspace(32) [[TMP68]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP71:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP55]], i32 3
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP72:%.*]] = getelementptr inbounds i32, ptr [[TMP61]], i32 3
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP73:%.*]] = load i32, ptr [[TMP72]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP73]], ptr addrspace(32) [[TMP71]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP74:%.*]] = call ptr inttoptr (i64 4 to ptr)([[STRUCT_TRAVERSALDATA]] [[TRAV_DATA2_I8]]), !continuation.registercount [[META13]], !continuation.returnedRegistercount !13
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP75:%.*]] = call [[STRUCT_DISPATCHSYSTEMDATA]] [[AWAIT_STRUCT_DISPATCHSYSTEMDATA]](ptr [[TMP74]])
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[ADDR_I9:%.*]] = call i64 @_AmdGetResumePointAddr() #[[ATTR0]]
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TRAV_DATA2_I10:%.*]] = insertvalue [[STRUCT_TRAVERSALDATA]] [[TRAV_DATA_I7]], i64 [[ADDR_I9]], 5
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP38:%.*]] = ptrtoint ptr [[PAYLOAD_SPILL_ALLOCA]] to i32
+; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP38]], ptr addrspace(20) @PAYLOAD, align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP39:%.*]] = load ptr addrspace(32), ptr addrspace(20) @PAYLOAD, align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP40:%.*]] = getelementptr inbounds [[STRUCT_LARGEPAYLOAD]], ptr [[P3]], i32 0
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP41:%.*]] = load i32, ptr [[TMP40]], align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP41]], ptr addrspace(20) getelementptr inbounds (i32, ptr addrspace(20) @PAYLOAD, i32 1), align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP54:%.*]] = getelementptr inbounds i32, ptr [[TMP40]], i32 1
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP55:%.*]] = load i32, ptr [[TMP54]], align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP55]], ptr addrspace(32) [[TMP39]], align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP56:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP39]], i32 1
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP45:%.*]] = getelementptr inbounds i32, ptr [[TMP54]], i32 1
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP57:%.*]] = load i32, ptr [[TMP45]], align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP57]], ptr addrspace(32) [[TMP56]], align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP47:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP39]], i32 2
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP48:%.*]] = getelementptr inbounds i32, ptr [[TMP54]], i32 2
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP49:%.*]] = load i32, ptr [[TMP48]], align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP49]], ptr addrspace(32) [[TMP47]], align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP50:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP39]], i32 3
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP51:%.*]] = getelementptr inbounds i32, ptr [[TMP54]], i32 3
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP52:%.*]] = load i32, ptr [[TMP51]], align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP52]], ptr addrspace(32) [[TMP50]], align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP58:%.*]] = call ptr inttoptr (i64 4 to ptr)(i64 -1, i64 poison, [[STRUCT_TRAVERSALDATA]] [[TRAV_DATA2_I10]]), !continuation.registercount [[META13]], !continuation.wait.await [[META6]], !continuation.returnedRegistercount [[META13]]
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP75:%.*]] = call [[STRUCT_DISPATCHSYSTEMDATA]] [[AWAIT_2:@[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]](ptr [[TMP58]])
 ; LOWERRAYTRACINGPIPELINE-NEXT:    store [[STRUCT_LARGEPAYLOAD]] poison, ptr [[P3]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP76:%.*]] = load ptr addrspace(32), ptr addrspace(20) @PAYLOAD, align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP77:%.*]] = getelementptr inbounds [[STRUCT_LARGEPAYLOAD]], ptr [[P3]], i32 0, i32 0
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP77:%.*]] = getelementptr inbounds [[STRUCT_LARGEPAYLOAD]], ptr [[P3]], i32 0
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP81:%.*]] = load i32, ptr addrspace(20) getelementptr inbounds (i32, ptr addrspace(20) @PAYLOAD, i32 1), align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP81]], ptr [[TMP77]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP82:%.*]] = getelementptr inbounds i32, ptr [[TMP77]], i32 1
@@ -292,12 +296,13 @@ attributes #3 = { nounwind memory(none) }
 ; LOWERRAYTRACINGPIPELINE-NEXT:    call void @amd.dx.setLocalRootIndex(i32 5)
 ; LOWERRAYTRACINGPIPELINE-NEXT:    br label [[DOTSPLIT:%.*]]
 ; LOWERRAYTRACINGPIPELINE:       .split:
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP96:%.*]] = getelementptr inbounds [[STRUCT_SMALLPAYLOAD]], ptr [[TMP2]], i32 0, i32 0
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP99:%.*]] = load i32, ptr [[TMP96]], align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP70:%.*]] = getelementptr inbounds [[STRUCT_SMALLPAYLOAD]], ptr [[TMP2]], i32 0
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP99:%.*]] = load i32, ptr [[TMP70]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP99]], ptr addrspace(20) @PAYLOAD, align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP100:%.*]] = getelementptr inbounds [[STRUCT_SYSTEMDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP101:%.*]] = load [[STRUCT_DISPATCHSYSTEMDATA]], ptr [[TMP100]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    ret [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP101]], !continuation.registercount [[META17]]
+; LOWERRAYTRACINGPIPELINE-NEXT:    call void (...) @lgc.ilcps.return(i64 [[RETURNADDR]], [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP101]]), !continuation.registercount [[META17]]
+; LOWERRAYTRACINGPIPELINE-NEXT:    unreachable
 ;
 ;
 ; LOWERRAYTRACINGPIPELINE-LABEL: define i32 @_cont_GetLocalRootIndex(
@@ -306,7 +311,7 @@ attributes #3 = { nounwind memory(none) }
 ;
 ;
 ; CLEANUP-LABEL: define void @Miss(
-; CLEANUP-SAME: i64 [[RETURNADDR:%.*]], [[STRUCT_SYSTEMDATA:%.*]] [[TMP0:%.*]]) !lgc.rt.attribute.size [[META15:![0-9]+]] !lgc.rt.shaderstage [[META16:![0-9]+]] !continuation.registercount [[META17:![0-9]+]] !continuation [[META18:![0-9]+]] !continuation.stacksize [[META19:![0-9]+]] !continuation.state [[META20:![0-9]+]] {
+; CLEANUP-SAME: i64 [[RETURNADDR:%.*]], [[STRUCT_SYSTEMDATA:%.*]] [[TMP0:%.*]]) !lgc.rt.shaderstage [[META16:![0-9]+]] !continuation.registercount [[META17:![0-9]+]] !continuation [[META18:![0-9]+]] !continuation.stacksize [[META19:![0-9]+]] !continuation.state [[META20:![0-9]+]] {
 ; CLEANUP-NEXT:  AllocaSpillBB:
 ; CLEANUP-NEXT:    [[CONT_STATE_STACK_SEGMENT:%.*]] = call ptr addrspace(32) @lgc.cps.alloc(i32 28)
 ; CLEANUP-NEXT:    [[PAYLOAD_SPILL_ALLOCA:%.*]] = getelementptr inbounds [[MISS_FRAME:%.*]], ptr addrspace(32) [[CONT_STATE_STACK_SEGMENT]], i32 0, i32 0
@@ -324,19 +329,21 @@ attributes #3 = { nounwind memory(none) }
 ; CLEANUP-NEXT:    [[DIS_DATA_I_FCA_0_INSERT:%.*]] = insertvalue [[STRUCT_DISPATCHSYSTEMDATA:%.*]] poison, <3 x i32> [[DOTFCA_0_0_EXTRACT]], 0
 ; CLEANUP-NEXT:    [[SYS_DATA_I:%.*]] = insertvalue [[STRUCT_SYSTEMDATA]] undef, [[STRUCT_DISPATCHSYSTEMDATA]] [[DIS_DATA_I_FCA_0_INSERT]], 0
 ; CLEANUP-NEXT:    [[TRAV_DATA_I:%.*]] = insertvalue [[STRUCT_TRAVERSALDATA:%.*]] undef, [[STRUCT_SYSTEMDATA]] [[SYS_DATA_I]], 0
-; CLEANUP-NEXT:    [[TRAV_DATA2_I:%.*]] = insertvalue [[STRUCT_TRAVERSALDATA]] [[TRAV_DATA_I]], i64 -1, 5
+; CLEANUP-NEXT:    [[ADDR_I:%.*]] = call i64 @_AmdGetResumePointAddr() #[[ATTR0:[0-9]+]]
+; CLEANUP-NEXT:    [[TRAV_DATA2_I:%.*]] = insertvalue [[STRUCT_TRAVERSALDATA]] [[TRAV_DATA_I]], i64 [[ADDR_I]], 5
 ; CLEANUP-NEXT:    store i32 0, ptr addrspace(20) @PAYLOAD, align 4
-; CLEANUP-NEXT:    call void (i64, ...) @continuation.continue(i64 4, i64 ptrtoint (ptr @Miss.resume.0 to i64), [[STRUCT_TRAVERSALDATA]] [[TRAV_DATA2_I]]), !continuation.registercount [[META17]], !continuation.returnedRegistercount !17
+; CLEANUP-NEXT:    [[TMP3:%.*]] = call i64 (...) @lgc.cps.as.continuation.reference__i64(ptr @Miss.resume.0)
+; CLEANUP-NEXT:    call void (i64, i64, ...) @continuation.waitContinue(i64 4, i64 -1, i64 [[TMP3]], i64 poison, [[STRUCT_TRAVERSALDATA]] [[TRAV_DATA2_I]]), !continuation.registercount [[META17]], !continuation.returnedRegistercount [[META17]]
 ; CLEANUP-NEXT:    unreachable
 ;
 ;
 ; CLEANUP-LABEL: define dso_local void @Miss.resume.0(
-; CLEANUP-SAME: [[STRUCT_DISPATCHSYSTEMDATA:%.*]] [[TMP0:%.*]]) !lgc.rt.attribute.size [[META15]] !lgc.rt.shaderstage [[META16]] !continuation.registercount [[META17]] !continuation [[META18]] {
+; CLEANUP-SAME: i64 [[TMP0:%.*]], [[STRUCT_DISPATCHSYSTEMDATA:%.*]] [[TMP1:%.*]]) !lgc.rt.shaderstage [[META16]] !continuation.registercount [[META17]] !continuation [[META18]] {
 ; CLEANUP-NEXT:  entryresume.0:
 ; CLEANUP-NEXT:    [[CONT_STATE_STACK_SEGMENT:%.*]] = call ptr addrspace(32) @lgc.cps.peek(i32 28)
 ; CLEANUP-NEXT:    [[PAYLOAD_SPILL_ALLOCA:%.*]] = getelementptr inbounds [[MISS_FRAME:%.*]], ptr addrspace(32) [[CONT_STATE_STACK_SEGMENT]], i32 0, i32 0
-; CLEANUP-NEXT:    [[TMP1:%.*]] = load i32, ptr addrspace(20) @PAYLOAD, align 4
-; CLEANUP-NEXT:    [[DOTFCA_0_EXTRACT:%.*]] = extractvalue [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP0]], 0
+; CLEANUP-NEXT:    [[TMP5:%.*]] = load i32, ptr addrspace(20) @PAYLOAD, align 4
+; CLEANUP-NEXT:    [[DOTFCA_0_EXTRACT:%.*]] = extractvalue [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP1]], 0
 ; CLEANUP-NEXT:    call void @amd.dx.setLocalRootIndex(i32 5)
 ; CLEANUP-NEXT:    [[T110:%.*]] = load [[DX_TYPES_HANDLE:%.*]], ptr @"\01?Scene@@3URaytracingAccelerationStructure@@A", align 4
 ; CLEANUP-NEXT:    [[T29:%.*]] = call [[DX_TYPES_HANDLE]] [[DX_OP_CREATEHANDLEFORLIB_DX_TYPES_HANDLE:@[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]](i32 160, [[DX_TYPES_HANDLE]] [[T110]])
@@ -345,70 +352,74 @@ attributes #3 = { nounwind memory(none) }
 ; CLEANUP-NEXT:    [[DIS_DATA_I1_FCA_0_INSERT:%.*]] = insertvalue [[STRUCT_DISPATCHSYSTEMDATA]] poison, <3 x i32> [[DOTFCA_0_EXTRACT]], 0
 ; CLEANUP-NEXT:    [[SYS_DATA_I2:%.*]] = insertvalue [[STRUCT_SYSTEMDATA:%.*]] undef, [[STRUCT_DISPATCHSYSTEMDATA]] [[DIS_DATA_I1_FCA_0_INSERT]], 0
 ; CLEANUP-NEXT:    [[TRAV_DATA_I3:%.*]] = insertvalue [[STRUCT_TRAVERSALDATA:%.*]] undef, [[STRUCT_SYSTEMDATA]] [[SYS_DATA_I2]], 0
-; CLEANUP-NEXT:    [[TRAV_DATA2_I4:%.*]] = insertvalue [[STRUCT_TRAVERSALDATA]] [[TRAV_DATA_I3]], i64 -1, 5
+; CLEANUP-NEXT:    [[ADDR_I4:%.*]] = call i64 @_AmdGetResumePointAddr() #[[ATTR0]]
+; CLEANUP-NEXT:    [[TRAV_DATA2_I5:%.*]] = insertvalue [[STRUCT_TRAVERSALDATA]] [[TRAV_DATA_I3]], i64 [[ADDR_I4]], 5
 ; CLEANUP-NEXT:    [[TMP3:%.*]] = ptrtoint ptr addrspace(32) [[PAYLOAD_SPILL_ALLOCA]] to i32
 ; CLEANUP-NEXT:    store i32 [[TMP3]], ptr addrspace(20) @PAYLOAD, align 4
 ; CLEANUP-NEXT:    [[TMP4:%.*]] = load ptr addrspace(32), ptr addrspace(20) @PAYLOAD, align 4
 ; CLEANUP-NEXT:    store i32 0, ptr addrspace(20) getelementptr inbounds (i32, ptr addrspace(20) @PAYLOAD, i32 1), align 4
 ; CLEANUP-NEXT:    store i32 0, ptr addrspace(32) [[TMP4]], align 4
-; CLEANUP-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP4]], i32 1
-; CLEANUP-NEXT:    store i32 0, ptr addrspace(32) [[TMP7]], align 4
-; CLEANUP-NEXT:    call void (i64, ...) @continuation.continue(i64 4, i64 ptrtoint (ptr @Miss.resume.1 to i64), [[STRUCT_TRAVERSALDATA]] [[TRAV_DATA2_I4]]), !continuation.registercount [[META13:![0-9]+]], !continuation.returnedRegistercount !13
+; CLEANUP-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP4]], i32 1
+; CLEANUP-NEXT:    store i32 0, ptr addrspace(32) [[TMP6]], align 4
+; CLEANUP-NEXT:    [[TMP7:%.*]] = call i64 (...) @lgc.cps.as.continuation.reference__i64(ptr @Miss.resume.1)
+; CLEANUP-NEXT:    call void (i64, i64, ...) @continuation.waitContinue(i64 4, i64 -1, i64 [[TMP7]], i64 poison, [[STRUCT_TRAVERSALDATA]] [[TRAV_DATA2_I5]]), !continuation.registercount [[META13:![0-9]+]], !continuation.returnedRegistercount [[META13]]
 ; CLEANUP-NEXT:    unreachable
 ;
 ;
 ; CLEANUP-LABEL: define dso_local void @Miss.resume.1(
-; CLEANUP-SAME: [[STRUCT_DISPATCHSYSTEMDATA:%.*]] [[TMP0:%.*]]) !lgc.rt.attribute.size [[META15]] !lgc.rt.shaderstage [[META16]] !continuation.registercount [[META13]] !continuation [[META18]] {
+; CLEANUP-SAME: i64 [[TMP0:%.*]], [[STRUCT_DISPATCHSYSTEMDATA:%.*]] [[TMP1:%.*]]) !lgc.rt.shaderstage [[META16]] !continuation.registercount [[META13]] !continuation [[META18]] {
 ; CLEANUP-NEXT:  entryresume.1:
 ; CLEANUP-NEXT:    [[CONT_STATE_STACK_SEGMENT:%.*]] = call ptr addrspace(32) @lgc.cps.peek(i32 28)
 ; CLEANUP-NEXT:    [[PAYLOAD_SPILL_ALLOCA:%.*]] = getelementptr inbounds [[MISS_FRAME:%.*]], ptr addrspace(32) [[CONT_STATE_STACK_SEGMENT]], i32 0, i32 0
-; CLEANUP-NEXT:    [[TMP1:%.*]] = load ptr addrspace(32), ptr addrspace(20) @PAYLOAD, align 4
+; CLEANUP-NEXT:    [[TMP6:%.*]] = load ptr addrspace(32), ptr addrspace(20) @PAYLOAD, align 4
 ; CLEANUP-NEXT:    [[TMP3:%.*]] = load i32, ptr addrspace(20) getelementptr inbounds (i32, ptr addrspace(20) @PAYLOAD, i32 1), align 4
-; CLEANUP-NEXT:    [[TMP4:%.*]] = load i32, ptr addrspace(32) [[TMP1]], align 4
-; CLEANUP-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP1]], i32 1
 ; CLEANUP-NEXT:    [[TMP7:%.*]] = load i32, ptr addrspace(32) [[TMP6]], align 4
+; CLEANUP-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP6]], i32 1
+; CLEANUP-NEXT:    [[TMP15:%.*]] = load i32, ptr addrspace(32) [[TMP5]], align 4
 ; CLEANUP-NEXT:    [[TMP8:%.*]] = load ptr addrspace(32), ptr addrspace(20) @PAYLOAD, align 4
-; CLEANUP-NEXT:    [[DOTFCA_0_EXTRACT12:%.*]] = extractvalue [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP0]], 0
+; CLEANUP-NEXT:    [[DOTFCA_0_EXTRACT12:%.*]] = extractvalue [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP1]], 0
 ; CLEANUP-NEXT:    call void @amd.dx.setLocalRootIndex(i32 5)
 ; CLEANUP-NEXT:    [[T17:%.*]] = load [[DX_TYPES_HANDLE:%.*]], ptr @"\01?Scene@@3URaytracingAccelerationStructure@@A", align 4
 ; CLEANUP-NEXT:    [[T26:%.*]] = call [[DX_TYPES_HANDLE]] [[DX_OP_CREATEHANDLEFORLIB_DX_TYPES_HANDLE:@[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]](i32 160, [[DX_TYPES_HANDLE]] [[T17]])
 ; CLEANUP-NEXT:    [[T35:%.*]] = call [[DX_TYPES_HANDLE]] [[DX_OP_ANNOTATEHANDLE:@[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]](i32 216, [[DX_TYPES_HANDLE]] [[T26]], [[DX_TYPES_RESOURCEPROPERTIES:%.*]] { i32 16, i32 0 })
-; CLEANUP-NEXT:    [[TMP9:%.*]] = call i64 @amd.dx.getAccelStructAddr([[DX_TYPES_HANDLE]] [[T35]])
+; CLEANUP-NEXT:    [[TMP13:%.*]] = call i64 @amd.dx.getAccelStructAddr([[DX_TYPES_HANDLE]] [[T35]])
 ; CLEANUP-NEXT:    [[DIS_DATA_I5_FCA_0_INSERT:%.*]] = insertvalue [[STRUCT_DISPATCHSYSTEMDATA]] poison, <3 x i32> [[DOTFCA_0_EXTRACT12]], 0
 ; CLEANUP-NEXT:    [[SYS_DATA_I6:%.*]] = insertvalue [[STRUCT_SYSTEMDATA:%.*]] undef, [[STRUCT_DISPATCHSYSTEMDATA]] [[DIS_DATA_I5_FCA_0_INSERT]], 0
 ; CLEANUP-NEXT:    [[TRAV_DATA_I7:%.*]] = insertvalue [[STRUCT_TRAVERSALDATA:%.*]] undef, [[STRUCT_SYSTEMDATA]] [[SYS_DATA_I6]], 0
-; CLEANUP-NEXT:    [[TRAV_DATA2_I8:%.*]] = insertvalue [[STRUCT_TRAVERSALDATA]] [[TRAV_DATA_I7]], i64 -1, 5
-; CLEANUP-NEXT:    [[TMP10:%.*]] = ptrtoint ptr addrspace(32) [[PAYLOAD_SPILL_ALLOCA]] to i32
-; CLEANUP-NEXT:    store i32 [[TMP10]], ptr addrspace(20) @PAYLOAD, align 4
-; CLEANUP-NEXT:    [[TMP11:%.*]] = load ptr addrspace(32), ptr addrspace(20) @PAYLOAD, align 4
+; CLEANUP-NEXT:    [[ADDR_I9:%.*]] = call i64 @_AmdGetResumePointAddr() #[[ATTR0]]
+; CLEANUP-NEXT:    [[TRAV_DATA2_I10:%.*]] = insertvalue [[STRUCT_TRAVERSALDATA]] [[TRAV_DATA_I7]], i64 [[ADDR_I9]], 5
+; CLEANUP-NEXT:    [[TMP14:%.*]] = ptrtoint ptr addrspace(32) [[PAYLOAD_SPILL_ALLOCA]] to i32
+; CLEANUP-NEXT:    store i32 [[TMP14]], ptr addrspace(20) @PAYLOAD, align 4
+; CLEANUP-NEXT:    [[TMP9:%.*]] = load ptr addrspace(32), ptr addrspace(20) @PAYLOAD, align 4
 ; CLEANUP-NEXT:    store i32 0, ptr addrspace(20) getelementptr inbounds (i32, ptr addrspace(20) @PAYLOAD, i32 1), align 4
+; CLEANUP-NEXT:    store i32 0, ptr addrspace(32) [[TMP9]], align 4
+; CLEANUP-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP9]], i32 1
+; CLEANUP-NEXT:    store i32 0, ptr addrspace(32) [[TMP10]], align 4
+; CLEANUP-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP9]], i32 2
 ; CLEANUP-NEXT:    store i32 0, ptr addrspace(32) [[TMP11]], align 4
-; CLEANUP-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP11]], i32 1
-; CLEANUP-NEXT:    store i32 0, ptr addrspace(32) [[TMP14]], align 4
-; CLEANUP-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP11]], i32 2
-; CLEANUP-NEXT:    store i32 0, ptr addrspace(32) [[TMP15]], align 4
-; CLEANUP-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP11]], i32 3
-; CLEANUP-NEXT:    store i32 0, ptr addrspace(32) [[TMP16]], align 4
-; CLEANUP-NEXT:    call void (i64, ...) @continuation.continue(i64 4, i64 ptrtoint (ptr @Miss.resume.2 to i64), [[STRUCT_TRAVERSALDATA]] [[TRAV_DATA2_I8]]), !continuation.registercount [[META13]], !continuation.returnedRegistercount !13
+; CLEANUP-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP9]], i32 3
+; CLEANUP-NEXT:    store i32 0, ptr addrspace(32) [[TMP12]], align 4
+; CLEANUP-NEXT:    [[TMP16:%.*]] = call i64 (...) @lgc.cps.as.continuation.reference__i64(ptr @Miss.resume.2)
+; CLEANUP-NEXT:    call void (i64, i64, ...) @continuation.waitContinue(i64 4, i64 -1, i64 [[TMP16]], i64 poison, [[STRUCT_TRAVERSALDATA]] [[TRAV_DATA2_I10]]), !continuation.registercount [[META13]], !continuation.returnedRegistercount [[META13]]
 ; CLEANUP-NEXT:    unreachable
 ;
 ;
 ; CLEANUP-LABEL: define dso_local void @Miss.resume.2(
-; CLEANUP-SAME: [[STRUCT_DISPATCHSYSTEMDATA:%.*]] [[TMP0:%.*]]) !lgc.rt.attribute.size [[META15]] !lgc.rt.shaderstage [[META16]] !continuation.registercount [[META13]] !continuation [[META18]] {
+; CLEANUP-SAME: i64 [[TMP0:%.*]], [[STRUCT_DISPATCHSYSTEMDATA:%.*]] [[TMP1:%.*]]) !lgc.rt.shaderstage [[META16]] !continuation.registercount [[META13]] !continuation [[META18]] {
 ; CLEANUP-NEXT:  entryresume.2:
 ; CLEANUP-NEXT:    [[CONT_STATE_STACK_SEGMENT:%.*]] = call ptr addrspace(32) @lgc.cps.peek(i32 28)
 ; CLEANUP-NEXT:    [[PAYLOAD_SPILL_ALLOCA:%.*]] = getelementptr inbounds [[MISS_FRAME:%.*]], ptr addrspace(32) [[CONT_STATE_STACK_SEGMENT]], i32 0, i32 0
-; CLEANUP-NEXT:    [[TMP1:%.*]] = load ptr addrspace(32), ptr addrspace(20) @PAYLOAD, align 4
-; CLEANUP-NEXT:    [[TMP3:%.*]] = load i32, ptr addrspace(20) getelementptr inbounds (i32, ptr addrspace(20) @PAYLOAD, i32 1), align 4
-; CLEANUP-NEXT:    [[TMP4:%.*]] = load i32, ptr addrspace(32) [[TMP1]], align 4
-; CLEANUP-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP1]], i32 1
-; CLEANUP-NEXT:    [[TMP7:%.*]] = load i32, ptr addrspace(32) [[TMP6]], align 4
-; CLEANUP-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP1]], i32 2
-; CLEANUP-NEXT:    [[TMP9:%.*]] = load i32, ptr addrspace(32) [[TMP8]], align 4
-; CLEANUP-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP1]], i32 3
+; CLEANUP-NEXT:    [[TMP4:%.*]] = load ptr addrspace(32), ptr addrspace(20) @PAYLOAD, align 4
+; CLEANUP-NEXT:    [[TMP7:%.*]] = load i32, ptr addrspace(20) getelementptr inbounds (i32, ptr addrspace(20) @PAYLOAD, i32 1), align 4
+; CLEANUP-NEXT:    [[TMP5:%.*]] = load i32, ptr addrspace(32) [[TMP4]], align 4
+; CLEANUP-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP4]], i32 1
+; CLEANUP-NEXT:    [[TMP8:%.*]] = load i32, ptr addrspace(32) [[TMP6]], align 4
+; CLEANUP-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP4]], i32 2
 ; CLEANUP-NEXT:    [[TMP11:%.*]] = load i32, ptr addrspace(32) [[TMP10]], align 4
+; CLEANUP-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP4]], i32 3
+; CLEANUP-NEXT:    [[TMP13:%.*]] = load i32, ptr addrspace(32) [[TMP9]], align 4
 ; CLEANUP-NEXT:    [[TMP12:%.*]] = load ptr addrspace(32), ptr addrspace(20) @PAYLOAD, align 4
-; CLEANUP-NEXT:    [[DOTFCA_0_EXTRACT14:%.*]] = extractvalue [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP0]], 0
+; CLEANUP-NEXT:    [[DOTFCA_0_EXTRACT14:%.*]] = extractvalue [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP1]], 0
 ; CLEANUP-NEXT:    call void @amd.dx.setLocalRootIndex(i32 5)
 ; CLEANUP-NEXT:    [[DOTRELOAD_ADDR:%.*]] = getelementptr inbounds [[MISS_FRAME]], ptr addrspace(32) [[CONT_STATE_STACK_SEGMENT]], i32 0, i32 2
 ; CLEANUP-NEXT:    [[DOTRELOAD:%.*]] = load i32, ptr addrspace(32) [[DOTRELOAD_ADDR]], align 4
@@ -417,7 +428,7 @@ attributes #3 = { nounwind memory(none) }
 ; CLEANUP-NEXT:    store i32 [[DOTRELOAD]], ptr addrspace(20) @PAYLOAD, align 4
 ; CLEANUP-NEXT:    [[DOTFCA_0_INSERT:%.*]] = insertvalue [[STRUCT_DISPATCHSYSTEMDATA]] poison, <3 x i32> [[DOTFCA_0_EXTRACT14]], 0
 ; CLEANUP-NEXT:    call void @lgc.cps.free(i32 28)
-; CLEANUP-NEXT:    call void (i64, ...) @continuation.continue(i64 [[RETURNADDR_RELOAD]], [[STRUCT_DISPATCHSYSTEMDATA]] [[DOTFCA_0_INSERT]]), !continuation.registercount [[META17]]
+; CLEANUP-NEXT:    call void (i64, ...) @continuation.continue(i64 [[RETURNADDR_RELOAD]], i64 poison, [[STRUCT_DISPATCHSYSTEMDATA]] [[DOTFCA_0_INSERT]]), !continuation.registercount [[META17]]
 ; CLEANUP-NEXT:    unreachable
 ;
 ;
@@ -427,12 +438,12 @@ attributes #3 = { nounwind memory(none) }
 ;
 ;
 ; CLEANUP-CPS-LABEL: define void @Miss(
-; CLEANUP-CPS-SAME: {} [[CONT_STATE:%.*]], i32 [[RETURN_ADDR:%.*]], i32 [[SHADER_INDEX:%.*]], [[STRUCT_SYSTEMDATA:%.*]] [[SYSTEM_DATA:%.*]], {} [[HIT_ATTRS:%.*]], [27 x i32] [[PADDING:%.*]], [1 x i32] [[PAYLOAD:%.*]]) !lgc.rt.attribute.size [[META15:![0-9]+]] !lgc.rt.shaderstage [[META16:![0-9]+]] !lgc.cps [[META17:![0-9]+]] !continuation [[META18:![0-9]+]] !continuation.stacksize [[META19:![0-9]+]] {
+; CLEANUP-CPS-SAME: {} [[CONT_STATE:%.*]], i32 [[RETURNADDR:%.*]], i32 [[SHADER_INDEX:%.*]], [[STRUCT_SYSTEMDATA:%.*]] [[SYSTEM_DATA:%.*]], {} [[HIT_ATTRS:%.*]], [27 x i32] [[PADDING:%.*]], [1 x i32] [[PAYLOAD:%.*]]) !lgc.rt.shaderstage [[META16:![0-9]+]] !lgc.cps [[META13:![0-9]+]] !continuation [[META17:![0-9]+]] !continuation.stacksize [[META18:![0-9]+]] {
 ; CLEANUP-CPS-NEXT:  AllocaSpillBB:
 ; CLEANUP-CPS-NEXT:    [[CONT_STATE_STACK_SEGMENT:%.*]] = call ptr addrspace(32) @lgc.cps.alloc(i32 24)
 ; CLEANUP-CPS-NEXT:    [[PAYLOAD_SPILL_ALLOCA:%.*]] = getelementptr inbounds [[MISS_FRAME:%.*]], ptr addrspace(32) [[CONT_STATE_STACK_SEGMENT]], i32 0, i32 0
 ; CLEANUP-CPS-NEXT:    [[RETURN_ADDR_SPILL_ADDR:%.*]] = getelementptr inbounds [[MISS_FRAME]], ptr addrspace(32) [[CONT_STATE_STACK_SEGMENT]], i32 0, i32 1
-; CLEANUP-CPS-NEXT:    store i32 [[RETURN_ADDR]], ptr addrspace(32) [[RETURN_ADDR_SPILL_ADDR]], align 4
+; CLEANUP-CPS-NEXT:    store i32 [[RETURNADDR]], ptr addrspace(32) [[RETURN_ADDR_SPILL_ADDR]], align 4
 ; CLEANUP-CPS-NEXT:    [[PAYLOAD_FCA_0_EXTRACT:%.*]] = extractvalue [1 x i32] [[PAYLOAD]], 0
 ; CLEANUP-CPS-NEXT:    [[PAYLOAD_FCA_0_EXTRACT_SPILL_ADDR:%.*]] = getelementptr inbounds [[MISS_FRAME]], ptr addrspace(32) [[CONT_STATE_STACK_SEGMENT]], i32 0, i32 2
 ; CLEANUP-CPS-NEXT:    store i32 [[PAYLOAD_FCA_0_EXTRACT]], ptr addrspace(32) [[PAYLOAD_FCA_0_EXTRACT_SPILL_ADDR]], align 4
@@ -445,15 +456,15 @@ attributes #3 = { nounwind memory(none) }
 ; CLEANUP-CPS-NEXT:    [[DIS_DATA_I_FCA_0_INSERT:%.*]] = insertvalue [[STRUCT_DISPATCHSYSTEMDATA:%.*]] poison, <3 x i32> [[SYSTEM_DATA_FCA_0_0_EXTRACT]], 0
 ; CLEANUP-CPS-NEXT:    [[SYS_DATA_I:%.*]] = insertvalue [[STRUCT_SYSTEMDATA]] undef, [[STRUCT_DISPATCHSYSTEMDATA]] [[DIS_DATA_I_FCA_0_INSERT]], 0
 ; CLEANUP-CPS-NEXT:    [[TRAV_DATA_I:%.*]] = insertvalue [[STRUCT_TRAVERSALDATA:%.*]] undef, [[STRUCT_SYSTEMDATA]] [[SYS_DATA_I]], 0
-; CLEANUP-CPS-NEXT:    [[TRAV_DATA2_I:%.*]] = insertvalue [[STRUCT_TRAVERSALDATA]] [[TRAV_DATA_I]], i64 -1, 5
-; CLEANUP-CPS-NEXT:    [[DOTFCA_0_INSERT13:%.*]] = insertvalue [1 x i32] poison, i32 0, 0
 ; CLEANUP-CPS-NEXT:    [[TMP1:%.*]] = call i64 (...) @lgc.cps.as.continuation.reference__i64(ptr @Miss.resume.0)
-; CLEANUP-CPS-NEXT:    call void (...) @lgc.cps.jump(i32 4, i32 4, {} poison, i64 [[TMP1]], i32 5, [30 x i32] poison, [1 x i32] [[DOTFCA_0_INSERT13]]), !continuation.registercount [[META17]], !continuation.returnedRegistercount !17
+; CLEANUP-CPS-NEXT:    [[TRAV_DATA2_I:%.*]] = insertvalue [[STRUCT_TRAVERSALDATA]] [[TRAV_DATA_I]], i64 [[TMP1]], 5
+; CLEANUP-CPS-NEXT:    [[DOTFCA_0_INSERT13:%.*]] = insertvalue [1 x i32] poison, i32 0, 0
+; CLEANUP-CPS-NEXT:    call void (...) @lgc.cps.jump(i32 4, i32 -1, {} poison, i64 -1, i64 [[TMP1]], i32 5, [[STRUCT_TRAVERSALDATA]] [[TRAV_DATA2_I]], [8 x i32] poison, [1 x i32] [[DOTFCA_0_INSERT13]]), !continuation.registercount [[META19:![0-9]+]], !continuation.wait.await [[META6:![0-9]+]], !continuation.returnedRegistercount [[META19]]
 ; CLEANUP-CPS-NEXT:    unreachable
 ;
 ;
 ; CLEANUP-CPS-LABEL: define dso_local void @Miss.resume.0(
-; CLEANUP-CPS-SAME: {} [[TMP0:%.*]], i32 [[TMP1:%.*]], i32 [[TMP2:%.*]], { [[STRUCT_DISPATCHSYSTEMDATA:%.*]], [27 x i32], [1 x i32] } [[TMP3:%.*]]) !lgc.rt.attribute.size [[META15]] !lgc.rt.shaderstage [[META16]] !lgc.cps [[META17]] !continuation [[META18]] !continuation.stacksize [[META19]] {
+; CLEANUP-CPS-SAME: {} [[TMP0:%.*]], i32 [[TMP1:%.*]], i32 [[TMP2:%.*]], { [[STRUCT_DISPATCHSYSTEMDATA:%.*]], [27 x i32], [1 x i32] } [[TMP3:%.*]]) !lgc.rt.shaderstage [[META16]] !lgc.cps [[META13]] !continuation [[META17]] !continuation.stacksize [[META18]] {
 ; CLEANUP-CPS-NEXT:  entryresume.0:
 ; CLEANUP-CPS-NEXT:    [[TMP4:%.*]] = call ptr addrspace(32) @lgc.cps.peek(i32 24)
 ; CLEANUP-CPS-NEXT:    [[PAYLOAD_SPILL_ALLOCA:%.*]] = getelementptr inbounds [[MISS_FRAME:%.*]], ptr addrspace(32) [[TMP4]], i32 0, i32 0
@@ -469,21 +480,21 @@ attributes #3 = { nounwind memory(none) }
 ; CLEANUP-CPS-NEXT:    [[DIS_DATA_I1_FCA_0_INSERT:%.*]] = insertvalue [[STRUCT_DISPATCHSYSTEMDATA]] poison, <3 x i32> [[DOTFCA_0_EXTRACT45]], 0
 ; CLEANUP-CPS-NEXT:    [[SYS_DATA_I2:%.*]] = insertvalue [[STRUCT_SYSTEMDATA:%.*]] undef, [[STRUCT_DISPATCHSYSTEMDATA]] [[DIS_DATA_I1_FCA_0_INSERT]], 0
 ; CLEANUP-CPS-NEXT:    [[TRAV_DATA_I3:%.*]] = insertvalue [[STRUCT_TRAVERSALDATA:%.*]] undef, [[STRUCT_SYSTEMDATA]] [[SYS_DATA_I2]], 0
-; CLEANUP-CPS-NEXT:    [[TRAV_DATA2_I4:%.*]] = insertvalue [[STRUCT_TRAVERSALDATA]] [[TRAV_DATA_I3]], i64 -1, 5
+; CLEANUP-CPS-NEXT:    [[TMP11:%.*]] = call i64 (...) @lgc.cps.as.continuation.reference__i64(ptr @Miss.resume.1)
+; CLEANUP-CPS-NEXT:    [[TRAV_DATA2_I5:%.*]] = insertvalue [[STRUCT_TRAVERSALDATA]] [[TRAV_DATA_I3]], i64 [[TMP11]], 5
 ; CLEANUP-CPS-NEXT:    [[TMP8:%.*]] = ptrtoint ptr addrspace(32) [[PAYLOAD_SPILL_ALLOCA]] to i32
 ; CLEANUP-CPS-NEXT:    [[TMP9:%.*]] = inttoptr i32 [[TMP8]] to ptr addrspace(32)
 ; CLEANUP-CPS-NEXT:    store i32 0, ptr addrspace(32) [[TMP9]], align 4
-; CLEANUP-CPS-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP9]], i32 1
-; CLEANUP-CPS-NEXT:    store i32 0, ptr addrspace(32) [[TMP12]], align 4
+; CLEANUP-CPS-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP9]], i32 1
+; CLEANUP-CPS-NEXT:    store i32 0, ptr addrspace(32) [[TMP10]], align 4
 ; CLEANUP-CPS-NEXT:    [[DOTFCA_0_INSERT17:%.*]] = insertvalue [2 x i32] poison, i32 [[TMP8]], 0
 ; CLEANUP-CPS-NEXT:    [[DOTFCA_1_INSERT:%.*]] = insertvalue [2 x i32] [[DOTFCA_0_INSERT17]], i32 0, 1
-; CLEANUP-CPS-NEXT:    [[TMP13:%.*]] = call i64 (...) @lgc.cps.as.continuation.reference__i64(ptr @Miss.resume.1)
-; CLEANUP-CPS-NEXT:    call void (...) @lgc.cps.jump(i32 4, i32 4, {} poison, i64 [[TMP13]], i32 5, [30 x i32] poison, [2 x i32] [[DOTFCA_1_INSERT]]), !continuation.registercount [[META13:![0-9]+]], !continuation.returnedRegistercount !13
+; CLEANUP-CPS-NEXT:    call void (...) @lgc.cps.jump(i32 4, i32 -1, {} poison, i64 -1, i64 [[TMP11]], i32 5, [[STRUCT_TRAVERSALDATA]] [[TRAV_DATA2_I5]], [8 x i32] poison, [2 x i32] [[DOTFCA_1_INSERT]]), !continuation.registercount [[META13]], !continuation.wait.await [[META6]], !continuation.returnedRegistercount [[META13]]
 ; CLEANUP-CPS-NEXT:    unreachable
 ;
 ;
 ; CLEANUP-CPS-LABEL: define dso_local void @Miss.resume.1(
-; CLEANUP-CPS-SAME: {} [[TMP0:%.*]], i32 [[TMP1:%.*]], i32 [[TMP2:%.*]], { [[STRUCT_DISPATCHSYSTEMDATA:%.*]], [27 x i32], [2 x i32] } [[TMP3:%.*]]) !lgc.rt.attribute.size [[META15]] !lgc.rt.shaderstage [[META16]] !lgc.cps [[META17]] !continuation [[META18]] !continuation.stacksize [[META19]] {
+; CLEANUP-CPS-SAME: {} [[TMP0:%.*]], i32 [[TMP1:%.*]], i32 [[TMP2:%.*]], { [[STRUCT_DISPATCHSYSTEMDATA:%.*]], [27 x i32], [2 x i32] } [[TMP3:%.*]]) !lgc.rt.shaderstage [[META16]] !lgc.cps [[META13]] !continuation [[META17]] !continuation.stacksize [[META18]] {
 ; CLEANUP-CPS-NEXT:  entryresume.1:
 ; CLEANUP-CPS-NEXT:    [[TMP4:%.*]] = call ptr addrspace(32) @lgc.cps.peek(i32 24)
 ; CLEANUP-CPS-NEXT:    [[PAYLOAD_SPILL_ALLOCA:%.*]] = getelementptr inbounds [[MISS_FRAME:%.*]], ptr addrspace(32) [[TMP4]], i32 0, i32 0
@@ -493,37 +504,37 @@ attributes #3 = { nounwind memory(none) }
 ; CLEANUP-CPS-NEXT:    [[TMP6:%.*]] = extractvalue { [[STRUCT_DISPATCHSYSTEMDATA]], [27 x i32], [2 x i32] } [[TMP3]], 0
 ; CLEANUP-CPS-NEXT:    [[TMP7:%.*]] = inttoptr i32 [[DOTFCA_0_EXTRACT18]] to ptr addrspace(32)
 ; CLEANUP-CPS-NEXT:    [[TMP8:%.*]] = load i32, ptr addrspace(32) [[TMP7]], align 4
-; CLEANUP-CPS-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP7]], i32 1
-; CLEANUP-CPS-NEXT:    [[TMP12:%.*]] = load i32, ptr addrspace(32) [[TMP11]], align 4
-; CLEANUP-CPS-NEXT:    [[TMP13:%.*]] = inttoptr i32 [[DOTFCA_0_EXTRACT18]] to ptr addrspace(32)
+; CLEANUP-CPS-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP7]], i32 1
+; CLEANUP-CPS-NEXT:    [[TMP10:%.*]] = load i32, ptr addrspace(32) [[TMP9]], align 4
+; CLEANUP-CPS-NEXT:    [[TMP11:%.*]] = inttoptr i32 [[DOTFCA_0_EXTRACT18]] to ptr addrspace(32)
 ; CLEANUP-CPS-NEXT:    [[DOTFCA_0_EXTRACT47:%.*]] = extractvalue [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP6]], 0
 ; CLEANUP-CPS-NEXT:    call void @amd.dx.setLocalRootIndex(i32 5)
 ; CLEANUP-CPS-NEXT:    [[T17:%.*]] = load [[DX_TYPES_HANDLE:%.*]], ptr @"\01?Scene@@3URaytracingAccelerationStructure@@A", align 4
 ; CLEANUP-CPS-NEXT:    [[T26:%.*]] = call [[DX_TYPES_HANDLE]] [[DX_OP_CREATEHANDLEFORLIB_DX_TYPES_HANDLE:@[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]](i32 160, [[DX_TYPES_HANDLE]] [[T17]])
 ; CLEANUP-CPS-NEXT:    [[T35:%.*]] = call [[DX_TYPES_HANDLE]] [[DX_OP_ANNOTATEHANDLE:@[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]](i32 216, [[DX_TYPES_HANDLE]] [[T26]], [[DX_TYPES_RESOURCEPROPERTIES:%.*]] { i32 16, i32 0 })
-; CLEANUP-CPS-NEXT:    [[TMP14:%.*]] = call i64 @amd.dx.getAccelStructAddr([[DX_TYPES_HANDLE]] [[T35]])
+; CLEANUP-CPS-NEXT:    [[TMP12:%.*]] = call i64 @amd.dx.getAccelStructAddr([[DX_TYPES_HANDLE]] [[T35]])
 ; CLEANUP-CPS-NEXT:    [[DIS_DATA_I5_FCA_0_INSERT:%.*]] = insertvalue [[STRUCT_DISPATCHSYSTEMDATA]] poison, <3 x i32> [[DOTFCA_0_EXTRACT47]], 0
 ; CLEANUP-CPS-NEXT:    [[SYS_DATA_I6:%.*]] = insertvalue [[STRUCT_SYSTEMDATA:%.*]] undef, [[STRUCT_DISPATCHSYSTEMDATA]] [[DIS_DATA_I5_FCA_0_INSERT]], 0
 ; CLEANUP-CPS-NEXT:    [[TRAV_DATA_I7:%.*]] = insertvalue [[STRUCT_TRAVERSALDATA:%.*]] undef, [[STRUCT_SYSTEMDATA]] [[SYS_DATA_I6]], 0
-; CLEANUP-CPS-NEXT:    [[TRAV_DATA2_I8:%.*]] = insertvalue [[STRUCT_TRAVERSALDATA]] [[TRAV_DATA_I7]], i64 -1, 5
-; CLEANUP-CPS-NEXT:    [[TMP15:%.*]] = ptrtoint ptr addrspace(32) [[PAYLOAD_SPILL_ALLOCA]] to i32
-; CLEANUP-CPS-NEXT:    [[TMP16:%.*]] = inttoptr i32 [[TMP15]] to ptr addrspace(32)
+; CLEANUP-CPS-NEXT:    [[TMP18:%.*]] = call i64 (...) @lgc.cps.as.continuation.reference__i64(ptr @Miss.resume.2)
+; CLEANUP-CPS-NEXT:    [[TRAV_DATA2_I10:%.*]] = insertvalue [[STRUCT_TRAVERSALDATA]] [[TRAV_DATA_I7]], i64 [[TMP18]], 5
+; CLEANUP-CPS-NEXT:    [[TMP13:%.*]] = ptrtoint ptr addrspace(32) [[PAYLOAD_SPILL_ALLOCA]] to i32
+; CLEANUP-CPS-NEXT:    [[TMP14:%.*]] = inttoptr i32 [[TMP13]] to ptr addrspace(32)
+; CLEANUP-CPS-NEXT:    store i32 0, ptr addrspace(32) [[TMP14]], align 4
+; CLEANUP-CPS-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP14]], i32 1
+; CLEANUP-CPS-NEXT:    store i32 0, ptr addrspace(32) [[TMP15]], align 4
+; CLEANUP-CPS-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP14]], i32 2
 ; CLEANUP-CPS-NEXT:    store i32 0, ptr addrspace(32) [[TMP16]], align 4
-; CLEANUP-CPS-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP16]], i32 1
-; CLEANUP-CPS-NEXT:    store i32 0, ptr addrspace(32) [[TMP19]], align 4
-; CLEANUP-CPS-NEXT:    [[TMP20:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP16]], i32 2
-; CLEANUP-CPS-NEXT:    store i32 0, ptr addrspace(32) [[TMP20]], align 4
-; CLEANUP-CPS-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP16]], i32 3
-; CLEANUP-CPS-NEXT:    store i32 0, ptr addrspace(32) [[TMP21]], align 4
-; CLEANUP-CPS-NEXT:    [[DOTFCA_0_INSERT23:%.*]] = insertvalue [2 x i32] poison, i32 [[TMP15]], 0
+; CLEANUP-CPS-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP14]], i32 3
+; CLEANUP-CPS-NEXT:    store i32 0, ptr addrspace(32) [[TMP17]], align 4
+; CLEANUP-CPS-NEXT:    [[DOTFCA_0_INSERT23:%.*]] = insertvalue [2 x i32] poison, i32 [[TMP13]], 0
 ; CLEANUP-CPS-NEXT:    [[DOTFCA_1_INSERT26:%.*]] = insertvalue [2 x i32] [[DOTFCA_0_INSERT23]], i32 0, 1
-; CLEANUP-CPS-NEXT:    [[TMP22:%.*]] = call i64 (...) @lgc.cps.as.continuation.reference__i64(ptr @Miss.resume.2)
-; CLEANUP-CPS-NEXT:    call void (...) @lgc.cps.jump(i32 4, i32 4, {} poison, i64 [[TMP22]], i32 5, [30 x i32] poison, [2 x i32] [[DOTFCA_1_INSERT26]]), !continuation.registercount [[META13]], !continuation.returnedRegistercount !13
+; CLEANUP-CPS-NEXT:    call void (...) @lgc.cps.jump(i32 4, i32 -1, {} poison, i64 -1, i64 [[TMP18]], i32 5, [[STRUCT_TRAVERSALDATA]] [[TRAV_DATA2_I10]], [8 x i32] poison, [2 x i32] [[DOTFCA_1_INSERT26]]), !continuation.registercount [[META13]], !continuation.wait.await [[META6]], !continuation.returnedRegistercount [[META13]]
 ; CLEANUP-CPS-NEXT:    unreachable
 ;
 ;
 ; CLEANUP-CPS-LABEL: define dso_local void @Miss.resume.2(
-; CLEANUP-CPS-SAME: {} [[TMP0:%.*]], i32 [[TMP1:%.*]], i32 [[TMP2:%.*]], { [[STRUCT_DISPATCHSYSTEMDATA:%.*]], [27 x i32], [2 x i32] } [[TMP3:%.*]]) !lgc.rt.attribute.size [[META15]] !lgc.rt.shaderstage [[META16]] !lgc.cps [[META17]] !continuation [[META18]] !continuation.stacksize [[META19]] {
+; CLEANUP-CPS-SAME: {} [[TMP0:%.*]], i32 [[TMP1:%.*]], i32 [[TMP2:%.*]], { [[STRUCT_DISPATCHSYSTEMDATA:%.*]], [27 x i32], [2 x i32] } [[TMP3:%.*]]) !lgc.rt.shaderstage [[META16]] !lgc.cps [[META13]] !continuation [[META17]] !continuation.stacksize [[META18]] {
 ; CLEANUP-CPS-NEXT:  entryresume.2:
 ; CLEANUP-CPS-NEXT:    [[TMP4:%.*]] = call ptr addrspace(32) @lgc.cps.peek(i32 24)
 ; CLEANUP-CPS-NEXT:    [[PAYLOAD_SPILL_ALLOCA:%.*]] = getelementptr inbounds [[MISS_FRAME:%.*]], ptr addrspace(32) [[TMP4]], i32 0, i32 0
@@ -533,13 +544,13 @@ attributes #3 = { nounwind memory(none) }
 ; CLEANUP-CPS-NEXT:    [[TMP6:%.*]] = extractvalue { [[STRUCT_DISPATCHSYSTEMDATA]], [27 x i32], [2 x i32] } [[TMP3]], 0
 ; CLEANUP-CPS-NEXT:    [[TMP7:%.*]] = inttoptr i32 [[DOTFCA_0_EXTRACT27]] to ptr addrspace(32)
 ; CLEANUP-CPS-NEXT:    [[TMP8:%.*]] = load i32, ptr addrspace(32) [[TMP7]], align 4
-; CLEANUP-CPS-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP7]], i32 1
+; CLEANUP-CPS-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP7]], i32 1
+; CLEANUP-CPS-NEXT:    [[TMP10:%.*]] = load i32, ptr addrspace(32) [[TMP9]], align 4
+; CLEANUP-CPS-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP7]], i32 2
 ; CLEANUP-CPS-NEXT:    [[TMP12:%.*]] = load i32, ptr addrspace(32) [[TMP11]], align 4
-; CLEANUP-CPS-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP7]], i32 2
+; CLEANUP-CPS-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP7]], i32 3
 ; CLEANUP-CPS-NEXT:    [[TMP14:%.*]] = load i32, ptr addrspace(32) [[TMP13]], align 4
-; CLEANUP-CPS-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP7]], i32 3
-; CLEANUP-CPS-NEXT:    [[TMP16:%.*]] = load i32, ptr addrspace(32) [[TMP15]], align 4
-; CLEANUP-CPS-NEXT:    [[TMP17:%.*]] = inttoptr i32 [[DOTFCA_0_EXTRACT27]] to ptr addrspace(32)
+; CLEANUP-CPS-NEXT:    [[TMP15:%.*]] = inttoptr i32 [[DOTFCA_0_EXTRACT27]] to ptr addrspace(32)
 ; CLEANUP-CPS-NEXT:    [[DOTFCA_0_EXTRACT49:%.*]] = extractvalue [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP6]], 0
 ; CLEANUP-CPS-NEXT:    call void @amd.dx.setLocalRootIndex(i32 5)
 ; CLEANUP-CPS-NEXT:    [[PAYLOAD_FCA_0_EXTRACT_RELOAD_ADDR:%.*]] = getelementptr inbounds [[MISS_FRAME]], ptr addrspace(32) [[TMP4]], i32 0, i32 2
@@ -549,7 +560,7 @@ attributes #3 = { nounwind memory(none) }
 ; CLEANUP-CPS-NEXT:    [[DOTFCA_0_INSERT44:%.*]] = insertvalue [[STRUCT_DISPATCHSYSTEMDATA]] poison, <3 x i32> [[DOTFCA_0_EXTRACT49]], 0
 ; CLEANUP-CPS-NEXT:    [[DOTFCA_0_INSERT:%.*]] = insertvalue [1 x i32] poison, i32 [[PAYLOAD_FCA_0_EXTRACT_RELOAD]], 0
 ; CLEANUP-CPS-NEXT:    call void @lgc.cps.free(i32 24)
-; CLEANUP-CPS-NEXT:    call void (...) @lgc.cps.jump(i32 [[RETURN_ADDR_RELOAD]], i32 3, {} poison, i32 poison, i32 poison, [[STRUCT_DISPATCHSYSTEMDATA]] [[DOTFCA_0_INSERT44]], [27 x i32] poison, [1 x i32] [[DOTFCA_0_INSERT]]), !continuation.registercount [[META17]]
+; CLEANUP-CPS-NEXT:    call void (...) @lgc.cps.jump(i32 [[RETURN_ADDR_RELOAD]], i32 6, {} poison, i32 poison, i32 poison, [[STRUCT_DISPATCHSYSTEMDATA]] [[DOTFCA_0_INSERT44]], [27 x i32] poison, [1 x i32] [[DOTFCA_0_INSERT]]), !continuation.registercount [[META19]]
 ; CLEANUP-CPS-NEXT:    unreachable
 ;
 ;
@@ -559,7 +570,7 @@ attributes #3 = { nounwind memory(none) }
 ;
 ;
 ; DXILCONTPOSTPROCESS-LABEL: define void @Miss(
-; DXILCONTPOSTPROCESS-SAME: i32 [[CSPINIT:%.*]], i64 [[RETURNADDR:%.*]], [[STRUCT_SYSTEMDATA:%.*]] [[TMP0:%.*]]) !lgc.rt.attribute.size [[META15:![0-9]+]] !lgc.rt.shaderstage [[META16:![0-9]+]] !continuation.registercount [[META17:![0-9]+]] !continuation [[META18:![0-9]+]] !continuation.stacksize [[META19:![0-9]+]] !continuation.state [[META20:![0-9]+]] {
+; DXILCONTPOSTPROCESS-SAME: i32 [[CSPINIT:%.*]], i64 [[RETURNADDR:%.*]], [[STRUCT_SYSTEMDATA:%.*]] [[TMP0:%.*]]) !lgc.rt.shaderstage [[META16:![0-9]+]] !continuation.registercount [[META17:![0-9]+]] !continuation [[META18:![0-9]+]] !continuation.stacksize [[META19:![0-9]+]] !continuation.state [[META20:![0-9]+]] {
 ; DXILCONTPOSTPROCESS-NEXT:  AllocaSpillBB:
 ; DXILCONTPOSTPROCESS-NEXT:    [[CSP:%.*]] = alloca i32, align 4
 ; DXILCONTPOSTPROCESS-NEXT:    store i32 [[CSPINIT]], ptr [[CSP]], align 4
@@ -584,23 +595,23 @@ attributes #3 = { nounwind memory(none) }
 ; DXILCONTPOSTPROCESS-NEXT:    [[DIS_DATA_I_FCA_0_INSERT:%.*]] = insertvalue [[STRUCT_DISPATCHSYSTEMDATA:%.*]] poison, <3 x i32> [[DOTFCA_0_0_EXTRACT]], 0
 ; DXILCONTPOSTPROCESS-NEXT:    [[SYS_DATA_I:%.*]] = insertvalue [[STRUCT_SYSTEMDATA]] undef, [[STRUCT_DISPATCHSYSTEMDATA]] [[DIS_DATA_I_FCA_0_INSERT]], 0
 ; DXILCONTPOSTPROCESS-NEXT:    [[TRAV_DATA_I:%.*]] = insertvalue [[STRUCT_TRAVERSALDATA:%.*]] undef, [[STRUCT_SYSTEMDATA]] [[SYS_DATA_I]], 0
-; DXILCONTPOSTPROCESS-NEXT:    [[TRAV_DATA2_I:%.*]] = insertvalue [[STRUCT_TRAVERSALDATA]] [[TRAV_DATA_I]], i64 -1, 5
+; DXILCONTPOSTPROCESS-NEXT:    [[TMP12:%.*]] = call i64 @continuation.getAddrAndMD(ptr @Miss.resume.0)
+; DXILCONTPOSTPROCESS-NEXT:    [[TRAV_DATA2_I:%.*]] = insertvalue [[STRUCT_TRAVERSALDATA]] [[TRAV_DATA_I]], i64 [[TMP12]], 5
 ; DXILCONTPOSTPROCESS-NEXT:    store i32 0, ptr addrspace(20) @REGISTERS, align 4
 ; DXILCONTPOSTPROCESS-NEXT:    [[TMP11:%.*]] = load i32, ptr [[CSP]], align 4
-; DXILCONTPOSTPROCESS-NEXT:    [[TMP12:%.*]] = call i64 @continuation.getAddrAndMD(i64 ptrtoint (ptr @Miss.resume.0 to i64))
-; DXILCONTPOSTPROCESS-NEXT:    call void (i64, ...) @continuation.continue(i64 4, i32 [[TMP11]], i64 [[TMP12]], [[STRUCT_TRAVERSALDATA]] [[TRAV_DATA2_I]]), !continuation.registercount [[META17]], !continuation.returnedRegistercount !17
+; DXILCONTPOSTPROCESS-NEXT:    call void (i64, i64, ...) @continuation.waitContinue(i64 4, i64 -1, i32 [[TMP11]], i64 poison, [[STRUCT_TRAVERSALDATA]] [[TRAV_DATA2_I]]), !continuation.registercount [[META17]], !continuation.returnedRegistercount [[META17]]
 ; DXILCONTPOSTPROCESS-NEXT:    unreachable
 ;
 ;
 ; DXILCONTPOSTPROCESS-LABEL: define dso_local void @Miss.resume.0(
-; DXILCONTPOSTPROCESS-SAME: i32 [[CSPINIT:%.*]], [[STRUCT_DISPATCHSYSTEMDATA:%.*]] [[TMP0:%.*]]) !lgc.rt.attribute.size [[META15]] !lgc.rt.shaderstage [[META16]] !continuation.registercount [[META17]] !continuation [[META18]] {
+; DXILCONTPOSTPROCESS-SAME: i32 [[CSPINIT:%.*]], i64 [[TMP0:%.*]], [[STRUCT_DISPATCHSYSTEMDATA:%.*]] [[TMP1:%.*]]) !lgc.rt.shaderstage [[META16]] !continuation.registercount [[META17]] !continuation [[META18]] {
 ; DXILCONTPOSTPROCESS-NEXT:  entryresume.0:
 ; DXILCONTPOSTPROCESS-NEXT:    [[CSP:%.*]] = alloca i32, align 4
 ; DXILCONTPOSTPROCESS-NEXT:    store i32 [[CSPINIT]], ptr [[CSP]], align 4
-; DXILCONTPOSTPROCESS-NEXT:    [[TMP1:%.*]] = load i32, ptr [[CSP]], align 4
-; DXILCONTPOSTPROCESS-NEXT:    [[TMP2:%.*]] = add i32 [[TMP1]], -28
+; DXILCONTPOSTPROCESS-NEXT:    [[TMP13:%.*]] = load i32, ptr [[CSP]], align 4
+; DXILCONTPOSTPROCESS-NEXT:    [[TMP2:%.*]] = add i32 [[TMP13]], -28
 ; DXILCONTPOSTPROCESS-NEXT:    [[TMP3:%.*]] = load i32, ptr addrspace(20) @REGISTERS, align 4
-; DXILCONTPOSTPROCESS-NEXT:    [[DOTFCA_0_EXTRACT:%.*]] = extractvalue [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP0]], 0
+; DXILCONTPOSTPROCESS-NEXT:    [[DOTFCA_0_EXTRACT:%.*]] = extractvalue [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP1]], 0
 ; DXILCONTPOSTPROCESS-NEXT:    call void @amd.dx.setLocalRootIndex(i32 5)
 ; DXILCONTPOSTPROCESS-NEXT:    [[T110:%.*]] = load [[DX_TYPES_HANDLE:%.*]], ptr @"\01?Scene@@3URaytracingAccelerationStructure@@A", align 4
 ; DXILCONTPOSTPROCESS-NEXT:    [[T29:%.*]] = call [[DX_TYPES_HANDLE]] [[DX_OP_CREATEHANDLEFORLIB_DX_TYPES_HANDLE:@[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]](i32 160, [[DX_TYPES_HANDLE]] [[T110]])
@@ -609,7 +620,8 @@ attributes #3 = { nounwind memory(none) }
 ; DXILCONTPOSTPROCESS-NEXT:    [[DIS_DATA_I1_FCA_0_INSERT:%.*]] = insertvalue [[STRUCT_DISPATCHSYSTEMDATA]] poison, <3 x i32> [[DOTFCA_0_EXTRACT]], 0
 ; DXILCONTPOSTPROCESS-NEXT:    [[SYS_DATA_I2:%.*]] = insertvalue [[STRUCT_SYSTEMDATA:%.*]] undef, [[STRUCT_DISPATCHSYSTEMDATA]] [[DIS_DATA_I1_FCA_0_INSERT]], 0
 ; DXILCONTPOSTPROCESS-NEXT:    [[TRAV_DATA_I3:%.*]] = insertvalue [[STRUCT_TRAVERSALDATA:%.*]] undef, [[STRUCT_SYSTEMDATA]] [[SYS_DATA_I2]], 0
-; DXILCONTPOSTPROCESS-NEXT:    [[TRAV_DATA2_I4:%.*]] = insertvalue [[STRUCT_TRAVERSALDATA]] [[TRAV_DATA_I3]], i64 -1, 5
+; DXILCONTPOSTPROCESS-NEXT:    [[TMP12:%.*]] = call i64 @continuation.getAddrAndMD(ptr @Miss.resume.1)
+; DXILCONTPOSTPROCESS-NEXT:    [[TRAV_DATA2_I5:%.*]] = insertvalue [[STRUCT_TRAVERSALDATA]] [[TRAV_DATA_I3]], i64 [[TMP12]], 5
 ; DXILCONTPOSTPROCESS-NEXT:    store i32 [[TMP2]], ptr addrspace(20) @REGISTERS, align 4
 ; DXILCONTPOSTPROCESS-NEXT:    [[TMP5:%.*]] = load i32, ptr addrspace(20) @REGISTERS, align 4
 ; DXILCONTPOSTPROCESS-NEXT:    store i32 0, ptr addrspace(20) getelementptr inbounds (i32, ptr addrspace(20) @REGISTERS, i32 1), align 4
@@ -621,18 +633,17 @@ attributes #3 = { nounwind memory(none) }
 ; DXILCONTPOSTPROCESS-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP9]], i32 0
 ; DXILCONTPOSTPROCESS-NEXT:    store i32 0, ptr addrspace(21) [[TMP10]], align 4
 ; DXILCONTPOSTPROCESS-NEXT:    [[TMP11:%.*]] = load i32, ptr [[CSP]], align 4
-; DXILCONTPOSTPROCESS-NEXT:    [[TMP12:%.*]] = call i64 @continuation.getAddrAndMD(i64 ptrtoint (ptr @Miss.resume.1 to i64))
-; DXILCONTPOSTPROCESS-NEXT:    call void (i64, ...) @continuation.continue(i64 4, i32 [[TMP11]], i64 [[TMP12]], [[STRUCT_TRAVERSALDATA]] [[TRAV_DATA2_I4]]), !continuation.registercount [[META13:![0-9]+]], !continuation.returnedRegistercount !13
+; DXILCONTPOSTPROCESS-NEXT:    call void (i64, i64, ...) @continuation.waitContinue(i64 4, i64 -1, i32 [[TMP11]], i64 poison, [[STRUCT_TRAVERSALDATA]] [[TRAV_DATA2_I5]]), !continuation.registercount [[META13:![0-9]+]], !continuation.returnedRegistercount [[META13]]
 ; DXILCONTPOSTPROCESS-NEXT:    unreachable
 ;
 ;
 ; DXILCONTPOSTPROCESS-LABEL: define dso_local void @Miss.resume.1(
-; DXILCONTPOSTPROCESS-SAME: i32 [[CSPINIT:%.*]], [[STRUCT_DISPATCHSYSTEMDATA:%.*]] [[TMP0:%.*]]) !lgc.rt.attribute.size [[META15]] !lgc.rt.shaderstage [[META16]] !continuation.registercount [[META13]] !continuation [[META18]] {
+; DXILCONTPOSTPROCESS-SAME: i32 [[CSPINIT:%.*]], i64 [[TMP0:%.*]], [[STRUCT_DISPATCHSYSTEMDATA:%.*]] [[TMP1:%.*]]) !lgc.rt.shaderstage [[META16]] !continuation.registercount [[META13]] !continuation [[META18]] {
 ; DXILCONTPOSTPROCESS-NEXT:  entryresume.1:
 ; DXILCONTPOSTPROCESS-NEXT:    [[CSP:%.*]] = alloca i32, align 4
 ; DXILCONTPOSTPROCESS-NEXT:    store i32 [[CSPINIT]], ptr [[CSP]], align 4
-; DXILCONTPOSTPROCESS-NEXT:    [[TMP1:%.*]] = load i32, ptr [[CSP]], align 4
-; DXILCONTPOSTPROCESS-NEXT:    [[TMP2:%.*]] = add i32 [[TMP1]], -28
+; DXILCONTPOSTPROCESS-NEXT:    [[TMP28:%.*]] = load i32, ptr [[CSP]], align 4
+; DXILCONTPOSTPROCESS-NEXT:    [[TMP2:%.*]] = add i32 [[TMP28]], -28
 ; DXILCONTPOSTPROCESS-NEXT:    [[TMP3:%.*]] = load i32, ptr addrspace(20) @REGISTERS, align 4
 ; DXILCONTPOSTPROCESS-NEXT:    [[TMP4:%.*]] = load i32, ptr addrspace(20) getelementptr inbounds (i32, ptr addrspace(20) @REGISTERS, i32 1), align 4
 ; DXILCONTPOSTPROCESS-NEXT:    [[TMP5:%.*]] = inttoptr i32 [[TMP3]] to ptr addrspace(21)
@@ -643,7 +654,7 @@ attributes #3 = { nounwind memory(none) }
 ; DXILCONTPOSTPROCESS-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP9]], i32 0
 ; DXILCONTPOSTPROCESS-NEXT:    [[TMP11:%.*]] = load i32, ptr addrspace(21) [[TMP10]], align 4
 ; DXILCONTPOSTPROCESS-NEXT:    [[TMP12:%.*]] = load i32, ptr addrspace(20) @REGISTERS, align 4
-; DXILCONTPOSTPROCESS-NEXT:    [[DOTFCA_0_EXTRACT12:%.*]] = extractvalue [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP0]], 0
+; DXILCONTPOSTPROCESS-NEXT:    [[DOTFCA_0_EXTRACT12:%.*]] = extractvalue [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP1]], 0
 ; DXILCONTPOSTPROCESS-NEXT:    call void @amd.dx.setLocalRootIndex(i32 5)
 ; DXILCONTPOSTPROCESS-NEXT:    [[T17:%.*]] = load [[DX_TYPES_HANDLE:%.*]], ptr @"\01?Scene@@3URaytracingAccelerationStructure@@A", align 4
 ; DXILCONTPOSTPROCESS-NEXT:    [[T26:%.*]] = call [[DX_TYPES_HANDLE]] [[DX_OP_CREATEHANDLEFORLIB_DX_TYPES_HANDLE:@[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]](i32 160, [[DX_TYPES_HANDLE]] [[T17]])
@@ -652,7 +663,8 @@ attributes #3 = { nounwind memory(none) }
 ; DXILCONTPOSTPROCESS-NEXT:    [[DIS_DATA_I5_FCA_0_INSERT:%.*]] = insertvalue [[STRUCT_DISPATCHSYSTEMDATA]] poison, <3 x i32> [[DOTFCA_0_EXTRACT12]], 0
 ; DXILCONTPOSTPROCESS-NEXT:    [[SYS_DATA_I6:%.*]] = insertvalue [[STRUCT_SYSTEMDATA:%.*]] undef, [[STRUCT_DISPATCHSYSTEMDATA]] [[DIS_DATA_I5_FCA_0_INSERT]], 0
 ; DXILCONTPOSTPROCESS-NEXT:    [[TRAV_DATA_I7:%.*]] = insertvalue [[STRUCT_TRAVERSALDATA:%.*]] undef, [[STRUCT_SYSTEMDATA]] [[SYS_DATA_I6]], 0
-; DXILCONTPOSTPROCESS-NEXT:    [[TRAV_DATA2_I8:%.*]] = insertvalue [[STRUCT_TRAVERSALDATA]] [[TRAV_DATA_I7]], i64 -1, 5
+; DXILCONTPOSTPROCESS-NEXT:    [[TMP27:%.*]] = call i64 @continuation.getAddrAndMD(ptr @Miss.resume.2)
+; DXILCONTPOSTPROCESS-NEXT:    [[TRAV_DATA2_I10:%.*]] = insertvalue [[STRUCT_TRAVERSALDATA]] [[TRAV_DATA_I7]], i64 [[TMP27]], 5
 ; DXILCONTPOSTPROCESS-NEXT:    store i32 [[TMP2]], ptr addrspace(20) @REGISTERS, align 4
 ; DXILCONTPOSTPROCESS-NEXT:    [[TMP14:%.*]] = load i32, ptr addrspace(20) @REGISTERS, align 4
 ; DXILCONTPOSTPROCESS-NEXT:    store i32 0, ptr addrspace(20) getelementptr inbounds (i32, ptr addrspace(20) @REGISTERS, i32 1), align 4
@@ -672,18 +684,17 @@ attributes #3 = { nounwind memory(none) }
 ; DXILCONTPOSTPROCESS-NEXT:    [[TMP25:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP24]], i32 0
 ; DXILCONTPOSTPROCESS-NEXT:    store i32 0, ptr addrspace(21) [[TMP25]], align 4
 ; DXILCONTPOSTPROCESS-NEXT:    [[TMP26:%.*]] = load i32, ptr [[CSP]], align 4
-; DXILCONTPOSTPROCESS-NEXT:    [[TMP27:%.*]] = call i64 @continuation.getAddrAndMD(i64 ptrtoint (ptr @Miss.resume.2 to i64))
-; DXILCONTPOSTPROCESS-NEXT:    call void (i64, ...) @continuation.continue(i64 4, i32 [[TMP26]], i64 [[TMP27]], [[STRUCT_TRAVERSALDATA]] [[TRAV_DATA2_I8]]), !continuation.registercount [[META13]], !continuation.returnedRegistercount !13
+; DXILCONTPOSTPROCESS-NEXT:    call void (i64, i64, ...) @continuation.waitContinue(i64 4, i64 -1, i32 [[TMP26]], i64 poison, [[STRUCT_TRAVERSALDATA]] [[TRAV_DATA2_I10]]), !continuation.registercount [[META13]], !continuation.returnedRegistercount [[META13]]
 ; DXILCONTPOSTPROCESS-NEXT:    unreachable
 ;
 ;
 ; DXILCONTPOSTPROCESS-LABEL: define dso_local void @Miss.resume.2(
-; DXILCONTPOSTPROCESS-SAME: i32 [[CSPINIT:%.*]], [[STRUCT_DISPATCHSYSTEMDATA:%.*]] [[TMP0:%.*]]) !lgc.rt.attribute.size [[META15]] !lgc.rt.shaderstage [[META16]] !continuation.registercount [[META13]] !continuation [[META18]] {
+; DXILCONTPOSTPROCESS-SAME: i32 [[CSPINIT:%.*]], i64 [[TMP0:%.*]], [[STRUCT_DISPATCHSYSTEMDATA:%.*]] [[TMP1:%.*]]) !lgc.rt.shaderstage [[META16]] !continuation.registercount [[META13]] !continuation [[META18]] {
 ; DXILCONTPOSTPROCESS-NEXT:  entryresume.2:
 ; DXILCONTPOSTPROCESS-NEXT:    [[CSP:%.*]] = alloca i32, align 4
 ; DXILCONTPOSTPROCESS-NEXT:    store i32 [[CSPINIT]], ptr [[CSP]], align 4
-; DXILCONTPOSTPROCESS-NEXT:    [[TMP1:%.*]] = load i32, ptr [[CSP]], align 4
-; DXILCONTPOSTPROCESS-NEXT:    [[TMP2:%.*]] = add i32 [[TMP1]], -28
+; DXILCONTPOSTPROCESS-NEXT:    [[TMP30:%.*]] = load i32, ptr [[CSP]], align 4
+; DXILCONTPOSTPROCESS-NEXT:    [[TMP2:%.*]] = add i32 [[TMP30]], -28
 ; DXILCONTPOSTPROCESS-NEXT:    [[TMP3:%.*]] = load i32, ptr addrspace(20) @REGISTERS, align 4
 ; DXILCONTPOSTPROCESS-NEXT:    [[TMP4:%.*]] = load i32, ptr addrspace(20) getelementptr inbounds (i32, ptr addrspace(20) @REGISTERS, i32 1), align 4
 ; DXILCONTPOSTPROCESS-NEXT:    [[TMP5:%.*]] = inttoptr i32 [[TMP3]] to ptr addrspace(21)
@@ -702,7 +713,7 @@ attributes #3 = { nounwind memory(none) }
 ; DXILCONTPOSTPROCESS-NEXT:    [[TMP18:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP17]], i32 0
 ; DXILCONTPOSTPROCESS-NEXT:    [[TMP19:%.*]] = load i32, ptr addrspace(21) [[TMP18]], align 4
 ; DXILCONTPOSTPROCESS-NEXT:    [[TMP20:%.*]] = load i32, ptr addrspace(20) @REGISTERS, align 4
-; DXILCONTPOSTPROCESS-NEXT:    [[DOTFCA_0_EXTRACT14:%.*]] = extractvalue [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP0]], 0
+; DXILCONTPOSTPROCESS-NEXT:    [[DOTFCA_0_EXTRACT14:%.*]] = extractvalue [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP1]], 0
 ; DXILCONTPOSTPROCESS-NEXT:    call void @amd.dx.setLocalRootIndex(i32 5)
 ; DXILCONTPOSTPROCESS-NEXT:    [[TMP21:%.*]] = add i32 [[TMP2]], 24
 ; DXILCONTPOSTPROCESS-NEXT:    [[TMP22:%.*]] = inttoptr i32 [[TMP21]] to ptr addrspace(21)
@@ -718,7 +729,7 @@ attributes #3 = { nounwind memory(none) }
 ; DXILCONTPOSTPROCESS-NEXT:    [[TMP28:%.*]] = add i32 [[TMP27]], -28
 ; DXILCONTPOSTPROCESS-NEXT:    store i32 [[TMP28]], ptr [[CSP]], align 4
 ; DXILCONTPOSTPROCESS-NEXT:    [[TMP29:%.*]] = load i32, ptr [[CSP]], align 4
-; DXILCONTPOSTPROCESS-NEXT:    call void (i64, ...) @continuation.continue(i64 [[RETURNADDR_RELOAD]], i32 [[TMP29]], [[STRUCT_DISPATCHSYSTEMDATA]] [[DOTFCA_0_INSERT]]), !continuation.registercount [[META17]]
+; DXILCONTPOSTPROCESS-NEXT:    call void (i64, ...) @continuation.continue(i64 [[RETURNADDR_RELOAD]], i32 [[TMP29]], i64 poison, [[STRUCT_DISPATCHSYSTEMDATA]] [[DOTFCA_0_INSERT]]), !continuation.registercount [[META17]]
 ; DXILCONTPOSTPROCESS-NEXT:    unreachable
 ;
 ;
diff --git a/llvmraytracing/test/dx/lower-rt-pipeline-simple-call-shader.ll b/llvmraytracing/test/dx/lower-rt-pipeline-simple-call-shader.ll
index 69840cc200..fb08b19985 100644
--- a/llvmraytracing/test/dx/lower-rt-pipeline-simple-call-shader.ll
+++ b/llvmraytracing/test/dx/lower-rt-pipeline-simple-call-shader.ll
@@ -1,28 +1,20 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --include-generated-funcs --version 3
-; RUN: grep -v lgc.cps.module %s | opt --verify-each -passes='dxil-cont-lgc-rt-op-converter,lint,lower-raytracing-pipeline,lint,remove-types-metadata' -S 2> %t0.stderr | FileCheck -check-prefix=LOWERRAYTRACINGPIPELINE %s
-; RUN: count 0 < %t0.stderr
-; RUN: grep -v lgc.cps.module %s | opt --verify-each -passes='dxil-cont-lgc-rt-op-converter,lint,inline,lint,lower-raytracing-pipeline,lint,sroa,lint,lower-await,lint,coro-early,dxil-coro-split,coro-cleanup,lint,legacy-cleanup-continuations,lint,remove-types-metadata' \
-; RUN:     -S 2> %t1.stderr | FileCheck -check-prefix=CLEANUP %s
-; RUN: count 0 < %t1.stderr
-; RUN: grep -v lgc.cps.module %s | opt --verify-each -passes='dxil-cont-lgc-rt-op-converter,lint,inline,lint,lower-raytracing-pipeline,lint,sroa,lint,lower-await,lint,coro-early,dxil-coro-split,coro-cleanup,lint,legacy-cleanup-continuations,lint,register-buffer,lint,remove-types-metadata' \
-; RUN:     -S 2> %t2.stderr | FileCheck -check-prefix=REGISTERBUFFER %s
-; RUN: count 0 < %t2.stderr
-; RUN: grep -v lgc.cps.module %s | opt --verify-each -passes='dxil-cont-lgc-rt-op-converter,lint,inline,lint,lower-raytracing-pipeline,lint,sroa,lint,lower-await,lint,coro-early,dxil-coro-split,coro-cleanup,lint,legacy-cleanup-continuations,lint,register-buffer,lint,dxil-cont-post-process,lint,remove-types-metadata' \
-; RUN:     -S 2> %t3.stderr | FileCheck -check-prefix=POSTPROCESS %s
-; RUN: count 0 < %t3.stderr
-; RUN: opt --verify-each -passes='dxil-cont-lgc-rt-op-converter,lint,lower-raytracing-pipeline,lint,remove-types-metadata' -S %s 2> %t4.stderr | FileCheck -check-prefix=LOWERRAYTRACINGPIPELINE-CPS %s
-; RUN: count 0 < %t4.stderr
-; RUN: opt --verify-each -passes='dxil-cont-lgc-rt-op-converter,lint,inline,lint,lower-raytracing-pipeline,lint,sroa,lint,lower-await,lint,coro-early,dxil-coro-split,coro-cleanup,lint,dxil-cleanup-continuations,lint,remove-types-metadata' \
-; RUN:     -S %s 2> %t5.stderr | FileCheck -check-prefix=CLEANUP-CPS %s
-; RUN: count 0 < %t5.stderr
-; RUN: opt --verify-each -passes='dxil-cont-intrinsic-prepare,lint,dxil-cont-lgc-rt-op-converter,lint,inline,lint,lower-raytracing-pipeline,lint,sroa,lint,lower-await,lint,coro-early,dxil-coro-split,coro-cleanup,lint,dxil-cleanup-continuations,lint,register-buffer,lint,remove-types-metadata' \
-; RUN:     -S %s 2> %t6.stderr | FileCheck -check-prefix=REGISTERBUFFER-CPS %s
-; RUN: count 0 < %t6.stderr
-; RUN: opt --verify-each -passes='dxil-cont-intrinsic-prepare,lint,dxil-cont-lgc-rt-op-converter,lint,inline,lint,lower-raytracing-pipeline,lint,sroa,lint,lower-await,lint,coro-early,dxil-coro-split,coro-cleanup,lint,dxil-cleanup-continuations,lint,register-buffer,lint,dxil-cont-post-process,lint,remove-types-metadata' \
-; RUN:     -S %s 2> %t7.stderr | FileCheck -check-prefix=POSTPROCESS-CPS %s
-; RUN: count 0 < %t7.stderr
+; RUN: grep -v lgc.cps.module %s | opt --verify-each -passes="dxil-cont-lgc-rt-op-converter,lint,lower-raytracing-pipeline,lint,remove-types-metadata" -S --lint-abort-on-error | FileCheck -check-prefix=LOWERRAYTRACINGPIPELINE %s
+; RUN: grep -v lgc.cps.module %s | opt --verify-each -passes="dxil-cont-lgc-rt-op-converter,lint,inline,lint,lower-raytracing-pipeline,lint,sroa,lint,lower-await,lint,coro-early,dxil-coro-split,coro-cleanup,lint,legacy-cleanup-continuations,lint,remove-types-metadata" \
+; RUN:     -S --lint-abort-on-error | FileCheck -check-prefix=CLEANUP %s
+; RUN: grep -v lgc.cps.module %s | opt --verify-each -passes="dxil-cont-lgc-rt-op-converter,lint,inline,lint,lower-raytracing-pipeline,lint,sroa,lint,lower-await,lint,coro-early,dxil-coro-split,coro-cleanup,lint,legacy-cleanup-continuations,lint,register-buffer,lint,remove-types-metadata" \
+; RUN:     -S --lint-abort-on-error | FileCheck -check-prefix=REGISTERBUFFER %s
+; RUN: grep -v lgc.cps.module %s | opt --verify-each -passes="dxil-cont-lgc-rt-op-converter,lint,inline,lint,lower-raytracing-pipeline,lint,sroa,lint,lower-await,lint,coro-early,dxil-coro-split,coro-cleanup,lint,legacy-cleanup-continuations,lint,register-buffer,lint,dxil-cont-post-process,lint,remove-types-metadata" \
+; RUN:     -S --lint-abort-on-error | FileCheck -check-prefix=POSTPROCESS %s
+; RUN: opt --verify-each -passes="dxil-cont-lgc-rt-op-converter,lint,lower-raytracing-pipeline,lint,remove-types-metadata" -S %s --lint-abort-on-error | FileCheck -check-prefix=LOWERRAYTRACINGPIPELINE-CPS %s
+; RUN: opt --verify-each -passes="dxil-cont-lgc-rt-op-converter,lint,inline,lint,lower-raytracing-pipeline,lint,sroa,lint,lower-await,lint,coro-early,dxil-coro-split,coro-cleanup,lint,dxil-cleanup-continuations,lint,remove-types-metadata" \
+; RUN:     -S %s --lint-abort-on-error | FileCheck -check-prefix=CLEANUP-CPS %s
+; RUN: opt --verify-each -passes="dxil-cont-intrinsic-prepare,lint,dxil-cont-lgc-rt-op-converter,lint,inline,lint,lower-raytracing-pipeline,lint,sroa,lint,lower-await,lint,coro-early,dxil-coro-split,coro-cleanup,lint,dxil-cleanup-continuations,lint,register-buffer,lint,remove-types-metadata" \
+; RUN:     -S %s --lint-abort-on-error | FileCheck -check-prefix=REGISTERBUFFER-CPS %s
+; RUN: opt --verify-each -passes="dxil-cont-intrinsic-prepare,lint,dxil-cont-lgc-rt-op-converter,lint,inline,lint,lower-raytracing-pipeline,lint,sroa,lint,lower-await,lint,coro-early,dxil-coro-split,coro-cleanup,lint,dxil-cleanup-continuations,lint,register-buffer,lint,dxil-cont-post-process,lint,remove-types-metadata" \
+; RUN:     -S %s --lint-abort-on-error | FileCheck -check-prefix=POSTPROCESS-CPS %s
 
-target datalayout = "e-m:e-p:64:32-p20:32:32-p21:32:32-p32:32:32-i1:32-i8:8-i16:32-i32:32-i64:32-f16:32-f32:32-f64:32-v16:32-v32:32-v48:32-v64:32-v80:32-v96:32-v112:32-v128:32-v144:32-v160:32-v176:32-v192:32-v208:32-v224:32-v240:32-v256:32-n8:16:32"
+target datalayout = "e-m:e-p:64:32-p20:32:32-p21:32:32-p32:32:32-i1:32-i8:8-i16:16-i32:32-i64:32-f16:16-f32:32-f64:32-v8:8-v16:16-v32:32-v48:32-v64:32-v80:32-v96:32-v112:32-v128:32-v144:32-v160:32-v176:32-v192:32-v208:32-v224:32-v240:32-v256:32-n8:16:32"
 
 %dx.types.Handle = type { i8* }
 %struct.DispatchSystemData = type { i32 }
@@ -126,25 +118,25 @@ attributes #1 = { alwaysinline }
 ;
 ;
 ; LOWERRAYTRACINGPIPELINE-LABEL: define %struct.DispatchSystemData @called(
-; LOWERRAYTRACINGPIPELINE-SAME: [[STRUCT_DISPATCHSYSTEMDATA:%.*]] [[TMP0:%.*]]) !lgc.rt.shaderstage [[META16:![0-9]+]] !continuation.registercount [[META14:![0-9]+]] !continuation [[META17:![0-9]+]] {
+; LOWERRAYTRACINGPIPELINE-SAME: i64 [[RETURNADDR:%.*]], [[STRUCT_DISPATCHSYSTEMDATA:%.*]] [[TMP0:%.*]]) !lgc.rt.shaderstage [[META16:![0-9]+]] !continuation.registercount [[META14:![0-9]+]] !continuation [[META17:![0-9]+]] {
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[SYSTEM_DATA_ALLOCA:%.*]] = alloca [[STRUCT_DISPATCHSYSTEMDATA]], align 8
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP2:%.*]] = alloca [[STRUCT_MYPARAMS:%.*]], align 8
 ; LOWERRAYTRACINGPIPELINE-NEXT:    store [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP0]], ptr [[SYSTEM_DATA_ALLOCA]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [[STRUCT_MYPARAMS]], ptr [[TMP2]], i32 0, i32 0
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP6:%.*]] = load i32, ptr addrspace(20) @PAYLOAD, align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP6]], ptr [[TMP3]], align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [[STRUCT_MYPARAMS]], ptr [[TMP2]], i32 0
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP4:%.*]] = load i32, ptr addrspace(20) @PAYLOAD, align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP4]], ptr [[TMP3]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    call void @amd.dx.setLocalRootIndex(i32 5)
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[DIS_DATA_I:%.*]] = load [[STRUCT_DISPATCHSYSTEMDATA]], ptr [[SYSTEM_DATA_ALLOCA]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT_MYPARAMS]], ptr [[TMP2]], i32 0, i32 0
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP10:%.*]] = load i32, ptr [[TMP7]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP10]], ptr addrspace(20) @PAYLOAD, align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP8:%.*]] = call ptr inttoptr (i64 2 to ptr)([[STRUCT_DISPATCHSYSTEMDATA]] [[DIS_DATA_I]]), !continuation.registercount [[META14]], !continuation.returnedRegistercount !14
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP12:%.*]] = call [[STRUCT_DISPATCHSYSTEMDATA]] [[AWAIT_STRUCT_DISPATCHSYSTEMDATA:@[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]](ptr [[TMP8]])
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [[STRUCT_MYPARAMS]], ptr [[TMP2]], i32 0
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP6:%.*]] = load i32, ptr [[TMP5]], align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP6]], ptr addrspace(20) @PAYLOAD, align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP7:%.*]] = call ptr inttoptr (i64 2 to ptr)([[STRUCT_DISPATCHSYSTEMDATA]] [[DIS_DATA_I]]), !continuation.registercount [[META14]], !continuation.returnedRegistercount [[META14]]
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP8:%.*]] = call [[STRUCT_DISPATCHSYSTEMDATA]] [[AWAIT:@[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]](ptr [[TMP7]])
 ; LOWERRAYTRACINGPIPELINE-NEXT:    store [[STRUCT_MYPARAMS]] poison, ptr [[TMP2]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP13:%.*]] = getelementptr inbounds [[STRUCT_MYPARAMS]], ptr [[TMP2]], i32 0, i32 0
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP16:%.*]] = load i32, ptr addrspace(20) @PAYLOAD, align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP16]], ptr [[TMP13]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    store [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP12]], ptr [[SYSTEM_DATA_ALLOCA]], align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [[STRUCT_MYPARAMS]], ptr [[TMP2]], i32 0
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP10:%.*]] = load i32, ptr addrspace(20) @PAYLOAD, align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP10]], ptr [[TMP9]], align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    store [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP8]], ptr [[SYSTEM_DATA_ALLOCA]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    call void @amd.dx.setLocalRootIndex(i32 5)
 ; LOWERRAYTRACINGPIPELINE-NEXT:    br label [[DOTSPLIT:%.*]]
 ; LOWERRAYTRACINGPIPELINE:       .split:
@@ -152,11 +144,12 @@ attributes #1 = { alwaysinline }
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[A:%.*]] = extractelement <3 x i32> [[TMP17]], i8 0
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP18:%.*]] = call <3 x i32> @lgc.rt.dispatch.rays.dimensions()
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[B:%.*]] = extractelement <3 x i32> [[TMP18]], i8 0
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP19:%.*]] = getelementptr inbounds [[STRUCT_MYPARAMS]], ptr [[TMP2]], i32 0, i32 0
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP19:%.*]] = getelementptr inbounds [[STRUCT_MYPARAMS]], ptr [[TMP2]], i32 0
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP22:%.*]] = load i32, ptr [[TMP19]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP22]], ptr addrspace(20) @PAYLOAD, align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP23:%.*]] = load [[STRUCT_DISPATCHSYSTEMDATA]], ptr [[SYSTEM_DATA_ALLOCA]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    ret [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP23]], !continuation.registercount [[META14]]
+; LOWERRAYTRACINGPIPELINE-NEXT:    call void (...) @lgc.ilcps.return(i64 [[RETURNADDR]], [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP23]]), !continuation.registercount [[META14]]
+; LOWERRAYTRACINGPIPELINE-NEXT:    unreachable
 ;
 ;
 ; CLEANUP-LABEL: define i32 @_cont_GetLocalRootIndex(
@@ -175,16 +168,17 @@ attributes #1 = { alwaysinline }
 ; CLEANUP-NEXT:    call void @amd.dx.setLocalRootIndex(i32 5)
 ; CLEANUP-NEXT:    [[DIS_DATA_I_FCA_0_INSERT:%.*]] = insertvalue [[STRUCT_DISPATCHSYSTEMDATA]] poison, i32 [[DOTFCA_0_EXTRACT]], 0
 ; CLEANUP-NEXT:    store i32 [[TMP1]], ptr addrspace(20) @PAYLOAD, align 4
-; CLEANUP-NEXT:    call void (i64, ...) @continuation.continue(i64 2, i64 ptrtoint (ptr @called.resume.0 to i64), [[STRUCT_DISPATCHSYSTEMDATA]] [[DIS_DATA_I_FCA_0_INSERT]]), !continuation.registercount [[META14]], !continuation.returnedRegistercount !14
+; CLEANUP-NEXT:    [[TMP2:%.*]] = call i64 (...) @lgc.cps.as.continuation.reference__i64(ptr @called.resume.0)
+; CLEANUP-NEXT:    call void (i64, ...) @continuation.continue(i64 2, i64 [[TMP2]], [[STRUCT_DISPATCHSYSTEMDATA]] [[DIS_DATA_I_FCA_0_INSERT]]), !continuation.registercount [[META14]], !continuation.returnedRegistercount [[META14]]
 ; CLEANUP-NEXT:    unreachable
 ;
 ;
 ; CLEANUP-LABEL: define dso_local void @called.resume.0(
-; CLEANUP-SAME: [[STRUCT_DISPATCHSYSTEMDATA:%.*]] [[TMP0:%.*]]) !lgc.rt.shaderstage [[META16]] !continuation.registercount [[META14]] !continuation [[META17]] {
+; CLEANUP-SAME: i64 [[TMP0:%.*]], [[STRUCT_DISPATCHSYSTEMDATA:%.*]] [[TMP1:%.*]]) !lgc.rt.shaderstage [[META16]] !continuation.registercount [[META14]] !continuation [[META17]] {
 ; CLEANUP-NEXT:  entryresume.0:
 ; CLEANUP-NEXT:    [[CONT_STATE_STACK_SEGMENT:%.*]] = call ptr addrspace(32) @lgc.cps.peek(i32 8)
-; CLEANUP-NEXT:    [[TMP1:%.*]] = load i32, ptr addrspace(20) @PAYLOAD, align 4
-; CLEANUP-NEXT:    [[DOTFCA_0_EXTRACT3:%.*]] = extractvalue [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP0]], 0
+; CLEANUP-NEXT:    [[TMP4:%.*]] = load i32, ptr addrspace(20) @PAYLOAD, align 4
+; CLEANUP-NEXT:    [[DOTFCA_0_EXTRACT3:%.*]] = extractvalue [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP1]], 0
 ; CLEANUP-NEXT:    call void @amd.dx.setLocalRootIndex(i32 5)
 ; CLEANUP-NEXT:    [[RETURNADDR_RELOAD_ADDR:%.*]] = getelementptr inbounds [[CALLED_FRAME:%.*]], ptr addrspace(32) [[CONT_STATE_STACK_SEGMENT]], i32 0, i32 0
 ; CLEANUP-NEXT:    [[RETURNADDR_RELOAD:%.*]] = load i64, ptr addrspace(32) [[RETURNADDR_RELOAD_ADDR]], align 4
@@ -192,10 +186,10 @@ attributes #1 = { alwaysinline }
 ; CLEANUP-NEXT:    [[A:%.*]] = extractelement <3 x i32> [[TMP2]], i8 0
 ; CLEANUP-NEXT:    [[TMP3:%.*]] = call <3 x i32> @lgc.rt.dispatch.rays.dimensions()
 ; CLEANUP-NEXT:    [[B:%.*]] = extractelement <3 x i32> [[TMP3]], i8 0
-; CLEANUP-NEXT:    store i32 [[TMP1]], ptr addrspace(20) @PAYLOAD, align 4
+; CLEANUP-NEXT:    store i32 [[TMP4]], ptr addrspace(20) @PAYLOAD, align 4
 ; CLEANUP-NEXT:    [[DOTFCA_0_INSERT:%.*]] = insertvalue [[STRUCT_DISPATCHSYSTEMDATA]] poison, i32 [[DOTFCA_0_EXTRACT3]], 0
 ; CLEANUP-NEXT:    call void @lgc.cps.free(i32 8)
-; CLEANUP-NEXT:    call void (i64, ...) @continuation.continue(i64 [[RETURNADDR_RELOAD]], [[STRUCT_DISPATCHSYSTEMDATA]] [[DOTFCA_0_INSERT]]), !continuation.registercount [[META14]]
+; CLEANUP-NEXT:    call void (i64, ...) @continuation.continue(i64 [[RETURNADDR_RELOAD]], i64 poison, [[STRUCT_DISPATCHSYSTEMDATA]] [[DOTFCA_0_INSERT]]), !continuation.registercount [[META14]]
 ; CLEANUP-NEXT:    unreachable
 ;
 ;
@@ -215,16 +209,17 @@ attributes #1 = { alwaysinline }
 ; REGISTERBUFFER-NEXT:    call void @amd.dx.setLocalRootIndex(i32 5)
 ; REGISTERBUFFER-NEXT:    [[DIS_DATA_I_FCA_0_INSERT:%.*]] = insertvalue [[STRUCT_DISPATCHSYSTEMDATA]] poison, i32 [[DOTFCA_0_EXTRACT]], 0
 ; REGISTERBUFFER-NEXT:    store i32 [[TMP1]], ptr addrspace(20) @PAYLOAD, align 4
-; REGISTERBUFFER-NEXT:    call void (i64, ...) @continuation.continue(i64 2, i64 ptrtoint (ptr @called.resume.0 to i64), [[STRUCT_DISPATCHSYSTEMDATA]] [[DIS_DATA_I_FCA_0_INSERT]]), !continuation.registercount [[META14]], !continuation.returnedRegistercount !14
+; REGISTERBUFFER-NEXT:    [[TMP2:%.*]] = call i64 (...) @lgc.cps.as.continuation.reference__i64(ptr @called.resume.0)
+; REGISTERBUFFER-NEXT:    call void (i64, ...) @continuation.continue(i64 2, i64 [[TMP2]], [[STRUCT_DISPATCHSYSTEMDATA]] [[DIS_DATA_I_FCA_0_INSERT]]), !continuation.registercount [[META14]], !continuation.returnedRegistercount [[META14]]
 ; REGISTERBUFFER-NEXT:    unreachable
 ;
 ;
 ; REGISTERBUFFER-LABEL: define dso_local void @called.resume.0(
-; REGISTERBUFFER-SAME: [[STRUCT_DISPATCHSYSTEMDATA:%.*]] [[TMP0:%.*]]) !lgc.rt.shaderstage [[META16]] !continuation.registercount [[META14]] !continuation [[META17]] {
+; REGISTERBUFFER-SAME: i64 [[TMP0:%.*]], [[STRUCT_DISPATCHSYSTEMDATA:%.*]] [[TMP1:%.*]]) !lgc.rt.shaderstage [[META16]] !continuation.registercount [[META14]] !continuation [[META17]] {
 ; REGISTERBUFFER-NEXT:  entryresume.0:
 ; REGISTERBUFFER-NEXT:    [[CONT_STATE_STACK_SEGMENT:%.*]] = call ptr addrspace(32) @lgc.cps.peek(i32 8)
-; REGISTERBUFFER-NEXT:    [[TMP1:%.*]] = load i32, ptr addrspace(20) @PAYLOAD, align 4
-; REGISTERBUFFER-NEXT:    [[DOTFCA_0_EXTRACT3:%.*]] = extractvalue [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP0]], 0
+; REGISTERBUFFER-NEXT:    [[TMP4:%.*]] = load i32, ptr addrspace(20) @PAYLOAD, align 4
+; REGISTERBUFFER-NEXT:    [[DOTFCA_0_EXTRACT3:%.*]] = extractvalue [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP1]], 0
 ; REGISTERBUFFER-NEXT:    call void @amd.dx.setLocalRootIndex(i32 5)
 ; REGISTERBUFFER-NEXT:    [[RETURNADDR_RELOAD_ADDR:%.*]] = getelementptr inbounds [[CALLED_FRAME:%.*]], ptr addrspace(32) [[CONT_STATE_STACK_SEGMENT]], i32 0, i32 0
 ; REGISTERBUFFER-NEXT:    [[RETURNADDR_RELOAD:%.*]] = load i64, ptr addrspace(32) [[RETURNADDR_RELOAD_ADDR]], align 4
@@ -232,10 +227,10 @@ attributes #1 = { alwaysinline }
 ; REGISTERBUFFER-NEXT:    [[A:%.*]] = extractelement <3 x i32> [[TMP2]], i8 0
 ; REGISTERBUFFER-NEXT:    [[TMP3:%.*]] = call <3 x i32> @lgc.rt.dispatch.rays.dimensions()
 ; REGISTERBUFFER-NEXT:    [[B:%.*]] = extractelement <3 x i32> [[TMP3]], i8 0
-; REGISTERBUFFER-NEXT:    store i32 [[TMP1]], ptr addrspace(20) @PAYLOAD, align 4
+; REGISTERBUFFER-NEXT:    store i32 [[TMP4]], ptr addrspace(20) @PAYLOAD, align 4
 ; REGISTERBUFFER-NEXT:    [[DOTFCA_0_INSERT:%.*]] = insertvalue [[STRUCT_DISPATCHSYSTEMDATA]] poison, i32 [[DOTFCA_0_EXTRACT3]], 0
 ; REGISTERBUFFER-NEXT:    call void @lgc.cps.free(i32 8)
-; REGISTERBUFFER-NEXT:    call void (i64, ...) @continuation.continue(i64 [[RETURNADDR_RELOAD]], [[STRUCT_DISPATCHSYSTEMDATA]] [[DOTFCA_0_INSERT]]), !continuation.registercount [[META14]]
+; REGISTERBUFFER-NEXT:    call void (i64, ...) @continuation.continue(i64 [[RETURNADDR_RELOAD]], i64 poison, [[STRUCT_DISPATCHSYSTEMDATA]] [[DOTFCA_0_INSERT]]), !continuation.registercount [[META14]]
 ; REGISTERBUFFER-NEXT:    unreachable
 ;
 ;
@@ -260,30 +255,30 @@ attributes #1 = { alwaysinline }
 ; POSTPROCESS-NEXT:    call void @amd.dx.setLocalRootIndex(i32 5)
 ; POSTPROCESS-NEXT:    [[DIS_DATA_I_FCA_0_INSERT:%.*]] = insertvalue [[STRUCT_DISPATCHSYSTEMDATA]] poison, i32 [[DOTFCA_0_EXTRACT]], 0
 ; POSTPROCESS-NEXT:    store i32 [[TMP5]], ptr addrspace(20) @REGISTERS, align 4
+; POSTPROCESS-NEXT:    [[TMP7:%.*]] = call i64 @continuation.getAddrAndMD(ptr @called.resume.0)
 ; POSTPROCESS-NEXT:    [[TMP6:%.*]] = load i32, ptr [[CSP]], align 4
-; POSTPROCESS-NEXT:    [[TMP7:%.*]] = call i64 @continuation.getAddrAndMD(i64 ptrtoint (ptr @called.resume.0 to i64))
-; POSTPROCESS-NEXT:    call void (i64, ...) @continuation.continue(i64 2, i32 [[TMP6]], i64 [[TMP7]], [[STRUCT_DISPATCHSYSTEMDATA]] [[DIS_DATA_I_FCA_0_INSERT]]), !continuation.registercount [[META14]], !continuation.returnedRegistercount !14
+; POSTPROCESS-NEXT:    call void (i64, ...) @continuation.continue(i64 2, i32 [[TMP6]], i64 [[TMP7]], [[STRUCT_DISPATCHSYSTEMDATA]] [[DIS_DATA_I_FCA_0_INSERT]]), !continuation.registercount [[META14]], !continuation.returnedRegistercount [[META14]]
 ; POSTPROCESS-NEXT:    unreachable
 ;
 ;
 ; POSTPROCESS-LABEL: define dso_local void @called.resume.0(
-; POSTPROCESS-SAME: i32 [[CSPINIT:%.*]], [[STRUCT_DISPATCHSYSTEMDATA:%.*]] [[TMP0:%.*]]) !lgc.rt.shaderstage [[META16]] !continuation.registercount [[META14]] !continuation [[META17]] {
+; POSTPROCESS-SAME: i32 [[CSPINIT:%.*]], i64 [[TMP0:%.*]], [[STRUCT_DISPATCHSYSTEMDATA:%.*]] [[TMP1:%.*]]) !lgc.rt.shaderstage [[META16]] !continuation.registercount [[META14]] !continuation [[META17]] {
 ; POSTPROCESS-NEXT:  entryresume.0:
-; POSTPROCESS-NEXT:    [[SYSTEM_DATA_ALLOCA:%.*]] = alloca [[STRUCT_DISPATCHSYSTEMDATA]], align 8
+; POSTPROCESS-NEXT:    [[SYSTEM_DATA_ALLOCA1:%.*]] = alloca [[STRUCT_DISPATCHSYSTEMDATA]], align 8
 ; POSTPROCESS-NEXT:    [[CSP:%.*]] = alloca i32, align 4
 ; POSTPROCESS-NEXT:    store i32 [[CSPINIT]], ptr [[CSP]], align 4
-; POSTPROCESS-NEXT:    store [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP0]], ptr [[SYSTEM_DATA_ALLOCA]], align 4
-; POSTPROCESS-NEXT:    [[TMP1:%.*]] = load i32, ptr [[CSP]], align 4
-; POSTPROCESS-NEXT:    [[TMP2:%.*]] = add i32 [[TMP1]], -8
+; POSTPROCESS-NEXT:    store [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP1]], ptr [[SYSTEM_DATA_ALLOCA1]], align 4
+; POSTPROCESS-NEXT:    [[TMP11:%.*]] = load i32, ptr [[CSP]], align 4
+; POSTPROCESS-NEXT:    [[TMP2:%.*]] = add i32 [[TMP11]], -8
 ; POSTPROCESS-NEXT:    [[TMP3:%.*]] = load i32, ptr addrspace(20) @REGISTERS, align 4
-; POSTPROCESS-NEXT:    [[DOTFCA_0_EXTRACT3:%.*]] = extractvalue [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP0]], 0
+; POSTPROCESS-NEXT:    [[DOTFCA_0_EXTRACT3:%.*]] = extractvalue [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP1]], 0
 ; POSTPROCESS-NEXT:    call void @amd.dx.setLocalRootIndex(i32 5)
 ; POSTPROCESS-NEXT:    [[TMP4:%.*]] = inttoptr i32 [[TMP2]] to ptr addrspace(21)
 ; POSTPROCESS-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP4]], i32 0
 ; POSTPROCESS-NEXT:    [[RETURNADDR_RELOAD:%.*]] = load i64, ptr addrspace(21) [[TMP5]], align 4
-; POSTPROCESS-NEXT:    [[TMP6:%.*]] = call <3 x i32> @_cont_DispatchRaysIndex3(ptr [[SYSTEM_DATA_ALLOCA]])
+; POSTPROCESS-NEXT:    [[TMP6:%.*]] = call <3 x i32> @_cont_DispatchRaysIndex3(ptr [[SYSTEM_DATA_ALLOCA1]])
 ; POSTPROCESS-NEXT:    [[A:%.*]] = extractelement <3 x i32> [[TMP6]], i8 0
-; POSTPROCESS-NEXT:    [[TMP7:%.*]] = call <3 x i32> @_cont_DispatchRaysDimensions3(ptr [[SYSTEM_DATA_ALLOCA]])
+; POSTPROCESS-NEXT:    [[TMP7:%.*]] = call <3 x i32> @_cont_DispatchRaysDimensions3(ptr [[SYSTEM_DATA_ALLOCA1]])
 ; POSTPROCESS-NEXT:    [[B:%.*]] = extractelement <3 x i32> [[TMP7]], i8 0
 ; POSTPROCESS-NEXT:    store i32 [[TMP3]], ptr addrspace(20) @REGISTERS, align 4
 ; POSTPROCESS-NEXT:    [[DOTFCA_0_INSERT:%.*]] = insertvalue [[STRUCT_DISPATCHSYSTEMDATA]] poison, i32 [[DOTFCA_0_EXTRACT3]], 0
@@ -291,7 +286,7 @@ attributes #1 = { alwaysinline }
 ; POSTPROCESS-NEXT:    [[TMP9:%.*]] = add i32 [[TMP8]], -8
 ; POSTPROCESS-NEXT:    store i32 [[TMP9]], ptr [[CSP]], align 4
 ; POSTPROCESS-NEXT:    [[TMP10:%.*]] = load i32, ptr [[CSP]], align 4
-; POSTPROCESS-NEXT:    call void (i64, ...) @continuation.continue(i64 [[RETURNADDR_RELOAD]], i32 [[TMP10]], [[STRUCT_DISPATCHSYSTEMDATA]] [[DOTFCA_0_INSERT]]), !continuation.registercount [[META14]]
+; POSTPROCESS-NEXT:    call void (i64, ...) @continuation.continue(i64 [[RETURNADDR_RELOAD]], i32 [[TMP10]], i64 poison, [[STRUCT_DISPATCHSYSTEMDATA]] [[DOTFCA_0_INSERT]]), !continuation.registercount [[META14]]
 ; POSTPROCESS-NEXT:    unreachable
 ;
 ;
@@ -301,43 +296,43 @@ attributes #1 = { alwaysinline }
 ;
 ;
 ; LOWERRAYTRACINGPIPELINE-CPS-LABEL: define void @called(
-; LOWERRAYTRACINGPIPELINE-CPS-SAME: {} [[CONT_STATE:%.*]], i32 [[RETURN_ADDR:%.*]], i32 [[SHADER_INDEX:%.*]], [[STRUCT_DISPATCHSYSTEMDATA:%.*]] [[SYSTEM_DATA:%.*]], {} [[HIT_ATTRS:%.*]], [8 x i32] [[PADDING:%.*]], [1 x i32] [[PAYLOAD:%.*]]) !lgc.rt.shaderstage [[META16:![0-9]+]] !lgc.cps [[META14:![0-9]+]] !continuation [[META17:![0-9]+]] {
+; LOWERRAYTRACINGPIPELINE-CPS-SAME: {} [[CONT_STATE:%.*]], i32 [[RETURNADDR:%.*]], i32 [[SHADER_INDEX:%.*]], [[STRUCT_DISPATCHSYSTEMDATA:%.*]] [[SYSTEM_DATA:%.*]], {} [[HIT_ATTRS:%.*]], [8 x i32] [[PADDING:%.*]], [1 x i32] [[PAYLOAD:%.*]]) !lgc.rt.shaderstage [[META16:![0-9]+]] !lgc.cps [[META17:![0-9]+]] !continuation [[META18:![0-9]+]] {
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[SYSTEM_DATA_ALLOCA:%.*]] = alloca [[STRUCT_DISPATCHSYSTEMDATA]], align 8
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[PAYLOAD_ALLOCA:%.*]] = alloca [30 x i32], align 4
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP1:%.*]] = alloca [[STRUCT_MYPARAMS:%.*]], align 8
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store [1 x i32] [[PAYLOAD]], ptr [[PAYLOAD_ALLOCA]], align 4
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store [[STRUCT_DISPATCHSYSTEMDATA]] [[SYSTEM_DATA]], ptr [[SYSTEM_DATA_ALLOCA]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [[STRUCT_MYPARAMS]], ptr [[TMP1]], i32 0, i32 0
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [[STRUCT_MYPARAMS]], ptr [[TMP1]], i32 0
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP3:%.*]] = load i32, ptr [[PAYLOAD_ALLOCA]], align 4
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store i32 [[TMP3]], ptr [[TMP2]], align 4
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    call void @amd.dx.setLocalRootIndex(i32 [[SHADER_INDEX]])
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[DIS_DATA_I:%.*]] = load [[STRUCT_DISPATCHSYSTEMDATA]], ptr [[SYSTEM_DATA_ALLOCA]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [[STRUCT_MYPARAMS]], ptr [[TMP1]], i32 0, i32 0
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP5:%.*]] = load i32, ptr [[TMP8]], align 4
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [[STRUCT_MYPARAMS]], ptr [[TMP1]], i32 0
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP5:%.*]] = load i32, ptr [[TMP4]], align 4
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store i32 [[TMP5]], ptr [[PAYLOAD_ALLOCA]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP14:%.*]] = load [1 x i32], ptr [[PAYLOAD_ALLOCA]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP15:%.*]] = call { [[STRUCT_DISPATCHSYSTEMDATA]], [8 x i32], [1 x i32] } (...) @lgc.cps.await__sl_s_struct.DispatchSystemDatasa8i32a1i32s(i32 2, i32 2, i32 5, [9 x i32] poison, [1 x i32] [[TMP14]]), !continuation.registercount [[META14]], !continuation.returnedRegistercount !14
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP16:%.*]] = extractvalue { [[STRUCT_DISPATCHSYSTEMDATA]], [8 x i32], [1 x i32] } [[TMP15]], 2
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store [1 x i32] [[TMP16]], ptr [[PAYLOAD_ALLOCA]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP17:%.*]] = extractvalue { [[STRUCT_DISPATCHSYSTEMDATA]], [8 x i32], [1 x i32] } [[TMP15]], 0
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP6:%.*]] = load [1 x i32], ptr [[PAYLOAD_ALLOCA]], align 4
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP7:%.*]] = call { [[STRUCT_DISPATCHSYSTEMDATA]], [8 x i32], [1 x i32] } (...) @lgc.cps.await__sl_s_struct.DispatchSystemDatasa8i32a1i32s(i32 2, i32 4, i32 5, [9 x i32] poison, [1 x i32] [[TMP6]]), !continuation.registercount [[META14:![0-9]+]], !continuation.returnedRegistercount [[META14]]
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP8:%.*]] = extractvalue { [[STRUCT_DISPATCHSYSTEMDATA]], [8 x i32], [1 x i32] } [[TMP7]], 2
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store [1 x i32] [[TMP8]], ptr [[PAYLOAD_ALLOCA]], align 4
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP9:%.*]] = extractvalue { [[STRUCT_DISPATCHSYSTEMDATA]], [8 x i32], [1 x i32] } [[TMP7]], 0
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store [[STRUCT_MYPARAMS]] poison, ptr [[TMP1]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP18:%.*]] = getelementptr inbounds [[STRUCT_MYPARAMS]], ptr [[TMP1]], i32 0, i32 0
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [[STRUCT_MYPARAMS]], ptr [[TMP1]], i32 0
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP11:%.*]] = load i32, ptr [[PAYLOAD_ALLOCA]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store i32 [[TMP11]], ptr [[TMP18]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP17]], ptr [[SYSTEM_DATA_ALLOCA]], align 4
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store i32 [[TMP11]], ptr [[TMP10]], align 4
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP9]], ptr [[SYSTEM_DATA_ALLOCA]], align 4
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    call void @amd.dx.setLocalRootIndex(i32 5)
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    br label [[DOTSPLIT:%.*]]
 ; LOWERRAYTRACINGPIPELINE-CPS:       .split:
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP24:%.*]] = call <3 x i32> @lgc.rt.dispatch.rays.index()
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[A:%.*]] = extractelement <3 x i32> [[TMP24]], i8 0
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP25:%.*]] = call <3 x i32> @lgc.rt.dispatch.rays.dimensions()
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[B:%.*]] = extractelement <3 x i32> [[TMP25]], i8 0
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP26:%.*]] = getelementptr inbounds [[STRUCT_MYPARAMS]], ptr [[TMP1]], i32 0, i32 0
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP19:%.*]] = load i32, ptr [[TMP26]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store i32 [[TMP19]], ptr [[PAYLOAD_ALLOCA]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP32:%.*]] = load [[STRUCT_DISPATCHSYSTEMDATA]], ptr [[SYSTEM_DATA_ALLOCA]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP33:%.*]] = load [1 x i32], ptr [[PAYLOAD_ALLOCA]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    call void (...) @lgc.cps.jump(i32 [[RETURN_ADDR]], i32 3, {} poison, i32 poison, i32 poison, [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP32]], [8 x i32] poison, [1 x i32] [[TMP33]]), !continuation.registercount [[META14]]
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP12:%.*]] = call <3 x i32> @lgc.rt.dispatch.rays.index()
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[A:%.*]] = extractelement <3 x i32> [[TMP12]], i8 0
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP13:%.*]] = call <3 x i32> @lgc.rt.dispatch.rays.dimensions()
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[B:%.*]] = extractelement <3 x i32> [[TMP13]], i8 0
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP14:%.*]] = getelementptr inbounds [[STRUCT_MYPARAMS]], ptr [[TMP1]], i32 0
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP15:%.*]] = load i32, ptr [[TMP14]], align 4
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store i32 [[TMP15]], ptr [[PAYLOAD_ALLOCA]], align 4
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP16:%.*]] = load [[STRUCT_DISPATCHSYSTEMDATA]], ptr [[SYSTEM_DATA_ALLOCA]], align 4
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP17:%.*]] = load [1 x i32], ptr [[PAYLOAD_ALLOCA]], align 4
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    call void (...) @lgc.cps.jump(i32 [[RETURNADDR]], i32 6, {} poison, i32 poison, i32 poison, [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP16]], [8 x i32] poison, [1 x i32] [[TMP17]]), !continuation.registercount [[META14]]
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    unreachable
 ;
 ;
@@ -347,23 +342,23 @@ attributes #1 = { alwaysinline }
 ;
 ;
 ; CLEANUP-CPS-LABEL: define void @called(
-; CLEANUP-CPS-SAME: {} [[CONT_STATE:%.*]], i32 [[RETURN_ADDR:%.*]], i32 [[SHADER_INDEX:%.*]], [[STRUCT_DISPATCHSYSTEMDATA:%.*]] [[SYSTEM_DATA:%.*]], {} [[HIT_ATTRS:%.*]], [8 x i32] [[PADDING:%.*]], [1 x i32] [[PAYLOAD:%.*]]) !lgc.rt.shaderstage [[META16:![0-9]+]] !lgc.cps [[META14:![0-9]+]] !continuation [[META17:![0-9]+]] {
+; CLEANUP-CPS-SAME: {} [[CONT_STATE:%.*]], i32 [[RETURNADDR:%.*]], i32 [[SHADER_INDEX:%.*]], [[STRUCT_DISPATCHSYSTEMDATA:%.*]] [[SYSTEM_DATA:%.*]], {} [[HIT_ATTRS:%.*]], [8 x i32] [[PADDING:%.*]], [1 x i32] [[PAYLOAD:%.*]]) !lgc.rt.shaderstage [[META16:![0-9]+]] !lgc.cps [[META17:![0-9]+]] !continuation [[META18:![0-9]+]] {
 ; CLEANUP-CPS-NEXT:  AllocaSpillBB:
 ; CLEANUP-CPS-NEXT:    [[CONT_STATE_STACK_SEGMENT:%.*]] = call ptr addrspace(32) @lgc.cps.alloc(i32 8)
 ; CLEANUP-CPS-NEXT:    [[RETURN_ADDR_SPILL_ADDR:%.*]] = getelementptr inbounds [[CALLED_FRAME:%.*]], ptr addrspace(32) [[CONT_STATE_STACK_SEGMENT]], i32 0, i32 0
-; CLEANUP-CPS-NEXT:    store i32 [[RETURN_ADDR]], ptr addrspace(32) [[RETURN_ADDR_SPILL_ADDR]], align 4
+; CLEANUP-CPS-NEXT:    store i32 [[RETURNADDR]], ptr addrspace(32) [[RETURN_ADDR_SPILL_ADDR]], align 4
 ; CLEANUP-CPS-NEXT:    [[PAYLOAD_FCA_0_EXTRACT:%.*]] = extractvalue [1 x i32] [[PAYLOAD]], 0
 ; CLEANUP-CPS-NEXT:    [[SYSTEM_DATA_FCA_0_EXTRACT:%.*]] = extractvalue [[STRUCT_DISPATCHSYSTEMDATA]] [[SYSTEM_DATA]], 0
 ; CLEANUP-CPS-NEXT:    call void @amd.dx.setLocalRootIndex(i32 [[SHADER_INDEX]])
 ; CLEANUP-CPS-NEXT:    [[DIS_DATA_I_FCA_0_INSERT:%.*]] = insertvalue [[STRUCT_DISPATCHSYSTEMDATA]] poison, i32 [[SYSTEM_DATA_FCA_0_EXTRACT]], 0
 ; CLEANUP-CPS-NEXT:    [[DOTFCA_0_INSERT4:%.*]] = insertvalue [1 x i32] poison, i32 [[PAYLOAD_FCA_0_EXTRACT]], 0
 ; CLEANUP-CPS-NEXT:    [[TMP0:%.*]] = call i64 (...) @lgc.cps.as.continuation.reference__i64(ptr @called.resume.0)
-; CLEANUP-CPS-NEXT:    call void (...) @lgc.cps.jump(i32 2, i32 2, {} poison, i64 [[TMP0]], i32 5, [9 x i32] poison, [1 x i32] [[DOTFCA_0_INSERT4]]), !continuation.registercount [[META14]], !continuation.returnedRegistercount !14
+; CLEANUP-CPS-NEXT:    call void (...) @lgc.cps.jump(i32 2, i32 4, {} poison, i64 [[TMP0]], i32 5, [9 x i32] poison, [1 x i32] [[DOTFCA_0_INSERT4]]), !continuation.registercount [[META14:![0-9]+]], !continuation.returnedRegistercount [[META14]]
 ; CLEANUP-CPS-NEXT:    unreachable
 ;
 ;
 ; CLEANUP-CPS-LABEL: define dso_local void @called.resume.0(
-; CLEANUP-CPS-SAME: {} [[TMP0:%.*]], i32 [[TMP1:%.*]], i32 [[TMP2:%.*]], { [[STRUCT_DISPATCHSYSTEMDATA:%.*]], [8 x i32], [1 x i32] } [[TMP3:%.*]]) !lgc.rt.shaderstage [[META16]] !lgc.cps [[META14]] !continuation [[META17]] {
+; CLEANUP-CPS-SAME: {} [[TMP0:%.*]], i32 [[TMP1:%.*]], i32 [[TMP2:%.*]], { [[STRUCT_DISPATCHSYSTEMDATA:%.*]], [8 x i32], [1 x i32] } [[TMP3:%.*]]) !lgc.rt.shaderstage [[META16]] !lgc.cps [[META17]] !continuation [[META18]] {
 ; CLEANUP-CPS-NEXT:  entryresume.0:
 ; CLEANUP-CPS-NEXT:    [[TMP4:%.*]] = alloca { [[STRUCT_DISPATCHSYSTEMDATA]], [8 x i32], [1 x i32] }, align 8
 ; CLEANUP-CPS-NEXT:    store { [[STRUCT_DISPATCHSYSTEMDATA]], [8 x i32], [1 x i32] } [[TMP3]], ptr [[TMP4]], align 4
@@ -384,7 +379,7 @@ attributes #1 = { alwaysinline }
 ; CLEANUP-CPS-NEXT:    [[DOTFCA_0_INSERT9:%.*]] = insertvalue [[STRUCT_DISPATCHSYSTEMDATA]] poison, i32 [[DOTFCA_0_EXTRACT10]], 0
 ; CLEANUP-CPS-NEXT:    [[DOTFCA_0_INSERT:%.*]] = insertvalue [1 x i32] poison, i32 [[DOTFCA_0_EXTRACT]], 0
 ; CLEANUP-CPS-NEXT:    call void @lgc.cps.free(i32 8)
-; CLEANUP-CPS-NEXT:    call void (...) @lgc.cps.jump(i32 [[RETURN_ADDR_RELOAD]], i32 3, {} poison, i32 poison, i32 poison, [[STRUCT_DISPATCHSYSTEMDATA]] [[DOTFCA_0_INSERT9]], [8 x i32] poison, [1 x i32] [[DOTFCA_0_INSERT]]), !continuation.registercount [[META14]]
+; CLEANUP-CPS-NEXT:    call void (...) @lgc.cps.jump(i32 [[RETURN_ADDR_RELOAD]], i32 6, {} poison, i32 poison, i32 poison, [[STRUCT_DISPATCHSYSTEMDATA]] [[DOTFCA_0_INSERT9]], [8 x i32] poison, [1 x i32] [[DOTFCA_0_INSERT]]), !continuation.registercount [[META14]]
 ; CLEANUP-CPS-NEXT:    unreachable
 ;
 ;
@@ -394,23 +389,23 @@ attributes #1 = { alwaysinline }
 ;
 ;
 ; REGISTERBUFFER-CPS-LABEL: define void @called(
-; REGISTERBUFFER-CPS-SAME: {} [[CONT_STATE:%.*]], i32 [[RETURN_ADDR:%.*]], i32 [[SHADER_INDEX:%.*]], [[STRUCT_DISPATCHSYSTEMDATA:%.*]] [[SYSTEM_DATA:%.*]], {} [[HIT_ATTRS:%.*]], [8 x i32] [[PADDING:%.*]], [1 x i32] [[PAYLOAD:%.*]]) !continuation [[META16:![0-9]+]] !lgc.rt.shaderstage [[META17:![0-9]+]] !lgc.cps [[META14:![0-9]+]] {
+; REGISTERBUFFER-CPS-SAME: {} [[CONT_STATE:%.*]], i32 [[RETURNADDR:%.*]], i32 [[SHADER_INDEX:%.*]], [[STRUCT_DISPATCHSYSTEMDATA:%.*]] [[SYSTEM_DATA:%.*]], {} [[HIT_ATTRS:%.*]], [8 x i32] [[PADDING:%.*]], [1 x i32] [[PAYLOAD:%.*]]) !continuation [[META16:![0-9]+]] !lgc.rt.shaderstage [[META17:![0-9]+]] !lgc.cps [[META18:![0-9]+]] {
 ; REGISTERBUFFER-CPS-NEXT:  AllocaSpillBB:
 ; REGISTERBUFFER-CPS-NEXT:    [[CONT_STATE_STACK_SEGMENT:%.*]] = call ptr addrspace(32) @lgc.cps.alloc(i32 8)
 ; REGISTERBUFFER-CPS-NEXT:    [[RETURN_ADDR_SPILL_ADDR:%.*]] = getelementptr inbounds [[CALLED_FRAME:%.*]], ptr addrspace(32) [[CONT_STATE_STACK_SEGMENT]], i32 0, i32 0
-; REGISTERBUFFER-CPS-NEXT:    store i32 [[RETURN_ADDR]], ptr addrspace(32) [[RETURN_ADDR_SPILL_ADDR]], align 4
+; REGISTERBUFFER-CPS-NEXT:    store i32 [[RETURNADDR]], ptr addrspace(32) [[RETURN_ADDR_SPILL_ADDR]], align 4
 ; REGISTERBUFFER-CPS-NEXT:    [[PAYLOAD_FCA_0_EXTRACT:%.*]] = extractvalue [1 x i32] [[PAYLOAD]], 0
 ; REGISTERBUFFER-CPS-NEXT:    [[SYSTEM_DATA_FCA_0_EXTRACT:%.*]] = extractvalue [[STRUCT_DISPATCHSYSTEMDATA]] [[SYSTEM_DATA]], 0
 ; REGISTERBUFFER-CPS-NEXT:    call void @amd.dx.setLocalRootIndex(i32 [[SHADER_INDEX]])
 ; REGISTERBUFFER-CPS-NEXT:    [[DIS_DATA_I_FCA_0_INSERT:%.*]] = insertvalue [[STRUCT_DISPATCHSYSTEMDATA]] poison, i32 [[SYSTEM_DATA_FCA_0_EXTRACT]], 0
 ; REGISTERBUFFER-CPS-NEXT:    [[DOTFCA_0_INSERT4:%.*]] = insertvalue [1 x i32] poison, i32 [[PAYLOAD_FCA_0_EXTRACT]], 0
 ; REGISTERBUFFER-CPS-NEXT:    [[TMP0:%.*]] = call i64 (...) @lgc.cps.as.continuation.reference__i64(ptr @called.resume.0)
-; REGISTERBUFFER-CPS-NEXT:    call void (...) @lgc.cps.jump(i32 2, i32 2, {} poison, i64 [[TMP0]], i32 5, [9 x i32] poison, [1 x i32] [[DOTFCA_0_INSERT4]]), !continuation.registercount [[META14]], !continuation.returnedRegistercount !14
+; REGISTERBUFFER-CPS-NEXT:    call void (...) @lgc.cps.jump(i32 2, i32 4, {} poison, i64 [[TMP0]], i32 5, [9 x i32] poison, [1 x i32] [[DOTFCA_0_INSERT4]]), !continuation.registercount [[META14:![0-9]+]], !continuation.returnedRegistercount [[META14]]
 ; REGISTERBUFFER-CPS-NEXT:    unreachable
 ;
 ;
 ; REGISTERBUFFER-CPS-LABEL: define dso_local void @called.resume.0(
-; REGISTERBUFFER-CPS-SAME: {} [[TMP0:%.*]], i32 [[TMP1:%.*]], i32 [[TMP2:%.*]], { [[STRUCT_DISPATCHSYSTEMDATA:%.*]], [8 x i32], [1 x i32] } [[TMP3:%.*]]) !continuation [[META16]] !lgc.rt.shaderstage [[META17]] !lgc.cps [[META14]] {
+; REGISTERBUFFER-CPS-SAME: {} [[TMP0:%.*]], i32 [[TMP1:%.*]], i32 [[TMP2:%.*]], { [[STRUCT_DISPATCHSYSTEMDATA:%.*]], [8 x i32], [1 x i32] } [[TMP3:%.*]]) !continuation [[META16]] !lgc.rt.shaderstage [[META17]] !lgc.cps [[META18]] {
 ; REGISTERBUFFER-CPS-NEXT:  entryresume.0:
 ; REGISTERBUFFER-CPS-NEXT:    [[TMP4:%.*]] = alloca { [[STRUCT_DISPATCHSYSTEMDATA]], [8 x i32], [1 x i32] }, align 8
 ; REGISTERBUFFER-CPS-NEXT:    store { [[STRUCT_DISPATCHSYSTEMDATA]], [8 x i32], [1 x i32] } [[TMP3]], ptr [[TMP4]], align 4
@@ -431,7 +426,7 @@ attributes #1 = { alwaysinline }
 ; REGISTERBUFFER-CPS-NEXT:    [[DOTFCA_0_INSERT9:%.*]] = insertvalue [[STRUCT_DISPATCHSYSTEMDATA]] poison, i32 [[DOTFCA_0_EXTRACT10]], 0
 ; REGISTERBUFFER-CPS-NEXT:    [[DOTFCA_0_INSERT:%.*]] = insertvalue [1 x i32] poison, i32 [[DOTFCA_0_EXTRACT]], 0
 ; REGISTERBUFFER-CPS-NEXT:    call void @lgc.cps.free(i32 8)
-; REGISTERBUFFER-CPS-NEXT:    call void (...) @lgc.cps.jump(i32 [[RETURN_ADDR_RELOAD]], i32 3, {} poison, i32 poison, i32 poison, [[STRUCT_DISPATCHSYSTEMDATA]] [[DOTFCA_0_INSERT9]], [8 x i32] poison, [1 x i32] [[DOTFCA_0_INSERT]]), !continuation.registercount [[META14]]
+; REGISTERBUFFER-CPS-NEXT:    call void (...) @lgc.cps.jump(i32 [[RETURN_ADDR_RELOAD]], i32 6, {} poison, i32 poison, i32 poison, [[STRUCT_DISPATCHSYSTEMDATA]] [[DOTFCA_0_INSERT9]], [8 x i32] poison, [1 x i32] [[DOTFCA_0_INSERT]]), !continuation.registercount [[META14]]
 ; REGISTERBUFFER-CPS-NEXT:    unreachable
 ;
 ;
@@ -441,7 +436,7 @@ attributes #1 = { alwaysinline }
 ;
 ;
 ; POSTPROCESS-CPS-LABEL: define void @called(
-; POSTPROCESS-CPS-SAME: {} [[CONT_STATE:%.*]], i32 [[CSPINIT:%.*]], i32 [[RETURN_ADDR:%.*]], i32 [[SHADER_INDEX:%.*]], [[STRUCT_DISPATCHSYSTEMDATA:%.*]] [[SYSTEM_DATA:%.*]], {} [[HIT_ATTRS:%.*]], [8 x i32] [[PADDING:%.*]], [1 x i32] [[PAYLOAD:%.*]]) !continuation [[META16:![0-9]+]] !lgc.rt.shaderstage [[META17:![0-9]+]] !lgc.cps [[META14:![0-9]+]] {
+; POSTPROCESS-CPS-SAME: {} [[CONT_STATE:%.*]], i32 [[CSPINIT:%.*]], i32 [[RETURNADDR:%.*]], i32 [[SHADER_INDEX:%.*]], [[STRUCT_DISPATCHSYSTEMDATA:%.*]] [[SYSTEM_DATA:%.*]], {} [[HIT_ATTRS:%.*]], [8 x i32] [[PADDING:%.*]], [1 x i32] [[PAYLOAD:%.*]]) !continuation [[META16:![0-9]+]] !lgc.rt.shaderstage [[META17:![0-9]+]] !lgc.cps [[META18:![0-9]+]] {
 ; POSTPROCESS-CPS-NEXT:  AllocaSpillBB:
 ; POSTPROCESS-CPS-NEXT:    [[CSP:%.*]] = alloca i32, align 4
 ; POSTPROCESS-CPS-NEXT:    store i32 [[CSPINIT]], ptr [[CSP]], align 4
@@ -450,20 +445,20 @@ attributes #1 = { alwaysinline }
 ; POSTPROCESS-CPS-NEXT:    store i32 [[TMP1]], ptr [[CSP]], align 4
 ; POSTPROCESS-CPS-NEXT:    [[TMP2:%.*]] = inttoptr i32 [[TMP0]] to ptr addrspace(21)
 ; POSTPROCESS-CPS-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP2]], i32 0
-; POSTPROCESS-CPS-NEXT:    store i32 [[RETURN_ADDR]], ptr addrspace(21) [[TMP3]], align 4
+; POSTPROCESS-CPS-NEXT:    store i32 [[RETURNADDR]], ptr addrspace(21) [[TMP3]], align 4
 ; POSTPROCESS-CPS-NEXT:    [[PAYLOAD_FCA_0_EXTRACT:%.*]] = extractvalue [1 x i32] [[PAYLOAD]], 0
 ; POSTPROCESS-CPS-NEXT:    [[SYSTEM_DATA_FCA_0_EXTRACT:%.*]] = extractvalue [[STRUCT_DISPATCHSYSTEMDATA]] [[SYSTEM_DATA]], 0
 ; POSTPROCESS-CPS-NEXT:    call void @amd.dx.setLocalRootIndex(i32 [[SHADER_INDEX]])
 ; POSTPROCESS-CPS-NEXT:    [[DIS_DATA_I_FCA_0_INSERT:%.*]] = insertvalue [[STRUCT_DISPATCHSYSTEMDATA]] poison, i32 [[SYSTEM_DATA_FCA_0_EXTRACT]], 0
 ; POSTPROCESS-CPS-NEXT:    [[DOTFCA_0_INSERT4:%.*]] = insertvalue [1 x i32] poison, i32 [[PAYLOAD_FCA_0_EXTRACT]], 0
+; POSTPROCESS-CPS-NEXT:    [[TMP5:%.*]] = call i64 @continuation.getAddrAndMD(ptr @called.resume.0)
 ; POSTPROCESS-CPS-NEXT:    [[TMP4:%.*]] = load i32, ptr [[CSP]], align 4
-; POSTPROCESS-CPS-NEXT:    [[TMP5:%.*]] = call i64 @continuation.getAddrAndMD(i64 ptrtoint (ptr @called.resume.0 to i64))
-; POSTPROCESS-CPS-NEXT:    call void (i64, ...) @continuation.continue(i64 2, i32 [[TMP4]], i64 [[TMP5]], i32 5, [9 x i32] poison, [1 x i32] [[DOTFCA_0_INSERT4]]), !continuation.registercount [[META14]], !continuation.returnedRegistercount !14
+; POSTPROCESS-CPS-NEXT:    call void (i64, ...) @continuation.continue(i64 2, i32 [[TMP4]], i64 [[TMP5]], i32 5, [9 x i32] poison, [1 x i32] [[DOTFCA_0_INSERT4]]), !continuation.registercount [[META14:![0-9]+]], !continuation.returnedRegistercount [[META14]]
 ; POSTPROCESS-CPS-NEXT:    unreachable
 ;
 ;
 ; POSTPROCESS-CPS-LABEL: define dso_local void @called.resume.0(
-; POSTPROCESS-CPS-SAME: {} [[TMP0:%.*]], i32 [[CSPINIT:%.*]], i32 [[TMP1:%.*]], i32 [[TMP2:%.*]], { [[STRUCT_DISPATCHSYSTEMDATA:%.*]], [8 x i32], [1 x i32] } [[TMP3:%.*]]) !continuation [[META16]] !lgc.rt.shaderstage [[META17]] !lgc.cps [[META14]] {
+; POSTPROCESS-CPS-SAME: {} [[TMP0:%.*]], i32 [[CSPINIT:%.*]], i32 [[TMP1:%.*]], i32 [[TMP2:%.*]], { [[STRUCT_DISPATCHSYSTEMDATA:%.*]], [8 x i32], [1 x i32] } [[TMP3:%.*]]) !continuation [[META16]] !lgc.rt.shaderstage [[META17]] !lgc.cps [[META18]] {
 ; POSTPROCESS-CPS-NEXT:  entryresume.0:
 ; POSTPROCESS-CPS-NEXT:    [[TMP4:%.*]] = alloca { [[STRUCT_DISPATCHSYSTEMDATA]], [8 x i32], [1 x i32] }, align 8
 ; POSTPROCESS-CPS-NEXT:    [[CSP:%.*]] = alloca i32, align 4
diff --git a/llvmraytracing/test/dx/lower-rt-pipeline-small-payload-field.ll b/llvmraytracing/test/dx/lower-rt-pipeline-small-payload-field.ll
index 15b1889fbb..9668194c58 100644
--- a/llvmraytracing/test/dx/lower-rt-pipeline-small-payload-field.ll
+++ b/llvmraytracing/test/dx/lower-rt-pipeline-small-payload-field.ll
@@ -2,14 +2,22 @@
 ; Test copying of fields between local and global payload whose size
 ; is not a multiple of i32s, requiring copies at a smaller granularity
 ; for at least a suffix of the fields.
-; RUN: opt --verify-each -passes='dxil-cont-lgc-rt-op-converter,lint,lower-raytracing-pipeline,lint,remove-types-metadata' -S %s 2> %t.stderr | FileCheck -check-prefix=LOWERRAYTRACINGPIPELINE %s
-; RUN: count 0 < %t.stderr
-
-target datalayout = "e-m:e-p:64:32-p20:32:32-p21:32:32-p32:32:32-i1:32-i8:8-i16:32-i32:32-i64:32-f16:32-f32:32-f64:32-v16:32-v32:32-v48:32-v64:32-v80:32-v96:32-v112:32-v128:32-v144:32-v160:32-v176:32-v192:32-v208:32-v224:32-v240:32-v256:32-n8:16:32"
-
-; The last two fields are relevant. The i16 needs special treatment,
-; as well as the last two bytes of the <3 x i16>.
-%struct.Payload = type { [5 x i32], i16, <3 x i16> }
+; RUN: opt --verify-each -passes='dxil-cont-lgc-rt-op-converter,lint,lower-raytracing-pipeline,lint,remove-types-metadata' -S %s --lint-abort-on-error | FileCheck %s
+
+target datalayout = "e-m:e-p:64:32-p20:32:32-p21:32:32-p32:32:32-i1:32-i8:8-i16:16-i32:32-i64:32-f16:16-f32:32-f64:32-v8:8-v16:16-v32:32-v48:32-v64:32-v80:32-v96:32-v112:32-v128:32-v144:32-v160:32-v176:32-v192:32-v208:32-v224:32-v240:32-v256:32-n8:16:32"
+
+; This payload struct is PAQed as follows:
+; struct [raypayload] Payload
+; {
+;     int v[5]                 : write(caller) : read(miss, caller);
+;     min16uint smallField     : write(miss)   : read(caller);
+;     min16uint3 smallFieldVec : write(miss)   : read(caller);
+; };
+; The last two fields are particularly relevant.
+; The i16 needs special treatment, as well as the last two bytes of the <3 x i16>.
+%struct.PAQPayload = type { [5 x i32], i16, <3 x i16> }
+; Identical, but without PAQ:
+%struct.NoPAQPayload = type { [5 x i32], i16, <3 x i16> }
 %struct.DispatchSystemData = type { i32 }
 %struct.TraversalData = type { %struct.SystemData }
 %struct.SystemData = type { %struct.DispatchSystemData }
@@ -18,8 +26,15 @@ target datalayout = "e-m:e-p:64:32-p20:32:32-p21:32:32-p32:32:32-i1:32-i8:8-i16:
 %struct.BuiltInTriangleIntersectionAttributes = type { <2 x float> }
 
 ; Function Attrs: nounwind
-define void @Miss(%struct.Payload* noalias nocapture %payload) #0 !types !17 {
-  %1 = getelementptr inbounds %struct.Payload, %struct.Payload* %payload, i32 0, i32 1
+define void @MissPAQ(%struct.PAQPayload* noalias nocapture %payload) #0 !types !17 {
+  %1 = getelementptr inbounds %struct.PAQPayload, %struct.PAQPayload* %payload, i32 0, i32 1
+  store i16 17, i16* %1, align 4
+  ret void
+}
+
+; Function Attrs: nounwind
+define void @MissNoPAQ(%struct.NoPAQPayload* noalias nocapture %payload) #0 !types !31 {
+  %1 = getelementptr inbounds %struct.NoPAQPayload, %struct.NoPAQPayload* %payload, i32 0, i32 1
   store i16 17, i16* %1, align 4
   ret void
 }
@@ -70,27 +85,27 @@ attributes #3 = { nounwind memory(none) }
 !dx.shaderModel = !{!2}
 !dx.typeAnnotations = !{!3}
 !dx.dxrPayloadAnnotations = !{!8}
-!dx.entryPoints = !{!12, !14}
+!dx.entryPoints = !{!12, !14, !33}
 
 !0 = !{!"dxcoob 2019.05.00"}
 !1 = !{i32 1, i32 7}
 !2 = !{!"lib", i32 6, i32 7}
-!3 = !{i32 1, void (%struct.Payload*)* @Miss, !4}
+!3 = !{i32 1, void (%struct.PAQPayload*)* @MissPAQ, !4}
 !4 = !{!5, !7}
 !5 = !{i32 1, !6, !6}
 !6 = !{}
 !7 = !{i32 2, !6, !6}
-!8 = !{i32 0, %struct.Payload undef, !9}
+!8 = !{i32 0, %struct.PAQPayload undef, !9}
 !9 = !{!10, !11, !11}
 !10 = !{i32 0, i32 259}
 !11 = !{i32 0, i32 513}
 !12 = !{null, !"", null, null, !13}
 !13 = !{i32 0, i64 32}
-!14 = !{void (%struct.Payload*)* @Miss, !"Miss", null, null, !15}
+!14 = !{void (%struct.PAQPayload*)* @MissPAQ, !"MissPAQ", null, null, !15}
 !15 = !{i32 8, i32 11, i32 6, i32 24, i32 5, !16}
 !16 = !{i32 0}
 !17 = !{!"function", !"void", !18}
-!18 = !{i32 0, %struct.Payload poison}
+!18 = !{i32 0, %struct.PAQPayload poison}
 !19 = !{!"function", %struct.BuiltInTriangleIntersectionAttributes poison, !20}
 !20 = !{i32 0, %struct.SystemData poison}
 !21 = !{!"function", !"void", !20, %struct.BuiltInTriangleIntersectionAttributes poison}
@@ -103,51 +118,121 @@ attributes #3 = { nounwind memory(none) }
 !28 = !{!"function", !"void", !29}
 !29 = !{i32 0, %struct.AnyHitTraversalData poison}
 !30 = !{!"function", i32 poison, !27}
-; LOWERRAYTRACINGPIPELINE-LABEL: define %struct.DispatchSystemData @Miss(
-; LOWERRAYTRACINGPIPELINE-SAME: [[STRUCT_SYSTEMDATA:%.*]] [[TMP0:%.*]]) #[[ATTR0:[0-9]+]] !lgc.rt.shaderstage [[META20:![0-9]+]] !continuation.registercount [[META18:![0-9]+]] !continuation [[META21:![0-9]+]] {
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[SYSTEM_DATA_ALLOCA:%.*]] = alloca [[STRUCT_SYSTEMDATA]], align 8
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP2:%.*]] = alloca [[STRUCT_PAYLOAD:%.*]], align 8
-; LOWERRAYTRACINGPIPELINE-NEXT:    store [[STRUCT_SYSTEMDATA]] [[TMP0]], ptr [[SYSTEM_DATA_ALLOCA]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [[STRUCT_SYSTEMDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [[STRUCT_PAYLOAD]], ptr [[TMP2]], i32 0, i32 0
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP7:%.*]] = load i32, ptr addrspace(20) @PAYLOAD, align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP7]], ptr [[TMP4]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i32 1
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP10:%.*]] = load i32, ptr addrspace(20) getelementptr inbounds (i32, ptr addrspace(20) @PAYLOAD, i32 7), align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP10]], ptr [[TMP8]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i32, ptr [[TMP8]], i32 1
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP12:%.*]] = load i32, ptr addrspace(20) getelementptr inbounds (i32, ptr addrspace(20) @PAYLOAD, i32 8), align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP12]], ptr [[TMP11]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[TMP8]], i32 2
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP14:%.*]] = load i32, ptr addrspace(20) getelementptr inbounds (i32, ptr addrspace(20) @PAYLOAD, i32 9), align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP14]], ptr [[TMP13]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i32, ptr [[TMP8]], i32 3
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP16:%.*]] = load i32, ptr addrspace(20) getelementptr inbounds (i32, ptr addrspace(20) @PAYLOAD, i32 10), align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP16]], ptr [[TMP15]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    call void @amd.dx.setLocalRootIndex(i32 5)
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [[STRUCT_PAYLOAD]], ptr [[TMP2]], i32 0, i32 1
-; LOWERRAYTRACINGPIPELINE-NEXT:    store i16 17, ptr [[TMP17]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP18:%.*]] = getelementptr inbounds [[STRUCT_PAYLOAD]], ptr [[TMP2]], i32 0, i32 1
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP21:%.*]] = load i8, ptr [[TMP18]], align 1
-; LOWERRAYTRACINGPIPELINE-NEXT:    store i8 [[TMP21]], ptr addrspace(20) getelementptr inbounds (i32, ptr addrspace(20) @PAYLOAD, i32 1), align 1
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP22:%.*]] = getelementptr i8, ptr [[TMP18]], i32 1
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP23:%.*]] = load i8, ptr [[TMP22]], align 1
-; LOWERRAYTRACINGPIPELINE-NEXT:    store i8 [[TMP23]], ptr addrspace(20) getelementptr (i8, ptr addrspace(20) getelementptr inbounds (i32, ptr addrspace(20) @PAYLOAD, i32 1), i32 1), align 1
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [[STRUCT_PAYLOAD]], ptr [[TMP2]], i32 0, i32 2
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP27:%.*]] = load i32, ptr [[TMP24]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP27]], ptr addrspace(20) getelementptr inbounds (i32, ptr addrspace(20) @PAYLOAD, i32 2), align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP28:%.*]] = getelementptr i8, ptr [[TMP24]], i32 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP29:%.*]] = load i8, ptr [[TMP28]], align 1
-; LOWERRAYTRACINGPIPELINE-NEXT:    store i8 [[TMP29]], ptr addrspace(20) getelementptr (i8, ptr addrspace(20) getelementptr inbounds (i32, ptr addrspace(20) @PAYLOAD, i32 2), i32 4), align 1
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP30:%.*]] = getelementptr i8, ptr [[TMP24]], i32 5
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP31:%.*]] = load i8, ptr [[TMP30]], align 1
-; LOWERRAYTRACINGPIPELINE-NEXT:    store i8 [[TMP31]], ptr addrspace(20) getelementptr (i8, ptr addrspace(20) getelementptr inbounds (i32, ptr addrspace(20) @PAYLOAD, i32 2), i32 5), align 1
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP32:%.*]] = getelementptr inbounds [[STRUCT_SYSTEMDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP33:%.*]] = load [[STRUCT_DISPATCHSYSTEMDATA:%.*]], ptr [[TMP32]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    ret [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP33]], !continuation.registercount [[META18]]
+!31 = !{!"function", !"void", !32}
+!32 = !{i32 0, %struct.NoPAQPayload poison}
+!33 = !{void (%struct.NoPAQPayload*)* @MissNoPAQ, !"MissNoPAQ", null, null, !34}
+!34 = !{i32 8, i32 11, i32 6, i32 24, i32 5, !35}
+!35 = !{i32 0}
+
+; CHECK-LABEL: define %struct.DispatchSystemData @MissPAQ(
+; CHECK-SAME: i64 [[RETURNADDR:%.*]], [[STRUCT_SYSTEMDATA:%.*]] [[TMP0:%.*]]) #[[ATTR0:[0-9]+]] !lgc.rt.shaderstage [[META21:![0-9]+]] !continuation.registercount [[META22:![0-9]+]] !continuation [[META23:![0-9]+]] {
+; CHECK-NEXT:    [[SYSTEM_DATA_ALLOCA:%.*]] = alloca [[STRUCT_SYSTEMDATA]], align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = alloca [[STRUCT_PAYLOAD:%.*]], align 8
+; CHECK-NEXT:    store [[STRUCT_SYSTEMDATA]] [[TMP0]], ptr [[SYSTEM_DATA_ALLOCA]], align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [[STRUCT_SYSTEMDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [[STRUCT_PAYLOAD]], ptr [[TMP2]], i32 0, i32 0
+; CHECK-NEXT:    [[TMP7:%.*]] = load i32, ptr addrspace(20) @PAYLOAD, align 4
+; CHECK-NEXT:    store i32 [[TMP7]], ptr [[TMP4]], align 4
+; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i32 1
+; CHECK-NEXT:    [[TMP10:%.*]] = load i32, ptr addrspace(20) getelementptr inbounds (i32, ptr addrspace(20) @PAYLOAD, i32 7), align 4
+; CHECK-NEXT:    store i32 [[TMP10]], ptr [[TMP8]], align 4
+; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i32, ptr [[TMP8]], i32 1
+; CHECK-NEXT:    [[TMP12:%.*]] = load i32, ptr addrspace(20) getelementptr inbounds (i32, ptr addrspace(20) @PAYLOAD, i32 8), align 4
+; CHECK-NEXT:    store i32 [[TMP12]], ptr [[TMP11]], align 4
+; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[TMP8]], i32 2
+; CHECK-NEXT:    [[TMP14:%.*]] = load i32, ptr addrspace(20) getelementptr inbounds (i32, ptr addrspace(20) @PAYLOAD, i32 9), align 4
+; CHECK-NEXT:    store i32 [[TMP14]], ptr [[TMP13]], align 4
+; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i32, ptr [[TMP8]], i32 3
+; CHECK-NEXT:    [[TMP16:%.*]] = load i32, ptr addrspace(20) getelementptr inbounds (i32, ptr addrspace(20) @PAYLOAD, i32 10), align 4
+; CHECK-NEXT:    store i32 [[TMP16]], ptr [[TMP15]], align 4
+; CHECK-NEXT:    call void @amd.dx.setLocalRootIndex(i32 5)
+; CHECK-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [[STRUCT_PAYLOAD]], ptr [[TMP2]], i32 0, i32 1
+; CHECK-NEXT:    store i16 17, ptr [[TMP17]], align 4
+; CHECK-NEXT:    [[TMP18:%.*]] = getelementptr inbounds [[STRUCT_PAYLOAD]], ptr [[TMP2]], i32 0, i32 1
+; CHECK-NEXT:    [[TMP21:%.*]] = load i8, ptr [[TMP18]], align 1
+; CHECK-NEXT:    store i8 [[TMP21]], ptr addrspace(20) getelementptr inbounds (i32, ptr addrspace(20) @PAYLOAD, i32 1), align 1
+; CHECK-NEXT:    [[TMP22:%.*]] = getelementptr i8, ptr [[TMP18]], i32 1
+; CHECK-NEXT:    [[TMP23:%.*]] = load i8, ptr [[TMP22]], align 1
+; CHECK-NEXT:    store i8 [[TMP23]], ptr addrspace(20) getelementptr (i8, ptr addrspace(20) getelementptr inbounds (i32, ptr addrspace(20) @PAYLOAD, i32 1), i32 1), align 1
+; CHECK-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [[STRUCT_PAYLOAD]], ptr [[TMP2]], i32 0, i32 2
+; CHECK-NEXT:    [[TMP27:%.*]] = load i32, ptr [[TMP24]], align 4
+; CHECK-NEXT:    store i32 [[TMP27]], ptr addrspace(20) getelementptr inbounds (i32, ptr addrspace(20) @PAYLOAD, i32 2), align 4
+; CHECK-NEXT:    [[TMP28:%.*]] = getelementptr i8, ptr [[TMP24]], i32 4
+; CHECK-NEXT:    [[TMP29:%.*]] = load i8, ptr [[TMP28]], align 1
+; CHECK-NEXT:    store i8 [[TMP29]], ptr addrspace(20) getelementptr (i8, ptr addrspace(20) getelementptr inbounds (i32, ptr addrspace(20) @PAYLOAD, i32 2), i32 4), align 1
+; CHECK-NEXT:    [[TMP30:%.*]] = getelementptr i8, ptr [[TMP24]], i32 5
+; CHECK-NEXT:    [[TMP31:%.*]] = load i8, ptr [[TMP30]], align 1
+; CHECK-NEXT:    store i8 [[TMP31]], ptr addrspace(20) getelementptr (i8, ptr addrspace(20) getelementptr inbounds (i32, ptr addrspace(20) @PAYLOAD, i32 2), i32 5), align 1
+; CHECK-NEXT:    [[TMP32:%.*]] = getelementptr inbounds [[STRUCT_SYSTEMDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0
+; CHECK-NEXT:    [[TMP33:%.*]] = load [[STRUCT_DISPATCHSYSTEMDATA:%.*]], ptr [[TMP32]], align 4
+; CHECK-NEXT:    call void (...) @lgc.ilcps.return(i64 [[RETURNADDR]], [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP33]]), !continuation.registercount [[META22]]
+; CHECK-NEXT:    unreachable
+;
+;
+; CHECK-LABEL: define %struct.DispatchSystemData @MissNoPAQ(
+; CHECK-SAME: i64 [[RETURNADDR:%.*]], [[STRUCT_SYSTEMDATA:%.*]] [[TMP0:%.*]]) #[[ATTR0]] !lgc.rt.shaderstage [[META21]] !continuation.registercount [[META19:![0-9]+]] !continuation [[META24:![0-9]+]] {
+; CHECK-NEXT:    [[SYSTEM_DATA_ALLOCA:%.*]] = alloca [[STRUCT_SYSTEMDATA]], align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = alloca [[STRUCT_NOPAQPAYLOAD:%.*]], align 8
+; CHECK-NEXT:    store [[STRUCT_SYSTEMDATA]] [[TMP0]], ptr [[SYSTEM_DATA_ALLOCA]], align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [[STRUCT_SYSTEMDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [[STRUCT_NOPAQPAYLOAD]], ptr [[TMP2]], i32 0
+; CHECK-NEXT:    [[TMP5:%.*]] = load i32, ptr addrspace(20) @PAYLOAD, align 4
+; CHECK-NEXT:    store i32 [[TMP5]], ptr [[TMP4]], align 4
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i32 1
+; CHECK-NEXT:    [[TMP7:%.*]] = load i32, ptr addrspace(20) getelementptr inbounds (i32, ptr addrspace(20) @PAYLOAD, i32 7), align 4
+; CHECK-NEXT:    store i32 [[TMP7]], ptr [[TMP6]], align 4
+; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[TMP6]], i32 1
+; CHECK-NEXT:    [[TMP9:%.*]] = load i32, ptr addrspace(20) getelementptr inbounds (i32, ptr addrspace(20) @PAYLOAD, i32 8), align 4
+; CHECK-NEXT:    store i32 [[TMP9]], ptr [[TMP8]], align 4
+; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[TMP6]], i32 2
+; CHECK-NEXT:    [[TMP11:%.*]] = load i32, ptr addrspace(20) getelementptr inbounds (i32, ptr addrspace(20) @PAYLOAD, i32 9), align 4
+; CHECK-NEXT:    store i32 [[TMP11]], ptr [[TMP10]], align 4
+; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[TMP6]], i32 3
+; CHECK-NEXT:    [[TMP13:%.*]] = load i32, ptr addrspace(20) getelementptr inbounds (i32, ptr addrspace(20) @PAYLOAD, i32 10), align 4
+; CHECK-NEXT:    store i32 [[TMP13]], ptr [[TMP12]], align 4
+; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i32, ptr [[TMP6]], i32 4
+; CHECK-NEXT:    [[TMP15:%.*]] = load i32, ptr addrspace(20) getelementptr inbounds (i32, ptr addrspace(20) @PAYLOAD, i32 11), align 4
+; CHECK-NEXT:    store i32 [[TMP15]], ptr [[TMP14]], align 4
+; CHECK-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i32, ptr [[TMP6]], i32 5
+; CHECK-NEXT:    [[TMP19:%.*]] = load i32, ptr addrspace(20) getelementptr inbounds (i32, ptr addrspace(20) @PAYLOAD, i32 12), align 4
+; CHECK-NEXT:    store i32 [[TMP19]], ptr [[TMP18]], align 4
+; CHECK-NEXT:    [[TMP20:%.*]] = getelementptr inbounds i32, ptr [[TMP6]], i32 6
+; CHECK-NEXT:    [[TMP21:%.*]] = load i32, ptr addrspace(20) getelementptr inbounds (i32, ptr addrspace(20) @PAYLOAD, i32 13), align 4
+; CHECK-NEXT:    store i32 [[TMP21]], ptr [[TMP20]], align 4
+; CHECK-NEXT:    call void @amd.dx.setLocalRootIndex(i32 5)
+; CHECK-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [[STRUCT_NOPAQPAYLOAD]], ptr [[TMP2]], i32 0, i32 1
+; CHECK-NEXT:    store i16 17, ptr [[TMP24]], align 4
+; CHECK-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [[STRUCT_NOPAQPAYLOAD]], ptr [[TMP2]], i32 0
+; CHECK-NEXT:    [[TMP26:%.*]] = load i32, ptr [[TMP25]], align 4
+; CHECK-NEXT:    store i32 [[TMP26]], ptr addrspace(20) @PAYLOAD, align 4
+; CHECK-NEXT:    [[TMP27:%.*]] = getelementptr inbounds i32, ptr [[TMP25]], i32 1
+; CHECK-NEXT:    [[TMP28:%.*]] = load i32, ptr [[TMP27]], align 4
+; CHECK-NEXT:    store i32 [[TMP28]], ptr addrspace(20) getelementptr inbounds (i32, ptr addrspace(20) @PAYLOAD, i32 7), align 4
+; CHECK-NEXT:    [[TMP29:%.*]] = getelementptr inbounds i32, ptr [[TMP27]], i32 1
+; CHECK-NEXT:    [[TMP30:%.*]] = load i32, ptr [[TMP29]], align 4
+; CHECK-NEXT:    store i32 [[TMP30]], ptr addrspace(20) getelementptr inbounds (i32, ptr addrspace(20) @PAYLOAD, i32 8), align 4
+; CHECK-NEXT:    [[TMP31:%.*]] = getelementptr inbounds i32, ptr [[TMP27]], i32 2
+; CHECK-NEXT:    [[TMP32:%.*]] = load i32, ptr [[TMP31]], align 4
+; CHECK-NEXT:    store i32 [[TMP32]], ptr addrspace(20) getelementptr inbounds (i32, ptr addrspace(20) @PAYLOAD, i32 9), align 4
+; CHECK-NEXT:    [[TMP33:%.*]] = getelementptr inbounds i32, ptr [[TMP27]], i32 3
+; CHECK-NEXT:    [[TMP34:%.*]] = load i32, ptr [[TMP33]], align 4
+; CHECK-NEXT:    store i32 [[TMP34]], ptr addrspace(20) getelementptr inbounds (i32, ptr addrspace(20) @PAYLOAD, i32 10), align 4
+; CHECK-NEXT:    [[TMP39:%.*]] = getelementptr inbounds i32, ptr [[TMP27]], i32 4
+; CHECK-NEXT:    [[TMP40:%.*]] = load i32, ptr [[TMP39]], align 4
+; CHECK-NEXT:    store i32 [[TMP40]], ptr addrspace(20) getelementptr inbounds (i32, ptr addrspace(20) @PAYLOAD, i32 11), align 4
+; CHECK-NEXT:    [[TMP37:%.*]] = getelementptr inbounds i32, ptr [[TMP27]], i32 5
+; CHECK-NEXT:    [[TMP38:%.*]] = load i32, ptr [[TMP37]], align 4
+; CHECK-NEXT:    store i32 [[TMP38]], ptr addrspace(20) getelementptr inbounds (i32, ptr addrspace(20) @PAYLOAD, i32 12), align 4
+; CHECK-NEXT:    [[TMP35:%.*]] = getelementptr inbounds i32, ptr [[TMP27]], i32 6
+; CHECK-NEXT:    [[TMP36:%.*]] = load i32, ptr [[TMP35]], align 4
+; CHECK-NEXT:    store i32 [[TMP36]], ptr addrspace(20) getelementptr inbounds (i32, ptr addrspace(20) @PAYLOAD, i32 13), align 4
+; CHECK-NEXT:    [[TMP45:%.*]] = getelementptr inbounds [[STRUCT_SYSTEMDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0
+; CHECK-NEXT:    [[TMP46:%.*]] = load [[STRUCT_DISPATCHSYSTEMDATA:%.*]], ptr [[TMP45]], align 4
+; CHECK-NEXT:    call void (...) @lgc.ilcps.return(i64 [[RETURNADDR]], [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP46]]), !continuation.registercount [[META19]]
+; CHECK-NEXT:    unreachable
 ;
 ;
-; LOWERRAYTRACINGPIPELINE-LABEL: define i32 @_cont_GetLocalRootIndex(
-; LOWERRAYTRACINGPIPELINE-SAME: ptr [[DATA:%.*]]) #[[ATTR1:[0-9]+]] {
-; LOWERRAYTRACINGPIPELINE-NEXT:    ret i32 5
+; CHECK-LABEL: define i32 @_cont_GetLocalRootIndex(
+; CHECK-SAME: ptr [[DATA:%.*]]) #[[ATTR1:[0-9]+]] {
+; CHECK-NEXT:    ret i32 5
 ;
diff --git a/llvmraytracing/test/dx/lower-rt-pipeline-small-payload-field.ll.hlsl b/llvmraytracing/test/dx/lower-rt-pipeline-small-payload-field.ll.hlsl
deleted file mode 100644
index fad4f37c43..0000000000
--- a/llvmraytracing/test/dx/lower-rt-pipeline-small-payload-field.ll.hlsl
+++ /dev/null
@@ -1,9 +0,0 @@
-// This file is not a test itself, but used to generate the .ll test file.
-
-struct[raypayload] Payload {
-  int v[5] : write(caller) : read(miss, caller);
-  min16uint smallField : write(miss) : read(caller);
-  min16uint3 smallFieldVec : write(miss) : read(caller);
-};
-
-[shader("miss")] void Miss(inout Payload payload) { payload.smallField = 17; }
diff --git a/llvmraytracing/test/dx/lower-rt-pipeline.ll b/llvmraytracing/test/dx/lower-rt-pipeline.ll
index dc2d8e0a00..f097e2e7f6 100644
--- a/llvmraytracing/test/dx/lower-rt-pipeline.ll
+++ b/llvmraytracing/test/dx/lower-rt-pipeline.ll
@@ -1,16 +1,11 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --include-generated-funcs --version 3
-; RUN: grep -v lgc.cps.module %s | opt --verify-each -passes='dxil-cont-lgc-rt-op-converter,lint,lower-raytracing-pipeline,lint,remove-types-metadata' -S 2> %t0.stderr | FileCheck -check-prefix=LOWERRAYTRACINGPIPELINE %s
-; RUN: count 0 < %t0.stderr
-; RUN: opt --verify-each -passes='dxil-cont-lgc-rt-op-converter,lint,lower-raytracing-pipeline,lint,remove-types-metadata' -S %s 2> %t1.stderr | FileCheck -check-prefix=LOWERRAYTRACINGPIPELINE-CPS %s
-; RUN: count 0 < %t1.stderr
-; RUN: opt --verify-each -passes='dxil-cont-lgc-rt-op-converter,lint,inline,lint,lower-raytracing-pipeline,lint,sroa,lint,lower-await,lint,coro-early,dxil-coro-split,coro-cleanup,lint,dxil-cleanup-continuations,lint,register-buffer,lint,remove-types-metadata' -S %s 2> %t2.stderr | FileCheck -check-prefix=REGISTERBUFFER-CPS %s
-; RUN: count 0 < %t2.stderr
-; RUN: grep -v lgc.cps.module %s | opt --verify-each -passes='dxil-cont-lgc-rt-op-converter,lint,inline,lint,lower-raytracing-pipeline,lint,sroa,lint,lower-await,lint,coro-early,dxil-coro-split,coro-cleanup,lint,legacy-cleanup-continuations,lint,register-buffer,lint,dxil-cont-post-process,lint,remove-types-metadata' -S 2> %t3.stderr | FileCheck -check-prefix=POSTPROCESS %s
-; RUN: count 0 < %t3.stderr
-; RUN: opt --verify-each -passes='dxil-cont-lgc-rt-op-converter,lint,inline,lint,lower-raytracing-pipeline,lint,sroa,lint,lower-await,lint,coro-early,dxil-coro-split,coro-cleanup,lint,dxil-cleanup-continuations,lint,register-buffer,lint,dxil-cont-post-process,lint,remove-types-metadata' -S %s 2> %t4.stderr | FileCheck -check-prefix=POSTPROCESS-CPS %s
-; RUN: count 0 < %t4.stderr
+; RUN: grep -v lgc.cps.module %s | opt --verify-each -passes="dxil-cont-lgc-rt-op-converter,lint,lower-raytracing-pipeline,lint,remove-types-metadata" -S --lint-abort-on-error | FileCheck -check-prefix=LOWERRAYTRACINGPIPELINE %s
+; RUN: opt --verify-each -passes="dxil-cont-lgc-rt-op-converter,lint,lower-raytracing-pipeline,lint,remove-types-metadata" -S %s --lint-abort-on-error | FileCheck -check-prefix=LOWERRAYTRACINGPIPELINE-CPS %s
+; RUN: opt --verify-each -passes="dxil-cont-lgc-rt-op-converter,lint,inline,lint,lower-raytracing-pipeline,lint,sroa,lint,lower-await,lint,coro-early,dxil-coro-split,coro-cleanup,lint,dxil-cleanup-continuations,lint,register-buffer,lint,remove-types-metadata" -S %s --lint-abort-on-error | FileCheck -check-prefix=REGISTERBUFFER-CPS %s
+; RUN: grep -v lgc.cps.module %s | opt --verify-each -passes="dxil-cont-lgc-rt-op-converter,lint,inline,lint,lower-raytracing-pipeline,lint,sroa,lint,lower-await,lint,coro-early,dxil-coro-split,coro-cleanup,lint,legacy-cleanup-continuations,lint,register-buffer,lint,dxil-cont-post-process,lint,remove-types-metadata" -S --lint-abort-on-error | FileCheck -check-prefix=POSTPROCESS %s
+; RUN: opt --verify-each -passes="dxil-cont-lgc-rt-op-converter,lint,inline,lint,lower-raytracing-pipeline,lint,sroa,lint,lower-await,lint,coro-early,dxil-coro-split,coro-cleanup,lint,dxil-cleanup-continuations,lint,register-buffer,lint,dxil-cont-post-process,lint,remove-types-metadata" -S %s --lint-abort-on-error | FileCheck -check-prefix=POSTPROCESS-CPS %s
 
-target datalayout = "e-m:e-p:64:32-p20:32:32-p21:32:32-p32:32:32-i1:32-i8:8-i16:32-i32:32-i64:32-f16:32-f32:32-f64:32-v16:32-v32:32-v48:32-v64:32-v80:32-v96:32-v112:32-v128:32-v144:32-v160:32-v176:32-v192:32-v208:32-v224:32-v240:32-v256:32-n8:16:32"
+target datalayout = "e-m:e-p:64:32-p20:32:32-p21:32:32-p32:32:32-i1:32-i8:8-i16:16-i32:32-i64:32-f16:16-f32:32-f64:32-v8:8-v16:16-v32:32-v48:32-v64:32-v80:32-v96:32-v112:32-v128:32-v144:32-v160:32-v176:32-v192:32-v208:32-v224:32-v240:32-v256:32-n8:16:32"
 
 %dx.types.Handle = type { i8* }
 %struct.DispatchSystemData = type { <3 x i32> }
@@ -32,6 +27,10 @@ define i32 @_cont_GetContinuationStackAddr() #0 {
   ret i32 0
 }
 
+define void @_cont_ExitRayGen(ptr nocapture readonly %data) alwaysinline nounwind !types !{!"function", !"void", !{i32 0, %struct.DispatchSystemData poison}} {
+  ret void
+}
+
 declare %struct.DispatchSystemData @_cont_SetupRayGen() #0
 
 declare %struct.DispatchSystemData @_AmdAwaitTraversal(i64, %struct.TraversalData) #0
@@ -500,7 +499,7 @@ attributes #5 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 ;
 ;
 ; LOWERRAYTRACINGPIPELINE-LABEL: define void @MyRayGen(
-; LOWERRAYTRACINGPIPELINE-SAME: [[STRUCT_DISPATCHSYSTEMDATA:%.*]] [[TMP0:%.*]]) #[[ATTR2:[0-9]+]] !lgc.rt.shaderstage [[META22:![0-9]+]] !continuation.entry [[META13:![0-9]+]] !continuation.registercount [[META22]] !continuation [[META35:![0-9]+]] {
+; LOWERRAYTRACINGPIPELINE-SAME: i64 [[RETURNADDR:%.*]], [[STRUCT_DISPATCHSYSTEMDATA:%.*]] [[TMP0:%.*]]) #[[ATTR2:[0-9]+]] !lgc.rt.shaderstage [[META22:![0-9]+]] !continuation.entry [[META13:![0-9]+]] !continuation.registercount [[META22]] !continuation [[META35:![0-9]+]] {
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[SYSTEM_DATA_ALLOCA:%.*]] = alloca [[STRUCT_DISPATCHSYSTEMDATA]], align 8
 ; LOWERRAYTRACINGPIPELINE-NEXT:    store [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP0]], ptr [[SYSTEM_DATA_ALLOCA]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    call void @amd.dx.setLocalRootIndex(i32 0)
@@ -519,113 +518,114 @@ attributes #5 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TRAV_DATA_I:%.*]] = insertvalue [[STRUCT_TRAVERSALDATA:%.*]] undef, [[STRUCT_SYSTEMDATA]] [[SYS_DATA_I]], 0
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[ADDR_I:%.*]] = call i64 @_AmdGetResumePointAddr() #[[ATTR3:[0-9]+]]
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TRAV_DATA2_I:%.*]] = insertvalue [[STRUCT_TRAVERSALDATA]] [[TRAV_DATA_I]], i64 [[ADDR_I]], 5
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [[STRUCT_RAYPAYLOAD]], ptr [[TMP4]], i32 0, i32 0
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP13:%.*]] = load i32, ptr [[TMP10]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP13]], ptr addrspace(20) @PAYLOAD, align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i32, ptr [[TMP10]], i32 1
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP16:%.*]] = load i32, ptr [[TMP14]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP16]], ptr addrspace(20) getelementptr inbounds (i32, ptr addrspace(20) @PAYLOAD, i32 7), align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i32, ptr [[TMP14]], i32 1
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP18:%.*]] = load i32, ptr [[TMP17]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP18]], ptr addrspace(20) getelementptr inbounds (i32, ptr addrspace(20) @PAYLOAD, i32 8), align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i32, ptr [[TMP14]], i32 2
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP20:%.*]] = load i32, ptr [[TMP19]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP20]], ptr addrspace(20) getelementptr inbounds (i32, ptr addrspace(20) @PAYLOAD, i32 9), align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP21:%.*]] = call ptr inttoptr (i64 4 to ptr)([[STRUCT_TRAVERSALDATA]] [[TRAV_DATA2_I]]), !continuation.registercount [[META33:![0-9]+]], !continuation.returnedRegistercount !33
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP22:%.*]] = call [[STRUCT_DISPATCHSYSTEMDATA]] [[AWAIT_STRUCT_DISPATCHSYSTEMDATA:@[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]](ptr [[TMP21]])
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [[STRUCT_RAYPAYLOAD]], ptr [[TMP4]], i32 0
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP11:%.*]] = load i32, ptr [[TMP10]], align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP11]], ptr addrspace(20) @PAYLOAD, align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[TMP10]], i32 1
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP13:%.*]] = load i32, ptr [[TMP12]], align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP13]], ptr addrspace(20) getelementptr inbounds (i32, ptr addrspace(20) @PAYLOAD, i32 7), align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i32, ptr [[TMP12]], i32 1
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP15:%.*]] = load i32, ptr [[TMP14]], align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP15]], ptr addrspace(20) getelementptr inbounds (i32, ptr addrspace(20) @PAYLOAD, i32 8), align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i32, ptr [[TMP12]], i32 2
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP17:%.*]] = load i32, ptr [[TMP16]], align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP17]], ptr addrspace(20) getelementptr inbounds (i32, ptr addrspace(20) @PAYLOAD, i32 9), align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP18:%.*]] = call ptr inttoptr (i64 4 to ptr)([[STRUCT_TRAVERSALDATA]] [[TRAV_DATA2_I]], i64 poison), !continuation.registercount [[META33:![0-9]+]], !continuation.returnedRegistercount [[META33]]
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP19:%.*]] = call [[STRUCT_DISPATCHSYSTEMDATA]] [[AWAIT:@[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]](ptr [[TMP18]])
 ; LOWERRAYTRACINGPIPELINE-NEXT:    store [[STRUCT_RAYPAYLOAD]] poison, ptr [[TMP4]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [[STRUCT_RAYPAYLOAD]], ptr [[TMP4]], i32 0, i32 0
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP26:%.*]] = load i32, ptr addrspace(20) @PAYLOAD, align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP26]], ptr [[TMP23]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP27:%.*]] = getelementptr inbounds i32, ptr [[TMP23]], i32 1
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP29:%.*]] = load i32, ptr addrspace(20) getelementptr inbounds (i32, ptr addrspace(20) @PAYLOAD, i32 7), align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP29]], ptr [[TMP27]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP30:%.*]] = getelementptr inbounds i32, ptr [[TMP27]], i32 1
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP31:%.*]] = load i32, ptr addrspace(20) getelementptr inbounds (i32, ptr addrspace(20) @PAYLOAD, i32 8), align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP31]], ptr [[TMP30]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP32:%.*]] = getelementptr inbounds i32, ptr [[TMP27]], i32 2
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP33:%.*]] = load i32, ptr addrspace(20) getelementptr inbounds (i32, ptr addrspace(20) @PAYLOAD, i32 9), align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP33]], ptr [[TMP32]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    store [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP22]], ptr [[SYSTEM_DATA_ALLOCA]], align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [[STRUCT_RAYPAYLOAD]], ptr [[TMP4]], i32 0
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP21:%.*]] = load i32, ptr addrspace(20) @PAYLOAD, align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP21]], ptr [[TMP20]], align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP22:%.*]] = getelementptr inbounds i32, ptr [[TMP20]], i32 1
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP23:%.*]] = load i32, ptr addrspace(20) getelementptr inbounds (i32, ptr addrspace(20) @PAYLOAD, i32 7), align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP23]], ptr [[TMP22]], align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP24:%.*]] = getelementptr inbounds i32, ptr [[TMP22]], i32 1
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP25:%.*]] = load i32, ptr addrspace(20) getelementptr inbounds (i32, ptr addrspace(20) @PAYLOAD, i32 8), align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP25]], ptr [[TMP24]], align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP26:%.*]] = getelementptr inbounds i32, ptr [[TMP22]], i32 2
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP27:%.*]] = load i32, ptr addrspace(20) getelementptr inbounds (i32, ptr addrspace(20) @PAYLOAD, i32 9), align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP27]], ptr [[TMP26]], align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    store [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP19]], ptr [[SYSTEM_DATA_ALLOCA]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    call void @amd.dx.setLocalRootIndex(i32 0)
 ; LOWERRAYTRACINGPIPELINE-NEXT:    br label [[DOTSPLIT:%.*]]
 ; LOWERRAYTRACINGPIPELINE:       .split:
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP34:%.*]] = load <4 x float>, ptr [[TMP6]], align 4, !tbaa [[TBAA36]]
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP35:%.*]] = call <3 x i32> @lgc.rt.dispatch.rays.index()
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[EXTRACT:%.*]] = extractelement <3 x i32> [[TMP35]], i8 0
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP36:%.*]] = call <3 x i32> @lgc.rt.dispatch.rays.index()
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[EXTRACT1:%.*]] = extractelement <3 x i32> [[TMP36]], i8 1
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP37:%.*]] = call [[DX_TYPES_HANDLE]] [[DX_OP_CREATEHANDLEFORLIB_DX_TYPES_HANDLE]](i32 160, [[DX_TYPES_HANDLE]] [[TMP3]])
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP38:%.*]] = call [[DX_TYPES_HANDLE]] [[DX_OP_ANNOTATEHANDLE]](i32 216, [[DX_TYPES_HANDLE]] [[TMP37]], [[DX_TYPES_RESOURCEPROPERTIES]] { i32 4098, i32 1033 })
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP39:%.*]] = extractelement <4 x float> [[TMP34]], i64 0
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP40:%.*]] = extractelement <4 x float> [[TMP34]], i64 1
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP41:%.*]] = extractelement <4 x float> [[TMP34]], i64 2
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP42:%.*]] = extractelement <4 x float> [[TMP34]], i64 3
-; LOWERRAYTRACINGPIPELINE-NEXT:    call void @dx.op.textureStore.f32(i32 67, [[DX_TYPES_HANDLE]] [[TMP38]], i32 [[EXTRACT]], i32 [[EXTRACT1]], i32 undef, float [[TMP39]], float [[TMP40]], float [[TMP41]], float [[TMP42]], i8 15)
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP28:%.*]] = load <4 x float>, ptr [[TMP6]], align 4, !tbaa [[TBAA36]]
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP29:%.*]] = call <3 x i32> @lgc.rt.dispatch.rays.index()
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[EXTRACT:%.*]] = extractelement <3 x i32> [[TMP29]], i8 0
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP30:%.*]] = call <3 x i32> @lgc.rt.dispatch.rays.index()
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[EXTRACT1:%.*]] = extractelement <3 x i32> [[TMP30]], i8 1
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP31:%.*]] = call [[DX_TYPES_HANDLE]] [[DX_OP_CREATEHANDLEFORLIB_DX_TYPES_HANDLE]](i32 160, [[DX_TYPES_HANDLE]] [[TMP3]])
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP32:%.*]] = call [[DX_TYPES_HANDLE]] [[DX_OP_ANNOTATEHANDLE]](i32 216, [[DX_TYPES_HANDLE]] [[TMP31]], [[DX_TYPES_RESOURCEPROPERTIES]] { i32 4098, i32 1033 })
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP33:%.*]] = extractelement <4 x float> [[TMP28]], i64 0
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP34:%.*]] = extractelement <4 x float> [[TMP28]], i64 1
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP35:%.*]] = extractelement <4 x float> [[TMP28]], i64 2
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP36:%.*]] = extractelement <4 x float> [[TMP28]], i64 3
+; LOWERRAYTRACINGPIPELINE-NEXT:    call void @dx.op.textureStore.f32(i32 67, [[DX_TYPES_HANDLE]] [[TMP32]], i32 [[EXTRACT]], i32 [[EXTRACT1]], i32 undef, float [[TMP33]], float [[TMP34]], float [[TMP35]], float [[TMP36]], i8 15)
 ; LOWERRAYTRACINGPIPELINE-NEXT:    call void @llvm.lifetime.end.p0(i64 16, ptr [[TMP5]]) #[[ATTR1]]
-; LOWERRAYTRACINGPIPELINE-NEXT:    ret void, !continuation.registercount [[META32:![0-9]+]]
+; LOWERRAYTRACINGPIPELINE-NEXT:    ret void
 ;
 ;
 ; LOWERRAYTRACINGPIPELINE-LABEL: define %struct.DispatchSystemData @MyClosestHitShader(
-; LOWERRAYTRACINGPIPELINE-SAME: [[STRUCT_SYSTEMDATA:%.*]] [[TMP0:%.*]]) #[[ATTR2]] !lgc.rt.shaderstage [[META39:![0-9]+]] !continuation.registercount [[META33]] !continuation [[META40:![0-9]+]] {
+; LOWERRAYTRACINGPIPELINE-SAME: i64 [[RETURNADDR:%.*]], [[STRUCT_SYSTEMDATA:%.*]] [[TMP0:%.*]]) #[[ATTR2]] !lgc.rt.shaderstage [[META39:![0-9]+]] !continuation.registercount [[META33]] !continuation [[META40:![0-9]+]] {
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP2:%.*]] = alloca [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES:%.*]], align 8
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[SYSTEM_DATA_ALLOCA:%.*]] = alloca [[STRUCT_SYSTEMDATA]], align 8
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP3:%.*]] = alloca [[STRUCT_RAYPAYLOAD:%.*]], align 8
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[HITATTRS:%.*]] = alloca [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]], align 8
 ; LOWERRAYTRACINGPIPELINE-NEXT:    store [[STRUCT_SYSTEMDATA]] [[TMP0]], ptr [[SYSTEM_DATA_ALLOCA]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [[STRUCT_SYSTEMDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [[STRUCT_RAYPAYLOAD]], ptr [[TMP3]], i32 0, i32 0
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP8:%.*]] = load i32, ptr addrspace(20) @PAYLOAD, align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP8]], ptr [[TMP5]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[TMP5]], i32 1
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP11:%.*]] = load i32, ptr addrspace(20) getelementptr inbounds (i32, ptr addrspace(20) @PAYLOAD, i32 7), align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP11]], ptr [[TMP9]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[TMP9]], i32 1
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP13:%.*]] = load i32, ptr addrspace(20) getelementptr inbounds (i32, ptr addrspace(20) @PAYLOAD, i32 8), align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP13]], ptr [[TMP12]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i32, ptr [[TMP9]], i32 2
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP15:%.*]] = load i32, ptr addrspace(20) getelementptr inbounds (i32, ptr addrspace(20) @PAYLOAD, i32 9), align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP15]], ptr [[TMP14]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP16:%.*]] = call [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]] [[_CONT_GETTRIANGLEHITATTRIBUTES:@[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]](ptr [[SYSTEM_DATA_ALLOCA]])
-; LOWERRAYTRACINGPIPELINE-NEXT:    store [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]] [[TMP16]], ptr [[TMP2]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP17:%.*]] = load i32, ptr [[TMP2]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP17]], ptr [[HITATTRS]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP20:%.*]] = getelementptr inbounds i32, ptr [[HITATTRS]], i32 1
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i32, ptr [[TMP2]], i32 1
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP22:%.*]] = load i32, ptr [[TMP21]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP22]], ptr [[TMP20]], align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [[STRUCT_RAYPAYLOAD]], ptr [[TMP3]], i32 0
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP6:%.*]] = load i32, ptr addrspace(20) @PAYLOAD, align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP6]], ptr [[TMP5]], align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[TMP5]], i32 1
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP8:%.*]] = load i32, ptr addrspace(20) getelementptr inbounds (i32, ptr addrspace(20) @PAYLOAD, i32 7), align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP8]], ptr [[TMP7]], align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[TMP7]], i32 1
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP10:%.*]] = load i32, ptr addrspace(20) getelementptr inbounds (i32, ptr addrspace(20) @PAYLOAD, i32 8), align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP10]], ptr [[TMP9]], align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i32, ptr [[TMP7]], i32 2
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP12:%.*]] = load i32, ptr addrspace(20) getelementptr inbounds (i32, ptr addrspace(20) @PAYLOAD, i32 9), align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP12]], ptr [[TMP11]], align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP13:%.*]] = call [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]] [[_CONT_GETTRIANGLEHITATTRIBUTES:@[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]](ptr [[SYSTEM_DATA_ALLOCA]])
+; LOWERRAYTRACINGPIPELINE-NEXT:    store [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]] [[TMP13]], ptr [[TMP2]], align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP14:%.*]] = load i32, ptr [[TMP2]], align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP14]], ptr [[HITATTRS]], align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i32, ptr [[HITATTRS]], i32 1
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i32, ptr [[TMP2]], i32 1
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP17:%.*]] = load i32, ptr [[TMP16]], align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP17]], ptr [[TMP15]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    call void @amd.dx.setLocalRootIndex(i32 5)
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]], ptr [[HITATTRS]], i32 0, i32 0
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP24:%.*]] = load <2 x float>, ptr [[TMP23]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP25:%.*]] = extractelement <2 x float> [[TMP24]], i32 0
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP26:%.*]] = fsub fast float 1.000000e+00, [[TMP25]]
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP27:%.*]] = extractelement <2 x float> [[TMP24]], i32 1
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP28:%.*]] = fsub fast float [[TMP26]], [[TMP27]]
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP29:%.*]] = insertelement <4 x float> undef, float [[TMP28]], i64 0
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP30:%.*]] = insertelement <4 x float> [[TMP29]], float [[TMP25]], i64 1
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP31:%.*]] = insertelement <4 x float> [[TMP30]], float [[TMP27]], i64 2
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP32:%.*]] = insertelement <4 x float> [[TMP31]], float 1.000000e+00, i64 3
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP33:%.*]] = getelementptr inbounds [[STRUCT_RAYPAYLOAD]], ptr [[TMP3]], i32 0, i32 0
-; LOWERRAYTRACINGPIPELINE-NEXT:    store <4 x float> [[TMP32]], ptr [[TMP33]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP34:%.*]] = getelementptr inbounds [[STRUCT_RAYPAYLOAD]], ptr [[TMP3]], i32 0, i32 0
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP37:%.*]] = load i32, ptr [[TMP34]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP37]], ptr addrspace(20) @PAYLOAD, align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP38:%.*]] = getelementptr inbounds i32, ptr [[TMP34]], i32 1
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP40:%.*]] = load i32, ptr [[TMP38]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP40]], ptr addrspace(20) getelementptr inbounds (i32, ptr addrspace(20) @PAYLOAD, i32 7), align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP41:%.*]] = getelementptr inbounds i32, ptr [[TMP38]], i32 1
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP42:%.*]] = load i32, ptr [[TMP41]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP42]], ptr addrspace(20) getelementptr inbounds (i32, ptr addrspace(20) @PAYLOAD, i32 8), align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP43:%.*]] = getelementptr inbounds i32, ptr [[TMP38]], i32 2
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP44:%.*]] = load i32, ptr [[TMP43]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP44]], ptr addrspace(20) getelementptr inbounds (i32, ptr addrspace(20) @PAYLOAD, i32 9), align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP45:%.*]] = getelementptr inbounds [[STRUCT_SYSTEMDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP46:%.*]] = load [[STRUCT_DISPATCHSYSTEMDATA:%.*]], ptr [[TMP45]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    ret [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP46]], !continuation.registercount [[META33]]
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP18:%.*]] = getelementptr inbounds [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]], ptr [[HITATTRS]], i32 0, i32 0
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP19:%.*]] = load <2 x float>, ptr [[TMP18]], align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP20:%.*]] = extractelement <2 x float> [[TMP19]], i32 0
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP21:%.*]] = fsub fast float 1.000000e+00, [[TMP20]]
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP22:%.*]] = extractelement <2 x float> [[TMP19]], i32 1
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP23:%.*]] = fsub fast float [[TMP21]], [[TMP22]]
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP24:%.*]] = insertelement <4 x float> undef, float [[TMP23]], i64 0
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP25:%.*]] = insertelement <4 x float> [[TMP24]], float [[TMP20]], i64 1
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP26:%.*]] = insertelement <4 x float> [[TMP25]], float [[TMP22]], i64 2
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP27:%.*]] = insertelement <4 x float> [[TMP26]], float 1.000000e+00, i64 3
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP28:%.*]] = getelementptr inbounds [[STRUCT_RAYPAYLOAD]], ptr [[TMP3]], i32 0, i32 0
+; LOWERRAYTRACINGPIPELINE-NEXT:    store <4 x float> [[TMP27]], ptr [[TMP28]], align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP29:%.*]] = getelementptr inbounds [[STRUCT_RAYPAYLOAD]], ptr [[TMP3]], i32 0
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP30:%.*]] = load i32, ptr [[TMP29]], align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP30]], ptr addrspace(20) @PAYLOAD, align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP31:%.*]] = getelementptr inbounds i32, ptr [[TMP29]], i32 1
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP32:%.*]] = load i32, ptr [[TMP31]], align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP32]], ptr addrspace(20) getelementptr inbounds (i32, ptr addrspace(20) @PAYLOAD, i32 7), align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP33:%.*]] = getelementptr inbounds i32, ptr [[TMP31]], i32 1
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP34:%.*]] = load i32, ptr [[TMP33]], align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP34]], ptr addrspace(20) getelementptr inbounds (i32, ptr addrspace(20) @PAYLOAD, i32 8), align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP35:%.*]] = getelementptr inbounds i32, ptr [[TMP31]], i32 2
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP36:%.*]] = load i32, ptr [[TMP35]], align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP36]], ptr addrspace(20) getelementptr inbounds (i32, ptr addrspace(20) @PAYLOAD, i32 9), align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP37:%.*]] = getelementptr inbounds [[STRUCT_SYSTEMDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP38:%.*]] = load [[STRUCT_DISPATCHSYSTEMDATA:%.*]], ptr [[TMP37]], align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    call void (...) @lgc.ilcps.return(i64 [[RETURNADDR]], [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP38]]), !continuation.registercount [[META33]]
+; LOWERRAYTRACINGPIPELINE-NEXT:    unreachable
 ;
 ;
 ; LOWERRAYTRACINGPIPELINE-LABEL: define %struct.AnyHitTraversalData @MyAnyHitShader(
-; LOWERRAYTRACINGPIPELINE-SAME: [[STRUCT_ANYHITTRAVERSALDATA:%.*]] [[TMP0:%.*]], [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES:%.*]] [[TMP1:%.*]]) #[[ATTR2]] !lgc.rt.shaderstage [[META41:![0-9]+]] !continuation.registercount [[META33]] !continuation [[META42:![0-9]+]] {
+; LOWERRAYTRACINGPIPELINE-SAME: i64 [[RETURNADDR:%.*]], [[STRUCT_ANYHITTRAVERSALDATA:%.*]] [[TMP0:%.*]], [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES:%.*]] [[TMP1:%.*]]) #[[ATTR2]] !lgc.rt.shaderstage [[META41:![0-9]+]] !continuation.registercount [[META33]] !continuation [[META42:![0-9]+]] {
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP3:%.*]] = alloca [[STRUCT_HITDATA:%.*]], align 8
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP4:%.*]] = alloca [[STRUCT_HITDATA]], align 8
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP5:%.*]] = alloca [[STRUCT_HITDATA]], align 8
@@ -641,32 +641,32 @@ attributes #5 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[HITATTRSALLOCA:%.*]] = alloca [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]], align 8
 ; LOWERRAYTRACINGPIPELINE-NEXT:    store [[STRUCT_ANYHITTRAVERSALDATA]] [[TMP0]], ptr [[SYSTEM_DATA_ALLOCA]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP13:%.*]] = getelementptr inbounds [[STRUCT_ANYHITTRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0, i32 0, i32 0
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP14:%.*]] = getelementptr inbounds [[STRUCT_RAYPAYLOAD]], ptr [[TMP12]], i32 0, i32 0
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP17:%.*]] = load i32, ptr addrspace(20) @PAYLOAD, align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP17]], ptr [[TMP14]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i32, ptr [[TMP14]], i32 1
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP20:%.*]] = load i32, ptr addrspace(20) getelementptr inbounds (i32, ptr addrspace(20) @PAYLOAD, i32 7), align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP20]], ptr [[TMP19]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i32, ptr [[TMP19]], i32 1
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP22:%.*]] = load i32, ptr addrspace(20) getelementptr inbounds (i32, ptr addrspace(20) @PAYLOAD, i32 8), align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP22]], ptr [[TMP21]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP23:%.*]] = getelementptr inbounds i32, ptr [[TMP19]], i32 2
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP24:%.*]] = load i32, ptr addrspace(20) getelementptr inbounds (i32, ptr addrspace(20) @PAYLOAD, i32 9), align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP24]], ptr [[TMP23]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [[STRUCT_ANYHITTRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0, i32 0
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP26:%.*]] = call [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]] [[_CONT_GETTRIANGLEHITATTRIBUTES:@[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]](ptr [[TMP25]])
-; LOWERRAYTRACINGPIPELINE-NEXT:    store [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]] [[TMP26]], ptr [[TMP11]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP27:%.*]] = load i32, ptr [[TMP11]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP27]], ptr [[ORIGHITATTRS]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP30:%.*]] = getelementptr inbounds i32, ptr [[ORIGHITATTRS]], i32 1
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP28:%.*]] = getelementptr inbounds i32, ptr [[TMP11]], i32 1
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP32:%.*]] = load i32, ptr [[TMP28]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP32]], ptr [[TMP30]], align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP14:%.*]] = getelementptr inbounds [[STRUCT_RAYPAYLOAD]], ptr [[TMP12]], i32 0
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP15:%.*]] = load i32, ptr addrspace(20) @PAYLOAD, align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP15]], ptr [[TMP14]], align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i32, ptr [[TMP14]], i32 1
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP17:%.*]] = load i32, ptr addrspace(20) getelementptr inbounds (i32, ptr addrspace(20) @PAYLOAD, i32 7), align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP17]], ptr [[TMP16]], align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i32, ptr [[TMP16]], i32 1
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP19:%.*]] = load i32, ptr addrspace(20) getelementptr inbounds (i32, ptr addrspace(20) @PAYLOAD, i32 8), align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP19]], ptr [[TMP18]], align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP20:%.*]] = getelementptr inbounds i32, ptr [[TMP16]], i32 2
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP21:%.*]] = load i32, ptr addrspace(20) getelementptr inbounds (i32, ptr addrspace(20) @PAYLOAD, i32 9), align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP21]], ptr [[TMP20]], align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP22:%.*]] = getelementptr inbounds [[STRUCT_ANYHITTRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0, i32 0
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP23:%.*]] = call [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]] [[_CONT_GETTRIANGLEHITATTRIBUTES:@[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]](ptr [[TMP22]])
+; LOWERRAYTRACINGPIPELINE-NEXT:    store [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]] [[TMP23]], ptr [[TMP11]], align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP24:%.*]] = load i32, ptr [[TMP11]], align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP24]], ptr [[ORIGHITATTRS]], align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP25:%.*]] = getelementptr inbounds i32, ptr [[ORIGHITATTRS]], i32 1
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP26:%.*]] = getelementptr inbounds i32, ptr [[TMP11]], i32 1
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP27:%.*]] = load i32, ptr [[TMP26]], align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP27]], ptr [[TMP25]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    store [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]] [[TMP1]], ptr [[HITATTRSALLOCA]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    call void @amd.dx.setLocalRootIndex(i32 5)
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP33:%.*]] = getelementptr inbounds [[STRUCT_RAYPAYLOAD]], ptr [[TMP12]], i32 0, i32 0
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP34:%.*]] = load <4 x float>, ptr [[TMP33]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP35:%.*]] = getelementptr inbounds [[STRUCT_ANYHITTRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0, i32 0, i32 0
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP28:%.*]] = getelementptr inbounds [[STRUCT_RAYPAYLOAD]], ptr [[TMP12]], i32 0, i32 0
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP29:%.*]] = load <4 x float>, ptr [[TMP28]], align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP30:%.*]] = getelementptr inbounds [[STRUCT_ANYHITTRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0, i32 0, i32 0
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[RES_I1:%.*]] = load [[STRUCT_HITDATA]], ptr [[SYSTEM_DATA_ALLOCA]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    store [[STRUCT_HITDATA]] [[RES_I1]], ptr [[TMP4]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[RES_1_I2:%.*]] = load float, ptr [[TMP4]], align 4
@@ -678,7 +678,7 @@ attributes #5 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[VAL_1_I8:%.*]] = insertelement <3 x float> [[VAL_0_I7]], float [[RES_2_I4]], i32 1
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[VAL_2_I9:%.*]] = insertelement <3 x float> [[VAL_1_I8]], float [[RES_3_I6]], i32 2
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[EXTRACT1:%.*]] = extractelement <3 x float> [[VAL_2_I9]], i8 0
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP36:%.*]] = getelementptr inbounds [[STRUCT_ANYHITTRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0, i32 0, i32 0
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP31:%.*]] = getelementptr inbounds [[STRUCT_ANYHITTRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0, i32 0, i32 0
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[RES_I:%.*]] = load [[STRUCT_HITDATA]], ptr [[SYSTEM_DATA_ALLOCA]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    store [[STRUCT_HITDATA]] [[RES_I]], ptr [[TMP5]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[RESPTR_1_I:%.*]] = getelementptr [[STRUCT_HITDATA]], ptr [[TMP5]], i32 0, i32 1, i32 0
@@ -691,158 +691,163 @@ attributes #5 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[VAL_1_I:%.*]] = insertelement <3 x float> [[VAL_0_I]], float [[RES_2_I]], i32 1
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[VAL_2_I:%.*]] = insertelement <3 x float> [[VAL_1_I]], float [[RES_3_I]], i32 2
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[EXTRACT:%.*]] = extractelement <3 x float> [[VAL_2_I]], i8 0
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP37:%.*]] = getelementptr inbounds [[STRUCT_ANYHITTRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0, i32 0, i32 0
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP32:%.*]] = getelementptr inbounds [[STRUCT_ANYHITTRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0, i32 0, i32 0
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[RES_I10:%.*]] = load [[STRUCT_HITDATA]], ptr [[SYSTEM_DATA_ALLOCA]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    store [[STRUCT_HITDATA]] [[RES_I10]], ptr [[TMP3]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[RESPTR_I:%.*]] = getelementptr [[STRUCT_HITDATA]], ptr [[TMP3]], i32 0, i32 2
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[RES_I11:%.*]] = load float, ptr [[RESPTR_I]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP38:%.*]] = fmul fast float [[RES_I11]], [[EXTRACT]]
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP39:%.*]] = fadd fast float [[TMP38]], [[EXTRACT1]]
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP40:%.*]] = fcmp fast ogt float [[TMP39]], 0.000000e+00
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP41:%.*]] = fcmp fast ogt float [[TMP39]], 1.000000e+00
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP42:%.*]] = fcmp fast ogt float [[TMP39]], -1.000000e+00
-; LOWERRAYTRACINGPIPELINE-NEXT:    br i1 [[TMP40]], label [[TMP43:%.*]], label [[TMP88:%.*]]
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP33:%.*]] = fmul fast float [[RES_I11]], [[EXTRACT]]
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP34:%.*]] = fadd fast float [[TMP33]], [[EXTRACT1]]
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP35:%.*]] = fcmp fast ogt float [[TMP34]], 0.000000e+00
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP36:%.*]] = fcmp fast ogt float [[TMP34]], 1.000000e+00
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP37:%.*]] = fcmp fast ogt float [[TMP34]], -1.000000e+00
+; LOWERRAYTRACINGPIPELINE-NEXT:    br i1 [[TMP35]], label [[TMP38:%.*]], label [[TMP73:%.*]]
 ; LOWERRAYTRACINGPIPELINE:       38:
-; LOWERRAYTRACINGPIPELINE-NEXT:    store <4 x float> [[TMP34]], ptr [[TMP33]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    br i1 [[TMP41]], label [[TMP44:%.*]], label [[TMP66:%.*]]
+; LOWERRAYTRACINGPIPELINE-NEXT:    store <4 x float> [[TMP29]], ptr [[TMP28]], align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    br i1 [[TMP36]], label [[TMP39:%.*]], label [[TMP56:%.*]]
 ; LOWERRAYTRACINGPIPELINE:       39:
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP45:%.*]] = getelementptr inbounds [[STRUCT_ANYHITTRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0, i32 0, i32 0
-; LOWERRAYTRACINGPIPELINE-NEXT:    call void @_cont_AcceptHitAndEndSearch(ptr [[TMP45]])
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP46:%.*]] = getelementptr inbounds [[STRUCT_RAYPAYLOAD]], ptr [[TMP12]], i32 0, i32 0
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP49:%.*]] = load i32, ptr [[TMP46]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP49]], ptr addrspace(20) @PAYLOAD, align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP50:%.*]] = getelementptr inbounds i32, ptr [[TMP46]], i32 1
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP40:%.*]] = getelementptr inbounds [[STRUCT_ANYHITTRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0, i32 0, i32 0
+; LOWERRAYTRACINGPIPELINE-NEXT:    call void @_cont_AcceptHitAndEndSearch(ptr [[TMP40]])
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP41:%.*]] = getelementptr inbounds [[STRUCT_RAYPAYLOAD]], ptr [[TMP12]], i32 0
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP42:%.*]] = load i32, ptr [[TMP41]], align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP42]], ptr addrspace(20) @PAYLOAD, align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP43:%.*]] = getelementptr inbounds i32, ptr [[TMP41]], i32 1
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP44:%.*]] = load i32, ptr [[TMP43]], align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP44]], ptr addrspace(20) getelementptr inbounds (i32, ptr addrspace(20) @PAYLOAD, i32 7), align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP45:%.*]] = getelementptr inbounds i32, ptr [[TMP43]], i32 1
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP46:%.*]] = load i32, ptr [[TMP45]], align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP46]], ptr addrspace(20) getelementptr inbounds (i32, ptr addrspace(20) @PAYLOAD, i32 8), align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP47:%.*]] = getelementptr inbounds i32, ptr [[TMP43]], i32 2
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP48:%.*]] = load i32, ptr [[TMP47]], align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP48]], ptr addrspace(20) getelementptr inbounds (i32, ptr addrspace(20) @PAYLOAD, i32 9), align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP49:%.*]] = load i32, ptr [[HITATTRSALLOCA]], align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP49]], ptr [[TMP10]], align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP50:%.*]] = getelementptr inbounds i32, ptr [[HITATTRSALLOCA]], i32 1
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP51:%.*]] = getelementptr inbounds i32, ptr [[TMP10]], i32 1
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP52:%.*]] = load i32, ptr [[TMP50]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP52]], ptr addrspace(20) getelementptr inbounds (i32, ptr addrspace(20) @PAYLOAD, i32 7), align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP47:%.*]] = getelementptr inbounds i32, ptr [[TMP50]], i32 1
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP54:%.*]] = load i32, ptr [[TMP47]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP54]], ptr addrspace(20) getelementptr inbounds (i32, ptr addrspace(20) @PAYLOAD, i32 8), align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP55:%.*]] = getelementptr inbounds i32, ptr [[TMP50]], i32 2
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP56:%.*]] = load i32, ptr [[TMP55]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP56]], ptr addrspace(20) getelementptr inbounds (i32, ptr addrspace(20) @PAYLOAD, i32 9), align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP51:%.*]] = load i32, ptr [[HITATTRSALLOCA]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP51]], ptr [[TMP10]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP53:%.*]] = getelementptr inbounds i32, ptr [[HITATTRSALLOCA]], i32 1
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP61:%.*]] = getelementptr inbounds i32, ptr [[TMP10]], i32 1
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP62:%.*]] = load i32, ptr [[TMP53]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP62]], ptr [[TMP61]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP63:%.*]] = load [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]], ptr [[TMP10]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP64:%.*]] = getelementptr inbounds [[STRUCT_ANYHITTRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0, i32 0
-; LOWERRAYTRACINGPIPELINE-NEXT:    store [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]] [[TMP63]], ptr [[TMP64]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP65:%.*]] = load [[STRUCT_ANYHITTRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    ret [[STRUCT_ANYHITTRAVERSALDATA]] [[TMP65]], !continuation.registercount [[META33]]
+; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP52]], ptr [[TMP51]], align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP53:%.*]] = load [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]], ptr [[TMP10]], align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP54:%.*]] = getelementptr inbounds [[STRUCT_ANYHITTRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0, i32 0
+; LOWERRAYTRACINGPIPELINE-NEXT:    store [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]] [[TMP53]], ptr [[TMP54]], align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP55:%.*]] = load [[STRUCT_ANYHITTRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    call void (...) @lgc.ilcps.return(i64 [[RETURNADDR]], [[STRUCT_ANYHITTRAVERSALDATA]] [[TMP55]]), !continuation.registercount [[META33]]
+; LOWERRAYTRACINGPIPELINE-NEXT:    unreachable
 ; LOWERRAYTRACINGPIPELINE:       56:
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP67:%.*]] = getelementptr inbounds [[STRUCT_ANYHITTRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0, i32 0, i32 0
-; LOWERRAYTRACINGPIPELINE-NEXT:    call void @_cont_AcceptHitAndEndSearch(ptr [[TMP67]])
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP68:%.*]] = getelementptr inbounds [[STRUCT_RAYPAYLOAD]], ptr [[TMP12]], i32 0, i32 0
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP71:%.*]] = load i32, ptr [[TMP68]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP71]], ptr addrspace(20) @PAYLOAD, align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP72:%.*]] = getelementptr inbounds i32, ptr [[TMP68]], i32 1
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP74:%.*]] = load i32, ptr [[TMP72]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP74]], ptr addrspace(20) getelementptr inbounds (i32, ptr addrspace(20) @PAYLOAD, i32 7), align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP69:%.*]] = getelementptr inbounds i32, ptr [[TMP72]], i32 1
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP76:%.*]] = load i32, ptr [[TMP69]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP76]], ptr addrspace(20) getelementptr inbounds (i32, ptr addrspace(20) @PAYLOAD, i32 8), align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP77:%.*]] = getelementptr inbounds i32, ptr [[TMP72]], i32 2
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP78:%.*]] = load i32, ptr [[TMP77]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP78]], ptr addrspace(20) getelementptr inbounds (i32, ptr addrspace(20) @PAYLOAD, i32 9), align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP70:%.*]] = load i32, ptr [[HITATTRSALLOCA]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP70]], ptr [[TMP9]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP73:%.*]] = getelementptr inbounds i32, ptr [[HITATTRSALLOCA]], i32 1
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP83:%.*]] = getelementptr inbounds i32, ptr [[TMP9]], i32 1
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP84:%.*]] = load i32, ptr [[TMP73]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP84]], ptr [[TMP83]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP85:%.*]] = load [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]], ptr [[TMP9]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP86:%.*]] = getelementptr inbounds [[STRUCT_ANYHITTRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0, i32 0
-; LOWERRAYTRACINGPIPELINE-NEXT:    store [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]] [[TMP85]], ptr [[TMP86]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP87:%.*]] = load [[STRUCT_ANYHITTRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    ret [[STRUCT_ANYHITTRAVERSALDATA]] [[TMP87]], !continuation.registercount [[META33]]
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP57:%.*]] = getelementptr inbounds [[STRUCT_ANYHITTRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0, i32 0, i32 0
+; LOWERRAYTRACINGPIPELINE-NEXT:    call void @_cont_AcceptHitAndEndSearch(ptr [[TMP57]])
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP58:%.*]] = getelementptr inbounds [[STRUCT_RAYPAYLOAD]], ptr [[TMP12]], i32 0
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP59:%.*]] = load i32, ptr [[TMP58]], align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP59]], ptr addrspace(20) @PAYLOAD, align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP60:%.*]] = getelementptr inbounds i32, ptr [[TMP58]], i32 1
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP61:%.*]] = load i32, ptr [[TMP60]], align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP61]], ptr addrspace(20) getelementptr inbounds (i32, ptr addrspace(20) @PAYLOAD, i32 7), align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP62:%.*]] = getelementptr inbounds i32, ptr [[TMP60]], i32 1
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP63:%.*]] = load i32, ptr [[TMP62]], align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP63]], ptr addrspace(20) getelementptr inbounds (i32, ptr addrspace(20) @PAYLOAD, i32 8), align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP64:%.*]] = getelementptr inbounds i32, ptr [[TMP60]], i32 2
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP65:%.*]] = load i32, ptr [[TMP64]], align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP65]], ptr addrspace(20) getelementptr inbounds (i32, ptr addrspace(20) @PAYLOAD, i32 9), align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP66:%.*]] = load i32, ptr [[HITATTRSALLOCA]], align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP66]], ptr [[TMP9]], align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP67:%.*]] = getelementptr inbounds i32, ptr [[HITATTRSALLOCA]], i32 1
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP68:%.*]] = getelementptr inbounds i32, ptr [[TMP9]], i32 1
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP69:%.*]] = load i32, ptr [[TMP67]], align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP69]], ptr [[TMP68]], align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP70:%.*]] = load [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]], ptr [[TMP9]], align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP71:%.*]] = getelementptr inbounds [[STRUCT_ANYHITTRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0, i32 0
+; LOWERRAYTRACINGPIPELINE-NEXT:    store [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]] [[TMP70]], ptr [[TMP71]], align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP72:%.*]] = load [[STRUCT_ANYHITTRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    call void (...) @lgc.ilcps.return(i64 [[RETURNADDR]], [[STRUCT_ANYHITTRAVERSALDATA]] [[TMP72]]), !continuation.registercount [[META33]]
+; LOWERRAYTRACINGPIPELINE-NEXT:    unreachable
 ; LOWERRAYTRACINGPIPELINE:       73:
-; LOWERRAYTRACINGPIPELINE-NEXT:    br i1 [[TMP42]], label [[TMP89:%.*]], label [[TMP134:%.*]]
+; LOWERRAYTRACINGPIPELINE-NEXT:    br i1 [[TMP37]], label [[TMP74:%.*]], label [[TMP109:%.*]]
 ; LOWERRAYTRACINGPIPELINE:       74:
-; LOWERRAYTRACINGPIPELINE-NEXT:    br i1 [[TMP41]], label [[TMP90:%.*]], label [[TMP112:%.*]]
+; LOWERRAYTRACINGPIPELINE-NEXT:    br i1 [[TMP36]], label [[TMP75:%.*]], label [[TMP92:%.*]]
 ; LOWERRAYTRACINGPIPELINE:       75:
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP91:%.*]] = getelementptr inbounds [[STRUCT_ANYHITTRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0, i32 0, i32 0
-; LOWERRAYTRACINGPIPELINE-NEXT:    call void @_cont_IgnoreHit(ptr [[TMP91]])
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP92:%.*]] = getelementptr inbounds [[STRUCT_RAYPAYLOAD]], ptr [[TMP12]], i32 0, i32 0
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP95:%.*]] = load i32, ptr [[TMP92]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP95]], ptr addrspace(20) @PAYLOAD, align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP96:%.*]] = getelementptr inbounds i32, ptr [[TMP92]], i32 1
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP98:%.*]] = load i32, ptr [[TMP96]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP98]], ptr addrspace(20) getelementptr inbounds (i32, ptr addrspace(20) @PAYLOAD, i32 7), align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP81:%.*]] = getelementptr inbounds i32, ptr [[TMP96]], i32 1
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP100:%.*]] = load i32, ptr [[TMP81]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP100]], ptr addrspace(20) getelementptr inbounds (i32, ptr addrspace(20) @PAYLOAD, i32 8), align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP101:%.*]] = getelementptr inbounds i32, ptr [[TMP96]], i32 2
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP102:%.*]] = load i32, ptr [[TMP101]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP102]], ptr addrspace(20) getelementptr inbounds (i32, ptr addrspace(20) @PAYLOAD, i32 9), align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP93:%.*]] = load i32, ptr [[ORIGHITATTRS]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP93]], ptr [[TMP8]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP94:%.*]] = getelementptr inbounds i32, ptr [[ORIGHITATTRS]], i32 1
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP107:%.*]] = getelementptr inbounds i32, ptr [[TMP8]], i32 1
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP108:%.*]] = load i32, ptr [[TMP94]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP108]], ptr [[TMP107]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP109:%.*]] = load [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]], ptr [[TMP8]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP110:%.*]] = getelementptr inbounds [[STRUCT_ANYHITTRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0, i32 0
-; LOWERRAYTRACINGPIPELINE-NEXT:    store [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]] [[TMP109]], ptr [[TMP110]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP111:%.*]] = load [[STRUCT_ANYHITTRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    ret [[STRUCT_ANYHITTRAVERSALDATA]] [[TMP111]], !continuation.registercount [[META33]]
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP76:%.*]] = getelementptr inbounds [[STRUCT_ANYHITTRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0, i32 0, i32 0
+; LOWERRAYTRACINGPIPELINE-NEXT:    call void @_cont_IgnoreHit(ptr [[TMP76]])
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP77:%.*]] = getelementptr inbounds [[STRUCT_RAYPAYLOAD]], ptr [[TMP12]], i32 0
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP78:%.*]] = load i32, ptr [[TMP77]], align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP78]], ptr addrspace(20) @PAYLOAD, align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP79:%.*]] = getelementptr inbounds i32, ptr [[TMP77]], i32 1
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP80:%.*]] = load i32, ptr [[TMP79]], align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP80]], ptr addrspace(20) getelementptr inbounds (i32, ptr addrspace(20) @PAYLOAD, i32 7), align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP81:%.*]] = getelementptr inbounds i32, ptr [[TMP79]], i32 1
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP82:%.*]] = load i32, ptr [[TMP81]], align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP82]], ptr addrspace(20) getelementptr inbounds (i32, ptr addrspace(20) @PAYLOAD, i32 8), align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP83:%.*]] = getelementptr inbounds i32, ptr [[TMP79]], i32 2
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP84:%.*]] = load i32, ptr [[TMP83]], align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP84]], ptr addrspace(20) getelementptr inbounds (i32, ptr addrspace(20) @PAYLOAD, i32 9), align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP85:%.*]] = load i32, ptr [[ORIGHITATTRS]], align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP85]], ptr [[TMP8]], align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP86:%.*]] = getelementptr inbounds i32, ptr [[ORIGHITATTRS]], i32 1
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP87:%.*]] = getelementptr inbounds i32, ptr [[TMP8]], i32 1
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP88:%.*]] = load i32, ptr [[TMP86]], align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP88]], ptr [[TMP87]], align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP89:%.*]] = load [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]], ptr [[TMP8]], align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP90:%.*]] = getelementptr inbounds [[STRUCT_ANYHITTRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0, i32 0
+; LOWERRAYTRACINGPIPELINE-NEXT:    store [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]] [[TMP89]], ptr [[TMP90]], align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP91:%.*]] = load [[STRUCT_ANYHITTRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    call void (...) @lgc.ilcps.return(i64 [[RETURNADDR]], [[STRUCT_ANYHITTRAVERSALDATA]] [[TMP91]]), !continuation.registercount [[META33]]
+; LOWERRAYTRACINGPIPELINE-NEXT:    unreachable
 ; LOWERRAYTRACINGPIPELINE:       92:
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP113:%.*]] = getelementptr inbounds [[STRUCT_ANYHITTRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0, i32 0, i32 0
-; LOWERRAYTRACINGPIPELINE-NEXT:    call void @_cont_IgnoreHit(ptr [[TMP113]])
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP114:%.*]] = getelementptr inbounds [[STRUCT_RAYPAYLOAD]], ptr [[TMP12]], i32 0, i32 0
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP117:%.*]] = load i32, ptr [[TMP114]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP117]], ptr addrspace(20) @PAYLOAD, align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP118:%.*]] = getelementptr inbounds i32, ptr [[TMP114]], i32 1
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP120:%.*]] = load i32, ptr [[TMP118]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP120]], ptr addrspace(20) getelementptr inbounds (i32, ptr addrspace(20) @PAYLOAD, i32 7), align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP99:%.*]] = getelementptr inbounds i32, ptr [[TMP118]], i32 1
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP122:%.*]] = load i32, ptr [[TMP99]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP122]], ptr addrspace(20) getelementptr inbounds (i32, ptr addrspace(20) @PAYLOAD, i32 8), align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP123:%.*]] = getelementptr inbounds i32, ptr [[TMP118]], i32 2
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP124:%.*]] = load i32, ptr [[TMP123]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP124]], ptr addrspace(20) getelementptr inbounds (i32, ptr addrspace(20) @PAYLOAD, i32 9), align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP104:%.*]] = load i32, ptr [[ORIGHITATTRS]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP104]], ptr [[TMP7]], align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP93:%.*]] = getelementptr inbounds [[STRUCT_ANYHITTRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0, i32 0, i32 0
+; LOWERRAYTRACINGPIPELINE-NEXT:    call void @_cont_IgnoreHit(ptr [[TMP93]])
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP94:%.*]] = getelementptr inbounds [[STRUCT_RAYPAYLOAD]], ptr [[TMP12]], i32 0
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP95:%.*]] = load i32, ptr [[TMP94]], align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP95]], ptr addrspace(20) @PAYLOAD, align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP96:%.*]] = getelementptr inbounds i32, ptr [[TMP94]], i32 1
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP97:%.*]] = load i32, ptr [[TMP96]], align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP97]], ptr addrspace(20) getelementptr inbounds (i32, ptr addrspace(20) @PAYLOAD, i32 7), align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP98:%.*]] = getelementptr inbounds i32, ptr [[TMP96]], i32 1
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP99:%.*]] = load i32, ptr [[TMP98]], align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP99]], ptr addrspace(20) getelementptr inbounds (i32, ptr addrspace(20) @PAYLOAD, i32 8), align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP100:%.*]] = getelementptr inbounds i32, ptr [[TMP96]], i32 2
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP101:%.*]] = load i32, ptr [[TMP100]], align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP101]], ptr addrspace(20) getelementptr inbounds (i32, ptr addrspace(20) @PAYLOAD, i32 9), align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP102:%.*]] = load i32, ptr [[ORIGHITATTRS]], align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP102]], ptr [[TMP7]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP103:%.*]] = getelementptr inbounds i32, ptr [[ORIGHITATTRS]], i32 1
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP129:%.*]] = getelementptr inbounds i32, ptr [[TMP7]], i32 1
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP130:%.*]] = load i32, ptr [[TMP103]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP130]], ptr [[TMP129]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP131:%.*]] = load [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]], ptr [[TMP7]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP132:%.*]] = getelementptr inbounds [[STRUCT_ANYHITTRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0, i32 0
-; LOWERRAYTRACINGPIPELINE-NEXT:    store [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]] [[TMP131]], ptr [[TMP132]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP133:%.*]] = load [[STRUCT_ANYHITTRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    ret [[STRUCT_ANYHITTRAVERSALDATA]] [[TMP133]], !continuation.registercount [[META33]]
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP104:%.*]] = getelementptr inbounds i32, ptr [[TMP7]], i32 1
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP105:%.*]] = load i32, ptr [[TMP103]], align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP105]], ptr [[TMP104]], align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP106:%.*]] = load [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]], ptr [[TMP7]], align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP107:%.*]] = getelementptr inbounds [[STRUCT_ANYHITTRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0, i32 0
+; LOWERRAYTRACINGPIPELINE-NEXT:    store [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]] [[TMP106]], ptr [[TMP107]], align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP108:%.*]] = load [[STRUCT_ANYHITTRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    call void (...) @lgc.ilcps.return(i64 [[RETURNADDR]], [[STRUCT_ANYHITTRAVERSALDATA]] [[TMP108]]), !continuation.registercount [[META33]]
+; LOWERRAYTRACINGPIPELINE-NEXT:    unreachable
 ; LOWERRAYTRACINGPIPELINE:       109:
-; LOWERRAYTRACINGPIPELINE-NEXT:    store <4 x float> [[TMP34]], ptr [[TMP33]], align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    store <4 x float> [[TMP29]], ptr [[TMP28]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    call void @_cont_AcceptHit(ptr [[SYSTEM_DATA_ALLOCA]])
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP135:%.*]] = getelementptr inbounds [[STRUCT_RAYPAYLOAD]], ptr [[TMP12]], i32 0, i32 0
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP138:%.*]] = load i32, ptr [[TMP135]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP138]], ptr addrspace(20) @PAYLOAD, align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP139:%.*]] = getelementptr inbounds i32, ptr [[TMP135]], i32 1
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP141:%.*]] = load i32, ptr [[TMP139]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP141]], ptr addrspace(20) getelementptr inbounds (i32, ptr addrspace(20) @PAYLOAD, i32 7), align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP115:%.*]] = getelementptr inbounds i32, ptr [[TMP139]], i32 1
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP143:%.*]] = load i32, ptr [[TMP115]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP143]], ptr addrspace(20) getelementptr inbounds (i32, ptr addrspace(20) @PAYLOAD, i32 8), align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP144:%.*]] = getelementptr inbounds i32, ptr [[TMP139]], i32 2
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP145:%.*]] = load i32, ptr [[TMP144]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP145]], ptr addrspace(20) getelementptr inbounds (i32, ptr addrspace(20) @PAYLOAD, i32 9), align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP121:%.*]] = load i32, ptr [[HITATTRSALLOCA]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP121]], ptr [[TMP6]], align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP110:%.*]] = getelementptr inbounds [[STRUCT_RAYPAYLOAD]], ptr [[TMP12]], i32 0
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP111:%.*]] = load i32, ptr [[TMP110]], align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP111]], ptr addrspace(20) @PAYLOAD, align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP112:%.*]] = getelementptr inbounds i32, ptr [[TMP110]], i32 1
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP113:%.*]] = load i32, ptr [[TMP112]], align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP113]], ptr addrspace(20) getelementptr inbounds (i32, ptr addrspace(20) @PAYLOAD, i32 7), align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP114:%.*]] = getelementptr inbounds i32, ptr [[TMP112]], i32 1
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP115:%.*]] = load i32, ptr [[TMP114]], align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP115]], ptr addrspace(20) getelementptr inbounds (i32, ptr addrspace(20) @PAYLOAD, i32 8), align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP116:%.*]] = getelementptr inbounds i32, ptr [[TMP112]], i32 2
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP117:%.*]] = load i32, ptr [[TMP116]], align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP117]], ptr addrspace(20) getelementptr inbounds (i32, ptr addrspace(20) @PAYLOAD, i32 9), align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP118:%.*]] = load i32, ptr [[HITATTRSALLOCA]], align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP118]], ptr [[TMP6]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP119:%.*]] = getelementptr inbounds i32, ptr [[HITATTRSALLOCA]], i32 1
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP150:%.*]] = getelementptr inbounds i32, ptr [[TMP6]], i32 1
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP151:%.*]] = load i32, ptr [[TMP119]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP151]], ptr [[TMP150]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP152:%.*]] = load [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]], ptr [[TMP6]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP153:%.*]] = getelementptr inbounds [[STRUCT_ANYHITTRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0, i32 0
-; LOWERRAYTRACINGPIPELINE-NEXT:    store [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]] [[TMP152]], ptr [[TMP153]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP154:%.*]] = load [[STRUCT_ANYHITTRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    ret [[STRUCT_ANYHITTRAVERSALDATA]] [[TMP154]], !continuation.registercount [[META33]]
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP120:%.*]] = getelementptr inbounds i32, ptr [[TMP6]], i32 1
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP121:%.*]] = load i32, ptr [[TMP119]], align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP121]], ptr [[TMP120]], align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP122:%.*]] = load [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]], ptr [[TMP6]], align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP123:%.*]] = getelementptr inbounds [[STRUCT_ANYHITTRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0, i32 0
+; LOWERRAYTRACINGPIPELINE-NEXT:    store [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]] [[TMP122]], ptr [[TMP123]], align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP124:%.*]] = load [[STRUCT_ANYHITTRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    call void (...) @lgc.ilcps.return(i64 [[RETURNADDR]], [[STRUCT_ANYHITTRAVERSALDATA]] [[TMP124]]), !continuation.registercount [[META33]]
+; LOWERRAYTRACINGPIPELINE-NEXT:    unreachable
 ;
 ;
 ; LOWERRAYTRACINGPIPELINE-LABEL: define %struct.AnyHitTraversalData @MyIntersectionShader(
-; LOWERRAYTRACINGPIPELINE-SAME: [[STRUCT_ANYHITTRAVERSALDATA:%.*]] [[TMP0:%.*]]) #[[ATTR2]] !lgc.rt.shaderstage [[META43:![0-9]+]] !continuation.registercount [[META32]] !continuation [[META44:![0-9]+]] {
+; LOWERRAYTRACINGPIPELINE-SAME: i64 [[RETURNADDR:%.*]], [[STRUCT_ANYHITTRAVERSALDATA:%.*]] [[TMP0:%.*]]) #[[ATTR2]] !lgc.rt.shaderstage [[META43:![0-9]+]] !continuation.registercount [[META32:![0-9]+]] !continuation [[META44:![0-9]+]] {
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP2:%.*]] = alloca [[STRUCT_HITDATA:%.*]], align 8
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP3:%.*]] = alloca [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES:%.*]], align 8
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP4:%.*]] = alloca [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]], align 4
@@ -864,8 +869,8 @@ attributes #5 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 ; LOWERRAYTRACINGPIPELINE:       callAHit.i:
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TRAV_DATA_I:%.*]] = load [[STRUCT_ANYHITTRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP8:%.*]] = load [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]], ptr [[TMP4]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP9:%.*]] = call ptr inttoptr (i64 3 to ptr)([[STRUCT_ANYHITTRAVERSALDATA]] [[TRAV_DATA_I]], float [[RES_I2]], i32 0, [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]] [[TMP8]]), !continuation.registercount [[META32]], !continuation.returnedRegistercount !32
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP10:%.*]] = call [[STRUCT_ANYHITTRAVERSALDATA]] [[AWAIT_STRUCT_ANYHITTRAVERSALDATA:@[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]](ptr [[TMP9]])
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP9:%.*]] = call ptr inttoptr (i64 3 to ptr)([[STRUCT_ANYHITTRAVERSALDATA]] [[TRAV_DATA_I]], float [[RES_I2]], i32 0, [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]] [[TMP8]]), !continuation.registercount [[META32]], !continuation.returnedRegistercount [[META32]]
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP10:%.*]] = call [[STRUCT_ANYHITTRAVERSALDATA]] [[AWAIT_1:@[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]](ptr [[TMP9]])
 ; LOWERRAYTRACINGPIPELINE-NEXT:    store [[STRUCT_ANYHITTRAVERSALDATA]] [[TMP10]], ptr [[SYSTEM_DATA_ALLOCA]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP11:%.*]] = getelementptr inbounds [[STRUCT_ANYHITTRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0, i32 0, i32 0
 ; LOWERRAYTRACINGPIPELINE-NEXT:    call void @amd.dx.setLocalRootIndex(i32 5)
@@ -873,29 +878,31 @@ attributes #5 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 ; LOWERRAYTRACINGPIPELINE:       isEnd.i:
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP12:%.*]] = load i32, ptr [[TMP4]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP12]], ptr [[TMP3]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i32 1
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i32, ptr [[TMP3]], i32 1
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP17:%.*]] = load i32, ptr [[TMP15]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP17]], ptr [[TMP16]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP18:%.*]] = load [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]], ptr [[TMP3]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP19:%.*]] = getelementptr inbounds [[STRUCT_ANYHITTRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0, i32 0
-; LOWERRAYTRACINGPIPELINE-NEXT:    store [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]] [[TMP18]], ptr [[TMP19]], align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i32 1
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i32, ptr [[TMP3]], i32 1
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP15:%.*]] = load i32, ptr [[TMP13]], align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP15]], ptr [[TMP14]], align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP16:%.*]] = load [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]], ptr [[TMP3]], align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [[STRUCT_ANYHITTRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0, i32 0
+; LOWERRAYTRACINGPIPELINE-NEXT:    store [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]] [[TMP16]], ptr [[TMP17]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    br label [[_CONT_REPORTHIT_EXIT]]
 ; LOWERRAYTRACINGPIPELINE:       _cont_ReportHit.exit:
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [[STRUCT_ANYHITTRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP18:%.*]] = getelementptr inbounds [[STRUCT_ANYHITTRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[ISEND_I1:%.*]] = call i1 @opaqueIsEnd()
-; LOWERRAYTRACINGPIPELINE-NEXT:    br i1 [[ISEND_I1]], label [[TMP21:%.*]], label [[TMP23:%.*]]
+; LOWERRAYTRACINGPIPELINE-NEXT:    br i1 [[ISEND_I1]], label [[TMP19:%.*]], label [[TMP21:%.*]]
 ; LOWERRAYTRACINGPIPELINE:       19:
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP22:%.*]] = load [[STRUCT_ANYHITTRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    ret [[STRUCT_ANYHITTRAVERSALDATA]] [[TMP22]], !continuation.registercount [[META32]]
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP20:%.*]] = load [[STRUCT_ANYHITTRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    call void (...) @lgc.ilcps.return(i64 [[RETURNADDR]], [[STRUCT_ANYHITTRAVERSALDATA]] [[TMP20]]), !continuation.registercount [[META32]]
+; LOWERRAYTRACINGPIPELINE-NEXT:    unreachable
 ; LOWERRAYTRACINGPIPELINE:       21:
 ; LOWERRAYTRACINGPIPELINE-NEXT:    call void @llvm.lifetime.end.p0(i64 8, ptr [[TMP7]]) #[[ATTR1]]
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP24:%.*]] = load [[STRUCT_ANYHITTRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    ret [[STRUCT_ANYHITTRAVERSALDATA]] [[TMP24]], !continuation.registercount [[META32]]
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP22:%.*]] = load [[STRUCT_ANYHITTRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    call void (...) @lgc.ilcps.return(i64 [[RETURNADDR]], [[STRUCT_ANYHITTRAVERSALDATA]] [[TMP22]]), !continuation.registercount [[META32]]
+; LOWERRAYTRACINGPIPELINE-NEXT:    unreachable
 ;
 ;
 ; LOWERRAYTRACINGPIPELINE-LABEL: define %struct.AnyHitTraversalData @MyIntersectionShader2(
-; LOWERRAYTRACINGPIPELINE-SAME: [[STRUCT_ANYHITTRAVERSALDATA:%.*]] [[TMP0:%.*]]) #[[ATTR2]] !lgc.rt.shaderstage [[META43]] !continuation.registercount [[META32]] !continuation [[META45:![0-9]+]] {
+; LOWERRAYTRACINGPIPELINE-SAME: i64 [[RETURNADDR:%.*]], [[STRUCT_ANYHITTRAVERSALDATA:%.*]] [[TMP0:%.*]]) #[[ATTR2]] !lgc.rt.shaderstage [[META43]] !continuation.registercount [[META32]] !continuation [[META45:![0-9]+]] {
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP2:%.*]] = alloca [[STRUCT_HITDATA:%.*]], align 8
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP3:%.*]] = alloca [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES:%.*]], align 8
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP4:%.*]] = alloca [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES2:%.*]], align 4
@@ -917,8 +924,8 @@ attributes #5 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 ; LOWERRAYTRACINGPIPELINE:       callAHit.i:
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TRAV_DATA_I:%.*]] = load [[STRUCT_ANYHITTRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP8:%.*]] = load [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES2]], ptr [[TMP4]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP9:%.*]] = call ptr inttoptr (i64 3 to ptr)([[STRUCT_ANYHITTRAVERSALDATA]] [[TRAV_DATA_I]], float [[RES_I2]], i32 0, [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES2]] [[TMP8]]), !continuation.registercount [[META32]], !continuation.returnedRegistercount !32
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP10:%.*]] = call [[STRUCT_ANYHITTRAVERSALDATA]] [[AWAIT_STRUCT_ANYHITTRAVERSALDATA:@[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]](ptr [[TMP9]])
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP9:%.*]] = call ptr inttoptr (i64 3 to ptr)([[STRUCT_ANYHITTRAVERSALDATA]] [[TRAV_DATA_I]], float [[RES_I2]], i32 0, [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES2]] [[TMP8]]), !continuation.registercount [[META32]], !continuation.returnedRegistercount [[META32]]
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP10:%.*]] = call [[STRUCT_ANYHITTRAVERSALDATA]] [[AWAIT_2:@[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]](ptr [[TMP9]])
 ; LOWERRAYTRACINGPIPELINE-NEXT:    store [[STRUCT_ANYHITTRAVERSALDATA]] [[TMP10]], ptr [[SYSTEM_DATA_ALLOCA]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP11:%.*]] = getelementptr inbounds [[STRUCT_ANYHITTRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0, i32 0, i32 0
 ; LOWERRAYTRACINGPIPELINE-NEXT:    call void @amd.dx.setLocalRootIndex(i32 5)
@@ -926,63 +933,66 @@ attributes #5 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 ; LOWERRAYTRACINGPIPELINE:       isEnd.i:
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP12:%.*]] = load i32, ptr [[TMP4]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP12]], ptr [[TMP3]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i32 1
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i32, ptr [[TMP3]], i32 1
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP17:%.*]] = load i32, ptr [[TMP15]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP17]], ptr [[TMP16]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP18:%.*]] = load [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]], ptr [[TMP3]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP19:%.*]] = getelementptr inbounds [[STRUCT_ANYHITTRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0, i32 0
-; LOWERRAYTRACINGPIPELINE-NEXT:    store [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]] [[TMP18]], ptr [[TMP19]], align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i32 1
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i32, ptr [[TMP3]], i32 1
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP15:%.*]] = load i32, ptr [[TMP13]], align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP15]], ptr [[TMP14]], align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP16:%.*]] = load [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]], ptr [[TMP3]], align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [[STRUCT_ANYHITTRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0, i32 0
+; LOWERRAYTRACINGPIPELINE-NEXT:    store [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]] [[TMP16]], ptr [[TMP17]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    br label [[_CONT_REPORTHIT_EXIT]]
 ; LOWERRAYTRACINGPIPELINE:       _cont_ReportHit.exit:
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [[STRUCT_ANYHITTRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP18:%.*]] = getelementptr inbounds [[STRUCT_ANYHITTRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[ISEND_I1:%.*]] = call i1 @opaqueIsEnd()
-; LOWERRAYTRACINGPIPELINE-NEXT:    br i1 [[ISEND_I1]], label [[TMP21:%.*]], label [[TMP23:%.*]]
+; LOWERRAYTRACINGPIPELINE-NEXT:    br i1 [[ISEND_I1]], label [[TMP19:%.*]], label [[TMP21:%.*]]
 ; LOWERRAYTRACINGPIPELINE:       19:
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP22:%.*]] = load [[STRUCT_ANYHITTRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    ret [[STRUCT_ANYHITTRAVERSALDATA]] [[TMP22]], !continuation.registercount [[META32]]
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP20:%.*]] = load [[STRUCT_ANYHITTRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    call void (...) @lgc.ilcps.return(i64 [[RETURNADDR]], [[STRUCT_ANYHITTRAVERSALDATA]] [[TMP20]]), !continuation.registercount [[META32]]
+; LOWERRAYTRACINGPIPELINE-NEXT:    unreachable
 ; LOWERRAYTRACINGPIPELINE:       21:
 ; LOWERRAYTRACINGPIPELINE-NEXT:    call void @llvm.lifetime.end.p0(i64 8, ptr [[TMP7]]) #[[ATTR1]]
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP24:%.*]] = load [[STRUCT_ANYHITTRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    ret [[STRUCT_ANYHITTRAVERSALDATA]] [[TMP24]], !continuation.registercount [[META32]]
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP22:%.*]] = load [[STRUCT_ANYHITTRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    call void (...) @lgc.ilcps.return(i64 [[RETURNADDR]], [[STRUCT_ANYHITTRAVERSALDATA]] [[TMP22]]), !continuation.registercount [[META32]]
+; LOWERRAYTRACINGPIPELINE-NEXT:    unreachable
 ;
 ;
 ; LOWERRAYTRACINGPIPELINE-LABEL: define %struct.DispatchSystemData @MyMissShader(
-; LOWERRAYTRACINGPIPELINE-SAME: [[STRUCT_SYSTEMDATA:%.*]] [[TMP0:%.*]]) #[[ATTR2]] !lgc.rt.shaderstage [[META46:![0-9]+]] !continuation.registercount [[META33]] !continuation [[META47:![0-9]+]] {
+; LOWERRAYTRACINGPIPELINE-SAME: i64 [[RETURNADDR:%.*]], [[STRUCT_SYSTEMDATA:%.*]] [[TMP0:%.*]]) #[[ATTR2]] !lgc.rt.shaderstage [[META46:![0-9]+]] !continuation.registercount [[META33]] !continuation [[META47:![0-9]+]] {
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[SYSTEM_DATA_ALLOCA:%.*]] = alloca [[STRUCT_SYSTEMDATA]], align 8
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP2:%.*]] = alloca [[STRUCT_RAYPAYLOAD:%.*]], align 8
 ; LOWERRAYTRACINGPIPELINE-NEXT:    store [[STRUCT_SYSTEMDATA]] [[TMP0]], ptr [[SYSTEM_DATA_ALLOCA]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [[STRUCT_SYSTEMDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [[STRUCT_RAYPAYLOAD]], ptr [[TMP2]], i32 0, i32 0
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP7:%.*]] = load i32, ptr addrspace(20) @PAYLOAD, align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP7]], ptr [[TMP4]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i32 1
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP10:%.*]] = load i32, ptr addrspace(20) getelementptr inbounds (i32, ptr addrspace(20) @PAYLOAD, i32 7), align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP10]], ptr [[TMP8]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i32, ptr [[TMP8]], i32 1
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP12:%.*]] = load i32, ptr addrspace(20) getelementptr inbounds (i32, ptr addrspace(20) @PAYLOAD, i32 8), align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP12]], ptr [[TMP11]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[TMP8]], i32 2
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP14:%.*]] = load i32, ptr addrspace(20) getelementptr inbounds (i32, ptr addrspace(20) @PAYLOAD, i32 9), align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP14]], ptr [[TMP13]], align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [[STRUCT_RAYPAYLOAD]], ptr [[TMP2]], i32 0
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP5:%.*]] = load i32, ptr addrspace(20) @PAYLOAD, align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP5]], ptr [[TMP4]], align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i32 1
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP7:%.*]] = load i32, ptr addrspace(20) getelementptr inbounds (i32, ptr addrspace(20) @PAYLOAD, i32 7), align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP7]], ptr [[TMP6]], align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[TMP6]], i32 1
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP9:%.*]] = load i32, ptr addrspace(20) getelementptr inbounds (i32, ptr addrspace(20) @PAYLOAD, i32 8), align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP9]], ptr [[TMP8]], align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[TMP6]], i32 2
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP11:%.*]] = load i32, ptr addrspace(20) getelementptr inbounds (i32, ptr addrspace(20) @PAYLOAD, i32 9), align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP11]], ptr [[TMP10]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    call void @amd.dx.setLocalRootIndex(i32 5)
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [[STRUCT_RAYPAYLOAD]], ptr [[TMP2]], i32 0, i32 0
-; LOWERRAYTRACINGPIPELINE-NEXT:    store <4 x float> <float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 1.000000e+00>, ptr [[TMP15]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [[STRUCT_RAYPAYLOAD]], ptr [[TMP2]], i32 0, i32 0
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP19:%.*]] = load i32, ptr [[TMP16]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP19]], ptr addrspace(20) @PAYLOAD, align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP20:%.*]] = getelementptr inbounds i32, ptr [[TMP16]], i32 1
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP22:%.*]] = load i32, ptr [[TMP20]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP22]], ptr addrspace(20) getelementptr inbounds (i32, ptr addrspace(20) @PAYLOAD, i32 7), align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP23:%.*]] = getelementptr inbounds i32, ptr [[TMP20]], i32 1
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP24:%.*]] = load i32, ptr [[TMP23]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP24]], ptr addrspace(20) getelementptr inbounds (i32, ptr addrspace(20) @PAYLOAD, i32 8), align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP25:%.*]] = getelementptr inbounds i32, ptr [[TMP20]], i32 2
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP26:%.*]] = load i32, ptr [[TMP25]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP26]], ptr addrspace(20) getelementptr inbounds (i32, ptr addrspace(20) @PAYLOAD, i32 9), align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP27:%.*]] = getelementptr inbounds [[STRUCT_SYSTEMDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP28:%.*]] = load [[STRUCT_DISPATCHSYSTEMDATA:%.*]], ptr [[TMP27]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    ret [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP28]], !continuation.registercount [[META33]]
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [[STRUCT_RAYPAYLOAD]], ptr [[TMP2]], i32 0, i32 0
+; LOWERRAYTRACINGPIPELINE-NEXT:    store <4 x float> <float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 1.000000e+00>, ptr [[TMP12]], align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP13:%.*]] = getelementptr inbounds [[STRUCT_RAYPAYLOAD]], ptr [[TMP2]], i32 0
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP14:%.*]] = load i32, ptr [[TMP13]], align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP14]], ptr addrspace(20) @PAYLOAD, align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i32, ptr [[TMP13]], i32 1
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP16:%.*]] = load i32, ptr [[TMP15]], align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP16]], ptr addrspace(20) getelementptr inbounds (i32, ptr addrspace(20) @PAYLOAD, i32 7), align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i32, ptr [[TMP15]], i32 1
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP18:%.*]] = load i32, ptr [[TMP17]], align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP18]], ptr addrspace(20) getelementptr inbounds (i32, ptr addrspace(20) @PAYLOAD, i32 8), align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i32, ptr [[TMP15]], i32 2
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP20:%.*]] = load i32, ptr [[TMP19]], align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP20]], ptr addrspace(20) getelementptr inbounds (i32, ptr addrspace(20) @PAYLOAD, i32 9), align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP21:%.*]] = getelementptr inbounds [[STRUCT_SYSTEMDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP22:%.*]] = load [[STRUCT_DISPATCHSYSTEMDATA:%.*]], ptr [[TMP21]], align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    call void (...) @lgc.ilcps.return(i64 [[RETURNADDR]], [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP22]]), !continuation.registercount [[META33]]
+; LOWERRAYTRACINGPIPELINE-NEXT:    unreachable
 ;
 ;
 ; LOWERRAYTRACINGPIPELINE-CPS-LABEL: define i32 @_cont_GetContinuationStackAddr(
@@ -1065,7 +1075,7 @@ attributes #5 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 ;
 ;
 ; LOWERRAYTRACINGPIPELINE-CPS-LABEL: define void @MyRayGen(
-; LOWERRAYTRACINGPIPELINE-CPS-SAME: {} [[CONT_STATE:%.*]], i32 [[RETURN_ADDR:%.*]], i32 [[SHADER_INDEX:%.*]], [[STRUCT_DISPATCHSYSTEMDATA:%.*]] [[TMP0:%.*]]) #[[ATTR2:[0-9]+]] !lgc.rt.shaderstage [[META22:![0-9]+]] !lgc.cps [[META22]] !continuation [[META35:![0-9]+]] {
+; LOWERRAYTRACINGPIPELINE-CPS-SAME: {} [[CONT_STATE:%.*]], i32 [[RETURNADDR:%.*]], i32 [[SHADER_INDEX:%.*]], [[STRUCT_DISPATCHSYSTEMDATA:%.*]] [[TMP0:%.*]]) #[[ATTR2:[0-9]+]] !lgc.rt.shaderstage [[META22:![0-9]+]] !lgc.cps [[META35:![0-9]+]] !continuation [[META36:![0-9]+]] {
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[SYSTEM_DATA_ALLOCA:%.*]] = alloca [[STRUCT_DISPATCHSYSTEMDATA]], align 8
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[PAYLOAD_ALLOCA:%.*]] = alloca [30 x i32], align 4
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP0]], ptr [[SYSTEM_DATA_ALLOCA]], align 4
@@ -1076,7 +1086,7 @@ attributes #5 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP5:%.*]] = bitcast ptr [[TMP4]] to ptr
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    call void @llvm.lifetime.start.p0(i64 16, ptr [[TMP5]]) #[[ATTR1:[0-9]+]]
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[STRUCT_RAYPAYLOAD]], ptr [[TMP4]], i32 0, i32 0
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store <4 x float> zeroinitializer, ptr [[TMP6]], align 4, !tbaa [[TBAA36:![0-9]+]]
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store <4 x float> zeroinitializer, ptr [[TMP6]], align 4, !tbaa [[TBAA37:![0-9]+]]
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP7:%.*]] = call [[DX_TYPES_HANDLE]] [[DX_OP_CREATEHANDLEFORLIB_DX_TYPES_HANDLE:@[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]](i32 160, [[DX_TYPES_HANDLE]] [[TMP2]])
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP8:%.*]] = call [[DX_TYPES_HANDLE]] [[DX_OP_ANNOTATEHANDLE:@[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]](i32 216, [[DX_TYPES_HANDLE]] [[TMP7]], [[DX_TYPES_RESOURCEPROPERTIES:%.*]] { i32 16, i32 0 })
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP9:%.*]] = call i64 @amd.dx.getAccelStructAddr([[DX_TYPES_HANDLE]] [[TMP8]])
@@ -1085,64 +1095,64 @@ attributes #5 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TRAV_DATA_I:%.*]] = insertvalue [[STRUCT_TRAVERSALDATA:%.*]] undef, [[STRUCT_SYSTEMDATA]] [[SYS_DATA_I]], 0
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[ADDR_I:%.*]] = call i64 @_AmdGetResumePointAddr() #[[ATTR3:[0-9]+]]
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TRAV_DATA2_I:%.*]] = insertvalue [[STRUCT_TRAVERSALDATA]] [[TRAV_DATA_I]], i64 [[ADDR_I]], 5
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [[STRUCT_RAYPAYLOAD]], ptr [[TMP4]], i32 0, i32 0
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [[STRUCT_RAYPAYLOAD]], ptr [[TMP4]], i32 0
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP11:%.*]] = load i32, ptr [[TMP10]], align 4
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store i32 [[TMP11]], ptr [[PAYLOAD_ALLOCA]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i32, ptr [[PAYLOAD_ALLOCA]], i32 7
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i32, ptr [[TMP10]], i32 1
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP14:%.*]] = load i32, ptr [[TMP17]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store i32 [[TMP14]], ptr [[TMP16]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i32, ptr [[TMP16]], i32 1
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP22:%.*]] = getelementptr inbounds i32, ptr [[TMP17]], i32 1
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP23:%.*]] = load i32, ptr [[TMP22]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store i32 [[TMP23]], ptr [[TMP21]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP24:%.*]] = getelementptr inbounds i32, ptr [[TMP16]], i32 2
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP25:%.*]] = getelementptr inbounds i32, ptr [[TMP17]], i32 2
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP26:%.*]] = load i32, ptr [[TMP25]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store i32 [[TMP26]], ptr [[TMP24]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP27:%.*]] = load [10 x i32], ptr [[PAYLOAD_ALLOCA]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP28:%.*]] = call { [[STRUCT_DISPATCHSYSTEMDATA]], [33 x i32], [10 x i32] } (...) @lgc.cps.await__sl_s_struct.DispatchSystemDatasa33i32a10i32s(i32 4, i32 4, i32 5, [36 x i32] poison, [10 x i32] [[TMP27]]), !continuation.returnedRegistercount !33, !continuation.registercount [[META33:![0-9]+]]
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP29:%.*]] = extractvalue { [[STRUCT_DISPATCHSYSTEMDATA]], [33 x i32], [10 x i32] } [[TMP28]], 2
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store [10 x i32] [[TMP29]], ptr [[PAYLOAD_ALLOCA]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP30:%.*]] = extractvalue { [[STRUCT_DISPATCHSYSTEMDATA]], [33 x i32], [10 x i32] } [[TMP28]], 0
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[PAYLOAD_ALLOCA]], i32 7
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[TMP10]], i32 1
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP14:%.*]] = load i32, ptr [[TMP13]], align 4
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store i32 [[TMP14]], ptr [[TMP12]], align 4
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i32, ptr [[TMP12]], i32 1
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i32, ptr [[TMP13]], i32 1
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP17:%.*]] = load i32, ptr [[TMP16]], align 4
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store i32 [[TMP17]], ptr [[TMP15]], align 4
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i32, ptr [[TMP12]], i32 2
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i32, ptr [[TMP13]], i32 2
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP20:%.*]] = load i32, ptr [[TMP19]], align 4
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store i32 [[TMP20]], ptr [[TMP18]], align 4
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP21:%.*]] = load [10 x i32], ptr [[PAYLOAD_ALLOCA]], align 4
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP22:%.*]] = call { [[STRUCT_DISPATCHSYSTEMDATA]], [33 x i32], [10 x i32] } (...) @lgc.cps.await__sl_s_struct.DispatchSystemDatasa33i32a10i32s(i32 4, i32 8, i32 5, [36 x i32] poison, [10 x i32] [[TMP21]]), !continuation.returnedRegistercount [[META33:![0-9]+]], !continuation.registercount [[META33]]
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP23:%.*]] = extractvalue { [[STRUCT_DISPATCHSYSTEMDATA]], [33 x i32], [10 x i32] } [[TMP22]], 2
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store [10 x i32] [[TMP23]], ptr [[PAYLOAD_ALLOCA]], align 4
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP24:%.*]] = extractvalue { [[STRUCT_DISPATCHSYSTEMDATA]], [33 x i32], [10 x i32] } [[TMP22]], 0
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store [[STRUCT_RAYPAYLOAD]] poison, ptr [[TMP4]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP31:%.*]] = getelementptr inbounds [[STRUCT_RAYPAYLOAD]], ptr [[TMP4]], i32 0, i32 0
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP32:%.*]] = load i32, ptr [[PAYLOAD_ALLOCA]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store i32 [[TMP32]], ptr [[TMP31]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP37:%.*]] = getelementptr inbounds i32, ptr [[PAYLOAD_ALLOCA]], i32 7
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP38:%.*]] = getelementptr inbounds i32, ptr [[TMP31]], i32 1
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP33:%.*]] = load i32, ptr [[TMP37]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store i32 [[TMP33]], ptr [[TMP38]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP42:%.*]] = getelementptr inbounds i32, ptr [[TMP38]], i32 1
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP43:%.*]] = getelementptr inbounds i32, ptr [[TMP37]], i32 1
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP44:%.*]] = load i32, ptr [[TMP43]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store i32 [[TMP44]], ptr [[TMP42]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP45:%.*]] = getelementptr inbounds i32, ptr [[TMP38]], i32 2
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP46:%.*]] = getelementptr inbounds i32, ptr [[TMP37]], i32 2
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP47:%.*]] = load i32, ptr [[TMP46]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store i32 [[TMP47]], ptr [[TMP45]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP30]], ptr [[SYSTEM_DATA_ALLOCA]], align 4
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [[STRUCT_RAYPAYLOAD]], ptr [[TMP4]], i32 0
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP26:%.*]] = load i32, ptr [[PAYLOAD_ALLOCA]], align 4
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store i32 [[TMP26]], ptr [[TMP25]], align 4
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP27:%.*]] = getelementptr inbounds i32, ptr [[PAYLOAD_ALLOCA]], i32 7
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP28:%.*]] = getelementptr inbounds i32, ptr [[TMP25]], i32 1
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP29:%.*]] = load i32, ptr [[TMP27]], align 4
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store i32 [[TMP29]], ptr [[TMP28]], align 4
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP30:%.*]] = getelementptr inbounds i32, ptr [[TMP28]], i32 1
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP31:%.*]] = getelementptr inbounds i32, ptr [[TMP27]], i32 1
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP32:%.*]] = load i32, ptr [[TMP31]], align 4
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store i32 [[TMP32]], ptr [[TMP30]], align 4
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP33:%.*]] = getelementptr inbounds i32, ptr [[TMP28]], i32 2
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP34:%.*]] = getelementptr inbounds i32, ptr [[TMP27]], i32 2
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP35:%.*]] = load i32, ptr [[TMP34]], align 4
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store i32 [[TMP35]], ptr [[TMP33]], align 4
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP24]], ptr [[SYSTEM_DATA_ALLOCA]], align 4
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    call void @amd.dx.setLocalRootIndex(i32 0)
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    br label [[DOTSPLIT:%.*]]
 ; LOWERRAYTRACINGPIPELINE-CPS:       .split:
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP48:%.*]] = load <4 x float>, ptr [[TMP6]], align 4, !tbaa [[TBAA36]]
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP49:%.*]] = call <3 x i32> @lgc.rt.dispatch.rays.index()
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[EXTRACT:%.*]] = extractelement <3 x i32> [[TMP49]], i8 0
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP50:%.*]] = call <3 x i32> @lgc.rt.dispatch.rays.index()
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[EXTRACT1:%.*]] = extractelement <3 x i32> [[TMP50]], i8 1
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP51:%.*]] = call [[DX_TYPES_HANDLE]] [[DX_OP_CREATEHANDLEFORLIB_DX_TYPES_HANDLE]](i32 160, [[DX_TYPES_HANDLE]] [[TMP3]])
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP52:%.*]] = call [[DX_TYPES_HANDLE]] [[DX_OP_ANNOTATEHANDLE]](i32 216, [[DX_TYPES_HANDLE]] [[TMP51]], [[DX_TYPES_RESOURCEPROPERTIES]] { i32 4098, i32 1033 })
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP53:%.*]] = extractelement <4 x float> [[TMP48]], i64 0
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP54:%.*]] = extractelement <4 x float> [[TMP48]], i64 1
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP55:%.*]] = extractelement <4 x float> [[TMP48]], i64 2
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP56:%.*]] = extractelement <4 x float> [[TMP48]], i64 3
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    call void @dx.op.textureStore.f32(i32 67, [[DX_TYPES_HANDLE]] [[TMP52]], i32 [[EXTRACT]], i32 [[EXTRACT1]], i32 undef, float [[TMP53]], float [[TMP54]], float [[TMP55]], float [[TMP56]], i8 15)
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP36:%.*]] = load <4 x float>, ptr [[TMP6]], align 4, !tbaa [[TBAA37]]
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP37:%.*]] = call <3 x i32> @lgc.rt.dispatch.rays.index()
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[EXTRACT:%.*]] = extractelement <3 x i32> [[TMP37]], i8 0
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP38:%.*]] = call <3 x i32> @lgc.rt.dispatch.rays.index()
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[EXTRACT1:%.*]] = extractelement <3 x i32> [[TMP38]], i8 1
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP39:%.*]] = call [[DX_TYPES_HANDLE]] [[DX_OP_CREATEHANDLEFORLIB_DX_TYPES_HANDLE]](i32 160, [[DX_TYPES_HANDLE]] [[TMP3]])
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP40:%.*]] = call [[DX_TYPES_HANDLE]] [[DX_OP_ANNOTATEHANDLE]](i32 216, [[DX_TYPES_HANDLE]] [[TMP39]], [[DX_TYPES_RESOURCEPROPERTIES]] { i32 4098, i32 1033 })
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP41:%.*]] = extractelement <4 x float> [[TMP36]], i64 0
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP42:%.*]] = extractelement <4 x float> [[TMP36]], i64 1
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP43:%.*]] = extractelement <4 x float> [[TMP36]], i64 2
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP44:%.*]] = extractelement <4 x float> [[TMP36]], i64 3
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    call void @dx.op.textureStore.f32(i32 67, [[DX_TYPES_HANDLE]] [[TMP40]], i32 [[EXTRACT]], i32 [[EXTRACT1]], i32 undef, float [[TMP41]], float [[TMP42]], float [[TMP43]], float [[TMP44]], i8 15)
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    call void @llvm.lifetime.end.p0(i64 16, ptr [[TMP5]]) #[[ATTR1]]
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    ret void
 ;
 ;
 ; LOWERRAYTRACINGPIPELINE-CPS-LABEL: define void @MyClosestHitShader(
-; LOWERRAYTRACINGPIPELINE-CPS-SAME: {} [[CONT_STATE:%.*]], i32 [[RETURN_ADDR:%.*]], i32 [[SHADER_INDEX:%.*]], [[STRUCT_SYSTEMDATA:%.*]] [[SYSTEM_DATA:%.*]], {} [[HIT_ATTRS:%.*]], [33 x i32] [[PADDING:%.*]], [10 x i32] [[PAYLOAD:%.*]]) #[[ATTR2]] !lgc.rt.shaderstage [[META39:![0-9]+]] !lgc.cps [[META40:![0-9]+]] !continuation [[META41:![0-9]+]] {
+; LOWERRAYTRACINGPIPELINE-CPS-SAME: {} [[CONT_STATE:%.*]], i32 [[RETURNADDR:%.*]], i32 [[SHADER_INDEX:%.*]], [[STRUCT_SYSTEMDATA:%.*]] [[SYSTEM_DATA:%.*]], {} [[HIT_ATTRS:%.*]], [33 x i32] [[PADDING:%.*]], [10 x i32] [[PAYLOAD:%.*]]) #[[ATTR2]] !lgc.rt.shaderstage [[META40:![0-9]+]] !lgc.cps [[META41:![0-9]+]] !continuation [[META42:![0-9]+]] {
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP1:%.*]] = alloca [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES:%.*]], align 8
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[SYSTEM_DATA_ALLOCA:%.*]] = alloca [[STRUCT_SYSTEMDATA]], align 8
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[PAYLOAD_ALLOCA:%.*]] = alloca [30 x i32], align 4
@@ -1150,66 +1160,66 @@ attributes #5 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[HITATTRS:%.*]] = alloca [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]], align 8
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store [10 x i32] [[PAYLOAD]], ptr [[PAYLOAD_ALLOCA]], align 4
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store [[STRUCT_SYSTEMDATA]] [[SYSTEM_DATA]], ptr [[SYSTEM_DATA_ALLOCA]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [[STRUCT_RAYPAYLOAD]], ptr [[TMP2]], i32 0, i32 0
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [[STRUCT_RAYPAYLOAD]], ptr [[TMP2]], i32 0
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP4:%.*]] = load i32, ptr [[PAYLOAD_ALLOCA]], align 4
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store i32 [[TMP4]], ptr [[TMP3]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[PAYLOAD_ALLOCA]], i32 7
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[TMP3]], i32 1
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP7:%.*]] = load i32, ptr [[TMP9]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store i32 [[TMP7]], ptr [[TMP10]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i32, ptr [[TMP10]], i32 1
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i32, ptr [[TMP9]], i32 1
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP16:%.*]] = load i32, ptr [[TMP15]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store i32 [[TMP16]], ptr [[TMP14]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i32, ptr [[TMP10]], i32 2
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i32, ptr [[TMP9]], i32 2
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP19:%.*]] = load i32, ptr [[TMP18]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store i32 [[TMP19]], ptr [[TMP17]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP20:%.*]] = call [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]] [[_CONT_GETTRIANGLEHITATTRIBUTES:@[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]](ptr [[SYSTEM_DATA_ALLOCA]])
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]] [[TMP20]], ptr [[TMP1]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP21:%.*]] = load i32, ptr [[TMP1]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store i32 [[TMP21]], ptr [[HITATTRS]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP24:%.*]] = getelementptr inbounds i32, ptr [[HITATTRS]], i32 1
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP25:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 1
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP26:%.*]] = load i32, ptr [[TMP25]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store i32 [[TMP26]], ptr [[TMP24]], align 4
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[PAYLOAD_ALLOCA]], i32 7
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[TMP3]], i32 1
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP7:%.*]] = load i32, ptr [[TMP5]], align 4
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store i32 [[TMP7]], ptr [[TMP6]], align 4
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[TMP6]], i32 1
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[TMP5]], i32 1
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP10:%.*]] = load i32, ptr [[TMP9]], align 4
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store i32 [[TMP10]], ptr [[TMP8]], align 4
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i32, ptr [[TMP6]], i32 2
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[TMP5]], i32 2
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP13:%.*]] = load i32, ptr [[TMP12]], align 4
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store i32 [[TMP13]], ptr [[TMP11]], align 4
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP14:%.*]] = call [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]] [[_CONT_GETTRIANGLEHITATTRIBUTES:@[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]](ptr [[SYSTEM_DATA_ALLOCA]])
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]] [[TMP14]], ptr [[TMP1]], align 4
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP15:%.*]] = load i32, ptr [[TMP1]], align 4
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store i32 [[TMP15]], ptr [[HITATTRS]], align 4
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i32, ptr [[HITATTRS]], i32 1
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 1
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP18:%.*]] = load i32, ptr [[TMP17]], align 4
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store i32 [[TMP18]], ptr [[TMP16]], align 4
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    call void @amd.dx.setLocalRootIndex(i32 [[SHADER_INDEX]])
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP27:%.*]] = getelementptr inbounds [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]], ptr [[HITATTRS]], i32 0, i32 0
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP28:%.*]] = load <2 x float>, ptr [[TMP27]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP29:%.*]] = extractelement <2 x float> [[TMP28]], i32 0
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP30:%.*]] = fsub fast float 1.000000e+00, [[TMP29]]
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP31:%.*]] = extractelement <2 x float> [[TMP28]], i32 1
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP32:%.*]] = fsub fast float [[TMP30]], [[TMP31]]
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP33:%.*]] = insertelement <4 x float> undef, float [[TMP32]], i64 0
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP34:%.*]] = insertelement <4 x float> [[TMP33]], float [[TMP29]], i64 1
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP35:%.*]] = insertelement <4 x float> [[TMP34]], float [[TMP31]], i64 2
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP36:%.*]] = insertelement <4 x float> [[TMP35]], float 1.000000e+00, i64 3
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP37:%.*]] = getelementptr inbounds [[STRUCT_RAYPAYLOAD]], ptr [[TMP2]], i32 0, i32 0
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store <4 x float> [[TMP36]], ptr [[TMP37]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP38:%.*]] = getelementptr inbounds [[STRUCT_RAYPAYLOAD]], ptr [[TMP2]], i32 0, i32 0
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP39:%.*]] = load i32, ptr [[TMP38]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store i32 [[TMP39]], ptr [[PAYLOAD_ALLOCA]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP44:%.*]] = getelementptr inbounds i32, ptr [[PAYLOAD_ALLOCA]], i32 7
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP45:%.*]] = getelementptr inbounds i32, ptr [[TMP38]], i32 1
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP40:%.*]] = load i32, ptr [[TMP45]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store i32 [[TMP40]], ptr [[TMP44]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP49:%.*]] = getelementptr inbounds i32, ptr [[TMP44]], i32 1
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP50:%.*]] = getelementptr inbounds i32, ptr [[TMP45]], i32 1
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP51:%.*]] = load i32, ptr [[TMP50]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store i32 [[TMP51]], ptr [[TMP49]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP52:%.*]] = getelementptr inbounds i32, ptr [[TMP44]], i32 2
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP53:%.*]] = getelementptr inbounds i32, ptr [[TMP45]], i32 2
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP54:%.*]] = load i32, ptr [[TMP53]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store i32 [[TMP54]], ptr [[TMP52]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP55:%.*]] = getelementptr inbounds [[STRUCT_SYSTEMDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP56:%.*]] = load [[STRUCT_DISPATCHSYSTEMDATA:%.*]], ptr [[TMP55]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP57:%.*]] = load [10 x i32], ptr [[PAYLOAD_ALLOCA]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    call void (...) @lgc.cps.jump(i32 [[RETURN_ADDR]], i32 3, {} poison, i32 poison, i32 poison, [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP56]], [33 x i32] poison, [10 x i32] [[TMP57]]), !continuation.registercount [[META33]]
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP19:%.*]] = getelementptr inbounds [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]], ptr [[HITATTRS]], i32 0, i32 0
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP20:%.*]] = load <2 x float>, ptr [[TMP19]], align 4
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP21:%.*]] = extractelement <2 x float> [[TMP20]], i32 0
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP22:%.*]] = fsub fast float 1.000000e+00, [[TMP21]]
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP23:%.*]] = extractelement <2 x float> [[TMP20]], i32 1
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP24:%.*]] = fsub fast float [[TMP22]], [[TMP23]]
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP25:%.*]] = insertelement <4 x float> undef, float [[TMP24]], i64 0
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP26:%.*]] = insertelement <4 x float> [[TMP25]], float [[TMP21]], i64 1
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP27:%.*]] = insertelement <4 x float> [[TMP26]], float [[TMP23]], i64 2
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP28:%.*]] = insertelement <4 x float> [[TMP27]], float 1.000000e+00, i64 3
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP29:%.*]] = getelementptr inbounds [[STRUCT_RAYPAYLOAD]], ptr [[TMP2]], i32 0, i32 0
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store <4 x float> [[TMP28]], ptr [[TMP29]], align 4
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP30:%.*]] = getelementptr inbounds [[STRUCT_RAYPAYLOAD]], ptr [[TMP2]], i32 0
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP31:%.*]] = load i32, ptr [[TMP30]], align 4
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store i32 [[TMP31]], ptr [[PAYLOAD_ALLOCA]], align 4
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP32:%.*]] = getelementptr inbounds i32, ptr [[PAYLOAD_ALLOCA]], i32 7
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP33:%.*]] = getelementptr inbounds i32, ptr [[TMP30]], i32 1
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP34:%.*]] = load i32, ptr [[TMP33]], align 4
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store i32 [[TMP34]], ptr [[TMP32]], align 4
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP35:%.*]] = getelementptr inbounds i32, ptr [[TMP32]], i32 1
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP36:%.*]] = getelementptr inbounds i32, ptr [[TMP33]], i32 1
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP37:%.*]] = load i32, ptr [[TMP36]], align 4
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store i32 [[TMP37]], ptr [[TMP35]], align 4
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP38:%.*]] = getelementptr inbounds i32, ptr [[TMP32]], i32 2
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP39:%.*]] = getelementptr inbounds i32, ptr [[TMP33]], i32 2
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP40:%.*]] = load i32, ptr [[TMP39]], align 4
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store i32 [[TMP40]], ptr [[TMP38]], align 4
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP41:%.*]] = getelementptr inbounds [[STRUCT_SYSTEMDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP42:%.*]] = load [[STRUCT_DISPATCHSYSTEMDATA:%.*]], ptr [[TMP41]], align 4
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP43:%.*]] = load [10 x i32], ptr [[PAYLOAD_ALLOCA]], align 4
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    call void (...) @lgc.cps.jump(i32 [[RETURNADDR]], i32 6, {} poison, i32 poison, i32 poison, [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP42]], [33 x i32] poison, [10 x i32] [[TMP43]]), !continuation.registercount [[META33]]
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    unreachable
 ;
 ;
 ; LOWERRAYTRACINGPIPELINE-CPS-LABEL: define void @MyAnyHitShader(
-; LOWERRAYTRACINGPIPELINE-CPS-SAME: {} [[CONT_STATE:%.*]], i32 [[RETURN_ADDR:%.*]], i32 [[SHADER_INDEX:%.*]], [[STRUCT_ANYHITTRAVERSALDATA:%.*]] [[SYSTEM_DATA:%.*]], [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES:%.*]] [[HIT_ATTRS:%.*]], [6 x i32] [[PADDING:%.*]], [10 x i32] [[PAYLOAD:%.*]]) #[[ATTR2]] !lgc.rt.shaderstage [[META42:![0-9]+]] !lgc.cps [[META39]] !continuation [[META43:![0-9]+]] {
+; LOWERRAYTRACINGPIPELINE-CPS-SAME: {} [[CONT_STATE:%.*]], i32 [[RETURNADDR:%.*]], i32 [[SHADER_INDEX:%.*]], [[STRUCT_ANYHITTRAVERSALDATA:%.*]] [[SYSTEM_DATA:%.*]], [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES:%.*]] [[HIT_ATTRS:%.*]], [6 x i32] [[PADDING:%.*]], [10 x i32] [[PAYLOAD:%.*]]) #[[ATTR2]] !lgc.rt.shaderstage [[META41]] !lgc.cps [[META43:![0-9]+]] !continuation [[META44:![0-9]+]] {
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP1:%.*]] = alloca [[STRUCT_HITDATA:%.*]], align 8
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP2:%.*]] = alloca [[STRUCT_HITDATA]], align 8
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP3:%.*]] = alloca [[STRUCT_HITDATA]], align 8
@@ -1226,36 +1236,36 @@ attributes #5 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[HITATTRSALLOCA:%.*]] = alloca [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]], align 8
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store [10 x i32] [[PAYLOAD]], ptr [[PAYLOAD_ALLOCA]], align 4
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store [[STRUCT_ANYHITTRAVERSALDATA]] [[SYSTEM_DATA]], ptr [[SYSTEM_DATA_ALLOCA]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP11:%.*]] = getelementptr inbounds [[STRUCT_RAYPAYLOAD]], ptr [[TMP10]], i32 0, i32 0
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP11:%.*]] = getelementptr inbounds [[STRUCT_RAYPAYLOAD]], ptr [[TMP10]], i32 0
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP12:%.*]] = load i32, ptr [[PAYLOAD_ALLOCA]], align 4
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store i32 [[TMP12]], ptr [[TMP11]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i32, ptr [[PAYLOAD_ALLOCA]], i32 7
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i32, ptr [[TMP11]], i32 1
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP15:%.*]] = load i32, ptr [[TMP17]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store i32 [[TMP15]], ptr [[TMP18]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i32, ptr [[TMP18]], i32 1
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i32, ptr [[TMP17]], i32 1
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP20:%.*]] = load i32, ptr [[TMP19]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store i32 [[TMP20]], ptr [[TMP16]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP25:%.*]] = getelementptr inbounds i32, ptr [[TMP18]], i32 2
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP26:%.*]] = getelementptr inbounds i32, ptr [[TMP17]], i32 2
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP21:%.*]] = load i32, ptr [[TMP26]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store i32 [[TMP21]], ptr [[TMP25]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP28:%.*]] = getelementptr inbounds [[STRUCT_ANYHITTRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0, i32 0
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP29:%.*]] = call [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]] [[_CONT_GETTRIANGLEHITATTRIBUTES:@[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]](ptr [[TMP28]])
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]] [[TMP29]], ptr [[TMP9]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP30:%.*]] = getelementptr inbounds [[STRUCT_RAYPAYLOAD_ATTR_MAX_8_I32S_LAYOUT_1_ANYHIT_IN:%.*]], ptr [[PAYLOAD_ALLOCA]], i32 0, i32 0, i32 1
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP31:%.*]] = load i32, ptr [[TMP9]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store i32 [[TMP31]], ptr [[ORIGHITATTRS]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP32:%.*]] = getelementptr inbounds i32, ptr [[ORIGHITATTRS]], i32 1
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[PAYLOAD_ALLOCA]], i32 7
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i32, ptr [[TMP11]], i32 1
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP15:%.*]] = load i32, ptr [[TMP13]], align 4
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store i32 [[TMP15]], ptr [[TMP14]], align 4
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i32, ptr [[TMP14]], i32 1
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i32, ptr [[TMP13]], i32 1
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP18:%.*]] = load i32, ptr [[TMP17]], align 4
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store i32 [[TMP18]], ptr [[TMP16]], align 4
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i32, ptr [[TMP14]], i32 2
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP20:%.*]] = getelementptr inbounds i32, ptr [[TMP13]], i32 2
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP21:%.*]] = load i32, ptr [[TMP20]], align 4
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store i32 [[TMP21]], ptr [[TMP19]], align 4
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP22:%.*]] = getelementptr inbounds [[STRUCT_ANYHITTRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0, i32 0
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP23:%.*]] = call [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]] [[_CONT_GETTRIANGLEHITATTRIBUTES:@[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]](ptr [[TMP22]])
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]] [[TMP23]], ptr [[TMP9]], align 4
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [[STRUCT_RAYPAYLOAD_ATTR_MAX_8_I32S_LAYOUT_1_ANYHIT_IN:%.*]], ptr [[PAYLOAD_ALLOCA]], i32 0, i32 0, i32 1
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP25:%.*]] = load i32, ptr [[TMP9]], align 4
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store i32 [[TMP25]], ptr [[ORIGHITATTRS]], align 4
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP26:%.*]] = getelementptr inbounds i32, ptr [[ORIGHITATTRS]], i32 1
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP27:%.*]] = getelementptr inbounds i32, ptr [[TMP9]], i32 1
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP33:%.*]] = load i32, ptr [[TMP27]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store i32 [[TMP33]], ptr [[TMP32]], align 4
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP28:%.*]] = load i32, ptr [[TMP27]], align 4
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store i32 [[TMP28]], ptr [[TMP26]], align 4
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]] [[HIT_ATTRS]], ptr [[HITATTRSALLOCA]], align 4
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    call void @amd.dx.setLocalRootIndex(i32 [[SHADER_INDEX]])
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP37:%.*]] = getelementptr inbounds [[STRUCT_RAYPAYLOAD]], ptr [[TMP10]], i32 0, i32 0
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP38:%.*]] = load <4 x float>, ptr [[TMP37]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP39:%.*]] = getelementptr inbounds [[STRUCT_ANYHITTRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0, i32 0, i32 0
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP29:%.*]] = getelementptr inbounds [[STRUCT_RAYPAYLOAD]], ptr [[TMP10]], i32 0, i32 0
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP30:%.*]] = load <4 x float>, ptr [[TMP29]], align 4
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP31:%.*]] = getelementptr inbounds [[STRUCT_ANYHITTRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0, i32 0, i32 0
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[RES_I1:%.*]] = load [[STRUCT_HITDATA]], ptr [[SYSTEM_DATA_ALLOCA]], align 4
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store [[STRUCT_HITDATA]] [[RES_I1]], ptr [[TMP2]], align 4
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[RES_1_I2:%.*]] = load float, ptr [[TMP2]], align 4
@@ -1267,7 +1277,7 @@ attributes #5 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[VAL_1_I8:%.*]] = insertelement <3 x float> [[VAL_0_I7]], float [[RES_2_I4]], i32 1
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[VAL_2_I9:%.*]] = insertelement <3 x float> [[VAL_1_I8]], float [[RES_3_I6]], i32 2
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[EXTRACT1:%.*]] = extractelement <3 x float> [[VAL_2_I9]], i8 0
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP40:%.*]] = getelementptr inbounds [[STRUCT_ANYHITTRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0, i32 0, i32 0
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP32:%.*]] = getelementptr inbounds [[STRUCT_ANYHITTRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0, i32 0, i32 0
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[RES_I:%.*]] = load [[STRUCT_HITDATA]], ptr [[SYSTEM_DATA_ALLOCA]], align 4
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store [[STRUCT_HITDATA]] [[RES_I]], ptr [[TMP3]], align 4
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[RESPTR_1_I:%.*]] = getelementptr [[STRUCT_HITDATA]], ptr [[TMP3]], i32 0, i32 1, i32 0
@@ -1280,185 +1290,185 @@ attributes #5 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[VAL_1_I:%.*]] = insertelement <3 x float> [[VAL_0_I]], float [[RES_2_I]], i32 1
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[VAL_2_I:%.*]] = insertelement <3 x float> [[VAL_1_I]], float [[RES_3_I]], i32 2
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[EXTRACT:%.*]] = extractelement <3 x float> [[VAL_2_I]], i8 0
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP41:%.*]] = getelementptr inbounds [[STRUCT_ANYHITTRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0, i32 0, i32 0
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP33:%.*]] = getelementptr inbounds [[STRUCT_ANYHITTRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0, i32 0, i32 0
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[RES_I10:%.*]] = load [[STRUCT_HITDATA]], ptr [[SYSTEM_DATA_ALLOCA]], align 4
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store [[STRUCT_HITDATA]] [[RES_I10]], ptr [[TMP1]], align 4
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[RESPTR_I:%.*]] = getelementptr [[STRUCT_HITDATA]], ptr [[TMP1]], i32 0, i32 2
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[RES_I11:%.*]] = load float, ptr [[RESPTR_I]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP42:%.*]] = fmul fast float [[RES_I11]], [[EXTRACT]]
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP43:%.*]] = fadd fast float [[TMP42]], [[EXTRACT1]]
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP44:%.*]] = fcmp fast ogt float [[TMP43]], 0.000000e+00
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP45:%.*]] = fcmp fast ogt float [[TMP43]], 1.000000e+00
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP46:%.*]] = fcmp fast ogt float [[TMP43]], -1.000000e+00
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    br i1 [[TMP44]], label [[TMP47:%.*]], label [[TMP106:%.*]]
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP34:%.*]] = fmul fast float [[RES_I11]], [[EXTRACT]]
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP35:%.*]] = fadd fast float [[TMP34]], [[EXTRACT1]]
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP36:%.*]] = fcmp fast ogt float [[TMP35]], 0.000000e+00
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP37:%.*]] = fcmp fast ogt float [[TMP35]], 1.000000e+00
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP38:%.*]] = fcmp fast ogt float [[TMP35]], -1.000000e+00
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    br i1 [[TMP36]], label [[TMP39:%.*]], label [[TMP82:%.*]]
 ; LOWERRAYTRACINGPIPELINE-CPS:       39:
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store <4 x float> [[TMP38]], ptr [[TMP37]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    br i1 [[TMP45]], label [[TMP48:%.*]], label [[TMP77:%.*]]
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store <4 x float> [[TMP30]], ptr [[TMP29]], align 4
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    br i1 [[TMP37]], label [[TMP40:%.*]], label [[TMP61:%.*]]
 ; LOWERRAYTRACINGPIPELINE-CPS:       40:
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP49:%.*]] = getelementptr inbounds [[STRUCT_ANYHITTRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0, i32 0, i32 0
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    call void @_cont_AcceptHitAndEndSearch(ptr [[TMP49]])
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP50:%.*]] = getelementptr inbounds [[STRUCT_RAYPAYLOAD]], ptr [[TMP10]], i32 0, i32 0
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP51:%.*]] = load i32, ptr [[TMP50]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store i32 [[TMP51]], ptr [[PAYLOAD_ALLOCA]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP56:%.*]] = getelementptr inbounds i32, ptr [[PAYLOAD_ALLOCA]], i32 7
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP57:%.*]] = getelementptr inbounds i32, ptr [[TMP50]], i32 1
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP58:%.*]] = load i32, ptr [[TMP57]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store i32 [[TMP58]], ptr [[TMP56]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP59:%.*]] = getelementptr inbounds i32, ptr [[TMP56]], i32 1
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP60:%.*]] = getelementptr inbounds i32, ptr [[TMP57]], i32 1
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP61:%.*]] = load i32, ptr [[TMP60]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store i32 [[TMP61]], ptr [[TMP59]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP64:%.*]] = getelementptr inbounds i32, ptr [[TMP56]], i32 2
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP65:%.*]] = getelementptr inbounds i32, ptr [[TMP57]], i32 2
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP52:%.*]] = load i32, ptr [[TMP65]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store i32 [[TMP52]], ptr [[TMP64]], align 4
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP41:%.*]] = getelementptr inbounds [[STRUCT_ANYHITTRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0, i32 0, i32 0
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    call void @_cont_AcceptHitAndEndSearch(ptr [[TMP41]])
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP42:%.*]] = getelementptr inbounds [[STRUCT_RAYPAYLOAD]], ptr [[TMP10]], i32 0
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP43:%.*]] = load i32, ptr [[TMP42]], align 4
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store i32 [[TMP43]], ptr [[PAYLOAD_ALLOCA]], align 4
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP44:%.*]] = getelementptr inbounds i32, ptr [[PAYLOAD_ALLOCA]], i32 7
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP45:%.*]] = getelementptr inbounds i32, ptr [[TMP42]], i32 1
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP46:%.*]] = load i32, ptr [[TMP45]], align 4
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store i32 [[TMP46]], ptr [[TMP44]], align 4
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP47:%.*]] = getelementptr inbounds i32, ptr [[TMP44]], i32 1
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP48:%.*]] = getelementptr inbounds i32, ptr [[TMP45]], i32 1
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP49:%.*]] = load i32, ptr [[TMP48]], align 4
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store i32 [[TMP49]], ptr [[TMP47]], align 4
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP50:%.*]] = getelementptr inbounds i32, ptr [[TMP44]], i32 2
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP51:%.*]] = getelementptr inbounds i32, ptr [[TMP45]], i32 2
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP52:%.*]] = load i32, ptr [[TMP51]], align 4
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store i32 [[TMP52]], ptr [[TMP50]], align 4
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP53:%.*]] = load i32, ptr [[HITATTRSALLOCA]], align 4
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store i32 [[TMP53]], ptr [[TMP8]], align 4
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP54:%.*]] = getelementptr inbounds i32, ptr [[HITATTRSALLOCA]], i32 1
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP55:%.*]] = getelementptr inbounds i32, ptr [[TMP8]], i32 1
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP62:%.*]] = load i32, ptr [[TMP54]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store i32 [[TMP62]], ptr [[TMP55]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP73:%.*]] = load [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]], ptr [[TMP8]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP74:%.*]] = getelementptr inbounds [[STRUCT_ANYHITTRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0, i32 0
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]] [[TMP73]], ptr [[TMP74]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP75:%.*]] = load [[STRUCT_ANYHITTRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP76:%.*]] = load [10 x i32], ptr [[PAYLOAD_ALLOCA]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    call void (...) @lgc.cps.jump(i32 [[RETURN_ADDR]], i32 20, {} poison, i32 poison, i32 poison, [[STRUCT_ANYHITTRAVERSALDATA]] [[TMP75]], [8 x i32] poison, [10 x i32] [[TMP76]]), !continuation.registercount [[META33]]
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP56:%.*]] = load i32, ptr [[TMP54]], align 4
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store i32 [[TMP56]], ptr [[TMP55]], align 4
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP57:%.*]] = load [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]], ptr [[TMP8]], align 4
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP58:%.*]] = getelementptr inbounds [[STRUCT_ANYHITTRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0, i32 0
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]] [[TMP57]], ptr [[TMP58]], align 4
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP59:%.*]] = load [[STRUCT_ANYHITTRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], align 4
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP60:%.*]] = load [10 x i32], ptr [[PAYLOAD_ALLOCA]], align 4
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    call void (...) @lgc.cps.jump(i32 [[RETURNADDR]], i32 40, {} poison, i32 poison, i32 poison, [[STRUCT_ANYHITTRAVERSALDATA]] [[TMP59]], [8 x i32] poison, [10 x i32] [[TMP60]]), !continuation.registercount [[META33]]
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    unreachable
 ; LOWERRAYTRACINGPIPELINE-CPS:       61:
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP78:%.*]] = getelementptr inbounds [[STRUCT_ANYHITTRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0, i32 0, i32 0
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    call void @_cont_AcceptHitAndEndSearch(ptr [[TMP78]])
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP79:%.*]] = getelementptr inbounds [[STRUCT_RAYPAYLOAD]], ptr [[TMP10]], i32 0, i32 0
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP66:%.*]] = load i32, ptr [[TMP79]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store i32 [[TMP66]], ptr [[PAYLOAD_ALLOCA]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP85:%.*]] = getelementptr inbounds i32, ptr [[PAYLOAD_ALLOCA]], i32 7
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP86:%.*]] = getelementptr inbounds i32, ptr [[TMP79]], i32 1
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP67:%.*]] = load i32, ptr [[TMP86]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store i32 [[TMP67]], ptr [[TMP85]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP68:%.*]] = getelementptr inbounds i32, ptr [[TMP85]], i32 1
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP69:%.*]] = getelementptr inbounds i32, ptr [[TMP86]], i32 1
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP62:%.*]] = getelementptr inbounds [[STRUCT_ANYHITTRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0, i32 0, i32 0
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    call void @_cont_AcceptHitAndEndSearch(ptr [[TMP62]])
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP63:%.*]] = getelementptr inbounds [[STRUCT_RAYPAYLOAD]], ptr [[TMP10]], i32 0
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP64:%.*]] = load i32, ptr [[TMP63]], align 4
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store i32 [[TMP64]], ptr [[PAYLOAD_ALLOCA]], align 4
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP65:%.*]] = getelementptr inbounds i32, ptr [[PAYLOAD_ALLOCA]], i32 7
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP66:%.*]] = getelementptr inbounds i32, ptr [[TMP63]], i32 1
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP67:%.*]] = load i32, ptr [[TMP66]], align 4
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store i32 [[TMP67]], ptr [[TMP65]], align 4
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP68:%.*]] = getelementptr inbounds i32, ptr [[TMP65]], i32 1
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP69:%.*]] = getelementptr inbounds i32, ptr [[TMP66]], i32 1
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP70:%.*]] = load i32, ptr [[TMP69]], align 4
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store i32 [[TMP70]], ptr [[TMP68]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP93:%.*]] = getelementptr inbounds i32, ptr [[TMP85]], i32 2
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP94:%.*]] = getelementptr inbounds i32, ptr [[TMP86]], i32 2
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP80:%.*]] = load i32, ptr [[TMP94]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store i32 [[TMP80]], ptr [[TMP93]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP81:%.*]] = load i32, ptr [[HITATTRSALLOCA]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store i32 [[TMP81]], ptr [[TMP7]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP82:%.*]] = getelementptr inbounds i32, ptr [[HITATTRSALLOCA]], i32 1
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP83:%.*]] = getelementptr inbounds i32, ptr [[TMP7]], i32 1
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP84:%.*]] = load i32, ptr [[TMP82]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store i32 [[TMP84]], ptr [[TMP83]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP102:%.*]] = load [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]], ptr [[TMP7]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP103:%.*]] = getelementptr inbounds [[STRUCT_ANYHITTRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0, i32 0
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]] [[TMP102]], ptr [[TMP103]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP104:%.*]] = load [[STRUCT_ANYHITTRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP105:%.*]] = load [10 x i32], ptr [[PAYLOAD_ALLOCA]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    call void (...) @lgc.cps.jump(i32 [[RETURN_ADDR]], i32 20, {} poison, i32 poison, i32 poison, [[STRUCT_ANYHITTRAVERSALDATA]] [[TMP104]], [8 x i32] poison, [10 x i32] [[TMP105]]), !continuation.registercount [[META33]]
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP71:%.*]] = getelementptr inbounds i32, ptr [[TMP65]], i32 2
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP72:%.*]] = getelementptr inbounds i32, ptr [[TMP66]], i32 2
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP73:%.*]] = load i32, ptr [[TMP72]], align 4
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store i32 [[TMP73]], ptr [[TMP71]], align 4
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP74:%.*]] = load i32, ptr [[HITATTRSALLOCA]], align 4
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store i32 [[TMP74]], ptr [[TMP7]], align 4
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP75:%.*]] = getelementptr inbounds i32, ptr [[HITATTRSALLOCA]], i32 1
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP76:%.*]] = getelementptr inbounds i32, ptr [[TMP7]], i32 1
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP77:%.*]] = load i32, ptr [[TMP75]], align 4
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store i32 [[TMP77]], ptr [[TMP76]], align 4
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP78:%.*]] = load [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]], ptr [[TMP7]], align 4
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP79:%.*]] = getelementptr inbounds [[STRUCT_ANYHITTRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0, i32 0
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]] [[TMP78]], ptr [[TMP79]], align 4
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP80:%.*]] = load [[STRUCT_ANYHITTRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], align 4
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP81:%.*]] = load [10 x i32], ptr [[PAYLOAD_ALLOCA]], align 4
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    call void (...) @lgc.cps.jump(i32 [[RETURNADDR]], i32 40, {} poison, i32 poison, i32 poison, [[STRUCT_ANYHITTRAVERSALDATA]] [[TMP80]], [8 x i32] poison, [10 x i32] [[TMP81]]), !continuation.registercount [[META33]]
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    unreachable
 ; LOWERRAYTRACINGPIPELINE-CPS:       82:
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    br i1 [[TMP46]], label [[TMP107:%.*]], label [[TMP168:%.*]]
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    br i1 [[TMP38]], label [[TMP83:%.*]], label [[TMP128:%.*]]
 ; LOWERRAYTRACINGPIPELINE-CPS:       83:
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    br i1 [[TMP45]], label [[TMP108:%.*]], label [[TMP138:%.*]]
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    br i1 [[TMP37]], label [[TMP84:%.*]], label [[TMP106:%.*]]
 ; LOWERRAYTRACINGPIPELINE-CPS:       84:
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP109:%.*]] = getelementptr inbounds [[STRUCT_ANYHITTRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0, i32 0, i32 0
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    call void @_cont_IgnoreHit(ptr [[TMP109]])
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP110:%.*]] = getelementptr inbounds [[STRUCT_RAYPAYLOAD]], ptr [[TMP10]], i32 0, i32 0
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP87:%.*]] = load i32, ptr [[TMP110]], align 4
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP85:%.*]] = getelementptr inbounds [[STRUCT_ANYHITTRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0, i32 0, i32 0
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    call void @_cont_IgnoreHit(ptr [[TMP85]])
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP86:%.*]] = getelementptr inbounds [[STRUCT_RAYPAYLOAD]], ptr [[TMP10]], i32 0
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP87:%.*]] = load i32, ptr [[TMP86]], align 4
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store i32 [[TMP87]], ptr [[PAYLOAD_ALLOCA]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP116:%.*]] = getelementptr inbounds i32, ptr [[PAYLOAD_ALLOCA]], i32 7
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP117:%.*]] = getelementptr inbounds i32, ptr [[TMP110]], i32 1
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP90:%.*]] = load i32, ptr [[TMP117]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store i32 [[TMP90]], ptr [[TMP116]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP91:%.*]] = getelementptr inbounds i32, ptr [[TMP116]], i32 1
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP92:%.*]] = getelementptr inbounds i32, ptr [[TMP117]], i32 1
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP95:%.*]] = load i32, ptr [[TMP92]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store i32 [[TMP95]], ptr [[TMP91]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP124:%.*]] = getelementptr inbounds i32, ptr [[TMP116]], i32 2
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP125:%.*]] = getelementptr inbounds i32, ptr [[TMP117]], i32 2
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP96:%.*]] = load i32, ptr [[TMP125]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store i32 [[TMP96]], ptr [[TMP124]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP127:%.*]] = getelementptr inbounds [[STRUCT_RAYPAYLOAD_ATTR_MAX_8_I32S_LAYOUT_1_ANYHIT_IN]], ptr [[PAYLOAD_ALLOCA]], i32 0, i32 0, i32 1
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP88:%.*]] = getelementptr inbounds i32, ptr [[PAYLOAD_ALLOCA]], i32 7
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP89:%.*]] = getelementptr inbounds i32, ptr [[TMP86]], i32 1
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP90:%.*]] = load i32, ptr [[TMP89]], align 4
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store i32 [[TMP90]], ptr [[TMP88]], align 4
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP91:%.*]] = getelementptr inbounds i32, ptr [[TMP88]], i32 1
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP92:%.*]] = getelementptr inbounds i32, ptr [[TMP89]], i32 1
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP93:%.*]] = load i32, ptr [[TMP92]], align 4
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store i32 [[TMP93]], ptr [[TMP91]], align 4
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP94:%.*]] = getelementptr inbounds i32, ptr [[TMP88]], i32 2
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP95:%.*]] = getelementptr inbounds i32, ptr [[TMP89]], i32 2
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP96:%.*]] = load i32, ptr [[TMP95]], align 4
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store i32 [[TMP96]], ptr [[TMP94]], align 4
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP97:%.*]] = getelementptr inbounds [[STRUCT_RAYPAYLOAD_ATTR_MAX_8_I32S_LAYOUT_1_ANYHIT_IN]], ptr [[PAYLOAD_ALLOCA]], i32 0, i32 0, i32 1
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP98:%.*]] = load i32, ptr [[ORIGHITATTRS]], align 4
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store i32 [[TMP98]], ptr [[TMP6]], align 4
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP99:%.*]] = getelementptr inbounds i32, ptr [[ORIGHITATTRS]], i32 1
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP100:%.*]] = getelementptr inbounds i32, ptr [[TMP6]], i32 1
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP101:%.*]] = load i32, ptr [[TMP99]], align 4
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store i32 [[TMP101]], ptr [[TMP100]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP134:%.*]] = load [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]], ptr [[TMP6]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP135:%.*]] = getelementptr inbounds [[STRUCT_ANYHITTRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0, i32 0
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]] [[TMP134]], ptr [[TMP135]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP136:%.*]] = load [[STRUCT_ANYHITTRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP137:%.*]] = load [10 x i32], ptr [[PAYLOAD_ALLOCA]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    call void (...) @lgc.cps.jump(i32 [[RETURN_ADDR]], i32 20, {} poison, i32 poison, i32 poison, [[STRUCT_ANYHITTRAVERSALDATA]] [[TMP136]], [8 x i32] poison, [10 x i32] [[TMP137]]), !continuation.registercount [[META33]]
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP102:%.*]] = load [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]], ptr [[TMP6]], align 4
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP103:%.*]] = getelementptr inbounds [[STRUCT_ANYHITTRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0, i32 0
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]] [[TMP102]], ptr [[TMP103]], align 4
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP104:%.*]] = load [[STRUCT_ANYHITTRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], align 4
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP105:%.*]] = load [10 x i32], ptr [[PAYLOAD_ALLOCA]], align 4
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    call void (...) @lgc.cps.jump(i32 [[RETURNADDR]], i32 40, {} poison, i32 poison, i32 poison, [[STRUCT_ANYHITTRAVERSALDATA]] [[TMP104]], [8 x i32] poison, [10 x i32] [[TMP105]]), !continuation.registercount [[META33]]
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    unreachable
 ; LOWERRAYTRACINGPIPELINE-CPS:       106:
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP139:%.*]] = getelementptr inbounds [[STRUCT_ANYHITTRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0, i32 0, i32 0
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    call void @_cont_IgnoreHit(ptr [[TMP139]])
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP140:%.*]] = getelementptr inbounds [[STRUCT_RAYPAYLOAD]], ptr [[TMP10]], i32 0, i32 0
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP111:%.*]] = load i32, ptr [[TMP140]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store i32 [[TMP111]], ptr [[PAYLOAD_ALLOCA]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP146:%.*]] = getelementptr inbounds i32, ptr [[PAYLOAD_ALLOCA]], i32 7
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP147:%.*]] = getelementptr inbounds i32, ptr [[TMP140]], i32 1
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP112:%.*]] = load i32, ptr [[TMP147]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store i32 [[TMP112]], ptr [[TMP146]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP113:%.*]] = getelementptr inbounds i32, ptr [[TMP146]], i32 1
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP114:%.*]] = getelementptr inbounds i32, ptr [[TMP147]], i32 1
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP107:%.*]] = getelementptr inbounds [[STRUCT_ANYHITTRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0, i32 0, i32 0
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    call void @_cont_IgnoreHit(ptr [[TMP107]])
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP108:%.*]] = getelementptr inbounds [[STRUCT_RAYPAYLOAD]], ptr [[TMP10]], i32 0
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP109:%.*]] = load i32, ptr [[TMP108]], align 4
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store i32 [[TMP109]], ptr [[PAYLOAD_ALLOCA]], align 4
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP110:%.*]] = getelementptr inbounds i32, ptr [[PAYLOAD_ALLOCA]], i32 7
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP111:%.*]] = getelementptr inbounds i32, ptr [[TMP108]], i32 1
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP112:%.*]] = load i32, ptr [[TMP111]], align 4
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store i32 [[TMP112]], ptr [[TMP110]], align 4
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP113:%.*]] = getelementptr inbounds i32, ptr [[TMP110]], i32 1
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP114:%.*]] = getelementptr inbounds i32, ptr [[TMP111]], i32 1
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP115:%.*]] = load i32, ptr [[TMP114]], align 4
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store i32 [[TMP115]], ptr [[TMP113]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP154:%.*]] = getelementptr inbounds i32, ptr [[TMP146]], i32 2
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP155:%.*]] = getelementptr inbounds i32, ptr [[TMP147]], i32 2
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP118:%.*]] = load i32, ptr [[TMP155]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store i32 [[TMP118]], ptr [[TMP154]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP157:%.*]] = getelementptr inbounds [[STRUCT_RAYPAYLOAD_ATTR_MAX_8_I32S_LAYOUT_1_ANYHIT_IN]], ptr [[PAYLOAD_ALLOCA]], i32 0, i32 0, i32 1
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP116:%.*]] = getelementptr inbounds i32, ptr [[TMP110]], i32 2
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP117:%.*]] = getelementptr inbounds i32, ptr [[TMP111]], i32 2
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP118:%.*]] = load i32, ptr [[TMP117]], align 4
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store i32 [[TMP118]], ptr [[TMP116]], align 4
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP119:%.*]] = getelementptr inbounds [[STRUCT_RAYPAYLOAD_ATTR_MAX_8_I32S_LAYOUT_1_ANYHIT_IN]], ptr [[PAYLOAD_ALLOCA]], i32 0, i32 0, i32 1
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP120:%.*]] = load i32, ptr [[ORIGHITATTRS]], align 4
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store i32 [[TMP120]], ptr [[TMP5]], align 4
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP121:%.*]] = getelementptr inbounds i32, ptr [[ORIGHITATTRS]], i32 1
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP122:%.*]] = getelementptr inbounds i32, ptr [[TMP5]], i32 1
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP123:%.*]] = load i32, ptr [[TMP121]], align 4
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store i32 [[TMP123]], ptr [[TMP122]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP164:%.*]] = load [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]], ptr [[TMP5]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP165:%.*]] = getelementptr inbounds [[STRUCT_ANYHITTRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0, i32 0
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]] [[TMP164]], ptr [[TMP165]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP166:%.*]] = load [[STRUCT_ANYHITTRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP167:%.*]] = load [10 x i32], ptr [[PAYLOAD_ALLOCA]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    call void (...) @lgc.cps.jump(i32 [[RETURN_ADDR]], i32 20, {} poison, i32 poison, i32 poison, [[STRUCT_ANYHITTRAVERSALDATA]] [[TMP166]], [8 x i32] poison, [10 x i32] [[TMP167]]), !continuation.registercount [[META33]]
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP124:%.*]] = load [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]], ptr [[TMP5]], align 4
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP125:%.*]] = getelementptr inbounds [[STRUCT_ANYHITTRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0, i32 0
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]] [[TMP124]], ptr [[TMP125]], align 4
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP126:%.*]] = load [[STRUCT_ANYHITTRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], align 4
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP127:%.*]] = load [10 x i32], ptr [[PAYLOAD_ALLOCA]], align 4
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    call void (...) @lgc.cps.jump(i32 [[RETURNADDR]], i32 40, {} poison, i32 poison, i32 poison, [[STRUCT_ANYHITTRAVERSALDATA]] [[TMP126]], [8 x i32] poison, [10 x i32] [[TMP127]]), !continuation.registercount [[META33]]
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    unreachable
 ; LOWERRAYTRACINGPIPELINE-CPS:       128:
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store <4 x float> [[TMP38]], ptr [[TMP37]], align 4
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store <4 x float> [[TMP30]], ptr [[TMP29]], align 4
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    call void @_cont_AcceptHit(ptr [[SYSTEM_DATA_ALLOCA]])
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP169:%.*]] = getelementptr inbounds [[STRUCT_RAYPAYLOAD]], ptr [[TMP10]], i32 0, i32 0
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP130:%.*]] = load i32, ptr [[TMP169]], align 4
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP129:%.*]] = getelementptr inbounds [[STRUCT_RAYPAYLOAD]], ptr [[TMP10]], i32 0
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP130:%.*]] = load i32, ptr [[TMP129]], align 4
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store i32 [[TMP130]], ptr [[PAYLOAD_ALLOCA]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP175:%.*]] = getelementptr inbounds i32, ptr [[PAYLOAD_ALLOCA]], i32 7
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP176:%.*]] = getelementptr inbounds i32, ptr [[TMP169]], i32 1
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP133:%.*]] = load i32, ptr [[TMP176]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store i32 [[TMP133]], ptr [[TMP175]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP144:%.*]] = getelementptr inbounds i32, ptr [[TMP175]], i32 1
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP145:%.*]] = getelementptr inbounds i32, ptr [[TMP176]], i32 1
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP148:%.*]] = load i32, ptr [[TMP145]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store i32 [[TMP148]], ptr [[TMP144]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP183:%.*]] = getelementptr inbounds i32, ptr [[TMP175]], i32 2
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP184:%.*]] = getelementptr inbounds i32, ptr [[TMP176]], i32 2
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP149:%.*]] = load i32, ptr [[TMP184]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store i32 [[TMP149]], ptr [[TMP183]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP150:%.*]] = load i32, ptr [[HITATTRSALLOCA]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store i32 [[TMP150]], ptr [[TMP4]], align 4
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP131:%.*]] = getelementptr inbounds i32, ptr [[PAYLOAD_ALLOCA]], i32 7
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP132:%.*]] = getelementptr inbounds i32, ptr [[TMP129]], i32 1
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP133:%.*]] = load i32, ptr [[TMP132]], align 4
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store i32 [[TMP133]], ptr [[TMP131]], align 4
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP134:%.*]] = getelementptr inbounds i32, ptr [[TMP131]], i32 1
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP135:%.*]] = getelementptr inbounds i32, ptr [[TMP132]], i32 1
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP136:%.*]] = load i32, ptr [[TMP135]], align 4
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store i32 [[TMP136]], ptr [[TMP134]], align 4
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP137:%.*]] = getelementptr inbounds i32, ptr [[TMP131]], i32 2
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP138:%.*]] = getelementptr inbounds i32, ptr [[TMP132]], i32 2
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP139:%.*]] = load i32, ptr [[TMP138]], align 4
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store i32 [[TMP139]], ptr [[TMP137]], align 4
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP140:%.*]] = load i32, ptr [[HITATTRSALLOCA]], align 4
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store i32 [[TMP140]], ptr [[TMP4]], align 4
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP141:%.*]] = getelementptr inbounds i32, ptr [[HITATTRSALLOCA]], i32 1
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP142:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i32 1
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP143:%.*]] = load i32, ptr [[TMP141]], align 4
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store i32 [[TMP143]], ptr [[TMP142]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP192:%.*]] = load [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]], ptr [[TMP4]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP193:%.*]] = getelementptr inbounds [[STRUCT_ANYHITTRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0, i32 0
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]] [[TMP192]], ptr [[TMP193]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP194:%.*]] = load [[STRUCT_ANYHITTRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP195:%.*]] = load [10 x i32], ptr [[PAYLOAD_ALLOCA]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    call void (...) @lgc.cps.jump(i32 [[RETURN_ADDR]], i32 20, {} poison, i32 poison, i32 poison, [[STRUCT_ANYHITTRAVERSALDATA]] [[TMP194]], [8 x i32] poison, [10 x i32] [[TMP195]]), !continuation.registercount [[META33]]
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP144:%.*]] = load [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]], ptr [[TMP4]], align 4
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP145:%.*]] = getelementptr inbounds [[STRUCT_ANYHITTRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0, i32 0
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]] [[TMP144]], ptr [[TMP145]], align 4
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP146:%.*]] = load [[STRUCT_ANYHITTRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], align 4
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP147:%.*]] = load [10 x i32], ptr [[PAYLOAD_ALLOCA]], align 4
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    call void (...) @lgc.cps.jump(i32 [[RETURNADDR]], i32 40, {} poison, i32 poison, i32 poison, [[STRUCT_ANYHITTRAVERSALDATA]] [[TMP146]], [8 x i32] poison, [10 x i32] [[TMP147]]), !continuation.registercount [[META33]]
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    unreachable
 ;
 ;
 ; LOWERRAYTRACINGPIPELINE-CPS-LABEL: define void @MyIntersectionShader(
-; LOWERRAYTRACINGPIPELINE-CPS-SAME: {} [[CONT_STATE:%.*]], i32 [[RETURN_ADDR:%.*]], i32 [[SHADER_INDEX:%.*]], [[STRUCT_ANYHITTRAVERSALDATA:%.*]] [[SYSTEM_DATA:%.*]], {} [[HIT_ATTRS:%.*]], [8 x i32] [[PADDING:%.*]], [30 x i32] [[PAYLOAD:%.*]]) #[[ATTR2]] !lgc.rt.shaderstage [[META40]] !lgc.cps [[META44:![0-9]+]] !continuation [[META45:![0-9]+]] {
+; LOWERRAYTRACINGPIPELINE-CPS-SAME: {} [[CONT_STATE:%.*]], i32 [[RETURNADDR:%.*]], i32 [[SHADER_INDEX:%.*]], [[STRUCT_ANYHITTRAVERSALDATA:%.*]] [[SYSTEM_DATA:%.*]], {} [[HIT_ATTRS:%.*]], [8 x i32] [[PADDING:%.*]], [30 x i32] [[PAYLOAD:%.*]]) #[[ATTR2]] !lgc.rt.shaderstage [[META35]] !lgc.cps [[META45:![0-9]+]] !continuation [[META46:![0-9]+]] {
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP1:%.*]] = alloca [[STRUCT_HITDATA:%.*]], align 8
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP2:%.*]] = alloca [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES:%.*]], align 8
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP3:%.*]] = alloca [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]], align 4
@@ -1483,7 +1493,7 @@ attributes #5 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[STRUCT_ANYHITTRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0, i32 0, i32 0
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP7:%.*]] = load [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]], ptr [[TMP3]], align 4
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP8:%.*]] = load [30 x i32], ptr [[PAYLOAD_ALLOCA]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP9:%.*]] = call { [[STRUCT_ANYHITTRAVERSALDATA]], [8 x i32], [30 x i32] } (...) @lgc.cps.await__sl_s_struct.AnyHitTraversalDatasa8i32a30i32s(i32 3, i32 8, i32 5, float [[RES_I2]], i32 0, [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]] [[TMP7]], [32 x i32] poison, [30 x i32] [[TMP8]]), !continuation.returnedRegistercount !32, !continuation.registercount [[META32:![0-9]+]]
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP9:%.*]] = call { [[STRUCT_ANYHITTRAVERSALDATA]], [8 x i32], [30 x i32] } (...) @lgc.cps.await__sl_s_struct.AnyHitTraversalDatasa8i32a30i32s(i32 3, i32 16, i32 5, float [[RES_I2]], i32 0, [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]] [[TMP7]], [32 x i32] poison, [30 x i32] [[TMP8]]), !continuation.returnedRegistercount [[META32:![0-9]+]], !continuation.registercount [[META32]]
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP10:%.*]] = extractvalue { [[STRUCT_ANYHITTRAVERSALDATA]], [8 x i32], [30 x i32] } [[TMP9]], 2
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store [30 x i32] [[TMP10]], ptr [[PAYLOAD_ALLOCA]], align 4
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP11:%.*]] = extractvalue { [[STRUCT_ANYHITTRAVERSALDATA]], [8 x i32], [30 x i32] } [[TMP9]], 0
@@ -1495,33 +1505,33 @@ attributes #5 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP13:%.*]] = getelementptr i32, ptr [[PAYLOAD_ALLOCA]], i32 1
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP14:%.*]] = load i32, ptr [[TMP3]], align 4
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store i32 [[TMP14]], ptr [[TMP2]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i32, ptr [[TMP3]], i32 1
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i32, ptr [[TMP2]], i32 1
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP19:%.*]] = load i32, ptr [[TMP17]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store i32 [[TMP19]], ptr [[TMP18]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP20:%.*]] = load [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]], ptr [[TMP2]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP21:%.*]] = getelementptr inbounds [[STRUCT_ANYHITTRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0, i32 0
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]] [[TMP20]], ptr [[TMP21]], align 4
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i32, ptr [[TMP3]], i32 1
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i32, ptr [[TMP2]], i32 1
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP17:%.*]] = load i32, ptr [[TMP15]], align 4
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store i32 [[TMP17]], ptr [[TMP16]], align 4
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP18:%.*]] = load [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]], ptr [[TMP2]], align 4
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP19:%.*]] = getelementptr inbounds [[STRUCT_ANYHITTRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0, i32 0
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]] [[TMP18]], ptr [[TMP19]], align 4
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    br label [[_CONT_REPORTHIT_EXIT]]
 ; LOWERRAYTRACINGPIPELINE-CPS:       _cont_ReportHit.exit:
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP22:%.*]] = getelementptr inbounds [[STRUCT_ANYHITTRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [[STRUCT_ANYHITTRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[ISEND_I1:%.*]] = call i1 @opaqueIsEnd()
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    br i1 [[ISEND_I1]], label [[TMP23:%.*]], label [[TMP26:%.*]]
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    br i1 [[ISEND_I1]], label [[TMP21:%.*]], label [[TMP24:%.*]]
 ; LOWERRAYTRACINGPIPELINE-CPS:       21:
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP24:%.*]] = load [[STRUCT_ANYHITTRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP25:%.*]] = load [30 x i32], ptr [[PAYLOAD_ALLOCA]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    call void (...) @lgc.cps.jump(i32 [[RETURN_ADDR]], i32 4, {} poison, i32 poison, i32 poison, [[STRUCT_ANYHITTRAVERSALDATA]] [[TMP24]], [8 x i32] poison, [30 x i32] [[TMP25]]), !continuation.registercount [[META32]]
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP22:%.*]] = load [[STRUCT_ANYHITTRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], align 4
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP23:%.*]] = load [30 x i32], ptr [[PAYLOAD_ALLOCA]], align 4
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    call void (...) @lgc.cps.jump(i32 [[RETURNADDR]], i32 8, {} poison, i32 poison, i32 poison, [[STRUCT_ANYHITTRAVERSALDATA]] [[TMP22]], [8 x i32] poison, [30 x i32] [[TMP23]]), !continuation.registercount [[META32]]
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    unreachable
 ; LOWERRAYTRACINGPIPELINE-CPS:       24:
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    call void @llvm.lifetime.end.p0(i64 8, ptr [[TMP5]]) #[[ATTR1]]
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP27:%.*]] = load [[STRUCT_ANYHITTRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP28:%.*]] = load [30 x i32], ptr [[PAYLOAD_ALLOCA]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    call void (...) @lgc.cps.jump(i32 [[RETURN_ADDR]], i32 4, {} poison, i32 poison, i32 poison, [[STRUCT_ANYHITTRAVERSALDATA]] [[TMP27]], [8 x i32] poison, [30 x i32] [[TMP28]]), !continuation.registercount [[META32]]
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP25:%.*]] = load [[STRUCT_ANYHITTRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], align 4
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP26:%.*]] = load [30 x i32], ptr [[PAYLOAD_ALLOCA]], align 4
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    call void (...) @lgc.cps.jump(i32 [[RETURNADDR]], i32 8, {} poison, i32 poison, i32 poison, [[STRUCT_ANYHITTRAVERSALDATA]] [[TMP25]], [8 x i32] poison, [30 x i32] [[TMP26]]), !continuation.registercount [[META32]]
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    unreachable
 ;
 ;
 ; LOWERRAYTRACINGPIPELINE-CPS-LABEL: define void @MyIntersectionShader2(
-; LOWERRAYTRACINGPIPELINE-CPS-SAME: {} [[CONT_STATE:%.*]], i32 [[RETURN_ADDR:%.*]], i32 [[SHADER_INDEX:%.*]], [[STRUCT_ANYHITTRAVERSALDATA:%.*]] [[SYSTEM_DATA:%.*]], {} [[HIT_ATTRS:%.*]], [8 x i32] [[PADDING:%.*]], [30 x i32] [[PAYLOAD:%.*]]) #[[ATTR2]] !lgc.rt.shaderstage [[META40]] !lgc.cps [[META44]] !continuation [[META46:![0-9]+]] {
+; LOWERRAYTRACINGPIPELINE-CPS-SAME: {} [[CONT_STATE:%.*]], i32 [[RETURNADDR:%.*]], i32 [[SHADER_INDEX:%.*]], [[STRUCT_ANYHITTRAVERSALDATA:%.*]] [[SYSTEM_DATA:%.*]], {} [[HIT_ATTRS:%.*]], [8 x i32] [[PADDING:%.*]], [30 x i32] [[PAYLOAD:%.*]]) #[[ATTR2]] !lgc.rt.shaderstage [[META35]] !lgc.cps [[META45]] !continuation [[META47:![0-9]+]] {
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP1:%.*]] = alloca [[STRUCT_HITDATA:%.*]], align 8
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP2:%.*]] = alloca [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES:%.*]], align 8
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP3:%.*]] = alloca [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES2:%.*]], align 4
@@ -1546,7 +1556,7 @@ attributes #5 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[STRUCT_ANYHITTRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0, i32 0, i32 0
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP7:%.*]] = load [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES2]], ptr [[TMP3]], align 4
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP8:%.*]] = load [30 x i32], ptr [[PAYLOAD_ALLOCA]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP9:%.*]] = call { [[STRUCT_ANYHITTRAVERSALDATA]], [8 x i32], [30 x i32] } (...) @lgc.cps.await__sl_s_struct.AnyHitTraversalDatasa8i32a30i32s(i32 3, i32 8, i32 5, float [[RES_I2]], i32 0, [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES2]] [[TMP7]], [32 x i32] poison, [30 x i32] [[TMP8]]), !continuation.returnedRegistercount !32, !continuation.registercount [[META32]]
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP9:%.*]] = call { [[STRUCT_ANYHITTRAVERSALDATA]], [8 x i32], [30 x i32] } (...) @lgc.cps.await__sl_s_struct.AnyHitTraversalDatasa8i32a30i32s(i32 3, i32 16, i32 5, float [[RES_I2]], i32 0, [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES2]] [[TMP7]], [32 x i32] poison, [30 x i32] [[TMP8]]), !continuation.returnedRegistercount [[META32]], !continuation.registercount [[META32]]
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP10:%.*]] = extractvalue { [[STRUCT_ANYHITTRAVERSALDATA]], [8 x i32], [30 x i32] } [[TMP9]], 2
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store [30 x i32] [[TMP10]], ptr [[PAYLOAD_ALLOCA]], align 4
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP11:%.*]] = extractvalue { [[STRUCT_ANYHITTRAVERSALDATA]], [8 x i32], [30 x i32] } [[TMP9]], 0
@@ -1558,75 +1568,75 @@ attributes #5 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP13:%.*]] = getelementptr i32, ptr [[PAYLOAD_ALLOCA]], i32 1
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP14:%.*]] = load i32, ptr [[TMP3]], align 4
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store i32 [[TMP14]], ptr [[TMP2]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i32, ptr [[TMP3]], i32 1
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i32, ptr [[TMP2]], i32 1
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP19:%.*]] = load i32, ptr [[TMP17]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store i32 [[TMP19]], ptr [[TMP18]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP20:%.*]] = load [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]], ptr [[TMP2]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP21:%.*]] = getelementptr inbounds [[STRUCT_ANYHITTRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0, i32 0
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]] [[TMP20]], ptr [[TMP21]], align 4
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i32, ptr [[TMP3]], i32 1
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i32, ptr [[TMP2]], i32 1
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP17:%.*]] = load i32, ptr [[TMP15]], align 4
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store i32 [[TMP17]], ptr [[TMP16]], align 4
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP18:%.*]] = load [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]], ptr [[TMP2]], align 4
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP19:%.*]] = getelementptr inbounds [[STRUCT_ANYHITTRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0, i32 0
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]] [[TMP18]], ptr [[TMP19]], align 4
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    br label [[_CONT_REPORTHIT_EXIT]]
 ; LOWERRAYTRACINGPIPELINE-CPS:       _cont_ReportHit.exit:
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP22:%.*]] = getelementptr inbounds [[STRUCT_ANYHITTRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [[STRUCT_ANYHITTRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[ISEND_I1:%.*]] = call i1 @opaqueIsEnd()
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    br i1 [[ISEND_I1]], label [[TMP23:%.*]], label [[TMP26:%.*]]
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    br i1 [[ISEND_I1]], label [[TMP21:%.*]], label [[TMP24:%.*]]
 ; LOWERRAYTRACINGPIPELINE-CPS:       21:
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP24:%.*]] = load [[STRUCT_ANYHITTRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP25:%.*]] = load [30 x i32], ptr [[PAYLOAD_ALLOCA]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    call void (...) @lgc.cps.jump(i32 [[RETURN_ADDR]], i32 4, {} poison, i32 poison, i32 poison, [[STRUCT_ANYHITTRAVERSALDATA]] [[TMP24]], [8 x i32] poison, [30 x i32] [[TMP25]]), !continuation.registercount [[META32]]
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP22:%.*]] = load [[STRUCT_ANYHITTRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], align 4
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP23:%.*]] = load [30 x i32], ptr [[PAYLOAD_ALLOCA]], align 4
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    call void (...) @lgc.cps.jump(i32 [[RETURNADDR]], i32 8, {} poison, i32 poison, i32 poison, [[STRUCT_ANYHITTRAVERSALDATA]] [[TMP22]], [8 x i32] poison, [30 x i32] [[TMP23]]), !continuation.registercount [[META32]]
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    unreachable
 ; LOWERRAYTRACINGPIPELINE-CPS:       24:
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    call void @llvm.lifetime.end.p0(i64 8, ptr [[TMP5]]) #[[ATTR1]]
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP27:%.*]] = load [[STRUCT_ANYHITTRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP28:%.*]] = load [30 x i32], ptr [[PAYLOAD_ALLOCA]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    call void (...) @lgc.cps.jump(i32 [[RETURN_ADDR]], i32 4, {} poison, i32 poison, i32 poison, [[STRUCT_ANYHITTRAVERSALDATA]] [[TMP27]], [8 x i32] poison, [30 x i32] [[TMP28]]), !continuation.registercount [[META32]]
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP25:%.*]] = load [[STRUCT_ANYHITTRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], align 4
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP26:%.*]] = load [30 x i32], ptr [[PAYLOAD_ALLOCA]], align 4
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    call void (...) @lgc.cps.jump(i32 [[RETURNADDR]], i32 8, {} poison, i32 poison, i32 poison, [[STRUCT_ANYHITTRAVERSALDATA]] [[TMP25]], [8 x i32] poison, [30 x i32] [[TMP26]]), !continuation.registercount [[META32]]
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    unreachable
 ;
 ;
 ; LOWERRAYTRACINGPIPELINE-CPS-LABEL: define void @MyMissShader(
-; LOWERRAYTRACINGPIPELINE-CPS-SAME: {} [[CONT_STATE:%.*]], i32 [[RETURN_ADDR:%.*]], i32 [[SHADER_INDEX:%.*]], [[STRUCT_SYSTEMDATA:%.*]] [[SYSTEM_DATA:%.*]], {} [[HIT_ATTRS:%.*]], [33 x i32] [[PADDING:%.*]], [10 x i32] [[PAYLOAD:%.*]]) #[[ATTR2]] !lgc.rt.shaderstage [[META44]] !lgc.cps [[META40]] !continuation [[META47:![0-9]+]] {
+; LOWERRAYTRACINGPIPELINE-CPS-SAME: {} [[CONT_STATE:%.*]], i32 [[RETURNADDR:%.*]], i32 [[SHADER_INDEX:%.*]], [[STRUCT_SYSTEMDATA:%.*]] [[SYSTEM_DATA:%.*]], {} [[HIT_ATTRS:%.*]], [33 x i32] [[PADDING:%.*]], [10 x i32] [[PAYLOAD:%.*]]) #[[ATTR2]] !lgc.rt.shaderstage [[META43]] !lgc.cps [[META41]] !continuation [[META48:![0-9]+]] {
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[SYSTEM_DATA_ALLOCA:%.*]] = alloca [[STRUCT_SYSTEMDATA]], align 8
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[PAYLOAD_ALLOCA:%.*]] = alloca [30 x i32], align 4
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP1:%.*]] = alloca [[STRUCT_RAYPAYLOAD:%.*]], align 8
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store [10 x i32] [[PAYLOAD]], ptr [[PAYLOAD_ALLOCA]], align 4
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store [[STRUCT_SYSTEMDATA]] [[SYSTEM_DATA]], ptr [[SYSTEM_DATA_ALLOCA]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [[STRUCT_RAYPAYLOAD]], ptr [[TMP1]], i32 0, i32 0
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [[STRUCT_RAYPAYLOAD]], ptr [[TMP1]], i32 0
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP3:%.*]] = load i32, ptr [[PAYLOAD_ALLOCA]], align 4
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store i32 [[TMP3]], ptr [[TMP2]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[PAYLOAD_ALLOCA]], i32 7
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[TMP2]], i32 1
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP6:%.*]] = load i32, ptr [[TMP8]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store i32 [[TMP6]], ptr [[TMP9]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[TMP9]], i32 1
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i32, ptr [[TMP8]], i32 1
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[PAYLOAD_ALLOCA]], i32 7
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[TMP2]], i32 1
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP6:%.*]] = load i32, ptr [[TMP4]], align 4
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store i32 [[TMP6]], ptr [[TMP5]], align 4
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[TMP5]], i32 1
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i32 1
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP9:%.*]] = load i32, ptr [[TMP8]], align 4
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store i32 [[TMP9]], ptr [[TMP7]], align 4
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[TMP5]], i32 2
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i32 2
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP12:%.*]] = load i32, ptr [[TMP11]], align 4
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store i32 [[TMP12]], ptr [[TMP10]], align 4
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    call void @amd.dx.setLocalRootIndex(i32 [[SHADER_INDEX]])
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP13:%.*]] = getelementptr inbounds [[STRUCT_RAYPAYLOAD]], ptr [[TMP1]], i32 0, i32 0
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store <4 x float> <float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 1.000000e+00>, ptr [[TMP13]], align 4
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP14:%.*]] = getelementptr inbounds [[STRUCT_RAYPAYLOAD]], ptr [[TMP1]], i32 0
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP15:%.*]] = load i32, ptr [[TMP14]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store i32 [[TMP15]], ptr [[TMP13]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i32, ptr [[TMP9]], i32 2
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i32, ptr [[TMP8]], i32 2
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store i32 [[TMP15]], ptr [[PAYLOAD_ALLOCA]], align 4
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i32, ptr [[PAYLOAD_ALLOCA]], i32 7
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i32, ptr [[TMP14]], i32 1
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP18:%.*]] = load i32, ptr [[TMP17]], align 4
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store i32 [[TMP18]], ptr [[TMP16]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    call void @amd.dx.setLocalRootIndex(i32 [[SHADER_INDEX]])
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP19:%.*]] = getelementptr inbounds [[STRUCT_RAYPAYLOAD]], ptr [[TMP1]], i32 0, i32 0
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store <4 x float> <float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 1.000000e+00>, ptr [[TMP19]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [[STRUCT_RAYPAYLOAD]], ptr [[TMP1]], i32 0, i32 0
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i32, ptr [[TMP16]], i32 1
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP20:%.*]] = getelementptr inbounds i32, ptr [[TMP17]], i32 1
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP21:%.*]] = load i32, ptr [[TMP20]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store i32 [[TMP21]], ptr [[PAYLOAD_ALLOCA]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP26:%.*]] = getelementptr inbounds i32, ptr [[PAYLOAD_ALLOCA]], i32 7
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP27:%.*]] = getelementptr inbounds i32, ptr [[TMP20]], i32 1
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP22:%.*]] = load i32, ptr [[TMP27]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store i32 [[TMP22]], ptr [[TMP26]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP31:%.*]] = getelementptr inbounds i32, ptr [[TMP26]], i32 1
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP32:%.*]] = getelementptr inbounds i32, ptr [[TMP27]], i32 1
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP33:%.*]] = load i32, ptr [[TMP32]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store i32 [[TMP33]], ptr [[TMP31]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP34:%.*]] = getelementptr inbounds i32, ptr [[TMP26]], i32 2
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP35:%.*]] = getelementptr inbounds i32, ptr [[TMP27]], i32 2
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP36:%.*]] = load i32, ptr [[TMP35]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store i32 [[TMP36]], ptr [[TMP34]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP37:%.*]] = getelementptr inbounds [[STRUCT_SYSTEMDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP38:%.*]] = load [[STRUCT_DISPATCHSYSTEMDATA:%.*]], ptr [[TMP37]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP39:%.*]] = load [10 x i32], ptr [[PAYLOAD_ALLOCA]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    call void (...) @lgc.cps.jump(i32 [[RETURN_ADDR]], i32 3, {} poison, i32 poison, i32 poison, [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP38]], [33 x i32] poison, [10 x i32] [[TMP39]]), !continuation.registercount [[META33]]
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store i32 [[TMP21]], ptr [[TMP19]], align 4
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP22:%.*]] = getelementptr inbounds i32, ptr [[TMP16]], i32 2
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP23:%.*]] = getelementptr inbounds i32, ptr [[TMP17]], i32 2
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP24:%.*]] = load i32, ptr [[TMP23]], align 4
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store i32 [[TMP24]], ptr [[TMP22]], align 4
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [[STRUCT_SYSTEMDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP26:%.*]] = load [[STRUCT_DISPATCHSYSTEMDATA:%.*]], ptr [[TMP25]], align 4
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP27:%.*]] = load [10 x i32], ptr [[PAYLOAD_ALLOCA]], align 4
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    call void (...) @lgc.cps.jump(i32 [[RETURNADDR]], i32 6, {} poison, i32 poison, i32 poison, [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP26]], [33 x i32] poison, [10 x i32] [[TMP27]]), !continuation.registercount [[META33]]
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    unreachable
 ;
 ;
@@ -1710,7 +1720,7 @@ attributes #5 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 ;
 ;
 ; REGISTERBUFFER-CPS-LABEL: define void @MyRayGen(
-; REGISTERBUFFER-CPS-SAME: {} [[CONT_STATE:%.*]], i32 [[RETURN_ADDR:%.*]], i32 [[SHADER_INDEX:%.*]], [[STRUCT_DISPATCHSYSTEMDATA:%.*]] [[TMP0:%.*]]) #[[ATTR2:[0-9]+]] !lgc.rt.shaderstage [[META22:![0-9]+]] !lgc.cps [[META22]] !continuation [[META35:![0-9]+]] {
+; REGISTERBUFFER-CPS-SAME: {} [[CONT_STATE:%.*]], i32 [[RETURNADDR:%.*]], i32 [[SHADER_INDEX:%.*]], [[STRUCT_DISPATCHSYSTEMDATA:%.*]] [[TMP0:%.*]]) #[[ATTR2:[0-9]+]] !lgc.rt.shaderstage [[META22:![0-9]+]] !lgc.cps [[META35:![0-9]+]] !continuation [[META36:![0-9]+]] {
 ; REGISTERBUFFER-CPS-NEXT:  AllocaSpillBB:
 ; REGISTERBUFFER-CPS-NEXT:    [[DOTFCA_0_EXTRACT20:%.*]] = extractvalue [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP0]], 0
 ; REGISTERBUFFER-CPS-NEXT:    call void @amd.dx.setLocalRootIndex(i32 0)
@@ -1742,12 +1752,12 @@ attributes #5 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 ; REGISTERBUFFER-CPS-NEXT:    [[DOTFCA_7_INSERT:%.*]] = insertvalue [10 x i32] [[DOTFCA_6_INSERT]], i32 [[TMP8]], 7
 ; REGISTERBUFFER-CPS-NEXT:    [[DOTFCA_8_INSERT:%.*]] = insertvalue [10 x i32] [[DOTFCA_7_INSERT]], i32 [[TMP9]], 8
 ; REGISTERBUFFER-CPS-NEXT:    [[DOTFCA_9_INSERT:%.*]] = insertvalue [10 x i32] [[DOTFCA_8_INSERT]], i32 [[TMP10]], 9
-; REGISTERBUFFER-CPS-NEXT:    call void (...) @lgc.cps.jump(i32 4, i32 4, {} poison, i64 [[TMP6]], i32 5, [36 x i32] poison, [10 x i32] [[DOTFCA_9_INSERT]]), !continuation.returnedRegistercount !33, !continuation.registercount [[META33:![0-9]+]]
+; REGISTERBUFFER-CPS-NEXT:    call void (...) @lgc.cps.jump(i32 4, i32 8, {} poison, i64 [[TMP6]], i32 5, [36 x i32] poison, [10 x i32] [[DOTFCA_9_INSERT]]), !continuation.returnedRegistercount [[META33:![0-9]+]], !continuation.registercount [[META33]]
 ; REGISTERBUFFER-CPS-NEXT:    unreachable
 ;
 ;
 ; REGISTERBUFFER-CPS-LABEL: define dso_local void @MyRayGen.resume.0(
-; REGISTERBUFFER-CPS-SAME: {} [[TMP0:%.*]], i32 [[TMP1:%.*]], i32 [[TMP2:%.*]], { [[STRUCT_DISPATCHSYSTEMDATA:%.*]], [33 x i32], [10 x i32] } [[TMP3:%.*]]) !lgc.rt.shaderstage [[META22]] !lgc.cps [[META22]] !continuation [[META35]] {
+; REGISTERBUFFER-CPS-SAME: {} [[TMP0:%.*]], i32 [[TMP1:%.*]], i32 [[TMP2:%.*]], { [[STRUCT_DISPATCHSYSTEMDATA:%.*]], [33 x i32], [10 x i32] } [[TMP3:%.*]]) !lgc.rt.shaderstage [[META22]] !lgc.cps [[META35]] !continuation [[META36]] {
 ; REGISTERBUFFER-CPS-NEXT:  entryresume.0:
 ; REGISTERBUFFER-CPS-NEXT:    [[TMP4:%.*]] = alloca { [[STRUCT_DISPATCHSYSTEMDATA]], [33 x i32], [10 x i32] }, align 8
 ; REGISTERBUFFER-CPS-NEXT:    store { [[STRUCT_DISPATCHSYSTEMDATA]], [33 x i32], [10 x i32] } [[TMP3]], ptr [[TMP4]], align 4
@@ -1805,7 +1815,7 @@ attributes #5 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 ;
 ;
 ; REGISTERBUFFER-CPS-LABEL: define void @MyClosestHitShader(
-; REGISTERBUFFER-CPS-SAME: {} [[CONT_STATE:%.*]], i32 [[RETURN_ADDR:%.*]], i32 [[SHADER_INDEX:%.*]], [[STRUCT_SYSTEMDATA:%.*]] [[SYSTEM_DATA:%.*]], {} [[HIT_ATTRS:%.*]], [33 x i32] [[PADDING:%.*]], [10 x i32] [[PAYLOAD:%.*]]) #[[ATTR2]] !lgc.rt.shaderstage [[META36:![0-9]+]] !lgc.cps [[META37:![0-9]+]] !continuation [[META38:![0-9]+]] {
+; REGISTERBUFFER-CPS-SAME: {} [[CONT_STATE:%.*]], i32 [[RETURNADDR:%.*]], i32 [[SHADER_INDEX:%.*]], [[STRUCT_SYSTEMDATA:%.*]] [[SYSTEM_DATA:%.*]], {} [[HIT_ATTRS:%.*]], [33 x i32] [[PADDING:%.*]], [10 x i32] [[PAYLOAD:%.*]]) #[[ATTR2]] !lgc.rt.shaderstage [[META37:![0-9]+]] !lgc.cps [[META38:![0-9]+]] !continuation [[META39:![0-9]+]] {
 ; REGISTERBUFFER-CPS-NEXT:  AllocaSpillBB:
 ; REGISTERBUFFER-CPS-NEXT:    [[SYSTEM_DATA_ALLOCA:%.*]] = alloca [[STRUCT_SYSTEMDATA]], align 8
 ; REGISTERBUFFER-CPS-NEXT:    [[PAYLOAD_FCA_0_EXTRACT:%.*]] = extractvalue [10 x i32] [[PAYLOAD]], 0
@@ -1871,12 +1881,12 @@ attributes #5 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 ; REGISTERBUFFER-CPS-NEXT:    [[DOTFCA_8_INSERT:%.*]] = insertvalue [10 x i32] [[DOTFCA_7_INSERT]], i32 [[TMP19]], 8
 ; REGISTERBUFFER-CPS-NEXT:    [[DOTFCA_9_INSERT:%.*]] = insertvalue [10 x i32] [[DOTFCA_8_INSERT]], i32 [[TMP20]], 9
 ; REGISTERBUFFER-CPS-NEXT:    call void @lgc.cps.free(i32 0)
-; REGISTERBUFFER-CPS-NEXT:    call void (...) @lgc.cps.jump(i32 [[RETURN_ADDR]], i32 3, {} poison, i32 poison, i32 poison, [[STRUCT_DISPATCHSYSTEMDATA]] [[DOTFCA_0_INSERT10]], [33 x i32] poison, [10 x i32] [[DOTFCA_9_INSERT]]), !continuation.registercount [[META33]]
+; REGISTERBUFFER-CPS-NEXT:    call void (...) @lgc.cps.jump(i32 [[RETURNADDR]], i32 6, {} poison, i32 poison, i32 poison, [[STRUCT_DISPATCHSYSTEMDATA]] [[DOTFCA_0_INSERT10]], [33 x i32] poison, [10 x i32] [[DOTFCA_9_INSERT]]), !continuation.registercount [[META33]]
 ; REGISTERBUFFER-CPS-NEXT:    unreachable
 ;
 ;
 ; REGISTERBUFFER-CPS-LABEL: define void @MyAnyHitShader(
-; REGISTERBUFFER-CPS-SAME: {} [[CONT_STATE:%.*]], i32 [[RETURN_ADDR:%.*]], i32 [[SHADER_INDEX:%.*]], [[STRUCT_ANYHITTRAVERSALDATA:%.*]] [[SYSTEM_DATA:%.*]], [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES:%.*]] [[HIT_ATTRS:%.*]], [6 x i32] [[PADDING:%.*]], [10 x i32] [[PAYLOAD:%.*]]) #[[ATTR2]] !lgc.rt.shaderstage [[META39:![0-9]+]] !lgc.cps [[META36]] !continuation [[META40:![0-9]+]] {
+; REGISTERBUFFER-CPS-SAME: {} [[CONT_STATE:%.*]], i32 [[RETURNADDR:%.*]], i32 [[SHADER_INDEX:%.*]], [[STRUCT_ANYHITTRAVERSALDATA:%.*]] [[SYSTEM_DATA:%.*]], [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES:%.*]] [[HIT_ATTRS:%.*]], [6 x i32] [[PADDING:%.*]], [10 x i32] [[PAYLOAD:%.*]]) #[[ATTR2]] !lgc.rt.shaderstage [[META38]] !lgc.cps [[META40:![0-9]+]] !continuation [[META41:![0-9]+]] {
 ; REGISTERBUFFER-CPS-NEXT:  AllocaSpillBB:
 ; REGISTERBUFFER-CPS-NEXT:    [[SYSTEM_DATA_ALLOCA:%.*]] = alloca [[STRUCT_ANYHITTRAVERSALDATA]], align 8
 ; REGISTERBUFFER-CPS-NEXT:    [[PAYLOAD_FCA_0_EXTRACT:%.*]] = extractvalue [10 x i32] [[PAYLOAD]], 0
@@ -2029,12 +2039,12 @@ attributes #5 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 ; REGISTERBUFFER-CPS-NEXT:    [[TMP21:%.*]] = bitcast float [[DOTSROA_0_8_VEC_EXTRACT]] to i32
 ; REGISTERBUFFER-CPS-NEXT:    [[DOTSROA_0_12_VEC_EXTRACT:%.*]] = extractelement <4 x float> [[DOTSROA_0_12_VEC_INSERT]], i32 3
 ; REGISTERBUFFER-CPS-NEXT:    [[TMP22:%.*]] = bitcast float [[DOTSROA_0_12_VEC_EXTRACT]] to i32
-; REGISTERBUFFER-CPS-NEXT:    [[HITATTRSALLOCA_SROA_0_0_VEC_EXTRACT:%.*]] = extractelement <2 x float> [[HIT_ATTRS_FCA_0_EXTRACT]], i32 0
-; REGISTERBUFFER-CPS-NEXT:    [[TMP23:%.*]] = bitcast float [[HITATTRSALLOCA_SROA_0_0_VEC_EXTRACT]] to i32
+; REGISTERBUFFER-CPS-NEXT:    [[HITATTRSALLOCA_SROA_0_0_VEC_EXTRACT15:%.*]] = extractelement <2 x float> [[HIT_ATTRS_FCA_0_EXTRACT]], i32 0
+; REGISTERBUFFER-CPS-NEXT:    [[TMP23:%.*]] = bitcast float [[HITATTRSALLOCA_SROA_0_0_VEC_EXTRACT15]] to i32
 ; REGISTERBUFFER-CPS-NEXT:    [[TMP24:%.*]] = bitcast i32 [[TMP23]] to float
 ; REGISTERBUFFER-CPS-NEXT:    [[DOTSROA_0392_0_VEC_INSERT:%.*]] = insertelement <2 x float> undef, float [[TMP24]], i32 0
-; REGISTERBUFFER-CPS-NEXT:    [[HITATTRSALLOCA_SROA_0_4_VEC_EXTRACT:%.*]] = extractelement <2 x float> [[HIT_ATTRS_FCA_0_EXTRACT]], i32 1
-; REGISTERBUFFER-CPS-NEXT:    [[TMP25:%.*]] = bitcast float [[HITATTRSALLOCA_SROA_0_4_VEC_EXTRACT]] to i32
+; REGISTERBUFFER-CPS-NEXT:    [[HITATTRSALLOCA_SROA_0_4_VEC_EXTRACT19:%.*]] = extractelement <2 x float> [[HIT_ATTRS_FCA_0_EXTRACT]], i32 1
+; REGISTERBUFFER-CPS-NEXT:    [[TMP25:%.*]] = bitcast float [[HITATTRSALLOCA_SROA_0_4_VEC_EXTRACT19]] to i32
 ; REGISTERBUFFER-CPS-NEXT:    [[TMP26:%.*]] = bitcast i32 [[TMP25]] to float
 ; REGISTERBUFFER-CPS-NEXT:    [[DOTSROA_0392_4_VEC_INSERT:%.*]] = insertelement <2 x float> [[DOTSROA_0392_0_VEC_INSERT]], float [[TMP26]], i32 1
 ; REGISTERBUFFER-CPS-NEXT:    [[DOTFCA_0_INSERT391:%.*]] = insertvalue [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]] poison, <2 x float> [[DOTSROA_0392_4_VEC_INSERT]], 0
@@ -2092,7 +2102,7 @@ attributes #5 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 ; REGISTERBUFFER-CPS-NEXT:    [[DOTFCA_8_INSERT:%.*]] = insertvalue [10 x i32] [[DOTFCA_7_INSERT]], i32 [[TMP21]], 8
 ; REGISTERBUFFER-CPS-NEXT:    [[DOTFCA_9_INSERT:%.*]] = insertvalue [10 x i32] [[DOTFCA_8_INSERT]], i32 [[TMP22]], 9
 ; REGISTERBUFFER-CPS-NEXT:    call void @lgc.cps.free(i32 0)
-; REGISTERBUFFER-CPS-NEXT:    call void (...) @lgc.cps.jump(i32 [[RETURN_ADDR]], i32 20, {} poison, i32 poison, i32 poison, [[STRUCT_ANYHITTRAVERSALDATA]] [[DOTFCA_1_3_INSERT]], [8 x i32] poison, [10 x i32] [[DOTFCA_9_INSERT]]), !continuation.registercount [[META33]]
+; REGISTERBUFFER-CPS-NEXT:    call void (...) @lgc.cps.jump(i32 [[RETURNADDR]], i32 40, {} poison, i32 poison, i32 poison, [[STRUCT_ANYHITTRAVERSALDATA]] [[DOTFCA_1_3_INSERT]], [8 x i32] poison, [10 x i32] [[DOTFCA_9_INSERT]]), !continuation.registercount [[META33]]
 ; REGISTERBUFFER-CPS-NEXT:    unreachable
 ; REGISTERBUFFER-CPS:       28:
 ; REGISTERBUFFER-CPS-NEXT:    [[TMP29:%.*]] = getelementptr inbounds [[STRUCT_ANYHITTRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0, i32 0, i32 0
@@ -2101,16 +2111,16 @@ attributes #5 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 ; REGISTERBUFFER-CPS-NEXT:    [[TMP30:%.*]] = bitcast float [[DOTSROA_0_0_VEC_EXTRACT25]] to i32
 ; REGISTERBUFFER-CPS-NEXT:    [[DOTSROA_0_4_VEC_EXTRACT34:%.*]] = extractelement <4 x float> [[DOTSROA_0_12_VEC_INSERT]], i32 1
 ; REGISTERBUFFER-CPS-NEXT:    [[TMP31:%.*]] = bitcast float [[DOTSROA_0_4_VEC_EXTRACT34]] to i32
-; REGISTERBUFFER-CPS-NEXT:    [[DOTSROA_0_8_VEC_EXTRACT43:%.*]] = extractelement <4 x float> [[DOTSROA_0_12_VEC_INSERT]], i32 2
-; REGISTERBUFFER-CPS-NEXT:    [[TMP32:%.*]] = bitcast float [[DOTSROA_0_8_VEC_EXTRACT43]] to i32
+; REGISTERBUFFER-CPS-NEXT:    [[DOTSROA_0_8_VEC_EXTRACT42:%.*]] = extractelement <4 x float> [[DOTSROA_0_12_VEC_INSERT]], i32 2
+; REGISTERBUFFER-CPS-NEXT:    [[TMP32:%.*]] = bitcast float [[DOTSROA_0_8_VEC_EXTRACT42]] to i32
 ; REGISTERBUFFER-CPS-NEXT:    [[DOTSROA_0_12_VEC_EXTRACT52:%.*]] = extractelement <4 x float> [[DOTSROA_0_12_VEC_INSERT]], i32 3
 ; REGISTERBUFFER-CPS-NEXT:    [[TMP33:%.*]] = bitcast float [[DOTSROA_0_12_VEC_EXTRACT52]] to i32
-; REGISTERBUFFER-CPS-NEXT:    [[HITATTRSALLOCA_SROA_0_0_VEC_EXTRACT13:%.*]] = extractelement <2 x float> [[HIT_ATTRS_FCA_0_EXTRACT]], i32 0
-; REGISTERBUFFER-CPS-NEXT:    [[TMP34:%.*]] = bitcast float [[HITATTRSALLOCA_SROA_0_0_VEC_EXTRACT13]] to i32
+; REGISTERBUFFER-CPS-NEXT:    [[HITATTRSALLOCA_SROA_0_0_VEC_EXTRACT:%.*]] = extractelement <2 x float> [[HIT_ATTRS_FCA_0_EXTRACT]], i32 0
+; REGISTERBUFFER-CPS-NEXT:    [[TMP34:%.*]] = bitcast float [[HITATTRSALLOCA_SROA_0_0_VEC_EXTRACT]] to i32
 ; REGISTERBUFFER-CPS-NEXT:    [[TMP35:%.*]] = bitcast i32 [[TMP34]] to float
 ; REGISTERBUFFER-CPS-NEXT:    [[DOTSROA_0396_0_VEC_INSERT:%.*]] = insertelement <2 x float> undef, float [[TMP35]], i32 0
-; REGISTERBUFFER-CPS-NEXT:    [[HITATTRSALLOCA_SROA_0_4_VEC_EXTRACT17:%.*]] = extractelement <2 x float> [[HIT_ATTRS_FCA_0_EXTRACT]], i32 1
-; REGISTERBUFFER-CPS-NEXT:    [[TMP36:%.*]] = bitcast float [[HITATTRSALLOCA_SROA_0_4_VEC_EXTRACT17]] to i32
+; REGISTERBUFFER-CPS-NEXT:    [[HITATTRSALLOCA_SROA_0_4_VEC_EXTRACT:%.*]] = extractelement <2 x float> [[HIT_ATTRS_FCA_0_EXTRACT]], i32 1
+; REGISTERBUFFER-CPS-NEXT:    [[TMP36:%.*]] = bitcast float [[HITATTRSALLOCA_SROA_0_4_VEC_EXTRACT]] to i32
 ; REGISTERBUFFER-CPS-NEXT:    [[TMP37:%.*]] = bitcast i32 [[TMP36]] to float
 ; REGISTERBUFFER-CPS-NEXT:    [[DOTSROA_0396_4_VEC_INSERT:%.*]] = insertelement <2 x float> [[DOTSROA_0396_0_VEC_INSERT]], float [[TMP37]], i32 1
 ; REGISTERBUFFER-CPS-NEXT:    [[DOTFCA_0_INSERT395:%.*]] = insertvalue [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]] poison, <2 x float> [[DOTSROA_0396_4_VEC_INSERT]], 0
@@ -2168,7 +2178,7 @@ attributes #5 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 ; REGISTERBUFFER-CPS-NEXT:    [[DOTFCA_8_INSERT85:%.*]] = insertvalue [10 x i32] [[DOTFCA_7_INSERT82]], i32 [[TMP32]], 8
 ; REGISTERBUFFER-CPS-NEXT:    [[DOTFCA_9_INSERT88:%.*]] = insertvalue [10 x i32] [[DOTFCA_8_INSERT85]], i32 [[TMP33]], 9
 ; REGISTERBUFFER-CPS-NEXT:    call void @lgc.cps.free(i32 0)
-; REGISTERBUFFER-CPS-NEXT:    call void (...) @lgc.cps.jump(i32 [[RETURN_ADDR]], i32 20, {} poison, i32 poison, i32 poison, [[STRUCT_ANYHITTRAVERSALDATA]] [[DOTFCA_1_3_INSERT263]], [8 x i32] poison, [10 x i32] [[DOTFCA_9_INSERT88]]), !continuation.registercount [[META33]]
+; REGISTERBUFFER-CPS-NEXT:    call void (...) @lgc.cps.jump(i32 [[RETURNADDR]], i32 40, {} poison, i32 poison, i32 poison, [[STRUCT_ANYHITTRAVERSALDATA]] [[DOTFCA_1_3_INSERT263]], [8 x i32] poison, [10 x i32] [[DOTFCA_9_INSERT88]]), !continuation.registercount [[META33]]
 ; REGISTERBUFFER-CPS-NEXT:    unreachable
 ; REGISTERBUFFER-CPS:       39:
 ; REGISTERBUFFER-CPS-NEXT:    br i1 [[TMP15]], label [[TMP40:%.*]], label [[TMP59:%.*]]
@@ -2181,8 +2191,8 @@ attributes #5 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 ; REGISTERBUFFER-CPS-NEXT:    [[TMP43:%.*]] = bitcast float [[DOTSROA_0_0_VEC_EXTRACT27]] to i32
 ; REGISTERBUFFER-CPS-NEXT:    [[DOTSROA_0_4_VEC_EXTRACT36:%.*]] = extractelement <4 x float> [[DOTSROA_0_12_VEC_INSERT]], i32 1
 ; REGISTERBUFFER-CPS-NEXT:    [[TMP44:%.*]] = bitcast float [[DOTSROA_0_4_VEC_EXTRACT36]] to i32
-; REGISTERBUFFER-CPS-NEXT:    [[DOTSROA_0_8_VEC_EXTRACT45:%.*]] = extractelement <4 x float> [[DOTSROA_0_12_VEC_INSERT]], i32 2
-; REGISTERBUFFER-CPS-NEXT:    [[TMP45:%.*]] = bitcast float [[DOTSROA_0_8_VEC_EXTRACT45]] to i32
+; REGISTERBUFFER-CPS-NEXT:    [[DOTSROA_0_8_VEC_EXTRACT44:%.*]] = extractelement <4 x float> [[DOTSROA_0_12_VEC_INSERT]], i32 2
+; REGISTERBUFFER-CPS-NEXT:    [[TMP45:%.*]] = bitcast float [[DOTSROA_0_8_VEC_EXTRACT44]] to i32
 ; REGISTERBUFFER-CPS-NEXT:    [[DOTSROA_0_12_VEC_EXTRACT54:%.*]] = extractelement <4 x float> [[DOTSROA_0_12_VEC_INSERT]], i32 3
 ; REGISTERBUFFER-CPS-NEXT:    [[TMP46:%.*]] = bitcast float [[DOTSROA_0_12_VEC_EXTRACT54]] to i32
 ; REGISTERBUFFER-CPS-NEXT:    [[TMP47:%.*]] = bitcast i32 [[TMP6]] to float
@@ -2244,7 +2254,7 @@ attributes #5 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 ; REGISTERBUFFER-CPS-NEXT:    [[DOTFCA_8_INSERT115:%.*]] = insertvalue [10 x i32] [[DOTFCA_7_INSERT112]], i32 [[TMP45]], 8
 ; REGISTERBUFFER-CPS-NEXT:    [[DOTFCA_9_INSERT118:%.*]] = insertvalue [10 x i32] [[DOTFCA_8_INSERT115]], i32 [[TMP46]], 9
 ; REGISTERBUFFER-CPS-NEXT:    call void @lgc.cps.free(i32 0)
-; REGISTERBUFFER-CPS-NEXT:    call void (...) @lgc.cps.jump(i32 [[RETURN_ADDR]], i32 20, {} poison, i32 poison, i32 poison, [[STRUCT_ANYHITTRAVERSALDATA]] [[DOTFCA_1_3_INSERT304]], [8 x i32] poison, [10 x i32] [[DOTFCA_9_INSERT118]]), !continuation.registercount [[META33]]
+; REGISTERBUFFER-CPS-NEXT:    call void (...) @lgc.cps.jump(i32 [[RETURNADDR]], i32 40, {} poison, i32 poison, i32 poison, [[STRUCT_ANYHITTRAVERSALDATA]] [[DOTFCA_1_3_INSERT304]], [8 x i32] poison, [10 x i32] [[DOTFCA_9_INSERT118]]), !continuation.registercount [[META33]]
 ; REGISTERBUFFER-CPS-NEXT:    unreachable
 ; REGISTERBUFFER-CPS:       50:
 ; REGISTERBUFFER-CPS-NEXT:    [[TMP51:%.*]] = getelementptr inbounds [[STRUCT_ANYHITTRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0, i32 0, i32 0
@@ -2253,8 +2263,8 @@ attributes #5 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 ; REGISTERBUFFER-CPS-NEXT:    [[TMP52:%.*]] = bitcast float [[DOTSROA_0_0_VEC_EXTRACT29]] to i32
 ; REGISTERBUFFER-CPS-NEXT:    [[DOTSROA_0_4_VEC_EXTRACT38:%.*]] = extractelement <4 x float> [[DOTSROA_0_12_VEC_INSERT]], i32 1
 ; REGISTERBUFFER-CPS-NEXT:    [[TMP53:%.*]] = bitcast float [[DOTSROA_0_4_VEC_EXTRACT38]] to i32
-; REGISTERBUFFER-CPS-NEXT:    [[DOTSROA_0_8_VEC_EXTRACT47:%.*]] = extractelement <4 x float> [[DOTSROA_0_12_VEC_INSERT]], i32 2
-; REGISTERBUFFER-CPS-NEXT:    [[TMP54:%.*]] = bitcast float [[DOTSROA_0_8_VEC_EXTRACT47]] to i32
+; REGISTERBUFFER-CPS-NEXT:    [[DOTSROA_0_8_VEC_EXTRACT46:%.*]] = extractelement <4 x float> [[DOTSROA_0_12_VEC_INSERT]], i32 2
+; REGISTERBUFFER-CPS-NEXT:    [[TMP54:%.*]] = bitcast float [[DOTSROA_0_8_VEC_EXTRACT46]] to i32
 ; REGISTERBUFFER-CPS-NEXT:    [[DOTSROA_0_12_VEC_EXTRACT56:%.*]] = extractelement <4 x float> [[DOTSROA_0_12_VEC_INSERT]], i32 3
 ; REGISTERBUFFER-CPS-NEXT:    [[TMP55:%.*]] = bitcast float [[DOTSROA_0_12_VEC_EXTRACT56]] to i32
 ; REGISTERBUFFER-CPS-NEXT:    [[TMP56:%.*]] = bitcast i32 [[TMP6]] to float
@@ -2316,7 +2326,7 @@ attributes #5 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 ; REGISTERBUFFER-CPS-NEXT:    [[DOTFCA_8_INSERT145:%.*]] = insertvalue [10 x i32] [[DOTFCA_7_INSERT142]], i32 [[TMP54]], 8
 ; REGISTERBUFFER-CPS-NEXT:    [[DOTFCA_9_INSERT148:%.*]] = insertvalue [10 x i32] [[DOTFCA_8_INSERT145]], i32 [[TMP55]], 9
 ; REGISTERBUFFER-CPS-NEXT:    call void @lgc.cps.free(i32 0)
-; REGISTERBUFFER-CPS-NEXT:    call void (...) @lgc.cps.jump(i32 [[RETURN_ADDR]], i32 20, {} poison, i32 poison, i32 poison, [[STRUCT_ANYHITTRAVERSALDATA]] [[DOTFCA_1_3_INSERT345]], [8 x i32] poison, [10 x i32] [[DOTFCA_9_INSERT148]]), !continuation.registercount [[META33]]
+; REGISTERBUFFER-CPS-NEXT:    call void (...) @lgc.cps.jump(i32 [[RETURNADDR]], i32 40, {} poison, i32 poison, i32 poison, [[STRUCT_ANYHITTRAVERSALDATA]] [[DOTFCA_1_3_INSERT345]], [8 x i32] poison, [10 x i32] [[DOTFCA_9_INSERT148]]), !continuation.registercount [[META33]]
 ; REGISTERBUFFER-CPS-NEXT:    unreachable
 ; REGISTERBUFFER-CPS:       59:
 ; REGISTERBUFFER-CPS-NEXT:    call void @_cont_AcceptHit(ptr [[SYSTEM_DATA_ALLOCA]])
@@ -2324,16 +2334,16 @@ attributes #5 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 ; REGISTERBUFFER-CPS-NEXT:    [[TMP60:%.*]] = bitcast float [[DOTSROA_0_0_VEC_EXTRACT31]] to i32
 ; REGISTERBUFFER-CPS-NEXT:    [[DOTSROA_0_4_VEC_EXTRACT40:%.*]] = extractelement <4 x float> [[DOTSROA_0_12_VEC_INSERT]], i32 1
 ; REGISTERBUFFER-CPS-NEXT:    [[TMP61:%.*]] = bitcast float [[DOTSROA_0_4_VEC_EXTRACT40]] to i32
-; REGISTERBUFFER-CPS-NEXT:    [[DOTSROA_0_8_VEC_EXTRACT49:%.*]] = extractelement <4 x float> [[DOTSROA_0_12_VEC_INSERT]], i32 2
-; REGISTERBUFFER-CPS-NEXT:    [[TMP62:%.*]] = bitcast float [[DOTSROA_0_8_VEC_EXTRACT49]] to i32
+; REGISTERBUFFER-CPS-NEXT:    [[DOTSROA_0_8_VEC_EXTRACT48:%.*]] = extractelement <4 x float> [[DOTSROA_0_12_VEC_INSERT]], i32 2
+; REGISTERBUFFER-CPS-NEXT:    [[TMP62:%.*]] = bitcast float [[DOTSROA_0_8_VEC_EXTRACT48]] to i32
 ; REGISTERBUFFER-CPS-NEXT:    [[DOTSROA_0_12_VEC_EXTRACT58:%.*]] = extractelement <4 x float> [[DOTSROA_0_12_VEC_INSERT]], i32 3
 ; REGISTERBUFFER-CPS-NEXT:    [[TMP63:%.*]] = bitcast float [[DOTSROA_0_12_VEC_EXTRACT58]] to i32
-; REGISTERBUFFER-CPS-NEXT:    [[HITATTRSALLOCA_SROA_0_0_VEC_EXTRACT15:%.*]] = extractelement <2 x float> [[HIT_ATTRS_FCA_0_EXTRACT]], i32 0
-; REGISTERBUFFER-CPS-NEXT:    [[TMP64:%.*]] = bitcast float [[HITATTRSALLOCA_SROA_0_0_VEC_EXTRACT15]] to i32
+; REGISTERBUFFER-CPS-NEXT:    [[HITATTRSALLOCA_SROA_0_0_VEC_EXTRACT13:%.*]] = extractelement <2 x float> [[HIT_ATTRS_FCA_0_EXTRACT]], i32 0
+; REGISTERBUFFER-CPS-NEXT:    [[TMP64:%.*]] = bitcast float [[HITATTRSALLOCA_SROA_0_0_VEC_EXTRACT13]] to i32
 ; REGISTERBUFFER-CPS-NEXT:    [[TMP65:%.*]] = bitcast i32 [[TMP64]] to float
 ; REGISTERBUFFER-CPS-NEXT:    [[DOTSROA_0408_0_VEC_INSERT:%.*]] = insertelement <2 x float> undef, float [[TMP65]], i32 0
-; REGISTERBUFFER-CPS-NEXT:    [[HITATTRSALLOCA_SROA_0_4_VEC_EXTRACT19:%.*]] = extractelement <2 x float> [[HIT_ATTRS_FCA_0_EXTRACT]], i32 1
-; REGISTERBUFFER-CPS-NEXT:    [[TMP66:%.*]] = bitcast float [[HITATTRSALLOCA_SROA_0_4_VEC_EXTRACT19]] to i32
+; REGISTERBUFFER-CPS-NEXT:    [[HITATTRSALLOCA_SROA_0_4_VEC_EXTRACT17:%.*]] = extractelement <2 x float> [[HIT_ATTRS_FCA_0_EXTRACT]], i32 1
+; REGISTERBUFFER-CPS-NEXT:    [[TMP66:%.*]] = bitcast float [[HITATTRSALLOCA_SROA_0_4_VEC_EXTRACT17]] to i32
 ; REGISTERBUFFER-CPS-NEXT:    [[TMP67:%.*]] = bitcast i32 [[TMP66]] to float
 ; REGISTERBUFFER-CPS-NEXT:    [[DOTSROA_0408_4_VEC_INSERT:%.*]] = insertelement <2 x float> [[DOTSROA_0408_0_VEC_INSERT]], float [[TMP67]], i32 1
 ; REGISTERBUFFER-CPS-NEXT:    [[DOTFCA_0_INSERT407:%.*]] = insertvalue [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]] poison, <2 x float> [[DOTSROA_0408_4_VEC_INSERT]], 0
@@ -2391,16 +2401,16 @@ attributes #5 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 ; REGISTERBUFFER-CPS-NEXT:    [[DOTFCA_8_INSERT175:%.*]] = insertvalue [10 x i32] [[DOTFCA_7_INSERT172]], i32 [[TMP62]], 8
 ; REGISTERBUFFER-CPS-NEXT:    [[DOTFCA_9_INSERT178:%.*]] = insertvalue [10 x i32] [[DOTFCA_8_INSERT175]], i32 [[TMP63]], 9
 ; REGISTERBUFFER-CPS-NEXT:    call void @lgc.cps.free(i32 0)
-; REGISTERBUFFER-CPS-NEXT:    call void (...) @lgc.cps.jump(i32 [[RETURN_ADDR]], i32 20, {} poison, i32 poison, i32 poison, [[STRUCT_ANYHITTRAVERSALDATA]] [[DOTFCA_1_3_INSERT386]], [8 x i32] poison, [10 x i32] [[DOTFCA_9_INSERT178]]), !continuation.registercount [[META33]]
+; REGISTERBUFFER-CPS-NEXT:    call void (...) @lgc.cps.jump(i32 [[RETURNADDR]], i32 40, {} poison, i32 poison, i32 poison, [[STRUCT_ANYHITTRAVERSALDATA]] [[DOTFCA_1_3_INSERT386]], [8 x i32] poison, [10 x i32] [[DOTFCA_9_INSERT178]]), !continuation.registercount [[META33]]
 ; REGISTERBUFFER-CPS-NEXT:    unreachable
 ;
 ;
 ; REGISTERBUFFER-CPS-LABEL: define void @MyIntersectionShader(
-; REGISTERBUFFER-CPS-SAME: {} [[CONT_STATE:%.*]], i32 [[RETURN_ADDR:%.*]], i32 [[SHADER_INDEX:%.*]], [[STRUCT_ANYHITTRAVERSALDATA:%.*]] [[SYSTEM_DATA:%.*]], {} [[HIT_ATTRS:%.*]], [8 x i32] [[PADDING:%.*]], [30 x i32] [[PAYLOAD:%.*]]) #[[ATTR2]] !lgc.rt.shaderstage [[META37]] !lgc.cps [[META41:![0-9]+]] !continuation [[META42:![0-9]+]] {
+; REGISTERBUFFER-CPS-SAME: {} [[CONT_STATE:%.*]], i32 [[RETURNADDR:%.*]], i32 [[SHADER_INDEX:%.*]], [[STRUCT_ANYHITTRAVERSALDATA:%.*]] [[SYSTEM_DATA:%.*]], {} [[HIT_ATTRS:%.*]], [8 x i32] [[PADDING:%.*]], [30 x i32] [[PAYLOAD:%.*]]) #[[ATTR2]] !lgc.rt.shaderstage [[META35]] !lgc.cps [[META42:![0-9]+]] !continuation [[META43:![0-9]+]] {
 ; REGISTERBUFFER-CPS-NEXT:  AllocaSpillBB:
 ; REGISTERBUFFER-CPS-NEXT:    [[CONT_STATE_STACK_SEGMENT:%.*]] = call ptr addrspace(32) @lgc.cps.alloc(i32 8)
 ; REGISTERBUFFER-CPS-NEXT:    [[RETURN_ADDR_SPILL_ADDR:%.*]] = getelementptr inbounds [[MYINTERSECTIONSHADER_FRAME:%.*]], ptr addrspace(32) [[CONT_STATE_STACK_SEGMENT]], i32 0, i32 0
-; REGISTERBUFFER-CPS-NEXT:    store i32 [[RETURN_ADDR]], ptr addrspace(32) [[RETURN_ADDR_SPILL_ADDR]], align 4
+; REGISTERBUFFER-CPS-NEXT:    store i32 [[RETURNADDR]], ptr addrspace(32) [[RETURN_ADDR_SPILL_ADDR]], align 4
 ; REGISTERBUFFER-CPS-NEXT:    [[PAYLOAD_FCA_0_EXTRACT:%.*]] = extractvalue [30 x i32] [[PAYLOAD]], 0
 ; REGISTERBUFFER-CPS-NEXT:    [[PAYLOAD_FCA_1_EXTRACT:%.*]] = extractvalue [30 x i32] [[PAYLOAD]], 1
 ; REGISTERBUFFER-CPS-NEXT:    [[PAYLOAD_FCA_2_EXTRACT:%.*]] = extractvalue [30 x i32] [[PAYLOAD]], 2
@@ -2505,7 +2515,7 @@ attributes #5 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 ; REGISTERBUFFER-CPS-NEXT:    [[DOTFCA_28_INSERT89:%.*]] = insertvalue [30 x i32] [[DOTFCA_27_INSERT86]], i32 [[PAYLOAD_FCA_28_EXTRACT]], 28
 ; REGISTERBUFFER-CPS-NEXT:    [[DOTFCA_29_INSERT92:%.*]] = insertvalue [30 x i32] [[DOTFCA_28_INSERT89]], i32 [[PAYLOAD_FCA_29_EXTRACT]], 29
 ; REGISTERBUFFER-CPS-NEXT:    [[TMP2:%.*]] = call i64 (...) @lgc.cps.as.continuation.reference__i64(ptr @MyIntersectionShader.resume.0)
-; REGISTERBUFFER-CPS-NEXT:    call void (...) @lgc.cps.jump(i32 3, i32 8, {} poison, i64 [[TMP2]], i32 5, float [[RES_I_FCA_3_INSERT_FCA_2_EXTRACT]], i32 0, [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]] [[DOTFCA_0_INSERT350]], [32 x i32] poison, [30 x i32] [[DOTFCA_29_INSERT92]]), !continuation.returnedRegistercount !32, !continuation.registercount [[META32:![0-9]+]]
+; REGISTERBUFFER-CPS-NEXT:    call void (...) @lgc.cps.jump(i32 3, i32 16, {} poison, i64 [[TMP2]], i32 5, float [[RES_I_FCA_3_INSERT_FCA_2_EXTRACT]], i32 0, [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]] [[DOTFCA_0_INSERT350]], [32 x i32] poison, [30 x i32] [[DOTFCA_29_INSERT92]]), !continuation.returnedRegistercount [[META32:![0-9]+]], !continuation.registercount [[META32]]
 ; REGISTERBUFFER-CPS-NEXT:    unreachable
 ; REGISTERBUFFER-CPS:       isEnd.i:
 ; REGISTERBUFFER-CPS-NEXT:    [[DOTSROA_0_0_VEC_EXTRACT:%.*]] = extractelement <2 x float> undef, i32 0
@@ -2519,7 +2529,7 @@ attributes #5 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 ; REGISTERBUFFER-CPS-NEXT:    [[DOTFCA_0_INSERT352:%.*]] = insertvalue [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]] poison, <2 x float> [[DOTSROA_0353_4_VEC_INSERT]], 0
 ; REGISTERBUFFER-CPS-NEXT:    [[DOTFCA_0_EXTRACT286:%.*]] = extractvalue [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]] [[DOTFCA_0_INSERT352]], 0
 ; REGISTERBUFFER-CPS-NEXT:    [[TMP7:%.*]] = bitcast <2 x float> [[DOTFCA_0_EXTRACT286]] to <2 x i32>
-; REGISTERBUFFER-CPS-NEXT:    [[SYSTEM_DATA_ALLOCA_SROA_0_0_VEC_EXPAND:%.*]] = shufflevector <2 x i32> [[TMP7]], <2 x i32> poison, <3 x i32> <i32 0, i32 1, i32 {{undef|poison}}>
+; REGISTERBUFFER-CPS-NEXT:    [[SYSTEM_DATA_ALLOCA_SROA_0_0_VEC_EXPAND:%.*]] = shufflevector <2 x i32> [[TMP7]], <2 x i32> poison, <3 x i32> <i32 0, i32 1, i32 poison>
 ; REGISTERBUFFER-CPS-NEXT:    [[SYSTEM_DATA_ALLOCA_SROA_0_0_VECBLEND:%.*]] = select <3 x i1> <i1 true, i1 true, i1 false>, <3 x i32> [[SYSTEM_DATA_ALLOCA_SROA_0_0_VEC_EXPAND]], <3 x i32> [[SYSTEM_DATA_FCA_0_0_0_0_EXTRACT]]
 ; REGISTERBUFFER-CPS-NEXT:    [[ISEND_I1:%.*]] = call i1 @opaqueIsEnd()
 ; REGISTERBUFFER-CPS-NEXT:    br i1 [[ISEND_I1]], label [[TMP8:%.*]], label [[TMP9:%.*]]
@@ -2568,7 +2578,7 @@ attributes #5 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 ; REGISTERBUFFER-CPS-NEXT:    [[DOTFCA_28_INSERT209:%.*]] = insertvalue [30 x i32] [[DOTFCA_27_INSERT206]], i32 [[PAYLOAD_FCA_28_EXTRACT]], 28
 ; REGISTERBUFFER-CPS-NEXT:    [[DOTFCA_29_INSERT212:%.*]] = insertvalue [30 x i32] [[DOTFCA_28_INSERT209]], i32 [[PAYLOAD_FCA_29_EXTRACT]], 29
 ; REGISTERBUFFER-CPS-NEXT:    call void @lgc.cps.free(i32 8)
-; REGISTERBUFFER-CPS-NEXT:    call void (...) @lgc.cps.jump(i32 [[RETURN_ADDR]], i32 4, {} poison, i32 poison, i32 poison, [[STRUCT_ANYHITTRAVERSALDATA]] [[DOTFCA_1_3_INSERT325]], [8 x i32] poison, [30 x i32] [[DOTFCA_29_INSERT212]]), !continuation.registercount [[META32]]
+; REGISTERBUFFER-CPS-NEXT:    call void (...) @lgc.cps.jump(i32 [[RETURNADDR]], i32 8, {} poison, i32 poison, i32 poison, [[STRUCT_ANYHITTRAVERSALDATA]] [[DOTFCA_1_3_INSERT325]], [8 x i32] poison, [30 x i32] [[DOTFCA_29_INSERT212]]), !continuation.registercount [[META32]]
 ; REGISTERBUFFER-CPS-NEXT:    unreachable
 ; REGISTERBUFFER-CPS:       9:
 ; REGISTERBUFFER-CPS-NEXT:    [[DOTFCA_0_0_0_0_INSERT:%.*]] = insertvalue [[STRUCT_ANYHITTRAVERSALDATA]] poison, <3 x i32> [[SYSTEM_DATA_ALLOCA_SROA_0_0_VECBLEND]], 0, 0, 0, 0
@@ -2615,12 +2625,12 @@ attributes #5 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 ; REGISTERBUFFER-CPS-NEXT:    [[DOTFCA_28_INSERT:%.*]] = insertvalue [30 x i32] [[DOTFCA_27_INSERT]], i32 [[PAYLOAD_FCA_28_EXTRACT]], 28
 ; REGISTERBUFFER-CPS-NEXT:    [[DOTFCA_29_INSERT:%.*]] = insertvalue [30 x i32] [[DOTFCA_28_INSERT]], i32 [[PAYLOAD_FCA_29_EXTRACT]], 29
 ; REGISTERBUFFER-CPS-NEXT:    call void @lgc.cps.free(i32 8)
-; REGISTERBUFFER-CPS-NEXT:    call void (...) @lgc.cps.jump(i32 [[RETURN_ADDR]], i32 4, {} poison, i32 poison, i32 poison, [[STRUCT_ANYHITTRAVERSALDATA]] [[DOTFCA_1_3_INSERT]], [8 x i32] poison, [30 x i32] [[DOTFCA_29_INSERT]]), !continuation.registercount [[META32]]
+; REGISTERBUFFER-CPS-NEXT:    call void (...) @lgc.cps.jump(i32 [[RETURNADDR]], i32 8, {} poison, i32 poison, i32 poison, [[STRUCT_ANYHITTRAVERSALDATA]] [[DOTFCA_1_3_INSERT]], [8 x i32] poison, [30 x i32] [[DOTFCA_29_INSERT]]), !continuation.registercount [[META32]]
 ; REGISTERBUFFER-CPS-NEXT:    unreachable
 ;
 ;
 ; REGISTERBUFFER-CPS-LABEL: define dso_local void @MyIntersectionShader.resume.0(
-; REGISTERBUFFER-CPS-SAME: {} [[TMP0:%.*]], i32 [[TMP1:%.*]], i32 [[TMP2:%.*]], { [[STRUCT_ANYHITTRAVERSALDATA:%.*]], [8 x i32], [30 x i32] } [[TMP3:%.*]]) !lgc.rt.shaderstage [[META37]] !lgc.cps [[META41]] !continuation [[META42]] {
+; REGISTERBUFFER-CPS-SAME: {} [[TMP0:%.*]], i32 [[TMP1:%.*]], i32 [[TMP2:%.*]], { [[STRUCT_ANYHITTRAVERSALDATA:%.*]], [8 x i32], [30 x i32] } [[TMP3:%.*]]) !lgc.rt.shaderstage [[META35]] !lgc.cps [[META42]] !continuation [[META43]] {
 ; REGISTERBUFFER-CPS-NEXT:  entryresume.0:
 ; REGISTERBUFFER-CPS-NEXT:    [[TMP4:%.*]] = call ptr addrspace(32) @lgc.cps.peek(i32 8)
 ; REGISTERBUFFER-CPS-NEXT:    [[TMP5:%.*]] = extractvalue { [[STRUCT_ANYHITTRAVERSALDATA]], [8 x i32], [30 x i32] } [[TMP3]], 2
@@ -2718,7 +2728,7 @@ attributes #5 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 ; REGISTERBUFFER-CPS-NEXT:    [[DOTFCA_28_INSERT209:%.*]] = insertvalue [30 x i32] [[DOTFCA_27_INSERT206]], i32 [[DOTFCA_28_EXTRACT]], 28
 ; REGISTERBUFFER-CPS-NEXT:    [[DOTFCA_29_INSERT212:%.*]] = insertvalue [30 x i32] [[DOTFCA_28_INSERT209]], i32 [[DOTFCA_29_EXTRACT]], 29
 ; REGISTERBUFFER-CPS-NEXT:    call void @lgc.cps.free(i32 8)
-; REGISTERBUFFER-CPS-NEXT:    call void (...) @lgc.cps.jump(i32 [[RETURN_ADDR_RELOAD2]], i32 4, {} poison, i32 poison, i32 poison, [[STRUCT_ANYHITTRAVERSALDATA]] [[DOTFCA_1_3_INSERT325]], [8 x i32] poison, [30 x i32] [[DOTFCA_29_INSERT212]]), !continuation.registercount [[META32]]
+; REGISTERBUFFER-CPS-NEXT:    call void (...) @lgc.cps.jump(i32 [[RETURN_ADDR_RELOAD2]], i32 8, {} poison, i32 poison, i32 poison, [[STRUCT_ANYHITTRAVERSALDATA]] [[DOTFCA_1_3_INSERT325]], [8 x i32] poison, [30 x i32] [[DOTFCA_29_INSERT212]]), !continuation.registercount [[META32]]
 ; REGISTERBUFFER-CPS-NEXT:    unreachable
 ; REGISTERBUFFER-CPS:       8:
 ; REGISTERBUFFER-CPS-NEXT:    [[RETURN_ADDR_RELOAD_ADDR:%.*]] = getelementptr inbounds [[MYINTERSECTIONSHADER_FRAME]], ptr addrspace(32) [[TMP4]], i32 0, i32 0
@@ -2767,16 +2777,16 @@ attributes #5 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 ; REGISTERBUFFER-CPS-NEXT:    [[DOTFCA_28_INSERT:%.*]] = insertvalue [30 x i32] [[DOTFCA_27_INSERT]], i32 [[DOTFCA_28_EXTRACT]], 28
 ; REGISTERBUFFER-CPS-NEXT:    [[DOTFCA_29_INSERT:%.*]] = insertvalue [30 x i32] [[DOTFCA_28_INSERT]], i32 [[DOTFCA_29_EXTRACT]], 29
 ; REGISTERBUFFER-CPS-NEXT:    call void @lgc.cps.free(i32 8)
-; REGISTERBUFFER-CPS-NEXT:    call void (...) @lgc.cps.jump(i32 [[RETURN_ADDR_RELOAD]], i32 4, {} poison, i32 poison, i32 poison, [[STRUCT_ANYHITTRAVERSALDATA]] [[DOTFCA_1_3_INSERT]], [8 x i32] poison, [30 x i32] [[DOTFCA_29_INSERT]]), !continuation.registercount [[META32]]
+; REGISTERBUFFER-CPS-NEXT:    call void (...) @lgc.cps.jump(i32 [[RETURN_ADDR_RELOAD]], i32 8, {} poison, i32 poison, i32 poison, [[STRUCT_ANYHITTRAVERSALDATA]] [[DOTFCA_1_3_INSERT]], [8 x i32] poison, [30 x i32] [[DOTFCA_29_INSERT]]), !continuation.registercount [[META32]]
 ; REGISTERBUFFER-CPS-NEXT:    unreachable
 ;
 ;
 ; REGISTERBUFFER-CPS-LABEL: define void @MyIntersectionShader2(
-; REGISTERBUFFER-CPS-SAME: {} [[CONT_STATE:%.*]], i32 [[RETURN_ADDR:%.*]], i32 [[SHADER_INDEX:%.*]], [[STRUCT_ANYHITTRAVERSALDATA:%.*]] [[SYSTEM_DATA:%.*]], {} [[HIT_ATTRS:%.*]], [8 x i32] [[PADDING:%.*]], [30 x i32] [[PAYLOAD:%.*]]) #[[ATTR2]] !lgc.rt.shaderstage [[META37]] !lgc.cps [[META41]] !continuation [[META43:![0-9]+]] {
+; REGISTERBUFFER-CPS-SAME: {} [[CONT_STATE:%.*]], i32 [[RETURNADDR:%.*]], i32 [[SHADER_INDEX:%.*]], [[STRUCT_ANYHITTRAVERSALDATA:%.*]] [[SYSTEM_DATA:%.*]], {} [[HIT_ATTRS:%.*]], [8 x i32] [[PADDING:%.*]], [30 x i32] [[PAYLOAD:%.*]]) #[[ATTR2]] !lgc.rt.shaderstage [[META35]] !lgc.cps [[META42]] !continuation [[META44:![0-9]+]] {
 ; REGISTERBUFFER-CPS-NEXT:  AllocaSpillBB:
 ; REGISTERBUFFER-CPS-NEXT:    [[CONT_STATE_STACK_SEGMENT:%.*]] = call ptr addrspace(32) @lgc.cps.alloc(i32 8)
 ; REGISTERBUFFER-CPS-NEXT:    [[RETURN_ADDR_SPILL_ADDR:%.*]] = getelementptr inbounds [[MYINTERSECTIONSHADER2_FRAME:%.*]], ptr addrspace(32) [[CONT_STATE_STACK_SEGMENT]], i32 0, i32 0
-; REGISTERBUFFER-CPS-NEXT:    store i32 [[RETURN_ADDR]], ptr addrspace(32) [[RETURN_ADDR_SPILL_ADDR]], align 4
+; REGISTERBUFFER-CPS-NEXT:    store i32 [[RETURNADDR]], ptr addrspace(32) [[RETURN_ADDR_SPILL_ADDR]], align 4
 ; REGISTERBUFFER-CPS-NEXT:    [[PAYLOAD_FCA_0_EXTRACT:%.*]] = extractvalue [30 x i32] [[PAYLOAD]], 0
 ; REGISTERBUFFER-CPS-NEXT:    [[PAYLOAD_FCA_1_EXTRACT:%.*]] = extractvalue [30 x i32] [[PAYLOAD]], 1
 ; REGISTERBUFFER-CPS-NEXT:    [[PAYLOAD_FCA_2_EXTRACT:%.*]] = extractvalue [30 x i32] [[PAYLOAD]], 2
@@ -2881,7 +2891,7 @@ attributes #5 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 ; REGISTERBUFFER-CPS-NEXT:    [[DOTFCA_28_INSERT89:%.*]] = insertvalue [30 x i32] [[DOTFCA_27_INSERT86]], i32 [[PAYLOAD_FCA_28_EXTRACT]], 28
 ; REGISTERBUFFER-CPS-NEXT:    [[DOTFCA_29_INSERT92:%.*]] = insertvalue [30 x i32] [[DOTFCA_28_INSERT89]], i32 [[PAYLOAD_FCA_29_EXTRACT]], 29
 ; REGISTERBUFFER-CPS-NEXT:    [[TMP2:%.*]] = call i64 (...) @lgc.cps.as.continuation.reference__i64(ptr @MyIntersectionShader2.resume.0)
-; REGISTERBUFFER-CPS-NEXT:    call void (...) @lgc.cps.jump(i32 3, i32 8, {} poison, i64 [[TMP2]], i32 5, float [[RES_I_FCA_3_INSERT_FCA_2_EXTRACT]], i32 0, [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES2]] [[DOTFCA_0_INSERT350]], [32 x i32] poison, [30 x i32] [[DOTFCA_29_INSERT92]]), !continuation.returnedRegistercount !32, !continuation.registercount [[META32]]
+; REGISTERBUFFER-CPS-NEXT:    call void (...) @lgc.cps.jump(i32 3, i32 16, {} poison, i64 [[TMP2]], i32 5, float [[RES_I_FCA_3_INSERT_FCA_2_EXTRACT]], i32 0, [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES2]] [[DOTFCA_0_INSERT350]], [32 x i32] poison, [30 x i32] [[DOTFCA_29_INSERT92]]), !continuation.returnedRegistercount [[META32]], !continuation.registercount [[META32]]
 ; REGISTERBUFFER-CPS-NEXT:    unreachable
 ; REGISTERBUFFER-CPS:       isEnd.i:
 ; REGISTERBUFFER-CPS-NEXT:    [[DOTSROA_0_0_VEC_EXTRACT:%.*]] = extractelement <2 x float> undef, i32 0
@@ -2895,7 +2905,7 @@ attributes #5 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 ; REGISTERBUFFER-CPS-NEXT:    [[DOTFCA_0_INSERT352:%.*]] = insertvalue [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES:%.*]] poison, <2 x float> [[DOTSROA_0353_4_VEC_INSERT]], 0
 ; REGISTERBUFFER-CPS-NEXT:    [[DOTFCA_0_EXTRACT286:%.*]] = extractvalue [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]] [[DOTFCA_0_INSERT352]], 0
 ; REGISTERBUFFER-CPS-NEXT:    [[TMP7:%.*]] = bitcast <2 x float> [[DOTFCA_0_EXTRACT286]] to <2 x i32>
-; REGISTERBUFFER-CPS-NEXT:    [[SYSTEM_DATA_ALLOCA_SROA_0_0_VEC_EXPAND:%.*]] = shufflevector <2 x i32> [[TMP7]], <2 x i32> poison, <3 x i32> <i32 0, i32 1, i32 {{undef|poison}}>
+; REGISTERBUFFER-CPS-NEXT:    [[SYSTEM_DATA_ALLOCA_SROA_0_0_VEC_EXPAND:%.*]] = shufflevector <2 x i32> [[TMP7]], <2 x i32> poison, <3 x i32> <i32 0, i32 1, i32 poison>
 ; REGISTERBUFFER-CPS-NEXT:    [[SYSTEM_DATA_ALLOCA_SROA_0_0_VECBLEND:%.*]] = select <3 x i1> <i1 true, i1 true, i1 false>, <3 x i32> [[SYSTEM_DATA_ALLOCA_SROA_0_0_VEC_EXPAND]], <3 x i32> [[SYSTEM_DATA_FCA_0_0_0_0_EXTRACT]]
 ; REGISTERBUFFER-CPS-NEXT:    [[ISEND_I1:%.*]] = call i1 @opaqueIsEnd()
 ; REGISTERBUFFER-CPS-NEXT:    br i1 [[ISEND_I1]], label [[TMP8:%.*]], label [[TMP9:%.*]]
@@ -2944,7 +2954,7 @@ attributes #5 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 ; REGISTERBUFFER-CPS-NEXT:    [[DOTFCA_28_INSERT209:%.*]] = insertvalue [30 x i32] [[DOTFCA_27_INSERT206]], i32 [[PAYLOAD_FCA_28_EXTRACT]], 28
 ; REGISTERBUFFER-CPS-NEXT:    [[DOTFCA_29_INSERT212:%.*]] = insertvalue [30 x i32] [[DOTFCA_28_INSERT209]], i32 [[PAYLOAD_FCA_29_EXTRACT]], 29
 ; REGISTERBUFFER-CPS-NEXT:    call void @lgc.cps.free(i32 8)
-; REGISTERBUFFER-CPS-NEXT:    call void (...) @lgc.cps.jump(i32 [[RETURN_ADDR]], i32 4, {} poison, i32 poison, i32 poison, [[STRUCT_ANYHITTRAVERSALDATA]] [[DOTFCA_1_3_INSERT325]], [8 x i32] poison, [30 x i32] [[DOTFCA_29_INSERT212]]), !continuation.registercount [[META32]]
+; REGISTERBUFFER-CPS-NEXT:    call void (...) @lgc.cps.jump(i32 [[RETURNADDR]], i32 8, {} poison, i32 poison, i32 poison, [[STRUCT_ANYHITTRAVERSALDATA]] [[DOTFCA_1_3_INSERT325]], [8 x i32] poison, [30 x i32] [[DOTFCA_29_INSERT212]]), !continuation.registercount [[META32]]
 ; REGISTERBUFFER-CPS-NEXT:    unreachable
 ; REGISTERBUFFER-CPS:       9:
 ; REGISTERBUFFER-CPS-NEXT:    [[DOTFCA_0_0_0_0_INSERT:%.*]] = insertvalue [[STRUCT_ANYHITTRAVERSALDATA]] poison, <3 x i32> [[SYSTEM_DATA_ALLOCA_SROA_0_0_VECBLEND]], 0, 0, 0, 0
@@ -2991,12 +3001,12 @@ attributes #5 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 ; REGISTERBUFFER-CPS-NEXT:    [[DOTFCA_28_INSERT:%.*]] = insertvalue [30 x i32] [[DOTFCA_27_INSERT]], i32 [[PAYLOAD_FCA_28_EXTRACT]], 28
 ; REGISTERBUFFER-CPS-NEXT:    [[DOTFCA_29_INSERT:%.*]] = insertvalue [30 x i32] [[DOTFCA_28_INSERT]], i32 [[PAYLOAD_FCA_29_EXTRACT]], 29
 ; REGISTERBUFFER-CPS-NEXT:    call void @lgc.cps.free(i32 8)
-; REGISTERBUFFER-CPS-NEXT:    call void (...) @lgc.cps.jump(i32 [[RETURN_ADDR]], i32 4, {} poison, i32 poison, i32 poison, [[STRUCT_ANYHITTRAVERSALDATA]] [[DOTFCA_1_3_INSERT]], [8 x i32] poison, [30 x i32] [[DOTFCA_29_INSERT]]), !continuation.registercount [[META32]]
+; REGISTERBUFFER-CPS-NEXT:    call void (...) @lgc.cps.jump(i32 [[RETURNADDR]], i32 8, {} poison, i32 poison, i32 poison, [[STRUCT_ANYHITTRAVERSALDATA]] [[DOTFCA_1_3_INSERT]], [8 x i32] poison, [30 x i32] [[DOTFCA_29_INSERT]]), !continuation.registercount [[META32]]
 ; REGISTERBUFFER-CPS-NEXT:    unreachable
 ;
 ;
 ; REGISTERBUFFER-CPS-LABEL: define dso_local void @MyIntersectionShader2.resume.0(
-; REGISTERBUFFER-CPS-SAME: {} [[TMP0:%.*]], i32 [[TMP1:%.*]], i32 [[TMP2:%.*]], { [[STRUCT_ANYHITTRAVERSALDATA:%.*]], [8 x i32], [30 x i32] } [[TMP3:%.*]]) !lgc.rt.shaderstage [[META37]] !lgc.cps [[META41]] !continuation [[META43]] {
+; REGISTERBUFFER-CPS-SAME: {} [[TMP0:%.*]], i32 [[TMP1:%.*]], i32 [[TMP2:%.*]], { [[STRUCT_ANYHITTRAVERSALDATA:%.*]], [8 x i32], [30 x i32] } [[TMP3:%.*]]) !lgc.rt.shaderstage [[META35]] !lgc.cps [[META42]] !continuation [[META44]] {
 ; REGISTERBUFFER-CPS-NEXT:  entryresume.0:
 ; REGISTERBUFFER-CPS-NEXT:    [[TMP4:%.*]] = call ptr addrspace(32) @lgc.cps.peek(i32 8)
 ; REGISTERBUFFER-CPS-NEXT:    [[TMP5:%.*]] = extractvalue { [[STRUCT_ANYHITTRAVERSALDATA]], [8 x i32], [30 x i32] } [[TMP3]], 2
@@ -3094,7 +3104,7 @@ attributes #5 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 ; REGISTERBUFFER-CPS-NEXT:    [[DOTFCA_28_INSERT209:%.*]] = insertvalue [30 x i32] [[DOTFCA_27_INSERT206]], i32 [[DOTFCA_28_EXTRACT]], 28
 ; REGISTERBUFFER-CPS-NEXT:    [[DOTFCA_29_INSERT212:%.*]] = insertvalue [30 x i32] [[DOTFCA_28_INSERT209]], i32 [[DOTFCA_29_EXTRACT]], 29
 ; REGISTERBUFFER-CPS-NEXT:    call void @lgc.cps.free(i32 8)
-; REGISTERBUFFER-CPS-NEXT:    call void (...) @lgc.cps.jump(i32 [[RETURN_ADDR_RELOAD2]], i32 4, {} poison, i32 poison, i32 poison, [[STRUCT_ANYHITTRAVERSALDATA]] [[DOTFCA_1_3_INSERT325]], [8 x i32] poison, [30 x i32] [[DOTFCA_29_INSERT212]]), !continuation.registercount [[META32]]
+; REGISTERBUFFER-CPS-NEXT:    call void (...) @lgc.cps.jump(i32 [[RETURN_ADDR_RELOAD2]], i32 8, {} poison, i32 poison, i32 poison, [[STRUCT_ANYHITTRAVERSALDATA]] [[DOTFCA_1_3_INSERT325]], [8 x i32] poison, [30 x i32] [[DOTFCA_29_INSERT212]]), !continuation.registercount [[META32]]
 ; REGISTERBUFFER-CPS-NEXT:    unreachable
 ; REGISTERBUFFER-CPS:       8:
 ; REGISTERBUFFER-CPS-NEXT:    [[RETURN_ADDR_RELOAD_ADDR:%.*]] = getelementptr inbounds [[MYINTERSECTIONSHADER2_FRAME]], ptr addrspace(32) [[TMP4]], i32 0, i32 0
@@ -3143,12 +3153,12 @@ attributes #5 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 ; REGISTERBUFFER-CPS-NEXT:    [[DOTFCA_28_INSERT:%.*]] = insertvalue [30 x i32] [[DOTFCA_27_INSERT]], i32 [[DOTFCA_28_EXTRACT]], 28
 ; REGISTERBUFFER-CPS-NEXT:    [[DOTFCA_29_INSERT:%.*]] = insertvalue [30 x i32] [[DOTFCA_28_INSERT]], i32 [[DOTFCA_29_EXTRACT]], 29
 ; REGISTERBUFFER-CPS-NEXT:    call void @lgc.cps.free(i32 8)
-; REGISTERBUFFER-CPS-NEXT:    call void (...) @lgc.cps.jump(i32 [[RETURN_ADDR_RELOAD]], i32 4, {} poison, i32 poison, i32 poison, [[STRUCT_ANYHITTRAVERSALDATA]] [[DOTFCA_1_3_INSERT]], [8 x i32] poison, [30 x i32] [[DOTFCA_29_INSERT]]), !continuation.registercount [[META32]]
+; REGISTERBUFFER-CPS-NEXT:    call void (...) @lgc.cps.jump(i32 [[RETURN_ADDR_RELOAD]], i32 8, {} poison, i32 poison, i32 poison, [[STRUCT_ANYHITTRAVERSALDATA]] [[DOTFCA_1_3_INSERT]], [8 x i32] poison, [30 x i32] [[DOTFCA_29_INSERT]]), !continuation.registercount [[META32]]
 ; REGISTERBUFFER-CPS-NEXT:    unreachable
 ;
 ;
 ; REGISTERBUFFER-CPS-LABEL: define void @MyMissShader(
-; REGISTERBUFFER-CPS-SAME: {} [[CONT_STATE:%.*]], i32 [[RETURN_ADDR:%.*]], i32 [[SHADER_INDEX:%.*]], [[STRUCT_SYSTEMDATA:%.*]] [[SYSTEM_DATA:%.*]], {} [[HIT_ATTRS:%.*]], [33 x i32] [[PADDING:%.*]], [10 x i32] [[PAYLOAD:%.*]]) #[[ATTR2]] !lgc.rt.shaderstage [[META41]] !lgc.cps [[META37]] !continuation [[META44:![0-9]+]] {
+; REGISTERBUFFER-CPS-SAME: {} [[CONT_STATE:%.*]], i32 [[RETURNADDR:%.*]], i32 [[SHADER_INDEX:%.*]], [[STRUCT_SYSTEMDATA:%.*]] [[SYSTEM_DATA:%.*]], {} [[HIT_ATTRS:%.*]], [33 x i32] [[PADDING:%.*]], [10 x i32] [[PAYLOAD:%.*]]) #[[ATTR2]] !lgc.rt.shaderstage [[META40]] !lgc.cps [[META38]] !continuation [[META45:![0-9]+]] {
 ; REGISTERBUFFER-CPS-NEXT:  AllocaSpillBB:
 ; REGISTERBUFFER-CPS-NEXT:    [[PAYLOAD_FCA_0_EXTRACT:%.*]] = extractvalue [10 x i32] [[PAYLOAD]], 0
 ; REGISTERBUFFER-CPS-NEXT:    [[PAYLOAD_FCA_1_EXTRACT:%.*]] = extractvalue [10 x i32] [[PAYLOAD]], 1
@@ -3190,7 +3200,7 @@ attributes #5 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 ; REGISTERBUFFER-CPS-NEXT:    [[DOTFCA_8_INSERT:%.*]] = insertvalue [10 x i32] [[DOTFCA_7_INSERT]], i32 [[TMP6]], 8
 ; REGISTERBUFFER-CPS-NEXT:    [[DOTFCA_9_INSERT:%.*]] = insertvalue [10 x i32] [[DOTFCA_8_INSERT]], i32 [[TMP7]], 9
 ; REGISTERBUFFER-CPS-NEXT:    call void @lgc.cps.free(i32 0)
-; REGISTERBUFFER-CPS-NEXT:    call void (...) @lgc.cps.jump(i32 [[RETURN_ADDR]], i32 3, {} poison, i32 poison, i32 poison, [[STRUCT_DISPATCHSYSTEMDATA]] [[DOTFCA_0_INSERT9]], [33 x i32] poison, [10 x i32] [[DOTFCA_9_INSERT]]), !continuation.registercount [[META33]]
+; REGISTERBUFFER-CPS-NEXT:    call void (...) @lgc.cps.jump(i32 [[RETURNADDR]], i32 6, {} poison, i32 poison, i32 poison, [[STRUCT_DISPATCHSYSTEMDATA]] [[DOTFCA_0_INSERT9]], [33 x i32] poison, [10 x i32] [[DOTFCA_9_INSERT]]), !continuation.registercount [[META33]]
 ; REGISTERBUFFER-CPS-NEXT:    unreachable
 ;
 ;
@@ -3274,49 +3284,48 @@ attributes #5 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 ;
 ;
 ; POSTPROCESS-LABEL: define void @MyRayGen(
-; POSTPROCESS-SAME: ) #[[ATTR2:[0-9]+]] !lgc.rt.shaderstage [[META22:![0-9]+]] !continuation.entry [[META13:![0-9]+]] !continuation.registercount [[META22]] !continuation [[META35:![0-9]+]] !continuation.state [[META22]] {
+; POSTPROCESS-SAME: i32 [[CSPINIT:%.*]], i64 [[RETURNADDR:%.*]], [[STRUCT_DISPATCHSYSTEMDATA:%.*]] [[TMP0:%.*]]) #[[ATTR2:[0-9]+]] !lgc.rt.shaderstage [[META22:![0-9]+]] !continuation.entry [[META13:![0-9]+]] !continuation.registercount [[META22]] !continuation [[META35:![0-9]+]] !continuation.state [[META22]] {
 ; POSTPROCESS-NEXT:  AllocaSpillBB:
 ; POSTPROCESS-NEXT:    [[CSP:%.*]] = alloca i32, align 4
-; POSTPROCESS-NEXT:    store i32 0, ptr [[CSP]], align 4
-; POSTPROCESS-NEXT:    [[SYSTEM_DATA:%.*]] = call [[STRUCT_DISPATCHSYSTEMDATA:%.*]] [[_CONT_SETUPRAYGEN:@[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]]()
-; POSTPROCESS-NEXT:    [[DOTFCA_0_EXTRACT:%.*]] = extractvalue [[STRUCT_DISPATCHSYSTEMDATA]] [[SYSTEM_DATA]], 0
+; POSTPROCESS-NEXT:    store i32 [[CSPINIT]], ptr [[CSP]], align 4
+; POSTPROCESS-NEXT:    [[DOTFCA_0_EXTRACT:%.*]] = extractvalue [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP0]], 0
 ; POSTPROCESS-NEXT:    call void @amd.dx.setLocalRootIndex(i32 0)
-; POSTPROCESS-NEXT:    [[TMP0:%.*]] = load [[DX_TYPES_HANDLE:%.*]], ptr @"\01?Scene@@3URaytracingAccelerationStructure@@A", align 4
-; POSTPROCESS-NEXT:    [[TMP1:%.*]] = load [[DX_TYPES_HANDLE]], ptr @"\01?RenderTarget@@3V?$RWTexture2D@V?$vector@M$03@@@@A", align 4
-; POSTPROCESS-NEXT:    [[TMP2:%.*]] = call [[DX_TYPES_HANDLE]] [[DX_OP_CREATEHANDLEFORLIB_DX_TYPES_HANDLE:@[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]](i32 160, [[DX_TYPES_HANDLE]] [[TMP0]])
-; POSTPROCESS-NEXT:    [[TMP3:%.*]] = call [[DX_TYPES_HANDLE]] [[DX_OP_ANNOTATEHANDLE:@[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]](i32 216, [[DX_TYPES_HANDLE]] [[TMP2]], [[DX_TYPES_RESOURCEPROPERTIES:%.*]] { i32 16, i32 0 })
-; POSTPROCESS-NEXT:    [[TMP4:%.*]] = call i64 @amd.dx.getAccelStructAddr([[DX_TYPES_HANDLE]] [[TMP3]])
+; POSTPROCESS-NEXT:    [[TMP1:%.*]] = load [[DX_TYPES_HANDLE:%.*]], ptr @"\01?Scene@@3URaytracingAccelerationStructure@@A", align 4
+; POSTPROCESS-NEXT:    [[TMP2:%.*]] = load [[DX_TYPES_HANDLE]], ptr @"\01?RenderTarget@@3V?$RWTexture2D@V?$vector@M$03@@@@A", align 4
+; POSTPROCESS-NEXT:    [[TMP3:%.*]] = call [[DX_TYPES_HANDLE]] [[DX_OP_CREATEHANDLEFORLIB_DX_TYPES_HANDLE:@[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]](i32 160, [[DX_TYPES_HANDLE]] [[TMP1]])
+; POSTPROCESS-NEXT:    [[TMP4:%.*]] = call [[DX_TYPES_HANDLE]] [[DX_OP_ANNOTATEHANDLE:@[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]](i32 216, [[DX_TYPES_HANDLE]] [[TMP3]], [[DX_TYPES_RESOURCEPROPERTIES:%.*]] { i32 16, i32 0 })
+; POSTPROCESS-NEXT:    [[TMP5:%.*]] = call i64 @amd.dx.getAccelStructAddr([[DX_TYPES_HANDLE]] [[TMP4]])
 ; POSTPROCESS-NEXT:    [[DIS_DATA_I_FCA_0_INSERT:%.*]] = insertvalue [[STRUCT_DISPATCHSYSTEMDATA]] poison, <3 x i32> [[DOTFCA_0_EXTRACT]], 0
 ; POSTPROCESS-NEXT:    [[SYS_DATA_I:%.*]] = insertvalue [[STRUCT_SYSTEMDATA:%.*]] undef, [[STRUCT_DISPATCHSYSTEMDATA]] [[DIS_DATA_I_FCA_0_INSERT]], 0
 ; POSTPROCESS-NEXT:    [[TRAV_DATA_I:%.*]] = insertvalue [[STRUCT_TRAVERSALDATA:%.*]] undef, [[STRUCT_SYSTEMDATA]] [[SYS_DATA_I]], 0
-; POSTPROCESS-NEXT:    [[TMP5:%.*]] = call i64 @continuation.getAddrAndMD(i64 ptrtoint (ptr @MyRayGen.resume.0 to i64))
-; POSTPROCESS-NEXT:    [[TRAV_DATA2_I:%.*]] = insertvalue [[STRUCT_TRAVERSALDATA]] [[TRAV_DATA_I]], i64 [[TMP5]], 5
+; POSTPROCESS-NEXT:    [[TMP6:%.*]] = call i64 @continuation.getAddrAndMD(ptr @MyRayGen.resume.0)
+; POSTPROCESS-NEXT:    [[TRAV_DATA2_I:%.*]] = insertvalue [[STRUCT_TRAVERSALDATA]] [[TRAV_DATA_I]], i64 [[TMP6]], 5
 ; POSTPROCESS-NEXT:    [[DOTSROA_0_0_VEC_EXTRACT:%.*]] = extractelement <4 x float> zeroinitializer, i32 0
-; POSTPROCESS-NEXT:    [[TMP6:%.*]] = bitcast float [[DOTSROA_0_0_VEC_EXTRACT]] to i32
-; POSTPROCESS-NEXT:    store i32 [[TMP6]], ptr addrspace(20) @REGISTERS, align 4
+; POSTPROCESS-NEXT:    [[TMP7:%.*]] = bitcast float [[DOTSROA_0_0_VEC_EXTRACT]] to i32
+; POSTPROCESS-NEXT:    store i32 [[TMP7]], ptr addrspace(20) @REGISTERS, align 4
 ; POSTPROCESS-NEXT:    [[DOTSROA_0_4_VEC_EXTRACT:%.*]] = extractelement <4 x float> zeroinitializer, i32 1
-; POSTPROCESS-NEXT:    [[TMP7:%.*]] = bitcast float [[DOTSROA_0_4_VEC_EXTRACT]] to i32
-; POSTPROCESS-NEXT:    store i32 [[TMP7]], ptr addrspace(20) getelementptr inbounds (i32, ptr addrspace(20) @REGISTERS, i32 7), align 4
+; POSTPROCESS-NEXT:    [[TMP8:%.*]] = bitcast float [[DOTSROA_0_4_VEC_EXTRACT]] to i32
+; POSTPROCESS-NEXT:    store i32 [[TMP8]], ptr addrspace(20) getelementptr inbounds (i32, ptr addrspace(20) @REGISTERS, i32 7), align 4
 ; POSTPROCESS-NEXT:    [[DOTSROA_0_8_VEC_EXTRACT:%.*]] = extractelement <4 x float> zeroinitializer, i32 2
-; POSTPROCESS-NEXT:    [[TMP8:%.*]] = bitcast float [[DOTSROA_0_8_VEC_EXTRACT]] to i32
-; POSTPROCESS-NEXT:    store i32 [[TMP8]], ptr addrspace(20) getelementptr inbounds (i32, ptr addrspace(20) @REGISTERS, i32 8), align 4
+; POSTPROCESS-NEXT:    [[TMP9:%.*]] = bitcast float [[DOTSROA_0_8_VEC_EXTRACT]] to i32
+; POSTPROCESS-NEXT:    store i32 [[TMP9]], ptr addrspace(20) getelementptr inbounds (i32, ptr addrspace(20) @REGISTERS, i32 8), align 4
 ; POSTPROCESS-NEXT:    [[DOTSROA_0_12_VEC_EXTRACT:%.*]] = extractelement <4 x float> zeroinitializer, i32 3
-; POSTPROCESS-NEXT:    [[TMP9:%.*]] = bitcast float [[DOTSROA_0_12_VEC_EXTRACT]] to i32
-; POSTPROCESS-NEXT:    store i32 [[TMP9]], ptr addrspace(20) getelementptr inbounds (i32, ptr addrspace(20) @REGISTERS, i32 9), align 4
-; POSTPROCESS-NEXT:    [[TMP10:%.*]] = load i32, ptr [[CSP]], align 4
-; POSTPROCESS-NEXT:    call void (i64, ...) @continuation.continue(i64 4, i32 [[TMP10]], [[STRUCT_TRAVERSALDATA]] [[TRAV_DATA2_I]]), !continuation.registercount [[META33:![0-9]+]], !continuation.returnedRegistercount !33
+; POSTPROCESS-NEXT:    [[TMP10:%.*]] = bitcast float [[DOTSROA_0_12_VEC_EXTRACT]] to i32
+; POSTPROCESS-NEXT:    store i32 [[TMP10]], ptr addrspace(20) getelementptr inbounds (i32, ptr addrspace(20) @REGISTERS, i32 9), align 4
+; POSTPROCESS-NEXT:    [[TMP11:%.*]] = load i32, ptr [[CSP]], align 4
+; POSTPROCESS-NEXT:    call void (i64, ...) @continuation.continue(i64 4, i32 [[TMP11]], [[STRUCT_TRAVERSALDATA]] [[TRAV_DATA2_I]], i64 poison), !continuation.registercount [[META33:![0-9]+]], !continuation.returnedRegistercount [[META33]]
 ; POSTPROCESS-NEXT:    unreachable
 ;
 ;
 ; POSTPROCESS-LABEL: define dso_local void @MyRayGen.resume.0(
-; POSTPROCESS-SAME: i32 [[CSPINIT:%.*]], [[STRUCT_DISPATCHSYSTEMDATA:%.*]] [[TMP0:%.*]]) !lgc.rt.shaderstage [[META22]] !continuation.registercount [[META33]] !continuation [[META35]] {
+; POSTPROCESS-SAME: i32 [[CSPINIT:%.*]], i64 [[TMP0:%.*]], [[STRUCT_DISPATCHSYSTEMDATA:%.*]] [[TMP1:%.*]]) !lgc.rt.shaderstage [[META22]] !continuation.registercount [[META33]] !continuation [[META35]] {
 ; POSTPROCESS-NEXT:  entryresume.0:
 ; POSTPROCESS-NEXT:    [[SYSTEM_DATA_ALLOCA:%.*]] = alloca [[STRUCT_DISPATCHSYSTEMDATA]], align 8
 ; POSTPROCESS-NEXT:    [[CSP:%.*]] = alloca i32, align 4
 ; POSTPROCESS-NEXT:    store i32 [[CSPINIT]], ptr [[CSP]], align 4
-; POSTPROCESS-NEXT:    store [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP0]], ptr [[SYSTEM_DATA_ALLOCA]], align 4
-; POSTPROCESS-NEXT:    [[TMP1:%.*]] = load i32, ptr addrspace(20) @REGISTERS, align 4
-; POSTPROCESS-NEXT:    [[TMP2:%.*]] = bitcast i32 [[TMP1]] to float
+; POSTPROCESS-NEXT:    store [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP1]], ptr [[SYSTEM_DATA_ALLOCA]], align 4
+; POSTPROCESS-NEXT:    [[TMP10:%.*]] = load i32, ptr addrspace(20) @REGISTERS, align 4
+; POSTPROCESS-NEXT:    [[TMP2:%.*]] = bitcast i32 [[TMP10]] to float
 ; POSTPROCESS-NEXT:    [[DOTSROA_0_0_VEC_INSERT:%.*]] = insertelement <4 x float> poison, float [[TMP2]], i32 0
 ; POSTPROCESS-NEXT:    [[TMP3:%.*]] = load i32, ptr addrspace(20) getelementptr inbounds (i32, ptr addrspace(20) @REGISTERS, i32 7), align 4
 ; POSTPROCESS-NEXT:    [[TMP4:%.*]] = bitcast i32 [[TMP3]] to float
@@ -3327,7 +3336,7 @@ attributes #5 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 ; POSTPROCESS-NEXT:    [[TMP7:%.*]] = load i32, ptr addrspace(20) getelementptr inbounds (i32, ptr addrspace(20) @REGISTERS, i32 9), align 4
 ; POSTPROCESS-NEXT:    [[TMP8:%.*]] = bitcast i32 [[TMP7]] to float
 ; POSTPROCESS-NEXT:    [[DOTSROA_0_12_VEC_INSERT:%.*]] = insertelement <4 x float> [[DOTSROA_0_8_VEC_INSERT]], float [[TMP8]], i32 3
-; POSTPROCESS-NEXT:    [[DOTFCA_0_EXTRACT6:%.*]] = extractvalue [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP0]], 0
+; POSTPROCESS-NEXT:    [[DOTFCA_0_EXTRACT14:%.*]] = extractvalue [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP1]], 0
 ; POSTPROCESS-NEXT:    call void @amd.dx.setLocalRootIndex(i32 0)
 ; POSTPROCESS-NEXT:    [[TMP9:%.*]] = load [[DX_TYPES_HANDLE:%.*]], ptr @"\01?RenderTarget@@3V?$RWTexture2D@V?$vector@M$03@@@@A", align 4
 ; POSTPROCESS-NEXT:    [[RES_1_I1:%.*]] = load i32, ptr [[SYSTEM_DATA_ALLOCA]], align 4
@@ -3348,8 +3357,8 @@ attributes #5 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 ; POSTPROCESS-NEXT:    [[VAL_1_I:%.*]] = insertelement <3 x i32> [[VAL_0_I]], i32 [[RES_2_I]], i32 1
 ; POSTPROCESS-NEXT:    [[VAL_2_I:%.*]] = insertelement <3 x i32> [[VAL_1_I]], i32 [[RES_3_I]], i32 2
 ; POSTPROCESS-NEXT:    [[EXTRACT1:%.*]] = extractelement <3 x i32> [[VAL_2_I]], i8 1
-; POSTPROCESS-NEXT:    [[TMP10:%.*]] = call [[DX_TYPES_HANDLE]] [[DX_OP_CREATEHANDLEFORLIB_DX_TYPES_HANDLE:@[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]](i32 160, [[DX_TYPES_HANDLE]] [[TMP9]])
-; POSTPROCESS-NEXT:    [[TMP11:%.*]] = call [[DX_TYPES_HANDLE]] [[DX_OP_ANNOTATEHANDLE:@[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]](i32 216, [[DX_TYPES_HANDLE]] [[TMP10]], [[DX_TYPES_RESOURCEPROPERTIES:%.*]] { i32 4098, i32 1033 })
+; POSTPROCESS-NEXT:    [[TMP16:%.*]] = call [[DX_TYPES_HANDLE]] [[DX_OP_CREATEHANDLEFORLIB_DX_TYPES_HANDLE:@[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]](i32 160, [[DX_TYPES_HANDLE]] [[TMP9]])
+; POSTPROCESS-NEXT:    [[TMP11:%.*]] = call [[DX_TYPES_HANDLE]] [[DX_OP_ANNOTATEHANDLE:@[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]](i32 216, [[DX_TYPES_HANDLE]] [[TMP16]], [[DX_TYPES_RESOURCEPROPERTIES:%.*]] { i32 4098, i32 1033 })
 ; POSTPROCESS-NEXT:    [[TMP12:%.*]] = extractelement <4 x float> [[DOTSROA_0_12_VEC_INSERT]], i64 0
 ; POSTPROCESS-NEXT:    [[TMP13:%.*]] = extractelement <4 x float> [[DOTSROA_0_12_VEC_INSERT]], i64 1
 ; POSTPROCESS-NEXT:    [[TMP14:%.*]] = extractelement <4 x float> [[DOTSROA_0_12_VEC_INSERT]], i64 2
@@ -3418,7 +3427,7 @@ attributes #5 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 ; POSTPROCESS-NEXT:    [[DOTFCA_0_LOAD:%.*]] = load <3 x i32>, ptr [[DOTFCA_0_GEP]], align 4
 ; POSTPROCESS-NEXT:    [[DOTFCA_0_INSERT:%.*]] = insertvalue [[STRUCT_DISPATCHSYSTEMDATA]] poison, <3 x i32> [[DOTFCA_0_LOAD]], 0
 ; POSTPROCESS-NEXT:    [[TMP28:%.*]] = load i32, ptr [[CSP]], align 4
-; POSTPROCESS-NEXT:    call void (i64, ...) @continuation.continue(i64 [[RETURNADDR]], i32 [[TMP28]], [[STRUCT_DISPATCHSYSTEMDATA]] [[DOTFCA_0_INSERT]]), !continuation.registercount [[META33]]
+; POSTPROCESS-NEXT:    call void (i64, ...) @continuation.continue(i64 [[RETURNADDR]], i32 [[TMP28]], i64 poison, [[STRUCT_DISPATCHSYSTEMDATA]] [[DOTFCA_0_INSERT]]), !continuation.registercount [[META33]]
 ; POSTPROCESS-NEXT:    unreachable
 ;
 ;
@@ -3577,12 +3586,12 @@ attributes #5 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 ; POSTPROCESS-NEXT:    [[DOTSROA_0_12_VEC_EXTRACT:%.*]] = extractelement <4 x float> [[DOTSROA_0_12_VEC_INSERT]], i32 3
 ; POSTPROCESS-NEXT:    [[TMP29:%.*]] = bitcast float [[DOTSROA_0_12_VEC_EXTRACT]] to i32
 ; POSTPROCESS-NEXT:    store i32 [[TMP29]], ptr addrspace(20) getelementptr inbounds (i32, ptr addrspace(20) @REGISTERS, i32 9), align 4
-; POSTPROCESS-NEXT:    [[HITATTRSALLOCA_SROA_0_0_VEC_EXTRACT:%.*]] = extractelement <2 x float> [[DOTFCA_0_EXTRACT]], i32 0
-; POSTPROCESS-NEXT:    [[TMP30:%.*]] = bitcast float [[HITATTRSALLOCA_SROA_0_0_VEC_EXTRACT]] to i32
+; POSTPROCESS-NEXT:    [[HITATTRSALLOCA_SROA_0_0_VEC_EXTRACT15:%.*]] = extractelement <2 x float> [[DOTFCA_0_EXTRACT]], i32 0
+; POSTPROCESS-NEXT:    [[TMP30:%.*]] = bitcast float [[HITATTRSALLOCA_SROA_0_0_VEC_EXTRACT15]] to i32
 ; POSTPROCESS-NEXT:    [[TMP31:%.*]] = bitcast i32 [[TMP30]] to float
 ; POSTPROCESS-NEXT:    [[DOTSROA_0241_0_VEC_INSERT:%.*]] = insertelement <2 x float> undef, float [[TMP31]], i32 0
-; POSTPROCESS-NEXT:    [[HITATTRSALLOCA_SROA_0_4_VEC_EXTRACT:%.*]] = extractelement <2 x float> [[DOTFCA_0_EXTRACT]], i32 1
-; POSTPROCESS-NEXT:    [[TMP32:%.*]] = bitcast float [[HITATTRSALLOCA_SROA_0_4_VEC_EXTRACT]] to i32
+; POSTPROCESS-NEXT:    [[HITATTRSALLOCA_SROA_0_4_VEC_EXTRACT19:%.*]] = extractelement <2 x float> [[DOTFCA_0_EXTRACT]], i32 1
+; POSTPROCESS-NEXT:    [[TMP32:%.*]] = bitcast float [[HITATTRSALLOCA_SROA_0_4_VEC_EXTRACT19]] to i32
 ; POSTPROCESS-NEXT:    [[TMP33:%.*]] = bitcast i32 [[TMP32]] to float
 ; POSTPROCESS-NEXT:    [[DOTSROA_0241_4_VEC_INSERT:%.*]] = insertelement <2 x float> [[DOTSROA_0241_0_VEC_INSERT]], float [[TMP33]], i32 1
 ; POSTPROCESS-NEXT:    [[DOTFCA_0_INSERT:%.*]] = insertvalue [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]] poison, <2 x float> [[DOTSROA_0241_4_VEC_INSERT]], 0
@@ -3630,7 +3639,7 @@ attributes #5 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 ; POSTPROCESS-NEXT:    [[DOTFCA_1_3_LOAD:%.*]] = load i32, ptr [[DOTFCA_1_3_GEP72]], align 4
 ; POSTPROCESS-NEXT:    [[DOTFCA_1_3_INSERT:%.*]] = insertvalue [[STRUCT_ANYHITTRAVERSALDATA]] [[DOTFCA_1_2_INSERT]], i32 [[DOTFCA_1_3_LOAD]], 1, 3
 ; POSTPROCESS-NEXT:    [[TMP35:%.*]] = load i32, ptr [[CSP]], align 4
-; POSTPROCESS-NEXT:    call void (i64, ...) @continuation.continue(i64 [[RETURNADDR]], i32 [[TMP35]], [[STRUCT_ANYHITTRAVERSALDATA]] [[DOTFCA_1_3_INSERT]]), !continuation.registercount [[META33]]
+; POSTPROCESS-NEXT:    call void (i64, ...) @continuation.continue(i64 [[RETURNADDR]], i32 [[TMP35]], i64 poison, [[STRUCT_ANYHITTRAVERSALDATA]] [[DOTFCA_1_3_INSERT]]), !continuation.registercount [[META33]]
 ; POSTPROCESS-NEXT:    unreachable
 ; POSTPROCESS:       36:
 ; POSTPROCESS-NEXT:    [[TMP37:%.*]] = getelementptr inbounds [[STRUCT_ANYHITTRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0, i32 0, i32 0
@@ -3641,18 +3650,18 @@ attributes #5 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 ; POSTPROCESS-NEXT:    [[DOTSROA_0_4_VEC_EXTRACT34:%.*]] = extractelement <4 x float> [[DOTSROA_0_12_VEC_INSERT]], i32 1
 ; POSTPROCESS-NEXT:    [[TMP39:%.*]] = bitcast float [[DOTSROA_0_4_VEC_EXTRACT34]] to i32
 ; POSTPROCESS-NEXT:    store i32 [[TMP39]], ptr addrspace(20) getelementptr inbounds (i32, ptr addrspace(20) @REGISTERS, i32 7), align 4
-; POSTPROCESS-NEXT:    [[DOTSROA_0_8_VEC_EXTRACT43:%.*]] = extractelement <4 x float> [[DOTSROA_0_12_VEC_INSERT]], i32 2
-; POSTPROCESS-NEXT:    [[TMP40:%.*]] = bitcast float [[DOTSROA_0_8_VEC_EXTRACT43]] to i32
+; POSTPROCESS-NEXT:    [[DOTSROA_0_8_VEC_EXTRACT42:%.*]] = extractelement <4 x float> [[DOTSROA_0_12_VEC_INSERT]], i32 2
+; POSTPROCESS-NEXT:    [[TMP40:%.*]] = bitcast float [[DOTSROA_0_8_VEC_EXTRACT42]] to i32
 ; POSTPROCESS-NEXT:    store i32 [[TMP40]], ptr addrspace(20) getelementptr inbounds (i32, ptr addrspace(20) @REGISTERS, i32 8), align 4
 ; POSTPROCESS-NEXT:    [[DOTSROA_0_12_VEC_EXTRACT52:%.*]] = extractelement <4 x float> [[DOTSROA_0_12_VEC_INSERT]], i32 3
 ; POSTPROCESS-NEXT:    [[TMP41:%.*]] = bitcast float [[DOTSROA_0_12_VEC_EXTRACT52]] to i32
 ; POSTPROCESS-NEXT:    store i32 [[TMP41]], ptr addrspace(20) getelementptr inbounds (i32, ptr addrspace(20) @REGISTERS, i32 9), align 4
-; POSTPROCESS-NEXT:    [[HITATTRSALLOCA_SROA_0_0_VEC_EXTRACT13:%.*]] = extractelement <2 x float> [[DOTFCA_0_EXTRACT]], i32 0
-; POSTPROCESS-NEXT:    [[TMP42:%.*]] = bitcast float [[HITATTRSALLOCA_SROA_0_0_VEC_EXTRACT13]] to i32
+; POSTPROCESS-NEXT:    [[HITATTRSALLOCA_SROA_0_0_VEC_EXTRACT:%.*]] = extractelement <2 x float> [[DOTFCA_0_EXTRACT]], i32 0
+; POSTPROCESS-NEXT:    [[TMP42:%.*]] = bitcast float [[HITATTRSALLOCA_SROA_0_0_VEC_EXTRACT]] to i32
 ; POSTPROCESS-NEXT:    [[TMP43:%.*]] = bitcast i32 [[TMP42]] to float
 ; POSTPROCESS-NEXT:    [[DOTSROA_0245_0_VEC_INSERT:%.*]] = insertelement <2 x float> undef, float [[TMP43]], i32 0
-; POSTPROCESS-NEXT:    [[HITATTRSALLOCA_SROA_0_4_VEC_EXTRACT17:%.*]] = extractelement <2 x float> [[DOTFCA_0_EXTRACT]], i32 1
-; POSTPROCESS-NEXT:    [[TMP44:%.*]] = bitcast float [[HITATTRSALLOCA_SROA_0_4_VEC_EXTRACT17]] to i32
+; POSTPROCESS-NEXT:    [[HITATTRSALLOCA_SROA_0_4_VEC_EXTRACT:%.*]] = extractelement <2 x float> [[DOTFCA_0_EXTRACT]], i32 1
+; POSTPROCESS-NEXT:    [[TMP44:%.*]] = bitcast float [[HITATTRSALLOCA_SROA_0_4_VEC_EXTRACT]] to i32
 ; POSTPROCESS-NEXT:    [[TMP45:%.*]] = bitcast i32 [[TMP44]] to float
 ; POSTPROCESS-NEXT:    [[DOTSROA_0245_4_VEC_INSERT:%.*]] = insertelement <2 x float> [[DOTSROA_0245_0_VEC_INSERT]], float [[TMP45]], i32 1
 ; POSTPROCESS-NEXT:    [[DOTFCA_0_INSERT244:%.*]] = insertvalue [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]] poison, <2 x float> [[DOTSROA_0245_4_VEC_INSERT]], 0
@@ -3700,7 +3709,7 @@ attributes #5 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 ; POSTPROCESS-NEXT:    [[DOTFCA_1_3_LOAD112:%.*]] = load i32, ptr [[DOTFCA_1_3_GEP111]], align 4
 ; POSTPROCESS-NEXT:    [[DOTFCA_1_3_INSERT113:%.*]] = insertvalue [[STRUCT_ANYHITTRAVERSALDATA]] [[DOTFCA_1_2_INSERT110]], i32 [[DOTFCA_1_3_LOAD112]], 1, 3
 ; POSTPROCESS-NEXT:    [[TMP47:%.*]] = load i32, ptr [[CSP]], align 4
-; POSTPROCESS-NEXT:    call void (i64, ...) @continuation.continue(i64 [[RETURNADDR]], i32 [[TMP47]], [[STRUCT_ANYHITTRAVERSALDATA]] [[DOTFCA_1_3_INSERT113]]), !continuation.registercount [[META33]]
+; POSTPROCESS-NEXT:    call void (i64, ...) @continuation.continue(i64 [[RETURNADDR]], i32 [[TMP47]], i64 poison, [[STRUCT_ANYHITTRAVERSALDATA]] [[DOTFCA_1_3_INSERT113]]), !continuation.registercount [[META33]]
 ; POSTPROCESS-NEXT:    unreachable
 ; POSTPROCESS:       48:
 ; POSTPROCESS-NEXT:    br i1 [[TMP22]], label [[TMP49:%.*]], label [[TMP70:%.*]]
@@ -3715,8 +3724,8 @@ attributes #5 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 ; POSTPROCESS-NEXT:    [[DOTSROA_0_4_VEC_EXTRACT36:%.*]] = extractelement <4 x float> [[DOTSROA_0_12_VEC_INSERT]], i32 1
 ; POSTPROCESS-NEXT:    [[TMP53:%.*]] = bitcast float [[DOTSROA_0_4_VEC_EXTRACT36]] to i32
 ; POSTPROCESS-NEXT:    store i32 [[TMP53]], ptr addrspace(20) getelementptr inbounds (i32, ptr addrspace(20) @REGISTERS, i32 7), align 4
-; POSTPROCESS-NEXT:    [[DOTSROA_0_8_VEC_EXTRACT45:%.*]] = extractelement <4 x float> [[DOTSROA_0_12_VEC_INSERT]], i32 2
-; POSTPROCESS-NEXT:    [[TMP54:%.*]] = bitcast float [[DOTSROA_0_8_VEC_EXTRACT45]] to i32
+; POSTPROCESS-NEXT:    [[DOTSROA_0_8_VEC_EXTRACT44:%.*]] = extractelement <4 x float> [[DOTSROA_0_12_VEC_INSERT]], i32 2
+; POSTPROCESS-NEXT:    [[TMP54:%.*]] = bitcast float [[DOTSROA_0_8_VEC_EXTRACT44]] to i32
 ; POSTPROCESS-NEXT:    store i32 [[TMP54]], ptr addrspace(20) getelementptr inbounds (i32, ptr addrspace(20) @REGISTERS, i32 8), align 4
 ; POSTPROCESS-NEXT:    [[DOTSROA_0_12_VEC_EXTRACT54:%.*]] = extractelement <4 x float> [[DOTSROA_0_12_VEC_INSERT]], i32 3
 ; POSTPROCESS-NEXT:    [[TMP55:%.*]] = bitcast float [[DOTSROA_0_12_VEC_EXTRACT54]] to i32
@@ -3770,7 +3779,7 @@ attributes #5 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 ; POSTPROCESS-NEXT:    [[DOTFCA_1_3_LOAD153:%.*]] = load i32, ptr [[DOTFCA_1_3_GEP152]], align 4
 ; POSTPROCESS-NEXT:    [[DOTFCA_1_3_INSERT154:%.*]] = insertvalue [[STRUCT_ANYHITTRAVERSALDATA]] [[DOTFCA_1_2_INSERT151]], i32 [[DOTFCA_1_3_LOAD153]], 1, 3
 ; POSTPROCESS-NEXT:    [[TMP59:%.*]] = load i32, ptr [[CSP]], align 4
-; POSTPROCESS-NEXT:    call void (i64, ...) @continuation.continue(i64 [[RETURNADDR]], i32 [[TMP59]], [[STRUCT_ANYHITTRAVERSALDATA]] [[DOTFCA_1_3_INSERT154]]), !continuation.registercount [[META33]]
+; POSTPROCESS-NEXT:    call void (i64, ...) @continuation.continue(i64 [[RETURNADDR]], i32 [[TMP59]], i64 poison, [[STRUCT_ANYHITTRAVERSALDATA]] [[DOTFCA_1_3_INSERT154]]), !continuation.registercount [[META33]]
 ; POSTPROCESS-NEXT:    unreachable
 ; POSTPROCESS:       60:
 ; POSTPROCESS-NEXT:    [[TMP61:%.*]] = getelementptr inbounds [[STRUCT_ANYHITTRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0, i32 0, i32 0
@@ -3781,8 +3790,8 @@ attributes #5 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 ; POSTPROCESS-NEXT:    [[DOTSROA_0_4_VEC_EXTRACT38:%.*]] = extractelement <4 x float> [[DOTSROA_0_12_VEC_INSERT]], i32 1
 ; POSTPROCESS-NEXT:    [[TMP63:%.*]] = bitcast float [[DOTSROA_0_4_VEC_EXTRACT38]] to i32
 ; POSTPROCESS-NEXT:    store i32 [[TMP63]], ptr addrspace(20) getelementptr inbounds (i32, ptr addrspace(20) @REGISTERS, i32 7), align 4
-; POSTPROCESS-NEXT:    [[DOTSROA_0_8_VEC_EXTRACT47:%.*]] = extractelement <4 x float> [[DOTSROA_0_12_VEC_INSERT]], i32 2
-; POSTPROCESS-NEXT:    [[TMP64:%.*]] = bitcast float [[DOTSROA_0_8_VEC_EXTRACT47]] to i32
+; POSTPROCESS-NEXT:    [[DOTSROA_0_8_VEC_EXTRACT46:%.*]] = extractelement <4 x float> [[DOTSROA_0_12_VEC_INSERT]], i32 2
+; POSTPROCESS-NEXT:    [[TMP64:%.*]] = bitcast float [[DOTSROA_0_8_VEC_EXTRACT46]] to i32
 ; POSTPROCESS-NEXT:    store i32 [[TMP64]], ptr addrspace(20) getelementptr inbounds (i32, ptr addrspace(20) @REGISTERS, i32 8), align 4
 ; POSTPROCESS-NEXT:    [[DOTSROA_0_12_VEC_EXTRACT56:%.*]] = extractelement <4 x float> [[DOTSROA_0_12_VEC_INSERT]], i32 3
 ; POSTPROCESS-NEXT:    [[TMP65:%.*]] = bitcast float [[DOTSROA_0_12_VEC_EXTRACT56]] to i32
@@ -3836,7 +3845,7 @@ attributes #5 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 ; POSTPROCESS-NEXT:    [[DOTFCA_1_3_LOAD194:%.*]] = load i32, ptr [[DOTFCA_1_3_GEP193]], align 4
 ; POSTPROCESS-NEXT:    [[DOTFCA_1_3_INSERT195:%.*]] = insertvalue [[STRUCT_ANYHITTRAVERSALDATA]] [[DOTFCA_1_2_INSERT192]], i32 [[DOTFCA_1_3_LOAD194]], 1, 3
 ; POSTPROCESS-NEXT:    [[TMP69:%.*]] = load i32, ptr [[CSP]], align 4
-; POSTPROCESS-NEXT:    call void (i64, ...) @continuation.continue(i64 [[RETURNADDR]], i32 [[TMP69]], [[STRUCT_ANYHITTRAVERSALDATA]] [[DOTFCA_1_3_INSERT195]]), !continuation.registercount [[META33]]
+; POSTPROCESS-NEXT:    call void (i64, ...) @continuation.continue(i64 [[RETURNADDR]], i32 [[TMP69]], i64 poison, [[STRUCT_ANYHITTRAVERSALDATA]] [[DOTFCA_1_3_INSERT195]]), !continuation.registercount [[META33]]
 ; POSTPROCESS-NEXT:    unreachable
 ; POSTPROCESS:       70:
 ; POSTPROCESS-NEXT:    call void @_cont_AcceptHit(ptr [[SYSTEM_DATA_ALLOCA]])
@@ -3846,18 +3855,18 @@ attributes #5 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 ; POSTPROCESS-NEXT:    [[DOTSROA_0_4_VEC_EXTRACT40:%.*]] = extractelement <4 x float> [[DOTSROA_0_12_VEC_INSERT]], i32 1
 ; POSTPROCESS-NEXT:    [[TMP72:%.*]] = bitcast float [[DOTSROA_0_4_VEC_EXTRACT40]] to i32
 ; POSTPROCESS-NEXT:    store i32 [[TMP72]], ptr addrspace(20) getelementptr inbounds (i32, ptr addrspace(20) @REGISTERS, i32 7), align 4
-; POSTPROCESS-NEXT:    [[DOTSROA_0_8_VEC_EXTRACT49:%.*]] = extractelement <4 x float> [[DOTSROA_0_12_VEC_INSERT]], i32 2
-; POSTPROCESS-NEXT:    [[TMP73:%.*]] = bitcast float [[DOTSROA_0_8_VEC_EXTRACT49]] to i32
+; POSTPROCESS-NEXT:    [[DOTSROA_0_8_VEC_EXTRACT48:%.*]] = extractelement <4 x float> [[DOTSROA_0_12_VEC_INSERT]], i32 2
+; POSTPROCESS-NEXT:    [[TMP73:%.*]] = bitcast float [[DOTSROA_0_8_VEC_EXTRACT48]] to i32
 ; POSTPROCESS-NEXT:    store i32 [[TMP73]], ptr addrspace(20) getelementptr inbounds (i32, ptr addrspace(20) @REGISTERS, i32 8), align 4
 ; POSTPROCESS-NEXT:    [[DOTSROA_0_12_VEC_EXTRACT58:%.*]] = extractelement <4 x float> [[DOTSROA_0_12_VEC_INSERT]], i32 3
 ; POSTPROCESS-NEXT:    [[TMP74:%.*]] = bitcast float [[DOTSROA_0_12_VEC_EXTRACT58]] to i32
 ; POSTPROCESS-NEXT:    store i32 [[TMP74]], ptr addrspace(20) getelementptr inbounds (i32, ptr addrspace(20) @REGISTERS, i32 9), align 4
-; POSTPROCESS-NEXT:    [[HITATTRSALLOCA_SROA_0_0_VEC_EXTRACT15:%.*]] = extractelement <2 x float> [[DOTFCA_0_EXTRACT]], i32 0
-; POSTPROCESS-NEXT:    [[TMP75:%.*]] = bitcast float [[HITATTRSALLOCA_SROA_0_0_VEC_EXTRACT15]] to i32
+; POSTPROCESS-NEXT:    [[HITATTRSALLOCA_SROA_0_0_VEC_EXTRACT13:%.*]] = extractelement <2 x float> [[DOTFCA_0_EXTRACT]], i32 0
+; POSTPROCESS-NEXT:    [[TMP75:%.*]] = bitcast float [[HITATTRSALLOCA_SROA_0_0_VEC_EXTRACT13]] to i32
 ; POSTPROCESS-NEXT:    [[TMP76:%.*]] = bitcast i32 [[TMP75]] to float
 ; POSTPROCESS-NEXT:    [[DOTSROA_0257_0_VEC_INSERT:%.*]] = insertelement <2 x float> undef, float [[TMP76]], i32 0
-; POSTPROCESS-NEXT:    [[HITATTRSALLOCA_SROA_0_4_VEC_EXTRACT19:%.*]] = extractelement <2 x float> [[DOTFCA_0_EXTRACT]], i32 1
-; POSTPROCESS-NEXT:    [[TMP77:%.*]] = bitcast float [[HITATTRSALLOCA_SROA_0_4_VEC_EXTRACT19]] to i32
+; POSTPROCESS-NEXT:    [[HITATTRSALLOCA_SROA_0_4_VEC_EXTRACT17:%.*]] = extractelement <2 x float> [[DOTFCA_0_EXTRACT]], i32 1
+; POSTPROCESS-NEXT:    [[TMP77:%.*]] = bitcast float [[HITATTRSALLOCA_SROA_0_4_VEC_EXTRACT17]] to i32
 ; POSTPROCESS-NEXT:    [[TMP78:%.*]] = bitcast i32 [[TMP77]] to float
 ; POSTPROCESS-NEXT:    [[DOTSROA_0257_4_VEC_INSERT:%.*]] = insertelement <2 x float> [[DOTSROA_0257_0_VEC_INSERT]], float [[TMP78]], i32 1
 ; POSTPROCESS-NEXT:    [[DOTFCA_0_INSERT256:%.*]] = insertvalue [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]] poison, <2 x float> [[DOTSROA_0257_4_VEC_INSERT]], 0
@@ -3905,7 +3914,7 @@ attributes #5 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 ; POSTPROCESS-NEXT:    [[DOTFCA_1_3_LOAD235:%.*]] = load i32, ptr [[DOTFCA_1_3_GEP234]], align 4
 ; POSTPROCESS-NEXT:    [[DOTFCA_1_3_INSERT236:%.*]] = insertvalue [[STRUCT_ANYHITTRAVERSALDATA]] [[DOTFCA_1_2_INSERT233]], i32 [[DOTFCA_1_3_LOAD235]], 1, 3
 ; POSTPROCESS-NEXT:    [[TMP80:%.*]] = load i32, ptr [[CSP]], align 4
-; POSTPROCESS-NEXT:    call void (i64, ...) @continuation.continue(i64 [[RETURNADDR]], i32 [[TMP80]], [[STRUCT_ANYHITTRAVERSALDATA]] [[DOTFCA_1_3_INSERT236]]), !continuation.registercount [[META33]]
+; POSTPROCESS-NEXT:    call void (i64, ...) @continuation.continue(i64 [[RETURNADDR]], i32 [[TMP80]], i64 poison, [[STRUCT_ANYHITTRAVERSALDATA]] [[DOTFCA_1_3_INSERT236]]), !continuation.registercount [[META33]]
 ; POSTPROCESS-NEXT:    unreachable
 ;
 ;
@@ -3963,9 +3972,9 @@ attributes #5 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 ; POSTPROCESS-NEXT:    [[TRAV_DATA_I_FCA_1_2_INSERT:%.*]] = insertvalue [[STRUCT_ANYHITTRAVERSALDATA]] [[TRAV_DATA_I_FCA_1_1_INSERT]], float [[DOTFCA_1_2_EXTRACT]], 1, 2
 ; POSTPROCESS-NEXT:    [[TRAV_DATA_I_FCA_1_3_INSERT:%.*]] = insertvalue [[STRUCT_ANYHITTRAVERSALDATA]] [[TRAV_DATA_I_FCA_1_2_INSERT]], i32 [[DOTFCA_1_3_EXTRACT]], 1, 3
 ; POSTPROCESS-NEXT:    [[DOTFCA_0_INSERT:%.*]] = insertvalue [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES:%.*]] poison, <2 x float> undef, 0
+; POSTPROCESS-NEXT:    [[TMP8:%.*]] = call i64 @continuation.getAddrAndMD(ptr @MyIntersectionShader.resume.0)
 ; POSTPROCESS-NEXT:    [[TMP7:%.*]] = load i32, ptr [[CSP]], align 4
-; POSTPROCESS-NEXT:    [[TMP8:%.*]] = call i64 @continuation.getAddrAndMD(i64 ptrtoint (ptr @MyIntersectionShader.resume.0 to i64))
-; POSTPROCESS-NEXT:    call void (i64, ...) @continuation.continue(i64 3, i32 [[TMP7]], i64 [[TMP8]], [[STRUCT_ANYHITTRAVERSALDATA]] [[TRAV_DATA_I_FCA_1_3_INSERT]], float [[RES_I_FCA_3_INSERT_FCA_2_EXTRACT]], i32 0, [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]] [[DOTFCA_0_INSERT]]), !continuation.registercount [[META32]], !continuation.returnedRegistercount !32
+; POSTPROCESS-NEXT:    call void (i64, ...) @continuation.continue(i64 3, i32 [[TMP7]], i64 [[TMP8]], [[STRUCT_ANYHITTRAVERSALDATA]] [[TRAV_DATA_I_FCA_1_3_INSERT]], float [[RES_I_FCA_3_INSERT_FCA_2_EXTRACT]], i32 0, [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]] [[DOTFCA_0_INSERT]]), !continuation.registercount [[META32]], !continuation.returnedRegistercount [[META32]]
 ; POSTPROCESS-NEXT:    unreachable
 ; POSTPROCESS:       isEnd.i:
 ; POSTPROCESS-NEXT:    [[DOTSROA_0_0_VEC_EXTRACT:%.*]] = extractelement <2 x float> undef, i32 0
@@ -3979,7 +3988,7 @@ attributes #5 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 ; POSTPROCESS-NEXT:    [[DOTFCA_0_INSERT106:%.*]] = insertvalue [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]] poison, <2 x float> [[DOTSROA_0107_4_VEC_INSERT]], 0
 ; POSTPROCESS-NEXT:    [[DOTFCA_0_EXTRACT:%.*]] = extractvalue [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]] [[DOTFCA_0_INSERT106]], 0
 ; POSTPROCESS-NEXT:    [[TMP13:%.*]] = bitcast <2 x float> [[DOTFCA_0_EXTRACT]] to <2 x i32>
-; POSTPROCESS-NEXT:    [[SYSTEM_DATA_ALLOCA_SROA_0_0_VEC_EXPAND:%.*]] = shufflevector <2 x i32> [[TMP13]], <2 x i32> poison, <3 x i32> <i32 0, i32 1, i32 {{undef|poison}}>
+; POSTPROCESS-NEXT:    [[SYSTEM_DATA_ALLOCA_SROA_0_0_VEC_EXPAND:%.*]] = shufflevector <2 x i32> [[TMP13]], <2 x i32> poison, <3 x i32> <i32 0, i32 1, i32 poison>
 ; POSTPROCESS-NEXT:    [[SYSTEM_DATA_ALLOCA_SROA_0_0_VECBLEND:%.*]] = select <3 x i1> <i1 true, i1 true, i1 false>, <3 x i32> [[SYSTEM_DATA_ALLOCA_SROA_0_0_VEC_EXPAND]], <3 x i32> [[DOTFCA_0_0_0_0_EXTRACT]]
 ; POSTPROCESS-NEXT:    [[ISEND_I1:%.*]] = call i1 @opaqueIsEnd()
 ; POSTPROCESS-NEXT:    br i1 [[ISEND_I1]], label [[TMP14:%.*]], label [[TMP18:%.*]]
@@ -4001,7 +4010,7 @@ attributes #5 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 ; POSTPROCESS-NEXT:    [[TMP16:%.*]] = add i32 [[TMP15]], -8
 ; POSTPROCESS-NEXT:    store i32 [[TMP16]], ptr [[CSP]], align 4
 ; POSTPROCESS-NEXT:    [[TMP17:%.*]] = load i32, ptr [[CSP]], align 4
-; POSTPROCESS-NEXT:    call void (i64, ...) @continuation.continue(i64 [[RETURNADDR]], i32 [[TMP17]], [[STRUCT_ANYHITTRAVERSALDATA]] [[DOTFCA_1_3_INSERT80]]), !continuation.registercount [[META32]]
+; POSTPROCESS-NEXT:    call void (i64, ...) @continuation.continue(i64 [[RETURNADDR]], i32 [[TMP17]], i64 poison, [[STRUCT_ANYHITTRAVERSALDATA]] [[DOTFCA_1_3_INSERT80]]), !continuation.registercount [[META32]]
 ; POSTPROCESS-NEXT:    unreachable
 ; POSTPROCESS:       18:
 ; POSTPROCESS-NEXT:    [[DOTFCA_0_0_0_0_INSERT:%.*]] = insertvalue [[STRUCT_ANYHITTRAVERSALDATA]] poison, <3 x i32> [[SYSTEM_DATA_ALLOCA_SROA_0_0_VECBLEND]], 0, 0, 0, 0
@@ -4021,34 +4030,34 @@ attributes #5 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 ; POSTPROCESS-NEXT:    [[TMP20:%.*]] = add i32 [[TMP19]], -8
 ; POSTPROCESS-NEXT:    store i32 [[TMP20]], ptr [[CSP]], align 4
 ; POSTPROCESS-NEXT:    [[TMP21:%.*]] = load i32, ptr [[CSP]], align 4
-; POSTPROCESS-NEXT:    call void (i64, ...) @continuation.continue(i64 [[RETURNADDR]], i32 [[TMP21]], [[STRUCT_ANYHITTRAVERSALDATA]] [[DOTFCA_1_3_INSERT]]), !continuation.registercount [[META32]]
+; POSTPROCESS-NEXT:    call void (i64, ...) @continuation.continue(i64 [[RETURNADDR]], i32 [[TMP21]], i64 poison, [[STRUCT_ANYHITTRAVERSALDATA]] [[DOTFCA_1_3_INSERT]]), !continuation.registercount [[META32]]
 ; POSTPROCESS-NEXT:    unreachable
 ;
 ;
 ; POSTPROCESS-LABEL: define dso_local void @MyIntersectionShader.resume.0(
-; POSTPROCESS-SAME: i32 [[CSPINIT:%.*]], [[STRUCT_ANYHITTRAVERSALDATA:%.*]] [[TMP0:%.*]]) !lgc.rt.shaderstage [[META40]] !continuation.registercount [[META32]] !continuation [[META41]] {
+; POSTPROCESS-SAME: i32 [[CSPINIT:%.*]], i64 [[TMP0:%.*]], [[STRUCT_ANYHITTRAVERSALDATA:%.*]] [[TMP1:%.*]]) !lgc.rt.shaderstage [[META40]] !continuation.registercount [[META32]] !continuation [[META41]] {
 ; POSTPROCESS-NEXT:  entryresume.0:
 ; POSTPROCESS-NEXT:    [[CSP:%.*]] = alloca i32, align 4
 ; POSTPROCESS-NEXT:    store i32 [[CSPINIT]], ptr [[CSP]], align 4
-; POSTPROCESS-NEXT:    [[TMP1:%.*]] = load i32, ptr [[CSP]], align 4
-; POSTPROCESS-NEXT:    [[TMP2:%.*]] = add i32 [[TMP1]], -8
-; POSTPROCESS-NEXT:    [[DOTFCA_0_0_0_0_EXTRACT16:%.*]] = extractvalue [[STRUCT_ANYHITTRAVERSALDATA]] [[TMP0]], 0, 0, 0, 0
-; POSTPROCESS-NEXT:    [[DOTFCA_0_1_0_EXTRACT18:%.*]] = extractvalue [[STRUCT_ANYHITTRAVERSALDATA]] [[TMP0]], 0, 1, 0
-; POSTPROCESS-NEXT:    [[DOTFCA_0_1_1_EXTRACT20:%.*]] = extractvalue [[STRUCT_ANYHITTRAVERSALDATA]] [[TMP0]], 0, 1, 1
-; POSTPROCESS-NEXT:    [[DOTFCA_0_1_2_EXTRACT22:%.*]] = extractvalue [[STRUCT_ANYHITTRAVERSALDATA]] [[TMP0]], 0, 1, 2
-; POSTPROCESS-NEXT:    [[DOTFCA_0_1_3_EXTRACT24:%.*]] = extractvalue [[STRUCT_ANYHITTRAVERSALDATA]] [[TMP0]], 0, 1, 3
-; POSTPROCESS-NEXT:    [[DOTFCA_0_2_EXTRACT26:%.*]] = extractvalue [[STRUCT_ANYHITTRAVERSALDATA]] [[TMP0]], 0, 2
-; POSTPROCESS-NEXT:    [[DOTFCA_0_3_EXTRACT28:%.*]] = extractvalue [[STRUCT_ANYHITTRAVERSALDATA]] [[TMP0]], 0, 3
-; POSTPROCESS-NEXT:    [[DOTFCA_0_4_EXTRACT30:%.*]] = extractvalue [[STRUCT_ANYHITTRAVERSALDATA]] [[TMP0]], 0, 4
-; POSTPROCESS-NEXT:    [[DOTFCA_0_5_EXTRACT32:%.*]] = extractvalue [[STRUCT_ANYHITTRAVERSALDATA]] [[TMP0]], 0, 5
-; POSTPROCESS-NEXT:    [[DOTFCA_1_0_EXTRACT34:%.*]] = extractvalue [[STRUCT_ANYHITTRAVERSALDATA]] [[TMP0]], 1, 0
-; POSTPROCESS-NEXT:    [[DOTFCA_1_1_EXTRACT36:%.*]] = extractvalue [[STRUCT_ANYHITTRAVERSALDATA]] [[TMP0]], 1, 1
-; POSTPROCESS-NEXT:    [[DOTFCA_1_2_EXTRACT38:%.*]] = extractvalue [[STRUCT_ANYHITTRAVERSALDATA]] [[TMP0]], 1, 2
-; POSTPROCESS-NEXT:    [[DOTFCA_1_3_EXTRACT40:%.*]] = extractvalue [[STRUCT_ANYHITTRAVERSALDATA]] [[TMP0]], 1, 3
+; POSTPROCESS-NEXT:    [[TMP15:%.*]] = load i32, ptr [[CSP]], align 4
+; POSTPROCESS-NEXT:    [[TMP2:%.*]] = add i32 [[TMP15]], -8
+; POSTPROCESS-NEXT:    [[DOTFCA_0_0_0_0_EXTRACT16:%.*]] = extractvalue [[STRUCT_ANYHITTRAVERSALDATA]] [[TMP1]], 0, 0, 0, 0
+; POSTPROCESS-NEXT:    [[DOTFCA_0_1_0_EXTRACT18:%.*]] = extractvalue [[STRUCT_ANYHITTRAVERSALDATA]] [[TMP1]], 0, 1, 0
+; POSTPROCESS-NEXT:    [[DOTFCA_0_1_1_EXTRACT20:%.*]] = extractvalue [[STRUCT_ANYHITTRAVERSALDATA]] [[TMP1]], 0, 1, 1
+; POSTPROCESS-NEXT:    [[DOTFCA_0_1_2_EXTRACT22:%.*]] = extractvalue [[STRUCT_ANYHITTRAVERSALDATA]] [[TMP1]], 0, 1, 2
+; POSTPROCESS-NEXT:    [[DOTFCA_0_1_3_EXTRACT24:%.*]] = extractvalue [[STRUCT_ANYHITTRAVERSALDATA]] [[TMP1]], 0, 1, 3
+; POSTPROCESS-NEXT:    [[DOTFCA_0_2_EXTRACT26:%.*]] = extractvalue [[STRUCT_ANYHITTRAVERSALDATA]] [[TMP1]], 0, 2
+; POSTPROCESS-NEXT:    [[DOTFCA_0_3_EXTRACT28:%.*]] = extractvalue [[STRUCT_ANYHITTRAVERSALDATA]] [[TMP1]], 0, 3
+; POSTPROCESS-NEXT:    [[DOTFCA_0_4_EXTRACT30:%.*]] = extractvalue [[STRUCT_ANYHITTRAVERSALDATA]] [[TMP1]], 0, 4
+; POSTPROCESS-NEXT:    [[DOTFCA_0_5_EXTRACT32:%.*]] = extractvalue [[STRUCT_ANYHITTRAVERSALDATA]] [[TMP1]], 0, 5
+; POSTPROCESS-NEXT:    [[DOTFCA_1_0_EXTRACT34:%.*]] = extractvalue [[STRUCT_ANYHITTRAVERSALDATA]] [[TMP1]], 1, 0
+; POSTPROCESS-NEXT:    [[DOTFCA_1_1_EXTRACT36:%.*]] = extractvalue [[STRUCT_ANYHITTRAVERSALDATA]] [[TMP1]], 1, 1
+; POSTPROCESS-NEXT:    [[DOTFCA_1_2_EXTRACT38:%.*]] = extractvalue [[STRUCT_ANYHITTRAVERSALDATA]] [[TMP1]], 1, 2
+; POSTPROCESS-NEXT:    [[DOTFCA_1_3_EXTRACT40:%.*]] = extractvalue [[STRUCT_ANYHITTRAVERSALDATA]] [[TMP1]], 1, 3
 ; POSTPROCESS-NEXT:    call void @amd.dx.setLocalRootIndex(i32 5)
 ; POSTPROCESS-NEXT:    [[ISEND_I1:%.*]] = call i1 @opaqueIsEnd()
 ; POSTPROCESS-NEXT:    br i1 [[ISEND_I1]], label [[TMP3:%.*]], label [[TMP9:%.*]]
-; POSTPROCESS:       3:
+; POSTPROCESS:       4:
 ; POSTPROCESS-NEXT:    [[TMP4:%.*]] = inttoptr i32 [[TMP2]] to ptr addrspace(21)
 ; POSTPROCESS-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP4]], i32 0
 ; POSTPROCESS-NEXT:    [[RETURNADDR_RELOAD2:%.*]] = load i64, ptr addrspace(21) [[TMP5]], align 4
@@ -4069,9 +4078,9 @@ attributes #5 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 ; POSTPROCESS-NEXT:    [[TMP7:%.*]] = add i32 [[TMP6]], -8
 ; POSTPROCESS-NEXT:    store i32 [[TMP7]], ptr [[CSP]], align 4
 ; POSTPROCESS-NEXT:    [[TMP8:%.*]] = load i32, ptr [[CSP]], align 4
-; POSTPROCESS-NEXT:    call void (i64, ...) @continuation.continue(i64 [[RETURNADDR_RELOAD2]], i32 [[TMP8]], [[STRUCT_ANYHITTRAVERSALDATA]] [[DOTFCA_1_3_INSERT80]]), !continuation.registercount [[META32]]
+; POSTPROCESS-NEXT:    call void (i64, ...) @continuation.continue(i64 [[RETURNADDR_RELOAD2]], i32 [[TMP8]], i64 poison, [[STRUCT_ANYHITTRAVERSALDATA]] [[DOTFCA_1_3_INSERT80]]), !continuation.registercount [[META32]]
 ; POSTPROCESS-NEXT:    unreachable
-; POSTPROCESS:       9:
+; POSTPROCESS:       10:
 ; POSTPROCESS-NEXT:    [[TMP10:%.*]] = inttoptr i32 [[TMP2]] to ptr addrspace(21)
 ; POSTPROCESS-NEXT:    [[TMP11:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP10]], i32 0
 ; POSTPROCESS-NEXT:    [[RETURNADDR_RELOAD:%.*]] = load i64, ptr addrspace(21) [[TMP11]], align 4
@@ -4092,7 +4101,7 @@ attributes #5 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 ; POSTPROCESS-NEXT:    [[TMP13:%.*]] = add i32 [[TMP12]], -8
 ; POSTPROCESS-NEXT:    store i32 [[TMP13]], ptr [[CSP]], align 4
 ; POSTPROCESS-NEXT:    [[TMP14:%.*]] = load i32, ptr [[CSP]], align 4
-; POSTPROCESS-NEXT:    call void (i64, ...) @continuation.continue(i64 [[RETURNADDR_RELOAD]], i32 [[TMP14]], [[STRUCT_ANYHITTRAVERSALDATA]] [[DOTFCA_1_3_INSERT]]), !continuation.registercount [[META32]]
+; POSTPROCESS-NEXT:    call void (i64, ...) @continuation.continue(i64 [[RETURNADDR_RELOAD]], i32 [[TMP14]], i64 poison, [[STRUCT_ANYHITTRAVERSALDATA]] [[DOTFCA_1_3_INSERT]]), !continuation.registercount [[META32]]
 ; POSTPROCESS-NEXT:    unreachable
 ;
 ;
@@ -4150,9 +4159,9 @@ attributes #5 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 ; POSTPROCESS-NEXT:    [[TRAV_DATA_I_FCA_1_2_INSERT:%.*]] = insertvalue [[STRUCT_ANYHITTRAVERSALDATA]] [[TRAV_DATA_I_FCA_1_1_INSERT]], float [[DOTFCA_1_2_EXTRACT]], 1, 2
 ; POSTPROCESS-NEXT:    [[TRAV_DATA_I_FCA_1_3_INSERT:%.*]] = insertvalue [[STRUCT_ANYHITTRAVERSALDATA]] [[TRAV_DATA_I_FCA_1_2_INSERT]], i32 [[DOTFCA_1_3_EXTRACT]], 1, 3
 ; POSTPROCESS-NEXT:    [[DOTFCA_0_INSERT:%.*]] = insertvalue [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES2:%.*]] poison, <2 x float> undef, 0
+; POSTPROCESS-NEXT:    [[TMP8:%.*]] = call i64 @continuation.getAddrAndMD(ptr @MyIntersectionShader2.resume.0)
 ; POSTPROCESS-NEXT:    [[TMP7:%.*]] = load i32, ptr [[CSP]], align 4
-; POSTPROCESS-NEXT:    [[TMP8:%.*]] = call i64 @continuation.getAddrAndMD(i64 ptrtoint (ptr @MyIntersectionShader2.resume.0 to i64))
-; POSTPROCESS-NEXT:    call void (i64, ...) @continuation.continue(i64 3, i32 [[TMP7]], i64 [[TMP8]], [[STRUCT_ANYHITTRAVERSALDATA]] [[TRAV_DATA_I_FCA_1_3_INSERT]], float [[RES_I_FCA_3_INSERT_FCA_2_EXTRACT]], i32 0, [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES2]] [[DOTFCA_0_INSERT]]), !continuation.registercount [[META32]], !continuation.returnedRegistercount !32
+; POSTPROCESS-NEXT:    call void (i64, ...) @continuation.continue(i64 3, i32 [[TMP7]], i64 [[TMP8]], [[STRUCT_ANYHITTRAVERSALDATA]] [[TRAV_DATA_I_FCA_1_3_INSERT]], float [[RES_I_FCA_3_INSERT_FCA_2_EXTRACT]], i32 0, [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES2]] [[DOTFCA_0_INSERT]]), !continuation.registercount [[META32]], !continuation.returnedRegistercount [[META32]]
 ; POSTPROCESS-NEXT:    unreachable
 ; POSTPROCESS:       isEnd.i:
 ; POSTPROCESS-NEXT:    [[DOTSROA_0_0_VEC_EXTRACT:%.*]] = extractelement <2 x float> undef, i32 0
@@ -4166,7 +4175,7 @@ attributes #5 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 ; POSTPROCESS-NEXT:    [[DOTFCA_0_INSERT106:%.*]] = insertvalue [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES:%.*]] poison, <2 x float> [[DOTSROA_0107_4_VEC_INSERT]], 0
 ; POSTPROCESS-NEXT:    [[DOTFCA_0_EXTRACT:%.*]] = extractvalue [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]] [[DOTFCA_0_INSERT106]], 0
 ; POSTPROCESS-NEXT:    [[TMP13:%.*]] = bitcast <2 x float> [[DOTFCA_0_EXTRACT]] to <2 x i32>
-; POSTPROCESS-NEXT:    [[SYSTEM_DATA_ALLOCA_SROA_0_0_VEC_EXPAND:%.*]] = shufflevector <2 x i32> [[TMP13]], <2 x i32> poison, <3 x i32> <i32 0, i32 1, i32 {{undef|poison}}>
+; POSTPROCESS-NEXT:    [[SYSTEM_DATA_ALLOCA_SROA_0_0_VEC_EXPAND:%.*]] = shufflevector <2 x i32> [[TMP13]], <2 x i32> poison, <3 x i32> <i32 0, i32 1, i32 poison>
 ; POSTPROCESS-NEXT:    [[SYSTEM_DATA_ALLOCA_SROA_0_0_VECBLEND:%.*]] = select <3 x i1> <i1 true, i1 true, i1 false>, <3 x i32> [[SYSTEM_DATA_ALLOCA_SROA_0_0_VEC_EXPAND]], <3 x i32> [[DOTFCA_0_0_0_0_EXTRACT]]
 ; POSTPROCESS-NEXT:    [[ISEND_I1:%.*]] = call i1 @opaqueIsEnd()
 ; POSTPROCESS-NEXT:    br i1 [[ISEND_I1]], label [[TMP14:%.*]], label [[TMP18:%.*]]
@@ -4188,7 +4197,7 @@ attributes #5 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 ; POSTPROCESS-NEXT:    [[TMP16:%.*]] = add i32 [[TMP15]], -8
 ; POSTPROCESS-NEXT:    store i32 [[TMP16]], ptr [[CSP]], align 4
 ; POSTPROCESS-NEXT:    [[TMP17:%.*]] = load i32, ptr [[CSP]], align 4
-; POSTPROCESS-NEXT:    call void (i64, ...) @continuation.continue(i64 [[RETURNADDR]], i32 [[TMP17]], [[STRUCT_ANYHITTRAVERSALDATA]] [[DOTFCA_1_3_INSERT80]]), !continuation.registercount [[META32]]
+; POSTPROCESS-NEXT:    call void (i64, ...) @continuation.continue(i64 [[RETURNADDR]], i32 [[TMP17]], i64 poison, [[STRUCT_ANYHITTRAVERSALDATA]] [[DOTFCA_1_3_INSERT80]]), !continuation.registercount [[META32]]
 ; POSTPROCESS-NEXT:    unreachable
 ; POSTPROCESS:       18:
 ; POSTPROCESS-NEXT:    [[DOTFCA_0_0_0_0_INSERT:%.*]] = insertvalue [[STRUCT_ANYHITTRAVERSALDATA]] poison, <3 x i32> [[SYSTEM_DATA_ALLOCA_SROA_0_0_VECBLEND]], 0, 0, 0, 0
@@ -4208,34 +4217,34 @@ attributes #5 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 ; POSTPROCESS-NEXT:    [[TMP20:%.*]] = add i32 [[TMP19]], -8
 ; POSTPROCESS-NEXT:    store i32 [[TMP20]], ptr [[CSP]], align 4
 ; POSTPROCESS-NEXT:    [[TMP21:%.*]] = load i32, ptr [[CSP]], align 4
-; POSTPROCESS-NEXT:    call void (i64, ...) @continuation.continue(i64 [[RETURNADDR]], i32 [[TMP21]], [[STRUCT_ANYHITTRAVERSALDATA]] [[DOTFCA_1_3_INSERT]]), !continuation.registercount [[META32]]
+; POSTPROCESS-NEXT:    call void (i64, ...) @continuation.continue(i64 [[RETURNADDR]], i32 [[TMP21]], i64 poison, [[STRUCT_ANYHITTRAVERSALDATA]] [[DOTFCA_1_3_INSERT]]), !continuation.registercount [[META32]]
 ; POSTPROCESS-NEXT:    unreachable
 ;
 ;
 ; POSTPROCESS-LABEL: define dso_local void @MyIntersectionShader2.resume.0(
-; POSTPROCESS-SAME: i32 [[CSPINIT:%.*]], [[STRUCT_ANYHITTRAVERSALDATA:%.*]] [[TMP0:%.*]]) !lgc.rt.shaderstage [[META40]] !continuation.registercount [[META32]] !continuation [[META43]] {
+; POSTPROCESS-SAME: i32 [[CSPINIT:%.*]], i64 [[TMP0:%.*]], [[STRUCT_ANYHITTRAVERSALDATA:%.*]] [[TMP1:%.*]]) !lgc.rt.shaderstage [[META40]] !continuation.registercount [[META32]] !continuation [[META43]] {
 ; POSTPROCESS-NEXT:  entryresume.0:
 ; POSTPROCESS-NEXT:    [[CSP:%.*]] = alloca i32, align 4
 ; POSTPROCESS-NEXT:    store i32 [[CSPINIT]], ptr [[CSP]], align 4
-; POSTPROCESS-NEXT:    [[TMP1:%.*]] = load i32, ptr [[CSP]], align 4
-; POSTPROCESS-NEXT:    [[TMP2:%.*]] = add i32 [[TMP1]], -8
-; POSTPROCESS-NEXT:    [[DOTFCA_0_0_0_0_EXTRACT16:%.*]] = extractvalue [[STRUCT_ANYHITTRAVERSALDATA]] [[TMP0]], 0, 0, 0, 0
-; POSTPROCESS-NEXT:    [[DOTFCA_0_1_0_EXTRACT18:%.*]] = extractvalue [[STRUCT_ANYHITTRAVERSALDATA]] [[TMP0]], 0, 1, 0
-; POSTPROCESS-NEXT:    [[DOTFCA_0_1_1_EXTRACT20:%.*]] = extractvalue [[STRUCT_ANYHITTRAVERSALDATA]] [[TMP0]], 0, 1, 1
-; POSTPROCESS-NEXT:    [[DOTFCA_0_1_2_EXTRACT22:%.*]] = extractvalue [[STRUCT_ANYHITTRAVERSALDATA]] [[TMP0]], 0, 1, 2
-; POSTPROCESS-NEXT:    [[DOTFCA_0_1_3_EXTRACT24:%.*]] = extractvalue [[STRUCT_ANYHITTRAVERSALDATA]] [[TMP0]], 0, 1, 3
-; POSTPROCESS-NEXT:    [[DOTFCA_0_2_EXTRACT26:%.*]] = extractvalue [[STRUCT_ANYHITTRAVERSALDATA]] [[TMP0]], 0, 2
-; POSTPROCESS-NEXT:    [[DOTFCA_0_3_EXTRACT28:%.*]] = extractvalue [[STRUCT_ANYHITTRAVERSALDATA]] [[TMP0]], 0, 3
-; POSTPROCESS-NEXT:    [[DOTFCA_0_4_EXTRACT30:%.*]] = extractvalue [[STRUCT_ANYHITTRAVERSALDATA]] [[TMP0]], 0, 4
-; POSTPROCESS-NEXT:    [[DOTFCA_0_5_EXTRACT32:%.*]] = extractvalue [[STRUCT_ANYHITTRAVERSALDATA]] [[TMP0]], 0, 5
-; POSTPROCESS-NEXT:    [[DOTFCA_1_0_EXTRACT34:%.*]] = extractvalue [[STRUCT_ANYHITTRAVERSALDATA]] [[TMP0]], 1, 0
-; POSTPROCESS-NEXT:    [[DOTFCA_1_1_EXTRACT36:%.*]] = extractvalue [[STRUCT_ANYHITTRAVERSALDATA]] [[TMP0]], 1, 1
-; POSTPROCESS-NEXT:    [[DOTFCA_1_2_EXTRACT38:%.*]] = extractvalue [[STRUCT_ANYHITTRAVERSALDATA]] [[TMP0]], 1, 2
-; POSTPROCESS-NEXT:    [[DOTFCA_1_3_EXTRACT40:%.*]] = extractvalue [[STRUCT_ANYHITTRAVERSALDATA]] [[TMP0]], 1, 3
+; POSTPROCESS-NEXT:    [[TMP15:%.*]] = load i32, ptr [[CSP]], align 4
+; POSTPROCESS-NEXT:    [[TMP2:%.*]] = add i32 [[TMP15]], -8
+; POSTPROCESS-NEXT:    [[DOTFCA_0_0_0_0_EXTRACT16:%.*]] = extractvalue [[STRUCT_ANYHITTRAVERSALDATA]] [[TMP1]], 0, 0, 0, 0
+; POSTPROCESS-NEXT:    [[DOTFCA_0_1_0_EXTRACT18:%.*]] = extractvalue [[STRUCT_ANYHITTRAVERSALDATA]] [[TMP1]], 0, 1, 0
+; POSTPROCESS-NEXT:    [[DOTFCA_0_1_1_EXTRACT20:%.*]] = extractvalue [[STRUCT_ANYHITTRAVERSALDATA]] [[TMP1]], 0, 1, 1
+; POSTPROCESS-NEXT:    [[DOTFCA_0_1_2_EXTRACT22:%.*]] = extractvalue [[STRUCT_ANYHITTRAVERSALDATA]] [[TMP1]], 0, 1, 2
+; POSTPROCESS-NEXT:    [[DOTFCA_0_1_3_EXTRACT24:%.*]] = extractvalue [[STRUCT_ANYHITTRAVERSALDATA]] [[TMP1]], 0, 1, 3
+; POSTPROCESS-NEXT:    [[DOTFCA_0_2_EXTRACT26:%.*]] = extractvalue [[STRUCT_ANYHITTRAVERSALDATA]] [[TMP1]], 0, 2
+; POSTPROCESS-NEXT:    [[DOTFCA_0_3_EXTRACT28:%.*]] = extractvalue [[STRUCT_ANYHITTRAVERSALDATA]] [[TMP1]], 0, 3
+; POSTPROCESS-NEXT:    [[DOTFCA_0_4_EXTRACT30:%.*]] = extractvalue [[STRUCT_ANYHITTRAVERSALDATA]] [[TMP1]], 0, 4
+; POSTPROCESS-NEXT:    [[DOTFCA_0_5_EXTRACT32:%.*]] = extractvalue [[STRUCT_ANYHITTRAVERSALDATA]] [[TMP1]], 0, 5
+; POSTPROCESS-NEXT:    [[DOTFCA_1_0_EXTRACT34:%.*]] = extractvalue [[STRUCT_ANYHITTRAVERSALDATA]] [[TMP1]], 1, 0
+; POSTPROCESS-NEXT:    [[DOTFCA_1_1_EXTRACT36:%.*]] = extractvalue [[STRUCT_ANYHITTRAVERSALDATA]] [[TMP1]], 1, 1
+; POSTPROCESS-NEXT:    [[DOTFCA_1_2_EXTRACT38:%.*]] = extractvalue [[STRUCT_ANYHITTRAVERSALDATA]] [[TMP1]], 1, 2
+; POSTPROCESS-NEXT:    [[DOTFCA_1_3_EXTRACT40:%.*]] = extractvalue [[STRUCT_ANYHITTRAVERSALDATA]] [[TMP1]], 1, 3
 ; POSTPROCESS-NEXT:    call void @amd.dx.setLocalRootIndex(i32 5)
 ; POSTPROCESS-NEXT:    [[ISEND_I1:%.*]] = call i1 @opaqueIsEnd()
 ; POSTPROCESS-NEXT:    br i1 [[ISEND_I1]], label [[TMP3:%.*]], label [[TMP9:%.*]]
-; POSTPROCESS:       3:
+; POSTPROCESS:       4:
 ; POSTPROCESS-NEXT:    [[TMP4:%.*]] = inttoptr i32 [[TMP2]] to ptr addrspace(21)
 ; POSTPROCESS-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP4]], i32 0
 ; POSTPROCESS-NEXT:    [[RETURNADDR_RELOAD2:%.*]] = load i64, ptr addrspace(21) [[TMP5]], align 4
@@ -4256,9 +4265,9 @@ attributes #5 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 ; POSTPROCESS-NEXT:    [[TMP7:%.*]] = add i32 [[TMP6]], -8
 ; POSTPROCESS-NEXT:    store i32 [[TMP7]], ptr [[CSP]], align 4
 ; POSTPROCESS-NEXT:    [[TMP8:%.*]] = load i32, ptr [[CSP]], align 4
-; POSTPROCESS-NEXT:    call void (i64, ...) @continuation.continue(i64 [[RETURNADDR_RELOAD2]], i32 [[TMP8]], [[STRUCT_ANYHITTRAVERSALDATA]] [[DOTFCA_1_3_INSERT80]]), !continuation.registercount [[META32]]
+; POSTPROCESS-NEXT:    call void (i64, ...) @continuation.continue(i64 [[RETURNADDR_RELOAD2]], i32 [[TMP8]], i64 poison, [[STRUCT_ANYHITTRAVERSALDATA]] [[DOTFCA_1_3_INSERT80]]), !continuation.registercount [[META32]]
 ; POSTPROCESS-NEXT:    unreachable
-; POSTPROCESS:       9:
+; POSTPROCESS:       10:
 ; POSTPROCESS-NEXT:    [[TMP10:%.*]] = inttoptr i32 [[TMP2]] to ptr addrspace(21)
 ; POSTPROCESS-NEXT:    [[TMP11:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP10]], i32 0
 ; POSTPROCESS-NEXT:    [[RETURNADDR_RELOAD:%.*]] = load i64, ptr addrspace(21) [[TMP11]], align 4
@@ -4279,7 +4288,7 @@ attributes #5 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 ; POSTPROCESS-NEXT:    [[TMP13:%.*]] = add i32 [[TMP12]], -8
 ; POSTPROCESS-NEXT:    store i32 [[TMP13]], ptr [[CSP]], align 4
 ; POSTPROCESS-NEXT:    [[TMP14:%.*]] = load i32, ptr [[CSP]], align 4
-; POSTPROCESS-NEXT:    call void (i64, ...) @continuation.continue(i64 [[RETURNADDR_RELOAD]], i32 [[TMP14]], [[STRUCT_ANYHITTRAVERSALDATA]] [[DOTFCA_1_3_INSERT]]), !continuation.registercount [[META32]]
+; POSTPROCESS-NEXT:    call void (i64, ...) @continuation.continue(i64 [[RETURNADDR_RELOAD]], i32 [[TMP14]], i64 poison, [[STRUCT_ANYHITTRAVERSALDATA]] [[DOTFCA_1_3_INSERT]]), !continuation.registercount [[META32]]
 ; POSTPROCESS-NEXT:    unreachable
 ;
 ;
@@ -4316,7 +4325,7 @@ attributes #5 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 ; POSTPROCESS-NEXT:    store i32 [[TMP12]], ptr addrspace(20) getelementptr inbounds (i32, ptr addrspace(20) @REGISTERS, i32 9), align 4
 ; POSTPROCESS-NEXT:    [[DOTFCA_0_INSERT:%.*]] = insertvalue [[STRUCT_DISPATCHSYSTEMDATA:%.*]] poison, <3 x i32> [[DOTFCA_0_0_EXTRACT]], 0
 ; POSTPROCESS-NEXT:    [[TMP13:%.*]] = load i32, ptr [[CSP]], align 4
-; POSTPROCESS-NEXT:    call void (i64, ...) @continuation.continue(i64 [[RETURNADDR]], i32 [[TMP13]], [[STRUCT_DISPATCHSYSTEMDATA]] [[DOTFCA_0_INSERT]]), !continuation.registercount [[META33]]
+; POSTPROCESS-NEXT:    call void (i64, ...) @continuation.continue(i64 [[RETURNADDR]], i32 [[TMP13]], i64 poison, [[STRUCT_DISPATCHSYSTEMDATA]] [[DOTFCA_0_INSERT]]), !continuation.registercount [[META33]]
 ; POSTPROCESS-NEXT:    unreachable
 ;
 ;
@@ -4400,12 +4409,11 @@ attributes #5 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 ;
 ;
 ; POSTPROCESS-CPS-LABEL: define void @MyRayGen(
-; POSTPROCESS-CPS-SAME: {} [[TMP0:%.*]], i32 [[CSPINIT:%.*]], i32 [[TMP1:%.*]], i32 [[TMP2:%.*]]) #[[ATTR2:[0-9]+]] !lgc.rt.shaderstage [[META22:![0-9]+]] !lgc.cps [[META22]] !continuation [[META35:![0-9]+]] {
+; POSTPROCESS-CPS-SAME: {} [[CONT_STATE:%.*]], i32 [[CSPINIT:%.*]], i32 [[RETURNADDR:%.*]], i32 [[SHADER_INDEX:%.*]], [[STRUCT_DISPATCHSYSTEMDATA:%.*]] [[TMP0:%.*]]) #[[ATTR2:[0-9]+]] !lgc.rt.shaderstage [[META22:![0-9]+]] !lgc.cps [[META35:![0-9]+]] !continuation [[META36:![0-9]+]] {
 ; POSTPROCESS-CPS-NEXT:  AllocaSpillBB:
 ; POSTPROCESS-CPS-NEXT:    [[CSP:%.*]] = alloca i32, align 4
 ; POSTPROCESS-CPS-NEXT:    store i32 [[CSPINIT]], ptr [[CSP]], align 4
-; POSTPROCESS-CPS-NEXT:    [[SYSTEM_DATA:%.*]] = call [[STRUCT_DISPATCHSYSTEMDATA:%.*]] [[_CONT_SETUPRAYGEN:@[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]]()
-; POSTPROCESS-CPS-NEXT:    [[DOTFCA_0_EXTRACT20:%.*]] = extractvalue [[STRUCT_DISPATCHSYSTEMDATA]] [[SYSTEM_DATA]], 0
+; POSTPROCESS-CPS-NEXT:    [[DOTFCA_0_EXTRACT20:%.*]] = extractvalue [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP0]], 0
 ; POSTPROCESS-CPS-NEXT:    call void @amd.dx.setLocalRootIndex(i32 0)
 ; POSTPROCESS-CPS-NEXT:    [[TMP3:%.*]] = load [[DX_TYPES_HANDLE:%.*]], ptr @"\01?Scene@@3URaytracingAccelerationStructure@@A", align 4
 ; POSTPROCESS-CPS-NEXT:    [[TMP4:%.*]] = load [[DX_TYPES_HANDLE]], ptr @"\01?RenderTarget@@3V?$RWTexture2D@V?$vector@M$03@@@@A", align 4
@@ -4415,34 +4423,33 @@ attributes #5 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 ; POSTPROCESS-CPS-NEXT:    [[DIS_DATA_I_FCA_0_INSERT:%.*]] = insertvalue [[STRUCT_DISPATCHSYSTEMDATA]] poison, <3 x i32> [[DOTFCA_0_EXTRACT20]], 0
 ; POSTPROCESS-CPS-NEXT:    [[SYS_DATA_I:%.*]] = insertvalue [[STRUCT_SYSTEMDATA:%.*]] undef, [[STRUCT_DISPATCHSYSTEMDATA]] [[DIS_DATA_I_FCA_0_INSERT]], 0
 ; POSTPROCESS-CPS-NEXT:    [[TRAV_DATA_I:%.*]] = insertvalue [[STRUCT_TRAVERSALDATA:%.*]] undef, [[STRUCT_SYSTEMDATA]] [[SYS_DATA_I]], 0
-; POSTPROCESS-CPS-NEXT:    [[TMP8:%.*]] = call i64 @continuation.getAddrAndMD(i64 ptrtoint (ptr @MyRayGen.resume.0 to i64))
+; POSTPROCESS-CPS-NEXT:    [[TMP8:%.*]] = call i64 @continuation.getAddrAndMD(ptr @MyRayGen.resume.0)
 ; POSTPROCESS-CPS-NEXT:    [[TRAV_DATA2_I:%.*]] = insertvalue [[STRUCT_TRAVERSALDATA]] [[TRAV_DATA_I]], i64 [[TMP8]], 5
 ; POSTPROCESS-CPS-NEXT:    [[DOTSROA_0_0_VEC_EXTRACT:%.*]] = extractelement <4 x float> zeroinitializer, i32 0
-; POSTPROCESS-CPS-NEXT:    [[TMP9:%.*]] = bitcast float [[DOTSROA_0_0_VEC_EXTRACT]] to i32
+; POSTPROCESS-CPS-NEXT:    [[TMP12:%.*]] = bitcast float [[DOTSROA_0_0_VEC_EXTRACT]] to i32
 ; POSTPROCESS-CPS-NEXT:    [[DOTSROA_0_4_VEC_EXTRACT:%.*]] = extractelement <4 x float> zeroinitializer, i32 1
-; POSTPROCESS-CPS-NEXT:    [[TMP10:%.*]] = bitcast float [[DOTSROA_0_4_VEC_EXTRACT]] to i32
+; POSTPROCESS-CPS-NEXT:    [[TMP11:%.*]] = bitcast float [[DOTSROA_0_4_VEC_EXTRACT]] to i32
 ; POSTPROCESS-CPS-NEXT:    [[DOTSROA_0_8_VEC_EXTRACT:%.*]] = extractelement <4 x float> zeroinitializer, i32 2
-; POSTPROCESS-CPS-NEXT:    [[TMP11:%.*]] = bitcast float [[DOTSROA_0_8_VEC_EXTRACT]] to i32
+; POSTPROCESS-CPS-NEXT:    [[TMP9:%.*]] = bitcast float [[DOTSROA_0_8_VEC_EXTRACT]] to i32
 ; POSTPROCESS-CPS-NEXT:    [[DOTSROA_0_12_VEC_EXTRACT:%.*]] = extractelement <4 x float> zeroinitializer, i32 3
-; POSTPROCESS-CPS-NEXT:    [[TMP12:%.*]] = bitcast float [[DOTSROA_0_12_VEC_EXTRACT]] to i32
-; POSTPROCESS-CPS-NEXT:    [[DOTFCA_0_INSERT:%.*]] = insertvalue [10 x i32] poison, i32 [[TMP9]], 0
+; POSTPROCESS-CPS-NEXT:    [[TMP10:%.*]] = bitcast float [[DOTSROA_0_12_VEC_EXTRACT]] to i32
+; POSTPROCESS-CPS-NEXT:    [[DOTFCA_0_INSERT:%.*]] = insertvalue [10 x i32] poison, i32 [[TMP12]], 0
 ; POSTPROCESS-CPS-NEXT:    [[DOTFCA_1_INSERT:%.*]] = insertvalue [10 x i32] [[DOTFCA_0_INSERT]], i32 undef, 1
 ; POSTPROCESS-CPS-NEXT:    [[DOTFCA_2_INSERT:%.*]] = insertvalue [10 x i32] [[DOTFCA_1_INSERT]], i32 undef, 2
 ; POSTPROCESS-CPS-NEXT:    [[DOTFCA_3_INSERT:%.*]] = insertvalue [10 x i32] [[DOTFCA_2_INSERT]], i32 undef, 3
 ; POSTPROCESS-CPS-NEXT:    [[DOTFCA_4_INSERT:%.*]] = insertvalue [10 x i32] [[DOTFCA_3_INSERT]], i32 undef, 4
 ; POSTPROCESS-CPS-NEXT:    [[DOTFCA_5_INSERT:%.*]] = insertvalue [10 x i32] [[DOTFCA_4_INSERT]], i32 undef, 5
 ; POSTPROCESS-CPS-NEXT:    [[DOTFCA_6_INSERT:%.*]] = insertvalue [10 x i32] [[DOTFCA_5_INSERT]], i32 undef, 6
-; POSTPROCESS-CPS-NEXT:    [[DOTFCA_7_INSERT:%.*]] = insertvalue [10 x i32] [[DOTFCA_6_INSERT]], i32 [[TMP10]], 7
-; POSTPROCESS-CPS-NEXT:    [[DOTFCA_8_INSERT:%.*]] = insertvalue [10 x i32] [[DOTFCA_7_INSERT]], i32 [[TMP11]], 8
-; POSTPROCESS-CPS-NEXT:    [[DOTFCA_9_INSERT:%.*]] = insertvalue [10 x i32] [[DOTFCA_8_INSERT]], i32 [[TMP12]], 9
+; POSTPROCESS-CPS-NEXT:    [[DOTFCA_7_INSERT:%.*]] = insertvalue [10 x i32] [[DOTFCA_6_INSERT]], i32 [[TMP11]], 7
+; POSTPROCESS-CPS-NEXT:    [[DOTFCA_8_INSERT:%.*]] = insertvalue [10 x i32] [[DOTFCA_7_INSERT]], i32 [[TMP9]], 8
+; POSTPROCESS-CPS-NEXT:    [[DOTFCA_9_INSERT:%.*]] = insertvalue [10 x i32] [[DOTFCA_8_INSERT]], i32 [[TMP10]], 9
 ; POSTPROCESS-CPS-NEXT:    [[TMP13:%.*]] = load i32, ptr [[CSP]], align 4
-; POSTPROCESS-CPS-NEXT:    [[TMP14:%.*]] = call i64 @continuation.getAddrAndMD(i64 ptrtoint (ptr @MyRayGen.resume.0 to i64))
-; POSTPROCESS-CPS-NEXT:    call void (i64, ...) @continuation.continue(i64 4, i32 [[TMP13]], i64 [[TMP14]], i32 5, [36 x i32] poison, [10 x i32] [[DOTFCA_9_INSERT]]), !continuation.returnedRegistercount !33, !continuation.registercount [[META33:![0-9]+]]
+; POSTPROCESS-CPS-NEXT:    call void (i64, ...) @continuation.continue(i64 4, i32 [[TMP13]], i64 [[TMP8]], i32 5, [36 x i32] poison, [10 x i32] [[DOTFCA_9_INSERT]]), !continuation.returnedRegistercount [[META33:![0-9]+]], !continuation.registercount [[META33]]
 ; POSTPROCESS-CPS-NEXT:    unreachable
 ;
 ;
 ; POSTPROCESS-CPS-LABEL: define dso_local void @MyRayGen.resume.0(
-; POSTPROCESS-CPS-SAME: {} [[TMP0:%.*]], i32 [[CSPINIT:%.*]], i32 [[TMP1:%.*]], i32 [[TMP2:%.*]], { [[STRUCT_DISPATCHSYSTEMDATA:%.*]], [33 x i32], [10 x i32] } [[TMP3:%.*]]) !lgc.rt.shaderstage [[META22]] !lgc.cps [[META22]] !continuation [[META35]] {
+; POSTPROCESS-CPS-SAME: {} [[TMP0:%.*]], i32 [[CSPINIT:%.*]], i32 [[TMP1:%.*]], i32 [[TMP2:%.*]], { [[STRUCT_DISPATCHSYSTEMDATA:%.*]], [33 x i32], [10 x i32] } [[TMP3:%.*]]) !lgc.rt.shaderstage [[META22]] !lgc.cps [[META35]] !continuation [[META36]] {
 ; POSTPROCESS-CPS-NEXT:  entryresume.0:
 ; POSTPROCESS-CPS-NEXT:    [[TMP4:%.*]] = alloca { [[STRUCT_DISPATCHSYSTEMDATA]], [33 x i32], [10 x i32] }, align 8
 ; POSTPROCESS-CPS-NEXT:    [[CSP:%.*]] = alloca i32, align 4
@@ -4502,7 +4509,7 @@ attributes #5 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 ;
 ;
 ; POSTPROCESS-CPS-LABEL: define void @MyClosestHitShader(
-; POSTPROCESS-CPS-SAME: {} [[CONT_STATE:%.*]], i32 [[CSPINIT:%.*]], i32 [[RETURN_ADDR:%.*]], i32 [[SHADER_INDEX:%.*]], [[STRUCT_SYSTEMDATA:%.*]] [[SYSTEM_DATA:%.*]], {} [[HIT_ATTRS:%.*]], [33 x i32] [[PADDING:%.*]], [10 x i32] [[PAYLOAD:%.*]]) #[[ATTR2]] !lgc.rt.shaderstage [[META36:![0-9]+]] !lgc.cps [[META37:![0-9]+]] !continuation [[META38:![0-9]+]] {
+; POSTPROCESS-CPS-SAME: {} [[CONT_STATE:%.*]], i32 [[CSPINIT:%.*]], i32 [[RETURNADDR:%.*]], i32 [[SHADER_INDEX:%.*]], [[STRUCT_SYSTEMDATA:%.*]] [[SYSTEM_DATA:%.*]], {} [[HIT_ATTRS:%.*]], [33 x i32] [[PADDING:%.*]], [10 x i32] [[PAYLOAD:%.*]]) #[[ATTR2]] !lgc.rt.shaderstage [[META37:![0-9]+]] !lgc.cps [[META38:![0-9]+]] !continuation [[META39:![0-9]+]] {
 ; POSTPROCESS-CPS-NEXT:  AllocaSpillBB:
 ; POSTPROCESS-CPS-NEXT:    [[SYSTEM_DATA_ALLOCA:%.*]] = alloca [[STRUCT_SYSTEMDATA]], align 8
 ; POSTPROCESS-CPS-NEXT:    [[CSP:%.*]] = alloca i32, align 4
@@ -4572,14 +4579,14 @@ attributes #5 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 ; POSTPROCESS-CPS-NEXT:    [[TMP22:%.*]] = load i32, ptr [[CSP]], align 4
 ; POSTPROCESS-CPS-NEXT:    [[TMP23:%.*]] = add i32 [[TMP22]], 0
 ; POSTPROCESS-CPS-NEXT:    store i32 [[TMP23]], ptr [[CSP]], align 4
-; POSTPROCESS-CPS-NEXT:    [[TMP24:%.*]] = zext i32 [[RETURN_ADDR]] to i64
+; POSTPROCESS-CPS-NEXT:    [[TMP24:%.*]] = zext i32 [[RETURNADDR]] to i64
 ; POSTPROCESS-CPS-NEXT:    [[TMP25:%.*]] = load i32, ptr [[CSP]], align 4
 ; POSTPROCESS-CPS-NEXT:    call void (i64, ...) @continuation.continue(i64 [[TMP24]], i32 [[TMP25]], i32 poison, i32 poison, [[STRUCT_DISPATCHSYSTEMDATA]] [[DOTFCA_0_INSERT10]], [33 x i32] poison, [10 x i32] [[DOTFCA_9_INSERT]]), !continuation.registercount [[META33]]
 ; POSTPROCESS-CPS-NEXT:    unreachable
 ;
 ;
 ; POSTPROCESS-CPS-LABEL: define void @MyAnyHitShader(
-; POSTPROCESS-CPS-SAME: {} [[CONT_STATE:%.*]], i32 [[CSPINIT:%.*]], i32 [[RETURN_ADDR:%.*]], i32 [[SHADER_INDEX:%.*]], [[STRUCT_ANYHITTRAVERSALDATA:%.*]] [[SYSTEM_DATA:%.*]], [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES:%.*]] [[HIT_ATTRS:%.*]], [6 x i32] [[PADDING:%.*]], [10 x i32] [[PAYLOAD:%.*]]) #[[ATTR2]] !lgc.rt.shaderstage [[META39:![0-9]+]] !lgc.cps [[META36]] !continuation [[META40:![0-9]+]] {
+; POSTPROCESS-CPS-SAME: {} [[CONT_STATE:%.*]], i32 [[CSPINIT:%.*]], i32 [[RETURNADDR:%.*]], i32 [[SHADER_INDEX:%.*]], [[STRUCT_ANYHITTRAVERSALDATA:%.*]] [[SYSTEM_DATA:%.*]], [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES:%.*]] [[HIT_ATTRS:%.*]], [6 x i32] [[PADDING:%.*]], [10 x i32] [[PAYLOAD:%.*]]) #[[ATTR2]] !lgc.rt.shaderstage [[META38]] !lgc.cps [[META40:![0-9]+]] !continuation [[META41:![0-9]+]] {
 ; POSTPROCESS-CPS-NEXT:  AllocaSpillBB:
 ; POSTPROCESS-CPS-NEXT:    [[SYSTEM_DATA_ALLOCA:%.*]] = alloca [[STRUCT_ANYHITTRAVERSALDATA]], align 8
 ; POSTPROCESS-CPS-NEXT:    [[CSP:%.*]] = alloca i32, align 4
@@ -4734,12 +4741,12 @@ attributes #5 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 ; POSTPROCESS-CPS-NEXT:    [[TMP21:%.*]] = bitcast float [[DOTSROA_0_8_VEC_EXTRACT]] to i32
 ; POSTPROCESS-CPS-NEXT:    [[DOTSROA_0_12_VEC_EXTRACT:%.*]] = extractelement <4 x float> [[DOTSROA_0_12_VEC_INSERT]], i32 3
 ; POSTPROCESS-CPS-NEXT:    [[TMP22:%.*]] = bitcast float [[DOTSROA_0_12_VEC_EXTRACT]] to i32
-; POSTPROCESS-CPS-NEXT:    [[HITATTRSALLOCA_SROA_0_0_VEC_EXTRACT:%.*]] = extractelement <2 x float> [[HIT_ATTRS_FCA_0_EXTRACT]], i32 0
-; POSTPROCESS-CPS-NEXT:    [[TMP23:%.*]] = bitcast float [[HITATTRSALLOCA_SROA_0_0_VEC_EXTRACT]] to i32
+; POSTPROCESS-CPS-NEXT:    [[HITATTRSALLOCA_SROA_0_0_VEC_EXTRACT15:%.*]] = extractelement <2 x float> [[HIT_ATTRS_FCA_0_EXTRACT]], i32 0
+; POSTPROCESS-CPS-NEXT:    [[TMP23:%.*]] = bitcast float [[HITATTRSALLOCA_SROA_0_0_VEC_EXTRACT15]] to i32
 ; POSTPROCESS-CPS-NEXT:    [[TMP24:%.*]] = bitcast i32 [[TMP23]] to float
 ; POSTPROCESS-CPS-NEXT:    [[DOTSROA_0392_0_VEC_INSERT:%.*]] = insertelement <2 x float> undef, float [[TMP24]], i32 0
-; POSTPROCESS-CPS-NEXT:    [[HITATTRSALLOCA_SROA_0_4_VEC_EXTRACT:%.*]] = extractelement <2 x float> [[HIT_ATTRS_FCA_0_EXTRACT]], i32 1
-; POSTPROCESS-CPS-NEXT:    [[TMP25:%.*]] = bitcast float [[HITATTRSALLOCA_SROA_0_4_VEC_EXTRACT]] to i32
+; POSTPROCESS-CPS-NEXT:    [[HITATTRSALLOCA_SROA_0_4_VEC_EXTRACT19:%.*]] = extractelement <2 x float> [[HIT_ATTRS_FCA_0_EXTRACT]], i32 1
+; POSTPROCESS-CPS-NEXT:    [[TMP25:%.*]] = bitcast float [[HITATTRSALLOCA_SROA_0_4_VEC_EXTRACT19]] to i32
 ; POSTPROCESS-CPS-NEXT:    [[TMP26:%.*]] = bitcast i32 [[TMP25]] to float
 ; POSTPROCESS-CPS-NEXT:    [[DOTSROA_0392_4_VEC_INSERT:%.*]] = insertelement <2 x float> [[DOTSROA_0392_0_VEC_INSERT]], float [[TMP26]], i32 1
 ; POSTPROCESS-CPS-NEXT:    [[DOTFCA_0_INSERT391:%.*]] = insertvalue [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]] poison, <2 x float> [[DOTSROA_0392_4_VEC_INSERT]], 0
@@ -4799,7 +4806,7 @@ attributes #5 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 ; POSTPROCESS-CPS-NEXT:    [[TMP28:%.*]] = load i32, ptr [[CSP]], align 4
 ; POSTPROCESS-CPS-NEXT:    [[TMP29:%.*]] = add i32 [[TMP28]], 0
 ; POSTPROCESS-CPS-NEXT:    store i32 [[TMP29]], ptr [[CSP]], align 4
-; POSTPROCESS-CPS-NEXT:    [[TMP30:%.*]] = zext i32 [[RETURN_ADDR]] to i64
+; POSTPROCESS-CPS-NEXT:    [[TMP30:%.*]] = zext i32 [[RETURNADDR]] to i64
 ; POSTPROCESS-CPS-NEXT:    [[TMP31:%.*]] = load i32, ptr [[CSP]], align 4
 ; POSTPROCESS-CPS-NEXT:    call void (i64, ...) @continuation.continue(i64 [[TMP30]], i32 [[TMP31]], i32 poison, i32 poison, [[STRUCT_ANYHITTRAVERSALDATA]] [[DOTFCA_1_3_INSERT]], [8 x i32] poison, [10 x i32] [[DOTFCA_9_INSERT]]), !continuation.registercount [[META33]]
 ; POSTPROCESS-CPS-NEXT:    unreachable
@@ -4810,16 +4817,16 @@ attributes #5 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 ; POSTPROCESS-CPS-NEXT:    [[TMP34:%.*]] = bitcast float [[DOTSROA_0_0_VEC_EXTRACT25]] to i32
 ; POSTPROCESS-CPS-NEXT:    [[DOTSROA_0_4_VEC_EXTRACT34:%.*]] = extractelement <4 x float> [[DOTSROA_0_12_VEC_INSERT]], i32 1
 ; POSTPROCESS-CPS-NEXT:    [[TMP35:%.*]] = bitcast float [[DOTSROA_0_4_VEC_EXTRACT34]] to i32
-; POSTPROCESS-CPS-NEXT:    [[DOTSROA_0_8_VEC_EXTRACT43:%.*]] = extractelement <4 x float> [[DOTSROA_0_12_VEC_INSERT]], i32 2
-; POSTPROCESS-CPS-NEXT:    [[TMP36:%.*]] = bitcast float [[DOTSROA_0_8_VEC_EXTRACT43]] to i32
+; POSTPROCESS-CPS-NEXT:    [[DOTSROA_0_8_VEC_EXTRACT42:%.*]] = extractelement <4 x float> [[DOTSROA_0_12_VEC_INSERT]], i32 2
+; POSTPROCESS-CPS-NEXT:    [[TMP36:%.*]] = bitcast float [[DOTSROA_0_8_VEC_EXTRACT42]] to i32
 ; POSTPROCESS-CPS-NEXT:    [[DOTSROA_0_12_VEC_EXTRACT52:%.*]] = extractelement <4 x float> [[DOTSROA_0_12_VEC_INSERT]], i32 3
 ; POSTPROCESS-CPS-NEXT:    [[TMP37:%.*]] = bitcast float [[DOTSROA_0_12_VEC_EXTRACT52]] to i32
-; POSTPROCESS-CPS-NEXT:    [[HITATTRSALLOCA_SROA_0_0_VEC_EXTRACT13:%.*]] = extractelement <2 x float> [[HIT_ATTRS_FCA_0_EXTRACT]], i32 0
-; POSTPROCESS-CPS-NEXT:    [[TMP38:%.*]] = bitcast float [[HITATTRSALLOCA_SROA_0_0_VEC_EXTRACT13]] to i32
+; POSTPROCESS-CPS-NEXT:    [[HITATTRSALLOCA_SROA_0_0_VEC_EXTRACT:%.*]] = extractelement <2 x float> [[HIT_ATTRS_FCA_0_EXTRACT]], i32 0
+; POSTPROCESS-CPS-NEXT:    [[TMP38:%.*]] = bitcast float [[HITATTRSALLOCA_SROA_0_0_VEC_EXTRACT]] to i32
 ; POSTPROCESS-CPS-NEXT:    [[TMP39:%.*]] = bitcast i32 [[TMP38]] to float
 ; POSTPROCESS-CPS-NEXT:    [[DOTSROA_0396_0_VEC_INSERT:%.*]] = insertelement <2 x float> undef, float [[TMP39]], i32 0
-; POSTPROCESS-CPS-NEXT:    [[HITATTRSALLOCA_SROA_0_4_VEC_EXTRACT17:%.*]] = extractelement <2 x float> [[HIT_ATTRS_FCA_0_EXTRACT]], i32 1
-; POSTPROCESS-CPS-NEXT:    [[TMP40:%.*]] = bitcast float [[HITATTRSALLOCA_SROA_0_4_VEC_EXTRACT17]] to i32
+; POSTPROCESS-CPS-NEXT:    [[HITATTRSALLOCA_SROA_0_4_VEC_EXTRACT:%.*]] = extractelement <2 x float> [[HIT_ATTRS_FCA_0_EXTRACT]], i32 1
+; POSTPROCESS-CPS-NEXT:    [[TMP40:%.*]] = bitcast float [[HITATTRSALLOCA_SROA_0_4_VEC_EXTRACT]] to i32
 ; POSTPROCESS-CPS-NEXT:    [[TMP41:%.*]] = bitcast i32 [[TMP40]] to float
 ; POSTPROCESS-CPS-NEXT:    [[DOTSROA_0396_4_VEC_INSERT:%.*]] = insertelement <2 x float> [[DOTSROA_0396_0_VEC_INSERT]], float [[TMP41]], i32 1
 ; POSTPROCESS-CPS-NEXT:    [[DOTFCA_0_INSERT395:%.*]] = insertvalue [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]] poison, <2 x float> [[DOTSROA_0396_4_VEC_INSERT]], 0
@@ -4879,7 +4886,7 @@ attributes #5 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 ; POSTPROCESS-CPS-NEXT:    [[TMP43:%.*]] = load i32, ptr [[CSP]], align 4
 ; POSTPROCESS-CPS-NEXT:    [[TMP44:%.*]] = add i32 [[TMP43]], 0
 ; POSTPROCESS-CPS-NEXT:    store i32 [[TMP44]], ptr [[CSP]], align 4
-; POSTPROCESS-CPS-NEXT:    [[TMP45:%.*]] = zext i32 [[RETURN_ADDR]] to i64
+; POSTPROCESS-CPS-NEXT:    [[TMP45:%.*]] = zext i32 [[RETURNADDR]] to i64
 ; POSTPROCESS-CPS-NEXT:    [[TMP46:%.*]] = load i32, ptr [[CSP]], align 4
 ; POSTPROCESS-CPS-NEXT:    call void (i64, ...) @continuation.continue(i64 [[TMP45]], i32 [[TMP46]], i32 poison, i32 poison, [[STRUCT_ANYHITTRAVERSALDATA]] [[DOTFCA_1_3_INSERT263]], [8 x i32] poison, [10 x i32] [[DOTFCA_9_INSERT88]]), !continuation.registercount [[META33]]
 ; POSTPROCESS-CPS-NEXT:    unreachable
@@ -4894,8 +4901,8 @@ attributes #5 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 ; POSTPROCESS-CPS-NEXT:    [[TMP51:%.*]] = bitcast float [[DOTSROA_0_0_VEC_EXTRACT27]] to i32
 ; POSTPROCESS-CPS-NEXT:    [[DOTSROA_0_4_VEC_EXTRACT36:%.*]] = extractelement <4 x float> [[DOTSROA_0_12_VEC_INSERT]], i32 1
 ; POSTPROCESS-CPS-NEXT:    [[TMP52:%.*]] = bitcast float [[DOTSROA_0_4_VEC_EXTRACT36]] to i32
-; POSTPROCESS-CPS-NEXT:    [[DOTSROA_0_8_VEC_EXTRACT45:%.*]] = extractelement <4 x float> [[DOTSROA_0_12_VEC_INSERT]], i32 2
-; POSTPROCESS-CPS-NEXT:    [[TMP53:%.*]] = bitcast float [[DOTSROA_0_8_VEC_EXTRACT45]] to i32
+; POSTPROCESS-CPS-NEXT:    [[DOTSROA_0_8_VEC_EXTRACT44:%.*]] = extractelement <4 x float> [[DOTSROA_0_12_VEC_INSERT]], i32 2
+; POSTPROCESS-CPS-NEXT:    [[TMP53:%.*]] = bitcast float [[DOTSROA_0_8_VEC_EXTRACT44]] to i32
 ; POSTPROCESS-CPS-NEXT:    [[DOTSROA_0_12_VEC_EXTRACT54:%.*]] = extractelement <4 x float> [[DOTSROA_0_12_VEC_INSERT]], i32 3
 ; POSTPROCESS-CPS-NEXT:    [[TMP54:%.*]] = bitcast float [[DOTSROA_0_12_VEC_EXTRACT54]] to i32
 ; POSTPROCESS-CPS-NEXT:    [[TMP55:%.*]] = bitcast i32 [[TMP6]] to float
@@ -4959,7 +4966,7 @@ attributes #5 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 ; POSTPROCESS-CPS-NEXT:    [[TMP58:%.*]] = load i32, ptr [[CSP]], align 4
 ; POSTPROCESS-CPS-NEXT:    [[TMP59:%.*]] = add i32 [[TMP58]], 0
 ; POSTPROCESS-CPS-NEXT:    store i32 [[TMP59]], ptr [[CSP]], align 4
-; POSTPROCESS-CPS-NEXT:    [[TMP60:%.*]] = zext i32 [[RETURN_ADDR]] to i64
+; POSTPROCESS-CPS-NEXT:    [[TMP60:%.*]] = zext i32 [[RETURNADDR]] to i64
 ; POSTPROCESS-CPS-NEXT:    [[TMP61:%.*]] = load i32, ptr [[CSP]], align 4
 ; POSTPROCESS-CPS-NEXT:    call void (i64, ...) @continuation.continue(i64 [[TMP60]], i32 [[TMP61]], i32 poison, i32 poison, [[STRUCT_ANYHITTRAVERSALDATA]] [[DOTFCA_1_3_INSERT304]], [8 x i32] poison, [10 x i32] [[DOTFCA_9_INSERT118]]), !continuation.registercount [[META33]]
 ; POSTPROCESS-CPS-NEXT:    unreachable
@@ -4970,8 +4977,8 @@ attributes #5 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 ; POSTPROCESS-CPS-NEXT:    [[TMP64:%.*]] = bitcast float [[DOTSROA_0_0_VEC_EXTRACT29]] to i32
 ; POSTPROCESS-CPS-NEXT:    [[DOTSROA_0_4_VEC_EXTRACT38:%.*]] = extractelement <4 x float> [[DOTSROA_0_12_VEC_INSERT]], i32 1
 ; POSTPROCESS-CPS-NEXT:    [[TMP65:%.*]] = bitcast float [[DOTSROA_0_4_VEC_EXTRACT38]] to i32
-; POSTPROCESS-CPS-NEXT:    [[DOTSROA_0_8_VEC_EXTRACT47:%.*]] = extractelement <4 x float> [[DOTSROA_0_12_VEC_INSERT]], i32 2
-; POSTPROCESS-CPS-NEXT:    [[TMP66:%.*]] = bitcast float [[DOTSROA_0_8_VEC_EXTRACT47]] to i32
+; POSTPROCESS-CPS-NEXT:    [[DOTSROA_0_8_VEC_EXTRACT46:%.*]] = extractelement <4 x float> [[DOTSROA_0_12_VEC_INSERT]], i32 2
+; POSTPROCESS-CPS-NEXT:    [[TMP66:%.*]] = bitcast float [[DOTSROA_0_8_VEC_EXTRACT46]] to i32
 ; POSTPROCESS-CPS-NEXT:    [[DOTSROA_0_12_VEC_EXTRACT56:%.*]] = extractelement <4 x float> [[DOTSROA_0_12_VEC_INSERT]], i32 3
 ; POSTPROCESS-CPS-NEXT:    [[TMP67:%.*]] = bitcast float [[DOTSROA_0_12_VEC_EXTRACT56]] to i32
 ; POSTPROCESS-CPS-NEXT:    [[TMP68:%.*]] = bitcast i32 [[TMP6]] to float
@@ -5035,7 +5042,7 @@ attributes #5 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 ; POSTPROCESS-CPS-NEXT:    [[TMP71:%.*]] = load i32, ptr [[CSP]], align 4
 ; POSTPROCESS-CPS-NEXT:    [[TMP72:%.*]] = add i32 [[TMP71]], 0
 ; POSTPROCESS-CPS-NEXT:    store i32 [[TMP72]], ptr [[CSP]], align 4
-; POSTPROCESS-CPS-NEXT:    [[TMP73:%.*]] = zext i32 [[RETURN_ADDR]] to i64
+; POSTPROCESS-CPS-NEXT:    [[TMP73:%.*]] = zext i32 [[RETURNADDR]] to i64
 ; POSTPROCESS-CPS-NEXT:    [[TMP74:%.*]] = load i32, ptr [[CSP]], align 4
 ; POSTPROCESS-CPS-NEXT:    call void (i64, ...) @continuation.continue(i64 [[TMP73]], i32 [[TMP74]], i32 poison, i32 poison, [[STRUCT_ANYHITTRAVERSALDATA]] [[DOTFCA_1_3_INSERT345]], [8 x i32] poison, [10 x i32] [[DOTFCA_9_INSERT148]]), !continuation.registercount [[META33]]
 ; POSTPROCESS-CPS-NEXT:    unreachable
@@ -5045,16 +5052,16 @@ attributes #5 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 ; POSTPROCESS-CPS-NEXT:    [[TMP76:%.*]] = bitcast float [[DOTSROA_0_0_VEC_EXTRACT31]] to i32
 ; POSTPROCESS-CPS-NEXT:    [[DOTSROA_0_4_VEC_EXTRACT40:%.*]] = extractelement <4 x float> [[DOTSROA_0_12_VEC_INSERT]], i32 1
 ; POSTPROCESS-CPS-NEXT:    [[TMP77:%.*]] = bitcast float [[DOTSROA_0_4_VEC_EXTRACT40]] to i32
-; POSTPROCESS-CPS-NEXT:    [[DOTSROA_0_8_VEC_EXTRACT49:%.*]] = extractelement <4 x float> [[DOTSROA_0_12_VEC_INSERT]], i32 2
-; POSTPROCESS-CPS-NEXT:    [[TMP78:%.*]] = bitcast float [[DOTSROA_0_8_VEC_EXTRACT49]] to i32
+; POSTPROCESS-CPS-NEXT:    [[DOTSROA_0_8_VEC_EXTRACT48:%.*]] = extractelement <4 x float> [[DOTSROA_0_12_VEC_INSERT]], i32 2
+; POSTPROCESS-CPS-NEXT:    [[TMP78:%.*]] = bitcast float [[DOTSROA_0_8_VEC_EXTRACT48]] to i32
 ; POSTPROCESS-CPS-NEXT:    [[DOTSROA_0_12_VEC_EXTRACT58:%.*]] = extractelement <4 x float> [[DOTSROA_0_12_VEC_INSERT]], i32 3
 ; POSTPROCESS-CPS-NEXT:    [[TMP79:%.*]] = bitcast float [[DOTSROA_0_12_VEC_EXTRACT58]] to i32
-; POSTPROCESS-CPS-NEXT:    [[HITATTRSALLOCA_SROA_0_0_VEC_EXTRACT15:%.*]] = extractelement <2 x float> [[HIT_ATTRS_FCA_0_EXTRACT]], i32 0
-; POSTPROCESS-CPS-NEXT:    [[TMP80:%.*]] = bitcast float [[HITATTRSALLOCA_SROA_0_0_VEC_EXTRACT15]] to i32
+; POSTPROCESS-CPS-NEXT:    [[HITATTRSALLOCA_SROA_0_0_VEC_EXTRACT13:%.*]] = extractelement <2 x float> [[HIT_ATTRS_FCA_0_EXTRACT]], i32 0
+; POSTPROCESS-CPS-NEXT:    [[TMP80:%.*]] = bitcast float [[HITATTRSALLOCA_SROA_0_0_VEC_EXTRACT13]] to i32
 ; POSTPROCESS-CPS-NEXT:    [[TMP81:%.*]] = bitcast i32 [[TMP80]] to float
 ; POSTPROCESS-CPS-NEXT:    [[DOTSROA_0408_0_VEC_INSERT:%.*]] = insertelement <2 x float> undef, float [[TMP81]], i32 0
-; POSTPROCESS-CPS-NEXT:    [[HITATTRSALLOCA_SROA_0_4_VEC_EXTRACT19:%.*]] = extractelement <2 x float> [[HIT_ATTRS_FCA_0_EXTRACT]], i32 1
-; POSTPROCESS-CPS-NEXT:    [[TMP82:%.*]] = bitcast float [[HITATTRSALLOCA_SROA_0_4_VEC_EXTRACT19]] to i32
+; POSTPROCESS-CPS-NEXT:    [[HITATTRSALLOCA_SROA_0_4_VEC_EXTRACT17:%.*]] = extractelement <2 x float> [[HIT_ATTRS_FCA_0_EXTRACT]], i32 1
+; POSTPROCESS-CPS-NEXT:    [[TMP82:%.*]] = bitcast float [[HITATTRSALLOCA_SROA_0_4_VEC_EXTRACT17]] to i32
 ; POSTPROCESS-CPS-NEXT:    [[TMP83:%.*]] = bitcast i32 [[TMP82]] to float
 ; POSTPROCESS-CPS-NEXT:    [[DOTSROA_0408_4_VEC_INSERT:%.*]] = insertelement <2 x float> [[DOTSROA_0408_0_VEC_INSERT]], float [[TMP83]], i32 1
 ; POSTPROCESS-CPS-NEXT:    [[DOTFCA_0_INSERT407:%.*]] = insertvalue [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]] poison, <2 x float> [[DOTSROA_0408_4_VEC_INSERT]], 0
@@ -5114,14 +5121,14 @@ attributes #5 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 ; POSTPROCESS-CPS-NEXT:    [[TMP85:%.*]] = load i32, ptr [[CSP]], align 4
 ; POSTPROCESS-CPS-NEXT:    [[TMP86:%.*]] = add i32 [[TMP85]], 0
 ; POSTPROCESS-CPS-NEXT:    store i32 [[TMP86]], ptr [[CSP]], align 4
-; POSTPROCESS-CPS-NEXT:    [[TMP87:%.*]] = zext i32 [[RETURN_ADDR]] to i64
+; POSTPROCESS-CPS-NEXT:    [[TMP87:%.*]] = zext i32 [[RETURNADDR]] to i64
 ; POSTPROCESS-CPS-NEXT:    [[TMP88:%.*]] = load i32, ptr [[CSP]], align 4
 ; POSTPROCESS-CPS-NEXT:    call void (i64, ...) @continuation.continue(i64 [[TMP87]], i32 [[TMP88]], i32 poison, i32 poison, [[STRUCT_ANYHITTRAVERSALDATA]] [[DOTFCA_1_3_INSERT386]], [8 x i32] poison, [10 x i32] [[DOTFCA_9_INSERT178]]), !continuation.registercount [[META33]]
 ; POSTPROCESS-CPS-NEXT:    unreachable
 ;
 ;
 ; POSTPROCESS-CPS-LABEL: define void @MyIntersectionShader(
-; POSTPROCESS-CPS-SAME: {} [[CONT_STATE:%.*]], i32 [[CSPINIT:%.*]], i32 [[RETURN_ADDR:%.*]], i32 [[SHADER_INDEX:%.*]], [[STRUCT_ANYHITTRAVERSALDATA:%.*]] [[SYSTEM_DATA:%.*]], {} [[HIT_ATTRS:%.*]], [8 x i32] [[PADDING:%.*]], [30 x i32] [[PAYLOAD:%.*]]) #[[ATTR2]] !lgc.rt.shaderstage [[META37]] !lgc.cps [[META41:![0-9]+]] !continuation [[META42:![0-9]+]] {
+; POSTPROCESS-CPS-SAME: {} [[CONT_STATE:%.*]], i32 [[CSPINIT:%.*]], i32 [[RETURNADDR:%.*]], i32 [[SHADER_INDEX:%.*]], [[STRUCT_ANYHITTRAVERSALDATA:%.*]] [[SYSTEM_DATA:%.*]], {} [[HIT_ATTRS:%.*]], [8 x i32] [[PADDING:%.*]], [30 x i32] [[PAYLOAD:%.*]]) #[[ATTR2]] !lgc.rt.shaderstage [[META35]] !lgc.cps [[META42:![0-9]+]] !continuation [[META43:![0-9]+]] {
 ; POSTPROCESS-CPS-NEXT:  AllocaSpillBB:
 ; POSTPROCESS-CPS-NEXT:    [[CSP:%.*]] = alloca i32, align 4
 ; POSTPROCESS-CPS-NEXT:    store i32 [[CSPINIT]], ptr [[CSP]], align 4
@@ -5130,7 +5137,7 @@ attributes #5 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 ; POSTPROCESS-CPS-NEXT:    store i32 [[TMP1]], ptr [[CSP]], align 4
 ; POSTPROCESS-CPS-NEXT:    [[TMP2:%.*]] = inttoptr i32 [[TMP0]] to ptr addrspace(21)
 ; POSTPROCESS-CPS-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP2]], i32 0
-; POSTPROCESS-CPS-NEXT:    store i32 [[RETURN_ADDR]], ptr addrspace(21) [[TMP3]], align 4
+; POSTPROCESS-CPS-NEXT:    store i32 [[RETURNADDR]], ptr addrspace(21) [[TMP3]], align 4
 ; POSTPROCESS-CPS-NEXT:    [[PAYLOAD_FCA_0_EXTRACT:%.*]] = extractvalue [30 x i32] [[PAYLOAD]], 0
 ; POSTPROCESS-CPS-NEXT:    [[PAYLOAD_FCA_1_EXTRACT:%.*]] = extractvalue [30 x i32] [[PAYLOAD]], 1
 ; POSTPROCESS-CPS-NEXT:    [[PAYLOAD_FCA_2_EXTRACT:%.*]] = extractvalue [30 x i32] [[PAYLOAD]], 2
@@ -5234,9 +5241,9 @@ attributes #5 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 ; POSTPROCESS-CPS-NEXT:    [[DOTFCA_27_INSERT86:%.*]] = insertvalue [30 x i32] [[DOTFCA_26_INSERT83]], i32 [[PAYLOAD_FCA_27_EXTRACT]], 27
 ; POSTPROCESS-CPS-NEXT:    [[DOTFCA_28_INSERT89:%.*]] = insertvalue [30 x i32] [[DOTFCA_27_INSERT86]], i32 [[PAYLOAD_FCA_28_EXTRACT]], 28
 ; POSTPROCESS-CPS-NEXT:    [[DOTFCA_29_INSERT92:%.*]] = insertvalue [30 x i32] [[DOTFCA_28_INSERT89]], i32 [[PAYLOAD_FCA_29_EXTRACT]], 29
+; POSTPROCESS-CPS-NEXT:    [[TMP7:%.*]] = call i64 @continuation.getAddrAndMD(ptr @MyIntersectionShader.resume.0)
 ; POSTPROCESS-CPS-NEXT:    [[TMP6:%.*]] = load i32, ptr [[CSP]], align 4
-; POSTPROCESS-CPS-NEXT:    [[TMP7:%.*]] = call i64 @continuation.getAddrAndMD(i64 ptrtoint (ptr @MyIntersectionShader.resume.0 to i64))
-; POSTPROCESS-CPS-NEXT:    call void (i64, ...) @continuation.continue(i64 3, i32 [[TMP6]], i64 [[TMP7]], i32 5, float [[RES_I_FCA_3_INSERT_FCA_2_EXTRACT]], i32 0, [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]] [[DOTFCA_0_INSERT350]], [32 x i32] poison, [30 x i32] [[DOTFCA_29_INSERT92]]), !continuation.returnedRegistercount !32, !continuation.registercount [[META32:![0-9]+]]
+; POSTPROCESS-CPS-NEXT:    call void (i64, ...) @continuation.continue(i64 3, i32 [[TMP6]], i64 [[TMP7]], i32 5, float [[RES_I_FCA_3_INSERT_FCA_2_EXTRACT]], i32 0, [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]] [[DOTFCA_0_INSERT350]], [32 x i32] poison, [30 x i32] [[DOTFCA_29_INSERT92]]), !continuation.returnedRegistercount [[META32:![0-9]+]], !continuation.registercount [[META32]]
 ; POSTPROCESS-CPS-NEXT:    unreachable
 ; POSTPROCESS-CPS:       isEnd.i:
 ; POSTPROCESS-CPS-NEXT:    [[DOTSROA_0_0_VEC_EXTRACT:%.*]] = extractelement <2 x float> undef, i32 0
@@ -5250,7 +5257,7 @@ attributes #5 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 ; POSTPROCESS-CPS-NEXT:    [[DOTFCA_0_INSERT352:%.*]] = insertvalue [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]] poison, <2 x float> [[DOTSROA_0353_4_VEC_INSERT]], 0
 ; POSTPROCESS-CPS-NEXT:    [[DOTFCA_0_EXTRACT286:%.*]] = extractvalue [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]] [[DOTFCA_0_INSERT352]], 0
 ; POSTPROCESS-CPS-NEXT:    [[TMP12:%.*]] = bitcast <2 x float> [[DOTFCA_0_EXTRACT286]] to <2 x i32>
-; POSTPROCESS-CPS-NEXT:    [[SYSTEM_DATA_ALLOCA_SROA_0_0_VEC_EXPAND:%.*]] = shufflevector <2 x i32> [[TMP12]], <2 x i32> poison, <3 x i32> <i32 0, i32 1, i32 {{undef|poison}}>
+; POSTPROCESS-CPS-NEXT:    [[SYSTEM_DATA_ALLOCA_SROA_0_0_VEC_EXPAND:%.*]] = shufflevector <2 x i32> [[TMP12]], <2 x i32> poison, <3 x i32> <i32 0, i32 1, i32 poison>
 ; POSTPROCESS-CPS-NEXT:    [[SYSTEM_DATA_ALLOCA_SROA_0_0_VECBLEND:%.*]] = select <3 x i1> <i1 true, i1 true, i1 false>, <3 x i32> [[SYSTEM_DATA_ALLOCA_SROA_0_0_VEC_EXPAND]], <3 x i32> [[SYSTEM_DATA_FCA_0_0_0_0_EXTRACT]]
 ; POSTPROCESS-CPS-NEXT:    [[ISEND_I1:%.*]] = call i1 @opaqueIsEnd()
 ; POSTPROCESS-CPS-NEXT:    br i1 [[ISEND_I1]], label [[TMP13:%.*]], label [[TMP18:%.*]]
@@ -5301,7 +5308,7 @@ attributes #5 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 ; POSTPROCESS-CPS-NEXT:    [[TMP14:%.*]] = load i32, ptr [[CSP]], align 4
 ; POSTPROCESS-CPS-NEXT:    [[TMP15:%.*]] = add i32 [[TMP14]], -8
 ; POSTPROCESS-CPS-NEXT:    store i32 [[TMP15]], ptr [[CSP]], align 4
-; POSTPROCESS-CPS-NEXT:    [[TMP16:%.*]] = zext i32 [[RETURN_ADDR]] to i64
+; POSTPROCESS-CPS-NEXT:    [[TMP16:%.*]] = zext i32 [[RETURNADDR]] to i64
 ; POSTPROCESS-CPS-NEXT:    [[TMP17:%.*]] = load i32, ptr [[CSP]], align 4
 ; POSTPROCESS-CPS-NEXT:    call void (i64, ...) @continuation.continue(i64 [[TMP16]], i32 [[TMP17]], i32 poison, i32 poison, [[STRUCT_ANYHITTRAVERSALDATA]] [[DOTFCA_1_3_INSERT325]], [8 x i32] poison, [30 x i32] [[DOTFCA_29_INSERT212]]), !continuation.registercount [[META32]]
 ; POSTPROCESS-CPS-NEXT:    unreachable
@@ -5352,14 +5359,14 @@ attributes #5 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 ; POSTPROCESS-CPS-NEXT:    [[TMP19:%.*]] = load i32, ptr [[CSP]], align 4
 ; POSTPROCESS-CPS-NEXT:    [[TMP20:%.*]] = add i32 [[TMP19]], -8
 ; POSTPROCESS-CPS-NEXT:    store i32 [[TMP20]], ptr [[CSP]], align 4
-; POSTPROCESS-CPS-NEXT:    [[TMP21:%.*]] = zext i32 [[RETURN_ADDR]] to i64
+; POSTPROCESS-CPS-NEXT:    [[TMP21:%.*]] = zext i32 [[RETURNADDR]] to i64
 ; POSTPROCESS-CPS-NEXT:    [[TMP22:%.*]] = load i32, ptr [[CSP]], align 4
 ; POSTPROCESS-CPS-NEXT:    call void (i64, ...) @continuation.continue(i64 [[TMP21]], i32 [[TMP22]], i32 poison, i32 poison, [[STRUCT_ANYHITTRAVERSALDATA]] [[DOTFCA_1_3_INSERT]], [8 x i32] poison, [30 x i32] [[DOTFCA_29_INSERT]]), !continuation.registercount [[META32]]
 ; POSTPROCESS-CPS-NEXT:    unreachable
 ;
 ;
 ; POSTPROCESS-CPS-LABEL: define dso_local void @MyIntersectionShader.resume.0(
-; POSTPROCESS-CPS-SAME: {} [[TMP0:%.*]], i32 [[CSPINIT:%.*]], i32 [[TMP1:%.*]], i32 [[TMP2:%.*]], { [[STRUCT_ANYHITTRAVERSALDATA:%.*]], [8 x i32], [30 x i32] } [[TMP3:%.*]]) !lgc.rt.shaderstage [[META37]] !lgc.cps [[META41]] !continuation [[META42]] {
+; POSTPROCESS-CPS-SAME: {} [[TMP0:%.*]], i32 [[CSPINIT:%.*]], i32 [[TMP1:%.*]], i32 [[TMP2:%.*]], { [[STRUCT_ANYHITTRAVERSALDATA:%.*]], [8 x i32], [30 x i32] } [[TMP3:%.*]]) !lgc.rt.shaderstage [[META35]] !lgc.cps [[META42]] !continuation [[META43]] {
 ; POSTPROCESS-CPS-NEXT:  entryresume.0:
 ; POSTPROCESS-CPS-NEXT:    [[CSP:%.*]] = alloca i32, align 4
 ; POSTPROCESS-CPS-NEXT:    store i32 [[CSPINIT]], ptr [[CSP]], align 4
@@ -5524,7 +5531,7 @@ attributes #5 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 ;
 ;
 ; POSTPROCESS-CPS-LABEL: define void @MyIntersectionShader2(
-; POSTPROCESS-CPS-SAME: {} [[CONT_STATE:%.*]], i32 [[CSPINIT:%.*]], i32 [[RETURN_ADDR:%.*]], i32 [[SHADER_INDEX:%.*]], [[STRUCT_ANYHITTRAVERSALDATA:%.*]] [[SYSTEM_DATA:%.*]], {} [[HIT_ATTRS:%.*]], [8 x i32] [[PADDING:%.*]], [30 x i32] [[PAYLOAD:%.*]]) #[[ATTR2]] !lgc.rt.shaderstage [[META37]] !lgc.cps [[META41]] !continuation [[META43:![0-9]+]] {
+; POSTPROCESS-CPS-SAME: {} [[CONT_STATE:%.*]], i32 [[CSPINIT:%.*]], i32 [[RETURNADDR:%.*]], i32 [[SHADER_INDEX:%.*]], [[STRUCT_ANYHITTRAVERSALDATA:%.*]] [[SYSTEM_DATA:%.*]], {} [[HIT_ATTRS:%.*]], [8 x i32] [[PADDING:%.*]], [30 x i32] [[PAYLOAD:%.*]]) #[[ATTR2]] !lgc.rt.shaderstage [[META35]] !lgc.cps [[META42]] !continuation [[META44:![0-9]+]] {
 ; POSTPROCESS-CPS-NEXT:  AllocaSpillBB:
 ; POSTPROCESS-CPS-NEXT:    [[CSP:%.*]] = alloca i32, align 4
 ; POSTPROCESS-CPS-NEXT:    store i32 [[CSPINIT]], ptr [[CSP]], align 4
@@ -5533,7 +5540,7 @@ attributes #5 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 ; POSTPROCESS-CPS-NEXT:    store i32 [[TMP1]], ptr [[CSP]], align 4
 ; POSTPROCESS-CPS-NEXT:    [[TMP2:%.*]] = inttoptr i32 [[TMP0]] to ptr addrspace(21)
 ; POSTPROCESS-CPS-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP2]], i32 0
-; POSTPROCESS-CPS-NEXT:    store i32 [[RETURN_ADDR]], ptr addrspace(21) [[TMP3]], align 4
+; POSTPROCESS-CPS-NEXT:    store i32 [[RETURNADDR]], ptr addrspace(21) [[TMP3]], align 4
 ; POSTPROCESS-CPS-NEXT:    [[PAYLOAD_FCA_0_EXTRACT:%.*]] = extractvalue [30 x i32] [[PAYLOAD]], 0
 ; POSTPROCESS-CPS-NEXT:    [[PAYLOAD_FCA_1_EXTRACT:%.*]] = extractvalue [30 x i32] [[PAYLOAD]], 1
 ; POSTPROCESS-CPS-NEXT:    [[PAYLOAD_FCA_2_EXTRACT:%.*]] = extractvalue [30 x i32] [[PAYLOAD]], 2
@@ -5637,9 +5644,9 @@ attributes #5 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 ; POSTPROCESS-CPS-NEXT:    [[DOTFCA_27_INSERT86:%.*]] = insertvalue [30 x i32] [[DOTFCA_26_INSERT83]], i32 [[PAYLOAD_FCA_27_EXTRACT]], 27
 ; POSTPROCESS-CPS-NEXT:    [[DOTFCA_28_INSERT89:%.*]] = insertvalue [30 x i32] [[DOTFCA_27_INSERT86]], i32 [[PAYLOAD_FCA_28_EXTRACT]], 28
 ; POSTPROCESS-CPS-NEXT:    [[DOTFCA_29_INSERT92:%.*]] = insertvalue [30 x i32] [[DOTFCA_28_INSERT89]], i32 [[PAYLOAD_FCA_29_EXTRACT]], 29
+; POSTPROCESS-CPS-NEXT:    [[TMP7:%.*]] = call i64 @continuation.getAddrAndMD(ptr @MyIntersectionShader2.resume.0)
 ; POSTPROCESS-CPS-NEXT:    [[TMP6:%.*]] = load i32, ptr [[CSP]], align 4
-; POSTPROCESS-CPS-NEXT:    [[TMP7:%.*]] = call i64 @continuation.getAddrAndMD(i64 ptrtoint (ptr @MyIntersectionShader2.resume.0 to i64))
-; POSTPROCESS-CPS-NEXT:    call void (i64, ...) @continuation.continue(i64 3, i32 [[TMP6]], i64 [[TMP7]], i32 5, float [[RES_I_FCA_3_INSERT_FCA_2_EXTRACT]], i32 0, [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES2]] [[DOTFCA_0_INSERT350]], [32 x i32] poison, [30 x i32] [[DOTFCA_29_INSERT92]]), !continuation.returnedRegistercount !32, !continuation.registercount [[META32]]
+; POSTPROCESS-CPS-NEXT:    call void (i64, ...) @continuation.continue(i64 3, i32 [[TMP6]], i64 [[TMP7]], i32 5, float [[RES_I_FCA_3_INSERT_FCA_2_EXTRACT]], i32 0, [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES2]] [[DOTFCA_0_INSERT350]], [32 x i32] poison, [30 x i32] [[DOTFCA_29_INSERT92]]), !continuation.returnedRegistercount [[META32]], !continuation.registercount [[META32]]
 ; POSTPROCESS-CPS-NEXT:    unreachable
 ; POSTPROCESS-CPS:       isEnd.i:
 ; POSTPROCESS-CPS-NEXT:    [[DOTSROA_0_0_VEC_EXTRACT:%.*]] = extractelement <2 x float> undef, i32 0
@@ -5653,7 +5660,7 @@ attributes #5 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 ; POSTPROCESS-CPS-NEXT:    [[DOTFCA_0_INSERT352:%.*]] = insertvalue [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES:%.*]] poison, <2 x float> [[DOTSROA_0353_4_VEC_INSERT]], 0
 ; POSTPROCESS-CPS-NEXT:    [[DOTFCA_0_EXTRACT286:%.*]] = extractvalue [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]] [[DOTFCA_0_INSERT352]], 0
 ; POSTPROCESS-CPS-NEXT:    [[TMP12:%.*]] = bitcast <2 x float> [[DOTFCA_0_EXTRACT286]] to <2 x i32>
-; POSTPROCESS-CPS-NEXT:    [[SYSTEM_DATA_ALLOCA_SROA_0_0_VEC_EXPAND:%.*]] = shufflevector <2 x i32> [[TMP12]], <2 x i32> poison, <3 x i32> <i32 0, i32 1, i32 {{undef|poison}}>
+; POSTPROCESS-CPS-NEXT:    [[SYSTEM_DATA_ALLOCA_SROA_0_0_VEC_EXPAND:%.*]] = shufflevector <2 x i32> [[TMP12]], <2 x i32> poison, <3 x i32> <i32 0, i32 1, i32 poison>
 ; POSTPROCESS-CPS-NEXT:    [[SYSTEM_DATA_ALLOCA_SROA_0_0_VECBLEND:%.*]] = select <3 x i1> <i1 true, i1 true, i1 false>, <3 x i32> [[SYSTEM_DATA_ALLOCA_SROA_0_0_VEC_EXPAND]], <3 x i32> [[SYSTEM_DATA_FCA_0_0_0_0_EXTRACT]]
 ; POSTPROCESS-CPS-NEXT:    [[ISEND_I1:%.*]] = call i1 @opaqueIsEnd()
 ; POSTPROCESS-CPS-NEXT:    br i1 [[ISEND_I1]], label [[TMP13:%.*]], label [[TMP18:%.*]]
@@ -5704,7 +5711,7 @@ attributes #5 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 ; POSTPROCESS-CPS-NEXT:    [[TMP14:%.*]] = load i32, ptr [[CSP]], align 4
 ; POSTPROCESS-CPS-NEXT:    [[TMP15:%.*]] = add i32 [[TMP14]], -8
 ; POSTPROCESS-CPS-NEXT:    store i32 [[TMP15]], ptr [[CSP]], align 4
-; POSTPROCESS-CPS-NEXT:    [[TMP16:%.*]] = zext i32 [[RETURN_ADDR]] to i64
+; POSTPROCESS-CPS-NEXT:    [[TMP16:%.*]] = zext i32 [[RETURNADDR]] to i64
 ; POSTPROCESS-CPS-NEXT:    [[TMP17:%.*]] = load i32, ptr [[CSP]], align 4
 ; POSTPROCESS-CPS-NEXT:    call void (i64, ...) @continuation.continue(i64 [[TMP16]], i32 [[TMP17]], i32 poison, i32 poison, [[STRUCT_ANYHITTRAVERSALDATA]] [[DOTFCA_1_3_INSERT325]], [8 x i32] poison, [30 x i32] [[DOTFCA_29_INSERT212]]), !continuation.registercount [[META32]]
 ; POSTPROCESS-CPS-NEXT:    unreachable
@@ -5755,14 +5762,14 @@ attributes #5 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 ; POSTPROCESS-CPS-NEXT:    [[TMP19:%.*]] = load i32, ptr [[CSP]], align 4
 ; POSTPROCESS-CPS-NEXT:    [[TMP20:%.*]] = add i32 [[TMP19]], -8
 ; POSTPROCESS-CPS-NEXT:    store i32 [[TMP20]], ptr [[CSP]], align 4
-; POSTPROCESS-CPS-NEXT:    [[TMP21:%.*]] = zext i32 [[RETURN_ADDR]] to i64
+; POSTPROCESS-CPS-NEXT:    [[TMP21:%.*]] = zext i32 [[RETURNADDR]] to i64
 ; POSTPROCESS-CPS-NEXT:    [[TMP22:%.*]] = load i32, ptr [[CSP]], align 4
 ; POSTPROCESS-CPS-NEXT:    call void (i64, ...) @continuation.continue(i64 [[TMP21]], i32 [[TMP22]], i32 poison, i32 poison, [[STRUCT_ANYHITTRAVERSALDATA]] [[DOTFCA_1_3_INSERT]], [8 x i32] poison, [30 x i32] [[DOTFCA_29_INSERT]]), !continuation.registercount [[META32]]
 ; POSTPROCESS-CPS-NEXT:    unreachable
 ;
 ;
 ; POSTPROCESS-CPS-LABEL: define dso_local void @MyIntersectionShader2.resume.0(
-; POSTPROCESS-CPS-SAME: {} [[TMP0:%.*]], i32 [[CSPINIT:%.*]], i32 [[TMP1:%.*]], i32 [[TMP2:%.*]], { [[STRUCT_ANYHITTRAVERSALDATA:%.*]], [8 x i32], [30 x i32] } [[TMP3:%.*]]) !lgc.rt.shaderstage [[META37]] !lgc.cps [[META41]] !continuation [[META43]] {
+; POSTPROCESS-CPS-SAME: {} [[TMP0:%.*]], i32 [[CSPINIT:%.*]], i32 [[TMP1:%.*]], i32 [[TMP2:%.*]], { [[STRUCT_ANYHITTRAVERSALDATA:%.*]], [8 x i32], [30 x i32] } [[TMP3:%.*]]) !lgc.rt.shaderstage [[META35]] !lgc.cps [[META42]] !continuation [[META44]] {
 ; POSTPROCESS-CPS-NEXT:  entryresume.0:
 ; POSTPROCESS-CPS-NEXT:    [[CSP:%.*]] = alloca i32, align 4
 ; POSTPROCESS-CPS-NEXT:    store i32 [[CSPINIT]], ptr [[CSP]], align 4
@@ -5927,7 +5934,7 @@ attributes #5 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 ;
 ;
 ; POSTPROCESS-CPS-LABEL: define void @MyMissShader(
-; POSTPROCESS-CPS-SAME: {} [[CONT_STATE:%.*]], i32 [[CSPINIT:%.*]], i32 [[RETURN_ADDR:%.*]], i32 [[SHADER_INDEX:%.*]], [[STRUCT_SYSTEMDATA:%.*]] [[SYSTEM_DATA:%.*]], {} [[HIT_ATTRS:%.*]], [33 x i32] [[PADDING:%.*]], [10 x i32] [[PAYLOAD:%.*]]) #[[ATTR2]] !lgc.rt.shaderstage [[META41]] !lgc.cps [[META37]] !continuation [[META44:![0-9]+]] {
+; POSTPROCESS-CPS-SAME: {} [[CONT_STATE:%.*]], i32 [[CSPINIT:%.*]], i32 [[RETURNADDR:%.*]], i32 [[SHADER_INDEX:%.*]], [[STRUCT_SYSTEMDATA:%.*]] [[SYSTEM_DATA:%.*]], {} [[HIT_ATTRS:%.*]], [33 x i32] [[PADDING:%.*]], [10 x i32] [[PAYLOAD:%.*]]) #[[ATTR2]] !lgc.rt.shaderstage [[META40]] !lgc.cps [[META38]] !continuation [[META45:![0-9]+]] {
 ; POSTPROCESS-CPS-NEXT:  AllocaSpillBB:
 ; POSTPROCESS-CPS-NEXT:    [[CSP:%.*]] = alloca i32, align 4
 ; POSTPROCESS-CPS-NEXT:    store i32 [[CSPINIT]], ptr [[CSP]], align 4
@@ -5973,7 +5980,7 @@ attributes #5 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 ; POSTPROCESS-CPS-NEXT:    [[TMP8:%.*]] = load i32, ptr [[CSP]], align 4
 ; POSTPROCESS-CPS-NEXT:    [[TMP9:%.*]] = add i32 [[TMP8]], 0
 ; POSTPROCESS-CPS-NEXT:    store i32 [[TMP9]], ptr [[CSP]], align 4
-; POSTPROCESS-CPS-NEXT:    [[TMP10:%.*]] = zext i32 [[RETURN_ADDR]] to i64
+; POSTPROCESS-CPS-NEXT:    [[TMP10:%.*]] = zext i32 [[RETURNADDR]] to i64
 ; POSTPROCESS-CPS-NEXT:    [[TMP11:%.*]] = load i32, ptr [[CSP]], align 4
 ; POSTPROCESS-CPS-NEXT:    call void (i64, ...) @continuation.continue(i64 [[TMP10]], i32 [[TMP11]], i32 poison, i32 poison, [[STRUCT_DISPATCHSYSTEMDATA]] [[DOTFCA_0_INSERT9]], [33 x i32] poison, [10 x i32] [[DOTFCA_9_INSERT]]), !continuation.registercount [[META33]]
 ; POSTPROCESS-CPS-NEXT:    unreachable
diff --git a/llvmraytracing/test/dx/paq-hit-attribute-size.ll b/llvmraytracing/test/dx/paq-hit-attribute-size.ll
index 6ae6569450..37da0d7d2b 100644
--- a/llvmraytracing/test/dx/paq-hit-attribute-size.ll
+++ b/llvmraytracing/test/dx/paq-hit-attribute-size.ll
@@ -2,20 +2,25 @@
 ; size metadata.
 ;
 ; Default run checking serialization layouts and their usage:
-; RUN: grep -v INVALID %s | opt -debug-only=lower-raytracing-pipeline --verify-each -passes='dxil-cont-lgc-rt-op-converter,lint,lower-raytracing-pipeline,lint,remove-types-metadata' -S 2>&1 | FileCheck %s
-; Check that hit attributes violating the max size are detected and crash:
-; RUN: not --crash opt --verify-each -passes='dxil-cont-lgc-rt-op-converter,lint,lower-raytracing-pipeline,lint,remove-types-metadata' -S %s 2>&1 | FileCheck %s --check-prefix INVALID
+; RUN: grep -v 'NOT-1' %s | opt -debug-only=lower-raytracing-pipeline --verify-each -passes='dxil-cont-lgc-rt-op-converter,lint,lower-raytracing-pipeline,lint,remove-types-metadata' -S --lint-abort-on-error 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-MAX-1
+; RUN: grep -v 'NOT-2' %s | opt -debug-only=lower-raytracing-pipeline --verify-each -passes='dxil-cont-lgc-rt-op-converter,lint,lower-raytracing-pipeline,lint,remove-types-metadata' -S --lint-abort-on-error 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-MAX-2
+; RUN: grep -v 'NOT-4' %s | opt -debug-only=lower-raytracing-pipeline --verify-each -passes='dxil-cont-lgc-rt-op-converter,lint,lower-raytracing-pipeline,lint,remove-types-metadata' -S --lint-abort-on-error 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-MAX-4
+; RUN: grep -v 'NOT-8' %s | opt -debug-only=lower-raytracing-pipeline --verify-each -passes='dxil-cont-lgc-rt-op-converter,lint,lower-raytracing-pipeline,lint,remove-types-metadata' -S --lint-abort-on-error 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-MAX-8
+
+; Check that hit attributes violating the max size (here: 2 Dwords, set by removing lines containing NOT-2) are detected and crash:
+; RUN: grep -v 'NOT-INVALID' %s | not --crash opt --verify-each -passes='dxil-cont-lgc-rt-op-converter,lint,lower-raytracing-pipeline,lint,remove-types-metadata' -S --lint-abort-on-error 2>&1 | FileCheck %s --check-prefix INVALID
 ; REQUIRES: assertions
 
 ; INVALID: Hit attributes are too large!
 
-target datalayout = "e-m:e-p:64:32-p20:32:32-p21:32:32-p32:32:32-i1:32-i8:8-i16:32-i32:32-i64:32-f16:32-f32:32-f64:32-v16:32-v32:32-v48:32-v64:32-v80:32-v96:32-v112:32-v128:32-v144:32-v160:32-v176:32-v192:32-v208:32-v224:32-v240:32-v256:32-n8:16:32"
+target datalayout = "e-m:e-p:64:32-p20:32:32-p21:32:32-p32:32:32-i1:32-i8:8-i16:16-i32:32-i64:32-f16:16-f32:32-f64:32-v8:8-v16:16-v32:32-v48:32-v64:32-v80:32-v96:32-v112:32-v128:32-v144:32-v160:32-v176:32-v192:32-v208:32-v224:32-v240:32-v256:32-n8:16:32"
 
 %dx.types.Handle = type { i8* }
 %struct.MyPayload = type { float, i32, double }
 %struct.Attributes1DWords = type { [1 x i32] }
 %struct.Attributes2DWords = type { [2 x i32] }
 %struct.Attributes4DWords = type { [4 x i32] }
+%struct.Attributes8DWords = type { [8 x i32] }
 %dx.types.ResourceProperties = type { i32, i32 }
 %struct.DispatchSystemData = type { i32 }
 %struct.TraversalData = type { %struct.SystemData }
@@ -31,69 +36,30 @@ target datalayout = "e-m:e-p:64:32-p20:32:32-p21:32:32-p32:32:32-i1:32-i8:8-i16:
 @"\01?myAccelerationStructure@@3URaytracingAccelerationStructure@@A" = external constant %dx.types.Handle, align 4
 @"\01?gOutput@@3V?$RWTexture2D@V?$vector@M$03@@@@A" = external constant %dx.types.Handle, align 4
 
-; CHECK-DAG: %struct.MyPayload.attr_max_2_i32s.layout_0_caller_out = type { [4 x i32] }
-; CHECK-DAG: %struct.MyPayload.attr_max_4_i32s.layout_0_caller_out = type { [6 x i32] }
-; CHECK-DAG: %struct.MyPayload.attr_max_8_i32s.layout_0_caller_out = type { [10 x i32] }
 ; If the app uses only 1 DWord for hit attributes, then the layout does not get smaller.
 ; Instead, one 1 DWord in system data is unused.
-; CHECK-DAG: %struct.MyPayload.attr_max_1_i32s.layout_0_caller_out = type { [4 x i32] }
-
-; CHECK-LABEL: define {{.*}} @AnyHit1DWordsMax1DWords(
-define void @AnyHit1DWordsMax1DWords(%struct.MyPayload* %payload, %struct.Attributes1DWords* %attrs) !lgc.rt.attribute.size !49 !types !60 {
-  ret void
-}
-
-; CHECK-LABEL: define {{.*}} @AnyHit1DWordsMax2DWords(
-define void @AnyHit1DWordsMax2DWords(%struct.MyPayload* %payload, %struct.Attributes1DWords* %attrs) !lgc.rt.attribute.size !22 !types !60 {
-  ret void
-}
-
-; CHECK-LABEL: define {{.*}} @AnyHit1DWordsMax8DWords(
-define void @AnyHit1DWordsMax8DWords(%struct.MyPayload* %payload, %struct.Attributes1DWords* %attrs) !lgc.rt.attribute.size !27 !types !60 {
-  ret void
-}
-
-; The actual size matches the max size for this one, so the layout_2_anyhit_out_accept layout
-; is not specialized, thus no payload_attr_N_i32s suffix.
-; CHECK-LABEL: define {{.*}} @AnyHit2DWordsMax2DWords(
-define void @AnyHit2DWordsMax2DWords(%struct.MyPayload* %payload, %struct.Attributes2DWords* %attrs) !lgc.rt.attribute.size !22 !types !23 {
-  ret void
-}
-
-; The actual size is 2 DWords smaller than the max size.
-; There are 2 unused DWords in the layout.
-; CHECK-LABEL: define {{.*}} @AnyHit2DWordsMax4DWords(
-define void @AnyHit2DWordsMax4DWords(%struct.MyPayload* %payload, %struct.Attributes2DWords* %attrs) !lgc.rt.attribute.size !26 !types !23 {
-  ret void
-}
-
-; CHECK-LABEL: define {{.*}} @AnyHit2DWordsMax8DWords(
-define void @AnyHit2DWordsMax8DWords(%struct.MyPayload* %payload, %struct.Attributes2DWords* %attrs) !lgc.rt.attribute.size !27 !types !23 {
-  ret void
-}
-
-; CHECK-LABEL: define {{.*}} @AnyHit2DWordsNoLimit(
-define void @AnyHit2DWordsNoLimit(%struct.MyPayload* %payload, %struct.Attributes2DWords* %attrs) !types !23 {
-  ret void
-}
+; CHECK-MAX-1-DAG: %struct.MyPayload.attr_max_1_i32s.layout_0_caller_out = type { [4 x i32] }
+; CHECK-MAX-2-DAG: %struct.MyPayload.attr_max_2_i32s.layout_0_caller_out = type { [4 x i32] }
+; CHECK-MAX-4-DAG: %struct.MyPayload.attr_max_4_i32s.layout_0_caller_out = type { [6 x i32] }
+; CHECK-MAX-8-DAG: %struct.MyPayload.attr_max_8_i32s.layout_0_caller_out = type { [10 x i32] }
 
-; CHECK-LABEL: define {{.*}} @AnyHit4DWordsMax4DWords(
-define void @AnyHit4DWordsMax4DWords(%struct.MyPayload* %payload, %struct.Attributes4DWords* %attrs) !lgc.rt.attribute.size !26 !types !28 {
+; CHECK-LABEL: define {{.*}} @AnyHit1DWords(
+define void @AnyHit1DWords(%struct.MyPayload* %payload, %struct.Attributes1DWords* %attrs) !types !60 {
   ret void
 }
 
-; CHECK-LABEL: define {{.*}} @AnyHit4DWordsMax8DWords(
-define void @AnyHit4DWordsMax8DWords(%struct.MyPayload* %payload, %struct.Attributes4DWords* %attrs) !lgc.rt.attribute.size !27 !types !28 {
+; CHECK-LABEL: define {{.*}} @AnyHit2DWords(
+define void @AnyHit2DWords(%struct.MyPayload* %payload, %struct.Attributes2DWords* %attrs) !types !23 {
   ret void
 }
 
-; CHECK-LABEL: define {{.*}} @AnyHit4DWordsNoLimit(
-define void @AnyHit4DWordsNoLimit(%struct.MyPayload* %payload, %struct.Attributes4DWords* %attrs) !types !28 {
+; CHECK-LABEL: define {{.*}} @AnyHit4DWords(
+define void @AnyHit4DWords(%struct.MyPayload* %payload, %struct.Attributes4DWords* %attrs) !types !28 {
   ret void
 }
 
-; The following one violates the limit and should crash:
-define void @AnyHit4DWordsMax2DWords(%struct.MyPayload* %payload, %struct.Attributes4DWords* %attrs) !lgc.rt.attribute.size !22 !types !28 {
+; CHECK-LABEL: define {{.*}} @AnyHit8DWords(
+define void @AnyHit8DWords(%struct.MyPayload* %payload, %struct.Attributes8DWords* %attrs) !types !63 {
   ret void
 }
 
@@ -179,9 +145,24 @@ attributes #4 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 !dx.valver = !{!1}
 !dx.shaderModel = !{!2}
 !dx.resources = !{!3}
-!dx.entryPoints = !{!10,
-  !12, ; INVALID
-  !15, !16, !17, !18, !19, !20, !21, !50, !51, !52}
+; DX entry points. We use grep filters on NOT-{maxSize} to only enable compatible shaders.
+!dx.entryPoints = !{
+  !10
+  , !14 ; AHS using 1 Dword attributes.
+  , !15 ; AHS using 2 Dword attributes.  NOT-1
+  , !16 ; AHS using 4 Dword attributes.  NOT-1 NOT-2
+  , !17 ; AHS using 8 Dword attributes.  NOT-1 NOT-2 NOT-4
+}
+
+; We filter out one of the following lines using a grep in the RUN line.
+; The NOT-{maxSize} patterns are used to run a test with the max hit attribute size to
+; maxSize, and only enabling compatible shaders.
+; The NOT-INVALID pattern is used to run all shaders with a max attribute size of 2 dwords,
+; which is expected to fail.
+!lgc.rt.max.attribute.size = !{!49} ; 1 DWord(s).       NOT-2 NOT-4 NOT-8 NOT-INVALID
+!lgc.rt.max.attribute.size = !{!22} ; 2 DWord(s). NOT-1       NOT-4 NOT-8
+!lgc.rt.max.attribute.size = !{!26} ; 4 DWord(s). NOT-1 NOT-2       NOT-8 NOT-INVALID
+!lgc.rt.max.attribute.size = !{!27} ; 8 DWord(s). NOT-1 NOT-2 NOT-4       NOT-INVALID
 
 !0 = !{!"dxcoob 2019.05.00"}
 !1 = !{i32 1, i32 7}
@@ -195,16 +176,12 @@ attributes #4 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 !9 = !{i32 0, i32 9}
 !10 = !{null, !"", null, !3, !11}
 !11 = !{i32 0, i64 65540}
-!12 = !{void (%struct.MyPayload*, %struct.Attributes4DWords*)* @AnyHit4DWordsMax2DWords, !"AnyHit4DWordsMax2DWords", null, null, !13}
-!13 = !{i32 8, i32 9, i32 5, !14}
-!14 = !{i32 0}
-!15 = !{void (%struct.MyPayload*, %struct.Attributes4DWords*)* @AnyHit4DWordsMax4DWords, !"AnyHit4DWordsMax4DWords", null, null, !13}
-!16 = !{void (%struct.MyPayload*, %struct.Attributes4DWords*)* @AnyHit4DWordsMax8DWords, !"AnyHit4DWordsMax8DWords", null, null, !13}
-!17 = !{void (%struct.MyPayload*, %struct.Attributes4DWords*)* @AnyHit4DWordsNoLimit, !"AnyHit4DWordsNoLimit", null, null, !13}
-!18 = !{void (%struct.MyPayload*, %struct.Attributes2DWords*)* @AnyHit2DWordsMax2DWords, !"AnyHit2DWordsMax2DWords", null, null, !13}
-!19 = !{void (%struct.MyPayload*, %struct.Attributes2DWords*)* @AnyHit2DWordsMax4DWords, !"AnyHit2DWordsMax4DWords", null, null, !13}
-!20 = !{void (%struct.MyPayload*, %struct.Attributes2DWords*)* @AnyHit2DWordsMax8DWords, !"AnyHit2DWordsMax8DWords", null, null, !13}
-!21 = !{void (%struct.MyPayload*, %struct.Attributes2DWords*)* @AnyHit2DWordsNoLimit, !"AnyHit2DWordsNoLimit", null, null, !13}
+!12 = !{i32 8, i32 9, i32 5, !13}
+!13 = !{i32 0}
+!14 = !{void (%struct.MyPayload*, %struct.Attributes1DWords*)* @AnyHit1DWords, !"AnyHit1DWords", null, null, !12}
+!15 = !{void (%struct.MyPayload*, %struct.Attributes4DWords*)* @AnyHit2DWords, !"AnyHit2DWords", null, null, !12}
+!16 = !{void (%struct.MyPayload*, %struct.Attributes4DWords*)* @AnyHit4DWords, !"AnyHit4DWords", null, null, !12}
+!17 = !{void (%struct.MyPayload*, %struct.Attributes4DWords*)* @AnyHit8DWords, !"AnyHit8DWords", null, null, !12}
 !22 = !{i32 8}
 !23 = !{!"function", !"void", !24, !25}
 !24 = !{i32 0, %struct.MyPayload poison}
@@ -233,9 +210,8 @@ attributes #4 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 !47 = !{!"function", !"void", i64 poison, !48}
 !48 = !{i32 0, i8 poison}
 !49 = !{i32 4}
-!50 = !{void (%struct.MyPayload* , %struct.Attributes1DWords*)* @AnyHit1DWordsMax1DWords, !"AnyHit1DWordsMax1DWords", null, null, !13}
-!51 = !{void (%struct.MyPayload* , %struct.Attributes1DWords*)* @AnyHit1DWordsMax2DWords, !"AnyHit1DWordsMax2DWords", null, null, !13}
-!52 = !{void (%struct.MyPayload* , %struct.Attributes1DWords*)* @AnyHit1DWordsMax8DWords, !"AnyHit1DWordsMax8DWords", null, null, !13}
 !60 = !{!"function", !"void", !61, !62}
 !61 = !{i32 0, %struct.MyPayload poison}
 !62 = !{i32 0, %struct.Attributes1DWords poison}
+!63 = !{!"function", !"void", !24, !64}
+!64 = !{i32 0, %struct.Attributes8DWords poison}
diff --git a/llvmraytracing/test/dx/payload-caller-in-paq.ll b/llvmraytracing/test/dx/payload-caller-in-paq.ll
index 1243000692..affcf301ac 100644
--- a/llvmraytracing/test/dx/payload-caller-in-paq.ll
+++ b/llvmraytracing/test/dx/payload-caller-in-paq.ll
@@ -1,6 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --function RayGen --version 3
-; RUN: opt --verify-each -passes='dxil-cont-lgc-rt-op-converter,lint,lower-raytracing-pipeline,lint,inline,lint,remove-types-metadata' -S %s 2> %t.stderr | FileCheck -check-prefix=LOWERRAYTRACINGPIPELINE %s
-; RUN: count 0 < %t.stderr
+; RUN: opt --verify-each -passes='dxil-cont-lgc-rt-op-converter,lint,lower-raytracing-pipeline,lint,inline,lint,remove-types-metadata' -S %s --lint-abort-on-error | FileCheck -check-prefix=LOWERRAYTRACINGPIPELINE %s
 
 ; Test the special case of payload import in the caller after TraceRay. Here, we cast the
 ; payload storage both to the ClosestHitOut layout and the MissOut layout and import both,
@@ -11,13 +10,13 @@
 ; function name regex.
 ; Note that the payload has nontrivial payload access qualifiers set.
 
-target datalayout = "e-m:e-p:64:32-p20:32:32-p21:32:32-p32:32:32-i1:32-i8:8-i16:32-i32:32-i64:32-f16:32-f32:32-f64:32-v16:32-v32:32-v48:32-v64:32-v80:32-v96:32-v112:32-v128:32-v144:32-v160:32-v176:32-v192:32-v208:32-v224:32-v240:32-v256:32-n8:16:32"
+target datalayout = "e-m:e-p:64:32-p20:32:32-p21:32:32-p32:32:32-i1:32-i8:8-i16:16-i32:32-i64:32-f16:16-f32:32-f64:32-v8:8-v16:16-v32:32-v48:32-v64:32-v80:32-v96:32-v112:32-v128:32-v144:32-v160:32-v176:32-v192:32-v208:32-v224:32-v240:32-v256:32-n8:16:32"
 
 %dx.types.Handle = type { i8* }
 %struct.MyPayload = type { float, i32, double }
 %dx.types.ResourceProperties = type { i32, i32 }
 %struct.DispatchSystemData = type { i32 }
-%struct.TraversalData = type { %struct.SystemData }
+%struct.TraversalData = type { %struct.SystemData, i64 }
 %struct.SystemData = type { %struct.DispatchSystemData }
 %struct.AnyHitTraversalData = type { %struct.TraversalData, %struct.HitData }
 %struct.HitData = type { float, i32 }
@@ -28,10 +27,14 @@ target datalayout = "e-m:e-p:64:32-p20:32:32-p21:32:32-p32:32:32-i1:32-i8:8-i16:
 @"\01?myAccelerationStructure@@3URaytracingAccelerationStructure@@A" = external constant %dx.types.Handle, align 4
 @"\01?gOutput@@3V?$RWTexture2D@V?$vector@M$03@@@@A" = external constant %dx.types.Handle, align 4
 
+define void @_cont_ExitRayGen(ptr nocapture readonly %data) alwaysinline nounwind !types !{!"function", !"void", !{i32 0, %struct.DispatchSystemData poison}} {
+  ret void
+}
+
 ; Function Attrs: nounwind
 define void @RayGen() #0 {
 ; LOWERRAYTRACINGPIPELINE-LABEL: define void @RayGen(
-; LOWERRAYTRACINGPIPELINE-SAME: [[STRUCT_DISPATCHSYSTEMDATA:%.*]] [[TMP0:%.*]]) #[[ATTR0:[0-9]+]] !lgc.rt.shaderstage [[META23:![0-9]+]] !continuation.entry [[META13:![0-9]+]] !continuation.registercount [[META23]] !continuation [[META27:![0-9]+]] {
+; LOWERRAYTRACINGPIPELINE-SAME: i64 [[RETURNADDR:%.*]], [[STRUCT_DISPATCHSYSTEMDATA:%.*]] [[TMP0:%.*]]) #[[ATTR0:[0-9]+]] !lgc.rt.shaderstage [[META23:![0-9]+]] !continuation.entry [[META13:![0-9]+]] !continuation.registercount [[META23]] !continuation [[META27:![0-9]+]] {
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[SYSTEM_DATA_ALLOCA:%.*]] = alloca [[STRUCT_DISPATCHSYSTEMDATA]], align 8
 ; LOWERRAYTRACINGPIPELINE-NEXT:    store [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP0]], ptr [[SYSTEM_DATA_ALLOCA]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    call void @amd.dx.setLocalRootIndex(i32 0)
@@ -48,11 +51,13 @@ define void @RayGen() #0 {
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[DIS_DATA_I:%.*]] = load [[STRUCT_DISPATCHSYSTEMDATA]], ptr [[SYSTEM_DATA_ALLOCA]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[SYS_DATA_I:%.*]] = insertvalue [[STRUCT_SYSTEMDATA:%.*]] undef, [[STRUCT_DISPATCHSYSTEMDATA]] [[DIS_DATA_I]], 0
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TRAV_DATA_I:%.*]] = insertvalue [[STRUCT_TRAVERSALDATA:%.*]] undef, [[STRUCT_SYSTEMDATA]] [[SYS_DATA_I]], 0
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[ADDR_I:%.*]] = call i64 @_AmdGetResumePointAddr() #[[ATTR3:[0-9]+]]
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TRAV_DATA2_I:%.*]] = insertvalue [[STRUCT_TRAVERSALDATA]] [[TRAV_DATA_I]], i64 [[ADDR_I]], 1
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [[STRUCT_MYPAYLOAD]], ptr [[TMP4]], i32 0, i32 0
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP33:%.*]] = load i32, ptr [[TMP10]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP33]], ptr addrspace(20) @PAYLOAD, align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP41:%.*]] = call ptr inttoptr (i64 4 to ptr)([[STRUCT_TRAVERSALDATA]] [[TRAV_DATA_I]]), !continuation.registercount [[META32:![0-9]+]], !continuation.returnedRegistercount !25
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP13:%.*]] = call [[STRUCT_DISPATCHSYSTEMDATA]] [[AWAIT_STRUCT_DISPATCHSYSTEMDATA:@[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]](ptr [[TMP41]])
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP12:%.*]] = call ptr inttoptr (i64 4 to ptr)(i64 -1, i64 poison, [[STRUCT_TRAVERSALDATA]] [[TRAV_DATA2_I]]), !continuation.registercount [[META32:![0-9]+]], !continuation.wait.await [[META13]], !continuation.returnedRegistercount [[META25:![0-9]+]]
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP13:%.*]] = call [[STRUCT_DISPATCHSYSTEMDATA]] [[AWAIT:@[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]](ptr [[TMP12]])
 ; LOWERRAYTRACINGPIPELINE-NEXT:    store [[STRUCT_MYPAYLOAD]] poison, ptr [[TMP4]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP14:%.*]] = getelementptr inbounds [[STRUCT_MYPAYLOAD]], ptr [[TMP4]], i32 0, i32 0
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP19:%.*]] = load i32, ptr addrspace(20) @PAYLOAD, align 4
@@ -85,7 +90,7 @@ define void @RayGen() #0 {
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP34:%.*]] = call [[DX_TYPES_HANDLE]] [[DX_OP_ANNOTATEHANDLE]](i32 216, [[DX_TYPES_HANDLE]] [[TMP39]], [[DX_TYPES_RESOURCEPROPERTIES]] { i32 4098, i32 1033 })
 ; LOWERRAYTRACINGPIPELINE-NEXT:    call void @dx.op.textureStore.f32(i32 67, [[DX_TYPES_HANDLE]] [[TMP34]], i32 [[EXTRACT]], i32 [[EXTRACT1]], i32 undef, float [[TMP24]], float [[TMP27]], float [[TMP30]], float 0.000000e+00, i8 15)
 ; LOWERRAYTRACINGPIPELINE-NEXT:    call void @llvm.lifetime.end.p0(i64 16, ptr [[TMP5]]) #[[ATTR0]]
-; LOWERRAYTRACINGPIPELINE-NEXT:    ret void, !continuation.registercount [[META24:![0-9]+]]
+; LOWERRAYTRACINGPIPELINE-NEXT:    ret void
 ;
   %1 = load %dx.types.Handle, %dx.types.Handle* @"\01?myAccelerationStructure@@3URaytracingAccelerationStructure@@A", align 4
   %2 = load %dx.types.Handle, %dx.types.Handle* @"\01?gOutput@@3V?$RWTexture2D@V?$vector@M$03@@@@A", align 4
@@ -132,7 +137,7 @@ declare %dx.types.Handle @dx.op.createHandleForLib.dx.types.Handle(i32, %dx.type
 declare %struct.DispatchSystemData @_cont_SetupRayGen() #3
 
 ; Function Attrs: alwaysinline
-declare %struct.DispatchSystemData @_AmdAwaitTraversal(i64, %struct.TraversalData) #3
+declare %struct.DispatchSystemData @_AmdWaitAwaitTraversal(i64, i64, %struct.TraversalData) #3
 
 ; Function Attrs: alwaysinline
 declare %struct.DispatchSystemData @_AmdAwaitShader(i64, %struct.DispatchSystemData) #3
@@ -161,6 +166,9 @@ declare !types !43 void @_AmdRestoreSystemData(%struct.DispatchSystemData*) #1
 ; Function Attrs: nounwind memory(none)
 declare !types !44 void @_AmdRestoreSystemDataAnyHit(%struct.AnyHitTraversalData*) #1
 
+; Function Attrs: nounwind
+declare i64 @_AmdGetResumePointAddr() #3
+
 ; Function Attrs: alwaysinline
 define i32 @_cont_GetLocalRootIndex(%struct.DispatchSystemData* %data) #3 !types !46 {
   ret i32 5
@@ -171,7 +179,9 @@ define void @_cont_TraceRay(%struct.DispatchSystemData* %data, i64 %0, i32 %1, i
   %dis_data = load %struct.DispatchSystemData, %struct.DispatchSystemData* %data, align 4
   %sys_data = insertvalue %struct.SystemData undef, %struct.DispatchSystemData %dis_data, 0
   %trav_data = insertvalue %struct.TraversalData undef, %struct.SystemData %sys_data, 0
-  %newdata = call %struct.DispatchSystemData @_AmdAwaitTraversal(i64 4, %struct.TraversalData %trav_data)
+  %addr = call i64 @_AmdGetResumePointAddr() #3
+  %trav_data2 = insertvalue %struct.TraversalData %trav_data, i64 %addr, 1
+  %newdata = call %struct.DispatchSystemData @_AmdWaitAwaitTraversal(i64 4, i64 -1, %struct.TraversalData %trav_data2)
   store %struct.DispatchSystemData %newdata, %struct.DispatchSystemData* %data, align 4
   call void @_AmdRestoreSystemData(%struct.DispatchSystemData* %data)
   ret void
diff --git a/llvmraytracing/test/dx/payload-save-registers.ll b/llvmraytracing/test/dx/payload-save-registers.ll
index 423e5ad43e..c307c2ed18 100644
--- a/llvmraytracing/test/dx/payload-save-registers.ll
+++ b/llvmraytracing/test/dx/payload-save-registers.ll
@@ -1,6 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 3
-; RUN: opt --verify-each -passes='dxil-cont-lgc-rt-op-converter,lint,lower-raytracing-pipeline,lint,remove-types-metadata' -S %s 2> %t.stderr | FileCheck -check-prefix=LOWERRAYTRACINGPIPELINE %s
-; RUN: count 0 < %t.stderr
+; RUN: opt --verify-each -passes='dxil-cont-lgc-rt-op-converter,lint,lower-raytracing-pipeline,lint,remove-types-metadata' -S %s --lint-abort-on-error | FileCheck -check-prefix=LOWERRAYTRACINGPIPELINE %s
 
 ; Test that we correctly save and restore registers before/after recursive
 ; TraceRay or CallShader if there are live values in payload registers that
@@ -11,7 +10,7 @@
 ; Also, function name mangling was removed.
 ; Note that the payload has payload access qualifiers set.
 
-target datalayout = "e-m:e-p:64:32-p20:32:32-p21:32:32-p32:32:32-i1:32-i8:8-i16:32-i32:32-i64:32-f16:32-f32:32-f64:32-v16:32-v32:32-v48:32-v64:32-v80:32-v96:32-v112:32-v128:32-v144:32-v160:32-v176:32-v192:32-v208:32-v224:32-v240:32-v256:32-n8:16:32"
+target datalayout = "e-m:e-p:64:32-p20:32:32-p21:32:32-p32:32:32-i1:32-i8:8-i16:16-i32:32-i64:32-f16:16-f32:32-f64:32-v8:8-v16:16-v32:32-v48:32-v64:32-v80:32-v96:32-v112:32-v128:32-v144:32-v160:32-v176:32-v192:32-v208:32-v224:32-v240:32-v256:32-n8:16:32"
 
 %dx.types.Handle = type { i8* }
 %struct.OuterPayload = type { [15 x float], [15 x float] }
@@ -30,7 +29,7 @@ target datalayout = "e-m:e-p:64:32-p20:32:32-p21:32:32-p32:32:32-i1:32-i8:8-i16:
 ; Function Attrs: nounwind
 define void @Miss(%struct.OuterPayload* noalias nocapture %outerPayload) #0 !types !23 {
 ; LOWERRAYTRACINGPIPELINE-LABEL: define %struct.DispatchSystemData @Miss(
-; LOWERRAYTRACINGPIPELINE-SAME: [[STRUCT_SYSTEMDATA:%.*]] [[TMP0:%.*]]) #[[ATTR0:[0-9]+]] !lgc.rt.shaderstage [[META25:![0-9]+]] !continuation.registercount [[META23:![0-9]+]] !continuation [[META26:![0-9]+]] {
+; LOWERRAYTRACINGPIPELINE-SAME: i64 [[RETURNADDR:%.*]], [[STRUCT_SYSTEMDATA:%.*]] [[TMP0:%.*]]) #[[ATTR0:[0-9]+]] !lgc.rt.shaderstage [[META25:![0-9]+]] !continuation.registercount [[META23:![0-9]+]] !continuation [[META26:![0-9]+]] {
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[SYSTEM_DATA_ALLOCA:%.*]] = alloca [[STRUCT_SYSTEMDATA]], align 8
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP2:%.*]] = alloca [[STRUCT_OUTERPAYLOAD:%.*]], align 8
 ; LOWERRAYTRACINGPIPELINE-NEXT:    store [[STRUCT_SYSTEMDATA]] [[TMP0]], ptr [[SYSTEM_DATA_ALLOCA]], align 4
@@ -99,20 +98,20 @@ define void @Miss(%struct.OuterPayload* noalias nocapture %outerPayload) #0 !typ
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP51:%.*]] = load float, ptr [[TMP48]], align 4, !tbaa [[TBAA27:![0-9]+]]
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP50:%.*]] = getelementptr inbounds [[STRUCT_INNERPAYLOAD]], ptr [[TMP46]], i32 0, i32 0
 ; LOWERRAYTRACINGPIPELINE-NEXT:    store float [[TMP51]], ptr [[TMP50]], align 4, !tbaa [[TBAA27]]
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP56:%.*]] = call [[DX_TYPES_HANDLE]] [[DX_OP_CREATEHANDLEFORLIB_DX_TYPES_HANDLE:@[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]](i32 160, [[DX_TYPES_HANDLE]] [[TMP45]])
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP52:%.*]] = call [[DX_TYPES_HANDLE]] [[DX_OP_ANNOTATEHANDLE:@[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]](i32 216, [[DX_TYPES_HANDLE]] [[TMP56]], [[DX_TYPES_RESOURCEPROPERTIES:%.*]] { i32 16, i32 0 })
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP55:%.*]] = call [[DX_TYPES_HANDLE]] [[DX_OP_CREATEHANDLEFORLIB_DX_TYPES_HANDLE:@[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]](i32 160, [[DX_TYPES_HANDLE]] [[TMP45]])
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP52:%.*]] = call [[DX_TYPES_HANDLE]] [[DX_OP_ANNOTATEHANDLE:@[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]](i32 216, [[DX_TYPES_HANDLE]] [[TMP55]], [[DX_TYPES_RESOURCEPROPERTIES:%.*]] { i32 16, i32 0 })
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP53:%.*]] = call i64 @amd.dx.getAccelStructAddr([[DX_TYPES_HANDLE]] [[TMP52]])
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP54:%.*]] = getelementptr inbounds [[STRUCT_SYSTEMDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[DIS_DATA_I:%.*]] = load [[STRUCT_DISPATCHSYSTEMDATA:%.*]], ptr [[TMP54]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[SYS_DATA_I:%.*]] = insertvalue [[STRUCT_SYSTEMDATA]] undef, [[STRUCT_DISPATCHSYSTEMDATA]] [[DIS_DATA_I]], 0
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TRAV_DATA_I:%.*]] = insertvalue [[STRUCT_TRAVERSALDATA:%.*]] undef, [[STRUCT_SYSTEMDATA]] [[SYS_DATA_I]], 0
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP55:%.*]] = getelementptr inbounds [[STRUCT_INNERPAYLOAD]], ptr [[TMP46]], i32 0, i32 0
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP61:%.*]] = load i32, ptr [[TMP55]], align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP57:%.*]] = getelementptr inbounds [[STRUCT_INNERPAYLOAD]], ptr [[TMP46]], i32 0
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP61:%.*]] = load i32, ptr [[TMP57]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP61]], ptr addrspace(20) @PAYLOAD, align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP65:%.*]] = call ptr inttoptr (i64 4 to ptr)([[STRUCT_TRAVERSALDATA]] [[TRAV_DATA_I]]), !continuation.registercount [[META31:![0-9]+]], !continuation.returnedRegistercount !31
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP58:%.*]] = call [[STRUCT_DISPATCHSYSTEMDATA]] [[AWAIT_STRUCT_DISPATCHSYSTEMDATA:@[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]](ptr [[TMP65]])
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP56:%.*]] = call ptr inttoptr (i64 4 to ptr)([[STRUCT_TRAVERSALDATA]] [[TRAV_DATA_I]], i64 poison), !continuation.registercount [[META31:![0-9]+]], !continuation.returnedRegistercount [[META31]]
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP58:%.*]] = call [[STRUCT_DISPATCHSYSTEMDATA]] [[AWAIT:@[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]](ptr [[TMP56]])
 ; LOWERRAYTRACINGPIPELINE-NEXT:    store [[STRUCT_INNERPAYLOAD]] poison, ptr [[TMP46]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP59:%.*]] = getelementptr inbounds [[STRUCT_INNERPAYLOAD]], ptr [[TMP46]], i32 0, i32 0
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP59:%.*]] = getelementptr inbounds [[STRUCT_INNERPAYLOAD]], ptr [[TMP46]], i32 0
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP97:%.*]] = load i32, ptr addrspace(20) @PAYLOAD, align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP97]], ptr [[TMP59]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    store [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP58]], ptr [[TMP54]], align 4
@@ -179,7 +178,8 @@ define void @Miss(%struct.OuterPayload* noalias nocapture %outerPayload) #0 !typ
 ; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP101]], ptr addrspace(20) getelementptr inbounds (i32, ptr addrspace(20) @PAYLOAD, i32 21), align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP94:%.*]] = getelementptr inbounds [[STRUCT_SYSTEMDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP95:%.*]] = load [[STRUCT_DISPATCHSYSTEMDATA]], ptr [[TMP94]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    ret [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP95]], !continuation.registercount [[META23]]
+; LOWERRAYTRACINGPIPELINE-NEXT:    call void (...) @lgc.ilcps.return(i64 [[RETURNADDR]], [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP95]]), !continuation.registercount [[META23]]
+; LOWERRAYTRACINGPIPELINE-NEXT:    unreachable
 ;
   %1 = load %dx.types.Handle, %dx.types.Handle* @"\01?myAccelerationStructure@@3URaytracingAccelerationStructure@@A", align 4
   %2 = alloca %struct.InnerPayload, align 4
@@ -201,12 +201,12 @@ define void @Miss(%struct.OuterPayload* noalias nocapture %outerPayload) #0 !typ
 ; Function Attrs: nounwind
 define void @Callable(%struct.OuterPayload* noalias %outerPayload) #0 !types !23 {
 ; LOWERRAYTRACINGPIPELINE-LABEL: define %struct.DispatchSystemData @Callable(
-; LOWERRAYTRACINGPIPELINE-SAME: [[STRUCT_DISPATCHSYSTEMDATA:%.*]] [[TMP0:%.*]]) #[[ATTR0]] !lgc.rt.shaderstage [[META32:![0-9]+]] !continuation.registercount [[META23]] !continuation [[META33:![0-9]+]] {
+; LOWERRAYTRACINGPIPELINE-SAME: i64 [[RETURNADDR:%.*]], [[STRUCT_DISPATCHSYSTEMDATA:%.*]] [[TMP0:%.*]]) #[[ATTR0]] !lgc.rt.shaderstage [[META32:![0-9]+]] !continuation.registercount [[META23]] !continuation [[META33:![0-9]+]] {
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP2:%.*]] = alloca [[STRUCT_OUTERPAYLOAD:%.*]], align 8
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[SYSTEM_DATA_ALLOCA:%.*]] = alloca [[STRUCT_DISPATCHSYSTEMDATA]], align 8
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP3:%.*]] = alloca [[STRUCT_OUTERPAYLOAD]], align 8
 ; LOWERRAYTRACINGPIPELINE-NEXT:    store [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP0]], ptr [[SYSTEM_DATA_ALLOCA]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [[STRUCT_OUTERPAYLOAD]], ptr [[TMP3]], i32 0, i32 0
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [[STRUCT_OUTERPAYLOAD]], ptr [[TMP3]], i32 0
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP7:%.*]] = load i32, ptr addrspace(20) @PAYLOAD, align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP7]], ptr [[TMP4]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i32 1
@@ -251,49 +251,49 @@ define void @Callable(%struct.OuterPayload* noalias %outerPayload) #0 !types !23
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP32:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i32 14
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP35:%.*]] = load i32, ptr addrspace(20) getelementptr inbounds (i32, ptr addrspace(20) @PAYLOAD, i32 14), align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP35]], ptr [[TMP32]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP36:%.*]] = getelementptr inbounds [[STRUCT_OUTERPAYLOAD]], ptr [[TMP3]], i32 0, i32 1
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP34:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i32 15
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP39:%.*]] = load i32, ptr addrspace(20) getelementptr inbounds (i32, ptr addrspace(20) @PAYLOAD, i32 15), align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP39]], ptr [[TMP36]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP37:%.*]] = getelementptr inbounds i32, ptr [[TMP36]], i32 1
+; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP39]], ptr [[TMP34]], align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP36:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i32 16
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP41:%.*]] = load i32, ptr addrspace(20) getelementptr inbounds (i32, ptr addrspace(20) @PAYLOAD, i32 16), align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP41]], ptr [[TMP37]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP38:%.*]] = getelementptr inbounds i32, ptr [[TMP36]], i32 2
+; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP41]], ptr [[TMP36]], align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP38:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i32 17
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP43:%.*]] = load i32, ptr addrspace(20) getelementptr inbounds (i32, ptr addrspace(20) @PAYLOAD, i32 17), align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP43]], ptr [[TMP38]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP40:%.*]] = getelementptr inbounds i32, ptr [[TMP36]], i32 3
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP40:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i32 18
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP45:%.*]] = load i32, ptr addrspace(20) getelementptr inbounds (i32, ptr addrspace(20) @PAYLOAD, i32 18), align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP45]], ptr [[TMP40]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP42:%.*]] = getelementptr inbounds i32, ptr [[TMP36]], i32 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP42:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i32 19
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP47:%.*]] = load i32, ptr addrspace(20) getelementptr inbounds (i32, ptr addrspace(20) @PAYLOAD, i32 19), align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP47]], ptr [[TMP42]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP44:%.*]] = getelementptr inbounds i32, ptr [[TMP36]], i32 5
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP44:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i32 20
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP49:%.*]] = load i32, ptr addrspace(20) getelementptr inbounds (i32, ptr addrspace(20) @PAYLOAD, i32 20), align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP49]], ptr [[TMP44]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP46:%.*]] = getelementptr inbounds i32, ptr [[TMP36]], i32 6
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP46:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i32 21
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP51:%.*]] = load i32, ptr addrspace(20) getelementptr inbounds (i32, ptr addrspace(20) @PAYLOAD, i32 21), align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP51]], ptr [[TMP46]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP48:%.*]] = getelementptr inbounds i32, ptr [[TMP36]], i32 7
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP48:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i32 22
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP53:%.*]] = load i32, ptr addrspace(20) getelementptr inbounds (i32, ptr addrspace(20) @PAYLOAD, i32 22), align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP53]], ptr [[TMP48]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP50:%.*]] = getelementptr inbounds i32, ptr [[TMP36]], i32 8
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP50:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i32 23
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP55:%.*]] = load i32, ptr addrspace(20) getelementptr inbounds (i32, ptr addrspace(20) @PAYLOAD, i32 23), align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP55]], ptr [[TMP50]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP52:%.*]] = getelementptr inbounds i32, ptr [[TMP36]], i32 9
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP52:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i32 24
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP57:%.*]] = load i32, ptr addrspace(20) getelementptr inbounds (i32, ptr addrspace(20) @PAYLOAD, i32 24), align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP57]], ptr [[TMP52]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP54:%.*]] = getelementptr inbounds i32, ptr [[TMP36]], i32 10
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP54:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i32 25
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP59:%.*]] = load i32, ptr addrspace(20) getelementptr inbounds (i32, ptr addrspace(20) @PAYLOAD, i32 25), align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP59]], ptr [[TMP54]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP56:%.*]] = getelementptr inbounds i32, ptr [[TMP36]], i32 11
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP56:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i32 26
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP61:%.*]] = load i32, ptr addrspace(20) getelementptr inbounds (i32, ptr addrspace(20) @PAYLOAD, i32 26), align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP61]], ptr [[TMP56]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP58:%.*]] = getelementptr inbounds i32, ptr [[TMP36]], i32 12
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP58:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i32 27
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP63:%.*]] = load i32, ptr addrspace(20) getelementptr inbounds (i32, ptr addrspace(20) @PAYLOAD, i32 27), align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP63]], ptr [[TMP58]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP60:%.*]] = getelementptr inbounds i32, ptr [[TMP36]], i32 13
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP60:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i32 28
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP65:%.*]] = load i32, ptr addrspace(20) getelementptr inbounds (i32, ptr addrspace(20) @PAYLOAD, i32 28), align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP65]], ptr [[TMP60]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP62:%.*]] = getelementptr inbounds i32, ptr [[TMP36]], i32 14
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP62:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i32 29
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP67:%.*]] = load i32, ptr addrspace(20) getelementptr inbounds (i32, ptr addrspace(20) @PAYLOAD, i32 29), align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP67]], ptr [[TMP62]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    call void @amd.dx.setLocalRootIndex(i32 5)
@@ -418,7 +418,7 @@ define void @Callable(%struct.OuterPayload* noalias %outerPayload) #0 !types !23
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP157:%.*]] = load float, ptr [[TMP156]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    store float [[TMP157]], ptr [[TMP155]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[DIS_DATA_I:%.*]] = load [[STRUCT_DISPATCHSYSTEMDATA]], ptr [[SYSTEM_DATA_ALLOCA]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP158:%.*]] = getelementptr inbounds [[STRUCT_OUTERPAYLOAD]], ptr [[TMP2]], i32 0, i32 0
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP158:%.*]] = getelementptr inbounds [[STRUCT_OUTERPAYLOAD]], ptr [[TMP2]], i32 0
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP161:%.*]] = load i32, ptr [[TMP158]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP161]], ptr addrspace(20) @PAYLOAD, align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP159:%.*]] = getelementptr inbounds i32, ptr [[TMP158]], i32 1
@@ -463,55 +463,55 @@ define void @Callable(%struct.OuterPayload* noalias %outerPayload) #0 !types !23
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP182:%.*]] = getelementptr inbounds i32, ptr [[TMP158]], i32 14
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP189:%.*]] = load i32, ptr [[TMP182]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP189]], ptr addrspace(20) getelementptr inbounds (i32, ptr addrspace(20) @PAYLOAD, i32 14), align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP190:%.*]] = getelementptr inbounds [[STRUCT_OUTERPAYLOAD]], ptr [[TMP2]], i32 0, i32 1
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP193:%.*]] = load i32, ptr [[TMP190]], align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP191:%.*]] = getelementptr inbounds i32, ptr [[TMP158]], i32 15
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP193:%.*]] = load i32, ptr [[TMP191]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP193]], ptr addrspace(20) getelementptr inbounds (i32, ptr addrspace(20) @PAYLOAD, i32 15), align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP186:%.*]] = getelementptr inbounds i32, ptr [[TMP190]], i32 1
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP186:%.*]] = getelementptr inbounds i32, ptr [[TMP158]], i32 16
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP195:%.*]] = load i32, ptr [[TMP186]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP195]], ptr addrspace(20) getelementptr inbounds (i32, ptr addrspace(20) @PAYLOAD, i32 16), align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP188:%.*]] = getelementptr inbounds i32, ptr [[TMP190]], i32 2
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP188:%.*]] = getelementptr inbounds i32, ptr [[TMP158]], i32 17
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP197:%.*]] = load i32, ptr [[TMP188]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP197]], ptr addrspace(20) getelementptr inbounds (i32, ptr addrspace(20) @PAYLOAD, i32 17), align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP191:%.*]] = getelementptr inbounds i32, ptr [[TMP190]], i32 3
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP199:%.*]] = load i32, ptr [[TMP191]], align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP190:%.*]] = getelementptr inbounds i32, ptr [[TMP158]], i32 18
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP199:%.*]] = load i32, ptr [[TMP190]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP199]], ptr addrspace(20) getelementptr inbounds (i32, ptr addrspace(20) @PAYLOAD, i32 18), align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP192:%.*]] = getelementptr inbounds i32, ptr [[TMP190]], i32 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP192:%.*]] = getelementptr inbounds i32, ptr [[TMP158]], i32 19
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP201:%.*]] = load i32, ptr [[TMP192]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP201]], ptr addrspace(20) getelementptr inbounds (i32, ptr addrspace(20) @PAYLOAD, i32 19), align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP194:%.*]] = getelementptr inbounds i32, ptr [[TMP190]], i32 5
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP194:%.*]] = getelementptr inbounds i32, ptr [[TMP158]], i32 20
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP203:%.*]] = load i32, ptr [[TMP194]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP203]], ptr addrspace(20) getelementptr inbounds (i32, ptr addrspace(20) @PAYLOAD, i32 20), align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP196:%.*]] = getelementptr inbounds i32, ptr [[TMP190]], i32 6
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP196:%.*]] = getelementptr inbounds i32, ptr [[TMP158]], i32 21
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP205:%.*]] = load i32, ptr [[TMP196]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP205]], ptr addrspace(20) getelementptr inbounds (i32, ptr addrspace(20) @PAYLOAD, i32 21), align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP198:%.*]] = getelementptr inbounds i32, ptr [[TMP190]], i32 7
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP198:%.*]] = getelementptr inbounds i32, ptr [[TMP158]], i32 22
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP207:%.*]] = load i32, ptr [[TMP198]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP207]], ptr addrspace(20) getelementptr inbounds (i32, ptr addrspace(20) @PAYLOAD, i32 22), align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP200:%.*]] = getelementptr inbounds i32, ptr [[TMP190]], i32 8
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP200:%.*]] = getelementptr inbounds i32, ptr [[TMP158]], i32 23
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP209:%.*]] = load i32, ptr [[TMP200]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP209]], ptr addrspace(20) getelementptr inbounds (i32, ptr addrspace(20) @PAYLOAD, i32 23), align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP202:%.*]] = getelementptr inbounds i32, ptr [[TMP190]], i32 9
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP202:%.*]] = getelementptr inbounds i32, ptr [[TMP158]], i32 24
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP211:%.*]] = load i32, ptr [[TMP202]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP211]], ptr addrspace(20) getelementptr inbounds (i32, ptr addrspace(20) @PAYLOAD, i32 24), align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP204:%.*]] = getelementptr inbounds i32, ptr [[TMP190]], i32 10
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP204:%.*]] = getelementptr inbounds i32, ptr [[TMP158]], i32 25
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP213:%.*]] = load i32, ptr [[TMP204]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP213]], ptr addrspace(20) getelementptr inbounds (i32, ptr addrspace(20) @PAYLOAD, i32 25), align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP206:%.*]] = getelementptr inbounds i32, ptr [[TMP190]], i32 11
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP206:%.*]] = getelementptr inbounds i32, ptr [[TMP158]], i32 26
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP215:%.*]] = load i32, ptr [[TMP206]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP215]], ptr addrspace(20) getelementptr inbounds (i32, ptr addrspace(20) @PAYLOAD, i32 26), align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP208:%.*]] = getelementptr inbounds i32, ptr [[TMP190]], i32 12
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP208:%.*]] = getelementptr inbounds i32, ptr [[TMP158]], i32 27
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP217:%.*]] = load i32, ptr [[TMP208]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP217]], ptr addrspace(20) getelementptr inbounds (i32, ptr addrspace(20) @PAYLOAD, i32 27), align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP210:%.*]] = getelementptr inbounds i32, ptr [[TMP190]], i32 13
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP210:%.*]] = getelementptr inbounds i32, ptr [[TMP158]], i32 28
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP219:%.*]] = load i32, ptr [[TMP210]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP219]], ptr addrspace(20) getelementptr inbounds (i32, ptr addrspace(20) @PAYLOAD, i32 28), align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP212:%.*]] = getelementptr inbounds i32, ptr [[TMP190]], i32 14
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP212:%.*]] = getelementptr inbounds i32, ptr [[TMP158]], i32 29
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP221:%.*]] = load i32, ptr [[TMP212]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP221]], ptr addrspace(20) getelementptr inbounds (i32, ptr addrspace(20) @PAYLOAD, i32 29), align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP214:%.*]] = call ptr inttoptr (i64 2 to ptr)([[STRUCT_DISPATCHSYSTEMDATA]] [[DIS_DATA_I]]), !continuation.registercount [[META23]], !continuation.returnedRegistercount !23
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP223:%.*]] = call [[STRUCT_DISPATCHSYSTEMDATA]] [[AWAIT_STRUCT_DISPATCHSYSTEMDATA:@[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]](ptr [[TMP214]])
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP214:%.*]] = call ptr inttoptr (i64 2 to ptr)([[STRUCT_DISPATCHSYSTEMDATA]] [[DIS_DATA_I]]), !continuation.registercount [[META23]], !continuation.returnedRegistercount [[META23]]
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP223:%.*]] = call [[STRUCT_DISPATCHSYSTEMDATA]] [[AWAIT_1:@[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]](ptr [[TMP214]])
 ; LOWERRAYTRACINGPIPELINE-NEXT:    store [[STRUCT_OUTERPAYLOAD]] poison, ptr [[TMP2]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP224:%.*]] = getelementptr inbounds [[STRUCT_OUTERPAYLOAD]], ptr [[TMP2]], i32 0, i32 0
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP224:%.*]] = getelementptr inbounds [[STRUCT_OUTERPAYLOAD]], ptr [[TMP2]], i32 0
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP227:%.*]] = load i32, ptr addrspace(20) @PAYLOAD, align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP227]], ptr [[TMP224]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP218:%.*]] = getelementptr inbounds i32, ptr [[TMP224]], i32 1
@@ -556,49 +556,49 @@ define void @Callable(%struct.OuterPayload* noalias %outerPayload) #0 !types !23
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP244:%.*]] = getelementptr inbounds i32, ptr [[TMP224]], i32 14
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP255:%.*]] = load i32, ptr addrspace(20) getelementptr inbounds (i32, ptr addrspace(20) @PAYLOAD, i32 14), align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP255]], ptr [[TMP244]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP256:%.*]] = getelementptr inbounds [[STRUCT_OUTERPAYLOAD]], ptr [[TMP2]], i32 0, i32 1
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP246:%.*]] = getelementptr inbounds i32, ptr [[TMP224]], i32 15
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP259:%.*]] = load i32, ptr addrspace(20) getelementptr inbounds (i32, ptr addrspace(20) @PAYLOAD, i32 15), align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP259]], ptr [[TMP256]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP248:%.*]] = getelementptr inbounds i32, ptr [[TMP256]], i32 1
+; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP259]], ptr [[TMP246]], align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP248:%.*]] = getelementptr inbounds i32, ptr [[TMP224]], i32 16
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP261:%.*]] = load i32, ptr addrspace(20) getelementptr inbounds (i32, ptr addrspace(20) @PAYLOAD, i32 16), align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP261]], ptr [[TMP248]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP250:%.*]] = getelementptr inbounds i32, ptr [[TMP256]], i32 2
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP250:%.*]] = getelementptr inbounds i32, ptr [[TMP224]], i32 17
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP263:%.*]] = load i32, ptr addrspace(20) getelementptr inbounds (i32, ptr addrspace(20) @PAYLOAD, i32 17), align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP263]], ptr [[TMP250]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP252:%.*]] = getelementptr inbounds i32, ptr [[TMP256]], i32 3
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP252:%.*]] = getelementptr inbounds i32, ptr [[TMP224]], i32 18
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP265:%.*]] = load i32, ptr addrspace(20) getelementptr inbounds (i32, ptr addrspace(20) @PAYLOAD, i32 18), align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP265]], ptr [[TMP252]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP254:%.*]] = getelementptr inbounds i32, ptr [[TMP256]], i32 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP254:%.*]] = getelementptr inbounds i32, ptr [[TMP224]], i32 19
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP267:%.*]] = load i32, ptr addrspace(20) getelementptr inbounds (i32, ptr addrspace(20) @PAYLOAD, i32 19), align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP267]], ptr [[TMP254]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP257:%.*]] = getelementptr inbounds i32, ptr [[TMP256]], i32 5
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP256:%.*]] = getelementptr inbounds i32, ptr [[TMP224]], i32 20
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP269:%.*]] = load i32, ptr addrspace(20) getelementptr inbounds (i32, ptr addrspace(20) @PAYLOAD, i32 20), align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP269]], ptr [[TMP257]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP258:%.*]] = getelementptr inbounds i32, ptr [[TMP256]], i32 6
+; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP269]], ptr [[TMP256]], align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP258:%.*]] = getelementptr inbounds i32, ptr [[TMP224]], i32 21
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP270:%.*]] = load i32, ptr addrspace(20) getelementptr inbounds (i32, ptr addrspace(20) @PAYLOAD, i32 21), align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP270]], ptr [[TMP258]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP260:%.*]] = getelementptr inbounds i32, ptr [[TMP256]], i32 7
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP260:%.*]] = getelementptr inbounds i32, ptr [[TMP224]], i32 22
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP272:%.*]] = load i32, ptr addrspace(20) getelementptr inbounds (i32, ptr addrspace(20) @PAYLOAD, i32 22), align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP272]], ptr [[TMP260]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP262:%.*]] = getelementptr inbounds i32, ptr [[TMP256]], i32 8
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP262:%.*]] = getelementptr inbounds i32, ptr [[TMP224]], i32 23
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP274:%.*]] = load i32, ptr addrspace(20) getelementptr inbounds (i32, ptr addrspace(20) @PAYLOAD, i32 23), align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP274]], ptr [[TMP262]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP264:%.*]] = getelementptr inbounds i32, ptr [[TMP256]], i32 9
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP264:%.*]] = getelementptr inbounds i32, ptr [[TMP224]], i32 24
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP276:%.*]] = load i32, ptr addrspace(20) getelementptr inbounds (i32, ptr addrspace(20) @PAYLOAD, i32 24), align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP276]], ptr [[TMP264]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP266:%.*]] = getelementptr inbounds i32, ptr [[TMP256]], i32 10
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP266:%.*]] = getelementptr inbounds i32, ptr [[TMP224]], i32 25
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP278:%.*]] = load i32, ptr addrspace(20) getelementptr inbounds (i32, ptr addrspace(20) @PAYLOAD, i32 25), align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP278]], ptr [[TMP266]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP268:%.*]] = getelementptr inbounds i32, ptr [[TMP256]], i32 11
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP268:%.*]] = getelementptr inbounds i32, ptr [[TMP224]], i32 26
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP280:%.*]] = load i32, ptr addrspace(20) getelementptr inbounds (i32, ptr addrspace(20) @PAYLOAD, i32 26), align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP280]], ptr [[TMP268]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP271:%.*]] = getelementptr inbounds i32, ptr [[TMP256]], i32 12
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP271:%.*]] = getelementptr inbounds i32, ptr [[TMP224]], i32 27
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP282:%.*]] = load i32, ptr addrspace(20) getelementptr inbounds (i32, ptr addrspace(20) @PAYLOAD, i32 27), align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP282]], ptr [[TMP271]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP273:%.*]] = getelementptr inbounds i32, ptr [[TMP256]], i32 13
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP273:%.*]] = getelementptr inbounds i32, ptr [[TMP224]], i32 28
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP284:%.*]] = load i32, ptr addrspace(20) getelementptr inbounds (i32, ptr addrspace(20) @PAYLOAD, i32 28), align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP284]], ptr [[TMP273]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP275:%.*]] = getelementptr inbounds i32, ptr [[TMP256]], i32 14
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP275:%.*]] = getelementptr inbounds i32, ptr [[TMP224]], i32 29
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP286:%.*]] = load i32, ptr addrspace(20) getelementptr inbounds (i32, ptr addrspace(20) @PAYLOAD, i32 29), align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP286]], ptr [[TMP275]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    store [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP223]], ptr [[SYSTEM_DATA_ALLOCA]], align 4
@@ -665,7 +665,7 @@ define void @Callable(%struct.OuterPayload* noalias %outerPayload) #0 !types !23
 ; LOWERRAYTRACINGPIPELINE-NEXT:    store float [[TMP304]], ptr [[TMP153]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP305:%.*]] = load float, ptr [[TMP155]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    store float [[TMP305]], ptr [[TMP156]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP318:%.*]] = getelementptr inbounds [[STRUCT_OUTERPAYLOAD]], ptr [[TMP3]], i32 0, i32 0
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP318:%.*]] = getelementptr inbounds [[STRUCT_OUTERPAYLOAD]], ptr [[TMP3]], i32 0
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP321:%.*]] = load i32, ptr [[TMP318]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP321]], ptr addrspace(20) @PAYLOAD, align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP308:%.*]] = getelementptr inbounds i32, ptr [[TMP318]], i32 1
@@ -710,53 +710,54 @@ define void @Callable(%struct.OuterPayload* noalias %outerPayload) #0 !types !23
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP334:%.*]] = getelementptr inbounds i32, ptr [[TMP318]], i32 14
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP349:%.*]] = load i32, ptr [[TMP334]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP349]], ptr addrspace(20) getelementptr inbounds (i32, ptr addrspace(20) @PAYLOAD, i32 14), align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP350:%.*]] = getelementptr inbounds [[STRUCT_OUTERPAYLOAD]], ptr [[TMP3]], i32 0, i32 1
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP353:%.*]] = load i32, ptr [[TMP350]], align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP336:%.*]] = getelementptr inbounds i32, ptr [[TMP318]], i32 15
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP353:%.*]] = load i32, ptr [[TMP336]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP353]], ptr addrspace(20) getelementptr inbounds (i32, ptr addrspace(20) @PAYLOAD, i32 15), align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP338:%.*]] = getelementptr inbounds i32, ptr [[TMP350]], i32 1
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP338:%.*]] = getelementptr inbounds i32, ptr [[TMP318]], i32 16
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP355:%.*]] = load i32, ptr [[TMP338]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP355]], ptr addrspace(20) getelementptr inbounds (i32, ptr addrspace(20) @PAYLOAD, i32 16), align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP340:%.*]] = getelementptr inbounds i32, ptr [[TMP350]], i32 2
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP340:%.*]] = getelementptr inbounds i32, ptr [[TMP318]], i32 17
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP357:%.*]] = load i32, ptr [[TMP340]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP357]], ptr addrspace(20) getelementptr inbounds (i32, ptr addrspace(20) @PAYLOAD, i32 17), align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP342:%.*]] = getelementptr inbounds i32, ptr [[TMP350]], i32 3
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP342:%.*]] = getelementptr inbounds i32, ptr [[TMP318]], i32 18
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP359:%.*]] = load i32, ptr [[TMP342]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP359]], ptr addrspace(20) getelementptr inbounds (i32, ptr addrspace(20) @PAYLOAD, i32 18), align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP344:%.*]] = getelementptr inbounds i32, ptr [[TMP350]], i32 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP344:%.*]] = getelementptr inbounds i32, ptr [[TMP318]], i32 19
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP361:%.*]] = load i32, ptr [[TMP344]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP361]], ptr addrspace(20) getelementptr inbounds (i32, ptr addrspace(20) @PAYLOAD, i32 19), align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP346:%.*]] = getelementptr inbounds i32, ptr [[TMP350]], i32 5
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP346:%.*]] = getelementptr inbounds i32, ptr [[TMP318]], i32 20
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP363:%.*]] = load i32, ptr [[TMP346]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP363]], ptr addrspace(20) getelementptr inbounds (i32, ptr addrspace(20) @PAYLOAD, i32 20), align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP348:%.*]] = getelementptr inbounds i32, ptr [[TMP350]], i32 6
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP348:%.*]] = getelementptr inbounds i32, ptr [[TMP318]], i32 21
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP365:%.*]] = load i32, ptr [[TMP348]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP365]], ptr addrspace(20) getelementptr inbounds (i32, ptr addrspace(20) @PAYLOAD, i32 21), align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP351:%.*]] = getelementptr inbounds i32, ptr [[TMP350]], i32 7
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP367:%.*]] = load i32, ptr [[TMP351]], align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP350:%.*]] = getelementptr inbounds i32, ptr [[TMP318]], i32 22
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP367:%.*]] = load i32, ptr [[TMP350]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP367]], ptr addrspace(20) getelementptr inbounds (i32, ptr addrspace(20) @PAYLOAD, i32 22), align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP352:%.*]] = getelementptr inbounds i32, ptr [[TMP350]], i32 8
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP352:%.*]] = getelementptr inbounds i32, ptr [[TMP318]], i32 23
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP369:%.*]] = load i32, ptr [[TMP352]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP369]], ptr addrspace(20) getelementptr inbounds (i32, ptr addrspace(20) @PAYLOAD, i32 23), align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP354:%.*]] = getelementptr inbounds i32, ptr [[TMP350]], i32 9
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP354:%.*]] = getelementptr inbounds i32, ptr [[TMP318]], i32 24
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP371:%.*]] = load i32, ptr [[TMP354]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP371]], ptr addrspace(20) getelementptr inbounds (i32, ptr addrspace(20) @PAYLOAD, i32 24), align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP356:%.*]] = getelementptr inbounds i32, ptr [[TMP350]], i32 10
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP356:%.*]] = getelementptr inbounds i32, ptr [[TMP318]], i32 25
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP373:%.*]] = load i32, ptr [[TMP356]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP373]], ptr addrspace(20) getelementptr inbounds (i32, ptr addrspace(20) @PAYLOAD, i32 25), align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP358:%.*]] = getelementptr inbounds i32, ptr [[TMP350]], i32 11
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP358:%.*]] = getelementptr inbounds i32, ptr [[TMP318]], i32 26
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP375:%.*]] = load i32, ptr [[TMP358]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP375]], ptr addrspace(20) getelementptr inbounds (i32, ptr addrspace(20) @PAYLOAD, i32 26), align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP360:%.*]] = getelementptr inbounds i32, ptr [[TMP350]], i32 12
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP360:%.*]] = getelementptr inbounds i32, ptr [[TMP318]], i32 27
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP377:%.*]] = load i32, ptr [[TMP360]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP377]], ptr addrspace(20) getelementptr inbounds (i32, ptr addrspace(20) @PAYLOAD, i32 27), align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP362:%.*]] = getelementptr inbounds i32, ptr [[TMP350]], i32 13
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP362:%.*]] = getelementptr inbounds i32, ptr [[TMP318]], i32 28
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP379:%.*]] = load i32, ptr [[TMP362]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP379]], ptr addrspace(20) getelementptr inbounds (i32, ptr addrspace(20) @PAYLOAD, i32 28), align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP364:%.*]] = getelementptr inbounds i32, ptr [[TMP350]], i32 14
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP364:%.*]] = getelementptr inbounds i32, ptr [[TMP318]], i32 29
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP381:%.*]] = load i32, ptr [[TMP364]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP381]], ptr addrspace(20) getelementptr inbounds (i32, ptr addrspace(20) @PAYLOAD, i32 29), align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP382:%.*]] = load [[STRUCT_DISPATCHSYSTEMDATA]], ptr [[SYSTEM_DATA_ALLOCA]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    ret [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP382]], !continuation.registercount [[META23]]
+; LOWERRAYTRACINGPIPELINE-NEXT:    call void (...) @lgc.ilcps.return(i64 [[RETURNADDR]], [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP382]]), !continuation.registercount [[META23]]
+; LOWERRAYTRACINGPIPELINE-NEXT:    unreachable
 ;
   %1 = alloca %struct.OuterPayload, align 8
   %2 = getelementptr inbounds %struct.OuterPayload, %struct.OuterPayload* %1, i32 0, i32 0, i32 0
diff --git a/llvmraytracing/test/dx/payload.ll b/llvmraytracing/test/dx/payload.ll
index 4bd2b8c855..a1e7a5b953 100644
--- a/llvmraytracing/test/dx/payload.ll
+++ b/llvmraytracing/test/dx/payload.ll
@@ -1,14 +1,11 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --include-generated-funcs --version 3
-; RUN: grep -v SKIP_GLOBAL_ADDRSPACE %s | opt --verify-each -passes='dxil-cont-lgc-rt-op-converter,lint,inline,lint,lower-raytracing-pipeline,lint,sroa,lint,lower-await,lint,coro-early,dxil-coro-split,coro-cleanup,lint,legacy-cleanup-continuations,lint,remove-types-metadata' -S 2> %t0.stderr | FileCheck -check-prefix=CLEANUP %s
-; RUN: count 0 < %t0.stderr
+; RUN: grep -v SKIP_GLOBAL_ADDRSPACE %s | opt --verify-each -passes='dxil-cont-lgc-rt-op-converter,lint,inline,lint,lower-raytracing-pipeline,lint,sroa,lint,lower-await,lint,coro-early,dxil-coro-split,coro-cleanup,lint,legacy-cleanup-continuations,lint,remove-types-metadata' -S --lint-abort-on-error | FileCheck -check-prefix=CLEANUP %s
 ; RUN: grep -v SKIP_GLOBAL_ADDRSPACE %s | opt --verify-each -passes='dxil-cont-lgc-rt-op-converter,lint,inline,lint,lower-raytracing-pipeline,lint,sroa,lint,lower-await,lint,coro-early,dxil-coro-split,coro-cleanup,lint,legacy-cleanup-continuations,lint,register-buffer,lint,dxil-cont-post-process,lint,remove-types-metadata' \
-; RUN:     -S 2> %t1.stderr | FileCheck -check-prefix=POST-PROCESS %s
-; RUN: count 0 < %t1.stderr
+; RUN:     -S --lint-abort-on-error | FileCheck -check-prefix=POST-PROCESS %s
 ; RUN: opt --verify-each -passes='dxil-cont-lgc-rt-op-converter,lint,inline,lint,lower-raytracing-pipeline,lint,sroa,lint,lower-await,lint,coro-early,dxil-coro-split,coro-cleanup,lint,legacy-cleanup-continuations,lint,register-buffer,lint,dxil-cont-post-process,lint,remove-types-metadata' \
-; RUN:     -S %s 2> %t2.stderr | FileCheck -check-prefix=POST-PROCESS-GLOBAL %s
-; RUN: count 0 < %t2.stderr
+; RUN:     -S %s --lint-abort-on-error | FileCheck -check-prefix=POST-PROCESS-GLOBAL %s
 
-target datalayout = "e-m:e-p:64:32-p20:32:32-p21:32:32-p32:32:32-i1:32-i8:8-i16:32-i32:32-i64:32-f16:32-f32:32-f64:32-v16:32-v32:32-v48:32-v64:32-v80:32-v96:32-v112:32-v128:32-v144:32-v160:32-v176:32-v192:32-v208:32-v224:32-v240:32-v256:32-n8:16:32"
+target datalayout = "e-m:e-p:64:32-p20:32:32-p21:32:32-p32:32:32-i1:32-i8:8-i16:16-i32:32-i64:32-f16:16-f32:32-f64:32-v8:8-v16:16-v32:32-v48:32-v64:32-v80:32-v96:32-v112:32-v128:32-v144:32-v160:32-v176:32-v192:32-v208:32-v224:32-v240:32-v256:32-n8:16:32"
 
 %dx.types.Handle = type { i8* }
 %struct.DispatchSystemData = type { <3 x i32> }
@@ -31,7 +28,7 @@ declare i32 @_cont_GetContinuationStackAddr() #0
 declare %struct.DispatchSystemData @_cont_SetupRayGen() #0
 
 ; Function Attrs: alwaysinline
-declare %struct.DispatchSystemData @_AmdAwaitTraversal(i64, %struct.TraversalData) #0
+declare %struct.DispatchSystemData @_AmdWaitAwaitTraversal(i64, i64, %struct.TraversalData) #0
 
 ; Function Attrs: alwaysinline
 declare %struct.DispatchSystemData @_AmdAwaitShader(i64, %struct.DispatchSystemData) #0
@@ -39,6 +36,10 @@ declare %struct.DispatchSystemData @_AmdAwaitShader(i64, %struct.DispatchSystemD
 ; Function Attrs: alwaysinline
 declare %struct.AnyHitTraversalData @_AmdAwaitAnyHit(i64, %struct.AnyHitTraversalData, float, i32) #0
 
+define void @_cont_ExitRayGen(ptr nocapture readonly %data) alwaysinline nounwind !types !{!"function", !"void", !{i32 0, %struct.DispatchSystemData poison}} {
+  ret void
+}
+
 define %struct.BuiltInTriangleIntersectionAttributes @_cont_GetTriangleHitAttributes(%struct.SystemData* %data) #0 !types !17 {
   %addr = getelementptr %struct.SystemData, %struct.SystemData* %data, i32 0, i32 1
   %val = load %struct.BuiltInTriangleIntersectionAttributes, %struct.BuiltInTriangleIntersectionAttributes* %addr, align 4
@@ -60,6 +61,9 @@ declare !types !24 void @_AmdRestoreSystemData(%struct.DispatchSystemData*) #2
 ; Function Attrs: nounwind memory(none)
 declare !types !26 void @_AmdRestoreSystemDataAnyHit(%struct.AnyHitTraversalData*) #2
 
+; Function Attrs: nounwind
+declare i64 @_AmdGetResumePointAddr() #3
+
 ; Function Attrs: nounwind memory(none)
 declare !types !26 void @_cont_AcceptHit(%struct.AnyHitTraversalData* nocapture readnone) #2
 
@@ -75,7 +79,9 @@ define void @_cont_TraceRay(%struct.DispatchSystemData* %data, i64 %0, i32 %1, i
   %dis_data = load %struct.DispatchSystemData, %struct.DispatchSystemData* %data, align 4
   %sys_data = insertvalue %struct.SystemData undef, %struct.DispatchSystemData %dis_data, 0
   %trav_data = insertvalue %struct.TraversalData undef, %struct.SystemData %sys_data, 0
-  %newdata = call %struct.DispatchSystemData @_AmdAwaitTraversal(i64 4, %struct.TraversalData %trav_data)
+  %addr = call i64 @_AmdGetResumePointAddr() #3
+  %trav_data2 = insertvalue %struct.TraversalData %trav_data, i64 %addr, 5
+  %newdata = call %struct.DispatchSystemData @_AmdWaitAwaitTraversal(i64 4, i64 -1, %struct.TraversalData %trav_data2)
   store %struct.DispatchSystemData %newdata, %struct.DispatchSystemData* %data, align 4
   call void @_AmdRestoreSystemData(%struct.DispatchSystemData* %data)
   ret void
@@ -196,7 +202,7 @@ attributes #3 = { nounwind }
 ;
 ;
 ; CLEANUP-LABEL: define void @main(
-; CLEANUP-SAME: [[STRUCT_DISPATCHSYSTEMDATA:%.*]] [[TMP0:%.*]]) !lgc.rt.shaderstage [[META8:![0-9]+]] !continuation.entry [[META19:![0-9]+]] !continuation.registercount [[META8]] !continuation [[META20:![0-9]+]] !continuation.stacksize [[META21:![0-9]+]] !continuation.state [[META8]] {
+; CLEANUP-SAME: i64 [[RETURNADDR:%.*]], [[STRUCT_DISPATCHSYSTEMDATA:%.*]] [[TMP0:%.*]]) !lgc.rt.shaderstage [[META8:![0-9]+]] !continuation.entry [[META19:![0-9]+]] !continuation.registercount [[META8]] !continuation [[META20:![0-9]+]] !continuation.stacksize [[META21:![0-9]+]] !continuation.state [[META8]] {
 ; CLEANUP-NEXT:  AllocaSpillBB:
 ; CLEANUP-NEXT:    [[CONT_STATE_STACK_SEGMENT:%.*]] = call ptr addrspace(32) @lgc.cps.alloc(i32 108)
 ; CLEANUP-NEXT:    [[PAYLOAD_SPILL_ALLOCA:%.*]] = getelementptr inbounds [[MAIN_FRAME:%.*]], ptr addrspace(32) [[CONT_STATE_STACK_SEGMENT]], i32 0, i32 0
@@ -210,6 +216,8 @@ attributes #3 = { nounwind }
 ; CLEANUP-NEXT:    [[DIS_DATA_I_FCA_0_INSERT:%.*]] = insertvalue [[STRUCT_DISPATCHSYSTEMDATA]] poison, <3 x i32> [[DOTFCA_0_EXTRACT]], 0
 ; CLEANUP-NEXT:    [[SYS_DATA_I:%.*]] = insertvalue [[STRUCT_SYSTEMDATA:%.*]] undef, [[STRUCT_DISPATCHSYSTEMDATA]] [[DIS_DATA_I_FCA_0_INSERT]], 0
 ; CLEANUP-NEXT:    [[TRAV_DATA_I:%.*]] = insertvalue [[STRUCT_TRAVERSALDATA:%.*]] undef, [[STRUCT_SYSTEMDATA]] [[SYS_DATA_I]], 0
+; CLEANUP-NEXT:    [[ADDR_I:%.*]] = call i64 @_AmdGetResumePointAddr() #[[ATTR3:[0-9]+]]
+; CLEANUP-NEXT:    [[TRAV_DATA2_I:%.*]] = insertvalue [[STRUCT_TRAVERSALDATA]] [[TRAV_DATA_I]], i64 [[ADDR_I]], 5
 ; CLEANUP-NEXT:    [[TMP6:%.*]] = ptrtoint ptr addrspace(32) [[PAYLOAD_SPILL_ALLOCA]] to i32
 ; CLEANUP-NEXT:    store i32 [[TMP6]], ptr addrspace(20) @PAYLOAD, align 4
 ; CLEANUP-NEXT:    [[TMP7:%.*]] = load ptr addrspace(32), ptr addrspace(20) @PAYLOAD, align 4
@@ -289,16 +297,17 @@ attributes #3 = { nounwind }
 ; CLEANUP-NEXT:    store i32 undef, ptr addrspace(32) [[TMP32]], align 4
 ; CLEANUP-NEXT:    [[TMP33:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP7]], i32 26
 ; CLEANUP-NEXT:    store i32 undef, ptr addrspace(32) [[TMP33]], align 4
-; CLEANUP-NEXT:    call void (i64, ...) @continuation.continue(i64 4, i64 ptrtoint (ptr @main.resume.0 to i64), [[STRUCT_TRAVERSALDATA]] [[TRAV_DATA_I]]), !continuation.registercount [[META17:![0-9]+]], !continuation.returnedRegistercount !17
+; CLEANUP-NEXT:    [[TMP34:%.*]] = call i64 (...) @lgc.cps.as.continuation.reference__i64(ptr @main.resume.0)
+; CLEANUP-NEXT:    call void (i64, i64, ...) @continuation.waitContinue(i64 4, i64 -1, i64 [[TMP34]], i64 poison, [[STRUCT_TRAVERSALDATA]] [[TRAV_DATA2_I]]), !continuation.registercount [[META17:![0-9]+]], !continuation.returnedRegistercount [[META17]]
 ; CLEANUP-NEXT:    unreachable
 ;
 ;
 ; CLEANUP-LABEL: define dso_local void @main.resume.0(
-; CLEANUP-SAME: [[STRUCT_DISPATCHSYSTEMDATA:%.*]] [[TMP0:%.*]]) !lgc.rt.shaderstage [[META8]] !continuation.registercount [[META17]] !continuation [[META20]] {
+; CLEANUP-SAME: i64 [[TMP0:%.*]], [[STRUCT_DISPATCHSYSTEMDATA:%.*]] [[TMP1:%.*]]) !lgc.rt.shaderstage [[META8]] !continuation.registercount [[META17]] !continuation [[META20]] {
 ; CLEANUP-NEXT:  entryresume.0:
 ; CLEANUP-NEXT:    [[CONT_STATE_STACK_SEGMENT:%.*]] = call ptr addrspace(32) @lgc.cps.peek(i32 108)
 ; CLEANUP-NEXT:    [[PAYLOAD_SPILL_ALLOCA:%.*]] = getelementptr inbounds [[MAIN_FRAME:%.*]], ptr addrspace(32) [[CONT_STATE_STACK_SEGMENT]], i32 0, i32 0
-; CLEANUP-NEXT:    [[TMP1:%.*]] = load ptr addrspace(32), ptr addrspace(20) @PAYLOAD, align 4
+; CLEANUP-NEXT:    [[TMP2:%.*]] = load ptr addrspace(32), ptr addrspace(20) @PAYLOAD, align 4
 ; CLEANUP-NEXT:    [[TMP3:%.*]] = load i32, ptr addrspace(20) getelementptr inbounds (i32, ptr addrspace(20) @PAYLOAD, i32 7), align 4
 ; CLEANUP-NEXT:    [[TMP4:%.*]] = load i32, ptr addrspace(20) getelementptr inbounds (i32, ptr addrspace(20) @PAYLOAD, i32 8), align 4
 ; CLEANUP-NEXT:    [[TMP5:%.*]] = load i32, ptr addrspace(20) getelementptr inbounds (i32, ptr addrspace(20) @PAYLOAD, i32 9), align 4
@@ -322,61 +331,61 @@ attributes #3 = { nounwind }
 ; CLEANUP-NEXT:    [[TMP23:%.*]] = load i32, ptr addrspace(20) getelementptr inbounds (i32, ptr addrspace(20) @PAYLOAD, i32 27), align 4
 ; CLEANUP-NEXT:    [[TMP24:%.*]] = load i32, ptr addrspace(20) getelementptr inbounds (i32, ptr addrspace(20) @PAYLOAD, i32 28), align 4
 ; CLEANUP-NEXT:    [[TMP25:%.*]] = load i32, ptr addrspace(20) getelementptr inbounds (i32, ptr addrspace(20) @PAYLOAD, i32 29), align 4
-; CLEANUP-NEXT:    [[TMP78:%.*]] = load i32, ptr addrspace(32) [[TMP1]], align 4
-; CLEANUP-NEXT:    [[TMP26:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP1]], i32 1
-; CLEANUP-NEXT:    [[TMP27:%.*]] = load i32, ptr addrspace(32) [[TMP26]], align 4
-; CLEANUP-NEXT:    [[TMP28:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP1]], i32 2
-; CLEANUP-NEXT:    [[TMP29:%.*]] = load i32, ptr addrspace(32) [[TMP28]], align 4
-; CLEANUP-NEXT:    [[TMP30:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP1]], i32 3
-; CLEANUP-NEXT:    [[TMP31:%.*]] = load i32, ptr addrspace(32) [[TMP30]], align 4
-; CLEANUP-NEXT:    [[TMP32:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP1]], i32 4
-; CLEANUP-NEXT:    [[TMP33:%.*]] = load i32, ptr addrspace(32) [[TMP32]], align 4
-; CLEANUP-NEXT:    [[TMP34:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP1]], i32 5
-; CLEANUP-NEXT:    [[TMP35:%.*]] = load i32, ptr addrspace(32) [[TMP34]], align 4
-; CLEANUP-NEXT:    [[TMP36:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP1]], i32 6
-; CLEANUP-NEXT:    [[TMP37:%.*]] = load i32, ptr addrspace(32) [[TMP36]], align 4
-; CLEANUP-NEXT:    [[TMP38:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP1]], i32 7
-; CLEANUP-NEXT:    [[TMP39:%.*]] = load i32, ptr addrspace(32) [[TMP38]], align 4
-; CLEANUP-NEXT:    [[TMP40:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP1]], i32 8
-; CLEANUP-NEXT:    [[TMP41:%.*]] = load i32, ptr addrspace(32) [[TMP40]], align 4
-; CLEANUP-NEXT:    [[TMP42:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP1]], i32 9
-; CLEANUP-NEXT:    [[TMP43:%.*]] = load i32, ptr addrspace(32) [[TMP42]], align 4
-; CLEANUP-NEXT:    [[TMP44:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP1]], i32 10
-; CLEANUP-NEXT:    [[TMP45:%.*]] = load i32, ptr addrspace(32) [[TMP44]], align 4
-; CLEANUP-NEXT:    [[TMP46:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP1]], i32 11
-; CLEANUP-NEXT:    [[TMP47:%.*]] = load i32, ptr addrspace(32) [[TMP46]], align 4
-; CLEANUP-NEXT:    [[TMP48:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP1]], i32 12
-; CLEANUP-NEXT:    [[TMP49:%.*]] = load i32, ptr addrspace(32) [[TMP48]], align 4
-; CLEANUP-NEXT:    [[TMP50:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP1]], i32 13
-; CLEANUP-NEXT:    [[TMP51:%.*]] = load i32, ptr addrspace(32) [[TMP50]], align 4
-; CLEANUP-NEXT:    [[TMP52:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP1]], i32 14
-; CLEANUP-NEXT:    [[TMP53:%.*]] = load i32, ptr addrspace(32) [[TMP52]], align 4
-; CLEANUP-NEXT:    [[TMP54:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP1]], i32 15
-; CLEANUP-NEXT:    [[TMP55:%.*]] = load i32, ptr addrspace(32) [[TMP54]], align 4
-; CLEANUP-NEXT:    [[TMP56:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP1]], i32 16
-; CLEANUP-NEXT:    [[TMP57:%.*]] = load i32, ptr addrspace(32) [[TMP56]], align 4
-; CLEANUP-NEXT:    [[TMP58:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP1]], i32 17
-; CLEANUP-NEXT:    [[TMP59:%.*]] = load i32, ptr addrspace(32) [[TMP58]], align 4
-; CLEANUP-NEXT:    [[TMP60:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP1]], i32 18
-; CLEANUP-NEXT:    [[TMP61:%.*]] = load i32, ptr addrspace(32) [[TMP60]], align 4
-; CLEANUP-NEXT:    [[TMP62:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP1]], i32 19
-; CLEANUP-NEXT:    [[TMP63:%.*]] = load i32, ptr addrspace(32) [[TMP62]], align 4
-; CLEANUP-NEXT:    [[TMP64:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP1]], i32 20
-; CLEANUP-NEXT:    [[TMP65:%.*]] = load i32, ptr addrspace(32) [[TMP64]], align 4
-; CLEANUP-NEXT:    [[TMP66:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP1]], i32 21
-; CLEANUP-NEXT:    [[TMP67:%.*]] = load i32, ptr addrspace(32) [[TMP66]], align 4
-; CLEANUP-NEXT:    [[TMP68:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP1]], i32 22
-; CLEANUP-NEXT:    [[TMP69:%.*]] = load i32, ptr addrspace(32) [[TMP68]], align 4
-; CLEANUP-NEXT:    [[TMP70:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP1]], i32 23
-; CLEANUP-NEXT:    [[TMP71:%.*]] = load i32, ptr addrspace(32) [[TMP70]], align 4
-; CLEANUP-NEXT:    [[TMP72:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP1]], i32 24
-; CLEANUP-NEXT:    [[TMP73:%.*]] = load i32, ptr addrspace(32) [[TMP72]], align 4
-; CLEANUP-NEXT:    [[TMP74:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP1]], i32 25
-; CLEANUP-NEXT:    [[TMP75:%.*]] = load i32, ptr addrspace(32) [[TMP74]], align 4
-; CLEANUP-NEXT:    [[TMP76:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP1]], i32 26
-; CLEANUP-NEXT:    [[TMP77:%.*]] = load i32, ptr addrspace(32) [[TMP76]], align 4
+; CLEANUP-NEXT:    [[TMP26:%.*]] = load i32, ptr addrspace(32) [[TMP2]], align 4
+; CLEANUP-NEXT:    [[TMP27:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP2]], i32 1
+; CLEANUP-NEXT:    [[TMP28:%.*]] = load i32, ptr addrspace(32) [[TMP27]], align 4
+; CLEANUP-NEXT:    [[TMP29:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP2]], i32 2
+; CLEANUP-NEXT:    [[TMP30:%.*]] = load i32, ptr addrspace(32) [[TMP29]], align 4
+; CLEANUP-NEXT:    [[TMP31:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP2]], i32 3
+; CLEANUP-NEXT:    [[TMP32:%.*]] = load i32, ptr addrspace(32) [[TMP31]], align 4
+; CLEANUP-NEXT:    [[TMP33:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP2]], i32 4
+; CLEANUP-NEXT:    [[TMP34:%.*]] = load i32, ptr addrspace(32) [[TMP33]], align 4
+; CLEANUP-NEXT:    [[TMP35:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP2]], i32 5
+; CLEANUP-NEXT:    [[TMP36:%.*]] = load i32, ptr addrspace(32) [[TMP35]], align 4
+; CLEANUP-NEXT:    [[TMP37:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP2]], i32 6
+; CLEANUP-NEXT:    [[TMP38:%.*]] = load i32, ptr addrspace(32) [[TMP37]], align 4
+; CLEANUP-NEXT:    [[TMP39:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP2]], i32 7
+; CLEANUP-NEXT:    [[TMP40:%.*]] = load i32, ptr addrspace(32) [[TMP39]], align 4
+; CLEANUP-NEXT:    [[TMP41:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP2]], i32 8
+; CLEANUP-NEXT:    [[TMP42:%.*]] = load i32, ptr addrspace(32) [[TMP41]], align 4
+; CLEANUP-NEXT:    [[TMP43:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP2]], i32 9
+; CLEANUP-NEXT:    [[TMP44:%.*]] = load i32, ptr addrspace(32) [[TMP43]], align 4
+; CLEANUP-NEXT:    [[TMP45:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP2]], i32 10
+; CLEANUP-NEXT:    [[TMP46:%.*]] = load i32, ptr addrspace(32) [[TMP45]], align 4
+; CLEANUP-NEXT:    [[TMP47:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP2]], i32 11
+; CLEANUP-NEXT:    [[TMP48:%.*]] = load i32, ptr addrspace(32) [[TMP47]], align 4
+; CLEANUP-NEXT:    [[TMP49:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP2]], i32 12
+; CLEANUP-NEXT:    [[TMP50:%.*]] = load i32, ptr addrspace(32) [[TMP49]], align 4
+; CLEANUP-NEXT:    [[TMP51:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP2]], i32 13
+; CLEANUP-NEXT:    [[TMP52:%.*]] = load i32, ptr addrspace(32) [[TMP51]], align 4
+; CLEANUP-NEXT:    [[TMP53:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP2]], i32 14
+; CLEANUP-NEXT:    [[TMP54:%.*]] = load i32, ptr addrspace(32) [[TMP53]], align 4
+; CLEANUP-NEXT:    [[TMP55:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP2]], i32 15
+; CLEANUP-NEXT:    [[TMP56:%.*]] = load i32, ptr addrspace(32) [[TMP55]], align 4
+; CLEANUP-NEXT:    [[TMP57:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP2]], i32 16
+; CLEANUP-NEXT:    [[TMP58:%.*]] = load i32, ptr addrspace(32) [[TMP57]], align 4
+; CLEANUP-NEXT:    [[TMP59:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP2]], i32 17
+; CLEANUP-NEXT:    [[TMP60:%.*]] = load i32, ptr addrspace(32) [[TMP59]], align 4
+; CLEANUP-NEXT:    [[TMP61:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP2]], i32 18
+; CLEANUP-NEXT:    [[TMP62:%.*]] = load i32, ptr addrspace(32) [[TMP61]], align 4
+; CLEANUP-NEXT:    [[TMP63:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP2]], i32 19
+; CLEANUP-NEXT:    [[TMP64:%.*]] = load i32, ptr addrspace(32) [[TMP63]], align 4
+; CLEANUP-NEXT:    [[TMP65:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP2]], i32 20
+; CLEANUP-NEXT:    [[TMP66:%.*]] = load i32, ptr addrspace(32) [[TMP65]], align 4
+; CLEANUP-NEXT:    [[TMP67:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP2]], i32 21
+; CLEANUP-NEXT:    [[TMP68:%.*]] = load i32, ptr addrspace(32) [[TMP67]], align 4
+; CLEANUP-NEXT:    [[TMP69:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP2]], i32 22
+; CLEANUP-NEXT:    [[TMP70:%.*]] = load i32, ptr addrspace(32) [[TMP69]], align 4
+; CLEANUP-NEXT:    [[TMP71:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP2]], i32 23
+; CLEANUP-NEXT:    [[TMP72:%.*]] = load i32, ptr addrspace(32) [[TMP71]], align 4
+; CLEANUP-NEXT:    [[TMP73:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP2]], i32 24
+; CLEANUP-NEXT:    [[TMP74:%.*]] = load i32, ptr addrspace(32) [[TMP73]], align 4
+; CLEANUP-NEXT:    [[TMP75:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP2]], i32 25
+; CLEANUP-NEXT:    [[TMP76:%.*]] = load i32, ptr addrspace(32) [[TMP75]], align 4
+; CLEANUP-NEXT:    [[TMP77:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP2]], i32 26
+; CLEANUP-NEXT:    [[TMP78:%.*]] = load i32, ptr addrspace(32) [[TMP77]], align 4
 ; CLEANUP-NEXT:    [[TMP80:%.*]] = load ptr addrspace(32), ptr addrspace(20) @PAYLOAD, align 4
-; CLEANUP-NEXT:    [[DOTFCA_0_EXTRACT1:%.*]] = extractvalue [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP0]], 0
+; CLEANUP-NEXT:    [[DOTFCA_0_EXTRACT1:%.*]] = extractvalue [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP1]], 0
 ; CLEANUP-NEXT:    call void @amd.dx.setLocalRootIndex(i32 0)
 ; CLEANUP-NEXT:    call void @lgc.cps.free(i32 108)
 ; CLEANUP-NEXT:    ret void
@@ -385,7 +394,7 @@ attributes #3 = { nounwind }
 ;
 ;
 ; CLEANUP-LABEL: define void @AnyHit(
-; CLEANUP-SAME: i64 [[RETURNADDR:%.*]], [[STRUCT_ANYHITTRAVERSALDATA:%.*]] [[TMP0:%.*]], [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES:%.*]] [[TMP1:%.*]]) #[[ATTR3:[0-9]+]] !lgc.rt.shaderstage [[META22:![0-9]+]] !continuation.registercount [[META17]] !continuation [[META23:![0-9]+]] !continuation.state [[META8]] {
+; CLEANUP-SAME: i64 [[RETURNADDR:%.*]], [[STRUCT_ANYHITTRAVERSALDATA:%.*]] [[TMP0:%.*]], [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES:%.*]] [[TMP1:%.*]]) #[[ATTR3]] !lgc.rt.shaderstage [[META22:![0-9]+]] !continuation.registercount [[META17]] !continuation [[META23:![0-9]+]] !continuation.state [[META8]] {
 ; CLEANUP-NEXT:  AllocaSpillBB:
 ; CLEANUP-NEXT:    [[SYSTEM_DATA_ALLOCA:%.*]] = alloca [[STRUCT_ANYHITTRAVERSALDATA]], align 8
 ; CLEANUP-NEXT:    [[DOTFCA_0_0_0_0_EXTRACT:%.*]] = extractvalue [[STRUCT_ANYHITTRAVERSALDATA]] [[TMP0]], 0, 0, 0, 0
@@ -627,7 +636,7 @@ attributes #3 = { nounwind }
 ; CLEANUP-NEXT:    [[DOTFCA_1_1_GEP10:%.*]] = getelementptr inbounds [[STRUCT_ANYHITTRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 1, i32 1
 ; CLEANUP-NEXT:    [[DOTFCA_1_1_LOAD:%.*]] = load i32, ptr [[DOTFCA_1_1_GEP10]], align 4
 ; CLEANUP-NEXT:    [[DOTFCA_1_1_INSERT:%.*]] = insertvalue [[STRUCT_ANYHITTRAVERSALDATA]] [[DOTFCA_1_0_INSERT]], i32 [[DOTFCA_1_1_LOAD]], 1, 1
-; CLEANUP-NEXT:    call void (i64, ...) @continuation.continue(i64 [[RETURNADDR]], [[STRUCT_ANYHITTRAVERSALDATA]] [[DOTFCA_1_1_INSERT]]), !continuation.registercount [[META17]]
+; CLEANUP-NEXT:    call void (i64, ...) @continuation.continue(i64 [[RETURNADDR]], i64 poison, [[STRUCT_ANYHITTRAVERSALDATA]] [[DOTFCA_1_1_INSERT]]), !continuation.registercount [[META17]]
 ; CLEANUP-NEXT:    unreachable
 ;
 ;
@@ -735,6 +744,8 @@ attributes #3 = { nounwind }
 ; CLEANUP-NEXT:    [[DIS_DATA_I_FCA_0_INSERT:%.*]] = insertvalue [[STRUCT_DISPATCHSYSTEMDATA:%.*]] poison, <3 x i32> [[DOTFCA_0_0_EXTRACT]], 0
 ; CLEANUP-NEXT:    [[SYS_DATA_I:%.*]] = insertvalue [[STRUCT_SYSTEMDATA]] undef, [[STRUCT_DISPATCHSYSTEMDATA]] [[DIS_DATA_I_FCA_0_INSERT]], 0
 ; CLEANUP-NEXT:    [[TRAV_DATA_I:%.*]] = insertvalue [[STRUCT_TRAVERSALDATA:%.*]] undef, [[STRUCT_SYSTEMDATA]] [[SYS_DATA_I]], 0
+; CLEANUP-NEXT:    [[ADDR_I1:%.*]] = call i64 @_AmdGetResumePointAddr() #[[ATTR3]]
+; CLEANUP-NEXT:    [[TRAV_DATA2_I:%.*]] = insertvalue [[STRUCT_TRAVERSALDATA]] [[TRAV_DATA_I]], i64 [[ADDR_I1]], 5
 ; CLEANUP-NEXT:    [[TMP88:%.*]] = ptrtoint ptr addrspace(32) [[PAYLOAD_SPILL_ALLOCA]] to i32
 ; CLEANUP-NEXT:    store i32 [[TMP88]], ptr addrspace(20) @PAYLOAD, align 4
 ; CLEANUP-NEXT:    [[TMP89:%.*]] = load ptr addrspace(32), ptr addrspace(20) @PAYLOAD, align 4
@@ -814,16 +825,17 @@ attributes #3 = { nounwind }
 ; CLEANUP-NEXT:    store i32 [[TMP75]], ptr addrspace(32) [[TMP112]], align 4
 ; CLEANUP-NEXT:    [[TMP113:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP89]], i32 26
 ; CLEANUP-NEXT:    store i32 [[TMP77]], ptr addrspace(32) [[TMP113]], align 4
-; CLEANUP-NEXT:    call void (i64, ...) @continuation.continue(i64 4, i64 ptrtoint (ptr @ClosestHit.resume.0 to i64), [[STRUCT_TRAVERSALDATA]] [[TRAV_DATA_I]]), !continuation.registercount [[META17]], !continuation.returnedRegistercount !17
+; CLEANUP-NEXT:    [[TMP116:%.*]] = call i64 (...) @lgc.cps.as.continuation.reference__i64(ptr @ClosestHit.resume.0)
+; CLEANUP-NEXT:    call void (i64, i64, ...) @continuation.waitContinue(i64 4, i64 -1, i64 [[TMP116]], i64 poison, [[STRUCT_TRAVERSALDATA]] [[TRAV_DATA2_I]]), !continuation.registercount [[META17]], !continuation.returnedRegistercount [[META17]]
 ; CLEANUP-NEXT:    unreachable
 ;
 ;
 ; CLEANUP-LABEL: define dso_local void @ClosestHit.resume.0(
-; CLEANUP-SAME: [[STRUCT_DISPATCHSYSTEMDATA:%.*]] [[TMP0:%.*]]) !lgc.rt.shaderstage [[META24]] !continuation.registercount [[META17]] !continuation [[META25]] {
+; CLEANUP-SAME: i64 [[TMP0:%.*]], [[STRUCT_DISPATCHSYSTEMDATA:%.*]] [[TMP1:%.*]]) !lgc.rt.shaderstage [[META24]] !continuation.registercount [[META17]] !continuation [[META25]] {
 ; CLEANUP-NEXT:  entryresume.0:
 ; CLEANUP-NEXT:    [[CONT_STATE_STACK_SEGMENT:%.*]] = call ptr addrspace(32) @lgc.cps.peek(i32 120)
 ; CLEANUP-NEXT:    [[PAYLOAD_SPILL_ALLOCA:%.*]] = getelementptr inbounds [[CLOSESTHIT_FRAME:%.*]], ptr addrspace(32) [[CONT_STATE_STACK_SEGMENT]], i32 0, i32 0
-; CLEANUP-NEXT:    [[TMP1:%.*]] = load ptr addrspace(32), ptr addrspace(20) @PAYLOAD, align 4
+; CLEANUP-NEXT:    [[TMP2:%.*]] = load ptr addrspace(32), ptr addrspace(20) @PAYLOAD, align 4
 ; CLEANUP-NEXT:    [[TMP3:%.*]] = load i32, ptr addrspace(20) getelementptr inbounds (i32, ptr addrspace(20) @PAYLOAD, i32 7), align 4
 ; CLEANUP-NEXT:    [[TMP4:%.*]] = load i32, ptr addrspace(20) getelementptr inbounds (i32, ptr addrspace(20) @PAYLOAD, i32 8), align 4
 ; CLEANUP-NEXT:    [[TMP5:%.*]] = load i32, ptr addrspace(20) getelementptr inbounds (i32, ptr addrspace(20) @PAYLOAD, i32 9), align 4
@@ -847,61 +859,61 @@ attributes #3 = { nounwind }
 ; CLEANUP-NEXT:    [[TMP23:%.*]] = load i32, ptr addrspace(20) getelementptr inbounds (i32, ptr addrspace(20) @PAYLOAD, i32 27), align 4
 ; CLEANUP-NEXT:    [[TMP24:%.*]] = load i32, ptr addrspace(20) getelementptr inbounds (i32, ptr addrspace(20) @PAYLOAD, i32 28), align 4
 ; CLEANUP-NEXT:    [[TMP25:%.*]] = load i32, ptr addrspace(20) getelementptr inbounds (i32, ptr addrspace(20) @PAYLOAD, i32 29), align 4
-; CLEANUP-NEXT:    [[TMP78:%.*]] = load i32, ptr addrspace(32) [[TMP1]], align 4
-; CLEANUP-NEXT:    [[TMP26:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP1]], i32 1
-; CLEANUP-NEXT:    [[TMP27:%.*]] = load i32, ptr addrspace(32) [[TMP26]], align 4
-; CLEANUP-NEXT:    [[TMP28:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP1]], i32 2
-; CLEANUP-NEXT:    [[TMP29:%.*]] = load i32, ptr addrspace(32) [[TMP28]], align 4
-; CLEANUP-NEXT:    [[TMP30:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP1]], i32 3
-; CLEANUP-NEXT:    [[TMP31:%.*]] = load i32, ptr addrspace(32) [[TMP30]], align 4
-; CLEANUP-NEXT:    [[TMP32:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP1]], i32 4
-; CLEANUP-NEXT:    [[TMP33:%.*]] = load i32, ptr addrspace(32) [[TMP32]], align 4
-; CLEANUP-NEXT:    [[TMP34:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP1]], i32 5
-; CLEANUP-NEXT:    [[TMP35:%.*]] = load i32, ptr addrspace(32) [[TMP34]], align 4
-; CLEANUP-NEXT:    [[TMP36:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP1]], i32 6
-; CLEANUP-NEXT:    [[TMP37:%.*]] = load i32, ptr addrspace(32) [[TMP36]], align 4
-; CLEANUP-NEXT:    [[TMP38:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP1]], i32 7
-; CLEANUP-NEXT:    [[TMP39:%.*]] = load i32, ptr addrspace(32) [[TMP38]], align 4
-; CLEANUP-NEXT:    [[TMP40:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP1]], i32 8
-; CLEANUP-NEXT:    [[TMP41:%.*]] = load i32, ptr addrspace(32) [[TMP40]], align 4
-; CLEANUP-NEXT:    [[TMP42:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP1]], i32 9
-; CLEANUP-NEXT:    [[TMP43:%.*]] = load i32, ptr addrspace(32) [[TMP42]], align 4
-; CLEANUP-NEXT:    [[TMP44:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP1]], i32 10
-; CLEANUP-NEXT:    [[TMP45:%.*]] = load i32, ptr addrspace(32) [[TMP44]], align 4
-; CLEANUP-NEXT:    [[TMP46:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP1]], i32 11
-; CLEANUP-NEXT:    [[TMP47:%.*]] = load i32, ptr addrspace(32) [[TMP46]], align 4
-; CLEANUP-NEXT:    [[TMP48:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP1]], i32 12
-; CLEANUP-NEXT:    [[TMP49:%.*]] = load i32, ptr addrspace(32) [[TMP48]], align 4
-; CLEANUP-NEXT:    [[TMP50:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP1]], i32 13
-; CLEANUP-NEXT:    [[TMP51:%.*]] = load i32, ptr addrspace(32) [[TMP50]], align 4
-; CLEANUP-NEXT:    [[TMP52:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP1]], i32 14
-; CLEANUP-NEXT:    [[TMP53:%.*]] = load i32, ptr addrspace(32) [[TMP52]], align 4
-; CLEANUP-NEXT:    [[TMP54:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP1]], i32 15
-; CLEANUP-NEXT:    [[TMP55:%.*]] = load i32, ptr addrspace(32) [[TMP54]], align 4
-; CLEANUP-NEXT:    [[TMP56:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP1]], i32 16
-; CLEANUP-NEXT:    [[TMP57:%.*]] = load i32, ptr addrspace(32) [[TMP56]], align 4
-; CLEANUP-NEXT:    [[TMP58:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP1]], i32 17
-; CLEANUP-NEXT:    [[TMP59:%.*]] = load i32, ptr addrspace(32) [[TMP58]], align 4
-; CLEANUP-NEXT:    [[TMP60:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP1]], i32 18
-; CLEANUP-NEXT:    [[TMP61:%.*]] = load i32, ptr addrspace(32) [[TMP60]], align 4
-; CLEANUP-NEXT:    [[TMP62:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP1]], i32 19
-; CLEANUP-NEXT:    [[TMP63:%.*]] = load i32, ptr addrspace(32) [[TMP62]], align 4
-; CLEANUP-NEXT:    [[TMP64:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP1]], i32 20
-; CLEANUP-NEXT:    [[TMP65:%.*]] = load i32, ptr addrspace(32) [[TMP64]], align 4
-; CLEANUP-NEXT:    [[TMP66:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP1]], i32 21
-; CLEANUP-NEXT:    [[TMP67:%.*]] = load i32, ptr addrspace(32) [[TMP66]], align 4
-; CLEANUP-NEXT:    [[TMP68:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP1]], i32 22
-; CLEANUP-NEXT:    [[TMP69:%.*]] = load i32, ptr addrspace(32) [[TMP68]], align 4
-; CLEANUP-NEXT:    [[TMP70:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP1]], i32 23
-; CLEANUP-NEXT:    [[TMP71:%.*]] = load i32, ptr addrspace(32) [[TMP70]], align 4
-; CLEANUP-NEXT:    [[TMP72:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP1]], i32 24
-; CLEANUP-NEXT:    [[TMP73:%.*]] = load i32, ptr addrspace(32) [[TMP72]], align 4
-; CLEANUP-NEXT:    [[TMP74:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP1]], i32 25
-; CLEANUP-NEXT:    [[TMP75:%.*]] = load i32, ptr addrspace(32) [[TMP74]], align 4
-; CLEANUP-NEXT:    [[TMP76:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP1]], i32 26
-; CLEANUP-NEXT:    [[TMP77:%.*]] = load i32, ptr addrspace(32) [[TMP76]], align 4
+; CLEANUP-NEXT:    [[TMP26:%.*]] = load i32, ptr addrspace(32) [[TMP2]], align 4
+; CLEANUP-NEXT:    [[TMP27:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP2]], i32 1
+; CLEANUP-NEXT:    [[TMP28:%.*]] = load i32, ptr addrspace(32) [[TMP27]], align 4
+; CLEANUP-NEXT:    [[TMP29:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP2]], i32 2
+; CLEANUP-NEXT:    [[TMP30:%.*]] = load i32, ptr addrspace(32) [[TMP29]], align 4
+; CLEANUP-NEXT:    [[TMP31:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP2]], i32 3
+; CLEANUP-NEXT:    [[TMP32:%.*]] = load i32, ptr addrspace(32) [[TMP31]], align 4
+; CLEANUP-NEXT:    [[TMP33:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP2]], i32 4
+; CLEANUP-NEXT:    [[TMP34:%.*]] = load i32, ptr addrspace(32) [[TMP33]], align 4
+; CLEANUP-NEXT:    [[TMP35:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP2]], i32 5
+; CLEANUP-NEXT:    [[TMP36:%.*]] = load i32, ptr addrspace(32) [[TMP35]], align 4
+; CLEANUP-NEXT:    [[TMP37:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP2]], i32 6
+; CLEANUP-NEXT:    [[TMP38:%.*]] = load i32, ptr addrspace(32) [[TMP37]], align 4
+; CLEANUP-NEXT:    [[TMP39:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP2]], i32 7
+; CLEANUP-NEXT:    [[TMP40:%.*]] = load i32, ptr addrspace(32) [[TMP39]], align 4
+; CLEANUP-NEXT:    [[TMP41:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP2]], i32 8
+; CLEANUP-NEXT:    [[TMP42:%.*]] = load i32, ptr addrspace(32) [[TMP41]], align 4
+; CLEANUP-NEXT:    [[TMP43:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP2]], i32 9
+; CLEANUP-NEXT:    [[TMP44:%.*]] = load i32, ptr addrspace(32) [[TMP43]], align 4
+; CLEANUP-NEXT:    [[TMP45:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP2]], i32 10
+; CLEANUP-NEXT:    [[TMP46:%.*]] = load i32, ptr addrspace(32) [[TMP45]], align 4
+; CLEANUP-NEXT:    [[TMP47:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP2]], i32 11
+; CLEANUP-NEXT:    [[TMP48:%.*]] = load i32, ptr addrspace(32) [[TMP47]], align 4
+; CLEANUP-NEXT:    [[TMP49:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP2]], i32 12
+; CLEANUP-NEXT:    [[TMP50:%.*]] = load i32, ptr addrspace(32) [[TMP49]], align 4
+; CLEANUP-NEXT:    [[TMP51:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP2]], i32 13
+; CLEANUP-NEXT:    [[TMP52:%.*]] = load i32, ptr addrspace(32) [[TMP51]], align 4
+; CLEANUP-NEXT:    [[TMP53:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP2]], i32 14
+; CLEANUP-NEXT:    [[TMP54:%.*]] = load i32, ptr addrspace(32) [[TMP53]], align 4
+; CLEANUP-NEXT:    [[TMP55:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP2]], i32 15
+; CLEANUP-NEXT:    [[TMP56:%.*]] = load i32, ptr addrspace(32) [[TMP55]], align 4
+; CLEANUP-NEXT:    [[TMP57:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP2]], i32 16
+; CLEANUP-NEXT:    [[TMP58:%.*]] = load i32, ptr addrspace(32) [[TMP57]], align 4
+; CLEANUP-NEXT:    [[TMP59:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP2]], i32 17
+; CLEANUP-NEXT:    [[TMP60:%.*]] = load i32, ptr addrspace(32) [[TMP59]], align 4
+; CLEANUP-NEXT:    [[TMP61:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP2]], i32 18
+; CLEANUP-NEXT:    [[TMP62:%.*]] = load i32, ptr addrspace(32) [[TMP61]], align 4
+; CLEANUP-NEXT:    [[TMP63:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP2]], i32 19
+; CLEANUP-NEXT:    [[TMP64:%.*]] = load i32, ptr addrspace(32) [[TMP63]], align 4
+; CLEANUP-NEXT:    [[TMP65:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP2]], i32 20
+; CLEANUP-NEXT:    [[TMP66:%.*]] = load i32, ptr addrspace(32) [[TMP65]], align 4
+; CLEANUP-NEXT:    [[TMP67:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP2]], i32 21
+; CLEANUP-NEXT:    [[TMP68:%.*]] = load i32, ptr addrspace(32) [[TMP67]], align 4
+; CLEANUP-NEXT:    [[TMP69:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP2]], i32 22
+; CLEANUP-NEXT:    [[TMP70:%.*]] = load i32, ptr addrspace(32) [[TMP69]], align 4
+; CLEANUP-NEXT:    [[TMP71:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP2]], i32 23
+; CLEANUP-NEXT:    [[TMP72:%.*]] = load i32, ptr addrspace(32) [[TMP71]], align 4
+; CLEANUP-NEXT:    [[TMP73:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP2]], i32 24
+; CLEANUP-NEXT:    [[TMP74:%.*]] = load i32, ptr addrspace(32) [[TMP73]], align 4
+; CLEANUP-NEXT:    [[TMP75:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP2]], i32 25
+; CLEANUP-NEXT:    [[TMP76:%.*]] = load i32, ptr addrspace(32) [[TMP75]], align 4
+; CLEANUP-NEXT:    [[TMP77:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP2]], i32 26
+; CLEANUP-NEXT:    [[TMP78:%.*]] = load i32, ptr addrspace(32) [[TMP77]], align 4
 ; CLEANUP-NEXT:    [[TMP80:%.*]] = load ptr addrspace(32), ptr addrspace(20) @PAYLOAD, align 4
-; CLEANUP-NEXT:    [[DOTFCA_0_EXTRACT:%.*]] = extractvalue [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP0]], 0
+; CLEANUP-NEXT:    [[DOTFCA_0_EXTRACT:%.*]] = extractvalue [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP1]], 0
 ; CLEANUP-NEXT:    call void @amd.dx.setLocalRootIndex(i32 5)
 ; CLEANUP-NEXT:    [[DOTRELOAD_ADDR:%.*]] = getelementptr inbounds [[CLOSESTHIT_FRAME]], ptr addrspace(32) [[CONT_STATE_STACK_SEGMENT]], i32 0, i32 2
 ; CLEANUP-NEXT:    [[DOTRELOAD:%.*]] = load i32, ptr addrspace(32) [[DOTRELOAD_ADDR]], align 4
@@ -932,62 +944,62 @@ attributes #3 = { nounwind }
 ; CLEANUP-NEXT:    store i32 [[TMP23]], ptr addrspace(20) getelementptr inbounds (i32, ptr addrspace(20) @PAYLOAD, i32 27), align 4
 ; CLEANUP-NEXT:    store i32 [[TMP24]], ptr addrspace(20) getelementptr inbounds (i32, ptr addrspace(20) @PAYLOAD, i32 28), align 4
 ; CLEANUP-NEXT:    store i32 [[TMP25]], ptr addrspace(20) getelementptr inbounds (i32, ptr addrspace(20) @PAYLOAD, i32 29), align 4
-; CLEANUP-NEXT:    store i32 [[TMP78]], ptr addrspace(32) [[TMP81]], align 4
+; CLEANUP-NEXT:    store i32 [[TMP26]], ptr addrspace(32) [[TMP81]], align 4
 ; CLEANUP-NEXT:    [[TMP106:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP81]], i32 1
-; CLEANUP-NEXT:    store i32 [[TMP27]], ptr addrspace(32) [[TMP106]], align 4
+; CLEANUP-NEXT:    store i32 [[TMP28]], ptr addrspace(32) [[TMP106]], align 4
 ; CLEANUP-NEXT:    [[TMP107:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP81]], i32 2
-; CLEANUP-NEXT:    store i32 [[TMP29]], ptr addrspace(32) [[TMP107]], align 4
+; CLEANUP-NEXT:    store i32 [[TMP30]], ptr addrspace(32) [[TMP107]], align 4
 ; CLEANUP-NEXT:    [[TMP82:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP81]], i32 3
-; CLEANUP-NEXT:    store i32 [[TMP31]], ptr addrspace(32) [[TMP82]], align 4
+; CLEANUP-NEXT:    store i32 [[TMP32]], ptr addrspace(32) [[TMP82]], align 4
 ; CLEANUP-NEXT:    [[TMP83:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP81]], i32 4
-; CLEANUP-NEXT:    store i32 [[TMP33]], ptr addrspace(32) [[TMP83]], align 4
+; CLEANUP-NEXT:    store i32 [[TMP34]], ptr addrspace(32) [[TMP83]], align 4
 ; CLEANUP-NEXT:    [[TMP84:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP81]], i32 5
-; CLEANUP-NEXT:    store i32 [[TMP35]], ptr addrspace(32) [[TMP84]], align 4
+; CLEANUP-NEXT:    store i32 [[TMP36]], ptr addrspace(32) [[TMP84]], align 4
 ; CLEANUP-NEXT:    [[TMP85:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP81]], i32 6
-; CLEANUP-NEXT:    store i32 [[TMP37]], ptr addrspace(32) [[TMP85]], align 4
+; CLEANUP-NEXT:    store i32 [[TMP38]], ptr addrspace(32) [[TMP85]], align 4
 ; CLEANUP-NEXT:    [[TMP86:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP81]], i32 7
-; CLEANUP-NEXT:    store i32 [[TMP39]], ptr addrspace(32) [[TMP86]], align 4
+; CLEANUP-NEXT:    store i32 [[TMP40]], ptr addrspace(32) [[TMP86]], align 4
 ; CLEANUP-NEXT:    [[TMP87:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP81]], i32 8
-; CLEANUP-NEXT:    store i32 [[TMP41]], ptr addrspace(32) [[TMP87]], align 4
+; CLEANUP-NEXT:    store i32 [[TMP42]], ptr addrspace(32) [[TMP87]], align 4
 ; CLEANUP-NEXT:    [[TMP88:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP81]], i32 9
-; CLEANUP-NEXT:    store i32 [[TMP43]], ptr addrspace(32) [[TMP88]], align 4
+; CLEANUP-NEXT:    store i32 [[TMP44]], ptr addrspace(32) [[TMP88]], align 4
 ; CLEANUP-NEXT:    [[TMP89:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP81]], i32 10
-; CLEANUP-NEXT:    store i32 [[TMP45]], ptr addrspace(32) [[TMP89]], align 4
+; CLEANUP-NEXT:    store i32 [[TMP46]], ptr addrspace(32) [[TMP89]], align 4
 ; CLEANUP-NEXT:    [[TMP90:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP81]], i32 11
-; CLEANUP-NEXT:    store i32 [[TMP47]], ptr addrspace(32) [[TMP90]], align 4
+; CLEANUP-NEXT:    store i32 [[TMP48]], ptr addrspace(32) [[TMP90]], align 4
 ; CLEANUP-NEXT:    [[TMP91:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP81]], i32 12
-; CLEANUP-NEXT:    store i32 [[TMP49]], ptr addrspace(32) [[TMP91]], align 4
+; CLEANUP-NEXT:    store i32 [[TMP50]], ptr addrspace(32) [[TMP91]], align 4
 ; CLEANUP-NEXT:    [[TMP92:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP81]], i32 13
-; CLEANUP-NEXT:    store i32 [[TMP51]], ptr addrspace(32) [[TMP92]], align 4
+; CLEANUP-NEXT:    store i32 [[TMP52]], ptr addrspace(32) [[TMP92]], align 4
 ; CLEANUP-NEXT:    [[TMP93:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP81]], i32 14
-; CLEANUP-NEXT:    store i32 [[TMP53]], ptr addrspace(32) [[TMP93]], align 4
+; CLEANUP-NEXT:    store i32 [[TMP54]], ptr addrspace(32) [[TMP93]], align 4
 ; CLEANUP-NEXT:    [[TMP94:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP81]], i32 15
-; CLEANUP-NEXT:    store i32 [[TMP55]], ptr addrspace(32) [[TMP94]], align 4
+; CLEANUP-NEXT:    store i32 [[TMP56]], ptr addrspace(32) [[TMP94]], align 4
 ; CLEANUP-NEXT:    [[TMP95:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP81]], i32 16
-; CLEANUP-NEXT:    store i32 [[TMP57]], ptr addrspace(32) [[TMP95]], align 4
+; CLEANUP-NEXT:    store i32 [[TMP58]], ptr addrspace(32) [[TMP95]], align 4
 ; CLEANUP-NEXT:    [[TMP96:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP81]], i32 17
-; CLEANUP-NEXT:    store i32 [[TMP59]], ptr addrspace(32) [[TMP96]], align 4
+; CLEANUP-NEXT:    store i32 [[TMP60]], ptr addrspace(32) [[TMP96]], align 4
 ; CLEANUP-NEXT:    [[TMP97:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP81]], i32 18
-; CLEANUP-NEXT:    store i32 [[TMP61]], ptr addrspace(32) [[TMP97]], align 4
+; CLEANUP-NEXT:    store i32 [[TMP62]], ptr addrspace(32) [[TMP97]], align 4
 ; CLEANUP-NEXT:    [[TMP98:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP81]], i32 19
-; CLEANUP-NEXT:    store i32 [[TMP63]], ptr addrspace(32) [[TMP98]], align 4
+; CLEANUP-NEXT:    store i32 [[TMP64]], ptr addrspace(32) [[TMP98]], align 4
 ; CLEANUP-NEXT:    [[TMP99:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP81]], i32 20
-; CLEANUP-NEXT:    store i32 [[TMP65]], ptr addrspace(32) [[TMP99]], align 4
+; CLEANUP-NEXT:    store i32 [[TMP66]], ptr addrspace(32) [[TMP99]], align 4
 ; CLEANUP-NEXT:    [[TMP100:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP81]], i32 21
-; CLEANUP-NEXT:    store i32 [[TMP67]], ptr addrspace(32) [[TMP100]], align 4
+; CLEANUP-NEXT:    store i32 [[TMP68]], ptr addrspace(32) [[TMP100]], align 4
 ; CLEANUP-NEXT:    [[TMP101:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP81]], i32 22
-; CLEANUP-NEXT:    store i32 [[TMP69]], ptr addrspace(32) [[TMP101]], align 4
+; CLEANUP-NEXT:    store i32 [[TMP70]], ptr addrspace(32) [[TMP101]], align 4
 ; CLEANUP-NEXT:    [[TMP102:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP81]], i32 23
-; CLEANUP-NEXT:    store i32 [[TMP71]], ptr addrspace(32) [[TMP102]], align 4
+; CLEANUP-NEXT:    store i32 [[TMP72]], ptr addrspace(32) [[TMP102]], align 4
 ; CLEANUP-NEXT:    [[TMP103:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP81]], i32 24
-; CLEANUP-NEXT:    store i32 [[TMP73]], ptr addrspace(32) [[TMP103]], align 4
+; CLEANUP-NEXT:    store i32 [[TMP74]], ptr addrspace(32) [[TMP103]], align 4
 ; CLEANUP-NEXT:    [[TMP104:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP81]], i32 25
-; CLEANUP-NEXT:    store i32 [[TMP75]], ptr addrspace(32) [[TMP104]], align 4
+; CLEANUP-NEXT:    store i32 [[TMP76]], ptr addrspace(32) [[TMP104]], align 4
 ; CLEANUP-NEXT:    [[TMP105:%.*]] = getelementptr inbounds i32, ptr addrspace(32) [[TMP81]], i32 26
-; CLEANUP-NEXT:    store i32 [[TMP77]], ptr addrspace(32) [[TMP105]], align 4
+; CLEANUP-NEXT:    store i32 [[TMP78]], ptr addrspace(32) [[TMP105]], align 4
 ; CLEANUP-NEXT:    [[DOTFCA_0_INSERT:%.*]] = insertvalue [[STRUCT_DISPATCHSYSTEMDATA]] poison, <3 x i32> [[DOTFCA_0_EXTRACT]], 0
 ; CLEANUP-NEXT:    call void @lgc.cps.free(i32 120)
-; CLEANUP-NEXT:    call void (i64, ...) @continuation.continue(i64 [[RETURNADDR_RELOAD]], [[STRUCT_DISPATCHSYSTEMDATA]] [[DOTFCA_0_INSERT]]), !continuation.registercount [[META17]]
+; CLEANUP-NEXT:    call void (i64, ...) @continuation.continue(i64 [[RETURNADDR_RELOAD]], i64 poison, [[STRUCT_DISPATCHSYSTEMDATA]] [[DOTFCA_0_INSERT]]), !continuation.registercount [[META17]]
 ; CLEANUP-NEXT:    unreachable
 ;
 ;
@@ -1004,16 +1016,14 @@ attributes #3 = { nounwind }
 ;
 ;
 ; POST-PROCESS-LABEL: define void @main(
-; POST-PROCESS-SAME: ) !lgc.rt.shaderstage [[META8:![0-9]+]] !continuation.entry [[META19:![0-9]+]] !continuation.registercount [[META8]] !continuation [[META20:![0-9]+]] !continuation.stacksize [[META21:![0-9]+]] !continuation.state [[META8]] {
+; POST-PROCESS-SAME: i32 [[CSPINIT:%.*]], i64 [[RETURNADDR:%.*]], [[STRUCT_DISPATCHSYSTEMDATA:%.*]] [[TMP0:%.*]]) !lgc.rt.shaderstage [[META8:![0-9]+]] !continuation.entry [[META19:![0-9]+]] !continuation.registercount [[META8]] !continuation [[META20:![0-9]+]] !continuation.stacksize [[META21:![0-9]+]] !continuation.state [[META8]] {
 ; POST-PROCESS-NEXT:  AllocaSpillBB:
 ; POST-PROCESS-NEXT:    [[CSP:%.*]] = alloca i32, align 4
-; POST-PROCESS-NEXT:    [[TMP0:%.*]] = call i32 @_cont_GetContinuationStackAddr()
-; POST-PROCESS-NEXT:    store i32 [[TMP0]], ptr [[CSP]], align 4
-; POST-PROCESS-NEXT:    [[SYSTEM_DATA:%.*]] = call [[STRUCT_DISPATCHSYSTEMDATA:%.*]] [[_CONT_SETUPRAYGEN:@[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]]()
+; POST-PROCESS-NEXT:    store i32 [[CSPINIT]], ptr [[CSP]], align 4
 ; POST-PROCESS-NEXT:    [[TMP1:%.*]] = load i32, ptr [[CSP]], align 4
 ; POST-PROCESS-NEXT:    [[TMP2:%.*]] = add i32 [[TMP1]], 108
 ; POST-PROCESS-NEXT:    store i32 [[TMP2]], ptr [[CSP]], align 4
-; POST-PROCESS-NEXT:    [[DOTFCA_0_EXTRACT:%.*]] = extractvalue [[STRUCT_DISPATCHSYSTEMDATA]] [[SYSTEM_DATA]], 0
+; POST-PROCESS-NEXT:    [[DOTFCA_0_EXTRACT:%.*]] = extractvalue [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP0]], 0
 ; POST-PROCESS-NEXT:    call void @amd.dx.setLocalRootIndex(i32 0)
 ; POST-PROCESS-NEXT:    [[TMP3:%.*]] = load [[DX_TYPES_HANDLE:%.*]], ptr @"\01?Scene@@3URaytracingAccelerationStructure@@A", align 4
 ; POST-PROCESS-NEXT:    [[TMP4:%.*]] = load [[DX_TYPES_HANDLE]], ptr @"\01?RenderTarget@@3V?$RWTexture2D@V?$vector@M$03@@@@A", align 4
@@ -1023,6 +1033,8 @@ attributes #3 = { nounwind }
 ; POST-PROCESS-NEXT:    [[DIS_DATA_I_FCA_0_INSERT:%.*]] = insertvalue [[STRUCT_DISPATCHSYSTEMDATA]] poison, <3 x i32> [[DOTFCA_0_EXTRACT]], 0
 ; POST-PROCESS-NEXT:    [[SYS_DATA_I:%.*]] = insertvalue [[STRUCT_SYSTEMDATA:%.*]] undef, [[STRUCT_DISPATCHSYSTEMDATA]] [[DIS_DATA_I_FCA_0_INSERT]], 0
 ; POST-PROCESS-NEXT:    [[TRAV_DATA_I:%.*]] = insertvalue [[STRUCT_TRAVERSALDATA:%.*]] undef, [[STRUCT_SYSTEMDATA]] [[SYS_DATA_I]], 0
+; POST-PROCESS-NEXT:    [[TMP9:%.*]] = call i64 @continuation.getAddrAndMD(ptr @main.resume.0)
+; POST-PROCESS-NEXT:    [[TRAV_DATA2_I:%.*]] = insertvalue [[STRUCT_TRAVERSALDATA]] [[TRAV_DATA_I]], i64 [[TMP9]], 5
 ; POST-PROCESS-NEXT:    store i32 [[TMP1]], ptr addrspace(20) @REGISTERS, align 4
 ; POST-PROCESS-NEXT:    [[TMP8:%.*]] = load i32, ptr addrspace(20) @REGISTERS, align 4
 ; POST-PROCESS-NEXT:    store i32 undef, ptr addrspace(20) getelementptr inbounds (i32, ptr addrspace(20) @REGISTERS, i32 7), align 4
@@ -1048,126 +1060,125 @@ attributes #3 = { nounwind }
 ; POST-PROCESS-NEXT:    store i32 undef, ptr addrspace(20) getelementptr inbounds (i32, ptr addrspace(20) @REGISTERS, i32 27), align 4
 ; POST-PROCESS-NEXT:    store i32 undef, ptr addrspace(20) getelementptr inbounds (i32, ptr addrspace(20) @REGISTERS, i32 28), align 4
 ; POST-PROCESS-NEXT:    store i32 undef, ptr addrspace(20) getelementptr inbounds (i32, ptr addrspace(20) @REGISTERS, i32 29), align 4
-; POST-PROCESS-NEXT:    [[TMP9:%.*]] = inttoptr i32 [[TMP8]] to ptr addrspace(21)
-; POST-PROCESS-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP9]], i32 0
-; POST-PROCESS-NEXT:    store i32 undef, ptr addrspace(21) [[TMP10]], align 4
+; POST-PROCESS-NEXT:    [[TMP10:%.*]] = inttoptr i32 [[TMP8]] to ptr addrspace(21)
+; POST-PROCESS-NEXT:    [[TMP12:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP10]], i32 0
+; POST-PROCESS-NEXT:    store i32 undef, ptr addrspace(21) [[TMP12]], align 4
 ; POST-PROCESS-NEXT:    [[TMP11:%.*]] = add i32 [[TMP8]], 4
-; POST-PROCESS-NEXT:    [[TMP12:%.*]] = inttoptr i32 [[TMP11]] to ptr addrspace(21)
-; POST-PROCESS-NEXT:    [[TMP13:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP12]], i32 0
-; POST-PROCESS-NEXT:    store i32 undef, ptr addrspace(21) [[TMP13]], align 4
+; POST-PROCESS-NEXT:    [[TMP13:%.*]] = inttoptr i32 [[TMP11]] to ptr addrspace(21)
+; POST-PROCESS-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP13]], i32 0
+; POST-PROCESS-NEXT:    store i32 undef, ptr addrspace(21) [[TMP15]], align 4
 ; POST-PROCESS-NEXT:    [[TMP14:%.*]] = add i32 [[TMP8]], 8
-; POST-PROCESS-NEXT:    [[TMP15:%.*]] = inttoptr i32 [[TMP14]] to ptr addrspace(21)
-; POST-PROCESS-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP15]], i32 0
-; POST-PROCESS-NEXT:    store i32 undef, ptr addrspace(21) [[TMP16]], align 4
+; POST-PROCESS-NEXT:    [[TMP16:%.*]] = inttoptr i32 [[TMP14]] to ptr addrspace(21)
+; POST-PROCESS-NEXT:    [[TMP18:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP16]], i32 0
+; POST-PROCESS-NEXT:    store i32 undef, ptr addrspace(21) [[TMP18]], align 4
 ; POST-PROCESS-NEXT:    [[TMP17:%.*]] = add i32 [[TMP8]], 12
-; POST-PROCESS-NEXT:    [[TMP18:%.*]] = inttoptr i32 [[TMP17]] to ptr addrspace(21)
-; POST-PROCESS-NEXT:    [[TMP19:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP18]], i32 0
-; POST-PROCESS-NEXT:    store i32 undef, ptr addrspace(21) [[TMP19]], align 4
+; POST-PROCESS-NEXT:    [[TMP19:%.*]] = inttoptr i32 [[TMP17]] to ptr addrspace(21)
+; POST-PROCESS-NEXT:    [[TMP21:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP19]], i32 0
+; POST-PROCESS-NEXT:    store i32 undef, ptr addrspace(21) [[TMP21]], align 4
 ; POST-PROCESS-NEXT:    [[TMP20:%.*]] = add i32 [[TMP8]], 16
-; POST-PROCESS-NEXT:    [[TMP21:%.*]] = inttoptr i32 [[TMP20]] to ptr addrspace(21)
-; POST-PROCESS-NEXT:    [[TMP22:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP21]], i32 0
-; POST-PROCESS-NEXT:    store i32 undef, ptr addrspace(21) [[TMP22]], align 4
+; POST-PROCESS-NEXT:    [[TMP22:%.*]] = inttoptr i32 [[TMP20]] to ptr addrspace(21)
+; POST-PROCESS-NEXT:    [[TMP24:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP22]], i32 0
+; POST-PROCESS-NEXT:    store i32 undef, ptr addrspace(21) [[TMP24]], align 4
 ; POST-PROCESS-NEXT:    [[TMP23:%.*]] = add i32 [[TMP8]], 20
-; POST-PROCESS-NEXT:    [[TMP24:%.*]] = inttoptr i32 [[TMP23]] to ptr addrspace(21)
-; POST-PROCESS-NEXT:    [[TMP25:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP24]], i32 0
-; POST-PROCESS-NEXT:    store i32 undef, ptr addrspace(21) [[TMP25]], align 4
+; POST-PROCESS-NEXT:    [[TMP25:%.*]] = inttoptr i32 [[TMP23]] to ptr addrspace(21)
+; POST-PROCESS-NEXT:    [[TMP27:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP25]], i32 0
+; POST-PROCESS-NEXT:    store i32 undef, ptr addrspace(21) [[TMP27]], align 4
 ; POST-PROCESS-NEXT:    [[TMP26:%.*]] = add i32 [[TMP8]], 24
-; POST-PROCESS-NEXT:    [[TMP27:%.*]] = inttoptr i32 [[TMP26]] to ptr addrspace(21)
-; POST-PROCESS-NEXT:    [[TMP28:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP27]], i32 0
-; POST-PROCESS-NEXT:    store i32 undef, ptr addrspace(21) [[TMP28]], align 4
+; POST-PROCESS-NEXT:    [[TMP28:%.*]] = inttoptr i32 [[TMP26]] to ptr addrspace(21)
+; POST-PROCESS-NEXT:    [[TMP30:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP28]], i32 0
+; POST-PROCESS-NEXT:    store i32 undef, ptr addrspace(21) [[TMP30]], align 4
 ; POST-PROCESS-NEXT:    [[TMP29:%.*]] = add i32 [[TMP8]], 28
-; POST-PROCESS-NEXT:    [[TMP30:%.*]] = inttoptr i32 [[TMP29]] to ptr addrspace(21)
-; POST-PROCESS-NEXT:    [[TMP31:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP30]], i32 0
-; POST-PROCESS-NEXT:    store i32 undef, ptr addrspace(21) [[TMP31]], align 4
+; POST-PROCESS-NEXT:    [[TMP31:%.*]] = inttoptr i32 [[TMP29]] to ptr addrspace(21)
+; POST-PROCESS-NEXT:    [[TMP33:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP31]], i32 0
+; POST-PROCESS-NEXT:    store i32 undef, ptr addrspace(21) [[TMP33]], align 4
 ; POST-PROCESS-NEXT:    [[TMP32:%.*]] = add i32 [[TMP8]], 32
-; POST-PROCESS-NEXT:    [[TMP33:%.*]] = inttoptr i32 [[TMP32]] to ptr addrspace(21)
-; POST-PROCESS-NEXT:    [[TMP34:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP33]], i32 0
-; POST-PROCESS-NEXT:    store i32 undef, ptr addrspace(21) [[TMP34]], align 4
+; POST-PROCESS-NEXT:    [[TMP34:%.*]] = inttoptr i32 [[TMP32]] to ptr addrspace(21)
+; POST-PROCESS-NEXT:    [[TMP36:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP34]], i32 0
+; POST-PROCESS-NEXT:    store i32 undef, ptr addrspace(21) [[TMP36]], align 4
 ; POST-PROCESS-NEXT:    [[TMP35:%.*]] = add i32 [[TMP8]], 36
-; POST-PROCESS-NEXT:    [[TMP36:%.*]] = inttoptr i32 [[TMP35]] to ptr addrspace(21)
-; POST-PROCESS-NEXT:    [[TMP37:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP36]], i32 0
-; POST-PROCESS-NEXT:    store i32 undef, ptr addrspace(21) [[TMP37]], align 4
+; POST-PROCESS-NEXT:    [[TMP37:%.*]] = inttoptr i32 [[TMP35]] to ptr addrspace(21)
+; POST-PROCESS-NEXT:    [[TMP39:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP37]], i32 0
+; POST-PROCESS-NEXT:    store i32 undef, ptr addrspace(21) [[TMP39]], align 4
 ; POST-PROCESS-NEXT:    [[TMP38:%.*]] = add i32 [[TMP8]], 40
-; POST-PROCESS-NEXT:    [[TMP39:%.*]] = inttoptr i32 [[TMP38]] to ptr addrspace(21)
-; POST-PROCESS-NEXT:    [[TMP40:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP39]], i32 0
-; POST-PROCESS-NEXT:    store i32 undef, ptr addrspace(21) [[TMP40]], align 4
+; POST-PROCESS-NEXT:    [[TMP40:%.*]] = inttoptr i32 [[TMP38]] to ptr addrspace(21)
+; POST-PROCESS-NEXT:    [[TMP42:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP40]], i32 0
+; POST-PROCESS-NEXT:    store i32 undef, ptr addrspace(21) [[TMP42]], align 4
 ; POST-PROCESS-NEXT:    [[TMP41:%.*]] = add i32 [[TMP8]], 44
-; POST-PROCESS-NEXT:    [[TMP42:%.*]] = inttoptr i32 [[TMP41]] to ptr addrspace(21)
-; POST-PROCESS-NEXT:    [[TMP43:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP42]], i32 0
-; POST-PROCESS-NEXT:    store i32 undef, ptr addrspace(21) [[TMP43]], align 4
+; POST-PROCESS-NEXT:    [[TMP43:%.*]] = inttoptr i32 [[TMP41]] to ptr addrspace(21)
+; POST-PROCESS-NEXT:    [[TMP45:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP43]], i32 0
+; POST-PROCESS-NEXT:    store i32 undef, ptr addrspace(21) [[TMP45]], align 4
 ; POST-PROCESS-NEXT:    [[TMP44:%.*]] = add i32 [[TMP8]], 48
-; POST-PROCESS-NEXT:    [[TMP45:%.*]] = inttoptr i32 [[TMP44]] to ptr addrspace(21)
-; POST-PROCESS-NEXT:    [[TMP46:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP45]], i32 0
-; POST-PROCESS-NEXT:    store i32 undef, ptr addrspace(21) [[TMP46]], align 4
+; POST-PROCESS-NEXT:    [[TMP46:%.*]] = inttoptr i32 [[TMP44]] to ptr addrspace(21)
+; POST-PROCESS-NEXT:    [[TMP48:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP46]], i32 0
+; POST-PROCESS-NEXT:    store i32 undef, ptr addrspace(21) [[TMP48]], align 4
 ; POST-PROCESS-NEXT:    [[TMP47:%.*]] = add i32 [[TMP8]], 52
-; POST-PROCESS-NEXT:    [[TMP48:%.*]] = inttoptr i32 [[TMP47]] to ptr addrspace(21)
-; POST-PROCESS-NEXT:    [[TMP49:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP48]], i32 0
-; POST-PROCESS-NEXT:    store i32 undef, ptr addrspace(21) [[TMP49]], align 4
+; POST-PROCESS-NEXT:    [[TMP49:%.*]] = inttoptr i32 [[TMP47]] to ptr addrspace(21)
+; POST-PROCESS-NEXT:    [[TMP51:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP49]], i32 0
+; POST-PROCESS-NEXT:    store i32 undef, ptr addrspace(21) [[TMP51]], align 4
 ; POST-PROCESS-NEXT:    [[TMP50:%.*]] = add i32 [[TMP8]], 56
-; POST-PROCESS-NEXT:    [[TMP51:%.*]] = inttoptr i32 [[TMP50]] to ptr addrspace(21)
-; POST-PROCESS-NEXT:    [[TMP52:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP51]], i32 0
-; POST-PROCESS-NEXT:    store i32 undef, ptr addrspace(21) [[TMP52]], align 4
+; POST-PROCESS-NEXT:    [[TMP52:%.*]] = inttoptr i32 [[TMP50]] to ptr addrspace(21)
+; POST-PROCESS-NEXT:    [[TMP54:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP52]], i32 0
+; POST-PROCESS-NEXT:    store i32 undef, ptr addrspace(21) [[TMP54]], align 4
 ; POST-PROCESS-NEXT:    [[TMP53:%.*]] = add i32 [[TMP8]], 60
-; POST-PROCESS-NEXT:    [[TMP54:%.*]] = inttoptr i32 [[TMP53]] to ptr addrspace(21)
-; POST-PROCESS-NEXT:    [[TMP55:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP54]], i32 0
-; POST-PROCESS-NEXT:    store i32 undef, ptr addrspace(21) [[TMP55]], align 4
+; POST-PROCESS-NEXT:    [[TMP55:%.*]] = inttoptr i32 [[TMP53]] to ptr addrspace(21)
+; POST-PROCESS-NEXT:    [[TMP57:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP55]], i32 0
+; POST-PROCESS-NEXT:    store i32 undef, ptr addrspace(21) [[TMP57]], align 4
 ; POST-PROCESS-NEXT:    [[TMP56:%.*]] = add i32 [[TMP8]], 64
-; POST-PROCESS-NEXT:    [[TMP57:%.*]] = inttoptr i32 [[TMP56]] to ptr addrspace(21)
-; POST-PROCESS-NEXT:    [[TMP58:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP57]], i32 0
-; POST-PROCESS-NEXT:    store i32 undef, ptr addrspace(21) [[TMP58]], align 4
+; POST-PROCESS-NEXT:    [[TMP58:%.*]] = inttoptr i32 [[TMP56]] to ptr addrspace(21)
+; POST-PROCESS-NEXT:    [[TMP60:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP58]], i32 0
+; POST-PROCESS-NEXT:    store i32 undef, ptr addrspace(21) [[TMP60]], align 4
 ; POST-PROCESS-NEXT:    [[TMP59:%.*]] = add i32 [[TMP8]], 68
-; POST-PROCESS-NEXT:    [[TMP60:%.*]] = inttoptr i32 [[TMP59]] to ptr addrspace(21)
-; POST-PROCESS-NEXT:    [[TMP61:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP60]], i32 0
-; POST-PROCESS-NEXT:    store i32 undef, ptr addrspace(21) [[TMP61]], align 4
+; POST-PROCESS-NEXT:    [[TMP61:%.*]] = inttoptr i32 [[TMP59]] to ptr addrspace(21)
+; POST-PROCESS-NEXT:    [[TMP63:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP61]], i32 0
+; POST-PROCESS-NEXT:    store i32 undef, ptr addrspace(21) [[TMP63]], align 4
 ; POST-PROCESS-NEXT:    [[TMP62:%.*]] = add i32 [[TMP8]], 72
-; POST-PROCESS-NEXT:    [[TMP63:%.*]] = inttoptr i32 [[TMP62]] to ptr addrspace(21)
-; POST-PROCESS-NEXT:    [[TMP64:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP63]], i32 0
-; POST-PROCESS-NEXT:    store i32 undef, ptr addrspace(21) [[TMP64]], align 4
+; POST-PROCESS-NEXT:    [[TMP64:%.*]] = inttoptr i32 [[TMP62]] to ptr addrspace(21)
+; POST-PROCESS-NEXT:    [[TMP66:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP64]], i32 0
+; POST-PROCESS-NEXT:    store i32 undef, ptr addrspace(21) [[TMP66]], align 4
 ; POST-PROCESS-NEXT:    [[TMP65:%.*]] = add i32 [[TMP8]], 76
-; POST-PROCESS-NEXT:    [[TMP66:%.*]] = inttoptr i32 [[TMP65]] to ptr addrspace(21)
-; POST-PROCESS-NEXT:    [[TMP67:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP66]], i32 0
-; POST-PROCESS-NEXT:    store i32 undef, ptr addrspace(21) [[TMP67]], align 4
+; POST-PROCESS-NEXT:    [[TMP67:%.*]] = inttoptr i32 [[TMP65]] to ptr addrspace(21)
+; POST-PROCESS-NEXT:    [[TMP69:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP67]], i32 0
+; POST-PROCESS-NEXT:    store i32 undef, ptr addrspace(21) [[TMP69]], align 4
 ; POST-PROCESS-NEXT:    [[TMP68:%.*]] = add i32 [[TMP8]], 80
-; POST-PROCESS-NEXT:    [[TMP69:%.*]] = inttoptr i32 [[TMP68]] to ptr addrspace(21)
-; POST-PROCESS-NEXT:    [[TMP70:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP69]], i32 0
-; POST-PROCESS-NEXT:    store i32 undef, ptr addrspace(21) [[TMP70]], align 4
+; POST-PROCESS-NEXT:    [[TMP70:%.*]] = inttoptr i32 [[TMP68]] to ptr addrspace(21)
+; POST-PROCESS-NEXT:    [[TMP72:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP70]], i32 0
+; POST-PROCESS-NEXT:    store i32 undef, ptr addrspace(21) [[TMP72]], align 4
 ; POST-PROCESS-NEXT:    [[TMP71:%.*]] = add i32 [[TMP8]], 84
-; POST-PROCESS-NEXT:    [[TMP72:%.*]] = inttoptr i32 [[TMP71]] to ptr addrspace(21)
-; POST-PROCESS-NEXT:    [[TMP73:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP72]], i32 0
-; POST-PROCESS-NEXT:    store i32 undef, ptr addrspace(21) [[TMP73]], align 4
+; POST-PROCESS-NEXT:    [[TMP73:%.*]] = inttoptr i32 [[TMP71]] to ptr addrspace(21)
+; POST-PROCESS-NEXT:    [[TMP75:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP73]], i32 0
+; POST-PROCESS-NEXT:    store i32 undef, ptr addrspace(21) [[TMP75]], align 4
 ; POST-PROCESS-NEXT:    [[TMP74:%.*]] = add i32 [[TMP8]], 88
-; POST-PROCESS-NEXT:    [[TMP75:%.*]] = inttoptr i32 [[TMP74]] to ptr addrspace(21)
-; POST-PROCESS-NEXT:    [[TMP76:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP75]], i32 0
-; POST-PROCESS-NEXT:    store i32 undef, ptr addrspace(21) [[TMP76]], align 4
+; POST-PROCESS-NEXT:    [[TMP76:%.*]] = inttoptr i32 [[TMP74]] to ptr addrspace(21)
+; POST-PROCESS-NEXT:    [[TMP78:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP76]], i32 0
+; POST-PROCESS-NEXT:    store i32 undef, ptr addrspace(21) [[TMP78]], align 4
 ; POST-PROCESS-NEXT:    [[TMP77:%.*]] = add i32 [[TMP8]], 92
-; POST-PROCESS-NEXT:    [[TMP78:%.*]] = inttoptr i32 [[TMP77]] to ptr addrspace(21)
-; POST-PROCESS-NEXT:    [[TMP79:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP78]], i32 0
-; POST-PROCESS-NEXT:    store i32 undef, ptr addrspace(21) [[TMP79]], align 4
+; POST-PROCESS-NEXT:    [[TMP79:%.*]] = inttoptr i32 [[TMP77]] to ptr addrspace(21)
+; POST-PROCESS-NEXT:    [[TMP81:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP79]], i32 0
+; POST-PROCESS-NEXT:    store i32 undef, ptr addrspace(21) [[TMP81]], align 4
 ; POST-PROCESS-NEXT:    [[TMP80:%.*]] = add i32 [[TMP8]], 96
-; POST-PROCESS-NEXT:    [[TMP81:%.*]] = inttoptr i32 [[TMP80]] to ptr addrspace(21)
-; POST-PROCESS-NEXT:    [[TMP82:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP81]], i32 0
-; POST-PROCESS-NEXT:    store i32 undef, ptr addrspace(21) [[TMP82]], align 4
+; POST-PROCESS-NEXT:    [[TMP82:%.*]] = inttoptr i32 [[TMP80]] to ptr addrspace(21)
+; POST-PROCESS-NEXT:    [[TMP84:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP82]], i32 0
+; POST-PROCESS-NEXT:    store i32 undef, ptr addrspace(21) [[TMP84]], align 4
 ; POST-PROCESS-NEXT:    [[TMP83:%.*]] = add i32 [[TMP8]], 100
-; POST-PROCESS-NEXT:    [[TMP84:%.*]] = inttoptr i32 [[TMP83]] to ptr addrspace(21)
-; POST-PROCESS-NEXT:    [[TMP85:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP84]], i32 0
-; POST-PROCESS-NEXT:    store i32 undef, ptr addrspace(21) [[TMP85]], align 4
+; POST-PROCESS-NEXT:    [[TMP85:%.*]] = inttoptr i32 [[TMP83]] to ptr addrspace(21)
+; POST-PROCESS-NEXT:    [[TMP87:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP85]], i32 0
+; POST-PROCESS-NEXT:    store i32 undef, ptr addrspace(21) [[TMP87]], align 4
 ; POST-PROCESS-NEXT:    [[TMP86:%.*]] = add i32 [[TMP8]], 104
-; POST-PROCESS-NEXT:    [[TMP87:%.*]] = inttoptr i32 [[TMP86]] to ptr addrspace(21)
-; POST-PROCESS-NEXT:    [[TMP88:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP87]], i32 0
-; POST-PROCESS-NEXT:    store i32 undef, ptr addrspace(21) [[TMP88]], align 4
+; POST-PROCESS-NEXT:    [[TMP88:%.*]] = inttoptr i32 [[TMP86]] to ptr addrspace(21)
+; POST-PROCESS-NEXT:    [[TMP90:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP88]], i32 0
+; POST-PROCESS-NEXT:    store i32 undef, ptr addrspace(21) [[TMP90]], align 4
 ; POST-PROCESS-NEXT:    [[TMP89:%.*]] = load i32, ptr [[CSP]], align 4
-; POST-PROCESS-NEXT:    [[TMP90:%.*]] = call i64 @continuation.getAddrAndMD(i64 ptrtoint (ptr @main.resume.0 to i64))
-; POST-PROCESS-NEXT:    call void (i64, ...) @continuation.continue(i64 4, i32 [[TMP89]], i64 [[TMP90]], [[STRUCT_TRAVERSALDATA]] [[TRAV_DATA_I]]), !continuation.registercount [[META17:![0-9]+]], !continuation.returnedRegistercount !17
+; POST-PROCESS-NEXT:    call void (i64, i64, ...) @continuation.waitContinue(i64 4, i64 -1, i32 [[TMP89]], i64 poison, [[STRUCT_TRAVERSALDATA]] [[TRAV_DATA2_I]]), !continuation.registercount [[META17:![0-9]+]], !continuation.returnedRegistercount [[META17]]
 ; POST-PROCESS-NEXT:    unreachable
 ;
 ;
 ; POST-PROCESS-LABEL: define dso_local void @main.resume.0(
-; POST-PROCESS-SAME: i32 [[CSPINIT:%.*]], [[STRUCT_DISPATCHSYSTEMDATA:%.*]] [[TMP0:%.*]]) !lgc.rt.shaderstage [[META8]] !continuation.registercount [[META17]] !continuation [[META20]] {
+; POST-PROCESS-SAME: i32 [[CSPINIT:%.*]], i64 [[TMP0:%.*]], [[STRUCT_DISPATCHSYSTEMDATA:%.*]] [[TMP1:%.*]]) !lgc.rt.shaderstage [[META8]] !continuation.registercount [[META17]] !continuation [[META20]] {
 ; POST-PROCESS-NEXT:  entryresume.0:
 ; POST-PROCESS-NEXT:    [[CSP:%.*]] = alloca i32, align 4
 ; POST-PROCESS-NEXT:    store i32 [[CSPINIT]], ptr [[CSP]], align 4
-; POST-PROCESS-NEXT:    [[TMP1:%.*]] = load i32, ptr [[CSP]], align 4
-; POST-PROCESS-NEXT:    [[TMP2:%.*]] = add i32 [[TMP1]], -108
+; POST-PROCESS-NEXT:    [[TMP2:%.*]] = load i32, ptr [[CSP]], align 4
+; POST-PROCESS-NEXT:    [[TMP27:%.*]] = add i32 [[TMP2]], -108
 ; POST-PROCESS-NEXT:    [[TMP3:%.*]] = load i32, ptr addrspace(20) @REGISTERS, align 4
 ; POST-PROCESS-NEXT:    [[TMP4:%.*]] = load i32, ptr addrspace(20) getelementptr inbounds (i32, ptr addrspace(20) @REGISTERS, i32 7), align 4
 ; POST-PROCESS-NEXT:    [[TMP5:%.*]] = load i32, ptr addrspace(20) getelementptr inbounds (i32, ptr addrspace(20) @REGISTERS, i32 8), align 4
@@ -1192,115 +1203,115 @@ attributes #3 = { nounwind }
 ; POST-PROCESS-NEXT:    [[TMP24:%.*]] = load i32, ptr addrspace(20) getelementptr inbounds (i32, ptr addrspace(20) @REGISTERS, i32 27), align 4
 ; POST-PROCESS-NEXT:    [[TMP25:%.*]] = load i32, ptr addrspace(20) getelementptr inbounds (i32, ptr addrspace(20) @REGISTERS, i32 28), align 4
 ; POST-PROCESS-NEXT:    [[TMP26:%.*]] = load i32, ptr addrspace(20) getelementptr inbounds (i32, ptr addrspace(20) @REGISTERS, i32 29), align 4
-; POST-PROCESS-NEXT:    [[TMP27:%.*]] = inttoptr i32 [[TMP3]] to ptr addrspace(21)
-; POST-PROCESS-NEXT:    [[TMP28:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP27]], i32 0
-; POST-PROCESS-NEXT:    [[TMP29:%.*]] = load i32, ptr addrspace(21) [[TMP28]], align 4
+; POST-PROCESS-NEXT:    [[TMP28:%.*]] = inttoptr i32 [[TMP3]] to ptr addrspace(21)
+; POST-PROCESS-NEXT:    [[TMP29:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP28]], i32 0
+; POST-PROCESS-NEXT:    [[TMP31:%.*]] = load i32, ptr addrspace(21) [[TMP29]], align 4
 ; POST-PROCESS-NEXT:    [[TMP30:%.*]] = add i32 [[TMP3]], 4
-; POST-PROCESS-NEXT:    [[TMP31:%.*]] = inttoptr i32 [[TMP30]] to ptr addrspace(21)
-; POST-PROCESS-NEXT:    [[TMP32:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP31]], i32 0
-; POST-PROCESS-NEXT:    [[TMP33:%.*]] = load i32, ptr addrspace(21) [[TMP32]], align 4
+; POST-PROCESS-NEXT:    [[TMP32:%.*]] = inttoptr i32 [[TMP30]] to ptr addrspace(21)
+; POST-PROCESS-NEXT:    [[TMP33:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP32]], i32 0
+; POST-PROCESS-NEXT:    [[TMP35:%.*]] = load i32, ptr addrspace(21) [[TMP33]], align 4
 ; POST-PROCESS-NEXT:    [[TMP34:%.*]] = add i32 [[TMP3]], 8
-; POST-PROCESS-NEXT:    [[TMP35:%.*]] = inttoptr i32 [[TMP34]] to ptr addrspace(21)
-; POST-PROCESS-NEXT:    [[TMP36:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP35]], i32 0
-; POST-PROCESS-NEXT:    [[TMP37:%.*]] = load i32, ptr addrspace(21) [[TMP36]], align 4
+; POST-PROCESS-NEXT:    [[TMP36:%.*]] = inttoptr i32 [[TMP34]] to ptr addrspace(21)
+; POST-PROCESS-NEXT:    [[TMP37:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP36]], i32 0
+; POST-PROCESS-NEXT:    [[TMP39:%.*]] = load i32, ptr addrspace(21) [[TMP37]], align 4
 ; POST-PROCESS-NEXT:    [[TMP38:%.*]] = add i32 [[TMP3]], 12
-; POST-PROCESS-NEXT:    [[TMP39:%.*]] = inttoptr i32 [[TMP38]] to ptr addrspace(21)
-; POST-PROCESS-NEXT:    [[TMP40:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP39]], i32 0
-; POST-PROCESS-NEXT:    [[TMP41:%.*]] = load i32, ptr addrspace(21) [[TMP40]], align 4
+; POST-PROCESS-NEXT:    [[TMP40:%.*]] = inttoptr i32 [[TMP38]] to ptr addrspace(21)
+; POST-PROCESS-NEXT:    [[TMP41:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP40]], i32 0
+; POST-PROCESS-NEXT:    [[TMP43:%.*]] = load i32, ptr addrspace(21) [[TMP41]], align 4
 ; POST-PROCESS-NEXT:    [[TMP42:%.*]] = add i32 [[TMP3]], 16
-; POST-PROCESS-NEXT:    [[TMP43:%.*]] = inttoptr i32 [[TMP42]] to ptr addrspace(21)
-; POST-PROCESS-NEXT:    [[TMP44:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP43]], i32 0
-; POST-PROCESS-NEXT:    [[TMP45:%.*]] = load i32, ptr addrspace(21) [[TMP44]], align 4
+; POST-PROCESS-NEXT:    [[TMP44:%.*]] = inttoptr i32 [[TMP42]] to ptr addrspace(21)
+; POST-PROCESS-NEXT:    [[TMP45:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP44]], i32 0
+; POST-PROCESS-NEXT:    [[TMP47:%.*]] = load i32, ptr addrspace(21) [[TMP45]], align 4
 ; POST-PROCESS-NEXT:    [[TMP46:%.*]] = add i32 [[TMP3]], 20
-; POST-PROCESS-NEXT:    [[TMP47:%.*]] = inttoptr i32 [[TMP46]] to ptr addrspace(21)
-; POST-PROCESS-NEXT:    [[TMP48:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP47]], i32 0
-; POST-PROCESS-NEXT:    [[TMP49:%.*]] = load i32, ptr addrspace(21) [[TMP48]], align 4
+; POST-PROCESS-NEXT:    [[TMP48:%.*]] = inttoptr i32 [[TMP46]] to ptr addrspace(21)
+; POST-PROCESS-NEXT:    [[TMP49:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP48]], i32 0
+; POST-PROCESS-NEXT:    [[TMP51:%.*]] = load i32, ptr addrspace(21) [[TMP49]], align 4
 ; POST-PROCESS-NEXT:    [[TMP50:%.*]] = add i32 [[TMP3]], 24
-; POST-PROCESS-NEXT:    [[TMP51:%.*]] = inttoptr i32 [[TMP50]] to ptr addrspace(21)
-; POST-PROCESS-NEXT:    [[TMP52:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP51]], i32 0
-; POST-PROCESS-NEXT:    [[TMP53:%.*]] = load i32, ptr addrspace(21) [[TMP52]], align 4
+; POST-PROCESS-NEXT:    [[TMP52:%.*]] = inttoptr i32 [[TMP50]] to ptr addrspace(21)
+; POST-PROCESS-NEXT:    [[TMP53:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP52]], i32 0
+; POST-PROCESS-NEXT:    [[TMP55:%.*]] = load i32, ptr addrspace(21) [[TMP53]], align 4
 ; POST-PROCESS-NEXT:    [[TMP54:%.*]] = add i32 [[TMP3]], 28
-; POST-PROCESS-NEXT:    [[TMP55:%.*]] = inttoptr i32 [[TMP54]] to ptr addrspace(21)
-; POST-PROCESS-NEXT:    [[TMP56:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP55]], i32 0
-; POST-PROCESS-NEXT:    [[TMP57:%.*]] = load i32, ptr addrspace(21) [[TMP56]], align 4
+; POST-PROCESS-NEXT:    [[TMP56:%.*]] = inttoptr i32 [[TMP54]] to ptr addrspace(21)
+; POST-PROCESS-NEXT:    [[TMP57:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP56]], i32 0
+; POST-PROCESS-NEXT:    [[TMP59:%.*]] = load i32, ptr addrspace(21) [[TMP57]], align 4
 ; POST-PROCESS-NEXT:    [[TMP58:%.*]] = add i32 [[TMP3]], 32
-; POST-PROCESS-NEXT:    [[TMP59:%.*]] = inttoptr i32 [[TMP58]] to ptr addrspace(21)
-; POST-PROCESS-NEXT:    [[TMP60:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP59]], i32 0
-; POST-PROCESS-NEXT:    [[TMP61:%.*]] = load i32, ptr addrspace(21) [[TMP60]], align 4
+; POST-PROCESS-NEXT:    [[TMP60:%.*]] = inttoptr i32 [[TMP58]] to ptr addrspace(21)
+; POST-PROCESS-NEXT:    [[TMP61:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP60]], i32 0
+; POST-PROCESS-NEXT:    [[TMP63:%.*]] = load i32, ptr addrspace(21) [[TMP61]], align 4
 ; POST-PROCESS-NEXT:    [[TMP62:%.*]] = add i32 [[TMP3]], 36
-; POST-PROCESS-NEXT:    [[TMP63:%.*]] = inttoptr i32 [[TMP62]] to ptr addrspace(21)
-; POST-PROCESS-NEXT:    [[TMP64:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP63]], i32 0
-; POST-PROCESS-NEXT:    [[TMP65:%.*]] = load i32, ptr addrspace(21) [[TMP64]], align 4
+; POST-PROCESS-NEXT:    [[TMP64:%.*]] = inttoptr i32 [[TMP62]] to ptr addrspace(21)
+; POST-PROCESS-NEXT:    [[TMP65:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP64]], i32 0
+; POST-PROCESS-NEXT:    [[TMP67:%.*]] = load i32, ptr addrspace(21) [[TMP65]], align 4
 ; POST-PROCESS-NEXT:    [[TMP66:%.*]] = add i32 [[TMP3]], 40
-; POST-PROCESS-NEXT:    [[TMP67:%.*]] = inttoptr i32 [[TMP66]] to ptr addrspace(21)
-; POST-PROCESS-NEXT:    [[TMP68:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP67]], i32 0
-; POST-PROCESS-NEXT:    [[TMP69:%.*]] = load i32, ptr addrspace(21) [[TMP68]], align 4
+; POST-PROCESS-NEXT:    [[TMP68:%.*]] = inttoptr i32 [[TMP66]] to ptr addrspace(21)
+; POST-PROCESS-NEXT:    [[TMP69:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP68]], i32 0
+; POST-PROCESS-NEXT:    [[TMP71:%.*]] = load i32, ptr addrspace(21) [[TMP69]], align 4
 ; POST-PROCESS-NEXT:    [[TMP70:%.*]] = add i32 [[TMP3]], 44
-; POST-PROCESS-NEXT:    [[TMP71:%.*]] = inttoptr i32 [[TMP70]] to ptr addrspace(21)
-; POST-PROCESS-NEXT:    [[TMP72:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP71]], i32 0
-; POST-PROCESS-NEXT:    [[TMP73:%.*]] = load i32, ptr addrspace(21) [[TMP72]], align 4
+; POST-PROCESS-NEXT:    [[TMP72:%.*]] = inttoptr i32 [[TMP70]] to ptr addrspace(21)
+; POST-PROCESS-NEXT:    [[TMP73:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP72]], i32 0
+; POST-PROCESS-NEXT:    [[TMP75:%.*]] = load i32, ptr addrspace(21) [[TMP73]], align 4
 ; POST-PROCESS-NEXT:    [[TMP74:%.*]] = add i32 [[TMP3]], 48
-; POST-PROCESS-NEXT:    [[TMP75:%.*]] = inttoptr i32 [[TMP74]] to ptr addrspace(21)
-; POST-PROCESS-NEXT:    [[TMP76:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP75]], i32 0
-; POST-PROCESS-NEXT:    [[TMP77:%.*]] = load i32, ptr addrspace(21) [[TMP76]], align 4
+; POST-PROCESS-NEXT:    [[TMP76:%.*]] = inttoptr i32 [[TMP74]] to ptr addrspace(21)
+; POST-PROCESS-NEXT:    [[TMP77:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP76]], i32 0
+; POST-PROCESS-NEXT:    [[TMP79:%.*]] = load i32, ptr addrspace(21) [[TMP77]], align 4
 ; POST-PROCESS-NEXT:    [[TMP78:%.*]] = add i32 [[TMP3]], 52
-; POST-PROCESS-NEXT:    [[TMP79:%.*]] = inttoptr i32 [[TMP78]] to ptr addrspace(21)
-; POST-PROCESS-NEXT:    [[TMP80:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP79]], i32 0
-; POST-PROCESS-NEXT:    [[TMP81:%.*]] = load i32, ptr addrspace(21) [[TMP80]], align 4
+; POST-PROCESS-NEXT:    [[TMP80:%.*]] = inttoptr i32 [[TMP78]] to ptr addrspace(21)
+; POST-PROCESS-NEXT:    [[TMP81:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP80]], i32 0
+; POST-PROCESS-NEXT:    [[TMP83:%.*]] = load i32, ptr addrspace(21) [[TMP81]], align 4
 ; POST-PROCESS-NEXT:    [[TMP82:%.*]] = add i32 [[TMP3]], 56
-; POST-PROCESS-NEXT:    [[TMP83:%.*]] = inttoptr i32 [[TMP82]] to ptr addrspace(21)
-; POST-PROCESS-NEXT:    [[TMP84:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP83]], i32 0
-; POST-PROCESS-NEXT:    [[TMP85:%.*]] = load i32, ptr addrspace(21) [[TMP84]], align 4
+; POST-PROCESS-NEXT:    [[TMP84:%.*]] = inttoptr i32 [[TMP82]] to ptr addrspace(21)
+; POST-PROCESS-NEXT:    [[TMP85:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP84]], i32 0
+; POST-PROCESS-NEXT:    [[TMP87:%.*]] = load i32, ptr addrspace(21) [[TMP85]], align 4
 ; POST-PROCESS-NEXT:    [[TMP86:%.*]] = add i32 [[TMP3]], 60
-; POST-PROCESS-NEXT:    [[TMP87:%.*]] = inttoptr i32 [[TMP86]] to ptr addrspace(21)
-; POST-PROCESS-NEXT:    [[TMP88:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP87]], i32 0
-; POST-PROCESS-NEXT:    [[TMP89:%.*]] = load i32, ptr addrspace(21) [[TMP88]], align 4
+; POST-PROCESS-NEXT:    [[TMP88:%.*]] = inttoptr i32 [[TMP86]] to ptr addrspace(21)
+; POST-PROCESS-NEXT:    [[TMP89:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP88]], i32 0
+; POST-PROCESS-NEXT:    [[TMP91:%.*]] = load i32, ptr addrspace(21) [[TMP89]], align 4
 ; POST-PROCESS-NEXT:    [[TMP90:%.*]] = add i32 [[TMP3]], 64
-; POST-PROCESS-NEXT:    [[TMP91:%.*]] = inttoptr i32 [[TMP90]] to ptr addrspace(21)
-; POST-PROCESS-NEXT:    [[TMP92:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP91]], i32 0
-; POST-PROCESS-NEXT:    [[TMP93:%.*]] = load i32, ptr addrspace(21) [[TMP92]], align 4
+; POST-PROCESS-NEXT:    [[TMP92:%.*]] = inttoptr i32 [[TMP90]] to ptr addrspace(21)
+; POST-PROCESS-NEXT:    [[TMP93:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP92]], i32 0
+; POST-PROCESS-NEXT:    [[TMP95:%.*]] = load i32, ptr addrspace(21) [[TMP93]], align 4
 ; POST-PROCESS-NEXT:    [[TMP94:%.*]] = add i32 [[TMP3]], 68
-; POST-PROCESS-NEXT:    [[TMP95:%.*]] = inttoptr i32 [[TMP94]] to ptr addrspace(21)
-; POST-PROCESS-NEXT:    [[TMP96:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP95]], i32 0
-; POST-PROCESS-NEXT:    [[TMP97:%.*]] = load i32, ptr addrspace(21) [[TMP96]], align 4
+; POST-PROCESS-NEXT:    [[TMP96:%.*]] = inttoptr i32 [[TMP94]] to ptr addrspace(21)
+; POST-PROCESS-NEXT:    [[TMP97:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP96]], i32 0
+; POST-PROCESS-NEXT:    [[TMP99:%.*]] = load i32, ptr addrspace(21) [[TMP97]], align 4
 ; POST-PROCESS-NEXT:    [[TMP98:%.*]] = add i32 [[TMP3]], 72
-; POST-PROCESS-NEXT:    [[TMP99:%.*]] = inttoptr i32 [[TMP98]] to ptr addrspace(21)
-; POST-PROCESS-NEXT:    [[TMP100:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP99]], i32 0
-; POST-PROCESS-NEXT:    [[TMP101:%.*]] = load i32, ptr addrspace(21) [[TMP100]], align 4
+; POST-PROCESS-NEXT:    [[TMP100:%.*]] = inttoptr i32 [[TMP98]] to ptr addrspace(21)
+; POST-PROCESS-NEXT:    [[TMP101:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP100]], i32 0
+; POST-PROCESS-NEXT:    [[TMP103:%.*]] = load i32, ptr addrspace(21) [[TMP101]], align 4
 ; POST-PROCESS-NEXT:    [[TMP102:%.*]] = add i32 [[TMP3]], 76
-; POST-PROCESS-NEXT:    [[TMP103:%.*]] = inttoptr i32 [[TMP102]] to ptr addrspace(21)
-; POST-PROCESS-NEXT:    [[TMP104:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP103]], i32 0
-; POST-PROCESS-NEXT:    [[TMP105:%.*]] = load i32, ptr addrspace(21) [[TMP104]], align 4
+; POST-PROCESS-NEXT:    [[TMP104:%.*]] = inttoptr i32 [[TMP102]] to ptr addrspace(21)
+; POST-PROCESS-NEXT:    [[TMP105:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP104]], i32 0
+; POST-PROCESS-NEXT:    [[TMP107:%.*]] = load i32, ptr addrspace(21) [[TMP105]], align 4
 ; POST-PROCESS-NEXT:    [[TMP106:%.*]] = add i32 [[TMP3]], 80
-; POST-PROCESS-NEXT:    [[TMP107:%.*]] = inttoptr i32 [[TMP106]] to ptr addrspace(21)
-; POST-PROCESS-NEXT:    [[TMP108:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP107]], i32 0
-; POST-PROCESS-NEXT:    [[TMP109:%.*]] = load i32, ptr addrspace(21) [[TMP108]], align 4
+; POST-PROCESS-NEXT:    [[TMP108:%.*]] = inttoptr i32 [[TMP106]] to ptr addrspace(21)
+; POST-PROCESS-NEXT:    [[TMP109:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP108]], i32 0
+; POST-PROCESS-NEXT:    [[TMP111:%.*]] = load i32, ptr addrspace(21) [[TMP109]], align 4
 ; POST-PROCESS-NEXT:    [[TMP110:%.*]] = add i32 [[TMP3]], 84
-; POST-PROCESS-NEXT:    [[TMP111:%.*]] = inttoptr i32 [[TMP110]] to ptr addrspace(21)
-; POST-PROCESS-NEXT:    [[TMP112:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP111]], i32 0
-; POST-PROCESS-NEXT:    [[TMP113:%.*]] = load i32, ptr addrspace(21) [[TMP112]], align 4
+; POST-PROCESS-NEXT:    [[TMP112:%.*]] = inttoptr i32 [[TMP110]] to ptr addrspace(21)
+; POST-PROCESS-NEXT:    [[TMP113:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP112]], i32 0
+; POST-PROCESS-NEXT:    [[TMP115:%.*]] = load i32, ptr addrspace(21) [[TMP113]], align 4
 ; POST-PROCESS-NEXT:    [[TMP114:%.*]] = add i32 [[TMP3]], 88
-; POST-PROCESS-NEXT:    [[TMP115:%.*]] = inttoptr i32 [[TMP114]] to ptr addrspace(21)
-; POST-PROCESS-NEXT:    [[TMP116:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP115]], i32 0
-; POST-PROCESS-NEXT:    [[TMP117:%.*]] = load i32, ptr addrspace(21) [[TMP116]], align 4
+; POST-PROCESS-NEXT:    [[TMP116:%.*]] = inttoptr i32 [[TMP114]] to ptr addrspace(21)
+; POST-PROCESS-NEXT:    [[TMP117:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP116]], i32 0
+; POST-PROCESS-NEXT:    [[TMP119:%.*]] = load i32, ptr addrspace(21) [[TMP117]], align 4
 ; POST-PROCESS-NEXT:    [[TMP118:%.*]] = add i32 [[TMP3]], 92
-; POST-PROCESS-NEXT:    [[TMP119:%.*]] = inttoptr i32 [[TMP118]] to ptr addrspace(21)
-; POST-PROCESS-NEXT:    [[TMP120:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP119]], i32 0
-; POST-PROCESS-NEXT:    [[TMP121:%.*]] = load i32, ptr addrspace(21) [[TMP120]], align 4
+; POST-PROCESS-NEXT:    [[TMP120:%.*]] = inttoptr i32 [[TMP118]] to ptr addrspace(21)
+; POST-PROCESS-NEXT:    [[TMP121:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP120]], i32 0
+; POST-PROCESS-NEXT:    [[TMP123:%.*]] = load i32, ptr addrspace(21) [[TMP121]], align 4
 ; POST-PROCESS-NEXT:    [[TMP122:%.*]] = add i32 [[TMP3]], 96
-; POST-PROCESS-NEXT:    [[TMP123:%.*]] = inttoptr i32 [[TMP122]] to ptr addrspace(21)
-; POST-PROCESS-NEXT:    [[TMP124:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP123]], i32 0
-; POST-PROCESS-NEXT:    [[TMP125:%.*]] = load i32, ptr addrspace(21) [[TMP124]], align 4
+; POST-PROCESS-NEXT:    [[TMP124:%.*]] = inttoptr i32 [[TMP122]] to ptr addrspace(21)
+; POST-PROCESS-NEXT:    [[TMP125:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP124]], i32 0
+; POST-PROCESS-NEXT:    [[TMP127:%.*]] = load i32, ptr addrspace(21) [[TMP125]], align 4
 ; POST-PROCESS-NEXT:    [[TMP126:%.*]] = add i32 [[TMP3]], 100
-; POST-PROCESS-NEXT:    [[TMP127:%.*]] = inttoptr i32 [[TMP126]] to ptr addrspace(21)
-; POST-PROCESS-NEXT:    [[TMP128:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP127]], i32 0
-; POST-PROCESS-NEXT:    [[TMP129:%.*]] = load i32, ptr addrspace(21) [[TMP128]], align 4
+; POST-PROCESS-NEXT:    [[TMP128:%.*]] = inttoptr i32 [[TMP126]] to ptr addrspace(21)
+; POST-PROCESS-NEXT:    [[TMP129:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP128]], i32 0
+; POST-PROCESS-NEXT:    [[TMP131:%.*]] = load i32, ptr addrspace(21) [[TMP129]], align 4
 ; POST-PROCESS-NEXT:    [[TMP130:%.*]] = add i32 [[TMP3]], 104
-; POST-PROCESS-NEXT:    [[TMP131:%.*]] = inttoptr i32 [[TMP130]] to ptr addrspace(21)
-; POST-PROCESS-NEXT:    [[TMP132:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP131]], i32 0
-; POST-PROCESS-NEXT:    [[TMP133:%.*]] = load i32, ptr addrspace(21) [[TMP132]], align 4
+; POST-PROCESS-NEXT:    [[TMP132:%.*]] = inttoptr i32 [[TMP130]] to ptr addrspace(21)
+; POST-PROCESS-NEXT:    [[TMP133:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP132]], i32 0
+; POST-PROCESS-NEXT:    [[TMP137:%.*]] = load i32, ptr addrspace(21) [[TMP133]], align 4
 ; POST-PROCESS-NEXT:    [[TMP134:%.*]] = load i32, ptr addrspace(20) @REGISTERS, align 4
-; POST-PROCESS-NEXT:    [[DOTFCA_0_EXTRACT1:%.*]] = extractvalue [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP0]], 0
+; POST-PROCESS-NEXT:    [[DOTFCA_0_EXTRACT1:%.*]] = extractvalue [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP1]], 0
 ; POST-PROCESS-NEXT:    call void @amd.dx.setLocalRootIndex(i32 0)
 ; POST-PROCESS-NEXT:    [[TMP135:%.*]] = load i32, ptr [[CSP]], align 4
 ; POST-PROCESS-NEXT:    [[TMP136:%.*]] = add i32 [[TMP135]], -108
@@ -1664,7 +1675,7 @@ attributes #3 = { nounwind }
 ; POST-PROCESS-NEXT:    [[DOTFCA_1_1_LOAD:%.*]] = load i32, ptr [[DOTFCA_1_1_GEP10]], align 4
 ; POST-PROCESS-NEXT:    [[DOTFCA_1_1_INSERT:%.*]] = insertvalue [[STRUCT_ANYHITTRAVERSALDATA]] [[DOTFCA_1_0_INSERT]], i32 [[DOTFCA_1_1_LOAD]], 1, 1
 ; POST-PROCESS-NEXT:    [[TMP223:%.*]] = load i32, ptr [[CSP]], align 4
-; POST-PROCESS-NEXT:    call void (i64, ...) @continuation.continue(i64 [[RETURNADDR]], i32 [[TMP223]], [[STRUCT_ANYHITTRAVERSALDATA]] [[DOTFCA_1_1_INSERT]]), !continuation.registercount [[META17]]
+; POST-PROCESS-NEXT:    call void (i64, ...) @continuation.continue(i64 [[RETURNADDR]], i32 [[TMP223]], i64 poison, [[STRUCT_ANYHITTRAVERSALDATA]] [[DOTFCA_1_1_INSERT]]), !continuation.registercount [[META17]]
 ; POST-PROCESS-NEXT:    unreachable
 ;
 ;
@@ -1833,6 +1844,8 @@ attributes #3 = { nounwind }
 ; POST-PROCESS-NEXT:    [[DIS_DATA_I_FCA_0_INSERT:%.*]] = insertvalue [[STRUCT_DISPATCHSYSTEMDATA:%.*]] poison, <3 x i32> [[DOTFCA_0_0_EXTRACT]], 0
 ; POST-PROCESS-NEXT:    [[SYS_DATA_I:%.*]] = insertvalue [[STRUCT_SYSTEMDATA]] undef, [[STRUCT_DISPATCHSYSTEMDATA]] [[DIS_DATA_I_FCA_0_INSERT]], 0
 ; POST-PROCESS-NEXT:    [[TRAV_DATA_I:%.*]] = insertvalue [[STRUCT_TRAVERSALDATA:%.*]] undef, [[STRUCT_SYSTEMDATA]] [[SYS_DATA_I]], 0
+; POST-PROCESS-NEXT:    [[TMP149:%.*]] = call i64 @continuation.getAddrAndMD(ptr @ClosestHit.resume.0)
+; POST-PROCESS-NEXT:    [[TRAV_DATA2_I:%.*]] = insertvalue [[STRUCT_TRAVERSALDATA]] [[TRAV_DATA_I]], i64 [[TMP149]], 5
 ; POST-PROCESS-NEXT:    store i32 [[TMP1]], ptr addrspace(20) @REGISTERS, align 4
 ; POST-PROCESS-NEXT:    [[TMP148:%.*]] = load i32, ptr addrspace(20) @REGISTERS, align 4
 ; POST-PROCESS-NEXT:    store i32 [[TMP7]], ptr addrspace(20) getelementptr inbounds (i32, ptr addrspace(20) @REGISTERS, i32 7), align 4
@@ -1858,126 +1871,125 @@ attributes #3 = { nounwind }
 ; POST-PROCESS-NEXT:    store i32 [[TMP27]], ptr addrspace(20) getelementptr inbounds (i32, ptr addrspace(20) @REGISTERS, i32 27), align 4
 ; POST-PROCESS-NEXT:    store i32 [[TMP28]], ptr addrspace(20) getelementptr inbounds (i32, ptr addrspace(20) @REGISTERS, i32 28), align 4
 ; POST-PROCESS-NEXT:    store i32 [[TMP29]], ptr addrspace(20) getelementptr inbounds (i32, ptr addrspace(20) @REGISTERS, i32 29), align 4
-; POST-PROCESS-NEXT:    [[TMP149:%.*]] = inttoptr i32 [[TMP148]] to ptr addrspace(21)
-; POST-PROCESS-NEXT:    [[TMP150:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP149]], i32 0
-; POST-PROCESS-NEXT:    store i32 [[TMP32]], ptr addrspace(21) [[TMP150]], align 4
+; POST-PROCESS-NEXT:    [[TMP150:%.*]] = inttoptr i32 [[TMP148]] to ptr addrspace(21)
+; POST-PROCESS-NEXT:    [[TMP152:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP150]], i32 0
+; POST-PROCESS-NEXT:    store i32 [[TMP32]], ptr addrspace(21) [[TMP152]], align 4
 ; POST-PROCESS-NEXT:    [[TMP151:%.*]] = add i32 [[TMP148]], 4
-; POST-PROCESS-NEXT:    [[TMP152:%.*]] = inttoptr i32 [[TMP151]] to ptr addrspace(21)
-; POST-PROCESS-NEXT:    [[TMP153:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP152]], i32 0
-; POST-PROCESS-NEXT:    store i32 [[TMP36]], ptr addrspace(21) [[TMP153]], align 4
+; POST-PROCESS-NEXT:    [[TMP153:%.*]] = inttoptr i32 [[TMP151]] to ptr addrspace(21)
+; POST-PROCESS-NEXT:    [[TMP155:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP153]], i32 0
+; POST-PROCESS-NEXT:    store i32 [[TMP36]], ptr addrspace(21) [[TMP155]], align 4
 ; POST-PROCESS-NEXT:    [[TMP154:%.*]] = add i32 [[TMP148]], 8
-; POST-PROCESS-NEXT:    [[TMP155:%.*]] = inttoptr i32 [[TMP154]] to ptr addrspace(21)
-; POST-PROCESS-NEXT:    [[TMP156:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP155]], i32 0
-; POST-PROCESS-NEXT:    store i32 [[TMP40]], ptr addrspace(21) [[TMP156]], align 4
+; POST-PROCESS-NEXT:    [[TMP156:%.*]] = inttoptr i32 [[TMP154]] to ptr addrspace(21)
+; POST-PROCESS-NEXT:    [[TMP158:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP156]], i32 0
+; POST-PROCESS-NEXT:    store i32 [[TMP40]], ptr addrspace(21) [[TMP158]], align 4
 ; POST-PROCESS-NEXT:    [[TMP157:%.*]] = add i32 [[TMP148]], 12
-; POST-PROCESS-NEXT:    [[TMP158:%.*]] = inttoptr i32 [[TMP157]] to ptr addrspace(21)
-; POST-PROCESS-NEXT:    [[TMP159:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP158]], i32 0
-; POST-PROCESS-NEXT:    store i32 [[TMP44]], ptr addrspace(21) [[TMP159]], align 4
+; POST-PROCESS-NEXT:    [[TMP159:%.*]] = inttoptr i32 [[TMP157]] to ptr addrspace(21)
+; POST-PROCESS-NEXT:    [[TMP161:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP159]], i32 0
+; POST-PROCESS-NEXT:    store i32 [[TMP44]], ptr addrspace(21) [[TMP161]], align 4
 ; POST-PROCESS-NEXT:    [[TMP160:%.*]] = add i32 [[TMP148]], 16
-; POST-PROCESS-NEXT:    [[TMP161:%.*]] = inttoptr i32 [[TMP160]] to ptr addrspace(21)
-; POST-PROCESS-NEXT:    [[TMP162:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP161]], i32 0
-; POST-PROCESS-NEXT:    store i32 [[TMP48]], ptr addrspace(21) [[TMP162]], align 4
+; POST-PROCESS-NEXT:    [[TMP162:%.*]] = inttoptr i32 [[TMP160]] to ptr addrspace(21)
+; POST-PROCESS-NEXT:    [[TMP164:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP162]], i32 0
+; POST-PROCESS-NEXT:    store i32 [[TMP48]], ptr addrspace(21) [[TMP164]], align 4
 ; POST-PROCESS-NEXT:    [[TMP163:%.*]] = add i32 [[TMP148]], 20
-; POST-PROCESS-NEXT:    [[TMP164:%.*]] = inttoptr i32 [[TMP163]] to ptr addrspace(21)
-; POST-PROCESS-NEXT:    [[TMP165:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP164]], i32 0
-; POST-PROCESS-NEXT:    store i32 [[TMP52]], ptr addrspace(21) [[TMP165]], align 4
+; POST-PROCESS-NEXT:    [[TMP165:%.*]] = inttoptr i32 [[TMP163]] to ptr addrspace(21)
+; POST-PROCESS-NEXT:    [[TMP167:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP165]], i32 0
+; POST-PROCESS-NEXT:    store i32 [[TMP52]], ptr addrspace(21) [[TMP167]], align 4
 ; POST-PROCESS-NEXT:    [[TMP166:%.*]] = add i32 [[TMP148]], 24
-; POST-PROCESS-NEXT:    [[TMP167:%.*]] = inttoptr i32 [[TMP166]] to ptr addrspace(21)
-; POST-PROCESS-NEXT:    [[TMP168:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP167]], i32 0
-; POST-PROCESS-NEXT:    store i32 [[TMP56]], ptr addrspace(21) [[TMP168]], align 4
+; POST-PROCESS-NEXT:    [[TMP168:%.*]] = inttoptr i32 [[TMP166]] to ptr addrspace(21)
+; POST-PROCESS-NEXT:    [[TMP170:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP168]], i32 0
+; POST-PROCESS-NEXT:    store i32 [[TMP56]], ptr addrspace(21) [[TMP170]], align 4
 ; POST-PROCESS-NEXT:    [[TMP169:%.*]] = add i32 [[TMP148]], 28
-; POST-PROCESS-NEXT:    [[TMP170:%.*]] = inttoptr i32 [[TMP169]] to ptr addrspace(21)
-; POST-PROCESS-NEXT:    [[TMP171:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP170]], i32 0
-; POST-PROCESS-NEXT:    store i32 [[TMP60]], ptr addrspace(21) [[TMP171]], align 4
+; POST-PROCESS-NEXT:    [[TMP171:%.*]] = inttoptr i32 [[TMP169]] to ptr addrspace(21)
+; POST-PROCESS-NEXT:    [[TMP173:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP171]], i32 0
+; POST-PROCESS-NEXT:    store i32 [[TMP60]], ptr addrspace(21) [[TMP173]], align 4
 ; POST-PROCESS-NEXT:    [[TMP172:%.*]] = add i32 [[TMP148]], 32
-; POST-PROCESS-NEXT:    [[TMP173:%.*]] = inttoptr i32 [[TMP172]] to ptr addrspace(21)
-; POST-PROCESS-NEXT:    [[TMP174:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP173]], i32 0
-; POST-PROCESS-NEXT:    store i32 [[TMP64]], ptr addrspace(21) [[TMP174]], align 4
+; POST-PROCESS-NEXT:    [[TMP174:%.*]] = inttoptr i32 [[TMP172]] to ptr addrspace(21)
+; POST-PROCESS-NEXT:    [[TMP176:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP174]], i32 0
+; POST-PROCESS-NEXT:    store i32 [[TMP64]], ptr addrspace(21) [[TMP176]], align 4
 ; POST-PROCESS-NEXT:    [[TMP175:%.*]] = add i32 [[TMP148]], 36
-; POST-PROCESS-NEXT:    [[TMP176:%.*]] = inttoptr i32 [[TMP175]] to ptr addrspace(21)
-; POST-PROCESS-NEXT:    [[TMP177:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP176]], i32 0
-; POST-PROCESS-NEXT:    store i32 [[TMP68]], ptr addrspace(21) [[TMP177]], align 4
+; POST-PROCESS-NEXT:    [[TMP177:%.*]] = inttoptr i32 [[TMP175]] to ptr addrspace(21)
+; POST-PROCESS-NEXT:    [[TMP179:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP177]], i32 0
+; POST-PROCESS-NEXT:    store i32 [[TMP68]], ptr addrspace(21) [[TMP179]], align 4
 ; POST-PROCESS-NEXT:    [[TMP178:%.*]] = add i32 [[TMP148]], 40
-; POST-PROCESS-NEXT:    [[TMP179:%.*]] = inttoptr i32 [[TMP178]] to ptr addrspace(21)
-; POST-PROCESS-NEXT:    [[TMP180:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP179]], i32 0
-; POST-PROCESS-NEXT:    store i32 [[TMP72]], ptr addrspace(21) [[TMP180]], align 4
+; POST-PROCESS-NEXT:    [[TMP180:%.*]] = inttoptr i32 [[TMP178]] to ptr addrspace(21)
+; POST-PROCESS-NEXT:    [[TMP182:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP180]], i32 0
+; POST-PROCESS-NEXT:    store i32 [[TMP72]], ptr addrspace(21) [[TMP182]], align 4
 ; POST-PROCESS-NEXT:    [[TMP181:%.*]] = add i32 [[TMP148]], 44
-; POST-PROCESS-NEXT:    [[TMP182:%.*]] = inttoptr i32 [[TMP181]] to ptr addrspace(21)
-; POST-PROCESS-NEXT:    [[TMP183:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP182]], i32 0
-; POST-PROCESS-NEXT:    store i32 [[TMP76]], ptr addrspace(21) [[TMP183]], align 4
+; POST-PROCESS-NEXT:    [[TMP183:%.*]] = inttoptr i32 [[TMP181]] to ptr addrspace(21)
+; POST-PROCESS-NEXT:    [[TMP185:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP183]], i32 0
+; POST-PROCESS-NEXT:    store i32 [[TMP76]], ptr addrspace(21) [[TMP185]], align 4
 ; POST-PROCESS-NEXT:    [[TMP184:%.*]] = add i32 [[TMP148]], 48
-; POST-PROCESS-NEXT:    [[TMP185:%.*]] = inttoptr i32 [[TMP184]] to ptr addrspace(21)
-; POST-PROCESS-NEXT:    [[TMP186:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP185]], i32 0
-; POST-PROCESS-NEXT:    store i32 [[TMP80]], ptr addrspace(21) [[TMP186]], align 4
+; POST-PROCESS-NEXT:    [[TMP186:%.*]] = inttoptr i32 [[TMP184]] to ptr addrspace(21)
+; POST-PROCESS-NEXT:    [[TMP188:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP186]], i32 0
+; POST-PROCESS-NEXT:    store i32 [[TMP80]], ptr addrspace(21) [[TMP188]], align 4
 ; POST-PROCESS-NEXT:    [[TMP187:%.*]] = add i32 [[TMP148]], 52
-; POST-PROCESS-NEXT:    [[TMP188:%.*]] = inttoptr i32 [[TMP187]] to ptr addrspace(21)
-; POST-PROCESS-NEXT:    [[TMP189:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP188]], i32 0
-; POST-PROCESS-NEXT:    store i32 [[TMP84]], ptr addrspace(21) [[TMP189]], align 4
+; POST-PROCESS-NEXT:    [[TMP189:%.*]] = inttoptr i32 [[TMP187]] to ptr addrspace(21)
+; POST-PROCESS-NEXT:    [[TMP191:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP189]], i32 0
+; POST-PROCESS-NEXT:    store i32 [[TMP84]], ptr addrspace(21) [[TMP191]], align 4
 ; POST-PROCESS-NEXT:    [[TMP190:%.*]] = add i32 [[TMP148]], 56
-; POST-PROCESS-NEXT:    [[TMP191:%.*]] = inttoptr i32 [[TMP190]] to ptr addrspace(21)
-; POST-PROCESS-NEXT:    [[TMP192:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP191]], i32 0
-; POST-PROCESS-NEXT:    store i32 [[TMP88]], ptr addrspace(21) [[TMP192]], align 4
+; POST-PROCESS-NEXT:    [[TMP192:%.*]] = inttoptr i32 [[TMP190]] to ptr addrspace(21)
+; POST-PROCESS-NEXT:    [[TMP194:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP192]], i32 0
+; POST-PROCESS-NEXT:    store i32 [[TMP88]], ptr addrspace(21) [[TMP194]], align 4
 ; POST-PROCESS-NEXT:    [[TMP193:%.*]] = add i32 [[TMP148]], 60
-; POST-PROCESS-NEXT:    [[TMP194:%.*]] = inttoptr i32 [[TMP193]] to ptr addrspace(21)
-; POST-PROCESS-NEXT:    [[TMP195:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP194]], i32 0
-; POST-PROCESS-NEXT:    store i32 [[TMP92]], ptr addrspace(21) [[TMP195]], align 4
+; POST-PROCESS-NEXT:    [[TMP195:%.*]] = inttoptr i32 [[TMP193]] to ptr addrspace(21)
+; POST-PROCESS-NEXT:    [[TMP197:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP195]], i32 0
+; POST-PROCESS-NEXT:    store i32 [[TMP92]], ptr addrspace(21) [[TMP197]], align 4
 ; POST-PROCESS-NEXT:    [[TMP196:%.*]] = add i32 [[TMP148]], 64
-; POST-PROCESS-NEXT:    [[TMP197:%.*]] = inttoptr i32 [[TMP196]] to ptr addrspace(21)
-; POST-PROCESS-NEXT:    [[TMP198:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP197]], i32 0
-; POST-PROCESS-NEXT:    store i32 [[TMP96]], ptr addrspace(21) [[TMP198]], align 4
+; POST-PROCESS-NEXT:    [[TMP198:%.*]] = inttoptr i32 [[TMP196]] to ptr addrspace(21)
+; POST-PROCESS-NEXT:    [[TMP200:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP198]], i32 0
+; POST-PROCESS-NEXT:    store i32 [[TMP96]], ptr addrspace(21) [[TMP200]], align 4
 ; POST-PROCESS-NEXT:    [[TMP199:%.*]] = add i32 [[TMP148]], 68
-; POST-PROCESS-NEXT:    [[TMP200:%.*]] = inttoptr i32 [[TMP199]] to ptr addrspace(21)
-; POST-PROCESS-NEXT:    [[TMP201:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP200]], i32 0
-; POST-PROCESS-NEXT:    store i32 [[TMP100]], ptr addrspace(21) [[TMP201]], align 4
+; POST-PROCESS-NEXT:    [[TMP201:%.*]] = inttoptr i32 [[TMP199]] to ptr addrspace(21)
+; POST-PROCESS-NEXT:    [[TMP203:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP201]], i32 0
+; POST-PROCESS-NEXT:    store i32 [[TMP100]], ptr addrspace(21) [[TMP203]], align 4
 ; POST-PROCESS-NEXT:    [[TMP202:%.*]] = add i32 [[TMP148]], 72
-; POST-PROCESS-NEXT:    [[TMP203:%.*]] = inttoptr i32 [[TMP202]] to ptr addrspace(21)
-; POST-PROCESS-NEXT:    [[TMP204:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP203]], i32 0
-; POST-PROCESS-NEXT:    store i32 [[TMP104]], ptr addrspace(21) [[TMP204]], align 4
+; POST-PROCESS-NEXT:    [[TMP204:%.*]] = inttoptr i32 [[TMP202]] to ptr addrspace(21)
+; POST-PROCESS-NEXT:    [[TMP206:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP204]], i32 0
+; POST-PROCESS-NEXT:    store i32 [[TMP104]], ptr addrspace(21) [[TMP206]], align 4
 ; POST-PROCESS-NEXT:    [[TMP205:%.*]] = add i32 [[TMP148]], 76
-; POST-PROCESS-NEXT:    [[TMP206:%.*]] = inttoptr i32 [[TMP205]] to ptr addrspace(21)
-; POST-PROCESS-NEXT:    [[TMP207:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP206]], i32 0
-; POST-PROCESS-NEXT:    store i32 [[TMP108]], ptr addrspace(21) [[TMP207]], align 4
+; POST-PROCESS-NEXT:    [[TMP207:%.*]] = inttoptr i32 [[TMP205]] to ptr addrspace(21)
+; POST-PROCESS-NEXT:    [[TMP209:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP207]], i32 0
+; POST-PROCESS-NEXT:    store i32 [[TMP108]], ptr addrspace(21) [[TMP209]], align 4
 ; POST-PROCESS-NEXT:    [[TMP208:%.*]] = add i32 [[TMP148]], 80
-; POST-PROCESS-NEXT:    [[TMP209:%.*]] = inttoptr i32 [[TMP208]] to ptr addrspace(21)
-; POST-PROCESS-NEXT:    [[TMP210:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP209]], i32 0
-; POST-PROCESS-NEXT:    store i32 [[TMP112]], ptr addrspace(21) [[TMP210]], align 4
+; POST-PROCESS-NEXT:    [[TMP210:%.*]] = inttoptr i32 [[TMP208]] to ptr addrspace(21)
+; POST-PROCESS-NEXT:    [[TMP212:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP210]], i32 0
+; POST-PROCESS-NEXT:    store i32 [[TMP112]], ptr addrspace(21) [[TMP212]], align 4
 ; POST-PROCESS-NEXT:    [[TMP211:%.*]] = add i32 [[TMP148]], 84
-; POST-PROCESS-NEXT:    [[TMP212:%.*]] = inttoptr i32 [[TMP211]] to ptr addrspace(21)
-; POST-PROCESS-NEXT:    [[TMP213:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP212]], i32 0
-; POST-PROCESS-NEXT:    store i32 [[TMP116]], ptr addrspace(21) [[TMP213]], align 4
+; POST-PROCESS-NEXT:    [[TMP213:%.*]] = inttoptr i32 [[TMP211]] to ptr addrspace(21)
+; POST-PROCESS-NEXT:    [[TMP215:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP213]], i32 0
+; POST-PROCESS-NEXT:    store i32 [[TMP116]], ptr addrspace(21) [[TMP215]], align 4
 ; POST-PROCESS-NEXT:    [[TMP214:%.*]] = add i32 [[TMP148]], 88
-; POST-PROCESS-NEXT:    [[TMP215:%.*]] = inttoptr i32 [[TMP214]] to ptr addrspace(21)
-; POST-PROCESS-NEXT:    [[TMP216:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP215]], i32 0
-; POST-PROCESS-NEXT:    store i32 [[TMP120]], ptr addrspace(21) [[TMP216]], align 4
+; POST-PROCESS-NEXT:    [[TMP216:%.*]] = inttoptr i32 [[TMP214]] to ptr addrspace(21)
+; POST-PROCESS-NEXT:    [[TMP218:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP216]], i32 0
+; POST-PROCESS-NEXT:    store i32 [[TMP120]], ptr addrspace(21) [[TMP218]], align 4
 ; POST-PROCESS-NEXT:    [[TMP217:%.*]] = add i32 [[TMP148]], 92
-; POST-PROCESS-NEXT:    [[TMP218:%.*]] = inttoptr i32 [[TMP217]] to ptr addrspace(21)
-; POST-PROCESS-NEXT:    [[TMP219:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP218]], i32 0
-; POST-PROCESS-NEXT:    store i32 [[TMP124]], ptr addrspace(21) [[TMP219]], align 4
+; POST-PROCESS-NEXT:    [[TMP219:%.*]] = inttoptr i32 [[TMP217]] to ptr addrspace(21)
+; POST-PROCESS-NEXT:    [[TMP221:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP219]], i32 0
+; POST-PROCESS-NEXT:    store i32 [[TMP124]], ptr addrspace(21) [[TMP221]], align 4
 ; POST-PROCESS-NEXT:    [[TMP220:%.*]] = add i32 [[TMP148]], 96
-; POST-PROCESS-NEXT:    [[TMP221:%.*]] = inttoptr i32 [[TMP220]] to ptr addrspace(21)
-; POST-PROCESS-NEXT:    [[TMP222:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP221]], i32 0
-; POST-PROCESS-NEXT:    store i32 [[TMP128]], ptr addrspace(21) [[TMP222]], align 4
+; POST-PROCESS-NEXT:    [[TMP222:%.*]] = inttoptr i32 [[TMP220]] to ptr addrspace(21)
+; POST-PROCESS-NEXT:    [[TMP224:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP222]], i32 0
+; POST-PROCESS-NEXT:    store i32 [[TMP128]], ptr addrspace(21) [[TMP224]], align 4
 ; POST-PROCESS-NEXT:    [[TMP223:%.*]] = add i32 [[TMP148]], 100
-; POST-PROCESS-NEXT:    [[TMP224:%.*]] = inttoptr i32 [[TMP223]] to ptr addrspace(21)
-; POST-PROCESS-NEXT:    [[TMP225:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP224]], i32 0
-; POST-PROCESS-NEXT:    store i32 [[TMP132]], ptr addrspace(21) [[TMP225]], align 4
+; POST-PROCESS-NEXT:    [[TMP225:%.*]] = inttoptr i32 [[TMP223]] to ptr addrspace(21)
+; POST-PROCESS-NEXT:    [[TMP227:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP225]], i32 0
+; POST-PROCESS-NEXT:    store i32 [[TMP132]], ptr addrspace(21) [[TMP227]], align 4
 ; POST-PROCESS-NEXT:    [[TMP226:%.*]] = add i32 [[TMP148]], 104
-; POST-PROCESS-NEXT:    [[TMP227:%.*]] = inttoptr i32 [[TMP226]] to ptr addrspace(21)
-; POST-PROCESS-NEXT:    [[TMP228:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP227]], i32 0
-; POST-PROCESS-NEXT:    store i32 [[TMP136]], ptr addrspace(21) [[TMP228]], align 4
+; POST-PROCESS-NEXT:    [[TMP228:%.*]] = inttoptr i32 [[TMP226]] to ptr addrspace(21)
+; POST-PROCESS-NEXT:    [[TMP230:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP228]], i32 0
+; POST-PROCESS-NEXT:    store i32 [[TMP136]], ptr addrspace(21) [[TMP230]], align 4
 ; POST-PROCESS-NEXT:    [[TMP229:%.*]] = load i32, ptr [[CSP]], align 4
-; POST-PROCESS-NEXT:    [[TMP230:%.*]] = call i64 @continuation.getAddrAndMD(i64 ptrtoint (ptr @ClosestHit.resume.0 to i64))
-; POST-PROCESS-NEXT:    call void (i64, ...) @continuation.continue(i64 4, i32 [[TMP229]], i64 [[TMP230]], [[STRUCT_TRAVERSALDATA]] [[TRAV_DATA_I]]), !continuation.registercount [[META17]], !continuation.returnedRegistercount !17
+; POST-PROCESS-NEXT:    call void (i64, i64, ...) @continuation.waitContinue(i64 4, i64 -1, i32 [[TMP229]], i64 poison, [[STRUCT_TRAVERSALDATA]] [[TRAV_DATA2_I]]), !continuation.registercount [[META17]], !continuation.returnedRegistercount [[META17]]
 ; POST-PROCESS-NEXT:    unreachable
 ;
 ;
 ; POST-PROCESS-LABEL: define dso_local void @ClosestHit.resume.0(
-; POST-PROCESS-SAME: i32 [[CSPINIT:%.*]], [[STRUCT_DISPATCHSYSTEMDATA:%.*]] [[TMP0:%.*]]) !lgc.rt.shaderstage [[META24]] !continuation.registercount [[META17]] !continuation [[META25]] {
+; POST-PROCESS-SAME: i32 [[CSPINIT:%.*]], i64 [[TMP0:%.*]], [[STRUCT_DISPATCHSYSTEMDATA:%.*]] [[TMP1:%.*]]) !lgc.rt.shaderstage [[META24]] !continuation.registercount [[META17]] !continuation [[META25]] {
 ; POST-PROCESS-NEXT:  entryresume.0:
 ; POST-PROCESS-NEXT:    [[CSP:%.*]] = alloca i32, align 4
 ; POST-PROCESS-NEXT:    store i32 [[CSPINIT]], ptr [[CSP]], align 4
-; POST-PROCESS-NEXT:    [[TMP1:%.*]] = load i32, ptr [[CSP]], align 4
-; POST-PROCESS-NEXT:    [[TMP2:%.*]] = add i32 [[TMP1]], -120
+; POST-PROCESS-NEXT:    [[TMP27:%.*]] = load i32, ptr [[CSP]], align 4
+; POST-PROCESS-NEXT:    [[TMP2:%.*]] = add i32 [[TMP27]], -120
 ; POST-PROCESS-NEXT:    [[TMP3:%.*]] = load i32, ptr addrspace(20) @REGISTERS, align 4
 ; POST-PROCESS-NEXT:    [[TMP4:%.*]] = load i32, ptr addrspace(20) getelementptr inbounds (i32, ptr addrspace(20) @REGISTERS, i32 7), align 4
 ; POST-PROCESS-NEXT:    [[TMP5:%.*]] = load i32, ptr addrspace(20) getelementptr inbounds (i32, ptr addrspace(20) @REGISTERS, i32 8), align 4
@@ -2002,123 +2014,123 @@ attributes #3 = { nounwind }
 ; POST-PROCESS-NEXT:    [[TMP24:%.*]] = load i32, ptr addrspace(20) getelementptr inbounds (i32, ptr addrspace(20) @REGISTERS, i32 27), align 4
 ; POST-PROCESS-NEXT:    [[TMP25:%.*]] = load i32, ptr addrspace(20) getelementptr inbounds (i32, ptr addrspace(20) @REGISTERS, i32 28), align 4
 ; POST-PROCESS-NEXT:    [[TMP26:%.*]] = load i32, ptr addrspace(20) getelementptr inbounds (i32, ptr addrspace(20) @REGISTERS, i32 29), align 4
-; POST-PROCESS-NEXT:    [[TMP27:%.*]] = inttoptr i32 [[TMP3]] to ptr addrspace(21)
-; POST-PROCESS-NEXT:    [[TMP28:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP27]], i32 0
-; POST-PROCESS-NEXT:    [[TMP29:%.*]] = load i32, ptr addrspace(21) [[TMP28]], align 4
+; POST-PROCESS-NEXT:    [[TMP28:%.*]] = inttoptr i32 [[TMP3]] to ptr addrspace(21)
+; POST-PROCESS-NEXT:    [[TMP31:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP28]], i32 0
+; POST-PROCESS-NEXT:    [[TMP29:%.*]] = load i32, ptr addrspace(21) [[TMP31]], align 4
 ; POST-PROCESS-NEXT:    [[TMP30:%.*]] = add i32 [[TMP3]], 4
-; POST-PROCESS-NEXT:    [[TMP31:%.*]] = inttoptr i32 [[TMP30]] to ptr addrspace(21)
-; POST-PROCESS-NEXT:    [[TMP32:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP31]], i32 0
-; POST-PROCESS-NEXT:    [[TMP33:%.*]] = load i32, ptr addrspace(21) [[TMP32]], align 4
+; POST-PROCESS-NEXT:    [[TMP32:%.*]] = inttoptr i32 [[TMP30]] to ptr addrspace(21)
+; POST-PROCESS-NEXT:    [[TMP35:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP32]], i32 0
+; POST-PROCESS-NEXT:    [[TMP33:%.*]] = load i32, ptr addrspace(21) [[TMP35]], align 4
 ; POST-PROCESS-NEXT:    [[TMP34:%.*]] = add i32 [[TMP3]], 8
-; POST-PROCESS-NEXT:    [[TMP35:%.*]] = inttoptr i32 [[TMP34]] to ptr addrspace(21)
-; POST-PROCESS-NEXT:    [[TMP36:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP35]], i32 0
-; POST-PROCESS-NEXT:    [[TMP37:%.*]] = load i32, ptr addrspace(21) [[TMP36]], align 4
+; POST-PROCESS-NEXT:    [[TMP36:%.*]] = inttoptr i32 [[TMP34]] to ptr addrspace(21)
+; POST-PROCESS-NEXT:    [[TMP39:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP36]], i32 0
+; POST-PROCESS-NEXT:    [[TMP37:%.*]] = load i32, ptr addrspace(21) [[TMP39]], align 4
 ; POST-PROCESS-NEXT:    [[TMP38:%.*]] = add i32 [[TMP3]], 12
-; POST-PROCESS-NEXT:    [[TMP39:%.*]] = inttoptr i32 [[TMP38]] to ptr addrspace(21)
-; POST-PROCESS-NEXT:    [[TMP40:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP39]], i32 0
-; POST-PROCESS-NEXT:    [[TMP41:%.*]] = load i32, ptr addrspace(21) [[TMP40]], align 4
+; POST-PROCESS-NEXT:    [[TMP40:%.*]] = inttoptr i32 [[TMP38]] to ptr addrspace(21)
+; POST-PROCESS-NEXT:    [[TMP43:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP40]], i32 0
+; POST-PROCESS-NEXT:    [[TMP41:%.*]] = load i32, ptr addrspace(21) [[TMP43]], align 4
 ; POST-PROCESS-NEXT:    [[TMP42:%.*]] = add i32 [[TMP3]], 16
-; POST-PROCESS-NEXT:    [[TMP43:%.*]] = inttoptr i32 [[TMP42]] to ptr addrspace(21)
-; POST-PROCESS-NEXT:    [[TMP44:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP43]], i32 0
-; POST-PROCESS-NEXT:    [[TMP45:%.*]] = load i32, ptr addrspace(21) [[TMP44]], align 4
+; POST-PROCESS-NEXT:    [[TMP44:%.*]] = inttoptr i32 [[TMP42]] to ptr addrspace(21)
+; POST-PROCESS-NEXT:    [[TMP47:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP44]], i32 0
+; POST-PROCESS-NEXT:    [[TMP45:%.*]] = load i32, ptr addrspace(21) [[TMP47]], align 4
 ; POST-PROCESS-NEXT:    [[TMP46:%.*]] = add i32 [[TMP3]], 20
-; POST-PROCESS-NEXT:    [[TMP47:%.*]] = inttoptr i32 [[TMP46]] to ptr addrspace(21)
-; POST-PROCESS-NEXT:    [[TMP48:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP47]], i32 0
-; POST-PROCESS-NEXT:    [[TMP49:%.*]] = load i32, ptr addrspace(21) [[TMP48]], align 4
+; POST-PROCESS-NEXT:    [[TMP48:%.*]] = inttoptr i32 [[TMP46]] to ptr addrspace(21)
+; POST-PROCESS-NEXT:    [[TMP51:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP48]], i32 0
+; POST-PROCESS-NEXT:    [[TMP49:%.*]] = load i32, ptr addrspace(21) [[TMP51]], align 4
 ; POST-PROCESS-NEXT:    [[TMP50:%.*]] = add i32 [[TMP3]], 24
-; POST-PROCESS-NEXT:    [[TMP51:%.*]] = inttoptr i32 [[TMP50]] to ptr addrspace(21)
-; POST-PROCESS-NEXT:    [[TMP52:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP51]], i32 0
-; POST-PROCESS-NEXT:    [[TMP53:%.*]] = load i32, ptr addrspace(21) [[TMP52]], align 4
+; POST-PROCESS-NEXT:    [[TMP52:%.*]] = inttoptr i32 [[TMP50]] to ptr addrspace(21)
+; POST-PROCESS-NEXT:    [[TMP55:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP52]], i32 0
+; POST-PROCESS-NEXT:    [[TMP53:%.*]] = load i32, ptr addrspace(21) [[TMP55]], align 4
 ; POST-PROCESS-NEXT:    [[TMP54:%.*]] = add i32 [[TMP3]], 28
-; POST-PROCESS-NEXT:    [[TMP55:%.*]] = inttoptr i32 [[TMP54]] to ptr addrspace(21)
-; POST-PROCESS-NEXT:    [[TMP56:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP55]], i32 0
-; POST-PROCESS-NEXT:    [[TMP57:%.*]] = load i32, ptr addrspace(21) [[TMP56]], align 4
+; POST-PROCESS-NEXT:    [[TMP56:%.*]] = inttoptr i32 [[TMP54]] to ptr addrspace(21)
+; POST-PROCESS-NEXT:    [[TMP59:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP56]], i32 0
+; POST-PROCESS-NEXT:    [[TMP57:%.*]] = load i32, ptr addrspace(21) [[TMP59]], align 4
 ; POST-PROCESS-NEXT:    [[TMP58:%.*]] = add i32 [[TMP3]], 32
-; POST-PROCESS-NEXT:    [[TMP59:%.*]] = inttoptr i32 [[TMP58]] to ptr addrspace(21)
-; POST-PROCESS-NEXT:    [[TMP60:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP59]], i32 0
-; POST-PROCESS-NEXT:    [[TMP61:%.*]] = load i32, ptr addrspace(21) [[TMP60]], align 4
+; POST-PROCESS-NEXT:    [[TMP60:%.*]] = inttoptr i32 [[TMP58]] to ptr addrspace(21)
+; POST-PROCESS-NEXT:    [[TMP63:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP60]], i32 0
+; POST-PROCESS-NEXT:    [[TMP61:%.*]] = load i32, ptr addrspace(21) [[TMP63]], align 4
 ; POST-PROCESS-NEXT:    [[TMP62:%.*]] = add i32 [[TMP3]], 36
-; POST-PROCESS-NEXT:    [[TMP63:%.*]] = inttoptr i32 [[TMP62]] to ptr addrspace(21)
-; POST-PROCESS-NEXT:    [[TMP64:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP63]], i32 0
-; POST-PROCESS-NEXT:    [[TMP65:%.*]] = load i32, ptr addrspace(21) [[TMP64]], align 4
+; POST-PROCESS-NEXT:    [[TMP64:%.*]] = inttoptr i32 [[TMP62]] to ptr addrspace(21)
+; POST-PROCESS-NEXT:    [[TMP67:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP64]], i32 0
+; POST-PROCESS-NEXT:    [[TMP65:%.*]] = load i32, ptr addrspace(21) [[TMP67]], align 4
 ; POST-PROCESS-NEXT:    [[TMP66:%.*]] = add i32 [[TMP3]], 40
-; POST-PROCESS-NEXT:    [[TMP67:%.*]] = inttoptr i32 [[TMP66]] to ptr addrspace(21)
-; POST-PROCESS-NEXT:    [[TMP68:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP67]], i32 0
-; POST-PROCESS-NEXT:    [[TMP69:%.*]] = load i32, ptr addrspace(21) [[TMP68]], align 4
+; POST-PROCESS-NEXT:    [[TMP68:%.*]] = inttoptr i32 [[TMP66]] to ptr addrspace(21)
+; POST-PROCESS-NEXT:    [[TMP71:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP68]], i32 0
+; POST-PROCESS-NEXT:    [[TMP69:%.*]] = load i32, ptr addrspace(21) [[TMP71]], align 4
 ; POST-PROCESS-NEXT:    [[TMP70:%.*]] = add i32 [[TMP3]], 44
-; POST-PROCESS-NEXT:    [[TMP71:%.*]] = inttoptr i32 [[TMP70]] to ptr addrspace(21)
-; POST-PROCESS-NEXT:    [[TMP72:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP71]], i32 0
-; POST-PROCESS-NEXT:    [[TMP73:%.*]] = load i32, ptr addrspace(21) [[TMP72]], align 4
+; POST-PROCESS-NEXT:    [[TMP72:%.*]] = inttoptr i32 [[TMP70]] to ptr addrspace(21)
+; POST-PROCESS-NEXT:    [[TMP75:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP72]], i32 0
+; POST-PROCESS-NEXT:    [[TMP73:%.*]] = load i32, ptr addrspace(21) [[TMP75]], align 4
 ; POST-PROCESS-NEXT:    [[TMP74:%.*]] = add i32 [[TMP3]], 48
-; POST-PROCESS-NEXT:    [[TMP75:%.*]] = inttoptr i32 [[TMP74]] to ptr addrspace(21)
-; POST-PROCESS-NEXT:    [[TMP76:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP75]], i32 0
-; POST-PROCESS-NEXT:    [[TMP77:%.*]] = load i32, ptr addrspace(21) [[TMP76]], align 4
+; POST-PROCESS-NEXT:    [[TMP76:%.*]] = inttoptr i32 [[TMP74]] to ptr addrspace(21)
+; POST-PROCESS-NEXT:    [[TMP79:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP76]], i32 0
+; POST-PROCESS-NEXT:    [[TMP77:%.*]] = load i32, ptr addrspace(21) [[TMP79]], align 4
 ; POST-PROCESS-NEXT:    [[TMP78:%.*]] = add i32 [[TMP3]], 52
-; POST-PROCESS-NEXT:    [[TMP79:%.*]] = inttoptr i32 [[TMP78]] to ptr addrspace(21)
-; POST-PROCESS-NEXT:    [[TMP80:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP79]], i32 0
-; POST-PROCESS-NEXT:    [[TMP81:%.*]] = load i32, ptr addrspace(21) [[TMP80]], align 4
+; POST-PROCESS-NEXT:    [[TMP80:%.*]] = inttoptr i32 [[TMP78]] to ptr addrspace(21)
+; POST-PROCESS-NEXT:    [[TMP83:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP80]], i32 0
+; POST-PROCESS-NEXT:    [[TMP81:%.*]] = load i32, ptr addrspace(21) [[TMP83]], align 4
 ; POST-PROCESS-NEXT:    [[TMP82:%.*]] = add i32 [[TMP3]], 56
-; POST-PROCESS-NEXT:    [[TMP83:%.*]] = inttoptr i32 [[TMP82]] to ptr addrspace(21)
-; POST-PROCESS-NEXT:    [[TMP84:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP83]], i32 0
-; POST-PROCESS-NEXT:    [[TMP85:%.*]] = load i32, ptr addrspace(21) [[TMP84]], align 4
+; POST-PROCESS-NEXT:    [[TMP84:%.*]] = inttoptr i32 [[TMP82]] to ptr addrspace(21)
+; POST-PROCESS-NEXT:    [[TMP87:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP84]], i32 0
+; POST-PROCESS-NEXT:    [[TMP85:%.*]] = load i32, ptr addrspace(21) [[TMP87]], align 4
 ; POST-PROCESS-NEXT:    [[TMP86:%.*]] = add i32 [[TMP3]], 60
-; POST-PROCESS-NEXT:    [[TMP87:%.*]] = inttoptr i32 [[TMP86]] to ptr addrspace(21)
-; POST-PROCESS-NEXT:    [[TMP88:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP87]], i32 0
-; POST-PROCESS-NEXT:    [[TMP89:%.*]] = load i32, ptr addrspace(21) [[TMP88]], align 4
+; POST-PROCESS-NEXT:    [[TMP88:%.*]] = inttoptr i32 [[TMP86]] to ptr addrspace(21)
+; POST-PROCESS-NEXT:    [[TMP91:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP88]], i32 0
+; POST-PROCESS-NEXT:    [[TMP89:%.*]] = load i32, ptr addrspace(21) [[TMP91]], align 4
 ; POST-PROCESS-NEXT:    [[TMP90:%.*]] = add i32 [[TMP3]], 64
-; POST-PROCESS-NEXT:    [[TMP91:%.*]] = inttoptr i32 [[TMP90]] to ptr addrspace(21)
-; POST-PROCESS-NEXT:    [[TMP92:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP91]], i32 0
-; POST-PROCESS-NEXT:    [[TMP93:%.*]] = load i32, ptr addrspace(21) [[TMP92]], align 4
+; POST-PROCESS-NEXT:    [[TMP92:%.*]] = inttoptr i32 [[TMP90]] to ptr addrspace(21)
+; POST-PROCESS-NEXT:    [[TMP95:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP92]], i32 0
+; POST-PROCESS-NEXT:    [[TMP93:%.*]] = load i32, ptr addrspace(21) [[TMP95]], align 4
 ; POST-PROCESS-NEXT:    [[TMP94:%.*]] = add i32 [[TMP3]], 68
-; POST-PROCESS-NEXT:    [[TMP95:%.*]] = inttoptr i32 [[TMP94]] to ptr addrspace(21)
-; POST-PROCESS-NEXT:    [[TMP96:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP95]], i32 0
-; POST-PROCESS-NEXT:    [[TMP97:%.*]] = load i32, ptr addrspace(21) [[TMP96]], align 4
+; POST-PROCESS-NEXT:    [[TMP96:%.*]] = inttoptr i32 [[TMP94]] to ptr addrspace(21)
+; POST-PROCESS-NEXT:    [[TMP99:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP96]], i32 0
+; POST-PROCESS-NEXT:    [[TMP97:%.*]] = load i32, ptr addrspace(21) [[TMP99]], align 4
 ; POST-PROCESS-NEXT:    [[TMP98:%.*]] = add i32 [[TMP3]], 72
-; POST-PROCESS-NEXT:    [[TMP99:%.*]] = inttoptr i32 [[TMP98]] to ptr addrspace(21)
-; POST-PROCESS-NEXT:    [[TMP100:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP99]], i32 0
-; POST-PROCESS-NEXT:    [[TMP101:%.*]] = load i32, ptr addrspace(21) [[TMP100]], align 4
+; POST-PROCESS-NEXT:    [[TMP100:%.*]] = inttoptr i32 [[TMP98]] to ptr addrspace(21)
+; POST-PROCESS-NEXT:    [[TMP103:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP100]], i32 0
+; POST-PROCESS-NEXT:    [[TMP101:%.*]] = load i32, ptr addrspace(21) [[TMP103]], align 4
 ; POST-PROCESS-NEXT:    [[TMP102:%.*]] = add i32 [[TMP3]], 76
-; POST-PROCESS-NEXT:    [[TMP103:%.*]] = inttoptr i32 [[TMP102]] to ptr addrspace(21)
-; POST-PROCESS-NEXT:    [[TMP104:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP103]], i32 0
-; POST-PROCESS-NEXT:    [[TMP105:%.*]] = load i32, ptr addrspace(21) [[TMP104]], align 4
+; POST-PROCESS-NEXT:    [[TMP104:%.*]] = inttoptr i32 [[TMP102]] to ptr addrspace(21)
+; POST-PROCESS-NEXT:    [[TMP107:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP104]], i32 0
+; POST-PROCESS-NEXT:    [[TMP105:%.*]] = load i32, ptr addrspace(21) [[TMP107]], align 4
 ; POST-PROCESS-NEXT:    [[TMP106:%.*]] = add i32 [[TMP3]], 80
-; POST-PROCESS-NEXT:    [[TMP107:%.*]] = inttoptr i32 [[TMP106]] to ptr addrspace(21)
-; POST-PROCESS-NEXT:    [[TMP108:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP107]], i32 0
-; POST-PROCESS-NEXT:    [[TMP109:%.*]] = load i32, ptr addrspace(21) [[TMP108]], align 4
+; POST-PROCESS-NEXT:    [[TMP108:%.*]] = inttoptr i32 [[TMP106]] to ptr addrspace(21)
+; POST-PROCESS-NEXT:    [[TMP111:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP108]], i32 0
+; POST-PROCESS-NEXT:    [[TMP109:%.*]] = load i32, ptr addrspace(21) [[TMP111]], align 4
 ; POST-PROCESS-NEXT:    [[TMP110:%.*]] = add i32 [[TMP3]], 84
-; POST-PROCESS-NEXT:    [[TMP111:%.*]] = inttoptr i32 [[TMP110]] to ptr addrspace(21)
-; POST-PROCESS-NEXT:    [[TMP112:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP111]], i32 0
-; POST-PROCESS-NEXT:    [[TMP113:%.*]] = load i32, ptr addrspace(21) [[TMP112]], align 4
+; POST-PROCESS-NEXT:    [[TMP112:%.*]] = inttoptr i32 [[TMP110]] to ptr addrspace(21)
+; POST-PROCESS-NEXT:    [[TMP115:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP112]], i32 0
+; POST-PROCESS-NEXT:    [[TMP113:%.*]] = load i32, ptr addrspace(21) [[TMP115]], align 4
 ; POST-PROCESS-NEXT:    [[TMP114:%.*]] = add i32 [[TMP3]], 88
-; POST-PROCESS-NEXT:    [[TMP115:%.*]] = inttoptr i32 [[TMP114]] to ptr addrspace(21)
-; POST-PROCESS-NEXT:    [[TMP116:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP115]], i32 0
-; POST-PROCESS-NEXT:    [[TMP117:%.*]] = load i32, ptr addrspace(21) [[TMP116]], align 4
+; POST-PROCESS-NEXT:    [[TMP116:%.*]] = inttoptr i32 [[TMP114]] to ptr addrspace(21)
+; POST-PROCESS-NEXT:    [[TMP119:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP116]], i32 0
+; POST-PROCESS-NEXT:    [[TMP117:%.*]] = load i32, ptr addrspace(21) [[TMP119]], align 4
 ; POST-PROCESS-NEXT:    [[TMP118:%.*]] = add i32 [[TMP3]], 92
-; POST-PROCESS-NEXT:    [[TMP119:%.*]] = inttoptr i32 [[TMP118]] to ptr addrspace(21)
-; POST-PROCESS-NEXT:    [[TMP120:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP119]], i32 0
-; POST-PROCESS-NEXT:    [[TMP121:%.*]] = load i32, ptr addrspace(21) [[TMP120]], align 4
+; POST-PROCESS-NEXT:    [[TMP120:%.*]] = inttoptr i32 [[TMP118]] to ptr addrspace(21)
+; POST-PROCESS-NEXT:    [[TMP123:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP120]], i32 0
+; POST-PROCESS-NEXT:    [[TMP121:%.*]] = load i32, ptr addrspace(21) [[TMP123]], align 4
 ; POST-PROCESS-NEXT:    [[TMP122:%.*]] = add i32 [[TMP3]], 96
-; POST-PROCESS-NEXT:    [[TMP123:%.*]] = inttoptr i32 [[TMP122]] to ptr addrspace(21)
-; POST-PROCESS-NEXT:    [[TMP124:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP123]], i32 0
-; POST-PROCESS-NEXT:    [[TMP125:%.*]] = load i32, ptr addrspace(21) [[TMP124]], align 4
+; POST-PROCESS-NEXT:    [[TMP124:%.*]] = inttoptr i32 [[TMP122]] to ptr addrspace(21)
+; POST-PROCESS-NEXT:    [[TMP127:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP124]], i32 0
+; POST-PROCESS-NEXT:    [[TMP125:%.*]] = load i32, ptr addrspace(21) [[TMP127]], align 4
 ; POST-PROCESS-NEXT:    [[TMP126:%.*]] = add i32 [[TMP3]], 100
-; POST-PROCESS-NEXT:    [[TMP127:%.*]] = inttoptr i32 [[TMP126]] to ptr addrspace(21)
-; POST-PROCESS-NEXT:    [[TMP128:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP127]], i32 0
-; POST-PROCESS-NEXT:    [[TMP129:%.*]] = load i32, ptr addrspace(21) [[TMP128]], align 4
+; POST-PROCESS-NEXT:    [[TMP128:%.*]] = inttoptr i32 [[TMP126]] to ptr addrspace(21)
+; POST-PROCESS-NEXT:    [[TMP131:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP128]], i32 0
+; POST-PROCESS-NEXT:    [[TMP129:%.*]] = load i32, ptr addrspace(21) [[TMP131]], align 4
 ; POST-PROCESS-NEXT:    [[TMP130:%.*]] = add i32 [[TMP3]], 104
-; POST-PROCESS-NEXT:    [[TMP131:%.*]] = inttoptr i32 [[TMP130]] to ptr addrspace(21)
-; POST-PROCESS-NEXT:    [[TMP132:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP131]], i32 0
-; POST-PROCESS-NEXT:    [[TMP133:%.*]] = load i32, ptr addrspace(21) [[TMP132]], align 4
+; POST-PROCESS-NEXT:    [[TMP132:%.*]] = inttoptr i32 [[TMP130]] to ptr addrspace(21)
+; POST-PROCESS-NEXT:    [[TMP136:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP132]], i32 0
+; POST-PROCESS-NEXT:    [[TMP133:%.*]] = load i32, ptr addrspace(21) [[TMP136]], align 4
 ; POST-PROCESS-NEXT:    [[TMP134:%.*]] = load i32, ptr addrspace(20) @REGISTERS, align 4
-; POST-PROCESS-NEXT:    [[DOTFCA_0_EXTRACT:%.*]] = extractvalue [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP0]], 0
+; POST-PROCESS-NEXT:    [[DOTFCA_0_EXTRACT:%.*]] = extractvalue [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP1]], 0
 ; POST-PROCESS-NEXT:    call void @amd.dx.setLocalRootIndex(i32 5)
 ; POST-PROCESS-NEXT:    [[TMP135:%.*]] = add i32 [[TMP2]], 116
-; POST-PROCESS-NEXT:    [[TMP136:%.*]] = inttoptr i32 [[TMP135]] to ptr addrspace(21)
-; POST-PROCESS-NEXT:    [[TMP137:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP136]], i32 0
-; POST-PROCESS-NEXT:    [[DOTRELOAD:%.*]] = load i32, ptr addrspace(21) [[TMP137]], align 4
+; POST-PROCESS-NEXT:    [[TMP137:%.*]] = inttoptr i32 [[TMP135]] to ptr addrspace(21)
+; POST-PROCESS-NEXT:    [[TMP139:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP137]], i32 0
+; POST-PROCESS-NEXT:    [[DOTRELOAD:%.*]] = load i32, ptr addrspace(21) [[TMP139]], align 4
 ; POST-PROCESS-NEXT:    [[TMP138:%.*]] = add i32 [[TMP2]], 108
-; POST-PROCESS-NEXT:    [[TMP139:%.*]] = inttoptr i32 [[TMP138]] to ptr addrspace(21)
-; POST-PROCESS-NEXT:    [[TMP140:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP139]], i32 0
+; POST-PROCESS-NEXT:    [[TMP142:%.*]] = inttoptr i32 [[TMP138]] to ptr addrspace(21)
+; POST-PROCESS-NEXT:    [[TMP140:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP142]], i32 0
 ; POST-PROCESS-NEXT:    [[RETURNADDR_RELOAD:%.*]] = load i64, ptr addrspace(21) [[TMP140]], align 4
 ; POST-PROCESS-NEXT:    store i32 [[DOTRELOAD]], ptr addrspace(20) @REGISTERS, align 4
 ; POST-PROCESS-NEXT:    [[TMP141:%.*]] = load i32, ptr addrspace(20) @REGISTERS, align 4
@@ -2145,119 +2157,119 @@ attributes #3 = { nounwind }
 ; POST-PROCESS-NEXT:    store i32 [[TMP24]], ptr addrspace(20) getelementptr inbounds (i32, ptr addrspace(20) @REGISTERS, i32 27), align 4
 ; POST-PROCESS-NEXT:    store i32 [[TMP25]], ptr addrspace(20) getelementptr inbounds (i32, ptr addrspace(20) @REGISTERS, i32 28), align 4
 ; POST-PROCESS-NEXT:    store i32 [[TMP26]], ptr addrspace(20) getelementptr inbounds (i32, ptr addrspace(20) @REGISTERS, i32 29), align 4
-; POST-PROCESS-NEXT:    [[TMP142:%.*]] = inttoptr i32 [[TMP141]] to ptr addrspace(21)
-; POST-PROCESS-NEXT:    [[TMP143:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP142]], i32 0
+; POST-PROCESS-NEXT:    [[TMP145:%.*]] = inttoptr i32 [[TMP141]] to ptr addrspace(21)
+; POST-PROCESS-NEXT:    [[TMP143:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP145]], i32 0
 ; POST-PROCESS-NEXT:    store i32 [[TMP29]], ptr addrspace(21) [[TMP143]], align 4
 ; POST-PROCESS-NEXT:    [[TMP144:%.*]] = add i32 [[TMP141]], 4
-; POST-PROCESS-NEXT:    [[TMP145:%.*]] = inttoptr i32 [[TMP144]] to ptr addrspace(21)
-; POST-PROCESS-NEXT:    [[TMP146:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP145]], i32 0
+; POST-PROCESS-NEXT:    [[TMP148:%.*]] = inttoptr i32 [[TMP144]] to ptr addrspace(21)
+; POST-PROCESS-NEXT:    [[TMP146:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP148]], i32 0
 ; POST-PROCESS-NEXT:    store i32 [[TMP33]], ptr addrspace(21) [[TMP146]], align 4
 ; POST-PROCESS-NEXT:    [[TMP147:%.*]] = add i32 [[TMP141]], 8
-; POST-PROCESS-NEXT:    [[TMP148:%.*]] = inttoptr i32 [[TMP147]] to ptr addrspace(21)
-; POST-PROCESS-NEXT:    [[TMP149:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP148]], i32 0
+; POST-PROCESS-NEXT:    [[TMP151:%.*]] = inttoptr i32 [[TMP147]] to ptr addrspace(21)
+; POST-PROCESS-NEXT:    [[TMP149:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP151]], i32 0
 ; POST-PROCESS-NEXT:    store i32 [[TMP37]], ptr addrspace(21) [[TMP149]], align 4
 ; POST-PROCESS-NEXT:    [[TMP150:%.*]] = add i32 [[TMP141]], 12
-; POST-PROCESS-NEXT:    [[TMP151:%.*]] = inttoptr i32 [[TMP150]] to ptr addrspace(21)
-; POST-PROCESS-NEXT:    [[TMP152:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP151]], i32 0
+; POST-PROCESS-NEXT:    [[TMP154:%.*]] = inttoptr i32 [[TMP150]] to ptr addrspace(21)
+; POST-PROCESS-NEXT:    [[TMP152:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP154]], i32 0
 ; POST-PROCESS-NEXT:    store i32 [[TMP41]], ptr addrspace(21) [[TMP152]], align 4
 ; POST-PROCESS-NEXT:    [[TMP153:%.*]] = add i32 [[TMP141]], 16
-; POST-PROCESS-NEXT:    [[TMP154:%.*]] = inttoptr i32 [[TMP153]] to ptr addrspace(21)
-; POST-PROCESS-NEXT:    [[TMP155:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP154]], i32 0
+; POST-PROCESS-NEXT:    [[TMP157:%.*]] = inttoptr i32 [[TMP153]] to ptr addrspace(21)
+; POST-PROCESS-NEXT:    [[TMP155:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP157]], i32 0
 ; POST-PROCESS-NEXT:    store i32 [[TMP45]], ptr addrspace(21) [[TMP155]], align 4
 ; POST-PROCESS-NEXT:    [[TMP156:%.*]] = add i32 [[TMP141]], 20
-; POST-PROCESS-NEXT:    [[TMP157:%.*]] = inttoptr i32 [[TMP156]] to ptr addrspace(21)
-; POST-PROCESS-NEXT:    [[TMP158:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP157]], i32 0
+; POST-PROCESS-NEXT:    [[TMP160:%.*]] = inttoptr i32 [[TMP156]] to ptr addrspace(21)
+; POST-PROCESS-NEXT:    [[TMP158:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP160]], i32 0
 ; POST-PROCESS-NEXT:    store i32 [[TMP49]], ptr addrspace(21) [[TMP158]], align 4
 ; POST-PROCESS-NEXT:    [[TMP159:%.*]] = add i32 [[TMP141]], 24
-; POST-PROCESS-NEXT:    [[TMP160:%.*]] = inttoptr i32 [[TMP159]] to ptr addrspace(21)
-; POST-PROCESS-NEXT:    [[TMP161:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP160]], i32 0
+; POST-PROCESS-NEXT:    [[TMP163:%.*]] = inttoptr i32 [[TMP159]] to ptr addrspace(21)
+; POST-PROCESS-NEXT:    [[TMP161:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP163]], i32 0
 ; POST-PROCESS-NEXT:    store i32 [[TMP53]], ptr addrspace(21) [[TMP161]], align 4
 ; POST-PROCESS-NEXT:    [[TMP162:%.*]] = add i32 [[TMP141]], 28
-; POST-PROCESS-NEXT:    [[TMP163:%.*]] = inttoptr i32 [[TMP162]] to ptr addrspace(21)
-; POST-PROCESS-NEXT:    [[TMP164:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP163]], i32 0
+; POST-PROCESS-NEXT:    [[TMP166:%.*]] = inttoptr i32 [[TMP162]] to ptr addrspace(21)
+; POST-PROCESS-NEXT:    [[TMP164:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP166]], i32 0
 ; POST-PROCESS-NEXT:    store i32 [[TMP57]], ptr addrspace(21) [[TMP164]], align 4
 ; POST-PROCESS-NEXT:    [[TMP165:%.*]] = add i32 [[TMP141]], 32
-; POST-PROCESS-NEXT:    [[TMP166:%.*]] = inttoptr i32 [[TMP165]] to ptr addrspace(21)
-; POST-PROCESS-NEXT:    [[TMP167:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP166]], i32 0
+; POST-PROCESS-NEXT:    [[TMP169:%.*]] = inttoptr i32 [[TMP165]] to ptr addrspace(21)
+; POST-PROCESS-NEXT:    [[TMP167:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP169]], i32 0
 ; POST-PROCESS-NEXT:    store i32 [[TMP61]], ptr addrspace(21) [[TMP167]], align 4
 ; POST-PROCESS-NEXT:    [[TMP168:%.*]] = add i32 [[TMP141]], 36
-; POST-PROCESS-NEXT:    [[TMP169:%.*]] = inttoptr i32 [[TMP168]] to ptr addrspace(21)
-; POST-PROCESS-NEXT:    [[TMP170:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP169]], i32 0
+; POST-PROCESS-NEXT:    [[TMP172:%.*]] = inttoptr i32 [[TMP168]] to ptr addrspace(21)
+; POST-PROCESS-NEXT:    [[TMP170:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP172]], i32 0
 ; POST-PROCESS-NEXT:    store i32 [[TMP65]], ptr addrspace(21) [[TMP170]], align 4
 ; POST-PROCESS-NEXT:    [[TMP171:%.*]] = add i32 [[TMP141]], 40
-; POST-PROCESS-NEXT:    [[TMP172:%.*]] = inttoptr i32 [[TMP171]] to ptr addrspace(21)
-; POST-PROCESS-NEXT:    [[TMP173:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP172]], i32 0
+; POST-PROCESS-NEXT:    [[TMP175:%.*]] = inttoptr i32 [[TMP171]] to ptr addrspace(21)
+; POST-PROCESS-NEXT:    [[TMP173:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP175]], i32 0
 ; POST-PROCESS-NEXT:    store i32 [[TMP69]], ptr addrspace(21) [[TMP173]], align 4
 ; POST-PROCESS-NEXT:    [[TMP174:%.*]] = add i32 [[TMP141]], 44
-; POST-PROCESS-NEXT:    [[TMP175:%.*]] = inttoptr i32 [[TMP174]] to ptr addrspace(21)
-; POST-PROCESS-NEXT:    [[TMP176:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP175]], i32 0
+; POST-PROCESS-NEXT:    [[TMP178:%.*]] = inttoptr i32 [[TMP174]] to ptr addrspace(21)
+; POST-PROCESS-NEXT:    [[TMP176:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP178]], i32 0
 ; POST-PROCESS-NEXT:    store i32 [[TMP73]], ptr addrspace(21) [[TMP176]], align 4
 ; POST-PROCESS-NEXT:    [[TMP177:%.*]] = add i32 [[TMP141]], 48
-; POST-PROCESS-NEXT:    [[TMP178:%.*]] = inttoptr i32 [[TMP177]] to ptr addrspace(21)
-; POST-PROCESS-NEXT:    [[TMP179:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP178]], i32 0
+; POST-PROCESS-NEXT:    [[TMP181:%.*]] = inttoptr i32 [[TMP177]] to ptr addrspace(21)
+; POST-PROCESS-NEXT:    [[TMP179:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP181]], i32 0
 ; POST-PROCESS-NEXT:    store i32 [[TMP77]], ptr addrspace(21) [[TMP179]], align 4
 ; POST-PROCESS-NEXT:    [[TMP180:%.*]] = add i32 [[TMP141]], 52
-; POST-PROCESS-NEXT:    [[TMP181:%.*]] = inttoptr i32 [[TMP180]] to ptr addrspace(21)
-; POST-PROCESS-NEXT:    [[TMP182:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP181]], i32 0
+; POST-PROCESS-NEXT:    [[TMP184:%.*]] = inttoptr i32 [[TMP180]] to ptr addrspace(21)
+; POST-PROCESS-NEXT:    [[TMP182:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP184]], i32 0
 ; POST-PROCESS-NEXT:    store i32 [[TMP81]], ptr addrspace(21) [[TMP182]], align 4
 ; POST-PROCESS-NEXT:    [[TMP183:%.*]] = add i32 [[TMP141]], 56
-; POST-PROCESS-NEXT:    [[TMP184:%.*]] = inttoptr i32 [[TMP183]] to ptr addrspace(21)
-; POST-PROCESS-NEXT:    [[TMP185:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP184]], i32 0
+; POST-PROCESS-NEXT:    [[TMP187:%.*]] = inttoptr i32 [[TMP183]] to ptr addrspace(21)
+; POST-PROCESS-NEXT:    [[TMP185:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP187]], i32 0
 ; POST-PROCESS-NEXT:    store i32 [[TMP85]], ptr addrspace(21) [[TMP185]], align 4
 ; POST-PROCESS-NEXT:    [[TMP186:%.*]] = add i32 [[TMP141]], 60
-; POST-PROCESS-NEXT:    [[TMP187:%.*]] = inttoptr i32 [[TMP186]] to ptr addrspace(21)
-; POST-PROCESS-NEXT:    [[TMP188:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP187]], i32 0
+; POST-PROCESS-NEXT:    [[TMP190:%.*]] = inttoptr i32 [[TMP186]] to ptr addrspace(21)
+; POST-PROCESS-NEXT:    [[TMP188:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP190]], i32 0
 ; POST-PROCESS-NEXT:    store i32 [[TMP89]], ptr addrspace(21) [[TMP188]], align 4
 ; POST-PROCESS-NEXT:    [[TMP189:%.*]] = add i32 [[TMP141]], 64
-; POST-PROCESS-NEXT:    [[TMP190:%.*]] = inttoptr i32 [[TMP189]] to ptr addrspace(21)
-; POST-PROCESS-NEXT:    [[TMP191:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP190]], i32 0
+; POST-PROCESS-NEXT:    [[TMP193:%.*]] = inttoptr i32 [[TMP189]] to ptr addrspace(21)
+; POST-PROCESS-NEXT:    [[TMP191:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP193]], i32 0
 ; POST-PROCESS-NEXT:    store i32 [[TMP93]], ptr addrspace(21) [[TMP191]], align 4
 ; POST-PROCESS-NEXT:    [[TMP192:%.*]] = add i32 [[TMP141]], 68
-; POST-PROCESS-NEXT:    [[TMP193:%.*]] = inttoptr i32 [[TMP192]] to ptr addrspace(21)
-; POST-PROCESS-NEXT:    [[TMP194:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP193]], i32 0
+; POST-PROCESS-NEXT:    [[TMP196:%.*]] = inttoptr i32 [[TMP192]] to ptr addrspace(21)
+; POST-PROCESS-NEXT:    [[TMP194:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP196]], i32 0
 ; POST-PROCESS-NEXT:    store i32 [[TMP97]], ptr addrspace(21) [[TMP194]], align 4
 ; POST-PROCESS-NEXT:    [[TMP195:%.*]] = add i32 [[TMP141]], 72
-; POST-PROCESS-NEXT:    [[TMP196:%.*]] = inttoptr i32 [[TMP195]] to ptr addrspace(21)
-; POST-PROCESS-NEXT:    [[TMP197:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP196]], i32 0
+; POST-PROCESS-NEXT:    [[TMP199:%.*]] = inttoptr i32 [[TMP195]] to ptr addrspace(21)
+; POST-PROCESS-NEXT:    [[TMP197:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP199]], i32 0
 ; POST-PROCESS-NEXT:    store i32 [[TMP101]], ptr addrspace(21) [[TMP197]], align 4
 ; POST-PROCESS-NEXT:    [[TMP198:%.*]] = add i32 [[TMP141]], 76
-; POST-PROCESS-NEXT:    [[TMP199:%.*]] = inttoptr i32 [[TMP198]] to ptr addrspace(21)
-; POST-PROCESS-NEXT:    [[TMP200:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP199]], i32 0
+; POST-PROCESS-NEXT:    [[TMP202:%.*]] = inttoptr i32 [[TMP198]] to ptr addrspace(21)
+; POST-PROCESS-NEXT:    [[TMP200:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP202]], i32 0
 ; POST-PROCESS-NEXT:    store i32 [[TMP105]], ptr addrspace(21) [[TMP200]], align 4
 ; POST-PROCESS-NEXT:    [[TMP201:%.*]] = add i32 [[TMP141]], 80
-; POST-PROCESS-NEXT:    [[TMP202:%.*]] = inttoptr i32 [[TMP201]] to ptr addrspace(21)
-; POST-PROCESS-NEXT:    [[TMP203:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP202]], i32 0
+; POST-PROCESS-NEXT:    [[TMP205:%.*]] = inttoptr i32 [[TMP201]] to ptr addrspace(21)
+; POST-PROCESS-NEXT:    [[TMP203:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP205]], i32 0
 ; POST-PROCESS-NEXT:    store i32 [[TMP109]], ptr addrspace(21) [[TMP203]], align 4
 ; POST-PROCESS-NEXT:    [[TMP204:%.*]] = add i32 [[TMP141]], 84
-; POST-PROCESS-NEXT:    [[TMP205:%.*]] = inttoptr i32 [[TMP204]] to ptr addrspace(21)
-; POST-PROCESS-NEXT:    [[TMP206:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP205]], i32 0
+; POST-PROCESS-NEXT:    [[TMP208:%.*]] = inttoptr i32 [[TMP204]] to ptr addrspace(21)
+; POST-PROCESS-NEXT:    [[TMP206:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP208]], i32 0
 ; POST-PROCESS-NEXT:    store i32 [[TMP113]], ptr addrspace(21) [[TMP206]], align 4
 ; POST-PROCESS-NEXT:    [[TMP207:%.*]] = add i32 [[TMP141]], 88
-; POST-PROCESS-NEXT:    [[TMP208:%.*]] = inttoptr i32 [[TMP207]] to ptr addrspace(21)
-; POST-PROCESS-NEXT:    [[TMP209:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP208]], i32 0
+; POST-PROCESS-NEXT:    [[TMP211:%.*]] = inttoptr i32 [[TMP207]] to ptr addrspace(21)
+; POST-PROCESS-NEXT:    [[TMP209:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP211]], i32 0
 ; POST-PROCESS-NEXT:    store i32 [[TMP117]], ptr addrspace(21) [[TMP209]], align 4
 ; POST-PROCESS-NEXT:    [[TMP210:%.*]] = add i32 [[TMP141]], 92
-; POST-PROCESS-NEXT:    [[TMP211:%.*]] = inttoptr i32 [[TMP210]] to ptr addrspace(21)
-; POST-PROCESS-NEXT:    [[TMP212:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP211]], i32 0
+; POST-PROCESS-NEXT:    [[TMP214:%.*]] = inttoptr i32 [[TMP210]] to ptr addrspace(21)
+; POST-PROCESS-NEXT:    [[TMP212:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP214]], i32 0
 ; POST-PROCESS-NEXT:    store i32 [[TMP121]], ptr addrspace(21) [[TMP212]], align 4
 ; POST-PROCESS-NEXT:    [[TMP213:%.*]] = add i32 [[TMP141]], 96
-; POST-PROCESS-NEXT:    [[TMP214:%.*]] = inttoptr i32 [[TMP213]] to ptr addrspace(21)
-; POST-PROCESS-NEXT:    [[TMP215:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP214]], i32 0
+; POST-PROCESS-NEXT:    [[TMP217:%.*]] = inttoptr i32 [[TMP213]] to ptr addrspace(21)
+; POST-PROCESS-NEXT:    [[TMP215:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP217]], i32 0
 ; POST-PROCESS-NEXT:    store i32 [[TMP125]], ptr addrspace(21) [[TMP215]], align 4
 ; POST-PROCESS-NEXT:    [[TMP216:%.*]] = add i32 [[TMP141]], 100
-; POST-PROCESS-NEXT:    [[TMP217:%.*]] = inttoptr i32 [[TMP216]] to ptr addrspace(21)
-; POST-PROCESS-NEXT:    [[TMP218:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP217]], i32 0
+; POST-PROCESS-NEXT:    [[TMP220:%.*]] = inttoptr i32 [[TMP216]] to ptr addrspace(21)
+; POST-PROCESS-NEXT:    [[TMP218:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP220]], i32 0
 ; POST-PROCESS-NEXT:    store i32 [[TMP129]], ptr addrspace(21) [[TMP218]], align 4
 ; POST-PROCESS-NEXT:    [[TMP219:%.*]] = add i32 [[TMP141]], 104
-; POST-PROCESS-NEXT:    [[TMP220:%.*]] = inttoptr i32 [[TMP219]] to ptr addrspace(21)
-; POST-PROCESS-NEXT:    [[TMP221:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP220]], i32 0
+; POST-PROCESS-NEXT:    [[TMP225:%.*]] = inttoptr i32 [[TMP219]] to ptr addrspace(21)
+; POST-PROCESS-NEXT:    [[TMP221:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP225]], i32 0
 ; POST-PROCESS-NEXT:    store i32 [[TMP133]], ptr addrspace(21) [[TMP221]], align 4
 ; POST-PROCESS-NEXT:    [[DOTFCA_0_INSERT:%.*]] = insertvalue [[STRUCT_DISPATCHSYSTEMDATA]] poison, <3 x i32> [[DOTFCA_0_EXTRACT]], 0
 ; POST-PROCESS-NEXT:    [[TMP222:%.*]] = load i32, ptr [[CSP]], align 4
 ; POST-PROCESS-NEXT:    [[TMP223:%.*]] = add i32 [[TMP222]], -120
 ; POST-PROCESS-NEXT:    store i32 [[TMP223]], ptr [[CSP]], align 4
 ; POST-PROCESS-NEXT:    [[TMP224:%.*]] = load i32, ptr [[CSP]], align 4
-; POST-PROCESS-NEXT:    call void (i64, ...) @continuation.continue(i64 [[RETURNADDR_RELOAD]], i32 [[TMP224]], [[STRUCT_DISPATCHSYSTEMDATA]] [[DOTFCA_0_INSERT]]), !continuation.registercount [[META17]]
+; POST-PROCESS-NEXT:    call void (i64, ...) @continuation.continue(i64 [[RETURNADDR_RELOAD]], i32 [[TMP224]], i64 poison, [[STRUCT_DISPATCHSYSTEMDATA]] [[DOTFCA_0_INSERT]]), !continuation.registercount [[META17]]
 ; POST-PROCESS-NEXT:    unreachable
 ;
 ;
@@ -2274,18 +2286,16 @@ attributes #3 = { nounwind }
 ;
 ;
 ; POST-PROCESS-GLOBAL-LABEL: define void @main(
-; POST-PROCESS-GLOBAL-SAME: ) !lgc.rt.shaderstage [[META8:![0-9]+]] !continuation.entry [[META19:![0-9]+]] !continuation.registercount [[META8]] !continuation [[META20:![0-9]+]] !continuation.stacksize [[META21:![0-9]+]] !continuation.state [[META8]] {
+; POST-PROCESS-GLOBAL-SAME: i32 [[CSPINIT:%.*]], i64 [[RETURNADDR:%.*]], [[STRUCT_DISPATCHSYSTEMDATA:%.*]] [[TMP0:%.*]]) !lgc.rt.shaderstage [[META8:![0-9]+]] !continuation.entry [[META19:![0-9]+]] !continuation.registercount [[META8]] !continuation [[META20:![0-9]+]] !continuation.stacksize [[META21:![0-9]+]] !continuation.state [[META8]] {
 ; POST-PROCESS-GLOBAL-NEXT:  AllocaSpillBB:
 ; POST-PROCESS-GLOBAL-NEXT:    [[CSP:%.*]] = alloca i32, align 4
-; POST-PROCESS-GLOBAL-NEXT:    [[TMP0:%.*]] = call i32 @_cont_GetContinuationStackAddr()
-; POST-PROCESS-GLOBAL-NEXT:    store i32 [[TMP0]], ptr [[CSP]], align 4
+; POST-PROCESS-GLOBAL-NEXT:    store i32 [[CSPINIT]], ptr [[CSP]], align 4
 ; POST-PROCESS-GLOBAL-NEXT:    [[TMP1:%.*]] = call i64 @_cont_GetContinuationStackGlobalMemBase()
 ; POST-PROCESS-GLOBAL-NEXT:    [[TMP2:%.*]] = inttoptr i64 [[TMP1]] to ptr addrspace(22)
-; POST-PROCESS-GLOBAL-NEXT:    [[SYSTEM_DATA:%.*]] = call [[STRUCT_DISPATCHSYSTEMDATA:%.*]] [[_CONT_SETUPRAYGEN:@[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]]()
 ; POST-PROCESS-GLOBAL-NEXT:    [[TMP3:%.*]] = load i32, ptr [[CSP]], align 4
 ; POST-PROCESS-GLOBAL-NEXT:    [[TMP4:%.*]] = add i32 [[TMP3]], 108
 ; POST-PROCESS-GLOBAL-NEXT:    store i32 [[TMP4]], ptr [[CSP]], align 4
-; POST-PROCESS-GLOBAL-NEXT:    [[DOTFCA_0_EXTRACT:%.*]] = extractvalue [[STRUCT_DISPATCHSYSTEMDATA]] [[SYSTEM_DATA]], 0
+; POST-PROCESS-GLOBAL-NEXT:    [[DOTFCA_0_EXTRACT:%.*]] = extractvalue [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP0]], 0
 ; POST-PROCESS-GLOBAL-NEXT:    call void @amd.dx.setLocalRootIndex(i32 0)
 ; POST-PROCESS-GLOBAL-NEXT:    [[TMP5:%.*]] = load [[DX_TYPES_HANDLE:%.*]], ptr @"\01?Scene@@3URaytracingAccelerationStructure@@A", align 4
 ; POST-PROCESS-GLOBAL-NEXT:    [[TMP6:%.*]] = load [[DX_TYPES_HANDLE]], ptr @"\01?RenderTarget@@3V?$RWTexture2D@V?$vector@M$03@@@@A", align 4
@@ -2295,6 +2305,8 @@ attributes #3 = { nounwind }
 ; POST-PROCESS-GLOBAL-NEXT:    [[DIS_DATA_I_FCA_0_INSERT:%.*]] = insertvalue [[STRUCT_DISPATCHSYSTEMDATA]] poison, <3 x i32> [[DOTFCA_0_EXTRACT]], 0
 ; POST-PROCESS-GLOBAL-NEXT:    [[SYS_DATA_I:%.*]] = insertvalue [[STRUCT_SYSTEMDATA:%.*]] undef, [[STRUCT_DISPATCHSYSTEMDATA]] [[DIS_DATA_I_FCA_0_INSERT]], 0
 ; POST-PROCESS-GLOBAL-NEXT:    [[TRAV_DATA_I:%.*]] = insertvalue [[STRUCT_TRAVERSALDATA:%.*]] undef, [[STRUCT_SYSTEMDATA]] [[SYS_DATA_I]], 0
+; POST-PROCESS-GLOBAL-NEXT:    [[TMP11:%.*]] = call i64 @continuation.getAddrAndMD(ptr @main.resume.0)
+; POST-PROCESS-GLOBAL-NEXT:    [[TRAV_DATA2_I:%.*]] = insertvalue [[STRUCT_TRAVERSALDATA]] [[TRAV_DATA_I]], i64 [[TMP11]], 5
 ; POST-PROCESS-GLOBAL-NEXT:    store i32 [[TMP3]], ptr addrspace(20) @REGISTERS, align 4
 ; POST-PROCESS-GLOBAL-NEXT:    [[TMP10:%.*]] = load i32, ptr addrspace(20) @REGISTERS, align 4
 ; POST-PROCESS-GLOBAL-NEXT:    store i32 undef, ptr addrspace(20) getelementptr inbounds (i32, ptr addrspace(20) @REGISTERS, i32 7), align 4
@@ -2320,99 +2332,98 @@ attributes #3 = { nounwind }
 ; POST-PROCESS-GLOBAL-NEXT:    store i32 undef, ptr addrspace(20) getelementptr inbounds (i32, ptr addrspace(20) @REGISTERS, i32 27), align 4
 ; POST-PROCESS-GLOBAL-NEXT:    store i32 undef, ptr addrspace(20) getelementptr inbounds (i32, ptr addrspace(20) @REGISTERS, i32 28), align 4
 ; POST-PROCESS-GLOBAL-NEXT:    store i32 undef, ptr addrspace(20) getelementptr inbounds (i32, ptr addrspace(20) @REGISTERS, i32 29), align 4
-; POST-PROCESS-GLOBAL-NEXT:    [[TMP11:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP2]], i32 [[TMP10]]
-; POST-PROCESS-GLOBAL-NEXT:    store i32 undef, ptr addrspace(22) [[TMP11]], align 4
-; POST-PROCESS-GLOBAL-NEXT:    [[TMP12:%.*]] = add i32 [[TMP10]], 4
-; POST-PROCESS-GLOBAL-NEXT:    [[TMP13:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP2]], i32 [[TMP12]]
+; POST-PROCESS-GLOBAL-NEXT:    [[TMP13:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP2]], i32 [[TMP10]]
 ; POST-PROCESS-GLOBAL-NEXT:    store i32 undef, ptr addrspace(22) [[TMP13]], align 4
-; POST-PROCESS-GLOBAL-NEXT:    [[TMP14:%.*]] = add i32 [[TMP10]], 8
-; POST-PROCESS-GLOBAL-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP2]], i32 [[TMP14]]
+; POST-PROCESS-GLOBAL-NEXT:    [[TMP12:%.*]] = add i32 [[TMP10]], 4
+; POST-PROCESS-GLOBAL-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP2]], i32 [[TMP12]]
 ; POST-PROCESS-GLOBAL-NEXT:    store i32 undef, ptr addrspace(22) [[TMP15]], align 4
-; POST-PROCESS-GLOBAL-NEXT:    [[TMP16:%.*]] = add i32 [[TMP10]], 12
-; POST-PROCESS-GLOBAL-NEXT:    [[TMP17:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP2]], i32 [[TMP16]]
+; POST-PROCESS-GLOBAL-NEXT:    [[TMP14:%.*]] = add i32 [[TMP10]], 8
+; POST-PROCESS-GLOBAL-NEXT:    [[TMP17:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP2]], i32 [[TMP14]]
 ; POST-PROCESS-GLOBAL-NEXT:    store i32 undef, ptr addrspace(22) [[TMP17]], align 4
-; POST-PROCESS-GLOBAL-NEXT:    [[TMP18:%.*]] = add i32 [[TMP10]], 16
-; POST-PROCESS-GLOBAL-NEXT:    [[TMP19:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP2]], i32 [[TMP18]]
+; POST-PROCESS-GLOBAL-NEXT:    [[TMP16:%.*]] = add i32 [[TMP10]], 12
+; POST-PROCESS-GLOBAL-NEXT:    [[TMP19:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP2]], i32 [[TMP16]]
 ; POST-PROCESS-GLOBAL-NEXT:    store i32 undef, ptr addrspace(22) [[TMP19]], align 4
-; POST-PROCESS-GLOBAL-NEXT:    [[TMP20:%.*]] = add i32 [[TMP10]], 20
-; POST-PROCESS-GLOBAL-NEXT:    [[TMP21:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP2]], i32 [[TMP20]]
+; POST-PROCESS-GLOBAL-NEXT:    [[TMP18:%.*]] = add i32 [[TMP10]], 16
+; POST-PROCESS-GLOBAL-NEXT:    [[TMP21:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP2]], i32 [[TMP18]]
 ; POST-PROCESS-GLOBAL-NEXT:    store i32 undef, ptr addrspace(22) [[TMP21]], align 4
-; POST-PROCESS-GLOBAL-NEXT:    [[TMP22:%.*]] = add i32 [[TMP10]], 24
-; POST-PROCESS-GLOBAL-NEXT:    [[TMP23:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP2]], i32 [[TMP22]]
+; POST-PROCESS-GLOBAL-NEXT:    [[TMP20:%.*]] = add i32 [[TMP10]], 20
+; POST-PROCESS-GLOBAL-NEXT:    [[TMP23:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP2]], i32 [[TMP20]]
 ; POST-PROCESS-GLOBAL-NEXT:    store i32 undef, ptr addrspace(22) [[TMP23]], align 4
-; POST-PROCESS-GLOBAL-NEXT:    [[TMP24:%.*]] = add i32 [[TMP10]], 28
-; POST-PROCESS-GLOBAL-NEXT:    [[TMP25:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP2]], i32 [[TMP24]]
+; POST-PROCESS-GLOBAL-NEXT:    [[TMP22:%.*]] = add i32 [[TMP10]], 24
+; POST-PROCESS-GLOBAL-NEXT:    [[TMP25:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP2]], i32 [[TMP22]]
 ; POST-PROCESS-GLOBAL-NEXT:    store i32 undef, ptr addrspace(22) [[TMP25]], align 4
-; POST-PROCESS-GLOBAL-NEXT:    [[TMP26:%.*]] = add i32 [[TMP10]], 32
-; POST-PROCESS-GLOBAL-NEXT:    [[TMP27:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP2]], i32 [[TMP26]]
+; POST-PROCESS-GLOBAL-NEXT:    [[TMP24:%.*]] = add i32 [[TMP10]], 28
+; POST-PROCESS-GLOBAL-NEXT:    [[TMP27:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP2]], i32 [[TMP24]]
 ; POST-PROCESS-GLOBAL-NEXT:    store i32 undef, ptr addrspace(22) [[TMP27]], align 4
-; POST-PROCESS-GLOBAL-NEXT:    [[TMP28:%.*]] = add i32 [[TMP10]], 36
-; POST-PROCESS-GLOBAL-NEXT:    [[TMP29:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP2]], i32 [[TMP28]]
+; POST-PROCESS-GLOBAL-NEXT:    [[TMP26:%.*]] = add i32 [[TMP10]], 32
+; POST-PROCESS-GLOBAL-NEXT:    [[TMP29:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP2]], i32 [[TMP26]]
 ; POST-PROCESS-GLOBAL-NEXT:    store i32 undef, ptr addrspace(22) [[TMP29]], align 4
-; POST-PROCESS-GLOBAL-NEXT:    [[TMP30:%.*]] = add i32 [[TMP10]], 40
-; POST-PROCESS-GLOBAL-NEXT:    [[TMP31:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP2]], i32 [[TMP30]]
+; POST-PROCESS-GLOBAL-NEXT:    [[TMP28:%.*]] = add i32 [[TMP10]], 36
+; POST-PROCESS-GLOBAL-NEXT:    [[TMP31:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP2]], i32 [[TMP28]]
 ; POST-PROCESS-GLOBAL-NEXT:    store i32 undef, ptr addrspace(22) [[TMP31]], align 4
-; POST-PROCESS-GLOBAL-NEXT:    [[TMP32:%.*]] = add i32 [[TMP10]], 44
-; POST-PROCESS-GLOBAL-NEXT:    [[TMP33:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP2]], i32 [[TMP32]]
+; POST-PROCESS-GLOBAL-NEXT:    [[TMP30:%.*]] = add i32 [[TMP10]], 40
+; POST-PROCESS-GLOBAL-NEXT:    [[TMP33:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP2]], i32 [[TMP30]]
 ; POST-PROCESS-GLOBAL-NEXT:    store i32 undef, ptr addrspace(22) [[TMP33]], align 4
-; POST-PROCESS-GLOBAL-NEXT:    [[TMP34:%.*]] = add i32 [[TMP10]], 48
-; POST-PROCESS-GLOBAL-NEXT:    [[TMP35:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP2]], i32 [[TMP34]]
+; POST-PROCESS-GLOBAL-NEXT:    [[TMP32:%.*]] = add i32 [[TMP10]], 44
+; POST-PROCESS-GLOBAL-NEXT:    [[TMP35:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP2]], i32 [[TMP32]]
 ; POST-PROCESS-GLOBAL-NEXT:    store i32 undef, ptr addrspace(22) [[TMP35]], align 4
-; POST-PROCESS-GLOBAL-NEXT:    [[TMP36:%.*]] = add i32 [[TMP10]], 52
-; POST-PROCESS-GLOBAL-NEXT:    [[TMP37:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP2]], i32 [[TMP36]]
+; POST-PROCESS-GLOBAL-NEXT:    [[TMP34:%.*]] = add i32 [[TMP10]], 48
+; POST-PROCESS-GLOBAL-NEXT:    [[TMP37:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP2]], i32 [[TMP34]]
 ; POST-PROCESS-GLOBAL-NEXT:    store i32 undef, ptr addrspace(22) [[TMP37]], align 4
-; POST-PROCESS-GLOBAL-NEXT:    [[TMP38:%.*]] = add i32 [[TMP10]], 56
-; POST-PROCESS-GLOBAL-NEXT:    [[TMP39:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP2]], i32 [[TMP38]]
+; POST-PROCESS-GLOBAL-NEXT:    [[TMP36:%.*]] = add i32 [[TMP10]], 52
+; POST-PROCESS-GLOBAL-NEXT:    [[TMP39:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP2]], i32 [[TMP36]]
 ; POST-PROCESS-GLOBAL-NEXT:    store i32 undef, ptr addrspace(22) [[TMP39]], align 4
-; POST-PROCESS-GLOBAL-NEXT:    [[TMP40:%.*]] = add i32 [[TMP10]], 60
-; POST-PROCESS-GLOBAL-NEXT:    [[TMP41:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP2]], i32 [[TMP40]]
+; POST-PROCESS-GLOBAL-NEXT:    [[TMP38:%.*]] = add i32 [[TMP10]], 56
+; POST-PROCESS-GLOBAL-NEXT:    [[TMP41:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP2]], i32 [[TMP38]]
 ; POST-PROCESS-GLOBAL-NEXT:    store i32 undef, ptr addrspace(22) [[TMP41]], align 4
-; POST-PROCESS-GLOBAL-NEXT:    [[TMP42:%.*]] = add i32 [[TMP10]], 64
-; POST-PROCESS-GLOBAL-NEXT:    [[TMP43:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP2]], i32 [[TMP42]]
+; POST-PROCESS-GLOBAL-NEXT:    [[TMP40:%.*]] = add i32 [[TMP10]], 60
+; POST-PROCESS-GLOBAL-NEXT:    [[TMP43:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP2]], i32 [[TMP40]]
 ; POST-PROCESS-GLOBAL-NEXT:    store i32 undef, ptr addrspace(22) [[TMP43]], align 4
-; POST-PROCESS-GLOBAL-NEXT:    [[TMP44:%.*]] = add i32 [[TMP10]], 68
-; POST-PROCESS-GLOBAL-NEXT:    [[TMP45:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP2]], i32 [[TMP44]]
+; POST-PROCESS-GLOBAL-NEXT:    [[TMP42:%.*]] = add i32 [[TMP10]], 64
+; POST-PROCESS-GLOBAL-NEXT:    [[TMP45:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP2]], i32 [[TMP42]]
 ; POST-PROCESS-GLOBAL-NEXT:    store i32 undef, ptr addrspace(22) [[TMP45]], align 4
-; POST-PROCESS-GLOBAL-NEXT:    [[TMP46:%.*]] = add i32 [[TMP10]], 72
-; POST-PROCESS-GLOBAL-NEXT:    [[TMP47:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP2]], i32 [[TMP46]]
+; POST-PROCESS-GLOBAL-NEXT:    [[TMP44:%.*]] = add i32 [[TMP10]], 68
+; POST-PROCESS-GLOBAL-NEXT:    [[TMP47:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP2]], i32 [[TMP44]]
 ; POST-PROCESS-GLOBAL-NEXT:    store i32 undef, ptr addrspace(22) [[TMP47]], align 4
-; POST-PROCESS-GLOBAL-NEXT:    [[TMP48:%.*]] = add i32 [[TMP10]], 76
-; POST-PROCESS-GLOBAL-NEXT:    [[TMP49:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP2]], i32 [[TMP48]]
+; POST-PROCESS-GLOBAL-NEXT:    [[TMP46:%.*]] = add i32 [[TMP10]], 72
+; POST-PROCESS-GLOBAL-NEXT:    [[TMP49:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP2]], i32 [[TMP46]]
 ; POST-PROCESS-GLOBAL-NEXT:    store i32 undef, ptr addrspace(22) [[TMP49]], align 4
-; POST-PROCESS-GLOBAL-NEXT:    [[TMP50:%.*]] = add i32 [[TMP10]], 80
-; POST-PROCESS-GLOBAL-NEXT:    [[TMP51:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP2]], i32 [[TMP50]]
+; POST-PROCESS-GLOBAL-NEXT:    [[TMP48:%.*]] = add i32 [[TMP10]], 76
+; POST-PROCESS-GLOBAL-NEXT:    [[TMP51:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP2]], i32 [[TMP48]]
 ; POST-PROCESS-GLOBAL-NEXT:    store i32 undef, ptr addrspace(22) [[TMP51]], align 4
-; POST-PROCESS-GLOBAL-NEXT:    [[TMP52:%.*]] = add i32 [[TMP10]], 84
-; POST-PROCESS-GLOBAL-NEXT:    [[TMP53:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP2]], i32 [[TMP52]]
+; POST-PROCESS-GLOBAL-NEXT:    [[TMP50:%.*]] = add i32 [[TMP10]], 80
+; POST-PROCESS-GLOBAL-NEXT:    [[TMP53:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP2]], i32 [[TMP50]]
 ; POST-PROCESS-GLOBAL-NEXT:    store i32 undef, ptr addrspace(22) [[TMP53]], align 4
-; POST-PROCESS-GLOBAL-NEXT:    [[TMP54:%.*]] = add i32 [[TMP10]], 88
-; POST-PROCESS-GLOBAL-NEXT:    [[TMP55:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP2]], i32 [[TMP54]]
+; POST-PROCESS-GLOBAL-NEXT:    [[TMP52:%.*]] = add i32 [[TMP10]], 84
+; POST-PROCESS-GLOBAL-NEXT:    [[TMP55:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP2]], i32 [[TMP52]]
 ; POST-PROCESS-GLOBAL-NEXT:    store i32 undef, ptr addrspace(22) [[TMP55]], align 4
-; POST-PROCESS-GLOBAL-NEXT:    [[TMP56:%.*]] = add i32 [[TMP10]], 92
-; POST-PROCESS-GLOBAL-NEXT:    [[TMP57:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP2]], i32 [[TMP56]]
+; POST-PROCESS-GLOBAL-NEXT:    [[TMP54:%.*]] = add i32 [[TMP10]], 88
+; POST-PROCESS-GLOBAL-NEXT:    [[TMP57:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP2]], i32 [[TMP54]]
 ; POST-PROCESS-GLOBAL-NEXT:    store i32 undef, ptr addrspace(22) [[TMP57]], align 4
-; POST-PROCESS-GLOBAL-NEXT:    [[TMP58:%.*]] = add i32 [[TMP10]], 96
-; POST-PROCESS-GLOBAL-NEXT:    [[TMP59:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP2]], i32 [[TMP58]]
+; POST-PROCESS-GLOBAL-NEXT:    [[TMP56:%.*]] = add i32 [[TMP10]], 92
+; POST-PROCESS-GLOBAL-NEXT:    [[TMP59:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP2]], i32 [[TMP56]]
 ; POST-PROCESS-GLOBAL-NEXT:    store i32 undef, ptr addrspace(22) [[TMP59]], align 4
-; POST-PROCESS-GLOBAL-NEXT:    [[TMP60:%.*]] = add i32 [[TMP10]], 100
-; POST-PROCESS-GLOBAL-NEXT:    [[TMP61:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP2]], i32 [[TMP60]]
+; POST-PROCESS-GLOBAL-NEXT:    [[TMP58:%.*]] = add i32 [[TMP10]], 96
+; POST-PROCESS-GLOBAL-NEXT:    [[TMP61:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP2]], i32 [[TMP58]]
 ; POST-PROCESS-GLOBAL-NEXT:    store i32 undef, ptr addrspace(22) [[TMP61]], align 4
-; POST-PROCESS-GLOBAL-NEXT:    [[TMP62:%.*]] = add i32 [[TMP10]], 104
-; POST-PROCESS-GLOBAL-NEXT:    [[TMP63:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP2]], i32 [[TMP62]]
+; POST-PROCESS-GLOBAL-NEXT:    [[TMP60:%.*]] = add i32 [[TMP10]], 100
+; POST-PROCESS-GLOBAL-NEXT:    [[TMP63:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP2]], i32 [[TMP60]]
 ; POST-PROCESS-GLOBAL-NEXT:    store i32 undef, ptr addrspace(22) [[TMP63]], align 4
+; POST-PROCESS-GLOBAL-NEXT:    [[TMP62:%.*]] = add i32 [[TMP10]], 104
+; POST-PROCESS-GLOBAL-NEXT:    [[TMP65:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP2]], i32 [[TMP62]]
+; POST-PROCESS-GLOBAL-NEXT:    store i32 undef, ptr addrspace(22) [[TMP65]], align 4
 ; POST-PROCESS-GLOBAL-NEXT:    [[TMP64:%.*]] = load i32, ptr [[CSP]], align 4
-; POST-PROCESS-GLOBAL-NEXT:    [[TMP65:%.*]] = call i64 @continuation.getAddrAndMD(i64 ptrtoint (ptr @main.resume.0 to i64))
-; POST-PROCESS-GLOBAL-NEXT:    call void (i64, ...) @continuation.continue(i64 4, i32 [[TMP64]], i64 [[TMP65]], [[STRUCT_TRAVERSALDATA]] [[TRAV_DATA_I]]), !continuation.registercount [[META18:![0-9]+]], !continuation.returnedRegistercount !18
+; POST-PROCESS-GLOBAL-NEXT:    call void (i64, i64, ...) @continuation.waitContinue(i64 4, i64 -1, i32 [[TMP64]], i64 poison, [[STRUCT_TRAVERSALDATA]] [[TRAV_DATA2_I]]), !continuation.registercount [[META18:![0-9]+]], !continuation.returnedRegistercount [[META18]]
 ; POST-PROCESS-GLOBAL-NEXT:    unreachable
 ;
 ;
 ; POST-PROCESS-GLOBAL-LABEL: define dso_local void @main.resume.0(
-; POST-PROCESS-GLOBAL-SAME: i32 [[CSPINIT:%.*]], [[STRUCT_DISPATCHSYSTEMDATA:%.*]] [[TMP0:%.*]]) !lgc.rt.shaderstage [[META8]] !continuation.registercount [[META18]] !continuation [[META20]] {
+; POST-PROCESS-GLOBAL-SAME: i32 [[CSPINIT:%.*]], i64 [[TMP0:%.*]], [[STRUCT_DISPATCHSYSTEMDATA:%.*]] [[TMP1:%.*]]) !lgc.rt.shaderstage [[META8]] !continuation.registercount [[META18]] !continuation [[META20]] {
 ; POST-PROCESS-GLOBAL-NEXT:  entryresume.0:
 ; POST-PROCESS-GLOBAL-NEXT:    [[CSP:%.*]] = alloca i32, align 4
 ; POST-PROCESS-GLOBAL-NEXT:    store i32 [[CSPINIT]], ptr [[CSP]], align 4
-; POST-PROCESS-GLOBAL-NEXT:    [[TMP1:%.*]] = call i64 @_cont_GetContinuationStackGlobalMemBase()
-; POST-PROCESS-GLOBAL-NEXT:    [[TMP2:%.*]] = inttoptr i64 [[TMP1]] to ptr addrspace(22)
+; POST-PROCESS-GLOBAL-NEXT:    [[TMP2:%.*]] = call i64 @_cont_GetContinuationStackGlobalMemBase()
+; POST-PROCESS-GLOBAL-NEXT:    [[TMP29:%.*]] = inttoptr i64 [[TMP2]] to ptr addrspace(22)
 ; POST-PROCESS-GLOBAL-NEXT:    [[TMP3:%.*]] = load i32, ptr [[CSP]], align 4
 ; POST-PROCESS-GLOBAL-NEXT:    [[TMP4:%.*]] = add i32 [[TMP3]], -108
 ; POST-PROCESS-GLOBAL-NEXT:    [[TMP5:%.*]] = load i32, ptr addrspace(20) @REGISTERS, align 4
@@ -2439,88 +2450,88 @@ attributes #3 = { nounwind }
 ; POST-PROCESS-GLOBAL-NEXT:    [[TMP26:%.*]] = load i32, ptr addrspace(20) getelementptr inbounds (i32, ptr addrspace(20) @REGISTERS, i32 27), align 4
 ; POST-PROCESS-GLOBAL-NEXT:    [[TMP27:%.*]] = load i32, ptr addrspace(20) getelementptr inbounds (i32, ptr addrspace(20) @REGISTERS, i32 28), align 4
 ; POST-PROCESS-GLOBAL-NEXT:    [[TMP28:%.*]] = load i32, ptr addrspace(20) getelementptr inbounds (i32, ptr addrspace(20) @REGISTERS, i32 29), align 4
-; POST-PROCESS-GLOBAL-NEXT:    [[TMP29:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP2]], i32 [[TMP5]]
-; POST-PROCESS-GLOBAL-NEXT:    [[TMP30:%.*]] = load i32, ptr addrspace(22) [[TMP29]], align 4
+; POST-PROCESS-GLOBAL-NEXT:    [[TMP30:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP29]], i32 [[TMP5]]
+; POST-PROCESS-GLOBAL-NEXT:    [[TMP32:%.*]] = load i32, ptr addrspace(22) [[TMP30]], align 4
 ; POST-PROCESS-GLOBAL-NEXT:    [[TMP31:%.*]] = add i32 [[TMP5]], 4
-; POST-PROCESS-GLOBAL-NEXT:    [[TMP32:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP2]], i32 [[TMP31]]
-; POST-PROCESS-GLOBAL-NEXT:    [[TMP33:%.*]] = load i32, ptr addrspace(22) [[TMP32]], align 4
+; POST-PROCESS-GLOBAL-NEXT:    [[TMP33:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP29]], i32 [[TMP31]]
+; POST-PROCESS-GLOBAL-NEXT:    [[TMP35:%.*]] = load i32, ptr addrspace(22) [[TMP33]], align 4
 ; POST-PROCESS-GLOBAL-NEXT:    [[TMP34:%.*]] = add i32 [[TMP5]], 8
-; POST-PROCESS-GLOBAL-NEXT:    [[TMP35:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP2]], i32 [[TMP34]]
-; POST-PROCESS-GLOBAL-NEXT:    [[TMP36:%.*]] = load i32, ptr addrspace(22) [[TMP35]], align 4
+; POST-PROCESS-GLOBAL-NEXT:    [[TMP36:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP29]], i32 [[TMP34]]
+; POST-PROCESS-GLOBAL-NEXT:    [[TMP38:%.*]] = load i32, ptr addrspace(22) [[TMP36]], align 4
 ; POST-PROCESS-GLOBAL-NEXT:    [[TMP37:%.*]] = add i32 [[TMP5]], 12
-; POST-PROCESS-GLOBAL-NEXT:    [[TMP38:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP2]], i32 [[TMP37]]
-; POST-PROCESS-GLOBAL-NEXT:    [[TMP39:%.*]] = load i32, ptr addrspace(22) [[TMP38]], align 4
+; POST-PROCESS-GLOBAL-NEXT:    [[TMP39:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP29]], i32 [[TMP37]]
+; POST-PROCESS-GLOBAL-NEXT:    [[TMP41:%.*]] = load i32, ptr addrspace(22) [[TMP39]], align 4
 ; POST-PROCESS-GLOBAL-NEXT:    [[TMP40:%.*]] = add i32 [[TMP5]], 16
-; POST-PROCESS-GLOBAL-NEXT:    [[TMP41:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP2]], i32 [[TMP40]]
-; POST-PROCESS-GLOBAL-NEXT:    [[TMP42:%.*]] = load i32, ptr addrspace(22) [[TMP41]], align 4
+; POST-PROCESS-GLOBAL-NEXT:    [[TMP42:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP29]], i32 [[TMP40]]
+; POST-PROCESS-GLOBAL-NEXT:    [[TMP44:%.*]] = load i32, ptr addrspace(22) [[TMP42]], align 4
 ; POST-PROCESS-GLOBAL-NEXT:    [[TMP43:%.*]] = add i32 [[TMP5]], 20
-; POST-PROCESS-GLOBAL-NEXT:    [[TMP44:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP2]], i32 [[TMP43]]
-; POST-PROCESS-GLOBAL-NEXT:    [[TMP45:%.*]] = load i32, ptr addrspace(22) [[TMP44]], align 4
+; POST-PROCESS-GLOBAL-NEXT:    [[TMP45:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP29]], i32 [[TMP43]]
+; POST-PROCESS-GLOBAL-NEXT:    [[TMP47:%.*]] = load i32, ptr addrspace(22) [[TMP45]], align 4
 ; POST-PROCESS-GLOBAL-NEXT:    [[TMP46:%.*]] = add i32 [[TMP5]], 24
-; POST-PROCESS-GLOBAL-NEXT:    [[TMP47:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP2]], i32 [[TMP46]]
-; POST-PROCESS-GLOBAL-NEXT:    [[TMP48:%.*]] = load i32, ptr addrspace(22) [[TMP47]], align 4
+; POST-PROCESS-GLOBAL-NEXT:    [[TMP48:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP29]], i32 [[TMP46]]
+; POST-PROCESS-GLOBAL-NEXT:    [[TMP50:%.*]] = load i32, ptr addrspace(22) [[TMP48]], align 4
 ; POST-PROCESS-GLOBAL-NEXT:    [[TMP49:%.*]] = add i32 [[TMP5]], 28
-; POST-PROCESS-GLOBAL-NEXT:    [[TMP50:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP2]], i32 [[TMP49]]
-; POST-PROCESS-GLOBAL-NEXT:    [[TMP51:%.*]] = load i32, ptr addrspace(22) [[TMP50]], align 4
+; POST-PROCESS-GLOBAL-NEXT:    [[TMP51:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP29]], i32 [[TMP49]]
+; POST-PROCESS-GLOBAL-NEXT:    [[TMP53:%.*]] = load i32, ptr addrspace(22) [[TMP51]], align 4
 ; POST-PROCESS-GLOBAL-NEXT:    [[TMP52:%.*]] = add i32 [[TMP5]], 32
-; POST-PROCESS-GLOBAL-NEXT:    [[TMP53:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP2]], i32 [[TMP52]]
-; POST-PROCESS-GLOBAL-NEXT:    [[TMP54:%.*]] = load i32, ptr addrspace(22) [[TMP53]], align 4
+; POST-PROCESS-GLOBAL-NEXT:    [[TMP54:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP29]], i32 [[TMP52]]
+; POST-PROCESS-GLOBAL-NEXT:    [[TMP56:%.*]] = load i32, ptr addrspace(22) [[TMP54]], align 4
 ; POST-PROCESS-GLOBAL-NEXT:    [[TMP55:%.*]] = add i32 [[TMP5]], 36
-; POST-PROCESS-GLOBAL-NEXT:    [[TMP56:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP2]], i32 [[TMP55]]
-; POST-PROCESS-GLOBAL-NEXT:    [[TMP57:%.*]] = load i32, ptr addrspace(22) [[TMP56]], align 4
+; POST-PROCESS-GLOBAL-NEXT:    [[TMP57:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP29]], i32 [[TMP55]]
+; POST-PROCESS-GLOBAL-NEXT:    [[TMP59:%.*]] = load i32, ptr addrspace(22) [[TMP57]], align 4
 ; POST-PROCESS-GLOBAL-NEXT:    [[TMP58:%.*]] = add i32 [[TMP5]], 40
-; POST-PROCESS-GLOBAL-NEXT:    [[TMP59:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP2]], i32 [[TMP58]]
-; POST-PROCESS-GLOBAL-NEXT:    [[TMP60:%.*]] = load i32, ptr addrspace(22) [[TMP59]], align 4
+; POST-PROCESS-GLOBAL-NEXT:    [[TMP60:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP29]], i32 [[TMP58]]
+; POST-PROCESS-GLOBAL-NEXT:    [[TMP62:%.*]] = load i32, ptr addrspace(22) [[TMP60]], align 4
 ; POST-PROCESS-GLOBAL-NEXT:    [[TMP61:%.*]] = add i32 [[TMP5]], 44
-; POST-PROCESS-GLOBAL-NEXT:    [[TMP62:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP2]], i32 [[TMP61]]
-; POST-PROCESS-GLOBAL-NEXT:    [[TMP63:%.*]] = load i32, ptr addrspace(22) [[TMP62]], align 4
+; POST-PROCESS-GLOBAL-NEXT:    [[TMP63:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP29]], i32 [[TMP61]]
+; POST-PROCESS-GLOBAL-NEXT:    [[TMP65:%.*]] = load i32, ptr addrspace(22) [[TMP63]], align 4
 ; POST-PROCESS-GLOBAL-NEXT:    [[TMP64:%.*]] = add i32 [[TMP5]], 48
-; POST-PROCESS-GLOBAL-NEXT:    [[TMP65:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP2]], i32 [[TMP64]]
-; POST-PROCESS-GLOBAL-NEXT:    [[TMP66:%.*]] = load i32, ptr addrspace(22) [[TMP65]], align 4
+; POST-PROCESS-GLOBAL-NEXT:    [[TMP66:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP29]], i32 [[TMP64]]
+; POST-PROCESS-GLOBAL-NEXT:    [[TMP68:%.*]] = load i32, ptr addrspace(22) [[TMP66]], align 4
 ; POST-PROCESS-GLOBAL-NEXT:    [[TMP67:%.*]] = add i32 [[TMP5]], 52
-; POST-PROCESS-GLOBAL-NEXT:    [[TMP68:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP2]], i32 [[TMP67]]
-; POST-PROCESS-GLOBAL-NEXT:    [[TMP69:%.*]] = load i32, ptr addrspace(22) [[TMP68]], align 4
+; POST-PROCESS-GLOBAL-NEXT:    [[TMP69:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP29]], i32 [[TMP67]]
+; POST-PROCESS-GLOBAL-NEXT:    [[TMP71:%.*]] = load i32, ptr addrspace(22) [[TMP69]], align 4
 ; POST-PROCESS-GLOBAL-NEXT:    [[TMP70:%.*]] = add i32 [[TMP5]], 56
-; POST-PROCESS-GLOBAL-NEXT:    [[TMP71:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP2]], i32 [[TMP70]]
-; POST-PROCESS-GLOBAL-NEXT:    [[TMP72:%.*]] = load i32, ptr addrspace(22) [[TMP71]], align 4
+; POST-PROCESS-GLOBAL-NEXT:    [[TMP72:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP29]], i32 [[TMP70]]
+; POST-PROCESS-GLOBAL-NEXT:    [[TMP74:%.*]] = load i32, ptr addrspace(22) [[TMP72]], align 4
 ; POST-PROCESS-GLOBAL-NEXT:    [[TMP73:%.*]] = add i32 [[TMP5]], 60
-; POST-PROCESS-GLOBAL-NEXT:    [[TMP74:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP2]], i32 [[TMP73]]
-; POST-PROCESS-GLOBAL-NEXT:    [[TMP75:%.*]] = load i32, ptr addrspace(22) [[TMP74]], align 4
+; POST-PROCESS-GLOBAL-NEXT:    [[TMP75:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP29]], i32 [[TMP73]]
+; POST-PROCESS-GLOBAL-NEXT:    [[TMP77:%.*]] = load i32, ptr addrspace(22) [[TMP75]], align 4
 ; POST-PROCESS-GLOBAL-NEXT:    [[TMP76:%.*]] = add i32 [[TMP5]], 64
-; POST-PROCESS-GLOBAL-NEXT:    [[TMP77:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP2]], i32 [[TMP76]]
-; POST-PROCESS-GLOBAL-NEXT:    [[TMP78:%.*]] = load i32, ptr addrspace(22) [[TMP77]], align 4
+; POST-PROCESS-GLOBAL-NEXT:    [[TMP78:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP29]], i32 [[TMP76]]
+; POST-PROCESS-GLOBAL-NEXT:    [[TMP80:%.*]] = load i32, ptr addrspace(22) [[TMP78]], align 4
 ; POST-PROCESS-GLOBAL-NEXT:    [[TMP79:%.*]] = add i32 [[TMP5]], 68
-; POST-PROCESS-GLOBAL-NEXT:    [[TMP80:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP2]], i32 [[TMP79]]
-; POST-PROCESS-GLOBAL-NEXT:    [[TMP81:%.*]] = load i32, ptr addrspace(22) [[TMP80]], align 4
+; POST-PROCESS-GLOBAL-NEXT:    [[TMP81:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP29]], i32 [[TMP79]]
+; POST-PROCESS-GLOBAL-NEXT:    [[TMP83:%.*]] = load i32, ptr addrspace(22) [[TMP81]], align 4
 ; POST-PROCESS-GLOBAL-NEXT:    [[TMP82:%.*]] = add i32 [[TMP5]], 72
-; POST-PROCESS-GLOBAL-NEXT:    [[TMP83:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP2]], i32 [[TMP82]]
-; POST-PROCESS-GLOBAL-NEXT:    [[TMP84:%.*]] = load i32, ptr addrspace(22) [[TMP83]], align 4
+; POST-PROCESS-GLOBAL-NEXT:    [[TMP84:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP29]], i32 [[TMP82]]
+; POST-PROCESS-GLOBAL-NEXT:    [[TMP86:%.*]] = load i32, ptr addrspace(22) [[TMP84]], align 4
 ; POST-PROCESS-GLOBAL-NEXT:    [[TMP85:%.*]] = add i32 [[TMP5]], 76
-; POST-PROCESS-GLOBAL-NEXT:    [[TMP86:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP2]], i32 [[TMP85]]
-; POST-PROCESS-GLOBAL-NEXT:    [[TMP87:%.*]] = load i32, ptr addrspace(22) [[TMP86]], align 4
+; POST-PROCESS-GLOBAL-NEXT:    [[TMP87:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP29]], i32 [[TMP85]]
+; POST-PROCESS-GLOBAL-NEXT:    [[TMP89:%.*]] = load i32, ptr addrspace(22) [[TMP87]], align 4
 ; POST-PROCESS-GLOBAL-NEXT:    [[TMP88:%.*]] = add i32 [[TMP5]], 80
-; POST-PROCESS-GLOBAL-NEXT:    [[TMP89:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP2]], i32 [[TMP88]]
-; POST-PROCESS-GLOBAL-NEXT:    [[TMP90:%.*]] = load i32, ptr addrspace(22) [[TMP89]], align 4
+; POST-PROCESS-GLOBAL-NEXT:    [[TMP90:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP29]], i32 [[TMP88]]
+; POST-PROCESS-GLOBAL-NEXT:    [[TMP92:%.*]] = load i32, ptr addrspace(22) [[TMP90]], align 4
 ; POST-PROCESS-GLOBAL-NEXT:    [[TMP91:%.*]] = add i32 [[TMP5]], 84
-; POST-PROCESS-GLOBAL-NEXT:    [[TMP92:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP2]], i32 [[TMP91]]
-; POST-PROCESS-GLOBAL-NEXT:    [[TMP93:%.*]] = load i32, ptr addrspace(22) [[TMP92]], align 4
+; POST-PROCESS-GLOBAL-NEXT:    [[TMP93:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP29]], i32 [[TMP91]]
+; POST-PROCESS-GLOBAL-NEXT:    [[TMP95:%.*]] = load i32, ptr addrspace(22) [[TMP93]], align 4
 ; POST-PROCESS-GLOBAL-NEXT:    [[TMP94:%.*]] = add i32 [[TMP5]], 88
-; POST-PROCESS-GLOBAL-NEXT:    [[TMP95:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP2]], i32 [[TMP94]]
-; POST-PROCESS-GLOBAL-NEXT:    [[TMP96:%.*]] = load i32, ptr addrspace(22) [[TMP95]], align 4
+; POST-PROCESS-GLOBAL-NEXT:    [[TMP96:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP29]], i32 [[TMP94]]
+; POST-PROCESS-GLOBAL-NEXT:    [[TMP98:%.*]] = load i32, ptr addrspace(22) [[TMP96]], align 4
 ; POST-PROCESS-GLOBAL-NEXT:    [[TMP97:%.*]] = add i32 [[TMP5]], 92
-; POST-PROCESS-GLOBAL-NEXT:    [[TMP98:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP2]], i32 [[TMP97]]
-; POST-PROCESS-GLOBAL-NEXT:    [[TMP99:%.*]] = load i32, ptr addrspace(22) [[TMP98]], align 4
+; POST-PROCESS-GLOBAL-NEXT:    [[TMP99:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP29]], i32 [[TMP97]]
+; POST-PROCESS-GLOBAL-NEXT:    [[TMP101:%.*]] = load i32, ptr addrspace(22) [[TMP99]], align 4
 ; POST-PROCESS-GLOBAL-NEXT:    [[TMP100:%.*]] = add i32 [[TMP5]], 96
-; POST-PROCESS-GLOBAL-NEXT:    [[TMP101:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP2]], i32 [[TMP100]]
-; POST-PROCESS-GLOBAL-NEXT:    [[TMP102:%.*]] = load i32, ptr addrspace(22) [[TMP101]], align 4
+; POST-PROCESS-GLOBAL-NEXT:    [[TMP102:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP29]], i32 [[TMP100]]
+; POST-PROCESS-GLOBAL-NEXT:    [[TMP104:%.*]] = load i32, ptr addrspace(22) [[TMP102]], align 4
 ; POST-PROCESS-GLOBAL-NEXT:    [[TMP103:%.*]] = add i32 [[TMP5]], 100
-; POST-PROCESS-GLOBAL-NEXT:    [[TMP104:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP2]], i32 [[TMP103]]
-; POST-PROCESS-GLOBAL-NEXT:    [[TMP105:%.*]] = load i32, ptr addrspace(22) [[TMP104]], align 4
+; POST-PROCESS-GLOBAL-NEXT:    [[TMP105:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP29]], i32 [[TMP103]]
+; POST-PROCESS-GLOBAL-NEXT:    [[TMP107:%.*]] = load i32, ptr addrspace(22) [[TMP105]], align 4
 ; POST-PROCESS-GLOBAL-NEXT:    [[TMP106:%.*]] = add i32 [[TMP5]], 104
-; POST-PROCESS-GLOBAL-NEXT:    [[TMP107:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP2]], i32 [[TMP106]]
-; POST-PROCESS-GLOBAL-NEXT:    [[TMP108:%.*]] = load i32, ptr addrspace(22) [[TMP107]], align 4
+; POST-PROCESS-GLOBAL-NEXT:    [[TMP108:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP29]], i32 [[TMP106]]
+; POST-PROCESS-GLOBAL-NEXT:    [[TMP112:%.*]] = load i32, ptr addrspace(22) [[TMP108]], align 4
 ; POST-PROCESS-GLOBAL-NEXT:    [[TMP109:%.*]] = load i32, ptr addrspace(20) @REGISTERS, align 4
-; POST-PROCESS-GLOBAL-NEXT:    [[DOTFCA_0_EXTRACT1:%.*]] = extractvalue [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP0]], 0
+; POST-PROCESS-GLOBAL-NEXT:    [[DOTFCA_0_EXTRACT1:%.*]] = extractvalue [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP1]], 0
 ; POST-PROCESS-GLOBAL-NEXT:    call void @amd.dx.setLocalRootIndex(i32 0)
 ; POST-PROCESS-GLOBAL-NEXT:    [[TMP110:%.*]] = load i32, ptr [[CSP]], align 4
 ; POST-PROCESS-GLOBAL-NEXT:    [[TMP111:%.*]] = add i32 [[TMP110]], -108
@@ -2832,7 +2843,7 @@ attributes #3 = { nounwind }
 ; POST-PROCESS-GLOBAL-NEXT:    [[DOTFCA_1_1_LOAD:%.*]] = load i32, ptr [[DOTFCA_1_1_GEP10]], align 4
 ; POST-PROCESS-GLOBAL-NEXT:    [[DOTFCA_1_1_INSERT:%.*]] = insertvalue [[STRUCT_ANYHITTRAVERSALDATA]] [[DOTFCA_1_0_INSERT]], i32 [[DOTFCA_1_1_LOAD]], 1, 1
 ; POST-PROCESS-GLOBAL-NEXT:    [[TMP171:%.*]] = load i32, ptr [[CSP]], align 4
-; POST-PROCESS-GLOBAL-NEXT:    call void (i64, ...) @continuation.continue(i64 [[RETURNADDR]], i32 [[TMP171]], [[STRUCT_ANYHITTRAVERSALDATA]] [[DOTFCA_1_1_INSERT]]), !continuation.registercount [[META18]]
+; POST-PROCESS-GLOBAL-NEXT:    call void (i64, ...) @continuation.continue(i64 [[RETURNADDR]], i32 [[TMP171]], i64 poison, [[STRUCT_ANYHITTRAVERSALDATA]] [[DOTFCA_1_1_INSERT]]), !continuation.registercount [[META18]]
 ; POST-PROCESS-GLOBAL-NEXT:    unreachable
 ;
 ;
@@ -2974,6 +2985,8 @@ attributes #3 = { nounwind }
 ; POST-PROCESS-GLOBAL-NEXT:    [[DIS_DATA_I_FCA_0_INSERT:%.*]] = insertvalue [[STRUCT_DISPATCHSYSTEMDATA:%.*]] poison, <3 x i32> [[DOTFCA_0_0_EXTRACT]], 0
 ; POST-PROCESS-GLOBAL-NEXT:    [[SYS_DATA_I:%.*]] = insertvalue [[STRUCT_SYSTEMDATA]] undef, [[STRUCT_DISPATCHSYSTEMDATA]] [[DIS_DATA_I_FCA_0_INSERT]], 0
 ; POST-PROCESS-GLOBAL-NEXT:    [[TRAV_DATA_I:%.*]] = insertvalue [[STRUCT_TRAVERSALDATA:%.*]] undef, [[STRUCT_SYSTEMDATA]] [[SYS_DATA_I]], 0
+; POST-PROCESS-GLOBAL-NEXT:    [[TMP122:%.*]] = call i64 @continuation.getAddrAndMD(ptr @ClosestHit.resume.0)
+; POST-PROCESS-GLOBAL-NEXT:    [[TRAV_DATA2_I:%.*]] = insertvalue [[STRUCT_TRAVERSALDATA]] [[TRAV_DATA_I]], i64 [[TMP122]], 5
 ; POST-PROCESS-GLOBAL-NEXT:    store i32 [[TMP3]], ptr addrspace(20) @REGISTERS, align 4
 ; POST-PROCESS-GLOBAL-NEXT:    [[TMP121:%.*]] = load i32, ptr addrspace(20) @REGISTERS, align 4
 ; POST-PROCESS-GLOBAL-NEXT:    store i32 [[TMP8]], ptr addrspace(20) getelementptr inbounds (i32, ptr addrspace(20) @REGISTERS, i32 7), align 4
@@ -2999,99 +3012,98 @@ attributes #3 = { nounwind }
 ; POST-PROCESS-GLOBAL-NEXT:    store i32 [[TMP28]], ptr addrspace(20) getelementptr inbounds (i32, ptr addrspace(20) @REGISTERS, i32 27), align 4
 ; POST-PROCESS-GLOBAL-NEXT:    store i32 [[TMP29]], ptr addrspace(20) getelementptr inbounds (i32, ptr addrspace(20) @REGISTERS, i32 28), align 4
 ; POST-PROCESS-GLOBAL-NEXT:    store i32 [[TMP30]], ptr addrspace(20) getelementptr inbounds (i32, ptr addrspace(20) @REGISTERS, i32 29), align 4
-; POST-PROCESS-GLOBAL-NEXT:    [[TMP122:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP2]], i32 [[TMP121]]
-; POST-PROCESS-GLOBAL-NEXT:    store i32 [[TMP32]], ptr addrspace(22) [[TMP122]], align 4
+; POST-PROCESS-GLOBAL-NEXT:    [[TMP124:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP2]], i32 [[TMP121]]
+; POST-PROCESS-GLOBAL-NEXT:    store i32 [[TMP32]], ptr addrspace(22) [[TMP124]], align 4
 ; POST-PROCESS-GLOBAL-NEXT:    [[TMP123:%.*]] = add i32 [[TMP121]], 4
-; POST-PROCESS-GLOBAL-NEXT:    [[TMP124:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP2]], i32 [[TMP123]]
-; POST-PROCESS-GLOBAL-NEXT:    store i32 [[TMP35]], ptr addrspace(22) [[TMP124]], align 4
+; POST-PROCESS-GLOBAL-NEXT:    [[TMP126:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP2]], i32 [[TMP123]]
+; POST-PROCESS-GLOBAL-NEXT:    store i32 [[TMP35]], ptr addrspace(22) [[TMP126]], align 4
 ; POST-PROCESS-GLOBAL-NEXT:    [[TMP125:%.*]] = add i32 [[TMP121]], 8
-; POST-PROCESS-GLOBAL-NEXT:    [[TMP126:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP2]], i32 [[TMP125]]
-; POST-PROCESS-GLOBAL-NEXT:    store i32 [[TMP38]], ptr addrspace(22) [[TMP126]], align 4
+; POST-PROCESS-GLOBAL-NEXT:    [[TMP128:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP2]], i32 [[TMP125]]
+; POST-PROCESS-GLOBAL-NEXT:    store i32 [[TMP38]], ptr addrspace(22) [[TMP128]], align 4
 ; POST-PROCESS-GLOBAL-NEXT:    [[TMP127:%.*]] = add i32 [[TMP121]], 12
-; POST-PROCESS-GLOBAL-NEXT:    [[TMP128:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP2]], i32 [[TMP127]]
-; POST-PROCESS-GLOBAL-NEXT:    store i32 [[TMP41]], ptr addrspace(22) [[TMP128]], align 4
+; POST-PROCESS-GLOBAL-NEXT:    [[TMP130:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP2]], i32 [[TMP127]]
+; POST-PROCESS-GLOBAL-NEXT:    store i32 [[TMP41]], ptr addrspace(22) [[TMP130]], align 4
 ; POST-PROCESS-GLOBAL-NEXT:    [[TMP129:%.*]] = add i32 [[TMP121]], 16
-; POST-PROCESS-GLOBAL-NEXT:    [[TMP130:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP2]], i32 [[TMP129]]
-; POST-PROCESS-GLOBAL-NEXT:    store i32 [[TMP44]], ptr addrspace(22) [[TMP130]], align 4
+; POST-PROCESS-GLOBAL-NEXT:    [[TMP132:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP2]], i32 [[TMP129]]
+; POST-PROCESS-GLOBAL-NEXT:    store i32 [[TMP44]], ptr addrspace(22) [[TMP132]], align 4
 ; POST-PROCESS-GLOBAL-NEXT:    [[TMP131:%.*]] = add i32 [[TMP121]], 20
-; POST-PROCESS-GLOBAL-NEXT:    [[TMP132:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP2]], i32 [[TMP131]]
-; POST-PROCESS-GLOBAL-NEXT:    store i32 [[TMP47]], ptr addrspace(22) [[TMP132]], align 4
+; POST-PROCESS-GLOBAL-NEXT:    [[TMP134:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP2]], i32 [[TMP131]]
+; POST-PROCESS-GLOBAL-NEXT:    store i32 [[TMP47]], ptr addrspace(22) [[TMP134]], align 4
 ; POST-PROCESS-GLOBAL-NEXT:    [[TMP133:%.*]] = add i32 [[TMP121]], 24
-; POST-PROCESS-GLOBAL-NEXT:    [[TMP134:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP2]], i32 [[TMP133]]
-; POST-PROCESS-GLOBAL-NEXT:    store i32 [[TMP50]], ptr addrspace(22) [[TMP134]], align 4
+; POST-PROCESS-GLOBAL-NEXT:    [[TMP136:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP2]], i32 [[TMP133]]
+; POST-PROCESS-GLOBAL-NEXT:    store i32 [[TMP50]], ptr addrspace(22) [[TMP136]], align 4
 ; POST-PROCESS-GLOBAL-NEXT:    [[TMP135:%.*]] = add i32 [[TMP121]], 28
-; POST-PROCESS-GLOBAL-NEXT:    [[TMP136:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP2]], i32 [[TMP135]]
-; POST-PROCESS-GLOBAL-NEXT:    store i32 [[TMP53]], ptr addrspace(22) [[TMP136]], align 4
+; POST-PROCESS-GLOBAL-NEXT:    [[TMP138:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP2]], i32 [[TMP135]]
+; POST-PROCESS-GLOBAL-NEXT:    store i32 [[TMP53]], ptr addrspace(22) [[TMP138]], align 4
 ; POST-PROCESS-GLOBAL-NEXT:    [[TMP137:%.*]] = add i32 [[TMP121]], 32
-; POST-PROCESS-GLOBAL-NEXT:    [[TMP138:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP2]], i32 [[TMP137]]
-; POST-PROCESS-GLOBAL-NEXT:    store i32 [[TMP56]], ptr addrspace(22) [[TMP138]], align 4
+; POST-PROCESS-GLOBAL-NEXT:    [[TMP140:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP2]], i32 [[TMP137]]
+; POST-PROCESS-GLOBAL-NEXT:    store i32 [[TMP56]], ptr addrspace(22) [[TMP140]], align 4
 ; POST-PROCESS-GLOBAL-NEXT:    [[TMP139:%.*]] = add i32 [[TMP121]], 36
-; POST-PROCESS-GLOBAL-NEXT:    [[TMP140:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP2]], i32 [[TMP139]]
-; POST-PROCESS-GLOBAL-NEXT:    store i32 [[TMP59]], ptr addrspace(22) [[TMP140]], align 4
+; POST-PROCESS-GLOBAL-NEXT:    [[TMP142:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP2]], i32 [[TMP139]]
+; POST-PROCESS-GLOBAL-NEXT:    store i32 [[TMP59]], ptr addrspace(22) [[TMP142]], align 4
 ; POST-PROCESS-GLOBAL-NEXT:    [[TMP141:%.*]] = add i32 [[TMP121]], 40
-; POST-PROCESS-GLOBAL-NEXT:    [[TMP142:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP2]], i32 [[TMP141]]
-; POST-PROCESS-GLOBAL-NEXT:    store i32 [[TMP62]], ptr addrspace(22) [[TMP142]], align 4
+; POST-PROCESS-GLOBAL-NEXT:    [[TMP144:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP2]], i32 [[TMP141]]
+; POST-PROCESS-GLOBAL-NEXT:    store i32 [[TMP62]], ptr addrspace(22) [[TMP144]], align 4
 ; POST-PROCESS-GLOBAL-NEXT:    [[TMP143:%.*]] = add i32 [[TMP121]], 44
-; POST-PROCESS-GLOBAL-NEXT:    [[TMP144:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP2]], i32 [[TMP143]]
-; POST-PROCESS-GLOBAL-NEXT:    store i32 [[TMP65]], ptr addrspace(22) [[TMP144]], align 4
+; POST-PROCESS-GLOBAL-NEXT:    [[TMP146:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP2]], i32 [[TMP143]]
+; POST-PROCESS-GLOBAL-NEXT:    store i32 [[TMP65]], ptr addrspace(22) [[TMP146]], align 4
 ; POST-PROCESS-GLOBAL-NEXT:    [[TMP145:%.*]] = add i32 [[TMP121]], 48
-; POST-PROCESS-GLOBAL-NEXT:    [[TMP146:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP2]], i32 [[TMP145]]
-; POST-PROCESS-GLOBAL-NEXT:    store i32 [[TMP68]], ptr addrspace(22) [[TMP146]], align 4
+; POST-PROCESS-GLOBAL-NEXT:    [[TMP148:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP2]], i32 [[TMP145]]
+; POST-PROCESS-GLOBAL-NEXT:    store i32 [[TMP68]], ptr addrspace(22) [[TMP148]], align 4
 ; POST-PROCESS-GLOBAL-NEXT:    [[TMP147:%.*]] = add i32 [[TMP121]], 52
-; POST-PROCESS-GLOBAL-NEXT:    [[TMP148:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP2]], i32 [[TMP147]]
-; POST-PROCESS-GLOBAL-NEXT:    store i32 [[TMP71]], ptr addrspace(22) [[TMP148]], align 4
+; POST-PROCESS-GLOBAL-NEXT:    [[TMP150:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP2]], i32 [[TMP147]]
+; POST-PROCESS-GLOBAL-NEXT:    store i32 [[TMP71]], ptr addrspace(22) [[TMP150]], align 4
 ; POST-PROCESS-GLOBAL-NEXT:    [[TMP149:%.*]] = add i32 [[TMP121]], 56
-; POST-PROCESS-GLOBAL-NEXT:    [[TMP150:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP2]], i32 [[TMP149]]
-; POST-PROCESS-GLOBAL-NEXT:    store i32 [[TMP74]], ptr addrspace(22) [[TMP150]], align 4
+; POST-PROCESS-GLOBAL-NEXT:    [[TMP152:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP2]], i32 [[TMP149]]
+; POST-PROCESS-GLOBAL-NEXT:    store i32 [[TMP74]], ptr addrspace(22) [[TMP152]], align 4
 ; POST-PROCESS-GLOBAL-NEXT:    [[TMP151:%.*]] = add i32 [[TMP121]], 60
-; POST-PROCESS-GLOBAL-NEXT:    [[TMP152:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP2]], i32 [[TMP151]]
-; POST-PROCESS-GLOBAL-NEXT:    store i32 [[TMP77]], ptr addrspace(22) [[TMP152]], align 4
+; POST-PROCESS-GLOBAL-NEXT:    [[TMP154:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP2]], i32 [[TMP151]]
+; POST-PROCESS-GLOBAL-NEXT:    store i32 [[TMP77]], ptr addrspace(22) [[TMP154]], align 4
 ; POST-PROCESS-GLOBAL-NEXT:    [[TMP153:%.*]] = add i32 [[TMP121]], 64
-; POST-PROCESS-GLOBAL-NEXT:    [[TMP154:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP2]], i32 [[TMP153]]
-; POST-PROCESS-GLOBAL-NEXT:    store i32 [[TMP80]], ptr addrspace(22) [[TMP154]], align 4
+; POST-PROCESS-GLOBAL-NEXT:    [[TMP156:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP2]], i32 [[TMP153]]
+; POST-PROCESS-GLOBAL-NEXT:    store i32 [[TMP80]], ptr addrspace(22) [[TMP156]], align 4
 ; POST-PROCESS-GLOBAL-NEXT:    [[TMP155:%.*]] = add i32 [[TMP121]], 68
-; POST-PROCESS-GLOBAL-NEXT:    [[TMP156:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP2]], i32 [[TMP155]]
-; POST-PROCESS-GLOBAL-NEXT:    store i32 [[TMP83]], ptr addrspace(22) [[TMP156]], align 4
+; POST-PROCESS-GLOBAL-NEXT:    [[TMP158:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP2]], i32 [[TMP155]]
+; POST-PROCESS-GLOBAL-NEXT:    store i32 [[TMP83]], ptr addrspace(22) [[TMP158]], align 4
 ; POST-PROCESS-GLOBAL-NEXT:    [[TMP157:%.*]] = add i32 [[TMP121]], 72
-; POST-PROCESS-GLOBAL-NEXT:    [[TMP158:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP2]], i32 [[TMP157]]
-; POST-PROCESS-GLOBAL-NEXT:    store i32 [[TMP86]], ptr addrspace(22) [[TMP158]], align 4
+; POST-PROCESS-GLOBAL-NEXT:    [[TMP160:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP2]], i32 [[TMP157]]
+; POST-PROCESS-GLOBAL-NEXT:    store i32 [[TMP86]], ptr addrspace(22) [[TMP160]], align 4
 ; POST-PROCESS-GLOBAL-NEXT:    [[TMP159:%.*]] = add i32 [[TMP121]], 76
-; POST-PROCESS-GLOBAL-NEXT:    [[TMP160:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP2]], i32 [[TMP159]]
-; POST-PROCESS-GLOBAL-NEXT:    store i32 [[TMP89]], ptr addrspace(22) [[TMP160]], align 4
+; POST-PROCESS-GLOBAL-NEXT:    [[TMP162:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP2]], i32 [[TMP159]]
+; POST-PROCESS-GLOBAL-NEXT:    store i32 [[TMP89]], ptr addrspace(22) [[TMP162]], align 4
 ; POST-PROCESS-GLOBAL-NEXT:    [[TMP161:%.*]] = add i32 [[TMP121]], 80
-; POST-PROCESS-GLOBAL-NEXT:    [[TMP162:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP2]], i32 [[TMP161]]
-; POST-PROCESS-GLOBAL-NEXT:    store i32 [[TMP92]], ptr addrspace(22) [[TMP162]], align 4
+; POST-PROCESS-GLOBAL-NEXT:    [[TMP164:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP2]], i32 [[TMP161]]
+; POST-PROCESS-GLOBAL-NEXT:    store i32 [[TMP92]], ptr addrspace(22) [[TMP164]], align 4
 ; POST-PROCESS-GLOBAL-NEXT:    [[TMP163:%.*]] = add i32 [[TMP121]], 84
-; POST-PROCESS-GLOBAL-NEXT:    [[TMP164:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP2]], i32 [[TMP163]]
-; POST-PROCESS-GLOBAL-NEXT:    store i32 [[TMP95]], ptr addrspace(22) [[TMP164]], align 4
+; POST-PROCESS-GLOBAL-NEXT:    [[TMP166:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP2]], i32 [[TMP163]]
+; POST-PROCESS-GLOBAL-NEXT:    store i32 [[TMP95]], ptr addrspace(22) [[TMP166]], align 4
 ; POST-PROCESS-GLOBAL-NEXT:    [[TMP165:%.*]] = add i32 [[TMP121]], 88
-; POST-PROCESS-GLOBAL-NEXT:    [[TMP166:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP2]], i32 [[TMP165]]
-; POST-PROCESS-GLOBAL-NEXT:    store i32 [[TMP98]], ptr addrspace(22) [[TMP166]], align 4
+; POST-PROCESS-GLOBAL-NEXT:    [[TMP168:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP2]], i32 [[TMP165]]
+; POST-PROCESS-GLOBAL-NEXT:    store i32 [[TMP98]], ptr addrspace(22) [[TMP168]], align 4
 ; POST-PROCESS-GLOBAL-NEXT:    [[TMP167:%.*]] = add i32 [[TMP121]], 92
-; POST-PROCESS-GLOBAL-NEXT:    [[TMP168:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP2]], i32 [[TMP167]]
-; POST-PROCESS-GLOBAL-NEXT:    store i32 [[TMP101]], ptr addrspace(22) [[TMP168]], align 4
+; POST-PROCESS-GLOBAL-NEXT:    [[TMP170:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP2]], i32 [[TMP167]]
+; POST-PROCESS-GLOBAL-NEXT:    store i32 [[TMP101]], ptr addrspace(22) [[TMP170]], align 4
 ; POST-PROCESS-GLOBAL-NEXT:    [[TMP169:%.*]] = add i32 [[TMP121]], 96
-; POST-PROCESS-GLOBAL-NEXT:    [[TMP170:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP2]], i32 [[TMP169]]
-; POST-PROCESS-GLOBAL-NEXT:    store i32 [[TMP104]], ptr addrspace(22) [[TMP170]], align 4
+; POST-PROCESS-GLOBAL-NEXT:    [[TMP172:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP2]], i32 [[TMP169]]
+; POST-PROCESS-GLOBAL-NEXT:    store i32 [[TMP104]], ptr addrspace(22) [[TMP172]], align 4
 ; POST-PROCESS-GLOBAL-NEXT:    [[TMP171:%.*]] = add i32 [[TMP121]], 100
-; POST-PROCESS-GLOBAL-NEXT:    [[TMP172:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP2]], i32 [[TMP171]]
-; POST-PROCESS-GLOBAL-NEXT:    store i32 [[TMP107]], ptr addrspace(22) [[TMP172]], align 4
+; POST-PROCESS-GLOBAL-NEXT:    [[TMP174:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP2]], i32 [[TMP171]]
+; POST-PROCESS-GLOBAL-NEXT:    store i32 [[TMP107]], ptr addrspace(22) [[TMP174]], align 4
 ; POST-PROCESS-GLOBAL-NEXT:    [[TMP173:%.*]] = add i32 [[TMP121]], 104
-; POST-PROCESS-GLOBAL-NEXT:    [[TMP174:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP2]], i32 [[TMP173]]
-; POST-PROCESS-GLOBAL-NEXT:    store i32 [[TMP110]], ptr addrspace(22) [[TMP174]], align 4
+; POST-PROCESS-GLOBAL-NEXT:    [[TMP176:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP2]], i32 [[TMP173]]
+; POST-PROCESS-GLOBAL-NEXT:    store i32 [[TMP110]], ptr addrspace(22) [[TMP176]], align 4
 ; POST-PROCESS-GLOBAL-NEXT:    [[TMP175:%.*]] = load i32, ptr [[CSP]], align 4
-; POST-PROCESS-GLOBAL-NEXT:    [[TMP176:%.*]] = call i64 @continuation.getAddrAndMD(i64 ptrtoint (ptr @ClosestHit.resume.0 to i64))
-; POST-PROCESS-GLOBAL-NEXT:    call void (i64, ...) @continuation.continue(i64 4, i32 [[TMP175]], i64 [[TMP176]], [[STRUCT_TRAVERSALDATA]] [[TRAV_DATA_I]]), !continuation.registercount [[META18]], !continuation.returnedRegistercount !18
+; POST-PROCESS-GLOBAL-NEXT:    call void (i64, i64, ...) @continuation.waitContinue(i64 4, i64 -1, i32 [[TMP175]], i64 poison, [[STRUCT_TRAVERSALDATA]] [[TRAV_DATA2_I]]), !continuation.registercount [[META18]], !continuation.returnedRegistercount [[META18]]
 ; POST-PROCESS-GLOBAL-NEXT:    unreachable
 ;
 ;
 ; POST-PROCESS-GLOBAL-LABEL: define dso_local void @ClosestHit.resume.0(
-; POST-PROCESS-GLOBAL-SAME: i32 [[CSPINIT:%.*]], [[STRUCT_DISPATCHSYSTEMDATA:%.*]] [[TMP0:%.*]]) !lgc.rt.shaderstage [[META24]] !continuation.registercount [[META18]] !continuation [[META25]] {
+; POST-PROCESS-GLOBAL-SAME: i32 [[CSPINIT:%.*]], i64 [[TMP0:%.*]], [[STRUCT_DISPATCHSYSTEMDATA:%.*]] [[TMP1:%.*]]) !lgc.rt.shaderstage [[META24]] !continuation.registercount [[META18]] !continuation [[META25]] {
 ; POST-PROCESS-GLOBAL-NEXT:  entryresume.0:
 ; POST-PROCESS-GLOBAL-NEXT:    [[CSP:%.*]] = alloca i32, align 4
 ; POST-PROCESS-GLOBAL-NEXT:    store i32 [[CSPINIT]], ptr [[CSP]], align 4
-; POST-PROCESS-GLOBAL-NEXT:    [[TMP1:%.*]] = call i64 @_cont_GetContinuationStackGlobalMemBase()
-; POST-PROCESS-GLOBAL-NEXT:    [[TMP2:%.*]] = inttoptr i64 [[TMP1]] to ptr addrspace(22)
+; POST-PROCESS-GLOBAL-NEXT:    [[TMP2:%.*]] = call i64 @_cont_GetContinuationStackGlobalMemBase()
+; POST-PROCESS-GLOBAL-NEXT:    [[TMP29:%.*]] = inttoptr i64 [[TMP2]] to ptr addrspace(22)
 ; POST-PROCESS-GLOBAL-NEXT:    [[TMP3:%.*]] = load i32, ptr [[CSP]], align 4
 ; POST-PROCESS-GLOBAL-NEXT:    [[TMP4:%.*]] = add i32 [[TMP3]], -120
 ; POST-PROCESS-GLOBAL-NEXT:    [[TMP5:%.*]] = load i32, ptr addrspace(20) @REGISTERS, align 4
@@ -3118,94 +3130,94 @@ attributes #3 = { nounwind }
 ; POST-PROCESS-GLOBAL-NEXT:    [[TMP26:%.*]] = load i32, ptr addrspace(20) getelementptr inbounds (i32, ptr addrspace(20) @REGISTERS, i32 27), align 4
 ; POST-PROCESS-GLOBAL-NEXT:    [[TMP27:%.*]] = load i32, ptr addrspace(20) getelementptr inbounds (i32, ptr addrspace(20) @REGISTERS, i32 28), align 4
 ; POST-PROCESS-GLOBAL-NEXT:    [[TMP28:%.*]] = load i32, ptr addrspace(20) getelementptr inbounds (i32, ptr addrspace(20) @REGISTERS, i32 29), align 4
-; POST-PROCESS-GLOBAL-NEXT:    [[TMP29:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP2]], i32 [[TMP5]]
-; POST-PROCESS-GLOBAL-NEXT:    [[TMP30:%.*]] = load i32, ptr addrspace(22) [[TMP29]], align 4
+; POST-PROCESS-GLOBAL-NEXT:    [[TMP32:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP29]], i32 [[TMP5]]
+; POST-PROCESS-GLOBAL-NEXT:    [[TMP30:%.*]] = load i32, ptr addrspace(22) [[TMP32]], align 4
 ; POST-PROCESS-GLOBAL-NEXT:    [[TMP31:%.*]] = add i32 [[TMP5]], 4
-; POST-PROCESS-GLOBAL-NEXT:    [[TMP32:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP2]], i32 [[TMP31]]
-; POST-PROCESS-GLOBAL-NEXT:    [[TMP33:%.*]] = load i32, ptr addrspace(22) [[TMP32]], align 4
+; POST-PROCESS-GLOBAL-NEXT:    [[TMP35:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP29]], i32 [[TMP31]]
+; POST-PROCESS-GLOBAL-NEXT:    [[TMP33:%.*]] = load i32, ptr addrspace(22) [[TMP35]], align 4
 ; POST-PROCESS-GLOBAL-NEXT:    [[TMP34:%.*]] = add i32 [[TMP5]], 8
-; POST-PROCESS-GLOBAL-NEXT:    [[TMP35:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP2]], i32 [[TMP34]]
-; POST-PROCESS-GLOBAL-NEXT:    [[TMP36:%.*]] = load i32, ptr addrspace(22) [[TMP35]], align 4
+; POST-PROCESS-GLOBAL-NEXT:    [[TMP38:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP29]], i32 [[TMP34]]
+; POST-PROCESS-GLOBAL-NEXT:    [[TMP36:%.*]] = load i32, ptr addrspace(22) [[TMP38]], align 4
 ; POST-PROCESS-GLOBAL-NEXT:    [[TMP37:%.*]] = add i32 [[TMP5]], 12
-; POST-PROCESS-GLOBAL-NEXT:    [[TMP38:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP2]], i32 [[TMP37]]
-; POST-PROCESS-GLOBAL-NEXT:    [[TMP39:%.*]] = load i32, ptr addrspace(22) [[TMP38]], align 4
+; POST-PROCESS-GLOBAL-NEXT:    [[TMP41:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP29]], i32 [[TMP37]]
+; POST-PROCESS-GLOBAL-NEXT:    [[TMP39:%.*]] = load i32, ptr addrspace(22) [[TMP41]], align 4
 ; POST-PROCESS-GLOBAL-NEXT:    [[TMP40:%.*]] = add i32 [[TMP5]], 16
-; POST-PROCESS-GLOBAL-NEXT:    [[TMP41:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP2]], i32 [[TMP40]]
-; POST-PROCESS-GLOBAL-NEXT:    [[TMP42:%.*]] = load i32, ptr addrspace(22) [[TMP41]], align 4
+; POST-PROCESS-GLOBAL-NEXT:    [[TMP44:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP29]], i32 [[TMP40]]
+; POST-PROCESS-GLOBAL-NEXT:    [[TMP42:%.*]] = load i32, ptr addrspace(22) [[TMP44]], align 4
 ; POST-PROCESS-GLOBAL-NEXT:    [[TMP43:%.*]] = add i32 [[TMP5]], 20
-; POST-PROCESS-GLOBAL-NEXT:    [[TMP44:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP2]], i32 [[TMP43]]
-; POST-PROCESS-GLOBAL-NEXT:    [[TMP45:%.*]] = load i32, ptr addrspace(22) [[TMP44]], align 4
+; POST-PROCESS-GLOBAL-NEXT:    [[TMP47:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP29]], i32 [[TMP43]]
+; POST-PROCESS-GLOBAL-NEXT:    [[TMP45:%.*]] = load i32, ptr addrspace(22) [[TMP47]], align 4
 ; POST-PROCESS-GLOBAL-NEXT:    [[TMP46:%.*]] = add i32 [[TMP5]], 24
-; POST-PROCESS-GLOBAL-NEXT:    [[TMP47:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP2]], i32 [[TMP46]]
-; POST-PROCESS-GLOBAL-NEXT:    [[TMP48:%.*]] = load i32, ptr addrspace(22) [[TMP47]], align 4
+; POST-PROCESS-GLOBAL-NEXT:    [[TMP50:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP29]], i32 [[TMP46]]
+; POST-PROCESS-GLOBAL-NEXT:    [[TMP48:%.*]] = load i32, ptr addrspace(22) [[TMP50]], align 4
 ; POST-PROCESS-GLOBAL-NEXT:    [[TMP49:%.*]] = add i32 [[TMP5]], 28
-; POST-PROCESS-GLOBAL-NEXT:    [[TMP50:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP2]], i32 [[TMP49]]
-; POST-PROCESS-GLOBAL-NEXT:    [[TMP51:%.*]] = load i32, ptr addrspace(22) [[TMP50]], align 4
+; POST-PROCESS-GLOBAL-NEXT:    [[TMP53:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP29]], i32 [[TMP49]]
+; POST-PROCESS-GLOBAL-NEXT:    [[TMP51:%.*]] = load i32, ptr addrspace(22) [[TMP53]], align 4
 ; POST-PROCESS-GLOBAL-NEXT:    [[TMP52:%.*]] = add i32 [[TMP5]], 32
-; POST-PROCESS-GLOBAL-NEXT:    [[TMP53:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP2]], i32 [[TMP52]]
-; POST-PROCESS-GLOBAL-NEXT:    [[TMP54:%.*]] = load i32, ptr addrspace(22) [[TMP53]], align 4
+; POST-PROCESS-GLOBAL-NEXT:    [[TMP56:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP29]], i32 [[TMP52]]
+; POST-PROCESS-GLOBAL-NEXT:    [[TMP54:%.*]] = load i32, ptr addrspace(22) [[TMP56]], align 4
 ; POST-PROCESS-GLOBAL-NEXT:    [[TMP55:%.*]] = add i32 [[TMP5]], 36
-; POST-PROCESS-GLOBAL-NEXT:    [[TMP56:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP2]], i32 [[TMP55]]
-; POST-PROCESS-GLOBAL-NEXT:    [[TMP57:%.*]] = load i32, ptr addrspace(22) [[TMP56]], align 4
+; POST-PROCESS-GLOBAL-NEXT:    [[TMP59:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP29]], i32 [[TMP55]]
+; POST-PROCESS-GLOBAL-NEXT:    [[TMP57:%.*]] = load i32, ptr addrspace(22) [[TMP59]], align 4
 ; POST-PROCESS-GLOBAL-NEXT:    [[TMP58:%.*]] = add i32 [[TMP5]], 40
-; POST-PROCESS-GLOBAL-NEXT:    [[TMP59:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP2]], i32 [[TMP58]]
-; POST-PROCESS-GLOBAL-NEXT:    [[TMP60:%.*]] = load i32, ptr addrspace(22) [[TMP59]], align 4
+; POST-PROCESS-GLOBAL-NEXT:    [[TMP62:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP29]], i32 [[TMP58]]
+; POST-PROCESS-GLOBAL-NEXT:    [[TMP60:%.*]] = load i32, ptr addrspace(22) [[TMP62]], align 4
 ; POST-PROCESS-GLOBAL-NEXT:    [[TMP61:%.*]] = add i32 [[TMP5]], 44
-; POST-PROCESS-GLOBAL-NEXT:    [[TMP62:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP2]], i32 [[TMP61]]
-; POST-PROCESS-GLOBAL-NEXT:    [[TMP63:%.*]] = load i32, ptr addrspace(22) [[TMP62]], align 4
+; POST-PROCESS-GLOBAL-NEXT:    [[TMP65:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP29]], i32 [[TMP61]]
+; POST-PROCESS-GLOBAL-NEXT:    [[TMP63:%.*]] = load i32, ptr addrspace(22) [[TMP65]], align 4
 ; POST-PROCESS-GLOBAL-NEXT:    [[TMP64:%.*]] = add i32 [[TMP5]], 48
-; POST-PROCESS-GLOBAL-NEXT:    [[TMP65:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP2]], i32 [[TMP64]]
-; POST-PROCESS-GLOBAL-NEXT:    [[TMP66:%.*]] = load i32, ptr addrspace(22) [[TMP65]], align 4
+; POST-PROCESS-GLOBAL-NEXT:    [[TMP68:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP29]], i32 [[TMP64]]
+; POST-PROCESS-GLOBAL-NEXT:    [[TMP66:%.*]] = load i32, ptr addrspace(22) [[TMP68]], align 4
 ; POST-PROCESS-GLOBAL-NEXT:    [[TMP67:%.*]] = add i32 [[TMP5]], 52
-; POST-PROCESS-GLOBAL-NEXT:    [[TMP68:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP2]], i32 [[TMP67]]
-; POST-PROCESS-GLOBAL-NEXT:    [[TMP69:%.*]] = load i32, ptr addrspace(22) [[TMP68]], align 4
+; POST-PROCESS-GLOBAL-NEXT:    [[TMP71:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP29]], i32 [[TMP67]]
+; POST-PROCESS-GLOBAL-NEXT:    [[TMP69:%.*]] = load i32, ptr addrspace(22) [[TMP71]], align 4
 ; POST-PROCESS-GLOBAL-NEXT:    [[TMP70:%.*]] = add i32 [[TMP5]], 56
-; POST-PROCESS-GLOBAL-NEXT:    [[TMP71:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP2]], i32 [[TMP70]]
-; POST-PROCESS-GLOBAL-NEXT:    [[TMP72:%.*]] = load i32, ptr addrspace(22) [[TMP71]], align 4
+; POST-PROCESS-GLOBAL-NEXT:    [[TMP74:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP29]], i32 [[TMP70]]
+; POST-PROCESS-GLOBAL-NEXT:    [[TMP72:%.*]] = load i32, ptr addrspace(22) [[TMP74]], align 4
 ; POST-PROCESS-GLOBAL-NEXT:    [[TMP73:%.*]] = add i32 [[TMP5]], 60
-; POST-PROCESS-GLOBAL-NEXT:    [[TMP74:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP2]], i32 [[TMP73]]
-; POST-PROCESS-GLOBAL-NEXT:    [[TMP75:%.*]] = load i32, ptr addrspace(22) [[TMP74]], align 4
+; POST-PROCESS-GLOBAL-NEXT:    [[TMP77:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP29]], i32 [[TMP73]]
+; POST-PROCESS-GLOBAL-NEXT:    [[TMP75:%.*]] = load i32, ptr addrspace(22) [[TMP77]], align 4
 ; POST-PROCESS-GLOBAL-NEXT:    [[TMP76:%.*]] = add i32 [[TMP5]], 64
-; POST-PROCESS-GLOBAL-NEXT:    [[TMP77:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP2]], i32 [[TMP76]]
-; POST-PROCESS-GLOBAL-NEXT:    [[TMP78:%.*]] = load i32, ptr addrspace(22) [[TMP77]], align 4
+; POST-PROCESS-GLOBAL-NEXT:    [[TMP80:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP29]], i32 [[TMP76]]
+; POST-PROCESS-GLOBAL-NEXT:    [[TMP78:%.*]] = load i32, ptr addrspace(22) [[TMP80]], align 4
 ; POST-PROCESS-GLOBAL-NEXT:    [[TMP79:%.*]] = add i32 [[TMP5]], 68
-; POST-PROCESS-GLOBAL-NEXT:    [[TMP80:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP2]], i32 [[TMP79]]
-; POST-PROCESS-GLOBAL-NEXT:    [[TMP81:%.*]] = load i32, ptr addrspace(22) [[TMP80]], align 4
+; POST-PROCESS-GLOBAL-NEXT:    [[TMP83:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP29]], i32 [[TMP79]]
+; POST-PROCESS-GLOBAL-NEXT:    [[TMP81:%.*]] = load i32, ptr addrspace(22) [[TMP83]], align 4
 ; POST-PROCESS-GLOBAL-NEXT:    [[TMP82:%.*]] = add i32 [[TMP5]], 72
-; POST-PROCESS-GLOBAL-NEXT:    [[TMP83:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP2]], i32 [[TMP82]]
-; POST-PROCESS-GLOBAL-NEXT:    [[TMP84:%.*]] = load i32, ptr addrspace(22) [[TMP83]], align 4
+; POST-PROCESS-GLOBAL-NEXT:    [[TMP86:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP29]], i32 [[TMP82]]
+; POST-PROCESS-GLOBAL-NEXT:    [[TMP84:%.*]] = load i32, ptr addrspace(22) [[TMP86]], align 4
 ; POST-PROCESS-GLOBAL-NEXT:    [[TMP85:%.*]] = add i32 [[TMP5]], 76
-; POST-PROCESS-GLOBAL-NEXT:    [[TMP86:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP2]], i32 [[TMP85]]
-; POST-PROCESS-GLOBAL-NEXT:    [[TMP87:%.*]] = load i32, ptr addrspace(22) [[TMP86]], align 4
+; POST-PROCESS-GLOBAL-NEXT:    [[TMP89:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP29]], i32 [[TMP85]]
+; POST-PROCESS-GLOBAL-NEXT:    [[TMP87:%.*]] = load i32, ptr addrspace(22) [[TMP89]], align 4
 ; POST-PROCESS-GLOBAL-NEXT:    [[TMP88:%.*]] = add i32 [[TMP5]], 80
-; POST-PROCESS-GLOBAL-NEXT:    [[TMP89:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP2]], i32 [[TMP88]]
-; POST-PROCESS-GLOBAL-NEXT:    [[TMP90:%.*]] = load i32, ptr addrspace(22) [[TMP89]], align 4
+; POST-PROCESS-GLOBAL-NEXT:    [[TMP92:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP29]], i32 [[TMP88]]
+; POST-PROCESS-GLOBAL-NEXT:    [[TMP90:%.*]] = load i32, ptr addrspace(22) [[TMP92]], align 4
 ; POST-PROCESS-GLOBAL-NEXT:    [[TMP91:%.*]] = add i32 [[TMP5]], 84
-; POST-PROCESS-GLOBAL-NEXT:    [[TMP92:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP2]], i32 [[TMP91]]
-; POST-PROCESS-GLOBAL-NEXT:    [[TMP93:%.*]] = load i32, ptr addrspace(22) [[TMP92]], align 4
+; POST-PROCESS-GLOBAL-NEXT:    [[TMP95:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP29]], i32 [[TMP91]]
+; POST-PROCESS-GLOBAL-NEXT:    [[TMP93:%.*]] = load i32, ptr addrspace(22) [[TMP95]], align 4
 ; POST-PROCESS-GLOBAL-NEXT:    [[TMP94:%.*]] = add i32 [[TMP5]], 88
-; POST-PROCESS-GLOBAL-NEXT:    [[TMP95:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP2]], i32 [[TMP94]]
-; POST-PROCESS-GLOBAL-NEXT:    [[TMP96:%.*]] = load i32, ptr addrspace(22) [[TMP95]], align 4
+; POST-PROCESS-GLOBAL-NEXT:    [[TMP98:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP29]], i32 [[TMP94]]
+; POST-PROCESS-GLOBAL-NEXT:    [[TMP96:%.*]] = load i32, ptr addrspace(22) [[TMP98]], align 4
 ; POST-PROCESS-GLOBAL-NEXT:    [[TMP97:%.*]] = add i32 [[TMP5]], 92
-; POST-PROCESS-GLOBAL-NEXT:    [[TMP98:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP2]], i32 [[TMP97]]
-; POST-PROCESS-GLOBAL-NEXT:    [[TMP99:%.*]] = load i32, ptr addrspace(22) [[TMP98]], align 4
+; POST-PROCESS-GLOBAL-NEXT:    [[TMP101:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP29]], i32 [[TMP97]]
+; POST-PROCESS-GLOBAL-NEXT:    [[TMP99:%.*]] = load i32, ptr addrspace(22) [[TMP101]], align 4
 ; POST-PROCESS-GLOBAL-NEXT:    [[TMP100:%.*]] = add i32 [[TMP5]], 96
-; POST-PROCESS-GLOBAL-NEXT:    [[TMP101:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP2]], i32 [[TMP100]]
-; POST-PROCESS-GLOBAL-NEXT:    [[TMP102:%.*]] = load i32, ptr addrspace(22) [[TMP101]], align 4
+; POST-PROCESS-GLOBAL-NEXT:    [[TMP104:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP29]], i32 [[TMP100]]
+; POST-PROCESS-GLOBAL-NEXT:    [[TMP102:%.*]] = load i32, ptr addrspace(22) [[TMP104]], align 4
 ; POST-PROCESS-GLOBAL-NEXT:    [[TMP103:%.*]] = add i32 [[TMP5]], 100
-; POST-PROCESS-GLOBAL-NEXT:    [[TMP104:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP2]], i32 [[TMP103]]
-; POST-PROCESS-GLOBAL-NEXT:    [[TMP105:%.*]] = load i32, ptr addrspace(22) [[TMP104]], align 4
+; POST-PROCESS-GLOBAL-NEXT:    [[TMP107:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP29]], i32 [[TMP103]]
+; POST-PROCESS-GLOBAL-NEXT:    [[TMP105:%.*]] = load i32, ptr addrspace(22) [[TMP107]], align 4
 ; POST-PROCESS-GLOBAL-NEXT:    [[TMP106:%.*]] = add i32 [[TMP5]], 104
-; POST-PROCESS-GLOBAL-NEXT:    [[TMP107:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP2]], i32 [[TMP106]]
-; POST-PROCESS-GLOBAL-NEXT:    [[TMP108:%.*]] = load i32, ptr addrspace(22) [[TMP107]], align 4
+; POST-PROCESS-GLOBAL-NEXT:    [[TMP111:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP29]], i32 [[TMP106]]
+; POST-PROCESS-GLOBAL-NEXT:    [[TMP108:%.*]] = load i32, ptr addrspace(22) [[TMP111]], align 4
 ; POST-PROCESS-GLOBAL-NEXT:    [[TMP109:%.*]] = load i32, ptr addrspace(20) @REGISTERS, align 4
-; POST-PROCESS-GLOBAL-NEXT:    [[DOTFCA_0_EXTRACT:%.*]] = extractvalue [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP0]], 0
+; POST-PROCESS-GLOBAL-NEXT:    [[DOTFCA_0_EXTRACT:%.*]] = extractvalue [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP1]], 0
 ; POST-PROCESS-GLOBAL-NEXT:    call void @amd.dx.setLocalRootIndex(i32 5)
 ; POST-PROCESS-GLOBAL-NEXT:    [[TMP110:%.*]] = add i32 [[TMP4]], 116
-; POST-PROCESS-GLOBAL-NEXT:    [[TMP111:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP2]], i32 [[TMP110]]
-; POST-PROCESS-GLOBAL-NEXT:    [[DOTRELOAD:%.*]] = load i32, ptr addrspace(22) [[TMP111]], align 4
+; POST-PROCESS-GLOBAL-NEXT:    [[TMP171:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP29]], i32 [[TMP110]]
+; POST-PROCESS-GLOBAL-NEXT:    [[DOTRELOAD:%.*]] = load i32, ptr addrspace(22) [[TMP171]], align 4
 ; POST-PROCESS-GLOBAL-NEXT:    [[TMP112:%.*]] = add i32 [[TMP4]], 108
-; POST-PROCESS-GLOBAL-NEXT:    [[TMP113:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP2]], i32 [[TMP112]]
+; POST-PROCESS-GLOBAL-NEXT:    [[TMP113:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP29]], i32 [[TMP112]]
 ; POST-PROCESS-GLOBAL-NEXT:    [[RETURNADDR_RELOAD:%.*]] = load i64, ptr addrspace(22) [[TMP113]], align 4
 ; POST-PROCESS-GLOBAL-NEXT:    store i32 [[DOTRELOAD]], ptr addrspace(20) @REGISTERS, align 4
 ; POST-PROCESS-GLOBAL-NEXT:    [[TMP114:%.*]] = load i32, ptr addrspace(20) @REGISTERS, align 4
@@ -3232,91 +3244,91 @@ attributes #3 = { nounwind }
 ; POST-PROCESS-GLOBAL-NEXT:    store i32 [[TMP26]], ptr addrspace(20) getelementptr inbounds (i32, ptr addrspace(20) @REGISTERS, i32 27), align 4
 ; POST-PROCESS-GLOBAL-NEXT:    store i32 [[TMP27]], ptr addrspace(20) getelementptr inbounds (i32, ptr addrspace(20) @REGISTERS, i32 28), align 4
 ; POST-PROCESS-GLOBAL-NEXT:    store i32 [[TMP28]], ptr addrspace(20) getelementptr inbounds (i32, ptr addrspace(20) @REGISTERS, i32 29), align 4
-; POST-PROCESS-GLOBAL-NEXT:    [[TMP115:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP2]], i32 [[TMP114]]
+; POST-PROCESS-GLOBAL-NEXT:    [[TMP115:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP29]], i32 [[TMP114]]
 ; POST-PROCESS-GLOBAL-NEXT:    store i32 [[TMP30]], ptr addrspace(22) [[TMP115]], align 4
 ; POST-PROCESS-GLOBAL-NEXT:    [[TMP116:%.*]] = add i32 [[TMP114]], 4
-; POST-PROCESS-GLOBAL-NEXT:    [[TMP117:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP2]], i32 [[TMP116]]
+; POST-PROCESS-GLOBAL-NEXT:    [[TMP117:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP29]], i32 [[TMP116]]
 ; POST-PROCESS-GLOBAL-NEXT:    store i32 [[TMP33]], ptr addrspace(22) [[TMP117]], align 4
 ; POST-PROCESS-GLOBAL-NEXT:    [[TMP118:%.*]] = add i32 [[TMP114]], 8
-; POST-PROCESS-GLOBAL-NEXT:    [[TMP119:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP2]], i32 [[TMP118]]
+; POST-PROCESS-GLOBAL-NEXT:    [[TMP119:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP29]], i32 [[TMP118]]
 ; POST-PROCESS-GLOBAL-NEXT:    store i32 [[TMP36]], ptr addrspace(22) [[TMP119]], align 4
 ; POST-PROCESS-GLOBAL-NEXT:    [[TMP120:%.*]] = add i32 [[TMP114]], 12
-; POST-PROCESS-GLOBAL-NEXT:    [[TMP121:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP2]], i32 [[TMP120]]
+; POST-PROCESS-GLOBAL-NEXT:    [[TMP121:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP29]], i32 [[TMP120]]
 ; POST-PROCESS-GLOBAL-NEXT:    store i32 [[TMP39]], ptr addrspace(22) [[TMP121]], align 4
 ; POST-PROCESS-GLOBAL-NEXT:    [[TMP122:%.*]] = add i32 [[TMP114]], 16
-; POST-PROCESS-GLOBAL-NEXT:    [[TMP123:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP2]], i32 [[TMP122]]
+; POST-PROCESS-GLOBAL-NEXT:    [[TMP123:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP29]], i32 [[TMP122]]
 ; POST-PROCESS-GLOBAL-NEXT:    store i32 [[TMP42]], ptr addrspace(22) [[TMP123]], align 4
 ; POST-PROCESS-GLOBAL-NEXT:    [[TMP124:%.*]] = add i32 [[TMP114]], 20
-; POST-PROCESS-GLOBAL-NEXT:    [[TMP125:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP2]], i32 [[TMP124]]
+; POST-PROCESS-GLOBAL-NEXT:    [[TMP125:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP29]], i32 [[TMP124]]
 ; POST-PROCESS-GLOBAL-NEXT:    store i32 [[TMP45]], ptr addrspace(22) [[TMP125]], align 4
 ; POST-PROCESS-GLOBAL-NEXT:    [[TMP126:%.*]] = add i32 [[TMP114]], 24
-; POST-PROCESS-GLOBAL-NEXT:    [[TMP127:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP2]], i32 [[TMP126]]
+; POST-PROCESS-GLOBAL-NEXT:    [[TMP127:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP29]], i32 [[TMP126]]
 ; POST-PROCESS-GLOBAL-NEXT:    store i32 [[TMP48]], ptr addrspace(22) [[TMP127]], align 4
 ; POST-PROCESS-GLOBAL-NEXT:    [[TMP128:%.*]] = add i32 [[TMP114]], 28
-; POST-PROCESS-GLOBAL-NEXT:    [[TMP129:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP2]], i32 [[TMP128]]
+; POST-PROCESS-GLOBAL-NEXT:    [[TMP129:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP29]], i32 [[TMP128]]
 ; POST-PROCESS-GLOBAL-NEXT:    store i32 [[TMP51]], ptr addrspace(22) [[TMP129]], align 4
 ; POST-PROCESS-GLOBAL-NEXT:    [[TMP130:%.*]] = add i32 [[TMP114]], 32
-; POST-PROCESS-GLOBAL-NEXT:    [[TMP131:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP2]], i32 [[TMP130]]
+; POST-PROCESS-GLOBAL-NEXT:    [[TMP131:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP29]], i32 [[TMP130]]
 ; POST-PROCESS-GLOBAL-NEXT:    store i32 [[TMP54]], ptr addrspace(22) [[TMP131]], align 4
 ; POST-PROCESS-GLOBAL-NEXT:    [[TMP132:%.*]] = add i32 [[TMP114]], 36
-; POST-PROCESS-GLOBAL-NEXT:    [[TMP133:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP2]], i32 [[TMP132]]
+; POST-PROCESS-GLOBAL-NEXT:    [[TMP133:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP29]], i32 [[TMP132]]
 ; POST-PROCESS-GLOBAL-NEXT:    store i32 [[TMP57]], ptr addrspace(22) [[TMP133]], align 4
 ; POST-PROCESS-GLOBAL-NEXT:    [[TMP134:%.*]] = add i32 [[TMP114]], 40
-; POST-PROCESS-GLOBAL-NEXT:    [[TMP135:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP2]], i32 [[TMP134]]
+; POST-PROCESS-GLOBAL-NEXT:    [[TMP135:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP29]], i32 [[TMP134]]
 ; POST-PROCESS-GLOBAL-NEXT:    store i32 [[TMP60]], ptr addrspace(22) [[TMP135]], align 4
 ; POST-PROCESS-GLOBAL-NEXT:    [[TMP136:%.*]] = add i32 [[TMP114]], 44
-; POST-PROCESS-GLOBAL-NEXT:    [[TMP137:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP2]], i32 [[TMP136]]
+; POST-PROCESS-GLOBAL-NEXT:    [[TMP137:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP29]], i32 [[TMP136]]
 ; POST-PROCESS-GLOBAL-NEXT:    store i32 [[TMP63]], ptr addrspace(22) [[TMP137]], align 4
 ; POST-PROCESS-GLOBAL-NEXT:    [[TMP138:%.*]] = add i32 [[TMP114]], 48
-; POST-PROCESS-GLOBAL-NEXT:    [[TMP139:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP2]], i32 [[TMP138]]
+; POST-PROCESS-GLOBAL-NEXT:    [[TMP139:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP29]], i32 [[TMP138]]
 ; POST-PROCESS-GLOBAL-NEXT:    store i32 [[TMP66]], ptr addrspace(22) [[TMP139]], align 4
 ; POST-PROCESS-GLOBAL-NEXT:    [[TMP140:%.*]] = add i32 [[TMP114]], 52
-; POST-PROCESS-GLOBAL-NEXT:    [[TMP141:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP2]], i32 [[TMP140]]
+; POST-PROCESS-GLOBAL-NEXT:    [[TMP141:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP29]], i32 [[TMP140]]
 ; POST-PROCESS-GLOBAL-NEXT:    store i32 [[TMP69]], ptr addrspace(22) [[TMP141]], align 4
 ; POST-PROCESS-GLOBAL-NEXT:    [[TMP142:%.*]] = add i32 [[TMP114]], 56
-; POST-PROCESS-GLOBAL-NEXT:    [[TMP143:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP2]], i32 [[TMP142]]
+; POST-PROCESS-GLOBAL-NEXT:    [[TMP143:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP29]], i32 [[TMP142]]
 ; POST-PROCESS-GLOBAL-NEXT:    store i32 [[TMP72]], ptr addrspace(22) [[TMP143]], align 4
 ; POST-PROCESS-GLOBAL-NEXT:    [[TMP144:%.*]] = add i32 [[TMP114]], 60
-; POST-PROCESS-GLOBAL-NEXT:    [[TMP145:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP2]], i32 [[TMP144]]
+; POST-PROCESS-GLOBAL-NEXT:    [[TMP145:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP29]], i32 [[TMP144]]
 ; POST-PROCESS-GLOBAL-NEXT:    store i32 [[TMP75]], ptr addrspace(22) [[TMP145]], align 4
 ; POST-PROCESS-GLOBAL-NEXT:    [[TMP146:%.*]] = add i32 [[TMP114]], 64
-; POST-PROCESS-GLOBAL-NEXT:    [[TMP147:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP2]], i32 [[TMP146]]
+; POST-PROCESS-GLOBAL-NEXT:    [[TMP147:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP29]], i32 [[TMP146]]
 ; POST-PROCESS-GLOBAL-NEXT:    store i32 [[TMP78]], ptr addrspace(22) [[TMP147]], align 4
 ; POST-PROCESS-GLOBAL-NEXT:    [[TMP148:%.*]] = add i32 [[TMP114]], 68
-; POST-PROCESS-GLOBAL-NEXT:    [[TMP149:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP2]], i32 [[TMP148]]
+; POST-PROCESS-GLOBAL-NEXT:    [[TMP149:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP29]], i32 [[TMP148]]
 ; POST-PROCESS-GLOBAL-NEXT:    store i32 [[TMP81]], ptr addrspace(22) [[TMP149]], align 4
 ; POST-PROCESS-GLOBAL-NEXT:    [[TMP150:%.*]] = add i32 [[TMP114]], 72
-; POST-PROCESS-GLOBAL-NEXT:    [[TMP151:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP2]], i32 [[TMP150]]
+; POST-PROCESS-GLOBAL-NEXT:    [[TMP151:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP29]], i32 [[TMP150]]
 ; POST-PROCESS-GLOBAL-NEXT:    store i32 [[TMP84]], ptr addrspace(22) [[TMP151]], align 4
 ; POST-PROCESS-GLOBAL-NEXT:    [[TMP152:%.*]] = add i32 [[TMP114]], 76
-; POST-PROCESS-GLOBAL-NEXT:    [[TMP153:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP2]], i32 [[TMP152]]
+; POST-PROCESS-GLOBAL-NEXT:    [[TMP153:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP29]], i32 [[TMP152]]
 ; POST-PROCESS-GLOBAL-NEXT:    store i32 [[TMP87]], ptr addrspace(22) [[TMP153]], align 4
 ; POST-PROCESS-GLOBAL-NEXT:    [[TMP154:%.*]] = add i32 [[TMP114]], 80
-; POST-PROCESS-GLOBAL-NEXT:    [[TMP155:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP2]], i32 [[TMP154]]
+; POST-PROCESS-GLOBAL-NEXT:    [[TMP155:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP29]], i32 [[TMP154]]
 ; POST-PROCESS-GLOBAL-NEXT:    store i32 [[TMP90]], ptr addrspace(22) [[TMP155]], align 4
 ; POST-PROCESS-GLOBAL-NEXT:    [[TMP156:%.*]] = add i32 [[TMP114]], 84
-; POST-PROCESS-GLOBAL-NEXT:    [[TMP157:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP2]], i32 [[TMP156]]
+; POST-PROCESS-GLOBAL-NEXT:    [[TMP157:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP29]], i32 [[TMP156]]
 ; POST-PROCESS-GLOBAL-NEXT:    store i32 [[TMP93]], ptr addrspace(22) [[TMP157]], align 4
 ; POST-PROCESS-GLOBAL-NEXT:    [[TMP158:%.*]] = add i32 [[TMP114]], 88
-; POST-PROCESS-GLOBAL-NEXT:    [[TMP159:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP2]], i32 [[TMP158]]
+; POST-PROCESS-GLOBAL-NEXT:    [[TMP159:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP29]], i32 [[TMP158]]
 ; POST-PROCESS-GLOBAL-NEXT:    store i32 [[TMP96]], ptr addrspace(22) [[TMP159]], align 4
 ; POST-PROCESS-GLOBAL-NEXT:    [[TMP160:%.*]] = add i32 [[TMP114]], 92
-; POST-PROCESS-GLOBAL-NEXT:    [[TMP161:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP2]], i32 [[TMP160]]
+; POST-PROCESS-GLOBAL-NEXT:    [[TMP161:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP29]], i32 [[TMP160]]
 ; POST-PROCESS-GLOBAL-NEXT:    store i32 [[TMP99]], ptr addrspace(22) [[TMP161]], align 4
 ; POST-PROCESS-GLOBAL-NEXT:    [[TMP162:%.*]] = add i32 [[TMP114]], 96
-; POST-PROCESS-GLOBAL-NEXT:    [[TMP163:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP2]], i32 [[TMP162]]
+; POST-PROCESS-GLOBAL-NEXT:    [[TMP163:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP29]], i32 [[TMP162]]
 ; POST-PROCESS-GLOBAL-NEXT:    store i32 [[TMP102]], ptr addrspace(22) [[TMP163]], align 4
 ; POST-PROCESS-GLOBAL-NEXT:    [[TMP164:%.*]] = add i32 [[TMP114]], 100
-; POST-PROCESS-GLOBAL-NEXT:    [[TMP165:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP2]], i32 [[TMP164]]
+; POST-PROCESS-GLOBAL-NEXT:    [[TMP165:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP29]], i32 [[TMP164]]
 ; POST-PROCESS-GLOBAL-NEXT:    store i32 [[TMP105]], ptr addrspace(22) [[TMP165]], align 4
 ; POST-PROCESS-GLOBAL-NEXT:    [[TMP166:%.*]] = add i32 [[TMP114]], 104
-; POST-PROCESS-GLOBAL-NEXT:    [[TMP167:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP2]], i32 [[TMP166]]
+; POST-PROCESS-GLOBAL-NEXT:    [[TMP167:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP29]], i32 [[TMP166]]
 ; POST-PROCESS-GLOBAL-NEXT:    store i32 [[TMP108]], ptr addrspace(22) [[TMP167]], align 4
 ; POST-PROCESS-GLOBAL-NEXT:    [[DOTFCA_0_INSERT:%.*]] = insertvalue [[STRUCT_DISPATCHSYSTEMDATA]] poison, <3 x i32> [[DOTFCA_0_EXTRACT]], 0
 ; POST-PROCESS-GLOBAL-NEXT:    [[TMP168:%.*]] = load i32, ptr [[CSP]], align 4
 ; POST-PROCESS-GLOBAL-NEXT:    [[TMP169:%.*]] = add i32 [[TMP168]], -120
 ; POST-PROCESS-GLOBAL-NEXT:    store i32 [[TMP169]], ptr [[CSP]], align 4
 ; POST-PROCESS-GLOBAL-NEXT:    [[TMP170:%.*]] = load i32, ptr [[CSP]], align 4
-; POST-PROCESS-GLOBAL-NEXT:    call void (i64, ...) @continuation.continue(i64 [[RETURNADDR_RELOAD]], i32 [[TMP170]], [[STRUCT_DISPATCHSYSTEMDATA]] [[DOTFCA_0_INSERT]]), !continuation.registercount [[META18]]
+; POST-PROCESS-GLOBAL-NEXT:    call void (i64, ...) @continuation.continue(i64 [[RETURNADDR_RELOAD]], i32 [[TMP170]], i64 poison, [[STRUCT_DISPATCHSYSTEMDATA]] [[DOTFCA_0_INSERT]]), !continuation.registercount [[META18]]
 ; POST-PROCESS-GLOBAL-NEXT:    unreachable
 ;
diff --git a/llvmraytracing/test/dx/register-buffer.ll b/llvmraytracing/test/dx/register-buffer.ll
index 5a73e2278a..1e75847e28 100644
--- a/llvmraytracing/test/dx/register-buffer.ll
+++ b/llvmraytracing/test/dx/register-buffer.ll
@@ -1,8 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals --version 3
-; RUN: opt --verify-each -passes='register-buffer,lint,instsimplify' -S %s 2> %t.stderr | FileCheck %s
-; RUN: count 0 < %t.stderr
+; RUN: opt --verify-each -passes='register-buffer,lint,instsimplify' -S %s --lint-abort-on-error | FileCheck %s
 
-target datalayout = "e-m:e-p:64:32-p20:32:32-p21:32:32-p32:32:32-i1:32-i8:8-i16:32-i32:32-i64:32-f16:32-f32:32-f64:32-v16:32-v32:32-v48:32-v64:32-v80:32-v96:32-v112:32-v128:32-v144:32-v160:32-v176:32-v192:32-v208:32-v224:32-v240:32-v256:32-n8:16:32"
+target datalayout = "e-m:e-p:64:32-p20:32:32-p21:32:32-p32:32:32-i1:32-i8:8-i16:16-i32:32-i64:32-f16:16-f32:32-f64:32-v8:8-v16:16-v32:32-v48:32-v64:32-v80:32-v96:32-v112:32-v128:32-v144:32-v160:32-v176:32-v192:32-v208:32-v224:32-v240:32-v256:32-n8:16:32"
 
 @GLOBAL = external global [20 x i32], !registerbuffer !1
 @GLOBAL_NO_REGS = external global [20 x i32], !registerbuffer !2
diff --git a/llvmraytracing/test/dx/remat-intrinsic.ll b/llvmraytracing/test/dx/remat-intrinsic.ll
index c2e5314d43..51d909f62c 100644
--- a/llvmraytracing/test/dx/remat-intrinsic.ll
+++ b/llvmraytracing/test/dx/remat-intrinsic.ll
@@ -1,9 +1,8 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --include-generated-funcs --version 3
 ; RUN: opt --verify-each -passes='dxil-cont-intrinsic-prepare,lint,dxil-cont-lgc-rt-op-converter,lint,inline,lint,lower-raytracing-pipeline,lint,sroa,lint,lower-await,lint,coro-early,dxil-coro-split,coro-cleanup,lint,legacy-cleanup-continuations,lint,register-buffer,lint,dxil-cont-post-process,lint,remove-types-metadata' \
-; RUN:     -S %s 2> %t.stderr | FileCheck -check-prefix=POSTPROCESS %s
-; RUN: count 0 < %t.stderr
+; RUN:     -S %s --lint-abort-on-error | FileCheck -check-prefix=POSTPROCESS %s
 
-target datalayout = "e-m:e-p:64:32-p20:32:32-p21:32:32-p32:32:32-i1:32-i8:8-i16:32-i32:32-i64:32-f16:32-f32:32-f64:32-v16:32-v32:32-v48:32-v64:32-v80:32-v96:32-v112:32-v128:32-v144:32-v160:32-v176:32-v192:32-v208:32-v224:32-v240:32-v256:32-n8:16:32"
+target datalayout = "e-m:e-p:64:32-p20:32:32-p21:32:32-p32:32:32-i1:32-i8:8-i16:16-i32:32-i64:32-f16:16-f32:32-f64:32-v8:8-v16:16-v32:32-v48:32-v64:32-v80:32-v96:32-v112:32-v128:32-v144:32-v160:32-v176:32-v192:32-v208:32-v224:32-v240:32-v256:32-n8:16:32"
 
 %dx.types.Handle = type { i8* }
 %struct.DispatchSystemData = type { i32 }
@@ -141,48 +140,48 @@ attributes #1 = { nounwind }
 ; POSTPROCESS-NEXT:    call void @amd.dx.setLocalRootIndex(i32 5)
 ; POSTPROCESS-NEXT:    [[TMP6:%.*]] = call <3 x i32> @_cont_DispatchRaysIndex3(ptr [[SYSTEM_DATA_ALLOCA]])
 ; POSTPROCESS-NEXT:    [[I:%.*]] = extractelement <3 x i32> [[TMP6]], i8 0
-; POSTPROCESS-NEXT:    [[UNPACKED:%.*]] = call [[DX_TYPES_FOURI32:%.*]] @dx.op.unpack4x8.i32(i32 219, i8 1, i32 [[I]])
+; POSTPROCESS-NEXT:    [[UNPACKED:%.*]] = call [[DX_TYPES_FOURI32:%.*]] [[DX_OP_UNPACK4X8_I32:@[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]](i32 219, i8 1, i32 [[I]])
 ; POSTPROCESS-NEXT:    [[HANDLE0:%.*]] = load [[DX_TYPES_HANDLE:%.*]], ptr @"\01?RenderTarget@@3V?$RWTexture2D@V?$vector@M$03@@@@A", align 4
-; POSTPROCESS-NEXT:    [[HANDLE1:%.*]] = call [[DX_TYPES_HANDLE]] @dx.op.createHandleForLib.dx.types.Handle(i32 160, [[DX_TYPES_HANDLE]] [[HANDLE0]])
-; POSTPROCESS-NEXT:    [[HANDLE2:%.*]] = call [[DX_TYPES_HANDLE]] @dx.op.annotateHandle(i32 216, [[DX_TYPES_HANDLE]] [[HANDLE1]], [[DX_TYPES_RESOURCEPROPERTIES:%.*]] { i32 16, i32 0 })
+; POSTPROCESS-NEXT:    [[HANDLE1:%.*]] = call [[DX_TYPES_HANDLE]] [[DX_OP_CREATEHANDLEFORLIB_DX_TYPES_HANDLE:@[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]](i32 160, [[DX_TYPES_HANDLE]] [[HANDLE0]])
+; POSTPROCESS-NEXT:    [[HANDLE2:%.*]] = call [[DX_TYPES_HANDLE]] [[DX_OP_ANNOTATEHANDLE:@[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]](i32 216, [[DX_TYPES_HANDLE]] [[HANDLE1]], [[DX_TYPES_RESOURCEPROPERTIES:%.*]] { i32 16, i32 0 })
 ; POSTPROCESS-NEXT:    [[DIS_DATA_I_FCA_0_INSERT:%.*]] = insertvalue [[STRUCT_DISPATCHSYSTEMDATA]] poison, i32 [[DOTFCA_0_EXTRACT]], 0
 ; POSTPROCESS-NEXT:    store i32 [[TMP5]], ptr addrspace(20) @REGISTERS, align 4
+; POSTPROCESS-NEXT:    [[TMP8:%.*]] = call i64 @continuation.getAddrAndMD(ptr @called.resume.0)
 ; POSTPROCESS-NEXT:    [[TMP7:%.*]] = load i32, ptr [[CSP]], align 4
-; POSTPROCESS-NEXT:    [[TMP8:%.*]] = call i64 @continuation.getAddrAndMD(i64 ptrtoint (ptr @called.resume.0 to i64))
-; POSTPROCESS-NEXT:    call void (i64, ...) @continuation.continue(i64 2, i32 [[TMP7]], i64 [[TMP8]], [[STRUCT_DISPATCHSYSTEMDATA]] [[DIS_DATA_I_FCA_0_INSERT]]), !continuation.registercount [[META14]], !continuation.returnedRegistercount !14
+; POSTPROCESS-NEXT:    call void (i64, ...) @continuation.continue(i64 2, i32 [[TMP7]], i64 [[TMP8]], [[STRUCT_DISPATCHSYSTEMDATA]] [[DIS_DATA_I_FCA_0_INSERT]]), !continuation.registercount [[META14]], !continuation.returnedRegistercount [[META14]]
 ; POSTPROCESS-NEXT:    unreachable
 ;
 ;
 ; POSTPROCESS-LABEL: define dso_local void @called.resume.0(
-; POSTPROCESS-SAME: i32 [[CSPINIT:%.*]], [[STRUCT_DISPATCHSYSTEMDATA:%.*]] [[TMP0:%.*]]) !continuation [[META16]] !lgc.rt.shaderstage [[META17]] !continuation.registercount [[META14]] {
+; POSTPROCESS-SAME: i32 [[CSPINIT:%.*]], i64 [[TMP0:%.*]], [[STRUCT_DISPATCHSYSTEMDATA:%.*]] [[TMP1:%.*]]) !continuation [[META16]] !lgc.rt.shaderstage [[META17]] !continuation.registercount [[META14]] {
 ; POSTPROCESS-NEXT:  entryresume.0:
-; POSTPROCESS-NEXT:    [[SYSTEM_DATA_ALLOCA:%.*]] = alloca [[STRUCT_DISPATCHSYSTEMDATA]], align 8
+; POSTPROCESS-NEXT:    [[SYSTEM_DATA_ALLOCA1:%.*]] = alloca [[STRUCT_DISPATCHSYSTEMDATA]], align 8
 ; POSTPROCESS-NEXT:    [[CSP:%.*]] = alloca i32, align 4
 ; POSTPROCESS-NEXT:    store i32 [[CSPINIT]], ptr [[CSP]], align 4
-; POSTPROCESS-NEXT:    store [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP0]], ptr [[SYSTEM_DATA_ALLOCA]], align 4
-; POSTPROCESS-NEXT:    [[TMP1:%.*]] = load i32, ptr [[CSP]], align 4
-; POSTPROCESS-NEXT:    [[TMP2:%.*]] = add i32 [[TMP1]], -8
+; POSTPROCESS-NEXT:    store [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP1]], ptr [[SYSTEM_DATA_ALLOCA1]], align 4
+; POSTPROCESS-NEXT:    [[TMP13:%.*]] = load i32, ptr [[CSP]], align 4
+; POSTPROCESS-NEXT:    [[TMP2:%.*]] = add i32 [[TMP13]], -8
 ; POSTPROCESS-NEXT:    [[TMP3:%.*]] = load i32, ptr addrspace(20) @REGISTERS, align 4
-; POSTPROCESS-NEXT:    [[DOTFCA_0_EXTRACT3:%.*]] = extractvalue [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP0]], 0
+; POSTPROCESS-NEXT:    [[DOTFCA_0_EXTRACT3:%.*]] = extractvalue [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP1]], 0
 ; POSTPROCESS-NEXT:    call void @amd.dx.setLocalRootIndex(i32 5)
 ; POSTPROCESS-NEXT:    [[TMP4:%.*]] = inttoptr i32 [[TMP2]] to ptr addrspace(21)
 ; POSTPROCESS-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP4]], i32 0
 ; POSTPROCESS-NEXT:    [[RETURNADDR_RELOAD:%.*]] = load i64, ptr addrspace(21) [[TMP5]], align 4
 ; POSTPROCESS-NEXT:    [[HANDLE011:%.*]] = load [[DX_TYPES_HANDLE:%.*]], ptr @"\01?RenderTarget@@3V?$RWTexture2D@V?$vector@M$03@@@@A", align 4
-; POSTPROCESS-NEXT:    [[HANDLE110:%.*]] = call [[DX_TYPES_HANDLE]] @dx.op.createHandleForLib.dx.types.Handle(i32 160, [[DX_TYPES_HANDLE]] [[HANDLE011]])
-; POSTPROCESS-NEXT:    [[HANDLE29:%.*]] = call [[DX_TYPES_HANDLE]] @dx.op.annotateHandle(i32 216, [[DX_TYPES_HANDLE]] [[HANDLE110]], [[DX_TYPES_RESOURCEPROPERTIES:%.*]] { i32 16, i32 0 })
-; POSTPROCESS-NEXT:    [[TMP6:%.*]] = call <3 x i32> @_cont_DispatchRaysIndex3(ptr [[SYSTEM_DATA_ALLOCA]])
+; POSTPROCESS-NEXT:    [[HANDLE110:%.*]] = call [[DX_TYPES_HANDLE]] [[DX_OP_CREATEHANDLEFORLIB_DX_TYPES_HANDLE:@[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]](i32 160, [[DX_TYPES_HANDLE]] [[HANDLE011]])
+; POSTPROCESS-NEXT:    [[HANDLE29:%.*]] = call [[DX_TYPES_HANDLE]] [[DX_OP_ANNOTATEHANDLE:@[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]](i32 216, [[DX_TYPES_HANDLE]] [[HANDLE110]], [[DX_TYPES_RESOURCEPROPERTIES:%.*]] { i32 16, i32 0 })
+; POSTPROCESS-NEXT:    [[TMP6:%.*]] = call <3 x i32> @_cont_DispatchRaysIndex3(ptr [[SYSTEM_DATA_ALLOCA1]])
 ; POSTPROCESS-NEXT:    [[I8:%.*]] = extractelement <3 x i32> [[TMP6]], i8 0
-; POSTPROCESS-NEXT:    [[UNPACKED7:%.*]] = call [[DX_TYPES_FOURI32:%.*]] @dx.op.unpack4x8.i32(i32 219, i8 1, i32 [[I8]])
-; POSTPROCESS-NEXT:    [[TMP7:%.*]] = call <3 x i32> @_cont_DispatchRaysIndex3(ptr [[SYSTEM_DATA_ALLOCA]])
+; POSTPROCESS-NEXT:    [[UNPACKED7:%.*]] = call [[DX_TYPES_FOURI32:%.*]] [[DX_OP_UNPACK4X8_I32:@[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]](i32 219, i8 1, i32 [[I8]])
+; POSTPROCESS-NEXT:    [[TMP7:%.*]] = call <3 x i32> @_cont_DispatchRaysIndex3(ptr [[SYSTEM_DATA_ALLOCA1]])
 ; POSTPROCESS-NEXT:    [[I6:%.*]] = extractelement <3 x i32> [[TMP7]], i8 0
-; POSTPROCESS-NEXT:    [[UNPACKED5:%.*]] = call [[DX_TYPES_FOURI32]] @dx.op.unpack4x8.i32(i32 219, i8 1, i32 [[I6]])
-; POSTPROCESS-NEXT:    [[TMP8:%.*]] = call <3 x i32> @_cont_DispatchRaysIndex3(ptr [[SYSTEM_DATA_ALLOCA]])
+; POSTPROCESS-NEXT:    [[UNPACKED5:%.*]] = call [[DX_TYPES_FOURI32]] [[DX_OP_UNPACK4X8_I32]](i32 219, i8 1, i32 [[I6]])
+; POSTPROCESS-NEXT:    [[TMP8:%.*]] = call <3 x i32> @_cont_DispatchRaysIndex3(ptr [[SYSTEM_DATA_ALLOCA1]])
 ; POSTPROCESS-NEXT:    [[I4:%.*]] = extractelement <3 x i32> [[TMP8]], i8 0
-; POSTPROCESS-NEXT:    [[UNPACKED3:%.*]] = call [[DX_TYPES_FOURI32]] @dx.op.unpack4x8.i32(i32 219, i8 1, i32 [[I4]])
-; POSTPROCESS-NEXT:    [[TMP9:%.*]] = call <3 x i32> @_cont_DispatchRaysIndex3(ptr [[SYSTEM_DATA_ALLOCA]])
+; POSTPROCESS-NEXT:    [[UNPACKED3:%.*]] = call [[DX_TYPES_FOURI32]] [[DX_OP_UNPACK4X8_I32]](i32 219, i8 1, i32 [[I4]])
+; POSTPROCESS-NEXT:    [[TMP9:%.*]] = call <3 x i32> @_cont_DispatchRaysIndex3(ptr [[SYSTEM_DATA_ALLOCA1]])
 ; POSTPROCESS-NEXT:    [[I2:%.*]] = extractelement <3 x i32> [[TMP9]], i8 0
-; POSTPROCESS-NEXT:    [[UNPACKED1:%.*]] = call [[DX_TYPES_FOURI32]] @dx.op.unpack4x8.i32(i32 219, i8 1, i32 [[I2]])
+; POSTPROCESS-NEXT:    [[UNPACKED1:%.*]] = call [[DX_TYPES_FOURI32]] [[DX_OP_UNPACK4X8_I32]](i32 219, i8 1, i32 [[I2]])
 ; POSTPROCESS-NEXT:    [[A:%.*]] = extractvalue [[DX_TYPES_FOURI32]] [[UNPACKED7]], 0
 ; POSTPROCESS-NEXT:    [[B:%.*]] = extractvalue [[DX_TYPES_FOURI32]] [[UNPACKED5]], 1
 ; POSTPROCESS-NEXT:    [[C:%.*]] = extractvalue [[DX_TYPES_FOURI32]] [[UNPACKED3]], 2
@@ -195,6 +194,6 @@ attributes #1 = { nounwind }
 ; POSTPROCESS-NEXT:    [[TMP11:%.*]] = add i32 [[TMP10]], -8
 ; POSTPROCESS-NEXT:    store i32 [[TMP11]], ptr [[CSP]], align 4
 ; POSTPROCESS-NEXT:    [[TMP12:%.*]] = load i32, ptr [[CSP]], align 4
-; POSTPROCESS-NEXT:    call void (i64, ...) @continuation.continue(i64 [[RETURNADDR_RELOAD]], i32 [[TMP12]], [[STRUCT_DISPATCHSYSTEMDATA]] [[DOTFCA_0_INSERT]]), !continuation.registercount [[META14]]
+; POSTPROCESS-NEXT:    call void (i64, ...) @continuation.continue(i64 [[RETURNADDR_RELOAD]], i32 [[TMP12]], i64 poison, [[STRUCT_DISPATCHSYSTEMDATA]] [[DOTFCA_0_INSERT]]), !continuation.registercount [[META14]]
 ; POSTPROCESS-NEXT:    unreachable
 ;
diff --git a/llvmraytracing/test/dx/remove-types-metadata.ll b/llvmraytracing/test/dx/remove-types-metadata.ll
index 36df137617..7694d52e5e 100644
--- a/llvmraytracing/test/dx/remove-types-metadata.ll
+++ b/llvmraytracing/test/dx/remove-types-metadata.ll
@@ -1,8 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-attributes --check-globals --version 3
-; RUN: opt --verify-each -passes='remove-types-metadata' -S %s 2> %t.stderr | FileCheck -check-prefix=METADATA %s
-; RUN: count 0 < %t.stderr
+; RUN: opt --verify-each -passes='remove-types-metadata' -S %s --lint-abort-on-error | FileCheck -check-prefix=METADATA %s
 
-target datalayout = "e-m:e-p:64:32-p20:32:32-p21:32:32-p32:32:32-i1:32-i8:8-i16:32-i32:32-i64:32-f16:32-f32:32-f64:32-v16:32-v32:32-v48:32-v64:32-v80:32-v96:32-v112:32-v128:32-v144:32-v160:32-v176:32-v192:32-v208:32-v224:32-v240:32-v256:32-n8:16:32"
+target datalayout = "e-m:e-p:64:32-p20:32:32-p21:32:32-p32:32:32-i1:32-i8:8-i16:16-i32:32-i64:32-f16:16-f32:32-f64:32-v8:8-v16:16-v32:32-v48:32-v64:32-v80:32-v96:32-v112:32-v128:32-v144:32-v160:32-v176:32-v192:32-v208:32-v224:32-v240:32-v256:32-n8:16:32"
 
 %dx.types.Handle = type { i8* }
 %struct.DispatchSystemData = type { <3 x i32> }
@@ -72,7 +71,7 @@ define void @_cont_TraceRay(%struct.DispatchSystemData* %data, i64 %0, i32 %1, i
 ; METADATA-NEXT:    [[TRAV_DATA:%.*]] = insertvalue [[STRUCT_TRAVERSALDATA:%.*]] undef, [[STRUCT_SYSTEMDATA]] [[SYS_DATA]], 0
 ; METADATA-NEXT:    [[ADDR:%.*]] = call i64 @_AmdGetResumePointAddr() #[[ATTR3:[0-9]+]]
 ; METADATA-NEXT:    [[TRAV_DATA2:%.*]] = insertvalue [[STRUCT_TRAVERSALDATA]] [[TRAV_DATA]], i64 [[ADDR]], 5
-; METADATA-NEXT:    [[NEWDATA:%.*]] = call [[STRUCT_DISPATCHSYSTEMDATA]] @_AmdAwaitTraversal(i64 4, [[STRUCT_TRAVERSALDATA]] [[TRAV_DATA2]])
+; METADATA-NEXT:    [[NEWDATA:%.*]] = call [[STRUCT_DISPATCHSYSTEMDATA]] [[_AMDAWAITTRAVERSAL:@[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]](i64 4, [[STRUCT_TRAVERSALDATA]] [[TRAV_DATA2]])
 ; METADATA-NEXT:    store [[STRUCT_DISPATCHSYSTEMDATA]] [[NEWDATA]], ptr [[DATA]], align 4
 ; METADATA-NEXT:    call void @_AmdRestoreSystemData(ptr [[DATA]])
 ; METADATA-NEXT:    ret void
@@ -92,7 +91,7 @@ define void @_cont_CallShader(%struct.DispatchSystemData* %data, i32 %0) #0 !typ
 ; METADATA-LABEL: define void @_cont_CallShader(
 ; METADATA-SAME: ptr [[DATA:%.*]], i32 [[TMP0:%.*]]) #[[ATTR0]] {
 ; METADATA-NEXT:    [[DIS_DATA:%.*]] = load [[STRUCT_DISPATCHSYSTEMDATA:%.*]], ptr [[DATA]], align 4
-; METADATA-NEXT:    [[NEWDATA:%.*]] = call [[STRUCT_DISPATCHSYSTEMDATA]] @_AmdAwaitShader(i64 2, [[STRUCT_DISPATCHSYSTEMDATA]] [[DIS_DATA]])
+; METADATA-NEXT:    [[NEWDATA:%.*]] = call [[STRUCT_DISPATCHSYSTEMDATA]] [[_AMDAWAITSHADER:@[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]](i64 2, [[STRUCT_DISPATCHSYSTEMDATA]] [[DIS_DATA]])
 ; METADATA-NEXT:    store [[STRUCT_DISPATCHSYSTEMDATA]] [[NEWDATA]], ptr [[DATA]], align 4
 ; METADATA-NEXT:    call void @_AmdRestoreSystemData(ptr [[DATA]])
 ; METADATA-NEXT:    ret void
@@ -113,7 +112,7 @@ define i1 @_cont_ReportHit(%struct.AnyHitTraversalData* %data, float %t, i32 %hi
 ; METADATA-NEXT:    br i1 [[ISNOHIT]], label [[ISEND:%.*]], label [[CALLAHIT:%.*]]
 ; METADATA:       callAHit:
 ; METADATA-NEXT:    [[TRAV_DATA:%.*]] = load [[STRUCT_ANYHITTRAVERSALDATA]], ptr [[DATA]], align 4
-; METADATA-NEXT:    [[NEWDATA:%.*]] = call [[STRUCT_ANYHITTRAVERSALDATA]] @_AmdAwaitAnyHit(i64 3, [[STRUCT_ANYHITTRAVERSALDATA]] [[TRAV_DATA]], float [[T]], i32 [[HITKIND]])
+; METADATA-NEXT:    [[NEWDATA:%.*]] = call [[STRUCT_ANYHITTRAVERSALDATA]] [[_AMDAWAITANYHIT:@[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]](i64 3, [[STRUCT_ANYHITTRAVERSALDATA]] [[TRAV_DATA]], float [[T]], i32 [[HITKIND]])
 ; METADATA-NEXT:    store [[STRUCT_ANYHITTRAVERSALDATA]] [[NEWDATA]], ptr [[DATA]], align 4
 ; METADATA-NEXT:    call void @_AmdRestoreSystemDataAnyHit(ptr [[DATA]])
 ; METADATA-NEXT:    ret i1 true
@@ -198,14 +197,14 @@ define void @MyRayGen() #2 {
 ; METADATA-NEXT:    call void @llvm.lifetime.start.p0(i64 16, ptr [[TMP4]]) #[[ATTR1:[0-9]+]]
 ; METADATA-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [[STRUCT_RAYPAYLOAD]], ptr [[TMP3]], i32 0, i32 0
 ; METADATA-NEXT:    store <4 x float> zeroinitializer, ptr [[TMP5]], align 4, !tbaa [[TBAA31:![0-9]+]]
-; METADATA-NEXT:    [[TMP6:%.*]] = call [[DX_TYPES_HANDLE]] @dx.op.createHandleForLib.dx.types.Handle(i32 160, [[DX_TYPES_HANDLE]] [[TMP1]])
-; METADATA-NEXT:    [[TMP7:%.*]] = call [[DX_TYPES_HANDLE]] @dx.op.annotateHandle(i32 216, [[DX_TYPES_HANDLE]] [[TMP6]], [[DX_TYPES_RESOURCEPROPERTIES:%.*]] { i32 16, i32 0 })
+; METADATA-NEXT:    [[TMP6:%.*]] = call [[DX_TYPES_HANDLE]] [[DX_OP_CREATEHANDLEFORLIB_DX_TYPES_HANDLE:@[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]](i32 160, [[DX_TYPES_HANDLE]] [[TMP1]])
+; METADATA-NEXT:    [[TMP7:%.*]] = call [[DX_TYPES_HANDLE]] [[DX_OP_ANNOTATEHANDLE:@[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]](i32 216, [[DX_TYPES_HANDLE]] [[TMP6]], [[DX_TYPES_RESOURCEPROPERTIES:%.*]] { i32 16, i32 0 })
 ; METADATA-NEXT:    call void @dx.op.traceRay.struct.RayPayload(i32 157, [[DX_TYPES_HANDLE]] [[TMP7]], i32 16, i32 -1, i32 0, i32 1, i32 0, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0x3F50624DE0000000, float 1.000000e+00, float 0.000000e+00, float 0.000000e+00, float 1.000000e+04, ptr nonnull [[TMP3]])
 ; METADATA-NEXT:    [[TMP8:%.*]] = load <4 x float>, ptr [[TMP5]], align 4, !tbaa [[TBAA31]]
 ; METADATA-NEXT:    [[TMP9:%.*]] = call i32 @dx.op.dispatchRaysIndex.i32(i32 145, i8 0)
 ; METADATA-NEXT:    [[TMP10:%.*]] = call i32 @dx.op.dispatchRaysIndex.i32(i32 145, i8 1)
-; METADATA-NEXT:    [[TMP11:%.*]] = call [[DX_TYPES_HANDLE]] @dx.op.createHandleForLib.dx.types.Handle(i32 160, [[DX_TYPES_HANDLE]] [[TMP2]])
-; METADATA-NEXT:    [[TMP12:%.*]] = call [[DX_TYPES_HANDLE]] @dx.op.annotateHandle(i32 216, [[DX_TYPES_HANDLE]] [[TMP11]], [[DX_TYPES_RESOURCEPROPERTIES]] { i32 4098, i32 1033 })
+; METADATA-NEXT:    [[TMP11:%.*]] = call [[DX_TYPES_HANDLE]] [[DX_OP_CREATEHANDLEFORLIB_DX_TYPES_HANDLE]](i32 160, [[DX_TYPES_HANDLE]] [[TMP2]])
+; METADATA-NEXT:    [[TMP12:%.*]] = call [[DX_TYPES_HANDLE]] [[DX_OP_ANNOTATEHANDLE]](i32 216, [[DX_TYPES_HANDLE]] [[TMP11]], [[DX_TYPES_RESOURCEPROPERTIES]] { i32 4098, i32 1033 })
 ; METADATA-NEXT:    [[TMP13:%.*]] = extractelement <4 x float> [[TMP8]], i64 0
 ; METADATA-NEXT:    [[TMP14:%.*]] = extractelement <4 x float> [[TMP8]], i64 1
 ; METADATA-NEXT:    [[TMP15:%.*]] = extractelement <4 x float> [[TMP8]], i64 2
diff --git a/llvmraytracing/test/dx/remove-unused-declarations.ll b/llvmraytracing/test/dx/remove-unused-declarations.ll
index 20e12372fc..44a2222201 100644
--- a/llvmraytracing/test/dx/remove-unused-declarations.ll
+++ b/llvmraytracing/test/dx/remove-unused-declarations.ll
@@ -1,9 +1,7 @@
-; RUN: opt --verify-each -passes='dxil-cont-intrinsic-prepare,lint,dxil-cont-lgc-rt-op-converter,lint,lower-raytracing-pipeline,lint' -S %s 2> %t0.stderr | FileCheck -check-prefix=LOWERRAYTRACINGPIPELINE-DECL %s
-; RUN: count 0 < %t0.stderr
-; RUN: opt --verify-each -passes='dxil-cont-intrinsic-prepare,lint,dxil-cont-lgc-rt-op-converter,lint,inline,lint,lower-raytracing-pipeline,lint,sroa,lint,lower-await,lint,coro-early,dxil-coro-split,coro-cleanup,lint,legacy-cleanup-continuations,lint,register-buffer,lint,dxil-cont-post-process,lint' -S %s 2> %t1.stderr | FileCheck -check-prefix=DXILCONTPOSTPROCESS-DECL %s
-; RUN: count 0 < %t1.stderr
+; RUN: opt --verify-each -passes='dxil-cont-intrinsic-prepare,lint,dxil-cont-lgc-rt-op-converter,lint,lower-raytracing-pipeline,lint' -S %s --lint-abort-on-error | FileCheck -check-prefix=LOWERRAYTRACINGPIPELINE-DECL %s
+; RUN: opt --verify-each -passes='dxil-cont-intrinsic-prepare,lint,dxil-cont-lgc-rt-op-converter,lint,inline,lint,lower-raytracing-pipeline,lint,sroa,lint,lower-await,lint,coro-early,dxil-coro-split,coro-cleanup,lint,legacy-cleanup-continuations,lint,register-buffer,lint,dxil-cont-post-process,lint' -S %s --lint-abort-on-error | FileCheck -check-prefix=DXILCONTPOSTPROCESS-DECL %s
 
-target datalayout = "e-m:e-p:64:32-p20:32:32-p21:32:32-p32:32:32-i1:32-i8:8-i16:32-i32:32-i64:32-f16:32-f32:32-f64:32-v16:32-v32:32-v48:32-v64:32-v80:32-v96:32-v112:32-v128:32-v144:32-v160:32-v176:32-v192:32-v208:32-v224:32-v240:32-v256:32-n8:16:32"
+target datalayout = "e-m:e-p:64:32-p20:32:32-p21:32:32-p32:32:32-i1:32-i8:8-i16:16-i32:32-i64:32-f16:16-f32:32-f64:32-v8:8-v16:16-v32:32-v48:32-v64:32-v80:32-v96:32-v112:32-v128:32-v144:32-v160:32-v176:32-v192:32-v208:32-v224:32-v240:32-v256:32-n8:16:32"
 
 %struct.HitData = type { float, i32 }
 %struct.DispatchSystemData = type { <3 x i32> }
diff --git a/llvmraytracing/test/dx/traceray.ll b/llvmraytracing/test/dx/traceray.ll
index 9a465cd738..da0acf92a5 100644
--- a/llvmraytracing/test/dx/traceray.ll
+++ b/llvmraytracing/test/dx/traceray.ll
@@ -1,16 +1,11 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --include-generated-funcs --version 3
-; RUN: grep -v lgc.cps.module %s | grep -v SKIP_GLOBAL_ADDRSPACE | opt --verify-each -passes='dxil-cont-lgc-rt-op-converter,lint,lower-raytracing-pipeline,lint,remove-types-metadata' -S 2> %t0.stderr | FileCheck -check-prefix=LOWERRAYTRACINGPIPELINE %s
-; RUN: count 0 < %t0.stderr
-; RUN: grep -v lgc.cps.module %s | grep -v SKIP_GLOBAL_ADDRSPACE | opt --verify-each -passes='dxil-cont-lgc-rt-op-converter,lint,inline,lint,lower-raytracing-pipeline,lint,sroa,lint,lower-await,lint,coro-early,dxil-coro-split,coro-cleanup,lint,legacy-cleanup-continuations,lint,register-buffer,lint,dxil-cont-post-process,lint,remove-types-metadata' -S 2> %t1.stderr | FileCheck -check-prefix=DXILCONTPOSTPROCESS %s
-; RUN: count 0 < %t1.stderr
-; RUN: grep -v lgc.cps.module %s | opt --verify-each -passes='dxil-cont-lgc-rt-op-converter,lint,inline,lint,lower-raytracing-pipeline,lint,sroa,lint,lower-await,lint,coro-early,dxil-coro-split,coro-cleanup,lint,legacy-cleanup-continuations,lint,register-buffer,lint,dxil-cont-post-process,lint,remove-types-metadata' -S 2> %t2.stderr | FileCheck -check-prefix=DXILCONTPOSTPROCESS-GLOBAL %s
-; RUN: count 0 < %t2.stderr
-; RUN: grep -v SKIP_GLOBAL_ADDRSPACE %s | opt --verify-each -passes='dxil-cont-lgc-rt-op-converter,lint,lower-raytracing-pipeline,lint,remove-types-metadata' -S 2> %t3.stderr | FileCheck -check-prefix=LOWERRAYTRACINGPIPELINE-CPS %s
-; RUN: count 0 < %t3.stderr
-; RUN: grep -v SKIP_GLOBAL_ADDRSPACE %s | opt --verify-each -passes='dxil-cont-lgc-rt-op-converter,lint,inline,lint,lower-raytracing-pipeline,lint,sroa,lint,lower-await,lint,coro-early,dxil-coro-split,coro-cleanup,lint,dxil-cleanup-continuations,lint,register-buffer,lint,dxil-cont-post-process,lint,remove-types-metadata' -S 2> %t4.stderr | FileCheck -check-prefix=DXILCONTPOSTPROCESS-CPS %s
-; RUN: count 0 < %t4.stderr
+; RUN: grep -v lgc.cps.module %s | grep -v SKIP_GLOBAL_ADDRSPACE | opt --verify-each -passes="dxil-cont-lgc-rt-op-converter,lint,lower-raytracing-pipeline,lint,remove-types-metadata" -S --lint-abort-on-error | FileCheck -check-prefix=LOWERRAYTRACINGPIPELINE %s
+; RUN: grep -v lgc.cps.module %s | grep -v SKIP_GLOBAL_ADDRSPACE | opt --verify-each -passes="dxil-cont-lgc-rt-op-converter,lint,inline,lint,lower-raytracing-pipeline,lint,sroa,lint,lower-await,lint,coro-early,dxil-coro-split,coro-cleanup,lint,legacy-cleanup-continuations,lint,register-buffer,lint,dxil-cont-post-process,lint,remove-types-metadata" -S --lint-abort-on-error | FileCheck -check-prefix=DXILCONTPOSTPROCESS %s
+; RUN: grep -v lgc.cps.module %s | opt --verify-each -passes="dxil-cont-lgc-rt-op-converter,lint,inline,lint,lower-raytracing-pipeline,lint,sroa,lint,lower-await,lint,coro-early,dxil-coro-split,coro-cleanup,lint,legacy-cleanup-continuations,lint,register-buffer,lint,dxil-cont-post-process,lint,remove-types-metadata" -S --lint-abort-on-error | FileCheck -check-prefix=DXILCONTPOSTPROCESS-GLOBAL %s
+; RUN: grep -v SKIP_GLOBAL_ADDRSPACE %s | opt --verify-each -passes="dxil-cont-lgc-rt-op-converter,lint,lower-raytracing-pipeline,lint,remove-types-metadata" -S --lint-abort-on-error | FileCheck -check-prefix=LOWERRAYTRACINGPIPELINE-CPS %s
+; RUN: grep -v SKIP_GLOBAL_ADDRSPACE %s | opt --verify-each -passes="dxil-cont-lgc-rt-op-converter,lint,inline,lint,lower-raytracing-pipeline,lint,sroa,lint,lower-await,lint,coro-early,dxil-coro-split,coro-cleanup,lint,dxil-cleanup-continuations,lint,register-buffer,lint,dxil-cont-post-process,lint,remove-types-metadata" -S --lint-abort-on-error | FileCheck -check-prefix=DXILCONTPOSTPROCESS-CPS %s
 
-target datalayout = "e-m:e-p:64:32-p20:32:32-p21:32:32-p32:32:32-i1:32-i8:8-i16:32-i32:32-i64:32-f16:32-f32:32-f64:32-v16:32-v32:32-v48:32-v64:32-v80:32-v96:32-v112:32-v128:32-v144:32-v160:32-v176:32-v192:32-v208:32-v224:32-v240:32-v256:32-n8:16:32"
+target datalayout = "e-m:e-p:64:32-p20:32:32-p21:32:32-p32:32:32-i1:32-i8:8-i16:16-i32:32-i64:32-f16:16-f32:32-f64:32-v8:8-v16:16-v32:32-v48:32-v64:32-v80:32-v96:32-v112:32-v128:32-v144:32-v160:32-v176:32-v192:32-v208:32-v224:32-v240:32-v256:32-n8:16:32"
 
 %dx.types.Handle = type { i8* }
 %struct.DispatchSystemData = type { <3 x i32> }
@@ -69,6 +64,10 @@ declare !types !37 void @_AmdAcceptHitAttributes(%struct.TraversalData*) #1
 
 declare i1 @opaqueIsEnd() #0
 
+define void @_cont_ExitRayGen(ptr nocapture readonly %data) alwaysinline nounwind !types !{!"function", !"void", !{i32 0, %struct.DispatchSystemData poison}} {
+  ret void
+}
+
 define i1 @_cont_IsEndSearch(%struct.TraversalData* %data) #0 !types !39 {
   %isEnd = call i1 @opaqueIsEnd()
   ret i1 %isEnd
@@ -442,7 +441,7 @@ attributes #6 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 ;
 ;
 ; LOWERRAYTRACINGPIPELINE-LABEL: define void @MyRayGen(
-; LOWERRAYTRACINGPIPELINE-SAME: [[STRUCT_DISPATCHSYSTEMDATA:%.*]] [[TMP0:%.*]]) #[[ATTR3:[0-9]+]] !lgc.rt.shaderstage [[META22]] !continuation.registercount [[META22]] !continuation.entry [[META13:![0-9]+]] !continuation [[META36:![0-9]+]] {
+; LOWERRAYTRACINGPIPELINE-SAME: i64 [[RETURNADDR:%.*]], [[STRUCT_DISPATCHSYSTEMDATA:%.*]] [[TMP0:%.*]]) #[[ATTR3:[0-9]+]] !lgc.rt.shaderstage [[META22]] !continuation.registercount [[META22]] !continuation.entry [[META13:![0-9]+]] !continuation [[META36:![0-9]+]] {
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[SYSTEM_DATA_ALLOCA:%.*]] = alloca [[STRUCT_DISPATCHSYSTEMDATA]], align 8
 ; LOWERRAYTRACINGPIPELINE-NEXT:    store [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP0]], ptr [[SYSTEM_DATA_ALLOCA]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    call void @amd.dx.setLocalRootIndex(i32 0)
@@ -461,82 +460,82 @@ attributes #6 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TRAV_DATA_I:%.*]] = insertvalue [[STRUCT_TRAVERSALDATA:%.*]] undef, [[STRUCT_SYSTEMDATA]] [[SYS_DATA_I]], 0
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[ADDR_I:%.*]] = call i64 @_AmdGetResumePointAddr() #[[ATTR2:[0-9]+]]
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TRAV_DATA2_I:%.*]] = insertvalue [[STRUCT_TRAVERSALDATA]] [[TRAV_DATA_I]], i64 [[ADDR_I]], 5
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [[STRUCT_RAYPAYLOAD]], ptr [[TMP4]], i32 0, i32 0
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP13:%.*]] = load i32, ptr [[TMP10]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP13]], ptr addrspace(20) @PAYLOAD, align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i32, ptr [[TMP10]], i32 1
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP16:%.*]] = load i32, ptr [[TMP14]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP16]], ptr addrspace(20) getelementptr inbounds (i32, ptr addrspace(20) @PAYLOAD, i32 7), align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i32, ptr [[TMP14]], i32 1
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP18:%.*]] = load i32, ptr [[TMP17]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP18]], ptr addrspace(20) getelementptr inbounds (i32, ptr addrspace(20) @PAYLOAD, i32 8), align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i32, ptr [[TMP14]], i32 2
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP20:%.*]] = load i32, ptr [[TMP19]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP20]], ptr addrspace(20) getelementptr inbounds (i32, ptr addrspace(20) @PAYLOAD, i32 9), align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP21:%.*]] = call ptr inttoptr (i64 4 to ptr)(i64 -1, [[STRUCT_TRAVERSALDATA]] [[TRAV_DATA2_I]]), !continuation.registercount [[META33:![0-9]+]], !continuation.wait.await [[META13]], !continuation.returnedRegistercount !33
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP22:%.*]] = call [[STRUCT_DISPATCHSYSTEMDATA]] [[AWAIT_STRUCT_DISPATCHSYSTEMDATA:@[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]](ptr [[TMP21]])
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [[STRUCT_RAYPAYLOAD]], ptr [[TMP4]], i32 0
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP11:%.*]] = load i32, ptr [[TMP10]], align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP11]], ptr addrspace(20) @PAYLOAD, align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[TMP10]], i32 1
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP13:%.*]] = load i32, ptr [[TMP12]], align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP13]], ptr addrspace(20) getelementptr inbounds (i32, ptr addrspace(20) @PAYLOAD, i32 7), align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i32, ptr [[TMP12]], i32 1
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP15:%.*]] = load i32, ptr [[TMP14]], align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP15]], ptr addrspace(20) getelementptr inbounds (i32, ptr addrspace(20) @PAYLOAD, i32 8), align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i32, ptr [[TMP12]], i32 2
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP17:%.*]] = load i32, ptr [[TMP16]], align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP17]], ptr addrspace(20) getelementptr inbounds (i32, ptr addrspace(20) @PAYLOAD, i32 9), align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP18:%.*]] = call ptr inttoptr (i64 4 to ptr)(i64 -1, i64 poison, [[STRUCT_TRAVERSALDATA]] [[TRAV_DATA2_I]]), !continuation.registercount [[META33:![0-9]+]], !continuation.wait.await [[META13]], !continuation.returnedRegistercount [[META33]]
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP19:%.*]] = call [[STRUCT_DISPATCHSYSTEMDATA]] [[AWAIT:@[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]](ptr [[TMP18]])
 ; LOWERRAYTRACINGPIPELINE-NEXT:    store [[STRUCT_RAYPAYLOAD]] poison, ptr [[TMP4]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [[STRUCT_RAYPAYLOAD]], ptr [[TMP4]], i32 0, i32 0
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP26:%.*]] = load i32, ptr addrspace(20) @PAYLOAD, align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP26]], ptr [[TMP23]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP27:%.*]] = getelementptr inbounds i32, ptr [[TMP23]], i32 1
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP29:%.*]] = load i32, ptr addrspace(20) getelementptr inbounds (i32, ptr addrspace(20) @PAYLOAD, i32 7), align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP29]], ptr [[TMP27]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP30:%.*]] = getelementptr inbounds i32, ptr [[TMP27]], i32 1
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP31:%.*]] = load i32, ptr addrspace(20) getelementptr inbounds (i32, ptr addrspace(20) @PAYLOAD, i32 8), align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP31]], ptr [[TMP30]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP32:%.*]] = getelementptr inbounds i32, ptr [[TMP27]], i32 2
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP33:%.*]] = load i32, ptr addrspace(20) getelementptr inbounds (i32, ptr addrspace(20) @PAYLOAD, i32 9), align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP33]], ptr [[TMP32]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    store [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP22]], ptr [[SYSTEM_DATA_ALLOCA]], align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [[STRUCT_RAYPAYLOAD]], ptr [[TMP4]], i32 0
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP21:%.*]] = load i32, ptr addrspace(20) @PAYLOAD, align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP21]], ptr [[TMP20]], align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP22:%.*]] = getelementptr inbounds i32, ptr [[TMP20]], i32 1
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP23:%.*]] = load i32, ptr addrspace(20) getelementptr inbounds (i32, ptr addrspace(20) @PAYLOAD, i32 7), align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP23]], ptr [[TMP22]], align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP39:%.*]] = getelementptr inbounds i32, ptr [[TMP22]], i32 1
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP25:%.*]] = load i32, ptr addrspace(20) getelementptr inbounds (i32, ptr addrspace(20) @PAYLOAD, i32 8), align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP25]], ptr [[TMP39]], align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP26:%.*]] = getelementptr inbounds i32, ptr [[TMP22]], i32 2
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP27:%.*]] = load i32, ptr addrspace(20) getelementptr inbounds (i32, ptr addrspace(20) @PAYLOAD, i32 9), align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP27]], ptr [[TMP26]], align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    store [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP19]], ptr [[SYSTEM_DATA_ALLOCA]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    call void @amd.dx.setLocalRootIndex(i32 0)
 ; LOWERRAYTRACINGPIPELINE-NEXT:    br label [[DOTSPLIT:%.*]]
 ; LOWERRAYTRACINGPIPELINE:       .split:
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP34:%.*]] = load <4 x float>, ptr [[TMP6]], align 4, !tbaa [[TBAA37]]
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP35:%.*]] = call <3 x i32> @lgc.rt.dispatch.rays.index()
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[EXTRACT:%.*]] = extractelement <3 x i32> [[TMP35]], i8 0
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP36:%.*]] = call <3 x i32> @lgc.rt.dispatch.rays.index()
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[EXTRACT1:%.*]] = extractelement <3 x i32> [[TMP36]], i8 1
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP37:%.*]] = call [[DX_TYPES_HANDLE]] [[DX_OP_CREATEHANDLEFORLIB_DX_TYPES_HANDLE]](i32 160, [[DX_TYPES_HANDLE]] [[TMP3]])
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP38:%.*]] = call [[DX_TYPES_HANDLE]] [[DX_OP_ANNOTATEHANDLE]](i32 216, [[DX_TYPES_HANDLE]] [[TMP37]], [[DX_TYPES_RESOURCEPROPERTIES]] { i32 4098, i32 1033 })
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP39:%.*]] = extractelement <4 x float> [[TMP34]], i64 0
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP40:%.*]] = extractelement <4 x float> [[TMP34]], i64 1
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP41:%.*]] = extractelement <4 x float> [[TMP34]], i64 2
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP42:%.*]] = extractelement <4 x float> [[TMP34]], i64 3
-; LOWERRAYTRACINGPIPELINE-NEXT:    call void @dx.op.textureStore.f32(i32 67, [[DX_TYPES_HANDLE]] [[TMP38]], i32 [[EXTRACT]], i32 [[EXTRACT1]], i32 undef, float [[TMP39]], float [[TMP40]], float [[TMP41]], float [[TMP42]], i8 15)
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP28:%.*]] = load <4 x float>, ptr [[TMP6]], align 4, !tbaa [[TBAA37]]
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP29:%.*]] = call <3 x i32> @lgc.rt.dispatch.rays.index()
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[EXTRACT:%.*]] = extractelement <3 x i32> [[TMP29]], i8 0
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP30:%.*]] = call <3 x i32> @lgc.rt.dispatch.rays.index()
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[EXTRACT1:%.*]] = extractelement <3 x i32> [[TMP30]], i8 1
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP40:%.*]] = call [[DX_TYPES_HANDLE]] [[DX_OP_CREATEHANDLEFORLIB_DX_TYPES_HANDLE]](i32 160, [[DX_TYPES_HANDLE]] [[TMP3]])
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP32:%.*]] = call [[DX_TYPES_HANDLE]] [[DX_OP_ANNOTATEHANDLE]](i32 216, [[DX_TYPES_HANDLE]] [[TMP40]], [[DX_TYPES_RESOURCEPROPERTIES]] { i32 4098, i32 1033 })
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP33:%.*]] = extractelement <4 x float> [[TMP28]], i64 0
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP34:%.*]] = extractelement <4 x float> [[TMP28]], i64 1
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP35:%.*]] = extractelement <4 x float> [[TMP28]], i64 2
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP36:%.*]] = extractelement <4 x float> [[TMP28]], i64 3
+; LOWERRAYTRACINGPIPELINE-NEXT:    call void @dx.op.textureStore.f32(i32 67, [[DX_TYPES_HANDLE]] [[TMP32]], i32 [[EXTRACT]], i32 [[EXTRACT1]], i32 undef, float [[TMP33]], float [[TMP34]], float [[TMP35]], float [[TMP36]], i8 15)
 ; LOWERRAYTRACINGPIPELINE-NEXT:    call void @llvm.lifetime.end.p0(i64 16, ptr [[TMP5]]) #[[ATTR1]]
-; LOWERRAYTRACINGPIPELINE-NEXT:    ret void, !continuation.registercount [[META32:![0-9]+]]
+; LOWERRAYTRACINGPIPELINE-NEXT:    ret void
 ;
 ;
 ; LOWERRAYTRACINGPIPELINE-LABEL: define %struct.DispatchSystemData @MyClosestHitShader(
-; LOWERRAYTRACINGPIPELINE-SAME: [[STRUCT_SYSTEMDATA:%.*]] [[TMP0:%.*]]) #[[ATTR3]] !lgc.rt.shaderstage [[META40:![0-9]+]] !continuation.registercount [[META33]] !continuation [[META41:![0-9]+]] {
+; LOWERRAYTRACINGPIPELINE-SAME: i64 [[RETURNADDR:%.*]], [[STRUCT_SYSTEMDATA:%.*]] [[TMP0:%.*]]) #[[ATTR3]] !lgc.rt.shaderstage [[META40:![0-9]+]] !continuation.registercount [[META33]] !continuation [[META41:![0-9]+]] {
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP2:%.*]] = alloca [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES:%.*]], align 8
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[SYSTEM_DATA_ALLOCA:%.*]] = alloca [[STRUCT_SYSTEMDATA]], align 8
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP3:%.*]] = alloca [[STRUCT_RAYPAYLOAD:%.*]], align 8
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[HITATTRS:%.*]] = alloca [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]], align 8
 ; LOWERRAYTRACINGPIPELINE-NEXT:    store [[STRUCT_SYSTEMDATA]] [[TMP0]], ptr [[SYSTEM_DATA_ALLOCA]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [[STRUCT_SYSTEMDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [[STRUCT_RAYPAYLOAD]], ptr [[TMP3]], i32 0, i32 0
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP8:%.*]] = load i32, ptr addrspace(20) @PAYLOAD, align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP8]], ptr [[TMP5]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[TMP5]], i32 1
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP11:%.*]] = load i32, ptr addrspace(20) getelementptr inbounds (i32, ptr addrspace(20) @PAYLOAD, i32 7), align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP11]], ptr [[TMP9]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[TMP9]], i32 1
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP13:%.*]] = load i32, ptr addrspace(20) getelementptr inbounds (i32, ptr addrspace(20) @PAYLOAD, i32 8), align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP13]], ptr [[TMP12]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i32, ptr [[TMP9]], i32 2
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP15:%.*]] = load i32, ptr addrspace(20) getelementptr inbounds (i32, ptr addrspace(20) @PAYLOAD, i32 9), align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP15]], ptr [[TMP14]], align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [[STRUCT_RAYPAYLOAD]], ptr [[TMP3]], i32 0
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP6:%.*]] = load i32, ptr addrspace(20) @PAYLOAD, align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP6]], ptr [[TMP5]], align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[TMP5]], i32 1
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP8:%.*]] = load i32, ptr addrspace(20) getelementptr inbounds (i32, ptr addrspace(20) @PAYLOAD, i32 7), align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP8]], ptr [[TMP7]], align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[TMP7]], i32 1
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP10:%.*]] = load i32, ptr addrspace(20) getelementptr inbounds (i32, ptr addrspace(20) @PAYLOAD, i32 8), align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP10]], ptr [[TMP9]], align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i32, ptr [[TMP7]], i32 2
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP12:%.*]] = load i32, ptr addrspace(20) getelementptr inbounds (i32, ptr addrspace(20) @PAYLOAD, i32 9), align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP12]], ptr [[TMP11]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[ADDR_I:%.*]] = getelementptr [[STRUCT_SYSTEMDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 1
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[VAL_I:%.*]] = load [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]], ptr [[ADDR_I]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    store [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]] [[VAL_I]], ptr [[TMP2]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP16:%.*]] = load i32, ptr [[TMP2]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP16]], ptr [[HITATTRS]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i32, ptr [[HITATTRS]], i32 1
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP20:%.*]] = getelementptr inbounds i32, ptr [[TMP2]], i32 1
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP21:%.*]] = load i32, ptr [[TMP20]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP21]], ptr [[TMP19]], align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP13:%.*]] = load i32, ptr [[TMP2]], align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP13]], ptr [[HITATTRS]], align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i32, ptr [[HITATTRS]], i32 1
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i32, ptr [[TMP2]], i32 1
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP16:%.*]] = load i32, ptr [[TMP15]], align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP16]], ptr [[TMP14]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    call void @amd.dx.setLocalRootIndex(i32 5)
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP22:%.*]] = getelementptr inbounds [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]], ptr [[HITATTRS]], i32 0, i32 0
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP23:%.*]] = load <2 x float>, ptr [[TMP22]], align 4
@@ -550,7 +549,7 @@ attributes #6 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP31:%.*]] = insertelement <4 x float> [[TMP30]], float 1.000000e+00, i64 3
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP32:%.*]] = getelementptr inbounds [[STRUCT_RAYPAYLOAD]], ptr [[TMP3]], i32 0, i32 0
 ; LOWERRAYTRACINGPIPELINE-NEXT:    store <4 x float> [[TMP31]], ptr [[TMP32]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP33:%.*]] = getelementptr inbounds [[STRUCT_RAYPAYLOAD]], ptr [[TMP3]], i32 0, i32 0
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP33:%.*]] = getelementptr inbounds [[STRUCT_RAYPAYLOAD]], ptr [[TMP3]], i32 0
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP36:%.*]] = load i32, ptr [[TMP33]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP36]], ptr addrspace(20) @PAYLOAD, align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP37:%.*]] = getelementptr inbounds i32, ptr [[TMP33]], i32 1
@@ -564,11 +563,12 @@ attributes #6 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 ; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP43]], ptr addrspace(20) getelementptr inbounds (i32, ptr addrspace(20) @PAYLOAD, i32 9), align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP44:%.*]] = getelementptr inbounds [[STRUCT_SYSTEMDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP45:%.*]] = load [[STRUCT_DISPATCHSYSTEMDATA:%.*]], ptr [[TMP44]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    ret [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP45]], !continuation.registercount [[META33]]
+; LOWERRAYTRACINGPIPELINE-NEXT:    call void (...) @lgc.ilcps.return(i64 [[RETURNADDR]], [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP45]]), !continuation.registercount [[META33]]
+; LOWERRAYTRACINGPIPELINE-NEXT:    unreachable
 ;
 ;
 ; LOWERRAYTRACINGPIPELINE-LABEL: define %struct.TraversalData @MyAnyHitShader(
-; LOWERRAYTRACINGPIPELINE-SAME: [[STRUCT_TRAVERSALDATA:%.*]] [[TMP0:%.*]], [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES:%.*]] [[TMP1:%.*]]) #[[ATTR3]] !lgc.rt.shaderstage [[META42:![0-9]+]] !continuation.registercount [[META33]] !continuation [[META43:![0-9]+]] {
+; LOWERRAYTRACINGPIPELINE-SAME: i64 [[RETURNADDR:%.*]], [[STRUCT_TRAVERSALDATA:%.*]] [[TMP0:%.*]], [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES:%.*]] [[TMP1:%.*]]) #[[ATTR3]] !lgc.rt.shaderstage [[META42:![0-9]+]] !continuation.registercount [[META33]] !continuation [[META43:![0-9]+]] {
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP3:%.*]] = alloca [[STRUCT_HITDATA:%.*]], align 8
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP4:%.*]] = alloca [[STRUCT_HITDATA]], align 8
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP5:%.*]] = alloca [[STRUCT_HITDATA]], align 8
@@ -581,112 +581,114 @@ attributes #6 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[HITATTRSALLOCA:%.*]] = alloca [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]], align 8
 ; LOWERRAYTRACINGPIPELINE-NEXT:    store [[STRUCT_TRAVERSALDATA]] [[TMP0]], ptr [[SYSTEM_DATA_ALLOCA]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [[STRUCT_TRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0, i32 0
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP11:%.*]] = getelementptr inbounds [[STRUCT_RAYPAYLOAD]], ptr [[TMP9]], i32 0, i32 0
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP14:%.*]] = load i32, ptr addrspace(20) @PAYLOAD, align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP14]], ptr [[TMP11]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i32, ptr [[TMP11]], i32 1
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP17:%.*]] = load i32, ptr addrspace(20) getelementptr inbounds (i32, ptr addrspace(20) @PAYLOAD, i32 7), align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP17]], ptr [[TMP15]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i32, ptr [[TMP15]], i32 1
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP19:%.*]] = load i32, ptr addrspace(20) getelementptr inbounds (i32, ptr addrspace(20) @PAYLOAD, i32 8), align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP19]], ptr [[TMP18]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP20:%.*]] = getelementptr inbounds i32, ptr [[TMP15]], i32 2
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP21:%.*]] = load i32, ptr addrspace(20) getelementptr inbounds (i32, ptr addrspace(20) @PAYLOAD, i32 9), align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP21]], ptr [[TMP20]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP22:%.*]] = getelementptr inbounds [[STRUCT_TRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[ADDR_I:%.*]] = getelementptr [[STRUCT_SYSTEMDATA:%.*]], ptr [[TMP22]], i32 0, i32 1
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP11:%.*]] = getelementptr inbounds [[STRUCT_RAYPAYLOAD]], ptr [[TMP9]], i32 0
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP12:%.*]] = load i32, ptr addrspace(20) @PAYLOAD, align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP12]], ptr [[TMP11]], align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[TMP11]], i32 1
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP14:%.*]] = load i32, ptr addrspace(20) getelementptr inbounds (i32, ptr addrspace(20) @PAYLOAD, i32 7), align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP14]], ptr [[TMP13]], align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i32, ptr [[TMP13]], i32 1
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP16:%.*]] = load i32, ptr addrspace(20) getelementptr inbounds (i32, ptr addrspace(20) @PAYLOAD, i32 8), align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP16]], ptr [[TMP15]], align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i32, ptr [[TMP13]], i32 2
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP18:%.*]] = load i32, ptr addrspace(20) getelementptr inbounds (i32, ptr addrspace(20) @PAYLOAD, i32 9), align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP18]], ptr [[TMP17]], align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP19:%.*]] = getelementptr inbounds [[STRUCT_TRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[ADDR_I:%.*]] = getelementptr [[STRUCT_SYSTEMDATA:%.*]], ptr [[TMP19]], i32 0, i32 1
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[VAL_I:%.*]] = load [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]], ptr [[ADDR_I]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    store [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]] [[VAL_I]], ptr [[TMP8]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP23:%.*]] = load i32, ptr [[TMP8]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP23]], ptr [[ORIGHITATTRS]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP26:%.*]] = getelementptr inbounds i32, ptr [[ORIGHITATTRS]], i32 1
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP27:%.*]] = getelementptr inbounds i32, ptr [[TMP8]], i32 1
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP28:%.*]] = load i32, ptr [[TMP27]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP28]], ptr [[TMP26]], align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP20:%.*]] = load i32, ptr [[TMP8]], align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP20]], ptr [[ORIGHITATTRS]], align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i32, ptr [[ORIGHITATTRS]], i32 1
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP22:%.*]] = getelementptr inbounds i32, ptr [[TMP8]], i32 1
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP23:%.*]] = load i32, ptr [[TMP22]], align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP23]], ptr [[TMP21]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    store [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]] [[TMP1]], ptr [[HITATTRSALLOCA]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    call void @amd.dx.setLocalRootIndex(i32 5)
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP29:%.*]] = getelementptr inbounds [[STRUCT_RAYPAYLOAD]], ptr [[TMP9]], i32 0, i32 0
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP30:%.*]] = load <4 x float>, ptr [[TMP29]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP31:%.*]] = getelementptr inbounds [[STRUCT_TRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0, i32 0
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [[STRUCT_RAYPAYLOAD]], ptr [[TMP9]], i32 0, i32 0
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP25:%.*]] = load <4 x float>, ptr [[TMP24]], align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP26:%.*]] = getelementptr inbounds [[STRUCT_TRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0, i32 0
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[RESPTR_I3:%.*]] = getelementptr [[STRUCT_TRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 1
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[RES_I4:%.*]] = load [[STRUCT_HITDATA]], ptr [[RESPTR_I3]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    store [[STRUCT_HITDATA]] [[RES_I4]], ptr [[TMP4]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP32:%.*]] = call <3 x float> @_cont_ObjectRayOrigin3(ptr [[TMP31]], ptr [[TMP4]])
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[EXTRACT1:%.*]] = extractelement <3 x float> [[TMP32]], i8 0
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP33:%.*]] = getelementptr inbounds [[STRUCT_TRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0, i32 0
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP27:%.*]] = call <3 x float> @_cont_ObjectRayOrigin3(ptr [[TMP26]], ptr [[TMP4]])
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[EXTRACT1:%.*]] = extractelement <3 x float> [[TMP27]], i8 0
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP28:%.*]] = getelementptr inbounds [[STRUCT_TRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0, i32 0
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[RESPTR_I:%.*]] = getelementptr [[STRUCT_TRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 1
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[RES_I:%.*]] = load [[STRUCT_HITDATA]], ptr [[RESPTR_I]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    store [[STRUCT_HITDATA]] [[RES_I]], ptr [[TMP5]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP34:%.*]] = call <3 x float> @_cont_ObjectRayDirection3(ptr [[TMP33]], ptr [[TMP5]])
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[EXTRACT:%.*]] = extractelement <3 x float> [[TMP34]], i8 0
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP35:%.*]] = getelementptr inbounds [[STRUCT_TRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0, i32 0
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP29:%.*]] = call <3 x float> @_cont_ObjectRayDirection3(ptr [[TMP28]], ptr [[TMP5]])
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[EXTRACT:%.*]] = extractelement <3 x float> [[TMP29]], i8 0
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP30:%.*]] = getelementptr inbounds [[STRUCT_TRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0, i32 0
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[RESPTR_I5:%.*]] = getelementptr [[STRUCT_TRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 1
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[RES_I6:%.*]] = load [[STRUCT_HITDATA]], ptr [[RESPTR_I5]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    store [[STRUCT_HITDATA]] [[RES_I6]], ptr [[TMP3]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[RES_I7:%.*]] = load float, ptr [[TMP3]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP36:%.*]] = fmul fast float [[RES_I7]], [[EXTRACT]]
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP37:%.*]] = fadd fast float [[TMP36]], [[EXTRACT1]]
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP38:%.*]] = fcmp fast ogt float [[TMP37]], 0.000000e+00
-; LOWERRAYTRACINGPIPELINE-NEXT:    br i1 [[TMP38]], label [[TMP39:%.*]], label [[TMP61:%.*]]
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP31:%.*]] = fmul fast float [[RES_I7]], [[EXTRACT]]
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP32:%.*]] = fadd fast float [[TMP31]], [[EXTRACT1]]
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP33:%.*]] = fcmp fast ogt float [[TMP32]], 0.000000e+00
+; LOWERRAYTRACINGPIPELINE-NEXT:    br i1 [[TMP33]], label [[TMP34:%.*]], label [[TMP51:%.*]]
 ; LOWERRAYTRACINGPIPELINE:       34:
-; LOWERRAYTRACINGPIPELINE-NEXT:    store <4 x float> [[TMP30]], ptr [[TMP29]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP40:%.*]] = getelementptr inbounds [[STRUCT_TRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0, i32 0
-; LOWERRAYTRACINGPIPELINE-NEXT:    call void @_cont_AcceptHitAndEndSearch(ptr [[TMP40]])
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP41:%.*]] = getelementptr inbounds [[STRUCT_RAYPAYLOAD]], ptr [[TMP9]], i32 0, i32 0
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP44:%.*]] = load i32, ptr [[TMP41]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP44]], ptr addrspace(20) @PAYLOAD, align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP45:%.*]] = getelementptr inbounds i32, ptr [[TMP41]], i32 1
+; LOWERRAYTRACINGPIPELINE-NEXT:    store <4 x float> [[TMP25]], ptr [[TMP24]], align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP35:%.*]] = getelementptr inbounds [[STRUCT_TRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0, i32 0
+; LOWERRAYTRACINGPIPELINE-NEXT:    call void @_cont_AcceptHitAndEndSearch(ptr [[TMP35]])
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP36:%.*]] = getelementptr inbounds [[STRUCT_RAYPAYLOAD]], ptr [[TMP9]], i32 0
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP37:%.*]] = load i32, ptr [[TMP36]], align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP37]], ptr addrspace(20) @PAYLOAD, align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP45:%.*]] = getelementptr inbounds i32, ptr [[TMP36]], i32 1
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP47:%.*]] = load i32, ptr [[TMP45]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP47]], ptr addrspace(20) getelementptr inbounds (i32, ptr addrspace(20) @PAYLOAD, i32 7), align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP48:%.*]] = getelementptr inbounds i32, ptr [[TMP45]], i32 1
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP49:%.*]] = load i32, ptr [[TMP48]], align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP40:%.*]] = getelementptr inbounds i32, ptr [[TMP45]], i32 1
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP49:%.*]] = load i32, ptr [[TMP40]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP49]], ptr addrspace(20) getelementptr inbounds (i32, ptr addrspace(20) @PAYLOAD, i32 8), align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP50:%.*]] = getelementptr inbounds i32, ptr [[TMP45]], i32 2
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP51:%.*]] = load i32, ptr [[TMP50]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP51]], ptr addrspace(20) getelementptr inbounds (i32, ptr addrspace(20) @PAYLOAD, i32 9), align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP43:%.*]] = load i32, ptr [[TMP50]], align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP43]], ptr addrspace(20) getelementptr inbounds (i32, ptr addrspace(20) @PAYLOAD, i32 9), align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP46:%.*]] = load i32, ptr [[HITATTRSALLOCA]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP46]], ptr [[TMP7]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP55:%.*]] = getelementptr inbounds i32, ptr [[HITATTRSALLOCA]], i32 1
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP48:%.*]] = getelementptr inbounds i32, ptr [[HITATTRSALLOCA]], i32 1
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP56:%.*]] = getelementptr inbounds i32, ptr [[TMP7]], i32 1
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP57:%.*]] = load i32, ptr [[TMP55]], align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP57:%.*]] = load i32, ptr [[TMP48]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP57]], ptr [[TMP56]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP58:%.*]] = load [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]], ptr [[TMP7]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP59:%.*]] = getelementptr inbounds [[STRUCT_TRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[ADDR_I1:%.*]] = getelementptr [[STRUCT_SYSTEMDATA]], ptr [[TMP59]], i32 0, i32 1
 ; LOWERRAYTRACINGPIPELINE-NEXT:    store [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]] [[TMP58]], ptr [[ADDR_I1]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP60:%.*]] = load [[STRUCT_TRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    ret [[STRUCT_TRAVERSALDATA]] [[TMP60]], !continuation.registercount [[META33]]
+; LOWERRAYTRACINGPIPELINE-NEXT:    call void (...) @lgc.ilcps.return(i64 [[RETURNADDR]], [[STRUCT_TRAVERSALDATA]] [[TMP60]]), !continuation.registercount [[META33]]
+; LOWERRAYTRACINGPIPELINE-NEXT:    unreachable
 ; LOWERRAYTRACINGPIPELINE:       51:
-; LOWERRAYTRACINGPIPELINE-NEXT:    store <4 x float> [[TMP30]], ptr [[TMP29]], align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    store <4 x float> [[TMP25]], ptr [[TMP24]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    call void @_cont_AcceptHit(ptr [[SYSTEM_DATA_ALLOCA]])
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP62:%.*]] = getelementptr inbounds [[STRUCT_RAYPAYLOAD]], ptr [[TMP9]], i32 0, i32 0
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP62:%.*]] = getelementptr inbounds [[STRUCT_RAYPAYLOAD]], ptr [[TMP9]], i32 0
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP65:%.*]] = load i32, ptr [[TMP62]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP65]], ptr addrspace(20) @PAYLOAD, align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP66:%.*]] = getelementptr inbounds i32, ptr [[TMP62]], i32 1
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP68:%.*]] = load i32, ptr [[TMP66]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP68]], ptr addrspace(20) getelementptr inbounds (i32, ptr addrspace(20) @PAYLOAD, i32 7), align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP69:%.*]] = getelementptr inbounds i32, ptr [[TMP66]], i32 1
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP70:%.*]] = load i32, ptr [[TMP69]], align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP64:%.*]] = getelementptr inbounds i32, ptr [[TMP66]], i32 1
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP70:%.*]] = load i32, ptr [[TMP64]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP70]], ptr addrspace(20) getelementptr inbounds (i32, ptr addrspace(20) @PAYLOAD, i32 8), align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP71:%.*]] = getelementptr inbounds i32, ptr [[TMP66]], i32 2
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP72:%.*]] = load i32, ptr [[TMP71]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP72]], ptr addrspace(20) getelementptr inbounds (i32, ptr addrspace(20) @PAYLOAD, i32 9), align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP63:%.*]] = load i32, ptr [[HITATTRSALLOCA]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP63]], ptr [[TMP6]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP76:%.*]] = getelementptr inbounds i32, ptr [[HITATTRSALLOCA]], i32 1
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP61:%.*]] = getelementptr inbounds i32, ptr [[HITATTRSALLOCA]], i32 1
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP77:%.*]] = getelementptr inbounds i32, ptr [[TMP6]], i32 1
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP78:%.*]] = load i32, ptr [[TMP76]], align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP78:%.*]] = load i32, ptr [[TMP61]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP78]], ptr [[TMP77]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP79:%.*]] = load [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]], ptr [[TMP6]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP80:%.*]] = getelementptr inbounds [[STRUCT_TRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[ADDR_I2:%.*]] = getelementptr [[STRUCT_SYSTEMDATA]], ptr [[TMP80]], i32 0, i32 1
 ; LOWERRAYTRACINGPIPELINE-NEXT:    store [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]] [[TMP79]], ptr [[ADDR_I2]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP81:%.*]] = load [[STRUCT_TRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    ret [[STRUCT_TRAVERSALDATA]] [[TMP81]], !continuation.registercount [[META33]]
+; LOWERRAYTRACINGPIPELINE-NEXT:    call void (...) @lgc.ilcps.return(i64 [[RETURNADDR]], [[STRUCT_TRAVERSALDATA]] [[TMP81]]), !continuation.registercount [[META33]]
+; LOWERRAYTRACINGPIPELINE-NEXT:    unreachable
 ;
 ;
 ; LOWERRAYTRACINGPIPELINE-LABEL: define %struct.TraversalData @MyIntersectionShader(
-; LOWERRAYTRACINGPIPELINE-SAME: [[STRUCT_TRAVERSALDATA:%.*]] [[TMP0:%.*]]) #[[ATTR3]] !lgc.rt.shaderstage [[META44:![0-9]+]] !continuation.registercount [[META32]] !continuation [[META45:![0-9]+]] {
+; LOWERRAYTRACINGPIPELINE-SAME: i64 [[RETURNADDR:%.*]], [[STRUCT_TRAVERSALDATA:%.*]] [[TMP0:%.*]]) #[[ATTR3]] !lgc.rt.shaderstage [[META44:![0-9]+]] !continuation.registercount [[META32:![0-9]+]] !continuation [[META45:![0-9]+]] {
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP2:%.*]] = alloca [[STRUCT_HITDATA:%.*]], align 8
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP3:%.*]] = alloca [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES:%.*]], align 8
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP4:%.*]] = alloca [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]], align 4
@@ -706,8 +708,8 @@ attributes #6 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 ; LOWERRAYTRACINGPIPELINE:       anyhit.i:
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TRAV_DATA_I:%.*]] = load [[STRUCT_TRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP8:%.*]] = load [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]], ptr [[TMP4]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP9:%.*]] = call ptr inttoptr (i64 3 to ptr)([[STRUCT_TRAVERSALDATA]] [[TRAV_DATA_I]], float [[RES_I1]], i32 0, [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]] [[TMP8]]), !continuation.registercount [[META32]], !continuation.returnedRegistercount !32
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP10:%.*]] = call [[STRUCT_TRAVERSALDATA]] [[AWAIT_STRUCT_TRAVERSALDATA:@[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]](ptr [[TMP9]])
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP9:%.*]] = call ptr inttoptr (i64 3 to ptr)([[STRUCT_TRAVERSALDATA]] [[TRAV_DATA_I]], float [[RES_I1]], i32 0, [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]] [[TMP8]]), !continuation.registercount [[META32]], !continuation.returnedRegistercount [[META32]]
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP10:%.*]] = call [[STRUCT_TRAVERSALDATA]] [[AWAIT_1:@[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]](ptr [[TMP9]])
 ; LOWERRAYTRACINGPIPELINE-NEXT:    store [[STRUCT_TRAVERSALDATA]] [[TMP10]], ptr [[SYSTEM_DATA_ALLOCA]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP11:%.*]] = getelementptr inbounds [[STRUCT_TRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0, i32 0
 ; LOWERRAYTRACINGPIPELINE-NEXT:    call void @amd.dx.setLocalRootIndex(i32 5)
@@ -715,29 +717,31 @@ attributes #6 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 ; LOWERRAYTRACINGPIPELINE:       accepthit.i:
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP12:%.*]] = load i32, ptr [[TMP4]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP12]], ptr [[TMP3]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i32 1
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i32, ptr [[TMP3]], i32 1
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP17:%.*]] = load i32, ptr [[TMP15]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP17]], ptr [[TMP16]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP18:%.*]] = load [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]], ptr [[TMP3]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP19:%.*]] = getelementptr inbounds [[STRUCT_TRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[ADDR_I:%.*]] = getelementptr [[STRUCT_SYSTEMDATA:%.*]], ptr [[TMP19]], i32 0, i32 1
-; LOWERRAYTRACINGPIPELINE-NEXT:    store [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]] [[TMP18]], ptr [[ADDR_I]], align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i32 1
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i32, ptr [[TMP3]], i32 1
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP15:%.*]] = load i32, ptr [[TMP18]], align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP15]], ptr [[TMP14]], align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP16:%.*]] = load [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]], ptr [[TMP3]], align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [[STRUCT_TRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[ADDR_I:%.*]] = getelementptr [[STRUCT_SYSTEMDATA:%.*]], ptr [[TMP17]], i32 0, i32 1
+; LOWERRAYTRACINGPIPELINE-NEXT:    store [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]] [[TMP16]], ptr [[ADDR_I]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    br label [[_CONT_REPORTHIT_EXIT]]
 ; LOWERRAYTRACINGPIPELINE:       _cont_ReportHit.exit:
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[ISEND_I:%.*]] = call i1 @opaqueIsEnd()
 ; LOWERRAYTRACINGPIPELINE-NEXT:    br i1 [[ISEND_I]], label [[TMP20:%.*]], label [[TMP22:%.*]]
 ; LOWERRAYTRACINGPIPELINE:       18:
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP21:%.*]] = load [[STRUCT_TRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    ret [[STRUCT_TRAVERSALDATA]] [[TMP21]], !continuation.registercount [[META32]]
+; LOWERRAYTRACINGPIPELINE-NEXT:    call void (...) @lgc.ilcps.return(i64 [[RETURNADDR]], [[STRUCT_TRAVERSALDATA]] [[TMP21]]), !continuation.registercount [[META32]]
+; LOWERRAYTRACINGPIPELINE-NEXT:    unreachable
 ; LOWERRAYTRACINGPIPELINE:       20:
 ; LOWERRAYTRACINGPIPELINE-NEXT:    call void @llvm.lifetime.end.p0(i64 8, ptr [[TMP7]]) #[[ATTR1]]
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP23:%.*]] = load [[STRUCT_TRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    ret [[STRUCT_TRAVERSALDATA]] [[TMP23]], !continuation.registercount [[META32]]
+; LOWERRAYTRACINGPIPELINE-NEXT:    call void (...) @lgc.ilcps.return(i64 [[RETURNADDR]], [[STRUCT_TRAVERSALDATA]] [[TMP23]]), !continuation.registercount [[META32]]
+; LOWERRAYTRACINGPIPELINE-NEXT:    unreachable
 ;
 ;
 ; LOWERRAYTRACINGPIPELINE-LABEL: define %struct.TraversalData @MyIntersectionShaderLargeAttrs(
-; LOWERRAYTRACINGPIPELINE-SAME: [[STRUCT_TRAVERSALDATA:%.*]] [[TMP0:%.*]]) #[[ATTR3]] !lgc.rt.shaderstage [[META44]] !continuation.registercount [[META32]] !continuation [[META46:![0-9]+]] {
+; LOWERRAYTRACINGPIPELINE-SAME: i64 [[RETURNADDR:%.*]], [[STRUCT_TRAVERSALDATA:%.*]] [[TMP0:%.*]]) #[[ATTR3]] !lgc.rt.shaderstage [[META44]] !continuation.registercount [[META32]] !continuation [[META46:![0-9]+]] {
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP2:%.*]] = alloca [[STRUCT_HITDATA:%.*]], align 8
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP3:%.*]] = alloca [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES:%.*]], align 8
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP4:%.*]] = alloca [[STRUCT_LARGEINTERSECTIONATTRIBUTES:%.*]], align 4
@@ -771,8 +775,8 @@ attributes #6 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 ; LOWERRAYTRACINGPIPELINE:       anyhit.i:
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TRAV_DATA_I:%.*]] = load [[STRUCT_TRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP8:%.*]] = load [[STRUCT_LARGEINTERSECTIONATTRIBUTES]], ptr [[TMP4]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP9:%.*]] = call ptr inttoptr (i64 3 to ptr)([[STRUCT_TRAVERSALDATA]] [[TRAV_DATA_I]], float [[RES_I1]], i32 0, [[STRUCT_LARGEINTERSECTIONATTRIBUTES]] [[TMP8]]), !continuation.registercount [[META32]], !continuation.returnedRegistercount !32
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP10:%.*]] = call [[STRUCT_TRAVERSALDATA]] [[AWAIT_STRUCT_TRAVERSALDATA:@[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]](ptr [[TMP9]])
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP9:%.*]] = call ptr inttoptr (i64 3 to ptr)([[STRUCT_TRAVERSALDATA]] [[TRAV_DATA_I]], float [[RES_I1]], i32 0, [[STRUCT_LARGEINTERSECTIONATTRIBUTES]] [[TMP8]]), !continuation.registercount [[META32]], !continuation.returnedRegistercount [[META32]]
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP10:%.*]] = call [[STRUCT_TRAVERSALDATA]] [[AWAIT_2:@[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]](ptr [[TMP9]])
 ; LOWERRAYTRACINGPIPELINE-NEXT:    store [[STRUCT_TRAVERSALDATA]] [[TMP10]], ptr [[SYSTEM_DATA_ALLOCA]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP11:%.*]] = getelementptr inbounds [[STRUCT_TRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0, i32 0
 ; LOWERRAYTRACINGPIPELINE-NEXT:    call void @amd.dx.setLocalRootIndex(i32 5)
@@ -780,64 +784,66 @@ attributes #6 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 ; LOWERRAYTRACINGPIPELINE:       accepthit.i:
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP12:%.*]] = load i32, ptr [[TMP4]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP12]], ptr [[TMP3]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i32 1
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i32, ptr [[TMP3]], i32 1
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP17:%.*]] = load i32, ptr [[TMP15]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP17]], ptr [[TMP16]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i32 2
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP28:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i32 1
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i32, ptr [[TMP3]], i32 1
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP15:%.*]] = load i32, ptr [[TMP28]], align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP15]], ptr [[TMP14]], align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i32 2
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP17:%.*]] = load i32, ptr [[TMP16]], align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP17]], ptr addrspace(20) getelementptr (i32, ptr addrspace(20) @PAYLOAD, i32 1), align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i32 3
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP19:%.*]] = load i32, ptr [[TMP18]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP19]], ptr addrspace(20) getelementptr (i32, ptr addrspace(20) @PAYLOAD, i32 1), align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP20:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i32 3
+; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP19]], ptr addrspace(20) getelementptr (i32, ptr addrspace(20) @PAYLOAD, i32 2), align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP20:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i32 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP21:%.*]] = load i32, ptr [[TMP20]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP21]], ptr addrspace(20) getelementptr (i32, ptr addrspace(20) @PAYLOAD, i32 2), align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP22:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i32 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP21]], ptr addrspace(20) getelementptr (i32, ptr addrspace(20) @PAYLOAD, i32 3), align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP22:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i32 5
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP23:%.*]] = load i32, ptr [[TMP22]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP23]], ptr addrspace(20) getelementptr (i32, ptr addrspace(20) @PAYLOAD, i32 3), align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP24:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i32 5
+; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP23]], ptr addrspace(20) getelementptr (i32, ptr addrspace(20) @PAYLOAD, i32 4), align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP24:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i32 6
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP25:%.*]] = load i32, ptr [[TMP24]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP25]], ptr addrspace(20) getelementptr (i32, ptr addrspace(20) @PAYLOAD, i32 4), align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP26:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i32 6
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP27:%.*]] = load i32, ptr [[TMP26]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP27]], ptr addrspace(20) getelementptr (i32, ptr addrspace(20) @PAYLOAD, i32 5), align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP28:%.*]] = load [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]], ptr [[TMP3]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP29:%.*]] = getelementptr inbounds [[STRUCT_TRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[ADDR_I:%.*]] = getelementptr [[STRUCT_SYSTEMDATA:%.*]], ptr [[TMP29]], i32 0, i32 1
-; LOWERRAYTRACINGPIPELINE-NEXT:    store [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]] [[TMP28]], ptr [[ADDR_I]], align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP25]], ptr addrspace(20) getelementptr (i32, ptr addrspace(20) @PAYLOAD, i32 5), align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP26:%.*]] = load [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]], ptr [[TMP3]], align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP27:%.*]] = getelementptr inbounds [[STRUCT_TRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[ADDR_I:%.*]] = getelementptr [[STRUCT_SYSTEMDATA:%.*]], ptr [[TMP27]], i32 0, i32 1
+; LOWERRAYTRACINGPIPELINE-NEXT:    store [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]] [[TMP26]], ptr [[ADDR_I]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    br label [[_CONT_REPORTHIT_EXIT]]
 ; LOWERRAYTRACINGPIPELINE:       _cont_ReportHit.exit:
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[ISEND_I:%.*]] = call i1 @opaqueIsEnd()
 ; LOWERRAYTRACINGPIPELINE-NEXT:    br i1 [[ISEND_I]], label [[TMP30:%.*]], label [[TMP32:%.*]]
 ; LOWERRAYTRACINGPIPELINE:       28:
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP31:%.*]] = load [[STRUCT_TRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    ret [[STRUCT_TRAVERSALDATA]] [[TMP31]], !continuation.registercount [[META32]]
+; LOWERRAYTRACINGPIPELINE-NEXT:    call void (...) @lgc.ilcps.return(i64 [[RETURNADDR]], [[STRUCT_TRAVERSALDATA]] [[TMP31]]), !continuation.registercount [[META32]]
+; LOWERRAYTRACINGPIPELINE-NEXT:    unreachable
 ; LOWERRAYTRACINGPIPELINE:       30:
 ; LOWERRAYTRACINGPIPELINE-NEXT:    call void @llvm.lifetime.end.p0(i64 8, ptr [[TMP7]]) #[[ATTR1]]
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP33:%.*]] = load [[STRUCT_TRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    ret [[STRUCT_TRAVERSALDATA]] [[TMP33]], !continuation.registercount [[META32]]
+; LOWERRAYTRACINGPIPELINE-NEXT:    call void (...) @lgc.ilcps.return(i64 [[RETURNADDR]], [[STRUCT_TRAVERSALDATA]] [[TMP33]]), !continuation.registercount [[META32]]
+; LOWERRAYTRACINGPIPELINE-NEXT:    unreachable
 ;
 ;
 ; LOWERRAYTRACINGPIPELINE-LABEL: define %struct.DispatchSystemData @MyMissShader(
-; LOWERRAYTRACINGPIPELINE-SAME: [[STRUCT_SYSTEMDATA:%.*]] [[TMP0:%.*]]) #[[ATTR3]] !lgc.rt.shaderstage [[META47:![0-9]+]] !continuation.registercount [[META33]] !continuation [[META48:![0-9]+]] {
+; LOWERRAYTRACINGPIPELINE-SAME: i64 [[RETURNADDR:%.*]], [[STRUCT_SYSTEMDATA:%.*]] [[TMP0:%.*]]) #[[ATTR3]] !lgc.rt.shaderstage [[META47:![0-9]+]] !continuation.registercount [[META33]] !continuation [[META48:![0-9]+]] {
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[SYSTEM_DATA_ALLOCA:%.*]] = alloca [[STRUCT_SYSTEMDATA]], align 8
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP2:%.*]] = alloca [[STRUCT_RAYPAYLOAD:%.*]], align 8
 ; LOWERRAYTRACINGPIPELINE-NEXT:    store [[STRUCT_SYSTEMDATA]] [[TMP0]], ptr [[SYSTEM_DATA_ALLOCA]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [[STRUCT_SYSTEMDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [[STRUCT_RAYPAYLOAD]], ptr [[TMP2]], i32 0, i32 0
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP7:%.*]] = load i32, ptr addrspace(20) @PAYLOAD, align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP7]], ptr [[TMP4]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i32 1
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP10:%.*]] = load i32, ptr addrspace(20) getelementptr inbounds (i32, ptr addrspace(20) @PAYLOAD, i32 7), align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP10]], ptr [[TMP8]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i32, ptr [[TMP8]], i32 1
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP12:%.*]] = load i32, ptr addrspace(20) getelementptr inbounds (i32, ptr addrspace(20) @PAYLOAD, i32 8), align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP12]], ptr [[TMP11]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[TMP8]], i32 2
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP14:%.*]] = load i32, ptr addrspace(20) getelementptr inbounds (i32, ptr addrspace(20) @PAYLOAD, i32 9), align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP14]], ptr [[TMP13]], align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [[STRUCT_RAYPAYLOAD]], ptr [[TMP2]], i32 0
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP5:%.*]] = load i32, ptr addrspace(20) @PAYLOAD, align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP5]], ptr [[TMP4]], align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i32 1
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP7:%.*]] = load i32, ptr addrspace(20) getelementptr inbounds (i32, ptr addrspace(20) @PAYLOAD, i32 7), align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP7]], ptr [[TMP6]], align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[TMP6]], i32 1
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP9:%.*]] = load i32, ptr addrspace(20) getelementptr inbounds (i32, ptr addrspace(20) @PAYLOAD, i32 8), align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP9]], ptr [[TMP8]], align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[TMP6]], i32 2
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP11:%.*]] = load i32, ptr addrspace(20) getelementptr inbounds (i32, ptr addrspace(20) @PAYLOAD, i32 9), align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP11]], ptr [[TMP10]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    call void @amd.dx.setLocalRootIndex(i32 5)
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [[STRUCT_RAYPAYLOAD]], ptr [[TMP2]], i32 0, i32 0
 ; LOWERRAYTRACINGPIPELINE-NEXT:    store <4 x float> <float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 1.000000e+00>, ptr [[TMP15]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [[STRUCT_RAYPAYLOAD]], ptr [[TMP2]], i32 0, i32 0
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [[STRUCT_RAYPAYLOAD]], ptr [[TMP2]], i32 0
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP19:%.*]] = load i32, ptr [[TMP16]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP19]], ptr addrspace(20) @PAYLOAD, align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP20:%.*]] = getelementptr inbounds i32, ptr [[TMP16]], i32 1
@@ -851,7 +857,8 @@ attributes #6 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 ; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP26]], ptr addrspace(20) getelementptr inbounds (i32, ptr addrspace(20) @PAYLOAD, i32 9), align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP27:%.*]] = getelementptr inbounds [[STRUCT_SYSTEMDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP28:%.*]] = load [[STRUCT_DISPATCHSYSTEMDATA:%.*]], ptr [[TMP27]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    ret [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP28]], !continuation.registercount [[META33]]
+; LOWERRAYTRACINGPIPELINE-NEXT:    call void (...) @lgc.ilcps.return(i64 [[RETURNADDR]], [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP28]]), !continuation.registercount [[META33]]
+; LOWERRAYTRACINGPIPELINE-NEXT:    unreachable
 ;
 ;
 ; DXILCONTPOSTPROCESS-LABEL: define i1 @_cont_IsEndSearch(
@@ -880,7 +887,7 @@ attributes #6 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 ;
 ;
 ; DXILCONTPOSTPROCESS-LABEL: define void @_cont_KernelEntry(
-; DXILCONTPOSTPROCESS-SAME: ) #[[ATTR0]] !lgc.rt.shaderstage [[META35:![0-9]+]] !continuation.registercount [[META22:![0-9]+]] {
+; DXILCONTPOSTPROCESS-SAME: ) #[[ATTR0]] !lgc.rt.shaderstage [[META35:![0-9]+]] !continuation.registercount [[META22:![0-9]+]] !continuation [[META36:![0-9]+]] {
 ; DXILCONTPOSTPROCESS-NEXT:    [[CSP:%.*]] = alloca i32, align 4
 ; DXILCONTPOSTPROCESS-NEXT:    [[CSPINIT:%.*]] = ptrtoint ptr @debug_global to i32
 ; DXILCONTPOSTPROCESS-NEXT:    store i32 [[CSPINIT]], ptr [[CSP]], align 4
@@ -904,13 +911,11 @@ attributes #6 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 ;
 ;
 ; DXILCONTPOSTPROCESS-LABEL: define void @MyRayGen(
-; DXILCONTPOSTPROCESS-SAME: ) #[[ATTR3:[0-9]+]] !lgc.rt.shaderstage [[META22]] !continuation.registercount [[META22]] !continuation.entry [[META13:![0-9]+]] !continuation [[META36:![0-9]+]] !continuation.state [[META22]] {
+; DXILCONTPOSTPROCESS-SAME: i32 [[CSPINIT:%.*]], i64 [[RETURNADDR:%.*]], [[STRUCT_DISPATCHSYSTEMDATA:%.*]] [[TMP0:%.*]]) #[[ATTR3:[0-9]+]] !lgc.rt.shaderstage [[META22]] !continuation.registercount [[META22]] !continuation.entry [[META13:![0-9]+]] !continuation [[META37:![0-9]+]] !continuation.state [[META22]] {
 ; DXILCONTPOSTPROCESS-NEXT:  AllocaSpillBB:
 ; DXILCONTPOSTPROCESS-NEXT:    [[CSP:%.*]] = alloca i32, align 4
-; DXILCONTPOSTPROCESS-NEXT:    [[TMP0:%.*]] = call i32 @_cont_GetContinuationStackAddr()
-; DXILCONTPOSTPROCESS-NEXT:    store i32 [[TMP0]], ptr [[CSP]], align 4
-; DXILCONTPOSTPROCESS-NEXT:    [[SYSTEM_DATA:%.*]] = call [[STRUCT_DISPATCHSYSTEMDATA:%.*]] [[_CONT_SETUPRAYGEN:@[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]]()
-; DXILCONTPOSTPROCESS-NEXT:    [[DOTFCA_0_EXTRACT:%.*]] = extractvalue [[STRUCT_DISPATCHSYSTEMDATA]] [[SYSTEM_DATA]], 0
+; DXILCONTPOSTPROCESS-NEXT:    store i32 [[CSPINIT]], ptr [[CSP]], align 4
+; DXILCONTPOSTPROCESS-NEXT:    [[DOTFCA_0_EXTRACT:%.*]] = extractvalue [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP0]], 0
 ; DXILCONTPOSTPROCESS-NEXT:    call void @amd.dx.setLocalRootIndex(i32 0)
 ; DXILCONTPOSTPROCESS-NEXT:    [[TMP1:%.*]] = load [[DX_TYPES_HANDLE:%.*]], ptr @"\01?Scene@@3URaytracingAccelerationStructure@@A", align 4
 ; DXILCONTPOSTPROCESS-NEXT:    [[TMP2:%.*]] = load [[DX_TYPES_HANDLE]], ptr @"\01?RenderTarget@@3V?$RWTexture2D@V?$vector@M$03@@@@A", align 4
@@ -920,7 +925,7 @@ attributes #6 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 ; DXILCONTPOSTPROCESS-NEXT:    [[DIS_DATA_I_FCA_0_INSERT:%.*]] = insertvalue [[STRUCT_DISPATCHSYSTEMDATA]] poison, <3 x i32> [[DOTFCA_0_EXTRACT]], 0
 ; DXILCONTPOSTPROCESS-NEXT:    [[SYS_DATA_I:%.*]] = insertvalue [[STRUCT_SYSTEMDATA:%.*]] undef, [[STRUCT_DISPATCHSYSTEMDATA]] [[DIS_DATA_I_FCA_0_INSERT]], 0
 ; DXILCONTPOSTPROCESS-NEXT:    [[TRAV_DATA_I:%.*]] = insertvalue [[STRUCT_TRAVERSALDATA:%.*]] undef, [[STRUCT_SYSTEMDATA]] [[SYS_DATA_I]], 0
-; DXILCONTPOSTPROCESS-NEXT:    [[TMP6:%.*]] = call i64 @continuation.getAddrAndMD(i64 ptrtoint (ptr @MyRayGen.resume.0 to i64))
+; DXILCONTPOSTPROCESS-NEXT:    [[TMP6:%.*]] = call i64 @continuation.getAddrAndMD(ptr @MyRayGen.resume.0)
 ; DXILCONTPOSTPROCESS-NEXT:    [[TRAV_DATA2_I:%.*]] = insertvalue [[STRUCT_TRAVERSALDATA]] [[TRAV_DATA_I]], i64 [[TMP6]], 5
 ; DXILCONTPOSTPROCESS-NEXT:    [[DOTSROA_0_0_VEC_EXTRACT:%.*]] = extractelement <4 x float> zeroinitializer, i32 0
 ; DXILCONTPOSTPROCESS-NEXT:    [[TMP7:%.*]] = bitcast float [[DOTSROA_0_0_VEC_EXTRACT]] to i32
@@ -935,19 +940,19 @@ attributes #6 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 ; DXILCONTPOSTPROCESS-NEXT:    [[TMP10:%.*]] = bitcast float [[DOTSROA_0_12_VEC_EXTRACT]] to i32
 ; DXILCONTPOSTPROCESS-NEXT:    store i32 [[TMP10]], ptr addrspace(20) getelementptr inbounds (i32, ptr addrspace(20) @REGISTERS, i32 9), align 4
 ; DXILCONTPOSTPROCESS-NEXT:    [[TMP11:%.*]] = load i32, ptr [[CSP]], align 4
-; DXILCONTPOSTPROCESS-NEXT:    call void (i64, i64, ...) @continuation.waitContinue(i64 4, i64 -1, i32 [[TMP11]], [[STRUCT_TRAVERSALDATA]] [[TRAV_DATA2_I]]), !continuation.registercount [[META33:![0-9]+]], !continuation.returnedRegistercount !33
+; DXILCONTPOSTPROCESS-NEXT:    call void (i64, i64, ...) @continuation.waitContinue(i64 4, i64 -1, i32 [[TMP11]], i64 poison, [[STRUCT_TRAVERSALDATA]] [[TRAV_DATA2_I]]), !continuation.registercount [[META33:![0-9]+]], !continuation.returnedRegistercount [[META33]]
 ; DXILCONTPOSTPROCESS-NEXT:    unreachable
 ;
 ;
 ; DXILCONTPOSTPROCESS-LABEL: define dso_local void @MyRayGen.resume.0(
-; DXILCONTPOSTPROCESS-SAME: i32 [[CSPINIT:%.*]], [[STRUCT_DISPATCHSYSTEMDATA:%.*]] [[TMP0:%.*]]) !lgc.rt.shaderstage [[META22]] !continuation.registercount [[META33]] !continuation [[META36]] {
+; DXILCONTPOSTPROCESS-SAME: i32 [[CSPINIT:%.*]], i64 [[TMP0:%.*]], [[STRUCT_DISPATCHSYSTEMDATA:%.*]] [[TMP1:%.*]]) !lgc.rt.shaderstage [[META22]] !continuation.registercount [[META33]] !continuation [[META37]] {
 ; DXILCONTPOSTPROCESS-NEXT:  entryresume.0:
-; DXILCONTPOSTPROCESS-NEXT:    [[SYSTEM_DATA_ALLOCA:%.*]] = alloca [[STRUCT_DISPATCHSYSTEMDATA]], align 8
+; DXILCONTPOSTPROCESS-NEXT:    [[SYSTEM_DATA_ALLOCA1:%.*]] = alloca [[STRUCT_DISPATCHSYSTEMDATA]], align 8
 ; DXILCONTPOSTPROCESS-NEXT:    [[CSP:%.*]] = alloca i32, align 4
 ; DXILCONTPOSTPROCESS-NEXT:    store i32 [[CSPINIT]], ptr [[CSP]], align 4
-; DXILCONTPOSTPROCESS-NEXT:    store [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP0]], ptr [[SYSTEM_DATA_ALLOCA]], align 4
-; DXILCONTPOSTPROCESS-NEXT:    [[TMP1:%.*]] = load i32, ptr addrspace(20) @REGISTERS, align 4
-; DXILCONTPOSTPROCESS-NEXT:    [[TMP2:%.*]] = bitcast i32 [[TMP1]] to float
+; DXILCONTPOSTPROCESS-NEXT:    store [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP1]], ptr [[SYSTEM_DATA_ALLOCA1]], align 4
+; DXILCONTPOSTPROCESS-NEXT:    [[TMP12:%.*]] = load i32, ptr addrspace(20) @REGISTERS, align 4
+; DXILCONTPOSTPROCESS-NEXT:    [[TMP2:%.*]] = bitcast i32 [[TMP12]] to float
 ; DXILCONTPOSTPROCESS-NEXT:    [[DOTSROA_0_0_VEC_INSERT:%.*]] = insertelement <4 x float> poison, float [[TMP2]], i32 0
 ; DXILCONTPOSTPROCESS-NEXT:    [[TMP3:%.*]] = load i32, ptr addrspace(20) getelementptr inbounds (i32, ptr addrspace(20) @REGISTERS, i32 7), align 4
 ; DXILCONTPOSTPROCESS-NEXT:    [[TMP4:%.*]] = bitcast i32 [[TMP3]] to float
@@ -958,15 +963,15 @@ attributes #6 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 ; DXILCONTPOSTPROCESS-NEXT:    [[TMP7:%.*]] = load i32, ptr addrspace(20) getelementptr inbounds (i32, ptr addrspace(20) @REGISTERS, i32 9), align 4
 ; DXILCONTPOSTPROCESS-NEXT:    [[TMP8:%.*]] = bitcast i32 [[TMP7]] to float
 ; DXILCONTPOSTPROCESS-NEXT:    [[DOTSROA_0_12_VEC_INSERT:%.*]] = insertelement <4 x float> [[DOTSROA_0_8_VEC_INSERT]], float [[TMP8]], i32 3
-; DXILCONTPOSTPROCESS-NEXT:    [[DOTFCA_0_EXTRACT6:%.*]] = extractvalue [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP0]], 0
+; DXILCONTPOSTPROCESS-NEXT:    [[DOTFCA_0_EXTRACT6:%.*]] = extractvalue [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP1]], 0
 ; DXILCONTPOSTPROCESS-NEXT:    call void @amd.dx.setLocalRootIndex(i32 0)
 ; DXILCONTPOSTPROCESS-NEXT:    [[TMP9:%.*]] = load [[DX_TYPES_HANDLE:%.*]], ptr @"\01?RenderTarget@@3V?$RWTexture2D@V?$vector@M$03@@@@A", align 4
-; DXILCONTPOSTPROCESS-NEXT:    [[TMP10:%.*]] = call <3 x i32> @_cont_DispatchRaysIndex3(ptr [[SYSTEM_DATA_ALLOCA]])
+; DXILCONTPOSTPROCESS-NEXT:    [[TMP10:%.*]] = call <3 x i32> @_cont_DispatchRaysIndex3(ptr [[SYSTEM_DATA_ALLOCA1]])
 ; DXILCONTPOSTPROCESS-NEXT:    [[EXTRACT:%.*]] = extractelement <3 x i32> [[TMP10]], i8 0
-; DXILCONTPOSTPROCESS-NEXT:    [[TMP11:%.*]] = call <3 x i32> @_cont_DispatchRaysIndex3(ptr [[SYSTEM_DATA_ALLOCA]])
+; DXILCONTPOSTPROCESS-NEXT:    [[TMP11:%.*]] = call <3 x i32> @_cont_DispatchRaysIndex3(ptr [[SYSTEM_DATA_ALLOCA1]])
 ; DXILCONTPOSTPROCESS-NEXT:    [[EXTRACT1:%.*]] = extractelement <3 x i32> [[TMP11]], i8 1
-; DXILCONTPOSTPROCESS-NEXT:    [[TMP12:%.*]] = call [[DX_TYPES_HANDLE]] [[DX_OP_CREATEHANDLEFORLIB_DX_TYPES_HANDLE:@[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]](i32 160, [[DX_TYPES_HANDLE]] [[TMP9]])
-; DXILCONTPOSTPROCESS-NEXT:    [[TMP13:%.*]] = call [[DX_TYPES_HANDLE]] [[DX_OP_ANNOTATEHANDLE:@[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]](i32 216, [[DX_TYPES_HANDLE]] [[TMP12]], [[DX_TYPES_RESOURCEPROPERTIES:%.*]] { i32 4098, i32 1033 })
+; DXILCONTPOSTPROCESS-NEXT:    [[TMP18:%.*]] = call [[DX_TYPES_HANDLE]] [[DX_OP_CREATEHANDLEFORLIB_DX_TYPES_HANDLE:@[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]](i32 160, [[DX_TYPES_HANDLE]] [[TMP9]])
+; DXILCONTPOSTPROCESS-NEXT:    [[TMP13:%.*]] = call [[DX_TYPES_HANDLE]] [[DX_OP_ANNOTATEHANDLE:@[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]](i32 216, [[DX_TYPES_HANDLE]] [[TMP18]], [[DX_TYPES_RESOURCEPROPERTIES:%.*]] { i32 4098, i32 1033 })
 ; DXILCONTPOSTPROCESS-NEXT:    [[TMP14:%.*]] = extractelement <4 x float> [[DOTSROA_0_12_VEC_INSERT]], i64 0
 ; DXILCONTPOSTPROCESS-NEXT:    [[TMP15:%.*]] = extractelement <4 x float> [[DOTSROA_0_12_VEC_INSERT]], i64 1
 ; DXILCONTPOSTPROCESS-NEXT:    [[TMP16:%.*]] = extractelement <4 x float> [[DOTSROA_0_12_VEC_INSERT]], i64 2
@@ -978,7 +983,7 @@ attributes #6 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 ;
 ;
 ; DXILCONTPOSTPROCESS-LABEL: define void @MyClosestHitShader(
-; DXILCONTPOSTPROCESS-SAME: i32 [[CSPINIT:%.*]], i64 [[RETURNADDR:%.*]], [[STRUCT_SYSTEMDATA:%.*]] [[TMP0:%.*]]) #[[ATTR3]] !lgc.rt.shaderstage [[META37:![0-9]+]] !continuation.registercount [[META33]] !continuation [[META38:![0-9]+]] !continuation.state [[META22]] {
+; DXILCONTPOSTPROCESS-SAME: i32 [[CSPINIT:%.*]], i64 [[RETURNADDR:%.*]], [[STRUCT_SYSTEMDATA:%.*]] [[TMP0:%.*]]) #[[ATTR3]] !lgc.rt.shaderstage [[META38:![0-9]+]] !continuation.registercount [[META33]] !continuation [[META39:![0-9]+]] !continuation.state [[META22]] {
 ; DXILCONTPOSTPROCESS-NEXT:  AllocaSpillBB:
 ; DXILCONTPOSTPROCESS-NEXT:    [[CSP:%.*]] = alloca i32, align 4
 ; DXILCONTPOSTPROCESS-NEXT:    store i32 [[CSPINIT]], ptr [[CSP]], align 4
@@ -1029,12 +1034,12 @@ attributes #6 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 ; DXILCONTPOSTPROCESS-NEXT:    store i32 [[TMP24]], ptr addrspace(20) getelementptr inbounds (i32, ptr addrspace(20) @REGISTERS, i32 9), align 4
 ; DXILCONTPOSTPROCESS-NEXT:    [[DOTFCA_0_INSERT:%.*]] = insertvalue [[STRUCT_DISPATCHSYSTEMDATA:%.*]] poison, <3 x i32> [[DOTFCA_0_0_EXTRACT]], 0
 ; DXILCONTPOSTPROCESS-NEXT:    [[TMP25:%.*]] = load i32, ptr [[CSP]], align 4
-; DXILCONTPOSTPROCESS-NEXT:    call void (i64, ...) @continuation.continue(i64 [[RETURNADDR]], i32 [[TMP25]], [[STRUCT_DISPATCHSYSTEMDATA]] [[DOTFCA_0_INSERT]]), !continuation.registercount [[META33]]
+; DXILCONTPOSTPROCESS-NEXT:    call void (i64, ...) @continuation.continue(i64 [[RETURNADDR]], i32 [[TMP25]], i64 poison, [[STRUCT_DISPATCHSYSTEMDATA]] [[DOTFCA_0_INSERT]]), !continuation.registercount [[META33]]
 ; DXILCONTPOSTPROCESS-NEXT:    unreachable
 ;
 ;
 ; DXILCONTPOSTPROCESS-LABEL: define void @MyAnyHitShader(
-; DXILCONTPOSTPROCESS-SAME: i32 [[CSPINIT:%.*]], i64 [[RETURNADDR:%.*]], [[STRUCT_TRAVERSALDATA:%.*]] [[TMP0:%.*]], [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES:%.*]] [[TMP1:%.*]]) #[[ATTR3]] !lgc.rt.shaderstage [[META39:![0-9]+]] !continuation.registercount [[META33]] !continuation [[META40:![0-9]+]] !continuation.state [[META22]] {
+; DXILCONTPOSTPROCESS-SAME: i32 [[CSPINIT:%.*]], i64 [[RETURNADDR:%.*]], [[STRUCT_TRAVERSALDATA:%.*]] [[TMP0:%.*]], [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES:%.*]] [[TMP1:%.*]]) #[[ATTR3]] !lgc.rt.shaderstage [[META40:![0-9]+]] !continuation.registercount [[META33]] !continuation [[META41:![0-9]+]] !continuation.state [[META22]] {
 ; DXILCONTPOSTPROCESS-NEXT:  AllocaSpillBB:
 ; DXILCONTPOSTPROCESS-NEXT:    [[TMP2:%.*]] = alloca [[STRUCT_HITDATA:%.*]], align 8
 ; DXILCONTPOSTPROCESS-NEXT:    [[TMP3:%.*]] = alloca [[STRUCT_HITDATA]], align 8
@@ -1151,12 +1156,12 @@ attributes #6 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 ; DXILCONTPOSTPROCESS-NEXT:    [[DOTSROA_0_12_VEC_EXTRACT:%.*]] = extractelement <4 x float> [[DOTSROA_0_12_VEC_INSERT]], i32 3
 ; DXILCONTPOSTPROCESS-NEXT:    [[TMP29:%.*]] = bitcast float [[DOTSROA_0_12_VEC_EXTRACT]] to i32
 ; DXILCONTPOSTPROCESS-NEXT:    store i32 [[TMP29]], ptr addrspace(20) getelementptr inbounds (i32, ptr addrspace(20) @REGISTERS, i32 9), align 4
-; DXILCONTPOSTPROCESS-NEXT:    [[HITATTRSALLOCA_SROA_0_0_VEC_EXTRACT:%.*]] = extractelement <2 x float> [[DOTFCA_0_EXTRACT]], i32 0
-; DXILCONTPOSTPROCESS-NEXT:    [[TMP30:%.*]] = bitcast float [[HITATTRSALLOCA_SROA_0_0_VEC_EXTRACT]] to i32
+; DXILCONTPOSTPROCESS-NEXT:    [[HITATTRSALLOCA_SROA_0_0_VEC_EXTRACT9:%.*]] = extractelement <2 x float> [[DOTFCA_0_EXTRACT]], i32 0
+; DXILCONTPOSTPROCESS-NEXT:    [[TMP30:%.*]] = bitcast float [[HITATTRSALLOCA_SROA_0_0_VEC_EXTRACT9]] to i32
 ; DXILCONTPOSTPROCESS-NEXT:    [[TMP31:%.*]] = bitcast i32 [[TMP30]] to float
 ; DXILCONTPOSTPROCESS-NEXT:    [[DOTSROA_062_0_VEC_INSERT:%.*]] = insertelement <2 x float> undef, float [[TMP31]], i32 0
-; DXILCONTPOSTPROCESS-NEXT:    [[HITATTRSALLOCA_SROA_0_4_VEC_EXTRACT:%.*]] = extractelement <2 x float> [[DOTFCA_0_EXTRACT]], i32 1
-; DXILCONTPOSTPROCESS-NEXT:    [[TMP32:%.*]] = bitcast float [[HITATTRSALLOCA_SROA_0_4_VEC_EXTRACT]] to i32
+; DXILCONTPOSTPROCESS-NEXT:    [[HITATTRSALLOCA_SROA_0_4_VEC_EXTRACT11:%.*]] = extractelement <2 x float> [[DOTFCA_0_EXTRACT]], i32 1
+; DXILCONTPOSTPROCESS-NEXT:    [[TMP32:%.*]] = bitcast float [[HITATTRSALLOCA_SROA_0_4_VEC_EXTRACT11]] to i32
 ; DXILCONTPOSTPROCESS-NEXT:    [[TMP33:%.*]] = bitcast i32 [[TMP32]] to float
 ; DXILCONTPOSTPROCESS-NEXT:    [[DOTSROA_062_4_VEC_INSERT:%.*]] = insertelement <2 x float> [[DOTSROA_062_0_VEC_INSERT]], float [[TMP33]], i32 1
 ; DXILCONTPOSTPROCESS-NEXT:    [[DOTFCA_0_INSERT:%.*]] = insertvalue [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]] poison, <2 x float> [[DOTSROA_062_4_VEC_INSERT]], 0
@@ -1190,28 +1195,28 @@ attributes #6 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 ; DXILCONTPOSTPROCESS-NEXT:    [[DOTFCA_5_LOAD:%.*]] = load i64, ptr [[DOTFCA_5_GEP33]], align 4
 ; DXILCONTPOSTPROCESS-NEXT:    [[DOTFCA_5_INSERT:%.*]] = insertvalue [[STRUCT_TRAVERSALDATA]] [[DOTFCA_4_INSERT]], i64 [[DOTFCA_5_LOAD]], 5
 ; DXILCONTPOSTPROCESS-NEXT:    [[TMP35:%.*]] = load i32, ptr [[CSP]], align 4
-; DXILCONTPOSTPROCESS-NEXT:    call void (i64, ...) @continuation.continue(i64 [[RETURNADDR]], i32 [[TMP35]], [[STRUCT_TRAVERSALDATA]] [[DOTFCA_5_INSERT]]), !continuation.registercount [[META33]]
+; DXILCONTPOSTPROCESS-NEXT:    call void (i64, ...) @continuation.continue(i64 [[RETURNADDR]], i32 [[TMP35]], i64 poison, [[STRUCT_TRAVERSALDATA]] [[DOTFCA_5_INSERT]]), !continuation.registercount [[META33]]
 ; DXILCONTPOSTPROCESS-NEXT:    unreachable
 ; DXILCONTPOSTPROCESS:       36:
 ; DXILCONTPOSTPROCESS-NEXT:    call void @_cont_AcceptHit(ptr [[SYSTEM_DATA_ALLOCA]])
-; DXILCONTPOSTPROCESS-NEXT:    [[DOTSROA_0_0_VEC_EXTRACT15:%.*]] = extractelement <4 x float> [[DOTSROA_0_12_VEC_INSERT]], i32 0
-; DXILCONTPOSTPROCESS-NEXT:    [[TMP37:%.*]] = bitcast float [[DOTSROA_0_0_VEC_EXTRACT15]] to i32
+; DXILCONTPOSTPROCESS-NEXT:    [[DOTSROA_0_0_VEC_EXTRACT14:%.*]] = extractelement <4 x float> [[DOTSROA_0_12_VEC_INSERT]], i32 0
+; DXILCONTPOSTPROCESS-NEXT:    [[TMP37:%.*]] = bitcast float [[DOTSROA_0_0_VEC_EXTRACT14]] to i32
 ; DXILCONTPOSTPROCESS-NEXT:    store i32 [[TMP37]], ptr addrspace(20) @REGISTERS, align 4
 ; DXILCONTPOSTPROCESS-NEXT:    [[DOTSROA_0_4_VEC_EXTRACT18:%.*]] = extractelement <4 x float> [[DOTSROA_0_12_VEC_INSERT]], i32 1
 ; DXILCONTPOSTPROCESS-NEXT:    [[TMP38:%.*]] = bitcast float [[DOTSROA_0_4_VEC_EXTRACT18]] to i32
 ; DXILCONTPOSTPROCESS-NEXT:    store i32 [[TMP38]], ptr addrspace(20) getelementptr inbounds (i32, ptr addrspace(20) @REGISTERS, i32 7), align 4
-; DXILCONTPOSTPROCESS-NEXT:    [[DOTSROA_0_8_VEC_EXTRACT21:%.*]] = extractelement <4 x float> [[DOTSROA_0_12_VEC_INSERT]], i32 2
-; DXILCONTPOSTPROCESS-NEXT:    [[TMP39:%.*]] = bitcast float [[DOTSROA_0_8_VEC_EXTRACT21]] to i32
+; DXILCONTPOSTPROCESS-NEXT:    [[DOTSROA_0_8_VEC_EXTRACT20:%.*]] = extractelement <4 x float> [[DOTSROA_0_12_VEC_INSERT]], i32 2
+; DXILCONTPOSTPROCESS-NEXT:    [[TMP39:%.*]] = bitcast float [[DOTSROA_0_8_VEC_EXTRACT20]] to i32
 ; DXILCONTPOSTPROCESS-NEXT:    store i32 [[TMP39]], ptr addrspace(20) getelementptr inbounds (i32, ptr addrspace(20) @REGISTERS, i32 8), align 4
-; DXILCONTPOSTPROCESS-NEXT:    [[DOTSROA_0_12_VEC_EXTRACT24:%.*]] = extractelement <4 x float> [[DOTSROA_0_12_VEC_INSERT]], i32 3
-; DXILCONTPOSTPROCESS-NEXT:    [[TMP40:%.*]] = bitcast float [[DOTSROA_0_12_VEC_EXTRACT24]] to i32
+; DXILCONTPOSTPROCESS-NEXT:    [[DOTSROA_0_12_VEC_EXTRACT23:%.*]] = extractelement <4 x float> [[DOTSROA_0_12_VEC_INSERT]], i32 3
+; DXILCONTPOSTPROCESS-NEXT:    [[TMP40:%.*]] = bitcast float [[DOTSROA_0_12_VEC_EXTRACT23]] to i32
 ; DXILCONTPOSTPROCESS-NEXT:    store i32 [[TMP40]], ptr addrspace(20) getelementptr inbounds (i32, ptr addrspace(20) @REGISTERS, i32 9), align 4
-; DXILCONTPOSTPROCESS-NEXT:    [[HITATTRSALLOCA_SROA_0_0_VEC_EXTRACT9:%.*]] = extractelement <2 x float> [[DOTFCA_0_EXTRACT]], i32 0
-; DXILCONTPOSTPROCESS-NEXT:    [[TMP41:%.*]] = bitcast float [[HITATTRSALLOCA_SROA_0_0_VEC_EXTRACT9]] to i32
+; DXILCONTPOSTPROCESS-NEXT:    [[HITATTRSALLOCA_SROA_0_0_VEC_EXTRACT:%.*]] = extractelement <2 x float> [[DOTFCA_0_EXTRACT]], i32 0
+; DXILCONTPOSTPROCESS-NEXT:    [[TMP41:%.*]] = bitcast float [[HITATTRSALLOCA_SROA_0_0_VEC_EXTRACT]] to i32
 ; DXILCONTPOSTPROCESS-NEXT:    [[TMP42:%.*]] = bitcast i32 [[TMP41]] to float
 ; DXILCONTPOSTPROCESS-NEXT:    [[DOTSROA_066_0_VEC_INSERT:%.*]] = insertelement <2 x float> undef, float [[TMP42]], i32 0
-; DXILCONTPOSTPROCESS-NEXT:    [[HITATTRSALLOCA_SROA_0_4_VEC_EXTRACT11:%.*]] = extractelement <2 x float> [[DOTFCA_0_EXTRACT]], i32 1
-; DXILCONTPOSTPROCESS-NEXT:    [[TMP43:%.*]] = bitcast float [[HITATTRSALLOCA_SROA_0_4_VEC_EXTRACT11]] to i32
+; DXILCONTPOSTPROCESS-NEXT:    [[HITATTRSALLOCA_SROA_0_4_VEC_EXTRACT:%.*]] = extractelement <2 x float> [[DOTFCA_0_EXTRACT]], i32 1
+; DXILCONTPOSTPROCESS-NEXT:    [[TMP43:%.*]] = bitcast float [[HITATTRSALLOCA_SROA_0_4_VEC_EXTRACT]] to i32
 ; DXILCONTPOSTPROCESS-NEXT:    [[TMP44:%.*]] = bitcast i32 [[TMP43]] to float
 ; DXILCONTPOSTPROCESS-NEXT:    [[DOTSROA_066_4_VEC_INSERT:%.*]] = insertelement <2 x float> [[DOTSROA_066_0_VEC_INSERT]], float [[TMP44]], i32 1
 ; DXILCONTPOSTPROCESS-NEXT:    [[DOTFCA_0_INSERT65:%.*]] = insertvalue [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]] poison, <2 x float> [[DOTSROA_066_4_VEC_INSERT]], 0
@@ -1245,12 +1250,12 @@ attributes #6 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 ; DXILCONTPOSTPROCESS-NEXT:    [[DOTFCA_5_LOAD58:%.*]] = load i64, ptr [[DOTFCA_5_GEP57]], align 4
 ; DXILCONTPOSTPROCESS-NEXT:    [[DOTFCA_5_INSERT59:%.*]] = insertvalue [[STRUCT_TRAVERSALDATA]] [[DOTFCA_4_INSERT56]], i64 [[DOTFCA_5_LOAD58]], 5
 ; DXILCONTPOSTPROCESS-NEXT:    [[TMP46:%.*]] = load i32, ptr [[CSP]], align 4
-; DXILCONTPOSTPROCESS-NEXT:    call void (i64, ...) @continuation.continue(i64 [[RETURNADDR]], i32 [[TMP46]], [[STRUCT_TRAVERSALDATA]] [[DOTFCA_5_INSERT59]]), !continuation.registercount [[META33]]
+; DXILCONTPOSTPROCESS-NEXT:    call void (i64, ...) @continuation.continue(i64 [[RETURNADDR]], i32 [[TMP46]], i64 poison, [[STRUCT_TRAVERSALDATA]] [[DOTFCA_5_INSERT59]]), !continuation.registercount [[META33]]
 ; DXILCONTPOSTPROCESS-NEXT:    unreachable
 ;
 ;
 ; DXILCONTPOSTPROCESS-LABEL: define void @MyIntersectionShader(
-; DXILCONTPOSTPROCESS-SAME: i32 [[CSPINIT:%.*]], i64 [[RETURNADDR:%.*]], [[STRUCT_TRAVERSALDATA:%.*]] [[TMP0:%.*]]) #[[ATTR3]] !lgc.rt.shaderstage [[META41:![0-9]+]] !continuation.registercount [[META32:![0-9]+]] !continuation [[META42:![0-9]+]] !continuation.stacksize [[META43:![0-9]+]] !continuation.state [[META43]] {
+; DXILCONTPOSTPROCESS-SAME: i32 [[CSPINIT:%.*]], i64 [[RETURNADDR:%.*]], [[STRUCT_TRAVERSALDATA:%.*]] [[TMP0:%.*]]) #[[ATTR3]] !lgc.rt.shaderstage [[META42:![0-9]+]] !continuation.registercount [[META32:![0-9]+]] !continuation [[META43:![0-9]+]] !continuation.stacksize [[META44:![0-9]+]] !continuation.state [[META44]] {
 ; DXILCONTPOSTPROCESS-NEXT:  AllocaSpillBB:
 ; DXILCONTPOSTPROCESS-NEXT:    [[CSP:%.*]] = alloca i32, align 4
 ; DXILCONTPOSTPROCESS-NEXT:    store i32 [[CSPINIT]], ptr [[CSP]], align 4
@@ -1285,9 +1290,9 @@ attributes #6 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 ; DXILCONTPOSTPROCESS-NEXT:    [[TRAV_DATA_I_FCA_4_INSERT:%.*]] = insertvalue [[STRUCT_TRAVERSALDATA]] [[TRAV_DATA_I_FCA_3_INSERT]], float [[DOTFCA_4_EXTRACT]], 4
 ; DXILCONTPOSTPROCESS-NEXT:    [[TRAV_DATA_I_FCA_5_INSERT:%.*]] = insertvalue [[STRUCT_TRAVERSALDATA]] [[TRAV_DATA_I_FCA_4_INSERT]], i64 [[DOTFCA_5_EXTRACT]], 5
 ; DXILCONTPOSTPROCESS-NEXT:    [[DOTFCA_0_INSERT:%.*]] = insertvalue [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES:%.*]] poison, <2 x float> undef, 0
+; DXILCONTPOSTPROCESS-NEXT:    [[TMP6:%.*]] = call i64 @continuation.getAddrAndMD(ptr @MyIntersectionShader.resume.0)
 ; DXILCONTPOSTPROCESS-NEXT:    [[TMP5:%.*]] = load i32, ptr [[CSP]], align 4
-; DXILCONTPOSTPROCESS-NEXT:    [[TMP6:%.*]] = call i64 @continuation.getAddrAndMD(i64 ptrtoint (ptr @MyIntersectionShader.resume.0 to i64))
-; DXILCONTPOSTPROCESS-NEXT:    call void (i64, ...) @continuation.continue(i64 3, i32 [[TMP5]], i64 [[TMP6]], [[STRUCT_TRAVERSALDATA]] [[TRAV_DATA_I_FCA_5_INSERT]], float [[RES_I_FCA_1_INSERT_FCA_0_EXTRACT]], i32 0, [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]] [[DOTFCA_0_INSERT]]), !continuation.registercount [[META32]], !continuation.returnedRegistercount !32
+; DXILCONTPOSTPROCESS-NEXT:    call void (i64, ...) @continuation.continue(i64 3, i32 [[TMP5]], i64 [[TMP6]], [[STRUCT_TRAVERSALDATA]] [[TRAV_DATA_I_FCA_5_INSERT]], float [[RES_I_FCA_1_INSERT_FCA_0_EXTRACT]], i32 0, [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]] [[DOTFCA_0_INSERT]]), !continuation.registercount [[META32]], !continuation.returnedRegistercount [[META32]]
 ; DXILCONTPOSTPROCESS-NEXT:    unreachable
 ; DXILCONTPOSTPROCESS:       accepthit.i:
 ; DXILCONTPOSTPROCESS-NEXT:    [[DOTSROA_0_0_VEC_EXTRACT:%.*]] = extractelement <2 x float> undef, i32 0
@@ -1315,7 +1320,7 @@ attributes #6 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 ; DXILCONTPOSTPROCESS-NEXT:    [[TMP13:%.*]] = add i32 [[TMP12]], -8
 ; DXILCONTPOSTPROCESS-NEXT:    store i32 [[TMP13]], ptr [[CSP]], align 4
 ; DXILCONTPOSTPROCESS-NEXT:    [[TMP14:%.*]] = load i32, ptr [[CSP]], align 4
-; DXILCONTPOSTPROCESS-NEXT:    call void (i64, ...) @continuation.continue(i64 [[RETURNADDR]], i32 [[TMP14]], [[STRUCT_TRAVERSALDATA]] [[DOTFCA_5_INSERT49]]), !continuation.registercount [[META32]]
+; DXILCONTPOSTPROCESS-NEXT:    call void (i64, ...) @continuation.continue(i64 [[RETURNADDR]], i32 [[TMP14]], i64 poison, [[STRUCT_TRAVERSALDATA]] [[DOTFCA_5_INSERT49]]), !continuation.registercount [[META32]]
 ; DXILCONTPOSTPROCESS-NEXT:    unreachable
 ; DXILCONTPOSTPROCESS:       15:
 ; DXILCONTPOSTPROCESS-NEXT:    [[DOTFCA_0_0_0_INSERT:%.*]] = insertvalue [[STRUCT_TRAVERSALDATA]] poison, <3 x i32> [[DOTFCA_0_0_0_EXTRACT]], 0, 0, 0
@@ -1330,29 +1335,29 @@ attributes #6 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 ; DXILCONTPOSTPROCESS-NEXT:    [[TMP17:%.*]] = add i32 [[TMP16]], -8
 ; DXILCONTPOSTPROCESS-NEXT:    store i32 [[TMP17]], ptr [[CSP]], align 4
 ; DXILCONTPOSTPROCESS-NEXT:    [[TMP18:%.*]] = load i32, ptr [[CSP]], align 4
-; DXILCONTPOSTPROCESS-NEXT:    call void (i64, ...) @continuation.continue(i64 [[RETURNADDR]], i32 [[TMP18]], [[STRUCT_TRAVERSALDATA]] [[DOTFCA_5_INSERT]]), !continuation.registercount [[META32]]
+; DXILCONTPOSTPROCESS-NEXT:    call void (i64, ...) @continuation.continue(i64 [[RETURNADDR]], i32 [[TMP18]], i64 poison, [[STRUCT_TRAVERSALDATA]] [[DOTFCA_5_INSERT]]), !continuation.registercount [[META32]]
 ; DXILCONTPOSTPROCESS-NEXT:    unreachable
 ;
 ;
 ; DXILCONTPOSTPROCESS-LABEL: define dso_local void @MyIntersectionShader.resume.0(
-; DXILCONTPOSTPROCESS-SAME: i32 [[CSPINIT:%.*]], [[STRUCT_TRAVERSALDATA:%.*]] [[TMP0:%.*]]) !lgc.rt.shaderstage [[META41]] !continuation.registercount [[META32]] !continuation [[META42]] {
+; DXILCONTPOSTPROCESS-SAME: i32 [[CSPINIT:%.*]], i64 [[TMP0:%.*]], [[STRUCT_TRAVERSALDATA:%.*]] [[TMP1:%.*]]) !lgc.rt.shaderstage [[META42]] !continuation.registercount [[META32]] !continuation [[META43]] {
 ; DXILCONTPOSTPROCESS-NEXT:  entryresume.0:
 ; DXILCONTPOSTPROCESS-NEXT:    [[CSP:%.*]] = alloca i32, align 4
 ; DXILCONTPOSTPROCESS-NEXT:    store i32 [[CSPINIT]], ptr [[CSP]], align 4
-; DXILCONTPOSTPROCESS-NEXT:    [[TMP1:%.*]] = load i32, ptr [[CSP]], align 4
-; DXILCONTPOSTPROCESS-NEXT:    [[TMP2:%.*]] = add i32 [[TMP1]], -8
-; DXILCONTPOSTPROCESS-NEXT:    [[DOTFCA_0_0_0_EXTRACT10:%.*]] = extractvalue [[STRUCT_TRAVERSALDATA]] [[TMP0]], 0, 0, 0
-; DXILCONTPOSTPROCESS-NEXT:    [[DOTFCA_0_1_0_EXTRACT12:%.*]] = extractvalue [[STRUCT_TRAVERSALDATA]] [[TMP0]], 0, 1, 0
-; DXILCONTPOSTPROCESS-NEXT:    [[DOTFCA_1_0_EXTRACT14:%.*]] = extractvalue [[STRUCT_TRAVERSALDATA]] [[TMP0]], 1, 0
-; DXILCONTPOSTPROCESS-NEXT:    [[DOTFCA_1_1_EXTRACT16:%.*]] = extractvalue [[STRUCT_TRAVERSALDATA]] [[TMP0]], 1, 1
-; DXILCONTPOSTPROCESS-NEXT:    [[DOTFCA_2_EXTRACT18:%.*]] = extractvalue [[STRUCT_TRAVERSALDATA]] [[TMP0]], 2
-; DXILCONTPOSTPROCESS-NEXT:    [[DOTFCA_3_EXTRACT20:%.*]] = extractvalue [[STRUCT_TRAVERSALDATA]] [[TMP0]], 3
-; DXILCONTPOSTPROCESS-NEXT:    [[DOTFCA_4_EXTRACT22:%.*]] = extractvalue [[STRUCT_TRAVERSALDATA]] [[TMP0]], 4
-; DXILCONTPOSTPROCESS-NEXT:    [[DOTFCA_5_EXTRACT24:%.*]] = extractvalue [[STRUCT_TRAVERSALDATA]] [[TMP0]], 5
+; DXILCONTPOSTPROCESS-NEXT:    [[TMP15:%.*]] = load i32, ptr [[CSP]], align 4
+; DXILCONTPOSTPROCESS-NEXT:    [[TMP2:%.*]] = add i32 [[TMP15]], -8
+; DXILCONTPOSTPROCESS-NEXT:    [[DOTFCA_0_0_0_EXTRACT10:%.*]] = extractvalue [[STRUCT_TRAVERSALDATA]] [[TMP1]], 0, 0, 0
+; DXILCONTPOSTPROCESS-NEXT:    [[DOTFCA_0_1_0_EXTRACT12:%.*]] = extractvalue [[STRUCT_TRAVERSALDATA]] [[TMP1]], 0, 1, 0
+; DXILCONTPOSTPROCESS-NEXT:    [[DOTFCA_1_0_EXTRACT14:%.*]] = extractvalue [[STRUCT_TRAVERSALDATA]] [[TMP1]], 1, 0
+; DXILCONTPOSTPROCESS-NEXT:    [[DOTFCA_1_1_EXTRACT16:%.*]] = extractvalue [[STRUCT_TRAVERSALDATA]] [[TMP1]], 1, 1
+; DXILCONTPOSTPROCESS-NEXT:    [[DOTFCA_2_EXTRACT18:%.*]] = extractvalue [[STRUCT_TRAVERSALDATA]] [[TMP1]], 2
+; DXILCONTPOSTPROCESS-NEXT:    [[DOTFCA_3_EXTRACT20:%.*]] = extractvalue [[STRUCT_TRAVERSALDATA]] [[TMP1]], 3
+; DXILCONTPOSTPROCESS-NEXT:    [[DOTFCA_4_EXTRACT22:%.*]] = extractvalue [[STRUCT_TRAVERSALDATA]] [[TMP1]], 4
+; DXILCONTPOSTPROCESS-NEXT:    [[DOTFCA_5_EXTRACT24:%.*]] = extractvalue [[STRUCT_TRAVERSALDATA]] [[TMP1]], 5
 ; DXILCONTPOSTPROCESS-NEXT:    call void @amd.dx.setLocalRootIndex(i32 5)
 ; DXILCONTPOSTPROCESS-NEXT:    [[ISEND_I:%.*]] = call i1 @opaqueIsEnd()
 ; DXILCONTPOSTPROCESS-NEXT:    br i1 [[ISEND_I]], label [[TMP3:%.*]], label [[TMP9:%.*]]
-; DXILCONTPOSTPROCESS:       3:
+; DXILCONTPOSTPROCESS:       4:
 ; DXILCONTPOSTPROCESS-NEXT:    [[TMP4:%.*]] = inttoptr i32 [[TMP2]] to ptr addrspace(21)
 ; DXILCONTPOSTPROCESS-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP4]], i32 0
 ; DXILCONTPOSTPROCESS-NEXT:    [[RETURNADDR_RELOAD2:%.*]] = load i64, ptr addrspace(21) [[TMP5]], align 4
@@ -1368,9 +1373,9 @@ attributes #6 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 ; DXILCONTPOSTPROCESS-NEXT:    [[TMP7:%.*]] = add i32 [[TMP6]], -8
 ; DXILCONTPOSTPROCESS-NEXT:    store i32 [[TMP7]], ptr [[CSP]], align 4
 ; DXILCONTPOSTPROCESS-NEXT:    [[TMP8:%.*]] = load i32, ptr [[CSP]], align 4
-; DXILCONTPOSTPROCESS-NEXT:    call void (i64, ...) @continuation.continue(i64 [[RETURNADDR_RELOAD2]], i32 [[TMP8]], [[STRUCT_TRAVERSALDATA]] [[DOTFCA_5_INSERT49]]), !continuation.registercount [[META32]]
+; DXILCONTPOSTPROCESS-NEXT:    call void (i64, ...) @continuation.continue(i64 [[RETURNADDR_RELOAD2]], i32 [[TMP8]], i64 poison, [[STRUCT_TRAVERSALDATA]] [[DOTFCA_5_INSERT49]]), !continuation.registercount [[META32]]
 ; DXILCONTPOSTPROCESS-NEXT:    unreachable
-; DXILCONTPOSTPROCESS:       9:
+; DXILCONTPOSTPROCESS:       10:
 ; DXILCONTPOSTPROCESS-NEXT:    [[TMP10:%.*]] = inttoptr i32 [[TMP2]] to ptr addrspace(21)
 ; DXILCONTPOSTPROCESS-NEXT:    [[TMP11:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP10]], i32 0
 ; DXILCONTPOSTPROCESS-NEXT:    [[RETURNADDR_RELOAD:%.*]] = load i64, ptr addrspace(21) [[TMP11]], align 4
@@ -1386,12 +1391,12 @@ attributes #6 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 ; DXILCONTPOSTPROCESS-NEXT:    [[TMP13:%.*]] = add i32 [[TMP12]], -8
 ; DXILCONTPOSTPROCESS-NEXT:    store i32 [[TMP13]], ptr [[CSP]], align 4
 ; DXILCONTPOSTPROCESS-NEXT:    [[TMP14:%.*]] = load i32, ptr [[CSP]], align 4
-; DXILCONTPOSTPROCESS-NEXT:    call void (i64, ...) @continuation.continue(i64 [[RETURNADDR_RELOAD]], i32 [[TMP14]], [[STRUCT_TRAVERSALDATA]] [[DOTFCA_5_INSERT]]), !continuation.registercount [[META32]]
+; DXILCONTPOSTPROCESS-NEXT:    call void (i64, ...) @continuation.continue(i64 [[RETURNADDR_RELOAD]], i32 [[TMP14]], i64 poison, [[STRUCT_TRAVERSALDATA]] [[DOTFCA_5_INSERT]]), !continuation.registercount [[META32]]
 ; DXILCONTPOSTPROCESS-NEXT:    unreachable
 ;
 ;
 ; DXILCONTPOSTPROCESS-LABEL: define void @MyIntersectionShaderLargeAttrs(
-; DXILCONTPOSTPROCESS-SAME: i32 [[CSPINIT:%.*]], i64 [[RETURNADDR:%.*]], [[STRUCT_TRAVERSALDATA:%.*]] [[TMP0:%.*]]) #[[ATTR3]] !lgc.rt.shaderstage [[META41]] !continuation.registercount [[META32]] !continuation [[META44:![0-9]+]] !continuation.stacksize [[META43]] !continuation.state [[META43]] {
+; DXILCONTPOSTPROCESS-SAME: i32 [[CSPINIT:%.*]], i64 [[RETURNADDR:%.*]], [[STRUCT_TRAVERSALDATA:%.*]] [[TMP0:%.*]]) #[[ATTR3]] !lgc.rt.shaderstage [[META42]] !continuation.registercount [[META32]] !continuation [[META45:![0-9]+]] !continuation.stacksize [[META44]] !continuation.state [[META44]] {
 ; DXILCONTPOSTPROCESS-NEXT:  AllocaSpillBB:
 ; DXILCONTPOSTPROCESS-NEXT:    [[CSP:%.*]] = alloca i32, align 4
 ; DXILCONTPOSTPROCESS-NEXT:    store i32 [[CSPINIT]], ptr [[CSP]], align 4
@@ -1432,9 +1437,9 @@ attributes #6 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 ; DXILCONTPOSTPROCESS-NEXT:    [[DOTFCA_0_4_INSERT:%.*]] = insertvalue [[STRUCT_LARGEINTERSECTIONATTRIBUTES]] [[DOTFCA_0_3_INSERT]], i32 104, 0, 4
 ; DXILCONTPOSTPROCESS-NEXT:    [[DOTFCA_0_5_INSERT:%.*]] = insertvalue [[STRUCT_LARGEINTERSECTIONATTRIBUTES]] [[DOTFCA_0_4_INSERT]], i32 105, 0, 5
 ; DXILCONTPOSTPROCESS-NEXT:    [[DOTFCA_0_6_INSERT:%.*]] = insertvalue [[STRUCT_LARGEINTERSECTIONATTRIBUTES]] [[DOTFCA_0_5_INSERT]], i32 106, 0, 6
+; DXILCONTPOSTPROCESS-NEXT:    [[TMP6:%.*]] = call i64 @continuation.getAddrAndMD(ptr @MyIntersectionShaderLargeAttrs.resume.0)
 ; DXILCONTPOSTPROCESS-NEXT:    [[TMP5:%.*]] = load i32, ptr [[CSP]], align 4
-; DXILCONTPOSTPROCESS-NEXT:    [[TMP6:%.*]] = call i64 @continuation.getAddrAndMD(i64 ptrtoint (ptr @MyIntersectionShaderLargeAttrs.resume.0 to i64))
-; DXILCONTPOSTPROCESS-NEXT:    call void (i64, ...) @continuation.continue(i64 3, i32 [[TMP5]], i64 [[TMP6]], [[STRUCT_TRAVERSALDATA]] [[TRAV_DATA_I_FCA_5_INSERT]], float [[RES_I_FCA_1_INSERT_FCA_0_EXTRACT]], i32 0, [[STRUCT_LARGEINTERSECTIONATTRIBUTES]] [[DOTFCA_0_6_INSERT]]), !continuation.registercount [[META32]], !continuation.returnedRegistercount !32
+; DXILCONTPOSTPROCESS-NEXT:    call void (i64, ...) @continuation.continue(i64 3, i32 [[TMP5]], i64 [[TMP6]], [[STRUCT_TRAVERSALDATA]] [[TRAV_DATA_I_FCA_5_INSERT]], float [[RES_I_FCA_1_INSERT_FCA_0_EXTRACT]], i32 0, [[STRUCT_LARGEINTERSECTIONATTRIBUTES]] [[DOTFCA_0_6_INSERT]]), !continuation.registercount [[META32]], !continuation.returnedRegistercount [[META32]]
 ; DXILCONTPOSTPROCESS-NEXT:    unreachable
 ; DXILCONTPOSTPROCESS:       accepthit.i:
 ; DXILCONTPOSTPROCESS-NEXT:    [[TMP7:%.*]] = bitcast i32 100 to float
@@ -1463,7 +1468,7 @@ attributes #6 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 ; DXILCONTPOSTPROCESS-NEXT:    [[TMP11:%.*]] = add i32 [[TMP10]], -8
 ; DXILCONTPOSTPROCESS-NEXT:    store i32 [[TMP11]], ptr [[CSP]], align 4
 ; DXILCONTPOSTPROCESS-NEXT:    [[TMP12:%.*]] = load i32, ptr [[CSP]], align 4
-; DXILCONTPOSTPROCESS-NEXT:    call void (i64, ...) @continuation.continue(i64 [[RETURNADDR]], i32 [[TMP12]], [[STRUCT_TRAVERSALDATA]] [[DOTFCA_5_INSERT49]]), !continuation.registercount [[META32]]
+; DXILCONTPOSTPROCESS-NEXT:    call void (i64, ...) @continuation.continue(i64 [[RETURNADDR]], i32 [[TMP12]], i64 poison, [[STRUCT_TRAVERSALDATA]] [[DOTFCA_5_INSERT49]]), !continuation.registercount [[META32]]
 ; DXILCONTPOSTPROCESS-NEXT:    unreachable
 ; DXILCONTPOSTPROCESS:       13:
 ; DXILCONTPOSTPROCESS-NEXT:    [[DOTFCA_0_0_0_INSERT:%.*]] = insertvalue [[STRUCT_TRAVERSALDATA]] poison, <3 x i32> [[DOTFCA_0_0_0_EXTRACT]], 0, 0, 0
@@ -1478,29 +1483,29 @@ attributes #6 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 ; DXILCONTPOSTPROCESS-NEXT:    [[TMP15:%.*]] = add i32 [[TMP14]], -8
 ; DXILCONTPOSTPROCESS-NEXT:    store i32 [[TMP15]], ptr [[CSP]], align 4
 ; DXILCONTPOSTPROCESS-NEXT:    [[TMP16:%.*]] = load i32, ptr [[CSP]], align 4
-; DXILCONTPOSTPROCESS-NEXT:    call void (i64, ...) @continuation.continue(i64 [[RETURNADDR]], i32 [[TMP16]], [[STRUCT_TRAVERSALDATA]] [[DOTFCA_5_INSERT]]), !continuation.registercount [[META32]]
+; DXILCONTPOSTPROCESS-NEXT:    call void (i64, ...) @continuation.continue(i64 [[RETURNADDR]], i32 [[TMP16]], i64 poison, [[STRUCT_TRAVERSALDATA]] [[DOTFCA_5_INSERT]]), !continuation.registercount [[META32]]
 ; DXILCONTPOSTPROCESS-NEXT:    unreachable
 ;
 ;
 ; DXILCONTPOSTPROCESS-LABEL: define dso_local void @MyIntersectionShaderLargeAttrs.resume.0(
-; DXILCONTPOSTPROCESS-SAME: i32 [[CSPINIT:%.*]], [[STRUCT_TRAVERSALDATA:%.*]] [[TMP0:%.*]]) !lgc.rt.shaderstage [[META41]] !continuation.registercount [[META32]] !continuation [[META44]] {
+; DXILCONTPOSTPROCESS-SAME: i32 [[CSPINIT:%.*]], i64 [[TMP0:%.*]], [[STRUCT_TRAVERSALDATA:%.*]] [[TMP1:%.*]]) !lgc.rt.shaderstage [[META42]] !continuation.registercount [[META32]] !continuation [[META45]] {
 ; DXILCONTPOSTPROCESS-NEXT:  entryresume.0:
 ; DXILCONTPOSTPROCESS-NEXT:    [[CSP:%.*]] = alloca i32, align 4
 ; DXILCONTPOSTPROCESS-NEXT:    store i32 [[CSPINIT]], ptr [[CSP]], align 4
-; DXILCONTPOSTPROCESS-NEXT:    [[TMP1:%.*]] = load i32, ptr [[CSP]], align 4
-; DXILCONTPOSTPROCESS-NEXT:    [[TMP2:%.*]] = add i32 [[TMP1]], -8
-; DXILCONTPOSTPROCESS-NEXT:    [[DOTFCA_0_0_0_EXTRACT10:%.*]] = extractvalue [[STRUCT_TRAVERSALDATA]] [[TMP0]], 0, 0, 0
-; DXILCONTPOSTPROCESS-NEXT:    [[DOTFCA_0_1_0_EXTRACT12:%.*]] = extractvalue [[STRUCT_TRAVERSALDATA]] [[TMP0]], 0, 1, 0
-; DXILCONTPOSTPROCESS-NEXT:    [[DOTFCA_1_0_EXTRACT14:%.*]] = extractvalue [[STRUCT_TRAVERSALDATA]] [[TMP0]], 1, 0
-; DXILCONTPOSTPROCESS-NEXT:    [[DOTFCA_1_1_EXTRACT16:%.*]] = extractvalue [[STRUCT_TRAVERSALDATA]] [[TMP0]], 1, 1
-; DXILCONTPOSTPROCESS-NEXT:    [[DOTFCA_2_EXTRACT18:%.*]] = extractvalue [[STRUCT_TRAVERSALDATA]] [[TMP0]], 2
-; DXILCONTPOSTPROCESS-NEXT:    [[DOTFCA_3_EXTRACT20:%.*]] = extractvalue [[STRUCT_TRAVERSALDATA]] [[TMP0]], 3
-; DXILCONTPOSTPROCESS-NEXT:    [[DOTFCA_4_EXTRACT22:%.*]] = extractvalue [[STRUCT_TRAVERSALDATA]] [[TMP0]], 4
-; DXILCONTPOSTPROCESS-NEXT:    [[DOTFCA_5_EXTRACT24:%.*]] = extractvalue [[STRUCT_TRAVERSALDATA]] [[TMP0]], 5
+; DXILCONTPOSTPROCESS-NEXT:    [[TMP15:%.*]] = load i32, ptr [[CSP]], align 4
+; DXILCONTPOSTPROCESS-NEXT:    [[TMP2:%.*]] = add i32 [[TMP15]], -8
+; DXILCONTPOSTPROCESS-NEXT:    [[DOTFCA_0_0_0_EXTRACT10:%.*]] = extractvalue [[STRUCT_TRAVERSALDATA]] [[TMP1]], 0, 0, 0
+; DXILCONTPOSTPROCESS-NEXT:    [[DOTFCA_0_1_0_EXTRACT12:%.*]] = extractvalue [[STRUCT_TRAVERSALDATA]] [[TMP1]], 0, 1, 0
+; DXILCONTPOSTPROCESS-NEXT:    [[DOTFCA_1_0_EXTRACT14:%.*]] = extractvalue [[STRUCT_TRAVERSALDATA]] [[TMP1]], 1, 0
+; DXILCONTPOSTPROCESS-NEXT:    [[DOTFCA_1_1_EXTRACT16:%.*]] = extractvalue [[STRUCT_TRAVERSALDATA]] [[TMP1]], 1, 1
+; DXILCONTPOSTPROCESS-NEXT:    [[DOTFCA_2_EXTRACT18:%.*]] = extractvalue [[STRUCT_TRAVERSALDATA]] [[TMP1]], 2
+; DXILCONTPOSTPROCESS-NEXT:    [[DOTFCA_3_EXTRACT20:%.*]] = extractvalue [[STRUCT_TRAVERSALDATA]] [[TMP1]], 3
+; DXILCONTPOSTPROCESS-NEXT:    [[DOTFCA_4_EXTRACT22:%.*]] = extractvalue [[STRUCT_TRAVERSALDATA]] [[TMP1]], 4
+; DXILCONTPOSTPROCESS-NEXT:    [[DOTFCA_5_EXTRACT24:%.*]] = extractvalue [[STRUCT_TRAVERSALDATA]] [[TMP1]], 5
 ; DXILCONTPOSTPROCESS-NEXT:    call void @amd.dx.setLocalRootIndex(i32 5)
 ; DXILCONTPOSTPROCESS-NEXT:    [[ISEND_I:%.*]] = call i1 @opaqueIsEnd()
 ; DXILCONTPOSTPROCESS-NEXT:    br i1 [[ISEND_I]], label [[TMP3:%.*]], label [[TMP9:%.*]]
-; DXILCONTPOSTPROCESS:       3:
+; DXILCONTPOSTPROCESS:       4:
 ; DXILCONTPOSTPROCESS-NEXT:    [[TMP4:%.*]] = inttoptr i32 [[TMP2]] to ptr addrspace(21)
 ; DXILCONTPOSTPROCESS-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP4]], i32 0
 ; DXILCONTPOSTPROCESS-NEXT:    [[RETURNADDR_RELOAD2:%.*]] = load i64, ptr addrspace(21) [[TMP5]], align 4
@@ -1516,9 +1521,9 @@ attributes #6 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 ; DXILCONTPOSTPROCESS-NEXT:    [[TMP7:%.*]] = add i32 [[TMP6]], -8
 ; DXILCONTPOSTPROCESS-NEXT:    store i32 [[TMP7]], ptr [[CSP]], align 4
 ; DXILCONTPOSTPROCESS-NEXT:    [[TMP8:%.*]] = load i32, ptr [[CSP]], align 4
-; DXILCONTPOSTPROCESS-NEXT:    call void (i64, ...) @continuation.continue(i64 [[RETURNADDR_RELOAD2]], i32 [[TMP8]], [[STRUCT_TRAVERSALDATA]] [[DOTFCA_5_INSERT49]]), !continuation.registercount [[META32]]
+; DXILCONTPOSTPROCESS-NEXT:    call void (i64, ...) @continuation.continue(i64 [[RETURNADDR_RELOAD2]], i32 [[TMP8]], i64 poison, [[STRUCT_TRAVERSALDATA]] [[DOTFCA_5_INSERT49]]), !continuation.registercount [[META32]]
 ; DXILCONTPOSTPROCESS-NEXT:    unreachable
-; DXILCONTPOSTPROCESS:       9:
+; DXILCONTPOSTPROCESS:       10:
 ; DXILCONTPOSTPROCESS-NEXT:    [[TMP10:%.*]] = inttoptr i32 [[TMP2]] to ptr addrspace(21)
 ; DXILCONTPOSTPROCESS-NEXT:    [[TMP11:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP10]], i32 0
 ; DXILCONTPOSTPROCESS-NEXT:    [[RETURNADDR_RELOAD:%.*]] = load i64, ptr addrspace(21) [[TMP11]], align 4
@@ -1534,12 +1539,12 @@ attributes #6 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 ; DXILCONTPOSTPROCESS-NEXT:    [[TMP13:%.*]] = add i32 [[TMP12]], -8
 ; DXILCONTPOSTPROCESS-NEXT:    store i32 [[TMP13]], ptr [[CSP]], align 4
 ; DXILCONTPOSTPROCESS-NEXT:    [[TMP14:%.*]] = load i32, ptr [[CSP]], align 4
-; DXILCONTPOSTPROCESS-NEXT:    call void (i64, ...) @continuation.continue(i64 [[RETURNADDR_RELOAD]], i32 [[TMP14]], [[STRUCT_TRAVERSALDATA]] [[DOTFCA_5_INSERT]]), !continuation.registercount [[META32]]
+; DXILCONTPOSTPROCESS-NEXT:    call void (i64, ...) @continuation.continue(i64 [[RETURNADDR_RELOAD]], i32 [[TMP14]], i64 poison, [[STRUCT_TRAVERSALDATA]] [[DOTFCA_5_INSERT]]), !continuation.registercount [[META32]]
 ; DXILCONTPOSTPROCESS-NEXT:    unreachable
 ;
 ;
 ; DXILCONTPOSTPROCESS-LABEL: define void @MyMissShader(
-; DXILCONTPOSTPROCESS-SAME: i32 [[CSPINIT:%.*]], i64 [[RETURNADDR:%.*]], [[STRUCT_SYSTEMDATA:%.*]] [[TMP0:%.*]]) #[[ATTR3]] !lgc.rt.shaderstage [[META45:![0-9]+]] !continuation.registercount [[META33]] !continuation [[META46:![0-9]+]] !continuation.state [[META22]] {
+; DXILCONTPOSTPROCESS-SAME: i32 [[CSPINIT:%.*]], i64 [[RETURNADDR:%.*]], [[STRUCT_SYSTEMDATA:%.*]] [[TMP0:%.*]]) #[[ATTR3]] !lgc.rt.shaderstage [[META46:![0-9]+]] !continuation.registercount [[META33]] !continuation [[META47:![0-9]+]] !continuation.state [[META22]] {
 ; DXILCONTPOSTPROCESS-NEXT:  AllocaSpillBB:
 ; DXILCONTPOSTPROCESS-NEXT:    [[CSP:%.*]] = alloca i32, align 4
 ; DXILCONTPOSTPROCESS-NEXT:    store i32 [[CSPINIT]], ptr [[CSP]], align 4
@@ -1572,7 +1577,7 @@ attributes #6 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 ; DXILCONTPOSTPROCESS-NEXT:    store i32 [[TMP12]], ptr addrspace(20) getelementptr inbounds (i32, ptr addrspace(20) @REGISTERS, i32 9), align 4
 ; DXILCONTPOSTPROCESS-NEXT:    [[DOTFCA_0_INSERT:%.*]] = insertvalue [[STRUCT_DISPATCHSYSTEMDATA:%.*]] poison, <3 x i32> [[DOTFCA_0_0_EXTRACT]], 0
 ; DXILCONTPOSTPROCESS-NEXT:    [[TMP13:%.*]] = load i32, ptr [[CSP]], align 4
-; DXILCONTPOSTPROCESS-NEXT:    call void (i64, ...) @continuation.continue(i64 [[RETURNADDR]], i32 [[TMP13]], [[STRUCT_DISPATCHSYSTEMDATA]] [[DOTFCA_0_INSERT]]), !continuation.registercount [[META33]]
+; DXILCONTPOSTPROCESS-NEXT:    call void (i64, ...) @continuation.continue(i64 [[RETURNADDR]], i32 [[TMP13]], i64 poison, [[STRUCT_DISPATCHSYSTEMDATA]] [[DOTFCA_0_INSERT]]), !continuation.registercount [[META33]]
 ; DXILCONTPOSTPROCESS-NEXT:    unreachable
 ;
 ;
@@ -1602,7 +1607,7 @@ attributes #6 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 ;
 ;
 ; DXILCONTPOSTPROCESS-GLOBAL-LABEL: define void @_cont_KernelEntry(
-; DXILCONTPOSTPROCESS-GLOBAL-SAME: ) #[[ATTR0]] !lgc.rt.shaderstage [[META35:![0-9]+]] !continuation.registercount [[META22:![0-9]+]] {
+; DXILCONTPOSTPROCESS-GLOBAL-SAME: ) #[[ATTR0]] !lgc.rt.shaderstage [[META35:![0-9]+]] !continuation.registercount [[META22:![0-9]+]] !continuation [[META36:![0-9]+]] {
 ; DXILCONTPOSTPROCESS-GLOBAL-NEXT:    [[CSP:%.*]] = alloca i32, align 4
 ; DXILCONTPOSTPROCESS-GLOBAL-NEXT:    [[TMP1:%.*]] = call i64 @_cont_GetContinuationStackGlobalMemBase()
 ; DXILCONTPOSTPROCESS-GLOBAL-NEXT:    [[TMP2:%.*]] = inttoptr i64 [[TMP1]] to ptr addrspace(22)
@@ -1628,15 +1633,13 @@ attributes #6 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 ;
 ;
 ; DXILCONTPOSTPROCESS-GLOBAL-LABEL: define void @MyRayGen(
-; DXILCONTPOSTPROCESS-GLOBAL-SAME: ) #[[ATTR3:[0-9]+]] !lgc.rt.shaderstage [[META22]] !continuation.registercount [[META22]] !continuation.entry [[META13:![0-9]+]] !continuation [[META36:![0-9]+]] !continuation.state [[META22]] {
+; DXILCONTPOSTPROCESS-GLOBAL-SAME: i32 [[CSPINIT:%.*]], i64 [[RETURNADDR:%.*]], [[STRUCT_DISPATCHSYSTEMDATA:%.*]] [[TMP0:%.*]]) #[[ATTR3:[0-9]+]] !lgc.rt.shaderstage [[META22]] !continuation.registercount [[META22]] !continuation.entry [[META13:![0-9]+]] !continuation [[META37:![0-9]+]] !continuation.state [[META22]] {
 ; DXILCONTPOSTPROCESS-GLOBAL-NEXT:  AllocaSpillBB:
 ; DXILCONTPOSTPROCESS-GLOBAL-NEXT:    [[CSP:%.*]] = alloca i32, align 4
-; DXILCONTPOSTPROCESS-GLOBAL-NEXT:    [[TMP0:%.*]] = call i32 @_cont_GetContinuationStackAddr()
-; DXILCONTPOSTPROCESS-GLOBAL-NEXT:    store i32 [[TMP0]], ptr [[CSP]], align 4
+; DXILCONTPOSTPROCESS-GLOBAL-NEXT:    store i32 [[CSPINIT]], ptr [[CSP]], align 4
 ; DXILCONTPOSTPROCESS-GLOBAL-NEXT:    [[TMP1:%.*]] = call i64 @_cont_GetContinuationStackGlobalMemBase()
 ; DXILCONTPOSTPROCESS-GLOBAL-NEXT:    [[TMP2:%.*]] = inttoptr i64 [[TMP1]] to ptr addrspace(22)
-; DXILCONTPOSTPROCESS-GLOBAL-NEXT:    [[SYSTEM_DATA:%.*]] = call [[STRUCT_DISPATCHSYSTEMDATA:%.*]] [[_CONT_SETUPRAYGEN:@[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]]()
-; DXILCONTPOSTPROCESS-GLOBAL-NEXT:    [[DOTFCA_0_EXTRACT:%.*]] = extractvalue [[STRUCT_DISPATCHSYSTEMDATA]] [[SYSTEM_DATA]], 0
+; DXILCONTPOSTPROCESS-GLOBAL-NEXT:    [[DOTFCA_0_EXTRACT:%.*]] = extractvalue [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP0]], 0
 ; DXILCONTPOSTPROCESS-GLOBAL-NEXT:    call void @amd.dx.setLocalRootIndex(i32 0)
 ; DXILCONTPOSTPROCESS-GLOBAL-NEXT:    [[TMP3:%.*]] = load [[DX_TYPES_HANDLE:%.*]], ptr @"\01?Scene@@3URaytracingAccelerationStructure@@A", align 4
 ; DXILCONTPOSTPROCESS-GLOBAL-NEXT:    [[TMP4:%.*]] = load [[DX_TYPES_HANDLE]], ptr @"\01?RenderTarget@@3V?$RWTexture2D@V?$vector@M$03@@@@A", align 4
@@ -1646,7 +1649,7 @@ attributes #6 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 ; DXILCONTPOSTPROCESS-GLOBAL-NEXT:    [[DIS_DATA_I_FCA_0_INSERT:%.*]] = insertvalue [[STRUCT_DISPATCHSYSTEMDATA]] poison, <3 x i32> [[DOTFCA_0_EXTRACT]], 0
 ; DXILCONTPOSTPROCESS-GLOBAL-NEXT:    [[SYS_DATA_I:%.*]] = insertvalue [[STRUCT_SYSTEMDATA:%.*]] undef, [[STRUCT_DISPATCHSYSTEMDATA]] [[DIS_DATA_I_FCA_0_INSERT]], 0
 ; DXILCONTPOSTPROCESS-GLOBAL-NEXT:    [[TRAV_DATA_I:%.*]] = insertvalue [[STRUCT_TRAVERSALDATA:%.*]] undef, [[STRUCT_SYSTEMDATA]] [[SYS_DATA_I]], 0
-; DXILCONTPOSTPROCESS-GLOBAL-NEXT:    [[TMP8:%.*]] = call i64 @continuation.getAddrAndMD(i64 ptrtoint (ptr @MyRayGen.resume.0 to i64))
+; DXILCONTPOSTPROCESS-GLOBAL-NEXT:    [[TMP8:%.*]] = call i64 @continuation.getAddrAndMD(ptr @MyRayGen.resume.0)
 ; DXILCONTPOSTPROCESS-GLOBAL-NEXT:    [[TRAV_DATA2_I:%.*]] = insertvalue [[STRUCT_TRAVERSALDATA]] [[TRAV_DATA_I]], i64 [[TMP8]], 5
 ; DXILCONTPOSTPROCESS-GLOBAL-NEXT:    [[DOTSROA_0_0_VEC_EXTRACT:%.*]] = extractelement <4 x float> zeroinitializer, i32 0
 ; DXILCONTPOSTPROCESS-GLOBAL-NEXT:    [[TMP9:%.*]] = bitcast float [[DOTSROA_0_0_VEC_EXTRACT]] to i32
@@ -1661,19 +1664,19 @@ attributes #6 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 ; DXILCONTPOSTPROCESS-GLOBAL-NEXT:    [[TMP12:%.*]] = bitcast float [[DOTSROA_0_12_VEC_EXTRACT]] to i32
 ; DXILCONTPOSTPROCESS-GLOBAL-NEXT:    store i32 [[TMP12]], ptr addrspace(20) getelementptr inbounds (i32, ptr addrspace(20) @REGISTERS, i32 9), align 4
 ; DXILCONTPOSTPROCESS-GLOBAL-NEXT:    [[TMP13:%.*]] = load i32, ptr [[CSP]], align 4
-; DXILCONTPOSTPROCESS-GLOBAL-NEXT:    call void (i64, i64, ...) @continuation.waitContinue(i64 4, i64 -1, i32 [[TMP13]], [[STRUCT_TRAVERSALDATA]] [[TRAV_DATA2_I]]), !continuation.registercount [[META34:![0-9]+]], !continuation.returnedRegistercount !34
+; DXILCONTPOSTPROCESS-GLOBAL-NEXT:    call void (i64, i64, ...) @continuation.waitContinue(i64 4, i64 -1, i32 [[TMP13]], i64 poison, [[STRUCT_TRAVERSALDATA]] [[TRAV_DATA2_I]]), !continuation.registercount [[META34:![0-9]+]], !continuation.returnedRegistercount [[META34]]
 ; DXILCONTPOSTPROCESS-GLOBAL-NEXT:    unreachable
 ;
 ;
 ; DXILCONTPOSTPROCESS-GLOBAL-LABEL: define dso_local void @MyRayGen.resume.0(
-; DXILCONTPOSTPROCESS-GLOBAL-SAME: i32 [[CSPINIT:%.*]], [[STRUCT_DISPATCHSYSTEMDATA:%.*]] [[TMP0:%.*]]) !lgc.rt.shaderstage [[META22]] !continuation.registercount [[META34]] !continuation [[META36]] {
+; DXILCONTPOSTPROCESS-GLOBAL-SAME: i32 [[CSPINIT:%.*]], i64 [[TMP0:%.*]], [[STRUCT_DISPATCHSYSTEMDATA:%.*]] [[TMP1:%.*]]) !lgc.rt.shaderstage [[META22]] !continuation.registercount [[META34]] !continuation [[META37]] {
 ; DXILCONTPOSTPROCESS-GLOBAL-NEXT:  entryresume.0:
-; DXILCONTPOSTPROCESS-GLOBAL-NEXT:    [[SYSTEM_DATA_ALLOCA:%.*]] = alloca [[STRUCT_DISPATCHSYSTEMDATA]], align 8
+; DXILCONTPOSTPROCESS-GLOBAL-NEXT:    [[SYSTEM_DATA_ALLOCA1:%.*]] = alloca [[STRUCT_DISPATCHSYSTEMDATA]], align 8
 ; DXILCONTPOSTPROCESS-GLOBAL-NEXT:    [[CSP:%.*]] = alloca i32, align 4
 ; DXILCONTPOSTPROCESS-GLOBAL-NEXT:    store i32 [[CSPINIT]], ptr [[CSP]], align 4
-; DXILCONTPOSTPROCESS-GLOBAL-NEXT:    [[TMP1:%.*]] = call i64 @_cont_GetContinuationStackGlobalMemBase()
-; DXILCONTPOSTPROCESS-GLOBAL-NEXT:    [[TMP2:%.*]] = inttoptr i64 [[TMP1]] to ptr addrspace(22)
-; DXILCONTPOSTPROCESS-GLOBAL-NEXT:    store [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP0]], ptr [[SYSTEM_DATA_ALLOCA]], align 4
+; DXILCONTPOSTPROCESS-GLOBAL-NEXT:    [[TMP2:%.*]] = call i64 @_cont_GetContinuationStackGlobalMemBase()
+; DXILCONTPOSTPROCESS-GLOBAL-NEXT:    [[TMP14:%.*]] = inttoptr i64 [[TMP2]] to ptr addrspace(22)
+; DXILCONTPOSTPROCESS-GLOBAL-NEXT:    store [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP1]], ptr [[SYSTEM_DATA_ALLOCA1]], align 4
 ; DXILCONTPOSTPROCESS-GLOBAL-NEXT:    [[TMP3:%.*]] = load i32, ptr addrspace(20) @REGISTERS, align 4
 ; DXILCONTPOSTPROCESS-GLOBAL-NEXT:    [[TMP4:%.*]] = bitcast i32 [[TMP3]] to float
 ; DXILCONTPOSTPROCESS-GLOBAL-NEXT:    [[DOTSROA_0_0_VEC_INSERT:%.*]] = insertelement <4 x float> poison, float [[TMP4]], i32 0
@@ -1686,15 +1689,15 @@ attributes #6 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 ; DXILCONTPOSTPROCESS-GLOBAL-NEXT:    [[TMP9:%.*]] = load i32, ptr addrspace(20) getelementptr inbounds (i32, ptr addrspace(20) @REGISTERS, i32 9), align 4
 ; DXILCONTPOSTPROCESS-GLOBAL-NEXT:    [[TMP10:%.*]] = bitcast i32 [[TMP9]] to float
 ; DXILCONTPOSTPROCESS-GLOBAL-NEXT:    [[DOTSROA_0_12_VEC_INSERT:%.*]] = insertelement <4 x float> [[DOTSROA_0_8_VEC_INSERT]], float [[TMP10]], i32 3
-; DXILCONTPOSTPROCESS-GLOBAL-NEXT:    [[DOTFCA_0_EXTRACT6:%.*]] = extractvalue [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP0]], 0
+; DXILCONTPOSTPROCESS-GLOBAL-NEXT:    [[DOTFCA_0_EXTRACT6:%.*]] = extractvalue [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP1]], 0
 ; DXILCONTPOSTPROCESS-GLOBAL-NEXT:    call void @amd.dx.setLocalRootIndex(i32 0)
 ; DXILCONTPOSTPROCESS-GLOBAL-NEXT:    [[TMP11:%.*]] = load [[DX_TYPES_HANDLE:%.*]], ptr @"\01?RenderTarget@@3V?$RWTexture2D@V?$vector@M$03@@@@A", align 4
-; DXILCONTPOSTPROCESS-GLOBAL-NEXT:    [[TMP12:%.*]] = call <3 x i32> @_cont_DispatchRaysIndex3(ptr [[SYSTEM_DATA_ALLOCA]])
+; DXILCONTPOSTPROCESS-GLOBAL-NEXT:    [[TMP12:%.*]] = call <3 x i32> @_cont_DispatchRaysIndex3(ptr [[SYSTEM_DATA_ALLOCA1]])
 ; DXILCONTPOSTPROCESS-GLOBAL-NEXT:    [[EXTRACT:%.*]] = extractelement <3 x i32> [[TMP12]], i8 0
-; DXILCONTPOSTPROCESS-GLOBAL-NEXT:    [[TMP13:%.*]] = call <3 x i32> @_cont_DispatchRaysIndex3(ptr [[SYSTEM_DATA_ALLOCA]])
+; DXILCONTPOSTPROCESS-GLOBAL-NEXT:    [[TMP13:%.*]] = call <3 x i32> @_cont_DispatchRaysIndex3(ptr [[SYSTEM_DATA_ALLOCA1]])
 ; DXILCONTPOSTPROCESS-GLOBAL-NEXT:    [[EXTRACT1:%.*]] = extractelement <3 x i32> [[TMP13]], i8 1
-; DXILCONTPOSTPROCESS-GLOBAL-NEXT:    [[TMP14:%.*]] = call [[DX_TYPES_HANDLE]] [[DX_OP_CREATEHANDLEFORLIB_DX_TYPES_HANDLE:@[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]](i32 160, [[DX_TYPES_HANDLE]] [[TMP11]])
-; DXILCONTPOSTPROCESS-GLOBAL-NEXT:    [[TMP15:%.*]] = call [[DX_TYPES_HANDLE]] [[DX_OP_ANNOTATEHANDLE:@[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]](i32 216, [[DX_TYPES_HANDLE]] [[TMP14]], [[DX_TYPES_RESOURCEPROPERTIES:%.*]] { i32 4098, i32 1033 })
+; DXILCONTPOSTPROCESS-GLOBAL-NEXT:    [[TMP20:%.*]] = call [[DX_TYPES_HANDLE]] [[DX_OP_CREATEHANDLEFORLIB_DX_TYPES_HANDLE:@[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]](i32 160, [[DX_TYPES_HANDLE]] [[TMP11]])
+; DXILCONTPOSTPROCESS-GLOBAL-NEXT:    [[TMP15:%.*]] = call [[DX_TYPES_HANDLE]] [[DX_OP_ANNOTATEHANDLE:@[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]](i32 216, [[DX_TYPES_HANDLE]] [[TMP20]], [[DX_TYPES_RESOURCEPROPERTIES:%.*]] { i32 4098, i32 1033 })
 ; DXILCONTPOSTPROCESS-GLOBAL-NEXT:    [[TMP16:%.*]] = extractelement <4 x float> [[DOTSROA_0_12_VEC_INSERT]], i64 0
 ; DXILCONTPOSTPROCESS-GLOBAL-NEXT:    [[TMP17:%.*]] = extractelement <4 x float> [[DOTSROA_0_12_VEC_INSERT]], i64 1
 ; DXILCONTPOSTPROCESS-GLOBAL-NEXT:    [[TMP18:%.*]] = extractelement <4 x float> [[DOTSROA_0_12_VEC_INSERT]], i64 2
@@ -1706,7 +1709,7 @@ attributes #6 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 ;
 ;
 ; DXILCONTPOSTPROCESS-GLOBAL-LABEL: define void @MyClosestHitShader(
-; DXILCONTPOSTPROCESS-GLOBAL-SAME: i32 [[CSPINIT:%.*]], i64 [[RETURNADDR:%.*]], [[STRUCT_SYSTEMDATA:%.*]] [[TMP0:%.*]]) #[[ATTR3]] !lgc.rt.shaderstage [[META37:![0-9]+]] !continuation.registercount [[META34]] !continuation [[META38:![0-9]+]] !continuation.state [[META22]] {
+; DXILCONTPOSTPROCESS-GLOBAL-SAME: i32 [[CSPINIT:%.*]], i64 [[RETURNADDR:%.*]], [[STRUCT_SYSTEMDATA:%.*]] [[TMP0:%.*]]) #[[ATTR3]] !lgc.rt.shaderstage [[META38:![0-9]+]] !continuation.registercount [[META34]] !continuation [[META39:![0-9]+]] !continuation.state [[META22]] {
 ; DXILCONTPOSTPROCESS-GLOBAL-NEXT:  AllocaSpillBB:
 ; DXILCONTPOSTPROCESS-GLOBAL-NEXT:    [[CSP:%.*]] = alloca i32, align 4
 ; DXILCONTPOSTPROCESS-GLOBAL-NEXT:    store i32 [[CSPINIT]], ptr [[CSP]], align 4
@@ -1759,12 +1762,12 @@ attributes #6 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 ; DXILCONTPOSTPROCESS-GLOBAL-NEXT:    store i32 [[TMP26]], ptr addrspace(20) getelementptr inbounds (i32, ptr addrspace(20) @REGISTERS, i32 9), align 4
 ; DXILCONTPOSTPROCESS-GLOBAL-NEXT:    [[DOTFCA_0_INSERT:%.*]] = insertvalue [[STRUCT_DISPATCHSYSTEMDATA:%.*]] poison, <3 x i32> [[DOTFCA_0_0_EXTRACT]], 0
 ; DXILCONTPOSTPROCESS-GLOBAL-NEXT:    [[TMP27:%.*]] = load i32, ptr [[CSP]], align 4
-; DXILCONTPOSTPROCESS-GLOBAL-NEXT:    call void (i64, ...) @continuation.continue(i64 [[RETURNADDR]], i32 [[TMP27]], [[STRUCT_DISPATCHSYSTEMDATA]] [[DOTFCA_0_INSERT]]), !continuation.registercount [[META34]]
+; DXILCONTPOSTPROCESS-GLOBAL-NEXT:    call void (i64, ...) @continuation.continue(i64 [[RETURNADDR]], i32 [[TMP27]], i64 poison, [[STRUCT_DISPATCHSYSTEMDATA]] [[DOTFCA_0_INSERT]]), !continuation.registercount [[META34]]
 ; DXILCONTPOSTPROCESS-GLOBAL-NEXT:    unreachable
 ;
 ;
 ; DXILCONTPOSTPROCESS-GLOBAL-LABEL: define void @MyAnyHitShader(
-; DXILCONTPOSTPROCESS-GLOBAL-SAME: i32 [[CSPINIT:%.*]], i64 [[RETURNADDR:%.*]], [[STRUCT_TRAVERSALDATA:%.*]] [[TMP0:%.*]], [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES:%.*]] [[TMP1:%.*]]) #[[ATTR3]] !lgc.rt.shaderstage [[META39:![0-9]+]] !continuation.registercount [[META34]] !continuation [[META40:![0-9]+]] !continuation.state [[META22]] {
+; DXILCONTPOSTPROCESS-GLOBAL-SAME: i32 [[CSPINIT:%.*]], i64 [[RETURNADDR:%.*]], [[STRUCT_TRAVERSALDATA:%.*]] [[TMP0:%.*]], [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES:%.*]] [[TMP1:%.*]]) #[[ATTR3]] !lgc.rt.shaderstage [[META40:![0-9]+]] !continuation.registercount [[META34]] !continuation [[META41:![0-9]+]] !continuation.state [[META22]] {
 ; DXILCONTPOSTPROCESS-GLOBAL-NEXT:  AllocaSpillBB:
 ; DXILCONTPOSTPROCESS-GLOBAL-NEXT:    [[TMP2:%.*]] = alloca [[STRUCT_HITDATA:%.*]], align 8
 ; DXILCONTPOSTPROCESS-GLOBAL-NEXT:    [[TMP3:%.*]] = alloca [[STRUCT_HITDATA]], align 8
@@ -1883,12 +1886,12 @@ attributes #6 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 ; DXILCONTPOSTPROCESS-GLOBAL-NEXT:    [[DOTSROA_0_12_VEC_EXTRACT:%.*]] = extractelement <4 x float> [[DOTSROA_0_12_VEC_INSERT]], i32 3
 ; DXILCONTPOSTPROCESS-GLOBAL-NEXT:    [[TMP31:%.*]] = bitcast float [[DOTSROA_0_12_VEC_EXTRACT]] to i32
 ; DXILCONTPOSTPROCESS-GLOBAL-NEXT:    store i32 [[TMP31]], ptr addrspace(20) getelementptr inbounds (i32, ptr addrspace(20) @REGISTERS, i32 9), align 4
-; DXILCONTPOSTPROCESS-GLOBAL-NEXT:    [[HITATTRSALLOCA_SROA_0_0_VEC_EXTRACT:%.*]] = extractelement <2 x float> [[DOTFCA_0_EXTRACT]], i32 0
-; DXILCONTPOSTPROCESS-GLOBAL-NEXT:    [[TMP32:%.*]] = bitcast float [[HITATTRSALLOCA_SROA_0_0_VEC_EXTRACT]] to i32
+; DXILCONTPOSTPROCESS-GLOBAL-NEXT:    [[HITATTRSALLOCA_SROA_0_0_VEC_EXTRACT9:%.*]] = extractelement <2 x float> [[DOTFCA_0_EXTRACT]], i32 0
+; DXILCONTPOSTPROCESS-GLOBAL-NEXT:    [[TMP32:%.*]] = bitcast float [[HITATTRSALLOCA_SROA_0_0_VEC_EXTRACT9]] to i32
 ; DXILCONTPOSTPROCESS-GLOBAL-NEXT:    [[TMP33:%.*]] = bitcast i32 [[TMP32]] to float
 ; DXILCONTPOSTPROCESS-GLOBAL-NEXT:    [[DOTSROA_062_0_VEC_INSERT:%.*]] = insertelement <2 x float> undef, float [[TMP33]], i32 0
-; DXILCONTPOSTPROCESS-GLOBAL-NEXT:    [[HITATTRSALLOCA_SROA_0_4_VEC_EXTRACT:%.*]] = extractelement <2 x float> [[DOTFCA_0_EXTRACT]], i32 1
-; DXILCONTPOSTPROCESS-GLOBAL-NEXT:    [[TMP34:%.*]] = bitcast float [[HITATTRSALLOCA_SROA_0_4_VEC_EXTRACT]] to i32
+; DXILCONTPOSTPROCESS-GLOBAL-NEXT:    [[HITATTRSALLOCA_SROA_0_4_VEC_EXTRACT11:%.*]] = extractelement <2 x float> [[DOTFCA_0_EXTRACT]], i32 1
+; DXILCONTPOSTPROCESS-GLOBAL-NEXT:    [[TMP34:%.*]] = bitcast float [[HITATTRSALLOCA_SROA_0_4_VEC_EXTRACT11]] to i32
 ; DXILCONTPOSTPROCESS-GLOBAL-NEXT:    [[TMP35:%.*]] = bitcast i32 [[TMP34]] to float
 ; DXILCONTPOSTPROCESS-GLOBAL-NEXT:    [[DOTSROA_062_4_VEC_INSERT:%.*]] = insertelement <2 x float> [[DOTSROA_062_0_VEC_INSERT]], float [[TMP35]], i32 1
 ; DXILCONTPOSTPROCESS-GLOBAL-NEXT:    [[DOTFCA_0_INSERT:%.*]] = insertvalue [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]] poison, <2 x float> [[DOTSROA_062_4_VEC_INSERT]], 0
@@ -1922,28 +1925,28 @@ attributes #6 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 ; DXILCONTPOSTPROCESS-GLOBAL-NEXT:    [[DOTFCA_5_LOAD:%.*]] = load i64, ptr [[DOTFCA_5_GEP33]], align 4
 ; DXILCONTPOSTPROCESS-GLOBAL-NEXT:    [[DOTFCA_5_INSERT:%.*]] = insertvalue [[STRUCT_TRAVERSALDATA]] [[DOTFCA_4_INSERT]], i64 [[DOTFCA_5_LOAD]], 5
 ; DXILCONTPOSTPROCESS-GLOBAL-NEXT:    [[TMP37:%.*]] = load i32, ptr [[CSP]], align 4
-; DXILCONTPOSTPROCESS-GLOBAL-NEXT:    call void (i64, ...) @continuation.continue(i64 [[RETURNADDR]], i32 [[TMP37]], [[STRUCT_TRAVERSALDATA]] [[DOTFCA_5_INSERT]]), !continuation.registercount [[META34]]
+; DXILCONTPOSTPROCESS-GLOBAL-NEXT:    call void (i64, ...) @continuation.continue(i64 [[RETURNADDR]], i32 [[TMP37]], i64 poison, [[STRUCT_TRAVERSALDATA]] [[DOTFCA_5_INSERT]]), !continuation.registercount [[META34]]
 ; DXILCONTPOSTPROCESS-GLOBAL-NEXT:    unreachable
 ; DXILCONTPOSTPROCESS-GLOBAL:       38:
 ; DXILCONTPOSTPROCESS-GLOBAL-NEXT:    call void @_cont_AcceptHit(ptr [[SYSTEM_DATA_ALLOCA]])
-; DXILCONTPOSTPROCESS-GLOBAL-NEXT:    [[DOTSROA_0_0_VEC_EXTRACT15:%.*]] = extractelement <4 x float> [[DOTSROA_0_12_VEC_INSERT]], i32 0
-; DXILCONTPOSTPROCESS-GLOBAL-NEXT:    [[TMP39:%.*]] = bitcast float [[DOTSROA_0_0_VEC_EXTRACT15]] to i32
+; DXILCONTPOSTPROCESS-GLOBAL-NEXT:    [[DOTSROA_0_0_VEC_EXTRACT14:%.*]] = extractelement <4 x float> [[DOTSROA_0_12_VEC_INSERT]], i32 0
+; DXILCONTPOSTPROCESS-GLOBAL-NEXT:    [[TMP39:%.*]] = bitcast float [[DOTSROA_0_0_VEC_EXTRACT14]] to i32
 ; DXILCONTPOSTPROCESS-GLOBAL-NEXT:    store i32 [[TMP39]], ptr addrspace(20) @REGISTERS, align 4
 ; DXILCONTPOSTPROCESS-GLOBAL-NEXT:    [[DOTSROA_0_4_VEC_EXTRACT18:%.*]] = extractelement <4 x float> [[DOTSROA_0_12_VEC_INSERT]], i32 1
 ; DXILCONTPOSTPROCESS-GLOBAL-NEXT:    [[TMP40:%.*]] = bitcast float [[DOTSROA_0_4_VEC_EXTRACT18]] to i32
 ; DXILCONTPOSTPROCESS-GLOBAL-NEXT:    store i32 [[TMP40]], ptr addrspace(20) getelementptr inbounds (i32, ptr addrspace(20) @REGISTERS, i32 7), align 4
-; DXILCONTPOSTPROCESS-GLOBAL-NEXT:    [[DOTSROA_0_8_VEC_EXTRACT21:%.*]] = extractelement <4 x float> [[DOTSROA_0_12_VEC_INSERT]], i32 2
-; DXILCONTPOSTPROCESS-GLOBAL-NEXT:    [[TMP41:%.*]] = bitcast float [[DOTSROA_0_8_VEC_EXTRACT21]] to i32
+; DXILCONTPOSTPROCESS-GLOBAL-NEXT:    [[DOTSROA_0_8_VEC_EXTRACT20:%.*]] = extractelement <4 x float> [[DOTSROA_0_12_VEC_INSERT]], i32 2
+; DXILCONTPOSTPROCESS-GLOBAL-NEXT:    [[TMP41:%.*]] = bitcast float [[DOTSROA_0_8_VEC_EXTRACT20]] to i32
 ; DXILCONTPOSTPROCESS-GLOBAL-NEXT:    store i32 [[TMP41]], ptr addrspace(20) getelementptr inbounds (i32, ptr addrspace(20) @REGISTERS, i32 8), align 4
-; DXILCONTPOSTPROCESS-GLOBAL-NEXT:    [[DOTSROA_0_12_VEC_EXTRACT24:%.*]] = extractelement <4 x float> [[DOTSROA_0_12_VEC_INSERT]], i32 3
-; DXILCONTPOSTPROCESS-GLOBAL-NEXT:    [[TMP42:%.*]] = bitcast float [[DOTSROA_0_12_VEC_EXTRACT24]] to i32
+; DXILCONTPOSTPROCESS-GLOBAL-NEXT:    [[DOTSROA_0_12_VEC_EXTRACT23:%.*]] = extractelement <4 x float> [[DOTSROA_0_12_VEC_INSERT]], i32 3
+; DXILCONTPOSTPROCESS-GLOBAL-NEXT:    [[TMP42:%.*]] = bitcast float [[DOTSROA_0_12_VEC_EXTRACT23]] to i32
 ; DXILCONTPOSTPROCESS-GLOBAL-NEXT:    store i32 [[TMP42]], ptr addrspace(20) getelementptr inbounds (i32, ptr addrspace(20) @REGISTERS, i32 9), align 4
-; DXILCONTPOSTPROCESS-GLOBAL-NEXT:    [[HITATTRSALLOCA_SROA_0_0_VEC_EXTRACT9:%.*]] = extractelement <2 x float> [[DOTFCA_0_EXTRACT]], i32 0
-; DXILCONTPOSTPROCESS-GLOBAL-NEXT:    [[TMP43:%.*]] = bitcast float [[HITATTRSALLOCA_SROA_0_0_VEC_EXTRACT9]] to i32
+; DXILCONTPOSTPROCESS-GLOBAL-NEXT:    [[HITATTRSALLOCA_SROA_0_0_VEC_EXTRACT:%.*]] = extractelement <2 x float> [[DOTFCA_0_EXTRACT]], i32 0
+; DXILCONTPOSTPROCESS-GLOBAL-NEXT:    [[TMP43:%.*]] = bitcast float [[HITATTRSALLOCA_SROA_0_0_VEC_EXTRACT]] to i32
 ; DXILCONTPOSTPROCESS-GLOBAL-NEXT:    [[TMP44:%.*]] = bitcast i32 [[TMP43]] to float
 ; DXILCONTPOSTPROCESS-GLOBAL-NEXT:    [[DOTSROA_066_0_VEC_INSERT:%.*]] = insertelement <2 x float> undef, float [[TMP44]], i32 0
-; DXILCONTPOSTPROCESS-GLOBAL-NEXT:    [[HITATTRSALLOCA_SROA_0_4_VEC_EXTRACT11:%.*]] = extractelement <2 x float> [[DOTFCA_0_EXTRACT]], i32 1
-; DXILCONTPOSTPROCESS-GLOBAL-NEXT:    [[TMP45:%.*]] = bitcast float [[HITATTRSALLOCA_SROA_0_4_VEC_EXTRACT11]] to i32
+; DXILCONTPOSTPROCESS-GLOBAL-NEXT:    [[HITATTRSALLOCA_SROA_0_4_VEC_EXTRACT:%.*]] = extractelement <2 x float> [[DOTFCA_0_EXTRACT]], i32 1
+; DXILCONTPOSTPROCESS-GLOBAL-NEXT:    [[TMP45:%.*]] = bitcast float [[HITATTRSALLOCA_SROA_0_4_VEC_EXTRACT]] to i32
 ; DXILCONTPOSTPROCESS-GLOBAL-NEXT:    [[TMP46:%.*]] = bitcast i32 [[TMP45]] to float
 ; DXILCONTPOSTPROCESS-GLOBAL-NEXT:    [[DOTSROA_066_4_VEC_INSERT:%.*]] = insertelement <2 x float> [[DOTSROA_066_0_VEC_INSERT]], float [[TMP46]], i32 1
 ; DXILCONTPOSTPROCESS-GLOBAL-NEXT:    [[DOTFCA_0_INSERT65:%.*]] = insertvalue [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]] poison, <2 x float> [[DOTSROA_066_4_VEC_INSERT]], 0
@@ -1977,12 +1980,12 @@ attributes #6 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 ; DXILCONTPOSTPROCESS-GLOBAL-NEXT:    [[DOTFCA_5_LOAD58:%.*]] = load i64, ptr [[DOTFCA_5_GEP57]], align 4
 ; DXILCONTPOSTPROCESS-GLOBAL-NEXT:    [[DOTFCA_5_INSERT59:%.*]] = insertvalue [[STRUCT_TRAVERSALDATA]] [[DOTFCA_4_INSERT56]], i64 [[DOTFCA_5_LOAD58]], 5
 ; DXILCONTPOSTPROCESS-GLOBAL-NEXT:    [[TMP48:%.*]] = load i32, ptr [[CSP]], align 4
-; DXILCONTPOSTPROCESS-GLOBAL-NEXT:    call void (i64, ...) @continuation.continue(i64 [[RETURNADDR]], i32 [[TMP48]], [[STRUCT_TRAVERSALDATA]] [[DOTFCA_5_INSERT59]]), !continuation.registercount [[META34]]
+; DXILCONTPOSTPROCESS-GLOBAL-NEXT:    call void (i64, ...) @continuation.continue(i64 [[RETURNADDR]], i32 [[TMP48]], i64 poison, [[STRUCT_TRAVERSALDATA]] [[DOTFCA_5_INSERT59]]), !continuation.registercount [[META34]]
 ; DXILCONTPOSTPROCESS-GLOBAL-NEXT:    unreachable
 ;
 ;
 ; DXILCONTPOSTPROCESS-GLOBAL-LABEL: define void @MyIntersectionShader(
-; DXILCONTPOSTPROCESS-GLOBAL-SAME: i32 [[CSPINIT:%.*]], i64 [[RETURNADDR:%.*]], [[STRUCT_TRAVERSALDATA:%.*]] [[TMP0:%.*]]) #[[ATTR3]] !lgc.rt.shaderstage [[META41:![0-9]+]] !continuation.registercount [[META33:![0-9]+]] !continuation [[META42:![0-9]+]] !continuation.stacksize [[META43:![0-9]+]] !continuation.state [[META43]] {
+; DXILCONTPOSTPROCESS-GLOBAL-SAME: i32 [[CSPINIT:%.*]], i64 [[RETURNADDR:%.*]], [[STRUCT_TRAVERSALDATA:%.*]] [[TMP0:%.*]]) #[[ATTR3]] !lgc.rt.shaderstage [[META42:![0-9]+]] !continuation.registercount [[META33:![0-9]+]] !continuation [[META43:![0-9]+]] !continuation.stacksize [[META44:![0-9]+]] !continuation.state [[META44]] {
 ; DXILCONTPOSTPROCESS-GLOBAL-NEXT:  AllocaSpillBB:
 ; DXILCONTPOSTPROCESS-GLOBAL-NEXT:    [[CSP:%.*]] = alloca i32, align 4
 ; DXILCONTPOSTPROCESS-GLOBAL-NEXT:    store i32 [[CSPINIT]], ptr [[CSP]], align 4
@@ -2018,9 +2021,9 @@ attributes #6 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 ; DXILCONTPOSTPROCESS-GLOBAL-NEXT:    [[TRAV_DATA_I_FCA_4_INSERT:%.*]] = insertvalue [[STRUCT_TRAVERSALDATA]] [[TRAV_DATA_I_FCA_3_INSERT]], float [[DOTFCA_4_EXTRACT]], 4
 ; DXILCONTPOSTPROCESS-GLOBAL-NEXT:    [[TRAV_DATA_I_FCA_5_INSERT:%.*]] = insertvalue [[STRUCT_TRAVERSALDATA]] [[TRAV_DATA_I_FCA_4_INSERT]], i64 [[DOTFCA_5_EXTRACT]], 5
 ; DXILCONTPOSTPROCESS-GLOBAL-NEXT:    [[DOTFCA_0_INSERT:%.*]] = insertvalue [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES:%.*]] poison, <2 x float> undef, 0
+; DXILCONTPOSTPROCESS-GLOBAL-NEXT:    [[TMP7:%.*]] = call i64 @continuation.getAddrAndMD(ptr @MyIntersectionShader.resume.0)
 ; DXILCONTPOSTPROCESS-GLOBAL-NEXT:    [[TMP6:%.*]] = load i32, ptr [[CSP]], align 4
-; DXILCONTPOSTPROCESS-GLOBAL-NEXT:    [[TMP7:%.*]] = call i64 @continuation.getAddrAndMD(i64 ptrtoint (ptr @MyIntersectionShader.resume.0 to i64))
-; DXILCONTPOSTPROCESS-GLOBAL-NEXT:    call void (i64, ...) @continuation.continue(i64 3, i32 [[TMP6]], i64 [[TMP7]], [[STRUCT_TRAVERSALDATA]] [[TRAV_DATA_I_FCA_5_INSERT]], float [[RES_I_FCA_1_INSERT_FCA_0_EXTRACT]], i32 0, [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]] [[DOTFCA_0_INSERT]]), !continuation.registercount [[META33]], !continuation.returnedRegistercount !33
+; DXILCONTPOSTPROCESS-GLOBAL-NEXT:    call void (i64, ...) @continuation.continue(i64 3, i32 [[TMP6]], i64 [[TMP7]], [[STRUCT_TRAVERSALDATA]] [[TRAV_DATA_I_FCA_5_INSERT]], float [[RES_I_FCA_1_INSERT_FCA_0_EXTRACT]], i32 0, [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]] [[DOTFCA_0_INSERT]]), !continuation.registercount [[META33]], !continuation.returnedRegistercount [[META33]]
 ; DXILCONTPOSTPROCESS-GLOBAL-NEXT:    unreachable
 ; DXILCONTPOSTPROCESS-GLOBAL:       accepthit.i:
 ; DXILCONTPOSTPROCESS-GLOBAL-NEXT:    [[DOTSROA_0_0_VEC_EXTRACT:%.*]] = extractelement <2 x float> undef, i32 0
@@ -2048,7 +2051,7 @@ attributes #6 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 ; DXILCONTPOSTPROCESS-GLOBAL-NEXT:    [[TMP14:%.*]] = add i32 [[TMP13]], -8
 ; DXILCONTPOSTPROCESS-GLOBAL-NEXT:    store i32 [[TMP14]], ptr [[CSP]], align 4
 ; DXILCONTPOSTPROCESS-GLOBAL-NEXT:    [[TMP15:%.*]] = load i32, ptr [[CSP]], align 4
-; DXILCONTPOSTPROCESS-GLOBAL-NEXT:    call void (i64, ...) @continuation.continue(i64 [[RETURNADDR]], i32 [[TMP15]], [[STRUCT_TRAVERSALDATA]] [[DOTFCA_5_INSERT49]]), !continuation.registercount [[META33]]
+; DXILCONTPOSTPROCESS-GLOBAL-NEXT:    call void (i64, ...) @continuation.continue(i64 [[RETURNADDR]], i32 [[TMP15]], i64 poison, [[STRUCT_TRAVERSALDATA]] [[DOTFCA_5_INSERT49]]), !continuation.registercount [[META33]]
 ; DXILCONTPOSTPROCESS-GLOBAL-NEXT:    unreachable
 ; DXILCONTPOSTPROCESS-GLOBAL:       16:
 ; DXILCONTPOSTPROCESS-GLOBAL-NEXT:    [[DOTFCA_0_0_0_INSERT:%.*]] = insertvalue [[STRUCT_TRAVERSALDATA]] poison, <3 x i32> [[DOTFCA_0_0_0_EXTRACT]], 0, 0, 0
@@ -2063,31 +2066,31 @@ attributes #6 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 ; DXILCONTPOSTPROCESS-GLOBAL-NEXT:    [[TMP18:%.*]] = add i32 [[TMP17]], -8
 ; DXILCONTPOSTPROCESS-GLOBAL-NEXT:    store i32 [[TMP18]], ptr [[CSP]], align 4
 ; DXILCONTPOSTPROCESS-GLOBAL-NEXT:    [[TMP19:%.*]] = load i32, ptr [[CSP]], align 4
-; DXILCONTPOSTPROCESS-GLOBAL-NEXT:    call void (i64, ...) @continuation.continue(i64 [[RETURNADDR]], i32 [[TMP19]], [[STRUCT_TRAVERSALDATA]] [[DOTFCA_5_INSERT]]), !continuation.registercount [[META33]]
+; DXILCONTPOSTPROCESS-GLOBAL-NEXT:    call void (i64, ...) @continuation.continue(i64 [[RETURNADDR]], i32 [[TMP19]], i64 poison, [[STRUCT_TRAVERSALDATA]] [[DOTFCA_5_INSERT]]), !continuation.registercount [[META33]]
 ; DXILCONTPOSTPROCESS-GLOBAL-NEXT:    unreachable
 ;
 ;
 ; DXILCONTPOSTPROCESS-GLOBAL-LABEL: define dso_local void @MyIntersectionShader.resume.0(
-; DXILCONTPOSTPROCESS-GLOBAL-SAME: i32 [[CSPINIT:%.*]], [[STRUCT_TRAVERSALDATA:%.*]] [[TMP0:%.*]]) !lgc.rt.shaderstage [[META41]] !continuation.registercount [[META33]] !continuation [[META42]] {
+; DXILCONTPOSTPROCESS-GLOBAL-SAME: i32 [[CSPINIT:%.*]], i64 [[TMP0:%.*]], [[STRUCT_TRAVERSALDATA:%.*]] [[TMP1:%.*]]) !lgc.rt.shaderstage [[META42]] !continuation.registercount [[META33]] !continuation [[META43]] {
 ; DXILCONTPOSTPROCESS-GLOBAL-NEXT:  entryresume.0:
 ; DXILCONTPOSTPROCESS-GLOBAL-NEXT:    [[CSP:%.*]] = alloca i32, align 4
 ; DXILCONTPOSTPROCESS-GLOBAL-NEXT:    store i32 [[CSPINIT]], ptr [[CSP]], align 4
-; DXILCONTPOSTPROCESS-GLOBAL-NEXT:    [[TMP1:%.*]] = call i64 @_cont_GetContinuationStackGlobalMemBase()
-; DXILCONTPOSTPROCESS-GLOBAL-NEXT:    [[TMP2:%.*]] = inttoptr i64 [[TMP1]] to ptr addrspace(22)
+; DXILCONTPOSTPROCESS-GLOBAL-NEXT:    [[TMP15:%.*]] = call i64 @_cont_GetContinuationStackGlobalMemBase()
+; DXILCONTPOSTPROCESS-GLOBAL-NEXT:    [[TMP2:%.*]] = inttoptr i64 [[TMP15]] to ptr addrspace(22)
 ; DXILCONTPOSTPROCESS-GLOBAL-NEXT:    [[TMP3:%.*]] = load i32, ptr [[CSP]], align 4
 ; DXILCONTPOSTPROCESS-GLOBAL-NEXT:    [[TMP4:%.*]] = add i32 [[TMP3]], -8
-; DXILCONTPOSTPROCESS-GLOBAL-NEXT:    [[DOTFCA_0_0_0_EXTRACT10:%.*]] = extractvalue [[STRUCT_TRAVERSALDATA]] [[TMP0]], 0, 0, 0
-; DXILCONTPOSTPROCESS-GLOBAL-NEXT:    [[DOTFCA_0_1_0_EXTRACT12:%.*]] = extractvalue [[STRUCT_TRAVERSALDATA]] [[TMP0]], 0, 1, 0
-; DXILCONTPOSTPROCESS-GLOBAL-NEXT:    [[DOTFCA_1_0_EXTRACT14:%.*]] = extractvalue [[STRUCT_TRAVERSALDATA]] [[TMP0]], 1, 0
-; DXILCONTPOSTPROCESS-GLOBAL-NEXT:    [[DOTFCA_1_1_EXTRACT16:%.*]] = extractvalue [[STRUCT_TRAVERSALDATA]] [[TMP0]], 1, 1
-; DXILCONTPOSTPROCESS-GLOBAL-NEXT:    [[DOTFCA_2_EXTRACT18:%.*]] = extractvalue [[STRUCT_TRAVERSALDATA]] [[TMP0]], 2
-; DXILCONTPOSTPROCESS-GLOBAL-NEXT:    [[DOTFCA_3_EXTRACT20:%.*]] = extractvalue [[STRUCT_TRAVERSALDATA]] [[TMP0]], 3
-; DXILCONTPOSTPROCESS-GLOBAL-NEXT:    [[DOTFCA_4_EXTRACT22:%.*]] = extractvalue [[STRUCT_TRAVERSALDATA]] [[TMP0]], 4
-; DXILCONTPOSTPROCESS-GLOBAL-NEXT:    [[DOTFCA_5_EXTRACT24:%.*]] = extractvalue [[STRUCT_TRAVERSALDATA]] [[TMP0]], 5
+; DXILCONTPOSTPROCESS-GLOBAL-NEXT:    [[DOTFCA_0_0_0_EXTRACT10:%.*]] = extractvalue [[STRUCT_TRAVERSALDATA]] [[TMP1]], 0, 0, 0
+; DXILCONTPOSTPROCESS-GLOBAL-NEXT:    [[DOTFCA_0_1_0_EXTRACT12:%.*]] = extractvalue [[STRUCT_TRAVERSALDATA]] [[TMP1]], 0, 1, 0
+; DXILCONTPOSTPROCESS-GLOBAL-NEXT:    [[DOTFCA_1_0_EXTRACT14:%.*]] = extractvalue [[STRUCT_TRAVERSALDATA]] [[TMP1]], 1, 0
+; DXILCONTPOSTPROCESS-GLOBAL-NEXT:    [[DOTFCA_1_1_EXTRACT16:%.*]] = extractvalue [[STRUCT_TRAVERSALDATA]] [[TMP1]], 1, 1
+; DXILCONTPOSTPROCESS-GLOBAL-NEXT:    [[DOTFCA_2_EXTRACT18:%.*]] = extractvalue [[STRUCT_TRAVERSALDATA]] [[TMP1]], 2
+; DXILCONTPOSTPROCESS-GLOBAL-NEXT:    [[DOTFCA_3_EXTRACT20:%.*]] = extractvalue [[STRUCT_TRAVERSALDATA]] [[TMP1]], 3
+; DXILCONTPOSTPROCESS-GLOBAL-NEXT:    [[DOTFCA_4_EXTRACT22:%.*]] = extractvalue [[STRUCT_TRAVERSALDATA]] [[TMP1]], 4
+; DXILCONTPOSTPROCESS-GLOBAL-NEXT:    [[DOTFCA_5_EXTRACT24:%.*]] = extractvalue [[STRUCT_TRAVERSALDATA]] [[TMP1]], 5
 ; DXILCONTPOSTPROCESS-GLOBAL-NEXT:    call void @amd.dx.setLocalRootIndex(i32 5)
 ; DXILCONTPOSTPROCESS-GLOBAL-NEXT:    [[ISEND_I:%.*]] = call i1 @opaqueIsEnd()
 ; DXILCONTPOSTPROCESS-GLOBAL-NEXT:    br i1 [[ISEND_I]], label [[TMP5:%.*]], label [[TMP10:%.*]]
-; DXILCONTPOSTPROCESS-GLOBAL:       5:
+; DXILCONTPOSTPROCESS-GLOBAL:       6:
 ; DXILCONTPOSTPROCESS-GLOBAL-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP2]], i32 [[TMP4]]
 ; DXILCONTPOSTPROCESS-GLOBAL-NEXT:    [[RETURNADDR_RELOAD2:%.*]] = load i64, ptr addrspace(22) [[TMP6]], align 4
 ; DXILCONTPOSTPROCESS-GLOBAL-NEXT:    [[DOTFCA_0_0_0_INSERT28:%.*]] = insertvalue [[STRUCT_TRAVERSALDATA]] poison, <3 x i32> [[DOTFCA_0_0_0_EXTRACT10]], 0, 0, 0
@@ -2102,9 +2105,9 @@ attributes #6 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 ; DXILCONTPOSTPROCESS-GLOBAL-NEXT:    [[TMP8:%.*]] = add i32 [[TMP7]], -8
 ; DXILCONTPOSTPROCESS-GLOBAL-NEXT:    store i32 [[TMP8]], ptr [[CSP]], align 4
 ; DXILCONTPOSTPROCESS-GLOBAL-NEXT:    [[TMP9:%.*]] = load i32, ptr [[CSP]], align 4
-; DXILCONTPOSTPROCESS-GLOBAL-NEXT:    call void (i64, ...) @continuation.continue(i64 [[RETURNADDR_RELOAD2]], i32 [[TMP9]], [[STRUCT_TRAVERSALDATA]] [[DOTFCA_5_INSERT49]]), !continuation.registercount [[META33]]
+; DXILCONTPOSTPROCESS-GLOBAL-NEXT:    call void (i64, ...) @continuation.continue(i64 [[RETURNADDR_RELOAD2]], i32 [[TMP9]], i64 poison, [[STRUCT_TRAVERSALDATA]] [[DOTFCA_5_INSERT49]]), !continuation.registercount [[META33]]
 ; DXILCONTPOSTPROCESS-GLOBAL-NEXT:    unreachable
-; DXILCONTPOSTPROCESS-GLOBAL:       10:
+; DXILCONTPOSTPROCESS-GLOBAL:       11:
 ; DXILCONTPOSTPROCESS-GLOBAL-NEXT:    [[TMP11:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP2]], i32 [[TMP4]]
 ; DXILCONTPOSTPROCESS-GLOBAL-NEXT:    [[RETURNADDR_RELOAD:%.*]] = load i64, ptr addrspace(22) [[TMP11]], align 4
 ; DXILCONTPOSTPROCESS-GLOBAL-NEXT:    [[DOTFCA_0_0_0_INSERT:%.*]] = insertvalue [[STRUCT_TRAVERSALDATA]] poison, <3 x i32> [[DOTFCA_0_0_0_EXTRACT10]], 0, 0, 0
@@ -2119,12 +2122,12 @@ attributes #6 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 ; DXILCONTPOSTPROCESS-GLOBAL-NEXT:    [[TMP13:%.*]] = add i32 [[TMP12]], -8
 ; DXILCONTPOSTPROCESS-GLOBAL-NEXT:    store i32 [[TMP13]], ptr [[CSP]], align 4
 ; DXILCONTPOSTPROCESS-GLOBAL-NEXT:    [[TMP14:%.*]] = load i32, ptr [[CSP]], align 4
-; DXILCONTPOSTPROCESS-GLOBAL-NEXT:    call void (i64, ...) @continuation.continue(i64 [[RETURNADDR_RELOAD]], i32 [[TMP14]], [[STRUCT_TRAVERSALDATA]] [[DOTFCA_5_INSERT]]), !continuation.registercount [[META33]]
+; DXILCONTPOSTPROCESS-GLOBAL-NEXT:    call void (i64, ...) @continuation.continue(i64 [[RETURNADDR_RELOAD]], i32 [[TMP14]], i64 poison, [[STRUCT_TRAVERSALDATA]] [[DOTFCA_5_INSERT]]), !continuation.registercount [[META33]]
 ; DXILCONTPOSTPROCESS-GLOBAL-NEXT:    unreachable
 ;
 ;
 ; DXILCONTPOSTPROCESS-GLOBAL-LABEL: define void @MyIntersectionShaderLargeAttrs(
-; DXILCONTPOSTPROCESS-GLOBAL-SAME: i32 [[CSPINIT:%.*]], i64 [[RETURNADDR:%.*]], [[STRUCT_TRAVERSALDATA:%.*]] [[TMP0:%.*]]) #[[ATTR3]] !lgc.rt.shaderstage [[META41]] !continuation.registercount [[META33]] !continuation [[META44:![0-9]+]] !continuation.stacksize [[META43]] !continuation.state [[META43]] {
+; DXILCONTPOSTPROCESS-GLOBAL-SAME: i32 [[CSPINIT:%.*]], i64 [[RETURNADDR:%.*]], [[STRUCT_TRAVERSALDATA:%.*]] [[TMP0:%.*]]) #[[ATTR3]] !lgc.rt.shaderstage [[META42]] !continuation.registercount [[META33]] !continuation [[META45:![0-9]+]] !continuation.stacksize [[META44]] !continuation.state [[META44]] {
 ; DXILCONTPOSTPROCESS-GLOBAL-NEXT:  AllocaSpillBB:
 ; DXILCONTPOSTPROCESS-GLOBAL-NEXT:    [[CSP:%.*]] = alloca i32, align 4
 ; DXILCONTPOSTPROCESS-GLOBAL-NEXT:    store i32 [[CSPINIT]], ptr [[CSP]], align 4
@@ -2166,9 +2169,9 @@ attributes #6 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 ; DXILCONTPOSTPROCESS-GLOBAL-NEXT:    [[DOTFCA_0_4_INSERT:%.*]] = insertvalue [[STRUCT_LARGEINTERSECTIONATTRIBUTES]] [[DOTFCA_0_3_INSERT]], i32 104, 0, 4
 ; DXILCONTPOSTPROCESS-GLOBAL-NEXT:    [[DOTFCA_0_5_INSERT:%.*]] = insertvalue [[STRUCT_LARGEINTERSECTIONATTRIBUTES]] [[DOTFCA_0_4_INSERT]], i32 105, 0, 5
 ; DXILCONTPOSTPROCESS-GLOBAL-NEXT:    [[DOTFCA_0_6_INSERT:%.*]] = insertvalue [[STRUCT_LARGEINTERSECTIONATTRIBUTES]] [[DOTFCA_0_5_INSERT]], i32 106, 0, 6
+; DXILCONTPOSTPROCESS-GLOBAL-NEXT:    [[TMP7:%.*]] = call i64 @continuation.getAddrAndMD(ptr @MyIntersectionShaderLargeAttrs.resume.0)
 ; DXILCONTPOSTPROCESS-GLOBAL-NEXT:    [[TMP6:%.*]] = load i32, ptr [[CSP]], align 4
-; DXILCONTPOSTPROCESS-GLOBAL-NEXT:    [[TMP7:%.*]] = call i64 @continuation.getAddrAndMD(i64 ptrtoint (ptr @MyIntersectionShaderLargeAttrs.resume.0 to i64))
-; DXILCONTPOSTPROCESS-GLOBAL-NEXT:    call void (i64, ...) @continuation.continue(i64 3, i32 [[TMP6]], i64 [[TMP7]], [[STRUCT_TRAVERSALDATA]] [[TRAV_DATA_I_FCA_5_INSERT]], float [[RES_I_FCA_1_INSERT_FCA_0_EXTRACT]], i32 0, [[STRUCT_LARGEINTERSECTIONATTRIBUTES]] [[DOTFCA_0_6_INSERT]]), !continuation.registercount [[META33]], !continuation.returnedRegistercount !33
+; DXILCONTPOSTPROCESS-GLOBAL-NEXT:    call void (i64, ...) @continuation.continue(i64 3, i32 [[TMP6]], i64 [[TMP7]], [[STRUCT_TRAVERSALDATA]] [[TRAV_DATA_I_FCA_5_INSERT]], float [[RES_I_FCA_1_INSERT_FCA_0_EXTRACT]], i32 0, [[STRUCT_LARGEINTERSECTIONATTRIBUTES]] [[DOTFCA_0_6_INSERT]]), !continuation.registercount [[META33]], !continuation.returnedRegistercount [[META33]]
 ; DXILCONTPOSTPROCESS-GLOBAL-NEXT:    unreachable
 ; DXILCONTPOSTPROCESS-GLOBAL:       accepthit.i:
 ; DXILCONTPOSTPROCESS-GLOBAL-NEXT:    [[TMP8:%.*]] = bitcast i32 100 to float
@@ -2197,7 +2200,7 @@ attributes #6 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 ; DXILCONTPOSTPROCESS-GLOBAL-NEXT:    [[TMP12:%.*]] = add i32 [[TMP11]], -8
 ; DXILCONTPOSTPROCESS-GLOBAL-NEXT:    store i32 [[TMP12]], ptr [[CSP]], align 4
 ; DXILCONTPOSTPROCESS-GLOBAL-NEXT:    [[TMP13:%.*]] = load i32, ptr [[CSP]], align 4
-; DXILCONTPOSTPROCESS-GLOBAL-NEXT:    call void (i64, ...) @continuation.continue(i64 [[RETURNADDR]], i32 [[TMP13]], [[STRUCT_TRAVERSALDATA]] [[DOTFCA_5_INSERT49]]), !continuation.registercount [[META33]]
+; DXILCONTPOSTPROCESS-GLOBAL-NEXT:    call void (i64, ...) @continuation.continue(i64 [[RETURNADDR]], i32 [[TMP13]], i64 poison, [[STRUCT_TRAVERSALDATA]] [[DOTFCA_5_INSERT49]]), !continuation.registercount [[META33]]
 ; DXILCONTPOSTPROCESS-GLOBAL-NEXT:    unreachable
 ; DXILCONTPOSTPROCESS-GLOBAL:       14:
 ; DXILCONTPOSTPROCESS-GLOBAL-NEXT:    [[DOTFCA_0_0_0_INSERT:%.*]] = insertvalue [[STRUCT_TRAVERSALDATA]] poison, <3 x i32> [[DOTFCA_0_0_0_EXTRACT]], 0, 0, 0
@@ -2212,31 +2215,31 @@ attributes #6 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 ; DXILCONTPOSTPROCESS-GLOBAL-NEXT:    [[TMP16:%.*]] = add i32 [[TMP15]], -8
 ; DXILCONTPOSTPROCESS-GLOBAL-NEXT:    store i32 [[TMP16]], ptr [[CSP]], align 4
 ; DXILCONTPOSTPROCESS-GLOBAL-NEXT:    [[TMP17:%.*]] = load i32, ptr [[CSP]], align 4
-; DXILCONTPOSTPROCESS-GLOBAL-NEXT:    call void (i64, ...) @continuation.continue(i64 [[RETURNADDR]], i32 [[TMP17]], [[STRUCT_TRAVERSALDATA]] [[DOTFCA_5_INSERT]]), !continuation.registercount [[META33]]
+; DXILCONTPOSTPROCESS-GLOBAL-NEXT:    call void (i64, ...) @continuation.continue(i64 [[RETURNADDR]], i32 [[TMP17]], i64 poison, [[STRUCT_TRAVERSALDATA]] [[DOTFCA_5_INSERT]]), !continuation.registercount [[META33]]
 ; DXILCONTPOSTPROCESS-GLOBAL-NEXT:    unreachable
 ;
 ;
 ; DXILCONTPOSTPROCESS-GLOBAL-LABEL: define dso_local void @MyIntersectionShaderLargeAttrs.resume.0(
-; DXILCONTPOSTPROCESS-GLOBAL-SAME: i32 [[CSPINIT:%.*]], [[STRUCT_TRAVERSALDATA:%.*]] [[TMP0:%.*]]) !lgc.rt.shaderstage [[META41]] !continuation.registercount [[META33]] !continuation [[META44]] {
+; DXILCONTPOSTPROCESS-GLOBAL-SAME: i32 [[CSPINIT:%.*]], i64 [[TMP0:%.*]], [[STRUCT_TRAVERSALDATA:%.*]] [[TMP1:%.*]]) !lgc.rt.shaderstage [[META42]] !continuation.registercount [[META33]] !continuation [[META45]] {
 ; DXILCONTPOSTPROCESS-GLOBAL-NEXT:  entryresume.0:
 ; DXILCONTPOSTPROCESS-GLOBAL-NEXT:    [[CSP:%.*]] = alloca i32, align 4
 ; DXILCONTPOSTPROCESS-GLOBAL-NEXT:    store i32 [[CSPINIT]], ptr [[CSP]], align 4
-; DXILCONTPOSTPROCESS-GLOBAL-NEXT:    [[TMP1:%.*]] = call i64 @_cont_GetContinuationStackGlobalMemBase()
-; DXILCONTPOSTPROCESS-GLOBAL-NEXT:    [[TMP2:%.*]] = inttoptr i64 [[TMP1]] to ptr addrspace(22)
+; DXILCONTPOSTPROCESS-GLOBAL-NEXT:    [[TMP15:%.*]] = call i64 @_cont_GetContinuationStackGlobalMemBase()
+; DXILCONTPOSTPROCESS-GLOBAL-NEXT:    [[TMP2:%.*]] = inttoptr i64 [[TMP15]] to ptr addrspace(22)
 ; DXILCONTPOSTPROCESS-GLOBAL-NEXT:    [[TMP3:%.*]] = load i32, ptr [[CSP]], align 4
 ; DXILCONTPOSTPROCESS-GLOBAL-NEXT:    [[TMP4:%.*]] = add i32 [[TMP3]], -8
-; DXILCONTPOSTPROCESS-GLOBAL-NEXT:    [[DOTFCA_0_0_0_EXTRACT10:%.*]] = extractvalue [[STRUCT_TRAVERSALDATA]] [[TMP0]], 0, 0, 0
-; DXILCONTPOSTPROCESS-GLOBAL-NEXT:    [[DOTFCA_0_1_0_EXTRACT12:%.*]] = extractvalue [[STRUCT_TRAVERSALDATA]] [[TMP0]], 0, 1, 0
-; DXILCONTPOSTPROCESS-GLOBAL-NEXT:    [[DOTFCA_1_0_EXTRACT14:%.*]] = extractvalue [[STRUCT_TRAVERSALDATA]] [[TMP0]], 1, 0
-; DXILCONTPOSTPROCESS-GLOBAL-NEXT:    [[DOTFCA_1_1_EXTRACT16:%.*]] = extractvalue [[STRUCT_TRAVERSALDATA]] [[TMP0]], 1, 1
-; DXILCONTPOSTPROCESS-GLOBAL-NEXT:    [[DOTFCA_2_EXTRACT18:%.*]] = extractvalue [[STRUCT_TRAVERSALDATA]] [[TMP0]], 2
-; DXILCONTPOSTPROCESS-GLOBAL-NEXT:    [[DOTFCA_3_EXTRACT20:%.*]] = extractvalue [[STRUCT_TRAVERSALDATA]] [[TMP0]], 3
-; DXILCONTPOSTPROCESS-GLOBAL-NEXT:    [[DOTFCA_4_EXTRACT22:%.*]] = extractvalue [[STRUCT_TRAVERSALDATA]] [[TMP0]], 4
-; DXILCONTPOSTPROCESS-GLOBAL-NEXT:    [[DOTFCA_5_EXTRACT24:%.*]] = extractvalue [[STRUCT_TRAVERSALDATA]] [[TMP0]], 5
+; DXILCONTPOSTPROCESS-GLOBAL-NEXT:    [[DOTFCA_0_0_0_EXTRACT10:%.*]] = extractvalue [[STRUCT_TRAVERSALDATA]] [[TMP1]], 0, 0, 0
+; DXILCONTPOSTPROCESS-GLOBAL-NEXT:    [[DOTFCA_0_1_0_EXTRACT12:%.*]] = extractvalue [[STRUCT_TRAVERSALDATA]] [[TMP1]], 0, 1, 0
+; DXILCONTPOSTPROCESS-GLOBAL-NEXT:    [[DOTFCA_1_0_EXTRACT14:%.*]] = extractvalue [[STRUCT_TRAVERSALDATA]] [[TMP1]], 1, 0
+; DXILCONTPOSTPROCESS-GLOBAL-NEXT:    [[DOTFCA_1_1_EXTRACT16:%.*]] = extractvalue [[STRUCT_TRAVERSALDATA]] [[TMP1]], 1, 1
+; DXILCONTPOSTPROCESS-GLOBAL-NEXT:    [[DOTFCA_2_EXTRACT18:%.*]] = extractvalue [[STRUCT_TRAVERSALDATA]] [[TMP1]], 2
+; DXILCONTPOSTPROCESS-GLOBAL-NEXT:    [[DOTFCA_3_EXTRACT20:%.*]] = extractvalue [[STRUCT_TRAVERSALDATA]] [[TMP1]], 3
+; DXILCONTPOSTPROCESS-GLOBAL-NEXT:    [[DOTFCA_4_EXTRACT22:%.*]] = extractvalue [[STRUCT_TRAVERSALDATA]] [[TMP1]], 4
+; DXILCONTPOSTPROCESS-GLOBAL-NEXT:    [[DOTFCA_5_EXTRACT24:%.*]] = extractvalue [[STRUCT_TRAVERSALDATA]] [[TMP1]], 5
 ; DXILCONTPOSTPROCESS-GLOBAL-NEXT:    call void @amd.dx.setLocalRootIndex(i32 5)
 ; DXILCONTPOSTPROCESS-GLOBAL-NEXT:    [[ISEND_I:%.*]] = call i1 @opaqueIsEnd()
 ; DXILCONTPOSTPROCESS-GLOBAL-NEXT:    br i1 [[ISEND_I]], label [[TMP5:%.*]], label [[TMP10:%.*]]
-; DXILCONTPOSTPROCESS-GLOBAL:       5:
+; DXILCONTPOSTPROCESS-GLOBAL:       6:
 ; DXILCONTPOSTPROCESS-GLOBAL-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP2]], i32 [[TMP4]]
 ; DXILCONTPOSTPROCESS-GLOBAL-NEXT:    [[RETURNADDR_RELOAD2:%.*]] = load i64, ptr addrspace(22) [[TMP6]], align 4
 ; DXILCONTPOSTPROCESS-GLOBAL-NEXT:    [[DOTFCA_0_0_0_INSERT28:%.*]] = insertvalue [[STRUCT_TRAVERSALDATA]] poison, <3 x i32> [[DOTFCA_0_0_0_EXTRACT10]], 0, 0, 0
@@ -2251,9 +2254,9 @@ attributes #6 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 ; DXILCONTPOSTPROCESS-GLOBAL-NEXT:    [[TMP8:%.*]] = add i32 [[TMP7]], -8
 ; DXILCONTPOSTPROCESS-GLOBAL-NEXT:    store i32 [[TMP8]], ptr [[CSP]], align 4
 ; DXILCONTPOSTPROCESS-GLOBAL-NEXT:    [[TMP9:%.*]] = load i32, ptr [[CSP]], align 4
-; DXILCONTPOSTPROCESS-GLOBAL-NEXT:    call void (i64, ...) @continuation.continue(i64 [[RETURNADDR_RELOAD2]], i32 [[TMP9]], [[STRUCT_TRAVERSALDATA]] [[DOTFCA_5_INSERT49]]), !continuation.registercount [[META33]]
+; DXILCONTPOSTPROCESS-GLOBAL-NEXT:    call void (i64, ...) @continuation.continue(i64 [[RETURNADDR_RELOAD2]], i32 [[TMP9]], i64 poison, [[STRUCT_TRAVERSALDATA]] [[DOTFCA_5_INSERT49]]), !continuation.registercount [[META33]]
 ; DXILCONTPOSTPROCESS-GLOBAL-NEXT:    unreachable
-; DXILCONTPOSTPROCESS-GLOBAL:       10:
+; DXILCONTPOSTPROCESS-GLOBAL:       11:
 ; DXILCONTPOSTPROCESS-GLOBAL-NEXT:    [[TMP11:%.*]] = getelementptr i8, ptr addrspace(22) [[TMP2]], i32 [[TMP4]]
 ; DXILCONTPOSTPROCESS-GLOBAL-NEXT:    [[RETURNADDR_RELOAD:%.*]] = load i64, ptr addrspace(22) [[TMP11]], align 4
 ; DXILCONTPOSTPROCESS-GLOBAL-NEXT:    [[DOTFCA_0_0_0_INSERT:%.*]] = insertvalue [[STRUCT_TRAVERSALDATA]] poison, <3 x i32> [[DOTFCA_0_0_0_EXTRACT10]], 0, 0, 0
@@ -2268,12 +2271,12 @@ attributes #6 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 ; DXILCONTPOSTPROCESS-GLOBAL-NEXT:    [[TMP13:%.*]] = add i32 [[TMP12]], -8
 ; DXILCONTPOSTPROCESS-GLOBAL-NEXT:    store i32 [[TMP13]], ptr [[CSP]], align 4
 ; DXILCONTPOSTPROCESS-GLOBAL-NEXT:    [[TMP14:%.*]] = load i32, ptr [[CSP]], align 4
-; DXILCONTPOSTPROCESS-GLOBAL-NEXT:    call void (i64, ...) @continuation.continue(i64 [[RETURNADDR_RELOAD]], i32 [[TMP14]], [[STRUCT_TRAVERSALDATA]] [[DOTFCA_5_INSERT]]), !continuation.registercount [[META33]]
+; DXILCONTPOSTPROCESS-GLOBAL-NEXT:    call void (i64, ...) @continuation.continue(i64 [[RETURNADDR_RELOAD]], i32 [[TMP14]], i64 poison, [[STRUCT_TRAVERSALDATA]] [[DOTFCA_5_INSERT]]), !continuation.registercount [[META33]]
 ; DXILCONTPOSTPROCESS-GLOBAL-NEXT:    unreachable
 ;
 ;
 ; DXILCONTPOSTPROCESS-GLOBAL-LABEL: define void @MyMissShader(
-; DXILCONTPOSTPROCESS-GLOBAL-SAME: i32 [[CSPINIT:%.*]], i64 [[RETURNADDR:%.*]], [[STRUCT_SYSTEMDATA:%.*]] [[TMP0:%.*]]) #[[ATTR3]] !lgc.rt.shaderstage [[META45:![0-9]+]] !continuation.registercount [[META34]] !continuation [[META46:![0-9]+]] !continuation.state [[META22]] {
+; DXILCONTPOSTPROCESS-GLOBAL-SAME: i32 [[CSPINIT:%.*]], i64 [[RETURNADDR:%.*]], [[STRUCT_SYSTEMDATA:%.*]] [[TMP0:%.*]]) #[[ATTR3]] !lgc.rt.shaderstage [[META46:![0-9]+]] !continuation.registercount [[META34]] !continuation [[META47:![0-9]+]] !continuation.state [[META22]] {
 ; DXILCONTPOSTPROCESS-GLOBAL-NEXT:  AllocaSpillBB:
 ; DXILCONTPOSTPROCESS-GLOBAL-NEXT:    [[CSP:%.*]] = alloca i32, align 4
 ; DXILCONTPOSTPROCESS-GLOBAL-NEXT:    store i32 [[CSPINIT]], ptr [[CSP]], align 4
@@ -2308,7 +2311,7 @@ attributes #6 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 ; DXILCONTPOSTPROCESS-GLOBAL-NEXT:    store i32 [[TMP14]], ptr addrspace(20) getelementptr inbounds (i32, ptr addrspace(20) @REGISTERS, i32 9), align 4
 ; DXILCONTPOSTPROCESS-GLOBAL-NEXT:    [[DOTFCA_0_INSERT:%.*]] = insertvalue [[STRUCT_DISPATCHSYSTEMDATA:%.*]] poison, <3 x i32> [[DOTFCA_0_0_EXTRACT]], 0
 ; DXILCONTPOSTPROCESS-GLOBAL-NEXT:    [[TMP15:%.*]] = load i32, ptr [[CSP]], align 4
-; DXILCONTPOSTPROCESS-GLOBAL-NEXT:    call void (i64, ...) @continuation.continue(i64 [[RETURNADDR]], i32 [[TMP15]], [[STRUCT_DISPATCHSYSTEMDATA]] [[DOTFCA_0_INSERT]]), !continuation.registercount [[META34]]
+; DXILCONTPOSTPROCESS-GLOBAL-NEXT:    call void (i64, ...) @continuation.continue(i64 [[RETURNADDR]], i32 [[TMP15]], i64 poison, [[STRUCT_DISPATCHSYSTEMDATA]] [[DOTFCA_0_INSERT]]), !continuation.registercount [[META34]]
 ; DXILCONTPOSTPROCESS-GLOBAL-NEXT:    unreachable
 ;
 ;
@@ -2360,7 +2363,7 @@ attributes #6 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 ;
 ;
 ; LOWERRAYTRACINGPIPELINE-CPS-LABEL: define void @MyRayGen(
-; LOWERRAYTRACINGPIPELINE-CPS-SAME: {} [[CONT_STATE:%.*]], i32 [[RETURN_ADDR:%.*]], i32 [[SHADER_INDEX:%.*]], [[STRUCT_DISPATCHSYSTEMDATA:%.*]] [[TMP0:%.*]]) #[[ATTR3:[0-9]+]] !lgc.rt.shaderstage [[META22]] !lgc.cps [[META22]] !continuation [[META36:![0-9]+]] {
+; LOWERRAYTRACINGPIPELINE-CPS-SAME: {} [[CONT_STATE:%.*]], i32 [[RETURNADDR:%.*]], i32 [[SHADER_INDEX:%.*]], [[STRUCT_DISPATCHSYSTEMDATA:%.*]] [[TMP0:%.*]]) #[[ATTR3:[0-9]+]] !lgc.rt.shaderstage [[META22]] !lgc.cps [[META36:![0-9]+]] !continuation [[META37:![0-9]+]] {
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[SYSTEM_DATA_ALLOCA:%.*]] = alloca [[STRUCT_DISPATCHSYSTEMDATA]], align 8
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[PAYLOAD_ALLOCA:%.*]] = alloca [30 x i32], align 4
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP0]], ptr [[SYSTEM_DATA_ALLOCA]], align 4
@@ -2371,7 +2374,7 @@ attributes #6 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP5:%.*]] = bitcast ptr [[TMP4]] to ptr
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    call void @llvm.lifetime.start.p0(i64 16, ptr [[TMP5]]) #[[ATTR1:[0-9]+]]
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[STRUCT_RAYPAYLOAD]], ptr [[TMP4]], i32 0, i32 0
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store <4 x float> zeroinitializer, ptr [[TMP6]], align 4, !tbaa [[TBAA37:![0-9]+]]
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store <4 x float> zeroinitializer, ptr [[TMP6]], align 4, !tbaa [[TBAA38:![0-9]+]]
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP7:%.*]] = call [[DX_TYPES_HANDLE]] [[DX_OP_CREATEHANDLEFORLIB_DX_TYPES_HANDLE:@[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]](i32 160, [[DX_TYPES_HANDLE]] [[TMP2]])
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP8:%.*]] = call [[DX_TYPES_HANDLE]] [[DX_OP_ANNOTATEHANDLE:@[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]](i32 216, [[DX_TYPES_HANDLE]] [[TMP7]], [[DX_TYPES_RESOURCEPROPERTIES:%.*]] { i32 16, i32 0 })
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP9:%.*]] = call i64 @amd.dx.getAccelStructAddr([[DX_TYPES_HANDLE]] [[TMP8]])
@@ -2380,64 +2383,64 @@ attributes #6 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TRAV_DATA_I:%.*]] = insertvalue [[STRUCT_TRAVERSALDATA:%.*]] undef, [[STRUCT_SYSTEMDATA]] [[SYS_DATA_I]], 0
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[ADDR_I:%.*]] = call i64 @_AmdGetResumePointAddr() #[[ATTR2:[0-9]+]]
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TRAV_DATA2_I:%.*]] = insertvalue [[STRUCT_TRAVERSALDATA]] [[TRAV_DATA_I]], i64 [[ADDR_I]], 5
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [[STRUCT_RAYPAYLOAD]], ptr [[TMP4]], i32 0, i32 0
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [[STRUCT_RAYPAYLOAD]], ptr [[TMP4]], i32 0
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP11:%.*]] = load i32, ptr [[TMP10]], align 4
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store i32 [[TMP11]], ptr [[PAYLOAD_ALLOCA]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i32, ptr [[PAYLOAD_ALLOCA]], i32 7
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i32, ptr [[TMP10]], i32 1
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP14:%.*]] = load i32, ptr [[TMP17]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store i32 [[TMP14]], ptr [[TMP16]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i32, ptr [[TMP16]], i32 1
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP22:%.*]] = getelementptr inbounds i32, ptr [[TMP17]], i32 1
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP23:%.*]] = load i32, ptr [[TMP22]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store i32 [[TMP23]], ptr [[TMP21]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP24:%.*]] = getelementptr inbounds i32, ptr [[TMP16]], i32 2
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP25:%.*]] = getelementptr inbounds i32, ptr [[TMP17]], i32 2
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP26:%.*]] = load i32, ptr [[TMP25]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store i32 [[TMP26]], ptr [[TMP24]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP27:%.*]] = load [10 x i32], ptr [[PAYLOAD_ALLOCA]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP28:%.*]] = call { [[STRUCT_DISPATCHSYSTEMDATA]], [21 x i32], [10 x i32] } (...) @lgc.cps.await__sl_s_struct.DispatchSystemDatasa21i32a10i32s(i32 4, i32 4, i64 -1, i32 5, [[STRUCT_TRAVERSALDATA]] [[TRAV_DATA2_I]], [6 x i32] poison, [10 x i32] [[TMP27]]), !continuation.registercount [[META33:![0-9]+]], !continuation.wait.await [[META13:![0-9]+]], !continuation.returnedRegistercount !33
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP29:%.*]] = extractvalue { [[STRUCT_DISPATCHSYSTEMDATA]], [21 x i32], [10 x i32] } [[TMP28]], 2
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store [10 x i32] [[TMP29]], ptr [[PAYLOAD_ALLOCA]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP30:%.*]] = extractvalue { [[STRUCT_DISPATCHSYSTEMDATA]], [21 x i32], [10 x i32] } [[TMP28]], 0
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[PAYLOAD_ALLOCA]], i32 7
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[TMP10]], i32 1
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP14:%.*]] = load i32, ptr [[TMP13]], align 4
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store i32 [[TMP14]], ptr [[TMP12]], align 4
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i32, ptr [[TMP12]], i32 1
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i32, ptr [[TMP13]], i32 1
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP17:%.*]] = load i32, ptr [[TMP16]], align 4
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store i32 [[TMP17]], ptr [[TMP15]], align 4
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i32, ptr [[TMP12]], i32 2
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i32, ptr [[TMP13]], i32 2
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP20:%.*]] = load i32, ptr [[TMP19]], align 4
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store i32 [[TMP20]], ptr [[TMP18]], align 4
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP21:%.*]] = load [10 x i32], ptr [[PAYLOAD_ALLOCA]], align 4
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP22:%.*]] = call { [[STRUCT_DISPATCHSYSTEMDATA]], [21 x i32], [10 x i32] } (...) @lgc.cps.await__sl_s_struct.DispatchSystemDatasa21i32a10i32s(i32 4, i32 8, i64 -1, i32 5, [[STRUCT_TRAVERSALDATA]] [[TRAV_DATA2_I]], [6 x i32] poison, [10 x i32] [[TMP21]]), !continuation.registercount [[META33:![0-9]+]], !continuation.wait.await [[META13:![0-9]+]], !continuation.returnedRegistercount [[META33]]
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP23:%.*]] = extractvalue { [[STRUCT_DISPATCHSYSTEMDATA]], [21 x i32], [10 x i32] } [[TMP22]], 2
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store [10 x i32] [[TMP23]], ptr [[PAYLOAD_ALLOCA]], align 4
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP24:%.*]] = extractvalue { [[STRUCT_DISPATCHSYSTEMDATA]], [21 x i32], [10 x i32] } [[TMP22]], 0
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store [[STRUCT_RAYPAYLOAD]] poison, ptr [[TMP4]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP31:%.*]] = getelementptr inbounds [[STRUCT_RAYPAYLOAD]], ptr [[TMP4]], i32 0, i32 0
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP32:%.*]] = load i32, ptr [[PAYLOAD_ALLOCA]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store i32 [[TMP32]], ptr [[TMP31]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP37:%.*]] = getelementptr inbounds i32, ptr [[PAYLOAD_ALLOCA]], i32 7
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP38:%.*]] = getelementptr inbounds i32, ptr [[TMP31]], i32 1
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP33:%.*]] = load i32, ptr [[TMP37]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store i32 [[TMP33]], ptr [[TMP38]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP42:%.*]] = getelementptr inbounds i32, ptr [[TMP38]], i32 1
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP43:%.*]] = getelementptr inbounds i32, ptr [[TMP37]], i32 1
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP44:%.*]] = load i32, ptr [[TMP43]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store i32 [[TMP44]], ptr [[TMP42]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP45:%.*]] = getelementptr inbounds i32, ptr [[TMP38]], i32 2
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP46:%.*]] = getelementptr inbounds i32, ptr [[TMP37]], i32 2
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP47:%.*]] = load i32, ptr [[TMP46]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store i32 [[TMP47]], ptr [[TMP45]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP30]], ptr [[SYSTEM_DATA_ALLOCA]], align 4
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [[STRUCT_RAYPAYLOAD]], ptr [[TMP4]], i32 0
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP26:%.*]] = load i32, ptr [[PAYLOAD_ALLOCA]], align 4
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store i32 [[TMP26]], ptr [[TMP25]], align 4
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP27:%.*]] = getelementptr inbounds i32, ptr [[PAYLOAD_ALLOCA]], i32 7
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP28:%.*]] = getelementptr inbounds i32, ptr [[TMP25]], i32 1
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP29:%.*]] = load i32, ptr [[TMP27]], align 4
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store i32 [[TMP29]], ptr [[TMP28]], align 4
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP30:%.*]] = getelementptr inbounds i32, ptr [[TMP28]], i32 1
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP31:%.*]] = getelementptr inbounds i32, ptr [[TMP27]], i32 1
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP32:%.*]] = load i32, ptr [[TMP31]], align 4
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store i32 [[TMP32]], ptr [[TMP30]], align 4
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP33:%.*]] = getelementptr inbounds i32, ptr [[TMP28]], i32 2
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP34:%.*]] = getelementptr inbounds i32, ptr [[TMP27]], i32 2
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP35:%.*]] = load i32, ptr [[TMP34]], align 4
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store i32 [[TMP35]], ptr [[TMP33]], align 4
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP24]], ptr [[SYSTEM_DATA_ALLOCA]], align 4
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    call void @amd.dx.setLocalRootIndex(i32 0)
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    br label [[DOTSPLIT:%.*]]
 ; LOWERRAYTRACINGPIPELINE-CPS:       .split:
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP48:%.*]] = load <4 x float>, ptr [[TMP6]], align 4, !tbaa [[TBAA37]]
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP49:%.*]] = call <3 x i32> @lgc.rt.dispatch.rays.index()
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[EXTRACT:%.*]] = extractelement <3 x i32> [[TMP49]], i8 0
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP50:%.*]] = call <3 x i32> @lgc.rt.dispatch.rays.index()
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[EXTRACT1:%.*]] = extractelement <3 x i32> [[TMP50]], i8 1
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP51:%.*]] = call [[DX_TYPES_HANDLE]] [[DX_OP_CREATEHANDLEFORLIB_DX_TYPES_HANDLE]](i32 160, [[DX_TYPES_HANDLE]] [[TMP3]])
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP52:%.*]] = call [[DX_TYPES_HANDLE]] [[DX_OP_ANNOTATEHANDLE]](i32 216, [[DX_TYPES_HANDLE]] [[TMP51]], [[DX_TYPES_RESOURCEPROPERTIES]] { i32 4098, i32 1033 })
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP53:%.*]] = extractelement <4 x float> [[TMP48]], i64 0
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP54:%.*]] = extractelement <4 x float> [[TMP48]], i64 1
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP55:%.*]] = extractelement <4 x float> [[TMP48]], i64 2
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP56:%.*]] = extractelement <4 x float> [[TMP48]], i64 3
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    call void @dx.op.textureStore.f32(i32 67, [[DX_TYPES_HANDLE]] [[TMP52]], i32 [[EXTRACT]], i32 [[EXTRACT1]], i32 undef, float [[TMP53]], float [[TMP54]], float [[TMP55]], float [[TMP56]], i8 15)
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP36:%.*]] = load <4 x float>, ptr [[TMP6]], align 4, !tbaa [[TBAA38]]
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP37:%.*]] = call <3 x i32> @lgc.rt.dispatch.rays.index()
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[EXTRACT:%.*]] = extractelement <3 x i32> [[TMP37]], i8 0
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP38:%.*]] = call <3 x i32> @lgc.rt.dispatch.rays.index()
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[EXTRACT1:%.*]] = extractelement <3 x i32> [[TMP38]], i8 1
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP39:%.*]] = call [[DX_TYPES_HANDLE]] [[DX_OP_CREATEHANDLEFORLIB_DX_TYPES_HANDLE]](i32 160, [[DX_TYPES_HANDLE]] [[TMP3]])
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP40:%.*]] = call [[DX_TYPES_HANDLE]] [[DX_OP_ANNOTATEHANDLE]](i32 216, [[DX_TYPES_HANDLE]] [[TMP39]], [[DX_TYPES_RESOURCEPROPERTIES]] { i32 4098, i32 1033 })
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP41:%.*]] = extractelement <4 x float> [[TMP36]], i64 0
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP42:%.*]] = extractelement <4 x float> [[TMP36]], i64 1
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP43:%.*]] = extractelement <4 x float> [[TMP36]], i64 2
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP44:%.*]] = extractelement <4 x float> [[TMP36]], i64 3
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    call void @dx.op.textureStore.f32(i32 67, [[DX_TYPES_HANDLE]] [[TMP40]], i32 [[EXTRACT]], i32 [[EXTRACT1]], i32 undef, float [[TMP41]], float [[TMP42]], float [[TMP43]], float [[TMP44]], i8 15)
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    call void @llvm.lifetime.end.p0(i64 16, ptr [[TMP5]]) #[[ATTR1]]
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    ret void
 ;
 ;
 ; LOWERRAYTRACINGPIPELINE-CPS-LABEL: define void @MyClosestHitShader(
-; LOWERRAYTRACINGPIPELINE-CPS-SAME: {} [[CONT_STATE:%.*]], i32 [[RETURN_ADDR:%.*]], i32 [[SHADER_INDEX:%.*]], [[STRUCT_SYSTEMDATA:%.*]] [[SYSTEM_DATA:%.*]], {} [[HIT_ATTRS:%.*]], [19 x i32] [[PADDING:%.*]], [10 x i32] [[PAYLOAD:%.*]]) #[[ATTR3]] !lgc.rt.shaderstage [[META40:![0-9]+]] !lgc.cps [[META41:![0-9]+]] !continuation [[META42:![0-9]+]] {
+; LOWERRAYTRACINGPIPELINE-CPS-SAME: {} [[CONT_STATE:%.*]], i32 [[RETURNADDR:%.*]], i32 [[SHADER_INDEX:%.*]], [[STRUCT_SYSTEMDATA:%.*]] [[SYSTEM_DATA:%.*]], {} [[HIT_ATTRS:%.*]], [19 x i32] [[PADDING:%.*]], [10 x i32] [[PAYLOAD:%.*]]) #[[ATTR3]] !lgc.rt.shaderstage [[META41:![0-9]+]] !lgc.cps [[META42:![0-9]+]] !continuation [[META43:![0-9]+]] {
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP1:%.*]] = alloca [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES:%.*]], align 8
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[SYSTEM_DATA_ALLOCA:%.*]] = alloca [[STRUCT_SYSTEMDATA]], align 8
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[PAYLOAD_ALLOCA:%.*]] = alloca [30 x i32], align 4
@@ -2445,67 +2448,67 @@ attributes #6 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[HITATTRS:%.*]] = alloca [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]], align 8
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store [10 x i32] [[PAYLOAD]], ptr [[PAYLOAD_ALLOCA]], align 4
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store [[STRUCT_SYSTEMDATA]] [[SYSTEM_DATA]], ptr [[SYSTEM_DATA_ALLOCA]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [[STRUCT_RAYPAYLOAD]], ptr [[TMP2]], i32 0, i32 0
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [[STRUCT_RAYPAYLOAD]], ptr [[TMP2]], i32 0
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP4:%.*]] = load i32, ptr [[PAYLOAD_ALLOCA]], align 4
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store i32 [[TMP4]], ptr [[TMP3]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[PAYLOAD_ALLOCA]], i32 7
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[TMP3]], i32 1
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP7:%.*]] = load i32, ptr [[TMP9]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store i32 [[TMP7]], ptr [[TMP10]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i32, ptr [[TMP10]], i32 1
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i32, ptr [[TMP9]], i32 1
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP16:%.*]] = load i32, ptr [[TMP15]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store i32 [[TMP16]], ptr [[TMP14]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i32, ptr [[TMP10]], i32 2
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i32, ptr [[TMP9]], i32 2
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP19:%.*]] = load i32, ptr [[TMP18]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store i32 [[TMP19]], ptr [[TMP17]], align 4
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[PAYLOAD_ALLOCA]], i32 7
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[TMP3]], i32 1
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP7:%.*]] = load i32, ptr [[TMP5]], align 4
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store i32 [[TMP7]], ptr [[TMP6]], align 4
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[TMP6]], i32 1
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[TMP5]], i32 1
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP10:%.*]] = load i32, ptr [[TMP9]], align 4
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store i32 [[TMP10]], ptr [[TMP8]], align 4
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i32, ptr [[TMP6]], i32 2
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[TMP5]], i32 2
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP13:%.*]] = load i32, ptr [[TMP12]], align 4
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store i32 [[TMP13]], ptr [[TMP11]], align 4
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[ADDR_I:%.*]] = getelementptr [[STRUCT_SYSTEMDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 1
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[VAL_I:%.*]] = load [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]], ptr [[ADDR_I]], align 4
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]] [[VAL_I]], ptr [[TMP1]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP20:%.*]] = load i32, ptr [[TMP1]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store i32 [[TMP20]], ptr [[HITATTRS]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP23:%.*]] = getelementptr inbounds i32, ptr [[HITATTRS]], i32 1
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP24:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 1
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP25:%.*]] = load i32, ptr [[TMP24]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store i32 [[TMP25]], ptr [[TMP23]], align 4
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP14:%.*]] = load i32, ptr [[TMP1]], align 4
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store i32 [[TMP14]], ptr [[HITATTRS]], align 4
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i32, ptr [[HITATTRS]], i32 1
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 1
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP17:%.*]] = load i32, ptr [[TMP16]], align 4
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store i32 [[TMP17]], ptr [[TMP15]], align 4
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    call void @amd.dx.setLocalRootIndex(i32 [[SHADER_INDEX]])
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP26:%.*]] = getelementptr inbounds [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]], ptr [[HITATTRS]], i32 0, i32 0
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP27:%.*]] = load <2 x float>, ptr [[TMP26]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP28:%.*]] = extractelement <2 x float> [[TMP27]], i32 0
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP29:%.*]] = fsub fast float 1.000000e+00, [[TMP28]]
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP30:%.*]] = extractelement <2 x float> [[TMP27]], i32 1
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP31:%.*]] = fsub fast float [[TMP29]], [[TMP30]]
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP32:%.*]] = insertelement <4 x float> undef, float [[TMP31]], i64 0
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP33:%.*]] = insertelement <4 x float> [[TMP32]], float [[TMP28]], i64 1
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP34:%.*]] = insertelement <4 x float> [[TMP33]], float [[TMP30]], i64 2
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP35:%.*]] = insertelement <4 x float> [[TMP34]], float 1.000000e+00, i64 3
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP36:%.*]] = getelementptr inbounds [[STRUCT_RAYPAYLOAD]], ptr [[TMP2]], i32 0, i32 0
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store <4 x float> [[TMP35]], ptr [[TMP36]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP37:%.*]] = getelementptr inbounds [[STRUCT_RAYPAYLOAD]], ptr [[TMP2]], i32 0, i32 0
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP38:%.*]] = load i32, ptr [[TMP37]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store i32 [[TMP38]], ptr [[PAYLOAD_ALLOCA]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP43:%.*]] = getelementptr inbounds i32, ptr [[PAYLOAD_ALLOCA]], i32 7
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP44:%.*]] = getelementptr inbounds i32, ptr [[TMP37]], i32 1
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP39:%.*]] = load i32, ptr [[TMP44]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store i32 [[TMP39]], ptr [[TMP43]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP48:%.*]] = getelementptr inbounds i32, ptr [[TMP43]], i32 1
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP49:%.*]] = getelementptr inbounds i32, ptr [[TMP44]], i32 1
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP50:%.*]] = load i32, ptr [[TMP49]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store i32 [[TMP50]], ptr [[TMP48]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP51:%.*]] = getelementptr inbounds i32, ptr [[TMP43]], i32 2
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP52:%.*]] = getelementptr inbounds i32, ptr [[TMP44]], i32 2
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP53:%.*]] = load i32, ptr [[TMP52]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store i32 [[TMP53]], ptr [[TMP51]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP54:%.*]] = getelementptr inbounds [[STRUCT_SYSTEMDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP55:%.*]] = load [[STRUCT_DISPATCHSYSTEMDATA:%.*]], ptr [[TMP54]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP56:%.*]] = load [10 x i32], ptr [[PAYLOAD_ALLOCA]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    call void (...) @lgc.cps.jump(i32 [[RETURN_ADDR]], i32 3, {} poison, i32 poison, i32 poison, [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP55]], [21 x i32] poison, [10 x i32] [[TMP56]]), !continuation.registercount [[META33]]
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP18:%.*]] = getelementptr inbounds [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]], ptr [[HITATTRS]], i32 0, i32 0
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP19:%.*]] = load <2 x float>, ptr [[TMP18]], align 4
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP20:%.*]] = extractelement <2 x float> [[TMP19]], i32 0
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP21:%.*]] = fsub fast float 1.000000e+00, [[TMP20]]
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP22:%.*]] = extractelement <2 x float> [[TMP19]], i32 1
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP23:%.*]] = fsub fast float [[TMP21]], [[TMP22]]
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP24:%.*]] = insertelement <4 x float> undef, float [[TMP23]], i64 0
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP25:%.*]] = insertelement <4 x float> [[TMP24]], float [[TMP20]], i64 1
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP26:%.*]] = insertelement <4 x float> [[TMP25]], float [[TMP22]], i64 2
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP27:%.*]] = insertelement <4 x float> [[TMP26]], float 1.000000e+00, i64 3
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP28:%.*]] = getelementptr inbounds [[STRUCT_RAYPAYLOAD]], ptr [[TMP2]], i32 0, i32 0
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store <4 x float> [[TMP27]], ptr [[TMP28]], align 4
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP29:%.*]] = getelementptr inbounds [[STRUCT_RAYPAYLOAD]], ptr [[TMP2]], i32 0
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP30:%.*]] = load i32, ptr [[TMP29]], align 4
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store i32 [[TMP30]], ptr [[PAYLOAD_ALLOCA]], align 4
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP31:%.*]] = getelementptr inbounds i32, ptr [[PAYLOAD_ALLOCA]], i32 7
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP32:%.*]] = getelementptr inbounds i32, ptr [[TMP29]], i32 1
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP33:%.*]] = load i32, ptr [[TMP32]], align 4
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store i32 [[TMP33]], ptr [[TMP31]], align 4
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP34:%.*]] = getelementptr inbounds i32, ptr [[TMP31]], i32 1
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP35:%.*]] = getelementptr inbounds i32, ptr [[TMP32]], i32 1
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP36:%.*]] = load i32, ptr [[TMP35]], align 4
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store i32 [[TMP36]], ptr [[TMP34]], align 4
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP37:%.*]] = getelementptr inbounds i32, ptr [[TMP31]], i32 2
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP38:%.*]] = getelementptr inbounds i32, ptr [[TMP32]], i32 2
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP39:%.*]] = load i32, ptr [[TMP38]], align 4
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store i32 [[TMP39]], ptr [[TMP37]], align 4
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP40:%.*]] = getelementptr inbounds [[STRUCT_SYSTEMDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP41:%.*]] = load [[STRUCT_DISPATCHSYSTEMDATA:%.*]], ptr [[TMP40]], align 4
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP42:%.*]] = load [10 x i32], ptr [[PAYLOAD_ALLOCA]], align 4
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    call void (...) @lgc.cps.jump(i32 [[RETURNADDR]], i32 6, {} poison, i32 poison, i32 poison, [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP41]], [21 x i32] poison, [10 x i32] [[TMP42]]), !continuation.registercount [[META33]]
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    unreachable
 ;
 ;
 ; LOWERRAYTRACINGPIPELINE-CPS-LABEL: define void @MyAnyHitShader(
-; LOWERRAYTRACINGPIPELINE-CPS-SAME: {} [[CONT_STATE:%.*]], i32 [[RETURN_ADDR:%.*]], i32 [[SHADER_INDEX:%.*]], [[STRUCT_TRAVERSALDATA:%.*]] [[SYSTEM_DATA:%.*]], [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES:%.*]] [[HIT_ATTRS:%.*]], [6 x i32] [[PADDING:%.*]], [10 x i32] [[PAYLOAD:%.*]]) #[[ATTR3]] !lgc.rt.shaderstage [[META43:![0-9]+]] !lgc.cps [[META40]] !continuation [[META44:![0-9]+]] {
+; LOWERRAYTRACINGPIPELINE-CPS-SAME: {} [[CONT_STATE:%.*]], i32 [[RETURNADDR:%.*]], i32 [[SHADER_INDEX:%.*]], [[STRUCT_TRAVERSALDATA:%.*]] [[SYSTEM_DATA:%.*]], [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES:%.*]] [[HIT_ATTRS:%.*]], [6 x i32] [[PADDING:%.*]], [10 x i32] [[PAYLOAD:%.*]]) #[[ATTR3]] !lgc.rt.shaderstage [[META42]] !lgc.cps [[META44:![0-9]+]] !continuation [[META45:![0-9]+]] {
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP1:%.*]] = alloca [[STRUCT_HITDATA:%.*]], align 8
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP2:%.*]] = alloca [[STRUCT_HITDATA]], align 8
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP3:%.*]] = alloca [[STRUCT_HITDATA]], align 8
@@ -2519,126 +2522,126 @@ attributes #6 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[HITATTRSALLOCA:%.*]] = alloca [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]], align 8
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store [10 x i32] [[PAYLOAD]], ptr [[PAYLOAD_ALLOCA]], align 4
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store [[STRUCT_TRAVERSALDATA]] [[SYSTEM_DATA]], ptr [[SYSTEM_DATA_ALLOCA]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [[STRUCT_RAYPAYLOAD]], ptr [[TMP7]], i32 0, i32 0
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [[STRUCT_RAYPAYLOAD]], ptr [[TMP7]], i32 0
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP9:%.*]] = load i32, ptr [[PAYLOAD_ALLOCA]], align 4
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store i32 [[TMP9]], ptr [[TMP8]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i32, ptr [[PAYLOAD_ALLOCA]], i32 7
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i32, ptr [[TMP8]], i32 1
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP12:%.*]] = load i32, ptr [[TMP14]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store i32 [[TMP12]], ptr [[TMP15]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i32, ptr [[TMP15]], i32 1
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP20:%.*]] = getelementptr inbounds i32, ptr [[TMP14]], i32 1
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP16:%.*]] = load i32, ptr [[TMP20]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store i32 [[TMP16]], ptr [[TMP19]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP22:%.*]] = getelementptr inbounds i32, ptr [[TMP15]], i32 2
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP23:%.*]] = getelementptr inbounds i32, ptr [[TMP14]], i32 2
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP18:%.*]] = load i32, ptr [[TMP23]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store i32 [[TMP18]], ptr [[TMP22]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [[STRUCT_TRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[ADDR_I:%.*]] = getelementptr [[STRUCT_SYSTEMDATA:%.*]], ptr [[TMP25]], i32 0, i32 1
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[PAYLOAD_ALLOCA]], i32 7
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i32, ptr [[TMP8]], i32 1
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP12:%.*]] = load i32, ptr [[TMP10]], align 4
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store i32 [[TMP12]], ptr [[TMP11]], align 4
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[TMP11]], i32 1
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i32, ptr [[TMP10]], i32 1
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP15:%.*]] = load i32, ptr [[TMP14]], align 4
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store i32 [[TMP15]], ptr [[TMP13]], align 4
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i32, ptr [[TMP11]], i32 2
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i32, ptr [[TMP10]], i32 2
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP18:%.*]] = load i32, ptr [[TMP17]], align 4
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store i32 [[TMP18]], ptr [[TMP16]], align 4
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP19:%.*]] = getelementptr inbounds [[STRUCT_TRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[ADDR_I:%.*]] = getelementptr [[STRUCT_SYSTEMDATA:%.*]], ptr [[TMP19]], i32 0, i32 1
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[VAL_I:%.*]] = load [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]], ptr [[ADDR_I]], align 4
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]] [[VAL_I]], ptr [[TMP6]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP26:%.*]] = getelementptr inbounds [[STRUCT_RAYPAYLOAD_ATTR_MAX_8_I32S_LAYOUT_1_ANYHIT_IN:%.*]], ptr [[PAYLOAD_ALLOCA]], i32 0, i32 0, i32 1
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [[STRUCT_RAYPAYLOAD_ATTR_MAX_8_I32S_LAYOUT_1_ANYHIT_IN:%.*]], ptr [[PAYLOAD_ALLOCA]], i32 0, i32 0, i32 1
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP21:%.*]] = load i32, ptr [[TMP6]], align 4
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store i32 [[TMP21]], ptr [[ORIGHITATTRS]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP30:%.*]] = getelementptr inbounds i32, ptr [[ORIGHITATTRS]], i32 1
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP31:%.*]] = getelementptr inbounds i32, ptr [[TMP6]], i32 1
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP24:%.*]] = load i32, ptr [[TMP31]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store i32 [[TMP24]], ptr [[TMP30]], align 4
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP22:%.*]] = getelementptr inbounds i32, ptr [[ORIGHITATTRS]], i32 1
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP23:%.*]] = getelementptr inbounds i32, ptr [[TMP6]], i32 1
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP24:%.*]] = load i32, ptr [[TMP23]], align 4
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store i32 [[TMP24]], ptr [[TMP22]], align 4
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]] [[HIT_ATTRS]], ptr [[HITATTRSALLOCA]], align 4
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    call void @amd.dx.setLocalRootIndex(i32 [[SHADER_INDEX]])
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP33:%.*]] = getelementptr inbounds [[STRUCT_RAYPAYLOAD]], ptr [[TMP7]], i32 0, i32 0
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP34:%.*]] = load <4 x float>, ptr [[TMP33]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP35:%.*]] = getelementptr inbounds [[STRUCT_TRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0, i32 0
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [[STRUCT_RAYPAYLOAD]], ptr [[TMP7]], i32 0, i32 0
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP26:%.*]] = load <4 x float>, ptr [[TMP25]], align 4
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP27:%.*]] = getelementptr inbounds [[STRUCT_TRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0, i32 0
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[RESPTR_I3:%.*]] = getelementptr [[STRUCT_TRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 1
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[RES_I4:%.*]] = load [[STRUCT_HITDATA]], ptr [[RESPTR_I3]], align 4
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store [[STRUCT_HITDATA]] [[RES_I4]], ptr [[TMP2]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP36:%.*]] = call <3 x float> @_cont_ObjectRayOrigin3(ptr [[TMP35]], ptr [[TMP2]])
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[EXTRACT1:%.*]] = extractelement <3 x float> [[TMP36]], i8 0
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP37:%.*]] = getelementptr inbounds [[STRUCT_TRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0, i32 0
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP28:%.*]] = call <3 x float> @_cont_ObjectRayOrigin3(ptr [[TMP27]], ptr [[TMP2]])
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[EXTRACT1:%.*]] = extractelement <3 x float> [[TMP28]], i8 0
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP29:%.*]] = getelementptr inbounds [[STRUCT_TRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0, i32 0
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[RESPTR_I:%.*]] = getelementptr [[STRUCT_TRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 1
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[RES_I:%.*]] = load [[STRUCT_HITDATA]], ptr [[RESPTR_I]], align 4
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store [[STRUCT_HITDATA]] [[RES_I]], ptr [[TMP3]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP38:%.*]] = call <3 x float> @_cont_ObjectRayDirection3(ptr [[TMP37]], ptr [[TMP3]])
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[EXTRACT:%.*]] = extractelement <3 x float> [[TMP38]], i8 0
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP39:%.*]] = getelementptr inbounds [[STRUCT_TRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0, i32 0
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP30:%.*]] = call <3 x float> @_cont_ObjectRayDirection3(ptr [[TMP29]], ptr [[TMP3]])
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[EXTRACT:%.*]] = extractelement <3 x float> [[TMP30]], i8 0
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP31:%.*]] = getelementptr inbounds [[STRUCT_TRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0, i32 0
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[RESPTR_I5:%.*]] = getelementptr [[STRUCT_TRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 1
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[RES_I6:%.*]] = load [[STRUCT_HITDATA]], ptr [[RESPTR_I5]], align 4
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store [[STRUCT_HITDATA]] [[RES_I6]], ptr [[TMP1]], align 4
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[RES_I7:%.*]] = load float, ptr [[TMP1]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP40:%.*]] = fmul fast float [[RES_I7]], [[EXTRACT]]
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP41:%.*]] = fadd fast float [[TMP40]], [[EXTRACT1]]
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP42:%.*]] = fcmp fast ogt float [[TMP41]], 0.000000e+00
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    br i1 [[TMP42]], label [[TMP43:%.*]], label [[TMP72:%.*]]
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP32:%.*]] = fmul fast float [[RES_I7]], [[EXTRACT]]
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP33:%.*]] = fadd fast float [[TMP32]], [[EXTRACT1]]
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP34:%.*]] = fcmp fast ogt float [[TMP33]], 0.000000e+00
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    br i1 [[TMP34]], label [[TMP35:%.*]], label [[TMP56:%.*]]
 ; LOWERRAYTRACINGPIPELINE-CPS:       35:
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store <4 x float> [[TMP34]], ptr [[TMP33]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP44:%.*]] = getelementptr inbounds [[STRUCT_TRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0, i32 0
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    call void @_cont_AcceptHitAndEndSearch(ptr [[TMP44]])
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP45:%.*]] = getelementptr inbounds [[STRUCT_RAYPAYLOAD]], ptr [[TMP7]], i32 0, i32 0
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP46:%.*]] = load i32, ptr [[TMP45]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store i32 [[TMP46]], ptr [[PAYLOAD_ALLOCA]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP51:%.*]] = getelementptr inbounds i32, ptr [[PAYLOAD_ALLOCA]], i32 7
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP52:%.*]] = getelementptr inbounds i32, ptr [[TMP45]], i32 1
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP49:%.*]] = load i32, ptr [[TMP52]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store i32 [[TMP49]], ptr [[TMP51]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP56:%.*]] = getelementptr inbounds i32, ptr [[TMP51]], i32 1
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP57:%.*]] = getelementptr inbounds i32, ptr [[TMP52]], i32 1
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP50:%.*]] = load i32, ptr [[TMP57]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store i32 [[TMP50]], ptr [[TMP56]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP59:%.*]] = getelementptr inbounds i32, ptr [[TMP51]], i32 2
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP60:%.*]] = getelementptr inbounds i32, ptr [[TMP52]], i32 2
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP47:%.*]] = load i32, ptr [[TMP60]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store i32 [[TMP47]], ptr [[TMP59]], align 4
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store <4 x float> [[TMP26]], ptr [[TMP25]], align 4
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP36:%.*]] = getelementptr inbounds [[STRUCT_TRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0, i32 0
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    call void @_cont_AcceptHitAndEndSearch(ptr [[TMP36]])
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP37:%.*]] = getelementptr inbounds [[STRUCT_RAYPAYLOAD]], ptr [[TMP7]], i32 0
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP38:%.*]] = load i32, ptr [[TMP37]], align 4
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store i32 [[TMP38]], ptr [[PAYLOAD_ALLOCA]], align 4
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP39:%.*]] = getelementptr inbounds i32, ptr [[PAYLOAD_ALLOCA]], i32 7
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP40:%.*]] = getelementptr inbounds i32, ptr [[TMP37]], i32 1
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP41:%.*]] = load i32, ptr [[TMP40]], align 4
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store i32 [[TMP41]], ptr [[TMP39]], align 4
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP42:%.*]] = getelementptr inbounds i32, ptr [[TMP39]], i32 1
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP43:%.*]] = getelementptr inbounds i32, ptr [[TMP40]], i32 1
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP44:%.*]] = load i32, ptr [[TMP43]], align 4
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store i32 [[TMP44]], ptr [[TMP42]], align 4
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP45:%.*]] = getelementptr inbounds i32, ptr [[TMP39]], i32 2
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP46:%.*]] = getelementptr inbounds i32, ptr [[TMP40]], i32 2
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP47:%.*]] = load i32, ptr [[TMP46]], align 4
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store i32 [[TMP47]], ptr [[TMP45]], align 4
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP48:%.*]] = load i32, ptr [[HITATTRSALLOCA]], align 4
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store i32 [[TMP48]], ptr [[TMP5]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP65:%.*]] = getelementptr inbounds i32, ptr [[HITATTRSALLOCA]], i32 1
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP66:%.*]] = getelementptr inbounds i32, ptr [[TMP5]], i32 1
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP53:%.*]] = load i32, ptr [[TMP65]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store i32 [[TMP53]], ptr [[TMP66]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP68:%.*]] = load [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]], ptr [[TMP5]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP69:%.*]] = getelementptr inbounds [[STRUCT_TRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[ADDR_I1:%.*]] = getelementptr [[STRUCT_SYSTEMDATA]], ptr [[TMP69]], i32 0, i32 1
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]] [[TMP68]], ptr [[ADDR_I1]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP70:%.*]] = load [[STRUCT_TRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP71:%.*]] = load [10 x i32], ptr [[PAYLOAD_ALLOCA]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    call void (...) @lgc.cps.jump(i32 [[RETURN_ADDR]], i32 20, {} poison, i32 poison, i32 poison, [[STRUCT_TRAVERSALDATA]] [[TMP70]], [8 x i32] poison, [10 x i32] [[TMP71]]), !continuation.registercount [[META33]]
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP49:%.*]] = getelementptr inbounds i32, ptr [[HITATTRSALLOCA]], i32 1
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP50:%.*]] = getelementptr inbounds i32, ptr [[TMP5]], i32 1
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP51:%.*]] = load i32, ptr [[TMP49]], align 4
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store i32 [[TMP51]], ptr [[TMP50]], align 4
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP52:%.*]] = load [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]], ptr [[TMP5]], align 4
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP53:%.*]] = getelementptr inbounds [[STRUCT_TRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[ADDR_I1:%.*]] = getelementptr [[STRUCT_SYSTEMDATA]], ptr [[TMP53]], i32 0, i32 1
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]] [[TMP52]], ptr [[ADDR_I1]], align 4
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP54:%.*]] = load [[STRUCT_TRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], align 4
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP55:%.*]] = load [10 x i32], ptr [[PAYLOAD_ALLOCA]], align 4
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    call void (...) @lgc.cps.jump(i32 [[RETURNADDR]], i32 40, {} poison, i32 poison, i32 poison, [[STRUCT_TRAVERSALDATA]] [[TMP54]], [8 x i32] poison, [10 x i32] [[TMP55]]), !continuation.registercount [[META33]]
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    unreachable
 ; LOWERRAYTRACINGPIPELINE-CPS:       56:
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store <4 x float> [[TMP34]], ptr [[TMP33]], align 4
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store <4 x float> [[TMP26]], ptr [[TMP25]], align 4
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    call void @_cont_AcceptHit(ptr [[SYSTEM_DATA_ALLOCA]])
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP73:%.*]] = getelementptr inbounds [[STRUCT_RAYPAYLOAD]], ptr [[TMP7]], i32 0, i32 0
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP58:%.*]] = load i32, ptr [[TMP73]], align 4
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP57:%.*]] = getelementptr inbounds [[STRUCT_RAYPAYLOAD]], ptr [[TMP7]], i32 0
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP58:%.*]] = load i32, ptr [[TMP57]], align 4
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store i32 [[TMP58]], ptr [[PAYLOAD_ALLOCA]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP79:%.*]] = getelementptr inbounds i32, ptr [[PAYLOAD_ALLOCA]], i32 7
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP80:%.*]] = getelementptr inbounds i32, ptr [[TMP73]], i32 1
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP61:%.*]] = load i32, ptr [[TMP80]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store i32 [[TMP61]], ptr [[TMP79]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP84:%.*]] = getelementptr inbounds i32, ptr [[TMP79]], i32 1
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP85:%.*]] = getelementptr inbounds i32, ptr [[TMP80]], i32 1
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP64:%.*]] = load i32, ptr [[TMP85]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store i32 [[TMP64]], ptr [[TMP84]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP87:%.*]] = getelementptr inbounds i32, ptr [[TMP79]], i32 2
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP88:%.*]] = getelementptr inbounds i32, ptr [[TMP80]], i32 2
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP67:%.*]] = load i32, ptr [[TMP88]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store i32 [[TMP67]], ptr [[TMP87]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP74:%.*]] = load i32, ptr [[HITATTRSALLOCA]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store i32 [[TMP74]], ptr [[TMP4]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP93:%.*]] = getelementptr inbounds i32, ptr [[HITATTRSALLOCA]], i32 1
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP94:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i32 1
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP75:%.*]] = load i32, ptr [[TMP93]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store i32 [[TMP75]], ptr [[TMP94]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP96:%.*]] = load [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]], ptr [[TMP4]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP97:%.*]] = getelementptr inbounds [[STRUCT_TRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[ADDR_I2:%.*]] = getelementptr [[STRUCT_SYSTEMDATA]], ptr [[TMP97]], i32 0, i32 1
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]] [[TMP96]], ptr [[ADDR_I2]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP98:%.*]] = load [[STRUCT_TRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP99:%.*]] = load [10 x i32], ptr [[PAYLOAD_ALLOCA]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    call void (...) @lgc.cps.jump(i32 [[RETURN_ADDR]], i32 20, {} poison, i32 poison, i32 poison, [[STRUCT_TRAVERSALDATA]] [[TMP98]], [8 x i32] poison, [10 x i32] [[TMP99]]), !continuation.registercount [[META33]]
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP59:%.*]] = getelementptr inbounds i32, ptr [[PAYLOAD_ALLOCA]], i32 7
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP60:%.*]] = getelementptr inbounds i32, ptr [[TMP57]], i32 1
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP61:%.*]] = load i32, ptr [[TMP60]], align 4
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store i32 [[TMP61]], ptr [[TMP59]], align 4
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP62:%.*]] = getelementptr inbounds i32, ptr [[TMP59]], i32 1
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP63:%.*]] = getelementptr inbounds i32, ptr [[TMP60]], i32 1
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP64:%.*]] = load i32, ptr [[TMP63]], align 4
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store i32 [[TMP64]], ptr [[TMP62]], align 4
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP65:%.*]] = getelementptr inbounds i32, ptr [[TMP59]], i32 2
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP66:%.*]] = getelementptr inbounds i32, ptr [[TMP60]], i32 2
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP67:%.*]] = load i32, ptr [[TMP66]], align 4
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store i32 [[TMP67]], ptr [[TMP65]], align 4
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP68:%.*]] = load i32, ptr [[HITATTRSALLOCA]], align 4
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store i32 [[TMP68]], ptr [[TMP4]], align 4
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP69:%.*]] = getelementptr inbounds i32, ptr [[HITATTRSALLOCA]], i32 1
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP70:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i32 1
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP71:%.*]] = load i32, ptr [[TMP69]], align 4
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store i32 [[TMP71]], ptr [[TMP70]], align 4
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP72:%.*]] = load [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]], ptr [[TMP4]], align 4
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP73:%.*]] = getelementptr inbounds [[STRUCT_TRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[ADDR_I2:%.*]] = getelementptr [[STRUCT_SYSTEMDATA]], ptr [[TMP73]], i32 0, i32 1
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]] [[TMP72]], ptr [[ADDR_I2]], align 4
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP74:%.*]] = load [[STRUCT_TRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], align 4
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP75:%.*]] = load [10 x i32], ptr [[PAYLOAD_ALLOCA]], align 4
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    call void (...) @lgc.cps.jump(i32 [[RETURNADDR]], i32 40, {} poison, i32 poison, i32 poison, [[STRUCT_TRAVERSALDATA]] [[TMP74]], [8 x i32] poison, [10 x i32] [[TMP75]]), !continuation.registercount [[META33]]
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    unreachable
 ;
 ;
 ; LOWERRAYTRACINGPIPELINE-CPS-LABEL: define void @MyIntersectionShader(
-; LOWERRAYTRACINGPIPELINE-CPS-SAME: {} [[CONT_STATE:%.*]], i32 [[RETURN_ADDR:%.*]], i32 [[SHADER_INDEX:%.*]], [[STRUCT_TRAVERSALDATA:%.*]] [[SYSTEM_DATA:%.*]], {} [[HIT_ATTRS:%.*]], [8 x i32] [[PADDING:%.*]], [30 x i32] [[PAYLOAD:%.*]]) #[[ATTR3]] !lgc.rt.shaderstage [[META41]] !lgc.cps [[META45:![0-9]+]] !continuation [[META46:![0-9]+]] {
+; LOWERRAYTRACINGPIPELINE-CPS-SAME: {} [[CONT_STATE:%.*]], i32 [[RETURNADDR:%.*]], i32 [[SHADER_INDEX:%.*]], [[STRUCT_TRAVERSALDATA:%.*]] [[SYSTEM_DATA:%.*]], {} [[HIT_ATTRS:%.*]], [8 x i32] [[PADDING:%.*]], [30 x i32] [[PAYLOAD:%.*]]) #[[ATTR3]] !lgc.rt.shaderstage [[META36]] !lgc.cps [[META46:![0-9]+]] !continuation [[META47:![0-9]+]] {
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP1:%.*]] = alloca [[STRUCT_HITDATA:%.*]], align 8
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP2:%.*]] = alloca [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES:%.*]], align 8
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP3:%.*]] = alloca [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]], align 4
@@ -2661,7 +2664,7 @@ attributes #6 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[STRUCT_TRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0, i32 0
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP7:%.*]] = load [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]], ptr [[TMP3]], align 4
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP8:%.*]] = load [30 x i32], ptr [[PAYLOAD_ALLOCA]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP9:%.*]] = call { [[STRUCT_TRAVERSALDATA]], [8 x i32], [30 x i32] } (...) @lgc.cps.await__sl_s_struct.TraversalDatasa8i32a30i32s(i32 3, i32 8, i32 5, float [[RES_I1]], i32 0, [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]] [[TMP7]], [20 x i32] poison, [30 x i32] [[TMP8]]), !continuation.registercount [[META32:![0-9]+]], !continuation.returnedRegistercount !32
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP9:%.*]] = call { [[STRUCT_TRAVERSALDATA]], [8 x i32], [30 x i32] } (...) @lgc.cps.await__sl_s_struct.TraversalDatasa8i32a30i32s(i32 3, i32 16, i32 5, float [[RES_I1]], i32 0, [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]] [[TMP7]], [20 x i32] poison, [30 x i32] [[TMP8]]), !continuation.registercount [[META32:![0-9]+]], !continuation.returnedRegistercount [[META32]]
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP10:%.*]] = extractvalue { [[STRUCT_TRAVERSALDATA]], [8 x i32], [30 x i32] } [[TMP9]], 2
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store [30 x i32] [[TMP10]], ptr [[PAYLOAD_ALLOCA]], align 4
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP11:%.*]] = extractvalue { [[STRUCT_TRAVERSALDATA]], [8 x i32], [30 x i32] } [[TMP9]], 0
@@ -2673,33 +2676,33 @@ attributes #6 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP13:%.*]] = getelementptr i32, ptr [[PAYLOAD_ALLOCA]], i32 1
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP14:%.*]] = load i32, ptr [[TMP3]], align 4
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store i32 [[TMP14]], ptr [[TMP2]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i32, ptr [[TMP3]], i32 1
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i32, ptr [[TMP2]], i32 1
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP19:%.*]] = load i32, ptr [[TMP17]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store i32 [[TMP19]], ptr [[TMP18]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP20:%.*]] = load [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]], ptr [[TMP2]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP21:%.*]] = getelementptr inbounds [[STRUCT_TRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[ADDR_I:%.*]] = getelementptr [[STRUCT_SYSTEMDATA:%.*]], ptr [[TMP21]], i32 0, i32 1
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]] [[TMP20]], ptr [[ADDR_I]], align 4
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i32, ptr [[TMP3]], i32 1
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i32, ptr [[TMP2]], i32 1
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP17:%.*]] = load i32, ptr [[TMP15]], align 4
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store i32 [[TMP17]], ptr [[TMP16]], align 4
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP18:%.*]] = load [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]], ptr [[TMP2]], align 4
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP19:%.*]] = getelementptr inbounds [[STRUCT_TRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[ADDR_I:%.*]] = getelementptr [[STRUCT_SYSTEMDATA:%.*]], ptr [[TMP19]], i32 0, i32 1
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]] [[TMP18]], ptr [[ADDR_I]], align 4
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    br label [[_CONT_REPORTHIT_EXIT]]
 ; LOWERRAYTRACINGPIPELINE-CPS:       _cont_ReportHit.exit:
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[ISEND_I:%.*]] = call i1 @opaqueIsEnd()
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    br i1 [[ISEND_I]], label [[TMP22:%.*]], label [[TMP25:%.*]]
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    br i1 [[ISEND_I]], label [[TMP20:%.*]], label [[TMP23:%.*]]
 ; LOWERRAYTRACINGPIPELINE-CPS:       20:
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP23:%.*]] = load [[STRUCT_TRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP24:%.*]] = load [30 x i32], ptr [[PAYLOAD_ALLOCA]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    call void (...) @lgc.cps.jump(i32 [[RETURN_ADDR]], i32 4, {} poison, i32 poison, i32 poison, [[STRUCT_TRAVERSALDATA]] [[TMP23]], [8 x i32] poison, [30 x i32] [[TMP24]]), !continuation.registercount [[META32]]
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP21:%.*]] = load [[STRUCT_TRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], align 4
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP22:%.*]] = load [30 x i32], ptr [[PAYLOAD_ALLOCA]], align 4
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    call void (...) @lgc.cps.jump(i32 [[RETURNADDR]], i32 8, {} poison, i32 poison, i32 poison, [[STRUCT_TRAVERSALDATA]] [[TMP21]], [8 x i32] poison, [30 x i32] [[TMP22]]), !continuation.registercount [[META32]]
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    unreachable
 ; LOWERRAYTRACINGPIPELINE-CPS:       23:
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    call void @llvm.lifetime.end.p0(i64 8, ptr [[TMP5]]) #[[ATTR1]]
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP26:%.*]] = load [[STRUCT_TRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP27:%.*]] = load [30 x i32], ptr [[PAYLOAD_ALLOCA]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    call void (...) @lgc.cps.jump(i32 [[RETURN_ADDR]], i32 4, {} poison, i32 poison, i32 poison, [[STRUCT_TRAVERSALDATA]] [[TMP26]], [8 x i32] poison, [30 x i32] [[TMP27]]), !continuation.registercount [[META32]]
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP24:%.*]] = load [[STRUCT_TRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], align 4
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP25:%.*]] = load [30 x i32], ptr [[PAYLOAD_ALLOCA]], align 4
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    call void (...) @lgc.cps.jump(i32 [[RETURNADDR]], i32 8, {} poison, i32 poison, i32 poison, [[STRUCT_TRAVERSALDATA]] [[TMP24]], [8 x i32] poison, [30 x i32] [[TMP25]]), !continuation.registercount [[META32]]
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    unreachable
 ;
 ;
 ; LOWERRAYTRACINGPIPELINE-CPS-LABEL: define void @MyIntersectionShaderLargeAttrs(
-; LOWERRAYTRACINGPIPELINE-CPS-SAME: {} [[CONT_STATE:%.*]], i32 [[RETURN_ADDR:%.*]], i32 [[SHADER_INDEX:%.*]], [[STRUCT_TRAVERSALDATA:%.*]] [[SYSTEM_DATA:%.*]], {} [[HIT_ATTRS:%.*]], [8 x i32] [[PADDING:%.*]], [30 x i32] [[PAYLOAD:%.*]]) #[[ATTR3]] !lgc.rt.shaderstage [[META41]] !lgc.cps [[META45]] !continuation [[META47:![0-9]+]] {
+; LOWERRAYTRACINGPIPELINE-CPS-SAME: {} [[CONT_STATE:%.*]], i32 [[RETURNADDR:%.*]], i32 [[SHADER_INDEX:%.*]], [[STRUCT_TRAVERSALDATA:%.*]] [[SYSTEM_DATA:%.*]], {} [[HIT_ATTRS:%.*]], [8 x i32] [[PADDING:%.*]], [30 x i32] [[PAYLOAD:%.*]]) #[[ATTR3]] !lgc.rt.shaderstage [[META36]] !lgc.cps [[META46]] !continuation [[META48:![0-9]+]] {
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP1:%.*]] = alloca [[STRUCT_HITDATA:%.*]], align 8
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP2:%.*]] = alloca [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES:%.*]], align 8
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP3:%.*]] = alloca [[STRUCT_LARGEINTERSECTIONATTRIBUTES:%.*]], align 4
@@ -2736,7 +2739,7 @@ attributes #6 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[STRUCT_TRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0, i32 0
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP7:%.*]] = load [[STRUCT_LARGEINTERSECTIONATTRIBUTES]], ptr [[TMP3]], align 4
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP8:%.*]] = load [30 x i32], ptr [[PAYLOAD_ALLOCA]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP9:%.*]] = call { [[STRUCT_TRAVERSALDATA]], [8 x i32], [30 x i32] } (...) @lgc.cps.await__sl_s_struct.TraversalDatasa8i32a30i32s(i32 3, i32 8, i32 5, float [[RES_I1]], i32 0, [[STRUCT_LARGEINTERSECTIONATTRIBUTES]] [[TMP7]], [15 x i32] poison, [30 x i32] [[TMP8]]), !continuation.registercount [[META32]], !continuation.returnedRegistercount !32
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP9:%.*]] = call { [[STRUCT_TRAVERSALDATA]], [8 x i32], [30 x i32] } (...) @lgc.cps.await__sl_s_struct.TraversalDatasa8i32a30i32s(i32 3, i32 16, i32 5, float [[RES_I1]], i32 0, [[STRUCT_LARGEINTERSECTIONATTRIBUTES]] [[TMP7]], [15 x i32] poison, [30 x i32] [[TMP8]]), !continuation.registercount [[META32]], !continuation.returnedRegistercount [[META32]]
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP10:%.*]] = extractvalue { [[STRUCT_TRAVERSALDATA]], [8 x i32], [30 x i32] } [[TMP9]], 2
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store [30 x i32] [[TMP10]], ptr [[PAYLOAD_ALLOCA]], align 4
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP11:%.*]] = extractvalue { [[STRUCT_TRAVERSALDATA]], [8 x i32], [30 x i32] } [[TMP9]], 0
@@ -2748,94 +2751,94 @@ attributes #6 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP13:%.*]] = getelementptr i32, ptr [[PAYLOAD_ALLOCA]], i32 1
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP14:%.*]] = load i32, ptr [[TMP3]], align 4
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store i32 [[TMP14]], ptr [[TMP2]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i32, ptr [[TMP3]], i32 1
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i32, ptr [[TMP2]], i32 1
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP19:%.*]] = load i32, ptr [[TMP17]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store i32 [[TMP19]], ptr [[TMP18]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP20:%.*]] = getelementptr inbounds i32, ptr [[TMP3]], i32 2
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i32, ptr [[TMP3]], i32 1
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i32, ptr [[TMP2]], i32 1
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP17:%.*]] = load i32, ptr [[TMP15]], align 4
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store i32 [[TMP17]], ptr [[TMP16]], align 4
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i32, ptr [[TMP3]], i32 2
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP19:%.*]] = load i32, ptr [[TMP18]], align 4
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store i32 [[TMP19]], ptr [[TMP13]], align 4
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP20:%.*]] = getelementptr inbounds i32, ptr [[TMP3]], i32 3
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i32, ptr [[TMP13]], i32 1
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP22:%.*]] = load i32, ptr [[TMP20]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store i32 [[TMP22]], ptr [[TMP13]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP23:%.*]] = getelementptr inbounds i32, ptr [[TMP3]], i32 3
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP24:%.*]] = getelementptr inbounds i32, ptr [[TMP13]], i32 1
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store i32 [[TMP22]], ptr [[TMP21]], align 4
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP23:%.*]] = getelementptr inbounds i32, ptr [[TMP3]], i32 4
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP24:%.*]] = getelementptr inbounds i32, ptr [[TMP13]], i32 2
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP25:%.*]] = load i32, ptr [[TMP23]], align 4
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store i32 [[TMP25]], ptr [[TMP24]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP26:%.*]] = getelementptr inbounds i32, ptr [[TMP3]], i32 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP27:%.*]] = getelementptr inbounds i32, ptr [[TMP13]], i32 2
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP26:%.*]] = getelementptr inbounds i32, ptr [[TMP3]], i32 5
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP27:%.*]] = getelementptr inbounds i32, ptr [[TMP13]], i32 3
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP28:%.*]] = load i32, ptr [[TMP26]], align 4
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store i32 [[TMP28]], ptr [[TMP27]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP29:%.*]] = getelementptr inbounds i32, ptr [[TMP3]], i32 5
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP30:%.*]] = getelementptr inbounds i32, ptr [[TMP13]], i32 3
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP29:%.*]] = getelementptr inbounds i32, ptr [[TMP3]], i32 6
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP30:%.*]] = getelementptr inbounds i32, ptr [[TMP13]], i32 4
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP31:%.*]] = load i32, ptr [[TMP29]], align 4
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store i32 [[TMP31]], ptr [[TMP30]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP32:%.*]] = getelementptr inbounds i32, ptr [[TMP3]], i32 6
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP33:%.*]] = getelementptr inbounds i32, ptr [[TMP13]], i32 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP34:%.*]] = load i32, ptr [[TMP32]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store i32 [[TMP34]], ptr [[TMP33]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP35:%.*]] = load [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]], ptr [[TMP2]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP36:%.*]] = getelementptr inbounds [[STRUCT_TRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[ADDR_I:%.*]] = getelementptr [[STRUCT_SYSTEMDATA:%.*]], ptr [[TMP36]], i32 0, i32 1
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]] [[TMP35]], ptr [[ADDR_I]], align 4
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP32:%.*]] = load [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]], ptr [[TMP2]], align 4
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP33:%.*]] = getelementptr inbounds [[STRUCT_TRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[ADDR_I:%.*]] = getelementptr [[STRUCT_SYSTEMDATA:%.*]], ptr [[TMP33]], i32 0, i32 1
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]] [[TMP32]], ptr [[ADDR_I]], align 4
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    br label [[_CONT_REPORTHIT_EXIT]]
 ; LOWERRAYTRACINGPIPELINE-CPS:       _cont_ReportHit.exit:
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[ISEND_I:%.*]] = call i1 @opaqueIsEnd()
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    br i1 [[ISEND_I]], label [[TMP37:%.*]], label [[TMP40:%.*]]
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    br i1 [[ISEND_I]], label [[TMP34:%.*]], label [[TMP37:%.*]]
 ; LOWERRAYTRACINGPIPELINE-CPS:       34:
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP38:%.*]] = load [[STRUCT_TRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP39:%.*]] = load [30 x i32], ptr [[PAYLOAD_ALLOCA]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    call void (...) @lgc.cps.jump(i32 [[RETURN_ADDR]], i32 4, {} poison, i32 poison, i32 poison, [[STRUCT_TRAVERSALDATA]] [[TMP38]], [8 x i32] poison, [30 x i32] [[TMP39]]), !continuation.registercount [[META32]]
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP35:%.*]] = load [[STRUCT_TRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], align 4
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP36:%.*]] = load [30 x i32], ptr [[PAYLOAD_ALLOCA]], align 4
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    call void (...) @lgc.cps.jump(i32 [[RETURNADDR]], i32 8, {} poison, i32 poison, i32 poison, [[STRUCT_TRAVERSALDATA]] [[TMP35]], [8 x i32] poison, [30 x i32] [[TMP36]]), !continuation.registercount [[META32]]
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    unreachable
 ; LOWERRAYTRACINGPIPELINE-CPS:       37:
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    call void @llvm.lifetime.end.p0(i64 8, ptr [[TMP5]]) #[[ATTR1]]
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP41:%.*]] = load [[STRUCT_TRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP42:%.*]] = load [30 x i32], ptr [[PAYLOAD_ALLOCA]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    call void (...) @lgc.cps.jump(i32 [[RETURN_ADDR]], i32 4, {} poison, i32 poison, i32 poison, [[STRUCT_TRAVERSALDATA]] [[TMP41]], [8 x i32] poison, [30 x i32] [[TMP42]]), !continuation.registercount [[META32]]
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP38:%.*]] = load [[STRUCT_TRAVERSALDATA]], ptr [[SYSTEM_DATA_ALLOCA]], align 4
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP39:%.*]] = load [30 x i32], ptr [[PAYLOAD_ALLOCA]], align 4
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    call void (...) @lgc.cps.jump(i32 [[RETURNADDR]], i32 8, {} poison, i32 poison, i32 poison, [[STRUCT_TRAVERSALDATA]] [[TMP38]], [8 x i32] poison, [30 x i32] [[TMP39]]), !continuation.registercount [[META32]]
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    unreachable
 ;
 ;
 ; LOWERRAYTRACINGPIPELINE-CPS-LABEL: define void @MyMissShader(
-; LOWERRAYTRACINGPIPELINE-CPS-SAME: {} [[CONT_STATE:%.*]], i32 [[RETURN_ADDR:%.*]], i32 [[SHADER_INDEX:%.*]], [[STRUCT_SYSTEMDATA:%.*]] [[SYSTEM_DATA:%.*]], {} [[HIT_ATTRS:%.*]], [19 x i32] [[PADDING:%.*]], [10 x i32] [[PAYLOAD:%.*]]) #[[ATTR3]] !lgc.rt.shaderstage [[META45]] !lgc.cps [[META41]] !continuation [[META48:![0-9]+]] {
+; LOWERRAYTRACINGPIPELINE-CPS-SAME: {} [[CONT_STATE:%.*]], i32 [[RETURNADDR:%.*]], i32 [[SHADER_INDEX:%.*]], [[STRUCT_SYSTEMDATA:%.*]] [[SYSTEM_DATA:%.*]], {} [[HIT_ATTRS:%.*]], [19 x i32] [[PADDING:%.*]], [10 x i32] [[PAYLOAD:%.*]]) #[[ATTR3]] !lgc.rt.shaderstage [[META44]] !lgc.cps [[META42]] !continuation [[META49:![0-9]+]] {
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[SYSTEM_DATA_ALLOCA:%.*]] = alloca [[STRUCT_SYSTEMDATA]], align 8
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[PAYLOAD_ALLOCA:%.*]] = alloca [30 x i32], align 4
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP1:%.*]] = alloca [[STRUCT_RAYPAYLOAD:%.*]], align 8
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store [10 x i32] [[PAYLOAD]], ptr [[PAYLOAD_ALLOCA]], align 4
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store [[STRUCT_SYSTEMDATA]] [[SYSTEM_DATA]], ptr [[SYSTEM_DATA_ALLOCA]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [[STRUCT_RAYPAYLOAD]], ptr [[TMP1]], i32 0, i32 0
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [[STRUCT_RAYPAYLOAD]], ptr [[TMP1]], i32 0
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP3:%.*]] = load i32, ptr [[PAYLOAD_ALLOCA]], align 4
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store i32 [[TMP3]], ptr [[TMP2]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[PAYLOAD_ALLOCA]], i32 7
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[TMP2]], i32 1
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP6:%.*]] = load i32, ptr [[TMP8]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store i32 [[TMP6]], ptr [[TMP9]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[TMP9]], i32 1
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i32, ptr [[TMP8]], i32 1
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[PAYLOAD_ALLOCA]], i32 7
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[TMP2]], i32 1
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP6:%.*]] = load i32, ptr [[TMP4]], align 4
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store i32 [[TMP6]], ptr [[TMP5]], align 4
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[TMP5]], i32 1
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i32 1
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP9:%.*]] = load i32, ptr [[TMP8]], align 4
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store i32 [[TMP9]], ptr [[TMP7]], align 4
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[TMP5]], i32 2
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i32 2
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP12:%.*]] = load i32, ptr [[TMP11]], align 4
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store i32 [[TMP12]], ptr [[TMP10]], align 4
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    call void @amd.dx.setLocalRootIndex(i32 [[SHADER_INDEX]])
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP13:%.*]] = getelementptr inbounds [[STRUCT_RAYPAYLOAD]], ptr [[TMP1]], i32 0, i32 0
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store <4 x float> <float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 1.000000e+00>, ptr [[TMP13]], align 4
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP14:%.*]] = getelementptr inbounds [[STRUCT_RAYPAYLOAD]], ptr [[TMP1]], i32 0
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP15:%.*]] = load i32, ptr [[TMP14]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store i32 [[TMP15]], ptr [[TMP13]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i32, ptr [[TMP9]], i32 2
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i32, ptr [[TMP8]], i32 2
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store i32 [[TMP15]], ptr [[PAYLOAD_ALLOCA]], align 4
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i32, ptr [[PAYLOAD_ALLOCA]], i32 7
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i32, ptr [[TMP14]], i32 1
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP18:%.*]] = load i32, ptr [[TMP17]], align 4
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store i32 [[TMP18]], ptr [[TMP16]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    call void @amd.dx.setLocalRootIndex(i32 [[SHADER_INDEX]])
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP19:%.*]] = getelementptr inbounds [[STRUCT_RAYPAYLOAD]], ptr [[TMP1]], i32 0, i32 0
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store <4 x float> <float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 1.000000e+00>, ptr [[TMP19]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [[STRUCT_RAYPAYLOAD]], ptr [[TMP1]], i32 0, i32 0
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i32, ptr [[TMP16]], i32 1
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP20:%.*]] = getelementptr inbounds i32, ptr [[TMP17]], i32 1
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP21:%.*]] = load i32, ptr [[TMP20]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store i32 [[TMP21]], ptr [[PAYLOAD_ALLOCA]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP26:%.*]] = getelementptr inbounds i32, ptr [[PAYLOAD_ALLOCA]], i32 7
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP27:%.*]] = getelementptr inbounds i32, ptr [[TMP20]], i32 1
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP22:%.*]] = load i32, ptr [[TMP27]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store i32 [[TMP22]], ptr [[TMP26]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP31:%.*]] = getelementptr inbounds i32, ptr [[TMP26]], i32 1
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP32:%.*]] = getelementptr inbounds i32, ptr [[TMP27]], i32 1
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP33:%.*]] = load i32, ptr [[TMP32]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store i32 [[TMP33]], ptr [[TMP31]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP34:%.*]] = getelementptr inbounds i32, ptr [[TMP26]], i32 2
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP35:%.*]] = getelementptr inbounds i32, ptr [[TMP27]], i32 2
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP36:%.*]] = load i32, ptr [[TMP35]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store i32 [[TMP36]], ptr [[TMP34]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP37:%.*]] = getelementptr inbounds [[STRUCT_SYSTEMDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP38:%.*]] = load [[STRUCT_DISPATCHSYSTEMDATA:%.*]], ptr [[TMP37]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP39:%.*]] = load [10 x i32], ptr [[PAYLOAD_ALLOCA]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    call void (...) @lgc.cps.jump(i32 [[RETURN_ADDR]], i32 3, {} poison, i32 poison, i32 poison, [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP38]], [21 x i32] poison, [10 x i32] [[TMP39]]), !continuation.registercount [[META33]]
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store i32 [[TMP21]], ptr [[TMP19]], align 4
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP22:%.*]] = getelementptr inbounds i32, ptr [[TMP16]], i32 2
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP23:%.*]] = getelementptr inbounds i32, ptr [[TMP17]], i32 2
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP24:%.*]] = load i32, ptr [[TMP23]], align 4
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store i32 [[TMP24]], ptr [[TMP22]], align 4
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [[STRUCT_SYSTEMDATA]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP26:%.*]] = load [[STRUCT_DISPATCHSYSTEMDATA:%.*]], ptr [[TMP25]], align 4
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP27:%.*]] = load [10 x i32], ptr [[PAYLOAD_ALLOCA]], align 4
+; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    call void (...) @lgc.cps.jump(i32 [[RETURNADDR]], i32 6, {} poison, i32 poison, i32 poison, [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP26]], [21 x i32] poison, [10 x i32] [[TMP27]]), !continuation.registercount [[META33]]
 ; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    unreachable
 ;
 ;
@@ -2889,49 +2892,47 @@ attributes #6 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 ;
 ;
 ; DXILCONTPOSTPROCESS-CPS-LABEL: define void @MyRayGen(
-; DXILCONTPOSTPROCESS-CPS-SAME: {} [[TMP0:%.*]], i32 [[CSPINIT:%.*]], i32 [[TMP1:%.*]], i32 [[TMP2:%.*]]) #[[ATTR3:[0-9]+]] !lgc.rt.shaderstage [[META22]] !lgc.cps [[META22]] !continuation [[META36:![0-9]+]] {
+; DXILCONTPOSTPROCESS-CPS-SAME: {} [[CONT_STATE:%.*]], i32 [[CSPINIT:%.*]], i32 [[RETURNADDR:%.*]], i32 [[SHADER_INDEX:%.*]], [[STRUCT_DISPATCHSYSTEMDATA:%.*]] [[TMP0:%.*]]) #[[ATTR3:[0-9]+]] !lgc.rt.shaderstage [[META22]] !lgc.cps [[META36:![0-9]+]] !continuation [[META37:![0-9]+]] {
 ; DXILCONTPOSTPROCESS-CPS-NEXT:  AllocaSpillBB:
 ; DXILCONTPOSTPROCESS-CPS-NEXT:    [[CSP:%.*]] = alloca i32, align 4
 ; DXILCONTPOSTPROCESS-CPS-NEXT:    store i32 [[CSPINIT]], ptr [[CSP]], align 4
-; DXILCONTPOSTPROCESS-CPS-NEXT:    [[SYSTEM_DATA:%.*]] = call [[STRUCT_DISPATCHSYSTEMDATA:%.*]] [[_CONT_SETUPRAYGEN:@[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]]()
-; DXILCONTPOSTPROCESS-CPS-NEXT:    [[DOTFCA_0_EXTRACT20:%.*]] = extractvalue [[STRUCT_DISPATCHSYSTEMDATA]] [[SYSTEM_DATA]], 0
+; DXILCONTPOSTPROCESS-CPS-NEXT:    [[DOTFCA_0_EXTRACT20:%.*]] = extractvalue [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP0]], 0
 ; DXILCONTPOSTPROCESS-CPS-NEXT:    call void @amd.dx.setLocalRootIndex(i32 0)
-; DXILCONTPOSTPROCESS-CPS-NEXT:    [[TMP3:%.*]] = load [[DX_TYPES_HANDLE:%.*]], ptr @"\01?Scene@@3URaytracingAccelerationStructure@@A", align 4
-; DXILCONTPOSTPROCESS-CPS-NEXT:    [[TMP4:%.*]] = load [[DX_TYPES_HANDLE]], ptr @"\01?RenderTarget@@3V?$RWTexture2D@V?$vector@M$03@@@@A", align 4
-; DXILCONTPOSTPROCESS-CPS-NEXT:    [[TMP5:%.*]] = call [[DX_TYPES_HANDLE]] [[DX_OP_CREATEHANDLEFORLIB_DX_TYPES_HANDLE:@[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]](i32 160, [[DX_TYPES_HANDLE]] [[TMP3]])
-; DXILCONTPOSTPROCESS-CPS-NEXT:    [[TMP6:%.*]] = call [[DX_TYPES_HANDLE]] [[DX_OP_ANNOTATEHANDLE:@[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]](i32 216, [[DX_TYPES_HANDLE]] [[TMP5]], [[DX_TYPES_RESOURCEPROPERTIES:%.*]] { i32 16, i32 0 })
-; DXILCONTPOSTPROCESS-CPS-NEXT:    [[TMP7:%.*]] = call i64 @amd.dx.getAccelStructAddr([[DX_TYPES_HANDLE]] [[TMP6]])
+; DXILCONTPOSTPROCESS-CPS-NEXT:    [[TMP1:%.*]] = load [[DX_TYPES_HANDLE:%.*]], ptr @"\01?Scene@@3URaytracingAccelerationStructure@@A", align 4
+; DXILCONTPOSTPROCESS-CPS-NEXT:    [[TMP2:%.*]] = load [[DX_TYPES_HANDLE]], ptr @"\01?RenderTarget@@3V?$RWTexture2D@V?$vector@M$03@@@@A", align 4
+; DXILCONTPOSTPROCESS-CPS-NEXT:    [[TMP3:%.*]] = call [[DX_TYPES_HANDLE]] [[DX_OP_CREATEHANDLEFORLIB_DX_TYPES_HANDLE:@[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]](i32 160, [[DX_TYPES_HANDLE]] [[TMP1]])
+; DXILCONTPOSTPROCESS-CPS-NEXT:    [[TMP4:%.*]] = call [[DX_TYPES_HANDLE]] [[DX_OP_ANNOTATEHANDLE:@[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]](i32 216, [[DX_TYPES_HANDLE]] [[TMP3]], [[DX_TYPES_RESOURCEPROPERTIES:%.*]] { i32 16, i32 0 })
+; DXILCONTPOSTPROCESS-CPS-NEXT:    [[TMP5:%.*]] = call i64 @amd.dx.getAccelStructAddr([[DX_TYPES_HANDLE]] [[TMP4]])
 ; DXILCONTPOSTPROCESS-CPS-NEXT:    [[DIS_DATA_I_FCA_0_INSERT:%.*]] = insertvalue [[STRUCT_DISPATCHSYSTEMDATA]] poison, <3 x i32> [[DOTFCA_0_EXTRACT20]], 0
 ; DXILCONTPOSTPROCESS-CPS-NEXT:    [[SYS_DATA_I:%.*]] = insertvalue [[STRUCT_SYSTEMDATA:%.*]] undef, [[STRUCT_DISPATCHSYSTEMDATA]] [[DIS_DATA_I_FCA_0_INSERT]], 0
 ; DXILCONTPOSTPROCESS-CPS-NEXT:    [[TRAV_DATA_I:%.*]] = insertvalue [[STRUCT_TRAVERSALDATA:%.*]] undef, [[STRUCT_SYSTEMDATA]] [[SYS_DATA_I]], 0
-; DXILCONTPOSTPROCESS-CPS-NEXT:    [[TMP8:%.*]] = call i64 @continuation.getAddrAndMD(i64 ptrtoint (ptr @MyRayGen.resume.0 to i64))
+; DXILCONTPOSTPROCESS-CPS-NEXT:    [[TMP8:%.*]] = call i64 @continuation.getAddrAndMD(ptr @MyRayGen.resume.0)
 ; DXILCONTPOSTPROCESS-CPS-NEXT:    [[TRAV_DATA2_I:%.*]] = insertvalue [[STRUCT_TRAVERSALDATA]] [[TRAV_DATA_I]], i64 [[TMP8]], 5
 ; DXILCONTPOSTPROCESS-CPS-NEXT:    [[DOTSROA_0_0_VEC_EXTRACT:%.*]] = extractelement <4 x float> zeroinitializer, i32 0
-; DXILCONTPOSTPROCESS-CPS-NEXT:    [[TMP9:%.*]] = bitcast float [[DOTSROA_0_0_VEC_EXTRACT]] to i32
+; DXILCONTPOSTPROCESS-CPS-NEXT:    [[TMP7:%.*]] = bitcast float [[DOTSROA_0_0_VEC_EXTRACT]] to i32
 ; DXILCONTPOSTPROCESS-CPS-NEXT:    [[DOTSROA_0_4_VEC_EXTRACT:%.*]] = extractelement <4 x float> zeroinitializer, i32 1
-; DXILCONTPOSTPROCESS-CPS-NEXT:    [[TMP10:%.*]] = bitcast float [[DOTSROA_0_4_VEC_EXTRACT]] to i32
+; DXILCONTPOSTPROCESS-CPS-NEXT:    [[TMP11:%.*]] = bitcast float [[DOTSROA_0_4_VEC_EXTRACT]] to i32
 ; DXILCONTPOSTPROCESS-CPS-NEXT:    [[DOTSROA_0_8_VEC_EXTRACT:%.*]] = extractelement <4 x float> zeroinitializer, i32 2
-; DXILCONTPOSTPROCESS-CPS-NEXT:    [[TMP11:%.*]] = bitcast float [[DOTSROA_0_8_VEC_EXTRACT]] to i32
+; DXILCONTPOSTPROCESS-CPS-NEXT:    [[TMP9:%.*]] = bitcast float [[DOTSROA_0_8_VEC_EXTRACT]] to i32
 ; DXILCONTPOSTPROCESS-CPS-NEXT:    [[DOTSROA_0_12_VEC_EXTRACT:%.*]] = extractelement <4 x float> zeroinitializer, i32 3
-; DXILCONTPOSTPROCESS-CPS-NEXT:    [[TMP12:%.*]] = bitcast float [[DOTSROA_0_12_VEC_EXTRACT]] to i32
-; DXILCONTPOSTPROCESS-CPS-NEXT:    [[DOTFCA_0_INSERT:%.*]] = insertvalue [10 x i32] poison, i32 [[TMP9]], 0
+; DXILCONTPOSTPROCESS-CPS-NEXT:    [[TMP10:%.*]] = bitcast float [[DOTSROA_0_12_VEC_EXTRACT]] to i32
+; DXILCONTPOSTPROCESS-CPS-NEXT:    [[DOTFCA_0_INSERT:%.*]] = insertvalue [10 x i32] poison, i32 [[TMP7]], 0
 ; DXILCONTPOSTPROCESS-CPS-NEXT:    [[DOTFCA_1_INSERT:%.*]] = insertvalue [10 x i32] [[DOTFCA_0_INSERT]], i32 undef, 1
 ; DXILCONTPOSTPROCESS-CPS-NEXT:    [[DOTFCA_2_INSERT:%.*]] = insertvalue [10 x i32] [[DOTFCA_1_INSERT]], i32 undef, 2
 ; DXILCONTPOSTPROCESS-CPS-NEXT:    [[DOTFCA_3_INSERT:%.*]] = insertvalue [10 x i32] [[DOTFCA_2_INSERT]], i32 undef, 3
 ; DXILCONTPOSTPROCESS-CPS-NEXT:    [[DOTFCA_4_INSERT:%.*]] = insertvalue [10 x i32] [[DOTFCA_3_INSERT]], i32 undef, 4
 ; DXILCONTPOSTPROCESS-CPS-NEXT:    [[DOTFCA_5_INSERT:%.*]] = insertvalue [10 x i32] [[DOTFCA_4_INSERT]], i32 undef, 5
 ; DXILCONTPOSTPROCESS-CPS-NEXT:    [[DOTFCA_6_INSERT:%.*]] = insertvalue [10 x i32] [[DOTFCA_5_INSERT]], i32 undef, 6
-; DXILCONTPOSTPROCESS-CPS-NEXT:    [[DOTFCA_7_INSERT:%.*]] = insertvalue [10 x i32] [[DOTFCA_6_INSERT]], i32 [[TMP10]], 7
-; DXILCONTPOSTPROCESS-CPS-NEXT:    [[DOTFCA_8_INSERT:%.*]] = insertvalue [10 x i32] [[DOTFCA_7_INSERT]], i32 [[TMP11]], 8
-; DXILCONTPOSTPROCESS-CPS-NEXT:    [[DOTFCA_9_INSERT:%.*]] = insertvalue [10 x i32] [[DOTFCA_8_INSERT]], i32 [[TMP12]], 9
+; DXILCONTPOSTPROCESS-CPS-NEXT:    [[DOTFCA_7_INSERT:%.*]] = insertvalue [10 x i32] [[DOTFCA_6_INSERT]], i32 [[TMP11]], 7
+; DXILCONTPOSTPROCESS-CPS-NEXT:    [[DOTFCA_8_INSERT:%.*]] = insertvalue [10 x i32] [[DOTFCA_7_INSERT]], i32 [[TMP9]], 8
+; DXILCONTPOSTPROCESS-CPS-NEXT:    [[DOTFCA_9_INSERT:%.*]] = insertvalue [10 x i32] [[DOTFCA_8_INSERT]], i32 [[TMP10]], 9
 ; DXILCONTPOSTPROCESS-CPS-NEXT:    [[TMP13:%.*]] = load i32, ptr [[CSP]], align 4
-; DXILCONTPOSTPROCESS-CPS-NEXT:    [[TMP14:%.*]] = call i64 @continuation.getAddrAndMD(i64 ptrtoint (ptr @MyRayGen.resume.0 to i64))
-; DXILCONTPOSTPROCESS-CPS-NEXT:    call void (i64, i64, ...) @continuation.waitContinue(i64 4, i64 -1, i32 [[TMP13]], i64 [[TMP14]], i32 5, [[STRUCT_TRAVERSALDATA]] [[TRAV_DATA2_I]], [6 x i32] poison, [10 x i32] [[DOTFCA_9_INSERT]]), !continuation.registercount [[META33:![0-9]+]], !continuation.returnedRegistercount !33
+; DXILCONTPOSTPROCESS-CPS-NEXT:    call void (i64, i64, ...) @continuation.waitContinue(i64 4, i64 -1, i32 [[TMP13]], i64 [[TMP8]], i32 5, [[STRUCT_TRAVERSALDATA]] [[TRAV_DATA2_I]], [6 x i32] poison, [10 x i32] [[DOTFCA_9_INSERT]]), !continuation.registercount [[META33:![0-9]+]], !continuation.returnedRegistercount [[META33]]
 ; DXILCONTPOSTPROCESS-CPS-NEXT:    unreachable
 ;
 ;
 ; DXILCONTPOSTPROCESS-CPS-LABEL: define dso_local void @MyRayGen.resume.0(
-; DXILCONTPOSTPROCESS-CPS-SAME: {} [[TMP0:%.*]], i32 [[CSPINIT:%.*]], i32 [[TMP1:%.*]], i32 [[TMP2:%.*]], { [[STRUCT_DISPATCHSYSTEMDATA:%.*]], [21 x i32], [10 x i32] } [[TMP3:%.*]]) !lgc.rt.shaderstage [[META22]] !lgc.cps [[META22]] !continuation [[META36]] {
+; DXILCONTPOSTPROCESS-CPS-SAME: {} [[TMP0:%.*]], i32 [[CSPINIT:%.*]], i32 [[TMP1:%.*]], i32 [[TMP2:%.*]], { [[STRUCT_DISPATCHSYSTEMDATA:%.*]], [21 x i32], [10 x i32] } [[TMP3:%.*]]) !lgc.rt.shaderstage [[META22]] !lgc.cps [[META36]] !continuation [[META37]] {
 ; DXILCONTPOSTPROCESS-CPS-NEXT:  entryresume.0:
 ; DXILCONTPOSTPROCESS-CPS-NEXT:    [[TMP4:%.*]] = alloca { [[STRUCT_DISPATCHSYSTEMDATA]], [21 x i32], [10 x i32] }, align 8
 ; DXILCONTPOSTPROCESS-CPS-NEXT:    [[CSP:%.*]] = alloca i32, align 4
@@ -2977,7 +2978,7 @@ attributes #6 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 ;
 ;
 ; DXILCONTPOSTPROCESS-CPS-LABEL: define void @MyClosestHitShader(
-; DXILCONTPOSTPROCESS-CPS-SAME: {} [[CONT_STATE:%.*]], i32 [[CSPINIT:%.*]], i32 [[RETURN_ADDR:%.*]], i32 [[SHADER_INDEX:%.*]], [[STRUCT_SYSTEMDATA:%.*]] [[SYSTEM_DATA:%.*]], {} [[HIT_ATTRS:%.*]], [19 x i32] [[PADDING:%.*]], [10 x i32] [[PAYLOAD:%.*]]) #[[ATTR3]] !lgc.rt.shaderstage [[META37:![0-9]+]] !lgc.cps [[META38:![0-9]+]] !continuation [[META39:![0-9]+]] {
+; DXILCONTPOSTPROCESS-CPS-SAME: {} [[CONT_STATE:%.*]], i32 [[CSPINIT:%.*]], i32 [[RETURNADDR:%.*]], i32 [[SHADER_INDEX:%.*]], [[STRUCT_SYSTEMDATA:%.*]] [[SYSTEM_DATA:%.*]], {} [[HIT_ATTRS:%.*]], [19 x i32] [[PADDING:%.*]], [10 x i32] [[PAYLOAD:%.*]]) #[[ATTR3]] !lgc.rt.shaderstage [[META38:![0-9]+]] !lgc.cps [[META39:![0-9]+]] !continuation [[META40:![0-9]+]] {
 ; DXILCONTPOSTPROCESS-CPS-NEXT:  AllocaSpillBB:
 ; DXILCONTPOSTPROCESS-CPS-NEXT:    [[CSP:%.*]] = alloca i32, align 4
 ; DXILCONTPOSTPROCESS-CPS-NEXT:    store i32 [[CSPINIT]], ptr [[CSP]], align 4
@@ -3042,14 +3043,14 @@ attributes #6 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 ; DXILCONTPOSTPROCESS-CPS-NEXT:    [[TMP20:%.*]] = load i32, ptr [[CSP]], align 4
 ; DXILCONTPOSTPROCESS-CPS-NEXT:    [[TMP21:%.*]] = add i32 [[TMP20]], 0
 ; DXILCONTPOSTPROCESS-CPS-NEXT:    store i32 [[TMP21]], ptr [[CSP]], align 4
-; DXILCONTPOSTPROCESS-CPS-NEXT:    [[TMP22:%.*]] = zext i32 [[RETURN_ADDR]] to i64
+; DXILCONTPOSTPROCESS-CPS-NEXT:    [[TMP22:%.*]] = zext i32 [[RETURNADDR]] to i64
 ; DXILCONTPOSTPROCESS-CPS-NEXT:    [[TMP23:%.*]] = load i32, ptr [[CSP]], align 4
 ; DXILCONTPOSTPROCESS-CPS-NEXT:    call void (i64, ...) @continuation.continue(i64 [[TMP22]], i32 [[TMP23]], i32 poison, i32 poison, [[STRUCT_DISPATCHSYSTEMDATA]] [[DOTFCA_0_INSERT10]], [21 x i32] poison, [10 x i32] [[DOTFCA_9_INSERT]]), !continuation.registercount [[META33]]
 ; DXILCONTPOSTPROCESS-CPS-NEXT:    unreachable
 ;
 ;
 ; DXILCONTPOSTPROCESS-CPS-LABEL: define void @MyAnyHitShader(
-; DXILCONTPOSTPROCESS-CPS-SAME: {} [[CONT_STATE:%.*]], i32 [[CSPINIT:%.*]], i32 [[RETURN_ADDR:%.*]], i32 [[SHADER_INDEX:%.*]], [[STRUCT_TRAVERSALDATA:%.*]] [[SYSTEM_DATA:%.*]], [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES:%.*]] [[HIT_ATTRS:%.*]], [6 x i32] [[PADDING:%.*]], [10 x i32] [[PAYLOAD:%.*]]) #[[ATTR3]] !lgc.rt.shaderstage [[META40:![0-9]+]] !lgc.cps [[META37]] !continuation [[META41:![0-9]+]] {
+; DXILCONTPOSTPROCESS-CPS-SAME: {} [[CONT_STATE:%.*]], i32 [[CSPINIT:%.*]], i32 [[RETURNADDR:%.*]], i32 [[SHADER_INDEX:%.*]], [[STRUCT_TRAVERSALDATA:%.*]] [[SYSTEM_DATA:%.*]], [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES:%.*]] [[HIT_ATTRS:%.*]], [6 x i32] [[PADDING:%.*]], [10 x i32] [[PAYLOAD:%.*]]) #[[ATTR3]] !lgc.rt.shaderstage [[META39]] !lgc.cps [[META41:![0-9]+]] !continuation [[META42:![0-9]+]] {
 ; DXILCONTPOSTPROCESS-CPS-NEXT:  AllocaSpillBB:
 ; DXILCONTPOSTPROCESS-CPS-NEXT:    [[TMP0:%.*]] = alloca [[STRUCT_HITDATA:%.*]], align 8
 ; DXILCONTPOSTPROCESS-CPS-NEXT:    [[TMP1:%.*]] = alloca [[STRUCT_HITDATA]], align 8
@@ -3167,12 +3168,12 @@ attributes #6 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 ; DXILCONTPOSTPROCESS-CPS-NEXT:    [[TMP21:%.*]] = bitcast float [[DOTSROA_0_8_VEC_EXTRACT]] to i32
 ; DXILCONTPOSTPROCESS-CPS-NEXT:    [[DOTSROA_0_12_VEC_EXTRACT:%.*]] = extractelement <4 x float> [[DOTSROA_0_12_VEC_INSERT]], i32 3
 ; DXILCONTPOSTPROCESS-CPS-NEXT:    [[TMP22:%.*]] = bitcast float [[DOTSROA_0_12_VEC_EXTRACT]] to i32
-; DXILCONTPOSTPROCESS-CPS-NEXT:    [[HITATTRSALLOCA_SROA_0_0_VEC_EXTRACT:%.*]] = extractelement <2 x float> [[HIT_ATTRS_FCA_0_EXTRACT]], i32 0
-; DXILCONTPOSTPROCESS-CPS-NEXT:    [[TMP23:%.*]] = bitcast float [[HITATTRSALLOCA_SROA_0_0_VEC_EXTRACT]] to i32
+; DXILCONTPOSTPROCESS-CPS-NEXT:    [[HITATTRSALLOCA_SROA_0_0_VEC_EXTRACT9:%.*]] = extractelement <2 x float> [[HIT_ATTRS_FCA_0_EXTRACT]], i32 0
+; DXILCONTPOSTPROCESS-CPS-NEXT:    [[TMP23:%.*]] = bitcast float [[HITATTRSALLOCA_SROA_0_0_VEC_EXTRACT9]] to i32
 ; DXILCONTPOSTPROCESS-CPS-NEXT:    [[TMP24:%.*]] = bitcast i32 [[TMP23]] to float
 ; DXILCONTPOSTPROCESS-CPS-NEXT:    [[DOTSROA_0102_0_VEC_INSERT:%.*]] = insertelement <2 x float> undef, float [[TMP24]], i32 0
-; DXILCONTPOSTPROCESS-CPS-NEXT:    [[HITATTRSALLOCA_SROA_0_4_VEC_EXTRACT:%.*]] = extractelement <2 x float> [[HIT_ATTRS_FCA_0_EXTRACT]], i32 1
-; DXILCONTPOSTPROCESS-CPS-NEXT:    [[TMP25:%.*]] = bitcast float [[HITATTRSALLOCA_SROA_0_4_VEC_EXTRACT]] to i32
+; DXILCONTPOSTPROCESS-CPS-NEXT:    [[HITATTRSALLOCA_SROA_0_4_VEC_EXTRACT11:%.*]] = extractelement <2 x float> [[HIT_ATTRS_FCA_0_EXTRACT]], i32 1
+; DXILCONTPOSTPROCESS-CPS-NEXT:    [[TMP25:%.*]] = bitcast float [[HITATTRSALLOCA_SROA_0_4_VEC_EXTRACT11]] to i32
 ; DXILCONTPOSTPROCESS-CPS-NEXT:    [[TMP26:%.*]] = bitcast i32 [[TMP25]] to float
 ; DXILCONTPOSTPROCESS-CPS-NEXT:    [[DOTSROA_0102_4_VEC_INSERT:%.*]] = insertelement <2 x float> [[DOTSROA_0102_0_VEC_INSERT]], float [[TMP26]], i32 1
 ; DXILCONTPOSTPROCESS-CPS-NEXT:    [[DOTFCA_0_INSERT101:%.*]] = insertvalue [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]] poison, <2 x float> [[DOTSROA_0102_4_VEC_INSERT]], 0
@@ -3218,26 +3219,26 @@ attributes #6 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 ; DXILCONTPOSTPROCESS-CPS-NEXT:    [[TMP28:%.*]] = load i32, ptr [[CSP]], align 4
 ; DXILCONTPOSTPROCESS-CPS-NEXT:    [[TMP29:%.*]] = add i32 [[TMP28]], 0
 ; DXILCONTPOSTPROCESS-CPS-NEXT:    store i32 [[TMP29]], ptr [[CSP]], align 4
-; DXILCONTPOSTPROCESS-CPS-NEXT:    [[TMP30:%.*]] = zext i32 [[RETURN_ADDR]] to i64
+; DXILCONTPOSTPROCESS-CPS-NEXT:    [[TMP30:%.*]] = zext i32 [[RETURNADDR]] to i64
 ; DXILCONTPOSTPROCESS-CPS-NEXT:    [[TMP31:%.*]] = load i32, ptr [[CSP]], align 4
 ; DXILCONTPOSTPROCESS-CPS-NEXT:    call void (i64, ...) @continuation.continue(i64 [[TMP30]], i32 [[TMP31]], i32 poison, i32 poison, [[STRUCT_TRAVERSALDATA]] [[DOTFCA_5_INSERT72]], [8 x i32] poison, [10 x i32] [[DOTFCA_9_INSERT]]), !continuation.registercount [[META33]]
 ; DXILCONTPOSTPROCESS-CPS-NEXT:    unreachable
 ; DXILCONTPOSTPROCESS-CPS:       32:
 ; DXILCONTPOSTPROCESS-CPS-NEXT:    call void @_cont_AcceptHit(ptr [[SYSTEM_DATA_ALLOCA]])
-; DXILCONTPOSTPROCESS-CPS-NEXT:    [[DOTSROA_0_0_VEC_EXTRACT15:%.*]] = extractelement <4 x float> [[DOTSROA_0_12_VEC_INSERT]], i32 0
-; DXILCONTPOSTPROCESS-CPS-NEXT:    [[TMP33:%.*]] = bitcast float [[DOTSROA_0_0_VEC_EXTRACT15]] to i32
+; DXILCONTPOSTPROCESS-CPS-NEXT:    [[DOTSROA_0_0_VEC_EXTRACT14:%.*]] = extractelement <4 x float> [[DOTSROA_0_12_VEC_INSERT]], i32 0
+; DXILCONTPOSTPROCESS-CPS-NEXT:    [[TMP33:%.*]] = bitcast float [[DOTSROA_0_0_VEC_EXTRACT14]] to i32
 ; DXILCONTPOSTPROCESS-CPS-NEXT:    [[DOTSROA_0_4_VEC_EXTRACT18:%.*]] = extractelement <4 x float> [[DOTSROA_0_12_VEC_INSERT]], i32 1
 ; DXILCONTPOSTPROCESS-CPS-NEXT:    [[TMP34:%.*]] = bitcast float [[DOTSROA_0_4_VEC_EXTRACT18]] to i32
-; DXILCONTPOSTPROCESS-CPS-NEXT:    [[DOTSROA_0_8_VEC_EXTRACT21:%.*]] = extractelement <4 x float> [[DOTSROA_0_12_VEC_INSERT]], i32 2
-; DXILCONTPOSTPROCESS-CPS-NEXT:    [[TMP35:%.*]] = bitcast float [[DOTSROA_0_8_VEC_EXTRACT21]] to i32
-; DXILCONTPOSTPROCESS-CPS-NEXT:    [[DOTSROA_0_12_VEC_EXTRACT24:%.*]] = extractelement <4 x float> [[DOTSROA_0_12_VEC_INSERT]], i32 3
-; DXILCONTPOSTPROCESS-CPS-NEXT:    [[TMP36:%.*]] = bitcast float [[DOTSROA_0_12_VEC_EXTRACT24]] to i32
-; DXILCONTPOSTPROCESS-CPS-NEXT:    [[HITATTRSALLOCA_SROA_0_0_VEC_EXTRACT9:%.*]] = extractelement <2 x float> [[HIT_ATTRS_FCA_0_EXTRACT]], i32 0
-; DXILCONTPOSTPROCESS-CPS-NEXT:    [[TMP37:%.*]] = bitcast float [[HITATTRSALLOCA_SROA_0_0_VEC_EXTRACT9]] to i32
+; DXILCONTPOSTPROCESS-CPS-NEXT:    [[DOTSROA_0_8_VEC_EXTRACT20:%.*]] = extractelement <4 x float> [[DOTSROA_0_12_VEC_INSERT]], i32 2
+; DXILCONTPOSTPROCESS-CPS-NEXT:    [[TMP35:%.*]] = bitcast float [[DOTSROA_0_8_VEC_EXTRACT20]] to i32
+; DXILCONTPOSTPROCESS-CPS-NEXT:    [[DOTSROA_0_12_VEC_EXTRACT23:%.*]] = extractelement <4 x float> [[DOTSROA_0_12_VEC_INSERT]], i32 3
+; DXILCONTPOSTPROCESS-CPS-NEXT:    [[TMP36:%.*]] = bitcast float [[DOTSROA_0_12_VEC_EXTRACT23]] to i32
+; DXILCONTPOSTPROCESS-CPS-NEXT:    [[HITATTRSALLOCA_SROA_0_0_VEC_EXTRACT:%.*]] = extractelement <2 x float> [[HIT_ATTRS_FCA_0_EXTRACT]], i32 0
+; DXILCONTPOSTPROCESS-CPS-NEXT:    [[TMP37:%.*]] = bitcast float [[HITATTRSALLOCA_SROA_0_0_VEC_EXTRACT]] to i32
 ; DXILCONTPOSTPROCESS-CPS-NEXT:    [[TMP38:%.*]] = bitcast i32 [[TMP37]] to float
 ; DXILCONTPOSTPROCESS-CPS-NEXT:    [[DOTSROA_0106_0_VEC_INSERT:%.*]] = insertelement <2 x float> undef, float [[TMP38]], i32 0
-; DXILCONTPOSTPROCESS-CPS-NEXT:    [[HITATTRSALLOCA_SROA_0_4_VEC_EXTRACT11:%.*]] = extractelement <2 x float> [[HIT_ATTRS_FCA_0_EXTRACT]], i32 1
-; DXILCONTPOSTPROCESS-CPS-NEXT:    [[TMP39:%.*]] = bitcast float [[HITATTRSALLOCA_SROA_0_4_VEC_EXTRACT11]] to i32
+; DXILCONTPOSTPROCESS-CPS-NEXT:    [[HITATTRSALLOCA_SROA_0_4_VEC_EXTRACT:%.*]] = extractelement <2 x float> [[HIT_ATTRS_FCA_0_EXTRACT]], i32 1
+; DXILCONTPOSTPROCESS-CPS-NEXT:    [[TMP39:%.*]] = bitcast float [[HITATTRSALLOCA_SROA_0_4_VEC_EXTRACT]] to i32
 ; DXILCONTPOSTPROCESS-CPS-NEXT:    [[TMP40:%.*]] = bitcast i32 [[TMP39]] to float
 ; DXILCONTPOSTPROCESS-CPS-NEXT:    [[DOTSROA_0106_4_VEC_INSERT:%.*]] = insertelement <2 x float> [[DOTSROA_0106_0_VEC_INSERT]], float [[TMP40]], i32 1
 ; DXILCONTPOSTPROCESS-CPS-NEXT:    [[DOTFCA_0_INSERT105:%.*]] = insertvalue [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]] poison, <2 x float> [[DOTSROA_0106_4_VEC_INSERT]], 0
@@ -3283,14 +3284,14 @@ attributes #6 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 ; DXILCONTPOSTPROCESS-CPS-NEXT:    [[TMP42:%.*]] = load i32, ptr [[CSP]], align 4
 ; DXILCONTPOSTPROCESS-CPS-NEXT:    [[TMP43:%.*]] = add i32 [[TMP42]], 0
 ; DXILCONTPOSTPROCESS-CPS-NEXT:    store i32 [[TMP43]], ptr [[CSP]], align 4
-; DXILCONTPOSTPROCESS-CPS-NEXT:    [[TMP44:%.*]] = zext i32 [[RETURN_ADDR]] to i64
+; DXILCONTPOSTPROCESS-CPS-NEXT:    [[TMP44:%.*]] = zext i32 [[RETURNADDR]] to i64
 ; DXILCONTPOSTPROCESS-CPS-NEXT:    [[TMP45:%.*]] = load i32, ptr [[CSP]], align 4
 ; DXILCONTPOSTPROCESS-CPS-NEXT:    call void (i64, ...) @continuation.continue(i64 [[TMP44]], i32 [[TMP45]], i32 poison, i32 poison, [[STRUCT_TRAVERSALDATA]] [[DOTFCA_5_INSERT98]], [8 x i32] poison, [10 x i32] [[DOTFCA_9_INSERT54]]), !continuation.registercount [[META33]]
 ; DXILCONTPOSTPROCESS-CPS-NEXT:    unreachable
 ;
 ;
 ; DXILCONTPOSTPROCESS-CPS-LABEL: define void @MyIntersectionShader(
-; DXILCONTPOSTPROCESS-CPS-SAME: {} [[CONT_STATE:%.*]], i32 [[CSPINIT:%.*]], i32 [[RETURN_ADDR:%.*]], i32 [[SHADER_INDEX:%.*]], [[STRUCT_TRAVERSALDATA:%.*]] [[SYSTEM_DATA:%.*]], {} [[HIT_ATTRS:%.*]], [8 x i32] [[PADDING:%.*]], [30 x i32] [[PAYLOAD:%.*]]) #[[ATTR3]] !lgc.rt.shaderstage [[META38]] !lgc.cps [[META42:![0-9]+]] !continuation [[META43:![0-9]+]] {
+; DXILCONTPOSTPROCESS-CPS-SAME: {} [[CONT_STATE:%.*]], i32 [[CSPINIT:%.*]], i32 [[RETURNADDR:%.*]], i32 [[SHADER_INDEX:%.*]], [[STRUCT_TRAVERSALDATA:%.*]] [[SYSTEM_DATA:%.*]], {} [[HIT_ATTRS:%.*]], [8 x i32] [[PADDING:%.*]], [30 x i32] [[PAYLOAD:%.*]]) #[[ATTR3]] !lgc.rt.shaderstage [[META36]] !lgc.cps [[META43:![0-9]+]] !continuation [[META44:![0-9]+]] {
 ; DXILCONTPOSTPROCESS-CPS-NEXT:  AllocaSpillBB:
 ; DXILCONTPOSTPROCESS-CPS-NEXT:    [[CSP:%.*]] = alloca i32, align 4
 ; DXILCONTPOSTPROCESS-CPS-NEXT:    store i32 [[CSPINIT]], ptr [[CSP]], align 4
@@ -3299,7 +3300,7 @@ attributes #6 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 ; DXILCONTPOSTPROCESS-CPS-NEXT:    store i32 [[TMP1]], ptr [[CSP]], align 4
 ; DXILCONTPOSTPROCESS-CPS-NEXT:    [[TMP2:%.*]] = inttoptr i32 [[TMP0]] to ptr addrspace(21)
 ; DXILCONTPOSTPROCESS-CPS-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP2]], i32 0
-; DXILCONTPOSTPROCESS-CPS-NEXT:    store i32 [[RETURN_ADDR]], ptr addrspace(21) [[TMP3]], align 4
+; DXILCONTPOSTPROCESS-CPS-NEXT:    store i32 [[RETURNADDR]], ptr addrspace(21) [[TMP3]], align 4
 ; DXILCONTPOSTPROCESS-CPS-NEXT:    [[PAYLOAD_FCA_0_EXTRACT:%.*]] = extractvalue [30 x i32] [[PAYLOAD]], 0
 ; DXILCONTPOSTPROCESS-CPS-NEXT:    [[PAYLOAD_FCA_1_EXTRACT:%.*]] = extractvalue [30 x i32] [[PAYLOAD]], 1
 ; DXILCONTPOSTPROCESS-CPS-NEXT:    [[PAYLOAD_FCA_2_EXTRACT:%.*]] = extractvalue [30 x i32] [[PAYLOAD]], 2
@@ -3385,9 +3386,9 @@ attributes #6 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 ; DXILCONTPOSTPROCESS-CPS-NEXT:    [[DOTFCA_27_INSERT85:%.*]] = insertvalue [30 x i32] [[DOTFCA_26_INSERT82]], i32 [[PAYLOAD_FCA_27_EXTRACT]], 27
 ; DXILCONTPOSTPROCESS-CPS-NEXT:    [[DOTFCA_28_INSERT88:%.*]] = insertvalue [30 x i32] [[DOTFCA_27_INSERT85]], i32 [[PAYLOAD_FCA_28_EXTRACT]], 28
 ; DXILCONTPOSTPROCESS-CPS-NEXT:    [[DOTFCA_29_INSERT91:%.*]] = insertvalue [30 x i32] [[DOTFCA_28_INSERT88]], i32 [[PAYLOAD_FCA_29_EXTRACT]], 29
+; DXILCONTPOSTPROCESS-CPS-NEXT:    [[TMP5:%.*]] = call i64 @continuation.getAddrAndMD(ptr @MyIntersectionShader.resume.0)
 ; DXILCONTPOSTPROCESS-CPS-NEXT:    [[TMP4:%.*]] = load i32, ptr [[CSP]], align 4
-; DXILCONTPOSTPROCESS-CPS-NEXT:    [[TMP5:%.*]] = call i64 @continuation.getAddrAndMD(i64 ptrtoint (ptr @MyIntersectionShader.resume.0 to i64))
-; DXILCONTPOSTPROCESS-CPS-NEXT:    call void (i64, ...) @continuation.continue(i64 3, i32 [[TMP4]], i64 [[TMP5]], i32 5, float [[RES_I_FCA_1_INSERT_FCA_0_EXTRACT]], i32 0, [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]] [[DOTFCA_0_INSERT326]], [20 x i32] poison, [30 x i32] [[DOTFCA_29_INSERT91]]), !continuation.registercount [[META32:![0-9]+]], !continuation.returnedRegistercount !32
+; DXILCONTPOSTPROCESS-CPS-NEXT:    call void (i64, ...) @continuation.continue(i64 3, i32 [[TMP4]], i64 [[TMP5]], i32 5, float [[RES_I_FCA_1_INSERT_FCA_0_EXTRACT]], i32 0, [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES]] [[DOTFCA_0_INSERT326]], [20 x i32] poison, [30 x i32] [[DOTFCA_29_INSERT91]]), !continuation.registercount [[META32:![0-9]+]], !continuation.returnedRegistercount [[META32]]
 ; DXILCONTPOSTPROCESS-CPS-NEXT:    unreachable
 ; DXILCONTPOSTPROCESS-CPS:       accepthit.i:
 ; DXILCONTPOSTPROCESS-CPS-NEXT:    [[DOTSROA_0_0_VEC_EXTRACT:%.*]] = extractelement <2 x float> undef, i32 0
@@ -3444,7 +3445,7 @@ attributes #6 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 ; DXILCONTPOSTPROCESS-CPS-NEXT:    [[TMP11:%.*]] = load i32, ptr [[CSP]], align 4
 ; DXILCONTPOSTPROCESS-CPS-NEXT:    [[TMP12:%.*]] = add i32 [[TMP11]], -8
 ; DXILCONTPOSTPROCESS-CPS-NEXT:    store i32 [[TMP12]], ptr [[CSP]], align 4
-; DXILCONTPOSTPROCESS-CPS-NEXT:    [[TMP13:%.*]] = zext i32 [[RETURN_ADDR]] to i64
+; DXILCONTPOSTPROCESS-CPS-NEXT:    [[TMP13:%.*]] = zext i32 [[RETURNADDR]] to i64
 ; DXILCONTPOSTPROCESS-CPS-NEXT:    [[TMP14:%.*]] = load i32, ptr [[CSP]], align 4
 ; DXILCONTPOSTPROCESS-CPS-NEXT:    call void (i64, ...) @continuation.continue(i64 [[TMP13]], i32 [[TMP14]], i32 poison, i32 poison, [[STRUCT_TRAVERSALDATA]] [[DOTFCA_5_INSERT312]], [8 x i32] poison, [30 x i32] [[DOTFCA_29_INSERT211]]), !continuation.registercount [[META32]]
 ; DXILCONTPOSTPROCESS-CPS-NEXT:    unreachable
@@ -3490,14 +3491,14 @@ attributes #6 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 ; DXILCONTPOSTPROCESS-CPS-NEXT:    [[TMP16:%.*]] = load i32, ptr [[CSP]], align 4
 ; DXILCONTPOSTPROCESS-CPS-NEXT:    [[TMP17:%.*]] = add i32 [[TMP16]], -8
 ; DXILCONTPOSTPROCESS-CPS-NEXT:    store i32 [[TMP17]], ptr [[CSP]], align 4
-; DXILCONTPOSTPROCESS-CPS-NEXT:    [[TMP18:%.*]] = zext i32 [[RETURN_ADDR]] to i64
+; DXILCONTPOSTPROCESS-CPS-NEXT:    [[TMP18:%.*]] = zext i32 [[RETURNADDR]] to i64
 ; DXILCONTPOSTPROCESS-CPS-NEXT:    [[TMP19:%.*]] = load i32, ptr [[CSP]], align 4
 ; DXILCONTPOSTPROCESS-CPS-NEXT:    call void (i64, ...) @continuation.continue(i64 [[TMP18]], i32 [[TMP19]], i32 poison, i32 poison, [[STRUCT_TRAVERSALDATA]] [[DOTFCA_5_INSERT275]], [8 x i32] poison, [30 x i32] [[DOTFCA_29_INSERT]]), !continuation.registercount [[META32]]
 ; DXILCONTPOSTPROCESS-CPS-NEXT:    unreachable
 ;
 ;
 ; DXILCONTPOSTPROCESS-CPS-LABEL: define dso_local void @MyIntersectionShader.resume.0(
-; DXILCONTPOSTPROCESS-CPS-SAME: {} [[TMP0:%.*]], i32 [[CSPINIT:%.*]], i32 [[TMP1:%.*]], i32 [[TMP2:%.*]], { [[STRUCT_TRAVERSALDATA:%.*]], [8 x i32], [30 x i32] } [[TMP3:%.*]]) !lgc.rt.shaderstage [[META38]] !lgc.cps [[META42]] !continuation [[META43]] {
+; DXILCONTPOSTPROCESS-CPS-SAME: {} [[TMP0:%.*]], i32 [[CSPINIT:%.*]], i32 [[TMP1:%.*]], i32 [[TMP2:%.*]], { [[STRUCT_TRAVERSALDATA:%.*]], [8 x i32], [30 x i32] } [[TMP3:%.*]]) !lgc.rt.shaderstage [[META36]] !lgc.cps [[META43]] !continuation [[META44]] {
 ; DXILCONTPOSTPROCESS-CPS-NEXT:  entryresume.0:
 ; DXILCONTPOSTPROCESS-CPS-NEXT:    [[CSP:%.*]] = alloca i32, align 4
 ; DXILCONTPOSTPROCESS-CPS-NEXT:    store i32 [[CSPINIT]], ptr [[CSP]], align 4
@@ -3647,7 +3648,7 @@ attributes #6 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 ;
 ;
 ; DXILCONTPOSTPROCESS-CPS-LABEL: define void @MyIntersectionShaderLargeAttrs(
-; DXILCONTPOSTPROCESS-CPS-SAME: {} [[CONT_STATE:%.*]], i32 [[CSPINIT:%.*]], i32 [[RETURN_ADDR:%.*]], i32 [[SHADER_INDEX:%.*]], [[STRUCT_TRAVERSALDATA:%.*]] [[SYSTEM_DATA:%.*]], {} [[HIT_ATTRS:%.*]], [8 x i32] [[PADDING:%.*]], [30 x i32] [[PAYLOAD:%.*]]) #[[ATTR3]] !lgc.rt.shaderstage [[META38]] !lgc.cps [[META42]] !continuation [[META44:![0-9]+]] {
+; DXILCONTPOSTPROCESS-CPS-SAME: {} [[CONT_STATE:%.*]], i32 [[CSPINIT:%.*]], i32 [[RETURNADDR:%.*]], i32 [[SHADER_INDEX:%.*]], [[STRUCT_TRAVERSALDATA:%.*]] [[SYSTEM_DATA:%.*]], {} [[HIT_ATTRS:%.*]], [8 x i32] [[PADDING:%.*]], [30 x i32] [[PAYLOAD:%.*]]) #[[ATTR3]] !lgc.rt.shaderstage [[META36]] !lgc.cps [[META43]] !continuation [[META45:![0-9]+]] {
 ; DXILCONTPOSTPROCESS-CPS-NEXT:  AllocaSpillBB:
 ; DXILCONTPOSTPROCESS-CPS-NEXT:    [[CSP:%.*]] = alloca i32, align 4
 ; DXILCONTPOSTPROCESS-CPS-NEXT:    store i32 [[CSPINIT]], ptr [[CSP]], align 4
@@ -3656,7 +3657,7 @@ attributes #6 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 ; DXILCONTPOSTPROCESS-CPS-NEXT:    store i32 [[TMP1]], ptr [[CSP]], align 4
 ; DXILCONTPOSTPROCESS-CPS-NEXT:    [[TMP2:%.*]] = inttoptr i32 [[TMP0]] to ptr addrspace(21)
 ; DXILCONTPOSTPROCESS-CPS-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr addrspace(21) [[TMP2]], i32 0
-; DXILCONTPOSTPROCESS-CPS-NEXT:    store i32 [[RETURN_ADDR]], ptr addrspace(21) [[TMP3]], align 4
+; DXILCONTPOSTPROCESS-CPS-NEXT:    store i32 [[RETURNADDR]], ptr addrspace(21) [[TMP3]], align 4
 ; DXILCONTPOSTPROCESS-CPS-NEXT:    [[PAYLOAD_FCA_0_EXTRACT:%.*]] = extractvalue [30 x i32] [[PAYLOAD]], 0
 ; DXILCONTPOSTPROCESS-CPS-NEXT:    [[PAYLOAD_FCA_1_EXTRACT:%.*]] = extractvalue [30 x i32] [[PAYLOAD]], 1
 ; DXILCONTPOSTPROCESS-CPS-NEXT:    [[PAYLOAD_FCA_2_EXTRACT:%.*]] = extractvalue [30 x i32] [[PAYLOAD]], 2
@@ -3748,9 +3749,9 @@ attributes #6 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 ; DXILCONTPOSTPROCESS-CPS-NEXT:    [[DOTFCA_27_INSERT85:%.*]] = insertvalue [30 x i32] [[DOTFCA_26_INSERT82]], i32 [[PAYLOAD_FCA_27_EXTRACT]], 27
 ; DXILCONTPOSTPROCESS-CPS-NEXT:    [[DOTFCA_28_INSERT88:%.*]] = insertvalue [30 x i32] [[DOTFCA_27_INSERT85]], i32 [[PAYLOAD_FCA_28_EXTRACT]], 28
 ; DXILCONTPOSTPROCESS-CPS-NEXT:    [[DOTFCA_29_INSERT91:%.*]] = insertvalue [30 x i32] [[DOTFCA_28_INSERT88]], i32 [[PAYLOAD_FCA_29_EXTRACT]], 29
+; DXILCONTPOSTPROCESS-CPS-NEXT:    [[TMP5:%.*]] = call i64 @continuation.getAddrAndMD(ptr @MyIntersectionShaderLargeAttrs.resume.0)
 ; DXILCONTPOSTPROCESS-CPS-NEXT:    [[TMP4:%.*]] = load i32, ptr [[CSP]], align 4
-; DXILCONTPOSTPROCESS-CPS-NEXT:    [[TMP5:%.*]] = call i64 @continuation.getAddrAndMD(i64 ptrtoint (ptr @MyIntersectionShaderLargeAttrs.resume.0 to i64))
-; DXILCONTPOSTPROCESS-CPS-NEXT:    call void (i64, ...) @continuation.continue(i64 3, i32 [[TMP4]], i64 [[TMP5]], i32 5, float [[RES_I_FCA_1_INSERT_FCA_0_EXTRACT]], i32 0, [[STRUCT_LARGEINTERSECTIONATTRIBUTES]] [[DOTFCA_0_6_INSERT]], [15 x i32] poison, [30 x i32] [[DOTFCA_29_INSERT91]]), !continuation.registercount [[META32]], !continuation.returnedRegistercount !32
+; DXILCONTPOSTPROCESS-CPS-NEXT:    call void (i64, ...) @continuation.continue(i64 3, i32 [[TMP4]], i64 [[TMP5]], i32 5, float [[RES_I_FCA_1_INSERT_FCA_0_EXTRACT]], i32 0, [[STRUCT_LARGEINTERSECTIONATTRIBUTES]] [[DOTFCA_0_6_INSERT]], [15 x i32] poison, [30 x i32] [[DOTFCA_29_INSERT91]]), !continuation.registercount [[META32]], !continuation.returnedRegistercount [[META32]]
 ; DXILCONTPOSTPROCESS-CPS-NEXT:    unreachable
 ; DXILCONTPOSTPROCESS-CPS:       accepthit.i:
 ; DXILCONTPOSTPROCESS-CPS-NEXT:    [[TMP6:%.*]] = bitcast i32 100 to float
@@ -3803,7 +3804,7 @@ attributes #6 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 ; DXILCONTPOSTPROCESS-CPS-NEXT:    [[TMP9:%.*]] = load i32, ptr [[CSP]], align 4
 ; DXILCONTPOSTPROCESS-CPS-NEXT:    [[TMP10:%.*]] = add i32 [[TMP9]], -8
 ; DXILCONTPOSTPROCESS-CPS-NEXT:    store i32 [[TMP10]], ptr [[CSP]], align 4
-; DXILCONTPOSTPROCESS-CPS-NEXT:    [[TMP11:%.*]] = zext i32 [[RETURN_ADDR]] to i64
+; DXILCONTPOSTPROCESS-CPS-NEXT:    [[TMP11:%.*]] = zext i32 [[RETURNADDR]] to i64
 ; DXILCONTPOSTPROCESS-CPS-NEXT:    [[TMP12:%.*]] = load i32, ptr [[CSP]], align 4
 ; DXILCONTPOSTPROCESS-CPS-NEXT:    call void (i64, ...) @continuation.continue(i64 [[TMP11]], i32 [[TMP12]], i32 poison, i32 poison, [[STRUCT_TRAVERSALDATA]] [[DOTFCA_5_INSERT312]], [8 x i32] poison, [30 x i32] [[DOTFCA_29_INSERT211]]), !continuation.registercount [[META32]]
 ; DXILCONTPOSTPROCESS-CPS-NEXT:    unreachable
@@ -3849,14 +3850,14 @@ attributes #6 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 ; DXILCONTPOSTPROCESS-CPS-NEXT:    [[TMP14:%.*]] = load i32, ptr [[CSP]], align 4
 ; DXILCONTPOSTPROCESS-CPS-NEXT:    [[TMP15:%.*]] = add i32 [[TMP14]], -8
 ; DXILCONTPOSTPROCESS-CPS-NEXT:    store i32 [[TMP15]], ptr [[CSP]], align 4
-; DXILCONTPOSTPROCESS-CPS-NEXT:    [[TMP16:%.*]] = zext i32 [[RETURN_ADDR]] to i64
+; DXILCONTPOSTPROCESS-CPS-NEXT:    [[TMP16:%.*]] = zext i32 [[RETURNADDR]] to i64
 ; DXILCONTPOSTPROCESS-CPS-NEXT:    [[TMP17:%.*]] = load i32, ptr [[CSP]], align 4
 ; DXILCONTPOSTPROCESS-CPS-NEXT:    call void (i64, ...) @continuation.continue(i64 [[TMP16]], i32 [[TMP17]], i32 poison, i32 poison, [[STRUCT_TRAVERSALDATA]] [[DOTFCA_5_INSERT275]], [8 x i32] poison, [30 x i32] [[DOTFCA_29_INSERT]]), !continuation.registercount [[META32]]
 ; DXILCONTPOSTPROCESS-CPS-NEXT:    unreachable
 ;
 ;
 ; DXILCONTPOSTPROCESS-CPS-LABEL: define dso_local void @MyIntersectionShaderLargeAttrs.resume.0(
-; DXILCONTPOSTPROCESS-CPS-SAME: {} [[TMP0:%.*]], i32 [[CSPINIT:%.*]], i32 [[TMP1:%.*]], i32 [[TMP2:%.*]], { [[STRUCT_TRAVERSALDATA:%.*]], [8 x i32], [30 x i32] } [[TMP3:%.*]]) !lgc.rt.shaderstage [[META38]] !lgc.cps [[META42]] !continuation [[META44]] {
+; DXILCONTPOSTPROCESS-CPS-SAME: {} [[TMP0:%.*]], i32 [[CSPINIT:%.*]], i32 [[TMP1:%.*]], i32 [[TMP2:%.*]], { [[STRUCT_TRAVERSALDATA:%.*]], [8 x i32], [30 x i32] } [[TMP3:%.*]]) !lgc.rt.shaderstage [[META36]] !lgc.cps [[META43]] !continuation [[META45]] {
 ; DXILCONTPOSTPROCESS-CPS-NEXT:  entryresume.0:
 ; DXILCONTPOSTPROCESS-CPS-NEXT:    [[CSP:%.*]] = alloca i32, align 4
 ; DXILCONTPOSTPROCESS-CPS-NEXT:    store i32 [[CSPINIT]], ptr [[CSP]], align 4
@@ -4006,7 +4007,7 @@ attributes #6 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 ;
 ;
 ; DXILCONTPOSTPROCESS-CPS-LABEL: define void @MyMissShader(
-; DXILCONTPOSTPROCESS-CPS-SAME: {} [[CONT_STATE:%.*]], i32 [[CSPINIT:%.*]], i32 [[RETURN_ADDR:%.*]], i32 [[SHADER_INDEX:%.*]], [[STRUCT_SYSTEMDATA:%.*]] [[SYSTEM_DATA:%.*]], {} [[HIT_ATTRS:%.*]], [19 x i32] [[PADDING:%.*]], [10 x i32] [[PAYLOAD:%.*]]) #[[ATTR3]] !lgc.rt.shaderstage [[META42]] !lgc.cps [[META38]] !continuation [[META45:![0-9]+]] {
+; DXILCONTPOSTPROCESS-CPS-SAME: {} [[CONT_STATE:%.*]], i32 [[CSPINIT:%.*]], i32 [[RETURNADDR:%.*]], i32 [[SHADER_INDEX:%.*]], [[STRUCT_SYSTEMDATA:%.*]] [[SYSTEM_DATA:%.*]], {} [[HIT_ATTRS:%.*]], [19 x i32] [[PADDING:%.*]], [10 x i32] [[PAYLOAD:%.*]]) #[[ATTR3]] !lgc.rt.shaderstage [[META41]] !lgc.cps [[META39]] !continuation [[META46:![0-9]+]] {
 ; DXILCONTPOSTPROCESS-CPS-NEXT:  AllocaSpillBB:
 ; DXILCONTPOSTPROCESS-CPS-NEXT:    [[CSP:%.*]] = alloca i32, align 4
 ; DXILCONTPOSTPROCESS-CPS-NEXT:    store i32 [[CSPINIT]], ptr [[CSP]], align 4
@@ -4053,7 +4054,7 @@ attributes #6 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 ; DXILCONTPOSTPROCESS-CPS-NEXT:    [[TMP8:%.*]] = load i32, ptr [[CSP]], align 4
 ; DXILCONTPOSTPROCESS-CPS-NEXT:    [[TMP9:%.*]] = add i32 [[TMP8]], 0
 ; DXILCONTPOSTPROCESS-CPS-NEXT:    store i32 [[TMP9]], ptr [[CSP]], align 4
-; DXILCONTPOSTPROCESS-CPS-NEXT:    [[TMP10:%.*]] = zext i32 [[RETURN_ADDR]] to i64
+; DXILCONTPOSTPROCESS-CPS-NEXT:    [[TMP10:%.*]] = zext i32 [[RETURNADDR]] to i64
 ; DXILCONTPOSTPROCESS-CPS-NEXT:    [[TMP11:%.*]] = load i32, ptr [[CSP]], align 4
 ; DXILCONTPOSTPROCESS-CPS-NEXT:    call void (i64, ...) @continuation.continue(i64 [[TMP10]], i32 [[TMP11]], i32 poison, i32 poison, [[STRUCT_DISPATCHSYSTEMDATA]] [[DOTFCA_0_INSERT9]], [21 x i32] poison, [10 x i32] [[DOTFCA_9_INSERT]]), !continuation.registercount [[META33]]
 ; DXILCONTPOSTPROCESS-CPS-NEXT:    unreachable
diff --git a/llvmraytracing/test/dx/unnamed-type-intrinsics.ll b/llvmraytracing/test/dx/unnamed-type-intrinsics.ll
index ecea4a3fb7..39271713a9 100644
--- a/llvmraytracing/test/dx/unnamed-type-intrinsics.ll
+++ b/llvmraytracing/test/dx/unnamed-type-intrinsics.ll
@@ -1,10 +1,9 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --include-generated-funcs --version 3
-; RUN: opt --verify-each -passes='dxil-cont-lgc-rt-op-converter,lint,lower-raytracing-pipeline,lint,remove-types-metadata' -S %s 2> %t.stderr | FileCheck -check-prefix=LOWERRAYTRACINGPIPELINE %s
-; RUN: count 0 < %t.stderr
+; RUN: opt --verify-each -passes='dxil-cont-lgc-rt-op-converter,lint,lower-raytracing-pipeline,lint,remove-types-metadata' -S %s --lint-abort-on-error | FileCheck -check-prefix=LOWERRAYTRACINGPIPELINE %s
 
 ; Check that using unnamed types works well with generating intrinsic names
 
-target datalayout = "e-m:e-p:64:32-p20:32:32-p21:32:32-p32:32:32-i1:32-i8:8-i16:32-i32:32-i64:32-f16:32-f32:32-f64:32-v16:32-v32:32-v48:32-v64:32-v80:32-v96:32-v112:32-v128:32-v144:32-v160:32-v176:32-v192:32-v208:32-v224:32-v240:32-v256:32-n8:16:32"
+target datalayout = "e-m:e-p:64:32-p20:32:32-p21:32:32-p32:32:32-i1:32-i8:8-i16:16-i32:32-i64:32-f16:16-f32:32-f64:32-v8:8-v16:16-v32:32-v48:32-v64:32-v80:32-v96:32-v112:32-v128:32-v144:32-v160:32-v176:32-v192:32-v208:32-v224:32-v240:32-v256:32-n8:16:32"
 
 ; struct.DispatchSystemData
 %0 = type { <3 x i32> }
@@ -70,6 +69,10 @@ declare !types !28 void @_cont_IgnoreHit(%0* nocapture readnone) #1
 ; Function Attrs: nounwind
 declare !types !29 void @_AmdAcceptHitAttributes(%3* nocapture readnone) #1
 
+define void @_cont_ExitRayGen(ptr nocapture readonly %data) alwaysinline nounwind !types !{!"function", !"void", !{i32 0, %0 poison}} {
+  ret void
+}
+
 define void @_cont_TraceRay(%0* %data, i64 %0, i32 %1, i32 %2, i32 %3, i32 %4, i32 %5, float %6, float %7, float %8, float %9, float %10, float %11, float %12, float %13) #0 !types !30 {
   %dis_data = load %0, %0* %data, align 4
   %sys_data = insertvalue %2 undef, %0 %dis_data, 0
@@ -346,7 +349,7 @@ attributes #5 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 ;
 ;
 ; LOWERRAYTRACINGPIPELINE-LABEL: define void @MyRayGen(
-; LOWERRAYTRACINGPIPELINE-SAME: [[TMP0:%.*]] [[TMP0]]) #[[ATTR2:[0-9]+]] !lgc.rt.shaderstage [[META14:![0-9]+]] !continuation.entry [[META20:![0-9]+]] !continuation.registercount [[META14]] !continuation [[META21:![0-9]+]] {
+; LOWERRAYTRACINGPIPELINE-SAME: i64 [[RETURNADDR:%.*]], [[TMP0:%.*]] [[TMP0]]) #[[ATTR2:[0-9]+]] !lgc.rt.shaderstage [[META14:![0-9]+]] !continuation.entry [[META20:![0-9]+]] !continuation.registercount [[META14]] !continuation [[META21:![0-9]+]] {
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[SYSTEM_DATA_ALLOCA:%.*]] = alloca [[TMP0]], align 8
 ; LOWERRAYTRACINGPIPELINE-NEXT:    store [[TMP0]] [[TMP0]], ptr [[SYSTEM_DATA_ALLOCA]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    call void @amd.dx.setLocalRootIndex(i32 0)
@@ -365,7 +368,7 @@ attributes #5 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TRAV_DATA_I:%.*]] = insertvalue [[TMP1:%.*]] undef, [[TMP2]] [[SYS_DATA_I]], 0
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[ADDR_I:%.*]] = call i64 @_AmdGetResumePointAddr() #[[ATTR3:[0-9]+]]
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TRAV_DATA2_I:%.*]] = insertvalue [[TMP1]] [[TRAV_DATA_I]], i64 [[ADDR_I]], 5
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [[STRUCT_RAYPAYLOAD]], ptr [[TMP4]], i32 0, i32 0
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [[STRUCT_RAYPAYLOAD]], ptr [[TMP4]], i32 0
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP13:%.*]] = load i32, ptr [[TMP10]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP13]], ptr addrspace(20) @PAYLOAD, align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i32, ptr [[TMP10]], i32 1
@@ -377,10 +380,10 @@ attributes #5 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i32, ptr [[TMP14]], i32 2
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP20:%.*]] = load i32, ptr [[TMP19]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP20]], ptr addrspace(20) getelementptr inbounds (i32, ptr addrspace(20) @PAYLOAD, i32 9), align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP21:%.*]] = call ptr inttoptr (i64 4 to ptr)([[TMP1]] [[TRAV_DATA2_I]]), !continuation.registercount [[META18:![0-9]+]], !continuation.returnedRegistercount !18
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP22:%.*]] = call [[TMP0]] [[AWAIT_:@[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]](ptr [[TMP21]])
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP21:%.*]] = call ptr inttoptr (i64 4 to ptr)([[TMP1]] [[TRAV_DATA2_I]], i64 poison), !continuation.registercount [[META18:![0-9]+]], !continuation.returnedRegistercount [[META18]]
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP22:%.*]] = call [[TMP0]] [[AWAIT:@[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]](ptr [[TMP21]])
 ; LOWERRAYTRACINGPIPELINE-NEXT:    store [[STRUCT_RAYPAYLOAD]] poison, ptr [[TMP4]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [[STRUCT_RAYPAYLOAD]], ptr [[TMP4]], i32 0, i32 0
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [[STRUCT_RAYPAYLOAD]], ptr [[TMP4]], i32 0
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP26:%.*]] = load i32, ptr addrspace(20) @PAYLOAD, align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP26]], ptr [[TMP23]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP27:%.*]] = getelementptr inbounds i32, ptr [[TMP23]], i32 1
@@ -409,11 +412,11 @@ attributes #5 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP42:%.*]] = extractelement <4 x float> [[TMP34]], i64 3
 ; LOWERRAYTRACINGPIPELINE-NEXT:    call void @dx.op.textureStore.f32(i32 67, [[DX_TYPES_HANDLE]] [[TMP38]], i32 [[EXTRACT]], i32 [[EXTRACT1]], i32 undef, float [[TMP39]], float [[TMP40]], float [[TMP41]], float [[TMP42]], i8 15)
 ; LOWERRAYTRACINGPIPELINE-NEXT:    call void @llvm.lifetime.end.p0(i64 16, ptr [[TMP5]]) #[[ATTR1]]
-; LOWERRAYTRACINGPIPELINE-NEXT:    ret void, !continuation.registercount [[META17:![0-9]+]]
+; LOWERRAYTRACINGPIPELINE-NEXT:    ret void
 ;
 ;
 ; LOWERRAYTRACINGPIPELINE-LABEL: define %0 @MyClosestHit(
-; LOWERRAYTRACINGPIPELINE-SAME: [[TMP2:%.*]] [[TMP0:%.*]]) #[[ATTR2]] !lgc.rt.shaderstage [[META25:![0-9]+]] !continuation.registercount [[META18]] !continuation [[META26:![0-9]+]] {
+; LOWERRAYTRACINGPIPELINE-SAME: i64 [[RETURNADDR:%.*]], [[TMP2:%.*]] [[TMP0:%.*]]) #[[ATTR2]] !lgc.rt.shaderstage [[META25:![0-9]+]] !continuation.registercount [[META18]] !continuation [[META26:![0-9]+]] {
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP2]] = alloca [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES:%.*]], align 8
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[SYSTEM_DATA_ALLOCA:%.*]] = alloca [[TMP2]], align 8
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP3:%.*]] = alloca [[STRUCT_RAYPAYLOAD:%.*]], align 8
@@ -421,7 +424,7 @@ attributes #5 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 ; LOWERRAYTRACINGPIPELINE-NEXT:    store [[TMP2]] [[TMP0]], ptr [[SYSTEM_DATA_ALLOCA]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [[TMP2]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP5:%.*]] = call i32 @_cont_GetLocalRootIndex(ptr [[TMP4]])
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[STRUCT_RAYPAYLOAD]], ptr [[TMP3]], i32 0, i32 0
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[STRUCT_RAYPAYLOAD]], ptr [[TMP3]], i32 0
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP9:%.*]] = load i32, ptr addrspace(20) @PAYLOAD, align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP9]], ptr [[TMP6]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[TMP6]], i32 1
@@ -454,7 +457,7 @@ attributes #5 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP33:%.*]] = insertelement <4 x float> [[TMP32]], float 1.000000e+00, i64 3
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP34:%.*]] = getelementptr inbounds [[STRUCT_RAYPAYLOAD]], ptr [[TMP3]], i32 0, i32 0
 ; LOWERRAYTRACINGPIPELINE-NEXT:    store <4 x float> [[TMP33]], ptr [[TMP34]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP35:%.*]] = getelementptr inbounds [[STRUCT_RAYPAYLOAD]], ptr [[TMP3]], i32 0, i32 0
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP35:%.*]] = getelementptr inbounds [[STRUCT_RAYPAYLOAD]], ptr [[TMP3]], i32 0
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP38:%.*]] = load i32, ptr [[TMP35]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP38]], ptr addrspace(20) @PAYLOAD, align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP39:%.*]] = getelementptr inbounds i32, ptr [[TMP35]], i32 1
@@ -468,5 +471,6 @@ attributes #5 = { nocallback nofree nosync nounwind willreturn memory(argmem: re
 ; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[TMP45]], ptr addrspace(20) getelementptr inbounds (i32, ptr addrspace(20) @PAYLOAD, i32 9), align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP46:%.*]] = getelementptr inbounds [[TMP2]], ptr [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[TMP47:%.*]] = load [[TMP0]], ptr [[TMP46]], align 4
-; LOWERRAYTRACINGPIPELINE-NEXT:    ret [[TMP0]] [[TMP47]], !continuation.registercount [[META18]]
+; LOWERRAYTRACINGPIPELINE-NEXT:    call void (...) @lgc.ilcps.return(i64 [[RETURNADDR]], [[TMP0]] [[TMP47]]), !continuation.registercount [[META18]]
+; LOWERRAYTRACINGPIPELINE-NEXT:    unreachable
 ;
diff --git a/llvmraytracing/test/dx/wrong-system-data.ll b/llvmraytracing/test/dx/wrong-system-data.ll
index 44f2f6c66b..a9b04e6555 100644
--- a/llvmraytracing/test/dx/wrong-system-data.ll
+++ b/llvmraytracing/test/dx/wrong-system-data.ll
@@ -1,8 +1,8 @@
-; RUN: not --crash opt --verify-each -passes='dxil-cont-lgc-rt-op-converter,lint,lower-raytracing-pipeline,remove-types-metadata' -S %s 2>&1 | FileCheck %s
+; RUN: not --crash opt --verify-each -passes='dxil-cont-lgc-rt-op-converter,lint,lower-raytracing-pipeline,remove-types-metadata' -S %s --lint-abort-on-error 2>&1 | FileCheck %s
 
 ; CHECK: Invalid system data struct: Did not contain the needed struct type
 
-target datalayout = "e-m:e-p:64:32-p20:32:32-p21:32:32-p32:32:32-i1:32-i8:8-i16:32-i32:32-i64:32-f16:32-f32:32-f64:32-v16:32-v32:32-v48:32-v64:32-v80:32-v96:32-v112:32-v128:32-v144:32-v160:32-v176:32-v192:32-v208:32-v224:32-v240:32-v256:32-n8:16:32"
+target datalayout = "e-m:e-p:64:32-p20:32:32-p21:32:32-p32:32:32-i1:32-i8:8-i16:16-i32:32-i64:32-f16:16-f32:32-f64:32-v8:8-v16:16-v32:32-v48:32-v64:32-v80:32-v96:32-v112:32-v128:32-v144:32-v160:32-v176:32-v192:32-v208:32-v224:32-v240:32-v256:32-n8:16:32"
 
 %dx.types.Handle = type { i8* }
 %struct.TraversalData = type { %struct.SystemData }
@@ -43,6 +43,10 @@ declare !types !38 i1 @_cont_IsEndSearch(%struct.TraversalData*) #0
 
 declare !types !39 i32 @_cont_HitKind(%struct.SystemData*) #0
 
+define void @_cont_ExitRayGen(ptr nocapture readonly %data) alwaysinline nounwind !types !{!"function", !"void", !{i32 0, %struct.DispatchSystemData poison}} {
+  ret void
+}
+
 define void @_cont_TraceRay(%struct.DispatchSystemData* %data, i64 %0, i32 %1, i32 %2, i32 %3, i32 %4, i32 %5, float %6, float %7, float %8, float %9, float %10, float %11, float %12, float %13) #0 !types !40 {
   %sys_data = insertvalue %struct.SystemData undef, i32 1, 0
   %trav_data = insertvalue %struct.TraversalData undef, %struct.SystemData %sys_data, 0
diff --git a/llvmraytracing/test/intrinsics/discard-values.ll b/llvmraytracing/test/intrinsics/discard-values.ll
index 83ec328c4a..eee9c4c7d9 100644
--- a/llvmraytracing/test/intrinsics/discard-values.ll
+++ b/llvmraytracing/test/intrinsics/discard-values.ll
@@ -1,6 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 2
-; RUN: opt --verify-each -passes='dxil-cont-intrinsic-prepare,lint' -S %s 2> %t.stderr | FileCheck %s
-; RUN: count 0 < %t.stderr
+; RUN: opt --verify-each -passes='dxil-cont-intrinsic-prepare,lint' -S %s --lint-abort-on-error | FileCheck %s
 
 %struct.AnyHitData = type { float, i32 }
 %struct.DispatchSystemData = type { i32 }
diff --git a/llvmraytracing/test/intrinsics/get-func-addr-not-found.ll b/llvmraytracing/test/intrinsics/get-func-addr-not-found.ll
index 37ac17d405..d9dc80eff3 100644
--- a/llvmraytracing/test/intrinsics/get-func-addr-not-found.ll
+++ b/llvmraytracing/test/intrinsics/get-func-addr-not-found.ll
@@ -1,4 +1,4 @@
-; RUN: not --crash opt --verify-each -passes='dxil-cont-lgc-rt-op-converter,lint,lower-raytracing-pipeline,lint' -S %s 2>&1 | FileCheck %s
+; RUN: not --crash opt --verify-each -passes='dxil-cont-lgc-rt-op-converter,lint,lower-raytracing-pipeline,lint' -S %s --lint-abort-on-error 2>&1 | FileCheck %s
 
 ; CHECK: ERROR: Did not find function '' requested by _AmdGetFuncAddr
 
@@ -10,6 +10,10 @@ declare %struct.DispatchSystemData @_cont_SetupRayGen()
 
 declare !types !8 i32 @_cont_GetLocalRootIndex(%struct.DispatchSystemData*)
 
+define void @_cont_ExitRayGen(ptr nocapture readonly %data) alwaysinline nounwind !types !{!"function", !"void", !{i32 0, %struct.DispatchSystemData poison}} {
+  ret void
+}
+
 define i64 @main() {
 entry:
   %val = call i64 @_AmdGetFuncAddr()
diff --git a/llvmraytracing/test/intrinsics/get-func-addr.ll b/llvmraytracing/test/intrinsics/get-func-addr.ll
index b8a0cc54a5..1a4d83707a 100644
--- a/llvmraytracing/test/intrinsics/get-func-addr.ll
+++ b/llvmraytracing/test/intrinsics/get-func-addr.ll
@@ -1,6 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 2
-; RUN: opt --verify-each -passes='dxil-cont-lgc-rt-op-converter,lint,lower-raytracing-pipeline,lint' -S %s 2> %t.stderr | FileCheck %s
-; RUN: count 0 < %t.stderr
+; RUN: opt --verify-each -passes='dxil-cont-lgc-rt-op-converter,lint,lower-raytracing-pipeline,lint' -S %s --lint-abort-on-error | FileCheck %s
 
 %struct.DispatchSystemData = type { i32 }
 
@@ -11,16 +10,22 @@ declare %struct.DispatchSystemData @_cont_SetupRayGen()
 
 declare !types !8 i32 @_cont_GetLocalRootIndex(%struct.DispatchSystemData*)
 
+define void @_cont_ExitRayGen(ptr nocapture readonly %data) alwaysinline nounwind !types !{!"function", !"void", !{i32 0, %struct.DispatchSystemData poison}} {
+  ret void
+}
+
 define { i64, i32 } @main() !lgc.rt.shaderstage !10 {
 ; CHECK-LABEL: define void @main
-; CHECK-SAME: ([[STRUCT_DISPATCHSYSTEMDATA:%.*]] [[TMP0:%.*]]) !lgc.rt.shaderstage [[META6:![0-9]+]] !continuation.entry [[META12:![0-9]+]] !continuation.registercount [[META6]] !continuation [[META13:![0-9]+]] {
+; CHECK-SAME: (i64 [[RETURNADDR:%.*]], [[STRUCT_DISPATCHSYSTEMDATA:%.*]] [[TMP0:%.*]]) !lgc.rt.shaderstage [[META5:![0-9]+]] !continuation.entry [[META11:![0-9]+]] !continuation.registercount [[META5]] !continuation [[META12:![0-9]+]] {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[SYSTEM_DATA_ALLOCA:%.*]] = alloca [[STRUCT_DISPATCHSYSTEMDATA]], align 8
 ; CHECK-NEXT:    store [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP0]], ptr [[SYSTEM_DATA_ALLOCA]], align 4
 ; CHECK-NEXT:    call void @amd.dx.setLocalRootIndex(i32 0)
-; CHECK-NEXT:    [[V0:%.*]] = insertvalue { i64, i32 } undef, i64 ptrtoint (ptr @MyFunc to i64), 0
-; CHECK-NEXT:    [[V1:%.*]] = insertvalue { i64, i32 } undef, i32 ptrtoint (ptr @MyFunc2 to i32), 1
-; CHECK-NEXT:    ret void, !continuation.registercount [[META9:![0-9]+]]
+; CHECK-NEXT:    [[TMP1:%.*]] = call i64 (...) @lgc.cps.as.continuation.reference__i64(ptr @MyFunc)
+; CHECK-NEXT:    [[TMP2:%.*]] = call i32 (...) @lgc.cps.as.continuation.reference__i32(ptr @MyFunc2)
+; CHECK-NEXT:    [[V0:%.*]] = insertvalue { i64, i32 } undef, i64 [[TMP1]], 0
+; CHECK-NEXT:    [[V1:%.*]] = insertvalue { i64, i32 } undef, i32 [[TMP2]], 1
+; CHECK-NEXT:    ret void
 ;
 entry:
   %val = call i64 @_AmdGetFuncAddrMyFunc()
diff --git a/llvmraytracing/test/intrinsics/shader-start.ll b/llvmraytracing/test/intrinsics/shader-start.ll
new file mode 100644
index 0000000000..cb0aef05a0
--- /dev/null
+++ b/llvmraytracing/test/intrinsics/shader-start.ll
@@ -0,0 +1,59 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
+; RUN: opt --verify-each -passes='lower-raytracing-pipeline,lint' -S %s --lint-abort-on-error | FileCheck %s
+
+%struct.DispatchSystemData = type { i32 }
+%struct.HitData = type { float, i32 }
+
+declare !types !8 i32 @_cont_GetLocalRootIndex(%struct.DispatchSystemData*)
+declare !types !13 i1 @_cont_ReportHit(%struct.DispatchSystemData* %data, float %t, i32 %hitKind)
+
+define void @main() !lgc.rt.shaderstage !10 {
+; CHECK-LABEL: define %struct.DispatchSystemData @main(
+; CHECK-SAME: i64 [[RETURNADDR:%.*]], [[STRUCT_DISPATCHSYSTEMDATA:%.*]] [[TMP0:%.*]]) !lgc.rt.shaderstage [[META5:![0-9]+]] !continuation.registercount [[META0:![0-9]+]] !continuation [[META6:![0-9]+]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[SYSTEM_DATA_ALLOCA:%.*]] = alloca [[STRUCT_DISPATCHSYSTEMDATA]], align 8
+; CHECK-NEXT:    store [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP0]], ptr [[SYSTEM_DATA_ALLOCA]], align 4
+; CHECK-NEXT:    store i32 123, ptr [[SYSTEM_DATA_ALLOCA]], align 4
+; CHECK-NEXT:    [[TMP1:%.*]] = load [[STRUCT_DISPATCHSYSTEMDATA]], ptr [[SYSTEM_DATA_ALLOCA]], align 4
+; CHECK-NEXT:    call void (...) @lgc.ilcps.return(i64 [[RETURNADDR]], [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP1]]), !continuation.registercount [[META0]]
+; CHECK-NEXT:    unreachable
+;
+entry:
+  ret void
+}
+
+define void @_cont_ShaderStart(%struct.DispatchSystemData* %data) !types !11 {
+; CHECK-LABEL: define void @_cont_ShaderStart(
+; CHECK-SAME: ptr [[DATA:%.*]]) !types [[META7:![0-9]+]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr [[STRUCT_DISPATCHSYSTEMDATA:%.*]], ptr [[DATA]], i32 0, i32 0
+; CHECK-NEXT:    store i32 123, ptr [[TMP0]], align 4
+; CHECK-NEXT:    ret void
+;
+entry:
+  %0 = getelementptr %struct.DispatchSystemData, ptr %data, i32 0, i32 0
+  store i32 123, ptr %0, align 4
+  ret void
+}
+
+!0 = !{null, !"", null, !1, !6}
+!1 = !{!2, null, null, null}
+!2 = !{!3}
+!3 = !{i1 ()* @main, !"main", null, null, !4}
+!4 = !{i32 8, i32 7, i32 6, i32 16, i32 7, i32 8, i32 5, !5}
+!5 = !{i32 0}
+!6 = !{i32 0, i64 65536}
+!7 = !{i32 21}
+!8 = !{!"function", i32 poison, !9}
+!9 = !{i32 0, %struct.DispatchSystemData poison}
+!10 = !{i32 1}
+!11 = !{!"function", !"void", !9}
+!12 = !{i32 0, %struct.DispatchSystemData poison}
+!13 = !{!"function", <3 x i32> poison, !12}
+;.
+; CHECK: [[META0]] = !{i32 30}
+; CHECK: [[META4:![0-9]+]] = !{i32 0, %struct.DispatchSystemData poison}
+; CHECK: [[META5]] = !{i32 1}
+; CHECK: [[META6]] = !{ptr @main}
+; CHECK: [[META7]] = !{!"function", !"void", [[META4]]}
+;.
diff --git a/llvmraytracing/test/lgccps/call-shader-i1-payload.ll b/llvmraytracing/test/lgccps/call-shader-i1-payload.ll
new file mode 100644
index 0000000000..e21043d994
--- /dev/null
+++ b/llvmraytracing/test/lgccps/call-shader-i1-payload.ll
@@ -0,0 +1,163 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --include-generated-funcs --version 3
+; RUN: opt --verify-each -S -o - -passes='lower-raytracing-pipeline' %s | FileCheck --check-prefixes=LOWER-RAYTRACING-PIPELINE %s
+; RUN: opt --verify-each -S -o - -passes='lower-raytracing-pipeline,sroa' %s | FileCheck --check-prefixes=SROA %s
+
+; The test checks the payload alloca is fully written and be promoted to register successfully.
+
+%struct.DispatchSystemData = type { i32 }
+%struct.TraversalData = type { %struct.SystemData }
+%struct.SystemData = type { %struct.DispatchSystemData }
+%struct.MyParams = type { i32, i1 }
+
+%struct.AnyHitTraversalData = type { { { i32, i32 }, { i64, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i32, i64 } }, { float, i32, i32, i32, i32 } }
+
+; Need _cont_ReportHit to get anyhit traversal system data type
+declare  !types !8 i1 @_cont_ReportHit(%struct.AnyHitTraversalData* %data, float %t, i32 %hitKind)
+
+; Function Attrs: alwaysinline
+declare %struct.DispatchSystemData @_AmdAwaitTraversal(i64, %struct.TraversalData) #0
+
+; Function Attrs: alwaysinline
+declare %struct.DispatchSystemData @_AmdAwaitShader(i64, %struct.DispatchSystemData) #0
+
+; Function Attrs: alwaysinline
+define i32 @_cont_GetLocalRootIndex(ptr %data) #0 !types !1 {
+  ret i32 5
+}
+
+; Function Attrs: alwaysinline
+define void @_cont_CallShader(ptr %data, i32 %0) #0 !types !2 {
+  %dis_data = load %struct.DispatchSystemData, ptr %data, align 4
+  %newdata = call %struct.DispatchSystemData @_AmdAwaitShader(i64 2, %struct.DispatchSystemData %dis_data)
+  store %struct.DispatchSystemData %newdata, ptr %data, align 4
+  ret void
+}
+
+define void @called(ptr %params) !types !3 !cont.payload.type !4 !lgc.rt.shaderstage !5 {
+  call void (...) @lgc.rt.call.callable.shader(i32 2, ptr %params, i32 4), !cont.payload.type !4
+  ret void
+}
+
+; Function Attrs: nounwind willreturn memory(argmem: readwrite, inaccessiblemem: readwrite)
+declare void @lgc.rt.call.callable.shader(...) #1
+
+attributes #0 = { alwaysinline }
+attributes #1 = { nounwind willreturn memory(argmem: readwrite, inaccessiblemem: readwrite) }
+
+!lgc.cps.module = !{}
+
+!0 = !{i32 0, %struct.DispatchSystemData poison}
+!1 = !{!"function", i32 poison, !0}
+!2 = !{!"function", !"void", !0, i32 poison}
+!3 = !{!"function", !"void", !6}
+!4 = !{%struct.MyParams poison}
+!5 = !{i32 5}
+!6 = !{i32 0, %struct.MyParams poison}
+!7 = !{i32 0, %struct.AnyHitTraversalData poison}
+!8 = !{!"function", i1 poison, !7, float poison, i32 poison}
+
+; LOWER-RAYTRACING-PIPELINE-LABEL: define i32 @_cont_GetLocalRootIndex(
+; LOWER-RAYTRACING-PIPELINE-SAME: ptr [[DATA:%.*]]) #[[ATTR0:[0-9]+]] !types [[META3:![0-9]+]] {
+; LOWER-RAYTRACING-PIPELINE-NEXT:    ret i32 5
+;
+;
+; LOWER-RAYTRACING-PIPELINE-LABEL: define void @called(
+; LOWER-RAYTRACING-PIPELINE-SAME: {} [[CONT_STATE:%.*]], i32 [[RETURNADDR:%.*]], i32 [[SHADER_INDEX:%.*]], [[STRUCT_DISPATCHSYSTEMDATA:%.*]] [[SYSTEM_DATA:%.*]], {} [[HIT_ATTRS:%.*]], [41 x i32] [[PADDING:%.*]], [2 x i32] [[PAYLOAD:%.*]]) !lgc.rt.shaderstage [[META5:![0-9]+]] !lgc.cps [[META1:![0-9]+]] !continuation [[META6:![0-9]+]] {
+; LOWER-RAYTRACING-PIPELINE-NEXT:    [[SYSTEM_DATA_ALLOCA:%.*]] = alloca [[STRUCT_DISPATCHSYSTEMDATA]], align 8
+; LOWER-RAYTRACING-PIPELINE-NEXT:    [[PAYLOAD_ALLOCA:%.*]] = alloca [30 x i32], align 4
+; LOWER-RAYTRACING-PIPELINE-NEXT:    [[TMP1:%.*]] = alloca [[STRUCT_MYPARAMS:%.*]], align 8
+; LOWER-RAYTRACING-PIPELINE-NEXT:    store [2 x i32] [[PAYLOAD]], ptr [[PAYLOAD_ALLOCA]], align 4
+; LOWER-RAYTRACING-PIPELINE-NEXT:    store [[STRUCT_DISPATCHSYSTEMDATA]] [[SYSTEM_DATA]], ptr [[SYSTEM_DATA_ALLOCA]], align 4
+; LOWER-RAYTRACING-PIPELINE-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [[STRUCT_MYPARAMS]], ptr [[TMP1]], i32 0
+; LOWER-RAYTRACING-PIPELINE-NEXT:    [[TMP3:%.*]] = load i32, ptr [[PAYLOAD_ALLOCA]], align 4
+; LOWER-RAYTRACING-PIPELINE-NEXT:    store i32 [[TMP3]], ptr [[TMP2]], align 4
+; LOWER-RAYTRACING-PIPELINE-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[TMP2]], i32 1
+; LOWER-RAYTRACING-PIPELINE-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[PAYLOAD_ALLOCA]], i32 1
+; LOWER-RAYTRACING-PIPELINE-NEXT:    [[TMP6:%.*]] = load i32, ptr [[TMP5]], align 4
+; LOWER-RAYTRACING-PIPELINE-NEXT:    store i32 [[TMP6]], ptr [[TMP4]], align 4
+; LOWER-RAYTRACING-PIPELINE-NEXT:    [[DIS_DATA_I:%.*]] = load [[STRUCT_DISPATCHSYSTEMDATA]], ptr [[SYSTEM_DATA_ALLOCA]], align 4
+; LOWER-RAYTRACING-PIPELINE-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT_MYPARAMS]], ptr [[TMP1]], i32 0
+; LOWER-RAYTRACING-PIPELINE-NEXT:    [[TMP8:%.*]] = load i32, ptr [[TMP7]], align 4
+; LOWER-RAYTRACING-PIPELINE-NEXT:    store i32 [[TMP8]], ptr [[PAYLOAD_ALLOCA]], align 4
+; LOWER-RAYTRACING-PIPELINE-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[PAYLOAD_ALLOCA]], i32 1
+; LOWER-RAYTRACING-PIPELINE-NEXT:    [[TMP23:%.*]] = getelementptr inbounds i32, ptr [[TMP7]], i32 1
+; LOWER-RAYTRACING-PIPELINE-NEXT:    [[TMP11:%.*]] = load i32, ptr [[TMP23]], align 4
+; LOWER-RAYTRACING-PIPELINE-NEXT:    store i32 [[TMP11]], ptr [[TMP10]], align 4
+; LOWER-RAYTRACING-PIPELINE-NEXT:    [[TMP12:%.*]] = load [2 x i32], ptr [[PAYLOAD_ALLOCA]], align 4
+; LOWER-RAYTRACING-PIPELINE-NEXT:    [[TMP13:%.*]] = call { [[STRUCT_DISPATCHSYSTEMDATA]], [41 x i32], [2 x i32] } (...) @lgc.cps.await__sl_s_struct.DispatchSystemDatasa41i32a2i32s(i32 2, i32 4, i32 5, [42 x i32] poison, [2 x i32] [[TMP12]]), !continuation.registercount [[META1]], !continuation.returnedRegistercount [[META1]]
+; LOWER-RAYTRACING-PIPELINE-NEXT:    [[TMP14:%.*]] = extractvalue { [[STRUCT_DISPATCHSYSTEMDATA]], [41 x i32], [2 x i32] } [[TMP13]], 2
+; LOWER-RAYTRACING-PIPELINE-NEXT:    store [2 x i32] [[TMP14]], ptr [[PAYLOAD_ALLOCA]], align 4
+; LOWER-RAYTRACING-PIPELINE-NEXT:    [[TMP15:%.*]] = extractvalue { [[STRUCT_DISPATCHSYSTEMDATA]], [41 x i32], [2 x i32] } [[TMP13]], 0
+; LOWER-RAYTRACING-PIPELINE-NEXT:    store [[STRUCT_MYPARAMS]] poison, ptr [[TMP1]], align 4
+; LOWER-RAYTRACING-PIPELINE-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [[STRUCT_MYPARAMS]], ptr [[TMP1]], i32 0
+; LOWER-RAYTRACING-PIPELINE-NEXT:    [[TMP17:%.*]] = load i32, ptr [[PAYLOAD_ALLOCA]], align 4
+; LOWER-RAYTRACING-PIPELINE-NEXT:    store i32 [[TMP17]], ptr [[TMP16]], align 4
+; LOWER-RAYTRACING-PIPELINE-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i32, ptr [[TMP16]], i32 1
+; LOWER-RAYTRACING-PIPELINE-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i32, ptr [[PAYLOAD_ALLOCA]], i32 1
+; LOWER-RAYTRACING-PIPELINE-NEXT:    [[TMP20:%.*]] = load i32, ptr [[TMP19]], align 4
+; LOWER-RAYTRACING-PIPELINE-NEXT:    store i32 [[TMP20]], ptr [[TMP18]], align 4
+; LOWER-RAYTRACING-PIPELINE-NEXT:    store [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP15]], ptr [[SYSTEM_DATA_ALLOCA]], align 4
+; LOWER-RAYTRACING-PIPELINE-NEXT:    [[TMP21:%.*]] = getelementptr inbounds [[STRUCT_MYPARAMS]], ptr [[TMP1]], i32 0
+; LOWER-RAYTRACING-PIPELINE-NEXT:    [[TMP22:%.*]] = load i32, ptr [[TMP21]], align 4
+; LOWER-RAYTRACING-PIPELINE-NEXT:    store i32 [[TMP22]], ptr [[PAYLOAD_ALLOCA]], align 4
+; LOWER-RAYTRACING-PIPELINE-NEXT:    [[TMP24:%.*]] = getelementptr inbounds i32, ptr [[PAYLOAD_ALLOCA]], i32 1
+; LOWER-RAYTRACING-PIPELINE-NEXT:    [[TMP28:%.*]] = getelementptr inbounds i32, ptr [[TMP21]], i32 1
+; LOWER-RAYTRACING-PIPELINE-NEXT:    [[TMP25:%.*]] = load i32, ptr [[TMP28]], align 4
+; LOWER-RAYTRACING-PIPELINE-NEXT:    store i32 [[TMP25]], ptr [[TMP24]], align 4
+; LOWER-RAYTRACING-PIPELINE-NEXT:    [[TMP26:%.*]] = load [[STRUCT_DISPATCHSYSTEMDATA]], ptr [[SYSTEM_DATA_ALLOCA]], align 4
+; LOWER-RAYTRACING-PIPELINE-NEXT:    [[TMP27:%.*]] = load [2 x i32], ptr [[PAYLOAD_ALLOCA]], align 4
+; LOWER-RAYTRACING-PIPELINE-NEXT:    call void (...) @lgc.cps.jump(i32 [[RETURNADDR]], i32 6, {} poison, i32 poison, i32 poison, [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP26]], [41 x i32] poison, [2 x i32] [[TMP27]]), !continuation.registercount [[META1]]
+; LOWER-RAYTRACING-PIPELINE-NEXT:    unreachable
+;
+;
+; SROA-LABEL: define i32 @_cont_GetLocalRootIndex(
+; SROA-SAME: ptr [[DATA:%.*]]) #[[ATTR0:[0-9]+]] !types [[META3:![0-9]+]] {
+; SROA-NEXT:    ret i32 5
+;
+;
+; SROA-LABEL: define void @called(
+; SROA-SAME: {} [[CONT_STATE:%.*]], i32 [[RETURNADDR:%.*]], i32 [[SHADER_INDEX:%.*]], [[STRUCT_DISPATCHSYSTEMDATA:%.*]] [[SYSTEM_DATA:%.*]], {} [[HIT_ATTRS:%.*]], [41 x i32] [[PADDING:%.*]], [2 x i32] [[PAYLOAD:%.*]]) !lgc.rt.shaderstage [[META5:![0-9]+]] !lgc.cps [[META1:![0-9]+]] !continuation [[META6:![0-9]+]] {
+; SROA-NEXT:    [[DOTSROA_5:%.*]] = alloca i8, align 4
+; SROA-NEXT:    [[PAYLOAD_FCA_0_EXTRACT:%.*]] = extractvalue [2 x i32] [[PAYLOAD]], 0
+; SROA-NEXT:    [[PAYLOAD_FCA_1_EXTRACT:%.*]] = extractvalue [2 x i32] [[PAYLOAD]], 1
+; SROA-NEXT:    [[PAYLOAD_ALLOCA_SROA_8_4_EXTRACT_TRUNC:%.*]] = trunc i32 [[PAYLOAD_FCA_1_EXTRACT]] to i8
+; SROA-NEXT:    [[PAYLOAD_ALLOCA_SROA_16_4_EXTRACT_SHIFT:%.*]] = lshr i32 [[PAYLOAD_FCA_1_EXTRACT]], 8
+; SROA-NEXT:    [[PAYLOAD_ALLOCA_SROA_16_4_EXTRACT_TRUNC:%.*]] = trunc i32 [[PAYLOAD_ALLOCA_SROA_16_4_EXTRACT_SHIFT]] to i24
+; SROA-NEXT:    [[SYSTEM_DATA_FCA_0_EXTRACT:%.*]] = extractvalue [[STRUCT_DISPATCHSYSTEMDATA]] [[SYSTEM_DATA]], 0
+; SROA-NEXT:    store i8 [[PAYLOAD_ALLOCA_SROA_8_4_EXTRACT_TRUNC]], ptr [[DOTSROA_5]], align 4
+; SROA-NEXT:    [[DIS_DATA_I_FCA_0_INSERT:%.*]] = insertvalue [[STRUCT_DISPATCHSYSTEMDATA]] poison, i32 [[SYSTEM_DATA_FCA_0_EXTRACT]], 0
+; SROA-NEXT:    [[DOTSROA_5_0__SROA_5_4_2:%.*]] = load i8, ptr [[DOTSROA_5]], align 4
+; SROA-NEXT:    [[DOTFCA_0_INSERT5:%.*]] = insertvalue [2 x i32] poison, i32 [[PAYLOAD_FCA_0_EXTRACT]], 0
+; SROA-NEXT:    [[PAYLOAD_ALLOCA_SROA_16_4_INSERT_EXT19:%.*]] = zext i24 [[PAYLOAD_ALLOCA_SROA_16_4_EXTRACT_TRUNC]] to i32
+; SROA-NEXT:    [[PAYLOAD_ALLOCA_SROA_16_4_INSERT_SHIFT20:%.*]] = shl i32 [[PAYLOAD_ALLOCA_SROA_16_4_INSERT_EXT19]], 8
+; SROA-NEXT:    [[PAYLOAD_ALLOCA_SROA_16_4_INSERT_MASK21:%.*]] = and i32 undef, 255
+; SROA-NEXT:    [[PAYLOAD_ALLOCA_SROA_16_4_INSERT_INSERT22:%.*]] = or i32 [[PAYLOAD_ALLOCA_SROA_16_4_INSERT_MASK21]], [[PAYLOAD_ALLOCA_SROA_16_4_INSERT_SHIFT20]]
+; SROA-NEXT:    [[PAYLOAD_ALLOCA_SROA_8_4_INSERT_EXT15:%.*]] = zext i8 [[DOTSROA_5_0__SROA_5_4_2]] to i32
+; SROA-NEXT:    [[PAYLOAD_ALLOCA_SROA_8_4_INSERT_MASK16:%.*]] = and i32 [[PAYLOAD_ALLOCA_SROA_16_4_INSERT_INSERT22]], -256
+; SROA-NEXT:    [[PAYLOAD_ALLOCA_SROA_8_4_INSERT_INSERT17:%.*]] = or i32 [[PAYLOAD_ALLOCA_SROA_8_4_INSERT_MASK16]], [[PAYLOAD_ALLOCA_SROA_8_4_INSERT_EXT15]]
+; SROA-NEXT:    [[DOTFCA_1_INSERT8:%.*]] = insertvalue [2 x i32] [[DOTFCA_0_INSERT5]], i32 [[PAYLOAD_ALLOCA_SROA_8_4_INSERT_INSERT17]], 1
+; SROA-NEXT:    [[TMP1:%.*]] = call { [[STRUCT_DISPATCHSYSTEMDATA]], [41 x i32], [2 x i32] } (...) @lgc.cps.await__sl_s_struct.DispatchSystemDatasa41i32a2i32s(i32 2, i32 4, i32 5, [42 x i32] poison, [2 x i32] [[DOTFCA_1_INSERT8]]), !continuation.registercount [[META1]], !continuation.returnedRegistercount [[META1]]
+; SROA-NEXT:    [[TMP2:%.*]] = extractvalue { [[STRUCT_DISPATCHSYSTEMDATA]], [41 x i32], [2 x i32] } [[TMP1]], 2
+; SROA-NEXT:    [[DOTFCA_0_EXTRACT:%.*]] = extractvalue [2 x i32] [[TMP2]], 0
+; SROA-NEXT:    [[DOTFCA_1_EXTRACT:%.*]] = extractvalue [2 x i32] [[TMP2]], 1
+; SROA-NEXT:    [[PAYLOAD_ALLOCA_SROA_8_4_EXTRACT_TRUNC18:%.*]] = trunc i32 [[DOTFCA_1_EXTRACT]] to i8
+; SROA-NEXT:    [[PAYLOAD_ALLOCA_SROA_16_4_EXTRACT_SHIFT23:%.*]] = lshr i32 [[DOTFCA_1_EXTRACT]], 8
+; SROA-NEXT:    [[PAYLOAD_ALLOCA_SROA_16_4_EXTRACT_TRUNC24:%.*]] = trunc i32 [[PAYLOAD_ALLOCA_SROA_16_4_EXTRACT_SHIFT23]] to i24
+; SROA-NEXT:    [[TMP3:%.*]] = extractvalue { [[STRUCT_DISPATCHSYSTEMDATA]], [41 x i32], [2 x i32] } [[TMP1]], 0
+; SROA-NEXT:    store i1 poison, ptr [[DOTSROA_5]], align 4
+; SROA-NEXT:    store i8 [[PAYLOAD_ALLOCA_SROA_8_4_EXTRACT_TRUNC18]], ptr [[DOTSROA_5]], align 4
+; SROA-NEXT:    [[DOTFCA_0_EXTRACT27:%.*]] = extractvalue [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP3]], 0
+; SROA-NEXT:    [[DOTSROA_5_0__SROA_5_4_:%.*]] = load i8, ptr [[DOTSROA_5]], align 4
+; SROA-NEXT:    [[DOTFCA_0_INSERT26:%.*]] = insertvalue [[STRUCT_DISPATCHSYSTEMDATA]] poison, i32 [[DOTFCA_0_EXTRACT27]], 0
+; SROA-NEXT:    [[DOTFCA_0_INSERT:%.*]] = insertvalue [2 x i32] poison, i32 [[DOTFCA_0_EXTRACT]], 0
+; SROA-NEXT:    [[PAYLOAD_ALLOCA_SROA_16_4_INSERT_EXT:%.*]] = zext i24 [[PAYLOAD_ALLOCA_SROA_16_4_EXTRACT_TRUNC24]] to i32
+; SROA-NEXT:    [[PAYLOAD_ALLOCA_SROA_16_4_INSERT_SHIFT:%.*]] = shl i32 [[PAYLOAD_ALLOCA_SROA_16_4_INSERT_EXT]], 8
+; SROA-NEXT:    [[PAYLOAD_ALLOCA_SROA_16_4_INSERT_MASK:%.*]] = and i32 undef, 255
+; SROA-NEXT:    [[PAYLOAD_ALLOCA_SROA_16_4_INSERT_INSERT:%.*]] = or i32 [[PAYLOAD_ALLOCA_SROA_16_4_INSERT_MASK]], [[PAYLOAD_ALLOCA_SROA_16_4_INSERT_SHIFT]]
+; SROA-NEXT:    [[PAYLOAD_ALLOCA_SROA_8_4_INSERT_EXT:%.*]] = zext i8 [[DOTSROA_5_0__SROA_5_4_]] to i32
+; SROA-NEXT:    [[PAYLOAD_ALLOCA_SROA_8_4_INSERT_MASK:%.*]] = and i32 [[PAYLOAD_ALLOCA_SROA_16_4_INSERT_INSERT]], -256
+; SROA-NEXT:    [[PAYLOAD_ALLOCA_SROA_8_4_INSERT_INSERT:%.*]] = or i32 [[PAYLOAD_ALLOCA_SROA_8_4_INSERT_MASK]], [[PAYLOAD_ALLOCA_SROA_8_4_INSERT_EXT]]
+; SROA-NEXT:    [[DOTFCA_1_INSERT:%.*]] = insertvalue [2 x i32] [[DOTFCA_0_INSERT]], i32 [[PAYLOAD_ALLOCA_SROA_8_4_INSERT_INSERT]], 1
+; SROA-NEXT:    call void (...) @lgc.cps.jump(i32 [[RETURNADDR]], i32 6, {} poison, i32 poison, i32 poison, [[STRUCT_DISPATCHSYSTEMDATA]] [[DOTFCA_0_INSERT26]], [41 x i32] poison, [2 x i32] [[DOTFCA_1_INSERT]]), !continuation.registercount [[META1]]
+; SROA-NEXT:    unreachable
+;
diff --git a/llvmraytracing/test/lgccps/cleanup-store-loads.ll b/llvmraytracing/test/lgccps/cleanup-store-loads.ll
index cef6961ff9..684c24a284 100644
--- a/llvmraytracing/test/lgccps/cleanup-store-loads.ll
+++ b/llvmraytracing/test/lgccps/cleanup-store-loads.ll
@@ -164,7 +164,7 @@ bb2:                                              ; preds = %entry
 define internal { ptr, ptr } @test.resume.0(ptr noalias noundef nonnull align 4 dereferenceable(8) %0, i1 %1) !lgc.cps !0 !continuation !1 {
 entryresume.0:
   %2 = load ptr, ptr %0, align 8
-  %3 = call float @continuations.getReturnValue__f32()
+  %3 = call float @lgc.ilcps.getReturnValue__f32()
   %arg.reload.addr = getelementptr inbounds %test.Frame, ptr %2, i32 0, i32 1
   %arg.reload = load float, ptr %arg.reload.addr, align 4
   %rcr.reload.addr = getelementptr inbounds %test.Frame, ptr %2, i32 0, i32 0
@@ -197,7 +197,7 @@ declare ptr @llvm.coro.begin(token, ptr writeonly) #1
 declare i1 @llvm.coro.suspend.retcon.i1(...) #1
 
 ; Function Attrs: nounwind willreturn
-declare float @continuations.getReturnValue__f32() #2
+declare float @lgc.ilcps.getReturnValue__f32() #2
 
 ; Function Attrs: noreturn
 declare void @continuation.return(...) #3
diff --git a/llvmraytracing/test/lgccps/entry-point-with-cps.ll b/llvmraytracing/test/lgccps/entry-point-with-cps.ll
index 7b181539d8..bbdfce955c 100644
--- a/llvmraytracing/test/lgccps/entry-point-with-cps.ll
+++ b/llvmraytracing/test/lgccps/entry-point-with-cps.ll
@@ -150,9 +150,9 @@ declare void @lgc.cps.jump(...)
 ; LOWER-AWAIT-NEXT:    [[TMP4:%.*]] = inttoptr i32 [[CR_1]] to ptr
 ; LOWER-AWAIT-NEXT:    [[TMP5:%.*]] = call ptr [[TMP4]](i32 [[CR_1]], i32 4, i32 [[X]], ptr addrspace(1) [[DST]])
 ; LOWER-AWAIT-NEXT:    [[TMP6:%.*]] = call i1 (...) @llvm.coro.suspend.retcon.i1(ptr [[TMP5]])
-; LOWER-AWAIT-NEXT:    [[TMP7:%.*]] = call [2 x i32] @continuations.getReturnValue__a2i32()
+; LOWER-AWAIT-NEXT:    [[TMP7:%.*]] = call [2 x i32] @lgc.ilcps.getReturnValue__a2i32()
 ; LOWER-AWAIT-NEXT:    store [2 x i32] [[TMP7]], ptr addrspace(1) [[DST]], align 4
-; LOWER-AWAIT-NEXT:    call void (...) @continuation.return()
+; LOWER-AWAIT-NEXT:    call void (...) @lgc.ilcps.return(i32 poison)
 ; LOWER-AWAIT-NEXT:    unreachable
 ;
 ;
@@ -167,7 +167,7 @@ declare void @lgc.cps.jump(...)
 ; LOWER-AWAIT-NEXT:    [[TMP4:%.*]] = inttoptr i32 [[CR_1]] to ptr
 ; LOWER-AWAIT-NEXT:    [[TMP5:%.*]] = call ptr [[TMP4]](i32 [[CR_1]], i32 2, i32 [[X]])
 ; LOWER-AWAIT-NEXT:    [[TMP6:%.*]] = call i1 (...) @llvm.coro.suspend.retcon.i1(ptr [[TMP5]])
-; LOWER-AWAIT-NEXT:    [[TMP7:%.*]] = call i32 @continuations.getReturnValue__i32()
+; LOWER-AWAIT-NEXT:    [[TMP7:%.*]] = call i32 @lgc.ilcps.getReturnValue__i32()
 ; LOWER-AWAIT-NEXT:    call void (...) @lgc.cps.jump(i32 [[RCR]], i32 5, i32 [[TMP7]])
 ; LOWER-AWAIT-NEXT:    unreachable
 ;
@@ -190,6 +190,6 @@ declare void @lgc.cps.jump(...)
 ; LOWER-AWAIT-NEXT:    [[TMP5:%.*]] = call i1 (...) @llvm.coro.suspend.retcon.i1(ptr [[TMP4]])
 ; LOWER-AWAIT-NEXT:    br label [[EXIT]]
 ; LOWER-AWAIT:       exit:
-; LOWER-AWAIT-NEXT:    call void (...) @continuation.return()
+; LOWER-AWAIT-NEXT:    call void (...) @lgc.ilcps.return(i32 poison)
 ; LOWER-AWAIT-NEXT:    unreachable
 ;
diff --git a/llvmraytracing/test/lgccps/lower-traversal.ll b/llvmraytracing/test/lgccps/lower-traversal.ll
index 689d6beb69..395bf3bf1c 100644
--- a/llvmraytracing/test/lgccps/lower-traversal.ll
+++ b/llvmraytracing/test/lgccps/lower-traversal.ll
@@ -1,6 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 3
-; RUN: opt --verify-each -passes='lower-raytracing-pipeline,lint' -S %s 2> %t0.stderr | FileCheck -check-prefix=LOWERRAYTRACINGPIPELINE-CPS %s
-; RUN: count 0 < %t0.stderr
+; We run this test file twice with different max hit attribute sizes to test that e.g. padding depends correctly on the max hit attribute size.
+; RUN: grep -v HITATTR_SIZE_8  %s | opt --verify-each -passes='lower-raytracing-pipeline,lint' -S --lint-abort-on-error | FileCheck -check-prefix=CHECK-ATTRSIZE-16 %s
+; RUN: grep -v HITATTR_SIZE_16 %s | opt --verify-each -passes='lower-raytracing-pipeline,lint' -S --lint-abort-on-error | FileCheck -check-prefix=CHECK-ATTRSIZE-8 %s
 
 %struct.AnyHitTraversalData = type { { { i32, i32 }, { i64, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i32, i64 } }, { float, i32, i32, i32, i32 } }
 %struct.DispatchSystemData = type { i32 }
@@ -12,197 +13,388 @@ declare !types !10 i32 @_cont_GetLocalRootIndex(%struct.DispatchSystemData*)
 
 declare i64 @_AmdGetCurrentFuncAddr()
 
-define dso_local spir_func { { float, i32, i32, i32, i32 }, <2 x float>, i32 } @_cont_Traversal(ptr addrspace(5) %0) local_unnamed_addr !lgc.shaderstage !0 !types !1 !lgc.rt.shaderstage !3 !lgc.rt.attribute.size !4 {
-; LOWERRAYTRACINGPIPELINE-CPS-LABEL: define dso_local spir_func void @_cont_Traversal(
-; LOWERRAYTRACINGPIPELINE-CPS-SAME: {} [[CONT_STATE:%.*]], i32 [[RETURN_ADDR:%.*]], i32 [[SHADER_INDEX:%.*]], { { <3 x i32>, i32 }, { i64, i32, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i64 } } [[SYSTEM_DATA:%.*]], {} [[HIT_ATTRS:%.*]], [7 x i32] [[PADDING:%.*]], [8 x i32] [[PAYLOAD:%.*]]) local_unnamed_addr !lgc.shaderstage [[META4:![0-9]+]] !lgc.rt.shaderstage [[META5:![0-9]+]] !lgc.rt.attribute.size [[META6:![0-9]+]] !lgc.cps [[META7:![0-9]+]] !continuation [[META8:![0-9]+]] {
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:  .entry:
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[SYSTEM_DATA_ALLOCA:%.*]] = alloca { { <3 x i32>, i32 }, { i64, i32, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i64 } }, align 16, addrspace(5)
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store { { <3 x i32>, i32 }, { i64, i32, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i64 } } [[SYSTEM_DATA]], ptr addrspace(5) [[SYSTEM_DATA_ALLOCA]], align 16
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP0:%.*]] = call ptr addrspace(7) @lgc.load.buffer.desc(i64 93, i32 17, i32 0, i32 0)
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP1:%.*]] = call ptr @llvm.invariant.start.p7(i64 -1, ptr addrspace(7) [[TMP0]])
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP2:%.*]] = load <3 x i32>, ptr addrspace(5) [[SYSTEM_DATA_ALLOCA]], align 16
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP3:%.*]] = getelementptr { { <3 x i32>, i32 }, { i64, i32, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i64 } }, ptr addrspace(5) [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0, i32 1
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP4:%.*]] = load i32, ptr addrspace(5) [[TMP3]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP5:%.*]] = getelementptr { { <3 x i32>, i32 }, { i64, i32, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i64 } }, ptr addrspace(5) [[SYSTEM_DATA_ALLOCA]], i32 0, i32 1, i32 0
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP6:%.*]] = load i64, ptr addrspace(5) [[TMP5]], align 8
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP7:%.*]] = getelementptr { { <3 x i32>, i32 }, { i64, i32, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i64 } }, ptr addrspace(5) [[SYSTEM_DATA_ALLOCA]], i32 0, i32 1, i32 1
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP8:%.*]] = load i32, ptr addrspace(5) [[TMP7]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP9:%.*]] = getelementptr { { <3 x i32>, i32 }, { i64, i32, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i64 } }, ptr addrspace(5) [[SYSTEM_DATA_ALLOCA]], i32 0, i32 1, i32 2
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP10:%.*]] = load i32, ptr addrspace(5) [[TMP9]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[DOTFR539:%.*]] = freeze i32 [[TMP10]]
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP11:%.*]] = getelementptr { { <3 x i32>, i32 }, { i64, i32, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i64 } }, ptr addrspace(5) [[SYSTEM_DATA_ALLOCA]], i32 0, i32 1, i32 3
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP12:%.*]] = load <3 x float>, ptr addrspace(5) [[TMP11]], align 16
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP13:%.*]] = getelementptr { { <3 x i32>, i32 }, { i64, i32, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i64 } }, ptr addrspace(5) [[SYSTEM_DATA_ALLOCA]], i32 0, i32 1, i32 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP14:%.*]] = load <3 x float>, ptr addrspace(5) [[TMP13]], align 16
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP15:%.*]] = getelementptr { { <3 x i32>, i32 }, { i64, i32, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i64 } }, ptr addrspace(5) [[SYSTEM_DATA_ALLOCA]], i32 0, i32 1, i32 5
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP16:%.*]] = load float, ptr addrspace(5) [[TMP15]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP17:%.*]] = getelementptr { { <3 x i32>, i32 }, { i64, i32, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i64 } }, ptr addrspace(5) [[SYSTEM_DATA_ALLOCA]], i32 0, i32 1, i32 6
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP18:%.*]] = load float, ptr addrspace(5) [[TMP17]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP19:%.*]] = getelementptr { { <3 x i32>, i32 }, { i64, i32, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i64 } }, ptr addrspace(5) [[SYSTEM_DATA_ALLOCA]], i32 0, i32 2, i32 0, i32 0
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP20:%.*]] = load float, ptr addrspace(5) [[TMP19]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP21:%.*]] = getelementptr { { <3 x i32>, i32 }, { i64, i32, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i64 } }, ptr addrspace(5) [[SYSTEM_DATA_ALLOCA]], i32 0, i32 2, i32 0, i32 1
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP22:%.*]] = load i32, ptr addrspace(5) [[TMP21]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[DOTFR:%.*]] = freeze i32 [[TMP22]]
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP23:%.*]] = getelementptr { { <3 x i32>, i32 }, { i64, i32, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i64 } }, ptr addrspace(5) [[SYSTEM_DATA_ALLOCA]], i32 0, i32 2, i32 0, i32 2
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP24:%.*]] = load i32, ptr addrspace(5) [[TMP23]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP25:%.*]] = getelementptr { { <3 x i32>, i32 }, { i64, i32, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i64 } }, ptr addrspace(5) [[SYSTEM_DATA_ALLOCA]], i32 0, i32 2, i32 0, i32 3
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP26:%.*]] = load i32, ptr addrspace(5) [[TMP25]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP27:%.*]] = getelementptr { { <3 x i32>, i32 }, { i64, i32, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i64 } }, ptr addrspace(5) [[SYSTEM_DATA_ALLOCA]], i32 0, i32 2, i32 0, i32 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP28:%.*]] = load i32, ptr addrspace(5) [[TMP27]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP29:%.*]] = getelementptr { { <3 x i32>, i32 }, { i64, i32, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i64 } }, ptr addrspace(5) [[SYSTEM_DATA_ALLOCA]], i32 0, i32 2, i32 1
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP30:%.*]] = load <2 x float>, ptr addrspace(5) [[TMP29]], align 8
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP31:%.*]] = getelementptr { { <3 x i32>, i32 }, { i64, i32, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i64 } }, ptr addrspace(5) [[SYSTEM_DATA_ALLOCA]], i32 0, i32 2, i32 2
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP32:%.*]] = load i32, ptr addrspace(5) [[TMP31]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP33:%.*]] = getelementptr { { <3 x i32>, i32 }, { i64, i32, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i64 } }, ptr addrspace(5) [[SYSTEM_DATA_ALLOCA]], i32 0, i32 2, i32 3
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP34:%.*]] = load i32, ptr addrspace(5) [[TMP33]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP35:%.*]] = getelementptr { { <3 x i32>, i32 }, { i64, i32, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i64 } }, ptr addrspace(5) [[SYSTEM_DATA_ALLOCA]], i32 0, i32 2, i32 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP36:%.*]] = load i32, ptr addrspace(5) [[TMP35]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP37:%.*]] = getelementptr { { <3 x i32>, i32 }, { i64, i32, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i64 } }, ptr addrspace(5) [[SYSTEM_DATA_ALLOCA]], i32 0, i32 2, i32 5
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP38:%.*]] = load i32, ptr addrspace(5) [[TMP37]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP39:%.*]] = getelementptr { { <3 x i32>, i32 }, { i64, i32, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i64 } }, ptr addrspace(5) [[SYSTEM_DATA_ALLOCA]], i32 0, i32 2, i32 6
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP40:%.*]] = load i32, ptr addrspace(5) [[TMP39]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP41:%.*]] = getelementptr { { <3 x i32>, i32 }, { i64, i32, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i64 } }, ptr addrspace(5) [[SYSTEM_DATA_ALLOCA]], i32 0, i32 2, i32 7
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP42:%.*]] = load i32, ptr addrspace(5) [[TMP41]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP43:%.*]] = getelementptr { { <3 x i32>, i32 }, { i64, i32, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i64 } }, ptr addrspace(5) [[SYSTEM_DATA_ALLOCA]], i32 0, i32 2, i32 8
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP44:%.*]] = load i64, ptr addrspace(5) [[TMP43]], align 8
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP45:%.*]] = icmp ugt i32 [[DOTFR]], -3
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    br i1 [[TMP45]], label [[TMP46:%.*]], label [[TMP67:%.*]]
-; LOWERRAYTRACINGPIPELINE-CPS:       46:
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP47:%.*]] = getelementptr inbounds <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 }>, ptr addrspace(7) [[TMP0]], i32 0, i32 5
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP48:%.*]] = load i32, ptr addrspace(7) [[TMP47]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP49:%.*]] = getelementptr inbounds <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 }>, ptr addrspace(7) [[TMP0]], i32 0, i32 6
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP50:%.*]] = load i32, ptr addrspace(7) [[TMP49]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP51:%.*]] = zext i32 [[TMP50]] to i64
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP52:%.*]] = shl nuw i64 [[TMP51]], 32
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP53:%.*]] = zext i32 [[TMP48]] to i64
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP54:%.*]] = or i64 [[TMP52]], [[TMP53]]
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[DOTFR541:%.*]] = freeze i64 [[TMP54]]
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP55:%.*]] = icmp eq i64 [[DOTFR541]], 0
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    br i1 [[TMP55]], label [[DOTEXIT2:%.*]], label [[TMP56:%.*]]
-; LOWERRAYTRACINGPIPELINE-CPS:       56:
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP57:%.*]] = lshr i32 [[TMP8]], 16
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP58:%.*]] = getelementptr inbounds <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 }>, ptr addrspace(7) [[TMP0]], i32 0, i32 7
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP59:%.*]] = load i32, ptr addrspace(7) [[TMP58]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP60:%.*]] = mul i32 [[TMP59]], [[TMP57]]
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP61:%.*]] = zext i32 [[TMP60]] to i64
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP62:%.*]] = add i64 [[DOTFR541]], [[TMP61]]
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP63:%.*]] = inttoptr i64 [[TMP62]] to ptr addrspace(4)
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP64:%.*]] = load i32, ptr addrspace(4) [[TMP63]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP65:%.*]] = freeze i32 [[TMP64]]
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    br label [[DOTEXIT2]]
-; LOWERRAYTRACINGPIPELINE-CPS:       .exit2:
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[DOT0:%.*]] = phi i32 [ [[TMP57]], [[TMP56]] ], [ undef, [[TMP46]] ]
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[DOTSROA_0128_0_EXTRACT_TRUNC:%.*]] = phi i32 [ [[TMP65]], [[TMP56]] ], [ 0, [[TMP46]] ]
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[DOTNOT542:%.*]] = icmp eq i32 [[DOTSROA_0128_0_EXTRACT_TRUNC]], 0
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    br i1 [[DOTNOT542]], label [[TMP106:%.*]], label [[TMP66:%.*]]
-; LOWERRAYTRACINGPIPELINE-CPS:       66:
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[DOTSROA_0130_0_EXTRACT_TRUNC:%.*]] = trunc i64 [[TMP44]] to i32
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[DOTFCA_0_0_INSERT:%.*]] = insertvalue { { <3 x i32>, i32 }, { i64, i32, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i64 } } poison, <3 x i32> [[TMP2]], 0, 0
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[DOTFCA_0_1_INSERT:%.*]] = insertvalue { { <3 x i32>, i32 }, { i64, i32, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i64 } } [[DOTFCA_0_0_INSERT]], i32 [[DOT0]], 0, 1
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[DOTFCA_1_0_INSERT:%.*]] = insertvalue { { <3 x i32>, i32 }, { i64, i32, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i64 } } [[DOTFCA_0_1_INSERT]], i64 [[TMP6]], 1, 0
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[DOTFCA_1_1_INSERT:%.*]] = insertvalue { { <3 x i32>, i32 }, { i64, i32, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i64 } } [[DOTFCA_1_0_INSERT]], i32 [[TMP8]], 1, 1
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[DOTFCA_1_2_INSERT:%.*]] = insertvalue { { <3 x i32>, i32 }, { i64, i32, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i64 } } [[DOTFCA_1_1_INSERT]], i32 [[DOTFR539]], 1, 2
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[DOTFCA_1_3_INSERT:%.*]] = insertvalue { { <3 x i32>, i32 }, { i64, i32, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i64 } } [[DOTFCA_1_2_INSERT]], <3 x float> [[TMP12]], 1, 3
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[DOTFCA_1_4_INSERT:%.*]] = insertvalue { { <3 x i32>, i32 }, { i64, i32, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i64 } } [[DOTFCA_1_3_INSERT]], <3 x float> [[TMP14]], 1, 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[DOTFCA_1_5_INSERT:%.*]] = insertvalue { { <3 x i32>, i32 }, { i64, i32, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i64 } } [[DOTFCA_1_4_INSERT]], float [[TMP16]], 1, 5
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[DOTFCA_1_6_INSERT:%.*]] = insertvalue { { <3 x i32>, i32 }, { i64, i32, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i64 } } [[DOTFCA_1_5_INSERT]], float [[TMP18]], 1, 6
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[DOTFCA_2_0_0_INSERT:%.*]] = insertvalue { { <3 x i32>, i32 }, { i64, i32, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i64 } } [[DOTFCA_1_6_INSERT]], float [[TMP20]], 2, 0, 0
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[DOTFCA_2_0_1_INSERT:%.*]] = insertvalue { { <3 x i32>, i32 }, { i64, i32, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i64 } } [[DOTFCA_2_0_0_INSERT]], i32 [[DOTFR]], 2, 0, 1
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[DOTFCA_2_0_2_INSERT:%.*]] = insertvalue { { <3 x i32>, i32 }, { i64, i32, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i64 } } [[DOTFCA_2_0_1_INSERT]], i32 [[TMP24]], 2, 0, 2
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[DOTFCA_2_0_3_INSERT:%.*]] = insertvalue { { <3 x i32>, i32 }, { i64, i32, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i64 } } [[DOTFCA_2_0_2_INSERT]], i32 [[TMP26]], 2, 0, 3
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[DOTFCA_2_0_4_INSERT:%.*]] = insertvalue { { <3 x i32>, i32 }, { i64, i32, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i64 } } [[DOTFCA_2_0_3_INSERT]], i32 [[TMP28]], 2, 0, 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[DOTFCA_2_1_INSERT:%.*]] = insertvalue { { <3 x i32>, i32 }, { i64, i32, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i64 } } [[DOTFCA_2_0_4_INSERT]], <2 x float> [[TMP30]], 2, 1
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[DOTFCA_2_2_INSERT:%.*]] = insertvalue { { <3 x i32>, i32 }, { i64, i32, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i64 } } [[DOTFCA_2_1_INSERT]], i32 [[TMP32]], 2, 2
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[DOTFCA_2_3_INSERT:%.*]] = insertvalue { { <3 x i32>, i32 }, { i64, i32, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i64 } } [[DOTFCA_2_2_INSERT]], i32 [[TMP34]], 2, 3
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[DOTFCA_2_4_INSERT:%.*]] = insertvalue { { <3 x i32>, i32 }, { i64, i32, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i64 } } [[DOTFCA_2_3_INSERT]], i32 [[TMP36]], 2, 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[DOTFCA_2_5_INSERT:%.*]] = insertvalue { { <3 x i32>, i32 }, { i64, i32, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i64 } } [[DOTFCA_2_4_INSERT]], i32 [[TMP38]], 2, 5
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[DOTFCA_2_6_INSERT:%.*]] = insertvalue { { <3 x i32>, i32 }, { i64, i32, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i64 } } [[DOTFCA_2_5_INSERT]], i32 [[TMP40]], 2, 6
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[DOTFCA_2_7_INSERT:%.*]] = insertvalue { { <3 x i32>, i32 }, { i64, i32, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i64 } } [[DOTFCA_2_6_INSERT]], i32 [[TMP42]], 2, 7
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[DOTFCA_2_8_INSERT:%.*]] = insertvalue { { <3 x i32>, i32 }, { i64, i32, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i64 } } [[DOTFCA_2_7_INSERT]], i64 [[TMP44]], 2, 8
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    call void (...) @lgc.cps.jump(i32 [[DOTSROA_0128_0_EXTRACT_TRUNC]], i32 -1, {} poison, i32 [[DOTSROA_0130_0_EXTRACT_TRUNC]], i32 [[DOT0]], { { <3 x i32>, i32 }, { i64, i32, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i64 } } [[DOTFCA_2_8_INSERT]], [7 x i32] poison, [8 x i32] [[PAYLOAD]])
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    unreachable
-; LOWERRAYTRACINGPIPELINE-CPS:       67:
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP68:%.*]] = shl i32 [[DOTFR]], 3
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP69:%.*]] = and i32 [[TMP68]], -64
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP70:%.*]] = zext i32 [[TMP69]] to i64
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP71:%.*]] = add i64 [[TMP6]], [[TMP70]]
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP72:%.*]] = add i64 [[TMP71]], 48
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP73:%.*]] = inttoptr i64 [[TMP72]] to ptr addrspace(1)
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP74:%.*]] = load <4 x i32>, ptr addrspace(1) [[TMP73]], align 16
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[DOT4_VEC_EXTRACT452:%.*]] = extractelement <4 x i32> [[TMP74]], i64 1
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP75:%.*]] = and i32 [[TMP26]], 16777215
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP76:%.*]] = and i32 [[DOT4_VEC_EXTRACT452]], 16777215
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP77:%.*]] = lshr i32 [[TMP8]], 8
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP78:%.*]] = and i32 [[TMP77]], 15
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP79:%.*]] = lshr i32 [[TMP8]], 12
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP80:%.*]] = and i32 [[TMP79]], 15
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP81:%.*]] = mul nuw nsw i32 [[TMP80]], [[TMP75]]
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP82:%.*]] = add nuw nsw i32 [[TMP78]], [[TMP81]]
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP83:%.*]] = add nuw nsw i32 [[TMP82]], [[TMP76]]
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP84:%.*]] = getelementptr inbounds <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 }>, ptr addrspace(7) [[TMP0]], i32 0, i32 9
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP85:%.*]] = load i32, ptr addrspace(7) [[TMP84]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP86:%.*]] = getelementptr inbounds <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 }>, ptr addrspace(7) [[TMP0]], i32 0, i32 10
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP87:%.*]] = load i32, ptr addrspace(7) [[TMP86]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP88:%.*]] = zext i32 [[TMP87]] to i64
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP89:%.*]] = shl nuw i64 [[TMP88]], 32
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP90:%.*]] = zext i32 [[TMP85]] to i64
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP91:%.*]] = or i64 [[TMP89]], [[TMP90]]
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[DOTFR537:%.*]] = freeze i64 [[TMP91]]
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[DOTNOT:%.*]] = icmp eq i64 [[DOTFR537]], 0
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    br i1 [[DOTNOT]], label [[DOTEXIT5:%.*]], label [[TMP92:%.*]]
-; LOWERRAYTRACINGPIPELINE-CPS:       92:
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP93:%.*]] = getelementptr inbounds <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 }>, ptr addrspace(7) [[TMP0]], i32 0, i32 11
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP94:%.*]] = load i32, ptr addrspace(7) [[TMP93]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP95:%.*]] = mul i32 [[TMP94]], [[TMP83]]
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP96:%.*]] = zext i32 [[TMP95]] to i64
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP97:%.*]] = add i64 [[DOTFR537]], [[TMP96]]
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP98:%.*]] = inttoptr i64 [[TMP97]] to ptr addrspace(1)
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP99:%.*]] = load <4 x i32>, ptr addrspace(1) [[TMP98]], align 16
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP100:%.*]] = shufflevector <4 x i32> [[TMP99]], <4 x i32> poison, <2 x i32> <i32 0, i32 1>
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP101:%.*]] = freeze <2 x i32> [[TMP100]]
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    br label [[DOTEXIT5]]
-; LOWERRAYTRACINGPIPELINE-CPS:       .exit5:
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[DOTSROA_0501_0:%.*]] = phi <2 x i32> [ [[TMP101]], [[TMP92]] ], [ zeroinitializer, [[TMP67]] ]
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP102:%.*]] = and i32 [[DOTFR539]], 8
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP103:%.*]] = icmp ne i32 [[TMP102]], 0
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[DOTSROA_0150_0_VEC_EXTRACT:%.*]] = extractelement <2 x i32> [[DOTSROA_0501_0]], i64 0
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[DOTNOT540:%.*]] = icmp eq i32 [[DOTSROA_0150_0_VEC_EXTRACT]], 0
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[OR_COND:%.*]] = or i1 [[TMP103]], [[DOTNOT540]]
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    br i1 [[OR_COND]], label [[TMP106]], label [[TMP104:%.*]]
-; LOWERRAYTRACINGPIPELINE-CPS:       104:
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP105:%.*]] = call i64 (...) @lgc.cps.as.continuation.reference__i64(ptr @_cont_Traversal)
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[DOTSROA_0320_0_EXTRACT_TRUNC:%.*]] = trunc i64 [[TMP105]] to i32
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[DOTFCA_0_0_INSERT322:%.*]] = insertvalue { { <3 x i32>, i32 }, { i64, i32, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i64 } } poison, <3 x i32> [[TMP2]], 0, 0
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[DOTFCA_0_1_INSERT323:%.*]] = insertvalue { { <3 x i32>, i32 }, { i64, i32, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i64 } } [[DOTFCA_0_0_INSERT322]], i32 [[TMP83]], 0, 1
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[DOTFCA_1_0_INSERT324:%.*]] = insertvalue { { <3 x i32>, i32 }, { i64, i32, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i64 } } [[DOTFCA_0_1_INSERT323]], i64 [[TMP6]], 1, 0
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[DOTFCA_1_1_INSERT325:%.*]] = insertvalue { { <3 x i32>, i32 }, { i64, i32, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i64 } } [[DOTFCA_1_0_INSERT324]], i32 [[TMP8]], 1, 1
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[DOTFCA_1_2_INSERT326:%.*]] = insertvalue { { <3 x i32>, i32 }, { i64, i32, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i64 } } [[DOTFCA_1_1_INSERT325]], i32 [[DOTFR539]], 1, 2
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[DOTFCA_1_3_INSERT327:%.*]] = insertvalue { { <3 x i32>, i32 }, { i64, i32, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i64 } } [[DOTFCA_1_2_INSERT326]], <3 x float> [[TMP12]], 1, 3
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[DOTFCA_1_4_INSERT328:%.*]] = insertvalue { { <3 x i32>, i32 }, { i64, i32, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i64 } } [[DOTFCA_1_3_INSERT327]], <3 x float> [[TMP14]], 1, 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[DOTFCA_1_5_INSERT329:%.*]] = insertvalue { { <3 x i32>, i32 }, { i64, i32, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i64 } } [[DOTFCA_1_4_INSERT328]], float [[TMP16]], 1, 5
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[DOTFCA_1_6_INSERT330:%.*]] = insertvalue { { <3 x i32>, i32 }, { i64, i32, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i64 } } [[DOTFCA_1_5_INSERT329]], float [[TMP18]], 1, 6
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[DOTFCA_2_0_0_INSERT331:%.*]] = insertvalue { { <3 x i32>, i32 }, { i64, i32, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i64 } } [[DOTFCA_1_6_INSERT330]], float [[TMP20]], 2, 0, 0
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[DOTFCA_2_0_1_INSERT332:%.*]] = insertvalue { { <3 x i32>, i32 }, { i64, i32, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i64 } } [[DOTFCA_2_0_0_INSERT331]], i32 [[DOTFR]], 2, 0, 1
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[DOTFCA_2_0_2_INSERT333:%.*]] = insertvalue { { <3 x i32>, i32 }, { i64, i32, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i64 } } [[DOTFCA_2_0_1_INSERT332]], i32 [[TMP24]], 2, 0, 2
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[DOTFCA_2_0_3_INSERT334:%.*]] = insertvalue { { <3 x i32>, i32 }, { i64, i32, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i64 } } [[DOTFCA_2_0_2_INSERT333]], i32 [[TMP26]], 2, 0, 3
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[DOTFCA_2_0_4_INSERT335:%.*]] = insertvalue { { <3 x i32>, i32 }, { i64, i32, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i64 } } [[DOTFCA_2_0_3_INSERT334]], i32 [[TMP28]], 2, 0, 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[DOTFCA_2_1_INSERT336:%.*]] = insertvalue { { <3 x i32>, i32 }, { i64, i32, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i64 } } [[DOTFCA_2_0_4_INSERT335]], <2 x float> [[TMP30]], 2, 1
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[DOTFCA_2_2_INSERT337:%.*]] = insertvalue { { <3 x i32>, i32 }, { i64, i32, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i64 } } [[DOTFCA_2_1_INSERT336]], i32 [[TMP32]], 2, 2
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[DOTFCA_2_3_INSERT338:%.*]] = insertvalue { { <3 x i32>, i32 }, { i64, i32, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i64 } } [[DOTFCA_2_2_INSERT337]], i32 [[TMP34]], 2, 3
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[DOTFCA_2_4_INSERT339:%.*]] = insertvalue { { <3 x i32>, i32 }, { i64, i32, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i64 } } [[DOTFCA_2_3_INSERT338]], i32 [[TMP36]], 2, 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[DOTFCA_2_5_INSERT340:%.*]] = insertvalue { { <3 x i32>, i32 }, { i64, i32, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i64 } } [[DOTFCA_2_4_INSERT339]], i32 [[TMP38]], 2, 5
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[DOTFCA_2_6_INSERT341:%.*]] = insertvalue { { <3 x i32>, i32 }, { i64, i32, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i64 } } [[DOTFCA_2_5_INSERT340]], i32 [[TMP40]], 2, 6
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[DOTFCA_2_7_INSERT342:%.*]] = insertvalue { { <3 x i32>, i32 }, { i64, i32, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i64 } } [[DOTFCA_2_6_INSERT341]], i32 [[TMP42]], 2, 7
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[DOTFCA_2_8_INSERT343:%.*]] = insertvalue { { <3 x i32>, i32 }, { i64, i32, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i64 } } [[DOTFCA_2_7_INSERT342]], i64 [[TMP44]], 2, 8
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    call void (...) @lgc.cps.jump(i32 [[DOTSROA_0150_0_VEC_EXTRACT]], i32 -1, {} poison, i32 [[DOTSROA_0320_0_EXTRACT_TRUNC]], i32 [[TMP83]], { { <3 x i32>, i32 }, { i64, i32, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i64 } } [[DOTFCA_2_8_INSERT343]], [7 x i32] poison, [8 x i32] [[PAYLOAD]])
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    unreachable
-; LOWERRAYTRACINGPIPELINE-CPS:       106:
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[DOTSROA_7_0:%.*]] = phi i32 [ [[TMP4]], [[DOTEXIT2]] ], [ [[TMP83]], [[DOTEXIT5]] ]
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[DOTSROA_0373_0_EXTRACT_TRUNC:%.*]] = trunc i64 [[TMP44]] to i32
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[DOTFCA_0_INSERT:%.*]] = insertvalue { <3 x i32>, i32 } poison, <3 x i32> [[TMP2]], 0
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[DOTFCA_1_INSERT:%.*]] = insertvalue { <3 x i32>, i32 } [[DOTFCA_0_INSERT]], i32 [[DOTSROA_7_0]], 1
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    call void (...) @lgc.cps.jump(i32 [[DOTSROA_0373_0_EXTRACT_TRUNC]], i32 -1, {} poison, i32 poison, i32 [[DOTSROA_7_0]], { <3 x i32>, i32 } [[DOTFCA_1_INSERT]], [34 x i32] poison, [8 x i32] [[PAYLOAD]])
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    unreachable
+define dso_local spir_func { { float, i32, i32, i32, i32 }, <2 x float>, i32 } @_cont_Traversal(ptr addrspace(5) %0) local_unnamed_addr !lgc.shaderstage !0 !types !1 !lgc.rt.shaderstage !3 {
+; CHECK-ATTRSIZE-16-LABEL: define dso_local spir_func void @_cont_Traversal(
+; CHECK-ATTRSIZE-16-SAME: {} [[CONT_STATE:%.*]], i32 [[RETURNADDR:%.*]], i32 [[SHADER_INDEX:%.*]], { { <3 x i32>, i32 }, { i64, i32, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i64 } } [[SYSTEM_DATA:%.*]], {} [[HIT_ATTRS:%.*]], [7 x i32] [[PADDING:%.*]], [8 x i32] [[PAYLOAD:%.*]]) local_unnamed_addr !lgc.shaderstage [[META6:![0-9]+]] !lgc.rt.shaderstage [[META7:![0-9]+]] !lgc.cps [[META8:![0-9]+]] !continuation [[META9:![0-9]+]] {
+; CHECK-ATTRSIZE-16-NEXT:  .entry:
+; CHECK-ATTRSIZE-16-NEXT:    [[SYSTEM_DATA_ALLOCA:%.*]] = alloca { { <3 x i32>, i32 }, { i64, i32, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i64 } }, align 16, addrspace(5)
+; CHECK-ATTRSIZE-16-NEXT:    store { { <3 x i32>, i32 }, { i64, i32, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i64 } } [[SYSTEM_DATA]], ptr addrspace(5) [[SYSTEM_DATA_ALLOCA]], align 16
+; CHECK-ATTRSIZE-16-NEXT:    [[TMP0:%.*]] = call ptr addrspace(7) @lgc.load.buffer.desc(i64 93, i32 17, i32 0, i32 0)
+; CHECK-ATTRSIZE-16-NEXT:    [[TMP1:%.*]] = call ptr @llvm.invariant.start.p7(i64 -1, ptr addrspace(7) [[TMP0]])
+; CHECK-ATTRSIZE-16-NEXT:    [[TMP2:%.*]] = load <3 x i32>, ptr addrspace(5) [[SYSTEM_DATA_ALLOCA]], align 16
+; CHECK-ATTRSIZE-16-NEXT:    [[TMP3:%.*]] = getelementptr { { <3 x i32>, i32 }, { i64, i32, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i64 } }, ptr addrspace(5) [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0, i32 1
+; CHECK-ATTRSIZE-16-NEXT:    [[TMP4:%.*]] = load i32, ptr addrspace(5) [[TMP3]], align 4
+; CHECK-ATTRSIZE-16-NEXT:    [[TMP5:%.*]] = getelementptr { { <3 x i32>, i32 }, { i64, i32, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i64 } }, ptr addrspace(5) [[SYSTEM_DATA_ALLOCA]], i32 0, i32 1, i32 0
+; CHECK-ATTRSIZE-16-NEXT:    [[TMP6:%.*]] = load i64, ptr addrspace(5) [[TMP5]], align 8
+; CHECK-ATTRSIZE-16-NEXT:    [[TMP7:%.*]] = getelementptr { { <3 x i32>, i32 }, { i64, i32, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i64 } }, ptr addrspace(5) [[SYSTEM_DATA_ALLOCA]], i32 0, i32 1, i32 1
+; CHECK-ATTRSIZE-16-NEXT:    [[TMP8:%.*]] = load i32, ptr addrspace(5) [[TMP7]], align 4
+; CHECK-ATTRSIZE-16-NEXT:    [[TMP9:%.*]] = getelementptr { { <3 x i32>, i32 }, { i64, i32, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i64 } }, ptr addrspace(5) [[SYSTEM_DATA_ALLOCA]], i32 0, i32 1, i32 2
+; CHECK-ATTRSIZE-16-NEXT:    [[TMP10:%.*]] = load i32, ptr addrspace(5) [[TMP9]], align 4
+; CHECK-ATTRSIZE-16-NEXT:    [[DOTFR539:%.*]] = freeze i32 [[TMP10]]
+; CHECK-ATTRSIZE-16-NEXT:    [[TMP11:%.*]] = getelementptr { { <3 x i32>, i32 }, { i64, i32, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i64 } }, ptr addrspace(5) [[SYSTEM_DATA_ALLOCA]], i32 0, i32 1, i32 3
+; CHECK-ATTRSIZE-16-NEXT:    [[TMP12:%.*]] = load <3 x float>, ptr addrspace(5) [[TMP11]], align 16
+; CHECK-ATTRSIZE-16-NEXT:    [[TMP13:%.*]] = getelementptr { { <3 x i32>, i32 }, { i64, i32, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i64 } }, ptr addrspace(5) [[SYSTEM_DATA_ALLOCA]], i32 0, i32 1, i32 4
+; CHECK-ATTRSIZE-16-NEXT:    [[TMP14:%.*]] = load <3 x float>, ptr addrspace(5) [[TMP13]], align 16
+; CHECK-ATTRSIZE-16-NEXT:    [[TMP15:%.*]] = getelementptr { { <3 x i32>, i32 }, { i64, i32, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i64 } }, ptr addrspace(5) [[SYSTEM_DATA_ALLOCA]], i32 0, i32 1, i32 5
+; CHECK-ATTRSIZE-16-NEXT:    [[TMP16:%.*]] = load float, ptr addrspace(5) [[TMP15]], align 4
+; CHECK-ATTRSIZE-16-NEXT:    [[TMP17:%.*]] = getelementptr { { <3 x i32>, i32 }, { i64, i32, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i64 } }, ptr addrspace(5) [[SYSTEM_DATA_ALLOCA]], i32 0, i32 1, i32 6
+; CHECK-ATTRSIZE-16-NEXT:    [[TMP18:%.*]] = load float, ptr addrspace(5) [[TMP17]], align 4
+; CHECK-ATTRSIZE-16-NEXT:    [[TMP19:%.*]] = getelementptr { { <3 x i32>, i32 }, { i64, i32, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i64 } }, ptr addrspace(5) [[SYSTEM_DATA_ALLOCA]], i32 0, i32 2, i32 0, i32 0
+; CHECK-ATTRSIZE-16-NEXT:    [[TMP20:%.*]] = load float, ptr addrspace(5) [[TMP19]], align 4
+; CHECK-ATTRSIZE-16-NEXT:    [[TMP21:%.*]] = getelementptr { { <3 x i32>, i32 }, { i64, i32, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i64 } }, ptr addrspace(5) [[SYSTEM_DATA_ALLOCA]], i32 0, i32 2, i32 0, i32 1
+; CHECK-ATTRSIZE-16-NEXT:    [[TMP22:%.*]] = load i32, ptr addrspace(5) [[TMP21]], align 4
+; CHECK-ATTRSIZE-16-NEXT:    [[DOTFR:%.*]] = freeze i32 [[TMP22]]
+; CHECK-ATTRSIZE-16-NEXT:    [[TMP23:%.*]] = getelementptr { { <3 x i32>, i32 }, { i64, i32, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i64 } }, ptr addrspace(5) [[SYSTEM_DATA_ALLOCA]], i32 0, i32 2, i32 0, i32 2
+; CHECK-ATTRSIZE-16-NEXT:    [[TMP24:%.*]] = load i32, ptr addrspace(5) [[TMP23]], align 4
+; CHECK-ATTRSIZE-16-NEXT:    [[TMP25:%.*]] = getelementptr { { <3 x i32>, i32 }, { i64, i32, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i64 } }, ptr addrspace(5) [[SYSTEM_DATA_ALLOCA]], i32 0, i32 2, i32 0, i32 3
+; CHECK-ATTRSIZE-16-NEXT:    [[TMP26:%.*]] = load i32, ptr addrspace(5) [[TMP25]], align 4
+; CHECK-ATTRSIZE-16-NEXT:    [[TMP27:%.*]] = getelementptr { { <3 x i32>, i32 }, { i64, i32, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i64 } }, ptr addrspace(5) [[SYSTEM_DATA_ALLOCA]], i32 0, i32 2, i32 0, i32 4
+; CHECK-ATTRSIZE-16-NEXT:    [[TMP28:%.*]] = load i32, ptr addrspace(5) [[TMP27]], align 4
+; CHECK-ATTRSIZE-16-NEXT:    [[TMP29:%.*]] = getelementptr { { <3 x i32>, i32 }, { i64, i32, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i64 } }, ptr addrspace(5) [[SYSTEM_DATA_ALLOCA]], i32 0, i32 2, i32 1
+; CHECK-ATTRSIZE-16-NEXT:    [[TMP30:%.*]] = load <2 x float>, ptr addrspace(5) [[TMP29]], align 8
+; CHECK-ATTRSIZE-16-NEXT:    [[TMP31:%.*]] = getelementptr { { <3 x i32>, i32 }, { i64, i32, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i64 } }, ptr addrspace(5) [[SYSTEM_DATA_ALLOCA]], i32 0, i32 2, i32 2
+; CHECK-ATTRSIZE-16-NEXT:    [[TMP32:%.*]] = load i32, ptr addrspace(5) [[TMP31]], align 4
+; CHECK-ATTRSIZE-16-NEXT:    [[TMP33:%.*]] = getelementptr { { <3 x i32>, i32 }, { i64, i32, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i64 } }, ptr addrspace(5) [[SYSTEM_DATA_ALLOCA]], i32 0, i32 2, i32 3
+; CHECK-ATTRSIZE-16-NEXT:    [[TMP34:%.*]] = load i32, ptr addrspace(5) [[TMP33]], align 4
+; CHECK-ATTRSIZE-16-NEXT:    [[TMP35:%.*]] = getelementptr { { <3 x i32>, i32 }, { i64, i32, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i64 } }, ptr addrspace(5) [[SYSTEM_DATA_ALLOCA]], i32 0, i32 2, i32 4
+; CHECK-ATTRSIZE-16-NEXT:    [[TMP36:%.*]] = load i32, ptr addrspace(5) [[TMP35]], align 4
+; CHECK-ATTRSIZE-16-NEXT:    [[TMP37:%.*]] = getelementptr { { <3 x i32>, i32 }, { i64, i32, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i64 } }, ptr addrspace(5) [[SYSTEM_DATA_ALLOCA]], i32 0, i32 2, i32 5
+; CHECK-ATTRSIZE-16-NEXT:    [[TMP38:%.*]] = load i32, ptr addrspace(5) [[TMP37]], align 4
+; CHECK-ATTRSIZE-16-NEXT:    [[TMP39:%.*]] = getelementptr { { <3 x i32>, i32 }, { i64, i32, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i64 } }, ptr addrspace(5) [[SYSTEM_DATA_ALLOCA]], i32 0, i32 2, i32 6
+; CHECK-ATTRSIZE-16-NEXT:    [[TMP40:%.*]] = load i32, ptr addrspace(5) [[TMP39]], align 4
+; CHECK-ATTRSIZE-16-NEXT:    [[TMP41:%.*]] = getelementptr { { <3 x i32>, i32 }, { i64, i32, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i64 } }, ptr addrspace(5) [[SYSTEM_DATA_ALLOCA]], i32 0, i32 2, i32 7
+; CHECK-ATTRSIZE-16-NEXT:    [[TMP42:%.*]] = load i32, ptr addrspace(5) [[TMP41]], align 4
+; CHECK-ATTRSIZE-16-NEXT:    [[TMP43:%.*]] = getelementptr { { <3 x i32>, i32 }, { i64, i32, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i64 } }, ptr addrspace(5) [[SYSTEM_DATA_ALLOCA]], i32 0, i32 2, i32 8
+; CHECK-ATTRSIZE-16-NEXT:    [[TMP44:%.*]] = load i64, ptr addrspace(5) [[TMP43]], align 8
+; CHECK-ATTRSIZE-16-NEXT:    [[TMP45:%.*]] = icmp ugt i32 [[DOTFR]], -3
+; CHECK-ATTRSIZE-16-NEXT:    br i1 [[TMP45]], label [[TMP46:%.*]], label [[TMP67:%.*]]
+; CHECK-ATTRSIZE-16:       46:
+; CHECK-ATTRSIZE-16-NEXT:    [[TMP47:%.*]] = getelementptr inbounds <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 }>, ptr addrspace(7) [[TMP0]], i32 0, i32 5
+; CHECK-ATTRSIZE-16-NEXT:    [[TMP48:%.*]] = load i32, ptr addrspace(7) [[TMP47]], align 4
+; CHECK-ATTRSIZE-16-NEXT:    [[TMP49:%.*]] = getelementptr inbounds <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 }>, ptr addrspace(7) [[TMP0]], i32 0, i32 6
+; CHECK-ATTRSIZE-16-NEXT:    [[TMP50:%.*]] = load i32, ptr addrspace(7) [[TMP49]], align 4
+; CHECK-ATTRSIZE-16-NEXT:    [[TMP51:%.*]] = zext i32 [[TMP50]] to i64
+; CHECK-ATTRSIZE-16-NEXT:    [[TMP52:%.*]] = shl nuw i64 [[TMP51]], 32
+; CHECK-ATTRSIZE-16-NEXT:    [[TMP53:%.*]] = zext i32 [[TMP48]] to i64
+; CHECK-ATTRSIZE-16-NEXT:    [[TMP54:%.*]] = or i64 [[TMP52]], [[TMP53]]
+; CHECK-ATTRSIZE-16-NEXT:    [[DOTFR541:%.*]] = freeze i64 [[TMP54]]
+; CHECK-ATTRSIZE-16-NEXT:    [[TMP55:%.*]] = icmp eq i64 [[DOTFR541]], 0
+; CHECK-ATTRSIZE-16-NEXT:    br i1 [[TMP55]], label [[DOTEXIT2:%.*]], label [[TMP56:%.*]]
+; CHECK-ATTRSIZE-16:       56:
+; CHECK-ATTRSIZE-16-NEXT:    [[TMP57:%.*]] = lshr i32 [[TMP8]], 16
+; CHECK-ATTRSIZE-16-NEXT:    [[TMP58:%.*]] = getelementptr inbounds <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 }>, ptr addrspace(7) [[TMP0]], i32 0, i32 7
+; CHECK-ATTRSIZE-16-NEXT:    [[TMP59:%.*]] = load i32, ptr addrspace(7) [[TMP58]], align 4
+; CHECK-ATTRSIZE-16-NEXT:    [[TMP60:%.*]] = mul i32 [[TMP59]], [[TMP57]]
+; CHECK-ATTRSIZE-16-NEXT:    [[TMP61:%.*]] = zext i32 [[TMP60]] to i64
+; CHECK-ATTRSIZE-16-NEXT:    [[TMP62:%.*]] = add i64 [[DOTFR541]], [[TMP61]]
+; CHECK-ATTRSIZE-16-NEXT:    [[TMP63:%.*]] = inttoptr i64 [[TMP62]] to ptr addrspace(4)
+; CHECK-ATTRSIZE-16-NEXT:    [[TMP64:%.*]] = load i32, ptr addrspace(4) [[TMP63]], align 4
+; CHECK-ATTRSIZE-16-NEXT:    [[TMP65:%.*]] = freeze i32 [[TMP64]]
+; CHECK-ATTRSIZE-16-NEXT:    br label [[DOTEXIT2]]
+; CHECK-ATTRSIZE-16:       .exit2:
+; CHECK-ATTRSIZE-16-NEXT:    [[DOT0:%.*]] = phi i32 [ [[TMP57]], [[TMP56]] ], [ undef, [[TMP46]] ]
+; CHECK-ATTRSIZE-16-NEXT:    [[DOTSROA_0128_0_EXTRACT_TRUNC:%.*]] = phi i32 [ [[TMP65]], [[TMP56]] ], [ 0, [[TMP46]] ]
+; CHECK-ATTRSIZE-16-NEXT:    [[DOTNOT542:%.*]] = icmp eq i32 [[DOTSROA_0128_0_EXTRACT_TRUNC]], 0
+; CHECK-ATTRSIZE-16-NEXT:    br i1 [[DOTNOT542]], label [[TMP106:%.*]], label [[TMP66:%.*]]
+; CHECK-ATTRSIZE-16:       66:
+; CHECK-ATTRSIZE-16-NEXT:    [[DOTSROA_0130_0_EXTRACT_TRUNC:%.*]] = trunc i64 [[TMP44]] to i32
+; CHECK-ATTRSIZE-16-NEXT:    [[DOTFCA_0_0_INSERT:%.*]] = insertvalue { { <3 x i32>, i32 }, { i64, i32, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i64 } } poison, <3 x i32> [[TMP2]], 0, 0
+; CHECK-ATTRSIZE-16-NEXT:    [[DOTFCA_0_1_INSERT:%.*]] = insertvalue { { <3 x i32>, i32 }, { i64, i32, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i64 } } [[DOTFCA_0_0_INSERT]], i32 [[DOT0]], 0, 1
+; CHECK-ATTRSIZE-16-NEXT:    [[DOTFCA_1_0_INSERT:%.*]] = insertvalue { { <3 x i32>, i32 }, { i64, i32, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i64 } } [[DOTFCA_0_1_INSERT]], i64 [[TMP6]], 1, 0
+; CHECK-ATTRSIZE-16-NEXT:    [[DOTFCA_1_1_INSERT:%.*]] = insertvalue { { <3 x i32>, i32 }, { i64, i32, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i64 } } [[DOTFCA_1_0_INSERT]], i32 [[TMP8]], 1, 1
+; CHECK-ATTRSIZE-16-NEXT:    [[DOTFCA_1_2_INSERT:%.*]] = insertvalue { { <3 x i32>, i32 }, { i64, i32, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i64 } } [[DOTFCA_1_1_INSERT]], i32 [[DOTFR539]], 1, 2
+; CHECK-ATTRSIZE-16-NEXT:    [[DOTFCA_1_3_INSERT:%.*]] = insertvalue { { <3 x i32>, i32 }, { i64, i32, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i64 } } [[DOTFCA_1_2_INSERT]], <3 x float> [[TMP12]], 1, 3
+; CHECK-ATTRSIZE-16-NEXT:    [[DOTFCA_1_4_INSERT:%.*]] = insertvalue { { <3 x i32>, i32 }, { i64, i32, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i64 } } [[DOTFCA_1_3_INSERT]], <3 x float> [[TMP14]], 1, 4
+; CHECK-ATTRSIZE-16-NEXT:    [[DOTFCA_1_5_INSERT:%.*]] = insertvalue { { <3 x i32>, i32 }, { i64, i32, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i64 } } [[DOTFCA_1_4_INSERT]], float [[TMP16]], 1, 5
+; CHECK-ATTRSIZE-16-NEXT:    [[DOTFCA_1_6_INSERT:%.*]] = insertvalue { { <3 x i32>, i32 }, { i64, i32, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i64 } } [[DOTFCA_1_5_INSERT]], float [[TMP18]], 1, 6
+; CHECK-ATTRSIZE-16-NEXT:    [[DOTFCA_2_0_0_INSERT:%.*]] = insertvalue { { <3 x i32>, i32 }, { i64, i32, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i64 } } [[DOTFCA_1_6_INSERT]], float [[TMP20]], 2, 0, 0
+; CHECK-ATTRSIZE-16-NEXT:    [[DOTFCA_2_0_1_INSERT:%.*]] = insertvalue { { <3 x i32>, i32 }, { i64, i32, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i64 } } [[DOTFCA_2_0_0_INSERT]], i32 [[DOTFR]], 2, 0, 1
+; CHECK-ATTRSIZE-16-NEXT:    [[DOTFCA_2_0_2_INSERT:%.*]] = insertvalue { { <3 x i32>, i32 }, { i64, i32, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i64 } } [[DOTFCA_2_0_1_INSERT]], i32 [[TMP24]], 2, 0, 2
+; CHECK-ATTRSIZE-16-NEXT:    [[DOTFCA_2_0_3_INSERT:%.*]] = insertvalue { { <3 x i32>, i32 }, { i64, i32, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i64 } } [[DOTFCA_2_0_2_INSERT]], i32 [[TMP26]], 2, 0, 3
+; CHECK-ATTRSIZE-16-NEXT:    [[DOTFCA_2_0_4_INSERT:%.*]] = insertvalue { { <3 x i32>, i32 }, { i64, i32, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i64 } } [[DOTFCA_2_0_3_INSERT]], i32 [[TMP28]], 2, 0, 4
+; CHECK-ATTRSIZE-16-NEXT:    [[DOTFCA_2_1_INSERT:%.*]] = insertvalue { { <3 x i32>, i32 }, { i64, i32, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i64 } } [[DOTFCA_2_0_4_INSERT]], <2 x float> [[TMP30]], 2, 1
+; CHECK-ATTRSIZE-16-NEXT:    [[DOTFCA_2_2_INSERT:%.*]] = insertvalue { { <3 x i32>, i32 }, { i64, i32, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i64 } } [[DOTFCA_2_1_INSERT]], i32 [[TMP32]], 2, 2
+; CHECK-ATTRSIZE-16-NEXT:    [[DOTFCA_2_3_INSERT:%.*]] = insertvalue { { <3 x i32>, i32 }, { i64, i32, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i64 } } [[DOTFCA_2_2_INSERT]], i32 [[TMP34]], 2, 3
+; CHECK-ATTRSIZE-16-NEXT:    [[DOTFCA_2_4_INSERT:%.*]] = insertvalue { { <3 x i32>, i32 }, { i64, i32, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i64 } } [[DOTFCA_2_3_INSERT]], i32 [[TMP36]], 2, 4
+; CHECK-ATTRSIZE-16-NEXT:    [[DOTFCA_2_5_INSERT:%.*]] = insertvalue { { <3 x i32>, i32 }, { i64, i32, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i64 } } [[DOTFCA_2_4_INSERT]], i32 [[TMP38]], 2, 5
+; CHECK-ATTRSIZE-16-NEXT:    [[DOTFCA_2_6_INSERT:%.*]] = insertvalue { { <3 x i32>, i32 }, { i64, i32, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i64 } } [[DOTFCA_2_5_INSERT]], i32 [[TMP40]], 2, 6
+; CHECK-ATTRSIZE-16-NEXT:    [[DOTFCA_2_7_INSERT:%.*]] = insertvalue { { <3 x i32>, i32 }, { i64, i32, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i64 } } [[DOTFCA_2_6_INSERT]], i32 [[TMP42]], 2, 7
+; CHECK-ATTRSIZE-16-NEXT:    [[DOTFCA_2_8_INSERT:%.*]] = insertvalue { { <3 x i32>, i32 }, { i64, i32, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i64 } } [[DOTFCA_2_7_INSERT]], i64 [[TMP44]], 2, 8
+; CHECK-ATTRSIZE-16-NEXT:    call void (...) @lgc.cps.jump(i32 [[DOTSROA_0128_0_EXTRACT_TRUNC]], i32 -1, {} poison, i32 [[DOTSROA_0130_0_EXTRACT_TRUNC]], i32 [[DOT0]], { { <3 x i32>, i32 }, { i64, i32, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i64 } } [[DOTFCA_2_8_INSERT]], [7 x i32] poison, [8 x i32] [[PAYLOAD]])
+; CHECK-ATTRSIZE-16-NEXT:    unreachable
+; CHECK-ATTRSIZE-16:       67:
+; CHECK-ATTRSIZE-16-NEXT:    [[TMP68:%.*]] = shl i32 [[DOTFR]], 3
+; CHECK-ATTRSIZE-16-NEXT:    [[TMP69:%.*]] = and i32 [[TMP68]], -64
+; CHECK-ATTRSIZE-16-NEXT:    [[TMP70:%.*]] = zext i32 [[TMP69]] to i64
+; CHECK-ATTRSIZE-16-NEXT:    [[TMP71:%.*]] = add i64 [[TMP6]], [[TMP70]]
+; CHECK-ATTRSIZE-16-NEXT:    [[TMP72:%.*]] = add i64 [[TMP71]], 48
+; CHECK-ATTRSIZE-16-NEXT:    [[TMP73:%.*]] = inttoptr i64 [[TMP72]] to ptr addrspace(1)
+; CHECK-ATTRSIZE-16-NEXT:    [[TMP74:%.*]] = load <4 x i32>, ptr addrspace(1) [[TMP73]], align 16
+; CHECK-ATTRSIZE-16-NEXT:    [[DOT4_VEC_EXTRACT452:%.*]] = extractelement <4 x i32> [[TMP74]], i64 1
+; CHECK-ATTRSIZE-16-NEXT:    [[TMP75:%.*]] = and i32 [[TMP26]], 16777215
+; CHECK-ATTRSIZE-16-NEXT:    [[TMP76:%.*]] = and i32 [[DOT4_VEC_EXTRACT452]], 16777215
+; CHECK-ATTRSIZE-16-NEXT:    [[TMP77:%.*]] = lshr i32 [[TMP8]], 8
+; CHECK-ATTRSIZE-16-NEXT:    [[TMP78:%.*]] = and i32 [[TMP77]], 15
+; CHECK-ATTRSIZE-16-NEXT:    [[TMP79:%.*]] = lshr i32 [[TMP8]], 12
+; CHECK-ATTRSIZE-16-NEXT:    [[TMP80:%.*]] = and i32 [[TMP79]], 15
+; CHECK-ATTRSIZE-16-NEXT:    [[TMP81:%.*]] = mul nuw nsw i32 [[TMP80]], [[TMP75]]
+; CHECK-ATTRSIZE-16-NEXT:    [[TMP82:%.*]] = add nuw nsw i32 [[TMP78]], [[TMP81]]
+; CHECK-ATTRSIZE-16-NEXT:    [[TMP83:%.*]] = add nuw nsw i32 [[TMP82]], [[TMP76]]
+; CHECK-ATTRSIZE-16-NEXT:    [[TMP84:%.*]] = getelementptr inbounds <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 }>, ptr addrspace(7) [[TMP0]], i32 0, i32 9
+; CHECK-ATTRSIZE-16-NEXT:    [[TMP85:%.*]] = load i32, ptr addrspace(7) [[TMP84]], align 4
+; CHECK-ATTRSIZE-16-NEXT:    [[TMP86:%.*]] = getelementptr inbounds <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 }>, ptr addrspace(7) [[TMP0]], i32 0, i32 10
+; CHECK-ATTRSIZE-16-NEXT:    [[TMP87:%.*]] = load i32, ptr addrspace(7) [[TMP86]], align 4
+; CHECK-ATTRSIZE-16-NEXT:    [[TMP88:%.*]] = zext i32 [[TMP87]] to i64
+; CHECK-ATTRSIZE-16-NEXT:    [[TMP89:%.*]] = shl nuw i64 [[TMP88]], 32
+; CHECK-ATTRSIZE-16-NEXT:    [[TMP90:%.*]] = zext i32 [[TMP85]] to i64
+; CHECK-ATTRSIZE-16-NEXT:    [[TMP91:%.*]] = or i64 [[TMP89]], [[TMP90]]
+; CHECK-ATTRSIZE-16-NEXT:    [[DOTFR537:%.*]] = freeze i64 [[TMP91]]
+; CHECK-ATTRSIZE-16-NEXT:    [[DOTNOT:%.*]] = icmp eq i64 [[DOTFR537]], 0
+; CHECK-ATTRSIZE-16-NEXT:    br i1 [[DOTNOT]], label [[DOTEXIT5:%.*]], label [[TMP92:%.*]]
+; CHECK-ATTRSIZE-16:       92:
+; CHECK-ATTRSIZE-16-NEXT:    [[TMP93:%.*]] = getelementptr inbounds <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 }>, ptr addrspace(7) [[TMP0]], i32 0, i32 11
+; CHECK-ATTRSIZE-16-NEXT:    [[TMP94:%.*]] = load i32, ptr addrspace(7) [[TMP93]], align 4
+; CHECK-ATTRSIZE-16-NEXT:    [[TMP95:%.*]] = mul i32 [[TMP94]], [[TMP83]]
+; CHECK-ATTRSIZE-16-NEXT:    [[TMP96:%.*]] = zext i32 [[TMP95]] to i64
+; CHECK-ATTRSIZE-16-NEXT:    [[TMP97:%.*]] = add i64 [[DOTFR537]], [[TMP96]]
+; CHECK-ATTRSIZE-16-NEXT:    [[TMP98:%.*]] = inttoptr i64 [[TMP97]] to ptr addrspace(1)
+; CHECK-ATTRSIZE-16-NEXT:    [[TMP99:%.*]] = load <4 x i32>, ptr addrspace(1) [[TMP98]], align 16
+; CHECK-ATTRSIZE-16-NEXT:    [[TMP100:%.*]] = shufflevector <4 x i32> [[TMP99]], <4 x i32> poison, <2 x i32> <i32 0, i32 1>
+; CHECK-ATTRSIZE-16-NEXT:    [[TMP101:%.*]] = freeze <2 x i32> [[TMP100]]
+; CHECK-ATTRSIZE-16-NEXT:    br label [[DOTEXIT5]]
+; CHECK-ATTRSIZE-16:       .exit5:
+; CHECK-ATTRSIZE-16-NEXT:    [[DOTSROA_0501_0:%.*]] = phi <2 x i32> [ [[TMP101]], [[TMP92]] ], [ zeroinitializer, [[TMP67]] ]
+; CHECK-ATTRSIZE-16-NEXT:    [[TMP102:%.*]] = and i32 [[DOTFR539]], 8
+; CHECK-ATTRSIZE-16-NEXT:    [[TMP103:%.*]] = icmp ne i32 [[TMP102]], 0
+; CHECK-ATTRSIZE-16-NEXT:    [[DOTSROA_0150_0_VEC_EXTRACT:%.*]] = extractelement <2 x i32> [[DOTSROA_0501_0]], i64 0
+; CHECK-ATTRSIZE-16-NEXT:    [[DOTNOT540:%.*]] = icmp eq i32 [[DOTSROA_0150_0_VEC_EXTRACT]], 0
+; CHECK-ATTRSIZE-16-NEXT:    [[OR_COND:%.*]] = or i1 [[TMP103]], [[DOTNOT540]]
+; CHECK-ATTRSIZE-16-NEXT:    br i1 [[OR_COND]], label [[TMP106]], label [[TMP104:%.*]]
+; CHECK-ATTRSIZE-16:       104:
+; CHECK-ATTRSIZE-16-NEXT:    [[TMP105:%.*]] = call i64 (...) @lgc.cps.as.continuation.reference__i64(ptr @_cont_Traversal)
+; CHECK-ATTRSIZE-16-NEXT:    [[DOTSROA_0320_0_EXTRACT_TRUNC:%.*]] = trunc i64 [[TMP105]] to i32
+; CHECK-ATTRSIZE-16-NEXT:    [[DOTFCA_0_0_INSERT322:%.*]] = insertvalue { { <3 x i32>, i32 }, { i64, i32, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i64 } } poison, <3 x i32> [[TMP2]], 0, 0
+; CHECK-ATTRSIZE-16-NEXT:    [[DOTFCA_0_1_INSERT323:%.*]] = insertvalue { { <3 x i32>, i32 }, { i64, i32, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i64 } } [[DOTFCA_0_0_INSERT322]], i32 [[TMP83]], 0, 1
+; CHECK-ATTRSIZE-16-NEXT:    [[DOTFCA_1_0_INSERT324:%.*]] = insertvalue { { <3 x i32>, i32 }, { i64, i32, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i64 } } [[DOTFCA_0_1_INSERT323]], i64 [[TMP6]], 1, 0
+; CHECK-ATTRSIZE-16-NEXT:    [[DOTFCA_1_1_INSERT325:%.*]] = insertvalue { { <3 x i32>, i32 }, { i64, i32, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i64 } } [[DOTFCA_1_0_INSERT324]], i32 [[TMP8]], 1, 1
+; CHECK-ATTRSIZE-16-NEXT:    [[DOTFCA_1_2_INSERT326:%.*]] = insertvalue { { <3 x i32>, i32 }, { i64, i32, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i64 } } [[DOTFCA_1_1_INSERT325]], i32 [[DOTFR539]], 1, 2
+; CHECK-ATTRSIZE-16-NEXT:    [[DOTFCA_1_3_INSERT327:%.*]] = insertvalue { { <3 x i32>, i32 }, { i64, i32, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i64 } } [[DOTFCA_1_2_INSERT326]], <3 x float> [[TMP12]], 1, 3
+; CHECK-ATTRSIZE-16-NEXT:    [[DOTFCA_1_4_INSERT328:%.*]] = insertvalue { { <3 x i32>, i32 }, { i64, i32, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i64 } } [[DOTFCA_1_3_INSERT327]], <3 x float> [[TMP14]], 1, 4
+; CHECK-ATTRSIZE-16-NEXT:    [[DOTFCA_1_5_INSERT329:%.*]] = insertvalue { { <3 x i32>, i32 }, { i64, i32, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i64 } } [[DOTFCA_1_4_INSERT328]], float [[TMP16]], 1, 5
+; CHECK-ATTRSIZE-16-NEXT:    [[DOTFCA_1_6_INSERT330:%.*]] = insertvalue { { <3 x i32>, i32 }, { i64, i32, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i64 } } [[DOTFCA_1_5_INSERT329]], float [[TMP18]], 1, 6
+; CHECK-ATTRSIZE-16-NEXT:    [[DOTFCA_2_0_0_INSERT331:%.*]] = insertvalue { { <3 x i32>, i32 }, { i64, i32, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i64 } } [[DOTFCA_1_6_INSERT330]], float [[TMP20]], 2, 0, 0
+; CHECK-ATTRSIZE-16-NEXT:    [[DOTFCA_2_0_1_INSERT332:%.*]] = insertvalue { { <3 x i32>, i32 }, { i64, i32, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i64 } } [[DOTFCA_2_0_0_INSERT331]], i32 [[DOTFR]], 2, 0, 1
+; CHECK-ATTRSIZE-16-NEXT:    [[DOTFCA_2_0_2_INSERT333:%.*]] = insertvalue { { <3 x i32>, i32 }, { i64, i32, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i64 } } [[DOTFCA_2_0_1_INSERT332]], i32 [[TMP24]], 2, 0, 2
+; CHECK-ATTRSIZE-16-NEXT:    [[DOTFCA_2_0_3_INSERT334:%.*]] = insertvalue { { <3 x i32>, i32 }, { i64, i32, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i64 } } [[DOTFCA_2_0_2_INSERT333]], i32 [[TMP26]], 2, 0, 3
+; CHECK-ATTRSIZE-16-NEXT:    [[DOTFCA_2_0_4_INSERT335:%.*]] = insertvalue { { <3 x i32>, i32 }, { i64, i32, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i64 } } [[DOTFCA_2_0_3_INSERT334]], i32 [[TMP28]], 2, 0, 4
+; CHECK-ATTRSIZE-16-NEXT:    [[DOTFCA_2_1_INSERT336:%.*]] = insertvalue { { <3 x i32>, i32 }, { i64, i32, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i64 } } [[DOTFCA_2_0_4_INSERT335]], <2 x float> [[TMP30]], 2, 1
+; CHECK-ATTRSIZE-16-NEXT:    [[DOTFCA_2_2_INSERT337:%.*]] = insertvalue { { <3 x i32>, i32 }, { i64, i32, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i64 } } [[DOTFCA_2_1_INSERT336]], i32 [[TMP32]], 2, 2
+; CHECK-ATTRSIZE-16-NEXT:    [[DOTFCA_2_3_INSERT338:%.*]] = insertvalue { { <3 x i32>, i32 }, { i64, i32, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i64 } } [[DOTFCA_2_2_INSERT337]], i32 [[TMP34]], 2, 3
+; CHECK-ATTRSIZE-16-NEXT:    [[DOTFCA_2_4_INSERT339:%.*]] = insertvalue { { <3 x i32>, i32 }, { i64, i32, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i64 } } [[DOTFCA_2_3_INSERT338]], i32 [[TMP36]], 2, 4
+; CHECK-ATTRSIZE-16-NEXT:    [[DOTFCA_2_5_INSERT340:%.*]] = insertvalue { { <3 x i32>, i32 }, { i64, i32, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i64 } } [[DOTFCA_2_4_INSERT339]], i32 [[TMP38]], 2, 5
+; CHECK-ATTRSIZE-16-NEXT:    [[DOTFCA_2_6_INSERT341:%.*]] = insertvalue { { <3 x i32>, i32 }, { i64, i32, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i64 } } [[DOTFCA_2_5_INSERT340]], i32 [[TMP40]], 2, 6
+; CHECK-ATTRSIZE-16-NEXT:    [[DOTFCA_2_7_INSERT342:%.*]] = insertvalue { { <3 x i32>, i32 }, { i64, i32, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i64 } } [[DOTFCA_2_6_INSERT341]], i32 [[TMP42]], 2, 7
+; CHECK-ATTRSIZE-16-NEXT:    [[DOTFCA_2_8_INSERT343:%.*]] = insertvalue { { <3 x i32>, i32 }, { i64, i32, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i64 } } [[DOTFCA_2_7_INSERT342]], i64 [[TMP44]], 2, 8
+; CHECK-ATTRSIZE-16-NEXT:    call void (...) @lgc.cps.jump(i32 [[DOTSROA_0150_0_VEC_EXTRACT]], i32 -1, {} poison, i32 [[DOTSROA_0320_0_EXTRACT_TRUNC]], i32 [[TMP83]], { { <3 x i32>, i32 }, { i64, i32, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i64 } } [[DOTFCA_2_8_INSERT343]], [7 x i32] poison, [8 x i32] [[PAYLOAD]])
+; CHECK-ATTRSIZE-16-NEXT:    unreachable
+; CHECK-ATTRSIZE-16:       106:
+; CHECK-ATTRSIZE-16-NEXT:    [[DOTSROA_7_0:%.*]] = phi i32 [ [[TMP4]], [[DOTEXIT2]] ], [ [[TMP83]], [[DOTEXIT5]] ]
+; CHECK-ATTRSIZE-16-NEXT:    [[DOTSROA_0373_0_EXTRACT_TRUNC:%.*]] = trunc i64 [[TMP44]] to i32
+; CHECK-ATTRSIZE-16-NEXT:    [[DOTFCA_0_INSERT:%.*]] = insertvalue { <3 x i32>, i32 } poison, <3 x i32> [[TMP2]], 0
+; CHECK-ATTRSIZE-16-NEXT:    [[DOTFCA_1_INSERT:%.*]] = insertvalue { <3 x i32>, i32 } [[DOTFCA_0_INSERT]], i32 [[DOTSROA_7_0]], 1
+; CHECK-ATTRSIZE-16-NEXT:    call void (...) @lgc.cps.jump(i32 [[DOTSROA_0373_0_EXTRACT_TRUNC]], i32 -1, {} poison, i32 poison, i32 [[DOTSROA_7_0]], { <3 x i32>, i32 } [[DOTFCA_1_INSERT]], [34 x i32] poison, [8 x i32] [[PAYLOAD]])
+; CHECK-ATTRSIZE-16-NEXT:    unreachable
+;
+; CHECK-ATTRSIZE-8-LABEL: define dso_local spir_func void @_cont_Traversal(
+; CHECK-ATTRSIZE-8-SAME: {} [[CONT_STATE:%.*]], i32 [[RETURNADDR:%.*]], i32 [[SHADER_INDEX:%.*]], { { <3 x i32>, i32 }, { i64, i32, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i64 } } [[SYSTEM_DATA:%.*]], {} [[HIT_ATTRS:%.*]], [5 x i32] [[PADDING:%.*]], [8 x i32] [[PAYLOAD:%.*]]) local_unnamed_addr !lgc.shaderstage [[META5:![0-9]+]] !lgc.rt.shaderstage [[META6:![0-9]+]] !lgc.cps [[META7:![0-9]+]] !continuation [[META8:![0-9]+]] {
+; CHECK-ATTRSIZE-8-NEXT:  .entry:
+; CHECK-ATTRSIZE-8-NEXT:    [[SYSTEM_DATA_ALLOCA:%.*]] = alloca { { <3 x i32>, i32 }, { i64, i32, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i64 } }, align 16, addrspace(5)
+; CHECK-ATTRSIZE-8-NEXT:    store { { <3 x i32>, i32 }, { i64, i32, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i64 } } [[SYSTEM_DATA]], ptr addrspace(5) [[SYSTEM_DATA_ALLOCA]], align 16
+; CHECK-ATTRSIZE-8-NEXT:    [[TMP0:%.*]] = call ptr addrspace(7) @lgc.load.buffer.desc(i64 93, i32 17, i32 0, i32 0)
+; CHECK-ATTRSIZE-8-NEXT:    [[TMP1:%.*]] = call ptr @llvm.invariant.start.p7(i64 -1, ptr addrspace(7) [[TMP0]])
+; CHECK-ATTRSIZE-8-NEXT:    [[TMP2:%.*]] = load <3 x i32>, ptr addrspace(5) [[SYSTEM_DATA_ALLOCA]], align 16
+; CHECK-ATTRSIZE-8-NEXT:    [[TMP3:%.*]] = getelementptr { { <3 x i32>, i32 }, { i64, i32, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i64 } }, ptr addrspace(5) [[SYSTEM_DATA_ALLOCA]], i32 0, i32 0, i32 1
+; CHECK-ATTRSIZE-8-NEXT:    [[TMP4:%.*]] = load i32, ptr addrspace(5) [[TMP3]], align 4
+; CHECK-ATTRSIZE-8-NEXT:    [[TMP5:%.*]] = getelementptr { { <3 x i32>, i32 }, { i64, i32, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i64 } }, ptr addrspace(5) [[SYSTEM_DATA_ALLOCA]], i32 0, i32 1, i32 0
+; CHECK-ATTRSIZE-8-NEXT:    [[TMP6:%.*]] = load i64, ptr addrspace(5) [[TMP5]], align 8
+; CHECK-ATTRSIZE-8-NEXT:    [[TMP7:%.*]] = getelementptr { { <3 x i32>, i32 }, { i64, i32, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i64 } }, ptr addrspace(5) [[SYSTEM_DATA_ALLOCA]], i32 0, i32 1, i32 1
+; CHECK-ATTRSIZE-8-NEXT:    [[TMP8:%.*]] = load i32, ptr addrspace(5) [[TMP7]], align 4
+; CHECK-ATTRSIZE-8-NEXT:    [[TMP9:%.*]] = getelementptr { { <3 x i32>, i32 }, { i64, i32, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i64 } }, ptr addrspace(5) [[SYSTEM_DATA_ALLOCA]], i32 0, i32 1, i32 2
+; CHECK-ATTRSIZE-8-NEXT:    [[TMP10:%.*]] = load i32, ptr addrspace(5) [[TMP9]], align 4
+; CHECK-ATTRSIZE-8-NEXT:    [[DOTFR539:%.*]] = freeze i32 [[TMP10]]
+; CHECK-ATTRSIZE-8-NEXT:    [[TMP11:%.*]] = getelementptr { { <3 x i32>, i32 }, { i64, i32, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i64 } }, ptr addrspace(5) [[SYSTEM_DATA_ALLOCA]], i32 0, i32 1, i32 3
+; CHECK-ATTRSIZE-8-NEXT:    [[TMP12:%.*]] = load <3 x float>, ptr addrspace(5) [[TMP11]], align 16
+; CHECK-ATTRSIZE-8-NEXT:    [[TMP13:%.*]] = getelementptr { { <3 x i32>, i32 }, { i64, i32, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i64 } }, ptr addrspace(5) [[SYSTEM_DATA_ALLOCA]], i32 0, i32 1, i32 4
+; CHECK-ATTRSIZE-8-NEXT:    [[TMP14:%.*]] = load <3 x float>, ptr addrspace(5) [[TMP13]], align 16
+; CHECK-ATTRSIZE-8-NEXT:    [[TMP15:%.*]] = getelementptr { { <3 x i32>, i32 }, { i64, i32, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i64 } }, ptr addrspace(5) [[SYSTEM_DATA_ALLOCA]], i32 0, i32 1, i32 5
+; CHECK-ATTRSIZE-8-NEXT:    [[TMP16:%.*]] = load float, ptr addrspace(5) [[TMP15]], align 4
+; CHECK-ATTRSIZE-8-NEXT:    [[TMP17:%.*]] = getelementptr { { <3 x i32>, i32 }, { i64, i32, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i64 } }, ptr addrspace(5) [[SYSTEM_DATA_ALLOCA]], i32 0, i32 1, i32 6
+; CHECK-ATTRSIZE-8-NEXT:    [[TMP18:%.*]] = load float, ptr addrspace(5) [[TMP17]], align 4
+; CHECK-ATTRSIZE-8-NEXT:    [[TMP19:%.*]] = getelementptr { { <3 x i32>, i32 }, { i64, i32, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i64 } }, ptr addrspace(5) [[SYSTEM_DATA_ALLOCA]], i32 0, i32 2, i32 0, i32 0
+; CHECK-ATTRSIZE-8-NEXT:    [[TMP20:%.*]] = load float, ptr addrspace(5) [[TMP19]], align 4
+; CHECK-ATTRSIZE-8-NEXT:    [[TMP21:%.*]] = getelementptr { { <3 x i32>, i32 }, { i64, i32, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i64 } }, ptr addrspace(5) [[SYSTEM_DATA_ALLOCA]], i32 0, i32 2, i32 0, i32 1
+; CHECK-ATTRSIZE-8-NEXT:    [[TMP22:%.*]] = load i32, ptr addrspace(5) [[TMP21]], align 4
+; CHECK-ATTRSIZE-8-NEXT:    [[DOTFR:%.*]] = freeze i32 [[TMP22]]
+; CHECK-ATTRSIZE-8-NEXT:    [[TMP23:%.*]] = getelementptr { { <3 x i32>, i32 }, { i64, i32, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i64 } }, ptr addrspace(5) [[SYSTEM_DATA_ALLOCA]], i32 0, i32 2, i32 0, i32 2
+; CHECK-ATTRSIZE-8-NEXT:    [[TMP24:%.*]] = load i32, ptr addrspace(5) [[TMP23]], align 4
+; CHECK-ATTRSIZE-8-NEXT:    [[TMP25:%.*]] = getelementptr { { <3 x i32>, i32 }, { i64, i32, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i64 } }, ptr addrspace(5) [[SYSTEM_DATA_ALLOCA]], i32 0, i32 2, i32 0, i32 3
+; CHECK-ATTRSIZE-8-NEXT:    [[TMP26:%.*]] = load i32, ptr addrspace(5) [[TMP25]], align 4
+; CHECK-ATTRSIZE-8-NEXT:    [[TMP27:%.*]] = getelementptr { { <3 x i32>, i32 }, { i64, i32, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i64 } }, ptr addrspace(5) [[SYSTEM_DATA_ALLOCA]], i32 0, i32 2, i32 0, i32 4
+; CHECK-ATTRSIZE-8-NEXT:    [[TMP28:%.*]] = load i32, ptr addrspace(5) [[TMP27]], align 4
+; CHECK-ATTRSIZE-8-NEXT:    [[TMP29:%.*]] = getelementptr { { <3 x i32>, i32 }, { i64, i32, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i64 } }, ptr addrspace(5) [[SYSTEM_DATA_ALLOCA]], i32 0, i32 2, i32 1
+; CHECK-ATTRSIZE-8-NEXT:    [[TMP30:%.*]] = load <2 x float>, ptr addrspace(5) [[TMP29]], align 8
+; CHECK-ATTRSIZE-8-NEXT:    [[TMP31:%.*]] = getelementptr { { <3 x i32>, i32 }, { i64, i32, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i64 } }, ptr addrspace(5) [[SYSTEM_DATA_ALLOCA]], i32 0, i32 2, i32 2
+; CHECK-ATTRSIZE-8-NEXT:    [[TMP32:%.*]] = load i32, ptr addrspace(5) [[TMP31]], align 4
+; CHECK-ATTRSIZE-8-NEXT:    [[TMP33:%.*]] = getelementptr { { <3 x i32>, i32 }, { i64, i32, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i64 } }, ptr addrspace(5) [[SYSTEM_DATA_ALLOCA]], i32 0, i32 2, i32 3
+; CHECK-ATTRSIZE-8-NEXT:    [[TMP34:%.*]] = load i32, ptr addrspace(5) [[TMP33]], align 4
+; CHECK-ATTRSIZE-8-NEXT:    [[TMP35:%.*]] = getelementptr { { <3 x i32>, i32 }, { i64, i32, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i64 } }, ptr addrspace(5) [[SYSTEM_DATA_ALLOCA]], i32 0, i32 2, i32 4
+; CHECK-ATTRSIZE-8-NEXT:    [[TMP36:%.*]] = load i32, ptr addrspace(5) [[TMP35]], align 4
+; CHECK-ATTRSIZE-8-NEXT:    [[TMP37:%.*]] = getelementptr { { <3 x i32>, i32 }, { i64, i32, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i64 } }, ptr addrspace(5) [[SYSTEM_DATA_ALLOCA]], i32 0, i32 2, i32 5
+; CHECK-ATTRSIZE-8-NEXT:    [[TMP38:%.*]] = load i32, ptr addrspace(5) [[TMP37]], align 4
+; CHECK-ATTRSIZE-8-NEXT:    [[TMP39:%.*]] = getelementptr { { <3 x i32>, i32 }, { i64, i32, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i64 } }, ptr addrspace(5) [[SYSTEM_DATA_ALLOCA]], i32 0, i32 2, i32 6
+; CHECK-ATTRSIZE-8-NEXT:    [[TMP40:%.*]] = load i32, ptr addrspace(5) [[TMP39]], align 4
+; CHECK-ATTRSIZE-8-NEXT:    [[TMP41:%.*]] = getelementptr { { <3 x i32>, i32 }, { i64, i32, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i64 } }, ptr addrspace(5) [[SYSTEM_DATA_ALLOCA]], i32 0, i32 2, i32 7
+; CHECK-ATTRSIZE-8-NEXT:    [[TMP42:%.*]] = load i32, ptr addrspace(5) [[TMP41]], align 4
+; CHECK-ATTRSIZE-8-NEXT:    [[TMP43:%.*]] = getelementptr { { <3 x i32>, i32 }, { i64, i32, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i64 } }, ptr addrspace(5) [[SYSTEM_DATA_ALLOCA]], i32 0, i32 2, i32 8
+; CHECK-ATTRSIZE-8-NEXT:    [[TMP44:%.*]] = load i64, ptr addrspace(5) [[TMP43]], align 8
+; CHECK-ATTRSIZE-8-NEXT:    [[TMP45:%.*]] = icmp ugt i32 [[DOTFR]], -3
+; CHECK-ATTRSIZE-8-NEXT:    br i1 [[TMP45]], label [[TMP46:%.*]], label [[TMP67:%.*]]
+; CHECK-ATTRSIZE-8:       46:
+; CHECK-ATTRSIZE-8-NEXT:    [[TMP47:%.*]] = getelementptr inbounds <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 }>, ptr addrspace(7) [[TMP0]], i32 0, i32 5
+; CHECK-ATTRSIZE-8-NEXT:    [[TMP48:%.*]] = load i32, ptr addrspace(7) [[TMP47]], align 4
+; CHECK-ATTRSIZE-8-NEXT:    [[TMP49:%.*]] = getelementptr inbounds <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 }>, ptr addrspace(7) [[TMP0]], i32 0, i32 6
+; CHECK-ATTRSIZE-8-NEXT:    [[TMP50:%.*]] = load i32, ptr addrspace(7) [[TMP49]], align 4
+; CHECK-ATTRSIZE-8-NEXT:    [[TMP51:%.*]] = zext i32 [[TMP50]] to i64
+; CHECK-ATTRSIZE-8-NEXT:    [[TMP52:%.*]] = shl nuw i64 [[TMP51]], 32
+; CHECK-ATTRSIZE-8-NEXT:    [[TMP53:%.*]] = zext i32 [[TMP48]] to i64
+; CHECK-ATTRSIZE-8-NEXT:    [[TMP54:%.*]] = or i64 [[TMP52]], [[TMP53]]
+; CHECK-ATTRSIZE-8-NEXT:    [[DOTFR541:%.*]] = freeze i64 [[TMP54]]
+; CHECK-ATTRSIZE-8-NEXT:    [[TMP55:%.*]] = icmp eq i64 [[DOTFR541]], 0
+; CHECK-ATTRSIZE-8-NEXT:    br i1 [[TMP55]], label [[DOTEXIT2:%.*]], label [[TMP56:%.*]]
+; CHECK-ATTRSIZE-8:       56:
+; CHECK-ATTRSIZE-8-NEXT:    [[TMP57:%.*]] = lshr i32 [[TMP8]], 16
+; CHECK-ATTRSIZE-8-NEXT:    [[TMP58:%.*]] = getelementptr inbounds <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 }>, ptr addrspace(7) [[TMP0]], i32 0, i32 7
+; CHECK-ATTRSIZE-8-NEXT:    [[TMP59:%.*]] = load i32, ptr addrspace(7) [[TMP58]], align 4
+; CHECK-ATTRSIZE-8-NEXT:    [[TMP60:%.*]] = mul i32 [[TMP59]], [[TMP57]]
+; CHECK-ATTRSIZE-8-NEXT:    [[TMP61:%.*]] = zext i32 [[TMP60]] to i64
+; CHECK-ATTRSIZE-8-NEXT:    [[TMP62:%.*]] = add i64 [[DOTFR541]], [[TMP61]]
+; CHECK-ATTRSIZE-8-NEXT:    [[TMP63:%.*]] = inttoptr i64 [[TMP62]] to ptr addrspace(4)
+; CHECK-ATTRSIZE-8-NEXT:    [[TMP64:%.*]] = load i32, ptr addrspace(4) [[TMP63]], align 4
+; CHECK-ATTRSIZE-8-NEXT:    [[TMP65:%.*]] = freeze i32 [[TMP64]]
+; CHECK-ATTRSIZE-8-NEXT:    br label [[DOTEXIT2]]
+; CHECK-ATTRSIZE-8:       .exit2:
+; CHECK-ATTRSIZE-8-NEXT:    [[DOT0:%.*]] = phi i32 [ [[TMP57]], [[TMP56]] ], [ undef, [[TMP46]] ]
+; CHECK-ATTRSIZE-8-NEXT:    [[DOTSROA_0128_0_EXTRACT_TRUNC:%.*]] = phi i32 [ [[TMP65]], [[TMP56]] ], [ 0, [[TMP46]] ]
+; CHECK-ATTRSIZE-8-NEXT:    [[DOTNOT542:%.*]] = icmp eq i32 [[DOTSROA_0128_0_EXTRACT_TRUNC]], 0
+; CHECK-ATTRSIZE-8-NEXT:    br i1 [[DOTNOT542]], label [[TMP106:%.*]], label [[TMP66:%.*]]
+; CHECK-ATTRSIZE-8:       66:
+; CHECK-ATTRSIZE-8-NEXT:    [[DOTSROA_0130_0_EXTRACT_TRUNC:%.*]] = trunc i64 [[TMP44]] to i32
+; CHECK-ATTRSIZE-8-NEXT:    [[DOTFCA_0_0_INSERT:%.*]] = insertvalue { { <3 x i32>, i32 }, { i64, i32, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i64 } } poison, <3 x i32> [[TMP2]], 0, 0
+; CHECK-ATTRSIZE-8-NEXT:    [[DOTFCA_0_1_INSERT:%.*]] = insertvalue { { <3 x i32>, i32 }, { i64, i32, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i64 } } [[DOTFCA_0_0_INSERT]], i32 [[DOT0]], 0, 1
+; CHECK-ATTRSIZE-8-NEXT:    [[DOTFCA_1_0_INSERT:%.*]] = insertvalue { { <3 x i32>, i32 }, { i64, i32, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i64 } } [[DOTFCA_0_1_INSERT]], i64 [[TMP6]], 1, 0
+; CHECK-ATTRSIZE-8-NEXT:    [[DOTFCA_1_1_INSERT:%.*]] = insertvalue { { <3 x i32>, i32 }, { i64, i32, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i64 } } [[DOTFCA_1_0_INSERT]], i32 [[TMP8]], 1, 1
+; CHECK-ATTRSIZE-8-NEXT:    [[DOTFCA_1_2_INSERT:%.*]] = insertvalue { { <3 x i32>, i32 }, { i64, i32, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i64 } } [[DOTFCA_1_1_INSERT]], i32 [[DOTFR539]], 1, 2
+; CHECK-ATTRSIZE-8-NEXT:    [[DOTFCA_1_3_INSERT:%.*]] = insertvalue { { <3 x i32>, i32 }, { i64, i32, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i64 } } [[DOTFCA_1_2_INSERT]], <3 x float> [[TMP12]], 1, 3
+; CHECK-ATTRSIZE-8-NEXT:    [[DOTFCA_1_4_INSERT:%.*]] = insertvalue { { <3 x i32>, i32 }, { i64, i32, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i64 } } [[DOTFCA_1_3_INSERT]], <3 x float> [[TMP14]], 1, 4
+; CHECK-ATTRSIZE-8-NEXT:    [[DOTFCA_1_5_INSERT:%.*]] = insertvalue { { <3 x i32>, i32 }, { i64, i32, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i64 } } [[DOTFCA_1_4_INSERT]], float [[TMP16]], 1, 5
+; CHECK-ATTRSIZE-8-NEXT:    [[DOTFCA_1_6_INSERT:%.*]] = insertvalue { { <3 x i32>, i32 }, { i64, i32, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i64 } } [[DOTFCA_1_5_INSERT]], float [[TMP18]], 1, 6
+; CHECK-ATTRSIZE-8-NEXT:    [[DOTFCA_2_0_0_INSERT:%.*]] = insertvalue { { <3 x i32>, i32 }, { i64, i32, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i64 } } [[DOTFCA_1_6_INSERT]], float [[TMP20]], 2, 0, 0
+; CHECK-ATTRSIZE-8-NEXT:    [[DOTFCA_2_0_1_INSERT:%.*]] = insertvalue { { <3 x i32>, i32 }, { i64, i32, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i64 } } [[DOTFCA_2_0_0_INSERT]], i32 [[DOTFR]], 2, 0, 1
+; CHECK-ATTRSIZE-8-NEXT:    [[DOTFCA_2_0_2_INSERT:%.*]] = insertvalue { { <3 x i32>, i32 }, { i64, i32, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i64 } } [[DOTFCA_2_0_1_INSERT]], i32 [[TMP24]], 2, 0, 2
+; CHECK-ATTRSIZE-8-NEXT:    [[DOTFCA_2_0_3_INSERT:%.*]] = insertvalue { { <3 x i32>, i32 }, { i64, i32, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i64 } } [[DOTFCA_2_0_2_INSERT]], i32 [[TMP26]], 2, 0, 3
+; CHECK-ATTRSIZE-8-NEXT:    [[DOTFCA_2_0_4_INSERT:%.*]] = insertvalue { { <3 x i32>, i32 }, { i64, i32, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i64 } } [[DOTFCA_2_0_3_INSERT]], i32 [[TMP28]], 2, 0, 4
+; CHECK-ATTRSIZE-8-NEXT:    [[DOTFCA_2_1_INSERT:%.*]] = insertvalue { { <3 x i32>, i32 }, { i64, i32, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i64 } } [[DOTFCA_2_0_4_INSERT]], <2 x float> [[TMP30]], 2, 1
+; CHECK-ATTRSIZE-8-NEXT:    [[DOTFCA_2_2_INSERT:%.*]] = insertvalue { { <3 x i32>, i32 }, { i64, i32, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i64 } } [[DOTFCA_2_1_INSERT]], i32 [[TMP32]], 2, 2
+; CHECK-ATTRSIZE-8-NEXT:    [[DOTFCA_2_3_INSERT:%.*]] = insertvalue { { <3 x i32>, i32 }, { i64, i32, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i64 } } [[DOTFCA_2_2_INSERT]], i32 [[TMP34]], 2, 3
+; CHECK-ATTRSIZE-8-NEXT:    [[DOTFCA_2_4_INSERT:%.*]] = insertvalue { { <3 x i32>, i32 }, { i64, i32, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i64 } } [[DOTFCA_2_3_INSERT]], i32 [[TMP36]], 2, 4
+; CHECK-ATTRSIZE-8-NEXT:    [[DOTFCA_2_5_INSERT:%.*]] = insertvalue { { <3 x i32>, i32 }, { i64, i32, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i64 } } [[DOTFCA_2_4_INSERT]], i32 [[TMP38]], 2, 5
+; CHECK-ATTRSIZE-8-NEXT:    [[DOTFCA_2_6_INSERT:%.*]] = insertvalue { { <3 x i32>, i32 }, { i64, i32, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i64 } } [[DOTFCA_2_5_INSERT]], i32 [[TMP40]], 2, 6
+; CHECK-ATTRSIZE-8-NEXT:    [[DOTFCA_2_7_INSERT:%.*]] = insertvalue { { <3 x i32>, i32 }, { i64, i32, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i64 } } [[DOTFCA_2_6_INSERT]], i32 [[TMP42]], 2, 7
+; CHECK-ATTRSIZE-8-NEXT:    [[DOTFCA_2_8_INSERT:%.*]] = insertvalue { { <3 x i32>, i32 }, { i64, i32, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i64 } } [[DOTFCA_2_7_INSERT]], i64 [[TMP44]], 2, 8
+; CHECK-ATTRSIZE-8-NEXT:    call void (...) @lgc.cps.jump(i32 [[DOTSROA_0128_0_EXTRACT_TRUNC]], i32 -1, {} poison, i32 [[DOTSROA_0130_0_EXTRACT_TRUNC]], i32 [[DOT0]], { { <3 x i32>, i32 }, { i64, i32, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i64 } } [[DOTFCA_2_8_INSERT]], [5 x i32] poison, [8 x i32] [[PAYLOAD]])
+; CHECK-ATTRSIZE-8-NEXT:    unreachable
+; CHECK-ATTRSIZE-8:       67:
+; CHECK-ATTRSIZE-8-NEXT:    [[TMP68:%.*]] = shl i32 [[DOTFR]], 3
+; CHECK-ATTRSIZE-8-NEXT:    [[TMP69:%.*]] = and i32 [[TMP68]], -64
+; CHECK-ATTRSIZE-8-NEXT:    [[TMP70:%.*]] = zext i32 [[TMP69]] to i64
+; CHECK-ATTRSIZE-8-NEXT:    [[TMP71:%.*]] = add i64 [[TMP6]], [[TMP70]]
+; CHECK-ATTRSIZE-8-NEXT:    [[TMP72:%.*]] = add i64 [[TMP71]], 48
+; CHECK-ATTRSIZE-8-NEXT:    [[TMP73:%.*]] = inttoptr i64 [[TMP72]] to ptr addrspace(1)
+; CHECK-ATTRSIZE-8-NEXT:    [[TMP74:%.*]] = load <4 x i32>, ptr addrspace(1) [[TMP73]], align 16
+; CHECK-ATTRSIZE-8-NEXT:    [[DOT4_VEC_EXTRACT452:%.*]] = extractelement <4 x i32> [[TMP74]], i64 1
+; CHECK-ATTRSIZE-8-NEXT:    [[TMP75:%.*]] = and i32 [[TMP26]], 16777215
+; CHECK-ATTRSIZE-8-NEXT:    [[TMP76:%.*]] = and i32 [[DOT4_VEC_EXTRACT452]], 16777215
+; CHECK-ATTRSIZE-8-NEXT:    [[TMP77:%.*]] = lshr i32 [[TMP8]], 8
+; CHECK-ATTRSIZE-8-NEXT:    [[TMP78:%.*]] = and i32 [[TMP77]], 15
+; CHECK-ATTRSIZE-8-NEXT:    [[TMP79:%.*]] = lshr i32 [[TMP8]], 12
+; CHECK-ATTRSIZE-8-NEXT:    [[TMP80:%.*]] = and i32 [[TMP79]], 15
+; CHECK-ATTRSIZE-8-NEXT:    [[TMP81:%.*]] = mul nuw nsw i32 [[TMP80]], [[TMP75]]
+; CHECK-ATTRSIZE-8-NEXT:    [[TMP82:%.*]] = add nuw nsw i32 [[TMP78]], [[TMP81]]
+; CHECK-ATTRSIZE-8-NEXT:    [[TMP83:%.*]] = add nuw nsw i32 [[TMP82]], [[TMP76]]
+; CHECK-ATTRSIZE-8-NEXT:    [[TMP84:%.*]] = getelementptr inbounds <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 }>, ptr addrspace(7) [[TMP0]], i32 0, i32 9
+; CHECK-ATTRSIZE-8-NEXT:    [[TMP85:%.*]] = load i32, ptr addrspace(7) [[TMP84]], align 4
+; CHECK-ATTRSIZE-8-NEXT:    [[TMP86:%.*]] = getelementptr inbounds <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 }>, ptr addrspace(7) [[TMP0]], i32 0, i32 10
+; CHECK-ATTRSIZE-8-NEXT:    [[TMP87:%.*]] = load i32, ptr addrspace(7) [[TMP86]], align 4
+; CHECK-ATTRSIZE-8-NEXT:    [[TMP88:%.*]] = zext i32 [[TMP87]] to i64
+; CHECK-ATTRSIZE-8-NEXT:    [[TMP89:%.*]] = shl nuw i64 [[TMP88]], 32
+; CHECK-ATTRSIZE-8-NEXT:    [[TMP90:%.*]] = zext i32 [[TMP85]] to i64
+; CHECK-ATTRSIZE-8-NEXT:    [[TMP91:%.*]] = or i64 [[TMP89]], [[TMP90]]
+; CHECK-ATTRSIZE-8-NEXT:    [[DOTFR537:%.*]] = freeze i64 [[TMP91]]
+; CHECK-ATTRSIZE-8-NEXT:    [[DOTNOT:%.*]] = icmp eq i64 [[DOTFR537]], 0
+; CHECK-ATTRSIZE-8-NEXT:    br i1 [[DOTNOT]], label [[DOTEXIT5:%.*]], label [[TMP92:%.*]]
+; CHECK-ATTRSIZE-8:       92:
+; CHECK-ATTRSIZE-8-NEXT:    [[TMP93:%.*]] = getelementptr inbounds <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 }>, ptr addrspace(7) [[TMP0]], i32 0, i32 11
+; CHECK-ATTRSIZE-8-NEXT:    [[TMP94:%.*]] = load i32, ptr addrspace(7) [[TMP93]], align 4
+; CHECK-ATTRSIZE-8-NEXT:    [[TMP95:%.*]] = mul i32 [[TMP94]], [[TMP83]]
+; CHECK-ATTRSIZE-8-NEXT:    [[TMP96:%.*]] = zext i32 [[TMP95]] to i64
+; CHECK-ATTRSIZE-8-NEXT:    [[TMP97:%.*]] = add i64 [[DOTFR537]], [[TMP96]]
+; CHECK-ATTRSIZE-8-NEXT:    [[TMP98:%.*]] = inttoptr i64 [[TMP97]] to ptr addrspace(1)
+; CHECK-ATTRSIZE-8-NEXT:    [[TMP99:%.*]] = load <4 x i32>, ptr addrspace(1) [[TMP98]], align 16
+; CHECK-ATTRSIZE-8-NEXT:    [[TMP100:%.*]] = shufflevector <4 x i32> [[TMP99]], <4 x i32> poison, <2 x i32> <i32 0, i32 1>
+; CHECK-ATTRSIZE-8-NEXT:    [[TMP101:%.*]] = freeze <2 x i32> [[TMP100]]
+; CHECK-ATTRSIZE-8-NEXT:    br label [[DOTEXIT5]]
+; CHECK-ATTRSIZE-8:       .exit5:
+; CHECK-ATTRSIZE-8-NEXT:    [[DOTSROA_0501_0:%.*]] = phi <2 x i32> [ [[TMP101]], [[TMP92]] ], [ zeroinitializer, [[TMP67]] ]
+; CHECK-ATTRSIZE-8-NEXT:    [[TMP102:%.*]] = and i32 [[DOTFR539]], 8
+; CHECK-ATTRSIZE-8-NEXT:    [[TMP103:%.*]] = icmp ne i32 [[TMP102]], 0
+; CHECK-ATTRSIZE-8-NEXT:    [[DOTSROA_0150_0_VEC_EXTRACT:%.*]] = extractelement <2 x i32> [[DOTSROA_0501_0]], i64 0
+; CHECK-ATTRSIZE-8-NEXT:    [[DOTNOT540:%.*]] = icmp eq i32 [[DOTSROA_0150_0_VEC_EXTRACT]], 0
+; CHECK-ATTRSIZE-8-NEXT:    [[OR_COND:%.*]] = or i1 [[TMP103]], [[DOTNOT540]]
+; CHECK-ATTRSIZE-8-NEXT:    br i1 [[OR_COND]], label [[TMP106]], label [[TMP104:%.*]]
+; CHECK-ATTRSIZE-8:       104:
+; CHECK-ATTRSIZE-8-NEXT:    [[TMP105:%.*]] = call i64 (...) @lgc.cps.as.continuation.reference__i64(ptr @_cont_Traversal)
+; CHECK-ATTRSIZE-8-NEXT:    [[DOTSROA_0320_0_EXTRACT_TRUNC:%.*]] = trunc i64 [[TMP105]] to i32
+; CHECK-ATTRSIZE-8-NEXT:    [[DOTFCA_0_0_INSERT322:%.*]] = insertvalue { { <3 x i32>, i32 }, { i64, i32, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i64 } } poison, <3 x i32> [[TMP2]], 0, 0
+; CHECK-ATTRSIZE-8-NEXT:    [[DOTFCA_0_1_INSERT323:%.*]] = insertvalue { { <3 x i32>, i32 }, { i64, i32, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i64 } } [[DOTFCA_0_0_INSERT322]], i32 [[TMP83]], 0, 1
+; CHECK-ATTRSIZE-8-NEXT:    [[DOTFCA_1_0_INSERT324:%.*]] = insertvalue { { <3 x i32>, i32 }, { i64, i32, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i64 } } [[DOTFCA_0_1_INSERT323]], i64 [[TMP6]], 1, 0
+; CHECK-ATTRSIZE-8-NEXT:    [[DOTFCA_1_1_INSERT325:%.*]] = insertvalue { { <3 x i32>, i32 }, { i64, i32, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i64 } } [[DOTFCA_1_0_INSERT324]], i32 [[TMP8]], 1, 1
+; CHECK-ATTRSIZE-8-NEXT:    [[DOTFCA_1_2_INSERT326:%.*]] = insertvalue { { <3 x i32>, i32 }, { i64, i32, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i64 } } [[DOTFCA_1_1_INSERT325]], i32 [[DOTFR539]], 1, 2
+; CHECK-ATTRSIZE-8-NEXT:    [[DOTFCA_1_3_INSERT327:%.*]] = insertvalue { { <3 x i32>, i32 }, { i64, i32, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i64 } } [[DOTFCA_1_2_INSERT326]], <3 x float> [[TMP12]], 1, 3
+; CHECK-ATTRSIZE-8-NEXT:    [[DOTFCA_1_4_INSERT328:%.*]] = insertvalue { { <3 x i32>, i32 }, { i64, i32, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i64 } } [[DOTFCA_1_3_INSERT327]], <3 x float> [[TMP14]], 1, 4
+; CHECK-ATTRSIZE-8-NEXT:    [[DOTFCA_1_5_INSERT329:%.*]] = insertvalue { { <3 x i32>, i32 }, { i64, i32, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i64 } } [[DOTFCA_1_4_INSERT328]], float [[TMP16]], 1, 5
+; CHECK-ATTRSIZE-8-NEXT:    [[DOTFCA_1_6_INSERT330:%.*]] = insertvalue { { <3 x i32>, i32 }, { i64, i32, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i64 } } [[DOTFCA_1_5_INSERT329]], float [[TMP18]], 1, 6
+; CHECK-ATTRSIZE-8-NEXT:    [[DOTFCA_2_0_0_INSERT331:%.*]] = insertvalue { { <3 x i32>, i32 }, { i64, i32, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i64 } } [[DOTFCA_1_6_INSERT330]], float [[TMP20]], 2, 0, 0
+; CHECK-ATTRSIZE-8-NEXT:    [[DOTFCA_2_0_1_INSERT332:%.*]] = insertvalue { { <3 x i32>, i32 }, { i64, i32, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i64 } } [[DOTFCA_2_0_0_INSERT331]], i32 [[DOTFR]], 2, 0, 1
+; CHECK-ATTRSIZE-8-NEXT:    [[DOTFCA_2_0_2_INSERT333:%.*]] = insertvalue { { <3 x i32>, i32 }, { i64, i32, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i64 } } [[DOTFCA_2_0_1_INSERT332]], i32 [[TMP24]], 2, 0, 2
+; CHECK-ATTRSIZE-8-NEXT:    [[DOTFCA_2_0_3_INSERT334:%.*]] = insertvalue { { <3 x i32>, i32 }, { i64, i32, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i64 } } [[DOTFCA_2_0_2_INSERT333]], i32 [[TMP26]], 2, 0, 3
+; CHECK-ATTRSIZE-8-NEXT:    [[DOTFCA_2_0_4_INSERT335:%.*]] = insertvalue { { <3 x i32>, i32 }, { i64, i32, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i64 } } [[DOTFCA_2_0_3_INSERT334]], i32 [[TMP28]], 2, 0, 4
+; CHECK-ATTRSIZE-8-NEXT:    [[DOTFCA_2_1_INSERT336:%.*]] = insertvalue { { <3 x i32>, i32 }, { i64, i32, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i64 } } [[DOTFCA_2_0_4_INSERT335]], <2 x float> [[TMP30]], 2, 1
+; CHECK-ATTRSIZE-8-NEXT:    [[DOTFCA_2_2_INSERT337:%.*]] = insertvalue { { <3 x i32>, i32 }, { i64, i32, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i64 } } [[DOTFCA_2_1_INSERT336]], i32 [[TMP32]], 2, 2
+; CHECK-ATTRSIZE-8-NEXT:    [[DOTFCA_2_3_INSERT338:%.*]] = insertvalue { { <3 x i32>, i32 }, { i64, i32, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i64 } } [[DOTFCA_2_2_INSERT337]], i32 [[TMP34]], 2, 3
+; CHECK-ATTRSIZE-8-NEXT:    [[DOTFCA_2_4_INSERT339:%.*]] = insertvalue { { <3 x i32>, i32 }, { i64, i32, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i64 } } [[DOTFCA_2_3_INSERT338]], i32 [[TMP36]], 2, 4
+; CHECK-ATTRSIZE-8-NEXT:    [[DOTFCA_2_5_INSERT340:%.*]] = insertvalue { { <3 x i32>, i32 }, { i64, i32, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i64 } } [[DOTFCA_2_4_INSERT339]], i32 [[TMP38]], 2, 5
+; CHECK-ATTRSIZE-8-NEXT:    [[DOTFCA_2_6_INSERT341:%.*]] = insertvalue { { <3 x i32>, i32 }, { i64, i32, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i64 } } [[DOTFCA_2_5_INSERT340]], i32 [[TMP40]], 2, 6
+; CHECK-ATTRSIZE-8-NEXT:    [[DOTFCA_2_7_INSERT342:%.*]] = insertvalue { { <3 x i32>, i32 }, { i64, i32, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i64 } } [[DOTFCA_2_6_INSERT341]], i32 [[TMP42]], 2, 7
+; CHECK-ATTRSIZE-8-NEXT:    [[DOTFCA_2_8_INSERT343:%.*]] = insertvalue { { <3 x i32>, i32 }, { i64, i32, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i64 } } [[DOTFCA_2_7_INSERT342]], i64 [[TMP44]], 2, 8
+; CHECK-ATTRSIZE-8-NEXT:    call void (...) @lgc.cps.jump(i32 [[DOTSROA_0150_0_VEC_EXTRACT]], i32 -1, {} poison, i32 [[DOTSROA_0320_0_EXTRACT_TRUNC]], i32 [[TMP83]], { { <3 x i32>, i32 }, { i64, i32, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i64 } } [[DOTFCA_2_8_INSERT343]], [5 x i32] poison, [8 x i32] [[PAYLOAD]])
+; CHECK-ATTRSIZE-8-NEXT:    unreachable
+; CHECK-ATTRSIZE-8:       106:
+; CHECK-ATTRSIZE-8-NEXT:    [[DOTSROA_7_0:%.*]] = phi i32 [ [[TMP4]], [[DOTEXIT2]] ], [ [[TMP83]], [[DOTEXIT5]] ]
+; CHECK-ATTRSIZE-8-NEXT:    [[DOTSROA_0373_0_EXTRACT_TRUNC:%.*]] = trunc i64 [[TMP44]] to i32
+; CHECK-ATTRSIZE-8-NEXT:    [[DOTFCA_0_INSERT:%.*]] = insertvalue { <3 x i32>, i32 } poison, <3 x i32> [[TMP2]], 0
+; CHECK-ATTRSIZE-8-NEXT:    [[DOTFCA_1_INSERT:%.*]] = insertvalue { <3 x i32>, i32 } [[DOTFCA_0_INSERT]], i32 [[DOTSROA_7_0]], 1
+; CHECK-ATTRSIZE-8-NEXT:    call void (...) @lgc.cps.jump(i32 [[DOTSROA_0373_0_EXTRACT_TRUNC]], i32 -1, {} poison, i32 poison, i32 [[DOTSROA_7_0]], { <3 x i32>, i32 } [[DOTFCA_1_INSERT]], [32 x i32] poison, [8 x i32] [[PAYLOAD]])
+; CHECK-ATTRSIZE-8-NEXT:    unreachable
 ;
 .entry:
   %1 = call ptr addrspace(7) @lgc.load.buffer.desc(i64 93, i32 17, i32 0, i32 0)
@@ -401,248 +593,22 @@ define dso_local spir_func { { float, i32, i32, i32, i32 }, <2 x float>, i32 } @
   unreachable
 }
 
-; Set !lgc.rt.attribute.size to 0 to test padding is added correctly for _AmdEnqueueAnyHit (should be poison {})
-define dso_local spir_func { { float, i32, i32, i32, i32 }, <2 x float>, i32 } @_cont_Traversal_2(ptr addrspace(5) %0) local_unnamed_addr !lgc.shaderstage !0 !types !1 !lgc.rt.shaderstage !3 !lgc.rt.attribute.size !8 {
-; LOWERRAYTRACINGPIPELINE-CPS-LABEL: define dso_local spir_func void @_cont_Traversal_2(
-; LOWERRAYTRACINGPIPELINE-CPS-SAME: {} [[CONT_STATE:%.*]], i32 [[RETURN_ADDR:%.*]], i32 [[SHADER_INDEX:%.*]], { { <3 x i32>, i32 }, { i64, i32, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i64 } } [[SYSTEM_DATA:%.*]], {} [[HIT_ATTRS:%.*]], [5 x i32] [[PADDING:%.*]], [8 x i32] [[PAYLOAD:%.*]]) local_unnamed_addr !lgc.shaderstage [[META4]] !lgc.rt.shaderstage [[META5]] !lgc.rt.attribute.size [[META9:![0-9]+]] !lgc.cps [[META7]] !continuation [[META10:![0-9]+]] {
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:  .entry:
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[SYSTEM_DATA_ALLOCA:%.*]] = alloca { { <3 x i32>, i32 }, { i64, i32, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i64 } }, align 16, addrspace(5)
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    store { { <3 x i32>, i32 }, { i64, i32, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i64 } } [[SYSTEM_DATA]], ptr addrspace(5) [[SYSTEM_DATA_ALLOCA]], align 16
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP0:%.*]] = call ptr addrspace(7) @lgc.load.buffer.desc(i64 93, i32 17, i32 0, i32 0)
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP1:%.*]] = call ptr @llvm.invariant.start.p7(i64 -1, ptr addrspace(7) [[TMP0]])
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP2:%.*]] = load i32, ptr addrspace(5) [[SYSTEM_DATA_ALLOCA]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr addrspace(5) [[SYSTEM_DATA_ALLOCA]], i32 16
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP4:%.*]] = load i64, ptr addrspace(5) [[TMP3]], align 8
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr addrspace(5) [[SYSTEM_DATA_ALLOCA]], i32 24
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP6:%.*]] = load i32, ptr addrspace(5) [[TMP5]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP7:%.*]] = getelementptr i8, ptr addrspace(5) [[SYSTEM_DATA_ALLOCA]], i32 32
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP8:%.*]] = load <3 x float>, ptr addrspace(5) [[TMP7]], align 16
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr addrspace(5) [[SYSTEM_DATA_ALLOCA]], i32 48
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP10:%.*]] = load <3 x float>, ptr addrspace(5) [[TMP9]], align 16
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP11:%.*]] = getelementptr i8, ptr addrspace(5) [[SYSTEM_DATA_ALLOCA]], i32 64
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP12:%.*]] = load float, ptr addrspace(5) [[TMP11]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP13:%.*]] = getelementptr i8, ptr addrspace(5) [[SYSTEM_DATA_ALLOCA]], i32 68
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP14:%.*]] = load float, ptr addrspace(5) [[TMP13]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr addrspace(5) [[SYSTEM_DATA_ALLOCA]], i32 80
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP16:%.*]] = load float, ptr addrspace(5) [[TMP15]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP17:%.*]] = getelementptr i8, ptr addrspace(5) [[SYSTEM_DATA_ALLOCA]], i32 84
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP18:%.*]] = load i32, ptr addrspace(5) [[TMP17]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP19:%.*]] = getelementptr i8, ptr addrspace(5) [[SYSTEM_DATA_ALLOCA]], i32 88
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP20:%.*]] = load i32, ptr addrspace(5) [[TMP19]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP21:%.*]] = getelementptr i8, ptr addrspace(5) [[SYSTEM_DATA_ALLOCA]], i32 92
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP22:%.*]] = load i32, ptr addrspace(5) [[TMP21]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP23:%.*]] = getelementptr i8, ptr addrspace(5) [[SYSTEM_DATA_ALLOCA]], i32 96
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP24:%.*]] = load i32, ptr addrspace(5) [[TMP23]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP25:%.*]] = getelementptr i8, ptr addrspace(5) [[SYSTEM_DATA_ALLOCA]], i32 104
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP26:%.*]] = load <2 x float>, ptr addrspace(5) [[TMP25]], align 8
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP27:%.*]] = getelementptr i8, ptr addrspace(5) [[SYSTEM_DATA_ALLOCA]], i32 112
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP28:%.*]] = load i32, ptr addrspace(5) [[TMP27]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP29:%.*]] = getelementptr i8, ptr addrspace(5) [[SYSTEM_DATA_ALLOCA]], i32 116
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP30:%.*]] = load i32, ptr addrspace(5) [[TMP29]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP31:%.*]] = getelementptr i8, ptr addrspace(5) [[SYSTEM_DATA_ALLOCA]], i32 120
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP32:%.*]] = load i32, ptr addrspace(5) [[TMP31]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP33:%.*]] = getelementptr i8, ptr addrspace(5) [[SYSTEM_DATA_ALLOCA]], i32 124
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP34:%.*]] = load i32, ptr addrspace(5) [[TMP33]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP35:%.*]] = getelementptr i8, ptr addrspace(5) [[SYSTEM_DATA_ALLOCA]], i32 128
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP36:%.*]] = load i32, ptr addrspace(5) [[TMP35]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP37:%.*]] = getelementptr i8, ptr addrspace(5) [[SYSTEM_DATA_ALLOCA]], i32 132
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP38:%.*]] = load i32, ptr addrspace(5) [[TMP37]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP39:%.*]] = getelementptr i8, ptr addrspace(5) [[SYSTEM_DATA_ALLOCA]], i32 136
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP40:%.*]] = load i32, ptr addrspace(5) [[TMP39]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP41:%.*]] = getelementptr i8, ptr addrspace(5) [[SYSTEM_DATA_ALLOCA]], i32 144
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP42:%.*]] = load i64, ptr addrspace(5) [[TMP41]], align 8
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP43:%.*]] = and i64 [[TMP4]], 281474976710655
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP44:%.*]] = add nuw nsw i64 [[TMP43]], 48
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP45:%.*]] = inttoptr i64 [[TMP44]] to ptr addrspace(1)
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP46:%.*]] = load <4 x i32>, ptr addrspace(1) [[TMP45]], align 16
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[DOT4_VEC_EXTRACT:%.*]] = extractelement <4 x i32> [[TMP46]], i64 1
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP47:%.*]] = and i32 [[DOT4_VEC_EXTRACT]], 16777215
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP48:%.*]] = lshr i32 [[TMP6]], 8
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP49:%.*]] = and i32 [[TMP48]], 15
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP50:%.*]] = add nuw nsw i32 [[TMP49]], [[TMP47]]
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP51:%.*]] = getelementptr inbounds i8, ptr addrspace(7) [[TMP0]], i32 36
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP52:%.*]] = load i32, ptr addrspace(7) [[TMP51]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP53:%.*]] = getelementptr inbounds i8, ptr addrspace(7) [[TMP0]], i32 40
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP54:%.*]] = load i32, ptr addrspace(7) [[TMP53]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP55:%.*]] = zext i32 [[TMP54]] to i64
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP56:%.*]] = shl nuw i64 [[TMP55]], 32
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP57:%.*]] = zext i32 [[TMP52]] to i64
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP58:%.*]] = or i64 [[TMP56]], [[TMP57]]
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[DOTFR:%.*]] = freeze i64 [[TMP58]]
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[DOTNOT:%.*]] = icmp eq i64 [[DOTFR]], 0
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    br i1 [[DOTNOT]], label [[DOTEXIT4:%.*]], label [[TMP59:%.*]]
-; LOWERRAYTRACINGPIPELINE-CPS:       59:
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP60:%.*]] = getelementptr inbounds i8, ptr addrspace(7) [[TMP0]], i32 44
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP61:%.*]] = load i32, ptr addrspace(7) [[TMP60]], align 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP62:%.*]] = mul i32 [[TMP61]], [[TMP50]]
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP63:%.*]] = zext i32 [[TMP62]] to i64
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP64:%.*]] = add i64 [[DOTFR]], [[TMP63]]
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP65:%.*]] = inttoptr i64 [[TMP64]] to ptr addrspace(1)
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP66:%.*]] = load <4 x i32>, ptr addrspace(1) [[TMP65]], align 16
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP67:%.*]] = shufflevector <4 x i32> [[TMP66]], <4 x i32> poison, <2 x i32> <i32 2, i32 3>
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    br label [[DOTEXIT4]]
-; LOWERRAYTRACINGPIPELINE-CPS:       .exit4:
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[DOTSROA_5334_0:%.*]] = phi <2 x i32> [ [[TMP67]], [[TMP59]] ], [ zeroinitializer, [[DOTENTRY:%.*]] ]
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[DOTSROA_370_8_VEC_EXTRACT:%.*]] = extractelement <2 x i32> [[DOTSROA_5334_0]], i64 0
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[TMP68:%.*]] = call i64 (...) @lgc.cps.as.continuation.reference__i64(ptr @_cont_Traversal_2)
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[DOTSROA_0112_0_EXTRACT_TRUNC:%.*]] = trunc i64 [[TMP68]] to i32
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[DOTFCA_0_0_0_INSERT:%.*]] = insertvalue { { { i32, i32 }, { i64, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i32, i64 } }, { float, i32, i32, i32, i32 } } poison, i32 [[TMP2]], 0, 0, 0
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[DOTFCA_0_0_1_INSERT:%.*]] = insertvalue { { { i32, i32 }, { i64, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i32, i64 } }, { float, i32, i32, i32, i32 } } [[DOTFCA_0_0_0_INSERT]], i32 [[TMP50]], 0, 0, 1
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[DOTFCA_0_1_0_INSERT:%.*]] = insertvalue { { { i32, i32 }, { i64, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i32, i64 } }, { float, i32, i32, i32, i32 } } [[DOTFCA_0_0_1_INSERT]], i64 [[TMP4]], 0, 1, 0
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[DOTFCA_0_1_1_INSERT:%.*]] = insertvalue { { { i32, i32 }, { i64, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i32, i64 } }, { float, i32, i32, i32, i32 } } [[DOTFCA_0_1_0_INSERT]], i32 [[TMP6]], 0, 1, 1
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[DOTFCA_0_1_2_INSERT:%.*]] = insertvalue { { { i32, i32 }, { i64, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i32, i64 } }, { float, i32, i32, i32, i32 } } [[DOTFCA_0_1_1_INSERT]], <3 x float> [[TMP8]], 0, 1, 2
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[DOTFCA_0_1_3_INSERT:%.*]] = insertvalue { { { i32, i32 }, { i64, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i32, i64 } }, { float, i32, i32, i32, i32 } } [[DOTFCA_0_1_2_INSERT]], <3 x float> [[TMP10]], 0, 1, 3
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[DOTFCA_0_1_4_INSERT:%.*]] = insertvalue { { { i32, i32 }, { i64, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i32, i64 } }, { float, i32, i32, i32, i32 } } [[DOTFCA_0_1_3_INSERT]], float [[TMP12]], 0, 1, 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[DOTFCA_0_1_5_INSERT:%.*]] = insertvalue { { { i32, i32 }, { i64, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i32, i64 } }, { float, i32, i32, i32, i32 } } [[DOTFCA_0_1_4_INSERT]], float [[TMP14]], 0, 1, 5
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[DOTFCA_0_2_0_0_INSERT:%.*]] = insertvalue { { { i32, i32 }, { i64, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i32, i64 } }, { float, i32, i32, i32, i32 } } [[DOTFCA_0_1_5_INSERT]], float [[TMP16]], 0, 2, 0, 0
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[DOTFCA_0_2_0_1_INSERT:%.*]] = insertvalue { { { i32, i32 }, { i64, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i32, i64 } }, { float, i32, i32, i32, i32 } } [[DOTFCA_0_2_0_0_INSERT]], i32 [[TMP18]], 0, 2, 0, 1
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[DOTFCA_0_2_0_2_INSERT:%.*]] = insertvalue { { { i32, i32 }, { i64, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i32, i64 } }, { float, i32, i32, i32, i32 } } [[DOTFCA_0_2_0_1_INSERT]], i32 [[TMP20]], 0, 2, 0, 2
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[DOTFCA_0_2_0_3_INSERT:%.*]] = insertvalue { { { i32, i32 }, { i64, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i32, i64 } }, { float, i32, i32, i32, i32 } } [[DOTFCA_0_2_0_2_INSERT]], i32 [[TMP22]], 0, 2, 0, 3
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[DOTFCA_0_2_0_4_INSERT:%.*]] = insertvalue { { { i32, i32 }, { i64, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i32, i64 } }, { float, i32, i32, i32, i32 } } [[DOTFCA_0_2_0_3_INSERT]], i32 [[TMP24]], 0, 2, 0, 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[DOTFCA_0_2_1_INSERT:%.*]] = insertvalue { { { i32, i32 }, { i64, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i32, i64 } }, { float, i32, i32, i32, i32 } } [[DOTFCA_0_2_0_4_INSERT]], <2 x float> [[TMP26]], 0, 2, 1
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[DOTFCA_0_2_2_INSERT:%.*]] = insertvalue { { { i32, i32 }, { i64, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i32, i64 } }, { float, i32, i32, i32, i32 } } [[DOTFCA_0_2_1_INSERT]], i32 [[TMP28]], 0, 2, 2
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[DOTFCA_0_2_3_INSERT:%.*]] = insertvalue { { { i32, i32 }, { i64, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i32, i64 } }, { float, i32, i32, i32, i32 } } [[DOTFCA_0_2_2_INSERT]], i32 [[TMP30]], 0, 2, 3
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[DOTFCA_0_2_4_INSERT:%.*]] = insertvalue { { { i32, i32 }, { i64, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i32, i64 } }, { float, i32, i32, i32, i32 } } [[DOTFCA_0_2_3_INSERT]], i32 [[TMP32]], 0, 2, 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[DOTFCA_0_2_5_INSERT:%.*]] = insertvalue { { { i32, i32 }, { i64, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i32, i64 } }, { float, i32, i32, i32, i32 } } [[DOTFCA_0_2_4_INSERT]], i32 [[TMP34]], 0, 2, 5
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[DOTFCA_0_2_6_INSERT:%.*]] = insertvalue { { { i32, i32 }, { i64, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i32, i64 } }, { float, i32, i32, i32, i32 } } [[DOTFCA_0_2_5_INSERT]], i32 [[TMP36]], 0, 2, 6
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[DOTFCA_0_2_7_INSERT:%.*]] = insertvalue { { { i32, i32 }, { i64, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i32, i64 } }, { float, i32, i32, i32, i32 } } [[DOTFCA_0_2_6_INSERT]], i32 [[TMP38]], 0, 2, 7
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[DOTFCA_0_2_8_INSERT:%.*]] = insertvalue { { { i32, i32 }, { i64, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i32, i64 } }, { float, i32, i32, i32, i32 } } [[DOTFCA_0_2_7_INSERT]], i32 [[TMP40]], 0, 2, 8
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[DOTFCA_0_2_9_INSERT:%.*]] = insertvalue { { { i32, i32 }, { i64, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i32, i64 } }, { float, i32, i32, i32, i32 } } [[DOTFCA_0_2_8_INSERT]], i64 [[TMP42]], 0, 2, 9
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[DOTFCA_1_0_INSERT:%.*]] = insertvalue { { { i32, i32 }, { i64, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i32, i64 } }, { float, i32, i32, i32, i32 } } [[DOTFCA_0_2_9_INSERT]], float 0.000000e+00, 1, 0
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[DOTFCA_1_1_INSERT:%.*]] = insertvalue { { { i32, i32 }, { i64, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i32, i64 } }, { float, i32, i32, i32, i32 } } [[DOTFCA_1_0_INSERT]], i32 0, 1, 1
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[DOTFCA_1_2_INSERT:%.*]] = insertvalue { { { i32, i32 }, { i64, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i32, i64 } }, { float, i32, i32, i32, i32 } } [[DOTFCA_1_1_INSERT]], i32 0, 1, 2
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[DOTFCA_1_3_INSERT:%.*]] = insertvalue { { { i32, i32 }, { i64, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i32, i64 } }, { float, i32, i32, i32, i32 } } [[DOTFCA_1_2_INSERT]], i32 0, 1, 3
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    [[DOTFCA_1_4_INSERT:%.*]] = insertvalue { { { i32, i32 }, { i64, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i32, i64 } }, { float, i32, i32, i32, i32 } } [[DOTFCA_1_3_INSERT]], i32 0, 1, 4
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    call void (...) @lgc.cps.jump(i32 [[DOTSROA_370_8_VEC_EXTRACT]], i32 -1, {} poison, i32 [[DOTSROA_0112_0_EXTRACT_TRUNC]], i32 [[TMP50]], { { { i32, i32 }, { i64, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i32, i64 } }, { float, i32, i32, i32, i32 } } [[DOTFCA_1_4_INSERT]], <2 x float> zeroinitializer, {} poison, [8 x i32] [[PAYLOAD]])
-; LOWERRAYTRACINGPIPELINE-CPS-NEXT:    unreachable
-;
-.entry:
-  %1 = call ptr addrspace(7) @lgc.load.buffer.desc(i64 93, i32 17, i32 0, i32 0)
-  %2 = call ptr @llvm.invariant.start.p7(i64 -1, ptr addrspace(7) %1)
-  %3 = load i32, ptr addrspace(5) %0, align 4
-  %4 = getelementptr i8, ptr addrspace(5) %0, i32 16
-  %5 = load i64, ptr addrspace(5) %4, align 8
-  %6 = getelementptr i8, ptr addrspace(5) %0, i32 24
-  %7 = load i32, ptr addrspace(5) %6, align 4
-  %8 = getelementptr i8, ptr addrspace(5) %0, i32 32
-  %9 = load <3 x float>, ptr addrspace(5) %8, align 16
-  %10 = getelementptr i8, ptr addrspace(5) %0, i32 48
-  %11 = load <3 x float>, ptr addrspace(5) %10, align 16
-  %12 = getelementptr i8, ptr addrspace(5) %0, i32 64
-  %13 = load float, ptr addrspace(5) %12, align 4
-  %14 = getelementptr i8, ptr addrspace(5) %0, i32 68
-  %15 = load float, ptr addrspace(5) %14, align 4
-  %16 = getelementptr i8, ptr addrspace(5) %0, i32 80
-  %17 = load float, ptr addrspace(5) %16, align 4
-  %18 = getelementptr i8, ptr addrspace(5) %0, i32 84
-  %19 = load i32, ptr addrspace(5) %18, align 4
-  %20 = getelementptr i8, ptr addrspace(5) %0, i32 88
-  %21 = load i32, ptr addrspace(5) %20, align 4
-  %22 = getelementptr i8, ptr addrspace(5) %0, i32 92
-  %23 = load i32, ptr addrspace(5) %22, align 4
-  %24 = getelementptr i8, ptr addrspace(5) %0, i32 96
-  %25 = load i32, ptr addrspace(5) %24, align 4
-  %26 = getelementptr i8, ptr addrspace(5) %0, i32 104
-  %27 = load <2 x float>, ptr addrspace(5) %26, align 8
-  %28 = getelementptr i8, ptr addrspace(5) %0, i32 112
-  %29 = load i32, ptr addrspace(5) %28, align 4
-  %30 = getelementptr i8, ptr addrspace(5) %0, i32 116
-  %31 = load i32, ptr addrspace(5) %30, align 4
-  %32 = getelementptr i8, ptr addrspace(5) %0, i32 120
-  %33 = load i32, ptr addrspace(5) %32, align 4
-  %34 = getelementptr i8, ptr addrspace(5) %0, i32 124
-  %35 = load i32, ptr addrspace(5) %34, align 4
-  %36 = getelementptr i8, ptr addrspace(5) %0, i32 128
-  %37 = load i32, ptr addrspace(5) %36, align 4
-  %38 = getelementptr i8, ptr addrspace(5) %0, i32 132
-  %39 = load i32, ptr addrspace(5) %38, align 4
-  %40 = getelementptr i8, ptr addrspace(5) %0, i32 136
-  %41 = load i32, ptr addrspace(5) %40, align 4
-  %42 = getelementptr i8, ptr addrspace(5) %0, i32 144
-  %43 = load i64, ptr addrspace(5) %42, align 8
-  %44 = and i64 %5, 281474976710655
-  %45 = add nuw nsw i64 %44, 48
-  %46 = inttoptr i64 %45 to ptr addrspace(1)
-  %47 = load <4 x i32>, ptr addrspace(1) %46, align 16
-  %.4.vec.extract = extractelement <4 x i32> %47, i64 1
-  %48 = and i32 %.4.vec.extract, 16777215
-  %49 = lshr i32 %7, 8
-  %50 = and i32 %49, 15
-  %51 = add nuw nsw i32 %50, %48
-  %52 = getelementptr inbounds i8, ptr addrspace(7) %1, i32 36
-  %53 = load i32, ptr addrspace(7) %52, align 4
-  %54 = getelementptr inbounds i8, ptr addrspace(7) %1, i32 40
-  %55 = load i32, ptr addrspace(7) %54, align 4
-  %56 = zext i32 %55 to i64
-  %57 = shl nuw i64 %56, 32
-  %58 = zext i32 %53 to i64
-  %59 = or i64 %57, %58
-  %.fr = freeze i64 %59
-  %.not = icmp eq i64 %.fr, 0
-  br i1 %.not, label %.exit4, label %60
-
-60:                                               ; preds = %.entry
-  %61 = getelementptr inbounds i8, ptr addrspace(7) %1, i32 44
-  %62 = load i32, ptr addrspace(7) %61, align 4
-  %63 = mul i32 %62, %51
-  %64 = zext i32 %63 to i64
-  %65 = add i64 %.fr, %64
-  %66 = inttoptr i64 %65 to ptr addrspace(1)
-  %67 = load <4 x i32>, ptr addrspace(1) %66, align 16
-  %68 = shufflevector <4 x i32> %67, <4 x i32> poison, <2 x i32> <i32 2, i32 3>
-  br label %.exit4
-
-.exit4:                                           ; preds = %.entry, %60
-  %.sroa.5334.0 = phi <2 x i32> [ %68, %60 ], [ zeroinitializer, %.entry ]
-  %.sroa.370.8.vec.extract = extractelement <2 x i32> %.sroa.5334.0, i64 0
-  %69 = call spir_func i64 @_AmdGetCurrentFuncAddr()
-  %.sroa.0112.0.extract.trunc = trunc i64 %69 to i32
-  %.fca.0.0.0.insert = insertvalue { { { i32, i32 }, { i64, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i32, i64 } }, { float, i32, i32, i32, i32 } } poison, i32 %3, 0, 0, 0
-  %.fca.0.0.1.insert = insertvalue { { { i32, i32 }, { i64, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i32, i64 } }, { float, i32, i32, i32, i32 } } %.fca.0.0.0.insert, i32 %51, 0, 0, 1
-  %.fca.0.1.0.insert = insertvalue { { { i32, i32 }, { i64, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i32, i64 } }, { float, i32, i32, i32, i32 } } %.fca.0.0.1.insert, i64 %5, 0, 1, 0
-  %.fca.0.1.1.insert = insertvalue { { { i32, i32 }, { i64, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i32, i64 } }, { float, i32, i32, i32, i32 } } %.fca.0.1.0.insert, i32 %7, 0, 1, 1
-  %.fca.0.1.2.insert = insertvalue { { { i32, i32 }, { i64, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i32, i64 } }, { float, i32, i32, i32, i32 } } %.fca.0.1.1.insert, <3 x float> %9, 0, 1, 2
-  %.fca.0.1.3.insert = insertvalue { { { i32, i32 }, { i64, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i32, i64 } }, { float, i32, i32, i32, i32 } } %.fca.0.1.2.insert, <3 x float> %11, 0, 1, 3
-  %.fca.0.1.4.insert = insertvalue { { { i32, i32 }, { i64, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i32, i64 } }, { float, i32, i32, i32, i32 } } %.fca.0.1.3.insert, float %13, 0, 1, 4
-  %.fca.0.1.5.insert = insertvalue { { { i32, i32 }, { i64, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i32, i64 } }, { float, i32, i32, i32, i32 } } %.fca.0.1.4.insert, float %15, 0, 1, 5
-  %.fca.0.2.0.0.insert = insertvalue { { { i32, i32 }, { i64, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i32, i64 } }, { float, i32, i32, i32, i32 } } %.fca.0.1.5.insert, float %17, 0, 2, 0, 0
-  %.fca.0.2.0.1.insert = insertvalue { { { i32, i32 }, { i64, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i32, i64 } }, { float, i32, i32, i32, i32 } } %.fca.0.2.0.0.insert, i32 %19, 0, 2, 0, 1
-  %.fca.0.2.0.2.insert = insertvalue { { { i32, i32 }, { i64, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i32, i64 } }, { float, i32, i32, i32, i32 } } %.fca.0.2.0.1.insert, i32 %21, 0, 2, 0, 2
-  %.fca.0.2.0.3.insert = insertvalue { { { i32, i32 }, { i64, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i32, i64 } }, { float, i32, i32, i32, i32 } } %.fca.0.2.0.2.insert, i32 %23, 0, 2, 0, 3
-  %.fca.0.2.0.4.insert = insertvalue { { { i32, i32 }, { i64, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i32, i64 } }, { float, i32, i32, i32, i32 } } %.fca.0.2.0.3.insert, i32 %25, 0, 2, 0, 4
-  %.fca.0.2.1.insert = insertvalue { { { i32, i32 }, { i64, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i32, i64 } }, { float, i32, i32, i32, i32 } } %.fca.0.2.0.4.insert, <2 x float> %27, 0, 2, 1
-  %.fca.0.2.2.insert = insertvalue { { { i32, i32 }, { i64, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i32, i64 } }, { float, i32, i32, i32, i32 } } %.fca.0.2.1.insert, i32 %29, 0, 2, 2
-  %.fca.0.2.3.insert = insertvalue { { { i32, i32 }, { i64, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i32, i64 } }, { float, i32, i32, i32, i32 } } %.fca.0.2.2.insert, i32 %31, 0, 2, 3
-  %.fca.0.2.4.insert = insertvalue { { { i32, i32 }, { i64, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i32, i64 } }, { float, i32, i32, i32, i32 } } %.fca.0.2.3.insert, i32 %33, 0, 2, 4
-  %.fca.0.2.5.insert = insertvalue { { { i32, i32 }, { i64, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i32, i64 } }, { float, i32, i32, i32, i32 } } %.fca.0.2.4.insert, i32 %35, 0, 2, 5
-  %.fca.0.2.6.insert = insertvalue { { { i32, i32 }, { i64, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i32, i64 } }, { float, i32, i32, i32, i32 } } %.fca.0.2.5.insert, i32 %37, 0, 2, 6
-  %.fca.0.2.7.insert = insertvalue { { { i32, i32 }, { i64, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i32, i64 } }, { float, i32, i32, i32, i32 } } %.fca.0.2.6.insert, i32 %39, 0, 2, 7
-  %.fca.0.2.8.insert = insertvalue { { { i32, i32 }, { i64, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i32, i64 } }, { float, i32, i32, i32, i32 } } %.fca.0.2.7.insert, i32 %41, 0, 2, 8
-  %.fca.0.2.9.insert = insertvalue { { { i32, i32 }, { i64, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i32, i64 } }, { float, i32, i32, i32, i32 } } %.fca.0.2.8.insert, i64 %43, 0, 2, 9
-  %.fca.1.0.insert = insertvalue { { { i32, i32 }, { i64, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i32, i64 } }, { float, i32, i32, i32, i32 } } %.fca.0.2.9.insert, float 0.000000e+00, 1, 0
-  %.fca.1.1.insert = insertvalue { { { i32, i32 }, { i64, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i32, i64 } }, { float, i32, i32, i32, i32 } } %.fca.1.0.insert, i32 0, 1, 1
-  %.fca.1.2.insert = insertvalue { { { i32, i32 }, { i64, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i32, i64 } }, { float, i32, i32, i32, i32 } } %.fca.1.1.insert, i32 0, 1, 2
-  %.fca.1.3.insert = insertvalue { { { i32, i32 }, { i64, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i32, i64 } }, { float, i32, i32, i32, i32 } } %.fca.1.2.insert, i32 0, 1, 3
-  %.fca.1.4.insert = insertvalue { { { i32, i32 }, { i64, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i32, i64 } }, { float, i32, i32, i32, i32 } } %.fca.1.3.insert, i32 0, 1, 4
-  call void (...) @lgc.cps.jump(i32 %.sroa.370.8.vec.extract, i32 -1, {} poison, i32 %.sroa.0112.0.extract.trunc, i32 %51, { { { i32, i32 }, { i64, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i32, i64 } }, { float, i32, i32, i32, i32 } } %.fca.1.4.insert, <2 x float> zeroinitializer)
-  unreachable
-}
-
 declare void @lgc.cps.jump(...) local_unnamed_addr
 declare ptr addrspace(7) @lgc.load.buffer.desc(i64 %0, i32 %1, i32 %2, i32 %3) local_unnamed_addr
 declare ptr @llvm.invariant.start.p7(i64 immarg %0, ptr addrspace(7) nocapture %1)
 
 !continuation.preservedPayloadRegisterCount = !{!7}
 !lgc.cps.module = !{}
+!lgc.rt.max.attribute.size = !{!4}
 
 !0 = !{i32 7}
 !1 = !{!"function", { { float, i32, i32, i32, i32 }, <2 x float>, i32 } poison, !2}
 !2 = !{i32 5, { { <3 x i32>, i32 }, { i64, i32, i32, <3 x float>, <3 x float>, float, float }, { { float, i32, i32, i32, i32 }, <2 x float>, i32, i32, i32, i32, i32, i32, i64 } } poison}
 !3 = !{i32 6}
-!4 = !{i32 16}
+!4 = !{i32 16} ; HITATTR_SIZE_16
+!4 = !{i32 8}  ; HITATTR_SIZE_8
 !5 = !{i32 0, %struct.AnyHitTraversalData poison}
 !6 = !{!"function", i1 poison, !5, float poison, i32 poison}
 !7 = !{i32 8}
-!8 = !{i32 0}
 !9 = !{i32 0, %struct.DispatchSystemData poison}
 !10 = !{!"function", i32 poison, !9}
diff --git a/llvmraytracing/test/lgccps/simple-await.ll b/llvmraytracing/test/lgccps/simple-await.ll
index d4e3f5da9c..613ee1a72b 100644
--- a/llvmraytracing/test/lgccps/simple-await.ll
+++ b/llvmraytracing/test/lgccps/simple-await.ll
@@ -56,7 +56,7 @@ declare void @lgc.cps.jump(...)
 ; LOWER-AWAIT-NEXT:    [[TMP4:%.*]] = inttoptr i32 [[CR]] to ptr
 ; LOWER-AWAIT-NEXT:    [[TMP5:%.*]] = call ptr [[TMP4]](i32 [[CR]], i32 2, float [[T0]])
 ; LOWER-AWAIT-NEXT:    [[TMP6:%.*]] = call i1 (...) @llvm.coro.suspend.retcon.i1(ptr [[TMP5]])
-; LOWER-AWAIT-NEXT:    [[TMP7:%.*]] = call float @continuations.getReturnValue__f32()
+; LOWER-AWAIT-NEXT:    [[TMP7:%.*]] = call float @lgc.ilcps.getReturnValue__f32()
 ; LOWER-AWAIT-NEXT:    [[RETURNVALUE:%.*]] = fmul float [[TMP7]], [[ARG]]
 ; LOWER-AWAIT-NEXT:    call void (...) @lgc.cps.jump(i32 [[RCR]], i32 2, {} poison, i32 poison, float [[RETURNVALUE]])
 ; LOWER-AWAIT-NEXT:    unreachable
diff --git a/tool/dumper/vkgcPipelineDumper.cpp b/tool/dumper/vkgcPipelineDumper.cpp
index 872d96444e..7ca0eb0c2f 100644
--- a/tool/dumper/vkgcPipelineDumper.cpp
+++ b/tool/dumper/vkgcPipelineDumper.cpp
@@ -695,7 +695,8 @@ void PipelineDumper::dumpPipelineShaderInfo(const PipelineShaderInfo *shaderInfo
   dumpFile << "options.fastMathFlags = " << shaderInfo->options.fastMathFlags << "\n";
   dumpFile << "options.disableFastMathFlags = " << shaderInfo->options.disableFastMathFlags << "\n";
   dumpFile << "options.ldsSpillLimitDwords = " << shaderInfo->options.ldsSpillLimitDwords << "\n";
-  dumpFile << "options.scalarizeWaterfallLoads = " << shaderInfo->options.scalarizeWaterfallLoads << "\n";
+  if (shaderInfo->options.scalarizeWaterfallLoads.has_value())
+    dumpFile << "options.scalarizeWaterfallLoads = " << *shaderInfo->options.scalarizeWaterfallLoads << "\n";
   dumpFile << "options.overrideForceThreadIdSwizzling = " << shaderInfo->options.overrideForceThreadIdSwizzling << "\n";
   dumpFile << "options.overrideShaderThreadGroupSizeX = " << shaderInfo->options.overrideShaderThreadGroupSizeX << "\n";
   dumpFile << "options.overrideShaderThreadGroupSizeY = " << shaderInfo->options.overrideShaderThreadGroupSizeY << "\n";
@@ -1039,6 +1040,9 @@ void PipelineDumper::dumpGraphicsStateInfo(const GraphicsPipelineBuildInfo *pipe
   dumpFile << "enableEarlyCompile = " << pipelineInfo->enableEarlyCompile << "\n";
   dumpFile << "enableColorExportShader = " << pipelineInfo->enableColorExportShader << "\n";
   dumpFile << "useSoftwareVertexBufferDescriptors = " << pipelineInfo->useSoftwareVertexBufferDescriptors << "\n";
+  dumpFile << "dynamicTopology = " << pipelineInfo->dynamicTopology << "\n";
+  dumpFile << "enableColorClampVs = " << pipelineInfo->glState.enableColorClampVs << "\n";
+  dumpFile << "enableColorClampFs = " << pipelineInfo->glState.enableColorClampFs << "\n";
 
   dumpFile << "originUpperLeft = " << pipelineInfo->getGlState().originUpperLeft << "\n";
   if (pipelineInfo->clientMetadataSize > 0) {
@@ -1245,11 +1249,20 @@ void PipelineDumper::dumpRayTracingStateInfo(const RayTracingPipelineBuildInfo *
   dumpFile << "indirectStageMask = " << pipelineInfo->indirectStageMask << "\n";
   dumpFile << "libraryMode = " << static_cast<unsigned>(pipelineInfo->libraryMode) << "\n";
   dumpFile << "mode = " << static_cast<unsigned>(pipelineInfo->mode) << "\n";
+  dumpFile << "cpsFlags = " << pipelineInfo->cpsFlags << "\n";
   dumpRayTracingRtState(&pipelineInfo->rtState, dumpDir, dumpFile);
   dumpFile << "payloadSizeMaxInLib = " << pipelineInfo->payloadSizeMaxInLib << "\n";
   dumpFile << "attributeSizeMaxInLib = " << pipelineInfo->attributeSizeMaxInLib << "\n";
   dumpFile << "hasPipelineLibrary = " << pipelineInfo->hasPipelineLibrary << "\n";
   dumpFile << "pipelineLibStageMask = " << pipelineInfo->pipelineLibStageMask << "\n";
+
+  for (unsigned i = 0; i < pipelineInfo->gpurtOptionCount; ++i) {
+    auto gpurtOption = &pipelineInfo->pGpurtOptions[i];
+    dumpFile << "gpurtOptions[" << i << "].nameHash = "
+             << "0x" << std::hex << gpurtOption->nameHash << "\n";
+    dumpFile << "gpurtOptions[" << i << "].value = "
+             << "0x" << std::hex << gpurtOption->value << "\n";
+  }
 }
 
 // =====================================================================================================================
@@ -1522,6 +1535,7 @@ MetroHash::Hash PipelineDumper::generateHashForGraphicsPipeline(const GraphicsPi
   // Relocatable shaders force an unlinked compilation.
   hasher.Update(pipeline->unlinked);
   hasher.Update(pipeline->enableEarlyCompile);
+  hasher.Update(pipeline->dynamicTopology);
   if (unlinkedShaderType == UnlinkedStageFragment && isCacheHash)
     hasher.Update(pipeline->enableColorExportShader);
   updateHashForPipelineOptions(&pipeline->options, &hasher, isCacheHash, unlinkedShaderType);
@@ -1554,6 +1568,9 @@ MetroHash::Hash PipelineDumper::generateHashForGraphicsPipeline(const GraphicsPi
   hasher.Update(pipeline->advancedBlendInfo.enableAdvancedBlend);
   hasher.Update(pipeline->advancedBlendInfo.binding);
 
+  hasher.Update(pipeline->glState.enableColorClampVs);
+  hasher.Update(pipeline->glState.enableColorClampFs);
+
   MetroHash::Hash hash = {};
   hasher.Finalize(hash.bytes);
 
@@ -1629,6 +1646,7 @@ MetroHash::Hash PipelineDumper::generateHashForRayTracingPipeline(const RayTraci
   hasher.Update(pipeline->maxRecursionDepth);
   hasher.Update(pipeline->indirectStageMask);
   hasher.Update(pipeline->mode);
+  hasher.Update(pipeline->cpsFlags);
   updateHashForRtState(&pipeline->rtState, &hasher, isCacheHash);
 
   hasher.Update(pipeline->libraryMode);
@@ -1655,6 +1673,14 @@ MetroHash::Hash PipelineDumper::generateHashForRayTracingPipeline(const RayTraci
   if (pipeline->clientMetadataSize > 0) {
     hasher.Update(reinterpret_cast<const uint8_t *>(pipeline->pClientMetadata), pipeline->clientMetadataSize);
   }
+
+  hasher.Update(pipeline->gpurtOptionCount);
+  for (unsigned i = 0; i < pipeline->gpurtOptionCount; ++i) {
+    auto gpurtOption = &pipeline->pGpurtOptions[i];
+    hasher.Update(gpurtOption->nameHash);
+    hasher.Update(gpurtOption->value);
+  }
+
   MetroHash::Hash hash = {};
   hasher.Finalize(hash.bytes);
 
diff --git a/tool/vfx/vfxPipelineDoc.cpp b/tool/vfx/vfxPipelineDoc.cpp
index 1039d7782f..f3e9ed993a 100644
--- a/tool/vfx/vfxPipelineDoc.cpp
+++ b/tool/vfx/vfxPipelineDoc.cpp
@@ -407,6 +407,7 @@ bool PipelineDocument::getPtrOfSubSection(Section *section, unsigned lineNum, co
     CASE_SUBSECTION(MemberTypeRtState, SectionRtState)
     CASE_SUBSECTION(MemberTypeRayTracingShaderExportConfig, SectionRayTracingShaderExportConfig)
     CASE_SUBSECTION(MemberTypeIndirectCalleeSavedRegs, SectionIndirectCalleeSavedRegs)
+    CASE_SUBSECTION(MemberTypeGpurtOption, SectionGpurtOption)
 #if GPURT_CLIENT_INTERFACE_MAJOR_VERSION >= 15
     CASE_SUBSECTION(MemberTypeGpurtFuncTable, SectionGpurtFuncTable)
 #endif
diff --git a/tool/vfx/vfxSection.h b/tool/vfx/vfxSection.h
index 62c09d6e8b..5a3f348fa6 100644
--- a/tool/vfx/vfxSection.h
+++ b/tool/vfx/vfxSection.h
@@ -129,6 +129,7 @@ enum MemberType : unsigned {
   MemberTypeRayTracingShaderExportConfig, // VFX member type: SectionRayTracingShaderExportConfig
   MemberTypeIndirectCalleeSavedRegs,      // VFX member type: SectionIndirectCalleeSavedRegs
   MemberTypeGpurtFuncTable,               // VFX member type: SectionGpurtFuncTable
+  MemberTypeGpurtOption,                  // VFX member type: SectionGpurtOption
   MemberTypeExtendedRobustness,           // VFX member type: SectionExtendedRobustness
   MemberTypeAdvancedBlendInfo,            // VFX member type: SectionAdvancedBlendInfo
   MemberTypeGlAttribLocation,             // GL vertex attribute location
@@ -693,6 +694,33 @@ class SectionShaderGroup : public Section {
   SubState m_state;
 };
 
+// =====================================================================================================================
+// Represents the sub section shader group
+class SectionGpurtOption : public Section {
+public:
+  typedef Vkgc::GpurtOption SubState;
+
+  SectionGpurtOption() : Section(getAddrTable(), SectionTypeUnset, "gpurtOption") {
+    memset(&m_state, 0, sizeof(m_state));
+  }
+
+  void getSubState(SubState &state) { state = m_state; };
+  SubState &getSubStateRef() { return m_state; };
+
+private:
+  static StrToMemberAddrArrayRef getAddrTable() {
+    static std::vector<StrToMemberAddr> addrTable = []() {
+      std::vector<StrToMemberAddr> addrTableInitializer;
+      INIT_STATE_MEMBER_NAME_TO_ADDR(SectionGpurtOption, nameHash, MemberTypeInt, false);
+      INIT_STATE_MEMBER_NAME_TO_ADDR(SectionGpurtOption, value, MemberTypeInt, false);
+      return addrTableInitializer;
+    }();
+    return {addrTable.data(), addrTable.size()};
+  }
+
+  SubState m_state;
+};
+
 // =====================================================================================================================
 // Represents the sub section vertex input binding
 class SectionVertexInputBinding : public Section {
diff --git a/tool/vfx/vfxVkSection.h b/tool/vfx/vfxVkSection.h
index e06eded576..313c79789b 100644
--- a/tool/vfx/vfxVkSection.h
+++ b/tool/vfx/vfxVkSection.h
@@ -880,6 +880,8 @@ class SectionGraphicsState : public Section {
       INIT_STATE_SUB_MEMBER_NAME_TO_ADDR(SectionGraphicsState, cbState, alphaToCoverageEnable, MemberTypeBool, false);
       INIT_STATE_SUB_MEMBER_NAME_TO_ADDR(SectionGraphicsState, cbState, dualSourceBlendEnable, MemberTypeBool, false);
       INIT_STATE_SUB_MEMBER_NAME_TO_ADDR(SectionGraphicsState, cbState, dualSourceBlendDynamic, MemberTypeBool, false);
+      INIT_STATE_SUB_MEMBER_NAME_TO_ADDR(SectionGraphicsState, glState, enableColorClampVs, MemberTypeBool, false);
+      INIT_STATE_SUB_MEMBER_NAME_TO_ADDR(SectionGraphicsState, glState, enableColorClampFs, MemberTypeBool, false);
       INIT_MEMBER_ARRAY_NAME_TO_ADDR(SectionGraphicsState, m_colorBuffer, MemberTypeColorBufferItem,
                                      Vkgc::MaxColorTargets, true);
 
@@ -890,6 +892,7 @@ class SectionGraphicsState : public Section {
       INIT_STATE_MEMBER_NAME_TO_ADDR(SectionGraphicsState, enableUberFetchShader, MemberTypeBool, false);
       INIT_STATE_MEMBER_NAME_TO_ADDR(SectionGraphicsState, enableColorExportShader, MemberTypeBool, false);
       INIT_STATE_MEMBER_NAME_TO_ADDR(SectionGraphicsState, enableEarlyCompile, MemberTypeBool, false);
+      INIT_STATE_MEMBER_NAME_TO_ADDR(SectionGraphicsState, dynamicTopology, MemberTypeBool, false);
       INIT_STATE_MEMBER_NAME_TO_ADDR(SectionGraphicsState, useSoftwareVertexBufferDescriptors, MemberTypeBool, false);
       INIT_MEMBER_NAME_TO_ADDR(SectionGraphicsState, m_shaderLibrary, MemberTypeString, false);
       INIT_MEMBER_NAME_TO_ADDR(SectionGraphicsState, m_rtState, MemberTypeRtState, true);
@@ -1104,6 +1107,8 @@ class SectionRayTracingState : public Section {
       INIT_STATE_MEMBER_NAME_TO_ADDR(SectionRayTracingState, attributeSizeMaxInLib, MemberTypeInt, false);
       INIT_STATE_MEMBER_NAME_TO_ADDR(SectionRayTracingState, isReplay, MemberTypeBool, false);
       INIT_MEMBER_NAME_TO_ADDR(SectionRayTracingState, m_clientMetadata, MemberTypeU8Array, false);
+      INIT_STATE_MEMBER_NAME_TO_ADDR(SectionRayTracingState, cpsFlags, MemberTypeInt, false);
+      INIT_MEMBER_DYNARRAY_NAME_TO_ADDR(SectionRayTracingState, m_gpurtOptions, MemberTypeGpurtOption, true);
       return addrTableInitializer;
     }();
     return {addrTable.data(), addrTable.size()};
@@ -1132,6 +1137,13 @@ class SectionRayTracingState : public Section {
       m_state.clientMetadataSize = m_clientMetadataBufMem.size();
       m_state.pClientMetadata = m_clientMetadataBufMem.data();
     }
+
+    m_state.gpurtOptionCount = static_cast<unsigned>(m_gpurtOptions.size());
+    m_vkgcGpurtOptions.resize(m_state.gpurtOptionCount);
+    for (unsigned i = 0; i < m_state.gpurtOptionCount; ++i)
+      m_gpurtOptions[i].getSubState(m_vkgcGpurtOptions[i]);
+    m_state.pGpurtOptions = (m_state.gpurtOptionCount) > 0 ? m_vkgcGpurtOptions.data() : nullptr;
+
     state = m_state;
   };
   SubState &getSubStateRef() { return m_state; };
@@ -1148,6 +1160,8 @@ class SectionRayTracingState : public Section {
   std::vector<uint8_t> m_traceRayBinary;
   std::vector<uint8_t> *m_clientMetadata;
   std::vector<uint8_t> m_clientMetadataBufMem;
+  std::vector<SectionGpurtOption> m_gpurtOptions;
+  std::vector<Vkgc::GpurtOption> m_vkgcGpurtOptions;
 };
 
 } // namespace Vfx
diff --git a/util/extensions.txt b/util/extensions.txt
index aeb04d5f8c..36a5c07005 100644
--- a/util/extensions.txt
+++ b/util/extensions.txt
@@ -41,10 +41,7 @@ SPV_KHR_ray_tracing
 SPV_KHR_ray_query
 SPV_KHR_fragment_shader_barycentric
 SPV_KHR_workgroup_memory_explicit_layout
-#if VKI_COOPERATIVE_MATRIX
-SPV_NV_cooperative_matrix
 SPV_KHR_cooperative_matrix
-#endif
 SPV_NV_shader_atomic_float
 SPV_NV_compute_shader_derivatives
 SPV_KHR_maximal_reconvergence
diff --git a/version/include/llpc/GpurtIntrinsics.h b/version/include/llpc/GpurtIntrinsics.h
index 90072cc393..be1ebbfdc5 100644
--- a/version/include/llpc/GpurtIntrinsics.h
+++ b/version/include/llpc/GpurtIntrinsics.h
@@ -63,6 +63,7 @@
 #endif
 
 #define CONTINUATIONS_LGC_STACK_LOWERING 1
+#define CONTINUATIONS_USE_DUMMY_RET_ADDR 1
 
 //=====================================================================================================================
 // Continuation intrinsics
diff --git a/version/include/llpcVersion.h.in b/version/include/llpcVersion.h.in
index 8a8db1f01b..d947653255 100644
--- a/version/include/llpcVersion.h.in
+++ b/version/include/llpcVersion.h.in
@@ -37,6 +37,9 @@
 //  %Version History
 //  | %Version | Change Description                                                                                    |
 //  | -------- | ----------------------------------------------------------------------------------------------------- |
+//  |     72.3 | Add enableColorClampVs and enableColorClampFs to GraphicsPipelineBuildInfo.                           |
+//  |     72.2 | Add pGpurtOptions and gpurtOptionCount to RayTracingPipelineBuildInfo                                 |
+//  |     72.1 | Add dynamicTopology to GraphicsPipelineBuildInfo                                                      |
 //  |     72.0 | Enable std430 layout rule 9 to the OpenGL default uniform block                                       |
 //  |     71.4 | Add PixelOpInternalBinding to InternalBinding. Add GlCompatibilityDrawPixelsType. Add enableBitmap to |
 //  |          | glState. Add enableBitmapLsb to glState. Add enableTwoSideLighting to glState. Add drawPixelsType to  |
@@ -181,7 +184,7 @@
 #define LLPC_INTERFACE_MAJOR_VERSION 72
 
 /// LLPC minor interface version.
-#define LLPC_INTERFACE_MINOR_VERSION 3
+#define LLPC_INTERFACE_MINOR_VERSION 2
 
 /// The client's LLPC major interface version
 #ifndef LLPC_CLIENT_INTERFACE_MAJOR_VERSION