GPUOpen-Drivers · LLJJDD · Nov 30, 2023 · Nov 29, 2023 · Nov 29, 2023 · tsymalla-AMD
@@ -239,6 +239,7 @@ target_sources(LLVMlgc PRIVATE
     patch/RegisterMetadataBuilder.cpp
     patch/CombineCooperativeMatrix.cpp
     patch/LowerCooperativeMatrix.cpp
+    patch/LowerGpuRt.cpp
 )
 
 # lgc/state

@@ -25,16 +25,18 @@
 /**
  ***********************************************************************************************************************
  * @file  LowerGpuRt.h
- * @brief LLPC header file: contains declaration of Llpc::LowerGpuRt
+ * @brief LGC header file: contains declaration of lgc::LowerGpuRt
  ***********************************************************************************************************************
  */
 #pragma once
 
-#include "llpcSpirvLower.h"
 #include "llvm/ADT/SmallSet.h"
 #include "llvm/IR/PassManager.h"
 
 namespace lgc {
+class Builder;
+class PipelineState;
+
 class GpurtGetStackSizeOp;
 class GpurtGetStackBaseOp;
 class GpurtGetStackStrideOp;
@@ -46,24 +48,17 @@ class GpurtGetBoxSortHeuristicModeOp;
 class GpurtGetStaticFlagsOp;
 class GpurtGetTriangleCompressionModeOp;
 class GpurtGetFlattenedGroupThreadIdOp;
-} // namespace lgc
 
-namespace llvm {
-class AllocaInst;
-}
-
-namespace Llpc {
-class LowerGpuRt : public SpirvLower, public llvm::PassInfoMixin<LowerGpuRt> {
+class LowerGpuRt : public llvm::PassInfoMixin<LowerGpuRt> {
 public:
-  LowerGpuRt();
   llvm::PreservedAnalyses run(llvm::Module &module, llvm::ModuleAnalysisManager &analysisManager);
 
 private:
   typedef void (LowerGpuRt::*LibraryFuncPtr)(llvm::Function *, unsigned);
   const static unsigned MaxLdsStackEntries = 16;
   uint32_t getWorkgroupSize() const;
   llvm::Value *getThreadIdInGroup() const;
-  void createGlobalStack();
+  void createGlobalStack(llvm::Module &module);
   void createRayStaticIdValue();
   void visitGetStackSize(lgc::GpurtGetStackSizeOp &inst);
   void visitGetStackBase(lgc::GpurtGetStackBaseOp &inst);
@@ -76,10 +71,11 @@ class LowerGpuRt : public SpirvLower, public llvm::PassInfoMixin<LowerGpuRt> {
   void visitGetStaticFlags(lgc::GpurtGetStaticFlagsOp &inst);
   void visitGetTriangleCompressionMode(lgc::GpurtGetTriangleCompressionModeOp &inst);
   void visitGetFlattenedGroupThreadId(lgc::GpurtGetFlattenedGroupThreadIdOp &inst);
-  llvm::Value *m_stack;                                  // Stack array to hold stack value
-  llvm::Type *m_stackTy;                                 // Stack type
-  bool m_lowerStack;                                     // If it is lowerStack
+  llvm::Value *m_stack = nullptr;                        // Stack array to hold stack value
+  llvm::Type *m_stackTy = nullptr;                       // Stack type
+  PipelineState *m_pipelineState = nullptr;              // Pipeline state
   llvm::SmallVector<llvm::Instruction *> m_callsToLower; // Call instruction to lower
   llvm::SmallSet<llvm::Function *, 4> m_funcsToLower;    // Functions to lower
+  Builder *m_builder = nullptr;
 };
-} // namespace Llpc
+} // namespace lgc
@@ -127,7 +127,7 @@ static const char SampleShadingMetaName[] = "lgc.sample.shading";
 // The front-end should zero-initialize a struct with "= {}" in case future changes add new fields.
 // Note: new fields must be added to the end of this structure to maintain test compatibility.
 union Options {
-  unsigned u32All[36];
+  unsigned u32All[40];
   struct {
     uint64_t hash[2];                 // Pipeline hash to set in ELF PAL metadata
     unsigned includeDisassembly;      // If set, the disassembly for all compiled shaders will be included
@@ -183,6 +183,10 @@ union Options {
     bool enableFragColor;                    // If enabled, do frag color broadcast
     bool useSoftwareVertexBufferDescriptors; // Use software vertex buffer descriptors to structure SRD.
     unsigned cpsFlags;                       // CPS feature flags
+    unsigned rtBoxSortHeuristicMode;         // Ray tracing box sort heuristic mode
+    unsigned rtStaticPipelineFlags;          // Ray tracing static pipeline flags
+    unsigned rtTriCompressMode;              // Ray tracing triangle compression mode
+    bool useGpurt;                           // Whether GPURT is used
   };
 };
 static_assert(sizeof(Options) == sizeof(Options::u32All));

@@ -25,43 +25,42 @@
 /**
  ***********************************************************************************************************************
  * @file  LowerGpuRt.cpp
- * @brief LLPC source file: contains implementation of class Llpc::LowerGpuRt.
+ * @brief LGC source file: contains implementation of class lgc::LowerGpuRt.
  ***********************************************************************************************************************
  */
-#include "LowerGpuRt.h"
-#include "llpcContext.h"
-#include "llpcRayTracingContext.h"
+#include "lgc/patch/LowerGpuRt.h"
 #include "lgc/Builder.h"
 #include "lgc/GpurtDialect.h"
+#include "lgc/LgcContext.h"
+#include "lgc/builder/BuilderImpl.h"
+#include "lgc/state/TargetInfo.h"
 #include "llvm-dialects/Dialect/Visitor.h"
 #include "llvm/IR/IntrinsicsAMDGPU.h"
 
-#define DEBUG_TYPE "llpc-gpurt"
+#define DEBUG_TYPE "lgc-lower-gpurt"
 using namespace lgc;
 using namespace llvm;
-using namespace Llpc;
 
 namespace RtName {
 static const char *LdsStack = "LdsStack";
 } // namespace RtName
 
-namespace Llpc {
-// =====================================================================================================================
-LowerGpuRt::LowerGpuRt() : m_stack(nullptr), m_stackTy(nullptr), m_lowerStack(false) {
-}
+namespace lgc {
 // =====================================================================================================================
 // Executes this SPIR-V lowering pass on the specified LLVM module.
 //
 // @param [in/out] module : LLVM module to be run on
 // @param [in/out] analysisManager : Analysis manager to use for this transformation
 PreservedAnalyses LowerGpuRt::run(Module &module, ModuleAnalysisManager &analysisManager) {
   LLVM_DEBUG(dbgs() << "Run the pass Lower-gpurt\n");
-  SpirvLower::init(&module);
-  auto gfxip = m_context->getPipelineContext()->getGfxIpVersion();
-  // NOTE: rayquery of sect and ahit can reuse lds.
-  m_lowerStack = (m_entryPoint->getName().startswith("_ahit") || m_entryPoint->getName().startswith("_sect")) &&
-                 (gfxip.major < 11);
-  createGlobalStack();
+
+  PipelineState *pipelineState = analysisManager.getResult<PipelineStateWrapper>(module).getPipelineState();
+  m_pipelineState = pipelineState;
+
+  Builder builderImpl(pipelineState->getContext());
+  m_builder = &builderImpl;
+
+  createGlobalStack(module);
 
   static auto visitor = llvm_dialects::VisitorBuilder<LowerGpuRt>()
                             .setStrategy(llvm_dialects::VisitorStrategy::ByFunctionDeclaration)
@@ -78,7 +77,7 @@ PreservedAnalyses LowerGpuRt::run(Module &module, ModuleAnalysisManager &analysi
                             .add(&LowerGpuRt::visitGetFlattenedGroupThreadId)
                             .build();
 
-  visitor.visit(*this, *m_module);
+  visitor.visit(*this, module);
 
   for (Instruction *call : m_callsToLower) {
     call->dropAllReferences();
@@ -99,14 +98,15 @@ PreservedAnalyses LowerGpuRt::run(Module &module, ModuleAnalysisManager &analysi
 // Get pipeline workgroup size for stack size calculation
 unsigned LowerGpuRt::getWorkgroupSize() const {
   unsigned workgroupSize = 0;
-  if (m_context->getPipelineType() == PipelineType::Graphics) {
-    workgroupSize = m_context->getPipelineContext()->getRayTracingWaveSize();
+  if (m_pipelineState->isGraphics()) {
+    // Force 64 for graphics stages
+    workgroupSize = 64;
   } else {
-    ComputeShaderMode mode = lgc::Pipeline::getComputeShaderMode(*m_module);
+    ComputeShaderMode mode = m_pipelineState->getShaderModes()->getComputeShaderMode();
     workgroupSize = mode.workgroupSizeX * mode.workgroupSizeY * mode.workgroupSizeZ;
   }
   assert(workgroupSize != 0);
-  if (m_context->getPipelineContext()->getGfxIpVersion().major >= 11) {
+  if (m_pipelineState->getTargetInfo().getGfxIpVersion().major >= 11) {
     // Round up to multiple of 32, as the ds_bvh_stack swizzle as 32 threads
     workgroupSize = alignTo(workgroupSize, 32);
   }
@@ -117,26 +117,47 @@ unsigned LowerGpuRt::getWorkgroupSize() const {
 // Get flat thread id in work group/wave
 Value *LowerGpuRt::getThreadIdInGroup() const {
   // Todo: for graphics shader, subgroupId * waveSize + subgroupLocalInvocationId()
-  unsigned builtIn = m_context->getPipelineType() == PipelineType::Graphics ? lgc::BuiltInSubgroupLocalInvocationId
-                                                                            : lgc::BuiltInLocalInvocationIndex;
-  lgc::InOutInfo inputInfo = {};
-  return m_builder->CreateReadBuiltInInput(static_cast<lgc::BuiltInKind>(builtIn), inputInfo, nullptr, nullptr, "");
+  unsigned builtIn = m_pipelineState->isGraphics() ? BuiltInSubgroupLocalInvocationId : BuiltInLocalInvocationIndex;
+  InOutInfo inputInfo = {};
+  return m_builder->CreateReadBuiltInInput(static_cast<BuiltInKind>(builtIn), inputInfo, nullptr, nullptr, "");
 }
 
 // =====================================================================================================================
 // Create global variable for the stack
-void LowerGpuRt::createGlobalStack() {
-  auto ldsStackSize = getWorkgroupSize() * MaxLdsStackEntries;
-  // Double anyhit and intersection shader lds size, these shader use lower part of stack to read/write value
-  if (m_lowerStack)
-    ldsStackSize = ldsStackSize << 1;
-
-  m_stackTy = ArrayType::get(m_builder->getInt32Ty(), ldsStackSize);
-  auto ldsStack = new GlobalVariable(*m_module, m_stackTy, false, GlobalValue::ExternalLinkage, nullptr,
-                                     RtName::LdsStack, nullptr, GlobalValue::NotThreadLocal, 3);
-
-  ldsStack->setAlignment(MaybeAlign(4));
-  m_stack = ldsStack;
+// @param [in/out] module : LLVM module to be run on
+void LowerGpuRt::createGlobalStack(Module &module) {
+  struct Payload {
+    bool needGlobalStack;
+    bool needExtraStack;
+  };
+  Payload payload = {false, false};
+  static auto visitor = llvm_dialects::VisitorBuilder<Payload>()
+                            .setStrategy(llvm_dialects::VisitorStrategy::ByFunctionDeclaration)
+                            .add<GpurtStackWriteOp>([](auto &payload, auto &op) {
+                              payload.needGlobalStack = true;
+                              payload.needExtraStack |= op.getUseExtraStack();
+                            })
+                            .add<GpurtStackReadOp>([](auto &payload, auto &op) {
+                              payload.needGlobalStack = true;
+                              payload.needExtraStack |= op.getUseExtraStack();
+                            })
+                            .add<GpurtLdsStackInitOp>([](auto &payload, auto &op) { payload.needGlobalStack = true; })
+                            .build();
+  visitor.visit(payload, module);
+
+  if (payload.needGlobalStack) {
+    auto ldsStackSize = getWorkgroupSize() * MaxLdsStackEntries;
+    // Double LDS size when any operations requires to perform on extra stack.
+    if (payload.needExtraStack)
+      ldsStackSize = ldsStackSize << 1;
+
+    m_stackTy = ArrayType::get(m_builder->getInt32Ty(), ldsStackSize);
+    auto ldsStack = new GlobalVariable(module, m_stackTy, false, GlobalValue::ExternalLinkage, nullptr,
+                                       RtName::LdsStack, nullptr, GlobalValue::NotThreadLocal, 3);
+
+    ldsStack->setAlignment(MaybeAlign(4));
+    m_stack = ldsStack;
+  }
 }
 
 // =====================================================================================================================
@@ -184,7 +205,7 @@ void LowerGpuRt::visitStackRead(GpurtStackReadOp &inst) {
   m_builder->SetInsertPoint(&inst);
   Value *stackIndex = inst.getIndex();
   Type *stackTy = PointerType::get(m_builder->getInt32Ty(), 3);
-  if (m_lowerStack) {
+  if (inst.getUseExtraStack()) {
     auto ldsStackSize = m_builder->getInt32(getWorkgroupSize() * MaxLdsStackEntries);
     stackIndex = m_builder->CreateAdd(stackIndex, ldsStackSize);
   }
@@ -206,7 +227,7 @@ void LowerGpuRt::visitStackWrite(GpurtStackWriteOp &inst) {
   Value *stackIndex = inst.getIndex();
   Value *stackData = inst.getValue();
   Type *stackTy = PointerType::get(m_builder->getInt32Ty(), 3);
-  if (m_lowerStack) {
+  if (inst.getUseExtraStack()) {
     auto ldsStackSize = m_builder->getInt32(getWorkgroupSize() * MaxLdsStackEntries);
     stackIndex = m_builder->CreateAdd(stackIndex, ldsStackSize);
   }
@@ -297,8 +318,7 @@ void LowerGpuRt::visitLdsStackStore(GpurtLdsStackStoreOp &inst) {
 // @param inst : The dialect instruction to process
 void LowerGpuRt::visitGetBoxSortHeuristicMode(GpurtGetBoxSortHeuristicModeOp &inst) {
   m_builder->SetInsertPoint(&inst);
-  auto rtState = m_context->getPipelineContext()->getRayTracingState();
-  Value *boxSortHeuristicMode = m_builder->getInt32(rtState->boxSortHeuristicMode);
+  Value *boxSortHeuristicMode = m_builder->getInt32(m_pipelineState->getOptions().rtBoxSortHeuristicMode);
   inst.replaceAllUsesWith(boxSortHeuristicMode);
   m_callsToLower.push_back(&inst);
   m_funcsToLower.insert(inst.getCalledFunction());
@@ -310,8 +330,7 @@ void LowerGpuRt::visitGetBoxSortHeuristicMode(GpurtGetBoxSortHeuristicModeOp &in
 // @param inst : The dialect instruction to process
 void LowerGpuRt::visitGetStaticFlags(GpurtGetStaticFlagsOp &inst) {
   m_builder->SetInsertPoint(&inst);
-  auto rtState = m_context->getPipelineContext()->getRayTracingState();
-  Value *staticPipelineFlags = m_builder->getInt32(rtState->staticPipelineFlags);
+  Value *staticPipelineFlags = m_builder->getInt32(m_pipelineState->getOptions().rtStaticPipelineFlags);
   inst.replaceAllUsesWith(staticPipelineFlags);
   m_callsToLower.push_back(&inst);
   m_funcsToLower.insert(inst.getCalledFunction());
@@ -323,8 +342,7 @@ void LowerGpuRt::visitGetStaticFlags(GpurtGetStaticFlagsOp &inst) {
 // @param inst : The dialect instruction to process
 void LowerGpuRt::visitGetTriangleCompressionMode(GpurtGetTriangleCompressionModeOp &inst) {
   m_builder->SetInsertPoint(&inst);
-  auto rtState = m_context->getPipelineContext()->getRayTracingState();
-  Value *triCompressMode = m_builder->getInt32(rtState->triCompressMode);
+  Value *triCompressMode = m_builder->getInt32(m_pipelineState->getOptions().rtTriCompressMode);
   inst.replaceAllUsesWith(triCompressMode);
   m_callsToLower.push_back(&inst);
   m_funcsToLower.insert(inst.getCalledFunction());
@@ -341,4 +359,4 @@ void LowerGpuRt::visitGetFlattenedGroupThreadId(GpurtGetFlattenedGroupThreadIdOp
   m_funcsToLower.insert(inst.getCalledFunction());
 }
 
-} // namespace Llpc
+} // namespace lgc
@@ -81,6 +81,7 @@ LLPC_MODULE_PASS("lgc-lower-debug-printf", LowerDebugPrintf)
 
 LLPC_FUNCTION_PASS("lgc-combine-cooperative-matrix", CombineCooperativeMatrix)
 LLPC_MODULE_PASS("lgc-lower-cooperative-matrix", LowerCooperativeMatrix)
+LLPC_MODULE_PASS("lgc-lower-gpurt", LowerGpuRt)
 
 #undef LLPC_PASS
 #undef LLPC_LOOP_PASS

@@ -40,6 +40,7 @@
 #include "lgc/patch/FragColorExport.h"
 #include "lgc/patch/LowerCooperativeMatrix.h"
 #include "lgc/patch/LowerDebugPrintf.h"
+#include "lgc/patch/LowerGpuRt.h"
 #include "lgc/patch/PatchBufferOp.h"
 #include "lgc/patch/PatchCheckShaderCache.h"
 #include "lgc/patch/PatchCopyShader.h"
@@ -132,6 +133,14 @@ void Patch::addPasses(PipelineState *pipelineState, lgc::PassManager &passMgr, T
   if (patchTimer)
     LgcContext::createAndAddStartStopTimer(passMgr, patchTimer, true);
 
+  if (pipelineState->getOptions().useGpurt) {
+    // NOTE: Lower GPURT operations and run InstCombinePass before builder replayer, because some Op are going to be
+    // turned into constant value, so that we can eliminate unused `@lgc.create.load.buffer.desc` before getting into
+    // replayer. Otherwise, unnecessary `writes_uavs` and `uses_uav` may be set.
+    passMgr.addPass(LowerGpuRt());
+    passMgr.addPass(createModuleToFunctionPassAdaptor(InstCombinePass()));
+  }
+
   // We're using BuilderRecorder; replay the Builder calls now
   passMgr.addPass(BuilderReplayer());
 

@@ -222,7 +222,6 @@ if(ICD_BUILD_LLPC)
         lower/llpcSpirvLowerTranslator.cpp
         lower/llpcSpirvLowerUtil.cpp
         lower/llpcSpirvProcessGpuRtLibrary.cpp
-        lower/LowerGpuRt.cpp
         lower/llpcSpirvLowerInternalLibraryIntrinsicUtil.cpp
         lower/LowerGLCompatibility.cpp
         lower/llpcSpirvLowerCooperativeMatrix.cpp

@@ -1663,6 +1663,7 @@ Result Compiler::buildPipelineInternal(Context *context, ArrayRef<const Pipeline
 
     if (numStagesWithRayQuery) {
       std::unique_ptr<Module> gpurtShaderLibrary = createGpurtShaderLibrary(context);
+      setUseGpurt(&*pipeline);
       if (!gpurtShaderLibrary)
         return Result::ErrorInvalidShader;
 
@@ -2677,6 +2678,16 @@ Result Compiler::generatePipeline(Context *context, unsigned moduleIndex, std::u
   return Result::Success;
 }
 
+// =====================================================================================================================
+// Set this pipeline use GPURT library
+//
+// @param pipeline : The pipeline object
+void Compiler::setUseGpurt(lgc::Pipeline *pipeline) {
+  auto options = pipeline->getOptions();
+  options.useGpurt = true;
+  pipeline->setOptions(options);
+}
+
 // =====================================================================================================================
 // Build single ray tracing pipeline ELF package.
 //
@@ -2888,6 +2899,7 @@ Result Compiler::buildRayTracingPipelineInternal(RayTracingContext &rtContext,
   std::unique_ptr<Module> gpurtShaderLibrary;
   if (needGpurtShaderLibrary) {
     gpurtShaderLibrary = createGpurtShaderLibrary(mainContext);
+    setUseGpurt(&*pipeline);
     if (!gpurtShaderLibrary)
       return Result::ErrorInvalidShader;
   }

@@ -199,6 +199,8 @@ class Compiler : public ICompiler {
   Result generatePipeline(Context *context, unsigned moduleIndex, std::unique_ptr<llvm::Module> module,
                           ElfPackage &pipelineElf, lgc::Pipeline *pipeline, TimerProfiler &timerProfiler);
 
+  void setUseGpurt(lgc::Pipeline *pipeline);
+
   std::vector<std::string> m_options;           // Compilation options
   MetroHash::Hash m_optionHash;                 // Hash code of compilation options
   GfxIpVersion m_gfxIp;                         // Graphics IP version info

@@ -342,6 +342,10 @@ Options PipelineContext::computePipelineOptions() const {
   options.enablePrimGeneratedQuery = getPipelineOptions()->enablePrimGeneratedQuery;
   options.enableFragColor = getPipelineOptions()->enableFragColor;
 
+  options.rtBoxSortHeuristicMode = m_rtState.boxSortHeuristicMode;
+  options.rtStaticPipelineFlags = m_rtState.staticPipelineFlags;
+  options.rtTriCompressMode = m_rtState.triCompressMode;
+
   return options;
 }