lgc: kill unused outputs based on channelWriteMask

This is an optimization of output caculations based on channelWriteMask of the pipeline stats from two aspects: - Remove export instructions for color targets that have a 0 write mask - Replace components that aren't used according to the write mask with 'poision' (cherry picked from commit fa12be36e0bca4a1ac9c0f4a1be7c6e3e0d63d33)
GPUOpen-Drivers · Oct 24, 2023 · 03c7675 · 03c7675
1 parent b10cb3d
commit 03c7675
Show file tree

Hide file tree

Showing 10 changed files with 114 additions and 118 deletions.
diff --git a/lgc/include/lgc/patch/FragColorExport.h b/lgc/include/lgc/patch/FragColorExport.h
@@ -77,7 +77,7 @@ class FragColorExport {
   FragColorExport &operator=(const FragColorExport &) = delete;
 
   llvm::Value *handleColorExportInstructions(llvm::Value *output, unsigned int hwColorExport, BuilderBase &builder,
-                                             ExportFormat expFmt, const bool signedness);
+                                             ExportFormat expFmt, const bool signedness, unsigned channelWriteMask);
 
   llvm::Value *convertToHalf(llvm::Value *value, bool signedness, BuilderBase &builder) const;
   llvm::Value *convertToFloat(llvm::Value *value, bool signedness, BuilderBase &builder) const;

diff --git a/lgc/interface/lgc/Pipeline.h b/lgc/interface/lgc/Pipeline.h
@@ -454,6 +454,7 @@ struct ColorExportFormat {
   unsigned blendEnable;          // Blend will be enabled for this target at draw time
   unsigned blendSrcAlphaToColor; // Whether source alpha is blended to color channels for this target
                                  //  at draw time
+  unsigned channelWriteMask;     // Write mask to specify destination channels
 };
 
 // Struct to pass to SetColorExportState

diff --git a/lgc/patch/FragColorExport.cpp b/lgc/patch/FragColorExport.cpp
@@ -70,7 +70,7 @@ LowerFragColorExport::LowerFragColorExport() : m_exportValues(MaxColorTargets +
 // @param input : The value we want to extract elements from
 // @param builder : The IR builder for inserting instructions
 // @param [out] results : The returned elements
-static void extractElements(Value *input, BuilderBase &builder, SmallVectorImpl<Value *> &results) {
+static void extractElements(Value *input, BuilderBase &builder, std::array<Value *, 4> &results) {
   Type *valueTy = input->getType();
   unsigned compCount = valueTy->isVectorTy() ? cast<FixedVectorType>(valueTy)->getNumElements() : 1;
   assert(compCount <= 4 && "At-most four elements allowed\n");
@@ -94,8 +94,10 @@ static void extractElements(Value *input, BuilderBase &builder, SmallVectorImpl<
 // @param builder : The IR builder for inserting instructions
 // @param expFmt: The format for the given render target
 // @param signedness: If output should be interpreted as a signed integer
+// @param channelWriteMask: Write mask to specify destination channels
 Value *FragColorExport::handleColorExportInstructions(Value *output, unsigned hwColorExport, BuilderBase &builder,
-                                                      ExportFormat expFmt, const bool signedness) {
+                                                      ExportFormat expFmt, const bool signedness,
+                                                      unsigned channelWriteMask) {
   assert(expFmt != EXP_FORMAT_ZERO);
 
   Type *outputTy = output->getType();
@@ -116,10 +118,18 @@ Value *FragColorExport::handleColorExportInstructions(Value *output, unsigned hw
       floatTy, //  EXP_FORMAT_32_ABGR = 9,
   };
 
-  SmallVector<Value *, 4> comps(4);
+  const auto undefFloat = PoisonValue::get(builder.getFloatTy());
+  const auto undefFloat16x2 = PoisonValue::get(FixedVectorType::get(builder.getHalfTy(), 2));
+
+  std::array<Value *, 4> comps;
+  std::array<Value *, 4> exports{undefFloat, undefFloat, undefFloat, undefFloat};
+  unsigned exportMask = 0;
 
   Type *exportTy = exportTypeMapping[expFmt];
 
+  const bool dualSourceBlendedEnable = m_pipelineState->getColorExportState().dualSourceBlendEnable ||
+                                       m_pipelineState->getColorExportState().dynamicDualSourceBlendEnable;
+
   // For 32bit output, we always to scalarize, but for 16bit output we may just operate on vector.
   if (exportTy->isFloatTy()) {
     if (compCount == 1) {
@@ -130,56 +140,54 @@ Value *FragColorExport::handleColorExportInstructions(Value *output, unsigned hw
     }
   }
 
-  const auto undefFloat = PoisonValue::get(builder.getFloatTy());
-  const auto undefFloat16x2 = PoisonValue::get(FixedVectorType::get(builder.getHalfTy(), 2));
-
   switch (expFmt) {
-  case EXP_FORMAT_32_R: {
-    compCount = 1;
-    comps[0] = convertToFloat(comps[0], signedness, builder);
-    break;
-  }
-  case EXP_FORMAT_32_GR: {
-    if (compCount >= 2) {
+  case EXP_FORMAT_32_R:
+  case EXP_FORMAT_32_GR:
+  case EXP_FORMAT_32_ABGR: {
+    if (expFmt == EXP_FORMAT_32_GR && compCount >= 2)
       compCount = 2;
-      comps[0] = convertToFloat(comps[0], signedness, builder);
-      comps[1] = convertToFloat(comps[1], signedness, builder);
-    } else {
+    else if (expFmt != EXP_FORMAT_32_ABGR)
       compCount = 1;
-      comps[0] = convertToFloat(comps[0], signedness, builder);
+
+    for (unsigned idx = 0; idx < compCount; ++idx) {
+      unsigned compMask = 1 << idx;
+      if (compMask & channelWriteMask) {
+        exports[idx] = convertToFloat(comps[idx], signedness, builder);
+        exportMask |= compMask;
+      }
     }
     break;
   }
   case EXP_FORMAT_32_AR: {
-    if (compCount == 4) {
+    if (compCount == 4)
       compCount = 2;
-      comps[0] = convertToFloat(comps[0], signedness, builder);
-      comps[1] = convertToFloat(comps[3], signedness, builder);
-    } else {
+    else
       compCount = 1;
-      comps[0] = convertToFloat(comps[0], signedness, builder);
+    for (unsigned idx = 0; idx < compCount; ++idx) {
+      unsigned j = (idx == 1) ? 3 : idx;
+      unsigned compMask = 1 << j;
+      if (compMask & channelWriteMask) {
+        exports[idx] = convertToFloat(comps[j], signedness, builder);
+        exportMask |= compMask;
+      }
     }
     break;
   }
-  case EXP_FORMAT_32_ABGR: {
-    for (unsigned i = 0; i < compCount; ++i)
-      comps[i] = convertToFloat(comps[i], signedness, builder);
-
-    for (unsigned i = compCount; i < 4; ++i)
-      comps[i] = undefFloat;
-    break;
-  }
   case EXP_FORMAT_FP16_ABGR: {
+    const unsigned compactCompCount = (compCount + 1) / 2;
     // convert to half type
     if (bitWidth <= 16) {
       output = convertToHalf(output, signedness, builder);
       extractElements(output, builder, comps);
       // re-pack
-      comps[0] = builder.CreateInsertElement(undefFloat16x2, comps[0], builder.getInt32(0));
-      comps[0] = builder.CreateInsertElement(comps[0], comps[1], builder.getInt32(1));
-      if (compCount > 2) {
-        comps[1] = builder.CreateInsertElement(undefFloat16x2, comps[2], builder.getInt32(0));
-        comps[1] = builder.CreateInsertElement(comps[1], comps[3], builder.getInt32(1));
+      for (unsigned idx = 0; idx < compactCompCount; ++idx) {
+        unsigned origIdx = 2 * idx;
+        unsigned compMask = (1 << origIdx) | (1 << (origIdx + 1));
+        if (compMask & channelWriteMask) {
+          exports[idx] = builder.CreateInsertElement(undefFloat16x2, comps[origIdx], builder.getInt32(0));
+          exports[idx] = builder.CreateInsertElement(exports[idx], comps[origIdx + 1], builder.getInt32(1));
+          exportMask |= compMask;
+        }
       }
     } else {
       if (outputTy->isIntOrIntVectorTy())
@@ -188,45 +196,61 @@ Value *FragColorExport::handleColorExportInstructions(Value *output, unsigned hw
                                                                  : builder.getFloatTy());
       extractElements(output, builder, comps);
 
+      exports[0] = exports[1] = undefFloat16x2;
       Attribute::AttrKind attribs[] = {Attribute::ReadNone};
-      comps[0] = builder.CreateNamedCall("llvm.amdgcn.cvt.pkrtz", FixedVectorType::get(builder.getHalfTy(), 2),
-                                         {comps[0], comps[1]}, attribs);
-      if (compCount > 2)
-        comps[1] = builder.CreateNamedCall("llvm.amdgcn.cvt.pkrtz", FixedVectorType::get(builder.getHalfTy(), 2),
-                                           {comps[2], comps[3]}, attribs);
+      for (unsigned idx = 0; idx < compactCompCount; ++idx) {
+        unsigned origIdx = 2 * idx;
+        unsigned compMask = (1 << origIdx) | (1 << (origIdx + 1));
+        if (compMask & channelWriteMask) {
+          exports[idx] = builder.CreateNamedCall("llvm.amdgcn.cvt.pkrtz", FixedVectorType::get(builder.getHalfTy(), 2),
+                                                 {comps[origIdx], comps[origIdx + 1]}, attribs);
+          exportMask |= compMask;
+        }
+      }
     }
-    break;
-  }
-  case EXP_FORMAT_UNORM16_ABGR:
-  case EXP_FORMAT_SNORM16_ABGR: {
-    output = convertToFloat(output, signedness, builder);
-    extractElements(output, builder, comps);
-
-    StringRef funcName =
-        expFmt == EXP_FORMAT_SNORM16_ABGR ? "llvm.amdgcn.cvt.pknorm.i16" : "llvm.amdgcn.cvt.pknorm.u16";
-
-    for (unsigned idx = 0; idx < (compCount + 1) / 2; idx++) {
-      Value *packedComps = builder.CreateNamedCall(funcName, FixedVectorType::get(builder.getInt16Ty(), 2),
-                                                   {comps[2 * idx], comps[2 * idx + 1]}, {});
-
-      comps[idx] = builder.CreateBitCast(packedComps, FixedVectorType::get(builder.getHalfTy(), 2));
+    // GFX11 removes compressed export, simply use 32bit-data export.
+    if (m_pipelineState->getTargetInfo().getGfxIpVersion().major >= 11 && !dualSourceBlendedEnable) {
+      exportMask = (1 << compactCompCount) - 1;
+      for (unsigned idx = 0; idx < compactCompCount; ++idx)
+        exports[idx] = builder.CreateBitCast(exports[idx], builder.getFloatTy());
     }
-
     break;
   }
+  case EXP_FORMAT_UNORM16_ABGR:
+  case EXP_FORMAT_SNORM16_ABGR:
   case EXP_FORMAT_UINT16_ABGR:
   case EXP_FORMAT_SINT16_ABGR: {
     assert(compCount <= 4);
-    output = convertToInt(output, signedness, builder);
-    extractElements(output, builder, comps);
 
-    StringRef funcName = expFmt == EXP_FORMAT_SINT16_ABGR ? "llvm.amdgcn.cvt.pk.i16" : "llvm.amdgcn.cvt.pk.u16";
+    StringRef funcName;
+    if (expFmt == EXP_FORMAT_SNORM16_ABGR || expFmt == EXP_FORMAT_UNORM16_ABGR) {
+      output = convertToFloat(output, signedness, builder);
+      funcName = EXP_FORMAT_SNORM16_ABGR ? "llvm.amdgcn.cvt.pknorm.i16" : "llvm.amdgcn.cvt.pknorm.u16";
+    } else {
+      output = convertToInt(output, signedness, builder);
+      funcName = expFmt == EXP_FORMAT_SINT16_ABGR ? "llvm.amdgcn.cvt.pk.i16" : "llvm.amdgcn.cvt.pk.u16";
+    }
+    extractElements(output, builder, comps);
 
-    for (unsigned idx = 0; idx < (compCount + 1) / 2; idx++) {
-      Value *packedComps = builder.CreateNamedCall(funcName, FixedVectorType::get(builder.getInt16Ty(), 2),
-                                                   {comps[2 * idx], comps[2 * idx + 1]}, {});
+    const unsigned compactCompCount = (compCount + 1) / 2;
+    exports[0] = exports[1] = undefFloat16x2;
+    for (unsigned idx = 0; idx < compactCompCount; idx++) {
+      unsigned origIdx = 2 * idx;
+      unsigned compMask = (1 << origIdx) | (1 << (origIdx + 1));
+      if (compMask & channelWriteMask) {
+        Value *packedComps = builder.CreateNamedCall(funcName, FixedVectorType::get(builder.getInt16Ty(), 2),
+                                                     {comps[2 * idx], comps[2 * idx + 1]}, {});
+
+        exports[idx] = builder.CreateBitCast(packedComps, FixedVectorType::get(builder.getHalfTy(), 2));
+        exportMask |= compMask;
+      }
+    }
 
-      comps[idx] = builder.CreateBitCast(packedComps, FixedVectorType::get(builder.getHalfTy(), 2));
+    // GFX11 removes compressed export, simply use 32bit-data export.
+    if (m_pipelineState->getTargetInfo().getGfxIpVersion().major >= 11 && !dualSourceBlendedEnable) {
+      exportMask = (1 << compactCompCount) - 1;
+      for (unsigned idx = 0; idx < compactCompCount; ++idx)
+        exports[idx] = builder.CreateBitCast(exports[idx], builder.getFloatTy());
     }
 
     break;
@@ -237,67 +261,36 @@ Value *FragColorExport::handleColorExportInstructions(Value *output, unsigned hw
   }
   }
 
-  if (m_pipelineState->getTargetInfo().getGfxIpVersion().major >= 11 &&
-      (m_pipelineState->getColorExportState().dualSourceBlendEnable ||
-       m_pipelineState->getColorExportState().dynamicDualSourceBlendEnable)) {
+  if (m_pipelineState->getTargetInfo().getGfxIpVersion().major >= 11 && dualSourceBlendedEnable) {
     // Save them for later dual-source-swizzle
     m_blendSourceChannels = exportTy->isHalfTy() ? (compCount + 1) / 2 : compCount;
     assert(hwColorExport <= 1);
-    m_blendSources[hwColorExport].append(comps.begin(), comps.end());
+    m_blendSources[hwColorExport].append(exports.begin(), exports.end());
     return nullptr;
   }
 
   Value *exportCall = nullptr;
 
-  if (exportTy->isHalfTy()) {
-    // GFX11 removes compressed export, simply use 32bit-data export.
-    if (m_pipelineState->getTargetInfo().getGfxIpVersion().major >= 11) {
-      // Translate compCount into the number of 32bit data.
-      compCount = (compCount + 1) / 2;
-      for (unsigned i = 0; i < compCount; i++)
-        comps[i] = builder.CreateBitCast(comps[i], builder.getFloatTy());
-      for (unsigned i = compCount; i < 4; i++)
-        comps[i] = undefFloat;
-
-      Value *args[] = {
-          builder.getInt32(EXP_TARGET_MRT_0 + hwColorExport), // tgt
-          builder.getInt32((1 << compCount) - 1),             // en
-          comps[0],                                           // src0
-          comps[1],                                           // src1
-          comps[2],                                           // src2
-          comps[3],                                           // src3
-          builder.getFalse(),                                 // done
-          builder.getTrue()                                   // vm
-      };
-
-      return builder.CreateNamedCall("llvm.amdgcn.exp.f32", Type::getVoidTy(*m_context), args, {});
-    }
-
+  if (exportTy->isHalfTy() && m_pipelineState->getTargetInfo().getGfxIpVersion().major < 11) {
     // 16-bit export (compressed)
-    if (compCount <= 2)
-      comps[1] = undefFloat16x2;
     Value *args[] = {
         builder.getInt32(EXP_TARGET_MRT_0 + hwColorExport), // tgt
-        builder.getInt32(compCount > 2 ? 0xF : 0x3),        // en
-        comps[0],                                           // src0
-        comps[1],                                           // src1
+        builder.getInt32(exportMask),                       // en
+        exports[0],                                         // src0
+        exports[1],                                         // src1
         builder.getFalse(),                                 // done
         builder.getTrue()                                   // vm
     };
 
     exportCall = builder.CreateNamedCall("llvm.amdgcn.exp.compr.v2f16", Type::getVoidTy(*m_context), args, {});
   } else {
-    // 32-bit export
-    for (unsigned i = compCount; i < 4; i++)
-      comps[i] = undefFloat;
-
     Value *args[] = {
         builder.getInt32(EXP_TARGET_MRT_0 + hwColorExport), // tgt
-        builder.getInt32((1 << compCount) - 1),             // en
-        comps[0],                                           // src0
-        comps[1],                                           // src1
-        comps[2],                                           // src2
-        comps[3],                                           // src3
+        builder.getInt32(exportMask),                       // en
+        exports[0],                                         // src0
+        exports[1],                                         // src1
+        exports[2],                                         // src2
+        exports[3],                                         // src3
         builder.getFalse(),                                 // done
         builder.getTrue()                                   // vm
     };
@@ -942,9 +935,10 @@ void FragColorExport::generateExportInstructions(ArrayRef<ColorExportInfo> info,
     assert(infoIt->hwColorTarget < MaxColorTargets);
 
     auto expFmt = static_cast<ExportFormat>(m_pipelineState->computeExportFormat(infoIt->ty, location));
-    if (expFmt != EXP_FORMAT_ZERO) {
+    const unsigned channelWriteMask = m_pipelineState->getColorExportFormat(location).channelWriteMask;
+    if (expFmt != EXP_FORMAT_ZERO && channelWriteMask != 0) {
       lastExport = handleColorExportInstructions(values[infoIt->hwColorTarget], hwColorExport, builder, expFmt,
-                                                 infoIt->isSigned);
+                                                 infoIt->isSigned, channelWriteMask);
       finalExportFormats.push_back(expFmt);
       ++hwColorExport;
     }

diff --git a/lgc/state/PalMetadata.cpp b/lgc/state/PalMetadata.cpp
@@ -1065,9 +1065,9 @@ void PalMetadata::updateCbShaderMask(llvm::ArrayRef<ColorExportInfo> exps) {
   for (auto &exp : exps) {
     if (exp.hwColorTarget == MaxColorTargets)
       continue;
-
-    if (m_pipelineState->computeExportFormat(exp.ty, exp.location) != 0) {
-      cbShaderMask |= (0xF << (4 * exp.location));
+    const unsigned channelWriteMask = m_pipelineState->getColorExportFormat(exp.location).channelWriteMask;
+    if (m_pipelineState->computeExportFormat(exp.ty, exp.location) != 0 && channelWriteMask != 0) {
+      cbShaderMask |= (channelWriteMask << (4 * exp.location));
     }
   }
 

diff --git a/lgc/state/PipelineState.cpp b/lgc/state/PipelineState.cpp
@@ -1212,7 +1212,7 @@ void PipelineState::recordColorExportState(Module *module) {
 
     // The color export formats named metadata node's operands are:
     // - N metadata nodes for N color targets, each one containing
-    // { dfmt, nfmt, blendEnable, blendSrcAlphaToColor }
+    // { dfmt, nfmt, blendEnable, blendSrcAlphaToColor, channelWriteMask }
     for (const ColorExportFormat &target : m_colorExportFormats)
       exportFormatsMetaNode->addOperand(getArrayOfInt32MetaNode(getContext(), target, /*atLeastOneValue=*/true));
   }

diff --git a/lgc/test/ElfRelocationSize.lgc b/lgc/test/ElfRelocationSize.lgc
@@ -35,7 +35,7 @@ target triple = "amdgcn--amdpal"
 !1 = !{i32 -1843601953, i32 337452067, i32 -1234379640, i32 1173800166}
 !2 = !{i32 2}
 !4 = !{i32 -225903757, i32 -647980161, i32 1491774676, i32 -114025882}
-!18 = !{i32 10}
+!18 = !{i32 10, i32 0, i32 0, i32 0, i32 15}
 !19 = !{i32 0, i32 0, i32 0, i32 0, i32 13, i32 7, i32 -1}
 !20 = !{i32 1, i32 0, i32 24, i32 0, i32 11, i32 7, i32 -1}
 !21 = !{i32 0, i32 0, i32 0, i32 1}
@@ -131,7 +131,7 @@ attributes #3 = { nounwind readonly willreturn }
 !10 = !{!"DescriptorTableVaPtr", i32 0, i32 0, i32 11, i32 1, i32 1}
 !11 = !{!"DescriptorResource", i32 1, i32 0, i32 0, i32 80, i32 0, i32 1, i32 8}
 !14 = !{!"DescriptorSampler", i32 2, i32 0, i32 -1, i32 4, i32 0, i32 2, i32 4, i32 12288, i32 117436416, i32 1750073344, i32 -2147483648}
-!19 = !{i32 10}
+!19 = !{i32 10, i32 0, i32 0, i32 0, i32 15}
 !20 = !{i32 6}
 
 ; ----------------------------------------------------------------------

diff --git a/lgc/test/PartPipeline.lgc b/lgc/test/PartPipeline.lgc
@@ -44,7 +44,7 @@ target triple = "amdgcn--amdpal"
 !1 = !{i32 -1843601953, i32 337452067, i32 -1234379640, i32 1173800166}
 !2 = !{i32 2}
 !4 = !{i32 -225903757, i32 -647980161, i32 1491774676, i32 -114025882}
-!18 = !{i32 10}
+!18 = !{i32 10, i32 0, i32 0, i32 0, i32 15}
 !19 = !{i32 0, i32 0, i32 0, i32 0, i32 13, i32 7, i32 -1}
 !20 = !{i32 1, i32 0, i32 24, i32 0, i32 11, i32 7, i32 -1}
 !21 = !{i32 0, i32 0, i32 0, i32 1}
@@ -140,7 +140,7 @@ attributes #3 = { nounwind readonly willreturn }
 !10 = !{!"DescriptorTableVaPtr", i32 0, i32 0, i32 11, i32 1, i32 1}
 !11 = !{!"DescriptorResource", i32 1, i32 0, i32 0, i32 80, i32 0, i32 1, i32 8}
 !14 = !{!"DescriptorSampler", i32 2, i32 0, i32 -1, i32 4, i32 0, i32 2, i32 4, i32 12288, i32 117436416, i32 1750073344, i32 -2147483648}
-!19 = !{i32 10}
+!19 = !{i32 10, i32 0, i32 0, i32 0, i32 15}
 !20 = !{i32 6}
 
 ; ----------------------------------------------------------------------

diff --git a/lgc/test/TextureRange.lgc b/lgc/test/TextureRange.lgc
@@ -147,7 +147,7 @@ attributes #3 = { nounwind readnone }
 !10 = !{!"InlineBuffer", i32 14, i32 0, i32 3, i32 1, i32 -1610612736, i32 4, i32 4}
 !11 = !{!"DescriptorBufferCompact", i32 10, i32 0, i32 4, i32 4, i32 -536870912, i32 0, i32 2}
 !12 = !{!"IndirectUserDataVaPtr", i32 0, i32 0, i32 7, i32 1, i32 0}
-!13 = !{i32 16}
+!13 = !{i32 16, i32 0, i32 0, i32 0, i32 15}
 !14 = !{i32 3, i32 3}
 !15 = !{!"\82\B0amdpal.pipelines\91\84\AA.registers\80\B0.spill_threshold\CE\FF\FF\FF\FF\B0.user_data_limit\00\AF.xgl_cache_info\82\B3.128_bit_cache_hash\92\CF\9B\97\D5d\9F\E5\B7\11\CF\E9#\B4W\05\EA\C6\A7\AD.llpc_version\A453.5\AEamdpal.version\92\02\03"}
 !16 = !{i32 0}