diff --git a/lgc/include/lgc/patch/FragColorExport.h b/lgc/include/lgc/patch/FragColorExport.h index 863ce8899d..b4632c94a0 100644 --- a/lgc/include/lgc/patch/FragColorExport.h +++ b/lgc/include/lgc/patch/FragColorExport.h @@ -77,7 +77,7 @@ class FragColorExport { FragColorExport &operator=(const FragColorExport &) = delete; llvm::Value *handleColorExportInstructions(llvm::Value *output, unsigned int hwColorExport, BuilderBase &builder, - ExportFormat expFmt, const bool signedness); + ExportFormat expFmt, const bool signedness, unsigned channelWriteMask); llvm::Value *convertToHalf(llvm::Value *value, bool signedness, BuilderBase &builder) const; llvm::Value *convertToFloat(llvm::Value *value, bool signedness, BuilderBase &builder) const; diff --git a/lgc/interface/lgc/Pipeline.h b/lgc/interface/lgc/Pipeline.h index b9fa744329..b0bd4883ad 100644 --- a/lgc/interface/lgc/Pipeline.h +++ b/lgc/interface/lgc/Pipeline.h @@ -454,6 +454,7 @@ struct ColorExportFormat { unsigned blendEnable; // Blend will be enabled for this target at draw time unsigned blendSrcAlphaToColor; // Whether source alpha is blended to color channels for this target // at draw time + unsigned channelWriteMask; // Write mask to specify destination channels }; // Struct to pass to SetColorExportState diff --git a/lgc/patch/FragColorExport.cpp b/lgc/patch/FragColorExport.cpp index 041d1aadfc..ebde70e229 100644 --- a/lgc/patch/FragColorExport.cpp +++ b/lgc/patch/FragColorExport.cpp @@ -70,7 +70,7 @@ LowerFragColorExport::LowerFragColorExport() : m_exportValues(MaxColorTargets + // @param input : The value we want to extract elements from // @param builder : The IR builder for inserting instructions // @param [out] results : The returned elements -static void extractElements(Value *input, BuilderBase &builder, SmallVectorImpl &results) { +static void extractElements(Value *input, BuilderBase &builder, std::array &results) { Type *valueTy = input->getType(); unsigned compCount = valueTy->isVectorTy() ? cast(valueTy)->getNumElements() : 1; assert(compCount <= 4 && "At-most four elements allowed\n"); @@ -94,8 +94,10 @@ static void extractElements(Value *input, BuilderBase &builder, SmallVectorImpl< // @param builder : The IR builder for inserting instructions // @param expFmt: The format for the given render target // @param signedness: If output should be interpreted as a signed integer +// @param channelWriteMask: Write mask to specify destination channels Value *FragColorExport::handleColorExportInstructions(Value *output, unsigned hwColorExport, BuilderBase &builder, - ExportFormat expFmt, const bool signedness) { + ExportFormat expFmt, const bool signedness, + unsigned channelWriteMask) { assert(expFmt != EXP_FORMAT_ZERO); Type *outputTy = output->getType(); @@ -116,10 +118,18 @@ Value *FragColorExport::handleColorExportInstructions(Value *output, unsigned hw floatTy, // EXP_FORMAT_32_ABGR = 9, }; - SmallVector comps(4); + const auto undefFloat = PoisonValue::get(builder.getFloatTy()); + const auto undefFloat16x2 = PoisonValue::get(FixedVectorType::get(builder.getHalfTy(), 2)); + + std::array comps; + std::array exports{undefFloat, undefFloat, undefFloat, undefFloat}; + unsigned exportMask = 0; Type *exportTy = exportTypeMapping[expFmt]; + const bool dualSourceBlendedEnable = m_pipelineState->getColorExportState().dualSourceBlendEnable || + m_pipelineState->getColorExportState().dynamicDualSourceBlendEnable; + // For 32bit output, we always to scalarize, but for 16bit output we may just operate on vector. if (exportTy->isFloatTy()) { if (compCount == 1) { @@ -130,56 +140,54 @@ Value *FragColorExport::handleColorExportInstructions(Value *output, unsigned hw } } - const auto undefFloat = PoisonValue::get(builder.getFloatTy()); - const auto undefFloat16x2 = PoisonValue::get(FixedVectorType::get(builder.getHalfTy(), 2)); - switch (expFmt) { - case EXP_FORMAT_32_R: { - compCount = 1; - comps[0] = convertToFloat(comps[0], signedness, builder); - break; - } - case EXP_FORMAT_32_GR: { - if (compCount >= 2) { + case EXP_FORMAT_32_R: + case EXP_FORMAT_32_GR: + case EXP_FORMAT_32_ABGR: { + if (expFmt == EXP_FORMAT_32_GR && compCount >= 2) compCount = 2; - comps[0] = convertToFloat(comps[0], signedness, builder); - comps[1] = convertToFloat(comps[1], signedness, builder); - } else { + else if (expFmt != EXP_FORMAT_32_ABGR) compCount = 1; - comps[0] = convertToFloat(comps[0], signedness, builder); + + for (unsigned idx = 0; idx < compCount; ++idx) { + unsigned compMask = 1 << idx; + if (compMask & channelWriteMask) { + exports[idx] = convertToFloat(comps[idx], signedness, builder); + exportMask |= compMask; + } } break; } case EXP_FORMAT_32_AR: { - if (compCount == 4) { + if (compCount == 4) compCount = 2; - comps[0] = convertToFloat(comps[0], signedness, builder); - comps[1] = convertToFloat(comps[3], signedness, builder); - } else { + else compCount = 1; - comps[0] = convertToFloat(comps[0], signedness, builder); + for (unsigned idx = 0; idx < compCount; ++idx) { + unsigned j = (idx == 1) ? 3 : idx; + unsigned compMask = 1 << j; + if (compMask & channelWriteMask) { + exports[idx] = convertToFloat(comps[j], signedness, builder); + exportMask |= compMask; + } } break; } - case EXP_FORMAT_32_ABGR: { - for (unsigned i = 0; i < compCount; ++i) - comps[i] = convertToFloat(comps[i], signedness, builder); - - for (unsigned i = compCount; i < 4; ++i) - comps[i] = undefFloat; - break; - } case EXP_FORMAT_FP16_ABGR: { + const unsigned compactCompCount = (compCount + 1) / 2; // convert to half type if (bitWidth <= 16) { output = convertToHalf(output, signedness, builder); extractElements(output, builder, comps); // re-pack - comps[0] = builder.CreateInsertElement(undefFloat16x2, comps[0], builder.getInt32(0)); - comps[0] = builder.CreateInsertElement(comps[0], comps[1], builder.getInt32(1)); - if (compCount > 2) { - comps[1] = builder.CreateInsertElement(undefFloat16x2, comps[2], builder.getInt32(0)); - comps[1] = builder.CreateInsertElement(comps[1], comps[3], builder.getInt32(1)); + for (unsigned idx = 0; idx < compactCompCount; ++idx) { + unsigned origIdx = 2 * idx; + unsigned compMask = (1 << origIdx) | (1 << (origIdx + 1)); + if (compMask & channelWriteMask) { + exports[idx] = builder.CreateInsertElement(undefFloat16x2, comps[origIdx], builder.getInt32(0)); + exports[idx] = builder.CreateInsertElement(exports[idx], comps[origIdx + 1], builder.getInt32(1)); + exportMask |= compMask; + } } } else { if (outputTy->isIntOrIntVectorTy()) @@ -188,45 +196,61 @@ Value *FragColorExport::handleColorExportInstructions(Value *output, unsigned hw : builder.getFloatTy()); extractElements(output, builder, comps); + exports[0] = exports[1] = undefFloat16x2; Attribute::AttrKind attribs[] = {Attribute::ReadNone}; - comps[0] = builder.CreateNamedCall("llvm.amdgcn.cvt.pkrtz", FixedVectorType::get(builder.getHalfTy(), 2), - {comps[0], comps[1]}, attribs); - if (compCount > 2) - comps[1] = builder.CreateNamedCall("llvm.amdgcn.cvt.pkrtz", FixedVectorType::get(builder.getHalfTy(), 2), - {comps[2], comps[3]}, attribs); + for (unsigned idx = 0; idx < compactCompCount; ++idx) { + unsigned origIdx = 2 * idx; + unsigned compMask = (1 << origIdx) | (1 << (origIdx + 1)); + if (compMask & channelWriteMask) { + exports[idx] = builder.CreateNamedCall("llvm.amdgcn.cvt.pkrtz", FixedVectorType::get(builder.getHalfTy(), 2), + {comps[origIdx], comps[origIdx + 1]}, attribs); + exportMask |= compMask; + } + } } - break; - } - case EXP_FORMAT_UNORM16_ABGR: - case EXP_FORMAT_SNORM16_ABGR: { - output = convertToFloat(output, signedness, builder); - extractElements(output, builder, comps); - - StringRef funcName = - expFmt == EXP_FORMAT_SNORM16_ABGR ? "llvm.amdgcn.cvt.pknorm.i16" : "llvm.amdgcn.cvt.pknorm.u16"; - - for (unsigned idx = 0; idx < (compCount + 1) / 2; idx++) { - Value *packedComps = builder.CreateNamedCall(funcName, FixedVectorType::get(builder.getInt16Ty(), 2), - {comps[2 * idx], comps[2 * idx + 1]}, {}); - - comps[idx] = builder.CreateBitCast(packedComps, FixedVectorType::get(builder.getHalfTy(), 2)); + // GFX11 removes compressed export, simply use 32bit-data export. + if (m_pipelineState->getTargetInfo().getGfxIpVersion().major >= 11 && !dualSourceBlendedEnable) { + exportMask = (1 << compactCompCount) - 1; + for (unsigned idx = 0; idx < compactCompCount; ++idx) + exports[idx] = builder.CreateBitCast(exports[idx], builder.getFloatTy()); } - break; } + case EXP_FORMAT_UNORM16_ABGR: + case EXP_FORMAT_SNORM16_ABGR: case EXP_FORMAT_UINT16_ABGR: case EXP_FORMAT_SINT16_ABGR: { assert(compCount <= 4); - output = convertToInt(output, signedness, builder); - extractElements(output, builder, comps); - StringRef funcName = expFmt == EXP_FORMAT_SINT16_ABGR ? "llvm.amdgcn.cvt.pk.i16" : "llvm.amdgcn.cvt.pk.u16"; + StringRef funcName; + if (expFmt == EXP_FORMAT_SNORM16_ABGR || expFmt == EXP_FORMAT_UNORM16_ABGR) { + output = convertToFloat(output, signedness, builder); + funcName = EXP_FORMAT_SNORM16_ABGR ? "llvm.amdgcn.cvt.pknorm.i16" : "llvm.amdgcn.cvt.pknorm.u16"; + } else { + output = convertToInt(output, signedness, builder); + funcName = expFmt == EXP_FORMAT_SINT16_ABGR ? "llvm.amdgcn.cvt.pk.i16" : "llvm.amdgcn.cvt.pk.u16"; + } + extractElements(output, builder, comps); - for (unsigned idx = 0; idx < (compCount + 1) / 2; idx++) { - Value *packedComps = builder.CreateNamedCall(funcName, FixedVectorType::get(builder.getInt16Ty(), 2), - {comps[2 * idx], comps[2 * idx + 1]}, {}); + const unsigned compactCompCount = (compCount + 1) / 2; + exports[0] = exports[1] = undefFloat16x2; + for (unsigned idx = 0; idx < compactCompCount; idx++) { + unsigned origIdx = 2 * idx; + unsigned compMask = (1 << origIdx) | (1 << (origIdx + 1)); + if (compMask & channelWriteMask) { + Value *packedComps = builder.CreateNamedCall(funcName, FixedVectorType::get(builder.getInt16Ty(), 2), + {comps[2 * idx], comps[2 * idx + 1]}, {}); + + exports[idx] = builder.CreateBitCast(packedComps, FixedVectorType::get(builder.getHalfTy(), 2)); + exportMask |= compMask; + } + } - comps[idx] = builder.CreateBitCast(packedComps, FixedVectorType::get(builder.getHalfTy(), 2)); + // GFX11 removes compressed export, simply use 32bit-data export. + if (m_pipelineState->getTargetInfo().getGfxIpVersion().major >= 11 && !dualSourceBlendedEnable) { + exportMask = (1 << compactCompCount) - 1; + for (unsigned idx = 0; idx < compactCompCount; ++idx) + exports[idx] = builder.CreateBitCast(exports[idx], builder.getFloatTy()); } break; @@ -237,67 +261,36 @@ Value *FragColorExport::handleColorExportInstructions(Value *output, unsigned hw } } - if (m_pipelineState->getTargetInfo().getGfxIpVersion().major >= 11 && - (m_pipelineState->getColorExportState().dualSourceBlendEnable || - m_pipelineState->getColorExportState().dynamicDualSourceBlendEnable)) { + if (m_pipelineState->getTargetInfo().getGfxIpVersion().major >= 11 && dualSourceBlendedEnable) { // Save them for later dual-source-swizzle m_blendSourceChannels = exportTy->isHalfTy() ? (compCount + 1) / 2 : compCount; assert(hwColorExport <= 1); - m_blendSources[hwColorExport].append(comps.begin(), comps.end()); + m_blendSources[hwColorExport].append(exports.begin(), exports.end()); return nullptr; } Value *exportCall = nullptr; - if (exportTy->isHalfTy()) { - // GFX11 removes compressed export, simply use 32bit-data export. - if (m_pipelineState->getTargetInfo().getGfxIpVersion().major >= 11) { - // Translate compCount into the number of 32bit data. - compCount = (compCount + 1) / 2; - for (unsigned i = 0; i < compCount; i++) - comps[i] = builder.CreateBitCast(comps[i], builder.getFloatTy()); - for (unsigned i = compCount; i < 4; i++) - comps[i] = undefFloat; - - Value *args[] = { - builder.getInt32(EXP_TARGET_MRT_0 + hwColorExport), // tgt - builder.getInt32((1 << compCount) - 1), // en - comps[0], // src0 - comps[1], // src1 - comps[2], // src2 - comps[3], // src3 - builder.getFalse(), // done - builder.getTrue() // vm - }; - - return builder.CreateNamedCall("llvm.amdgcn.exp.f32", Type::getVoidTy(*m_context), args, {}); - } - + if (exportTy->isHalfTy() && m_pipelineState->getTargetInfo().getGfxIpVersion().major < 11) { // 16-bit export (compressed) - if (compCount <= 2) - comps[1] = undefFloat16x2; Value *args[] = { builder.getInt32(EXP_TARGET_MRT_0 + hwColorExport), // tgt - builder.getInt32(compCount > 2 ? 0xF : 0x3), // en - comps[0], // src0 - comps[1], // src1 + builder.getInt32(exportMask), // en + exports[0], // src0 + exports[1], // src1 builder.getFalse(), // done builder.getTrue() // vm }; exportCall = builder.CreateNamedCall("llvm.amdgcn.exp.compr.v2f16", Type::getVoidTy(*m_context), args, {}); } else { - // 32-bit export - for (unsigned i = compCount; i < 4; i++) - comps[i] = undefFloat; - Value *args[] = { builder.getInt32(EXP_TARGET_MRT_0 + hwColorExport), // tgt - builder.getInt32((1 << compCount) - 1), // en - comps[0], // src0 - comps[1], // src1 - comps[2], // src2 - comps[3], // src3 + builder.getInt32(exportMask), // en + exports[0], // src0 + exports[1], // src1 + exports[2], // src2 + exports[3], // src3 builder.getFalse(), // done builder.getTrue() // vm }; @@ -942,9 +935,10 @@ void FragColorExport::generateExportInstructions(ArrayRef info, assert(infoIt->hwColorTarget < MaxColorTargets); auto expFmt = static_cast(m_pipelineState->computeExportFormat(infoIt->ty, location)); - if (expFmt != EXP_FORMAT_ZERO) { + const unsigned channelWriteMask = m_pipelineState->getColorExportFormat(location).channelWriteMask; + if (expFmt != EXP_FORMAT_ZERO && channelWriteMask != 0) { lastExport = handleColorExportInstructions(values[infoIt->hwColorTarget], hwColorExport, builder, expFmt, - infoIt->isSigned); + infoIt->isSigned, channelWriteMask); finalExportFormats.push_back(expFmt); ++hwColorExport; } diff --git a/lgc/state/PalMetadata.cpp b/lgc/state/PalMetadata.cpp index 7401438d13..4d54bc9f1a 100644 --- a/lgc/state/PalMetadata.cpp +++ b/lgc/state/PalMetadata.cpp @@ -1065,9 +1065,9 @@ void PalMetadata::updateCbShaderMask(llvm::ArrayRef exps) { for (auto &exp : exps) { if (exp.hwColorTarget == MaxColorTargets) continue; - - if (m_pipelineState->computeExportFormat(exp.ty, exp.location) != 0) { - cbShaderMask |= (0xF << (4 * exp.location)); + const unsigned channelWriteMask = m_pipelineState->getColorExportFormat(exp.location).channelWriteMask; + if (m_pipelineState->computeExportFormat(exp.ty, exp.location) != 0 && channelWriteMask != 0) { + cbShaderMask |= (channelWriteMask << (4 * exp.location)); } } diff --git a/lgc/state/PipelineState.cpp b/lgc/state/PipelineState.cpp index b4b085409f..d31c33f5ad 100644 --- a/lgc/state/PipelineState.cpp +++ b/lgc/state/PipelineState.cpp @@ -1212,7 +1212,7 @@ void PipelineState::recordColorExportState(Module *module) { // The color export formats named metadata node's operands are: // - N metadata nodes for N color targets, each one containing - // { dfmt, nfmt, blendEnable, blendSrcAlphaToColor } + // { dfmt, nfmt, blendEnable, blendSrcAlphaToColor, channelWriteMask } for (const ColorExportFormat &target : m_colorExportFormats) exportFormatsMetaNode->addOperand(getArrayOfInt32MetaNode(getContext(), target, /*atLeastOneValue=*/true)); } diff --git a/lgc/test/ElfRelocationSize.lgc b/lgc/test/ElfRelocationSize.lgc index d57db31579..e77214a9c1 100644 --- a/lgc/test/ElfRelocationSize.lgc +++ b/lgc/test/ElfRelocationSize.lgc @@ -35,7 +35,7 @@ target triple = "amdgcn--amdpal" !1 = !{i32 -1843601953, i32 337452067, i32 -1234379640, i32 1173800166} !2 = !{i32 2} !4 = !{i32 -225903757, i32 -647980161, i32 1491774676, i32 -114025882} -!18 = !{i32 10} +!18 = !{i32 10, i32 0, i32 0, i32 0, i32 15} !19 = !{i32 0, i32 0, i32 0, i32 0, i32 13, i32 7, i32 -1} !20 = !{i32 1, i32 0, i32 24, i32 0, i32 11, i32 7, i32 -1} !21 = !{i32 0, i32 0, i32 0, i32 1} @@ -131,7 +131,7 @@ attributes #3 = { nounwind readonly willreturn } !10 = !{!"DescriptorTableVaPtr", i32 0, i32 0, i32 11, i32 1, i32 1} !11 = !{!"DescriptorResource", i32 1, i32 0, i32 0, i32 80, i32 0, i32 1, i32 8} !14 = !{!"DescriptorSampler", i32 2, i32 0, i32 -1, i32 4, i32 0, i32 2, i32 4, i32 12288, i32 117436416, i32 1750073344, i32 -2147483648} -!19 = !{i32 10} +!19 = !{i32 10, i32 0, i32 0, i32 0, i32 15} !20 = !{i32 6} ; ---------------------------------------------------------------------- diff --git a/lgc/test/PartPipeline.lgc b/lgc/test/PartPipeline.lgc index 0dc17c2dd3..223460ea87 100644 --- a/lgc/test/PartPipeline.lgc +++ b/lgc/test/PartPipeline.lgc @@ -44,7 +44,7 @@ target triple = "amdgcn--amdpal" !1 = !{i32 -1843601953, i32 337452067, i32 -1234379640, i32 1173800166} !2 = !{i32 2} !4 = !{i32 -225903757, i32 -647980161, i32 1491774676, i32 -114025882} -!18 = !{i32 10} +!18 = !{i32 10, i32 0, i32 0, i32 0, i32 15} !19 = !{i32 0, i32 0, i32 0, i32 0, i32 13, i32 7, i32 -1} !20 = !{i32 1, i32 0, i32 24, i32 0, i32 11, i32 7, i32 -1} !21 = !{i32 0, i32 0, i32 0, i32 1} @@ -140,7 +140,7 @@ attributes #3 = { nounwind readonly willreturn } !10 = !{!"DescriptorTableVaPtr", i32 0, i32 0, i32 11, i32 1, i32 1} !11 = !{!"DescriptorResource", i32 1, i32 0, i32 0, i32 80, i32 0, i32 1, i32 8} !14 = !{!"DescriptorSampler", i32 2, i32 0, i32 -1, i32 4, i32 0, i32 2, i32 4, i32 12288, i32 117436416, i32 1750073344, i32 -2147483648} -!19 = !{i32 10} +!19 = !{i32 10, i32 0, i32 0, i32 0, i32 15} !20 = !{i32 6} ; ---------------------------------------------------------------------- diff --git a/lgc/test/TextureRange.lgc b/lgc/test/TextureRange.lgc index 41b8bb1b3b..f45129a8f1 100644 --- a/lgc/test/TextureRange.lgc +++ b/lgc/test/TextureRange.lgc @@ -147,7 +147,7 @@ attributes #3 = { nounwind readnone } !10 = !{!"InlineBuffer", i32 14, i32 0, i32 3, i32 1, i32 -1610612736, i32 4, i32 4} !11 = !{!"DescriptorBufferCompact", i32 10, i32 0, i32 4, i32 4, i32 -536870912, i32 0, i32 2} !12 = !{!"IndirectUserDataVaPtr", i32 0, i32 0, i32 7, i32 1, i32 0} -!13 = !{i32 16} +!13 = !{i32 16, i32 0, i32 0, i32 0, i32 15} !14 = !{i32 3, i32 3} !15 = !{!"\82\B0amdpal.pipelines\91\84\AA.registers\80\B0.spill_threshold\CE\FF\FF\FF\FF\B0.user_data_limit\00\AF.xgl_cache_info\82\B3.128_bit_cache_hash\92\CF\9B\97\D5d\9F\E5\B7\11\CF\E9#\B4W\05\EA\C6\A7\AD.llpc_version\A453.5\AEamdpal.version\92\02\03"} !16 = !{i32 0} diff --git a/lgc/test/lgcdis.lgc b/lgc/test/lgcdis.lgc index a2e3811168..7b970b75ac 100644 --- a/lgc/test/lgcdis.lgc +++ b/lgc/test/lgcdis.lgc @@ -133,7 +133,7 @@ attributes #2 = { nounwind readnone } !14 = !{!"DescriptorSampler", i32 2, i32 0, i32 16, i32 4, i32 0, i32 2, i32 4, <4 x i32> } !19 = !{i32 0, i32 0, i32 0, i32 0, i32 13, i32 7, i32 -1} !20 = !{i32 1, i32 0, i32 24, i32 0, i32 11, i32 7, i32 -1} -!21 = !{i32 10, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0} +!21 = !{i32 10, i32 0, i32 0, i32 0, i32 15} !22 = !{i32 2} !23 = !{i32 0, i32 0, i32 0, i32 1} !24 = !{i32 1} diff --git a/llpc/context/llpcGraphicsContext.cpp b/llpc/context/llpcGraphicsContext.cpp index cdc9c46383..8b9f7e380e 100644 --- a/llpc/context/llpcGraphicsContext.cpp +++ b/llpc/context/llpcGraphicsContext.cpp @@ -321,6 +321,7 @@ void GraphicsContext::setColorExportState(Pipeline *pipeline, Util::MetroHash64 formats[targetIndex].nfmt = nfmt; formats[targetIndex].blendEnable = cbState.target[targetIndex].blendEnable; formats[targetIndex].blendSrcAlphaToColor = cbState.target[targetIndex].blendSrcAlphaToColor; + formats[targetIndex].channelWriteMask = cbState.target[targetIndex].channelWriteMask; } }