diff --git a/.editorconfig b/.editorconfig index 9930fef8fc..a975c57605 100644 --- a/.editorconfig +++ b/.editorconfig @@ -1,6 +1,6 @@ [*] end_of_line = lf -indent_size = 4 +indent_size = 2 indent_style = space insert_final_newline = true max_line_length = 120 diff --git a/CMakeLists.txt b/CMakeLists.txt index 365c09bba9..e4032fd4b4 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -43,17 +43,10 @@ add_library(vkgc_headers INTERFACE) option(LLPC_BUILD_TOOLS "LLPC build all tools" OFF) ### Options that affect the headers #################################################################################### -option(LLPC_ENABLE_SHADER_CACHE "Enable experimental shader cache" OFF) - -if(LLPC_ENABLE_SHADER_CACHE) - target_compile_definitions(vkgc_headers INTERFACE LLPC_ENABLE_SHADER_CACHE) -endif() - if (LLPC_CLIENT_INTERFACE_MAJOR_VERSION) target_compile_definitions(vkgc_headers INTERFACE LLPC_CLIENT_INTERFACE_MAJOR_VERSION=${LLPC_CLIENT_INTERFACE_MAJOR_VERSION}) else() - # LLPC is not compiled, so fall back to the latest version - target_compile_definitions(vkgc_headers INTERFACE LLPC_CLIENT_INTERFACE_MAJOR_VERSION=LLPC_INTERFACE_MAJOR_VERSION) + message(FATAL_ERROR "Client of LLPC must set LLPC_CLIENT_INTERFACE_MAJOR_VERSION") endif() #if VKI_BUILD_GFX11 @@ -144,6 +137,7 @@ if(ICD_BUILD_LLPC) add_subdirectory(llpc ${PROJECT_BINARY_DIR}/llpc) if(LLPC_BUILD_TESTS) + set(LLVM_INCLUDE_TESTS ON CACHE BOOL "Force enable LLVM_INCLUDE_TESTS to include gmock" FORCE) add_subdirectory(test) endif() @@ -158,3 +152,62 @@ if(ICD_BUILD_LLPC) target_link_libraries(vkgc INTERFACE llpc) endif() + +# Set sub library properties +if(CMAKE_CXX_COMPILER_ID STREQUAL "MSVC") + + if (TARGET dumper) + set_property(TARGET dumper_base PROPERTY FOLDER Compiler) + set_property(TARGET dumper PROPERTY FOLDER Compiler) + endif() + + if (TARGET vfx) + set_property(TARGET vfx PROPERTY FOLDER Compiler) + endif() + + if(ICD_BUILD_LLPC) + set_property(TARGET llpc PROPERTY FOLDER Compiler) + set_property(TARGET llpcinternal PROPERTY FOLDER Compiler) + if(VKI_RAY_TRACING AND NOT LLPC_IS_STANDALONE) + set_property(TARGET vkgc_gpurtshim PROPERTY FOLDER Compiler) + endif() + set_property(TARGET vkgc_util PROPERTY FOLDER Compiler) + if (LLPC_BUILD_TOOLS) + set_property(TARGET amdllpc PROPERTY FOLDER Compiler) + endif() + + set_property(TARGET all-targets PROPERTY FOLDER Misc) + set_property(TARGET AMDGPU PROPERTY FOLDER Misc) + set_property(TARGET benchmark PROPERTY FOLDER Misc) + set_property(TARGET benchmark_main PROPERTY FOLDER Misc) + set_property(TARGET distribution PROPERTY FOLDER Misc) + set_property(TARGET Engine PROPERTY FOLDER Misc) + set_property(TARGET install-distribution PROPERTY FOLDER Misc) + set_property(TARGET install-distribution-stripped PROPERTY FOLDER Misc) + set_property(TARGET LLVMSupportBlake3 PROPERTY FOLDER Misc) + set_property(TARGET Native PROPERTY FOLDER Misc) + set_property(TARGET NativeCodeGen PROPERTY FOLDER Misc) + set_property(TARGET opt-viewer PROPERTY FOLDER Misc) + if (TARGET llvm-dialects-example) + set_property(TARGET llvm-dialects-example PROPERTY FOLDER Misc) + endif() + if (LLVM_OPTIMIZED_TABLEGEN) + set_property(TARGET llvm_nm_target PROPERTY FOLDER Misc) + set_property(TARGET llvm_readobj_target PROPERTY FOLDER Misc) + set_property(TARGET llvm-min-tblgen-host PROPERTY FOLDER Misc) + set_property(TARGET llvm-tblgen-host PROPERTY FOLDER Misc) + set_property(TARGET CONFIGURE_LLVM_NATIVE PROPERTY FOLDER Misc) + set_property(TARGET CREATE_LLVM_NATIVE PROPERTY FOLDER Misc) + endif() + if (LLPC_BUILD_TESTS) + set_property(TARGET check-all PROPERTY FOLDER Tests) + if(NOT LLPC_IS_STANDALONE) + set_property(TARGET check-amber PROPERTY FOLDER "LLPC Tests") + endif() + set_property(TARGET check-amdllpc PROPERTY FOLDER "LLPC Tests") + set_property(TARGET check-amdllpc-units PROPERTY FOLDER "LLPC Tests") + set_property(TARGET check-lgccps-units PROPERTY FOLDER "LgcCps Tests") + set_property(TARGET check-lgc-units PROPERTY FOLDER "LGC Tests") + endif() + endif() +endif() diff --git a/cmake/CompilerFlags.cmake b/cmake/CompilerFlags.cmake index dac5c56630..f586f4bdbe 100644 --- a/cmake/CompilerFlags.cmake +++ b/cmake/CompilerFlags.cmake @@ -1,7 +1,10 @@ function(set_compiler_options PROJECT_NAME ENABLE_WERROR) - target_compile_features("${PROJECT_NAME}" PUBLIC cxx_std_17) - set_target_properties("${PROJECT_NAME}" PROPERTIES CXX_EXTENSIONS OFF) - set_target_properties("${PROJECT_NAME}" PROPERTIES POSITION_INDEPENDENT_CODE ON) + target_compile_features(${PROJECT_NAME} PUBLIC cxx_std_20) + set_target_properties(${PROJECT_NAME} PROPERTIES + CXX_STANDARD 20 + CXX_STANDARD_REQUIRED ON + CXX_EXTENSIONS OFF + POSITION_INDEPENDENT_CODE ON) if(CMAKE_CXX_COMPILER_ID MATCHES "GNU|Clang") if(ENABLE_WERROR) diff --git a/cmake/continuations.cmake b/cmake/continuations.cmake new file mode 100644 index 0000000000..6e02acff59 --- /dev/null +++ b/cmake/continuations.cmake @@ -0,0 +1,43 @@ +## + ####################################################################################################################### + # + # Copyright (c) 2023 Advanced Micro Devices, Inc. All Rights Reserved. + # + # Permission is hereby granted, free of charge, to any person obtaining a copy + # of this software and associated documentation files (the "Software"), to deal + # in the Software without restriction, including without limitation the rights + # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + # copies of the Software, and to permit persons to whom the Software is + # furnished to do so, subject to the following conditions: + # + # The above copyright notice and this permission notice shall be included in all + # copies or substantial portions of the Software. + # + # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + # SOFTWARE. + # + ####################################################################################################################### + +set(LLPC_SOURCE_DIR "${CMAKE_CURRENT_LIST_DIR}/..") + +# Function to add continuations and its dependencies as LLVM external projects. +# This appends the project names to LLVM_EXTERNAL_PROJECTS and sets each LLVM_EXTERNAL_*_SOURCE_DIR, +# all in the caller's scope. +function(add_continuations_projects) + if (NOT "${LLVM_EXTERNAL_CONTINUATIONS_SOURCE_DIR}") + if (NOT "${LLVM_EXTERNAL_LLVM_DIALECTS_SOURCE_DIR}") + list(APPEND LLVM_EXTERNAL_PROJECTS llvm_dialects) + set(LLVM_EXTERNAL_LLVM_DIALECTS_SOURCE_DIR "${LLPC_SOURCE_DIR}/imported/llvm-dialects" PARENT_SCOPE) + endif() + list(APPEND LLVM_EXTERNAL_PROJECTS lgccps lgcrt continuations) + set(LLVM_EXTERNAL_CONTINUATIONS_SOURCE_DIR "${LLPC_SOURCE_DIR}/shared/continuations" PARENT_SCOPE) + set(LLVM_EXTERNAL_LGCRT_SOURCE_DIR "${LLPC_SOURCE_DIR}/shared/lgcrt" PARENT_SCOPE) + set(LLVM_EXTERNAL_LGCCPS_SOURCE_DIR "${LLPC_SOURCE_DIR}/shared/lgccps" PARENT_SCOPE) + set(LLVM_EXTERNAL_PROJECTS "${LLVM_EXTERNAL_PROJECTS}" PARENT_SCOPE) + endif() +endfunction() diff --git a/cmake/lgc.cmake b/cmake/lgc.cmake index c94c1fc984..45ba0a2111 100644 --- a/cmake/lgc.cmake +++ b/cmake/lgc.cmake @@ -34,9 +34,11 @@ function(add_lgc_projects) list(APPEND LLVM_EXTERNAL_PROJECTS llvm_dialects) set(LLVM_EXTERNAL_LLVM_DIALECTS_SOURCE_DIR "${LLPC_SOURCE_DIR}/imported/llvm-dialects" PARENT_SCOPE) endif() - list(APPEND LLVM_EXTERNAL_PROJECTS LgcCps lgc) + list(APPEND LLVM_EXTERNAL_PROJECTS LgcCps LgcRt Continuations lgc) set(LLVM_EXTERNAL_PROJECTS "${LLVM_EXTERNAL_PROJECTS}" PARENT_SCOPE) set(LLVM_EXTERNAL_LGCCPS_SOURCE_DIR "${LLPC_SOURCE_DIR}/shared/lgccps" PARENT_SCOPE) + set(LLVM_EXTERNAL_LGCRT_SOURCE_DIR "${LLPC_SOURCE_DIR}/shared/lgcrt" PARENT_SCOPE) + set(LLVM_EXTERNAL_CONTINUATIONS_SOURCE_DIR "${LLPC_SOURCE_DIR}/shared/continuations" PARENT_SCOPE) set(LLVM_EXTERNAL_LGC_SOURCE_DIR "${LLPC_SOURCE_DIR}/lgc" PARENT_SCOPE) endif() endfunction() diff --git a/imported/llvm-dialects b/imported/llvm-dialects index b5c7de36ef..8c54ca076f 160000 --- a/imported/llvm-dialects +++ b/imported/llvm-dialects @@ -1 +1 @@ -Subproject commit b5c7de36efa50805e497f8358b2e60a6e94a64d3 +Subproject commit 8c54ca076fbf841dc5d22da8b6a1d434a01b153c diff --git a/include/gpurt-compiler.h b/include/gpurt-compiler.h index c2d8c81b25..ad6d3259a5 100644 --- a/include/gpurt-compiler.h +++ b/include/gpurt-compiler.h @@ -92,6 +92,15 @@ struct DispatchRaysConstantData { unsigned profileMaxIterations; ///< Maximum traversal iterations for profiling unsigned traceRayGpuVaLo; ///< Traversal shader (shader table) base address low 32-bits unsigned traceRayGpuVaHi; ///< Traversal shader (shader table) base address high 32-bits + unsigned counterMode; ///< Counter capture mode. see TraceRayCounterMode + unsigned counterRayIdRangeBegin; ///< Counter capture ray ID range begin + unsigned counterRayIdRangeEnd; ///< Counter capture ray ID range end + unsigned cpsBackendStackSize; ///< The scratch memory used as stacks are divided into two parts: + ///< (a) Used by a compiler backend, start at offset 0. + unsigned cpsFrontendStackSize; ///< (b) Used by IR (Intermediate Representation), for a continuation passing shader. + unsigned cpsGlobalMemoryAddressLo; ///< Separate CPS stack memory base address low 32-bits + unsigned cpsGlobalMemoryAddressHi; ///< Separate CPS stack memory base address high 32-bits + unsigned counterMask; ///< Mask for filtering ray history token }; #pragma pack(pop) diff --git a/include/vkgcDefs.h b/include/vkgcDefs.h index aafccacedc..6541a3d7b8 100644 --- a/include/vkgcDefs.h +++ b/include/vkgcDefs.h @@ -45,7 +45,7 @@ #endif /// LLPC major interface version. -#define LLPC_INTERFACE_MAJOR_VERSION 65 +#define LLPC_INTERFACE_MAJOR_VERSION 68 /// LLPC minor interface version. #define LLPC_INTERFACE_MINOR_VERSION 0 @@ -58,10 +58,6 @@ #error LLPC client version is too old #endif -#ifndef LLPC_ENABLE_SHADER_CACHE -#define LLPC_ENABLE_SHADER_CACHE 0 -#endif - /// LLPC_NODISCARD - Warns when function return value is discarded. // // We cannot use the 'nodiscard' attribute until we upgrade to C++17 or newer mode. @@ -83,23 +79,32 @@ // %Version History // | %Version | Change Description | // | -------- | ----------------------------------------------------------------------------------------------------- | +// | 68.0 | Remove ICache *cache in all PipelineBuildInfo | +// | 67.0 | Modify the uber fetch shader. Adds locationMask(64bit) at the beginning of uber fetch shader internal | +// | | buffer which flags whether the related attribute data is valid. | +// | 66.0 | Remove shader cache in LLPC | +// | 65.6 | Add Result::RequireFullPipeline, returned if unlink shader fails. | +// | 65.5 | Rename noContract in PipelineShaderOptions to noContractOpDot | +// | 65.4 | Add disableSampleMask to PipelineOptions | +// | 65.3 | Add originUpperLeft to GraphicsPipelineBuildInfo | +// | 65.2 | Support SPIRV extended vertex attribute formats during vertex fetch module. | // | 65.0 | Remove updateDescInElf | // | 64.2 | Add dynamicSampleInfo to GraphicsPipelineBuildInfo::rsState | // | 64.1 | Add disableTruncCoordForGather to PipelineOptions. | // | 64.0 | Add enableColorExportShader to GraphicsPipelineBuildInfo. | -// | 63.3 | Add TesssellationLevel to iaState | +// | 63.3 | Add TessellationLevel to iaState | // | 63.2 | Add vertex64BitsAttribSingleLoc to PipelineOptions | // | 63.1 | Add forceDisableStreamOut and forceEnablePrimStats to ApiXfbOutData | -// | 63.0 | Add Atomic Counter, its default descriptor and map its concertType to Buffer. | +// | 63.0 | Add Atomic Counter, its default descriptor and map its concreteType to Buffer. | // | 62.1 | Add ApiXfbOutData GraphicsPipelineBuildInfo | // | 62.0 | Default to the compiler getting the GPURT library directly, and move shader library info into RtState | // | 61.16| Add replaceSetWithResourceType to PipelineOptions | // | 61.15| Add disableReadFirstLaneWorkaround to PipelineShaderOptions | -// | 61.14| Add rasterStream to rsState | +// | 61.14| Add rasterStream to rsState // | 61.13| Add dualSourceBlendDynamic to cbState | // | 61.12| Add mode to RayTracingPipelineBuildInfo | // | 61.11| Add UniformConstantMap and related structures | -// | 61.10| Add useShadingRate and useSampleInfoto ShaderModuleUsage | +// | 61.10| Add useShadingRate and useSampleInfo to ShaderModuleUsage | // | 61.8 | Add enableImplicitInvariantExports to PipelineOptions | // | 61.7 | Add disableFMA to PipelineShaderOptions | // | 61.6 | Add workaroundInitializeOutputsToZero to PipelineShaderOptions | @@ -141,7 +146,7 @@ // | 52.1 | Add pageMigrationEnabled to PipelineOptions | // | 52.0 | Add the member word4 and word5 to SamplerYCbCrConversionMetaData | // | 51.2 | Added new pipeline shader info to support mesh shader | -// | 51.0 | Added new shader stage enumerants to support mesh shader | +// | 51.0 | Added new shader stage enumerates to support mesh shader | // | 50.2 | Add the member dsState to GraphicsPipelineBuildInfo | // | 50.1 | Disclose ResourceMappingNodeType::InlineBuffer | // | 50.0 | Removed the member 'enableOpt' of ShaderModuleOptions | @@ -163,7 +168,7 @@ // | 45.0 | Remove the member 'enableFastLaunch' of NGG state | // | 44.0 | Rename the member 'forceNonPassthrough' of NGG state to 'forceCullingMode' | // | 43.1 | Add disableImageResourceCheck in PipelineOptions | -// | 43.0 | Removed the enumerant WaveBreakSize::DrawTime | +// | 43.0 | Removed the enumerate WaveBreakSize::DrawTime | // | 42.0 | Removed tileOptimal flag from SamplerYcbcrConversion metadata struct | // | 41.0 | Moved resource mapping from ShaderPipeline-level to Pipeline-level | // | 40.4 | Added fp32DenormalMode in PipelineShaderOptions to allow overriding SPIR-V denormal settings | @@ -171,7 +176,7 @@ // | 40.2 | Added extendedRobustness in PipelineOptions to support VK_EXT_robustness2 | // | 40.1 | Added disableLoopUnroll to PipelineShaderOptions | // | 40.0 | Added DescriptorReserved12, which moves DescriptorYCbCrSampler down to 13 | -// | 39.0 | Non-LLPC-specific XGL code should #include vkcgDefs.h instead of llpc.h | +// | 39.0 | Non-LLPC-specific XGL code should #include vkgcDefs.h instead of llpc.h | // | 38.3 | Added shadowDescriptorTableUsage and shadowDescriptorTablePtrHigh to PipelineOptions | // | 38.2 | Added scalarThreshold to PipelineShaderOptions | // | 38.1 | Added unrollThreshold to PipelineShaderOptions | @@ -234,6 +239,8 @@ enum class Result : int { NotReady = 0x00000003, // A required resource (e.g. cache entry) was not found. NotFound = 0x00000004, + /// Required full pipeline compilation + RequireFullPipeline = 0x00000005, /// The requested operation is unavailable at this time ErrorUnavailable = -(0x00000001), /// The operation could not complete due to insufficient system memory @@ -355,7 +362,7 @@ enum GlCompatibilityAttributeLocation : unsigned { SecondaryColor, ///< Internal vertex attribute of gl_SecondaryColor. FogCoord, ///< Internal vertex attribute of gl_FogCoord. ColorIndex, ///< Internal vertex attribute to pass on index of gl_MultiTexCoord. - EdgeFlag, ///< Internal vertex attribute use to pass on the edeg flag. + EdgeFlag, ///< Internal vertex attribute use to pass on the edge flag. Texcoord0, ///< Internal vertex attribute of gl_MultiTexCoord0. BaseinstanceOffset = Texcoord0 + GlCompatibilityLimits::MaxTextureCoords, ///< Internal vertex attribute: BaseInstanceOffset, @@ -573,11 +580,13 @@ struct PipelineOptions { bool internalRtShaders; ///< Whether this pipeline has internal raytracing shaders unsigned forceNonUniformResourceIndexStageMask; ///< Mask of the stage to force using non-uniform resource index. bool reserved16; - bool replaceSetWithResourceType; ///< For OGL only, replace 'set' with resource type during spirv translate - bool disableTruncCoordForGather; //< If set, trunc_coord of sampler srd is disabled for gather4 - bool enableCombinedTexture; ///< For OGL only, use the 'set' for DescriptorCombinedTexture - ///< for sampled images and samplers - bool vertex64BitsAttribSingleLoc; ///< For OGL only, dvec3/dvec4 vertex attrib only consumes 1 location. + bool replaceSetWithResourceType; ///< For OGL only, replace 'set' with resource type during spirv translate + bool disableSampleMask; ///< For OGL only, disabled if framebuffer doesn't attach multisample texture + bool disableTruncCoordForGather; ///< If set, trunc_coord of sampler srd is disabled for gather4 + bool enableCombinedTexture; ///< For OGL only, use the 'set' for DescriptorCombinedTexture + ///< for sampled images and samplers + bool vertex64BitsAttribSingleLoc; ///< For OGL only, dvec3/dvec4 vertex attrib only consumes 1 location. + unsigned reserved20; }; /// Prototype of allocator for output data buffer, used in shader-specific operations. @@ -628,6 +637,11 @@ struct ShaderModuleUsage { bool useShadingRate; ///< Whether shading rate is used bool useSampleInfo; ///< Whether gl_SamplePosition or InterpolateAtSample are used bool useClipVertex; ///< Whether gl_useClipVertex is used + bool useFragCoord; ///< Whether gl_FragCoord is used + bool originUpperLeft; ///< Whether pixel origin is upper-left + bool pixelCenterInteger; ///< Whether pixel coord is Integer + bool useGenericBuiltIn; ///< Whether to use builtIn inputs that include gl_PointCoord, gl_PrimitiveId, + /// gl_Layer, gl_ClipDistance or gl_CullDistance. }; /// Represents common part of shader module data @@ -798,14 +812,14 @@ struct PipelineShaderOptions { /// Threshold number of blocks in a loop for LICM pass to be disabled. unsigned disableLicmThreshold; - /// Threshold to use for loops with "Unroll" hint (0 = use llvm.llop.unroll.full). + /// Threshold to use for loops with "Unroll" hint (0 = use llvm.loop.unroll.full). unsigned unrollHintThreshold; - /// Threshold to use for loops with "DontUnroll" hint (0 = use llvm.llop.unroll.disable). + /// Threshold to use for loops with "DontUnroll" hint (0 = use llvm.loop.unroll.disable). unsigned dontUnrollHintThreshold; - /// Whether fastmath contract could be disabled - bool noContract; + /// Whether fast math contract could be disabled on Dot operations. + bool noContractOpDot; /// The enabled fast math flags (0 = depends on input language). unsigned fastMathFlags; @@ -969,6 +983,35 @@ constexpr uint32_t UberFetchShaderAttribMaskComponent2 = 0x0040000u; constexpr uint32_t UberFetchShaderAttribMaskComponent3 = 0x0080000u; constexpr uint32_t UberFetchShaderAttribMaskIsBgra = 0x0100000u; +/// Represents the bit field info of struct BilUberFetchShaderAttribInfo + +// OpenGL extended vertex attribute format +typedef enum VKInternalExtFormat { + VK_FORMAT_EXT_R32_UNORM = 0x00020000, + VK_FORMAT_EXT_R32_SNORM = 0x00020001, + VK_FORMAT_EXT_R32G32_UNORM = 0x00020002, + VK_FORMAT_EXT_R32G32_SNORM = 0x00020003, + VK_FORMAT_EXT_R32G32B32_UNORM = 0x00020004, + VK_FORMAT_EXT_R32G32B32_SNORM = 0x00020005, + VK_FORMAT_EXT_R32G32B32A32_UNORM = 0x00020006, + VK_FORMAT_EXT_R32G32B32A32_SNORM = 0x00020007, + VK_FORMAT_EXT_R32_FIXED = 0x00020008, + VK_FORMAT_EXT_R32G32_FIXED = 0x00020009, + VK_FORMAT_EXT_R32G32B32_FIXED = 0x0002000A, + VK_FORMAT_EXT_R32G32B32A32_FIXED = 0x0002000B, + VK_FORMAT_EXT_R32_USCALED = 0x0002000C, + VK_FORMAT_EXT_R32_SSCALED = 0x0002000D, + VK_FORMAT_EXT_R32G32_USCALED = 0x0002000E, + VK_FORMAT_EXT_R32G32_SSCALED = 0x0002000F, + VK_FORMAT_EXT_R32G32B32_USCALED = 0x00020010, + VK_FORMAT_EXT_R32G32B32_SSCALED = 0x00020011, + VK_FORMAT_EXT_R32G32B32A32_USCALED = 0x00020012, + VK_FORMAT_EXT_R32G32B32A32_SSCALED = 0x00020013, + VK_FORMAT_EXT_BEGIN_RANGE = VK_FORMAT_EXT_R32_UNORM, + VK_FORMAT_EXT_END_RANGE = VK_FORMAT_EXT_R32G32B32A32_SSCALED, + VK_FORMAT_EXT_RANGE_SIZE = VK_FORMAT_EXT_END_RANGE - VK_FORMAT_EXT_BEGIN_RANGE + 1 +} VKInternalExtFormat; + /// Represents info of a shader attached to a to-be-built pipeline. struct PipelineShaderInfo { const void *pModuleData; ///< Shader module data used for pipeline building (opaque) @@ -1147,21 +1190,21 @@ struct UniformConstantMap { UniformConstantMapEntry *pUniforms; ///< Mapping of for uniform constant }; -/// Represents transform feedback info for the caputred output +/// Represents transform feedback info for the captured output struct XfbOutInfo { bool isBuiltIn; ///< Determine if it is a built-in output unsigned location; ///< If isBuiltIn is true, it is the buildIn Id unsigned component; ///< The component offset within a location unsigned xfbBuffer; ///< The transform feedback buffer captures the output unsigned xfbOffset; ///< The byte offset in the transform feedback buffer - unsigned xfbStride; ///< The bytes consumed by a caputred vertex in the transform feedback buffer + unsigned xfbStride; ///< The bytes consumed by a captured vertex in the transform feedback buffer unsigned streamId; ///< The stream index }; /// Represents the transform feedback data filled by API interface struct ApiXfbOutData { - XfbOutInfo *pXfbOutInfos; ///< An array of XfbOutInfo iterms - unsigned numXfbOutInfo; ///< Count of XfbOutInfo iterms + XfbOutInfo *pXfbOutInfos; ///< An array of XfbOutInfo items + unsigned numXfbOutInfo; ///< Count of XfbOutInfo items bool forceDisableStreamOut; ///< Force to disable stream out XFB outputs bool forceEnablePrimStats; ///< Force to enable counting generated primitives }; @@ -1177,9 +1220,8 @@ struct GraphicsPipelineBuildInfo { void *pInstance; ///< Vulkan instance object void *pUserData; ///< User data OutputAllocFunc pfnOutputAlloc; ///< Output buffer allocator - ICache *cache; ///< ICache, used to search for the compiled shader data -#if LLPC_ENABLE_SHADER_CACHE - IShaderCache *pShaderCache; ///< Shader cache, used to search for the compiled shader data +#if LLPC_CLIENT_INTERFACE_MAJOR_VERSION < 68 + ICache *cache; ///< ICache, used to search for the compiled shader data #endif PipelineShaderInfo task; ///< Task shader PipelineShaderInfo vs; ///< Vertex shader @@ -1254,11 +1296,12 @@ struct GraphicsPipelineBuildInfo { BinaryData shaderLibrary; ///< SPIR-V library binary data #endif RtState rtState; ///< Ray tracing state + bool originUpperLeft; ///< Whether origin coordinate of framebuffer is upper-left. const void *pClientMetadata; ///< Pointer to (optional) client-defined data to be stored inside the ELF size_t clientMetadataSize; ///< Size (in bytes) of the client-defined data unsigned numUniformConstantMaps; ///< Number of uniform constant maps UniformConstantMap **ppUniformMaps; ///< Pointers to array of pointers for the uniform constant map. - ApiXfbOutData apiXfbOutData; ///< Transform feedback data specified by API interface. + ApiXfbOutData apiXfbOutData; ///< Transform feedback data specified by API inteface. }; /// Represents info to build a compute pipeline. @@ -1266,9 +1309,8 @@ struct ComputePipelineBuildInfo { void *pInstance; ///< Vulkan instance object void *pUserData; ///< User data OutputAllocFunc pfnOutputAlloc; ///< Output buffer allocator - ICache *cache; ///< ICache, used to search for the compiled shader data -#if LLPC_ENABLE_SHADER_CACHE - IShaderCache *pShaderCache; ///< Shader cache, used to search for the compiled shader data +#if LLPC_CLIENT_INTERFACE_MAJOR_VERSION < 68 + ICache *cache; ///< ICache, used to search for the compiled shader data #endif unsigned deviceIndex; ///< Device index for device group PipelineShaderInfo cs; ///< Compute shader @@ -1287,10 +1329,12 @@ struct ComputePipelineBuildInfo { /// Represents output of building a ray tracing pipeline. struct RayTracingPipelineBuildInfo { - void *pInstance; ///< Vulkan instance object - void *pUserData; ///< User data - OutputAllocFunc pfnOutputAlloc; ///< Output buffer allocator - ICache *cache; ///< ICache, used to search for the compiled shader data + void *pInstance; ///< Vulkan instance object + void *pUserData; ///< User data + OutputAllocFunc pfnOutputAlloc; ///< Output buffer allocator +#if LLPC_CLIENT_INTERFACE_MAJOR_VERSION < 68 + ICache *cache; ///< ICache, used to search for the compiled shader data +#endif unsigned deviceIndex; ///< Device index for device group unsigned deviceCount; ///< Device count for device group unsigned shaderCount; ///< Count of shader info @@ -1328,6 +1372,8 @@ struct RayTracingShaderProperty { uint64_t shaderId; ///< Ray tracing compiled shader ID char name[RayTracingMaxShaderNameLength]; ///< Ray tracing compiled shader name bool hasTraceRay; ///< Whether TraceRay() is used + bool onlyGpuVaLo; ///< Whether shader identifier LSB metadata is applied + uint64_t shaderIdExtraBits; ///< Raytracing shader identifier extra bits }; /// Represents ray-tracing shader identifier. @@ -1338,21 +1384,6 @@ struct RayTracingShaderIdentifier { uint64_t padding; ///< Padding to meet 32-byte api requirement and 8-byte alignment for descriptor table offset }; -/// The mapping method that the driver should apply to the shader IDs returned by the compiler in -/// RayTracingShaderIdentifier structures. -enum class RayTracingShaderIdentifierMapping { - /// No mapping, take IDs verbatim. - None, - - /// Interpret the ID as an index into the ELF module array and produce the full GPU VA of its unique contained - /// function. - ElfModuleGpuVa, - - /// Interpret the ID as an index into the ELF module array and produce the low 32 bits of the GPU VA of its unique - /// contained function. The high 32 bits remain 0 (but extraBits are still applied if present). - ElfModuleGpuVaLo, -}; - /// Values to be bitwise OR'd into the result of the mapping procedure applied to shader IDs. /// /// The compiler may use this to encode additional metadata into bits that are otherwise unused, e.g. LSBs that are @@ -1371,13 +1402,8 @@ struct RayTracingCaptureReplayVaMappingEntry { /// Represents the handles of shader group for ray-tracing pipeline struct RayTracingShaderGroupHandle { - unsigned shaderHandleCount; ///< Count of shader group handle array - RayTracingShaderIdentifier *shaderHandles; ///< Shader group handle array - RayTracingShaderIdentifierExtraBits *extraBits; ///< Extra bits array, one for each shader handle (may be null) - - RayTracingShaderIdentifierMapping shaderMapping; ///< Mapping applied to shaderIds - RayTracingShaderIdentifierMapping anyHitMapping; ///< Mapping applied to anyHitIds - RayTracingShaderIdentifierMapping intersectionMapping; ///< Mapping applied to intersectionIds + unsigned shaderHandleCount; ///< Count of shader group handle array + RayTracingShaderIdentifier *shaderHandles; ///< Shader group handle array }; /// Represents a set of ray-tracing shaders referenced by a ray-tracing pipeline diff --git a/lgc/CMakeLists.txt b/lgc/CMakeLists.txt index d68ac08842..814b9ba088 100644 --- a/lgc/CMakeLists.txt +++ b/lgc/CMakeLists.txt @@ -57,6 +57,9 @@ target_link_libraries(LLVMlgc PUBLIC llvm_dialects) ### Cached Project Options ############################################################################################# option(LLPC_BUILD_NAVI12 "LLPC support for NAVI12?" ON) +#if VKI_BUILD_NAVI32 +option(LLPC_BUILD_NAVI32 "LLPC support for NAVI32?" ON) +#endif option(LLPC_BUILD_REMBRANDT "LLPC support for REMBRANDT?" ON) option(LLPC_BUILD_RAPHAEL "LLPC support for RAPHAEL?" ON) option(LLPC_BUILD_MENDOCINO "LLPC support for MENDOCINO?" ON) @@ -71,7 +74,7 @@ set_compiler_options(LLVMlgc ${LLPC_ENABLE_WERROR}) ### TableGen for LGC dialect ########################################################################################### -set(LGC_TABLEGEN_EXE ${LLVM_TOOLS_BINARY_DIR}/llvm-dialects-tblgen) +set(LGC_TABLEGEN_EXE $) set(LGC_TABLEGEN_TARGET llvm-dialects-tblgen) set(LLVM_TARGET_DEFINITIONS interface/lgc/LgcDialect.td) @@ -124,6 +127,14 @@ target_compile_definitions(LLVMlgc PRIVATE CHIP_HDR_RENOIR) CHIP_HDR_NAVI12 ) endif() +#if VKI_BUILD_NAVI32 + if(LLPC_BUILD_NAVI32) + target_compile_definitions(LLVMlgc PRIVATE + LLPC_BUILD_NAVI32 + CHIP_HDR_NAVI32 + ) + endif() +#endif if(LLPC_BUILD_REMBRANDT) target_compile_definitions(LLVMlgc PRIVATE LLPC_BUILD_REMBRANDT @@ -201,6 +212,7 @@ target_sources(LLVMlgc PRIVATE # lgc/patch target_sources(LLVMlgc PRIVATE patch/ConfigBuilderBase.cpp + patch/Continufy.cpp patch/FragColorExport.cpp patch/Gfx6Chip.cpp patch/Gfx6ConfigBuilder.cpp @@ -276,4 +288,4 @@ if (LLPC_BUILD_TESTS) add_subdirectory(unittests) endif() -target_link_libraries(LLVMlgc PRIVATE LLVMLgcCps) +target_link_libraries(LLVMlgc PRIVATE LLVMLgcCps LLVMLgcRt) diff --git a/lgc/builder/ArithBuilder.cpp b/lgc/builder/ArithBuilder.cpp index 6112d8ac60..f47d8b1ec5 100644 --- a/lgc/builder/ArithBuilder.cpp +++ b/lgc/builder/ArithBuilder.cpp @@ -114,7 +114,7 @@ Value *BuilderImpl::CreateFpTruncWithRounding(Value *value, Type *destTy, Roundi // RTN/RTP: Use fptrunc_round intrinsic. StringRef roundingModeStr = convertRoundingModeToStr(roundingMode).value(); Value *roundingMode = MetadataAsValue::get(getContext(), MDString::get(getContext(), roundingModeStr)); - Value *result = scalarize(value, [=](Value *inValue) { + Value *result = scalarize(value, [=, this](Value *inValue) { return CreateIntrinsic(Intrinsic::fptrunc_round, {getHalfTy(), inValue->getType()}, {inValue, roundingMode}); }); result->setName(instName); diff --git a/lgc/builder/BuilderImpl.cpp b/lgc/builder/BuilderImpl.cpp index 076244ab89..2a3197be06 100644 --- a/lgc/builder/BuilderImpl.cpp +++ b/lgc/builder/BuilderImpl.cpp @@ -30,8 +30,10 @@ */ #include "lgc/builder/BuilderImpl.h" #include "lgc/LgcContext.h" +#include "lgc/LgcDialect.h" #include "lgc/state/PipelineState.h" #include "lgc/state/TargetInfo.h" +#include "llvm/IR/IntrinsicInst.h" #include "llvm/IR/IntrinsicsAMDGPU.h" using namespace lgc; @@ -338,23 +340,13 @@ BranchInst *BuilderImpl::createIf(Value *condition, bool wantElse, const Twine & // find the non-uniform index used in it. If that fails, we just use the // operand value as the index. // -// Note: this code has to cope with relocs as well, this is why we have to -// have a worklist of instructions to trace back -// through. Something like this: -// %1 = call .... @lgc.descriptor.set(...) ;; Known uniform base -// %2 = call .... @llvm.amdgcn.reloc.constant(...) ;; Known uniform reloc constant -// %3 = ptrtoint ... %1 to i64 -// %4 = zext ... %2 to i64 -// %5 = add i64 %3, %4 -// %6 = inttoptr i64 %5 to .... -// %7 = bitcast .... %6 to .... -// %8 = getelementptr .... %7, i64 %offset +// Note that this function may return null, which means that the given value has been shown to be uniform. // -// As long as the base pointer %7 can be traced back to a descriptor set and -// reloc we can infer that it is truly uniform and use the gep index as the waterfall index safely. +// This uses a fairly simple heuristic that nevertheless allows temporary expansion of the search breadth to handle +// the common case where a base pointer is assembled from separate high and low halves. // // @param nonUniformVal : Value representing non-uniform descriptor -// @return : Value representing the non-uniform index +// @return : Value representing the non-uniform index, or null if nonUniformVal could be proven to be uniform static Value *traceNonUniformIndex(Value *nonUniformVal) { auto load = dyn_cast(nonUniformVal); if (!load) { @@ -388,90 +380,128 @@ static Value *traceNonUniformIndex(Value *nonUniformVal) { } } - SmallVector worklist; - Value *base = load->getOperand(0); - Value *index = nullptr; + auto getSize = [](Value *value) -> uint64_t { + uint64_t size = value->getType()->getPrimitiveSizeInBits().getFixedValue(); + return size ? size : std::numeric_limits::max(); + }; - // Loop until a descriptor table reference or unexpected operation is reached. - // In the worst case this may visit all instructions in a function. - for (;;) { - if (auto bitcast = dyn_cast(base)) { - base = bitcast->getOperand(0); - continue; + uint64_t nonUniformValSize = getSize(nonUniformVal); + + // Loop until all nonUniforms have been found to be uniform or a heuristic abort criterion has been reached. + Value *candidateIndex = nullptr; + SmallVector nonUniforms; + nonUniforms.push_back(load); + + auto propagate = [&](Value *value) -> bool { + if (auto inst = dyn_cast(value)) { + if (nonUniforms.size() >= 2) + return false; + nonUniforms.push_back(inst); + return true; } - if (auto gep = dyn_cast(base)) { - if (gep->hasAllConstantIndices()) { - base = gep->getPointerOperand(); - continue; + return isa(value); + }; + + do { + Instruction *current = nonUniforms.pop_back_val(); + + // Immediately replace the current nonUniformVal by a strictly smaller one if possible. + if (!candidateIndex && nonUniforms.empty() && current != nonUniformVal) { + uint64_t size = getSize(current); + if (size < nonUniformValSize) { + nonUniformVal = current; + nonUniformValSize = size; } - // Variable GEP, to provide the index for the waterfall. - if (index || gep->getNumIndices() != 1) - break; - index = *gep->idx_begin(); - base = gep->getPointerOperand(); + } + + // See if we can propagate the search further. + if (current->isCast() || current->isUnaryOp()) { + if (!propagate(current->getOperand(0))) + return nonUniformVal; continue; } - if (auto extract = dyn_cast(base)) { - if (extract->getIndices().size() == 1 && extract->getIndices()[0] == 0) { - base = extract->getAggregateOperand(); - continue; - } - break; + + if (current->isBinaryOp()) { + if (!propagate(current->getOperand(0)) || !propagate(current->getOperand(1))) + return nonUniformVal; + continue; } - if (auto insert = dyn_cast(base)) { - if (insert->getIndices()[0] != 0) { - base = insert->getAggregateOperand(); - continue; - } - if (insert->getIndices().size() == 1 && insert->getIndices()[0] == 0) { - base = insert->getInsertedValueOperand(); + + if (auto *load = dyn_cast(current)) { + Value *ptr = load->getPointerOperand(); + unsigned as = ptr->getType()->getPointerAddressSpace(); + if (as == ADDR_SPACE_FLAT || as == ADDR_SPACE_PRIVATE) + return nonUniformVal; // load is a source of divergence, can't propagate + + if (!propagate(ptr)) + return nonUniformVal; + continue; + } + + if (auto gep = dyn_cast(current)) { + if (gep->hasAllConstantIndices()) { + if (!propagate(gep->getPointerOperand())) + return nonUniformVal; continue; } - break; + + // Variable GEP, assume that the index is non-uniform. + if (candidateIndex || gep->getNumIndices() != 1) + return nonUniformVal; + + if (!propagate(gep->getPointerOperand())) + return nonUniformVal; + + candidateIndex = *gep->idx_begin(); + if (getSize(candidateIndex) > nonUniformValSize) + return nonUniformVal; // propagating further is worthless + continue; } - if (auto intToPtr = dyn_cast(base)) { - base = intToPtr->getOperand(0); + + if (auto extract = dyn_cast(current)) { + if (!propagate(extract->getAggregateOperand())) + return nonUniformVal; continue; } - if (auto ptrToInt = dyn_cast(base)) { - base = ptrToInt->getOperand(0); + if (auto insert = dyn_cast(current)) { + if (!propagate(insert->getAggregateOperand()) || !propagate(insert->getInsertedValueOperand())) + return nonUniformVal; continue; } - if (auto zExt = dyn_cast(base)) { - base = zExt->getOperand(0); + if (auto extract = dyn_cast(current)) { + if (!isa(extract->getIndexOperand()) || !propagate(extract->getVectorOperand())) + return nonUniformVal; continue; } - if (auto call = dyn_cast(base)) { - if (index) { - if (auto calledFunc = call->getCalledFunction()) { - if (calledFunc->getName().startswith(lgcName::DescriptorTableAddr) || - calledFunc->getName().startswith("llvm.amdgcn.reloc.constant")) { - if (!worklist.empty()) { - base = worklist.pop_back_val(); - continue; - } - nonUniformVal = index; - break; - } - } - } + if (auto insert = dyn_cast(current)) { + if (!isa(insert->getOperand(2)) || !propagate(insert->getOperand(0)) || + !propagate(insert->getOperand(1))) + return nonUniformVal; + continue; } - if (auto addInst = dyn_cast(base)) { - // In this case we have to trace back both operands - // Set one to base for continued processing and put the other onto the worklist - // Give up if the worklist already has an entry - too complicated - if (addInst->isBinaryOp() && addInst->getOpcode() == Instruction::BinaryOps::Add) { - if (!worklist.empty()) - break; - base = addInst->getOperand(0); - worklist.push_back(addInst->getOperand(1)); - continue; + + if (auto call = dyn_cast(current)) { + if (auto intrinsic = dyn_cast(call)) { + unsigned id = intrinsic->getIntrinsicID(); + if (id == Intrinsic::amdgcn_readfirstlane || id == Intrinsic::amdgcn_s_getpc || + id == Intrinsic::amdgcn_reloc_constant) + continue; // is always uniform, no need to propagate + return nonUniformVal; } + + if (isa(call) || isa(call)) + continue; // is always uniform, no need to propagate + + return nonUniformVal; } - break; - } - return nonUniformVal; + // If we reach this point, it means we don't understand the instruction. It's likely a fairly complex instruction + // and we should heuristically abort the propagation anyway. It may even be a source of divergence, in which case + // propagating further would be incorrect. + return nonUniformVal; + } while (!nonUniforms.empty()); + + return candidateIndex; } // ===================================================================================================================== @@ -526,8 +556,11 @@ Instruction *BuilderImpl::createWaterfallLoop(Instruction *nonUniformInst, Array SmallVector nonUniformIndices; for (unsigned operandIdx : operandIdxs) { Value *nonUniformIndex = traceNonUniformIndex(nonUniformInst->getOperand(operandIdx)); - nonUniformIndices.push_back(nonUniformIndex); + if (nonUniformIndex) + nonUniformIndices.push_back(nonUniformIndex); } + if (nonUniformIndices.empty()) + return nonUniformInst; // For any index that is 64 bit, change it back to 32 bit for comparison at the top of the // waterfall loop. diff --git a/lgc/builder/BuilderRecorder.cpp b/lgc/builder/BuilderRecorder.cpp index f8db537f42..819b54d49e 100644 --- a/lgc/builder/BuilderRecorder.cpp +++ b/lgc/builder/BuilderRecorder.cpp @@ -228,8 +228,6 @@ StringRef BuilderRecorder::getCallName(BuilderOpcode opcode) { return "demote.to.helper.invocation"; case BuilderOpcode::IsHelperInvocation: return "is.helper.invocation"; - case BuilderOpcode::SetMeshOutputs: - return "set.mesh.outputs"; case BuilderOpcode::ImageLoad: return "image.load"; case BuilderOpcode::ImageLoadWithFmask: @@ -256,8 +254,6 @@ StringRef BuilderRecorder::getCallName(BuilderOpcode opcode) { return "image.get.lod"; case BuilderOpcode::ImageBvhIntersectRay: return "image.bvh.intersect.ray"; - case BuilderOpcode::Reserved2: - return "reserved2"; case BuilderOpcode::GetWaveSize: return "get.wave.size"; case BuilderOpcode::GetSubgroupSize: @@ -270,6 +266,8 @@ StringRef BuilderRecorder::getCallName(BuilderOpcode opcode) { return "subgroup.any"; case BuilderOpcode::SubgroupAllEqual: return "subgroup.all.equal"; + case BuilderOpcode::SubgroupRotate: + return "subgroup.rotate"; case BuilderOpcode::SubgroupBroadcast: return "subgroup.broadcast"; case BuilderOpcode::SubgroupBroadcastWaterfall: @@ -894,18 +892,6 @@ Value *Builder::CreateIsHelperInvocation(const Twine &instName) { return record(BuilderOpcode::IsHelperInvocation, getInt1Ty(), {}, instName); } -// ===================================================================================================================== -// In the mesh shader, set the actual output size of the primitives and vertices that the mesh shader workgroup will -// emit upon completion. -// -// @param vertexCount : Actual output size of the vertices -// @param primitiveCount : Actual output size of the primitives -// @param instName : Name to give final instruction -// @returns Instruction to set the actual size of mesh outputs -Instruction *Builder::CreateSetMeshOutputs(Value *vertexCount, Value *primitiveCount, const Twine &instName) { - return record(BuilderOpcode::SetMeshOutputs, nullptr, {vertexCount, primitiveCount}, instName); -} - // ===================================================================================================================== // Create "fclamp" operation. // @@ -1810,6 +1796,18 @@ Value *Builder::CreateSubgroupShuffleDown(Value *const value, Value *const offse return record(BuilderOpcode::SubgroupShuffleDown, value->getType(), {value, offset}, instName); } +// ===================================================================================================================== +// Create a subgroup rotate call. +// +// @param value : The value to read from the chosen rotated lane to all active lanes. +// @param delta : The delta/offset added to lane id. +// @param clusterSize : The cluster size if exists. +// @param instName : Name to give final instruction. +Value *Builder::CreateSubgroupRotate(Value *const value, Value *const delta, Value *const clusterSize, + const Twine &instName) { + return record(BuilderOpcode::SubgroupRotate, value->getType(), {value, delta, clusterSize}, instName); +} + // ===================================================================================================================== // Create a subgroup clustered reduction. // @@ -1854,9 +1852,10 @@ Value *Builder::CreateSubgroupClusteredExclusive(GroupArithOp groupArithOp, Valu // // @param value : The value to broadcast // @param index : The index within the quad to broadcast from +// @param inWQM : Whether it's in whole quad mode // @param instName : Name to give instruction(s) -Value *Builder::CreateSubgroupQuadBroadcast(Value *const value, Value *const index, const Twine &instName) { - return record(BuilderOpcode::SubgroupQuadBroadcast, value->getType(), {value, index}, instName); +Value *Builder::CreateSubgroupQuadBroadcast(Value *const value, Value *const index, bool inWQM, const Twine &instName) { + return record(BuilderOpcode::SubgroupQuadBroadcast, value->getType(), {value, index, getInt1(inWQM)}, instName); } // ===================================================================================================================== @@ -2072,6 +2071,7 @@ Instruction *Builder::record(BuilderOpcode opcode, Type *resultTy, ArrayRefCreateDebugBreak(); } - case BuilderOpcode::SetMeshOutputs: { - return m_builder->CreateSetMeshOutputs(args[0], args[1]); - } case BuilderOpcode::TransposeMatrix: { return m_builder->CreateTransposeMatrix(args[0]); } @@ -742,6 +739,9 @@ Value *BuilderReplayer::processCall(unsigned opcode, CallInst *call) { case BuilderOpcode::SubgroupAllEqual: { return m_builder->CreateSubgroupAllEqual(args[0]); } + case BuilderOpcode::SubgroupRotate: { + return m_builder->CreateSubgroupRotate(args[0], args[1], isa(args[2]) ? nullptr : &*args[2]); + } case BuilderOpcode::SubgroupBroadcast: { return m_builder->CreateSubgroupBroadcast(args[0], args[1]); } @@ -800,7 +800,7 @@ Value *BuilderReplayer::processCall(unsigned opcode, CallInst *call) { return m_builder->CreateSubgroupClusteredExclusive(groupArithOp, args[1], args[2]); } case BuilderOpcode::SubgroupQuadBroadcast: { - return m_builder->CreateSubgroupQuadBroadcast(args[0], args[1]); + return m_builder->CreateSubgroupQuadBroadcast(args[0], args[1], cast(args[2])->getZExtValue()); } case BuilderOpcode::SubgroupQuadSwapHorizontal: { return m_builder->CreateSubgroupQuadSwapHorizontal(args[0]); diff --git a/lgc/builder/DescBuilder.cpp b/lgc/builder/DescBuilder.cpp index c7e45f8d7c..088fb80307 100644 --- a/lgc/builder/DescBuilder.cpp +++ b/lgc/builder/DescBuilder.cpp @@ -35,6 +35,7 @@ #include "lgc/state/PalMetadata.h" #include "lgc/state/PipelineState.h" #include "lgc/state/TargetInfo.h" +#include "lgc/util/AddressExtender.h" #include "lgc/util/Internal.h" #include "llvm/IR/IntrinsicsAMDGPU.h" @@ -94,138 +95,88 @@ Value *BuilderImpl::CreateBufferDesc(uint64_t descSet, unsigned binding, Value * else if (flags & BufferFlagAddress) return64Address = true; - // Find the descriptor node. If doing a shader compilation with no user data layout provided, don't bother to - // look. Later code will use relocs. + // Find the descriptor node. + ResourceNodeType abstractType = ResourceNodeType::Unknown; + if (flags & BufferFlagConst) + abstractType = ResourceNodeType::DescriptorConstBuffer; + else if (flags & BufferFlagNonConst) + abstractType = ResourceNodeType::DescriptorBuffer; + else if (flags & BufferFlagShaderResource) + abstractType = ResourceNodeType::DescriptorResource; + else if (flags & BufferFlagSampler) + abstractType = ResourceNodeType::DescriptorSampler; + else if (flags & BufferFlagAddress) + abstractType = ResourceNodeType::DescriptorBufferCompact; + const ResourceNode *topNode = nullptr; const ResourceNode *node = nullptr; - if (!m_pipelineState->isUnlinked() || !m_pipelineState->getUserDataNodes().empty()) { - // We have the user data layout. Find the node. - ResourceNodeType abstractType = ResourceNodeType::Unknown; - if (flags & BufferFlagConst) - abstractType = ResourceNodeType::DescriptorConstBuffer; - else if (flags & BufferFlagNonConst) - abstractType = ResourceNodeType::DescriptorBuffer; - else if (flags & BufferFlagShaderResource) - abstractType = ResourceNodeType::DescriptorResource; - else if (flags & BufferFlagSampler) - abstractType = ResourceNodeType::DescriptorSampler; - else if (flags & BufferFlagAddress) - abstractType = ResourceNodeType::DescriptorBufferCompact; - - std::tie(topNode, node) = m_pipelineState->findResourceNode(abstractType, descSet, binding, m_shaderStage); - if (!node) { - // If we can't find the node, assume mutable descriptor and search for any node. - std::tie(topNode, node) = - m_pipelineState->findResourceNode(ResourceNodeType::DescriptorMutable, descSet, binding, m_shaderStage); - } - - if (!node) - report_fatal_error("Resource node not found"); - - if (node == topNode && isa(descIndex) && node->concreteType != ResourceNodeType::InlineBuffer) { - // Handle a descriptor in the root table (a "dynamic descriptor") specially, as long as it is not variably - // indexed and is not an InlineBuffer. This lgc.root.descriptor call is by default lowered in - // PatchEntryPointMutate into a load from the spill table, but it might be able to "unspill" it to - // directly use shader entry SGPRs. - // TODO: Handle root InlineBuffer specially in a similar way to PushConst. The default handling is - // suboptimal as it always loads from the spill table. - Type *descTy = getDescTy(node->concreteType); - std::string callName = lgcName::RootDescriptor; - addTypeMangling(descTy, {}, callName); - unsigned dwordSize = descTy->getPrimitiveSizeInBits() / 32; - unsigned dwordOffset = cast(descIndex)->getZExtValue() * dwordSize; - if (dwordOffset + dwordSize > node->sizeInDwords) { - // Index out of range - desc = PoisonValue::get(descTy); - } else { - dwordOffset += node->offsetInDwords; - dwordOffset += (binding - node->binding) * node->stride; - desc = CreateNamedCall(callName, descTy, getInt32(dwordOffset), Attribute::ReadNone); - } - if (return64Address) { - assert(node->concreteType == ResourceNodeType::DescriptorBufferCompact); - return CreateBitCast(desc, getInt64Ty()); - } - } else if (node->concreteType == ResourceNodeType::InlineBuffer) { - // Handle an inline buffer specially. Get a pointer to it, then expand to a descriptor. - Value *descPtr = getDescPtr(node->concreteType, node->abstractType, descSet, binding, topNode, node); - desc = buildInlineBufferDesc(descPtr); - } + std::tie(topNode, node) = m_pipelineState->findResourceNode(abstractType, descSet, binding, m_shaderStage); + if (!node) { + // If we can't find the node, assume mutable descriptor and search for any node. + std::tie(topNode, node) = + m_pipelineState->findResourceNode(ResourceNodeType::DescriptorMutable, descSet, binding, m_shaderStage); } - if (!desc) { - if (node) { - ResourceNodeType resType = node->concreteType; - ResourceNodeType abstractType = node->abstractType; - // Handle mutable descriptors - if (resType == ResourceNodeType::DescriptorMutable) { - resType = ResourceNodeType::DescriptorBuffer; - } - if (abstractType == ResourceNodeType::DescriptorMutable) { - abstractType = ResourceNodeType::DescriptorBuffer; - } - Value *descPtr = getDescPtr(resType, abstractType, descSet, binding, topNode, node); - // Index it. - if (descIndex != getInt32(0)) { - descIndex = CreateMul(descIndex, getStride(resType, descSet, binding, node)); - descPtr = CreateGEP(getInt8Ty(), descPtr, descIndex); - } + if (!node) + report_fatal_error("Resource node not found"); - // The buffer may have an attached counter buffer descriptor which do not have a different set or binding. - if (flags & BufferFlagAttachedCounter) { - // The node stride must be large enough to hold 2 buffer descriptors. - assert(node->stride * sizeof(uint32_t) == 2 * DescriptorSizeBuffer); - descPtr = CreateGEP(getInt8Ty(), descPtr, getInt32(DescriptorSizeBuffer)); - } + if (node == topNode && isa(descIndex) && node->concreteType != ResourceNodeType::InlineBuffer) { + // Handle a descriptor in the root table (a "dynamic descriptor") specially, as long as it is not variably + // indexed and is not an InlineBuffer. + Type *descTy; + if (return64Address) { + assert(node->concreteType == ResourceNodeType::DescriptorBufferCompact); + descTy = getInt64Ty(); + } else { + descTy = getDescTy(node->concreteType); + } - // Cast it to the right type. - descPtr = CreateBitCast(descPtr, getDescPtrTy(resType)); - // Load the descriptor. - desc = CreateLoad(getDescTy(resType), descPtr); + unsigned dwordSize = descTy->getPrimitiveSizeInBits() / 32; + unsigned dwordOffset = cast(descIndex)->getZExtValue() * dwordSize; + if (dwordOffset + dwordSize > node->sizeInDwords) { + // Index out of range + desc = PoisonValue::get(descTy); } else { - // For shader compilation with no user data layout provided, we don't know if the buffer is dynamic descriptor, - // We need to load two dwords for DescriptorBufferCompact, 4 dwords for DescriptorBuffer. To avoid out of bound, - // we will use two loads and load two dwords for each time. If the resource type is really DescriptorBuffer, the - // address of the second load will add 8 bytes, otherwise the address is the same as the first, it means we load - // the same data twice, but the data is not used. - - // Get the descriptor pointer which is from ResourceMapping, ignore the resource type. - ResourceNodeType resType = ResourceNodeType::DescriptorBuffer; - ResourceNodeType abstractType = resType; - Value *descPtr = getDescPtr(resType, abstractType, descSet, binding, nullptr, nullptr); - // Index it. - if (descIndex != getInt32(0)) { - descIndex = CreateMul(descIndex, getStride(resType, descSet, binding, nullptr)); - descPtr = CreateGEP(getInt8Ty(), descPtr, descIndex); - } + dwordOffset += node->offsetInDwords; + dwordOffset += (binding - node->binding) * node->stride; + desc = create(descTy, dwordOffset * 4); + } + if (return64Address) + return desc; + } else if (node->concreteType == ResourceNodeType::InlineBuffer) { + // Handle an inline buffer specially. Get a pointer to it, then expand to a descriptor. + Value *descPtr = getDescPtr(node->concreteType, topNode, node, binding); + desc = buildInlineBufferDesc(descPtr); + } else { + ResourceNodeType resType = node->concreteType; + ResourceNodeType abstractType = node->abstractType; + // Handle mutable descriptors + if (resType == ResourceNodeType::DescriptorMutable) { + resType = ResourceNodeType::DescriptorBuffer; + } + if (abstractType == ResourceNodeType::DescriptorMutable) { + abstractType = ResourceNodeType::DescriptorBuffer; + } + Value *descPtr = getDescPtr(resType, topNode, node, binding); + // Index it. + if (descIndex != getInt32(0)) { + descIndex = CreateMul(descIndex, getStride(resType, node)); + descPtr = CreateGEP(getInt8Ty(), descPtr, descIndex); + } - auto descPtrLo = CreateBitCast(descPtr, FixedVectorType::get(getInt32Ty(), 2)->getPointerTo(ADDR_SPACE_CONST)); - // The first load - auto descLo = CreateLoad(FixedVectorType::get(getInt32Ty(), 2), descPtrLo); - auto compactBufferDesc = buildBufferCompactDesc(descLo); - - // If descriptor set is InternalDescriptorSetId, this is a internal resource node, it is a root node - // and its type is ResourceNodeType::DescriptorBufferCompact. - if (descSet == InternalDescriptorSetId) { - assert(return64Address); - return CreateBitCast(descLo, getInt64Ty()); - } else { - // Add offset - Value *descPtrHi = CreateAddByteOffset(descPtr, getInt32(8)); - auto reloc = CreateRelocationConstant(reloc::CompactBuffer + Twine(descSet) + "_" + Twine(binding)); - auto isCompactBuffer = CreateICmpNE(reloc, getInt32(0)); - // Select the address - descPtrHi = CreateSelect(isCompactBuffer, descPtr, descPtrHi); - descPtrHi = CreateBitCast(descPtrHi, FixedVectorType::get(getInt32Ty(), 2)->getPointerTo(ADDR_SPACE_CONST)); - // The second load - auto descHi = CreateLoad(FixedVectorType::get(getInt32Ty(), 2), descPtrHi); - // Merge the whole descriptor for DescriptorBuffer - auto bufferDesc = CreateShuffleVector(descLo, descHi, {0, 1, 2, 3}); - // Select - desc = CreateSelect(isCompactBuffer, compactBufferDesc, bufferDesc); - } + // The buffer may have an attached counter buffer descriptor which do not have a different set or binding. + if (flags & BufferFlagAttachedCounter) { + // The node stride must be large enough to hold 2 buffer descriptors. + assert(node->stride * sizeof(uint32_t) == 2 * DescriptorSizeBuffer); + descPtr = CreateGEP(getInt8Ty(), descPtr, getInt32(DescriptorSizeBuffer)); } + + // Cast it to the right type. + descPtr = CreateBitCast(descPtr, getDescPtrTy(resType)); + // Load the descriptor. + desc = CreateLoad(getDescTy(resType), descPtr); } + if (node && (node->concreteType == ResourceNodeType::DescriptorBufferCompact || node->concreteType == ResourceNodeType::DescriptorConstBufferCompact)) desc = buildBufferCompactDesc(desc); @@ -248,28 +199,23 @@ Value *BuilderImpl::CreateBufferDesc(uint64_t descSet, unsigned binding, Value * // @param instName : Name to give instruction(s) Value *BuilderImpl::CreateGetDescStride(ResourceNodeType concreteType, ResourceNodeType abstractType, uint64_t descSet, unsigned binding, const Twine &instName) { - // Find the descriptor node. If doing a shader compilation with no user data layout provided, don't bother to - // look; we will use relocs instead. const ResourceNode *topNode = nullptr; const ResourceNode *node = nullptr; - if (!m_pipelineState->isUnlinked() || !m_pipelineState->getUserDataNodes().empty()) { - std::tie(topNode, node) = m_pipelineState->findResourceNode(abstractType, descSet, binding, m_shaderStage); - if (!node) { - // If we can't find the node, assume mutable descriptor and search for any node. - std::tie(topNode, node) = - m_pipelineState->findResourceNode(ResourceNodeType::DescriptorMutable, descSet, binding, m_shaderStage); - if (!node && - m_pipelineState->findResourceNode(ResourceNodeType::Unknown, descSet, binding, m_shaderStage).second) { - // NOTE: Resource node may be DescriptorTexelBuffer, but it is defined as OpTypeSampledImage in SPIRV, - // In this case, a caller may search for the DescriptorSampler and not find it. We return nullptr and - // expect the caller to handle it. - return PoisonValue::get(getInt32Ty()); - } - assert(node && "missing resource node"); + std::tie(topNode, node) = m_pipelineState->findResourceNode(abstractType, descSet, binding, m_shaderStage); + if (!node) { + // If we can't find the node, assume mutable descriptor and search for any node. + std::tie(topNode, node) = + m_pipelineState->findResourceNode(ResourceNodeType::DescriptorMutable, descSet, binding, m_shaderStage); + if (!node && m_pipelineState->findResourceNode(ResourceNodeType::Unknown, descSet, binding, m_shaderStage).second) { + // NOTE: Resource node may be DescriptorTexelBuffer, but it is defined as OpTypeSampledImage in SPIRV, + // In this case, a caller may search for the DescriptorSampler and not find it. We return poison and + // expect the caller to handle it. + return PoisonValue::get(getInt32Ty()); } assert(node && "missing resource node"); } - return getStride(concreteType, descSet, binding, node); + assert(node && "missing resource node"); + return getStride(concreteType, node); } // ===================================================================================================================== @@ -334,7 +280,7 @@ Value *BuilderImpl::CreateGetDescPtr(ResourceNodeType concreteType, ResourceNode } } else { // Get a pointer to the descriptor. - descPtr = getDescPtr(concreteType, abstractType, descSet, binding, topNode, node); + descPtr = getDescPtr(concreteType, topNode, node, binding); } // Cast to the right pointer type. @@ -349,24 +295,15 @@ Value *BuilderImpl::CreateGetDescPtr(ResourceNodeType concreteType, ResourceNode // @param instName : Name to give instruction(s) Value *BuilderImpl::CreateLoadPushConstantsPtr(const Twine &instName) { Value *ptr; - const bool isIndirect = getPipelineState()->getOptions().resourceLayoutScheme == ResourceLayoutScheme::Indirect; - if (isIndirect) { - const ResourceNode *topNode = m_pipelineState->findPushConstantResourceNode(m_shaderStage); - assert(topNode); - const ResourceNode subNode = topNode->innerTable[0]; - Value *highHalf = getInt32(HighAddrPc); - ptr = CreateNamedCall(lgcName::DescriptorTableAddr, getPtrTy(ADDR_SPACE_CONST), - {getInt32(unsigned(ResourceNodeType::PushConst)), - getInt32(unsigned(ResourceNodeType::PushConst)), getInt64(subNode.set), - getInt32(subNode.binding), highHalf}, - Attribute::ReadNone); + const ResourceNode *topNode = m_pipelineState->findPushConstantResourceNode(m_shaderStage); + assert(topNode); + if (topNode->concreteType == ResourceNodeType::DescriptorTableVaPtr) { + AddressExtender extender(GetInsertBlock()->getParent()); + ptr = create(getInt32Ty(), topNode->offsetInDwords * 4); + ptr = extender.extendWithPc(ptr, getPtrTy(ADDR_SPACE_CONST), *this); } else { - // Get the push const pointer. If subsequent code only uses this with constant GEPs and loads, - // then PatchEntryPointMutate might be able to "unspill" it so the code uses shader entry SGPRs - // directly instead of loading from the spill table. - std::string callName = lgcName::PushConst; - addTypeMangling(getPtrTy(ADDR_SPACE_CONST), {}, callName); - ptr = CreateNamedCall(callName, getPtrTy(ADDR_SPACE_CONST), {}, Attribute::ReadOnly); + assert(topNode->concreteType == ResourceNodeType::PushConst); + ptr = create(topNode->offsetInDwords * 4); } ptr->setName(instName); return ptr; @@ -382,23 +319,16 @@ bool BuilderImpl::useVertexBufferDescArray() { // Get the stride (in bytes) of a descriptor. Returns an i32 value. // // @param descType : Descriptor type -// @param descSet : Descriptor set -// @param binding : Descriptor binding // @param node : The descriptor node (nullptr for shader compilation) -// @param instName : Name to give instruction(s) -Value *BuilderImpl::getStride(ResourceNodeType descType, uint64_t descSet, unsigned binding, const ResourceNode *node) { - if (node && node->immutableSize != 0 && descType == ResourceNodeType::DescriptorSampler) { +Value *BuilderImpl::getStride(ResourceNodeType descType, const ResourceNode *node) { + assert(node); + + if (node->immutableSize != 0 && descType == ResourceNodeType::DescriptorSampler) { // This is an immutable sampler. Because we put the immutable value into a static variable, the stride is // always the size of the descriptor. return getInt32(DescriptorSizeSampler); } - if (m_pipelineState->isUnlinked() && m_pipelineState->getUserDataNodes().empty()) { - // Shader compilation: Get byte stride using a reloc. - return CreateRelocationConstant(reloc::DescriptorStride + Twine(descSet) + "_" + Twine(binding)); - } - // Pipeline compilation: Get the stride from the node. - assert(node); return getInt32(node->stride * sizeof(uint32_t)); } @@ -406,82 +336,33 @@ Value *BuilderImpl::getStride(ResourceNodeType descType, uint64_t descSet, unsig // Get a pointer to a descriptor, as a pointer to i8 // // @param concreteType : Concrete resource type -// @param abstractType : Abstract Resource type -// @param descSet : Descriptor set -// @param binding : Binding // @param topNode : Node in top-level descriptor table (nullptr for shader compilation) // @param node : The descriptor node itself (nullptr for shader compilation) -Value *BuilderImpl::getDescPtr(ResourceNodeType concreteType, ResourceNodeType abstractType, uint64_t descSet, - unsigned binding, const ResourceNode *topNode, const ResourceNode *node) { - Value *descPtr = nullptr; - - auto GetSpillTablePtr = [this]() { - // The descriptor is in the top-level table. (This can only happen for a DescriptorBuffer.) Contrary - // to what used to happen, we just load from the spill table, so we can get a pointer to the descriptor. - // The spill table gets returned as a pointer to array of i8. - return CreateNamedCall(lgcName::SpillTable, getInt8Ty()->getPointerTo(ADDR_SPACE_CONST), {}, Attribute::ReadNone); - }; - - auto GetDescriptorSetPtr = [this, node, topNode, concreteType, abstractType, descSet, binding]() -> Value * { - // Get the descriptor table pointer for the descriptor at the given set and binding, which might be passed as a - // user SGPR to the shader. - // The args to the lgc.descriptor.table.addr call are: - // - requested descriptor type - // - descriptor set number - // - descriptor binding number - // - value for high 32 bits of the pointer; HighAddrPc to use PC - if (node || topNode || concreteType != ResourceNodeType::DescriptorFmask) { - unsigned highAddrOfFmask = m_pipelineState->getOptions().highAddrOfFmask; - bool isFmask = concreteType == ResourceNodeType::DescriptorFmask; - Value *highHalf = getInt32(isFmask ? highAddrOfFmask : HighAddrPc); - return CreateNamedCall(lgcName::DescriptorTableAddr, getInt8Ty()->getPointerTo(ADDR_SPACE_CONST), - {getInt32(unsigned(concreteType)), getInt32(unsigned(abstractType)), getInt64(descSet), - getInt32(binding), highHalf}, - Attribute::ReadNone); - } - // This should be an unlinked shader, and we will use a relocation for the high half of the address. - assert(m_pipelineState->isUnlinked() && - "Cannot add shadow descriptor relocations unless building an unlinked shader."); - - // Get the address when the shadow table is disabled. - Value *nonShadowAddr = CreateNamedCall(lgcName::DescriptorTableAddr, getInt8Ty()->getPointerTo(ADDR_SPACE_CONST), - {getInt32(unsigned(concreteType)), getInt32(unsigned(abstractType)), - getInt64(descSet), getInt32(binding), getInt32(HighAddrPc)}, - Attribute::ReadNone); - - // Get the address using a relocation when the shadow table is enabled. - Value *shadowDescriptorReloc = CreateRelocationConstant(reloc::ShadowDescriptorTable); - Value *shadowAddr = CreateNamedCall(lgcName::DescriptorTableAddr, getInt8Ty()->getPointerTo(ADDR_SPACE_CONST), - {getInt32(unsigned(concreteType)), getInt32(unsigned(abstractType)), - getInt64(descSet), getInt32(binding), shadowDescriptorReloc}, - Attribute::ReadNone); - - // Use a relocation to select between the two. - Value *useShadowReloc = CreateRelocationConstant(reloc::ShadowDescriptorTableEnabled); - Value *useShadowTable = CreateICmpNE(useShadowReloc, getInt32(0)); - return CreateSelect(useShadowTable, shadowAddr, nonShadowAddr); - }; - - // Get the descriptor table pointer. - if (node && node == topNode) { - // Ensure we mark spill table usage. - descPtr = GetSpillTablePtr(); - getPipelineState()->getPalMetadata()->setUserDataSpillUsage(node->offsetInDwords); - } else { - descPtr = GetDescriptorSetPtr(); - } +// @param binding : Binding +Value *BuilderImpl::getDescPtr(ResourceNodeType concreteType, const ResourceNode *topNode, const ResourceNode *node, + unsigned binding) { + assert(node && topNode); // Get the offset for the descriptor. Where we are getting the second part of a combined resource, // add on the size of the first part. - unsigned offsetInDwords = node->offsetInDwords; - offsetInDwords += (binding - node->binding) * node->stride; - + unsigned offsetInDwords = node->offsetInDwords + (binding - node->binding) * node->stride; unsigned offsetInBytes = offsetInDwords * 4; if (concreteType == ResourceNodeType::DescriptorSampler && node->concreteType == ResourceNodeType::DescriptorCombinedTexture) offsetInBytes += DescriptorSizeResource; - return CreateAddByteOffset(descPtr, getInt32(offsetInBytes)); + if (node == topNode) + return create(offsetInBytes); + + // Get the descriptor table pointer for the descriptor at the given set and binding, which might be passed as a + // user SGPR to the shader. + unsigned highAddrOfFmask = m_pipelineState->getOptions().highAddrOfFmask; + bool isFmask = concreteType == ResourceNodeType::DescriptorFmask; + Value *highHalf = getInt32(isFmask ? highAddrOfFmask : HighAddrPc); + AddressExtender extender(GetInsertBlock()->getParent()); + Value *descPtr = create(getInt32Ty(), topNode->offsetInDwords * 4); + descPtr = extender.extend(descPtr, highHalf, getPtrTy(ADDR_SPACE_CONST), *this); + return CreateConstGEP1_32(getInt8Ty(), descPtr, offsetInBytes); } // ===================================================================================================================== diff --git a/lgc/builder/ImageBuilder.cpp b/lgc/builder/ImageBuilder.cpp index ad4ccb1e9e..91eb11cb7f 100644 --- a/lgc/builder/ImageBuilder.cpp +++ b/lgc/builder/ImageBuilder.cpp @@ -1373,9 +1373,12 @@ Value *BuilderImpl::CreateImageQueryLevels(unsigned dim, unsigned flags, Value * // @param imageDesc : Image descriptor or texel buffer descriptor // @param instName : Name to give instruction(s) Value *BuilderImpl::CreateImageQuerySamples(unsigned dim, unsigned flags, Value *imageDesc, const Twine &instName) { - // Extract LAST_LEVEL (SQ_IMG_RSRC_WORD3, [19:16]) Value *descWord3 = CreateExtractElement(imageDesc, 3); - Value *lastLevel = CreateIntrinsic(Intrinsic::amdgcn_ubfe, getInt32Ty(), {descWord3, getInt32(16), getInt32(4)}); + Value *lastLevel = nullptr; + if (m_pipelineState->getTargetInfo().getGfxIpVersion().major <= 11) { + // Extract LAST_LEVEL (SQ_IMG_RSRC_WORD3, [19:16]) + lastLevel = CreateIntrinsic(Intrinsic::amdgcn_ubfe, getInt32Ty(), {descWord3, getInt32(16), getInt32(4)}); + } // Sample number = 1 << LAST_LEVEL Value *sampleNumber = CreateShl(getInt32(1), lastLevel); diff --git a/lgc/builder/InOutBuilder.cpp b/lgc/builder/InOutBuilder.cpp index d3eb53ffee..227b6c8d69 100644 --- a/lgc/builder/InOutBuilder.cpp +++ b/lgc/builder/InOutBuilder.cpp @@ -205,14 +205,16 @@ Value *BuilderImpl::readGenericInputOutput(bool isOutput, Type *resultTy, unsign // Fold constant locationOffset into location. (Currently a variable locationOffset is only supported in // TCS, TES, and FS custom interpolation.) + bool isDynLocOffset = true; if (auto constLocOffset = dyn_cast(locationOffset)) { location += constLocOffset->getZExtValue(); locationOffset = getInt32(0); locationCount = (resultTy->getPrimitiveSizeInBits() + 127U) / 128U; + isDynLocOffset = false; } // Mark the usage of the input/output. - markGenericInputOutputUsage(isOutput, location, locationCount, inOutInfo, vertexIndex); + markGenericInputOutputUsage(isOutput, location, locationCount, inOutInfo, vertexIndex, isDynLocOffset); // Generate LLPC call for reading the input/output. Value *result = nullptr; @@ -290,14 +292,17 @@ Instruction *BuilderImpl::CreateWriteGenericOutput(Value *valueToWrite, unsigned // Fold constant locationOffset into location. (Currently a variable locationOffset is only supported in // TCS.) + bool isDynLocOffset = true; if (auto constLocOffset = dyn_cast(locationOffset)) { location += constLocOffset->getZExtValue(); locationOffset = getInt32(0); locationCount = (valueToWrite->getType()->getPrimitiveSizeInBits() + 127U) / 128U; + isDynLocOffset = false; } // Mark the usage of the output. - markGenericInputOutputUsage(/*isOutput=*/true, location, locationCount, outputInfo, vertexOrPrimitiveIndex); + markGenericInputOutputUsage(/*isOutput=*/true, location, locationCount, outputInfo, vertexOrPrimitiveIndex, + isDynLocOffset); // Set up the args for the llpc call. SmallVector args; @@ -370,8 +375,9 @@ Instruction *BuilderImpl::CreateWriteGenericOutput(Value *valueToWrite, unsigned // for mesh shader per-primitive output: primitive index; // for FS custom-interpolated input: auxiliary value; // else nullptr. +// @param isDynLocOffset : Whether the location offset is dynamic indexing void BuilderImpl::markGenericInputOutputUsage(bool isOutput, unsigned location, unsigned locationCount, - InOutInfo &inOutInfo, Value *vertexOrPrimIndex) { + InOutInfo &inOutInfo, Value *vertexOrPrimIndex, bool isDynLocOffset) { auto resUsage = getPipelineState()->getShaderResourceUsage(m_shaderStage); // Mark the input or output locations as in use. @@ -417,6 +423,8 @@ void BuilderImpl::markGenericInputOutputUsage(bool isOutput, unsigned location, keepAllLocations = true; } unsigned startLocation = (keepAllLocations ? 0 : location); + // NOTE: The non-invalid value as initial new Location info or new location is used to identify the dynamic indexing + // location. // Non-GS-output case. if (inOutLocInfoMap) { for (unsigned i = startLocation; i < location + locationCount; ++i) { @@ -424,16 +432,16 @@ void BuilderImpl::markGenericInputOutputUsage(bool isOutput, unsigned location, origLocationInfo.setLocation(i); origLocationInfo.setComponent(inOutInfo.getComponent()); auto &newLocationInfo = (*inOutLocInfoMap)[origLocationInfo]; - newLocationInfo.setData(InvalidValue); + newLocationInfo.setData(isDynLocOffset ? i : InvalidValue); } } if (perPatchInOutLocMap) { for (unsigned i = startLocation; i < location + locationCount; ++i) - (*perPatchInOutLocMap)[i] = InvalidValue; + (*perPatchInOutLocMap)[i] = isDynLocOffset ? i : InvalidValue; } if (perPrimitiveInOutLocMap) { for (unsigned i = startLocation; i < location + locationCount; ++i) - (*perPrimitiveInOutLocMap)[i] = InvalidValue; + (*perPrimitiveInOutLocMap)[i] = isDynLocOffset ? i : InvalidValue; } } else { // GS output. We include the stream ID with the location in the map key. @@ -451,6 +459,10 @@ void BuilderImpl::markGenericInputOutputUsage(bool isOutput, unsigned location, // Mark usage for interpolation info. markInterpolationInfo(inOutInfo); } + + if (isOutput && m_shaderStage == ShaderStageFragment && inOutInfo.isDualSourceBlendDynamic()) { + m_pipelineState->getColorExportState().dynamicDualSourceBlendEnable = true; + } } // ===================================================================================================================== @@ -792,7 +804,7 @@ Value *BuilderImpl::CreateReadBaryCoord(BuiltInKind builtIn, InOutInfo inputInfo const llvm::Twine &instName) { assert(builtIn == lgc::BuiltInBaryCoord || builtIn == lgc::BuiltInBaryCoordNoPerspKHR); - markBuiltInInputUsage(builtIn, 0); + markBuiltInInputUsage(builtIn, 0, inputInfo); // Force override to per-sample interpolation. if (getPipelineState()->getOptions().enableInterpModePatch && !auxInterpValue && @@ -865,7 +877,7 @@ Value *BuilderImpl::readBuiltIn(bool isOutput, BuiltInKind builtIn, InOutInfo in arraySize = constIndex->getZExtValue() + 1; if (!isOutput) - markBuiltInInputUsage(builtIn, arraySize); + markBuiltInInputUsage(builtIn, arraySize, inOutInfo); else markBuiltInOutputUsage(builtIn, arraySize, InvalidValue); @@ -1417,7 +1429,8 @@ Type *BuilderImpl::getBuiltInTy(BuiltInKind builtIn, InOutInfo inOutInfo) { // @param builtIn : Built-in ID // @param arraySize : Number of array elements for ClipDistance and CullDistance. (Multiple calls to this function for // this built-in might have different array sizes; we take the max) -void BuilderImpl::markBuiltInInputUsage(BuiltInKind &builtIn, unsigned arraySize) { +// @param inOutInfo : Extra input/output info (shader-defined array size) +void BuilderImpl::markBuiltInInputUsage(BuiltInKind &builtIn, unsigned arraySize, InOutInfo inOutInfo) { auto &usage = getPipelineState()->getShaderResourceUsage(m_shaderStage)->builtInUsage; assert((builtIn != BuiltInClipDistance && builtIn != BuiltInCullDistance) || arraySize != 0); switch (m_shaderStage) { @@ -1573,6 +1586,8 @@ void BuilderImpl::markBuiltInInputUsage(BuiltInKind &builtIn, unsigned arraySize switch (static_cast(builtIn)) { case BuiltInFragCoord: usage.fs.fragCoord = true; + if (inOutInfo.getInterpMode() == InOutInfo::InterpLocSample) + usage.fs.fragCoordIsSample = true; break; case BuiltInFrontFacing: usage.fs.frontFacing = true; diff --git a/lgc/builder/MiscBuilder.cpp b/lgc/builder/MiscBuilder.cpp index dbed19e786..3aaa0c74df 100644 --- a/lgc/builder/MiscBuilder.cpp +++ b/lgc/builder/MiscBuilder.cpp @@ -46,6 +46,10 @@ using namespace llvm; Instruction *BuilderImpl::CreateEmitVertex(unsigned streamId) { assert(m_shaderStage == ShaderStageGeometry); + // Mark this vertex stream as active if transform feedback is enabled or this is the rasterization stream. + if (m_pipelineState->enableXfb() || m_pipelineState->getRasterizerState().rasterStream == streamId) + m_pipelineState->setVertexStreamActive(streamId); + // Get GsWaveId std::string callName = lgcName::InputImportBuiltIn; callName += "GsWaveId.i32.i32"; @@ -64,6 +68,10 @@ Instruction *BuilderImpl::CreateEmitVertex(unsigned streamId) { Instruction *BuilderImpl::CreateEndPrimitive(unsigned streamId) { assert(m_shaderStage == ShaderStageGeometry); + // Mark this vertex stream as active if transform feedback is enabled or this is the rasterization stream. + if (m_pipelineState->enableXfb() || m_pipelineState->getRasterizerState().rasterStream == streamId) + m_pipelineState->setVertexStreamActive(streamId); + // Get GsWaveId std::string callName = lgcName::InputImportBuiltIn; callName += "GsWaveId.i32.i32"; @@ -125,19 +133,6 @@ Value *BuilderImpl::CreateIsHelperInvocation(const Twine &instName) { return CreateNot(isLive); } -// ===================================================================================================================== -// In the mesh shader, set the actual output size of the primitives and vertices that the mesh shader workgroup will -// emit upon completion. -// -// @param vertexCount : Actual output size of the vertices -// @param primitiveCount : Actual output size of the primitives -// @param instName : Name to give final instruction -// @returns Instruction to set the actual size of mesh outputs -Instruction *BuilderImpl::CreateSetMeshOutputs(Value *vertexCount, Value *primitiveCount, const Twine &instName) { - assert(m_shaderStage == ShaderStageMesh); // Only valid for mesh shader - return CreateNamedCall(lgcName::MeshTaskSetMeshOutputs, getVoidTy(), {vertexCount, primitiveCount}, {}); -} - // ===================================================================================================================== // Create a "readclock". // diff --git a/lgc/builder/SubgroupBuilder.cpp b/lgc/builder/SubgroupBuilder.cpp index 1e74218f72..5d145ebe8b 100644 --- a/lgc/builder/SubgroupBuilder.cpp +++ b/lgc/builder/SubgroupBuilder.cpp @@ -143,6 +143,29 @@ Value *BuilderImpl::CreateSubgroupAllEqual(Value *const value, const Twine &inst return CreateSubgroupAll(compare, instName); } +// ===================================================================================================================== +// Create a subgroup rotate call. +// +// @param value : The value to read from the chosen rotated lane to all active lanes. +// @param delta : The delta/offset added to lane id. +// @param clusterSize : The cluster size if exists. +// @param instName : Name to give final instruction. +Value *BuilderImpl::CreateSubgroupRotate(Value *const value, Value *const delta, Value *const clusterSize, + const Twine &instName) { + // LocalId = SubgroupLocalInvocationId + // RotationGroupSize = hasClusterSIze? ClusterSize : SubgroupSize. + // Invocation ID = ((LocalId + Delta) & (RotationGroupSize - 1)) + (LocalId & ~(RotationGroupSize - 1)) + Value *localId = CreateSubgroupMbcnt(getInt64(UINT64_MAX), ""); + Value *invocationId = CreateAdd(localId, delta); + if (clusterSize != nullptr) { + Value *rotationGroupSize = CreateSub(clusterSize, getInt32(1)); + invocationId = + CreateOr(CreateAnd(invocationId, rotationGroupSize), CreateAnd(localId, CreateNot(rotationGroupSize))); + } + + return CreateSubgroupShuffle(value, invocationId, instName); +} + // ===================================================================================================================== // Create a subgroup broadcast call. // @@ -991,12 +1014,14 @@ Value *BuilderImpl::CreateSubgroupClusteredExclusive(GroupArithOp groupArithOp, } // ===================================================================================================================== -// Create a subgroup quad broadcast call. +// Create a quad broadcast call. // // @param value : The value to broadcast across the quad. // @param index : The index in the quad to broadcast the value from. +// @param inWqm : Whether it's in whole quad mode. // @param instName : Name to give final instruction. -Value *BuilderImpl::CreateSubgroupQuadBroadcast(Value *const value, Value *const index, const Twine &instName) { +Value *BuilderImpl::CreateSubgroupQuadBroadcast(Value *const value, Value *const index, bool inWqm, + const Twine &instName) { Value *result = PoisonValue::get(value->getType()); const unsigned indexBits = index->getType()->getPrimitiveSizeInBits(); @@ -1026,12 +1051,13 @@ Value *BuilderImpl::CreateSubgroupQuadBroadcast(Value *const value, Value *const compare = CreateICmpEQ(index, getIntN(indexBits, 3)); result = CreateSelect(compare, createDsSwizzle(value, getDsSwizzleQuadMode(3, 3, 3, 3)), result); } - - return createWqm(result); + if (inWqm) + result = createWqm(result); + return result; } // ===================================================================================================================== -// Create a subgroup quad swap horizontal call. +// Create a quad swap horizontal call. // // @param value : The value to swap. // @param instName : Name to give final instruction. @@ -1043,7 +1069,7 @@ Value *BuilderImpl::CreateSubgroupQuadSwapHorizontal(Value *const value, const T } // ===================================================================================================================== -// Create a subgroup quad swap vertical call. +// Create a quad swap vertical call. // // @param value : The value to swap. // @param instName : Name to give final instruction. @@ -1055,7 +1081,7 @@ Value *BuilderImpl::CreateSubgroupQuadSwapVertical(Value *const value, const Twi } // ===================================================================================================================== -// Create a subgroup quadswapdiagonal call. +// Create a quadswapdiagonal call. // // @param value : The value to swap. // @param instName : Name to give final instruction. @@ -1067,7 +1093,7 @@ Value *BuilderImpl::CreateSubgroupQuadSwapDiagonal(Value *const value, const Twi } // ===================================================================================================================== -// Create a subgroup quad swap swizzle. +// Create a quad swap swizzle. // // @param value : The value to swizzle. // @param offset : The value to specify the swizzle offsets. diff --git a/lgc/elfLinker/ColorExportShader.cpp b/lgc/elfLinker/ColorExportShader.cpp index 35158524d4..eb2180a72d 100644 --- a/lgc/elfLinker/ColorExportShader.cpp +++ b/lgc/elfLinker/ColorExportShader.cpp @@ -113,12 +113,6 @@ Module *ColorExportShader::generate() { auto ret = cast(colorExportFunc->back().getTerminator()); BuilderBase builder(ret); - if (m_pipelineState->getOptions().enableColorExportShader) { - // NOTE: See LowerFragColorExport::jumpColorExport. Fragment shader uses a call amdgpu_gfx. In the amdgpu_gfx - // calling convention, the callee is expected to have the necessary waitcnt instructions. - builder.CreateIntrinsic(Intrinsic::amdgcn_s_waitcnt, {}, {builder.getInt32(0)}); - } - SmallVector values(MaxColorTargets + 1, nullptr); for (unsigned idx = 0; idx != m_exports.size(); ++idx) { values[m_exports[idx].hwColorTarget] = colorExportFunc->getArg(idx); @@ -126,6 +120,13 @@ Module *ColorExportShader::generate() { bool dummyExport = m_lgcContext->getTargetInfo().getGfxIpVersion().major < 10 || m_killEnabled; fragColorExport.generateExportInstructions(m_exports, values, m_exportFormat, dummyExport, builder); + + if (m_pipelineState->getOptions().enableColorExportShader) { + builder.CreateIntrinsic(Intrinsic::amdgcn_endpgm, {}, {}); + builder.CreateUnreachable(); + ret->eraseFromParent(); + } + return colorExportFunc->getParent(); } @@ -147,7 +148,11 @@ Function *ColorExportShader::createColorExportFunc() { // Create the function. Mark SGPR inputs as "inreg". Function *func = Function::Create(funcTy, GlobalValue::ExternalLinkage, getGlueShaderName(), module); - func->setCallingConv(CallingConv::AMDGPU_PS); + if (m_pipelineState->getOptions().enableColorExportShader) + func->setCallingConv(CallingConv::AMDGPU_Gfx); + else + func->setCallingConv(CallingConv::AMDGPU_PS); + func->setDLLStorageClass(GlobalValue::DLLExportStorageClass); setShaderStage(func, ShaderStageFragment); @@ -195,4 +200,5 @@ void ColorExportShader::updatePalMetadata(PalMetadata &palMetadata) { palMetadata.updateSpiShaderColFormat(finalExportFormats); palMetadata.updateCbShaderMask(m_exports); + palMetadata.updateDbShaderControl(); } diff --git a/lgc/elfLinker/ElfLinker.cpp b/lgc/elfLinker/ElfLinker.cpp index cb5a9104c8..541f8ab204 100644 --- a/lgc/elfLinker/ElfLinker.cpp +++ b/lgc/elfLinker/ElfLinker.cpp @@ -173,9 +173,6 @@ class ElfLinkerImpl final : public ElfLinker { // Link the unlinked shader/part-pipeline ELFs and the compiled glue code into a pipeline ELF bool link(raw_pwrite_stream &outStream) override final; - // Returns true if the fragment shader uses a builtin input that gets mapped. - bool fragmentShaderUsesMappedBuiltInInputs() override final; - // ----------------------------------------------------------------------------------------------------------------- // Accessors @@ -640,12 +637,6 @@ bool ElfLinkerImpl::link(raw_pwrite_stream &outStream) { return m_pipelineState->getLastError() == ""; } -// ===================================================================================================================== -// Returns true if the fragment shader uses a builtin input that gets mapped. -bool ElfLinkerImpl::fragmentShaderUsesMappedBuiltInInputs() { - return m_pipelineState->getPalMetadata()->fragmentShaderUsesMappedBuiltInInputs(); -} - // ===================================================================================================================== // Get string index in output ELF, adding to string table if necessary unsigned ElfLinkerImpl::getStringIndex(StringRef string) { diff --git a/lgc/elfLinker/RelocHandler.cpp b/lgc/elfLinker/RelocHandler.cpp index 7215b0809f..a1890d06ac 100644 --- a/lgc/elfLinker/RelocHandler.cpp +++ b/lgc/elfLinker/RelocHandler.cpp @@ -162,22 +162,6 @@ bool RelocHandler::getValue(StringRef name, uint64_t &value) { } } - if (name.startswith(reloc::DescriptorStride)) { - // Descriptor stride in bytes. - unsigned descSet = 0; - unsigned binding = 0; - ResourceNodeType type = ResourceNodeType::Unknown; - if (parseDescSetBinding(name.drop_front(strlen(reloc::DescriptorStride)), descSet, binding, type)) { - const ResourceNode *outerNode = nullptr; - const ResourceNode *node = nullptr; - std::tie(outerNode, node) = getPipelineState()->findResourceNode(type, descSet, binding); - if (!node) - report_fatal_error("No resource node for " + name); - value = node->stride * sizeof(uint32_t); - return true; - } - } - if (name.startswith(reloc::CompactBuffer)) { // Descriptor stride in bytes. unsigned descSet = 0; @@ -207,15 +191,6 @@ bool RelocHandler::getValue(StringRef name, uint64_t &value) { getPipelineState()->getPalMetadata()->setUserDataSpillUsage(pushConstantNode->offsetInDwords); return true; } - if (name == reloc::ShadowDescriptorTableEnabled) { - value = m_pipelineState->getOptions().highAddrOfFmask != ShadowDescriptorTableDisable; - return true; - } - - if (name == reloc::ShadowDescriptorTable) { - value = m_pipelineState->getOptions().highAddrOfFmask; - return true; - } return false; } diff --git a/lgc/imported/chip/gfx9/gfx9_plus_merged_enum.h b/lgc/imported/chip/gfx9/gfx9_plus_merged_enum.h index 79a66aa60a..039ce8de3f 100644 --- a/lgc/imported/chip/gfx9/gfx9_plus_merged_enum.h +++ b/lgc/imported/chip/gfx9/gfx9_plus_merged_enum.h @@ -51,7 +51,7 @@ typedef enum BinningMode { FORCE_BINNING_ON = 0x00000001, DISABLE_BINNING_USE_NEW_SC__GFX09_10 = 0x00000002, DISABLE_BINNING_USE_LEGACY_SC__GFX09_10 = 0x00000003, -#if CHIP_HDR_NAVI31 || CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 +#if CHIP_HDR_NAVI31|| CHIP_HDR_NAVI32|| CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 BINNING_ONE_PRIM_PER_BATCH__GFX11 = 0x00000002, BINNING_DISABLED__GFX11 = 0x00000003, #endif @@ -87,7 +87,7 @@ typedef enum BlendOp { BLEND_INV_SRC1_ALPHA__GFX09_10 = 0x00000012, BLEND_CONSTANT_ALPHA__GFX09_10 = 0x00000013, BLEND_ONE_MINUS_CONSTANT_ALPHA__GFX09_10 = 0x00000014, -#if CHIP_HDR_NAVI31 || CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 +#if CHIP_HDR_NAVI31|| CHIP_HDR_NAVI32|| CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 BLEND_CONSTANT_COLOR__GFX11 = 0x0000000b, BLEND_ONE_MINUS_CONSTANT_COLOR__GFX11 = 0x0000000c, BLEND_SRC1_COLOR__GFX11 = 0x0000000d, @@ -160,7 +160,7 @@ typedef enum BUF_FMT { BUF_FMT_16_16_UINT = 0x0000001b, BUF_FMT_16_16_SINT = 0x0000001c, BUF_FMT_16_16_FLOAT = 0x0000001d, -#if CHIP_HDR_NAVI31 || CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 +#if CHIP_HDR_NAVI31|| CHIP_HDR_NAVI32|| CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 BUF_FMT_10_11_11_FLOAT__GFX104PLUS = 0x0000001e, BUF_FMT_11_11_10_FLOAT__GFX104PLUS = 0x0000001f, BUF_FMT_10_10_10_2_UNORM__GFX104PLUS = 0x00000020, @@ -317,7 +317,7 @@ typedef enum CBMode { CB_FMASK_DECOMPRESS__GFX09_10 = 0x00000005, CB_DCC_DECOMPRESS__GFX09_10 = 0x00000006, CB_RESERVED__GFX10 = 0x00000007, -#if CHIP_HDR_NAVI31 || CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 +#if CHIP_HDR_NAVI31|| CHIP_HDR_NAVI32|| CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 CB_DCC_DECOMPRESS__GFX11 = 0x00000003, CB_RESERVED__GFX11 = 0x00000004, #endif @@ -864,7 +864,7 @@ typedef enum CBPerfSel { CB_PERF_SEL_CC_CACHE_256BS_SAVED_DUE_TO_QSB__GFX10CORE = 0x000001c2, CB_PERF_SEL_FC_CACHE_FMASK_NO_FETCH__GFX10CORE = 0x000001c3, CB_PERF_SEL_CC_CACHE_SECTOR_HIT__GFX10CORE = 0x000001c4, -#if CHIP_HDR_NAVI31 || CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 +#if CHIP_HDR_NAVI31|| CHIP_HDR_NAVI32|| CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 CB_PERF_SEL_DRAWN_BUSY__GFX11 = 0x00000002, CB_PERF_SEL_DRAWN_PIXEL__GFX11 = 0x00000003, CB_PERF_SEL_DRAWN_QUAD__GFX11 = 0x00000004, @@ -1133,11 +1133,11 @@ typedef enum CBPerfSel { constexpr unsigned int MaxCBPerfSelVg10_Vg12_Vg20_Rv1x_Rv2x = CB_PERF_SEL_CC_BB_BLEND_PIXEL_VLD__GFX09_10; constexpr unsigned int MaxCBPerfSelRn = CB_PERF_SEL_CC_DCC_KEY_VALUE__CONST_CLEAR_AC11__RN; constexpr unsigned int MaxCBPerfSelGfx10Core = CB_PERF_SEL_CC_CACHE_SECTOR_HIT__GFX10CORE; -#if CHIP_HDR_NAVI31 || CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 +#if CHIP_HDR_NAVI31|| CHIP_HDR_NAVI32|| CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 constexpr unsigned int MaxCBPerfSelGfx11 = CB_PERF_SEL_EXPORT_KILLED_BY_NULL_TARGET_SHADER_MASK__GFX11; #endif -#if CHIP_HDR_NAVI31 || CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 +#if CHIP_HDR_NAVI31|| CHIP_HDR_NAVI32|| CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 typedef enum CBRamList { CB_DCG_CCC_CAS_TAG_ARRAY = 0x00000000, CB_DCG_CCC_CAS_FRAG_PTR = 0x00000001, @@ -1385,7 +1385,7 @@ typedef enum CHA_PERF_SEL { CHA_PERF_SEL_STALL_RET_CONFLICT_CHC4__NV24 = 0x00000026, CHA_PERF_SEL_CYCLE__NV24 = 0x00000027, #endif -#if CHIP_HDR_NAVI31 || CHIP_HDR_NAVI33 +#if CHIP_HDR_NAVI31 || CHIP_HDR_NAVI32 || CHIP_HDR_NAVI33 CHA_PERF_SEL_STALL_CHC4__NV3X = 0x00000005, CHA_PERF_SEL_STALL_CHC5__NV3X = 0x00000006, CHA_PERF_SEL_REQUEST_CHC0__NV3X = 0x00000007, @@ -1479,7 +1479,7 @@ typedef enum CHA_PERF_SEL { } CHA_PERF_SEL; constexpr unsigned int MaxChaPerfSelGfx101 = CHA_PERF_SEL_CYCLE__GFX101; -#if CHIP_HDR_NAVI31 || CHIP_HDR_NAVI33 +#if CHIP_HDR_NAVI31 || CHIP_HDR_NAVI32 || CHIP_HDR_NAVI33 constexpr unsigned int MaxChaPerfSelNv3x = CHA_PERF_SEL_CYCLE__NV3X; #endif #if CHIP_HDR_NAVI24 @@ -1704,7 +1704,7 @@ typedef enum CHCG_PERF_SEL { CHCG_PERF_SEL_REQ_CLIENT18__NV24 = 0x00000026, CHCG_PERF_SEL_REQ_CLIENT19__NV24 = 0x00000027, #endif -#if CHIP_HDR_NAVI31 || CHIP_HDR_NAVI33 +#if CHIP_HDR_NAVI31 || CHIP_HDR_NAVI32 || CHIP_HDR_NAVI33 CHCG_PERF_SEL_CYCLE__NV3X = 0x00000000, CHCG_PERF_SEL_BUSY__NV3X = 0x00000001, CHCG_PERF_SEL_STARVE__NV3X = 0x00000002, @@ -1765,7 +1765,7 @@ constexpr unsigned int MaxChcgPerfSelNv22 = CHCG_PERF_SEL_REQ_CLIEN #if CHIP_HDR_NAVI21 constexpr unsigned int MaxChcgPerfSelNv21 = CHCG_PERF_SEL_REQ_CLIENT19__NV21; #endif -#if CHIP_HDR_NAVI31 || CHIP_HDR_NAVI33 +#if CHIP_HDR_NAVI31 || CHIP_HDR_NAVI32 || CHIP_HDR_NAVI33 constexpr unsigned int MaxChcgPerfSelNv3x = CHCG_PERF_SEL_REQ_CLIENT23__NV3X; #endif @@ -1810,7 +1810,7 @@ typedef enum CHC_PERF_SEL { CHC_PERF_SEL_REQ_CLIENT17__GFX103PLUSEXCLUSIVE = 0x00000025, CHC_PERF_SEL_REQ_CLIENT18__GFX103PLUSEXCLUSIVE = 0x00000026, CHC_PERF_SEL_REQ_CLIENT19__GFX103PLUSEXCLUSIVE = 0x00000027, -#if CHIP_HDR_NAVI31 || CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 +#if CHIP_HDR_NAVI31|| CHIP_HDR_NAVI32|| CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 CHC_PERF_SEL_REQ_CLIENT20__GFX11 = 0x00000028, CHC_PERF_SEL_REQ_CLIENT21__GFX11 = 0x00000029, CHC_PERF_SEL_REQ_CLIENT22__GFX11 = 0x0000002a, @@ -1820,7 +1820,7 @@ typedef enum CHC_PERF_SEL { constexpr unsigned int MaxChcPerfSelGfx101 = CHC_PERF_SEL_REQ_CLIENT14; constexpr unsigned int MaxChcPerfSelGfx103Derivative = CHC_PERF_SEL_REQ_CLIENT19__GFX103PLUSEXCLUSIVE; -#if CHIP_HDR_NAVI31 || CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 +#if CHIP_HDR_NAVI31|| CHIP_HDR_NAVI32|| CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 constexpr unsigned int MaxChcPerfSelGfx11 = CHC_PERF_SEL_REQ_CLIENT23__GFX11; #endif @@ -2028,7 +2028,7 @@ typedef enum CPC_PERFCOUNT_SEL { CPC_PERF_SEL_MEC_INSTR_CACHE_MISS__GFX10COREPLUS = 0x0000002c, CPC_PERF_SEL_MES_THREAD0__GFX10COREPLUS = 0x0000002d, CPC_PERF_SEL_MES_THREAD1__GFX10COREPLUS = 0x0000002e, -#if CHIP_HDR_NAVI31 || CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 +#if CHIP_HDR_NAVI31|| CHIP_HDR_NAVI32|| CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 CPC_PERF_SEL_ME1_STALL_WAIT_ON_MEM_READ__GFX11 = 0x00000009, CPC_PERF_SEL_ME1_STALL_WAIT_ON_MEM_WRITE__GFX11 = 0x0000000a, CPC_PERF_SEL_ME2_STALL_WAIT_ON_MEM_READ__GFX11 = 0x00000011, @@ -2047,7 +2047,7 @@ typedef enum CPC_PERFCOUNT_SEL { constexpr unsigned int MaxCpcPerfcountSelGfx09 = CPC_PERF_SEL_ME2_DC1_SPI_BUSY__CORE; constexpr unsigned int MaxCpcPerfcountSelGfx10Core = CPC_PERF_SEL_MES_THREAD1__GFX10COREPLUS; -#if CHIP_HDR_NAVI31 || CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 +#if CHIP_HDR_NAVI31|| CHIP_HDR_NAVI32|| CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 constexpr unsigned int MaxCpcPerfcountSelGfx11 = CPC_PERF_SEL_MEC_THREAD3__GFX11; #endif @@ -2129,7 +2129,7 @@ typedef enum CPF_PERFCOUNT_SEL { CPF_PERF_SEL_CSF_BUSY_FOR_FETCHING_DB__GFX10COREPLUS = 0x00000025, CPF_PERF_SEL_CPF_UTCL2IU_XACK__GFX10COREPLUS = 0x00000026, CPF_PERF_SEL_CPF_UTCL2IU_XNACK__GFX10COREPLUS = 0x00000027, -#if CHIP_HDR_NAVI31 || CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 +#if CHIP_HDR_NAVI31|| CHIP_HDR_NAVI32|| CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 CPF_PERF_SEL_CP_SDMA_MNGR_DMA_REQ__GFX11 = 0x00000028, CPF_PERF_SEL_CP_SDMA_MNGR_DMA_DONE__GFX11 = 0x00000029, CPF_PERF_SEL_CP_SDMA_MNGR_LATENCY__GFX11 = 0x0000002a, @@ -2139,7 +2139,7 @@ typedef enum CPF_PERFCOUNT_SEL { constexpr unsigned int MaxCpfPerfcountSelGfx09 = CPF_PERF_SEL_CPF_UTCL2IU_STALL__GFX09; constexpr unsigned int MaxCpfPerfcountSelGfx10Core = CPF_PERF_SEL_CPF_UTCL2IU_XNACK__GFX10COREPLUS; -#if CHIP_HDR_NAVI31 || CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 +#if CHIP_HDR_NAVI31|| CHIP_HDR_NAVI32|| CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 constexpr unsigned int MaxCpfPerfcountSelGfx11 = CPF_PERF_SEL_CP_SDMA_MNGR_SDMABUSY__GFX11; #endif @@ -2313,7 +2313,7 @@ typedef enum CPG_PERFCOUNT_SEL { CPG_PERF_SEL_LOAD_STALLED_ON_SET_COHERENCY__GFX10PLUS = 0x0000001f, CPG_PERF_SEL_DYNAMIC_CLK_VALID__GFX10PLUS = 0x00000020, CPG_PERF_SEL_REGISTER_CLK_VALID__GFX10PLUS = 0x00000021, -#if CHIP_HDR_NAVI31 || CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 +#if CHIP_HDR_NAVI31|| CHIP_HDR_NAVI32|| CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 CPG_PERF_SEL_PFP_PWS_STALLED0__GFX11 = 0x00000052, CPG_PERF_SEL_ME_PWS_STALLED0__GFX11 = 0x00000053, CPG_PERF_SEL_PFP_VGTDMA_INDR_STRUCT_BYPASS0__GFX11 = 0x00000054, @@ -2420,7 +2420,7 @@ constexpr unsigned int MaxCpgPerfcountSelVg10_Vg12_Vg20_Rn = CPG_PERF_SEL_CPG_UT constexpr unsigned int MaxCpgPerfcountSelRv1x_Rv2x = CPG_PERF_SEL_CPG_UTCL2IU_STALL__RV1X_RV2X; constexpr unsigned int MaxCpgPerfcountSelGfx103 = CPG_PERF_SEL_DMA_FETCHER_STALLED_ON_ROQ_FULL__GFX10COREPLUS; constexpr unsigned int MaxCpgPerfcountSelGfx101 = CPG_PERF_SEL_DMA_FETCHER_STALLED_ON_ROQ_FULL__GFX10COREPLUS; -#if CHIP_HDR_NAVI31 || CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 +#if CHIP_HDR_NAVI31|| CHIP_HDR_NAVI32|| CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 constexpr unsigned int MaxCpgPerfcountSelGfx11 = CPG_PERF_SEL_PFP_VGTDMA_DB_ROQ_DATA_STALL1__GFX11; #endif @@ -2481,7 +2481,7 @@ typedef enum DepthFormat { DEPTH_X24_8_32_FLOAT = 0x00000007, } DepthFormat; -#if CHIP_HDR_NAVI21|| CHIP_HDR_NAVI22|| CHIP_HDR_NAVI23|| CHIP_HDR_NAVI24|| CHIP_HDR_NAVI31 || CHIP_HDR_NAVI33 +#if CHIP_HDR_NAVI21 || CHIP_HDR_NAVI22 || CHIP_HDR_NAVI23 || CHIP_HDR_NAVI24 || CHIP_HDR_NAVI31 || CHIP_HDR_NAVI32 || CHIP_HDR_NAVI33 typedef enum DF_MALL_PERF_SEL { #if CHIP_HDR_NAVI21 DF_MALL_PERF_SEL_ML_MTQ_OCC__NV21 = 0x00000000, @@ -2563,7 +2563,7 @@ typedef enum DF_MALL_PERF_SEL { DF_MALL_PERF_SEL_MALL_SDP_LAT_HIST_GT500__NV24 = 0x00000036, DF_MALL_PERF_SEL_MALL_SDP_LAT_HIST_GT1000__NV24 = 0x00000037, #endif -#if CHIP_HDR_NAVI31 || CHIP_HDR_NAVI33 +#if CHIP_HDR_NAVI31 || CHIP_HDR_NAVI32 || CHIP_HDR_NAVI33 DF_MALL_PERF_SEL_ML_MTQ_OCC__NV3X = 0x00000000, DF_MALL_PERF_SEL_ML_MRS_OCC__NV3X = 0x00000001, DF_MALL_PERF_SEL_ML_REQ__NV3X = 0x00000002, @@ -2585,7 +2585,7 @@ typedef enum DF_MALL_PERF_SEL { #endif } DF_MALL_PERF_SEL; -#if CHIP_HDR_NAVI31 || CHIP_HDR_NAVI33 +#if CHIP_HDR_NAVI31 || CHIP_HDR_NAVI32 || CHIP_HDR_NAVI33 constexpr unsigned int MaxDfMallPerfSelNv3x = DF_MALL_PERF_SEL_MALL_SDP_LAT_HIST_GT1000__NV3X; #endif #if CHIP_HDR_NAVI24 @@ -2770,7 +2770,7 @@ typedef enum GCRPerfSel { GCR_PERF_SEL_UTCL2_OUT_OF_CREDIT_EVENT__GFX10COREPLUS = 0x0000005b, GCR_PERF_SEL_UTCL2_INFLIGHT_REQ__GFX10COREPLUS = 0x0000005c, GCR_PERF_SEL_UTCL2_FILTERED_RET__GFX10COREPLUS = 0x0000005d, -#if CHIP_HDR_NAVI31 || CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 +#if CHIP_HDR_NAVI31|| CHIP_HDR_NAVI32|| CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 GCR_PERF_SEL_SDMA0_GL1_TLB_SHOOTDOWN_REQ__GFX11 = 0x00000010, GCR_PERF_SEL_CPC_GL1_TLB_SHOOTDOWN_REQ__GFX11 = 0x00000030, GCR_PERF_SEL_CPG_GL1_TLB_SHOOTDOWN_REQ__GFX11 = 0x00000040, @@ -2944,7 +2944,7 @@ typedef enum GCRPerfSel { GCR_PERF_SEL_PIO_TCP_REQ__NV24 = 0x0000006c, GCR_PERF_SEL_PIO_TCP_TLB_SHOOTDOWN_REQ__NV24 = 0x0000006d, #endif -#if CHIP_HDR_NAVI31 || CHIP_HDR_NAVI33 +#if CHIP_HDR_NAVI31 || CHIP_HDR_NAVI32 || CHIP_HDR_NAVI33 GCR_PERF_SEL_SDMA1_ALL_REQ__NV3X = 0x00000011, GCR_PERF_SEL_SDMA1_GL2_RANGE_REQ__NV3X = 0x00000012, GCR_PERF_SEL_SDMA1_GL2_RANGE_LT16K_REQ__NV3X = 0x00000013, @@ -3120,7 +3120,7 @@ constexpr unsigned int MaxGCRPerfSelNv23 = GCR_PERF_SEL_PIO_TCP_TL #if CHIP_HDR_NAVI22 constexpr unsigned int MaxGCRPerfSelNv22 = GCR_PERF_SEL_PIO_TCP_TLB_SHOOTDOWN_REQ__NV22; #endif -#if CHIP_HDR_NAVI31 || CHIP_HDR_NAVI33 +#if CHIP_HDR_NAVI31 || CHIP_HDR_NAVI32 || CHIP_HDR_NAVI33 constexpr unsigned int MaxGCRPerfSelNv3x = GCR_PERF_SEL_PIO_GL1_TLB_SHOOTDOWN_REQ__NV3X; #endif #if CHIP_HDR_PHOENIX1 @@ -3491,7 +3491,7 @@ typedef enum GDS_PERFCOUNT_SELECT { GDS_PERF_SEL_SE3_SH1_GDS_SHORT_OP__GFX103 = 0x00000076, GDS_PERF_SEL_GWS_RELEASED__GFX103 = 0x00000077, GDS_PERF_SEL_GWS_BYPASS__GFX103 = 0x00000078, -#if CHIP_HDR_NAVI31 || CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 +#if CHIP_HDR_NAVI31|| CHIP_HDR_NAVI32|| CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 GDS_PERF_SEL_WR_COMP__GFX11 = 0x00000000, GDS_PERF_SEL_WBUF_WR__GFX11 = 0x00000001, GDS_PERF_SEL_SE0_NORET__GFX11 = 0x00000002, @@ -3646,7 +3646,7 @@ typedef enum GDS_PERFCOUNT_SELECT { constexpr unsigned int MaxGdsPerfcountSelectGfx103 = GDS_PERF_SEL_GWS_BYPASS__GFX103; constexpr unsigned int MaxGdsPerfcountSelectGfx09 = GDS_PERF_SEL_GWS_BYPASS__GFX09; constexpr unsigned int MaxGdsPerfcountSelectGfx101 = GDS_PERF_SEL_GWS_BYPASS__GFX101; -#if CHIP_HDR_NAVI31 || CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 +#if CHIP_HDR_NAVI31|| CHIP_HDR_NAVI32|| CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 constexpr unsigned int MaxGdsPerfcountSelectGfx11 = GDS_PERF_SEL_SE7_GS_WAVE_ID_VALID__GFX11; #endif @@ -3691,7 +3691,7 @@ typedef enum GE1_PERFCOUNT_SELECT { ge1_rbiu_di_fifo_starved_p1__GFX103COREPLUS = 0x00000025, ge1_rbiu_dr_fifo_stalled_p1__GFX103COREPLUS = 0x00000026, ge1_rbiu_dr_fifo_starved_p1__GFX103COREPLUS = 0x00000027, -#if CHIP_HDR_NAVI31 || CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 +#if CHIP_HDR_NAVI31|| CHIP_HDR_NAVI32|| CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 ge1_small_draws_one_instance__GFX11 = 0x00000020, ge1_unopt_multi_instance_draws__GFX11 = 0x00000023, #endif @@ -3763,7 +3763,7 @@ typedef enum GE2_DIST_PERFCOUNT_SELECT { ge_dist_hs_done_latency__GFX103DERIVATIVE = 0x0000003c, ge_dist_distributer_busy__GFX103DERIVATIVE = 0x0000003d, ge_tf_ret_data_stalling_hs_done__GFX103DERIVATIVE = 0x0000003e, -#if CHIP_HDR_NAVI31 || CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 +#if CHIP_HDR_NAVI31|| CHIP_HDR_NAVI32|| CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 ge_dist_hs_done_latency_se4__GFX11 = 0x00000005, ge_dist_hs_done_latency_se5__GFX11 = 0x00000006, ge_dist_hs_done_latency_se6__GFX11 = 0x00000007, @@ -3864,7 +3864,7 @@ typedef enum GE2_DIST_PERFCOUNT_SELECT { } GE2_DIST_PERFCOUNT_SELECT; constexpr unsigned int MaxGe2DistPerfcountSelectGfx103Derivative = ge_tf_ret_data_stalling_hs_done__GFX103DERIVATIVE; -#if CHIP_HDR_NAVI31 || CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 +#if CHIP_HDR_NAVI31|| CHIP_HDR_NAVI32|| CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 constexpr unsigned int MaxGe2DistPerfcountSelectGfx11 = ge_agm_gcr_combine__GFX11; #endif @@ -4008,7 +4008,7 @@ typedef enum GE2_SE_PERFCOUNT_SELECT { ge_se_es_done__GFX103DERIVATIVE = 0x00000005, ge_se_es_done_latency__GFX103DERIVATIVE = 0x00000006, ge_se_es_flush__GFX103DERIVATIVE = 0x00000007, -#if CHIP_HDR_NAVI31 || CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 +#if CHIP_HDR_NAVI31|| CHIP_HDR_NAVI32|| CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 ge_se_ds_prims__GFX11 = 0x00000000, ge_se_es_thread_groups__GFX11 = 0x00000001, ge_se_esvert_stalled_gsprim__GFX11 = 0x00000002, @@ -4101,7 +4101,7 @@ typedef enum GE2_SE_PERFCOUNT_SELECT { } GE2_SE_PERFCOUNT_SELECT; constexpr unsigned int MaxGe2SePerfcountSelectGfx103 = ge_hs_stall_tfmm_fifo_full__GFX103; -#if CHIP_HDR_NAVI31 || CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 +#if CHIP_HDR_NAVI31|| CHIP_HDR_NAVI32|| CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 constexpr unsigned int MaxGe2SePerfcountSelectGfx11 = ge_ngg_busy_base__GFX11; #endif @@ -4605,7 +4605,7 @@ typedef enum GL1C_PERF_SEL { GL1C_PERF_SEL_UTCL0_TRANSLATION_HIT__GFX103PLUSEXCLUSIVE = 0x00000042, GL1C_PERF_SEL_UTCL0_TRANSLATION_MISS__GFX103PLUSEXCLUSIVE = 0x00000043, GL1C_PERF_SEL_UTCL0_PERMISSION_MISS__GFX103PLUSEXCLUSIVE = 0x00000044, -#if CHIP_HDR_NAVI31 || CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 +#if CHIP_HDR_NAVI31|| CHIP_HDR_NAVI32|| CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 GL1C_PERF_SEL_UTCL0_MISS_UNDER_MISS__GFX11 = 0x00000045, GL1C_PERF_SEL_UTCL0_LFIFO_FULL__GFX11 = 0x00000046, GL1C_PERF_SEL_UTCL0_STALL_INFLIGHT_MAX__GFX11 = 0x00000047, @@ -4626,7 +4626,7 @@ typedef enum GL1C_PERF_SEL { constexpr unsigned int MaxGl1cPerfSelGfx101 = GL1C_PERF_SEL_REQ_CLIENT27__GFX101; constexpr unsigned int MaxGl1cPerfSelGfx103Derivative = GL1C_PERF_SEL_UTCL0_UTCL1_XNACK_NO_RETRY_FAULT__GFX103DERIVATIVE; -#if CHIP_HDR_NAVI31 || CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 +#if CHIP_HDR_NAVI31|| CHIP_HDR_NAVI32|| CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 constexpr unsigned int MaxGl1cPerfSelGfx11 = GL1C_PERF_SEL_UTCL0_UTCL1_XNACK_NO_RETRY_FAULT__GFX11; #endif @@ -4733,7 +4733,7 @@ typedef enum GL2A_PERF_SEL { GL2A_PERF_SEL_REQ_BURST_CLIENT13 = 0x00000058, GL2A_PERF_SEL_REQ_BURST_CLIENT14 = 0x00000059, GL2A_PERF_SEL_REQ_BURST_CLIENT15 = 0x0000005a, -#if CHIP_HDR_NAVI31 || CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 +#if CHIP_HDR_NAVI31|| CHIP_HDR_NAVI32|| CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 GL2A_PERF_SEL_RTN_CREDIT_STALL_CLIENT0__GFX104PLUS = 0x0000005b, GL2A_PERF_SEL_RTN_CREDIT_STALL_CLIENT1__GFX104PLUS = 0x0000005c, GL2A_PERF_SEL_RTN_CREDIT_STALL_CLIENT2__GFX104PLUS = 0x0000005d, @@ -4754,7 +4754,7 @@ typedef enum GL2A_PERF_SEL { } GL2A_PERF_SEL; constexpr unsigned int MaxGl2aPerfSelGfx10Core = GL2A_PERF_SEL_REQ_BURST_CLIENT15; -#if CHIP_HDR_NAVI31 || CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 +#if CHIP_HDR_NAVI31|| CHIP_HDR_NAVI32|| CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 constexpr unsigned int MaxGl2aPerfSelGfx104Plus = GL2A_PERF_SEL_RTN_CREDIT_STALL_CLIENT15__GFX104PLUS; #endif @@ -5224,7 +5224,7 @@ typedef enum GL2C_PERF_SEL { GL2C_PERF_SEL_CM_DCC_OUT_2x1__GFX103PLUSEXCLUSIVE = 0x000000fa, GL2C_PERF_SEL_CM_DCC_OUT_2x2__GFX103PLUSEXCLUSIVE = 0x000000fb, GL2C_PERF_SEL_CM_DCC_OUT_UNCOMP__GFX103PLUSEXCLUSIVE = 0x000000fc, -#if CHIP_HDR_NAVI31 || CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 +#if CHIP_HDR_NAVI31|| CHIP_HDR_NAVI32|| CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 GL2C_PERF_SEL_CM_DCC_OUT_CONST2SINGLE__GFX11 = 0x000000fd, GL2C_PERF_SEL_CM_DCC_OUT_CONST2CLEAR__GFX11 = 0x000000fe, GL2C_PERF_SEL_HIT_PASS_MISS_IN_CLIENT16__GFX11 = 0x000000ff, @@ -5236,7 +5236,7 @@ typedef enum GL2C_PERF_SEL { constexpr unsigned int MaxGl2cPerfSelGfx101 = GL2C_PERF_SEL_CM_DCC_STALL__GFX101; constexpr unsigned int MaxGl2cPerfSelGfx103 = GL2C_PERF_SEL_CM_DCC_OUT_CONST__GFX103DERIVATIVE; -#if CHIP_HDR_NAVI31 || CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 +#if CHIP_HDR_NAVI31|| CHIP_HDR_NAVI32|| CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 constexpr unsigned int MaxGl2cPerfSelGfx11 = GL2C_PERF_SEL_HIT_PASS_MISS_IN_CLIENT19__GFX11; #endif @@ -5301,7 +5301,7 @@ typedef enum GRBM_PERF_SEL { GRBM_PERF_SEL_CH_BUSY__GFX10PLUS = 0x0000002a, GRBM_PERF_SEL_PH_BUSY__GFX10PLUS = 0x0000002b, GRBM_PERF_SEL_GL1CC_BUSY__GFX10PLUS = 0x0000002e, -#if CHIP_HDR_NAVI31 || CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 +#if CHIP_HDR_NAVI31|| CHIP_HDR_NAVI32|| CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 GRBM_PERF_SEL_ANY_ACTIVE_F_BUSY__GFX11 = 0x0000002f, GRBM_PERF_SEL_GL1H_BUSY__GFX11 = 0x00000030, GRBM_PERF_SEL_PC_BUSY__GFX11 = 0x00000031, @@ -5311,7 +5311,7 @@ typedef enum GRBM_PERF_SEL { constexpr unsigned int MaxGrbmPerfSelGfx09 = GRBM_PERF_SEL_CPAXI_BUSY; constexpr unsigned int MaxGrbmPerfSelGfx10 = GRBM_PERF_SEL_GL1CC_BUSY__GFX10PLUS; -#if CHIP_HDR_NAVI31 || CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 +#if CHIP_HDR_NAVI31|| CHIP_HDR_NAVI32|| CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 constexpr unsigned int MaxGrbmPerfSelGfx11 = GRBM_PERF_SEL_PC_BUSY__GFX11; #endif @@ -5336,7 +5336,7 @@ typedef enum GRBM_SE0_PERF_SEL { GRBM_SE0_PERF_SEL_UTCL1_BUSY__GFX10PLUS = 0x00000010, GRBM_SE0_PERF_SEL_TCP_BUSY__GFX10PLUS = 0x00000011, GRBM_SE0_PERF_SEL_GL1CC_BUSY__GFX10PLUS = 0x00000012, -#if CHIP_HDR_NAVI31 || CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 +#if CHIP_HDR_NAVI31|| CHIP_HDR_NAVI32|| CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 GRBM_SE0_PERF_SEL_GL1H_BUSY__GFX11 = 0x00000013, GRBM_SE0_PERF_SEL_PC_BUSY__GFX11 = 0x00000014, #endif @@ -5344,7 +5344,7 @@ typedef enum GRBM_SE0_PERF_SEL { constexpr unsigned int MaxGrbmSe0PerfSelGfx09 = GRBM_SE0_PERF_SEL_RMI_BUSY; constexpr unsigned int MaxGrbmSe0PerfSelGfx10 = GRBM_SE0_PERF_SEL_GL1CC_BUSY__GFX10PLUS; -#if CHIP_HDR_NAVI31 || CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 +#if CHIP_HDR_NAVI31|| CHIP_HDR_NAVI32|| CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 constexpr unsigned int MaxGrbmSe0PerfSelGfx11 = GRBM_SE0_PERF_SEL_PC_BUSY__GFX11; #endif @@ -5371,7 +5371,7 @@ typedef enum GRBM_SE1_PERF_SEL { GRBM_SE1_PERF_SEL_UTCL1_BUSY__GFX10COREPLUS = 0x00000010, GRBM_SE1_PERF_SEL_TCP_BUSY__GFX10COREPLUS = 0x00000011, GRBM_SE1_PERF_SEL_GL1CC_BUSY__GFX10COREPLUS = 0x00000012, -#if CHIP_HDR_NAVI31 || CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 +#if CHIP_HDR_NAVI31|| CHIP_HDR_NAVI32|| CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 GRBM_SE1_PERF_SEL_GL1H_BUSY__GFX11 = 0x00000013, GRBM_SE1_PERF_SEL_PC_BUSY__GFX11 = 0x00000014, #endif @@ -5379,7 +5379,7 @@ typedef enum GRBM_SE1_PERF_SEL { constexpr unsigned int MaxGrbmSe1PerfSelGfx09 = GRBM_SE1_PERF_SEL_RMI_BUSY; constexpr unsigned int MaxGrbmSe1PerfSelGfx10Core = GRBM_SE1_PERF_SEL_GL1CC_BUSY__GFX10COREPLUS; -#if CHIP_HDR_NAVI31 || CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 +#if CHIP_HDR_NAVI31|| CHIP_HDR_NAVI32|| CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 constexpr unsigned int MaxGrbmSe1PerfSelGfx11 = GRBM_SE1_PERF_SEL_PC_BUSY__GFX11; #endif @@ -5406,7 +5406,7 @@ typedef enum GRBM_SE2_PERF_SEL { GRBM_SE2_PERF_SEL_UTCL1_BUSY__GFX10COREPLUS = 0x00000010, GRBM_SE2_PERF_SEL_TCP_BUSY__GFX10COREPLUS = 0x00000011, GRBM_SE2_PERF_SEL_GL1CC_BUSY__GFX10COREPLUS = 0x00000012, -#if CHIP_HDR_NAVI31 || CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 +#if CHIP_HDR_NAVI31|| CHIP_HDR_NAVI32|| CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 GRBM_SE2_PERF_SEL_GL1H_BUSY__GFX11 = 0x00000013, GRBM_SE2_PERF_SEL_PC_BUSY__GFX11 = 0x00000014, #endif @@ -5414,7 +5414,7 @@ typedef enum GRBM_SE2_PERF_SEL { constexpr unsigned int MaxGrbmSe2PerfSelGfx09 = GRBM_SE2_PERF_SEL_RMI_BUSY; constexpr unsigned int MaxGrbmSe2PerfSelGfx10Core = GRBM_SE2_PERF_SEL_GL1CC_BUSY__GFX10COREPLUS; -#if CHIP_HDR_NAVI31 || CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 +#if CHIP_HDR_NAVI31|| CHIP_HDR_NAVI32|| CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 constexpr unsigned int MaxGrbmSe2PerfSelGfx11 = GRBM_SE2_PERF_SEL_PC_BUSY__GFX11; #endif @@ -5441,7 +5441,7 @@ typedef enum GRBM_SE3_PERF_SEL { GRBM_SE3_PERF_SEL_UTCL1_BUSY__GFX10COREPLUS = 0x00000010, GRBM_SE3_PERF_SEL_TCP_BUSY__GFX10COREPLUS = 0x00000011, GRBM_SE3_PERF_SEL_GL1CC_BUSY__GFX10COREPLUS = 0x00000012, -#if CHIP_HDR_NAVI31 || CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 +#if CHIP_HDR_NAVI31|| CHIP_HDR_NAVI32|| CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 GRBM_SE3_PERF_SEL_GL1H_BUSY__GFX11 = 0x00000013, GRBM_SE3_PERF_SEL_PC_BUSY__GFX11 = 0x00000014, #endif @@ -5449,11 +5449,11 @@ typedef enum GRBM_SE3_PERF_SEL { constexpr unsigned int MaxGrbmSe3PerfSelGfx09 = GRBM_SE3_PERF_SEL_RMI_BUSY; constexpr unsigned int MaxGrbmSe3PerfSelGfx10Core = GRBM_SE3_PERF_SEL_GL1CC_BUSY__GFX10COREPLUS; -#if CHIP_HDR_NAVI31 || CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 +#if CHIP_HDR_NAVI31|| CHIP_HDR_NAVI32|| CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 constexpr unsigned int MaxGrbmSe3PerfSelGfx11 = GRBM_SE3_PERF_SEL_PC_BUSY__GFX11; #endif -#if CHIP_HDR_NAVI31 || CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 +#if CHIP_HDR_NAVI31|| CHIP_HDR_NAVI32|| CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 typedef enum GRBM_SE4_PERF_SEL { GRBM_SE4_PERF_SEL_COUNT = 0x00000000, GRBM_SE4_PERF_SEL_USER_DEFINED = 0x00000001, @@ -5666,7 +5666,7 @@ typedef enum IMG_DATA_FORMAT { IMG_DATA_FORMAT_YCBCR__GFX103 = 0x0000001d, IMG_DATA_FORMAT_LOD_5P3_USCALED__GFX103 = 0x0000003d, IMG_DATA_FORMAT_7E3__GFX103COREPLUS = 0x0000001e, -#if CHIP_HDR_NAVI31 || CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 +#if CHIP_HDR_NAVI31|| CHIP_HDR_NAVI32|| CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 IMG_DATA_FORMAT_RESERVED_44__GFX104PLUS = 0x0000002c, IMG_DATA_FORMAT_RESERVED_45__GFX104PLUS = 0x0000002d, IMG_DATA_FORMAT_RESERVED_46__GFX104PLUS = 0x0000002e, @@ -5767,7 +5767,7 @@ typedef enum IMG_DATA_FORMAT { IMG_DATA_FORMAT_RESERVED_42__GFX10COREPLUS = 0x0000002a, IMG_DATA_FORMAT_RESERVED_62__GFX10COREPLUS = 0x0000003e, IMG_DATA_FORMAT_RESERVED_43__GFX10PLUS = 0x0000002b, -#if CHIP_HDR_NAVI31 || CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 +#if CHIP_HDR_NAVI31|| CHIP_HDR_NAVI32|| CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 IMG_DATA_FORMAT_5_9_9_9__GFX11 = 0x00000018, IMG_DATA_FORMAT_GB_GR__GFX11 = 0x00000019, IMG_DATA_FORMAT_BG_RG__GFX11 = 0x0000001a, @@ -5838,6 +5838,74 @@ typedef enum IMG_DATA_FORMAT { IMG_DATA_FORMAT_MM_12_IN_16_16__NV24 = 0x00000057, IMG_DATA_FORMAT_MM_12_IN_16_16_16_16__NV24 = 0x00000058, #endif +#if CHIP_HDR_NAVI32 + IMG_DATA_FORMAT_FMASK8_S2_F2__NV32 = 0x0000003e, + IMG_DATA_FORMAT_FMASK8_S4_F4__NV32 = 0x0000003f, + IMG_DATA_FORMAT_FMASK8_S2_F1__NV32 = 0x00000040, + IMG_DATA_FORMAT_FMASK8_S4_F1__NV32 = 0x00000041, + IMG_DATA_FORMAT_FMASK8_S8_F1__NV32 = 0x00000042, + IMG_DATA_FORMAT_FMASK8_S4_F2__NV32 = 0x00000043, + IMG_DATA_FORMAT_FMASK16_S16_F1__NV32 = 0x00000044, + IMG_DATA_FORMAT_FMASK16_S8_F2__NV32 = 0x00000045, + IMG_DATA_FORMAT_FMASK32_S16_F2__NV32 = 0x00000046, + IMG_DATA_FORMAT_FMASK32_S8_F4__NV32 = 0x00000047, + IMG_DATA_FORMAT_FMASK64_S16_F4__NV32 = 0x00000048, + IMG_DATA_FORMAT_FMASK64_S16_F8__NV32 = 0x00000049, + IMG_DATA_FORMAT_FMASK32_S8_F8__NV32 = 0x0000004a, + IMG_DATA_FORMAT_RESERVED_75__NV32 = 0x0000004b, + IMG_DATA_FORMAT_RESERVED_76__NV32 = 0x0000004c, + IMG_DATA_FORMAT_RESERVED_77__NV32 = 0x0000004d, + IMG_DATA_FORMAT_RESERVED_78__NV32 = 0x0000004e, + IMG_DATA_FORMAT_RESERVED_79__NV32 = 0x0000004f, + IMG_DATA_FORMAT_RESERVED_80__NV32 = 0x00000050, + IMG_DATA_FORMAT_RESERVED_81__NV32 = 0x00000051, + IMG_DATA_FORMAT_RESERVED_82__NV32 = 0x00000052, + IMG_DATA_FORMAT_RESERVED_83__NV32 = 0x00000053, + IMG_DATA_FORMAT_RESERVED_84__NV32 = 0x00000054, + IMG_DATA_FORMAT_RESERVED_85__NV32 = 0x00000055, + IMG_DATA_FORMAT_RESERVED_86__NV32 = 0x00000056, + IMG_DATA_FORMAT_RESERVED_87__NV32 = 0x00000057, + IMG_DATA_FORMAT_RESERVED_88__NV32 = 0x00000058, + IMG_DATA_FORMAT_RESERVED_89__NV32 = 0x00000059, + IMG_DATA_FORMAT_RESERVED_90__NV32 = 0x0000005a, + IMG_DATA_FORMAT_RESERVED_91__NV32 = 0x0000005b, + IMG_DATA_FORMAT_RESERVED_92__NV32 = 0x0000005c, + IMG_DATA_FORMAT_RESERVED_93__NV32 = 0x0000005d, + IMG_DATA_FORMAT_RESERVED_94__NV32 = 0x0000005e, + IMG_DATA_FORMAT_RESERVED_95__NV32 = 0x0000005f, + IMG_DATA_FORMAT_RESERVED_96__NV32 = 0x00000060, + IMG_DATA_FORMAT_RESERVED_97__NV32 = 0x00000061, + IMG_DATA_FORMAT_RESERVED_98__NV32 = 0x00000062, + IMG_DATA_FORMAT_RESERVED_99__NV32 = 0x00000063, + IMG_DATA_FORMAT_RESERVED_100__NV32 = 0x00000064, + IMG_DATA_FORMAT_RESERVED_101__NV32 = 0x00000065, + IMG_DATA_FORMAT_RESERVED_102__NV32 = 0x00000066, + IMG_DATA_FORMAT_RESERVED_103__NV32 = 0x00000067, + IMG_DATA_FORMAT_RESERVED_104__NV32 = 0x00000068, + IMG_DATA_FORMAT_RESERVED_105__NV32 = 0x00000069, + IMG_DATA_FORMAT_RESERVED_106__NV32 = 0x0000006a, + IMG_DATA_FORMAT_RESERVED_107__NV32 = 0x0000006b, + IMG_DATA_FORMAT_RESERVED_108__NV32 = 0x0000006c, + IMG_DATA_FORMAT_RESERVED_109__NV32 = 0x0000006d, + IMG_DATA_FORMAT_RESERVED_110__NV32 = 0x0000006e, + IMG_DATA_FORMAT_RESERVED_111__NV32 = 0x0000006f, + IMG_DATA_FORMAT_RESERVED_112__NV32 = 0x00000070, + IMG_DATA_FORMAT_RESERVED_113__NV32 = 0x00000071, + IMG_DATA_FORMAT_RESERVED_114__NV32 = 0x00000072, + IMG_DATA_FORMAT_RESERVED_115__NV32 = 0x00000073, + IMG_DATA_FORMAT_RESERVED_116__NV32 = 0x00000074, + IMG_DATA_FORMAT_RESERVED_117__NV32 = 0x00000075, + IMG_DATA_FORMAT_RESERVED_118__NV32 = 0x00000076, + IMG_DATA_FORMAT_RESERVED_119__NV32 = 0x00000077, + IMG_DATA_FORMAT_RESERVED_120__NV32 = 0x00000078, + IMG_DATA_FORMAT_RESERVED_121__NV32 = 0x00000079, + IMG_DATA_FORMAT_RESERVED_122__NV32 = 0x0000007a, + IMG_DATA_FORMAT_RESERVED_123__NV32 = 0x0000007b, + IMG_DATA_FORMAT_RESERVED_124__NV32 = 0x0000007c, + IMG_DATA_FORMAT_RESERVED_125__NV32 = 0x0000007d, + IMG_DATA_FORMAT_RESERVED_126__NV32 = 0x0000007e, + IMG_DATA_FORMAT_RESERVED_127__NV32 = 0x0000007f, +#endif #if CHIP_HDR_NAVI33 IMG_DATA_FORMAT_FMASK8_S2_F2__NV33 = 0x0000003e, IMG_DATA_FORMAT_FMASK8_S4_F4__NV33 = 0x0000003f, @@ -5987,7 +6055,7 @@ typedef enum IMG_FMT { IMG_FMT_7E3_FLOAT__GFX103 = 0x0000011d, IMG_FMT_YCBCR_UNORM__GFX103 = 0x0000011e, IMG_FMT_YCBCR_SRGB__GFX103 = 0x0000011f, -#if CHIP_HDR_NAVI31 || CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 +#if CHIP_HDR_NAVI31|| CHIP_HDR_NAVI32|| CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 IMG_FMT_10_11_11_FLOAT__GFX104PLUS = 0x0000001e, IMG_FMT_11_11_10_FLOAT__GFX104PLUS = 0x0000001f, IMG_FMT_10_10_10_2_UNORM__GFX104PLUS = 0x00000020, @@ -6584,7 +6652,7 @@ typedef enum IMG_FMT { IMG_FMT_RESERVED_125__GFX10COREPLUS = 0x0000007d, IMG_FMT_RESERVED_126__GFX10COREPLUS = 0x0000007e, IMG_FMT_RESERVED_127__GFX10COREPLUS = 0x0000007f, -#if CHIP_HDR_NAVI31 || CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 +#if CHIP_HDR_NAVI31|| CHIP_HDR_NAVI32|| CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 IMG_FMT_MM_10_IN_16_UNORM__GFX11 = 0x0000005a, IMG_FMT_MM_10_IN_16_UINT__GFX11 = 0x0000005b, IMG_FMT_MM_10_IN_16_16_UNORM__GFX11 = 0x0000005c, @@ -6683,6 +6751,21 @@ typedef enum IMG_FMT { IMG_FMT_MM_12_IN_16_16_16_16_UNORM__NV24 = 0x00000124, IMG_FMT_MM_12_IN_16_16_16_16_UINT__NV24 = 0x00000125, #endif +#if CHIP_HDR_NAVI32 + IMG_FMT_FMASK8_S2_F2__NV32 = 0x00000060, + IMG_FMT_FMASK8_S4_F4__NV32 = 0x00000061, + IMG_FMT_FMASK8_S2_F1__NV32 = 0x00000062, + IMG_FMT_FMASK8_S4_F1__NV32 = 0x00000063, + IMG_FMT_FMASK8_S8_F1__NV32 = 0x00000064, + IMG_FMT_FMASK8_S4_F2__NV32 = 0x00000065, + IMG_FMT_FMASK16_S16_F1__NV32 = 0x00000066, + IMG_FMT_FMASK16_S8_F2__NV32 = 0x00000067, + IMG_FMT_FMASK32_S16_F2__NV32 = 0x00000068, + IMG_FMT_FMASK32_S8_F4__NV32 = 0x00000069, + IMG_FMT_FMASK64_S16_F4__NV32 = 0x0000006a, + IMG_FMT_FMASK64_S16_F8__NV32 = 0x0000006b, + IMG_FMT_FMASK32_S8_F8__NV32 = 0x0000006c, +#endif #if CHIP_HDR_NAVI33 IMG_FMT_FMASK8_S2_F2__NV33 = 0x00000060, IMG_FMT_FMASK8_S4_F4__NV33 = 0x00000061, @@ -6729,7 +6812,7 @@ typedef enum IMG_NUM_FORMAT { IMG_NUM_FORMAT_RESERVED_8__GFX09_10 = 0x00000008, IMG_NUM_FORMAT_RESERVED_14__GFX09_10 = 0x0000000e, IMG_NUM_FORMAT_RESERVED_15__GFX09_10 = 0x0000000f, -#if CHIP_HDR_NAVI31 || CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 +#if CHIP_HDR_NAVI31|| CHIP_HDR_NAVI32|| CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 IMG_NUM_FORMAT_SRGB__GFX104PLUS = 0x00000006, #endif IMG_NUM_FORMAT_SNORM_NZ__GFX10CORE = 0x00000006, @@ -6860,7 +6943,7 @@ typedef enum MTYPE { MTYPE_C_RW_US__GFX10PLUS = 0x00000000, } MTYPE; -#if CHIP_HDR_NAVI31 || CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 +#if CHIP_HDR_NAVI31|| CHIP_HDR_NAVI32|| CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 typedef enum OreoMode { OMODE_BLEND = 0x00000000, OMODE_O_THEN_B = 0x00000001, @@ -7263,7 +7346,7 @@ typedef enum PerfCounter_Vals { DB_PERF_SEL_DB_CB_lquad_quads_vrs_rate_2x2__GFX103 = 0x00000184, DB_PERF_SEL_prez_ps_invoked_pixel_cnt__GFX103 = 0x00000185, DB_PERF_SEL_postz_ps_invoked_pixel_cnt__GFX103 = 0x00000186, -#if CHIP_HDR_NAVI31 || CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 +#if CHIP_HDR_NAVI31|| CHIP_HDR_NAVI32|| CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 DB_PERF_SEL_esr_ps_vic_busy__GFX104PLUS = 0x000000c2, DB_PERF_SEL_esr_ps_vic_stall__GFX104PLUS = 0x000000c3, DB_PERF_SEL_CB_DB_rdreq_sends__GFX104PLUS = 0x00000109, @@ -7423,7 +7506,7 @@ typedef enum PerfCounter_Vals { DB_PERF_SEL_DB_SC_s_tile_rate__GFX10PLUS = 0x00000102, DB_PERF_SEL_DB_SC_c_tile_rate__GFX10PLUS = 0x00000103, DB_PERF_SEL_DB_SC_z_tile_rate__GFX10PLUS = 0x00000104, -#if CHIP_HDR_NAVI31 || CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 +#if CHIP_HDR_NAVI31|| CHIP_HDR_NAVI32|| CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 DB_PERF_SEL_DB_CB_export_events__GFX11 = 0x00000022, DB_PERF_SEL_DB_CB_export_sends__GFX11 = 0x0000002c, DB_PERF_SEL_DB_CB_export_busy__GFX11 = 0x0000002d, @@ -7492,7 +7575,7 @@ typedef enum PerfCounter_Vals { DB_PERF_SEL_OREO_Events_delayed__GFX11 = 0x00000182, DB_PERF_SEL_OREO_Events_stalls__GFX11 = 0x00000183, #endif -#if CHIP_HDR_NAVI31 || CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 +#if CHIP_HDR_NAVI31|| CHIP_HDR_NAVI32|| CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 DB_PERF_SEL_ts_events_pws_enable__HASPWS = 0x00000158, DB_PERF_SEL_ps_events_pws_enable__HASPWS = 0x00000159, DB_PERF_SEL_cs_events_pws_enable__HASPWS = 0x0000015a, @@ -7505,7 +7588,7 @@ typedef enum PerfCounter_Vals { constexpr unsigned int MaxPerfcounterValsGfx09 = DB_PERF_SEL_DB_SC_quad_quads_with_4_pixels__GFX09; constexpr unsigned int MaxPerfcounterValsGfx101 = DB_PERF_SEL_FG_LOB_FWDR_TIMEOUT_hits__GFX101; constexpr unsigned int MaxPerfcounterValsGfx103 = DB_PERF_SEL_postz_ps_invoked_pixel_cnt__GFX103; -#if CHIP_HDR_NAVI31 || CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 +#if CHIP_HDR_NAVI31|| CHIP_HDR_NAVI32|| CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 constexpr unsigned int MaxPerfcounterValsGfx11 = DB_PERF_SEL_OREO_Events_stalls__GFX11; #endif @@ -8690,7 +8773,7 @@ typedef enum PH_PERFCNT_SEL { PH_PERF_SEL_6_SC_ARB_STARVED_FROM_ABOVE_WITH_UNSELECTED_FIFO_FULL__GFX103PLUSEXCLUSIVE = 0x000003dd, PH_PERF_SEL_7_SC_ARB_STARVED_FROM_ABOVE_WITH_UNSELECTED_FIFO_FULL__GFX103PLUSEXCLUSIVE = 0x000003de, PH_PERF_SEL_8_SC_ARB_STARVED_FROM_ABOVE_WITH_UNSELECTED_FIFO_FULL__GFX103PLUSEXCLUSIVE = 0x000003df, -#if CHIP_HDR_NAVI31 || CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 +#if CHIP_HDR_NAVI31|| CHIP_HDR_NAVI32|| CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 PH_PERF_SEL_SC0_GFX_PIPE0_TO_1_TRANSITION__GFX11 = 0x00000014, PH_PERF_SEL_SC0_GFX_PIPE1_TO_0_TRANSITION__GFX11 = 0x00000015, PH_PERF_SEL_SC0_GFX_PIPE_PRIM_PROVOKED_TRANSITION__GFX11 = 0x00000016, @@ -9088,7 +9171,7 @@ typedef enum PH_PERFCNT_SEL { constexpr unsigned int MaxPhPerfcntSelGfx101 = PH_PERF_SEL_SC7_PA7_DEALLOC_4_0_RD__GFX10; constexpr unsigned int MaxPhPerfcntSelGfx103Derivative = PH_PERF_SEL_8_SC_ARB_STARVED_FROM_ABOVE_WITH_UNSELECTED_FIFO_FULL__GFX103PLUSEXCLUSIVE; -#if CHIP_HDR_NAVI31 || CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 +#if CHIP_HDR_NAVI31|| CHIP_HDR_NAVI32|| CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 constexpr unsigned int MaxPhPerfcntSelGfx11 = PH_PERF_SC7_FIFO_STATUS_3__GFX11; #endif @@ -9694,7 +9777,7 @@ typedef enum RMIPerfSel { RMI_PERF_SEL_RMI_RB_EARLY_WRACK_NACK2__GFX10CORE = 0x000000ff, RMI_PERF_SEL_RMI_RB_EARLY_WRACK_NACK3__GFX10CORE = 0x00000100, RMI_PERF_SEL_UTCL0_UTCL1_PERM_FAULT__GFX10CORE = 0x00000101, -#if CHIP_HDR_NAVI31 || CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 +#if CHIP_HDR_NAVI31|| CHIP_HDR_NAVI32|| CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 RMI_PERF_SEL_RB_RMI_WRREQ_ALL_CID__GFX11 = 0x00000008, RMI_PERF_SEL_RB_RMI_WRREQ_TO_WRRET_BUSY__GFX11 = 0x00000009, RMI_PERF_SEL_RB_RMI_WRREQ_CID0__GFX11 = 0x0000000a, @@ -9831,7 +9914,7 @@ typedef enum RMIPerfSel { constexpr unsigned int MaxRMIPerfSelGfx09 = RMI_PERF_SEL_RMI_RB_EARLY_WRACK_NACK3__GFX09; constexpr unsigned int MaxRMIPerfSelGfx10Core = RMI_PERF_SEL_UTCL0_UTCL1_PERM_FAULT__GFX10CORE; -#if CHIP_HDR_NAVI31 || CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 +#if CHIP_HDR_NAVI31|| CHIP_HDR_NAVI32|| CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 constexpr unsigned int MaxRMIPerfSelGfx11 = RMI_PERF_SEL_CONSUMER_PROBEGEN_DB_RTS_RTR__GFX11; #endif @@ -10481,7 +10564,7 @@ typedef enum SC_PERFCNT_SEL { SC_SPI_FPOV_1__GFX10VRS = 0x00000153, SC_SPI_FPOV_2__GFX10VRS = 0x00000154, SC_SPI_FPOV_3__GFX10VRS = 0x00000155, -#if CHIP_HDR_NAVI31 || CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 +#if CHIP_HDR_NAVI31|| CHIP_HDR_NAVI32|| CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 SC_PERF_SEL_RESERVED_76__GFX11 = 0x0000004c, SC_PERF_SEL_RESERVED_77__GFX11 = 0x0000004d, SC_PERF_SEL_RESERVED_78__GFX11 = 0x0000004e, @@ -10835,6 +10918,10 @@ typedef enum SC_PERFCNT_SEL { SC_PKR_BCI_QUAD_NEW_PRIM__GFX11 = 0x00000297, SC_SPI_WAVE_STALLED_BY_SPI__GFX11 = 0x00000298, #endif +#if CHIP_HDR_NAVI32 + SC_VRC_REPROBE_XFR__NV32 = 0x00000299, + SC_VRC_REPROBE_FULL__NV32 = 0x0000029a, +#endif #if CHIP_HDR_NAVI33 SC_VRC_REPROBE_XFR__NV33 = 0x00000299, SC_VRC_REPROBE_FULL__NV33 = 0x0000029a, @@ -10855,8 +10942,11 @@ constexpr unsigned int MaxScPerfcntSelApu11 = SC_VRC_REPROBE_FULL__AP #if CHIP_HDR_NAVI33 constexpr unsigned int MaxScPerfcntSelNv33 = SC_VRC_REPROBE_FULL__NV33; #endif +#if CHIP_HDR_NAVI32 +constexpr unsigned int MaxScPerfcntSelNv32 = SC_VRC_REPROBE_FULL__NV32; +#endif -#if CHIP_HDR_NAVI31 || CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 +#if CHIP_HDR_NAVI31|| CHIP_HDR_NAVI32|| CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 typedef enum SDMA_PERFMON_SEL { SDMA_PERFMON_SEL_CYCLE = 0x00000000, SDMA_PERFMON_SEL_IDLE = 0x00000001, @@ -11129,7 +11219,7 @@ typedef enum SDMA_PERF_SEL { SDMA_PERF_SEL_TLBI_RTN__GFX10COREPLUS = 0x00000060, SDMA_PERF_SEL_GCR_SEND__GFX10COREPLUS = 0x00000061, SDMA_PERF_SEL_GCR_RTN__GFX10COREPLUS = 0x00000062, -#if CHIP_HDR_NAVI31 || CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 +#if CHIP_HDR_NAVI31|| CHIP_HDR_NAVI32|| CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 SDMA_PERF_SEL_DUMMY_0__GFX11 = 0x0000002f, SDMA_PERF_SEL_DUMMY_1__GFX11 = 0x00000030, SDMA_PERF_SEL_QUEUE0_SELECT__GFX11 = 0x00000035, @@ -11176,7 +11266,7 @@ typedef enum SDMA_PERF_SEL { constexpr unsigned int MaxSdmaPerfSelGfx09 = SDMA_PERF_SEL_MMHUB_TAG_DELAY_COUNTER__GFX09; constexpr unsigned int MaxSdmaPerfSelOss50 = SDMA_PERF_SEL_MMHUB_TAG_DELAY_COUNTER__GFX10CORE; constexpr unsigned int MaxSdmaPerfSelGfx103 = SDMA_PERF_SEL_CH_CE_RDRET_VALID__GFX103; -#if CHIP_HDR_NAVI31 || CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 +#if CHIP_HDR_NAVI31|| CHIP_HDR_NAVI32|| CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 constexpr unsigned int MaxSdmaPerfSelGfx11 = SDMA_PERF_SEL_QUEUE7_SELECT__GFX11; #endif @@ -11210,7 +11300,7 @@ typedef enum SPI_LB_WAVES_SELECT { CS_NA = 0x00000002, SPI_LB_WAVES_RSVD = 0x00000003, VS_PS__GFX10 = 0x00000001, -#if CHIP_HDR_NAVI31 || CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 +#if CHIP_HDR_NAVI31|| CHIP_HDR_NAVI32|| CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 PS__GFX11 = 0x00000001, #endif } SPI_LB_WAVES_SELECT; @@ -11486,7 +11576,7 @@ typedef enum SPI_PERFCNT_SEL { SPI_PERF_EXP_THROT_UPSTEP__GFX103 = 0x00000149, SPI_PERF_EXP_THROT_DOWNSTEP__GFX103 = 0x0000014a, SPI_PERF_EXP_THROT_CAUSALITY_DETECTED__GFX103 = 0x0000014b, -#if CHIP_HDR_NAVI31 || CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 +#if CHIP_HDR_NAVI31|| CHIP_HDR_NAVI32|| CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 SPI_PERF_PS0_WINDOW_VALID__GFX104PLUS = 0x00000035, SPI_PERF_PS1_WINDOW_VALID__GFX104PLUS = 0x00000036, SPI_PERF_PS2_WINDOW_VALID__GFX104PLUS = 0x00000037, @@ -11760,7 +11850,7 @@ typedef enum SPI_PERFCNT_SEL { SPI_PERF_VWC_HS_WR__GFX10CORE = 0x00000130, SPI_PERF_VWC_CSGN_WR__GFX10CORE = 0x00000131, SPI_PERF_VWC_CSN_WR__GFX10CORE = 0x00000132, -#if CHIP_HDR_NAVI31 || CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 +#if CHIP_HDR_NAVI31|| CHIP_HDR_NAVI32|| CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 SPI_PERF_GS_WINDOW_VALID__GFX11 = 0x00000001, SPI_PERF_GS_BUSY__GFX11 = 0x00000002, SPI_PERF_GS_CRAWLER_STALL__GFX11 = 0x00000003, @@ -12125,6 +12215,9 @@ typedef enum SPI_PERFCNT_SEL { SPI_PERF_SWC_PS_WR__NV24 = 0x00000124, SPI_PERF_SWC_VS_WR__NV24 = 0x00000125, #endif +#if CHIP_HDR_NAVI32 + SPI_PERF_GS_GRP_LIFETIME_SAMPLE__NV32 = 0x00000012, +#endif #if CHIP_HDR_NAVI33 SPI_PERF_GS_GRP_LIFETIME_SAMPLE__NV33 = 0x00000012, #endif @@ -12139,7 +12232,7 @@ typedef enum SPI_PERFCNT_SEL { constexpr unsigned int MaxSpiPerfcntSelGfx09 = SPI_PERF_VWC_CSC_WR__GFX09; constexpr unsigned int MaxSpiPerfcntSelGfx101 = SPI_PERF_LS_PERS_UPD_FULL1__GFX101; constexpr unsigned int MaxSpiPerfcntSelGfx103 = SPI_PERF_EXP_THROT_CAUSALITY_DETECTED__GFX103; -#if CHIP_HDR_NAVI31 || CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 +#if CHIP_HDR_NAVI31|| CHIP_HDR_NAVI32|| CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 constexpr unsigned int MaxSpiPerfcntSelGfx11 = SPI_PERF_BUSY__GFX11; #endif @@ -12151,7 +12244,7 @@ typedef enum SPI_PNT_SPRITE_OVERRIDE { SPI_PNT_SPRITE_SEL_NONE = 0x00000004, } SPI_PNT_SPRITE_OVERRIDE; -#if CHIP_HDR_NAVI31 || CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 +#if CHIP_HDR_NAVI31|| CHIP_HDR_NAVI32|| CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 typedef enum SPI_PS_LDS_GROUP_SIZE { SPI_PS_LDS_GROUP_1 = 0x00000000, SPI_PS_LDS_GROUP_2 = 0x00000001, @@ -12196,7 +12289,7 @@ typedef enum SPM_PERFMON_STATE { STRM_PERFMON_STATE_COUNT_AND_DUMP_PHANTOM = 0x00000005, } SPM_PERFMON_STATE; -#if CHIP_HDR_NAVI31 || CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 +#if CHIP_HDR_NAVI31|| CHIP_HDR_NAVI32|| CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 typedef enum SQG_PERF_SEL { SQG_PERF_SEL_NONE = 0x00000000, SQG_PERF_SEL_MSG_BUS_BUSY = 0x00000001, @@ -13051,7 +13144,7 @@ typedef enum SQ_PERF_SEL { SQC_PERF_SEL_DCACHE_GCR_INVALIDATE__GFX103DERIVATIVE = 0x000001a0, SQC_PERF_SEL_Reserved_0x1a0__GFX103DERIVATIVE = 0x000001a1, SQC_PERF_SEL_DCACHE_SPI_RETURN_STALL__GFX103DERIVATIVE = 0x000001a2, -#if CHIP_HDR_NAVI31 || CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 +#if CHIP_HDR_NAVI31|| CHIP_HDR_NAVI32|| CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 SQ_PERF_SEL_WAVES_INITIAL_PREFETCH__GFX104PLUS = 0x00000017, SQ_PERF_SEL_NONE2__GFX104PLUS = 0x000001ff, #endif @@ -13069,7 +13162,7 @@ typedef enum SQ_PERF_SEL { SQ_PERF_SEL_EVENTS__GFX10PLUS = 0x0000000c, SQ_PERF_SEL_MSG_INTERRUPT__GFX10PLUS = 0x00000016, SQC_PERF_SEL_DUMMY_LAST__GFX10VRS = 0x000001a3, -#if CHIP_HDR_NAVI31 || CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 +#if CHIP_HDR_NAVI31|| CHIP_HDR_NAVI32|| CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 SQ_PERF_SEL_PS_QUADS__GFX11 = 0x0000000b, SQ_PERF_SEL_WAVES_EQ_32__GFX11 = 0x0000000d, SQ_PERF_SEL_WAVES_EQ_64__GFX11 = 0x0000000e, @@ -13373,7 +13466,7 @@ typedef enum SQ_PERF_SEL { constexpr unsigned int MaxSqPerfSelGfx09 = SQC_PERF_SEL_DUMMY_LAST__GFX09; constexpr unsigned int MaxSqPerfSelGfx10Core = SP_PERF_SEL_DUMMY_LAST__GFX10CORE; -#if CHIP_HDR_NAVI31 || CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 +#if CHIP_HDR_NAVI31|| CHIP_HDR_NAVI32|| CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 constexpr unsigned int MaxSqPerfSelGfx104Plus = SQ_PERF_SEL_NONE2__GFX104PLUS; #endif @@ -13639,11 +13732,11 @@ typedef enum SQ_TT_TOKEN_MASK_REG_INCLUDE { SQ_TT_TOKEN_MASK_CONTEXT_BIT = 0x00000010, SQ_TT_TOKEN_MASK_CONFIG_BIT = 0x00000020, SQ_TT_TOKEN_MASK_READS_BIT__GFX10 = 0x00000080, -#if CHIP_HDR_NAVI31 || CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 +#if CHIP_HDR_NAVI31|| CHIP_HDR_NAVI32|| CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 SQ_TT_TOKEN_MASK_ALL_BIT__GFX104PLUS = 0x00000040, #endif SQ_TT_TOKEN_MASK_OTHER_BIT__GFX10CORE = 0x00000040, -#if CHIP_HDR_NAVI31 || CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 +#if CHIP_HDR_NAVI31|| CHIP_HDR_NAVI32|| CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 SQ_TT_TOKEN_MASK_RSVD_BIT__GFX11 = 0x00000080, #endif } SQ_TT_TOKEN_MASK_REG_INCLUDE; @@ -13656,11 +13749,11 @@ typedef enum SQ_TT_TOKEN_MASK_REG_INCLUDE_SHIFT { SQ_TT_TOKEN_MASK_CONTEXT_SHIFT = 0x00000004, SQ_TT_TOKEN_MASK_CONFIG_SHIFT = 0x00000005, SQ_TT_TOKEN_MASK_READS_SHIFT__GFX10 = 0x00000007, -#if CHIP_HDR_NAVI31 || CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 +#if CHIP_HDR_NAVI31|| CHIP_HDR_NAVI32|| CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 SQ_TT_TOKEN_MASK_ALL_SHIFT__GFX104PLUS = 0x00000006, #endif SQ_TT_TOKEN_MASK_OTHER_SHIFT__GFX10CORE = 0x00000006, -#if CHIP_HDR_NAVI31 || CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 +#if CHIP_HDR_NAVI31|| CHIP_HDR_NAVI32|| CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 SQ_TT_TOKEN_MASK_RSVD_SHIFT__GFX11 = 0x00000007, #endif } SQ_TT_TOKEN_MASK_REG_INCLUDE_SHIFT; @@ -13698,13 +13791,13 @@ typedef enum SQ_TT_WTYPE_INCLUDE { SQ_TT_WTYPE_INCLUDE_HS_BIT = 0x00000010, SQ_TT_WTYPE_INCLUDE_CS_BIT = 0x00000040, SQ_TT_WTYPE_INCLUDE_VS_BIT__GFX10 = 0x00000002, -#if CHIP_HDR_NAVI31 || CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 +#if CHIP_HDR_NAVI31|| CHIP_HDR_NAVI32|| CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 SQ_TT_WTYPE_INCLUDE_RSVD1_BIT__GFX104PLUS = 0x00000008, SQ_TT_WTYPE_INCLUDE_RSVD2_BIT__GFX104PLUS = 0x00000020, #endif SQ_TT_WTYPE_INCLUDE_ES_BIT__GFX10CORE = 0x00000008, SQ_TT_WTYPE_INCLUDE_LS_BIT__GFX10CORE = 0x00000020, -#if CHIP_HDR_NAVI31 || CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 +#if CHIP_HDR_NAVI31|| CHIP_HDR_NAVI32|| CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 SQ_TT_WTYPE_INCLUDE_RSVD0_BIT__GFX11 = 0x00000002, #endif } SQ_TT_WTYPE_INCLUDE; @@ -13715,13 +13808,13 @@ typedef enum SQ_TT_WTYPE_INCLUDE_SHIFT { SQ_TT_WTYPE_INCLUDE_HS_SHIFT = 0x00000004, SQ_TT_WTYPE_INCLUDE_CS_SHIFT = 0x00000006, SQ_TT_WTYPE_INCLUDE_VS_SHIFT__GFX10 = 0x00000001, -#if CHIP_HDR_NAVI31 || CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 +#if CHIP_HDR_NAVI31|| CHIP_HDR_NAVI32|| CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 SQ_TT_WTYPE_INCLUDE_RSVD1_SHIFT__GFX104PLUS = 0x00000003, SQ_TT_WTYPE_INCLUDE_RSVD2_SHIFT__GFX104PLUS = 0x00000005, #endif SQ_TT_WTYPE_INCLUDE_ES_SHIFT__GFX10CORE = 0x00000003, SQ_TT_WTYPE_INCLUDE_LS_SHIFT__GFX10CORE = 0x00000005, -#if CHIP_HDR_NAVI31 || CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 +#if CHIP_HDR_NAVI31|| CHIP_HDR_NAVI32|| CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 SQ_TT_WTYPE_INCLUDE_RSVD0_SHIFT__GFX11 = 0x00000001, #endif } SQ_TT_WTYPE_INCLUDE_SHIFT; @@ -14374,7 +14467,7 @@ typedef enum SU_PERFCNT_SEL { PERF_ENGG_INDEX_PRIM_IF_FETCH_TO_PRIMIC_P_FIFO_WRITE__GFX10PLUS = 0x00000106, PERF_ENGG_INDEX_PRIM_IF_FETCH_TO_PRIMIC_P_FIFO_NO_WRITE__GFX10PLUS = 0x00000107, PERF_ENGG_POS_REQ_STARVED__GFX10PLUS = 0x00000108, -#if CHIP_HDR_NAVI31 || CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 +#if CHIP_HDR_NAVI31|| CHIP_HDR_NAVI32|| CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 PERF_CLPR_INPUT_PRIM__GFX11 = 0x00000008, PERF_CLPR_INPUT_NULL_PRIM__GFX11 = 0x00000009, PERF_CLPR_INPUT_EVENT__GFX11 = 0x0000000a, @@ -14432,7 +14525,7 @@ constexpr unsigned int MaxSuPerfcntSelGfx09_0 = PERF_CLIENT_UTCL1_INFLI constexpr unsigned int MaxSuPerfcntSelGfx09_1x = PERF_PA_PRIMIC_TO_CLPRIM_FIFO_FULL__GFX09_1X; constexpr unsigned int MaxSuPerfcntSelGfx101 = PERF_ENGG_POS_REQ_STALLED_BY_FULL_CLIPV_FIFO__GFX101; constexpr unsigned int MaxSuPerfcntSelGfx103Derivative = PERF_OUTPUT_PRIM_4_SC__GFX103PLUSEXCLUSIVE; -#if CHIP_HDR_NAVI31 || CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 +#if CHIP_HDR_NAVI31|| CHIP_HDR_NAVI32|| CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 constexpr unsigned int MaxSuPerfcntSelGfx11 = PERF_PA_BUSY__GFX11; #endif @@ -14475,7 +14568,7 @@ typedef enum SWIZZLE_MODE_ENUM { SW_VAR_R__GFX10CORE = 0x0000000f, SW_VAR_S_X__GFX10CORE = 0x0000001d, SW_VAR_D_X__GFX10CORE = 0x0000001e, -#if CHIP_HDR_NAVI31 || CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 +#if CHIP_HDR_NAVI31|| CHIP_HDR_NAVI32|| CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 SW_256KB_Z__GFX11 = 0x0000000c, SW_256KB_S__GFX11 = 0x0000000d, SW_256KB_D__GFX11 = 0x0000000e, @@ -14760,7 +14853,7 @@ typedef enum SX_PERFCOUNTER_VALS { SX_PERF_SEL_RB1_STALL_DUE_TO_ORDERING__GFX10 = 0x000000de, SX_PERF_SEL_RB2_STALL_DUE_TO_ORDERING__GFX10 = 0x000000df, SX_PERF_SEL_RB3_STALL_DUE_TO_ORDERING__GFX10 = 0x000000e0, -#if CHIP_HDR_NAVI31 || CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 +#if CHIP_HDR_NAVI31|| CHIP_HDR_NAVI32|| CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 SX_PERF_SEL_DB0_MRT_BLEND_BYPASS__GFX11 = 0x00000022, SX_PERF_SEL_DB0_MRT_DONT_RD_DEST__GFX11 = 0x00000023, SX_PERF_SEL_DB0_MRT_DISCARD_SRC__GFX11 = 0x00000024, @@ -14814,7 +14907,7 @@ typedef enum SX_PERFCOUNTER_VALS { constexpr unsigned int MaxSxPerfcounterValsGfx09 = SX_PERF_SEL_DB3_SIZE__GFX09_10; constexpr unsigned int MaxSxPerfcounterValsGfx10Core = SX_PERF_SEL_RB3_STALL_DUE_TO_ORDERING__GFX10; -#if CHIP_HDR_NAVI31 || CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 +#if CHIP_HDR_NAVI31|| CHIP_HDR_NAVI32|| CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 constexpr unsigned int MaxSxPerfcounterValsGfx11 = SX_PERF_SEL_DB3_4X2_DISCARD__GFX11; #endif @@ -15873,7 +15966,7 @@ typedef enum TCP_PERFCOUNT_SELECT { TCP_PERF_SEL_WRITE_DATACONFLICT_STALL__GFX103 = 0x0000003a, TCP_PERF_SEL_TD_TCP_STALL__GFX103 = 0x0000003b, TCP_PERF_SEL_BACK_COMPAT_SWITCH__GFX103 = 0x0000003c, -#if CHIP_HDR_NAVI31 || CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 +#if CHIP_HDR_NAVI31|| CHIP_HDR_NAVI32|| CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 TCP_PERF_SEL_REQ_NON_READ__GFX104PLUS = 0x00000010, TCP_PERF_SEL_REQ_MISS__GFX104PLUS = 0x00000011, #endif @@ -15912,7 +16005,7 @@ typedef enum TCP_PERFCOUNT_SELECT { TCP_PERF_SEL_REQ_READ_MISS_EVICT__GFX10PLUS = 0x0000000d, TCP_PERF_SEL_REQ_WRITE__GFX10PLUS = 0x0000000e, TCP_PERF_SEL_REQ_WRITE_MISS_EVICT__GFX10PLUS = 0x0000000f, -#if CHIP_HDR_NAVI31 || CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 +#if CHIP_HDR_NAVI31|| CHIP_HDR_NAVI32|| CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 TCP_PERF_SEL_REQ_TAGBANK0_SET0__GFX11 = 0x00000012, TCP_PERF_SEL_REQ_TAGBANK0_SET1__GFX11 = 0x00000013, TCP_PERF_SEL_REQ_TAGBANK1_SET0__GFX11 = 0x00000014, @@ -15979,7 +16072,7 @@ typedef enum TCP_PERFCOUNT_SELECT { constexpr unsigned int MaxTcpPerfcountSelectGfx09 = TCP_PERF_SEL_TCC_DCC_REQ__GFX09; constexpr unsigned int MaxTcpPerfcountSelectGfx101 = TCP_PERF_SEL_BACK_COMPAT_SWITCH__GFX101; constexpr unsigned int MaxTcpPerfcountSelectGfx103 = TCP_PERF_SEL_BACK_COMPAT_SWITCH__GFX103; -#if CHIP_HDR_NAVI31 || CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 +#if CHIP_HDR_NAVI31|| CHIP_HDR_NAVI32|| CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 constexpr unsigned int MaxTcpPerfcountSelectGfx11 = TCP_PERF_SEL_BURST_BIN_READHIT_gt16__GFX11; #endif @@ -16020,7 +16113,7 @@ typedef enum TC_MICRO_TILE_MODE { MICRO_TILE_MODE_Z_3D__GFX09 = 0x00000007, MICRO_TILE_MODE_Z__GFX10COREPLUS = 0x00000006, MICRO_TILE_MODE_RENDER_TARGET__GFX10PLUS = 0x00000001, -#if CHIP_HDR_NAVI31 || CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 +#if CHIP_HDR_NAVI31|| CHIP_HDR_NAVI32|| CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 MICRO_TILE_MODE_Z_VAR__GFX11 = 0x00000007, #endif #if CHIP_HDR_NAVI21 @@ -16173,7 +16266,7 @@ typedef enum TC_OP { TC_OP_RESERVED_FOP_FLUSH_DENORM_RTN_32_1__GFX09_10 = 0x0000000d, TC_OP_RESERVED_FOP_32_1__GFX09_10 = 0x00000045, TC_OP_RESERVED_FOP_FLUSH_DENORM_32_1__GFX09_10 = 0x0000004d, -#if CHIP_HDR_NAVI31 || CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 +#if CHIP_HDR_NAVI31|| CHIP_HDR_NAVI32|| CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 TC_OP_RESERVED_FADD_RTN_32__GFX11 = 0x00000005, TC_OP_ATOMIC_FADD_FLUSH_DENORM_RTN_32__GFX11 = 0x0000000d, TC_OP_RESERVED_FADD_32__GFX11 = 0x00000045, @@ -16486,7 +16579,7 @@ typedef enum TD_PERFCOUNT_SEL { TD_PERF_SEL_nofilter_dword_cycling_2cycles__GFX103PLUSEXCLUSIVE = 0x000000bd, TD_PERF_SEL_nofilter_dword_cycling_4cycles__GFX103PLUSEXCLUSIVE = 0x000000be, TD_PERF_SEL_input_bp_due_to_done_scoreboard_full__GFX103PLUSEXCLUSIVE = 0x000000bf, -#if CHIP_HDR_NAVI31 || CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 +#if CHIP_HDR_NAVI31|| CHIP_HDR_NAVI32|| CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 TD_PERF_SEL_sampler_preformatter_sclk_en__GFX104PLUS = 0x00000008, TD_PERF_SEL_ray_tracing_bvh4_sclk_en__GFX104PLUS = 0x00000016, TD_PERF_SEL_ray_tracing_bvh4_ip_sclk_en__GFX104PLUS = 0x00000017, @@ -16494,7 +16587,7 @@ typedef enum TD_PERFCOUNT_SEL { TD_PERF_SEL_sampler_lerp_busy__GFX10PLUS = 0x00000003, TD_PERF_SEL_sampler_out_busy__GFX10PLUS = 0x00000004, TD_PERF_SEL_nofilter_busy__GFX10PLUS = 0x00000005, -#if CHIP_HDR_NAVI31 || CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 +#if CHIP_HDR_NAVI31|| CHIP_HDR_NAVI32|| CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 TD_PERF_SEL_sampler_bilerp_sclk_en__GFX11 = 0x00000009, TD_PERF_SEL_sampler_bypass_sclk_en__GFX11 = 0x0000000a, TD_PERF_SEL_sampler_minmax_sclk_en__GFX11 = 0x0000000b, @@ -16516,7 +16609,7 @@ typedef enum TD_PERFCOUNT_SEL { constexpr unsigned int MaxTdPerfcountSelGfx09 = TD_PERF_SEL_texels_zeroed_out_by_blend_zero_prt__GFX09; constexpr unsigned int MaxTdPerfcountSelGfx101 = TD_PERF_SEL_nofilter_popcount_dmask_lt_num_comp_of_fmt__GFX101; constexpr unsigned int MaxTdPerfcountSelGfx103 = TD_PERF_SEL_input_bp_due_to_done_scoreboard_full__GFX103PLUSEXCLUSIVE; -#if CHIP_HDR_NAVI31 || CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 +#if CHIP_HDR_NAVI31|| CHIP_HDR_NAVI32|| CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 constexpr unsigned int MaxTdPerfcountSelGfx11 = TD_PERF_SEL_store_preempts_a_load__GFX11; #endif @@ -16943,7 +17036,7 @@ typedef enum UMC_PERFCOUNT_SELECT { UMC_PERF_SEL_TempOverThresh__NV24 = 0x00000052, UMC_PERF_SEL_TempCnt__NV24 = 0x00000053, #endif -#if CHIP_HDR_NAVI31 || CHIP_HDR_NAVI33 +#if CHIP_HDR_NAVI31 || CHIP_HDR_NAVI32 || CHIP_HDR_NAVI33 UMC_PERF_SEL_NONE__NV3X = 0x00000000, UMC_PERF_SEL_SdpPh__NV3X = 0x00000001, UMC_PERF_SEL_SdpPm__NV3X = 0x00000002, @@ -17091,7 +17184,7 @@ constexpr unsigned int MaxUmcPerfcountSelectNv22 = UMC_PERF_SEL_TempCnt__N #if CHIP_HDR_NAVI21 constexpr unsigned int MaxUmcPerfcountSelectNv21 = UMC_PERF_SEL_TempCnt__NV21; #endif -#if CHIP_HDR_NAVI31 || CHIP_HDR_NAVI33 +#if CHIP_HDR_NAVI31 || CHIP_HDR_NAVI32 || CHIP_HDR_NAVI33 constexpr unsigned int MaxUmcPerfcountSelectNv3x = UMC_PERF_SEL_ClockCount__NV3X; #endif @@ -17126,7 +17219,7 @@ typedef enum UTCL1PerfSel { UTCL1_PERF_SEL_UTCL2_UTCL1_INVREQS__GFX103DERIVATIVE = 0x00000012, UTCL1_PERF_SEL_RANGE_INVREQS__GFX103DERIVATIVE = 0x00000013, UTCL1_PERF_SEL_INV_ALL_VMID_INVREQS__GFX103DERIVATIVE = 0x00000014, -#if CHIP_HDR_NAVI31 || CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 +#if CHIP_HDR_NAVI31|| CHIP_HDR_NAVI32|| CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 UTCL1_PERF_SEL_MH_RECENT_BUF_HIT__GFX11 = 0x00000004, UTCL1_PERF_SEL_MH_DUPLICATE_DETECT__GFX11 = 0x00000005, UTCL1_PERF_SEL_UTCL2_REQS__GFX11 = 0x00000006, @@ -17223,7 +17316,7 @@ constexpr unsigned int MaxUTCL1PerfSelNv24 = UTCL1_PERF_SEL_UTCL2_RE #if CHIP_HDR_NAVI23 constexpr unsigned int MaxUTCL1PerfSelNv23 = UTCL1_PERF_SEL_UTCL2_REQS_OUTSTANDING_ACCUM__NV23; #endif -#if CHIP_HDR_NAVI31 || CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 +#if CHIP_HDR_NAVI31|| CHIP_HDR_NAVI32|| CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 constexpr unsigned int MaxUTCL1PerfSelGfx11 = UTCL1_PERF_SEL_ALOG_STALL_PMM_CREDITS__GFX11; #endif @@ -17378,7 +17471,7 @@ typedef enum VGT_EVENT_TYPE { BIN_CONF_OVERRIDE_CHECK__GFX10PLUS = 0x0000001d, THREAD_TRACE_DRAW__GFX10PLUS = 0x00000036, DRAW_DONE__GFX10PLUS = 0x0000003f, -#if CHIP_HDR_NAVI31 || CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 +#if CHIP_HDR_NAVI31|| CHIP_HDR_NAVI32|| CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 WAIT_SYNC__GFX11 = 0x00000015, ENABLE_PIPELINE_NOT_USED__GFX11 = 0x0000003e, #endif @@ -17488,7 +17581,7 @@ typedef enum VGT_OUT_PRIM_TYPE { VGT_OUT_LINE_ADJ__GFX09_10 = 0x0000000c, VGT_OUT_TRI_ADJ__GFX09_10 = 0x0000000d, VGT_OUT_PATCH__GFX09_10 = 0x0000000e, -#if CHIP_HDR_NAVI31 || CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 +#if CHIP_HDR_NAVI31|| CHIP_HDR_NAVI32|| CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 VGT_OUT_2D_RECT__GFX11 = 0x00000003, VGT_OUT_RECT_V0__GFX11 = 0x00000004, VGT_OUT_DUMMY_1__GFX11 = 0x00000005, @@ -17718,7 +17811,7 @@ typedef enum VRSCombinerMode { VRS_COMB_MODE_SATURATE = 0x00000004, } VRSCombinerMode; -#if CHIP_HDR_NAVI31 || CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 +#if CHIP_HDR_NAVI31|| CHIP_HDR_NAVI32|| CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 typedef enum VRSCombinerModeSC { SC_VRS_COMB_MODE_PASSTHRU = 0x00000000, SC_VRS_COMB_MODE_OVERRIDE = 0x00000001, @@ -17734,7 +17827,7 @@ typedef enum VRSHtileEncoding { VRS_HTILE_4BIT_ENCODING = 0x00000002, } VRSHtileEncoding; -#if CHIP_HDR_NAVI31 || CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 +#if CHIP_HDR_NAVI31|| CHIP_HDR_NAVI32|| CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 typedef enum VRSrate { VRS_SHADING_RATE_1X1 = 0x00000000, VRS_SHADING_RATE_1X2 = 0x00000001, @@ -17761,7 +17854,7 @@ typedef enum WD_IA_DRAW_REG_XFER { WD_IA_DRAW_REG_XFER_VGT_INSTANCE_BASE_ID = 0x00000002, WD_IA_DRAW_REG_XFER_GE_CNTL__GFX10PLUS = 0x00000003, WD_IA_DRAW_REG_XFER_GE_USER_VGPR_EN__GFX10PLUS = 0x00000004, -#if CHIP_HDR_NAVI31 || CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 +#if CHIP_HDR_NAVI31|| CHIP_HDR_NAVI32|| CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 WD_IA_DRAW_REG_XFER_FL_MS_WG_DIM__GFX11 = 0x00000005, WD_IA_DRAW_REG_XFER_FL_MS_WG_DIM_1__GFX11 = 0x00000006, WD_IA_DRAW_REG_XFER_FL_MS_TG_SIZE__GFX11 = 0x00000007, @@ -19730,7 +19823,7 @@ namespace Gfx103PlusExclusive constexpr unsigned int SQ_WAVE_IB_DEP_HOLD_CNT_SIZE = 0x00000001; } // namespace Gfx103PlusExclusive -#if CHIP_HDR_NAVI31 || CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 +#if CHIP_HDR_NAVI31|| CHIP_HDR_NAVI32|| CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 namespace Gfx104Plus { constexpr unsigned int SIMM16_WAITCNT_EXP_CNT_START = 0x00000000; @@ -19788,7 +19881,7 @@ namespace Gfx10Plus constexpr unsigned int SQ_WAVE_IB_DEP_VM_VSRC_SIZE = 0x00000004; } // namespace Gfx10Plus -#if CHIP_HDR_NAVI31 || CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 +#if CHIP_HDR_NAVI31|| CHIP_HDR_NAVI32|| CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 namespace Gfx11 { constexpr unsigned int CONTEXT_SPACE_END = 0x0000a3ff; @@ -19977,7 +20070,7 @@ enum PerfCtrId CpcPerfcountSelId = 4, CpfPerfcountSelId = 5, CpgPerfcountSelId = 6, -#if CHIP_HDR_NAVI21|| CHIP_HDR_NAVI22|| CHIP_HDR_NAVI23|| CHIP_HDR_NAVI24|| CHIP_HDR_NAVI31 || CHIP_HDR_NAVI33 +#if CHIP_HDR_NAVI21 || CHIP_HDR_NAVI22 || CHIP_HDR_NAVI23 || CHIP_HDR_NAVI24 || CHIP_HDR_NAVI31 || CHIP_HDR_NAVI32 || CHIP_HDR_NAVI33 DfMallPerfSelId = 7, #endif GCRPerfSelId = 8, @@ -19995,7 +20088,7 @@ enum PerfCtrId GrbmSe1PerfSelId = 21, GrbmSe2PerfSelId = 22, GrbmSe3PerfSelId = 23, -#if CHIP_HDR_NAVI31 || CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 +#if CHIP_HDR_NAVI31|| CHIP_HDR_NAVI32|| CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 GrbmSe4PerfSelId = 24, GrbmSe5PerfSelId = 25, GrbmSe6PerfSelId = 26, @@ -20010,7 +20103,7 @@ enum PerfCtrId SdmaPerfSelId = 34, SpiPerfcntSelId = 35, SqPerfSelId = 36, -#if CHIP_HDR_NAVI31 || CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 +#if CHIP_HDR_NAVI31|| CHIP_HDR_NAVI32|| CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 SqgPerfSelId = 37, #endif SuPerfcntSelId = 38, @@ -20726,6 +20819,61 @@ constexpr unsigned int Nv31MaxPerfEventIds[MaxPerfCtrId] = }; #endif +#if CHIP_HDR_NAVI32 +constexpr unsigned int Nv32MaxPerfEventIds[MaxPerfCtrId] = +{ + CB_PERF_SEL_EXPORT_KILLED_BY_NULL_TARGET_SHADER_MASK__GFX11, + CHA_PERF_SEL_CYCLE__NV3X, + CHC_PERF_SEL_REQ_CLIENT23__GFX11, + CHCG_PERF_SEL_REQ_CLIENT23__NV3X, + CPC_PERF_SEL_MEC_THREAD3__GFX11, + CPF_PERF_SEL_CP_SDMA_MNGR_SDMABUSY__GFX11, + CPG_PERF_SEL_PFP_VGTDMA_DB_ROQ_DATA_STALL1__GFX11, + DF_MALL_PERF_SEL_MALL_SDP_LAT_HIST_GT1000__NV3X, + GCR_PERF_SEL_PIO_GL1_TLB_SHOOTDOWN_REQ__NV3X, + GDS_PERF_SEL_SE7_GS_WAVE_ID_VALID__GFX11, + ge1_rbiu_dr_fifo_starved_p1__GFX103COREPLUS, + ge_agm_gcr_combine__GFX11, + ge_ngg_busy_base__GFX11, + 0, + GL1A_PERF_SEL_CYCLE__GFX103PLUSEXCLUSIVE, + GL1C_PERF_SEL_UTCL0_UTCL1_XNACK_NO_RETRY_FAULT__GFX11, + 0, + GL2A_PERF_SEL_RTN_CREDIT_STALL_CLIENT15__GFX104PLUS, + GL2C_PERF_SEL_HIT_PASS_MISS_IN_CLIENT19__GFX11, + GRBM_PERF_SEL_PC_BUSY__GFX11, + GRBM_SE0_PERF_SEL_PC_BUSY__GFX11, + GRBM_SE1_PERF_SEL_PC_BUSY__GFX11, + GRBM_SE2_PERF_SEL_PC_BUSY__GFX11, + GRBM_SE3_PERF_SEL_PC_BUSY__GFX11, + GRBM_SE4_PERF_SEL_PC_BUSY, + GRBM_SE5_PERF_SEL_PC_BUSY, + GRBM_SE6_PERF_SEL_PC_BUSY, + GRBM_SE7_PERF_SEL_PC_BUSY, + 0, + DB_PERF_SEL_OREO_Events_stalls__GFX11, + PH_PERF_SC7_FIFO_STATUS_3__GFX11, + RMI_PERF_SEL_CONSUMER_PROBEGEN_DB_RTS_RTR__GFX11, + RLC_PERF_SEL_SERDES_COMMAND_WRITE, + SC_VRC_REPROBE_FULL__NV32, + SDMA_PERF_SEL_QUEUE7_SELECT__GFX11, + SPI_PERF_BUSY__GFX11, + SQ_PERF_SEL_NONE2__GFX104PLUS, + SQG_PERF_SEL_DUMMY_LAST, + PERF_PA_BUSY__GFX11, + SX_PERF_SEL_DB3_4X2_DISCARD__GFX11, + TA_PERF_SEL_tcreq_clk_valid_cycles__GFX103PLUSEXCLUSIVE, + 0, + 0, + TCP_PERF_SEL_BURST_BIN_READHIT_gt16__GFX11, + TD_PERF_SEL_store_preempts_a_load__GFX11, + UTCL1_PERF_SEL_ALOG_STALL_PMM_CREDITS__GFX11, + UMC_PERF_SEL_ClockCount__NV3X, + 0, + 0, +}; +#endif + #if CHIP_HDR_NAVI33 constexpr unsigned int Nv33MaxPerfEventIds[MaxPerfCtrId] = { diff --git a/lgc/imported/chip/gfx9/gfx9_plus_merged_offset.h b/lgc/imported/chip/gfx9/gfx9_plus_merged_offset.h index 9fd5650fd4..20787c0d81 100644 --- a/lgc/imported/chip/gfx9/gfx9_plus_merged_offset.h +++ b/lgc/imported/chip/gfx9/gfx9_plus_merged_offset.h @@ -2544,7 +2544,7 @@ namespace Gfx103PlusExclusive constexpr unsigned int mmSX_PS_DOWNCONVERT_CONTROL = 0xA1D4; } // namespace Gfx103PlusExclusive -#if CHIP_HDR_NAVI31 || CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 +#if CHIP_HDR_NAVI31|| CHIP_HDR_NAVI32|| CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 namespace Gfx104Plus { constexpr unsigned int mmGE_RATE_CNTL_1 = 0x2254; @@ -3023,7 +3023,7 @@ namespace Gfx10Vrs constexpr unsigned int mmRLC_SPM_SE_MUXSEL_ADDR_OFFSET = 0xDCA8; } // namespace Gfx10Vrs -#if CHIP_HDR_NAVI31 || CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 +#if CHIP_HDR_NAVI31|| CHIP_HDR_NAVI32|| CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 namespace Gfx11 { constexpr unsigned int mmCB_DCC_CONFIG2 = 0x268B; @@ -5256,6 +5256,34 @@ namespace Nv31 } // namespace Nv31 #endif +#if CHIP_HDR_NAVI32 +namespace Nv32 +{ + constexpr unsigned int mmGC_ATC_L2_PERFCOUNTER0_CFG = 0xDD44; + constexpr unsigned int mmGC_ATC_L2_PERFCOUNTER1_CFG = 0xDD45; + constexpr unsigned int mmGC_ATC_L2_PERFCOUNTER2_HI = 0xD4F5; + constexpr unsigned int mmGC_ATC_L2_PERFCOUNTER2_LO = 0xD4F4; + constexpr unsigned int mmGC_ATC_L2_PERFCOUNTER2_MODE = 0xDD42; + constexpr unsigned int mmGC_ATC_L2_PERFCOUNTER2_SELECT = 0xDD40; + constexpr unsigned int mmGC_ATC_L2_PERFCOUNTER2_SELECT1 = 0xDD41; + constexpr unsigned int mmGC_ATC_L2_PERFCOUNTER_HI = 0xD4F9; + constexpr unsigned int mmGC_ATC_L2_PERFCOUNTER_LO = 0xD4F8; + constexpr unsigned int mmGC_ATC_L2_PERFCOUNTER_RSLT_CNTL = 0xDD46; + constexpr unsigned int mmGE_FED_STATUS = 0x224A; + constexpr unsigned int mmGRBM_SE3_PERFCOUNTER_HI = 0xD04C; + constexpr unsigned int mmGRBM_SE3_PERFCOUNTER_LO = 0xD04B; + constexpr unsigned int mmGRBM_SE3_PERFCOUNTER_SELECT = 0xD845; + constexpr unsigned int mmRPB_PERFCOUNTER0_CFG = 0x0C80; + constexpr unsigned int mmRPB_PERFCOUNTER1_CFG = 0x0C81; + constexpr unsigned int mmRPB_PERFCOUNTER2_CFG = 0x0C82; + constexpr unsigned int mmRPB_PERFCOUNTER3_CFG = 0x0C83; + constexpr unsigned int mmRPB_PERFCOUNTER_HI = 0x0C86; + constexpr unsigned int mmRPB_PERFCOUNTER_LO = 0x0C87; + constexpr unsigned int mmRPB_PERFCOUNTER_RSLT_CNTL = 0x0C84; + constexpr unsigned int mmRPB_PERF_COUNTER_CNTL = 0x0C85; +} // namespace Nv32 +#endif + #if CHIP_HDR_NAVI33 namespace Nv33 { @@ -5270,7 +5298,7 @@ namespace Nv33 } // namespace Nv33 #endif -#if CHIP_HDR_NAVI31 || CHIP_HDR_NAVI33 +#if CHIP_HDR_NAVI31 || CHIP_HDR_NAVI32 || CHIP_HDR_NAVI33 namespace Nv3x { constexpr unsigned int mmCHCG_PERFCOUNTER0_HI = 0xD3C9; diff --git a/lgc/imported/chip/gfx9/gfx9_plus_merged_registers.h b/lgc/imported/chip/gfx9/gfx9_plus_merged_registers.h index 3b24e5e795..033111d966 100644 --- a/lgc/imported/chip/gfx9/gfx9_plus_merged_registers.h +++ b/lgc/imported/chip/gfx9/gfx9_plus_merged_registers.h @@ -413,7 +413,7 @@ union CB_CACHE_EVICT_POINTS { unsigned int FC_CACHE_EVICT_POINT : 8; unsigned int : 16; } gfx10; -#if CHIP_HDR_NAVI31 || CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 +#if CHIP_HDR_NAVI31|| CHIP_HDR_NAVI32|| CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 struct { unsigned int CC_COLOR_EVICT_POINT : 8; unsigned int CC_FMASK_EVICT_POINT : 8; @@ -509,7 +509,7 @@ union CB_COLOR0_ATTRIB { unsigned int LIMIT_COLOR_FETCH_TO_256B_MAX : 1; unsigned int : 12; } gfx10Core; -#if CHIP_HDR_NAVI31 || CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 +#if CHIP_HDR_NAVI31|| CHIP_HDR_NAVI32|| CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 struct { unsigned int NUM_FRAGMENTS : 2; unsigned int FORCE_DST_ALPHA_1 : 1; @@ -703,7 +703,7 @@ union CB_COLOR0_DCC_CONTROL { unsigned int SKIP_LOW_COMP_RATIO : 1; unsigned int : 10; } gfx103PlusExclusive; -#if CHIP_HDR_NAVI31 || CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 +#if CHIP_HDR_NAVI31|| CHIP_HDR_NAVI32|| CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 struct { unsigned int SAMPLE_MASK_TRACKER_DISABLE : 1; unsigned int SAMPLE_MASK_TRACKER_FEA_FORCE : 1; @@ -800,7 +800,7 @@ union CB_COLOR0_INFO { unsigned int LINEAR_GENERAL : 1; unsigned int : 24; } gfx10Plus; -#if CHIP_HDR_NAVI31 || CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 +#if CHIP_HDR_NAVI31|| CHIP_HDR_NAVI32|| CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 struct { unsigned int FORMAT : 5; unsigned int : 27; @@ -883,7 +883,7 @@ union CB_COLOR1_ATTRIB { unsigned int LIMIT_COLOR_FETCH_TO_256B_MAX : 1; unsigned int : 12; } gfx10Core; -#if CHIP_HDR_NAVI31 || CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 +#if CHIP_HDR_NAVI31|| CHIP_HDR_NAVI32|| CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 struct { unsigned int NUM_FRAGMENTS : 2; unsigned int FORCE_DST_ALPHA_1 : 1; @@ -1077,7 +1077,7 @@ union CB_COLOR1_DCC_CONTROL { unsigned int SKIP_LOW_COMP_RATIO : 1; unsigned int : 10; } gfx103PlusExclusive; -#if CHIP_HDR_NAVI31 || CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 +#if CHIP_HDR_NAVI31|| CHIP_HDR_NAVI32|| CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 struct { unsigned int SAMPLE_MASK_TRACKER_DISABLE : 1; unsigned int SAMPLE_MASK_TRACKER_FEA_FORCE : 1; @@ -1174,7 +1174,7 @@ union CB_COLOR1_INFO { unsigned int LINEAR_GENERAL : 1; unsigned int : 24; } gfx10Plus; -#if CHIP_HDR_NAVI31 || CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 +#if CHIP_HDR_NAVI31|| CHIP_HDR_NAVI32|| CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 struct { unsigned int FORMAT : 5; unsigned int : 27; @@ -1257,7 +1257,7 @@ union CB_COLOR2_ATTRIB { unsigned int LIMIT_COLOR_FETCH_TO_256B_MAX : 1; unsigned int : 12; } gfx10Core; -#if CHIP_HDR_NAVI31 || CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 +#if CHIP_HDR_NAVI31|| CHIP_HDR_NAVI32|| CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 struct { unsigned int NUM_FRAGMENTS : 2; unsigned int FORCE_DST_ALPHA_1 : 1; @@ -1451,7 +1451,7 @@ union CB_COLOR2_DCC_CONTROL { unsigned int SKIP_LOW_COMP_RATIO : 1; unsigned int : 10; } gfx103PlusExclusive; -#if CHIP_HDR_NAVI31 || CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 +#if CHIP_HDR_NAVI31|| CHIP_HDR_NAVI32|| CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 struct { unsigned int SAMPLE_MASK_TRACKER_DISABLE : 1; unsigned int SAMPLE_MASK_TRACKER_FEA_FORCE : 1; @@ -1548,7 +1548,7 @@ union CB_COLOR2_INFO { unsigned int LINEAR_GENERAL : 1; unsigned int : 24; } gfx10Plus; -#if CHIP_HDR_NAVI31 || CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 +#if CHIP_HDR_NAVI31|| CHIP_HDR_NAVI32|| CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 struct { unsigned int FORMAT : 5; unsigned int : 27; @@ -1631,7 +1631,7 @@ union CB_COLOR3_ATTRIB { unsigned int LIMIT_COLOR_FETCH_TO_256B_MAX : 1; unsigned int : 12; } gfx10Core; -#if CHIP_HDR_NAVI31 || CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 +#if CHIP_HDR_NAVI31|| CHIP_HDR_NAVI32|| CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 struct { unsigned int NUM_FRAGMENTS : 2; unsigned int FORCE_DST_ALPHA_1 : 1; @@ -1825,7 +1825,7 @@ union CB_COLOR3_DCC_CONTROL { unsigned int SKIP_LOW_COMP_RATIO : 1; unsigned int : 10; } gfx103PlusExclusive; -#if CHIP_HDR_NAVI31 || CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 +#if CHIP_HDR_NAVI31|| CHIP_HDR_NAVI32|| CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 struct { unsigned int SAMPLE_MASK_TRACKER_DISABLE : 1; unsigned int SAMPLE_MASK_TRACKER_FEA_FORCE : 1; @@ -1922,7 +1922,7 @@ union CB_COLOR3_INFO { unsigned int LINEAR_GENERAL : 1; unsigned int : 24; } gfx10Plus; -#if CHIP_HDR_NAVI31 || CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 +#if CHIP_HDR_NAVI31|| CHIP_HDR_NAVI32|| CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 struct { unsigned int FORMAT : 5; unsigned int : 27; @@ -2005,7 +2005,7 @@ union CB_COLOR4_ATTRIB { unsigned int LIMIT_COLOR_FETCH_TO_256B_MAX : 1; unsigned int : 12; } gfx10Core; -#if CHIP_HDR_NAVI31 || CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 +#if CHIP_HDR_NAVI31|| CHIP_HDR_NAVI32|| CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 struct { unsigned int NUM_FRAGMENTS : 2; unsigned int FORCE_DST_ALPHA_1 : 1; @@ -2199,7 +2199,7 @@ union CB_COLOR4_DCC_CONTROL { unsigned int SKIP_LOW_COMP_RATIO : 1; unsigned int : 10; } gfx103PlusExclusive; -#if CHIP_HDR_NAVI31 || CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 +#if CHIP_HDR_NAVI31|| CHIP_HDR_NAVI32|| CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 struct { unsigned int SAMPLE_MASK_TRACKER_DISABLE : 1; unsigned int SAMPLE_MASK_TRACKER_FEA_FORCE : 1; @@ -2296,7 +2296,7 @@ union CB_COLOR4_INFO { unsigned int LINEAR_GENERAL : 1; unsigned int : 24; } gfx10Plus; -#if CHIP_HDR_NAVI31 || CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 +#if CHIP_HDR_NAVI31|| CHIP_HDR_NAVI32|| CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 struct { unsigned int FORMAT : 5; unsigned int : 27; @@ -2379,7 +2379,7 @@ union CB_COLOR5_ATTRIB { unsigned int LIMIT_COLOR_FETCH_TO_256B_MAX : 1; unsigned int : 12; } gfx10Core; -#if CHIP_HDR_NAVI31 || CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 +#if CHIP_HDR_NAVI31|| CHIP_HDR_NAVI32|| CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 struct { unsigned int NUM_FRAGMENTS : 2; unsigned int FORCE_DST_ALPHA_1 : 1; @@ -2573,7 +2573,7 @@ union CB_COLOR5_DCC_CONTROL { unsigned int SKIP_LOW_COMP_RATIO : 1; unsigned int : 10; } gfx103PlusExclusive; -#if CHIP_HDR_NAVI31 || CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 +#if CHIP_HDR_NAVI31|| CHIP_HDR_NAVI32|| CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 struct { unsigned int SAMPLE_MASK_TRACKER_DISABLE : 1; unsigned int SAMPLE_MASK_TRACKER_FEA_FORCE : 1; @@ -2670,7 +2670,7 @@ union CB_COLOR5_INFO { unsigned int LINEAR_GENERAL : 1; unsigned int : 24; } gfx10Plus; -#if CHIP_HDR_NAVI31 || CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 +#if CHIP_HDR_NAVI31|| CHIP_HDR_NAVI32|| CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 struct { unsigned int FORMAT : 5; unsigned int : 27; @@ -2753,7 +2753,7 @@ union CB_COLOR6_ATTRIB { unsigned int LIMIT_COLOR_FETCH_TO_256B_MAX : 1; unsigned int : 12; } gfx10Core; -#if CHIP_HDR_NAVI31 || CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 +#if CHIP_HDR_NAVI31|| CHIP_HDR_NAVI32|| CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 struct { unsigned int NUM_FRAGMENTS : 2; unsigned int FORCE_DST_ALPHA_1 : 1; @@ -2947,7 +2947,7 @@ union CB_COLOR6_DCC_CONTROL { unsigned int SKIP_LOW_COMP_RATIO : 1; unsigned int : 10; } gfx103PlusExclusive; -#if CHIP_HDR_NAVI31 || CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 +#if CHIP_HDR_NAVI31|| CHIP_HDR_NAVI32|| CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 struct { unsigned int SAMPLE_MASK_TRACKER_DISABLE : 1; unsigned int SAMPLE_MASK_TRACKER_FEA_FORCE : 1; @@ -3044,7 +3044,7 @@ union CB_COLOR6_INFO { unsigned int LINEAR_GENERAL : 1; unsigned int : 24; } gfx10Plus; -#if CHIP_HDR_NAVI31 || CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 +#if CHIP_HDR_NAVI31|| CHIP_HDR_NAVI32|| CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 struct { unsigned int FORMAT : 5; unsigned int : 27; @@ -3127,7 +3127,7 @@ union CB_COLOR7_ATTRIB { unsigned int LIMIT_COLOR_FETCH_TO_256B_MAX : 1; unsigned int : 12; } gfx10Core; -#if CHIP_HDR_NAVI31 || CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 +#if CHIP_HDR_NAVI31|| CHIP_HDR_NAVI32|| CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 struct { unsigned int NUM_FRAGMENTS : 2; unsigned int FORCE_DST_ALPHA_1 : 1; @@ -3321,7 +3321,7 @@ union CB_COLOR7_DCC_CONTROL { unsigned int SKIP_LOW_COMP_RATIO : 1; unsigned int : 10; } gfx103PlusExclusive; -#if CHIP_HDR_NAVI31 || CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 +#if CHIP_HDR_NAVI31|| CHIP_HDR_NAVI32|| CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 struct { unsigned int SAMPLE_MASK_TRACKER_DISABLE : 1; unsigned int SAMPLE_MASK_TRACKER_FEA_FORCE : 1; @@ -3418,7 +3418,7 @@ union CB_COLOR7_INFO { unsigned int LINEAR_GENERAL : 1; unsigned int : 24; } gfx10Plus; -#if CHIP_HDR_NAVI31 || CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 +#if CHIP_HDR_NAVI31|| CHIP_HDR_NAVI32|| CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 struct { unsigned int FORMAT : 5; unsigned int : 27; @@ -3544,7 +3544,7 @@ union CB_DCC_CONFIG { unsigned int READ_RETURN_SKID_FIFO_DEPTH : 9; unsigned int : 7; } gfx10Plus; -#if CHIP_HDR_NAVI31 || CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 +#if CHIP_HDR_NAVI31|| CHIP_HDR_NAVI32|| CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 struct { unsigned int SAMPLE_MASK_TRACKER_DEPTH : 5; unsigned int SAMPLE_MASK_TRACKER_DISABLE : 1; @@ -3560,7 +3560,7 @@ union CB_DCC_CONFIG { float f32All; }; -#if CHIP_HDR_NAVI31 || CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 +#if CHIP_HDR_NAVI31|| CHIP_HDR_NAVI32|| CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 union CB_DCC_CONFIG2 { struct { unsigned int INVALID_KEY_ERROR_CODE : 8; @@ -3604,7 +3604,7 @@ union CB_DCC_CONTROL { float f32All; }; -#if CHIP_HDR_NAVI31 || CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 +#if CHIP_HDR_NAVI31|| CHIP_HDR_NAVI32|| CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 union CB_FDCC_CONTROL { struct { unsigned int SAMPLE_MASK_TRACKER_DISABLE : 1; @@ -3627,7 +3627,7 @@ union CB_FDCC_CONTROL { }; #endif -#if CHIP_HDR_NAVI31 || CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 +#if CHIP_HDR_NAVI31|| CHIP_HDR_NAVI32|| CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 union CB_FGCG_SRAM_OVERRIDE { struct { unsigned int DISABLE_FGCG : 20; @@ -3709,7 +3709,7 @@ union CB_HW_CONTROL { unsigned int ALLOW_MRT_WITH_DUAL_SOURCE : 1; unsigned int : 31; } gfx10Plus; -#if CHIP_HDR_NAVI31 || CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 +#if CHIP_HDR_NAVI31|| CHIP_HDR_NAVI32|| CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 struct { unsigned int : 2; unsigned int DISABLE_SMT_WHEN_NO_FDCC_FIX : 1; @@ -3760,7 +3760,7 @@ union CB_HW_CONTROL_1 { unsigned int CM_TILE_FIFO_DEPTH : 9; unsigned int : 5; } gfx103Derivative; -#if CHIP_HDR_NAVI31 || CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 +#if CHIP_HDR_NAVI31|| CHIP_HDR_NAVI32|| CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 struct { unsigned int CC_CACHE_NUM_TAGS : 6; unsigned int : 26; @@ -3821,7 +3821,7 @@ union CB_HW_CONTROL_2 { unsigned int DRR_ASSUMED_FIFO_DEPTH_DIV8 : 6; unsigned int CHICKEN_BITS : 2; } gfx10; -#if CHIP_HDR_NAVI31 || CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 +#if CHIP_HDR_NAVI31|| CHIP_HDR_NAVI32|| CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 struct { unsigned int SPARE_4 : 8; unsigned int DRR_ASSUMED_FIFO_DEPTH_DIV8 : 6; @@ -3905,7 +3905,7 @@ union CB_HW_CONTROL_3 { unsigned int DISABLE_DCC_VRS_OPT : 1; unsigned int : 3; } gfx10Vrs; -#if CHIP_HDR_NAVI31 || CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 +#if CHIP_HDR_NAVI31|| CHIP_HDR_NAVI32|| CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 struct { unsigned int SPARE_5 : 1; unsigned int : 1; @@ -3964,7 +3964,7 @@ union CB_HW_CONTROL_4 { unsigned int DISABLE_BC_COLOR_CACHE_PREFETCH : 1; unsigned int : 17; } gfx10Core; -#if CHIP_HDR_NAVI31 || CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 +#if CHIP_HDR_NAVI31|| CHIP_HDR_NAVI32|| CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 struct { unsigned int COLOR_CACHE_FETCH_NUM_QB_LOG2 : 3; unsigned int COLOR_CACHE_FETCH_ALGORITHM : 2; @@ -4008,7 +4008,7 @@ union CB_HW_MEM_ARBITER_RD { unsigned int SEND_LASTS_WITHIN_GROUPS : 1; unsigned int : 2; } gfx09_10; -#if CHIP_HDR_NAVI31 || CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 +#if CHIP_HDR_NAVI31|| CHIP_HDR_NAVI32|| CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 struct { unsigned int : 12; unsigned int WEIGHT_DC : 2; @@ -4048,7 +4048,7 @@ union CB_HW_MEM_ARBITER_WR { unsigned int SEND_LASTS_WITHIN_GROUPS : 1; unsigned int : 2; } gfx09_10; -#if CHIP_HDR_NAVI31 || CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 +#if CHIP_HDR_NAVI31|| CHIP_HDR_NAVI32|| CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 struct { unsigned int : 12; unsigned int WEIGHT_DC : 2; @@ -4067,7 +4067,7 @@ union CB_HW_MEM_ARBITER_WR { float f32All; }; -#if CHIP_HDR_NAVI31 || CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 +#if CHIP_HDR_NAVI31|| CHIP_HDR_NAVI32|| CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 union CB_KEY_OVERRIDE_0 { struct { unsigned int OVERRIDE : 32; @@ -4079,7 +4079,7 @@ union CB_KEY_OVERRIDE_0 { }; #endif -#if CHIP_HDR_NAVI31 || CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 +#if CHIP_HDR_NAVI31|| CHIP_HDR_NAVI32|| CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 union CB_KEY_OVERRIDE_1 { struct { unsigned int OVERRIDE : 32; @@ -4091,7 +4091,7 @@ union CB_KEY_OVERRIDE_1 { }; #endif -#if CHIP_HDR_NAVI31 || CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 +#if CHIP_HDR_NAVI31|| CHIP_HDR_NAVI32|| CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 union CB_KEY_OVERRIDE_2 { struct { unsigned int OVERRIDE : 32; @@ -4103,7 +4103,7 @@ union CB_KEY_OVERRIDE_2 { }; #endif -#if CHIP_HDR_NAVI31 || CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 +#if CHIP_HDR_NAVI31|| CHIP_HDR_NAVI32|| CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 union CB_KEY_OVERRIDE_3 { struct { unsigned int OVERRIDE : 32; @@ -4115,7 +4115,7 @@ union CB_KEY_OVERRIDE_3 { }; #endif -#if CHIP_HDR_NAVI31 || CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 +#if CHIP_HDR_NAVI31|| CHIP_HDR_NAVI32|| CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 union CB_KEY_OVERRIDE_4 { struct { unsigned int OVERRIDE : 32; @@ -4127,7 +4127,7 @@ union CB_KEY_OVERRIDE_4 { }; #endif -#if CHIP_HDR_NAVI31 || CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 +#if CHIP_HDR_NAVI31|| CHIP_HDR_NAVI32|| CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 union CB_KEY_OVERRIDE_5 { struct { unsigned int OVERRIDE : 32; @@ -4139,7 +4139,7 @@ union CB_KEY_OVERRIDE_5 { }; #endif -#if CHIP_HDR_NAVI31 || CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 +#if CHIP_HDR_NAVI31|| CHIP_HDR_NAVI32|| CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 union CB_KEY_OVERRIDE_6 { struct { unsigned int OVERRIDE : 32; @@ -4151,7 +4151,7 @@ union CB_KEY_OVERRIDE_6 { }; #endif -#if CHIP_HDR_NAVI31 || CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 +#if CHIP_HDR_NAVI31|| CHIP_HDR_NAVI32|| CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 union CB_KEY_OVERRIDE_7 { struct { unsigned int OVERRIDE : 32; @@ -4284,7 +4284,7 @@ union CB_PERFCOUNTER0_SELECT { unsigned int PERF_SEL1 : 9; unsigned int : 13; } most; -#if CHIP_HDR_NAVI31 || CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 +#if CHIP_HDR_NAVI31|| CHIP_HDR_NAVI32|| CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 struct { unsigned int PERF_SEL : 10; unsigned int PERF_SEL1 : 10; @@ -4309,7 +4309,7 @@ union CB_PERFCOUNTER0_SELECT1 { unsigned int PERF_SEL3 : 9; unsigned int : 13; } most; -#if CHIP_HDR_NAVI31 || CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 +#if CHIP_HDR_NAVI31|| CHIP_HDR_NAVI32|| CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 struct { unsigned int PERF_SEL2 : 10; unsigned int PERF_SEL3 : 10; @@ -4351,7 +4351,7 @@ union CB_PERFCOUNTER1_SELECT { unsigned int PERF_SEL : 9; unsigned int : 23; } most; -#if CHIP_HDR_NAVI31 || CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 +#if CHIP_HDR_NAVI31|| CHIP_HDR_NAVI32|| CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 struct { unsigned int PERF_SEL : 10; unsigned int : 22; @@ -4392,7 +4392,7 @@ union CB_PERFCOUNTER2_SELECT { unsigned int PERF_SEL : 9; unsigned int : 23; } most; -#if CHIP_HDR_NAVI31 || CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 +#if CHIP_HDR_NAVI31|| CHIP_HDR_NAVI32|| CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 struct { unsigned int PERF_SEL : 10; unsigned int : 22; @@ -4433,7 +4433,7 @@ union CB_PERFCOUNTER3_SELECT { unsigned int PERF_SEL : 9; unsigned int : 23; } most; -#if CHIP_HDR_NAVI31 || CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 +#if CHIP_HDR_NAVI31|| CHIP_HDR_NAVI32|| CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 struct { unsigned int PERF_SEL : 10; unsigned int : 22; @@ -4508,7 +4508,7 @@ union CB_RMI_GL2_CACHE_CONTROL { unsigned int FMASK_BIG_PAGE : 1; unsigned int : 1; } gfx10; -#if CHIP_HDR_NAVI31 || CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 +#if CHIP_HDR_NAVI31|| CHIP_HDR_NAVI32|| CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 struct { unsigned int DCC_WR_POLICY : 2; unsigned int COLOR_WR_POLICY : 2; @@ -4547,7 +4547,7 @@ union CB_RMI_GL2_CACHE_CONTROL { unsigned int : 6; } nv24; #endif -#if CHIP_HDR_NAVI21|| CHIP_HDR_NAVI22|| CHIP_HDR_NAVI23|| CHIP_HDR_NAVI24|| CHIP_HDR_NAVI31 || CHIP_HDR_NAVI33 +#if CHIP_HDR_NAVI21|| CHIP_HDR_NAVI22|| CHIP_HDR_NAVI23|| CHIP_HDR_NAVI24|| CHIP_HDR_NAVI31|| CHIP_HDR_NAVI32|| CHIP_HDR_NAVI33 struct { unsigned int : 26; unsigned int DCC_L3_BYPASS : 1; @@ -5290,7 +5290,7 @@ union COMPUTE_DISPATCH_INITIATOR { unsigned int CS_W32_EN : 1; unsigned int : 16; } gfx10Plus; -#if CHIP_HDR_NAVI31 || CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 +#if CHIP_HDR_NAVI31|| CHIP_HDR_NAVI32|| CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 struct { unsigned int : 16; unsigned int AMP_SHADER_EN : 1; @@ -5304,7 +5304,7 @@ union COMPUTE_DISPATCH_INITIATOR { float f32All; }; -#if CHIP_HDR_NAVI31 || CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 +#if CHIP_HDR_NAVI31|| CHIP_HDR_NAVI32|| CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 union COMPUTE_DISPATCH_INTERLEAVE { struct { unsigned int INTERLEAVE : 10; @@ -5384,7 +5384,7 @@ union COMPUTE_MISC_RESERVED { unsigned int RESERVED2 : 1; unsigned int : 29; } gfx09_10; -#if CHIP_HDR_NAVI31 || CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 +#if CHIP_HDR_NAVI31|| CHIP_HDR_NAVI32|| CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 struct { unsigned int SEND_SEID : 3; unsigned int : 29; @@ -5528,14 +5528,14 @@ union COMPUTE_PGM_RSRC3 { unsigned int SHARED_VGPR_CNT : 4; unsigned int : 28; } bits, bitfields; -#if CHIP_HDR_NAVI31 || CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 +#if CHIP_HDR_NAVI31|| CHIP_HDR_NAVI32|| CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 struct { unsigned int : 4; unsigned int INST_PREF_SIZE : 6; unsigned int : 22; } gfx104Plus; #endif -#if CHIP_HDR_NAVI31 || CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 +#if CHIP_HDR_NAVI31|| CHIP_HDR_NAVI32|| CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 struct { unsigned int : 10; unsigned int TRAP_ON_START : 1; @@ -5840,7 +5840,7 @@ union COMPUTE_STATIC_THREAD_MGMT_SE3 { float f32All; }; -#if CHIP_HDR_NAVI31 || CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 +#if CHIP_HDR_NAVI31|| CHIP_HDR_NAVI32|| CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 union COMPUTE_STATIC_THREAD_MGMT_SE4 { struct { unsigned int SA0_CU_EN : 16; @@ -5853,7 +5853,7 @@ union COMPUTE_STATIC_THREAD_MGMT_SE4 { }; #endif -#if CHIP_HDR_NAVI31 || CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 +#if CHIP_HDR_NAVI31|| CHIP_HDR_NAVI32|| CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 union COMPUTE_STATIC_THREAD_MGMT_SE5 { struct { unsigned int SA0_CU_EN : 16; @@ -5866,7 +5866,7 @@ union COMPUTE_STATIC_THREAD_MGMT_SE5 { }; #endif -#if CHIP_HDR_NAVI31 || CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 +#if CHIP_HDR_NAVI31|| CHIP_HDR_NAVI32|| CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 union COMPUTE_STATIC_THREAD_MGMT_SE6 { struct { unsigned int SA0_CU_EN : 16; @@ -5879,7 +5879,7 @@ union COMPUTE_STATIC_THREAD_MGMT_SE6 { }; #endif -#if CHIP_HDR_NAVI31 || CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 +#if CHIP_HDR_NAVI31|| CHIP_HDR_NAVI32|| CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 union COMPUTE_STATIC_THREAD_MGMT_SE7 { struct { unsigned int SA0_CU_EN : 16; @@ -5923,7 +5923,7 @@ union COMPUTE_TMPRING_SIZE { unsigned int WAVESIZE : 13; unsigned int : 7; } gfx09_10; -#if CHIP_HDR_NAVI31 || CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 +#if CHIP_HDR_NAVI31|| CHIP_HDR_NAVI32|| CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 struct { unsigned int : 12; unsigned int WAVESIZE : 15; @@ -6286,7 +6286,7 @@ union CPC_PERFCOUNTER1_SELECT { unsigned int PERF_SEL : 10; unsigned int : 22; } gfx10Plus; -#if CHIP_HDR_NAVI31 || CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 +#if CHIP_HDR_NAVI31|| CHIP_HDR_NAVI32|| CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 struct { unsigned int : 28; unsigned int CNTR_MODE : 4; @@ -6412,7 +6412,7 @@ union CPF_PERFCOUNTER1_SELECT { unsigned int PERF_SEL : 10; unsigned int : 22; } gfx10Plus; -#if CHIP_HDR_NAVI31 || CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 +#if CHIP_HDR_NAVI31|| CHIP_HDR_NAVI32|| CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 struct { unsigned int : 28; unsigned int CNTR_MODE : 4; @@ -6538,7 +6538,7 @@ union CPG_PERFCOUNTER1_SELECT { unsigned int PERF_SEL : 10; unsigned int : 22; } gfx10Plus; -#if CHIP_HDR_NAVI31 || CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 +#if CHIP_HDR_NAVI31|| CHIP_HDR_NAVI32|| CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 struct { unsigned int : 28; unsigned int CNTR_MODE : 4; @@ -6960,7 +6960,7 @@ union CP_STRMOUT_CNTL { float f32All; }; -#if CHIP_HDR_NAVI31 || CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 +#if CHIP_HDR_NAVI31|| CHIP_HDR_NAVI32|| CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 union CP_VGT_ASINVOC_COUNT_HI { struct { unsigned int ASINVOC_COUNT_HI : 32; @@ -6972,7 +6972,7 @@ union CP_VGT_ASINVOC_COUNT_HI { }; #endif -#if CHIP_HDR_NAVI31 || CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 +#if CHIP_HDR_NAVI31|| CHIP_HDR_NAVI32|| CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 union CP_VGT_ASINVOC_COUNT_LO { struct { unsigned int ASINVOC_COUNT_LO : 32; @@ -7188,7 +7188,7 @@ union DB_CGTT_CLK_CTRL_0 { unsigned int RESERVED : 4; unsigned int : 16; } gfx09; -#if CHIP_HDR_NAVI31 || CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 +#if CHIP_HDR_NAVI31|| CHIP_HDR_NAVI32|| CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 struct { unsigned int SOFT_OVERRIDE0 : 1; unsigned int SOFT_OVERRIDE1 : 1; @@ -7207,7 +7207,7 @@ union DB_CGTT_CLK_CTRL_0 { unsigned int RESERVED : 4; unsigned int : 16; } gfx10Core; -#if CHIP_HDR_NAVI31 || CHIP_HDR_NAVI33 +#if CHIP_HDR_NAVI31 || CHIP_HDR_NAVI32 || CHIP_HDR_NAVI33 struct { unsigned int : 9; unsigned int RESERVED : 23; @@ -7267,7 +7267,7 @@ union DB_CREDIT_LIMIT { unsigned int DB_CB_TILE_CREDITS : 7; unsigned int : 1; } gfx09_10; -#if CHIP_HDR_NAVI31 || CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 +#if CHIP_HDR_NAVI31|| CHIP_HDR_NAVI32|| CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 struct { unsigned int : 13; unsigned int DB_SC_WAVE_CREDITS : 5; @@ -7621,7 +7621,7 @@ union DB_FGCG_INTERFACES_CLK_CTRL { unsigned int DB_CB_LQUAD_OVERRIDE : 1; unsigned int : 29; } gfx10; -#if CHIP_HDR_NAVI31 || CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 +#if CHIP_HDR_NAVI31|| CHIP_HDR_NAVI32|| CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 struct { unsigned int : 2; unsigned int DB_CB_EXPORT_OVERRIDE : 1; @@ -7668,14 +7668,14 @@ union DB_FGCG_SRAMS_CLK_CTRL { unsigned int OVERRIDE26 : 1; unsigned int : 5; } most; -#if CHIP_HDR_NAVI31 || CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 +#if CHIP_HDR_NAVI31|| CHIP_HDR_NAVI32|| CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 struct { unsigned int : 27; unsigned int OVERRIDE27 : 1; unsigned int : 4; } gfx104Plus; #endif -#if CHIP_HDR_NAVI31 || CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 +#if CHIP_HDR_NAVI31|| CHIP_HDR_NAVI32|| CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 struct { unsigned int : 28; unsigned int OVERRIDE28 : 1; @@ -7741,7 +7741,7 @@ union DB_FIFO_DEPTH3 { unsigned int : 16; unsigned int QUAD_READ_REQS : 8; } bits, bitfields; -#if CHIP_HDR_NAVI31 || CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 +#if CHIP_HDR_NAVI31|| CHIP_HDR_NAVI32|| CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 struct { unsigned int : 8; unsigned int OSB_WAVE_TABLE_DEPTH : 8; @@ -7755,7 +7755,7 @@ union DB_FIFO_DEPTH3 { float f32All; }; -#if CHIP_HDR_NAVI31 || CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 +#if CHIP_HDR_NAVI31|| CHIP_HDR_NAVI32|| CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 union DB_FIFO_DEPTH4 { struct { unsigned int OSB_SQUAD_TABLE_DEPTH : 8; @@ -8249,7 +8249,7 @@ union DB_RENDER_CONTROL { unsigned int : 1; unsigned int : 18; } gfx103; -#if CHIP_HDR_NAVI31 || CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 +#if CHIP_HDR_NAVI31|| CHIP_HDR_NAVI32|| CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 struct { unsigned int : 14; unsigned int : 1; @@ -8330,7 +8330,7 @@ union DB_RENDER_OVERRIDE2 { unsigned int CENTROID_COMPUTATION_MODE : 2; unsigned int : 3; } gfx103Plus; -#if CHIP_HDR_NAVI31 || CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 +#if CHIP_HDR_NAVI31|| CHIP_HDR_NAVI32|| CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 struct { unsigned int : 29; unsigned int DISABLE_NOZ : 1; @@ -8462,7 +8462,7 @@ union DB_RMI_L2_CACHE_CONTROL { unsigned int S_BIG_PAGE : 1; unsigned int : 6; } bits, bitfields; -#if CHIP_HDR_NAVI21|| CHIP_HDR_NAVI22|| CHIP_HDR_NAVI23|| CHIP_HDR_NAVI24|| CHIP_HDR_NAVI31 || CHIP_HDR_NAVI33 +#if CHIP_HDR_NAVI21|| CHIP_HDR_NAVI22|| CHIP_HDR_NAVI23|| CHIP_HDR_NAVI24|| CHIP_HDR_NAVI31|| CHIP_HDR_NAVI32|| CHIP_HDR_NAVI33 struct { unsigned int : 26; unsigned int Z_NOALLOC : 1; @@ -8509,7 +8509,7 @@ union DB_SHADER_CONTROL { unsigned int PRE_SHADER_DEPTH_COVERAGE_ENABLE : 1; unsigned int : 8; } gfx10Plus; -#if CHIP_HDR_NAVI31 || CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 +#if CHIP_HDR_NAVI31|| CHIP_HDR_NAVI32|| CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 struct { unsigned int : 24; unsigned int OREO_BLEND_ENABLE : 1; @@ -10849,7 +10849,7 @@ union GCR_PERFCOUNTER0_SELECT { unsigned int PERF_SEL1 : 9; unsigned int : 13; } gfx10; -#if CHIP_HDR_NAVI31 || CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 +#if CHIP_HDR_NAVI31|| CHIP_HDR_NAVI32|| CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 struct { unsigned int PERF_SEL : 10; unsigned int PERF_SEL1 : 10; @@ -10874,7 +10874,7 @@ union GCR_PERFCOUNTER0_SELECT1 { unsigned int PERF_SEL3 : 9; unsigned int : 13; } gfx10; -#if CHIP_HDR_NAVI31 || CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 +#if CHIP_HDR_NAVI31|| CHIP_HDR_NAVI32|| CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 struct { unsigned int PERF_SEL2 : 10; unsigned int PERF_SEL3 : 10; @@ -10914,7 +10914,7 @@ union GCR_PERFCOUNTER1_SELECT { unsigned int PERF_MODE : 4; unsigned int CNTL_MODE : 4; } gfx10; -#if CHIP_HDR_NAVI31 || CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 +#if CHIP_HDR_NAVI31|| CHIP_HDR_NAVI32|| CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 struct { unsigned int PERF_SEL : 10; unsigned int : 10; @@ -11151,6 +11151,13 @@ union GC_ATC_L2_PERFCOUNTER2_SELECT { unsigned int : 18; unsigned int PERF_MODE : 4; } gfx101; +#if CHIP_HDR_NAVI32 + struct { + unsigned int PERF_SEL0 : 10; + unsigned int : 18; + unsigned int PERF_MODE0 : 4; + } nv32; +#endif struct { unsigned int PERF_SEL : 10; unsigned int : 18; @@ -11492,14 +11499,14 @@ union GE1_PERFCOUNTER0_SELECT { unsigned int PERF_SEL0 : 10; unsigned int : 22; } gfx103CorePlus; -#if CHIP_HDR_NAVI31 || CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 +#if CHIP_HDR_NAVI31|| CHIP_HDR_NAVI32|| CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 struct { unsigned int : 24; unsigned int PERF_MODE1 : 4; unsigned int : 4; } gfx104Plus; #endif -#if CHIP_HDR_NAVI31 || CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 +#if CHIP_HDR_NAVI31|| CHIP_HDR_NAVI32|| CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 struct { unsigned int : 28; unsigned int PERF_MODE0 : 4; @@ -11522,7 +11529,7 @@ union GE1_PERFCOUNTER0_SELECT1 { unsigned int PERF_MODE2 : 4; unsigned int PERF_MODE3 : 4; } gfx103; -#if CHIP_HDR_NAVI31 || CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 +#if CHIP_HDR_NAVI31|| CHIP_HDR_NAVI32|| CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 struct { unsigned int : 24; unsigned int PERF_MODE3 : 4; @@ -11571,14 +11578,14 @@ union GE1_PERFCOUNTER1_SELECT { unsigned int PERF_SEL0 : 10; unsigned int : 22; } gfx103CorePlus; -#if CHIP_HDR_NAVI31 || CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 +#if CHIP_HDR_NAVI31|| CHIP_HDR_NAVI32|| CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 struct { unsigned int : 24; unsigned int PERF_MODE1 : 4; unsigned int : 4; } gfx104Plus; #endif -#if CHIP_HDR_NAVI31 || CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 +#if CHIP_HDR_NAVI31|| CHIP_HDR_NAVI32|| CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 struct { unsigned int : 28; unsigned int PERF_MODE0 : 4; @@ -11601,7 +11608,7 @@ union GE1_PERFCOUNTER1_SELECT1 { unsigned int PERF_MODE2 : 4; unsigned int PERF_MODE3 : 4; } gfx103; -#if CHIP_HDR_NAVI31 || CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 +#if CHIP_HDR_NAVI31|| CHIP_HDR_NAVI32|| CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 struct { unsigned int : 24; unsigned int PERF_MODE3 : 4; @@ -11650,14 +11657,14 @@ union GE1_PERFCOUNTER2_SELECT { unsigned int PERF_SEL0 : 10; unsigned int : 22; } gfx103CorePlus; -#if CHIP_HDR_NAVI31 || CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 +#if CHIP_HDR_NAVI31|| CHIP_HDR_NAVI32|| CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 struct { unsigned int : 24; unsigned int PERF_MODE1 : 4; unsigned int : 4; } gfx104Plus; #endif -#if CHIP_HDR_NAVI31 || CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 +#if CHIP_HDR_NAVI31|| CHIP_HDR_NAVI32|| CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 struct { unsigned int : 28; unsigned int PERF_MODE0 : 4; @@ -11680,7 +11687,7 @@ union GE1_PERFCOUNTER2_SELECT1 { unsigned int PERF_MODE2 : 4; unsigned int PERF_MODE3 : 4; } gfx103; -#if CHIP_HDR_NAVI31 || CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 +#if CHIP_HDR_NAVI31|| CHIP_HDR_NAVI32|| CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 struct { unsigned int : 24; unsigned int PERF_MODE3 : 4; @@ -11729,14 +11736,14 @@ union GE1_PERFCOUNTER3_SELECT { unsigned int PERF_SEL0 : 10; unsigned int : 22; } gfx103CorePlus; -#if CHIP_HDR_NAVI31 || CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 +#if CHIP_HDR_NAVI31|| CHIP_HDR_NAVI32|| CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 struct { unsigned int : 24; unsigned int PERF_MODE1 : 4; unsigned int : 4; } gfx104Plus; #endif -#if CHIP_HDR_NAVI31 || CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 +#if CHIP_HDR_NAVI31|| CHIP_HDR_NAVI32|| CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 struct { unsigned int : 28; unsigned int PERF_MODE0 : 4; @@ -11759,7 +11766,7 @@ union GE1_PERFCOUNTER3_SELECT1 { unsigned int PERF_MODE2 : 4; unsigned int PERF_MODE3 : 4; } gfx103; -#if CHIP_HDR_NAVI31 || CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 +#if CHIP_HDR_NAVI31|| CHIP_HDR_NAVI32|| CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 struct { unsigned int : 24; unsigned int PERF_MODE3 : 4; @@ -11808,14 +11815,14 @@ union GE2_DIST_PERFCOUNTER0_SELECT { unsigned int PERF_SEL0 : 10; unsigned int : 22; } gfx103CorePlus; -#if CHIP_HDR_NAVI31 || CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 +#if CHIP_HDR_NAVI31|| CHIP_HDR_NAVI32|| CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 struct { unsigned int : 24; unsigned int PERF_MODE1 : 4; unsigned int : 4; } gfx104Plus; #endif -#if CHIP_HDR_NAVI31 || CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 +#if CHIP_HDR_NAVI31|| CHIP_HDR_NAVI32|| CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 struct { unsigned int : 28; unsigned int PERF_MODE0 : 4; @@ -11838,7 +11845,7 @@ union GE2_DIST_PERFCOUNTER0_SELECT1 { unsigned int PERF_MODE2 : 4; unsigned int PERF_MODE3 : 4; } gfx103; -#if CHIP_HDR_NAVI31 || CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 +#if CHIP_HDR_NAVI31|| CHIP_HDR_NAVI32|| CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 struct { unsigned int : 24; unsigned int PERF_MODE3 : 4; @@ -11887,14 +11894,14 @@ union GE2_DIST_PERFCOUNTER1_SELECT { unsigned int PERF_SEL0 : 10; unsigned int : 22; } gfx103CorePlus; -#if CHIP_HDR_NAVI31 || CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 +#if CHIP_HDR_NAVI31|| CHIP_HDR_NAVI32|| CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 struct { unsigned int : 24; unsigned int PERF_MODE1 : 4; unsigned int : 4; } gfx104Plus; #endif -#if CHIP_HDR_NAVI31 || CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 +#if CHIP_HDR_NAVI31|| CHIP_HDR_NAVI32|| CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 struct { unsigned int : 28; unsigned int PERF_MODE0 : 4; @@ -11917,7 +11924,7 @@ union GE2_DIST_PERFCOUNTER1_SELECT1 { unsigned int PERF_MODE2 : 4; unsigned int PERF_MODE3 : 4; } gfx103; -#if CHIP_HDR_NAVI31 || CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 +#if CHIP_HDR_NAVI31|| CHIP_HDR_NAVI32|| CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 struct { unsigned int : 24; unsigned int PERF_MODE3 : 4; @@ -11966,14 +11973,14 @@ union GE2_DIST_PERFCOUNTER2_SELECT { unsigned int PERF_SEL0 : 10; unsigned int : 22; } gfx103CorePlus; -#if CHIP_HDR_NAVI31 || CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 +#if CHIP_HDR_NAVI31|| CHIP_HDR_NAVI32|| CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 struct { unsigned int : 24; unsigned int PERF_MODE1 : 4; unsigned int : 4; } gfx104Plus; #endif -#if CHIP_HDR_NAVI31 || CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 +#if CHIP_HDR_NAVI31|| CHIP_HDR_NAVI32|| CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 struct { unsigned int : 28; unsigned int PERF_MODE0 : 4; @@ -11996,7 +12003,7 @@ union GE2_DIST_PERFCOUNTER2_SELECT1 { unsigned int PERF_MODE2 : 4; unsigned int PERF_MODE3 : 4; } gfx103; -#if CHIP_HDR_NAVI31 || CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 +#if CHIP_HDR_NAVI31|| CHIP_HDR_NAVI32|| CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 struct { unsigned int : 24; unsigned int PERF_MODE3 : 4; @@ -12045,14 +12052,14 @@ union GE2_DIST_PERFCOUNTER3_SELECT { unsigned int PERF_SEL0 : 10; unsigned int : 22; } gfx103CorePlus; -#if CHIP_HDR_NAVI31 || CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 +#if CHIP_HDR_NAVI31|| CHIP_HDR_NAVI32|| CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 struct { unsigned int : 24; unsigned int PERF_MODE1 : 4; unsigned int : 4; } gfx104Plus; #endif -#if CHIP_HDR_NAVI31 || CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 +#if CHIP_HDR_NAVI31|| CHIP_HDR_NAVI32|| CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 struct { unsigned int : 28; unsigned int PERF_MODE0 : 4; @@ -12075,7 +12082,7 @@ union GE2_DIST_PERFCOUNTER3_SELECT1 { unsigned int PERF_MODE2 : 4; unsigned int PERF_MODE3 : 4; } gfx103; -#if CHIP_HDR_NAVI31 || CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 +#if CHIP_HDR_NAVI31|| CHIP_HDR_NAVI32|| CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 struct { unsigned int : 24; unsigned int PERF_MODE3 : 4; @@ -12124,14 +12131,14 @@ union GE2_SE_PERFCOUNTER0_SELECT { unsigned int PERF_SEL0 : 10; unsigned int : 22; } gfx103CorePlus; -#if CHIP_HDR_NAVI31 || CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 +#if CHIP_HDR_NAVI31|| CHIP_HDR_NAVI32|| CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 struct { unsigned int : 24; unsigned int PERF_MODE1 : 4; unsigned int : 4; } gfx104Plus; #endif -#if CHIP_HDR_NAVI31 || CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 +#if CHIP_HDR_NAVI31|| CHIP_HDR_NAVI32|| CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 struct { unsigned int : 28; unsigned int PERF_MODE0 : 4; @@ -12154,7 +12161,7 @@ union GE2_SE_PERFCOUNTER0_SELECT1 { unsigned int PERF_MODE2 : 4; unsigned int PERF_MODE3 : 4; } gfx103; -#if CHIP_HDR_NAVI31 || CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 +#if CHIP_HDR_NAVI31|| CHIP_HDR_NAVI32|| CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 struct { unsigned int : 24; unsigned int PERF_MODE3 : 4; @@ -12203,14 +12210,14 @@ union GE2_SE_PERFCOUNTER1_SELECT { unsigned int PERF_SEL0 : 10; unsigned int : 22; } gfx103CorePlus; -#if CHIP_HDR_NAVI31 || CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 +#if CHIP_HDR_NAVI31|| CHIP_HDR_NAVI32|| CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 struct { unsigned int : 24; unsigned int PERF_MODE1 : 4; unsigned int : 4; } gfx104Plus; #endif -#if CHIP_HDR_NAVI31 || CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 +#if CHIP_HDR_NAVI31|| CHIP_HDR_NAVI32|| CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 struct { unsigned int : 28; unsigned int PERF_MODE0 : 4; @@ -12233,7 +12240,7 @@ union GE2_SE_PERFCOUNTER1_SELECT1 { unsigned int PERF_MODE2 : 4; unsigned int PERF_MODE3 : 4; } gfx103; -#if CHIP_HDR_NAVI31 || CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 +#if CHIP_HDR_NAVI31|| CHIP_HDR_NAVI32|| CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 struct { unsigned int : 24; unsigned int PERF_MODE3 : 4; @@ -12282,14 +12289,14 @@ union GE2_SE_PERFCOUNTER2_SELECT { unsigned int PERF_SEL0 : 10; unsigned int : 22; } gfx103CorePlus; -#if CHIP_HDR_NAVI31 || CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 +#if CHIP_HDR_NAVI31|| CHIP_HDR_NAVI32|| CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 struct { unsigned int : 24; unsigned int PERF_MODE1 : 4; unsigned int : 4; } gfx104Plus; #endif -#if CHIP_HDR_NAVI31 || CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 +#if CHIP_HDR_NAVI31|| CHIP_HDR_NAVI32|| CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 struct { unsigned int : 28; unsigned int PERF_MODE0 : 4; @@ -12312,7 +12319,7 @@ union GE2_SE_PERFCOUNTER2_SELECT1 { unsigned int PERF_MODE2 : 4; unsigned int PERF_MODE3 : 4; } gfx103; -#if CHIP_HDR_NAVI31 || CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 +#if CHIP_HDR_NAVI31|| CHIP_HDR_NAVI32|| CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 struct { unsigned int : 24; unsigned int PERF_MODE3 : 4; @@ -12361,14 +12368,14 @@ union GE2_SE_PERFCOUNTER3_SELECT { unsigned int PERF_SEL0 : 10; unsigned int : 22; } gfx103CorePlus; -#if CHIP_HDR_NAVI31 || CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 +#if CHIP_HDR_NAVI31|| CHIP_HDR_NAVI32|| CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 struct { unsigned int : 24; unsigned int PERF_MODE1 : 4; unsigned int : 4; } gfx104Plus; #endif -#if CHIP_HDR_NAVI31 || CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 +#if CHIP_HDR_NAVI31|| CHIP_HDR_NAVI32|| CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 struct { unsigned int : 28; unsigned int PERF_MODE0 : 4; @@ -12391,7 +12398,7 @@ union GE2_SE_PERFCOUNTER3_SELECT1 { unsigned int PERF_MODE2 : 4; unsigned int PERF_MODE3 : 4; } gfx103; -#if CHIP_HDR_NAVI31 || CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 +#if CHIP_HDR_NAVI31|| CHIP_HDR_NAVI32|| CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 struct { unsigned int : 24; unsigned int PERF_MODE3 : 4; @@ -12416,7 +12423,7 @@ union GE_CNTL { unsigned int BREAK_WAVE_AT_EOI : 1; unsigned int : 13; } gfx10; -#if CHIP_HDR_NAVI31 || CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 +#if CHIP_HDR_NAVI31|| CHIP_HDR_NAVI32|| CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 struct { unsigned int PRIMS_PER_SUBGRP : 9; unsigned int VERTS_PER_SUBGRP : 9; @@ -12460,7 +12467,22 @@ union GE_FAST_CLKS { float f32All; }; -#if CHIP_HDR_NAVI31 || CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 +#if CHIP_HDR_NAVI32 +union GE_FED_STATUS { + struct { + unsigned int DMA_C0_FED_ERROR : 1; + unsigned int DMA_C1_FED_ERROR : 1; + unsigned int TF_REQ_FED_ERROR : 1; + unsigned int : 29; + } bits, bitfields; + + unsigned int u32All; + signed int i32All; + float f32All; +}; +#endif + +#if CHIP_HDR_NAVI31|| CHIP_HDR_NAVI32|| CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 union GE_GS_FAST_LAUNCH_WG_DIM { struct { unsigned int GS_FL_DIM_X : 16; @@ -12473,7 +12495,7 @@ union GE_GS_FAST_LAUNCH_WG_DIM { }; #endif -#if CHIP_HDR_NAVI31 || CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 +#if CHIP_HDR_NAVI31|| CHIP_HDR_NAVI32|| CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 union GE_GS_FAST_LAUNCH_WG_DIM_1 { struct { unsigned int GS_FL_DIM_Z : 16; @@ -12533,7 +12555,7 @@ union GE_MULTI_PRIM_IB_RESET_EN { unsigned int MATCH_ALL_BITS : 1; unsigned int : 30; } bits, bitfields; -#if CHIP_HDR_NAVI31 || CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 +#if CHIP_HDR_NAVI31|| CHIP_HDR_NAVI32|| CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 struct { unsigned int : 2; unsigned int DISABLE_FOR_AUTO_INDEX : 1; @@ -12558,7 +12580,7 @@ union GE_NGG_SUBGRP_CNTL { float f32All; }; -#if CHIP_HDR_NAVI31 || CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 +#if CHIP_HDR_NAVI31|| CHIP_HDR_NAVI32|| CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 union GE_PA_IF_SAFE_REG { struct { unsigned int GE_PA_CSB : 10; @@ -13075,7 +13097,7 @@ union GE_PRIV_CONTROL { unsigned int CLAMP_HS_OFFCHIP_PER_SE_OVERRIDE : 1; unsigned int : 15; } gfx103PlusExclusive; -#if CHIP_HDR_NAVI31 || CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 +#if CHIP_HDR_NAVI31|| CHIP_HDR_NAVI32|| CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 struct { unsigned int RESERVED : 1; unsigned int : 16; @@ -13089,7 +13111,7 @@ union GE_PRIV_CONTROL { float f32All; }; -#if CHIP_HDR_NAVI31 || CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 +#if CHIP_HDR_NAVI31|| CHIP_HDR_NAVI32|| CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 union GE_RATE_CNTL_1 { struct { unsigned int ADD_X_CLKS_LS_VERT : 4; @@ -13108,7 +13130,7 @@ union GE_RATE_CNTL_1 { }; #endif -#if CHIP_HDR_NAVI31 || CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 +#if CHIP_HDR_NAVI31|| CHIP_HDR_NAVI32|| CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 union GE_RATE_CNTL_2 { struct { unsigned int ADD_X_CLKS_VS_VERT : 4; @@ -13130,7 +13152,7 @@ union GE_RATE_CNTL_2 { }; #endif -#if CHIP_HDR_NAVI31 || CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 +#if CHIP_HDR_NAVI31|| CHIP_HDR_NAVI32|| CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 union GE_SPI_IF_SAFE_REG { struct { unsigned int GE_SPI_LS_ES_DATA : 6; @@ -14030,7 +14052,7 @@ union GRBM_PERFCOUNTER0_SELECT_HI { unsigned int GUS_BUSY_USER_DEFINED_MASK : 1; unsigned int : 24; } gfx10CorePlus; -#if CHIP_HDR_NAVI31 || CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 +#if CHIP_HDR_NAVI31|| CHIP_HDR_NAVI32|| CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 struct { unsigned int : 9; unsigned int GL1H_BUSY_USER_DEFINED_MASK : 1; @@ -14134,7 +14156,7 @@ union GRBM_PERFCOUNTER1_SELECT_HI { unsigned int GUS_BUSY_USER_DEFINED_MASK : 1; unsigned int : 24; } gfx10CorePlus; -#if CHIP_HDR_NAVI31 || CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 +#if CHIP_HDR_NAVI31|| CHIP_HDR_NAVI32|| CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 struct { unsigned int : 9; unsigned int GL1H_BUSY_USER_DEFINED_MASK : 1; @@ -14198,7 +14220,7 @@ union GRBM_SE0_PERFCOUNTER_SELECT { unsigned int GL1CC_BUSY_USER_DEFINED_MASK : 1; unsigned int : 6; } gfx10Plus; -#if CHIP_HDR_NAVI31 || CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 +#if CHIP_HDR_NAVI31|| CHIP_HDR_NAVI32|| CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 struct { unsigned int : 26; unsigned int GL1H_BUSY_USER_DEFINED_MASK : 1; @@ -14213,6 +14235,13 @@ union GRBM_SE0_PERFCOUNTER_SELECT { unsigned int : 3; } nv31; #endif +#if CHIP_HDR_NAVI32 + struct { + unsigned int : 28; + unsigned int SEDC_BUSY_USER_DEFINED_MASK : 1; + unsigned int : 3; + } nv32; +#endif unsigned int u32All; signed int i32All; @@ -14270,7 +14299,7 @@ union GRBM_SE1_PERFCOUNTER_SELECT { unsigned int GL1CC_BUSY_USER_DEFINED_MASK : 1; unsigned int : 6; } gfx10CorePlus; -#if CHIP_HDR_NAVI31 || CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 +#if CHIP_HDR_NAVI31|| CHIP_HDR_NAVI32|| CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 struct { unsigned int : 26; unsigned int GL1H_BUSY_USER_DEFINED_MASK : 1; @@ -14285,6 +14314,13 @@ union GRBM_SE1_PERFCOUNTER_SELECT { unsigned int : 3; } nv31; #endif +#if CHIP_HDR_NAVI32 + struct { + unsigned int : 28; + unsigned int SEDC_BUSY_USER_DEFINED_MASK : 1; + unsigned int : 3; + } nv32; +#endif unsigned int u32All; signed int i32All; @@ -14349,7 +14385,14 @@ union GRBM_SE2_PERFCOUNTER_SELECT { unsigned int : 3; } nv31; #endif -#if CHIP_HDR_NAVI31 || CHIP_HDR_NAVI33 +#if CHIP_HDR_NAVI32 + struct { + unsigned int : 28; + unsigned int SEDC_BUSY_USER_DEFINED_MASK : 1; + unsigned int : 3; + } nv32; +#endif +#if CHIP_HDR_NAVI31 || CHIP_HDR_NAVI32 || CHIP_HDR_NAVI33 struct { unsigned int : 23; unsigned int UTCL1_BUSY_USER_DEFINED_MASK : 1; @@ -14429,6 +14472,18 @@ union GRBM_SE3_PERFCOUNTER_SELECT { unsigned int : 3; } nv31; #endif +#if CHIP_HDR_NAVI32 + struct { + unsigned int : 23; + unsigned int UTCL1_BUSY_USER_DEFINED_MASK : 1; + unsigned int TCP_BUSY_USER_DEFINED_MASK : 1; + unsigned int GL1CC_BUSY_USER_DEFINED_MASK : 1; + unsigned int GL1H_BUSY_USER_DEFINED_MASK : 1; + unsigned int PC_BUSY_USER_DEFINED_MASK : 1; + unsigned int SEDC_BUSY_USER_DEFINED_MASK : 1; + unsigned int : 3; + } nv32; +#endif unsigned int u32All; signed int i32All; @@ -15013,7 +15068,7 @@ union IA_UTCL1_CNTL { unsigned int : 1; } nv24; #endif -#if CHIP_HDR_NAVI31 || CHIP_HDR_NAVI33 +#if CHIP_HDR_NAVI31 || CHIP_HDR_NAVI32 || CHIP_HDR_NAVI33 struct { unsigned int : 30; unsigned int LLC_NOALLOC_OVERRIDE : 1; @@ -15345,7 +15400,7 @@ union PA_CL_ENHANCE { unsigned int CLAMP_NEGATIVE_BB_TO_ZERO : 1; unsigned int : 7; } gfx103Derivative; -#if CHIP_HDR_NAVI31 || CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 +#if CHIP_HDR_NAVI31|| CHIP_HDR_NAVI32|| CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 struct { unsigned int : 18; unsigned int POLY_INNER_EDGE_FLAG_DISABLE : 1; @@ -16771,7 +16826,7 @@ union PA_CL_VS_OUT_CNTL { unsigned int USE_VTX_LINE_WIDTH : 1; unsigned int : 4; } gfx10Plus; -#if CHIP_HDR_NAVI31 || CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 +#if CHIP_HDR_NAVI31|| CHIP_HDR_NAVI32|| CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 struct { unsigned int : 31; unsigned int USE_VTX_FSR_SELECT : 1; @@ -16824,7 +16879,7 @@ union PA_PH_ENHANCE { unsigned int DISABLE_USE_LAST_PH_ARBITER_PERFCOUNTER_SAMPLE_EVENT : 1; unsigned int : 18; } gfx103PlusExclusive; -#if CHIP_HDR_NAVI31 || CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 +#if CHIP_HDR_NAVI31|| CHIP_HDR_NAVI32|| CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 struct { unsigned int : 14; unsigned int USE_PERFCOUNTER_START_STOP_EVENTS : 1; @@ -16832,7 +16887,7 @@ union PA_PH_ENHANCE { unsigned int : 16; } gfx104Plus; #endif -#if CHIP_HDR_NAVI31 || CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 +#if CHIP_HDR_NAVI31|| CHIP_HDR_NAVI32|| CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 struct { unsigned int : 16; unsigned int PH_SPI_GE_THROTTLE_MODE : 1; @@ -17188,9 +17243,9 @@ union PA_PH_PERFCOUNTER7_SELECT { float f32All; }; -#if CHIP_HDR_NAVI31 || CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 +#if CHIP_HDR_NAVI31|| CHIP_HDR_NAVI32|| CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 union PA_RATE_CNTL { -#if CHIP_HDR_NAVI31 || CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 +#if CHIP_HDR_NAVI31|| CHIP_HDR_NAVI32|| CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 struct { unsigned int VERTEX_RATE : 4; unsigned int PRIM_RATE : 4; @@ -17524,7 +17579,7 @@ union PA_SC_AA_SAMPLE_LOCS_PIXEL_X1Y1_3 { float f32All; }; -#if CHIP_HDR_NAVI31 || CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 +#if CHIP_HDR_NAVI31|| CHIP_HDR_NAVI32|| CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 union PA_SC_ATM_CNTL { struct { unsigned int SC_PC_IF_SIZE : 6; @@ -17596,7 +17651,7 @@ union PA_SC_BINNER_CNTL_1 { float f32All; }; -#if CHIP_HDR_NAVI31 || CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 +#if CHIP_HDR_NAVI31|| CHIP_HDR_NAVI32|| CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 union PA_SC_BINNER_CNTL_2 { struct { unsigned int BIN_SIZE_X_MULT_BY_1P5X : 1; @@ -17611,7 +17666,7 @@ union PA_SC_BINNER_CNTL_2 { unsigned int DISABLE_NOPCEXPORT_BREAKBATCH_CONDITION : 1; unsigned int : 10; } bits, bitfields; -#if CHIP_HDR_NAVI31 || CHIP_HDR_NAVI33 +#if CHIP_HDR_NAVI31 || CHIP_HDR_NAVI32 || CHIP_HDR_NAVI33 struct { unsigned int : 4; unsigned int LIGHT_SHAFT_DRAW_CALL_LIMIT : 3; @@ -17708,7 +17763,7 @@ union PA_SC_BINNER_EVENT_CNTL_1 { unsigned int BIN_CONF_OVERRIDE_CHECK : 2; unsigned int : 4; } gfx10Plus; -#if CHIP_HDR_NAVI31 || CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 +#if CHIP_HDR_NAVI31|| CHIP_HDR_NAVI32|| CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 struct { unsigned int : 10; unsigned int WAIT_SYNC : 2; @@ -17799,7 +17854,7 @@ union PA_SC_BINNER_EVENT_CNTL_3 { unsigned int : 16; unsigned int DRAW_DONE : 2; } gfx10Plus; -#if CHIP_HDR_NAVI31 || CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 +#if CHIP_HDR_NAVI31|| CHIP_HDR_NAVI32|| CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 struct { unsigned int : 28; unsigned int ENABLE_PIPELINE_NOT_USED : 2; @@ -18197,7 +18252,7 @@ union PA_SC_ENHANCE_1 { unsigned int DEBUG_PIXEL_PICKER_COUNT_PIXELS : 1; unsigned int : 18; } gfx10CorePlus; -#if CHIP_HDR_NAVI31 || CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 +#if CHIP_HDR_NAVI31|| CHIP_HDR_NAVI32|| CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 struct { unsigned int : 5; unsigned int DISABLE_NONBINNED_LIVE_PRIM_DG1_LS0_CL0_EOPKT_POKE : 1; @@ -18310,7 +18365,7 @@ union PA_SC_ENHANCE_3 { unsigned int DISABLE_CP_CONTEXT_DONE_PERFCOUNT_SAMPLE_EN : 1; unsigned int : 26; } gfx103PlusExclusive; -#if CHIP_HDR_NAVI31 || CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 +#if CHIP_HDR_NAVI31|| CHIP_HDR_NAVI32|| CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 struct { unsigned int : 6; unsigned int ENABLE_SINGLE_PA_EOPKT_FIRST_PHASE_FILTER : 1; @@ -18324,7 +18379,7 @@ union PA_SC_ENHANCE_3 { unsigned int : 18; } gfx104Plus; #endif -#if CHIP_HDR_NAVI31 || CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 +#if CHIP_HDR_NAVI31|| CHIP_HDR_NAVI32|| CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 struct { unsigned int : 2; unsigned int DISABLE_RB_MASK_COPY_FOR_NONP2_SA_PAIR_HARVEST : 1; @@ -18673,7 +18728,7 @@ union PA_SC_NGG_MODE_CNTL { unsigned int MAX_FPOVS_IN_WAVE : 8; unsigned int : 8; } gfx10Plus; -#if CHIP_HDR_NAVI31 || CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 +#if CHIP_HDR_NAVI31|| CHIP_HDR_NAVI32|| CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 struct { unsigned int : 12; unsigned int DISABLE_FPOG_AND_DEALLOC_CONFLICT : 1; @@ -18756,7 +18811,7 @@ union PA_SC_P3D_TRAP_SCREEN_V { float f32All; }; -#if CHIP_HDR_NAVI31 || CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 +#if CHIP_HDR_NAVI31|| CHIP_HDR_NAVI32|| CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 union PA_SC_PACKER_WAVE_ID_CNTL { struct { unsigned int WAVE_TABLE_SIZE : 10; @@ -19166,7 +19221,7 @@ union PA_SC_SHADER_CONTROL { unsigned int WAVE_BREAK_REGION_SIZE : 2; unsigned int : 25; } gfx10Plus; -#if CHIP_HDR_NAVI31 || CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 +#if CHIP_HDR_NAVI31|| CHIP_HDR_NAVI32|| CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 struct { unsigned int : 7; unsigned int DISABLE_OREO_CONFLICT_QUAD : 1; @@ -19216,7 +19271,7 @@ union PA_SC_TILE_STEERING_OVERRIDE { unsigned int NUM_PACKER_PER_SC : 1; unsigned int : 11; } gfx101; -#if CHIP_HDR_NAVI31 || CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 +#if CHIP_HDR_NAVI31|| CHIP_HDR_NAVI32|| CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 struct { unsigned int : 20; unsigned int NUM_PACKER_PER_SC : 2; @@ -20077,7 +20132,7 @@ union PA_SC_VPORT_ZMIN_15 { float f32All; }; -#if CHIP_HDR_NAVI31 || CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 +#if CHIP_HDR_NAVI31|| CHIP_HDR_NAVI32|| CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 union PA_SC_VRS_OVERRIDE_CNTL { struct { unsigned int VRS_OVERRIDE_RATE_COMBINER_MODE : 3; @@ -20096,7 +20151,7 @@ union PA_SC_VRS_OVERRIDE_CNTL { }; #endif -#if CHIP_HDR_NAVI31 || CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 +#if CHIP_HDR_NAVI31|| CHIP_HDR_NAVI32|| CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 union PA_SC_VRS_RATE_BASE { struct { unsigned int BASE_256B : 32; @@ -20108,7 +20163,7 @@ union PA_SC_VRS_RATE_BASE { }; #endif -#if CHIP_HDR_NAVI31 || CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 +#if CHIP_HDR_NAVI31|| CHIP_HDR_NAVI32|| CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 union PA_SC_VRS_RATE_BASE_EXT { struct { unsigned int BASE_256B : 8; @@ -20122,7 +20177,7 @@ union PA_SC_VRS_RATE_BASE_EXT { }; #endif -#if CHIP_HDR_NAVI31 || CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 +#if CHIP_HDR_NAVI31|| CHIP_HDR_NAVI32|| CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 union PA_SC_VRS_RATE_CACHE_CNTL { struct { unsigned int BIG_PAGE_RD : 1; @@ -20145,7 +20200,7 @@ union PA_SC_VRS_RATE_CACHE_CNTL { }; #endif -#if CHIP_HDR_NAVI31 || CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 +#if CHIP_HDR_NAVI31|| CHIP_HDR_NAVI32|| CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 union PA_SC_VRS_RATE_FEEDBACK_BASE { struct { unsigned int BASE_256B : 32; @@ -20157,7 +20212,7 @@ union PA_SC_VRS_RATE_FEEDBACK_BASE { }; #endif -#if CHIP_HDR_NAVI31 || CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 +#if CHIP_HDR_NAVI31|| CHIP_HDR_NAVI32|| CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 union PA_SC_VRS_RATE_FEEDBACK_BASE_EXT { struct { unsigned int BASE_256B : 8; @@ -20170,7 +20225,7 @@ union PA_SC_VRS_RATE_FEEDBACK_BASE_EXT { }; #endif -#if CHIP_HDR_NAVI31 || CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 +#if CHIP_HDR_NAVI31|| CHIP_HDR_NAVI32|| CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 union PA_SC_VRS_RATE_FEEDBACK_SIZE_XY { struct { unsigned int X_MAX : 11; @@ -20185,7 +20240,7 @@ union PA_SC_VRS_RATE_FEEDBACK_SIZE_XY { }; #endif -#if CHIP_HDR_NAVI31 || CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 +#if CHIP_HDR_NAVI31|| CHIP_HDR_NAVI32|| CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 union PA_SC_VRS_RATE_SIZE_XY { struct { unsigned int X_MAX : 11; @@ -20200,7 +20255,7 @@ union PA_SC_VRS_RATE_SIZE_XY { }; #endif -#if CHIP_HDR_NAVI31 || CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 +#if CHIP_HDR_NAVI31|| CHIP_HDR_NAVI32|| CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 union PA_SC_VRS_SURFACE_CNTL { struct { unsigned int : 6; @@ -20223,6 +20278,13 @@ union PA_SC_VRS_SURFACE_CNTL { unsigned int : 26; } apu11; #endif +#if CHIP_HDR_NAVI32 + struct { + unsigned int : 5; + unsigned int VRC_REPROBE_DISABLE : 1; + unsigned int : 26; + } nv32; +#endif #if CHIP_HDR_NAVI33 struct { unsigned int : 5; @@ -20237,7 +20299,7 @@ union PA_SC_VRS_SURFACE_CNTL { }; #endif -#if CHIP_HDR_NAVI31 || CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 +#if CHIP_HDR_NAVI31|| CHIP_HDR_NAVI32|| CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 union PA_SC_VRS_SURFACE_CNTL_1 { struct { unsigned int FORCE_SC_VRS_RATE_FINE : 1; @@ -21980,7 +22042,7 @@ union RLC_CGTT_MGCG_OVERRIDE { unsigned int RESERVED_15_9 : 7; unsigned int : 16; } gfx10Core; -#if CHIP_HDR_NAVI31 || CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 +#if CHIP_HDR_NAVI31|| CHIP_HDR_NAVI32|| CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 struct { unsigned int RLC_REPEATER_FGCG_OVERRIDE : 1; unsigned int : 8; @@ -22141,7 +22203,7 @@ union RLC_SPM_ACCUM_CTRLRAM_ADDR { unsigned int addr : 11; unsigned int RESERVED : 21; } gfx103; -#if CHIP_HDR_NAVI31 || CHIP_HDR_NAVI33 +#if CHIP_HDR_NAVI31 || CHIP_HDR_NAVI32 || CHIP_HDR_NAVI33 struct { unsigned int addr : 11; unsigned int RESERVED : 21; @@ -22307,7 +22369,15 @@ union RLC_SPM_ACCUM_MODE { unsigned int : 9; } nv31; #endif -#if CHIP_HDR_NAVI31 || CHIP_HDR_NAVI33 +#if CHIP_HDR_NAVI32 + struct { + unsigned int : 15; + unsigned int SE2_LoadOverride : 1; + unsigned int SE2_SwaLoadOverride : 1; + unsigned int : 15; + } nv32; +#endif +#if CHIP_HDR_NAVI31 || CHIP_HDR_NAVI32 || CHIP_HDR_NAVI33 struct { unsigned int : 13; unsigned int SE1_LoadOverride : 1; @@ -22377,7 +22447,7 @@ union RLC_SPM_ACCUM_STATUS { unsigned int SwaAccumArmed : 1; unsigned int : 12; } gfx103Plus; -#if CHIP_HDR_NAVI31 || CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 +#if CHIP_HDR_NAVI31|| CHIP_HDR_NAVI32|| CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 struct { unsigned int : 23; unsigned int MultiSampleAborted : 1; @@ -22655,7 +22725,7 @@ union RLC_SPM_GLOBALS_SAMPLE_SKEW { float f32All; }; -#if CHIP_HDR_NAVI31 || CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 +#if CHIP_HDR_NAVI31|| CHIP_HDR_NAVI32|| CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 union RLC_SPM_GLOBAL_DELAY_IND_ADDR { struct { unsigned int ADDR : 12; @@ -22668,7 +22738,7 @@ union RLC_SPM_GLOBAL_DELAY_IND_ADDR { }; #endif -#if CHIP_HDR_NAVI31 || CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 +#if CHIP_HDR_NAVI31|| CHIP_HDR_NAVI32|| CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 union RLC_SPM_GLOBAL_DELAY_IND_DATA { struct { unsigned int DATA : 6; @@ -22689,7 +22759,7 @@ union RLC_SPM_GLOBAL_MUXSEL_ADDR { unsigned int PERFMON_SEL_ADDR : 8; unsigned int RESERVED : 24; } gfx10; -#if CHIP_HDR_NAVI31 || CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 +#if CHIP_HDR_NAVI31|| CHIP_HDR_NAVI32|| CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 struct { unsigned int ADDR : 12; unsigned int : 20; @@ -22716,7 +22786,7 @@ union RLC_SPM_GLOBAL_MUXSEL_DATA { struct { unsigned int PERFMON_SEL_DATA : 32; } gfx09_10; -#if CHIP_HDR_NAVI31 || CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 +#if CHIP_HDR_NAVI31|| CHIP_HDR_NAVI32|| CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 struct { unsigned int SEL0 : 16; unsigned int SEL1 : 16; @@ -22877,7 +22947,7 @@ union RLC_SPM_MC_CNTL { unsigned int : 12; } nv24; #endif -#if CHIP_HDR_NAVI31 || CHIP_HDR_NAVI33 +#if CHIP_HDR_NAVI31 || CHIP_HDR_NAVI32 || CHIP_HDR_NAVI33 struct { unsigned int : 13; unsigned int RLC_SPM_RO : 1; @@ -22903,7 +22973,7 @@ union RLC_SPM_MC_CNTL { float f32All; }; -#if CHIP_HDR_NAVI31 || CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 +#if CHIP_HDR_NAVI31|| CHIP_HDR_NAVI32|| CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 union RLC_SPM_MODE { struct { unsigned int MODE : 1; @@ -22916,7 +22986,7 @@ union RLC_SPM_MODE { }; #endif -#if CHIP_HDR_NAVI31 || CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 +#if CHIP_HDR_NAVI31|| CHIP_HDR_NAVI32|| CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 union RLC_SPM_PAUSE { struct { unsigned int PAUSE : 1; @@ -22962,7 +23032,7 @@ union RLC_SPM_PERFMON_CNTL { unsigned int RESERVED1 : 12; unsigned int : 20; } gfx10Plus; -#if CHIP_HDR_NAVI31 || CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 +#if CHIP_HDR_NAVI31|| CHIP_HDR_NAVI32|| CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 struct { unsigned int : 14; unsigned int DISABLE_GFXCLOCK_COUNT : 1; @@ -23063,7 +23133,7 @@ union RLC_SPM_PERFMON_SEGMENT_SIZE { unsigned int SE2_NUM_LINE : 5; unsigned int RESERVED : 1; } gfx09_10; -#if CHIP_HDR_NAVI31 || CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 +#if CHIP_HDR_NAVI31|| CHIP_HDR_NAVI32|| CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 struct { unsigned int TOTAL_NUM_SEGMENT : 16; unsigned int GLOBAL_NUM_SEGMENT : 8; @@ -23149,7 +23219,7 @@ union RLC_SPM_RMI_PERFMON_SAMPLE_DELAY { float f32All; }; -#if CHIP_HDR_NAVI31 || CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 +#if CHIP_HDR_NAVI31|| CHIP_HDR_NAVI32|| CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 union RLC_SPM_RSPM_CMD { struct { unsigned int CMD : 4; @@ -23162,7 +23232,7 @@ union RLC_SPM_RSPM_CMD { }; #endif -#if CHIP_HDR_NAVI31 || CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 +#if CHIP_HDR_NAVI31|| CHIP_HDR_NAVI32|| CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 union RLC_SPM_RSPM_CMD_ACK { struct { unsigned int SE0_ACK : 1; @@ -23183,7 +23253,7 @@ union RLC_SPM_RSPM_CMD_ACK { }; #endif -#if CHIP_HDR_NAVI31 || CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 +#if CHIP_HDR_NAVI31|| CHIP_HDR_NAVI32|| CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 union RLC_SPM_RSPM_REQ_DATA_HI { struct { unsigned int DATA : 12; @@ -23196,7 +23266,7 @@ union RLC_SPM_RSPM_REQ_DATA_HI { }; #endif -#if CHIP_HDR_NAVI31 || CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 +#if CHIP_HDR_NAVI31|| CHIP_HDR_NAVI32|| CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 union RLC_SPM_RSPM_REQ_DATA_LO { struct { unsigned int DATA : 32; @@ -23208,7 +23278,7 @@ union RLC_SPM_RSPM_REQ_DATA_LO { }; #endif -#if CHIP_HDR_NAVI31 || CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 +#if CHIP_HDR_NAVI31|| CHIP_HDR_NAVI32|| CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 union RLC_SPM_RSPM_REQ_OP { struct { unsigned int OP : 4; @@ -23221,7 +23291,7 @@ union RLC_SPM_RSPM_REQ_OP { }; #endif -#if CHIP_HDR_NAVI31 || CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 +#if CHIP_HDR_NAVI31|| CHIP_HDR_NAVI32|| CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 union RLC_SPM_RSPM_RET_DATA { struct { unsigned int DATA : 32; @@ -23233,7 +23303,7 @@ union RLC_SPM_RSPM_RET_DATA { }; #endif -#if CHIP_HDR_NAVI31 || CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 +#if CHIP_HDR_NAVI31|| CHIP_HDR_NAVI32|| CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 union RLC_SPM_RSPM_RET_OP { struct { unsigned int OP : 4; @@ -23283,7 +23353,7 @@ union RLC_SPM_SEGMENT_THRESHOLD { float f32All; }; -#if CHIP_HDR_NAVI31 || CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 +#if CHIP_HDR_NAVI31|| CHIP_HDR_NAVI32|| CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 union RLC_SPM_SE_DELAY_IND_ADDR { struct { unsigned int ADDR : 12; @@ -23296,7 +23366,7 @@ union RLC_SPM_SE_DELAY_IND_ADDR { }; #endif -#if CHIP_HDR_NAVI31 || CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 +#if CHIP_HDR_NAVI31|| CHIP_HDR_NAVI32|| CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 union RLC_SPM_SE_DELAY_IND_DATA { struct { unsigned int DATA : 6; @@ -23317,7 +23387,7 @@ union RLC_SPM_SE_MUXSEL_ADDR { unsigned int PERFMON_SEL_ADDR : 9; unsigned int RESERVED : 23; } gfx10; -#if CHIP_HDR_NAVI31 || CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 +#if CHIP_HDR_NAVI31|| CHIP_HDR_NAVI32|| CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 struct { unsigned int ADDR : 12; unsigned int : 20; @@ -23344,7 +23414,7 @@ union RLC_SPM_SE_MUXSEL_DATA { struct { unsigned int PERFMON_SEL_DATA : 32; } gfx09_10; -#if CHIP_HDR_NAVI31 || CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 +#if CHIP_HDR_NAVI31|| CHIP_HDR_NAVI32|| CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 struct { unsigned int SEL0 : 16; unsigned int SEL1 : 16; @@ -23367,7 +23437,7 @@ union RLC_SPM_SE_MUXSEL_SKEW { float f32All; }; -#if CHIP_HDR_NAVI31 || CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 +#if CHIP_HDR_NAVI31|| CHIP_HDR_NAVI32|| CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 union RLC_SPM_SE_RSPM_REQ_DATA_HI { struct { unsigned int DATA : 12; @@ -23380,7 +23450,7 @@ union RLC_SPM_SE_RSPM_REQ_DATA_HI { }; #endif -#if CHIP_HDR_NAVI31 || CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 +#if CHIP_HDR_NAVI31|| CHIP_HDR_NAVI32|| CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 union RLC_SPM_SE_RSPM_REQ_DATA_LO { struct { unsigned int DATA : 32; @@ -23392,7 +23462,7 @@ union RLC_SPM_SE_RSPM_REQ_DATA_LO { }; #endif -#if CHIP_HDR_NAVI31 || CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 +#if CHIP_HDR_NAVI31|| CHIP_HDR_NAVI32|| CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 union RLC_SPM_SE_RSPM_REQ_OP { struct { unsigned int OP : 4; @@ -23405,7 +23475,7 @@ union RLC_SPM_SE_RSPM_REQ_OP { }; #endif -#if CHIP_HDR_NAVI31 || CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 +#if CHIP_HDR_NAVI31|| CHIP_HDR_NAVI32|| CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 union RLC_SPM_SE_RSPM_RET_DATA { struct { unsigned int DATA : 32; @@ -23417,7 +23487,7 @@ union RLC_SPM_SE_RSPM_RET_DATA { }; #endif -#if CHIP_HDR_NAVI31 || CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 +#if CHIP_HDR_NAVI31|| CHIP_HDR_NAVI32|| CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 union RLC_SPM_SE_RSPM_RET_OP { struct { unsigned int OP : 4; @@ -23464,7 +23534,7 @@ union RLC_SPM_SE_SAMPLE_SKEW { float f32All; }; -#if CHIP_HDR_NAVI31 || CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 +#if CHIP_HDR_NAVI31|| CHIP_HDR_NAVI32|| CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 union RLC_SPM_SPARE { struct { unsigned int SPARE : 32; @@ -23498,7 +23568,7 @@ union RLC_SPM_SQG_PERFMON_SAMPLE_DELAY { float f32All; }; -#if CHIP_HDR_NAVI31 || CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 +#if CHIP_HDR_NAVI31|| CHIP_HDR_NAVI32|| CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 union RLC_SPM_STATUS { struct { unsigned int CTL_BUSY : 1; @@ -23710,7 +23780,7 @@ union RMI_PERFCOUNTER0_SELECT { unsigned int PERF_SEL1 : 9; unsigned int : 13; } most; -#if CHIP_HDR_NAVI31 || CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 +#if CHIP_HDR_NAVI31|| CHIP_HDR_NAVI32|| CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 struct { unsigned int PERF_SEL : 10; unsigned int PERF_SEL1 : 10; @@ -23735,7 +23805,7 @@ union RMI_PERFCOUNTER0_SELECT1 { unsigned int PERF_SEL3 : 9; unsigned int : 13; } most; -#if CHIP_HDR_NAVI31 || CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 +#if CHIP_HDR_NAVI31|| CHIP_HDR_NAVI32|| CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 struct { unsigned int PERF_SEL2 : 10; unsigned int PERF_SEL3 : 10; @@ -23777,7 +23847,7 @@ union RMI_PERFCOUNTER1_SELECT { unsigned int PERF_SEL : 9; unsigned int : 23; } most; -#if CHIP_HDR_NAVI31 || CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 +#if CHIP_HDR_NAVI31|| CHIP_HDR_NAVI32|| CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 struct { unsigned int PERF_SEL : 10; unsigned int : 22; @@ -23822,7 +23892,7 @@ union RMI_PERFCOUNTER2_SELECT { unsigned int PERF_SEL1 : 9; unsigned int : 13; } most; -#if CHIP_HDR_NAVI31 || CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 +#if CHIP_HDR_NAVI31|| CHIP_HDR_NAVI32|| CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 struct { unsigned int PERF_SEL : 10; unsigned int PERF_SEL1 : 10; @@ -23847,7 +23917,7 @@ union RMI_PERFCOUNTER2_SELECT1 { unsigned int PERF_SEL3 : 9; unsigned int : 13; } most; -#if CHIP_HDR_NAVI31 || CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 +#if CHIP_HDR_NAVI31|| CHIP_HDR_NAVI32|| CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 struct { unsigned int PERF_SEL2 : 10; unsigned int PERF_SEL3 : 10; @@ -23889,7 +23959,7 @@ union RMI_PERFCOUNTER3_SELECT { unsigned int PERF_SEL : 9; unsigned int : 23; } most; -#if CHIP_HDR_NAVI31 || CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 +#if CHIP_HDR_NAVI31|| CHIP_HDR_NAVI32|| CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 struct { unsigned int PERF_SEL : 10; unsigned int : 22; @@ -24265,7 +24335,7 @@ union SDMA0_PERFMON_CNTL { float f32All; }; -#if CHIP_HDR_NAVI21|| CHIP_HDR_NAVI22|| CHIP_HDR_NAVI23|| CHIP_HDR_NAVI31 || CHIP_HDR_NAVI33 +#if CHIP_HDR_NAVI21 || CHIP_HDR_NAVI22 || CHIP_HDR_NAVI23 || CHIP_HDR_NAVI31 || CHIP_HDR_NAVI32 || CHIP_HDR_NAVI33 union SDMA1_PERFCNT_MISC_CNTL { #if CHIP_HDR_NAVI21 struct { @@ -24285,7 +24355,7 @@ union SDMA1_PERFCNT_MISC_CNTL { unsigned int : 16; } nv23; #endif -#if CHIP_HDR_NAVI31 || CHIP_HDR_NAVI33 +#if CHIP_HDR_NAVI31 || CHIP_HDR_NAVI32 || CHIP_HDR_NAVI33 struct { unsigned int CMD_OP : 16; unsigned int : 16; @@ -24298,7 +24368,7 @@ union SDMA1_PERFCNT_MISC_CNTL { }; #endif -#if CHIP_HDR_NAVI21|| CHIP_HDR_NAVI22|| CHIP_HDR_NAVI23|| CHIP_HDR_NAVI31 || CHIP_HDR_NAVI33 +#if CHIP_HDR_NAVI21 || CHIP_HDR_NAVI22 || CHIP_HDR_NAVI23 || CHIP_HDR_NAVI31 || CHIP_HDR_NAVI32 || CHIP_HDR_NAVI33 union SDMA1_PERFCNT_PERFCOUNTER0_CFG { #if CHIP_HDR_NAVI21 struct { @@ -24333,7 +24403,7 @@ union SDMA1_PERFCNT_PERFCOUNTER0_CFG { unsigned int : 2; } nv23; #endif -#if CHIP_HDR_NAVI31 || CHIP_HDR_NAVI33 +#if CHIP_HDR_NAVI31 || CHIP_HDR_NAVI32 || CHIP_HDR_NAVI33 struct { unsigned int PERF_SEL : 8; unsigned int PERF_SEL_END : 8; @@ -24351,7 +24421,7 @@ union SDMA1_PERFCNT_PERFCOUNTER0_CFG { }; #endif -#if CHIP_HDR_NAVI21|| CHIP_HDR_NAVI22|| CHIP_HDR_NAVI23|| CHIP_HDR_NAVI31 || CHIP_HDR_NAVI33 +#if CHIP_HDR_NAVI21 || CHIP_HDR_NAVI22 || CHIP_HDR_NAVI23 || CHIP_HDR_NAVI31 || CHIP_HDR_NAVI32 || CHIP_HDR_NAVI33 union SDMA1_PERFCNT_PERFCOUNTER1_CFG { #if CHIP_HDR_NAVI21 struct { @@ -24386,7 +24456,7 @@ union SDMA1_PERFCNT_PERFCOUNTER1_CFG { unsigned int : 2; } nv23; #endif -#if CHIP_HDR_NAVI31 || CHIP_HDR_NAVI33 +#if CHIP_HDR_NAVI31 || CHIP_HDR_NAVI32 || CHIP_HDR_NAVI33 struct { unsigned int PERF_SEL : 8; unsigned int PERF_SEL_END : 8; @@ -24404,7 +24474,7 @@ union SDMA1_PERFCNT_PERFCOUNTER1_CFG { }; #endif -#if CHIP_HDR_NAVI21|| CHIP_HDR_NAVI22|| CHIP_HDR_NAVI23|| CHIP_HDR_NAVI31 || CHIP_HDR_NAVI33 +#if CHIP_HDR_NAVI21 || CHIP_HDR_NAVI22 || CHIP_HDR_NAVI23 || CHIP_HDR_NAVI31 || CHIP_HDR_NAVI32 || CHIP_HDR_NAVI33 union SDMA1_PERFCNT_PERFCOUNTER_HI { #if CHIP_HDR_NAVI21 struct { @@ -24424,7 +24494,7 @@ union SDMA1_PERFCNT_PERFCOUNTER_HI { unsigned int COMPARE_VALUE : 16; } nv23; #endif -#if CHIP_HDR_NAVI31 || CHIP_HDR_NAVI33 +#if CHIP_HDR_NAVI31 || CHIP_HDR_NAVI32 || CHIP_HDR_NAVI33 struct { unsigned int COUNTER_HI : 16; unsigned int COMPARE_VALUE : 16; @@ -24437,7 +24507,7 @@ union SDMA1_PERFCNT_PERFCOUNTER_HI { }; #endif -#if CHIP_HDR_NAVI21|| CHIP_HDR_NAVI22|| CHIP_HDR_NAVI23|| CHIP_HDR_NAVI31 || CHIP_HDR_NAVI33 +#if CHIP_HDR_NAVI21 || CHIP_HDR_NAVI22 || CHIP_HDR_NAVI23 || CHIP_HDR_NAVI31 || CHIP_HDR_NAVI32 || CHIP_HDR_NAVI33 union SDMA1_PERFCNT_PERFCOUNTER_LO { #if CHIP_HDR_NAVI21 struct { @@ -24454,7 +24524,7 @@ union SDMA1_PERFCNT_PERFCOUNTER_LO { unsigned int COUNTER_LO : 32; } nv23; #endif -#if CHIP_HDR_NAVI31 || CHIP_HDR_NAVI33 +#if CHIP_HDR_NAVI31 || CHIP_HDR_NAVI32 || CHIP_HDR_NAVI33 struct { unsigned int COUNTER_LO : 32; } nv3x; @@ -24466,7 +24536,7 @@ union SDMA1_PERFCNT_PERFCOUNTER_LO { }; #endif -#if CHIP_HDR_NAVI21|| CHIP_HDR_NAVI22|| CHIP_HDR_NAVI23|| CHIP_HDR_NAVI31 || CHIP_HDR_NAVI33 +#if CHIP_HDR_NAVI21 || CHIP_HDR_NAVI22 || CHIP_HDR_NAVI23 || CHIP_HDR_NAVI31 || CHIP_HDR_NAVI32 || CHIP_HDR_NAVI33 union SDMA1_PERFCNT_PERFCOUNTER_RSLT_CNTL { #if CHIP_HDR_NAVI21 struct { @@ -24504,7 +24574,7 @@ union SDMA1_PERFCNT_PERFCOUNTER_RSLT_CNTL { unsigned int : 5; } nv23; #endif -#if CHIP_HDR_NAVI31 || CHIP_HDR_NAVI33 +#if CHIP_HDR_NAVI31 || CHIP_HDR_NAVI32 || CHIP_HDR_NAVI33 struct { unsigned int PERF_COUNTER_SELECT : 4; unsigned int : 4; @@ -25131,7 +25201,7 @@ union SPI_ARB_PRIORITY { float f32All; }; -#if CHIP_HDR_NAVI31 || CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 +#if CHIP_HDR_NAVI31|| CHIP_HDR_NAVI32|| CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 union SPI_ATTRIBUTE_RING_BASE { struct { unsigned int BASE : 32; @@ -25143,7 +25213,7 @@ union SPI_ATTRIBUTE_RING_BASE { }; #endif -#if CHIP_HDR_NAVI31 || CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 +#if CHIP_HDR_NAVI31|| CHIP_HDR_NAVI32|| CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 union SPI_ATTRIBUTE_RING_SIZE { struct { unsigned int MEM_SIZE : 8; @@ -25223,7 +25293,7 @@ union SPI_COMPUTE_WF_CTX_SAVE { float f32All; }; -#if CHIP_HDR_NAVI31 || CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 +#if CHIP_HDR_NAVI31|| CHIP_HDR_NAVI32|| CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 union SPI_COMPUTE_WF_CTX_SAVE_STATUS { struct { unsigned int PIPE0_QUEUE0_SAVE_BUSY : 1; @@ -25340,7 +25410,7 @@ union SPI_CONFIG_CNTL_1 { unsigned int SA_SCREEN_MAP : 1; unsigned int : 9; } gfx103Plus; -#if CHIP_HDR_NAVI31 || CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 +#if CHIP_HDR_NAVI31|| CHIP_HDR_NAVI32|| CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 struct { unsigned int : 8; unsigned int PS_GROUP_TIMEOUT_MODE : 1; @@ -25356,7 +25426,7 @@ union SPI_CONFIG_CNTL_1 { unsigned int EN_USER_ACCUM : 1; unsigned int : 10; } gfx10Plus; -#if CHIP_HDR_NAVI31 || CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 +#if CHIP_HDR_NAVI31|| CHIP_HDR_NAVI32|| CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 struct { unsigned int : 9; unsigned int OREO_EXPALLOC_STALL : 1; @@ -25405,7 +25475,7 @@ union SPI_CONFIG_CNTL_2 { unsigned int CONTEXT_SAVE_WAIT_GDS_GRANT_CYCLE_OVHD : 4; unsigned int : 24; } bits, bitfields; -#if CHIP_HDR_NAVI31 || CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 +#if CHIP_HDR_NAVI31|| CHIP_HDR_NAVI32|| CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 struct { unsigned int : 8; unsigned int PWS_CSG_WAIT_DISABLE : 1; @@ -25492,7 +25562,7 @@ union SPI_CONFIG_PS_CU_EN { unsigned int PKR_OFFSET : 4; unsigned int : 28; } gfx103CorePlus; -#if CHIP_HDR_NAVI31 || CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 +#if CHIP_HDR_NAVI31|| CHIP_HDR_NAVI32|| CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 struct { unsigned int : 4; unsigned int PKR2_OFFSET : 4; @@ -25727,7 +25797,7 @@ union SPI_FEATURE_CTRL { unsigned int BUS_ACTIVITY_THRESHOLD : 8; unsigned int : 4; } gfx101; -#if CHIP_HDR_NAVI31 || CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 +#if CHIP_HDR_NAVI31|| CHIP_HDR_NAVI32|| CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 struct { unsigned int TUNNELING_WAVE_LIMIT : 4; unsigned int RA_PROBE_IGNORE : 1; @@ -25817,7 +25887,7 @@ union SPI_GFX_CRAWLER_CONFIG { unsigned int VS_DEPTH : 5; unsigned int : 10; } gfx103Derivative; -#if CHIP_HDR_NAVI31 || CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 +#if CHIP_HDR_NAVI31|| CHIP_HDR_NAVI32|| CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 struct { unsigned int : 17; unsigned int PS_ALLOC_DEPTH : 5; @@ -25833,7 +25903,7 @@ union SPI_GFX_CRAWLER_CONFIG { float f32All; }; -#if CHIP_HDR_NAVI31 || CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 +#if CHIP_HDR_NAVI31|| CHIP_HDR_NAVI32|| CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 union SPI_GFX_SCRATCH_BASE_HI { struct { unsigned int DATA : 8; @@ -25846,7 +25916,7 @@ union SPI_GFX_SCRATCH_BASE_HI { }; #endif -#if CHIP_HDR_NAVI31 || CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 +#if CHIP_HDR_NAVI31|| CHIP_HDR_NAVI32|| CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 union SPI_GFX_SCRATCH_BASE_LO { struct { unsigned int DATA : 32; @@ -25858,7 +25928,7 @@ union SPI_GFX_SCRATCH_BASE_LO { }; #endif -#if CHIP_HDR_NAVI31 || CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 +#if CHIP_HDR_NAVI31|| CHIP_HDR_NAVI32|| CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 union SPI_GS_THROTTLE_CNTL1 { struct { unsigned int PH_POLL_INTERVAL : 4; @@ -25877,7 +25947,7 @@ union SPI_GS_THROTTLE_CNTL1 { }; #endif -#if CHIP_HDR_NAVI31 || CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 +#if CHIP_HDR_NAVI31|| CHIP_HDR_NAVI32|| CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 union SPI_GS_THROTTLE_CNTL2 { struct { unsigned int SPI_THROTTLE_MODE : 2; @@ -26013,7 +26083,7 @@ union SPI_LB_DATA_PERWGP_WAVE_HSGS { float f32All; }; -#if CHIP_HDR_NAVI31 || CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 +#if CHIP_HDR_NAVI31|| CHIP_HDR_NAVI32|| CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 union SPI_LB_DATA_PERWGP_WAVE_PS { struct { unsigned int WGP_USED_PS : 16; @@ -26544,7 +26614,7 @@ union SPI_PS_INPUT_CNTL_0 { unsigned int ROTATE_PC_PTR : 1; unsigned int : 20; } gfx103PlusExclusive; -#if CHIP_HDR_NAVI31 || CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 +#if CHIP_HDR_NAVI31|| CHIP_HDR_NAVI32|| CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 struct { unsigned int : 12; unsigned int PRIM_ATTR : 1; @@ -26584,7 +26654,7 @@ union SPI_PS_INPUT_CNTL_1 { unsigned int ROTATE_PC_PTR : 1; unsigned int : 20; } gfx103PlusExclusive; -#if CHIP_HDR_NAVI31 || CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 +#if CHIP_HDR_NAVI31|| CHIP_HDR_NAVI32|| CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 struct { unsigned int : 12; unsigned int PRIM_ATTR : 1; @@ -26624,7 +26694,7 @@ union SPI_PS_INPUT_CNTL_2 { unsigned int ROTATE_PC_PTR : 1; unsigned int : 20; } gfx103PlusExclusive; -#if CHIP_HDR_NAVI31 || CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 +#if CHIP_HDR_NAVI31|| CHIP_HDR_NAVI32|| CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 struct { unsigned int : 12; unsigned int PRIM_ATTR : 1; @@ -26664,7 +26734,7 @@ union SPI_PS_INPUT_CNTL_3 { unsigned int ROTATE_PC_PTR : 1; unsigned int : 20; } gfx103PlusExclusive; -#if CHIP_HDR_NAVI31 || CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 +#if CHIP_HDR_NAVI31|| CHIP_HDR_NAVI32|| CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 struct { unsigned int : 12; unsigned int PRIM_ATTR : 1; @@ -26704,7 +26774,7 @@ union SPI_PS_INPUT_CNTL_4 { unsigned int ROTATE_PC_PTR : 1; unsigned int : 20; } gfx103PlusExclusive; -#if CHIP_HDR_NAVI31 || CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 +#if CHIP_HDR_NAVI31|| CHIP_HDR_NAVI32|| CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 struct { unsigned int : 12; unsigned int PRIM_ATTR : 1; @@ -26744,7 +26814,7 @@ union SPI_PS_INPUT_CNTL_5 { unsigned int ROTATE_PC_PTR : 1; unsigned int : 20; } gfx103PlusExclusive; -#if CHIP_HDR_NAVI31 || CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 +#if CHIP_HDR_NAVI31|| CHIP_HDR_NAVI32|| CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 struct { unsigned int : 12; unsigned int PRIM_ATTR : 1; @@ -26784,7 +26854,7 @@ union SPI_PS_INPUT_CNTL_6 { unsigned int ROTATE_PC_PTR : 1; unsigned int : 20; } gfx103PlusExclusive; -#if CHIP_HDR_NAVI31 || CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 +#if CHIP_HDR_NAVI31|| CHIP_HDR_NAVI32|| CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 struct { unsigned int : 12; unsigned int PRIM_ATTR : 1; @@ -26824,7 +26894,7 @@ union SPI_PS_INPUT_CNTL_7 { unsigned int ROTATE_PC_PTR : 1; unsigned int : 20; } gfx103PlusExclusive; -#if CHIP_HDR_NAVI31 || CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 +#if CHIP_HDR_NAVI31|| CHIP_HDR_NAVI32|| CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 struct { unsigned int : 12; unsigned int PRIM_ATTR : 1; @@ -26864,7 +26934,7 @@ union SPI_PS_INPUT_CNTL_8 { unsigned int ROTATE_PC_PTR : 1; unsigned int : 20; } gfx103PlusExclusive; -#if CHIP_HDR_NAVI31 || CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 +#if CHIP_HDR_NAVI31|| CHIP_HDR_NAVI32|| CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 struct { unsigned int : 12; unsigned int PRIM_ATTR : 1; @@ -26904,7 +26974,7 @@ union SPI_PS_INPUT_CNTL_9 { unsigned int ROTATE_PC_PTR : 1; unsigned int : 20; } gfx103PlusExclusive; -#if CHIP_HDR_NAVI31 || CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 +#if CHIP_HDR_NAVI31|| CHIP_HDR_NAVI32|| CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 struct { unsigned int : 12; unsigned int PRIM_ATTR : 1; @@ -26944,7 +27014,7 @@ union SPI_PS_INPUT_CNTL_10 { unsigned int ROTATE_PC_PTR : 1; unsigned int : 20; } gfx103PlusExclusive; -#if CHIP_HDR_NAVI31 || CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 +#if CHIP_HDR_NAVI31|| CHIP_HDR_NAVI32|| CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 struct { unsigned int : 12; unsigned int PRIM_ATTR : 1; @@ -26984,7 +27054,7 @@ union SPI_PS_INPUT_CNTL_11 { unsigned int ROTATE_PC_PTR : 1; unsigned int : 20; } gfx103PlusExclusive; -#if CHIP_HDR_NAVI31 || CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 +#if CHIP_HDR_NAVI31|| CHIP_HDR_NAVI32|| CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 struct { unsigned int : 12; unsigned int PRIM_ATTR : 1; @@ -27024,7 +27094,7 @@ union SPI_PS_INPUT_CNTL_12 { unsigned int ROTATE_PC_PTR : 1; unsigned int : 20; } gfx103PlusExclusive; -#if CHIP_HDR_NAVI31 || CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 +#if CHIP_HDR_NAVI31|| CHIP_HDR_NAVI32|| CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 struct { unsigned int : 12; unsigned int PRIM_ATTR : 1; @@ -27064,7 +27134,7 @@ union SPI_PS_INPUT_CNTL_13 { unsigned int ROTATE_PC_PTR : 1; unsigned int : 20; } gfx103PlusExclusive; -#if CHIP_HDR_NAVI31 || CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 +#if CHIP_HDR_NAVI31|| CHIP_HDR_NAVI32|| CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 struct { unsigned int : 12; unsigned int PRIM_ATTR : 1; @@ -27104,7 +27174,7 @@ union SPI_PS_INPUT_CNTL_14 { unsigned int ROTATE_PC_PTR : 1; unsigned int : 20; } gfx103PlusExclusive; -#if CHIP_HDR_NAVI31 || CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 +#if CHIP_HDR_NAVI31|| CHIP_HDR_NAVI32|| CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 struct { unsigned int : 12; unsigned int PRIM_ATTR : 1; @@ -27144,7 +27214,7 @@ union SPI_PS_INPUT_CNTL_15 { unsigned int ROTATE_PC_PTR : 1; unsigned int : 20; } gfx103PlusExclusive; -#if CHIP_HDR_NAVI31 || CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 +#if CHIP_HDR_NAVI31|| CHIP_HDR_NAVI32|| CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 struct { unsigned int : 12; unsigned int PRIM_ATTR : 1; @@ -27184,7 +27254,7 @@ union SPI_PS_INPUT_CNTL_16 { unsigned int ROTATE_PC_PTR : 1; unsigned int : 20; } gfx103PlusExclusive; -#if CHIP_HDR_NAVI31 || CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 +#if CHIP_HDR_NAVI31|| CHIP_HDR_NAVI32|| CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 struct { unsigned int : 12; unsigned int PRIM_ATTR : 1; @@ -27224,7 +27294,7 @@ union SPI_PS_INPUT_CNTL_17 { unsigned int ROTATE_PC_PTR : 1; unsigned int : 20; } gfx103PlusExclusive; -#if CHIP_HDR_NAVI31 || CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 +#if CHIP_HDR_NAVI31|| CHIP_HDR_NAVI32|| CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 struct { unsigned int : 12; unsigned int PRIM_ATTR : 1; @@ -27264,7 +27334,7 @@ union SPI_PS_INPUT_CNTL_18 { unsigned int ROTATE_PC_PTR : 1; unsigned int : 20; } gfx103PlusExclusive; -#if CHIP_HDR_NAVI31 || CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 +#if CHIP_HDR_NAVI31|| CHIP_HDR_NAVI32|| CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 struct { unsigned int : 12; unsigned int PRIM_ATTR : 1; @@ -27304,7 +27374,7 @@ union SPI_PS_INPUT_CNTL_19 { unsigned int ROTATE_PC_PTR : 1; unsigned int : 20; } gfx103PlusExclusive; -#if CHIP_HDR_NAVI31 || CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 +#if CHIP_HDR_NAVI31|| CHIP_HDR_NAVI32|| CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 struct { unsigned int : 12; unsigned int PRIM_ATTR : 1; @@ -27338,7 +27408,7 @@ union SPI_PS_INPUT_CNTL_20 { unsigned int ROTATE_PC_PTR : 1; unsigned int : 20; } gfx103PlusExclusive; -#if CHIP_HDR_NAVI31 || CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 +#if CHIP_HDR_NAVI31|| CHIP_HDR_NAVI32|| CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 struct { unsigned int : 12; unsigned int PRIM_ATTR : 1; @@ -27372,7 +27442,7 @@ union SPI_PS_INPUT_CNTL_21 { unsigned int ROTATE_PC_PTR : 1; unsigned int : 20; } gfx103PlusExclusive; -#if CHIP_HDR_NAVI31 || CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 +#if CHIP_HDR_NAVI31|| CHIP_HDR_NAVI32|| CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 struct { unsigned int : 12; unsigned int PRIM_ATTR : 1; @@ -27406,7 +27476,7 @@ union SPI_PS_INPUT_CNTL_22 { unsigned int ROTATE_PC_PTR : 1; unsigned int : 20; } gfx103PlusExclusive; -#if CHIP_HDR_NAVI31 || CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 +#if CHIP_HDR_NAVI31|| CHIP_HDR_NAVI32|| CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 struct { unsigned int : 12; unsigned int PRIM_ATTR : 1; @@ -27440,7 +27510,7 @@ union SPI_PS_INPUT_CNTL_23 { unsigned int ROTATE_PC_PTR : 1; unsigned int : 20; } gfx103PlusExclusive; -#if CHIP_HDR_NAVI31 || CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 +#if CHIP_HDR_NAVI31|| CHIP_HDR_NAVI32|| CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 struct { unsigned int : 12; unsigned int PRIM_ATTR : 1; @@ -27474,7 +27544,7 @@ union SPI_PS_INPUT_CNTL_24 { unsigned int ROTATE_PC_PTR : 1; unsigned int : 20; } gfx103PlusExclusive; -#if CHIP_HDR_NAVI31 || CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 +#if CHIP_HDR_NAVI31|| CHIP_HDR_NAVI32|| CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 struct { unsigned int : 12; unsigned int PRIM_ATTR : 1; @@ -27508,7 +27578,7 @@ union SPI_PS_INPUT_CNTL_25 { unsigned int ROTATE_PC_PTR : 1; unsigned int : 20; } gfx103PlusExclusive; -#if CHIP_HDR_NAVI31 || CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 +#if CHIP_HDR_NAVI31|| CHIP_HDR_NAVI32|| CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 struct { unsigned int : 12; unsigned int PRIM_ATTR : 1; @@ -27542,7 +27612,7 @@ union SPI_PS_INPUT_CNTL_26 { unsigned int ROTATE_PC_PTR : 1; unsigned int : 20; } gfx103PlusExclusive; -#if CHIP_HDR_NAVI31 || CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 +#if CHIP_HDR_NAVI31|| CHIP_HDR_NAVI32|| CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 struct { unsigned int : 12; unsigned int PRIM_ATTR : 1; @@ -27576,7 +27646,7 @@ union SPI_PS_INPUT_CNTL_27 { unsigned int ROTATE_PC_PTR : 1; unsigned int : 20; } gfx103PlusExclusive; -#if CHIP_HDR_NAVI31 || CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 +#if CHIP_HDR_NAVI31|| CHIP_HDR_NAVI32|| CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 struct { unsigned int : 12; unsigned int PRIM_ATTR : 1; @@ -27610,7 +27680,7 @@ union SPI_PS_INPUT_CNTL_28 { unsigned int ROTATE_PC_PTR : 1; unsigned int : 20; } gfx103PlusExclusive; -#if CHIP_HDR_NAVI31 || CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 +#if CHIP_HDR_NAVI31|| CHIP_HDR_NAVI32|| CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 struct { unsigned int : 12; unsigned int PRIM_ATTR : 1; @@ -27644,7 +27714,7 @@ union SPI_PS_INPUT_CNTL_29 { unsigned int ROTATE_PC_PTR : 1; unsigned int : 20; } gfx103PlusExclusive; -#if CHIP_HDR_NAVI31 || CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 +#if CHIP_HDR_NAVI31|| CHIP_HDR_NAVI32|| CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 struct { unsigned int : 12; unsigned int PRIM_ATTR : 1; @@ -27678,7 +27748,7 @@ union SPI_PS_INPUT_CNTL_30 { unsigned int ROTATE_PC_PTR : 1; unsigned int : 20; } gfx103PlusExclusive; -#if CHIP_HDR_NAVI31 || CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 +#if CHIP_HDR_NAVI31|| CHIP_HDR_NAVI32|| CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 struct { unsigned int : 12; unsigned int PRIM_ATTR : 1; @@ -27712,7 +27782,7 @@ union SPI_PS_INPUT_CNTL_31 { unsigned int ROTATE_PC_PTR : 1; unsigned int : 20; } gfx103PlusExclusive; -#if CHIP_HDR_NAVI31 || CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 +#if CHIP_HDR_NAVI31|| CHIP_HDR_NAVI32|| CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 struct { unsigned int : 12; unsigned int PRIM_ATTR : 1; @@ -28405,7 +28475,7 @@ union SPI_SHADER_COL_FORMAT { float f32All; }; -#if CHIP_HDR_NAVI31 || CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 +#if CHIP_HDR_NAVI31|| CHIP_HDR_NAVI32|| CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 union SPI_SHADER_GS_MESHLET_DIM { struct { unsigned int MESHLET_NUM_THREAD_X : 8; @@ -28420,7 +28490,7 @@ union SPI_SHADER_GS_MESHLET_DIM { }; #endif -#if CHIP_HDR_NAVI31 || CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 +#if CHIP_HDR_NAVI31|| CHIP_HDR_NAVI32|| CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 union SPI_SHADER_GS_MESHLET_EXP_ALLOC { struct { unsigned int MAX_EXP_VERTS : 9; @@ -28523,7 +28593,7 @@ union SPI_SHADER_PGM_HI_GS { unsigned int MEM_BASE : 8; unsigned int : 24; } gfx09_10; -#if CHIP_HDR_NAVI31 || CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 +#if CHIP_HDR_NAVI31|| CHIP_HDR_NAVI32|| CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 struct { unsigned int MEM_BASE : 32; } gfx11; @@ -28539,7 +28609,7 @@ union SPI_SHADER_PGM_HI_HS { unsigned int MEM_BASE : 8; unsigned int : 24; } gfx09_10; -#if CHIP_HDR_NAVI31 || CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 +#if CHIP_HDR_NAVI31|| CHIP_HDR_NAVI32|| CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 struct { unsigned int MEM_BASE : 32; } gfx11; @@ -29179,7 +29249,7 @@ union SPI_SHADER_PGM_RSRC3_PS { unsigned int SIMD_DISABLE : 4; unsigned int : 2; } gfx09; -#if CHIP_HDR_NAVI31 || CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 +#if CHIP_HDR_NAVI31|| CHIP_HDR_NAVI32|| CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 struct { unsigned int : 22; unsigned int LDS_GROUP_SIZE : 2; @@ -29220,7 +29290,7 @@ union SPI_SHADER_PGM_RSRC4_GS { unsigned int CU_EN : 16; unsigned int : 16; } gfx10; -#if CHIP_HDR_NAVI31 || CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 +#if CHIP_HDR_NAVI31|| CHIP_HDR_NAVI32|| CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 struct { unsigned int : 23; unsigned int INST_PREF_SIZE : 6; @@ -29232,7 +29302,7 @@ union SPI_SHADER_PGM_RSRC4_GS { unsigned int SPI_SHADER_LATE_ALLOC_GS : 7; unsigned int : 9; } gfx10Plus; -#if CHIP_HDR_NAVI31 || CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 +#if CHIP_HDR_NAVI31|| CHIP_HDR_NAVI32|| CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 struct { unsigned int CU_EN : 1; unsigned int RESERVED : 13; @@ -29255,7 +29325,7 @@ union SPI_SHADER_PGM_RSRC4_HS { unsigned int GROUP_FIFO_DEPTH : 7; unsigned int : 25; } gfx09; -#if CHIP_HDR_NAVI31 || CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 +#if CHIP_HDR_NAVI31|| CHIP_HDR_NAVI32|| CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 struct { unsigned int : 16; unsigned int INST_PREF_SIZE : 6; @@ -29266,7 +29336,7 @@ union SPI_SHADER_PGM_RSRC4_HS { unsigned int CU_EN : 16; unsigned int : 16; } gfx10Plus; -#if CHIP_HDR_NAVI31 || CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 +#if CHIP_HDR_NAVI31|| CHIP_HDR_NAVI32|| CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 struct { unsigned int : 29; unsigned int TRAP_ON_START : 1; @@ -29285,14 +29355,14 @@ union SPI_SHADER_PGM_RSRC4_PS { unsigned int CU_EN : 16; unsigned int : 16; } bits, bitfields; -#if CHIP_HDR_NAVI31 || CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 +#if CHIP_HDR_NAVI31|| CHIP_HDR_NAVI32|| CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 struct { unsigned int : 16; unsigned int INST_PREF_SIZE : 6; unsigned int : 10; } gfx104Plus; #endif -#if CHIP_HDR_NAVI31 || CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 +#if CHIP_HDR_NAVI31|| CHIP_HDR_NAVI32|| CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 struct { unsigned int : 29; unsigned int TRAP_ON_START : 1; @@ -32367,7 +32437,7 @@ union SPI_TMPRING_SIZE { unsigned int WAVESIZE : 13; unsigned int : 7; } gfx09_10; -#if CHIP_HDR_NAVI31 || CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 +#if CHIP_HDR_NAVI31|| CHIP_HDR_NAVI32|| CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 struct { unsigned int : 12; unsigned int WAVESIZE : 15; @@ -32986,7 +33056,7 @@ union SQG_CONFIG { unsigned int UTCL0_RETRY_TIMER : 7; unsigned int : 21; } gfx10; -#if CHIP_HDR_NAVI31 || CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 +#if CHIP_HDR_NAVI31|| CHIP_HDR_NAVI32|| CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 struct { unsigned int : 13; unsigned int SQG_ICPFT_EN : 1; @@ -32994,7 +33064,7 @@ union SQG_CONFIG { unsigned int : 17; } gfx104Plus; #endif -#if CHIP_HDR_NAVI31 || CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 +#if CHIP_HDR_NAVI31|| CHIP_HDR_NAVI32|| CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 struct { unsigned int GL1H_PREFETCH_PAGE : 4; unsigned int : 12; @@ -33031,7 +33101,7 @@ union SQG_CONFIG { float f32All; }; -#if CHIP_HDR_NAVI31 || CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 +#if CHIP_HDR_NAVI31|| CHIP_HDR_NAVI32|| CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 union SQG_GL1H_STATUS { struct { unsigned int R0_ACK_ERR_DETECTED : 1; @@ -33047,7 +33117,7 @@ union SQG_GL1H_STATUS { }; #endif -#if CHIP_HDR_NAVI31 || CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 +#if CHIP_HDR_NAVI31|| CHIP_HDR_NAVI32|| CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 union SQG_PERFCOUNTER0_HI { struct { unsigned int PERFCOUNTER_HI : 32; @@ -33059,7 +33129,7 @@ union SQG_PERFCOUNTER0_HI { }; #endif -#if CHIP_HDR_NAVI31 || CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 +#if CHIP_HDR_NAVI31|| CHIP_HDR_NAVI32|| CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 union SQG_PERFCOUNTER0_LO { struct { unsigned int PERFCOUNTER_LO : 32; @@ -33071,7 +33141,7 @@ union SQG_PERFCOUNTER0_LO { }; #endif -#if CHIP_HDR_NAVI31 || CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 +#if CHIP_HDR_NAVI31|| CHIP_HDR_NAVI32|| CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 union SQG_PERFCOUNTER0_SELECT { struct { unsigned int PERF_SEL : 9; @@ -33087,7 +33157,7 @@ union SQG_PERFCOUNTER0_SELECT { }; #endif -#if CHIP_HDR_NAVI31 || CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 +#if CHIP_HDR_NAVI31|| CHIP_HDR_NAVI32|| CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 union SQG_PERFCOUNTER1_HI { struct { unsigned int PERFCOUNTER_HI : 32; @@ -33099,7 +33169,7 @@ union SQG_PERFCOUNTER1_HI { }; #endif -#if CHIP_HDR_NAVI31 || CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 +#if CHIP_HDR_NAVI31|| CHIP_HDR_NAVI32|| CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 union SQG_PERFCOUNTER1_LO { struct { unsigned int PERFCOUNTER_LO : 32; @@ -33111,7 +33181,7 @@ union SQG_PERFCOUNTER1_LO { }; #endif -#if CHIP_HDR_NAVI31 || CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 +#if CHIP_HDR_NAVI31|| CHIP_HDR_NAVI32|| CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 union SQG_PERFCOUNTER1_SELECT { struct { unsigned int PERF_SEL : 9; @@ -33127,7 +33197,7 @@ union SQG_PERFCOUNTER1_SELECT { }; #endif -#if CHIP_HDR_NAVI31 || CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 +#if CHIP_HDR_NAVI31|| CHIP_HDR_NAVI32|| CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 union SQG_PERFCOUNTER2_HI { struct { unsigned int PERFCOUNTER_HI : 32; @@ -33139,7 +33209,7 @@ union SQG_PERFCOUNTER2_HI { }; #endif -#if CHIP_HDR_NAVI31 || CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 +#if CHIP_HDR_NAVI31|| CHIP_HDR_NAVI32|| CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 union SQG_PERFCOUNTER2_LO { struct { unsigned int PERFCOUNTER_LO : 32; @@ -33151,7 +33221,7 @@ union SQG_PERFCOUNTER2_LO { }; #endif -#if CHIP_HDR_NAVI31 || CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 +#if CHIP_HDR_NAVI31|| CHIP_HDR_NAVI32|| CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 union SQG_PERFCOUNTER2_SELECT { struct { unsigned int PERF_SEL : 9; @@ -33167,7 +33237,7 @@ union SQG_PERFCOUNTER2_SELECT { }; #endif -#if CHIP_HDR_NAVI31 || CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 +#if CHIP_HDR_NAVI31|| CHIP_HDR_NAVI32|| CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 union SQG_PERFCOUNTER3_HI { struct { unsigned int PERFCOUNTER_HI : 32; @@ -33179,7 +33249,7 @@ union SQG_PERFCOUNTER3_HI { }; #endif -#if CHIP_HDR_NAVI31 || CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 +#if CHIP_HDR_NAVI31|| CHIP_HDR_NAVI32|| CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 union SQG_PERFCOUNTER3_LO { struct { unsigned int PERFCOUNTER_LO : 32; @@ -33191,7 +33261,7 @@ union SQG_PERFCOUNTER3_LO { }; #endif -#if CHIP_HDR_NAVI31 || CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 +#if CHIP_HDR_NAVI31|| CHIP_HDR_NAVI32|| CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 union SQG_PERFCOUNTER3_SELECT { struct { unsigned int PERF_SEL : 9; @@ -33207,7 +33277,7 @@ union SQG_PERFCOUNTER3_SELECT { }; #endif -#if CHIP_HDR_NAVI31 || CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 +#if CHIP_HDR_NAVI31|| CHIP_HDR_NAVI32|| CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 union SQG_PERFCOUNTER4_HI { struct { unsigned int PERFCOUNTER_HI : 32; @@ -33219,7 +33289,7 @@ union SQG_PERFCOUNTER4_HI { }; #endif -#if CHIP_HDR_NAVI31 || CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 +#if CHIP_HDR_NAVI31|| CHIP_HDR_NAVI32|| CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 union SQG_PERFCOUNTER4_LO { struct { unsigned int PERFCOUNTER_LO : 32; @@ -33231,7 +33301,7 @@ union SQG_PERFCOUNTER4_LO { }; #endif -#if CHIP_HDR_NAVI31 || CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 +#if CHIP_HDR_NAVI31|| CHIP_HDR_NAVI32|| CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 union SQG_PERFCOUNTER4_SELECT { struct { unsigned int PERF_SEL : 9; @@ -33247,7 +33317,7 @@ union SQG_PERFCOUNTER4_SELECT { }; #endif -#if CHIP_HDR_NAVI31 || CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 +#if CHIP_HDR_NAVI31|| CHIP_HDR_NAVI32|| CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 union SQG_PERFCOUNTER5_HI { struct { unsigned int PERFCOUNTER_HI : 32; @@ -33259,7 +33329,7 @@ union SQG_PERFCOUNTER5_HI { }; #endif -#if CHIP_HDR_NAVI31 || CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 +#if CHIP_HDR_NAVI31|| CHIP_HDR_NAVI32|| CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 union SQG_PERFCOUNTER5_LO { struct { unsigned int PERFCOUNTER_LO : 32; @@ -33271,7 +33341,7 @@ union SQG_PERFCOUNTER5_LO { }; #endif -#if CHIP_HDR_NAVI31 || CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 +#if CHIP_HDR_NAVI31|| CHIP_HDR_NAVI32|| CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 union SQG_PERFCOUNTER5_SELECT { struct { unsigned int PERF_SEL : 9; @@ -33287,7 +33357,7 @@ union SQG_PERFCOUNTER5_SELECT { }; #endif -#if CHIP_HDR_NAVI31 || CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 +#if CHIP_HDR_NAVI31|| CHIP_HDR_NAVI32|| CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 union SQG_PERFCOUNTER6_HI { struct { unsigned int PERFCOUNTER_HI : 32; @@ -33299,7 +33369,7 @@ union SQG_PERFCOUNTER6_HI { }; #endif -#if CHIP_HDR_NAVI31 || CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 +#if CHIP_HDR_NAVI31|| CHIP_HDR_NAVI32|| CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 union SQG_PERFCOUNTER6_LO { struct { unsigned int PERFCOUNTER_LO : 32; @@ -33311,7 +33381,7 @@ union SQG_PERFCOUNTER6_LO { }; #endif -#if CHIP_HDR_NAVI31 || CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 +#if CHIP_HDR_NAVI31|| CHIP_HDR_NAVI32|| CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 union SQG_PERFCOUNTER6_SELECT { struct { unsigned int PERF_SEL : 9; @@ -33327,7 +33397,7 @@ union SQG_PERFCOUNTER6_SELECT { }; #endif -#if CHIP_HDR_NAVI31 || CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 +#if CHIP_HDR_NAVI31|| CHIP_HDR_NAVI32|| CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 union SQG_PERFCOUNTER7_HI { struct { unsigned int PERFCOUNTER_HI : 32; @@ -33339,7 +33409,7 @@ union SQG_PERFCOUNTER7_HI { }; #endif -#if CHIP_HDR_NAVI31 || CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 +#if CHIP_HDR_NAVI31|| CHIP_HDR_NAVI32|| CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 union SQG_PERFCOUNTER7_LO { struct { unsigned int PERFCOUNTER_LO : 32; @@ -33351,7 +33421,7 @@ union SQG_PERFCOUNTER7_LO { }; #endif -#if CHIP_HDR_NAVI31 || CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 +#if CHIP_HDR_NAVI31|| CHIP_HDR_NAVI32|| CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 union SQG_PERFCOUNTER7_SELECT { struct { unsigned int PERF_SEL : 9; @@ -33367,7 +33437,7 @@ union SQG_PERFCOUNTER7_SELECT { }; #endif -#if CHIP_HDR_NAVI31 || CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 +#if CHIP_HDR_NAVI31|| CHIP_HDR_NAVI32|| CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 union SQG_PERFCOUNTER_CTRL { struct { unsigned int PS_EN : 1; @@ -33393,7 +33463,7 @@ union SQG_PERFCOUNTER_CTRL { }; #endif -#if CHIP_HDR_NAVI31 || CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 +#if CHIP_HDR_NAVI31|| CHIP_HDR_NAVI32|| CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 union SQG_PERFCOUNTER_CTRL2 { struct { unsigned int FORCE_EN : 1; @@ -33407,7 +33477,7 @@ union SQG_PERFCOUNTER_CTRL2 { }; #endif -#if CHIP_HDR_NAVI31 || CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 +#if CHIP_HDR_NAVI31|| CHIP_HDR_NAVI32|| CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 union SQG_PERF_SAMPLE_FINISH { struct { unsigned int STATUS : 7; @@ -33613,14 +33683,14 @@ union SQ_CLK_CTRL { unsigned int WCLK2DCLK_OVERRIDE : 1; unsigned int : 31; } gfx103Derivative; -#if CHIP_HDR_NAVI31 || CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 +#if CHIP_HDR_NAVI31|| CHIP_HDR_NAVI32|| CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 struct { unsigned int : 16; unsigned int SQ_LDS_DIRECT_FGCG_OVERRIDE : 1; unsigned int : 15; } gfx104Plus; #endif -#if CHIP_HDR_NAVI31 || CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 +#if CHIP_HDR_NAVI31|| CHIP_HDR_NAVI32|| CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 struct { unsigned int : 12; unsigned int SQ_SP_CMD_FGCG_OVERRIDE : 1; @@ -33737,7 +33807,7 @@ union SQ_CONFIG { unsigned int WCLK_HYSTERESIS_CNT : 2; unsigned int : 9; } gfx103PlusExclusive; -#if CHIP_HDR_NAVI31 || CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 +#if CHIP_HDR_NAVI31|| CHIP_HDR_NAVI32|| CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 struct { unsigned int : 9; unsigned int DISABLE_VMEM_EXEC_ZERO_SKIP : 1; @@ -33746,7 +33816,7 @@ union SQ_CONFIG { unsigned int : 4; } gfx104Plus; #endif -#if CHIP_HDR_NAVI31 || CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 +#if CHIP_HDR_NAVI31|| CHIP_HDR_NAVI32|| CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 struct { unsigned int ECO_SPARE : 8; unsigned int NEW_TRANS_ARB_SCHEME : 1; @@ -33980,7 +34050,7 @@ union SQ_FIFO_SIZES { unsigned int EXPORT_BUF_REDUCE : 2; unsigned int : 14; } gfx10Plus; -#if CHIP_HDR_NAVI31 || CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 +#if CHIP_HDR_NAVI31|| CHIP_HDR_NAVI32|| CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 struct { unsigned int : 12; unsigned int EXPORT_BUF_GS_RESERVED : 2; @@ -35521,7 +35591,7 @@ union SQ_PERFCOUNTER_CTRL2 { unsigned int FORCE_EN : 1; unsigned int : 31; } bits, bitfields; -#if CHIP_HDR_NAVI31 || CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 +#if CHIP_HDR_NAVI31|| CHIP_HDR_NAVI32|| CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 struct { unsigned int : 1; unsigned int VMID_EN : 16; @@ -35545,7 +35615,7 @@ union SQ_PERFCOUNTER_MASK { float f32All; }; -#if CHIP_HDR_NAVI31 || CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 +#if CHIP_HDR_NAVI31|| CHIP_HDR_NAVI32|| CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 union SQ_PERF_SNAPSHOT_CTRL { struct { unsigned int TIMER_ON_OFF : 1; @@ -35561,7 +35631,7 @@ union SQ_PERF_SNAPSHOT_CTRL { }; #endif -#if CHIP_HDR_NAVI31 || CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 +#if CHIP_HDR_NAVI31|| CHIP_HDR_NAVI32|| CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 union SQ_PERF_SNAPSHOT_DATA { struct { unsigned int VALID : 1; @@ -35580,7 +35650,7 @@ union SQ_PERF_SNAPSHOT_DATA { }; #endif -#if CHIP_HDR_NAVI31 || CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 +#if CHIP_HDR_NAVI31|| CHIP_HDR_NAVI32|| CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 union SQ_PERF_SNAPSHOT_PC_HI { struct { unsigned int PC_HI : 16; @@ -35593,7 +35663,7 @@ union SQ_PERF_SNAPSHOT_PC_HI { }; #endif -#if CHIP_HDR_NAVI31 || CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 +#if CHIP_HDR_NAVI31|| CHIP_HDR_NAVI32|| CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 union SQ_PERF_SNAPSHOT_PC_LO { struct { unsigned int PC_LO : 32; @@ -36011,7 +36081,7 @@ union SQ_THREAD_TRACE_CTRL { unsigned int : 11; unsigned int DRAW_EVENT_EN : 1; } gfx10Plus; -#if CHIP_HDR_NAVI31 || CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 +#if CHIP_HDR_NAVI31|| CHIP_HDR_NAVI32|| CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 struct { unsigned int : 3; unsigned int GL1_PERF_EN : 1; @@ -36101,7 +36171,7 @@ union SQ_THREAD_TRACE_MASK { unsigned int SQ_STALL_EN : 1; unsigned int : 16; } gfx09; -#if CHIP_HDR_NAVI31 || CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 +#if CHIP_HDR_NAVI31|| CHIP_HDR_NAVI32|| CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 struct { unsigned int : 17; unsigned int EXCLUDE_NONDETAIL_SHADERDATA : 1; @@ -36202,7 +36272,7 @@ union SQ_THREAD_TRACE_STATUS { unsigned int BUSY : 1; unsigned int : 6; } gfx10Plus; -#if CHIP_HDR_NAVI31 || CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 +#if CHIP_HDR_NAVI31|| CHIP_HDR_NAVI32|| CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 struct { unsigned int : 24; unsigned int WRITE_ERROR : 1; @@ -36223,7 +36293,7 @@ union SQ_THREAD_TRACE_STATUS2 { unsigned int PACKET_LOST_BUF_NO_LOCKDOWN : 1; unsigned int : 27; } bits, bitfields; -#if CHIP_HDR_NAVI31 || CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 +#if CHIP_HDR_NAVI31|| CHIP_HDR_NAVI32|| CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 struct { unsigned int : 8; unsigned int BUF_ISSUE_STATUS : 5; @@ -36264,7 +36334,7 @@ union SQ_THREAD_TRACE_TOKEN_MASK { unsigned int : 5; unsigned int REG_DETAIL_ALL : 1; } gfx10Plus; -#if CHIP_HDR_NAVI31 || CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 +#if CHIP_HDR_NAVI31|| CHIP_HDR_NAVI32|| CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 struct { unsigned int : 11; unsigned int TTRACE_EXEC : 1; @@ -37737,7 +37807,7 @@ union TA_CNTL { unsigned int TC_DATA_CREDIT : 3; unsigned int : 16; } gfx09; -#if CHIP_HDR_NAVI31 || CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 +#if CHIP_HDR_NAVI31|| CHIP_HDR_NAVI32|| CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 struct { unsigned int TA_SQ_XNACK_FGCG_DISABLE : 1; unsigned int : 31; @@ -37769,7 +37839,7 @@ union TA_CNTL { float f32All; }; -#if CHIP_HDR_NAVI31 || CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 +#if CHIP_HDR_NAVI31|| CHIP_HDR_NAVI32|| CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 union TA_CNTL2 { struct { unsigned int : 16; @@ -37778,7 +37848,7 @@ union TA_CNTL2 { unsigned int ELIMINATE_UNLIT_QUAD_DIS : 1; unsigned int : 12; } bits, bitfields; -#if CHIP_HDR_NAVI31 || CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 +#if CHIP_HDR_NAVI31|| CHIP_HDR_NAVI32|| CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 struct { unsigned int : 17; unsigned int ELEMSIZE_HASH_DIS : 1; @@ -46489,7 +46559,7 @@ union UTCL1_PERFCOUNTER1_SELECT { float f32All; }; -#if CHIP_HDR_NAVI31 || CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 +#if CHIP_HDR_NAVI31|| CHIP_HDR_NAVI32|| CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 union UTCL1_PERFCOUNTER2_HI { struct { unsigned int PERFCOUNTER_HI : 32; @@ -46501,7 +46571,7 @@ union UTCL1_PERFCOUNTER2_HI { }; #endif -#if CHIP_HDR_NAVI31 || CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 +#if CHIP_HDR_NAVI31|| CHIP_HDR_NAVI32|| CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 union UTCL1_PERFCOUNTER2_LO { struct { unsigned int PERFCOUNTER_LO : 32; @@ -46513,7 +46583,7 @@ union UTCL1_PERFCOUNTER2_LO { }; #endif -#if CHIP_HDR_NAVI31 || CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 +#if CHIP_HDR_NAVI31|| CHIP_HDR_NAVI32|| CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 union UTCL1_PERFCOUNTER2_SELECT { struct { unsigned int PERF_SEL : 10; @@ -46527,7 +46597,7 @@ union UTCL1_PERFCOUNTER2_SELECT { }; #endif -#if CHIP_HDR_NAVI31 || CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 +#if CHIP_HDR_NAVI31|| CHIP_HDR_NAVI32|| CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 union UTCL1_PERFCOUNTER3_HI { struct { unsigned int PERFCOUNTER_HI : 32; @@ -46539,7 +46609,7 @@ union UTCL1_PERFCOUNTER3_HI { }; #endif -#if CHIP_HDR_NAVI31 || CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 +#if CHIP_HDR_NAVI31|| CHIP_HDR_NAVI32|| CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 union UTCL1_PERFCOUNTER3_LO { struct { unsigned int PERFCOUNTER_LO : 32; @@ -46551,7 +46621,7 @@ union UTCL1_PERFCOUNTER3_LO { }; #endif -#if CHIP_HDR_NAVI31 || CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 +#if CHIP_HDR_NAVI31|| CHIP_HDR_NAVI32|| CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 union UTCL1_PERFCOUNTER3_SELECT { struct { unsigned int PERF_SEL : 10; @@ -47872,7 +47942,7 @@ union VGT_SHADER_STAGES_EN { unsigned int GS_FAST_LAUNCH : 2; unsigned int : 11; } gfx09_1xPlus; -#if CHIP_HDR_NAVI31 || CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 +#if CHIP_HDR_NAVI31|| CHIP_HDR_NAVI32|| CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 struct { unsigned int : 26; unsigned int PRIMGEN_PASSTHRU_NO_MSG : 1; @@ -48270,7 +48340,7 @@ union VGT_TF_PARAM { unsigned int MTYPE : 3; unsigned int : 6; } gfx10Plus; -#if CHIP_HDR_NAVI31 || CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 +#if CHIP_HDR_NAVI31|| CHIP_HDR_NAVI32|| CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 struct { unsigned int : 9; unsigned int NOT_USED : 1; @@ -48288,7 +48358,7 @@ union VGT_TF_RING_SIZE { unsigned int SIZE : 16; unsigned int : 16; } gfx09_10; -#if CHIP_HDR_NAVI31 || CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 +#if CHIP_HDR_NAVI31|| CHIP_HDR_NAVI32|| CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 struct { unsigned int SIZE : 17; unsigned int : 15; @@ -48438,7 +48508,7 @@ union WD_CNTL_STATUS { unsigned int TE0_BUSY : 1; unsigned int : 4; } gfx103Derivative; -#if CHIP_HDR_NAVI31 || CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 +#if CHIP_HDR_NAVI31|| CHIP_HDR_NAVI32|| CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 struct { unsigned int DIST_BUSY : 1; unsigned int DIST_BE_BUSY : 1; @@ -48782,7 +48852,7 @@ union WD_UTCL1_CNTL { unsigned int : 1; } nv24; #endif -#if CHIP_HDR_NAVI31 || CHIP_HDR_NAVI33 +#if CHIP_HDR_NAVI31 || CHIP_HDR_NAVI32 || CHIP_HDR_NAVI33 struct { unsigned int : 30; unsigned int LLC_NOALLOC_OVERRIDE : 1; diff --git a/lgc/imported/chip/gfx9/gfx9_plus_merged_typedef.h b/lgc/imported/chip/gfx9/gfx9_plus_merged_typedef.h index 36f3486281..8eb989b549 100644 --- a/lgc/imported/chip/gfx9/gfx9_plus_merged_typedef.h +++ b/lgc/imported/chip/gfx9/gfx9_plus_merged_typedef.h @@ -221,11 +221,11 @@ typedef union CB_COLOR7_VIEW regCB_COLOR7_VI typedef union CB_COLOR_CONTROL regCB_COLOR_CONTROL; typedef union CB_COVERAGE_OUT_CONTROL regCB_COVERAGE_OUT_CONTROL; typedef union CB_DCC_CONFIG regCB_DCC_CONFIG; -#if CHIP_HDR_NAVI31 || CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 +#if CHIP_HDR_NAVI31|| CHIP_HDR_NAVI32|| CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 typedef union CB_DCC_CONFIG2 regCB_DCC_CONFIG2; #endif typedef union CB_DCC_CONTROL regCB_DCC_CONTROL; -#if CHIP_HDR_NAVI31 || CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 +#if CHIP_HDR_NAVI31|| CHIP_HDR_NAVI32|| CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 typedef union CB_FDCC_CONTROL regCB_FDCC_CONTROL; typedef union CB_FGCG_SRAM_OVERRIDE regCB_FGCG_SRAM_OVERRIDE; #endif @@ -236,7 +236,7 @@ typedef union CB_HW_CONTROL_3 regCB_HW_CONTRO typedef union CB_HW_CONTROL_4 regCB_HW_CONTROL_4; typedef union CB_HW_MEM_ARBITER_RD regCB_HW_MEM_ARBITER_RD; typedef union CB_HW_MEM_ARBITER_WR regCB_HW_MEM_ARBITER_WR; -#if CHIP_HDR_NAVI31 || CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 +#if CHIP_HDR_NAVI31|| CHIP_HDR_NAVI32|| CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 typedef union CB_KEY_OVERRIDE_0 regCB_KEY_OVERRIDE_0; typedef union CB_KEY_OVERRIDE_1 regCB_KEY_OVERRIDE_1; typedef union CB_KEY_OVERRIDE_2 regCB_KEY_OVERRIDE_2; @@ -333,7 +333,7 @@ typedef union COMPUTE_DIM_Z regCOMPUTE_DIM_ typedef union COMPUTE_DISPATCH_END regCOMPUTE_DISPATCH_END; typedef union COMPUTE_DISPATCH_ID regCOMPUTE_DISPATCH_ID; typedef union COMPUTE_DISPATCH_INITIATOR regCOMPUTE_DISPATCH_INITIATOR; -#if CHIP_HDR_NAVI31 || CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 +#if CHIP_HDR_NAVI31|| CHIP_HDR_NAVI32|| CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 typedef union COMPUTE_DISPATCH_INTERLEAVE regCOMPUTE_DISPATCH_INTERLEAVE; #endif typedef union COMPUTE_DISPATCH_PKT_ADDR_HI regCOMPUTE_DISPATCH_PKT_ADDR_HI; @@ -372,7 +372,7 @@ typedef union COMPUTE_STATIC_THREAD_MGMT_SE0 regCOMPUTE_STAT typedef union COMPUTE_STATIC_THREAD_MGMT_SE1 regCOMPUTE_STATIC_THREAD_MGMT_SE1; typedef union COMPUTE_STATIC_THREAD_MGMT_SE2 regCOMPUTE_STATIC_THREAD_MGMT_SE2; typedef union COMPUTE_STATIC_THREAD_MGMT_SE3 regCOMPUTE_STATIC_THREAD_MGMT_SE3; -#if CHIP_HDR_NAVI31 || CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 +#if CHIP_HDR_NAVI31|| CHIP_HDR_NAVI32|| CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 typedef union COMPUTE_STATIC_THREAD_MGMT_SE4 regCOMPUTE_STATIC_THREAD_MGMT_SE4; typedef union COMPUTE_STATIC_THREAD_MGMT_SE5 regCOMPUTE_STATIC_THREAD_MGMT_SE5; typedef union COMPUTE_STATIC_THREAD_MGMT_SE6 regCOMPUTE_STATIC_THREAD_MGMT_SE6; @@ -461,7 +461,7 @@ typedef union CP_SC_PSINVOC_COUNT0_LO regCP_SC_PSINVO typedef union CP_SC_PSINVOC_COUNT1_HI regCP_SC_PSINVOC_COUNT1_HI; typedef union CP_SC_PSINVOC_COUNT1_LO regCP_SC_PSINVOC_COUNT1_LO; typedef union CP_STRMOUT_CNTL regCP_STRMOUT_CNTL; -#if CHIP_HDR_NAVI31 || CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 +#if CHIP_HDR_NAVI31|| CHIP_HDR_NAVI32|| CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 typedef union CP_VGT_ASINVOC_COUNT_HI regCP_VGT_ASINVOC_COUNT_HI; typedef union CP_VGT_ASINVOC_COUNT_LO regCP_VGT_ASINVOC_COUNT_LO; #endif @@ -509,7 +509,7 @@ typedef union DB_FGCG_SRAMS_CLK_CTRL regDB_FGCG_SRAM typedef union DB_FIFO_DEPTH1 regDB_FIFO_DEPTH1; typedef union DB_FIFO_DEPTH2 regDB_FIFO_DEPTH2; typedef union DB_FIFO_DEPTH3 regDB_FIFO_DEPTH3; -#if CHIP_HDR_NAVI31 || CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 +#if CHIP_HDR_NAVI31|| CHIP_HDR_NAVI32|| CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 typedef union DB_FIFO_DEPTH4 regDB_FIFO_DEPTH4; #endif typedef union DB_FREE_CACHELINES regDB_FREE_CACHELINES; @@ -759,7 +759,10 @@ typedef union GE2_SE_PERFCOUNTER3_SELECT1 regGE2_SE_PERFC typedef union GE_CNTL regGE_CNTL; typedef union GE_DMA_FIRST_INDEX regGE_DMA_FIRST_INDEX; typedef union GE_FAST_CLKS regGE_FAST_CLKS; -#if CHIP_HDR_NAVI31 || CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 +#if CHIP_HDR_NAVI32 +typedef union GE_FED_STATUS regGE_FED_STATUS; +#endif +#if CHIP_HDR_NAVI31|| CHIP_HDR_NAVI32|| CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 typedef union GE_GS_FAST_LAUNCH_WG_DIM regGE_GS_FAST_LAUNCH_WG_DIM; typedef union GE_GS_FAST_LAUNCH_WG_DIM_1 regGE_GS_FAST_LAUNCH_WG_DIM_1; #endif @@ -769,7 +772,7 @@ typedef union GE_MAX_VTX_INDX regGE_MAX_VTX_I typedef union GE_MIN_VTX_INDX regGE_MIN_VTX_INDX; typedef union GE_MULTI_PRIM_IB_RESET_EN regGE_MULTI_PRIM_IB_RESET_EN; typedef union GE_NGG_SUBGRP_CNTL regGE_NGG_SUBGRP_CNTL; -#if CHIP_HDR_NAVI31 || CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 +#if CHIP_HDR_NAVI31|| CHIP_HDR_NAVI32|| CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 typedef union GE_PA_IF_SAFE_REG regGE_PA_IF_SAFE_REG; #endif typedef union GE_PC_ALLOC regGE_PC_ALLOC; @@ -815,11 +818,11 @@ typedef union GE_PERFCOUNTER11_HI regGE_PERFCOUNT typedef union GE_PERFCOUNTER11_LO regGE_PERFCOUNTER11_LO; typedef union GE_PERFCOUNTER11_SELECT regGE_PERFCOUNTER11_SELECT; typedef union GE_PRIV_CONTROL regGE_PRIV_CONTROL; -#if CHIP_HDR_NAVI31 || CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 +#if CHIP_HDR_NAVI31|| CHIP_HDR_NAVI32|| CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 typedef union GE_RATE_CNTL_1 regGE_RATE_CNTL_1; typedef union GE_RATE_CNTL_2 regGE_RATE_CNTL_2; #endif -#if CHIP_HDR_NAVI31 || CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 +#if CHIP_HDR_NAVI31|| CHIP_HDR_NAVI32|| CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 typedef union GE_SPI_IF_SAFE_REG regGE_SPI_IF_SAFE_REG; #endif typedef union GE_STATUS regGE_STATUS; @@ -1128,7 +1131,7 @@ typedef union PA_PH_PERFCOUNTER6_SELECT regPA_PH_PERFCO typedef union PA_PH_PERFCOUNTER7_HI regPA_PH_PERFCOUNTER7_HI; typedef union PA_PH_PERFCOUNTER7_LO regPA_PH_PERFCOUNTER7_LO; typedef union PA_PH_PERFCOUNTER7_SELECT regPA_PH_PERFCOUNTER7_SELECT; -#if CHIP_HDR_NAVI31 || CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 +#if CHIP_HDR_NAVI31|| CHIP_HDR_NAVI32|| CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 typedef union PA_RATE_CNTL regPA_RATE_CNTL; #endif typedef union PA_SC_AA_CONFIG regPA_SC_AA_CONFIG; @@ -1150,13 +1153,13 @@ typedef union PA_SC_AA_SAMPLE_LOCS_PIXEL_X1Y1_0 regPA_SC_AA_SAM typedef union PA_SC_AA_SAMPLE_LOCS_PIXEL_X1Y1_1 regPA_SC_AA_SAMPLE_LOCS_PIXEL_X1Y1_1; typedef union PA_SC_AA_SAMPLE_LOCS_PIXEL_X1Y1_2 regPA_SC_AA_SAMPLE_LOCS_PIXEL_X1Y1_2; typedef union PA_SC_AA_SAMPLE_LOCS_PIXEL_X1Y1_3 regPA_SC_AA_SAMPLE_LOCS_PIXEL_X1Y1_3; -#if CHIP_HDR_NAVI31 || CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 +#if CHIP_HDR_NAVI31|| CHIP_HDR_NAVI32|| CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 typedef union PA_SC_ATM_CNTL regPA_SC_ATM_CNTL; #endif typedef union PA_SC_BC_WAVE_BREAK regPA_SC_BC_WAVE_BREAK; typedef union PA_SC_BINNER_CNTL_0 regPA_SC_BINNER_CNTL_0; typedef union PA_SC_BINNER_CNTL_1 regPA_SC_BINNER_CNTL_1; -#if CHIP_HDR_NAVI31 || CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 +#if CHIP_HDR_NAVI31|| CHIP_HDR_NAVI32|| CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 typedef union PA_SC_BINNER_CNTL_2 regPA_SC_BINNER_CNTL_2; #endif typedef union PA_SC_BINNER_CNTL_OVERRIDE regPA_SC_BINNER_CNTL_OVERRIDE; @@ -1212,7 +1215,7 @@ typedef union PA_SC_P3D_TRAP_SCREEN_HV_EN regPA_SC_P3D_TR typedef union PA_SC_P3D_TRAP_SCREEN_HV_LOCK regPA_SC_P3D_TRAP_SCREEN_HV_LOCK; typedef union PA_SC_P3D_TRAP_SCREEN_OCCURRENCE regPA_SC_P3D_TRAP_SCREEN_OCCURRENCE; typedef union PA_SC_P3D_TRAP_SCREEN_V regPA_SC_P3D_TRAP_SCREEN_V; -#if CHIP_HDR_NAVI31 || CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 +#if CHIP_HDR_NAVI31|| CHIP_HDR_NAVI32|| CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 typedef union PA_SC_PACKER_WAVE_ID_CNTL regPA_SC_PACKER_WAVE_ID_CNTL; #endif typedef union PA_SC_PBB_OVERRIDE_FLAG regPA_SC_PBB_OVERRIDE_FLAG; @@ -1319,7 +1322,7 @@ typedef union PA_SC_VPORT_ZMIN_12 regPA_SC_VPORT_ typedef union PA_SC_VPORT_ZMIN_13 regPA_SC_VPORT_ZMIN_13; typedef union PA_SC_VPORT_ZMIN_14 regPA_SC_VPORT_ZMIN_14; typedef union PA_SC_VPORT_ZMIN_15 regPA_SC_VPORT_ZMIN_15; -#if CHIP_HDR_NAVI31 || CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 +#if CHIP_HDR_NAVI31|| CHIP_HDR_NAVI32|| CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 typedef union PA_SC_VRS_OVERRIDE_CNTL regPA_SC_VRS_OVERRIDE_CNTL; typedef union PA_SC_VRS_RATE_BASE regPA_SC_VRS_RATE_BASE; typedef union PA_SC_VRS_RATE_BASE_EXT regPA_SC_VRS_RATE_BASE_EXT; @@ -1457,7 +1460,7 @@ typedef union RLC_SPM_GLB_SAMPLEDELAY_IND_ADDR regRLC_SPM_GLB_ typedef union RLC_SPM_GLB_SAMPLEDELAY_IND_DATA regRLC_SPM_GLB_SAMPLEDELAY_IND_DATA; typedef union RLC_SPM_GLOBALS_MUXSEL_SKEW regRLC_SPM_GLOBALS_MUXSEL_SKEW; typedef union RLC_SPM_GLOBALS_SAMPLE_SKEW regRLC_SPM_GLOBALS_SAMPLE_SKEW; -#if CHIP_HDR_NAVI31 || CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 +#if CHIP_HDR_NAVI31|| CHIP_HDR_NAVI32|| CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 typedef union RLC_SPM_GLOBAL_DELAY_IND_ADDR regRLC_SPM_GLOBAL_DELAY_IND_ADDR; typedef union RLC_SPM_GLOBAL_DELAY_IND_DATA regRLC_SPM_GLOBAL_DELAY_IND_DATA; #endif @@ -1470,7 +1473,7 @@ typedef union RLC_SPM_INT_INFO_1 regRLC_SPM_INT_ typedef union RLC_SPM_INT_INFO_2 regRLC_SPM_INT_INFO_2; typedef union RLC_SPM_INT_STATUS regRLC_SPM_INT_STATUS; typedef union RLC_SPM_MC_CNTL regRLC_SPM_MC_CNTL; -#if CHIP_HDR_NAVI31 || CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 +#if CHIP_HDR_NAVI31|| CHIP_HDR_NAVI32|| CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 typedef union RLC_SPM_MODE regRLC_SPM_MODE; typedef union RLC_SPM_PAUSE regRLC_SPM_PAUSE; #endif @@ -1489,7 +1492,7 @@ typedef union RLC_SPM_PERFMON_SWA_SEGMENT_SIZE regRLC_SPM_PERF typedef union RLC_SPM_RING_RDPTR regRLC_SPM_RING_RDPTR; typedef union RLC_SPM_RING_WRPTR regRLC_SPM_RING_WRPTR; typedef union RLC_SPM_RMI_PERFMON_SAMPLE_DELAY regRLC_SPM_RMI_PERFMON_SAMPLE_DELAY; -#if CHIP_HDR_NAVI31 || CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 +#if CHIP_HDR_NAVI31|| CHIP_HDR_NAVI32|| CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 typedef union RLC_SPM_RSPM_CMD regRLC_SPM_RSPM_CMD; typedef union RLC_SPM_RSPM_CMD_ACK regRLC_SPM_RSPM_CMD_ACK; typedef union RLC_SPM_RSPM_REQ_DATA_HI regRLC_SPM_RSPM_REQ_DATA_HI; @@ -1501,7 +1504,7 @@ typedef union RLC_SPM_RSPM_RET_OP regRLC_SPM_RSPM typedef union RLC_SPM_SAMPLE_CNT regRLC_SPM_SAMPLE_CNT; typedef union RLC_SPM_SC_PERFMON_SAMPLE_DELAY regRLC_SPM_SC_PERFMON_SAMPLE_DELAY; typedef union RLC_SPM_SEGMENT_THRESHOLD regRLC_SPM_SEGMENT_THRESHOLD; -#if CHIP_HDR_NAVI31 || CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 +#if CHIP_HDR_NAVI31|| CHIP_HDR_NAVI32|| CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 typedef union RLC_SPM_SE_DELAY_IND_ADDR regRLC_SPM_SE_DELAY_IND_ADDR; typedef union RLC_SPM_SE_DELAY_IND_DATA regRLC_SPM_SE_DELAY_IND_DATA; #endif @@ -1509,7 +1512,7 @@ typedef union RLC_SPM_SE_MUXSEL_ADDR regRLC_SPM_SE_M typedef union RLC_SPM_SE_MUXSEL_ADDR_OFFSET regRLC_SPM_SE_MUXSEL_ADDR_OFFSET; typedef union RLC_SPM_SE_MUXSEL_DATA regRLC_SPM_SE_MUXSEL_DATA; typedef union RLC_SPM_SE_MUXSEL_SKEW regRLC_SPM_SE_MUXSEL_SKEW; -#if CHIP_HDR_NAVI31 || CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 +#if CHIP_HDR_NAVI31|| CHIP_HDR_NAVI32|| CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 typedef union RLC_SPM_SE_RSPM_REQ_DATA_HI regRLC_SPM_SE_RSPM_REQ_DATA_HI; typedef union RLC_SPM_SE_RSPM_REQ_DATA_LO regRLC_SPM_SE_RSPM_REQ_DATA_LO; typedef union RLC_SPM_SE_RSPM_REQ_OP regRLC_SPM_SE_RSPM_REQ_OP; @@ -1519,12 +1522,12 @@ typedef union RLC_SPM_SE_RSPM_RET_OP regRLC_SPM_SE_R typedef union RLC_SPM_SE_SAMPLEDELAY_IND_ADDR regRLC_SPM_SE_SAMPLEDELAY_IND_ADDR; typedef union RLC_SPM_SE_SAMPLEDELAY_IND_DATA regRLC_SPM_SE_SAMPLEDELAY_IND_DATA; typedef union RLC_SPM_SE_SAMPLE_SKEW regRLC_SPM_SE_SAMPLE_SKEW; -#if CHIP_HDR_NAVI31 || CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 +#if CHIP_HDR_NAVI31|| CHIP_HDR_NAVI32|| CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 typedef union RLC_SPM_SPARE regRLC_SPM_SPARE; #endif typedef union RLC_SPM_SPI_PERFMON_SAMPLE_DELAY regRLC_SPM_SPI_PERFMON_SAMPLE_DELAY; typedef union RLC_SPM_SQG_PERFMON_SAMPLE_DELAY regRLC_SPM_SQG_PERFMON_SAMPLE_DELAY; -#if CHIP_HDR_NAVI31 || CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 +#if CHIP_HDR_NAVI31|| CHIP_HDR_NAVI32|| CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 typedef union RLC_SPM_STATUS regRLC_SPM_STATUS; #endif typedef union RLC_SPM_SX_PERFMON_SAMPLE_DELAY regRLC_SPM_SX_PERFMON_SAMPLE_DELAY; @@ -1581,7 +1584,7 @@ typedef union SDMA0_PERFCOUNTER1_SELECT regSDMA0_PERFCO typedef union SDMA0_PERFCOUNTER1_SELECT1 regSDMA0_PERFCOUNTER1_SELECT1; typedef union SDMA0_PERFCOUNTER_TAG_DELAY_RANGE regSDMA0_PERFCOUNTER_TAG_DELAY_RANGE; typedef union SDMA0_PERFMON_CNTL regSDMA0_PERFMON_CNTL; -#if CHIP_HDR_NAVI21|| CHIP_HDR_NAVI22|| CHIP_HDR_NAVI23|| CHIP_HDR_NAVI31 || CHIP_HDR_NAVI33 +#if CHIP_HDR_NAVI21 || CHIP_HDR_NAVI22 || CHIP_HDR_NAVI23 || CHIP_HDR_NAVI31 || CHIP_HDR_NAVI32 || CHIP_HDR_NAVI33 typedef union SDMA1_PERFCNT_MISC_CNTL regSDMA1_PERFCNT_MISC_CNTL; typedef union SDMA1_PERFCNT_PERFCOUNTER0_CFG regSDMA1_PERFCNT_PERFCOUNTER0_CFG; typedef union SDMA1_PERFCNT_PERFCOUNTER1_CFG regSDMA1_PERFCNT_PERFCOUNTER1_CFG; @@ -1635,7 +1638,7 @@ typedef union SPI_ARB_CNTL_0 regSPI_ARB_CNTL typedef union SPI_ARB_CYCLES_0 regSPI_ARB_CYCLES_0; typedef union SPI_ARB_CYCLES_1 regSPI_ARB_CYCLES_1; typedef union SPI_ARB_PRIORITY regSPI_ARB_PRIORITY; -#if CHIP_HDR_NAVI31 || CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 +#if CHIP_HDR_NAVI31|| CHIP_HDR_NAVI32|| CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 typedef union SPI_ATTRIBUTE_RING_BASE regSPI_ATTRIBUTE_RING_BASE; typedef union SPI_ATTRIBUTE_RING_SIZE regSPI_ATTRIBUTE_RING_SIZE; #endif @@ -1643,7 +1646,7 @@ typedef union SPI_BARYC_CNTL regSPI_BARYC_CN typedef union SPI_BARYC_SSAA_CNTL regSPI_BARYC_SSAA_CNTL; typedef union SPI_COMPUTE_QUEUE_RESET regSPI_COMPUTE_QUEUE_RESET; typedef union SPI_COMPUTE_WF_CTX_SAVE regSPI_COMPUTE_WF_CTX_SAVE; -#if CHIP_HDR_NAVI31 || CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 +#if CHIP_HDR_NAVI31|| CHIP_HDR_NAVI32|| CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 typedef union SPI_COMPUTE_WF_CTX_SAVE_STATUS regSPI_COMPUTE_WF_CTX_SAVE_STATUS; #endif typedef union SPI_CONFIG_CNTL regSPI_CONFIG_CNTL; @@ -1672,7 +1675,7 @@ typedef union SPI_FEATURE_CTRL regSPI_FEATURE_ typedef union SPI_GDS_CREDITS regSPI_GDS_CREDITS; typedef union SPI_GFX_CNTL regSPI_GFX_CNTL; typedef union SPI_GFX_CRAWLER_CONFIG regSPI_GFX_CRAWLER_CONFIG; -#if CHIP_HDR_NAVI31 || CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 +#if CHIP_HDR_NAVI31|| CHIP_HDR_NAVI32|| CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 typedef union SPI_GFX_SCRATCH_BASE_HI regSPI_GFX_SCRATCH_BASE_HI; typedef union SPI_GFX_SCRATCH_BASE_LO regSPI_GFX_SCRATCH_BASE_LO; typedef union SPI_GS_THROTTLE_CNTL1 regSPI_GS_THROTTLE_CNTL1; @@ -1688,7 +1691,7 @@ typedef union SPI_LB_DATA_PERCU_WAVE_HSGS regSPI_LB_DATA_ typedef union SPI_LB_DATA_PERCU_WAVE_VSPS regSPI_LB_DATA_PERCU_WAVE_VSPS; typedef union SPI_LB_DATA_PERWGP_WAVE_CS regSPI_LB_DATA_PERWGP_WAVE_CS; typedef union SPI_LB_DATA_PERWGP_WAVE_HSGS regSPI_LB_DATA_PERWGP_WAVE_HSGS; -#if CHIP_HDR_NAVI31 || CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 +#if CHIP_HDR_NAVI31|| CHIP_HDR_NAVI32|| CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 typedef union SPI_LB_DATA_PERWGP_WAVE_PS regSPI_LB_DATA_PERWGP_WAVE_PS; #endif typedef union SPI_LB_DATA_PERWGP_WAVE_VSPS regSPI_LB_DATA_PERWGP_WAVE_VSPS; @@ -1800,7 +1803,7 @@ typedef union SPI_RESOURCE_RESERVE_EN_CU_13 regSPI_RESOURCE typedef union SPI_RESOURCE_RESERVE_EN_CU_14 regSPI_RESOURCE_RESERVE_EN_CU_14; typedef union SPI_RESOURCE_RESERVE_EN_CU_15 regSPI_RESOURCE_RESERVE_EN_CU_15; typedef union SPI_SHADER_COL_FORMAT regSPI_SHADER_COL_FORMAT; -#if CHIP_HDR_NAVI31 || CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 +#if CHIP_HDR_NAVI31|| CHIP_HDR_NAVI32|| CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 typedef union SPI_SHADER_GS_MESHLET_DIM regSPI_SHADER_GS_MESHLET_DIM; typedef union SPI_SHADER_GS_MESHLET_EXP_ALLOC regSPI_SHADER_GS_MESHLET_EXP_ALLOC; #endif @@ -2181,7 +2184,7 @@ typedef union SPI_WF_LIFETIME_STATUS_19 regSPI_WF_LIFET typedef union SPI_WF_LIFETIME_STATUS_20 regSPI_WF_LIFETIME_STATUS_20; typedef union SPI_WF_LIFETIME_STATUS_21 regSPI_WF_LIFETIME_STATUS_21; typedef union SQG_CONFIG regSQG_CONFIG; -#if CHIP_HDR_NAVI31 || CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 +#if CHIP_HDR_NAVI31|| CHIP_HDR_NAVI32|| CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 typedef union SQG_GL1H_STATUS regSQG_GL1H_STATUS; typedef union SQG_PERFCOUNTER0_HI regSQG_PERFCOUNTER0_HI; typedef union SQG_PERFCOUNTER0_LO regSQG_PERFCOUNTER0_LO; @@ -2339,7 +2342,7 @@ typedef union SQ_PERFCOUNTER15_SELECT regSQ_PERFCOUNT typedef union SQ_PERFCOUNTER_CTRL regSQ_PERFCOUNTER_CTRL; typedef union SQ_PERFCOUNTER_CTRL2 regSQ_PERFCOUNTER_CTRL2; typedef union SQ_PERFCOUNTER_MASK regSQ_PERFCOUNTER_MASK; -#if CHIP_HDR_NAVI31 || CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 +#if CHIP_HDR_NAVI31|| CHIP_HDR_NAVI32|| CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 typedef union SQ_PERF_SNAPSHOT_CTRL regSQ_PERF_SNAPSHOT_CTRL; typedef union SQ_PERF_SNAPSHOT_DATA regSQ_PERF_SNAPSHOT_DATA; typedef union SQ_PERF_SNAPSHOT_PC_HI regSQ_PERF_SNAPSHOT_PC_HI; @@ -2476,7 +2479,7 @@ typedef union TA_BC_BASE_ADDR regTA_BC_BASE_A typedef union TA_BC_BASE_ADDR_HI regTA_BC_BASE_ADDR_HI; typedef union TA_CGTT_CTRL regTA_CGTT_CTRL; typedef union TA_CNTL regTA_CNTL; -#if CHIP_HDR_NAVI31 || CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 +#if CHIP_HDR_NAVI31|| CHIP_HDR_NAVI32|| CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 typedef union TA_CNTL2 regTA_CNTL2; #endif typedef union TA_CNTL_AUX regTA_CNTL_AUX; @@ -3130,7 +3133,7 @@ typedef union UTCL1_PERFCOUNTER0_SELECT regUTCL1_PERFCO typedef union UTCL1_PERFCOUNTER1_HI regUTCL1_PERFCOUNTER1_HI; typedef union UTCL1_PERFCOUNTER1_LO regUTCL1_PERFCOUNTER1_LO; typedef union UTCL1_PERFCOUNTER1_SELECT regUTCL1_PERFCOUNTER1_SELECT; -#if CHIP_HDR_NAVI31 || CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 +#if CHIP_HDR_NAVI31|| CHIP_HDR_NAVI32|| CHIP_HDR_NAVI33|| CHIP_HDR_PHOENIX1 typedef union UTCL1_PERFCOUNTER2_HI regUTCL1_PERFCOUNTER2_HI; typedef union UTCL1_PERFCOUNTER2_LO regUTCL1_PERFCOUNTER2_LO; typedef union UTCL1_PERFCOUNTER2_SELECT regUTCL1_PERFCOUNTER2_SELECT; diff --git a/lgc/include/lgc/builder/BuilderImpl.h b/lgc/include/lgc/builder/BuilderImpl.h index 166c1f755e..c758bb77ec 100644 --- a/lgc/include/lgc/builder/BuilderImpl.h +++ b/lgc/include/lgc/builder/BuilderImpl.h @@ -315,11 +315,11 @@ class BuilderImpl : public BuilderDefs { const ResourceNode *topNode, const ResourceNode *node, bool shadow); // Get the stride (in bytes) of a descriptor. - llvm::Value *getStride(ResourceNodeType descType, uint64_t descSet, unsigned binding, const ResourceNode *node); + llvm::Value *getStride(ResourceNodeType descType, const ResourceNode *node); // Get a pointer to a descriptor, as a pointer to i8 - llvm::Value *getDescPtr(ResourceNodeType concreteType, ResourceNodeType abstractType, uint64_t descSet, - unsigned binding, const ResourceNode *topNode, const ResourceNode *node); + llvm::Value *getDescPtr(ResourceNodeType concreteType, const ResourceNode *topNode, const ResourceNode *node, + unsigned binding); llvm::Value *scalarizeIfUniform(llvm::Value *value, bool isNonUniform); @@ -519,7 +519,7 @@ class BuilderImpl : public BuilderDefs { // Mark usage for a generic (user) input or output void markGenericInputOutputUsage(bool isOutput, unsigned location, unsigned locationCount, InOutInfo &inOutInfo, - llvm::Value *vertexOrPrimIndex); + llvm::Value *vertexOrPrimIndex, bool isDynLocOffset = false); // Mark interpolation info for FS input. void markInterpolationInfo(InOutInfo &interpInfo); @@ -551,7 +551,7 @@ class BuilderImpl : public BuilderDefs { llvm::Type *getBuiltInTy(BuiltInKind builtIn, InOutInfo inOutInfo); // Mark usage of a built-in input - void markBuiltInInputUsage(BuiltInKind &builtIn, unsigned arraySize); + void markBuiltInInputUsage(BuiltInKind &builtIn, unsigned arraySize, InOutInfo inOutInfo); // Mark usage of a built-in output void markBuiltInOutputUsage(BuiltInKind builtIn, unsigned arraySize, unsigned streamId); @@ -638,11 +638,6 @@ class BuilderImpl : public BuilderDefs { // Create a helper invocation query. Only allowed in a fragment shader. llvm::Value *CreateIsHelperInvocation(const llvm::Twine &instName = ""); - // In the mesh shader, set the actual output size of the primitives and vertices that the mesh shader workgroup will - // emit upon completion. - llvm::Instruction *CreateSetMeshOutputs(llvm::Value *vertexCount, llvm::Value *primitiveCount, - const llvm::Twine &instName = ""); - // ------------------------------------------------------------------------------------------------------------------- // Builder implementation subclass for subgroup operations public: @@ -664,6 +659,9 @@ class BuilderImpl : public BuilderDefs { // Create a subgroup all equal. llvm::Value *CreateSubgroupAllEqual(llvm::Value *const value, const llvm::Twine &instName = ""); + // Create a subgroup rotate. + llvm::Value *CreateSubgroupRotate(llvm::Value *const value, llvm::Value *const delta, llvm::Value *const clusterSize, + const llvm::Twine &instName = ""); // Create a subgroup broadcast. llvm::Value *CreateSubgroupBroadcast(llvm::Value *const value, llvm::Value *const index, const llvm::Twine &instName = ""); @@ -728,17 +726,17 @@ class BuilderImpl : public BuilderDefs { llvm::Value *CreateSubgroupClusteredExclusive(GroupArithOp groupArithOp, llvm::Value *const value, llvm::Value *const clusterSize, const llvm::Twine &instName = ""); - // Create a subgroup quad broadcast. - llvm::Value *CreateSubgroupQuadBroadcast(llvm::Value *const value, llvm::Value *const index, + // Create a quad broadcast. + llvm::Value *CreateSubgroupQuadBroadcast(llvm::Value *const value, llvm::Value *const index, bool inWQM = true, const llvm::Twine &instName = ""); - // Create a subgroup quad swap horizontal. + // Create a quad swap horizontal. llvm::Value *CreateSubgroupQuadSwapHorizontal(llvm::Value *const value, const llvm::Twine &instName = ""); - // Create a subgroup quad swap vertical. + // Create a quad swap vertical. llvm::Value *CreateSubgroupQuadSwapVertical(llvm::Value *const value, const llvm::Twine &instName = ""); - // Create a subgroup quad swap diagonal. + // Create a quad swap diagonal. llvm::Value *CreateSubgroupQuadSwapDiagonal(llvm::Value *const value, const llvm::Twine &instName = ""); // Create a subgroup swizzle quad. diff --git a/tool/dumper/vkgcRegisterDefs.h b/lgc/include/lgc/patch/Continufy.h similarity index 70% rename from tool/dumper/vkgcRegisterDefs.h rename to lgc/include/lgc/patch/Continufy.h index 39a376dff0..dbb52e72e4 100644 --- a/tool/dumper/vkgcRegisterDefs.h +++ b/lgc/include/lgc/patch/Continufy.h @@ -1,7 +1,7 @@ /* *********************************************************************************************************************** * - * Copyright (c) 2022-2023 Advanced Micro Devices, Inc. All Rights Reserved. + * Copyright (c) 2023 Advanced Micro Devices, Inc. All Rights Reserved. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal @@ -24,20 +24,22 @@ **********************************************************************************************************************/ /** *********************************************************************************************************************** - * @file vkgcRegisterDefs.h - * @brief VKGC header file: contains register info to restore the integer value for a register + * @file Continufy.h + * @brief LGC header file : contains declaration of class lgc::Continufy *********************************************************************************************************************** */ #pragma once +#include "lgc/state/PipelineState.h" -namespace Vkgc { +namespace lgc { -// A single register in the pipelineDumperRegs table -struct PipelineDumperReg { - unsigned number; - const char *name; -}; +// ===================================================================================================================== +// Pass to transform indirect call into continuation-style call. +class Continufy : public llvm::PassInfoMixin { -extern const std::vector PipelineDumperRegs; +public: + llvm::PreservedAnalyses run(llvm::Module &module, llvm::ModuleAnalysisManager &analysisManager); + static llvm::StringRef name() { return "Continufy Indirect calls"; } +}; -} // namespace Vkgc +} // namespace lgc diff --git a/lgc/include/lgc/patch/Patch.h b/lgc/include/lgc/patch/Patch.h index 2d1e27dd22..7c52020eae 100644 --- a/lgc/include/lgc/patch/Patch.h +++ b/lgc/include/lgc/patch/Patch.h @@ -53,8 +53,7 @@ class Patch { virtual ~Patch() {} static void addPasses(PipelineState *pipelineState, lgc::PassManager &passMgr, llvm::Timer *patchTimer, - llvm::Timer *optTimer, Pipeline::CheckShaderCacheFunc checkShaderCacheFunc, - llvm::CodeGenOpt::Level optLevel); + llvm::Timer *optTimer, Pipeline::CheckShaderCacheFunc checkShaderCacheFunc, uint32_t optLevel); // Register all the patching passes into the given pass manager static void registerPasses(lgc::PassManager &passMgr); @@ -65,7 +64,7 @@ class Patch { static llvm::GlobalVariable *getLdsVariable(PipelineState *pipelineState, llvm::Module *module); protected: - static void addOptimizationPasses(lgc::PassManager &passMgr, llvm::CodeGenOpt::Level optLevel); + static void addOptimizationPasses(lgc::PassManager &passMgr, uint32_t optLevel); void init(llvm::Module *module); diff --git a/lgc/include/lgc/patch/PatchEntryPointMutate.h b/lgc/include/lgc/patch/PatchEntryPointMutate.h index 5481d8e3aa..8c28bf405c 100644 --- a/lgc/include/lgc/patch/PatchEntryPointMutate.h +++ b/lgc/include/lgc/patch/PatchEntryPointMutate.h @@ -41,6 +41,8 @@ namespace lgc { +class UserDataOp; + // ===================================================================================================================== // The entry-point mutation pass class PatchEntryPointMutate : public Patch, public llvm::PassInfoMixin { @@ -68,40 +70,37 @@ class PatchEntryPointMutate : public Patch, public llvm::PassInfoMixin users; }; + // Dword-aligned load from constant userdata offset. + struct UserDataLoad { + llvm::Instruction *load = nullptr; + unsigned dwordOffset = 0; + unsigned dwordSize = 0; + }; + // Per-merged-shader-stage gathered user data usage information. struct UserDataUsage { // Check if special user data value is used by lgc.special.user.data call generated before PatchEntryPointMutate bool isSpecialUserDataUsed(UserDataMapping kind); - - // List of lgc.spill.table calls - UserDataNodeUsage spillTable; - // List of lgc.push.const calls. There is no direct attempt to unspill these; instead we attempt to - // unspill the pushConstOffsets loads. - UserDataNodeUsage pushConst; - // True means that we did not succeed in putting all loads into pushConstOffsets, so lgc.push.const - // calls must be kept. - bool pushConstSpill = false; - // Per-push-const-offset lists of loads from push const. We attempt to unspill these. - llvm::SmallVector pushConstOffsets; - // Per-user-data-offset lists of lgc.root.descriptor calls - llvm::SmallVector rootDescriptors; - // Per-table lists of lgc.descriptor.table.addr calls - // When the user data nodes are available, a table is identifed by its - // index in the user data nodes. Using this index allows for the possibility that a descriptor - // set is split over multiple tables. When it is not available, a table is identified by the - // descriptor set it contains, which is consistent with the Vulkan binding model. - llvm::SmallVector descriptorTables; + void addLoad(unsigned dwordOffset, unsigned dwordSize); + + unsigned spillTableEntryArgIdx = 0; + // Whether there is any dynamic indexing into lgc.user.data pointers. + bool haveDynamicUserDataLoads = false; + llvm::SmallVector userDataOps; + llvm::SmallVector loads; + // Minimum number of consecutive dwords for a statically known load *starting* at a given offset into user data + // (0 for dwords that aren't used) + llvm::SmallVector loadSizes; + // Entry argument index for each user data dword that has one. + llvm::SmallVector entryArgIdxs; // Per-UserDataMapping lists of lgc.special.user.data calls - llvm::SmallVector specialUserData; - // Minimum offset at which spill table is used. - unsigned spillUsage = UINT_MAX; + llvm::SmallVector specialUserData; // Usage of streamout table bool usesStreamOutTable = false; }; @@ -112,6 +111,9 @@ class PatchEntryPointMutate : public Patch, public llvm::PassInfoMixin &userDataArgs, llvm::SmallVectorImpl &specialUserDataArgs, llvm::IRBuilder<> &builder); - void addUserDataArgs(llvm::SmallVectorImpl &userDataArgs, llvm::IRBuilder<> &builder); - void addUserDataArg(llvm::SmallVectorImpl &userDataArgs, unsigned userDataValue, unsigned sizeInDwords, - const llvm::Twine &name, unsigned *argIndex, llvm::IRBuilder<> &builder); - void determineUnspilledUserDataArgs(llvm::ArrayRef userDataArgs, - llvm::ArrayRef specialUserDataArgs, llvm::IRBuilder<> &builder, - llvm::SmallVectorImpl &unspilledArgs); + void finalizeUserDataArgs(llvm::SmallVectorImpl &userDataArgs, + llvm::ArrayRef specialUserDataArgs, llvm::IRBuilder<> &builder); uint64_t pushFixedShaderArgTys(llvm::SmallVectorImpl &argTys) const; @@ -150,8 +148,8 @@ class PatchEntryPointMutate : public Patch, public llvm::PassInfoMixin userDataTys, llvm::ArrayRef argNames); - void lowerCpsJump(llvm::Function *parent, cps::JumpOp *jumpOp, llvm::BasicBlock *tailBlock, - llvm::SmallVectorImpl &exitInfos); + unsigned lowerCpsJump(llvm::Function *parent, cps::JumpOp *jumpOp, llvm::BasicBlock *tailBlock, + llvm::SmallVectorImpl &exitInfos); void lowerAsCpsReference(cps::AsContinuationReferenceOp &asCpsReferenceOp); // Get UserDataUsage struct for the merged shader stage that contains the given shader stage diff --git a/lgc/include/lgc/patch/PatchInOutImportExport.h b/lgc/include/lgc/patch/PatchInOutImportExport.h index 17729ec450..f77bc08f14 100644 --- a/lgc/include/lgc/patch/PatchInOutImportExport.h +++ b/lgc/include/lgc/patch/PatchInOutImportExport.h @@ -108,8 +108,7 @@ class PatchInOutImportExport : public Patch, public llvm::PassInfoMixin exps); + // Updates the DB shader control that depends on the CB state. + void updateDbShaderControl(); + // Sets the finalized 128-bit cache hash. The version identifies the version of LLPC used to generate the hash. void setFinalized128BitCacheHash(const lgc::Hash128 &finalizedCacheHash, const llvm::VersionTuple &version); @@ -198,9 +201,6 @@ class PalMetadata { // Erase the PAL metadata for FS input mappings. Used when finalizing the PAL metadata in the link. void eraseFragmentInputInfo(); - // Returns true if the fragment input info has an entry for a builtin. - bool fragmentShaderUsesMappedBuiltInInputs(); - // Returns the location of the fragment builtin or InvalidValue if the builtin is not found. unsigned getFragmentShaderBuiltInLoc(unsigned builtIn); @@ -248,7 +248,7 @@ class PalMetadata { void finalizeInputControlRegisterSetting(); // The maximum possible value for the spill threshold entry in the PAL metadata. - static constexpr uint64_t MAX_SPILL_THRESHOLD = UINT_MAX; + static constexpr uint64_t MAX_SPILL_THRESHOLD = USHRT_MAX; unsigned getUserDataCount(unsigned callingConv); unsigned getCallingConventionForFirstHardwareShaderStage(std::string &hwStageName); diff --git a/lgc/include/lgc/state/PipelineState.h b/lgc/include/lgc/state/PipelineState.h index c060146107..924d6a6710 100644 --- a/lgc/include/lgc/state/PipelineState.h +++ b/lgc/include/lgc/state/PipelineState.h @@ -103,8 +103,9 @@ struct NggControl { struct XfbStateMetadata { bool enableXfb; // Whether transform feedback is active bool enablePrimStats; // Whether to count generated primitives - std::array xfbStrides; // The strides of each XFB buffer. - std::array streamXfbBuffers; // The stream-out XFB buffers bit mask per stream. + std::array xfbStrides; // The strides of each XFB buffer + std::array streamXfbBuffers; // The stream-out XFB buffers bit mask per stream + std::array streamActive; // Flag indicating which vertex stream is active }; // ===================================================================================================================== @@ -283,7 +284,7 @@ class PipelineState final : public Pipeline { // Accessors for color export state const ColorExportFormat &getColorExportFormat(unsigned location); const bool hasColorExportFormats() { return !m_colorExportFormats.empty(); } - const ColorExportState &getColorExportState() { return m_colorExportState; } + ColorExportState &getColorExportState() { return m_colorExportState; } // Accessors for pipeline state unsigned getDeviceIndex() const { return m_deviceIndex; } @@ -392,12 +393,6 @@ class PipelineState final : public Pipeline { // Set transform feedback state metadata void setXfbStateMetadata(llvm::Module *module); - // Get XFB state metadata - const XfbStateMetadata &getXfbStateMetadata() const { return m_xfbStateMetadata; } - - // Get XFB state metadata - XfbStateMetadata &getXfbStateMetadata() { return m_xfbStateMetadata; } - // Check if transform feedback is active bool enableXfb() const { return m_xfbStateMetadata.enableXfb; } @@ -418,6 +413,16 @@ class PipelineState final : public Pipeline { // Get transform feedback buffers used for each stream std::array &getStreamXfbBuffers() { return m_xfbStateMetadata.streamXfbBuffers; } + // Set the activness for a vertex stream + void setVertexStreamActive(unsigned streamId) { m_xfbStateMetadata.streamActive[streamId] = true; } + + // Get the activeness for a vertex stream + bool isVertexStreamActive(unsigned streamId) { + if (getRasterizerState().rasterStream == streamId) + return true; // Rasterization stream is always active + return m_xfbStateMetadata.streamActive[streamId]; + } + // Set user data for a specific shader stage void setUserDataMap(ShaderStage shaderStage, llvm::ArrayRef userDataValues) { m_userDataMaps[shaderStage].append(userDataValues.begin(), userDataValues.end()); @@ -582,7 +587,7 @@ class PipelineState final : public Pipeline { bool m_preRasterHasGs = false; // Whether pre-rasterization part has a geometry shader bool m_computeLibrary = false; // Whether pipeline is in fact a compute library std::string m_client; // Client name for PAL metadata - Options m_options; // Per-pipeline options + Options m_options = {}; // Per-pipeline options std::vector m_shaderOptions; // Per-shader options std::unique_ptr m_allocUserDataNodes; // Allocated buffer for user data llvm::ArrayRef m_userDataNodes; // Top-level user data node table diff --git a/lgc/include/lgc/state/ResourceUsage.h b/lgc/include/lgc/state/ResourceUsage.h index 3982562edd..2a9e4d67d4 100644 --- a/lgc/include/lgc/state/ResourceUsage.h +++ b/lgc/include/lgc/state/ResourceUsage.h @@ -298,6 +298,7 @@ struct ResourceUsage { unsigned custom : 1; // Whether custom interpolation is used // Input unsigned fragCoord : 1; // Whether gl_FragCoord is used + unsigned fragCoordIsSample : 1; // Whether gl_FragCoord is used with sample Interpolation unsigned frontFacing : 1; // Whether gl_FrontFacing is used unsigned clipDistance : 4; // Array size of gl_ClipDistance[] (0 means unused) unsigned cullDistance : 4; // Array size of gl_CullDistance[] (0 means unused) diff --git a/lgc/include/lgc/util/AddressExtender.h b/lgc/include/lgc/util/AddressExtender.h index 2accc67068..878fd3534e 100644 --- a/lgc/include/lgc/util/AddressExtender.h +++ b/lgc/include/lgc/util/AddressExtender.h @@ -61,6 +61,14 @@ class AddressExtender { // @returns : 64-bit pointer value llvm::Instruction *extend(llvm::Value *addr32, llvm::Value *highHalf, llvm::Type *ptrTy, llvm::IRBuilder<> &builder); + // Extend an i32 into a 64-bit pointer using the high 32-bits of the PC + // + // @param addr32 : Address as 32-bit value + // @param ptrTy : Type to cast pointer to + // @param builder : IRBuilder to use, already set to the required insert point + // @returns : 64-bit pointer value + llvm::Instruction *extendWithPc(llvm::Value *addr32, llvm::Type *ptrTy, llvm::IRBuilder<> &builder); + private: // Get PC value as v2i32. llvm::Instruction *getPc(); diff --git a/lgc/include/lgc/util/CpsStackLowering.h b/lgc/include/lgc/util/CpsStackLowering.h index 215b999a3b..a632714ba7 100644 --- a/lgc/include/lgc/util/CpsStackLowering.h +++ b/lgc/include/lgc/util/CpsStackLowering.h @@ -49,6 +49,8 @@ class CpsStackLowering { public: CpsStackLowering(llvm::LLVMContext &context) : m_typeLowering(context) {} void lowerCpsStackOps(llvm::Function &function, llvm::Value *); + // Get continuation stack size (in bytes). + unsigned getStackSize() { return m_stackSizeInBytes; } TypeLowering m_typeLowering; @@ -66,6 +68,7 @@ class CpsStackLowering { llvm::Module *m_module; llvm::Value *m_cpsStackAlloca; + unsigned m_stackSizeInBytes = 0; }; } // namespace lgc diff --git a/lgc/include/lgc/util/Internal.h b/lgc/include/lgc/util/Internal.h index caad44c1a8..5726619f56 100644 --- a/lgc/include/lgc/util/Internal.h +++ b/lgc/include/lgc/util/Internal.h @@ -96,4 +96,9 @@ bool isDontCareValue(llvm::Value *value); // type in a return value struct, ensuring it gets into VGPRs. llvm::Type *getVgprTy(llvm::Type *ty); +// Modify the function argument types, and return the new function. NOTE: the function does not do any uses +// replacement, so the caller should call replaceAllUsesWith() for the function and arguments afterwards. +llvm::Function *mutateFunctionArguments(llvm::Function &fn, llvm::Type *retTy, + const llvm::ArrayRef argTys, llvm::AttributeList attributes); + } // namespace lgc diff --git a/lgc/interface/lgc/Builder.h b/lgc/interface/lgc/Builder.h index e29ecfc6c3..3354f9672d 100644 --- a/lgc/interface/lgc/Builder.h +++ b/lgc/interface/lgc/Builder.h @@ -110,6 +110,11 @@ class InOutInfo { m_data.bits.component = component; } + bool isDualSourceBlendDynamic() const { return m_data.bits.dualSourceBlendDynamic; } + void setDualSourceBlendDynamic(bool dualSourceBlendDynamic = true) { + m_data.bits.dualSourceBlendDynamic = dualSourceBlendDynamic; + } + private: union { struct { @@ -125,6 +130,7 @@ class InOutInfo { // whole array or of an element with a variable index. unsigned perPrimitive : 1; // Mesh shader output: whether it is a per-primitive output unsigned component : 2; // Component offset, specifying which components within a location is consumed + unsigned dualSourceBlendDynamic : 1; // Fs output: whether it's dynamic dual source blend output } bits; unsigned u32All; } m_data; @@ -1368,16 +1374,6 @@ class Builder : public BuilderDefs { // @param instName : Name to give instruction(s) llvm::Value *CreateIsHelperInvocation(const llvm::Twine &instName = ""); - // In the mesh shader, set the actual output size of the primitives and vertices that the mesh shader workgroup will - // emit upon completion. - // - // @param vertexCount : Actual output size of the vertices - // @param primitiveCount : Actual output size of the primitives - // @param instName : Name to give final instruction - // @returns Instruction to set the actual size of mesh outputs - llvm::Instruction *CreateSetMeshOutputs(llvm::Value *vertexCount, llvm::Value *primitiveCount, // NOLINT - const llvm::Twine &instName = ""); - // ----------------------------------------------------------------------------------------------------------------- // Subgroup operations @@ -1414,6 +1410,15 @@ class Builder : public BuilderDefs { // @param instName : Name to give instruction(s) llvm::Value *CreateSubgroupAllEqual(llvm::Value *const value, const llvm::Twine &instName = ""); + // Create a subgroup rotate call. + // + // @param value : The value to read from the chosen rotated lane to all active lanes. + // @param delta : The delta/offset added to lane id. + // @param clusterSize : The cluster size if exists. + // @param instName : Name to give final instruction. + llvm::Value *CreateSubgroupRotate(llvm::Value *const value, llvm::Value *const delta, llvm::Value *const clusterSize, + const llvm::Twine &instName = ""); + // Create a subgroup broadcast. // // @param value : The value to broadcast @@ -1545,27 +1550,28 @@ class Builder : public BuilderDefs { llvm::Value *CreateSubgroupClusteredExclusive(GroupArithOp groupArithOp, llvm::Value *const value, llvm::Value *const clusterSize, const llvm::Twine &instName = ""); - // Create a subgroup quad broadcast. + // Create a quad broadcast. // // @param value : The value to broadcast // @param index : The index within the quad to broadcast from + // @param inWQM : Whether it's in whole quad mode // @param instName : Name to give instruction(s) - llvm::Value *CreateSubgroupQuadBroadcast(llvm::Value *const value, llvm::Value *const index, + llvm::Value *CreateSubgroupQuadBroadcast(llvm::Value *const value, llvm::Value *const index, bool inWQM = true, const llvm::Twine &instName = ""); - // Create a subgroup quad swap horizontal. + // Create a quad swap horizontal. // // @param value : The value to swap // @param instName : Name to give instruction(s) llvm::Value *CreateSubgroupQuadSwapHorizontal(llvm::Value *const value, const llvm::Twine &instName = ""); - // Create a subgroup quad swap vertical. + // Create a quad swap vertical. // // @param value : The value to swap // @param instName : Name to give instruction(s) llvm::Value *CreateSubgroupQuadSwapVertical(llvm::Value *const value, const llvm::Twine &instName = ""); - // Create a subgroup quad swap diagonal. + // Create a quad swap diagonal. // // @param value : The value to swap // @param instName : Name to give instruction(s) diff --git a/lgc/interface/lgc/BuiltIns.h b/lgc/interface/lgc/BuiltIns.h index f074b8377f..71b51d6450 100644 --- a/lgc/interface/lgc/BuiltIns.h +++ b/lgc/interface/lgc/BuiltIns.h @@ -36,6 +36,9 @@ namespace lgc { // Max spirv builtIn value static constexpr unsigned BuiltInInternalBase = 0x10000000; +// Max builtIn value = BuiltInInternalBase + 13 +static constexpr unsigned MaxBuiltIn = 0x1000000D; + // Define built-in kind enum. enum BuiltInKind : unsigned { #define BUILTIN(name, number, out, in, type) BuiltIn##name = number, diff --git a/lgc/interface/lgc/ElfLinker.h b/lgc/interface/lgc/ElfLinker.h index be32fed72f..7c1f7b012d 100644 --- a/lgc/interface/lgc/ElfLinker.h +++ b/lgc/interface/lgc/ElfLinker.h @@ -106,9 +106,6 @@ class ElfLinker { // getLastError() to get a textual representation of the error, for use in logging or in error // reporting in a command-line utility. virtual bool link(llvm::raw_pwrite_stream &outStream) = 0; - - // Returns true if the fragment input info has an entry for a builtin. - virtual bool fragmentShaderUsesMappedBuiltInInputs() = 0; }; } // namespace lgc diff --git a/lgc/interface/lgc/GpurtDialect.td b/lgc/interface/lgc/GpurtDialect.td index 8fca5625ca..7b221817e0 100644 --- a/lgc/interface/lgc/GpurtDialect.td +++ b/lgc/interface/lgc/GpurtDialect.td @@ -52,7 +52,7 @@ def GpurtStackReadOp : GpurtOp<"stack.read", [Memory<[(read)]>, WillReturn]> { let results = (outs I32:$result); let summary = "read a dword from stack"; let description = [{ - Read a dword from lds/(scratch buffer) stack at index position + Read a dword from lds/(scrach buffer) stack at index position }]; } @@ -61,7 +61,7 @@ def GpurtStackWriteOp : GpurtOp<"stack.write", [Memory<[(write)]>, WillReturn]> let results = (outs I32:$result); let summary = "write a dword to stack"; let description = [{ - Write a dword to lds/(scratch buffer) stack at index position + Write a dword to lds/(scrach buffer) stack at index position }]; } @@ -187,7 +187,7 @@ def GpurtSetTraceParamsOp : GpurtOp<"set.trace.params", [Memory<[(write Inaccess }]; } -def GpurtCallClosestHitShaderOp : GpurtOp<"call.closest.hit.shader", [WillReturn]> { +def GpurtCallClosestHitShaderOp : GpurtOp<"call.closest.hit.shader", [Memory<[(readwrite InaccessibleMem)]>, WillReturn]> { let arguments = (ins V2I32:$shaderId, I32:$tableIndex); let results = (outs I1:$result); @@ -203,7 +203,7 @@ def GpurtCallClosestHitShaderOp : GpurtOp<"call.closest.hit.shader", [WillReturn }]; } -def GpurtCallMissShaderOp : GpurtOp<"call.miss.shader", [WillReturn]> { +def GpurtCallMissShaderOp : GpurtOp<"call.miss.shader", [Memory<[(readwrite InaccessibleMem)]>, WillReturn]> { let arguments = (ins V2I32:$shaderId, I32:$tableIndex); let results = (outs I1:$result); @@ -219,7 +219,7 @@ def GpurtCallMissShaderOp : GpurtOp<"call.miss.shader", [WillReturn]> { }]; } -def GpurtCallTriangleAnyHitShaderOp : GpurtOp<"call.triangle.any.hit.shader", [WillReturn]> { +def GpurtCallTriangleAnyHitShaderOp : GpurtOp<"call.triangle.any.hit.shader", [Memory<[(readwrite InaccessibleMem)]>, WillReturn]> { let arguments = (ins V2I32:$shaderId, I32:$tableIndex, V2F32:$attr); let results = (outs); @@ -235,7 +235,7 @@ def GpurtCallTriangleAnyHitShaderOp : GpurtOp<"call.triangle.any.hit.shader", [W }]; } -def GpurtCallIntersectionShaderOp : GpurtOp<"call.intersection.shader", [WillReturn]> { +def GpurtCallIntersectionShaderOp : GpurtOp<"call.intersection.shader", [Memory<[(readwrite InaccessibleMem)]>, WillReturn]> { let arguments = (ins V2I32:$shaderId, V2I32:$anyHitShaderId, I32:$tableIndex); let results = (outs); diff --git a/lgc/interface/lgc/LgcContext.h b/lgc/interface/lgc/LgcContext.h index 03514ebcb4..efc48e4de1 100644 --- a/lgc/interface/lgc/LgcContext.h +++ b/lgc/interface/lgc/LgcContext.h @@ -84,7 +84,15 @@ class LgcContext { // @param gpuName : LLVM GPU name (e.g. "gfx900"); empty to use -mcpu option setting // @param optLevel : LLVM optimization level used to initialize target machine static std::unique_ptr createTargetMachine(llvm::StringRef gpuName, - llvm::CodeGenOpt::Level optLevel); +#if LLVM_MAIN_REVISION && LLVM_MAIN_REVISION < 474768 + // Old version of the code + llvm::CodeGenOpt::Level optLevel +#else + // New version of the code (also handles unknown + // version, which we treat as latest) + llvm::CodeGenOptLevel optLevel +#endif + ); // Create the LgcContext. // @@ -129,11 +137,21 @@ class LgcContext { // Adds target passes to pass manager, depending on "-filetype" and "-emit-llvm" options void addTargetPasses(lgc::LegacyPassManager &passMgr, llvm::Timer *codeGenTimer, llvm::raw_pwrite_stream &outStream); +#if LLVM_MAIN_REVISION && LLVM_MAIN_REVISION < 474768 + // Old version of the code // Returns the optimization level for the context. llvm::CodeGenOpt::Level getOptimizationLevel() const; // Returns the optimization level used for context initialization. llvm::CodeGenOpt::Level getInitialOptimizationLevel() const { return m_initialOptLevel; } +#else + // New version of the code (also handles unknown version, which we treat as latest) + // Returns the optimization level for the context. + llvm::CodeGenOptLevel getOptimizationLevel() const; + + // Returns the optimization level used for context initialization. + llvm::CodeGenOptLevel getInitialOptimizationLevel() const { return m_initialOptLevel; } +#endif // Utility method to create a start/stop timer pass static llvm::ModulePass *createStartStopTimer(llvm::Timer *timer, bool starting); @@ -147,6 +165,7 @@ class LgcContext { // statements in the middle-end output to that stream, giving a dump of LLVM IR at a // few strategic places in the pass flow, as well as information such as input/output // mapping. + // The pointer set here is thread local. static void setLlpcOuts(llvm::raw_ostream *stream) { m_llpcOuts = stream; } static llvm::raw_ostream *getLgcOuts() { return m_llpcOuts; } @@ -160,13 +179,19 @@ class LgcContext { LgcContext(llvm::LLVMContext &context, unsigned palAbiVersion); - static llvm::raw_ostream *m_llpcOuts; // nullptr or stream for LLPC_OUTS - llvm::LLVMContext &m_context; // LLVM context - llvm::TargetMachine *m_targetMachine = nullptr; // Target machine - TargetInfo *m_targetInfo = nullptr; // Target info - unsigned m_palAbiVersion = 0xFFFFFFFF; // PAL pipeline ABI version to compile for - PassManagerCache *m_passManagerCache = nullptr; // Pass manager cache and creator - llvm::CodeGenOpt::Level m_initialOptLevel; // Optimization level at initialization + static thread_local llvm::raw_ostream *m_llpcOuts; // nullptr or stream for LLPC_OUTS + llvm::LLVMContext &m_context; // LLVM context + llvm::TargetMachine *m_targetMachine = nullptr; // Target machine + TargetInfo *m_targetInfo = nullptr; // Target info + unsigned m_palAbiVersion = 0xFFFFFFFF; // PAL pipeline ABI version to compile for + PassManagerCache *m_passManagerCache = nullptr; // Pass manager cache and creator +#if LLVM_MAIN_REVISION && LLVM_MAIN_REVISION < 474768 + // Old version of the code + llvm::CodeGenOpt::Level m_initialOptLevel; // Optimization level at initialization +#else + // New version of the code (also handles unknown version, which we treat as latest) + llvm::CodeGenOptLevel m_initialOptLevel; // Optimization level at initialization +#endif }; } // namespace lgc diff --git a/lgc/interface/lgc/LgcDialect.td b/lgc/interface/lgc/LgcDialect.td index 056f356a3b..0aa034b6bd 100644 --- a/lgc/interface/lgc/LgcDialect.td +++ b/lgc/interface/lgc/LgcDialect.td @@ -123,6 +123,88 @@ def EmitMeshTasksOp : LgcOp<"emit.mesh.tasks", [Memory<[]>]> { }]; } +def SetMeshOutputsOp : LgcOp<"set.mesh.outputs", [Memory<[]>]> { + let arguments = (ins I32:$vertexCount, I32:$primitiveCount); + let results = (outs); + + let summary = "set the actual output size of the primitives and vertices that the mesh shader workgroup will emit"; + let description = [{ + In the mesh shader, set the actual output size of the primitives and vertices that the mesh shader workgroup will + emit upon completion. + + `vertexCount` is the actual output size of the vertices. + `primitiveCount` is the actual output size of the primitives. + }]; +} + +def SetMeshPrimitiveIndicesOp : LgcOp<"set.mesh.primitive.indices", [Memory<[]>]> { + let arguments = (ins I32:$primitiveIndex, (ScalarOrFixedVector I32):$primitiveIndices); + let results = (outs); + + let summary = "set primitive indices for mesh shader"; + let description = [{ + In the mesh shader, set primitive indices by forming primitive connectivity data and writing it to LDS. + + `primitiveIndex` is the primitive index specifying which primitive to set. + `primitiveIndices` are all vertex index values that are used to form this primitive. + }]; +} + +def SetMeshPrimitiveCulledOp : LgcOp<"set.mesh.primitive.culled", [Memory<[]>]> { + let arguments = (ins I32:$primitiveIndex, I1:$isCulled); + let results = (outs); + + let summary = "set primitive culled state for mesh shader"; + let description = [{ + In the mesh shader, set primitive culled state by writing the null primitive flag to LDS. + + `primitiveIndex` is the primitive index specifying which primitive to set. + `isCulled` is a boolean flag indicating whether this primitive is culled. + }]; +} + +def GetMeshBuiltinInputOp : LgcOp<"get.mesh.builtin.input", [Memory<[]>, WillReturn]> { + let arguments = (ins AttrI32:$builtin); + let results = (outs value:$result); + + let defaultBuilderHasExplicitResultType = true; + + let summary = "return the value of mesh built-in input"; + let description = [{ + Return the value of mesh built-in input. + + `builtIn` is the input built-in ID of mesh shader. + }]; +} + +def WriteMeshVertexOutputOp : LgcOp<"write.mesh.vertex.output", [Memory<[]>]> { + let arguments = (ins I32:$outputOffset, I32:$vertexIndex, value:$outputValue); + let results = (outs); + + let summary = "Write mesh shader vertex outputs"; + let description = [{ + In the mesh shader, write mesh shader vertex outputs to LDS. + + `outputOffset` is the relative offset of this output (in dwords) within all outputs of the indexed vertex. + `vertexIndex` is the vertex index specifying which vertex to write. + `outputValue` is the output value to write. + }]; +} + +def WriteMeshPrimitiveOutputOp : LgcOp<"write.mesh.primitive.output", [Memory<[]>]> { + let arguments = (ins I32:$outputOffset, I32:$primitiveIndex, value:$outputValue); + let results = (outs); + + let summary = "Write mesh shader primitive outputs"; + let description = [{ + In the mesh shader, write mesh shader primitive outputs to LDS. + + `outputOffset` is the relative offset of this output (in dwords) within all outputs of the indexed primitive. + `primitiveIndex` is the primitive index specifying which primitive to write. + `outputValue` is the output value to write. + }]; +} + def GenericLocationOp : OpClass { let arguments = (ins AttrI1:$perPrimitive, AttrI32:$location, I32:$locOffset, I32:$elemIdx, I32:$arrayIndex); @@ -208,3 +290,35 @@ def InputImportInterpolatedOp : LgcOp<"input.import.interpolated", [Memory<[]>, this operation to map between HW and API. }]; } + +def LoadUserDataOp : LgcOp<"load.user.data", [Memory<[]>, WillReturn]> { + let arguments = (ins AttrI32:$offset); + let results = (outs value:$result); + + let defaultBuilderHasExplicitResultType = true; + + let verifier = [ + (or (I32 $result), + (I64 $result), + (FixedVectorType $result, I32, any)), + ]; + + let summary = "load from a constant offset in the user data"; + let description = [{ + `offset` is the offset into the user data table, in bytes. It must be a multiple of 4. + }]; +} + +def UserDataOp : LgcOp<"user.data", [Memory<[]>, WillReturn]> { + let arguments = (ins AttrI32:$offset); + let results = (outs ConstantPointer:$result); + + let summary = "return a pointer into user data"; + let description = [{ + `offset` is a byte offset into user data + + Attempting to access user data before `offset` via a pointer returned by this operation is undefined behavior. + + This operation is used for push constants in Vulkan and in some cases by OpenGL. + }]; +} diff --git a/lgc/interface/lgc/Pipeline.h b/lgc/interface/lgc/Pipeline.h index e6598b8ea4..618d504b0e 100644 --- a/lgc/interface/lgc/Pipeline.h +++ b/lgc/interface/lgc/Pipeline.h @@ -103,6 +103,14 @@ enum class ThreadGroupSwizzleMode : unsigned { Count, }; +// Enumerate the ray tracing indirect modes. +enum class RayTracingIndirectMode : unsigned { + NotIndirect = 0, // Not in indirect mode (or not ray tracing pipeline) + Legacy = 1, // Legacy indirect mode + ContinuationsContinufy = 2, // Continuations flow that based on Continufy pass + Continuations = 3, // Continuations flow that based on LowerRaytracingPipeline pass +}; + // Value for shadowDescriptorTable pipeline option. static const unsigned ShadowDescriptorTableDisable = ~0U; @@ -110,65 +118,61 @@ static const char XfbStateMetadataName[] = "lgc.xfb.state"; static const char SampleShadingMetaName[] = "lgc.sample.shading"; // Middle-end per-pipeline options to pass to SetOptions. +// The front-end should zero-initialize a struct with "= {}" in case future changes add new fields. // Note: new fields must be added to the end of this structure to maintain test compatibility. -struct Options { - uint64_t hash[2]; // Pipeline hash to set in ELF PAL metadata - unsigned includeDisassembly; // If set, the disassembly for all compiled shaders will be included - // in the pipeline ELF. - unsigned reconfigWorkgroupLayout; // If set, allows automatic workgroup reconfigure to take place on - // compute shaders. - bool forceCsThreadIdSwizzling; // Force rearranges threadId within group into blocks of 8*8 or 8*4. - unsigned overrideThreadGroupSizeX; // Override value for thread group size.X - unsigned overrideThreadGroupSizeY; // Override value for thread group size.Y - unsigned overrideThreadGroupSizeZ; // Override value for thread group size.Z - unsigned includeIr; // If set, the IR for all compiled shaders will be included in the - // pipeline ELF. - unsigned nggFlags; // Flags to control NGG (NggFlag* values ored together) - unsigned nggBackfaceExponent; // Value from 1 to UINT32_MAX that will cause the backface culling - // algorithm to ignore area calculations that are less than - // (10 ^ -(backfaceExponent)) / abs(w0 * w1 * w2) - // Only valid if the NGG backface culler is enabled. - // A value of 0 will disable the threshold. - NggSubgroupSizing nggSubgroupSizing; // NGG subgroup sizing type - bool fullSubgroups; // Use full subgroup lanes - unsigned nggVertsPerSubgroup; // How to determine NGG verts per subgroup - unsigned nggPrimsPerSubgroup; // How to determine NGG prims per subgroup - unsigned highAddrOfFmask; // High dword of Fmask address - bool enableFmask; // Whether to use Fmasks when loading from MSAA images - unsigned allowNullDescriptor; // Allow and give defined behavior for null descriptor - unsigned disableImageResourceCheck; // Don't do image resource type check - unsigned reserved0f; // Reserved for future functionality - unsigned useResourceBindingRange; // A resource node binding is the start of a range whose size is - // sizeInDwords/stride. - unsigned optimizeTessFactor; // If set, we can determine either send HT_TessFactor message or write to TF buffer - // depending the values of tessellation factors. - unsigned enableInterpModePatch; // Enable to do per-sample interpolation for nonperspective and smooth input - unsigned pageMigrationEnabled; // Enable page migration - ResourceLayoutScheme resourceLayoutScheme; // Resource layout scheme - ThreadGroupSwizzleMode threadGroupSwizzleMode; // Thread group swizzle mode - unsigned reverseThreadGroupBufferDescSet; // Descriptor set ID of the internal buffer for reverse thread group - // optimization - unsigned reverseThreadGroupBufferBinding; // Binding ID of the internal buffer for reverse thread group optimization - bool internalRtShaders; // Enable internal RT shader intrinsics - bool enableUberFetchShader; // Enable UberShader - bool reserved16; - bool disableTruncCoordForGather; // If set, trunc_coord of sampler srd is disabled for gather4 - bool enableColorExportShader; // Explicitly build color export shader, UnlinkedStageFragment elf will return extra - // meta data. - Options() { - // The memory representation of this struct gets written into LLVM metadata. To prevent uninitialized values from - // being written, we force everything to 0, including alignment gaps. - memset(this, 0, sizeof(Options)); - } - - Options(const Options &opts) { *this = opts; } - - Options &operator=(const Options &opts) { - // Copy everything, including data in alignment because this is used to implement the copy constructor. - memcpy(this, &opts, sizeof(Options)); - return *this; - } +union Options { + unsigned u32All[34]; + struct { + uint64_t hash[2]; // Pipeline hash to set in ELF PAL metadata + unsigned includeDisassembly; // If set, the disassembly for all compiled shaders will be included + // in the pipeline ELF. + unsigned reconfigWorkgroupLayout; // If set, allows automatic workgroup reconfigure to take place on + // compute shaders. + bool forceCsThreadIdSwizzling; // Force rearranges threadId within group into blocks of 8*8 or 8*4. + unsigned overrideThreadGroupSizeX; // Override value for thread group size.X + unsigned overrideThreadGroupSizeY; // Override value for thread group size.Y + unsigned overrideThreadGroupSizeZ; // Override value for thread group size.Z + unsigned includeIr; // If set, the IR for all compiled shaders will be included in the + // pipeline ELF. + unsigned nggFlags; // Flags to control NGG (NggFlag* values ored together) + unsigned nggBackfaceExponent; // Value from 1 to UINT32_MAX that will cause the backface culling + // algorithm to ignore area calculations that are less than + // (10 ^ -(backfaceExponent)) / abs(w0 * w1 * w2) + // Only valid if the NGG backface culler is enabled. + // A value of 0 will disable the threshold. + NggSubgroupSizing nggSubgroupSizing; // NGG subgroup sizing type + bool fullSubgroups; // Use full subgroup lanes + unsigned nggVertsPerSubgroup; // How to determine NGG verts per subgroup + unsigned nggPrimsPerSubgroup; // How to determine NGG prims per subgroup + unsigned highAddrOfFmask; // High dword of Fmask address + bool enableFmask; // Whether to use Fmasks when loading from MSAA images + unsigned allowNullDescriptor; // Allow and give defined behavior for null descriptor + unsigned disableImageResourceCheck; // Don't do image resource type check + unsigned reserved0f; // Reserved for future functionality + unsigned useResourceBindingRange; // A resource node binding is the start of a range whose size is + // sizeInDwords/stride. + unsigned optimizeTessFactor; // If set, we can determine either send HT_TessFactor message or write to TF buffer + // depending the values of tessellation factors. + unsigned enableInterpModePatch; // Enable to do per-sample interpolation for nonperspective and smooth input + unsigned pageMigrationEnabled; // Enable page migration + ResourceLayoutScheme resourceLayoutScheme; // Resource layout scheme + ThreadGroupSwizzleMode threadGroupSwizzleMode; // Thread group swizzle mode + unsigned reverseThreadGroupBufferDescSet; // Descriptor set ID of the internal buffer for reverse thread group + // optimization + unsigned reverseThreadGroupBufferBinding; // Binding ID of the internal buffer for reverse thread group optimization + bool internalRtShaders; // Enable internal RT shader intrinsics + bool enableUberFetchShader; // Enable UberShader + bool reserved16; + bool disableTruncCoordForGather; // If set, trunc_coord of sampler srd is disabled for gather4 + bool enableColorExportShader; // Explicitly build color export shader, UnlinkedStageFragment elf will return extra + // meta data. + bool fragCoordUsesInterpLoc; // Determining fragCoord use InterpLoc + bool disableSampleMask; // Disable export of sample mask from PS + bool reserved20; + RayTracingIndirectMode rtIndirectMode; // Ray tracing indirect mode + }; }; +static_assert(sizeof(Options) == sizeof(Options::u32All)); /// Represent a pipeline option which can be automatic as well as explicitly set. enum InvariantLoadsOption : unsigned { Auto = 0, EnableOptimization = 1, DisableOptimization = 2, ClearInvariants = 3 }; @@ -184,120 +188,111 @@ struct ColorExportInfo { // Middle-end per-shader options to pass to SetShaderOptions. // Note: new fields must be added to the end of this structure to maintain test compatibility. -struct ShaderOptions { - uint64_t hash[2]; // Shader hash to set in ELF PAL metadata - unsigned trapPresent; // Indicates a trap handler will be present when this pipeline is executed, - // and any trap conditions encountered in this shader should call the trap - // handler. This could include an arithmetic exception, an explicit trap - // request from the host, or a trap after every instruction when in debug - // mode. - unsigned debugMode; // When set, this shader should cause the trap handler to be executed after - // every instruction. Only valid if trapPresent is set. - unsigned allowReZ; // Allow the DB ReZ feature to be enabled. This will cause an early-Z test - // to potentially kill PS waves before launch, and also issues a late-Z test - // in case the PS kills pixels. Only valid for pixel shaders. +// The front-end should zero-initialize this with "= {}" in case future changes add new fields. +union ShaderOptions { + unsigned u32All[34]; + struct { + uint64_t hash[2]; // Shader hash to set in ELF PAL metadata + unsigned trapPresent; // Indicates a trap handler will be present when this pipeline is executed, + // and any trap conditions encountered in this shader should call the trap + // handler. This could include an arithmetic exception, an explicit trap + // request from the host, or a trap after every instruction when in debug + // mode. + unsigned debugMode; // When set, this shader should cause the trap handler to be executed after + // every instruction. Only valid if trapPresent is set. + unsigned allowReZ; // Allow the DB ReZ feature to be enabled. This will cause an early-Z test + // to potentially kill PS waves before launch, and also issues a late-Z test + // in case the PS kills pixels. Only valid for pixel shaders. - // Maximum VGPR limit for this shader. The actual limit used by back-end for shader compilation is the smaller - // of this value and whatever the target GPU supports. To effectively disable this limit, set this to 0. - unsigned vgprLimit; + // Maximum VGPR limit for this shader. The actual limit used by back-end for shader compilation is the smaller + // of this value and whatever the target GPU supports. To effectively disable this limit, set this to 0. + unsigned vgprLimit; - // Maximum SGPR limit for this shader. The actual limit used by back-end for shader compilation is the smaller - // of this value and whatever the target GPU supports. To effectively disable this limit, set this to 0. - unsigned sgprLimit; + // Maximum SGPR limit for this shader. The actual limit used by back-end for shader compilation is the smaller + // of this value and whatever the target GPU supports. To effectively disable this limit, set this to 0. + unsigned sgprLimit; - /// Overrides the number of CS thread-groups which the GPU will launch per compute-unit. This throttles the - /// shader, which can sometimes enable more graphics shader work to complete in parallel. A value of zero - /// disables limiting the number of thread-groups to launch. This field is ignored for graphics shaders. - unsigned maxThreadGroupsPerComputeUnit; + /// Overrides the number of CS thread-groups which the GPU will launch per compute-unit. This throttles the + /// shader, which can sometimes enable more graphics shader work to complete in parallel. A value of zero + /// disables limiting the number of thread-groups to launch. This field is ignored for graphics shaders. + unsigned maxThreadGroupsPerComputeUnit; - unsigned waveSize; // Control the number of threads per wavefront (GFX10+) - unsigned subgroupSize; // Override for the wave size when the shader uses gl_SubgroupSize, 0 for no override - unsigned wgpMode; // Whether to choose WGP mode or CU mode (GFX10+) - WaveBreak waveBreakSize; // Size of region to force the end of a wavefront (GFX10+). - // Only valid for fragment shaders. + unsigned waveSize; // Control the number of threads per wavefront (GFX10+) + unsigned subgroupSize; // Override for the wave size when the shader uses gl_SubgroupSize, 0 for no override + unsigned wgpMode; // Whether to choose WGP mode or CU mode (GFX10+) + WaveBreak waveBreakSize; // Size of region to force the end of a wavefront (GFX10+). + // Only valid for fragment shaders. - // Vector size threshold for load scalarizer. 0 means do not scalarize loads at all. - unsigned loadScalarizerThreshold; + // Vector size threshold for load scalarizer. 0 means do not scalarize loads at all. + unsigned loadScalarizerThreshold; - // Use the LLVM backend's SI scheduler instead of the default scheduler. - bool useSiScheduler; + // Use the LLVM backend's SI scheduler instead of the default scheduler. + bool useSiScheduler; - // Disable various LLVM IR code sinking passes. - bool disableCodeSinking; + // Disable various LLVM IR code sinking passes. + bool disableCodeSinking; - // Schedule for latency even if it reduces occupancy. - bool favorLatencyHiding; + // Schedule for latency even if it reduces occupancy. + bool favorLatencyHiding; - // Default unroll threshold for LLVM. - unsigned unrollThreshold; + // Default unroll threshold for LLVM. + unsigned unrollThreshold; - /// Override FP32 denormal handling. - DenormalMode fp32DenormalMode; + /// Override FP32 denormal handling. + DenormalMode fp32DenormalMode; - /// Whether enable adjustment of the fragment shader depth import for the variable shading rate - bool adjustDepthImportVrs; + /// Whether enable adjustment of the fragment shader depth import for the variable shading rate + bool adjustDepthImportVrs; - // Unroll loops by specified amount. 0 is default, 1 is no unroll. - unsigned forceLoopUnrollCount; + // Unroll loops by specified amount. 0 is default, 1 is no unroll. + unsigned forceLoopUnrollCount; - // Disable loop unrolling. - bool disableLoopUnroll; + // Disable loop unrolling. + bool disableLoopUnroll; - // Threshold for minimum number of blocks in a loop to disable the LICM pass. - unsigned disableLicmThreshold; + // Threshold for minimum number of blocks in a loop to disable the LICM pass. + unsigned disableLicmThreshold; - // Threshold to use for loops with Unroll hint. 0 to use llvm.loop.unroll.full metadata. - unsigned unrollHintThreshold; + // Threshold to use for loops with Unroll hint. 0 to use llvm.loop.unroll.full metadata. + unsigned unrollHintThreshold; - // Threshold to use for loops with DontUnroll hint. 0 to use llvm.loop.unroll.disable metadata. - unsigned dontUnrollHintThreshold; + // Threshold to use for loops with DontUnroll hint. 0 to use llvm.loop.unroll.disable metadata. + unsigned dontUnrollHintThreshold; - // Maximum amount of LDS space to be used for spilling. - unsigned ldsSpillLimitDwords; + // Maximum amount of LDS space to be used for spilling. + unsigned ldsSpillLimitDwords; - // Attempt to scalarize waterfall descriptor loads. - bool scalarizeWaterfallLoads; + // Attempt to scalarize waterfall descriptor loads. + bool scalarizeWaterfallLoads; - /// Override value for ThreadGroupSizeX - unsigned overrideShaderThreadGroupSizeX; + /// Override value for ThreadGroupSizeX + unsigned overrideShaderThreadGroupSizeX; - /// Override value for ThreadGroupSizeY - unsigned overrideShaderThreadGroupSizeY; + /// Override value for ThreadGroupSizeY + unsigned overrideShaderThreadGroupSizeY; - /// Override value for ThreadGroupSizeZ - unsigned overrideShaderThreadGroupSizeZ; + /// Override value for ThreadGroupSizeZ + unsigned overrideShaderThreadGroupSizeZ; - // When there is a valid "feedback loop" in renderpass, lateZ needs to be enabled - // In Vulkan a "feedback loop" is described as a subpass where there is at least - // one input attachment that is also a color or depth/stencil attachment - // Feedback loops are allowed and their behavior is well defined under certain conditions. - // When there is a feedback loop it is possible for the shaders to read - // the contents of the color and depth/stencil attachments - // from the shader during draw. Because of that possibility you have to use late-z - bool forceLateZ; + // When there is a valid "feedback loop" in renderpass, lateZ needs to be enabled + // In Vulkan a "feedback loop" is described as a subpass where there is at least + // one input attachment that is also a color or depth/stencil attachment + // Feedback loops are allowed and their behavior is well defined under certain conditions. + // When there is a feedback loop it is possible for the shaders to read + // the contents of the color and depth/stencil attachments + // from the shader during draw. Because of that possibility you have to use late-z + bool forceLateZ; - /// Minimum number of addresses to use NSA encoding on GFX10+ (0 = backend decides). - unsigned nsaThreshold; + /// Minimum number of addresses to use NSA encoding on GFX10+ (0 = backend decides). + unsigned nsaThreshold; - /// Aggressively mark shader loads as invariant (where it is safe to do so). - InvariantLoadsOption aggressiveInvariantLoads; + /// Aggressively mark shader loads as invariant (where it is safe to do so). + InvariantLoadsOption aggressiveInvariantLoads; - bool reserved; - - ShaderOptions() { - // The memory representation of this struct gets written into LLVM metadata. To prevent uninitialized values from - // being written, we force everything to 0, including alignment gaps. - memset(this, 0, sizeof(ShaderOptions)); - } - - ShaderOptions(const ShaderOptions &opts) { *this = opts; } - - ShaderOptions &operator=(const ShaderOptions &opts) { - // Copy everything, including data in alignment because this is used to implement the copy constructor - memcpy(this, &opts, sizeof(ShaderOptions)); - return *this; - } + bool reserved; + }; }; +static_assert(sizeof(ShaderOptions) == sizeof(ShaderOptions::u32All)); // ===================================================================================================================== // Definitions for user data resource nodes @@ -393,6 +388,7 @@ enum BufDataFormat { BufDataFormat5_6_5_1_Bgra, BufDataFormat1_5_6_5, BufDataFormat5_9_9_9, + BufDataFormat8_A }; // Numeric format of vertex buffer entry. These match the GFX9 hardware encoding. @@ -405,6 +401,7 @@ enum BufNumFormat { BufNumFormatSint = 5, BufNumFormatSnorm_Ogl = 6, BufNumFormatFloat = 7, + BufNumFormatFixed = 8, // Extra formats not in GFX9 hardware encoding: BufNumFormatSrgb, BufNumFormatOther, @@ -461,8 +458,9 @@ struct ColorExportFormat { // Struct to pass to SetColorExportState struct ColorExportState { - unsigned alphaToCoverageEnable; // Enable alpha to coverage - unsigned dualSourceBlendEnable; // Blend state bound at draw time will use a dual source blend mode + unsigned alphaToCoverageEnable; // Enable alpha to coverage + unsigned dualSourceBlendEnable; // Blend state bound at draw time will use a dual source blend mode + unsigned dynamicDualSourceBlendEnable; // Dynamic dual source blend enable }; // Struct to pass to SetInputAssemblyState. @@ -506,6 +504,8 @@ struct RasterizerState { unsigned rasterStream; // Which vertex stream to rasterize ProvokingVertexMode provokingVertexMode; // Specifies which vertex of a primitive is the _provoking vertex_, // this impacts which vertex's "flat" VS outputs are passed to the PS. + unsigned pixelShaderSamples; // Controls the pixel shader execution rate. Must be less than or equal to + // coverageSamples. Valid values are 1, 2, 4, and 8. }; // Struct to pass to depth/stencil state @@ -673,15 +673,14 @@ enum class PipelineLink : unsigned { struct FsOutInfo { unsigned hwColorTarget; // HW color output index unsigned location; // Output location in resource layout - bool isSigned; // Whether is signed + unsigned isSigned; // Whether is signed char typeName[8]; // Output data type Name, like v3f32 }; // Represents shader meta data struct FragmentOutputs { - FsOutInfo *fsOutInfos; // The color export information. + unsigned discard; // Whether this fragment shader has kill enabled. unsigned fsOutInfoCount; // The number of color exports. - bool discard; // Whether this fragment shader has kill enabled. }; // ===================================================================================================================== diff --git a/lgc/patch/Continufy.cpp b/lgc/patch/Continufy.cpp new file mode 100644 index 0000000000..d07ef67014 --- /dev/null +++ b/lgc/patch/Continufy.cpp @@ -0,0 +1,205 @@ +/* + *********************************************************************************************************************** + * + * Copyright (c) 2023 Advanced Micro Devices, Inc. All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + **********************************************************************************************************************/ +/** + *********************************************************************************************************************** + * @file Continufy.cpp + * @brief LLPC source file: contains implementation of class lgc::Continufy. + * This pass translates indirect call into cps.await call, which will be lowered into continuation call. + *********************************************************************************************************************** + */ +#include "lgc/patch/Continufy.h" +#include "lgccps/LgcCpsDialect.h" +#include "lgcrt/LgcRtDialect.h" +#include "lgc/Builder.h" +#include "lgc/LgcDialect.h" +#include "lgc/patch/Patch.h" +#include "lgc/state/PalMetadata.h" +#include "llvm/Support/Debug.h" + +#define DEBUG_TYPE "lgc-continufy" + +using namespace llvm; +using namespace lgc; +using namespace lgc::cps; + +namespace lgc { +using RtStage = rt::RayTracingShaderStage; + +static Function *insertCpsArguments(Function &fn) { + // Mutate function arguments, add ({} %state, %rcr). + LLVMContext &context = fn.getContext(); + SmallVector argTys = {StructType::get(context, {}), IntegerType::get(context, 32)}; + auto *fnTy = fn.getFunctionType(); + argTys.append(fnTy->params().begin(), fnTy->params().end()); + + auto *newFn = mutateFunctionArguments(fn, Type::getVoidTy(context), argTys, fn.getAttributes()); + + fn.replaceAllUsesWith(newFn); + for (unsigned idx = 0; idx < fn.arg_size(); idx++) { + Value *oldArg = fn.getArg(idx); + Value *newArg = newFn->getArg(idx + 2); + newArg->setName(oldArg->getName()); + oldArg->replaceAllUsesWith(newArg); + } + newFn->getArg(0)->setName("state"); + newFn->getArg(1)->setName("rcr"); + return newFn; +} + +/// Return the CPS levels mask of the ray-tracing stages that the input stage will return to. +/// NOTE: As Continufy pass will only be used to transform legacy indirect-call based ray-tracing shaders to lgccps +/// based continuation passing shader. The 'return stages' are just the possible callers of the input stage in typical +/// Vulkan ray-tracing pipeline. +static unsigned getReturnedLevels(int stage) { + // Traversal will return to RGS or CHS/MISS. + if (stage == -1) + return 1u << (unsigned)CpsLevel::RayGen | 1u << (unsigned)CpsLevel::ClosestHit_Miss_Callable; + + RtStage rtStage = static_cast(stage); + switch (rtStage) { + case RtStage::ShaderStageRayGeneration: + llvm_unreachable("Raygen shader should not arrive here."); + case RtStage::ShaderStageClosestHit: + case RtStage::ShaderStageMiss: + // Traversal + return (1u << (unsigned)CpsLevel::Traversal); + case RtStage::ShaderStageCallable: + // CHS/Miss/Callable | RGS + return (1u << (unsigned)CpsLevel::ClosestHit_Miss_Callable | 1u << (unsigned)CpsLevel::RayGen); + case RtStage::ShaderStageAnyHit: + // IS | Traversal + return (1u << (unsigned)CpsLevel::Intersection | 1u << (unsigned)CpsLevel::Traversal); + case RtStage::ShaderStageIntersection: + // Traversal + return 1u << (unsigned)CpsLevel::Traversal; + default: + llvm_unreachable("Unknown raytracing shader type."); + } +} + +/// Return CPS level of the ray-tracing stage. +static CpsLevel getCpsLevelFromRtStage(int stage) { + // Traversal + if (stage == -1) + return CpsLevel::Traversal; + + RtStage rtStage = static_cast(stage); + switch (rtStage) { + case RtStage::ShaderStageRayGeneration: + return CpsLevel::RayGen; + case RtStage::ShaderStageClosestHit: + case RtStage::ShaderStageMiss: + case RtStage::ShaderStageCallable: + return CpsLevel::ClosestHit_Miss_Callable; + case RtStage::ShaderStageAnyHit: + return CpsLevel::AnyHit_CombinedIntersection_AnyHit; + case RtStage::ShaderStageIntersection: + return CpsLevel::Intersection; + default: + llvm_unreachable("Unknown raytracing shader type."); + } +} + +// ===================================================================================================================== +// Executes this LLVM patching pass on the specified LLVM module. +// +// @param [in/out] module : LLVM module to be run on +// @param [in/out] analysisManager : Analysis manager to use for this transformation +// @returns : The preserved analyses (The analyses that are still valid after this pass) +PreservedAnalyses Continufy::run(Module &module, ModuleAnalysisManager &analysisManager) { + LLVM_DEBUG(dbgs() << "Run the Continufy pass \n"); + LLVMContext &context = module.getContext(); + + llvm_dialects::Builder builder(context); + SmallVector tobeErased; + + for (auto &fn : make_early_inc_range(module.functions())) { + MDNode *continufyStage = fn.getMetadata("continufy.stage"); + Function *fnPtr = &fn; + std::optional currentRtStage; + if (continufyStage) { + fnPtr = insertCpsArguments(fn); + currentRtStage = mdconst::extract(continufyStage->getOperand(0))->getSExtValue(); + CpsLevel level = getCpsLevelFromRtStage(currentRtStage.value()); + setCpsFunctionLevel(*fnPtr, level); + } + + // Translate call instruction with %continufy.stage into lgc.cps.await() with continuation reference. + for (auto &block : *fnPtr) { + for (auto &inst : block) { + if (!isa(inst)) + continue; + auto *calleeStage = inst.getMetadata("continufy.stage"); + if (!calleeStage) + continue; + + auto &call = cast(inst); + assert(call.getCallingConv() == CallingConv::SPIR_FUNC); + auto *called = call.getCalledOperand(); + + builder.SetInsertPoint(&call); + auto *continuationRef = builder.CreatePtrToInt(called, IntegerType::get(context, 32)); + CpsLevel calleeLevel = + getCpsLevelFromRtStage(mdconst::extract(calleeStage->getOperand(0))->getSExtValue()); + // RayGen level is zero, so it does not need a logic OR here. + if (calleeLevel != CpsLevel::RayGen) + continuationRef = builder.CreateOr(continuationRef, builder.getInt32((uint32_t)calleeLevel)); + + SmallVector callArgs(call.args()); + auto *newCall = builder.create(call.getType(), continuationRef, 1u << (unsigned)calleeLevel, callArgs); + call.replaceAllUsesWith(newCall); + tobeErased.push_back(&call); + } + + // Translate 'ret' into lgc.cps.jump for continufy stages. + if (!currentRtStage.has_value()) + continue; + // Skip the 'ret' in RGS. + if (currentRtStage.value() == (int32_t)RtStage::ShaderStageRayGeneration) + continue; + Instruction *term = block.getTerminator(); + if (auto *retInst = dyn_cast(term)) { + builder.SetInsertPoint(term); + auto *retValue = retInst->getReturnValue(); + // %rcr + SmallVector tailArgs = {PoisonValue::get(builder.getInt32Ty())}; + // return value + if (retValue) + tailArgs.push_back(retValue); + + builder.create(fnPtr->getArg(1), getReturnedLevels(currentRtStage.value()), + PoisonValue::get(StructType::get(context, {})) /* state */, tailArgs); + builder.CreateUnreachable(); + term->eraseFromParent(); + } + } + } + for (auto *inst : tobeErased) + inst->eraseFromParent(); + + return PreservedAnalyses::allInSet(); +} + +} // namespace lgc diff --git a/lgc/patch/FragColorExport.cpp b/lgc/patch/FragColorExport.cpp index dfd3aeddd8..cfe3891990 100644 --- a/lgc/patch/FragColorExport.cpp +++ b/lgc/patch/FragColorExport.cpp @@ -238,7 +238,8 @@ Value *FragColorExport::handleColorExportInstructions(Value *output, unsigned hw } if (m_pipelineState->getTargetInfo().getGfxIpVersion().major >= 11 && - m_pipelineState->getColorExportState().dualSourceBlendEnable) { + (m_pipelineState->getColorExportState().dualSourceBlendEnable || + m_pipelineState->getColorExportState().dynamicDualSourceBlendEnable)) { // Save them for later dual-source-swizzle m_blendSourceChannels = exportTy->isHalfTy() ? (compCount + 1) / 2 : compCount; assert(hwColorExport <= 1); @@ -698,8 +699,6 @@ llvm::Value *LowerFragColorExport::jumpColorExport(llvm::Function *fragEntryPoin m_pipelineState->getPalMetadata()->addColorExportInfo(m_info); m_pipelineState->getPalMetadata()->setDiscardState(m_resUsage->builtInUsage.fs.discard); - ReturnInst *retInst = cast(builder.GetInsertPoint()->getParent()->getTerminator()); - // First build the argument type for the fragment shader. SmallVector outputTypes; for (const ColorExportInfo &info : m_info) { @@ -729,15 +728,13 @@ llvm::Value *LowerFragColorExport::jumpColorExport(llvm::Function *fragEntryPoin auto funcTyPtr = funcTy->getPointerTo(ADDR_SPACE_CONST); auto colorShaderAddr = ShaderInputs::getSpecialUserData(UserDataMapping::ColorExportAddr, builder); AddressExtender addrExt(builder.GetInsertPoint()->getParent()->getParent()); - auto funcPtr = addrExt.extend(colorShaderAddr, builder.getInt32(HighAddrPc), funcTyPtr, builder); + auto funcPtr = addrExt.extendWithPc(colorShaderAddr, funcTyPtr, builder); // Jump auto callInst = builder.CreateCall(funcTy, funcPtr, argVal); callInst->setCallingConv(CallingConv::AMDGPU_Gfx); callInst->setDoesNotReturn(); callInst->setOnlyWritesMemory(); - builder.CreateUnreachable(); - retInst->eraseFromParent(); return callInst; } @@ -769,10 +766,12 @@ void LowerFragColorExport::collectExportInfoForBuiltinOutput(Function *module, B } case BuiltInSampleMask: { assert(output->getType()->isArrayTy()); + if (!m_pipelineState->getOptions().disableSampleMask) { + // NOTE: Only gl_SampleMask[0] is valid for us. + m_sampleMask = builder.CreateExtractValue(output, {0}); + m_sampleMask = builder.CreateBitCast(m_sampleMask, builder.getFloatTy()); + } - // NOTE: Only gl_SampleMask[0] is valid for us. - m_sampleMask = builder.CreateExtractValue(output, {0}); - m_sampleMask = builder.CreateBitCast(m_sampleMask, builder.getFloatTy()); break; } case BuiltInFragStencilRef: { @@ -1013,7 +1012,8 @@ void FragColorExport::generateExportInstructions(ArrayRef } if (m_pipelineState->getTargetInfo().getGfxIpVersion().major >= 11 && - m_pipelineState->getColorExportState().dualSourceBlendEnable) + (m_pipelineState->getColorExportState().dualSourceBlendEnable || + m_pipelineState->getColorExportState().dynamicDualSourceBlendEnable)) lastExport = dualSourceSwizzle(builder); if (!lastExport && dummyExport) { diff --git a/lgc/patch/Gfx6ConfigBuilder.cpp b/lgc/patch/Gfx6ConfigBuilder.cpp index bf06b3cb97..f11ea55b06 100644 --- a/lgc/patch/Gfx6ConfigBuilder.cpp +++ b/lgc/patch/Gfx6ConfigBuilder.cpp @@ -841,11 +841,15 @@ template void ConfigBuilder::buildPsRegConfig(ShaderStage shaderSta assert(shaderStage == ShaderStageFragment); const auto intfData = m_pipelineState->getShaderInterfaceData(shaderStage); + const auto &options = m_pipelineState->getOptions(); const auto &shaderOptions = m_pipelineState->getShaderOptions(shaderStage); const auto resUsage = m_pipelineState->getShaderResourceUsage(shaderStage); const auto &builtInUsage = resUsage->builtInUsage.fs; const auto &fragmentMode = m_pipelineState->getShaderModes()->getFragmentShaderMode(); + const bool useFloatLocationAtIteratedSampleNumber = + options.fragCoordUsesInterpLoc ? builtInUsage.fragCoordIsSample : builtInUsage.runAtSampleRate; + unsigned floatMode = setupFloatingPointMode(shaderStage); SET_REG_FIELD(&config->psRegs, SPI_SHADER_PGM_RSRC1_PS, FLOAT_MODE, floatMode); SET_REG_FIELD(&config->psRegs, SPI_SHADER_PGM_RSRC1_PS, DX10_CLAMP, true); // Follow PAL setting @@ -858,7 +862,7 @@ template void ConfigBuilder::buildPsRegConfig(ShaderStage shaderSta if (fragmentMode.pixelCenterInteger) { // TRUE - Force floating point position to upper left corner of pixel (X.0, Y.0) SET_REG_FIELD(&config->psRegs, SPI_BARYC_CNTL, POS_FLOAT_ULC, true); - } else if (builtInUsage.runAtSampleRate) { + } else if (useFloatLocationAtIteratedSampleNumber) { // 2 - Calculate per-pixel floating point position at iterated sample number SET_REG_FIELD(&config->psRegs, SPI_BARYC_CNTL, POS_FLOAT_LOCATION, 2); } else { diff --git a/lgc/patch/Gfx9ConfigBuilder.cpp b/lgc/patch/Gfx9ConfigBuilder.cpp index 5ce8802461..4509089cc9 100644 --- a/lgc/patch/Gfx9ConfigBuilder.cpp +++ b/lgc/patch/Gfx9ConfigBuilder.cpp @@ -1537,6 +1537,7 @@ template void ConfigBuilder::buildPsRegConfig(ShaderStage shaderSta assert(shaderStage == ShaderStageFragment); const auto intfData = m_pipelineState->getShaderInterfaceData(shaderStage); + const auto &options = m_pipelineState->getOptions(); const auto &shaderOptions = m_pipelineState->getShaderOptions(shaderStage); const auto resUsage = m_pipelineState->getShaderResourceUsage(shaderStage); const auto &builtInUsage = resUsage->builtInUsage.fs; @@ -1565,11 +1566,14 @@ template void ConfigBuilder::buildPsRegConfig(ShaderStage shaderSta SET_REG_GFX11_FIELD(&config->psRegs, SPI_SHADER_PGM_RSRC4_PS, IMAGE_OP, resUsage->useImageOp); } + const bool useFloatLocationAtIteratedSampleNumber = + options.fragCoordUsesInterpLoc ? builtInUsage.fragCoordIsSample : builtInUsage.runAtSampleRate; + SET_REG_FIELD(&config->psRegs, SPI_BARYC_CNTL, FRONT_FACE_ALL_BITS, true); if (fragmentMode.pixelCenterInteger) { // TRUE - Force floating point position to upper left corner of pixel (X.0, Y.0) SET_REG_FIELD(&config->psRegs, SPI_BARYC_CNTL, POS_FLOAT_ULC, true); - } else if (builtInUsage.runAtSampleRate) { + } else if (useFloatLocationAtIteratedSampleNumber) { // 2 - Calculate per-pixel floating point position at iterated sample number SET_REG_FIELD(&config->psRegs, SPI_BARYC_CNTL, POS_FLOAT_LOCATION, 2); } else { diff --git a/lgc/patch/MeshTaskShader.cpp b/lgc/patch/MeshTaskShader.cpp index 6e71b1edf5..d0fd411c7c 100644 --- a/lgc/patch/MeshTaskShader.cpp +++ b/lgc/patch/MeshTaskShader.cpp @@ -400,11 +400,11 @@ void MeshTaskShader::processMeshShader(Function *entryPoint) { // 2. Lower mesh shader specific calls: // - SetMeshOutputs -> Write vertex/primitive count to LDS and send message GS_ALLOC_REQ // (threadIdInSubgroup == 0) - // - SetPrimitiveIndices -> Write primitive connectivity data to LDS - // - SetPrimitiveCulled -> Write null primitive flag to LDS - // - GetMeshInput -> Lower mesh built-in input - // - Lower task payload pointer -> Transform task payload descriptor - // - Write primitive/vertex output -> Write output data to LDS + // - SetMeshPrimitiveIndices -> Write primitive connectivity data to LDS + // - SetMeshPrimitiveCulled -> Write null primitive flag to LDS + // - GetMeshBuiltinInput -> Lower mesh built-in input + // - TaskPayloadPtr -> Transform task payload descriptor + // - WriteMeshVertexOutput/WriteMeshPrimitiveOutput -> Write output data to LDS // } // // Barrier (if needBarrierFlag) @@ -1085,6 +1085,309 @@ void MeshTaskShader::lowerEmitMeshTasks(EmitMeshTasksOp &emitMeshTasksOp) { m_callsToRemove.push_back(&emitMeshTasksOp); } +// ===================================================================================================================== +// Lower set mesh outputs. Set the actual output size of the primitives and vertices that the mesh shader workgroup +// will emit. +// +// @param setMeshOutputsOp : Call instruction op to set mesh outputs +void MeshTaskShader::lowerSetMeshOutputs(SetMeshOutputsOp &setMeshOutputsOp) { + m_builder.SetInsertPoint(&setMeshOutputsOp); + + assert(getShaderStage(setMeshOutputsOp.getFunction()) == ShaderStageMesh); + + auto vertexCount = setMeshOutputsOp.getVertexCount(); + auto primitiveCount = setMeshOutputsOp.getPrimitiveCount(); + + auto setMeshOutputsCall = m_builder.GetInsertPoint(); + + auto checkSetMeshOutputsBlock = m_builder.GetInsertBlock(); + auto setMeshOutputsBlock = checkSetMeshOutputsBlock->splitBasicBlock(setMeshOutputsCall, ".setMeshOutputs"); + auto endSetMeshOutputsBlock = setMeshOutputsBlock->splitBasicBlock(setMeshOutputsCall, ".endSetMeshOutputs"); + + // Modify ".checkSetMeshOutputs" block + { + m_builder.SetInsertPoint(checkSetMeshOutputsBlock->getTerminator()); + + auto firstThreadInSubgroup = m_builder.CreateICmpEQ(m_waveThreadInfo.threadIdInSubgroup, m_builder.getInt32(0)); + m_builder.CreateCondBr(firstThreadInSubgroup, setMeshOutputsBlock, endSetMeshOutputsBlock); + checkSetMeshOutputsBlock->getTerminator()->eraseFromParent(); // Remove old terminator + } + + // Construct ".setMeshOutputs" block + { + m_builder.SetInsertPoint(setMeshOutputsBlock->getTerminator()); + + // Promote vertex/primitive count to SGPRs + vertexCount = m_builder.CreateIntrinsic(m_builder.getInt32Ty(), Intrinsic::amdgcn_readfirstlane, vertexCount); + primitiveCount = m_builder.CreateIntrinsic(m_builder.getInt32Ty(), Intrinsic::amdgcn_readfirstlane, primitiveCount); + + // Check if vertex count or primitive count is zero. If so, set both to zero in order to disable vertex/primitive + // exporting. + auto zeroVertexCount = m_builder.CreateICmpEQ(vertexCount, m_builder.getInt32(0)); + auto zeroPrimitiveCount = m_builder.CreateICmpEQ(primitiveCount, m_builder.getInt32(0)); + auto hasZeroCount = m_builder.CreateOr(zeroVertexCount, zeroPrimitiveCount); + vertexCount = m_builder.CreateSelect(hasZeroCount, m_builder.getInt32(0), vertexCount); + primitiveCount = m_builder.CreateSelect(hasZeroCount, m_builder.getInt32(0), primitiveCount); + + // NOTE: Here, we promote vertex/primitive count to SGPRs once again because M0 implicitly used in s_sendmsg is + // SGPR. LLVM backend has issues of handling this because it doesn't use s_cselect to translate LLVM IR select + // instruction (which keeps the destination operand still in SGPR) and it doesn't use readfirstlane to promote + // VGPR to SGPR for M0. + vertexCount = m_builder.CreateIntrinsic(m_builder.getInt32Ty(), Intrinsic::amdgcn_readfirstlane, vertexCount); + primitiveCount = m_builder.CreateIntrinsic(m_builder.getInt32Ty(), Intrinsic::amdgcn_readfirstlane, primitiveCount); + + // M0[10:0] = vertexCount, M0[22:12] = primitiveCount + Value *m0 = m_builder.CreateShl(primitiveCount, 12); + m0 = m_builder.CreateOr(m0, vertexCount); + m_builder.CreateIntrinsic(Intrinsic::amdgcn_s_sendmsg, {}, {m_builder.getInt32(GsAllocReq), m0}); + + Value *ldsOffset = m_builder.getInt32(getMeshShaderLdsRegionStart(MeshLdsRegion::VertexCount)); + writeValueToLds(vertexCount, ldsOffset); + + ldsOffset = m_builder.getInt32(getMeshShaderLdsRegionStart(MeshLdsRegion::PrimitiveCount)); + writeValueToLds(primitiveCount, ldsOffset); + } + + // Construct ".endSetMeshOutputs" block + { + m_builder.SetInsertPoint(endSetMeshOutputsBlock->getTerminator()); + + // Currently, nothing to do + } + + m_callsToRemove.push_back(&setMeshOutputsOp); +} + +// ===================================================================================================================== +// Lower set mesh primitive indices. Set primitive indices by forming primitive connectivity data and writing it to LDS. +// +// @param setMeshPrimitiveIndicesOp : Call instruction op to set primitive indices for mesh shader +void MeshTaskShader::lowerSetMeshPrimitiveIndices(SetMeshPrimitiveIndicesOp &setMeshPrimitiveIndicesOp) { + m_builder.SetInsertPoint(&setMeshPrimitiveIndicesOp); + + assert(getShaderStage(setMeshPrimitiveIndicesOp.getFunction()) == ShaderStageMesh); + + auto primitiveIndex = setMeshPrimitiveIndicesOp.getPrimitiveIndex(); + auto primitiveIndices = setMeshPrimitiveIndicesOp.getPrimitiveIndices(); + + // + // HW requires the primitive connectivity data has the following bit layout: + // + // +----------------+---------------+---------------+---------------+ + // | Null Primitive | Vertex Index2 | Vertex Index1 | Vertex Index0 | + // | [31] | [28:20] | [18:10] | [8:0] | + // +----------------+---------------+---------------+---------------+ + // + auto &meshMode = m_pipelineState->getShaderModes()->getMeshShaderMode(); + Value *primitiveData = nullptr; + + if (meshMode.outputPrimitive == OutputPrimitives::Points) { + assert(primitiveIndices->getType() == m_builder.getInt32Ty()); // i32 + primitiveData = primitiveIndices; + } else if (meshMode.outputPrimitive == OutputPrimitives::Lines) { + assert(primitiveIndices->getType() == FixedVectorType::get(m_builder.getInt32Ty(), 2)); // v2i32 + Value *vertex0 = m_builder.CreateExtractElement(primitiveIndices, static_cast(0)); + Value *vertex1 = m_builder.CreateExtractElement(primitiveIndices, 1); + + if (m_gfxIp.major <= 11) { + primitiveData = m_builder.CreateShl(vertex1, 10); + primitiveData = m_builder.CreateOr(primitiveData, vertex0); + } else { + llvm_unreachable("Not implemented!"); + } + } else { + assert(meshMode.outputPrimitive == OutputPrimitives::Triangles); + Value *vertex0 = m_builder.CreateExtractElement(primitiveIndices, static_cast(0)); + Value *vertex1 = m_builder.CreateExtractElement(primitiveIndices, 1); + Value *vertex2 = m_builder.CreateExtractElement(primitiveIndices, 2); + + if (m_gfxIp.major <= 11) { + primitiveData = m_builder.CreateShl(vertex2, 10); + primitiveData = m_builder.CreateOr(primitiveData, vertex1); + primitiveData = m_builder.CreateShl(primitiveData, 10); + primitiveData = m_builder.CreateOr(primitiveData, vertex0); + } else { + llvm_unreachable("Not implemented!"); + } + } + + Value *ldsStart = m_builder.getInt32(getMeshShaderLdsRegionStart(MeshLdsRegion::PrimitiveIndices)); + Value *ldsOffset = m_builder.CreateAdd(ldsStart, primitiveIndex); + + // NOTE: We first clear old primitive connectivity data and use atomic OR operation to set new data. This is because + // the null primitive flag might be set via built-in CullPrimitive. + static const unsigned ClearMask = (1u << 31); + atomicOpWithLds(AtomicRMWInst::And, m_builder.getInt32(ClearMask), ldsOffset); + atomicOpWithLds(AtomicRMWInst::Or, primitiveData, ldsOffset); + + m_callsToRemove.push_back(&setMeshPrimitiveIndicesOp); +} + +// ===================================================================================================================== +// Lower get mesh built-in value. Return the value of mesh built-in input. +// +// @param getMeshBuiltinInputOp : Call instruction op to return the value of mesh built-in input +void MeshTaskShader::lowerGetMeshBuiltinInput(GetMeshBuiltinInputOp &getMeshBuiltinInputOp) { + m_builder.SetInsertPoint(&getMeshBuiltinInputOp); + + auto entryPoint = getMeshBuiltinInputOp.getFunction(); + assert(getShaderStage(entryPoint) == ShaderStageMesh); + + Value *input = PoisonValue::get(getMeshBuiltinInputOp.getType()); + auto builtin = getMeshBuiltinInputOp.getBuiltin(); + switch (builtin) { + case BuiltInDrawIndex: { + auto &entryArgIdxs = m_pipelineState->getShaderInterfaceData(ShaderStageMesh)->entryArgIdxs.mesh; + input = getFunctionArgument(entryPoint, entryArgIdxs.drawIndex); + break; + } + case BuiltInViewIndex: { + if (m_pipelineState->getInputAssemblyState().enableMultiView) { + auto &entryArgIdxs = m_pipelineState->getShaderInterfaceData(ShaderStageMesh)->entryArgIdxs.mesh; + input = getFunctionArgument(entryPoint, entryArgIdxs.viewIndex); + } else { + input = m_builder.getInt32(0); + } + break; + } + case BuiltInNumWorkgroups: { + input = getMeshNumWorkgroups(); + break; + } + case BuiltInWorkgroupId: { + input = getMeshWorkgroupId(); + break; + } + case BuiltInLocalInvocationId: { + input = getMeshLocalInvocationId(); + break; + } + case BuiltInGlobalInvocationId: { + input = getMeshGlobalInvocationId(); + break; + } + case BuiltInLocalInvocationIndex: { + input = getMeshLocalInvocationIndex(); + break; + } + case BuiltInSubgroupId: { + // subgroupId = localInvocationIndex / subgroupSize + auto localInvocationIndex = getMeshLocalInvocationIndex(); + unsigned subgroupSize = m_pipelineState->getShaderSubgroupSize(ShaderStageMesh); + assert(subgroupSize > 0 && subgroupSize % 32 == 0); + input = m_builder.CreateLShr(localInvocationIndex, m_builder.getInt32(Log2_32(subgroupSize))); + break; + } + case BuiltInNumSubgroups: { + // numSubgroups = numMeshThreads / subgroupSize + const auto &meshMode = m_pipelineState->getShaderModes()->getMeshShaderMode(); + const unsigned numMeshThreads = meshMode.workgroupSizeX * meshMode.workgroupSizeY * meshMode.workgroupSizeZ; + unsigned subgroupSize = m_pipelineState->getShaderSubgroupSize(ShaderStageMesh); + assert(subgroupSize > 0 && subgroupSize % 32 == 0); + const unsigned numSubgroups = alignTo(numMeshThreads, subgroupSize) / subgroupSize; + input = m_builder.getInt32(numSubgroups); + break; + } + default: { + llvm_unreachable("Unknown mesh built-in input!"); + break; + } + } + + assert(!isa(input)); + getMeshBuiltinInputOp.replaceAllUsesWith(input); + + m_callsToRemove.push_back(&getMeshBuiltinInputOp); +} + +// ===================================================================================================================== +// Lower set mesh primitive culled state. Set primitive culled state by writing the null primitive flag to LDS. +// +// @param setMeshPrimitiveIndicesOp : Call instruction op to set primitive indices for mesh shader +void MeshTaskShader::lowerSetMeshPrimitiveCulled(SetMeshPrimitiveCulledOp &setMeshPrimitiveCulledOp) { + m_builder.SetInsertPoint(&setMeshPrimitiveCulledOp); + + assert(getShaderStage(setMeshPrimitiveCulledOp.getFunction()) == ShaderStageMesh); + + auto primitiveIndex = setMeshPrimitiveCulledOp.getPrimitiveIndex(); + auto isCulled = setMeshPrimitiveCulledOp.getIsCulled(); + + // + // HW requires the primitive connectivity data has the following bit layout: + // [31] = Null primitive flag + // [28:20] = Index of vertex2 + // [18:10] = Index of vertex1 + // [8:0] = Index of vertex0 + // + assert(isCulled->getType()->isIntegerTy(1)); + + static const unsigned NullPrimitive = (1u << 31); + auto nullPrimitive = m_builder.CreateSelect(isCulled, m_builder.getInt32(NullPrimitive), m_builder.getInt32(0)); + + Value *ldsStart = m_builder.getInt32(getMeshShaderLdsRegionStart(MeshLdsRegion::PrimitiveIndices)); + Value *ldsOffset = m_builder.CreateAdd(ldsStart, primitiveIndex); + + // NOTE: We first clear null primitive flag and use atomic OR operation to set new flag. This is because the + // primitive connectivity data might be set via built-in PrimitiveXXXIndices. + static const unsigned ClearMask = ~(1u << 31); + atomicOpWithLds(AtomicRMWInst::And, m_builder.getInt32(ClearMask), ldsOffset); + atomicOpWithLds(AtomicRMWInst::Or, nullPrimitive, ldsOffset); + + m_callsToRemove.push_back(&setMeshPrimitiveCulledOp); +} + +// ===================================================================================================================== +// Lower write mesh vertex output. Write mesh shader vertex outputs to LDS. +// +// @param writeMeshVertexOutputOp : Call instruction op to write vertex output for mesh shader +void MeshTaskShader::lowerWriteMeshVertexOutput(WriteMeshVertexOutputOp &writeMeshVertexOutputOp) { + m_builder.SetInsertPoint(&writeMeshVertexOutputOp); + + assert(getShaderStage(writeMeshVertexOutputOp.getFunction()) == ShaderStageMesh); + + auto outputOffset = writeMeshVertexOutputOp.getOutputOffset(); + auto vertexIndex = writeMeshVertexOutputOp.getVertexIndex(); + auto outputValue = writeMeshVertexOutputOp.getOutputValue(); + + const auto resUsage = m_pipelineState->getShaderResourceUsage(ShaderStageMesh); + const unsigned vertexStride = 4 * resUsage->inOutUsage.outputMapLocCount; // Corresponds to vec4 output + + Value *ldsStart = m_builder.getInt32(getMeshShaderLdsRegionStart(MeshLdsRegion::VertexOutput)); + Value *ldsOffset = m_builder.CreateMul(vertexIndex, m_builder.getInt32(vertexStride)); + ldsOffset = m_builder.CreateAdd(ldsOffset, outputOffset); + ldsOffset = m_builder.CreateAdd(ldsStart, ldsOffset); + + writeValueToLds(outputValue, ldsOffset); + + m_callsToRemove.push_back(&writeMeshVertexOutputOp); +} + +// ===================================================================================================================== +// Lower write mesh primitive output. Write mesh shader primitive outputs to LDS. +// +// @param writeMeshPrimitiveOutputOp : Call instruction op to write primitive output for mesh shader +void MeshTaskShader::lowerWriteMeshPrimitiveOutput(WriteMeshPrimitiveOutputOp &writeMeshPrimitiveOutputOp) { + m_builder.SetInsertPoint(&writeMeshPrimitiveOutputOp); + + assert(getShaderStage(writeMeshPrimitiveOutputOp.getFunction()) == ShaderStageMesh); + + auto outputOffset = writeMeshPrimitiveOutputOp.getOutputOffset(); + auto primitiveIndex = writeMeshPrimitiveOutputOp.getPrimitiveIndex(); + auto outputValue = writeMeshPrimitiveOutputOp.getOutputValue(); + + const auto resUsage = m_pipelineState->getShaderResourceUsage(ShaderStageMesh); + const unsigned primitiveStride = 4 * resUsage->inOutUsage.perPrimitiveOutputMapLocCount; // Corresponds to vec4 output + + Value *ldsStart = m_builder.getInt32(getMeshShaderLdsRegionStart(MeshLdsRegion::PrimitiveOutput)); + Value *ldsOffset = m_builder.CreateMul(primitiveIndex, m_builder.getInt32(primitiveStride)); + ldsOffset = m_builder.CreateAdd(ldsOffset, outputOffset); + ldsOffset = m_builder.CreateAdd(ldsStart, ldsOffset); + + writeValueToLds(outputValue, ldsOffset); + + m_callsToRemove.push_back(&writeMeshPrimitiveOutputOp); +} + // ===================================================================================================================== // Initialize the wave/thread info from the entry-point. // @@ -1388,80 +1691,15 @@ void MeshTaskShader::lowerMeshShaderBody(BasicBlock *apiMeshEntryBlock, BasicBlo } // Lower mesh shader calls - auto module = entryPoint->getParent(); - for (auto &func : module->functions()) { - if (!func.isDeclaration()) - continue; // Not targets - - if (func.getName().startswith(lgcName::MeshTaskCallPrefix)) { - for (auto user : func.users()) { - CallInst *const call = cast(user); - - if (call->getFunction() != entryPoint) - continue; // Not belong to mesh shader - - m_builder.SetInsertPoint(call); - - if (func.getName().startswith(lgcName::MeshTaskSetMeshOutputs)) { - // Set mesh outputs - assert(call->arg_size() == 2); - auto vertexCount = call->getOperand(0); - auto primitiveCount = call->getOperand(1); - - setMeshOutputs(vertexCount, primitiveCount); - } else if (func.getName().startswith(lgcName::MeshTaskSetPrimitiveIndices)) { - // Set primitive indices - assert(call->arg_size() == 2); - auto primitiveIndex = call->getOperand(0); - auto primitiveIndices = call->getOperand(1); - - setPrimitiveIndices(primitiveIndex, primitiveIndices); - } else if (func.getName().startswith(lgcName::MeshTaskSetPrimitiveCulled)) { - // Set primitive culled - assert(call->arg_size() == 2); - auto primitiveIndex = call->getOperand(0); - auto isCulled = call->getOperand(1); - - setPrimitiveCulled(primitiveIndex, isCulled); - } else if (func.getName().startswith(lgcName::MeshTaskGetMeshInput)) { - // Get mesh input - assert(call->arg_size() == 1); - unsigned builtIn = cast(call->getOperand(0))->getZExtValue(); - - // NOTE: Mesh shader input lowering is supposed to happen at the beginning of API mesh shader. - m_builder.SetInsertPoint(&*apiMeshEntryBlock->getFirstNonPHIOrDbgOrAlloca()); - - auto meshInput = getMeshInput(static_cast(builtIn)); - assert(meshInput->getType() == call->getType()); - call->replaceAllUsesWith(meshInput); - } else if (func.getName().startswith(lgcName::MeshTaskWriteVertexOutput)) { - // Write vertex output - assert(call->arg_size() == 3); - auto outputOffset = call->getOperand(0); - auto vertexIndex = call->getOperand(1); - auto outputValue = call->getOperand(2); - - writeVertexOutput(outputOffset, vertexIndex, outputValue); - } else if (func.getName().startswith(lgcName::MeshTaskWritePrimitiveOutput)) { - // Write primitive output - assert(call->arg_size() == 3); - auto outputOffset = call->getOperand(0); - auto primitiveIndex = call->getOperand(1); - auto outputValue = call->getOperand(2); - - writePrimitiveOutput(outputOffset, primitiveIndex, outputValue); - } else { - llvm_unreachable("Unknown mesh shader call!"); - } - - m_callsToRemove.push_back(call); - } - } - } - static auto visitor = llvm_dialects::VisitorBuilder() .setStrategy(llvm_dialects::VisitorStrategy::ByFunctionDeclaration) .add(&MeshTaskShader::lowerTaskPayloadPtr) + .add(&MeshTaskShader::lowerSetMeshOutputs) + .add(&MeshTaskShader::lowerSetMeshPrimitiveIndices) + .add(&MeshTaskShader::lowerSetMeshPrimitiveCulled) + .add(&MeshTaskShader::lowerGetMeshBuiltinInput) + .add(&MeshTaskShader::lowerWriteMeshVertexOutput) + .add(&MeshTaskShader::lowerWriteMeshPrimitiveOutput) .build(); visitor.visit(*this, *entryPoint); @@ -1473,253 +1711,6 @@ void MeshTaskShader::lowerMeshShaderBody(BasicBlock *apiMeshEntryBlock, BasicBlo m_callsToRemove.clear(); } -// ===================================================================================================================== -// Set the actual output size of the primitives and vertices that the mesh shader workgroup will emit. -// -// @param vertexCount : Actual output size of the vertices -// @param primitiveCount : Actual output size of the primitives -void MeshTaskShader::setMeshOutputs(Value *vertexCount, Value *primitiveCount) { - auto setMeshOutputsCall = m_builder.GetInsertPoint(); - - auto checkSetMeshOutputsBlock = m_builder.GetInsertBlock(); - auto setMeshOutputsBlock = checkSetMeshOutputsBlock->splitBasicBlock(setMeshOutputsCall, ".setMeshOutputs"); - auto endSetMeshOutputsBlock = setMeshOutputsBlock->splitBasicBlock(setMeshOutputsCall, ".endSetMeshOutputs"); - - // Modify ".checkSetMeshOutputs" block - { - m_builder.SetInsertPoint(checkSetMeshOutputsBlock->getTerminator()); - - auto firstThreadInSubgroup = m_builder.CreateICmpEQ(m_waveThreadInfo.threadIdInSubgroup, m_builder.getInt32(0)); - m_builder.CreateCondBr(firstThreadInSubgroup, setMeshOutputsBlock, endSetMeshOutputsBlock); - checkSetMeshOutputsBlock->getTerminator()->eraseFromParent(); // Remove old terminator - } - - // Construct ".setMeshOutputs" block - { - m_builder.SetInsertPoint(setMeshOutputsBlock->getTerminator()); - - // Promote vertex/primitive count to SGPRs - vertexCount = m_builder.CreateIntrinsic(m_builder.getInt32Ty(), Intrinsic::amdgcn_readfirstlane, vertexCount); - primitiveCount = m_builder.CreateIntrinsic(m_builder.getInt32Ty(), Intrinsic::amdgcn_readfirstlane, primitiveCount); - - // Check if vertex count or primitive count is zero. If so, set both to zero in order to disable vertex/primitive - // exporting. - auto zeroVertexCount = m_builder.CreateICmpEQ(vertexCount, m_builder.getInt32(0)); - auto zeroPrimitiveCount = m_builder.CreateICmpEQ(primitiveCount, m_builder.getInt32(0)); - auto hasZeroCount = m_builder.CreateOr(zeroVertexCount, zeroPrimitiveCount); - vertexCount = m_builder.CreateSelect(hasZeroCount, m_builder.getInt32(0), vertexCount); - primitiveCount = m_builder.CreateSelect(hasZeroCount, m_builder.getInt32(0), primitiveCount); - - // NOTE: Here, we promote vertex/primitive count to SGPRs once again because M0 implicitly used in s_sendmsg is - // SGPR. LLVM backend has issues of handling this because it doesn't use s_cselect to translate LLVM IR select - // instruction (which keeps the destination operand still in SGPR) and it doesn't use readfirstlane to promote - // VGPR to SGPR for M0. - vertexCount = m_builder.CreateIntrinsic(m_builder.getInt32Ty(), Intrinsic::amdgcn_readfirstlane, vertexCount); - primitiveCount = m_builder.CreateIntrinsic(m_builder.getInt32Ty(), Intrinsic::amdgcn_readfirstlane, primitiveCount); - - // M0[10:0] = vertexCount, M0[22:12] = primitiveCount - Value *m0 = m_builder.CreateShl(primitiveCount, 12); - m0 = m_builder.CreateOr(m0, vertexCount); - m_builder.CreateIntrinsic(Intrinsic::amdgcn_s_sendmsg, {}, {m_builder.getInt32(GsAllocReq), m0}); - - Value *ldsOffset = m_builder.getInt32(getMeshShaderLdsRegionStart(MeshLdsRegion::VertexCount)); - writeValueToLds(vertexCount, ldsOffset); - - ldsOffset = m_builder.getInt32(getMeshShaderLdsRegionStart(MeshLdsRegion::PrimitiveCount)); - writeValueToLds(primitiveCount, ldsOffset); - } - - // Construct ".endSetMeshOutputs" block - { - m_builder.SetInsertPoint(endSetMeshOutputsBlock->getTerminator()); - - // Currently, nothing to do - } -} - -// ===================================================================================================================== -// Set primitive indices by forming primitive connectivity data and writing it to LDS. -// -// @param primitiveIndex : Primitive indexing -// @param primitiveIndices : All vertex index values that are used to form this primitive -void MeshTaskShader::setPrimitiveIndices(Value *primitiveIndex, Value *primitiveIndices) { - // - // HW requires the primitive connectivity data has the following bit layout: - // - // +----------------+---------------+---------------+---------------+ - // | Null Primitive | Vertex Index2 | Vertex Index1 | Vertex Index0 | - // | [31] | [28:20] | [18:10] | [8:0] | - // +----------------+---------------+---------------+---------------+ - // - auto &meshMode = m_pipelineState->getShaderModes()->getMeshShaderMode(); - Value *primitiveData = nullptr; - - if (meshMode.outputPrimitive == OutputPrimitives::Points) { - assert(primitiveIndices->getType() == m_builder.getInt32Ty()); // i32 - primitiveData = primitiveIndices; - } else if (meshMode.outputPrimitive == OutputPrimitives::Lines) { - assert(primitiveIndices->getType() == FixedVectorType::get(m_builder.getInt32Ty(), 2)); // v2i32 - Value *vertex0 = m_builder.CreateExtractElement(primitiveIndices, static_cast(0)); - Value *vertex1 = m_builder.CreateExtractElement(primitiveIndices, 1); - - if (m_gfxIp.major <= 11) { - primitiveData = m_builder.CreateShl(vertex1, 10); - primitiveData = m_builder.CreateOr(primitiveData, vertex0); - } else { - llvm_unreachable("Not implemented!"); - } - } else { - assert(meshMode.outputPrimitive == OutputPrimitives::Triangles); - Value *vertex0 = m_builder.CreateExtractElement(primitiveIndices, static_cast(0)); - Value *vertex1 = m_builder.CreateExtractElement(primitiveIndices, 1); - Value *vertex2 = m_builder.CreateExtractElement(primitiveIndices, 2); - - if (m_gfxIp.major <= 11) { - primitiveData = m_builder.CreateShl(vertex2, 10); - primitiveData = m_builder.CreateOr(primitiveData, vertex1); - primitiveData = m_builder.CreateShl(primitiveData, 10); - primitiveData = m_builder.CreateOr(primitiveData, vertex0); - } else { - llvm_unreachable("Not implemented!"); - } - } - - Value *ldsStart = m_builder.getInt32(getMeshShaderLdsRegionStart(MeshLdsRegion::PrimitiveIndices)); - Value *ldsOffset = m_builder.CreateAdd(ldsStart, primitiveIndex); - - // NOTE: We first clear old primitive connectivity data and use atomic OR operation to set new data. This is because - // the null primitive flag might be set via built-in CullPrimitive. - static const unsigned ClearMask = (1u << 31); - atomicOpWithLds(AtomicRMWInst::And, m_builder.getInt32(ClearMask), ldsOffset); - atomicOpWithLds(AtomicRMWInst::Or, primitiveData, ldsOffset); -} - -// ===================================================================================================================== -// Set primitive culled state by writing the null primitive flag to LDS. -// -// @param primitiveIndex : Primitive indexing -// @param isCulled : Whether this primitive is culled -void MeshTaskShader::setPrimitiveCulled(Value *primitiveIndex, Value *isCulled) { - // - // HW requires the primitive connectivity data has the following bit layout: - // [31] = Null primitive flag - // [28:20] = Index of vertex2 - // [18:10] = Index of vertex1 - // [8:0] = Index of vertex0 - // - assert(isCulled->getType()->isIntegerTy(1)); - - static const unsigned NullPrimitive = (1u << 31); - auto nullPrimitive = m_builder.CreateSelect(isCulled, m_builder.getInt32(NullPrimitive), m_builder.getInt32(0)); - - Value *ldsStart = m_builder.getInt32(getMeshShaderLdsRegionStart(MeshLdsRegion::PrimitiveIndices)); - Value *ldsOffset = m_builder.CreateAdd(ldsStart, primitiveIndex); - - // NOTE: We first clear null primitive flag and use atomic OR operation to set new flag. This is because the - // primitive connectivity data might be set via built-in PrimitiveXXXIndices. - static const unsigned ClearMask = ~(1u << 31); - atomicOpWithLds(AtomicRMWInst::And, m_builder.getInt32(ClearMask), ldsOffset); - atomicOpWithLds(AtomicRMWInst::Or, nullPrimitive, ldsOffset); -} - -// ===================================================================================================================== -// Get mesh built-in input. -// -// @param builtIn : Input built-in ID of mesh shader -// @returns : Value of the specified input built-in -Value *MeshTaskShader::getMeshInput(BuiltInKind builtIn) { - auto entryPoint = m_builder.GetInsertBlock()->getParent(); - assert(getShaderStage(entryPoint) == ShaderStageMesh); - - switch (builtIn) { - case BuiltInDrawIndex: { - auto &entryArgIdxs = m_pipelineState->getShaderInterfaceData(ShaderStageMesh)->entryArgIdxs.mesh; - return getFunctionArgument(entryPoint, entryArgIdxs.drawIndex); - } - - case BuiltInViewIndex: { - if (m_pipelineState->getInputAssemblyState().enableMultiView) { - auto &entryArgIdxs = m_pipelineState->getShaderInterfaceData(ShaderStageMesh)->entryArgIdxs.mesh; - return getFunctionArgument(entryPoint, entryArgIdxs.viewIndex); - } - return m_builder.getInt32(0); - } - - case BuiltInNumWorkgroups: - return getMeshNumWorkgroups(); - - case BuiltInWorkgroupId: - return getMeshWorkgroupId(); - - case BuiltInLocalInvocationId: - return getMeshLocalInvocationId(); - - case BuiltInGlobalInvocationId: - return getMeshGlobalInvocationId(); - - case BuiltInLocalInvocationIndex: - return getMeshLocalInvocationIndex(); - - case BuiltInSubgroupId: { - // subgroupId = localInvocationIndex / subgroupSize - auto localInvocationIndex = getMeshLocalInvocationIndex(); - unsigned subgroupSize = m_pipelineState->getShaderSubgroupSize(ShaderStageMesh); - assert(subgroupSize > 0 && subgroupSize % 32 == 0); - return m_builder.CreateLShr(localInvocationIndex, m_builder.getInt32(Log2_32(subgroupSize))); - } - - case BuiltInNumSubgroups: { - // numSubgroups = numMeshThreads / subgroupSize - const auto &meshMode = m_pipelineState->getShaderModes()->getMeshShaderMode(); - const unsigned numMeshThreads = meshMode.workgroupSizeX * meshMode.workgroupSizeY * meshMode.workgroupSizeZ; - unsigned subgroupSize = m_pipelineState->getShaderSubgroupSize(ShaderStageMesh); - assert(subgroupSize > 0 && subgroupSize % 32 == 0); - const unsigned numSubgroups = alignTo(numMeshThreads, subgroupSize) / subgroupSize; - return m_builder.getInt32(numSubgroups); - } - - default: - llvm_unreachable("Unknown mesh input built-in!"); - return nullptr; - } -} - -// ===================================================================================================================== -// Write mesh shader vertex outputs to LDS. -// -// @param outputOffset : Relative offset of this output (in dwords) within all outputs of the indexed vertex -// @param vertexIndex : Vertex indexing -// @param outputValue : Output value to write -void MeshTaskShader::writeVertexOutput(Value *outputOffset, Value *vertexIndex, Value *outputValue) { - const auto resUsage = m_pipelineState->getShaderResourceUsage(ShaderStageMesh); - const unsigned vertexStride = 4 * resUsage->inOutUsage.outputMapLocCount; // Corresponds to vec4 output - - Value *ldsStart = m_builder.getInt32(getMeshShaderLdsRegionStart(MeshLdsRegion::VertexOutput)); - Value *ldsOffset = m_builder.CreateMul(vertexIndex, m_builder.getInt32(vertexStride)); - ldsOffset = m_builder.CreateAdd(ldsOffset, outputOffset); - ldsOffset = m_builder.CreateAdd(ldsStart, ldsOffset); - - writeValueToLds(outputValue, ldsOffset); -} - -// ===================================================================================================================== -// Write mesh shader primitive outputs to LDS. -// -// @param outputOffset : Relative offset of this output (in dwords) within all outputs of the indexed primitive -// @param vertexIndex : Primitive indexing -// @param outputValue : Output value to write -void MeshTaskShader::writePrimitiveOutput(Value *outputOffset, Value *primitiveIndex, Value *outputValue) { - const auto resUsage = m_pipelineState->getShaderResourceUsage(ShaderStageMesh); - const unsigned primitiveStride = 4 * resUsage->inOutUsage.perPrimitiveOutputMapLocCount; // Corresponds to vec4 output - - Value *ldsStart = m_builder.getInt32(getMeshShaderLdsRegionStart(MeshLdsRegion::PrimitiveOutput)); - Value *ldsOffset = m_builder.CreateMul(primitiveIndex, m_builder.getInt32(primitiveStride)); - ldsOffset = m_builder.CreateAdd(ldsOffset, outputOffset); - ldsOffset = m_builder.CreateAdd(ldsStart, ldsOffset); - - writeValueToLds(outputValue, ldsOffset); -} - // ===================================================================================================================== // Export primitive (primitive connectivity data, primitive payload, and primitive attributes). void MeshTaskShader::exportPrimitive() { diff --git a/lgc/patch/MeshTaskShader.h b/lgc/patch/MeshTaskShader.h index 456e57d33d..788ee5dfd0 100644 --- a/lgc/patch/MeshTaskShader.h +++ b/lgc/patch/MeshTaskShader.h @@ -82,6 +82,12 @@ class MeshTaskShader { void lowerTaskPayloadPtr(TaskPayloadPtrOp &taskPayloadPtrOp); void lowerEmitMeshTasks(EmitMeshTasksOp &emitMeshTasksOp); + void lowerSetMeshOutputs(SetMeshOutputsOp &setMeshOutputsOp); + void lowerSetMeshPrimitiveIndices(SetMeshPrimitiveIndicesOp &setMeshPrimitiveIndicesOp); + void lowerSetMeshPrimitiveCulled(SetMeshPrimitiveCulledOp &setMeshPrimitiveCulledOp); + void lowerGetMeshBuiltinInput(GetMeshBuiltinInputOp &getMeshBuiltinInputOp); + void lowerWriteMeshVertexOutput(WriteMeshVertexOutputOp &writeMeshVertexOutputOp); + void lowerWriteMeshPrimitiveOutput(WriteMeshPrimitiveOutputOp &writeMeshPrimitiveOutputOp); void initWaveThreadInfo(llvm::Function *entryPoint); llvm::Value *getShaderRingEntryIndex(llvm::Function *entryPoint); @@ -94,12 +100,6 @@ class MeshTaskShader { llvm::Function *mutateMeshShaderEntryPoint(llvm::Function *entryPoint); void lowerMeshShaderBody(llvm::BasicBlock *apiMeshEntryBlock, llvm::BasicBlock *apiMeshExitBlock); - void setMeshOutputs(llvm::Value *vertexCount, llvm::Value *primitiveCount); - void setPrimitiveIndices(llvm::Value *primitiveIndex, llvm::Value *primitiveIndices); - void setPrimitiveCulled(llvm::Value *primitiveIndex, llvm::Value *isCulled); - llvm::Value *getMeshInput(BuiltInKind builtIn); - void writeVertexOutput(llvm::Value *outputOffset, llvm::Value *vertexIndex, llvm::Value *outputValue); - void writePrimitiveOutput(llvm::Value *outputOffset, llvm::Value *primitiveIndex, llvm::Value *outputValue); void exportPrimitive(); void exportVertex(); diff --git a/lgc/patch/NggPrimShader.cpp b/lgc/patch/NggPrimShader.cpp index 3ef778a132..588c9e526b 100644 --- a/lgc/patch/NggPrimShader.cpp +++ b/lgc/patch/NggPrimShader.cpp @@ -2013,19 +2013,11 @@ void NggPrimShader::buildPrimShaderWithGs(Function *primShader) { { m_builder.SetInsertPoint(initPrimitiveDataBlock); - if (m_pipelineState->enableSwXfb()) { - const auto &streamXfbBuffers = m_pipelineState->getStreamXfbBuffers(); - for (unsigned i = 0; i < MaxGsStreams; ++i) { - // Treat the vertex stream as active if it is associated with XFB buffers or is the rasterization stream. - bool streamActive = streamXfbBuffers[i] != 0 || i == rasterStream; - if (streamActive) { // Initialize primitive connectivity data if the stream is active - writePerThreadDataToLds(m_builder.getInt32(NullPrim), m_nggInputs.threadIdInSubgroup, - PrimShaderLdsRegion::PrimitiveData, Gfx9::NggMaxThreadsPerSubgroup * i); - } + for (unsigned i = 0; i < MaxGsStreams; ++i) { + if (m_pipelineState->isVertexStreamActive(i)) { // Initialize primitive connectivity data if the stream is active + writePerThreadDataToLds(m_builder.getInt32(NullPrim), m_nggInputs.threadIdInSubgroup, + PrimShaderLdsRegion::PrimitiveData, Gfx9::NggMaxThreadsPerSubgroup * i); } - } else { - writePerThreadDataToLds(m_builder.getInt32(NullPrim), m_nggInputs.threadIdInSubgroup, - PrimShaderLdsRegion::PrimitiveData, Gfx9::NggMaxThreadsPerSubgroup * rasterStream); } m_builder.CreateBr(endInitPrimitiveDataBlock); @@ -4063,11 +4055,8 @@ Value *NggPrimShader::readGsOutput(Type *outputTy, unsigned location, unsigned c // @param [in/out] totalEmitVertsPtr : Pointer to the counter of GS emitted vertices for all stream void NggPrimShader::processGsEmit(unsigned streamId, Value *primitiveIndex, Value *emitVertsPtr, Value *outVertsPtr, Value *totalEmitVertsPtr) { - if (!m_pipelineState->enableSwXfb() && m_pipelineState->getRasterizerState().rasterStream != streamId) { - // NOTE: If SW-emulated stream-out is not enabled, only handle GS_EMIT message that belongs to the rasterization - // stream. - return; - } + if (!m_pipelineState->isVertexStreamActive(streamId)) + return; // Skip if this vertex stream is marked as inactive if (!m_gsHandlers.emit) m_gsHandlers.emit = createGsEmitHandler(); @@ -4082,11 +4071,8 @@ void NggPrimShader::processGsEmit(unsigned streamId, Value *primitiveIndex, Valu // @param streamId : ID of output vertex stream // @param [in/out] outVertsPtr : Pointer to the counter of GS output vertices of current primitive for this stream void NggPrimShader::processGsCut(unsigned streamId, Value *outVertsPtr) { - if (!m_pipelineState->enableSwXfb() && m_pipelineState->getRasterizerState().rasterStream != streamId) { - // NOTE: If SW-emulated stream-out is not enabled, only handle GS_CUT message that belongs to the rasterization - // stream. - return; - } + if (!m_pipelineState->isVertexStreamActive(streamId)) + return; // Skip if this vertex stream is marked as inactive if (!m_gsHandlers.cut) m_gsHandlers.cut = createGsCutHandler(); @@ -6581,16 +6567,11 @@ void NggPrimShader::processSwXfbWithGs(ArrayRef args) { lastActiveXfbBuffer = i; } - bool streamActive[MaxGsStreams] = {}; unsigned firstActiveStream = InvalidValue; unsigned lastActiveStream = InvalidValue; - const unsigned rasterStream = m_pipelineState->getRasterizerState().rasterStream; - for (unsigned i = 0; i < MaxGsStreams; ++i) { - // Treat the vertex stream as active if it is associated with XFB buffers or is the rasterization stream. - streamActive[i] = streamXfbBuffers[i] != 0 || i == rasterStream; - if (!streamActive[i]) + if (!m_pipelineState->isVertexStreamActive(i)) continue; // Stream is inactive if (firstActiveStream == InvalidValue) @@ -6672,7 +6653,7 @@ void NggPrimShader::processSwXfbWithGs(ArrayRef args) { BasicBlock *endCompactPrimitiveIndexBlock[MaxGsStreams] = {}; BasicBlock *insertPos = endAccumPrimitiveCountsBlock; for (unsigned i = 0; i < MaxGsStreams; ++i) { - if (streamActive[i]) { + if (m_pipelineState->isVertexStreamActive(i)) { compactPrimitiveIndexBlock[i] = createBlock(xfbEntryBlock->getParent(), ".compactPrimitiveIndexInStream" + std::to_string(i)); compactPrimitiveIndexBlock[i]->moveAfter(insertPos); @@ -6694,7 +6675,7 @@ void NggPrimShader::processSwXfbWithGs(ArrayRef args) { BasicBlock *endExportXfbOutputBlock[MaxGsStreams] = {}; insertPos = endPrepareXfbExportBlock; for (unsigned i = 0; i < MaxGsStreams; ++i) { - if (streamActive[i]) { + if (m_pipelineState->isVertexStreamActive(i)) { exportXfbOutputBlock[i] = createBlock(xfbEntryBlock->getParent(), ".exportXfbOutputInStream" + std::to_string(i)); exportXfbOutputBlock[i]->moveAfter(insertPos); insertPos = exportXfbOutputBlock[i]; @@ -6718,7 +6699,7 @@ void NggPrimShader::processSwXfbWithGs(ArrayRef args) { m_builder.SetInsertPoint(initPrimitiveCountsBlock); for (unsigned i = 0; i < MaxGsStreams; ++i) { - if (streamActive[i]) { + if (m_pipelineState->isVertexStreamActive(i)) { writePerThreadDataToLds(m_builder.getInt32(0), m_nggInputs.threadIdInSubgroup, PrimShaderLdsRegion::PrimitiveCounts, (Gfx9::NggMaxWavesPerSubgroup + 1) * i); } @@ -6743,7 +6724,7 @@ void NggPrimShader::processSwXfbWithGs(ArrayRef args) { m_builder.SetInsertPoint(checkPrimitiveDrawFlagBlock); for (unsigned i = 0; i < MaxGsStreams; ++i) { - if (streamActive[i]) { + if (m_pipelineState->isVertexStreamActive(i)) { // drawFlag = primData[N] != NullPrim auto primData = readPerThreadDataFromLds(m_builder.getInt32Ty(), m_nggInputs.threadIdInSubgroup, @@ -6763,14 +6744,14 @@ void NggPrimShader::processSwXfbWithGs(ArrayRef args) { // Update draw flags for (unsigned i = 0; i < MaxGsStreams; ++i) { - if (streamActive[i]) { + if (m_pipelineState->isVertexStreamActive(i)) { drawFlag[i] = createPhi( {{drawFlag[i], checkPrimitiveDrawFlagBlock}, {m_builder.getFalse(), endInitPrimitiveCountsBlock}}); } } for (unsigned i = 0; i < MaxGsStreams; ++i) { - if (streamActive[i]) { + if (m_pipelineState->isVertexStreamActive(i)) { drawMask[i] = ballot(drawFlag[i]); primCountInWave[i] = m_builder.CreateIntrinsic(Intrinsic::ctpop, m_builder.getInt64Ty(), drawMask[i]); @@ -6793,7 +6774,7 @@ void NggPrimShader::processSwXfbWithGs(ArrayRef args) { ldsOffset = m_builder.CreateAdd(ldsOffset, m_builder.getInt32(1)); for (unsigned i = 0; i < MaxGsStreams; ++i) { - if (streamActive[i]) { + if (m_pipelineState->isVertexStreamActive(i)) { atomicAdd( primCountInWave[i], m_builder.CreateAdd(ldsOffset, m_builder.getInt32(regionStart + (Gfx9::NggMaxWavesPerSubgroup + 1) * i))); @@ -6812,7 +6793,7 @@ void NggPrimShader::processSwXfbWithGs(ArrayRef args) { createFenceAndBarrier(); for (unsigned i = 0; i < MaxGsStreams; ++i) { - if (!streamActive[i]) + if (!m_pipelineState->isVertexStreamActive(i)) continue; auto primCountInWaves = @@ -6836,7 +6817,7 @@ void NggPrimShader::processSwXfbWithGs(ArrayRef args) { SmallVector xfbOutputExports; for (unsigned i = 0; i < MaxGsStreams; ++i) { - if (!streamActive[i]) + if (!m_pipelineState->isVertexStreamActive(i)) continue; // Construct ".compactPrimitiveIndexInStream[N]" block @@ -6875,7 +6856,7 @@ void NggPrimShader::processSwXfbWithGs(ArrayRef args) { m_builder.CreateCondBr(firstThreadInSubgroup, prepareXfbExportBlock, endPrepareXfbExportBlock); } else { unsigned nextActiveStream = i + 1; - while (!streamActive[nextActiveStream]) { + while (!m_pipelineState->isVertexStreamActive(nextActiveStream)) { ++nextActiveStream; } @@ -6995,7 +6976,7 @@ void NggPrimShader::processSwXfbWithGs(ArrayRef args) { } for (unsigned i = 0; i < MaxGsStreams; ++i) { - if (!streamActive[i]) + if (!m_pipelineState->isVertexStreamActive(i)) continue; writeValueToLds(numPrimsToWrite[i], m_builder.getInt32(regionStart + MaxTransformFeedbackBuffers + i)); @@ -7038,7 +7019,7 @@ void NggPrimShader::processSwXfbWithGs(ArrayRef args) { } for (unsigned i = 0; i < MaxGsStreams; ++i) { - if (streamActive[i]) { + if (m_pipelineState->isVertexStreamActive(i)) { numPrimsToWrite[i] = m_builder.CreateIntrinsic(m_builder.getInt32Ty(), Intrinsic::amdgcn_readlane, {xfbStatInfo, m_builder.getInt32(MaxTransformFeedbackBuffers + i)}); @@ -7051,7 +7032,7 @@ void NggPrimShader::processSwXfbWithGs(ArrayRef args) { } for (unsigned i = 0; i < MaxGsStreams; ++i) { - if (!streamActive[i]) + if (!m_pipelineState->isVertexStreamActive(i)) continue; // Construct ".exportXfbOutputInStream[N]" block @@ -7196,7 +7177,7 @@ void NggPrimShader::processSwXfbWithGs(ArrayRef args) { if (i != lastActiveStream) { unsigned nextActiveStream = i + 1; - while (!streamActive[nextActiveStream]) { + while (!m_pipelineState->isVertexStreamActive(nextActiveStream)) { ++nextActiveStream; } diff --git a/lgc/patch/PassRegistry.inc b/lgc/patch/PassRegistry.inc index d3b8c617cc..184dffe248 100644 --- a/lgc/patch/PassRegistry.inc +++ b/lgc/patch/PassRegistry.inc @@ -54,6 +54,7 @@ LLPC_MODULE_PASS("print", PipelineStatePrinter) LLPC_MODULE_PASS("lgc-pipeline-state-recorder", PipelineStateRecorder) LLPC_MODULE_PASS("lgc-builder-replayer", BuilderReplayer) +LLPC_MODULE_PASS("lgc-continufy", Continufy) LLPC_MODULE_PASS("lgc-patch-resource-collect", PatchResourceCollect) LLPC_MODULE_PASS("lgc-patch-initialize-workgroup-memory", PatchInitializeWorkgroupMemory) LLPC_MODULE_PASS("lgc-patch-image-derivatives", PatchImageDerivatives) diff --git a/lgc/patch/Patch.cpp b/lgc/patch/Patch.cpp index 866105e503..a92c9bd092 100644 --- a/lgc/patch/Patch.cpp +++ b/lgc/patch/Patch.cpp @@ -33,6 +33,7 @@ #include "lgc/LgcContext.h" #include "lgc/PassManager.h" #include "lgc/builder/BuilderReplayer.h" +#include "lgc/patch/Continufy.h" #include "lgc/patch/FragColorExport.h" #include "lgc/patch/LowerDebugPrintf.h" #include "lgc/patch/PatchBufferOp.h" @@ -116,7 +117,7 @@ namespace lgc { // @param optLevel : The optimization level uses to adjust the aggressiveness of // passes and which passes to add. void Patch::addPasses(PipelineState *pipelineState, lgc::PassManager &passMgr, Timer *patchTimer, Timer *optTimer, - Pipeline::CheckShaderCacheFunc checkShaderCacheFunc, CodeGenOpt::Level optLevel) { + Pipeline::CheckShaderCacheFunc checkShaderCacheFunc, uint32_t optLevel) { // Start timer for patching passes. if (patchTimer) LgcContext::createAndAddStartStopTimer(passMgr, patchTimer, true); @@ -339,7 +340,7 @@ void Patch::registerPasses(PassBuilder &passBuilder) { // @param [in/out] passMgr : Pass manager to add passes to // @param optLevel : The optimization level uses to adjust the aggressiveness of // passes and which passes to add. -void Patch::addOptimizationPasses(lgc::PassManager &passMgr, CodeGenOpt::Level optLevel) { +void Patch::addOptimizationPasses(lgc::PassManager &passMgr, uint32_t optLevel) { LLPC_OUTS("PassManager optimization level = " << optLevel << "\n"); passMgr.addPass(ForceFunctionAttrsPass()); diff --git a/lgc/patch/PatchCopyShader.cpp b/lgc/patch/PatchCopyShader.cpp index 5c96180c3d..cc00ac3a74 100644 --- a/lgc/patch/PatchCopyShader.cpp +++ b/lgc/patch/PatchCopyShader.cpp @@ -211,8 +211,6 @@ bool PatchCopyShader::runImpl(Module &module, PipelineShadersResult &pipelineSha } } - auto resUsage = m_pipelineState->getShaderResourceUsage(ShaderStageCopyShader); - if (!m_pipelineState->getNggControl()->enableNgg) { // If no NGG, the copy shader will become a real HW VS. Set the user data entries in the // PAL metadata here. @@ -246,13 +244,9 @@ bool PatchCopyShader::runImpl(Module &module, PipelineShadersResult &pipelineSha m_lds = Patch::getLdsVariable(m_pipelineState, &module); unsigned outputStreamCount = 0; - unsigned outputStreamId = InvalidValue; for (int i = 0; i < MaxGsStreams; ++i) { - if (resUsage->inOutUsage.gs.outLocCount[i] > 0) { + if (m_pipelineState->isVertexStreamActive(i)) outputStreamCount++; - if (outputStreamId == InvalidValue) - outputStreamId = i; - } } if (outputStreamCount > 1 && m_pipelineState->enableXfb()) { @@ -291,7 +285,7 @@ bool PatchCopyShader::runImpl(Module &module, PipelineShadersResult &pipelineSha auto switchInst = builder.CreateSwitch(streamId, endBlock, outputStreamCount); for (unsigned streamId = 0; streamId < MaxGsStreams; ++streamId) { - if (resUsage->inOutUsage.gs.outLocCount[streamId] > 0) { + if (m_pipelineState->isVertexStreamActive(streamId)) { std::string blockName = ".stream" + std::to_string(streamId); BasicBlock *streamBlock = BasicBlock::Create(*m_context, blockName, entryPoint, endBlock); builder.SetInsertPoint(streamBlock); @@ -304,7 +298,7 @@ bool PatchCopyShader::runImpl(Module &module, PipelineShadersResult &pipelineSha } } else { // NOTE: If NGG, the copy shader with stream-out is not a real HW VS and will be incorporated into NGG - // primitive shader later. Therefore, there is no multiple HW executions. + // primitive shader later. Therefore, there are no multiple HW executions. // // copyShader() { @@ -321,14 +315,14 @@ bool PatchCopyShader::runImpl(Module &module, PipelineShadersResult &pipelineSha assert(gfxIp.major >= 11); // Must be GFX11+ for (unsigned streamId = 0; streamId < MaxGsStreams; ++streamId) { - if (resUsage->inOutUsage.gs.outLocCount[streamId] > 0) + if (m_pipelineState->isVertexStreamActive(streamId)) exportOutput(streamId, builder); } builder.CreateBr(endBlock); } } else { - outputStreamId = outputStreamCount == 0 ? 0 : outputStreamId; - exportOutput(outputStreamId, builder); + // Just export outputs of rasterization stream + exportOutput(m_pipelineState->getRasterizerState().rasterStream, builder); builder.CreateBr(endBlock); } diff --git a/lgc/patch/PatchEntryPointMutate.cpp b/lgc/patch/PatchEntryPointMutate.cpp index 6adf183d65..9f4b17701d 100644 --- a/lgc/patch/PatchEntryPointMutate.cpp +++ b/lgc/patch/PatchEntryPointMutate.cpp @@ -55,7 +55,9 @@ #include "lgc/patch/PatchEntryPointMutate.h" #include "lgc/LgcContext.h" +#include "lgc/LgcDialect.h" #include "lgc/patch/ShaderInputs.h" +#include "lgc/state/AbiMetadata.h" #include "lgc/state/AbiUnlinked.h" #include "lgc/state/IntrinsDefs.h" #include "lgc/state/PalMetadata.h" @@ -320,8 +322,11 @@ bool PatchEntryPointMutate::lowerCpsOps(Function *func) { IRBuilder<> builder(func->getContext()); // Lower cps jumps. - for (auto *jump : cpsJumps) - lowerCpsJump(func, jump, tailBlock, exitInfos); + unsigned stackSize = 0; + for (auto *jump : cpsJumps) { + unsigned stateSize = lowerCpsJump(func, jump, tailBlock, exitInfos); + stackSize = std::max(stackSize, stateSize); + } // Lower returns. for (auto *ret : retInstrs) { @@ -432,14 +437,26 @@ bool PatchEntryPointMutate::lowerCpsOps(Function *func) { // New version of the code (also handles unknown version, which we treat as // latest) Type *chainTys[] = {builder.getPtrTy(), builder.getIntNTy(waveSize), userDataVec->getType(), vgprArg->getType()}; - builder.CreateIntrinsic(Intrinsic::amdgcn_cs_chain, chainTys, chainArgs); + auto *chainCall = builder.CreateIntrinsic(Intrinsic::amdgcn_cs_chain, chainTys, chainArgs); + // Add inreg attribute for (fn, exec, sgprs). + for (unsigned arg = 0; arg < 3; arg++) + chainCall->addParamAttr(arg, Attribute::InReg); #endif builder.CreateUnreachable(); + auto funcName = func->getName(); // Lower cps stack operations CpsStackLowering stackLowering(func->getContext()); stackLowering.lowerCpsStackOps(*func, m_funcCpsStackMap[func]); + stackSize += stackLowering.getStackSize(); + // Set per-function .frontend_stack_size PAL metadata. + auto &shaderFunctions = m_pipelineState->getPalMetadata() + ->getPipelineNode() + .getMap(true)[Util::Abi::PipelineMetadataKey::ShaderFunctions] + .getMap(true); + shaderFunctions[funcName].getMap(true)[Util::Abi::HardwareStageMetadataKey::FrontendStackSize] = stackSize; + return true; } @@ -533,14 +550,14 @@ Function *PatchEntryPointMutate::lowerCpsFunction(Function *func, ArrayRef &exitInfos) { +unsigned PatchEntryPointMutate::lowerCpsJump(Function *parent, cps::JumpOp *jumpOp, BasicBlock *tailBlock, + SmallVectorImpl &exitInfos) { IRBuilder<> builder(parent->getContext()); const DataLayout &layout = parent->getParent()->getDataLayout(); // Translate @lgc.cps.jump(CR %target, i32 %levels, T %state, ...) into: @@ -552,8 +569,9 @@ void PatchEntryPointMutate::lowerCpsJump(Function *parent, cps::JumpOp *jumpOp, Value *state = jumpOp->getState(); Value *vsp = builder.CreateAlignedLoad(builder.getPtrTy(getLoweredCpsStackAddrSpace()), m_funcCpsStackMap[parent], Align(getLoweredCpsStackPointerSize(layout))); + unsigned stateSize = 0; if (!state->getType()->isEmptyTy()) { - unsigned stateSize = layout.getTypeStoreSize(state->getType()); + stateSize = layout.getTypeStoreSize(state->getType()); builder.CreateStore(state, vsp); // Make vsp properly aligned across cps function. stateSize = alignTo(stateSize, continuationStackAlignment); @@ -580,12 +598,13 @@ void PatchEntryPointMutate::lowerCpsJump(Function *parent, cps::JumpOp *jumpOp, builder.CreateBr(tailBlock); jumpOp->eraseFromParent(); + return stateSize; } // ===================================================================================================================== // Set up compute-with-calls flag. It is set for either of these two cases: // 1. a compute library; -// 2. a compute pipeline that does indirect calls or calls to external functions. +// 2. a compute pipeline that does indirect calls or calls to external application shader functions. // // When set, this pass behaves differently, not attempting to omit unused shader inputs, since all shader inputs // are potentially used in other functions. It also modifies each call to pass the shader inputs between functions. @@ -608,20 +627,14 @@ void PatchEntryPointMutate::setupComputeWithCalls(Module *module) { return; } - // Search for indirect calls + // Search for indirect calls between application shaders. for (const BasicBlock &block : func) { for (const Instruction &inst : block) { if (auto *call = dyn_cast(&inst)) { - // If a function has a call to cps.jump, we need to treat it as `computeWithCalls`. - if (isa(call)) { + if (isa(call) || call->getCallingConv() == CallingConv::SPIR_FUNC) { m_computeWithCalls = true; return; } - Value *calledVal = call->getCalledOperand(); - if (isa(calledVal) || call->isInlineAsm()) - continue; - m_computeWithCalls = true; - return; } } } @@ -633,103 +646,89 @@ void PatchEntryPointMutate::setupComputeWithCalls(Module *module) { // // @param module : IR module void PatchEntryPointMutate::gatherUserDataUsage(Module *module) { - // Find lgc.spill.table, lgc.push.constants, lgc.root.descriptor, lgc.descriptor.set functions, and from - // there all calls to them. Add each call to the applicable list in the UserDataUsage struct for the - // (merged) shader stage. - // Find lgc.special.user.data functions, and from there all calls to them. Add each call to the applicable - // list in the UserDataUsage struct for the (merged) shader stage. - // Also find lgc.input.import.generic calls in VS, indicating that the vertex buffer table is needed. - // Also find lgc.output.export.xfb calls anywhere, indicating that the streamout table is needed in the - // last vertex-processing stage. - for (Function &func : *module) { - if (!func.isDeclaration()) - continue; - if (func.getName().startswith(lgcName::SpillTable)) { - for (User *user : func.users()) { - CallInst *call = cast(user); - ShaderStage stage = getShaderStage(call->getFunction()); - assert(stage != ShaderStageCopyShader); - getUserDataUsage(stage)->spillTable.users.push_back(call); - } - continue; - } - - if (func.getName().startswith(lgcName::PushConst)) { - for (User *user : func.users()) { - // For this call to lgc.push.const, attempt to find all loads with a constant dword-aligned offset and - // push into userDataUsage->pushConstOffsets. If we fail, set userDataUsage->pushConstSpill to indicate that - // we need to keep the pointer to the push const, derived as an offset into the spill table. - CallInst *call = cast(user); - ShaderStage stage = getShaderStage(call->getFunction()); - assert(stage != ShaderStageCopyShader); - auto userDataUsage = getUserDataUsage(stage); - userDataUsage->pushConst.users.push_back(call); - SmallVector, 4> users; - users.push_back({call, 0}); - for (unsigned i = 0; i != users.size(); ++i) { - Instruction *inst = users[i].first; - for (User *user : inst->users()) { - unsigned dwordOffset = users[i].second; - if (auto bitcast = dyn_cast(user)) { - // See through a bitcast. - users.push_back({bitcast, dwordOffset}); - continue; - } - if (isa(user) && !user->getType()->isAggregateType()) { - unsigned byteSize = module->getDataLayout().getTypeStoreSize(user->getType()); - if (byteSize % 4 == 0) { - // This is a scalar or vector load with dword-aligned size. We can attempt to unspill it, but, for - // a particular dword offset, we only attempt to unspill ones with the same (minimum) size. - unsigned dwordSize = byteSize / 4; - userDataUsage->pushConstOffsets.resize( - std::max(unsigned(userDataUsage->pushConstOffsets.size()), dwordOffset + 1)); - auto &pushConstOffset = userDataUsage->pushConstOffsets[dwordOffset]; - if (pushConstOffset.dwordSize == 0 || pushConstOffset.dwordSize >= dwordSize) { - if (pushConstOffset.dwordSize != 0 && pushConstOffset.dwordSize != dwordSize) { - // This load type is smaller than previously seen ones at this offset. Forget the earlier - // ones (and mark that some uses of the push const pointer remain). - userDataUsage->pushConstSpill = true; - pushConstOffset.users.clear(); - } - // Remember this load for possible unspilling. - pushConstOffset.dwordSize = dwordSize; - userDataUsage->pushConstOffsets[dwordOffset].users.push_back(cast(user)); + // Gather special ops requiring user data. + static const auto visitor = + llvm_dialects::VisitorBuilder() + .add([](PatchEntryPointMutate &self, UserDataOp &op) { + ShaderStage stage = getShaderStage(op.getFunction()); + assert(stage != ShaderStageCopyShader); + auto userDataUsage = self.getUserDataUsage(stage); + userDataUsage->userDataOps.push_back(&op); + + // Attempt to find all loads with a constant dword-aligned offset and push into + // userDataUsage->pushConstOffsets. If we fail, set userDataUsage->pushConstSpill to indicate that we need + // to keep the pointer to the push const, derived as an offset into the spill table. + bool haveDynamicUser = false; + SmallVector, 4> worklist; + worklist.push_back({&op, op.getOffset()}); + while (!worklist.empty()) { + auto [inst, offset] = worklist.pop_back_val(); + for (User *user : inst->users()) { + if (auto bitcast = dyn_cast(user)) { + // See through a bitcast. + worklist.push_back({bitcast, offset}); continue; } - } - } else if (auto gep = dyn_cast(user)) { - // For a gep, calculate the new constant offset. - APInt gepOffset(64, 0); - if (gep->accumulateConstantOffset(module->getDataLayout(), gepOffset)) { - unsigned gepByteOffset = gepOffset.getZExtValue(); - if (gepByteOffset % 4 == 0) { - // We still have a constant offset that is 4-aligned. Push it so we look at its users. - dwordOffset += gepByteOffset / 4; - users.push_back({gep, dwordOffset}); + if (isa(user)) { + if (user->getType()->isAggregateType()) { + haveDynamicUser = true; + continue; + } + unsigned byteSize = self.m_module->getDataLayout().getTypeStoreSize(user->getType()); + if (byteSize % 4 != 0 || offset % 4 != 0) { + haveDynamicUser = true; + continue; + } + + // This is a scalar or vector load with dword-aligned size at a fixed dword offset. We may be able to + // get it from a user data argument + UserDataLoad load; + load.load = cast(user); + load.dwordOffset = offset / 4; + load.dwordSize = byteSize / 4; + userDataUsage->loads.push_back(load); + + userDataUsage->addLoad(load.dwordOffset, load.dwordSize); continue; } + if (auto gep = dyn_cast(user)) { + // For a gep, calculate the new constant offset. + APInt gepOffset(64, 0); + if (gep->accumulateConstantOffset(self.m_module->getDataLayout(), gepOffset)) { + unsigned gepByteOffset = gepOffset.getZExtValue(); + worklist.push_back({gep, offset + gepByteOffset}); + continue; + } + } + haveDynamicUser = true; } } - // We have found some user we can't handle. Mark that we need to keep the push const pointer. - userDataUsage->pushConstSpill = true; - } - } - } - continue; - } - if (func.getName().startswith(lgcName::RootDescriptor)) { - for (User *user : func.users()) { - CallInst *call = cast(user); - unsigned dwordOffset = cast(call->getArgOperand(0))->getZExtValue(); - ShaderStage stage = getShaderStage(call->getFunction()); - assert(stage != ShaderStageCopyShader); - auto &rootDescriptors = getUserDataUsage(stage)->rootDescriptors; - rootDescriptors.resize(std::max(rootDescriptors.size(), size_t(dwordOffset + 1))); - rootDescriptors[dwordOffset].users.push_back(call); - } + if (haveDynamicUser) { + userDataUsage->haveDynamicUserDataLoads = true; + self.m_pipelineState->getPalMetadata()->setUserDataSpillUsage(op.getOffset() / 4); + } + }) + .add([](PatchEntryPointMutate &self, LoadUserDataOp &op) { + ShaderStage stage = getShaderStage(op.getFunction()); + assert(stage != ShaderStageCopyShader); + auto *userDataUsage = self.getUserDataUsage(stage); + + UserDataLoad load; + load.load = &op; + load.dwordOffset = op.getOffset() / 4; + load.dwordSize = self.m_module->getDataLayout().getTypeStoreSize(op.getType()) / 4; + + userDataUsage->loads.push_back(load); + userDataUsage->addLoad(load.dwordOffset, load.dwordSize); + }) + .build(); + + visitor.visit(*this, *module); + + for (Function &func : *module) { + if (!func.isDeclaration()) continue; - } if (func.getName().startswith(lgcName::SpecialUserData)) { for (User *user : func.users()) { @@ -745,29 +744,7 @@ void PatchEntryPointMutate::gatherUserDataUsage(Module *module) { continue; } - if (func.getName().startswith(lgcName::DescriptorTableAddr)) { - for (User *user : func.users()) { - CallInst *call = cast(user); - ResourceNodeType searchType = ResourceNodeType(cast(call->getArgOperand(1))->getZExtValue()); - uint64_t set = cast(call->getArgOperand(2))->getZExtValue(); - unsigned binding = cast(call->getArgOperand(3))->getZExtValue(); - ShaderStage stage = getShaderStage(call->getFunction()); - assert(stage != ShaderStageCopyShader); - auto &descriptorTable = getUserDataUsage(stage)->descriptorTables; - - // We use the offset of the node as the index. - const ResourceNode *node = m_pipelineState->findResourceNode(searchType, set, binding, stage).first; - if (!node) { - // Handle mutable descriptors - node = m_pipelineState->findResourceNode(ResourceNodeType::DescriptorMutable, set, binding, stage).first; - } - assert(node && "Could not find resource node"); - uint32_t descTableIndex = node - &m_pipelineState->getUserDataNodes().front(); - descriptorTable.resize(std::max(descriptorTable.size(), size_t(descTableIndex + 1))); - descriptorTable[descTableIndex].users.push_back(call); - } - } else if ((func.getName().startswith(lgcName::OutputExportXfb) && !func.use_empty()) || - m_pipelineState->enableSwXfb()) { + if ((func.getName().startswith(lgcName::OutputExportXfb) && !func.use_empty()) || m_pipelineState->enableSwXfb()) { // NOTE: For GFX11+, SW emulated stream-out will always use stream-out buffer descriptors and stream-out buffer // offsets to calculate numbers of written primitives/dwords and update the counters. auto lastVertexStage = auto lastVertexStage = m_pipelineState->getLastVertexProcessingStage(); @@ -777,6 +754,49 @@ void PatchEntryPointMutate::gatherUserDataUsage(Module *module) { } } +// ===================================================================================================================== +// Load a value of a simple type from user data at the given dwordOffset. +Value *PatchEntryPointMutate::loadUserData(const UserDataUsage &userDataUsage, Value *spillTable, Type *type, + unsigned dwordOffset, BuilderBase &builder) { + Function *func = builder.GetInsertBlock()->getParent(); + unsigned dwordSize = m_module->getDataLayout().getTypeStoreSize(type) / 4; + if (dwordOffset + dwordSize <= userDataUsage.entryArgIdxs.size()) { + SmallVector dwords; + for (unsigned i = 0; i != dwordSize; ++i) { + unsigned entryArgIdx = userDataUsage.entryArgIdxs[dwordOffset + i]; + if (!entryArgIdx) + break; + dwords.push_back(getFunctionArgument(func, entryArgIdx)); + } + if (dwords.size() == dwordSize) { + Value *result; + if (dwords.size() > 1) { + result = PoisonValue::get(FixedVectorType::get(builder.getInt32Ty(), dwords.size())); + for (unsigned i = 0; i != dwords.size(); ++i) + result = builder.CreateInsertElement(result, dwords[i], i); + } else { + result = dwords[0]; + } + if (type != result->getType()) { + if (isa(type)) { + if (dwordSize != 1) + result = builder.CreateBitCast(result, builder.getIntNTy(32 * dwordSize)); + result = builder.CreateIntToPtr(result, type); + } else { + result = builder.CreateBitCast(result, type); + } + } + return result; + } + } + + assert(spillTable); + Value *ptr = builder.CreateConstGEP1_32(builder.getInt8Ty(), spillTable, dwordOffset * 4); + auto *load = builder.CreateLoad(type, ptr); + load->setMetadata(LLVMContext::MD_invariant_load, MDNode::get(*m_context, {})); + return load; +} + // ===================================================================================================================== // Fix up user data uses in all shaders: For unspilled ones, use the entry arg directly; for spilled ones, // insert a load from the spill table, shared for the function. @@ -806,194 +826,42 @@ void PatchEntryPointMutate::fixupUserDataUses(Module &module) { // If needed, generate code for the spill table pointer (as pointer to i8) at the start of the function. Instruction *spillTable = nullptr; AddressExtender addressExtender(&func); - if (userDataUsage->spillTable.entryArgIdx != 0) { + if (userDataUsage->spillTableEntryArgIdx != 0) { builder.SetInsertPoint(addressExtender.getFirstInsertionPt()); - Argument *arg = getFunctionArgument(&func, userDataUsage->spillTable.entryArgIdx); - spillTable = addressExtender.extend(arg, builder.getInt32(HighAddrPc), - builder.getInt8Ty()->getPointerTo(ADDR_SPACE_CONST), builder); + Argument *arg = getFunctionArgument(&func, userDataUsage->spillTableEntryArgIdx); + spillTable = addressExtender.extendWithPc(arg, builder.getPtrTy(ADDR_SPACE_CONST), builder); } // Handle direct uses of the spill table that were generated in DescBuilder. - for (Instruction *&call : userDataUsage->spillTable.users) { - if (call && call->getFunction() == &func) { - call->replaceAllUsesWith(spillTable); - call->eraseFromParent(); - call = nullptr; - } - } - - // Handle unspilled parts of the push constant. - for (unsigned dwordOffset = 0; dwordOffset != userDataUsage->pushConstOffsets.size(); ++dwordOffset) { - UserDataNodeUsage &pushConstOffset = userDataUsage->pushConstOffsets[dwordOffset]; - if (!pushConstOffset.users.empty()) { - if (pushConstOffset.entryArgIdx) { - // This offset into the push constant is unspilled. Replace the loads with the entry arg, with a - // bitcast. (We know that all loads are non-aggregates of the same size, so we can bitcast.) - Argument *arg = getFunctionArgument(&func, pushConstOffset.entryArgIdx); - for (Instruction *&load : pushConstOffset.users) { - if (load && load->getFunction() == &func) { - builder.SetInsertPoint(load); - Value *replacement = nullptr; - if (!isa(load->getType())) - replacement = builder.CreateBitCast(arg, load->getType()); - else { - // For a pointer, we need to bitcast to a single int first, then to the pointer. - replacement = builder.CreateBitCast(arg, builder.getIntNTy(arg->getType()->getPrimitiveSizeInBits())); - replacement = builder.CreateIntToPtr(replacement, load->getType()); - } - load->replaceAllUsesWith(replacement); - load->eraseFromParent(); - load = nullptr; - } - } - } else { - // This offset into the push constant is spilled. All we need to do is ensure that the push constant - // pointer (derived as an offset into the spill table) remains. - userDataUsage->pushConstSpill = true; - } - } - } + for (auto *&call : userDataUsage->userDataOps) { + if (!call || call->getFunction() != &func) + continue; - // Handle the push constant pointer, always do that for compute libraries. - if (!userDataUsage->pushConst.users.empty() || isComputeWithCalls()) { - // If all uses of the push constant pointer are unspilled, we can just replace the lgc.push.const call - // with undef, as the address is ultimately not used anywhere. - Value *replacementVal = nullptr; - if (userDataUsage->pushConstSpill) { - // At least one use of the push constant pointer remains. - const ResourceNode *node = m_pipelineState->findSingleRootResourceNode(ResourceNodeType::PushConst, stage); - Value *byteOffset = nullptr; - builder.SetInsertPoint(spillTable->getNextNode()); - if (node) { - byteOffset = builder.getInt32(node->offsetInDwords * 4); - // Ensure we mark spill table usage. - m_pipelineState->getPalMetadata()->setUserDataSpillUsage(node->offsetInDwords); - } else if (!m_pipelineState->isUnlinked()) { - byteOffset = PoisonValue::get(builder.getInt32Ty()); - } else { - // Unlinked shader compilation: Use a reloc. - byteOffset = builder.CreateRelocationConstant(reloc::Pushconst); - } - replacementVal = builder.CreateGEP(builder.getInt8Ty(), spillTable, byteOffset); - } - for (Instruction *&call : userDataUsage->pushConst.users) { - if (call && call->getFunction() == &func) { - Value *thisReplacementVal = replacementVal; - if (!thisReplacementVal) { - // No use of the push constant pointer remains. Just replace with undef. - thisReplacementVal = PoisonValue::get(call->getType()); - } else { - builder.SetInsertPoint(call); - thisReplacementVal = builder.CreateBitCast(thisReplacementVal, call->getType()); - } - call->replaceAllUsesWith(thisReplacementVal); - call->eraseFromParent(); - call = nullptr; - } - } - } + auto *op = cast(call); + call = nullptr; - // Root descriptors ("dynamic descriptors"). - for (unsigned dwordOffset = 0; dwordOffset != userDataUsage->rootDescriptors.size(); ++dwordOffset) { - auto &rootDescriptor = userDataUsage->rootDescriptors[dwordOffset]; - if (rootDescriptor.users.empty()) - continue; - if (rootDescriptor.entryArgIdx != 0) { - // The root descriptor is unspilled, and uses an entry arg. - Argument *arg = getFunctionArgument(&func, rootDescriptor.entryArgIdx); - for (Instruction *&call : rootDescriptor.users) { - if (call && call->getFunction() == &func) { - call->replaceAllUsesWith(arg); - call->eraseFromParent(); - call = nullptr; - } - } + if (spillTable) { + builder.SetInsertPoint(op); + Value *ptr = builder.CreateConstGEP1_32(builder.getInt8Ty(), spillTable, op->getOffset()); + op->replaceAllUsesWith(ptr); } else { - // The root descriptor is spilled. Ensure we mark spill table usage. - m_pipelineState->getPalMetadata()->setUserDataSpillUsage(dwordOffset); - Value *byteOffset = builder.getInt32(dwordOffset * 4); - for (Instruction *&call : rootDescriptor.users) { - if (call && call->getFunction() == &func) { - builder.SetInsertPoint(call); - Value *descPtr = builder.CreateGEP(builder.getInt8Ty(), spillTable, byteOffset); - descPtr = builder.CreateBitCast(descPtr, call->getType()->getPointerTo(ADDR_SPACE_CONST)); - Value *desc = builder.CreateLoad(call->getType(), descPtr); - desc->setName("rootDesc" + Twine(dwordOffset)); - call->replaceAllUsesWith(desc); - call->eraseFromParent(); - call = nullptr; - } - } + // We don't actually have a spill table, which means that all (transitive) users of this op are ultimately + // no-ops or fixed-offset loads that will be replaced separately. + op->replaceAllUsesWith(PoisonValue::get(op->getType())); } + op->eraseFromParent(); } - // Descriptor tables - Type *ptrType = builder.getInt8Ty()->getPointerTo(ADDR_SPACE_CONST); - for (unsigned userDataIdx = 0; userDataIdx != userDataUsage->descriptorTables.size(); ++userDataIdx) { - auto &descriptorTable = userDataUsage->descriptorTables[userDataIdx]; - Instruction *spillTableLoad = nullptr; - const bool isDescTableSpilled = descriptorTable.entryArgIdx == 0; - - SmallDenseMap addrExtMap[2]; - for (Instruction *&inst : descriptorTable.users) { - Value *descTableVal = nullptr; - if (inst && inst->getFunction() == &func) { - auto call = cast(inst); - assert(call->getType() == ptrType); - - if (isDescTableSpilled && !spillTableLoad) { - // The descriptor table is spilled. At the start of the function, create the GEP and load which are then - // shared by all users. - std::string namePrefix = "descTable"; - builder.SetInsertPoint(spillTable->getNextNode()); - const ResourceNode *node = &m_pipelineState->getUserDataNodes()[userDataIdx]; - m_pipelineState->getPalMetadata()->setUserDataSpillUsage(node->offsetInDwords); - Value *addr = builder.CreateConstGEP1_32(builder.getInt8Ty(), spillTable, node->offsetInDwords * 4); - addr = builder.CreateBitCast(addr, builder.getInt32Ty()->getPointerTo(ADDR_SPACE_CONST)); - spillTableLoad = builder.CreateLoad(builder.getInt32Ty(), addr); - spillTableLoad->setName(namePrefix + Twine(userDataIdx)); - } - - // The address extension code only depends on descriptorTable (which is constant for the lifetime of the map) - // and highHalf. Use map with highHalf keys to avoid creating redundant nodes for the extensions. - Value *highHalf = call->getArgOperand(4); - auto it = addrExtMap[isDescTableSpilled].find(highHalf); - if (it != addrExtMap[isDescTableSpilled].end()) { - descTableVal = it->second; - } else { - - if (!isDescTableSpilled) { - // The descriptor set is unspilled, and uses an entry arg. - descTableVal = getFunctionArgument(&func, descriptorTable.entryArgIdx); - if (isa(highHalf)) { - // Set builder to insert the 32-to-64 extension code at the start of the function. - builder.SetInsertPoint(addressExtender.getFirstInsertionPt()); - } else { - // Set builder to insert the 32-to-64 extension code after the instruction containing the high half. - Instruction *highHalfInst = cast(highHalf); - builder.SetInsertPoint(highHalfInst->getNextNode()); - } - } else { - // The descriptor table is spilled, the load at the start of the function has been created. - assert(descriptorTable.entryArgIdx == 0); - assert(spillTableLoad); - descTableVal = spillTableLoad; - // Set builder to insert the 32-to-64 extension code just after the load. - builder.SetInsertPoint(spillTableLoad->getNextNode()); - } - - // Now we want to extend the loaded 32-bit value to a 64-bit pointer, using either PC or the provided - // high half. - descTableVal = addressExtender.extend(descTableVal, highHalf, ptrType, builder); - addrExtMap[isDescTableSpilled].insert({highHalf, descTableVal}); - } + // Handle generic fixed-offset user data loads. + for (auto &load : userDataUsage->loads) { + if (!load.load || load.load->getFunction() != &func) + continue; - // Replace uses of the call and erase it. - call->replaceAllUsesWith(descTableVal); - call->eraseFromParent(); - inst = nullptr; - } - } + builder.SetInsertPoint(load.load); + load.load->replaceAllUsesWith( + loadUserData(*userDataUsage, spillTable, load.load->getType(), load.dwordOffset, builder)); + load.load->eraseFromParent(); + load.load = nullptr; } // Special user data from lgc.special.user.data calls @@ -1402,19 +1270,15 @@ uint64_t PatchEntryPointMutate::generateEntryPointArgTys(ShaderInputs *shaderInp addSpecialUserDataArgs(userDataArgs, specialUserDataArgs, builder); - addUserDataArgs(userDataArgs, builder); - - // Determine which user data args are going to be "unspilled", and put them in unspilledArgs. - SmallVector unspilledArgs; - determineUnspilledUserDataArgs(userDataArgs, specialUserDataArgs, builder, unspilledArgs); + finalizeUserDataArgs(userDataArgs, specialUserDataArgs, builder); - // Scan unspilledArgs: for each one: + // Scan userDataArgs: for each one: // * add it to the arg type array // * set user data PAL metadata // * store the arg index into the pointer provided to the xxxArgs.push() // * if it's special user data, also store the arg index into the specialUserData entry. unsigned userDataIdx = 0; - for (const auto &userDataArg : unspilledArgs) { + for (const auto &userDataArg : userDataArgs) { if (userDataArg.argIndex) *userDataArg.argIndex = argTys.size() + argOffset; unsigned dwordSize = userDataArg.argDwordSize; @@ -1484,7 +1348,7 @@ uint64_t PatchEntryPointMutate::generateEntryPointArgTys(ShaderInputs *shaderInp SmallVector userDataMap; userDataMap.resize(NumUserSgprs, InvalidMapVal); userDataIdx = 0; - for (const auto &userDataArg : unspilledArgs) { + for (const auto &userDataArg : userDataArgs) { unsigned dwordSize = userDataArg.argDwordSize; if (userDataArg.userDataValue != InvalidMapVal) { bool isSystemUserData = isSystemUserDataValue(userDataArg.userDataValue); @@ -1703,315 +1567,184 @@ void PatchEntryPointMutate::addSpecialUserDataArgs(SmallVectorImpl if (userDataUsage->usesStreamOutTable || userDataUsage->isSpecialUserDataUsed(UserDataMapping::StreamOutTable)) { if (enableNgg || !m_pipelineState->hasShaderStage(ShaderStageCopyShader) && m_pipelineState->enableXfb()) { // If no NGG, stream out table will be set to copy shader's user data entry, we should not set it duplicately. + unsigned *tablePtr = nullptr; + switch (m_shaderStage) { case ShaderStageVertex: - userDataArgs.push_back(UserDataArg(builder.getInt32Ty(), "streamOutTable", UserDataMapping::StreamOutTable, - &intfData->entryArgIdxs.vs.streamOutData.tablePtr)); - if (m_pipelineState->enableSwXfb()) { - // NOTE: For GFX11+, the SW stream-out needs an additional special user data SGPR to store the - // stream-out control buffer address. - specialUserDataArgs.push_back(UserDataArg(builder.getInt32Ty(), "streamOutControlBuf", - UserDataMapping::StreamOutControlBuf, - &intfData->entryArgIdxs.vs.streamOutData.controlBufPtr)); - } + tablePtr = &intfData->entryArgIdxs.vs.streamOutData.tablePtr; break; case ShaderStageTessEval: - userDataArgs.push_back(UserDataArg(builder.getInt32Ty(), "streamOutTable", UserDataMapping::StreamOutTable, - &intfData->entryArgIdxs.tes.streamOutData.tablePtr)); - if (m_pipelineState->enableSwXfb()) { - // NOTE: For GFX11+, the SW stream-out needs an additional special user data SGPR to store the - // stream-out control buffer address. - specialUserDataArgs.push_back(UserDataArg(builder.getInt32Ty(), "streamOutControlBuf", - UserDataMapping::StreamOutControlBuf, - &intfData->entryArgIdxs.tes.streamOutData.controlBufPtr)); - } + tablePtr = &intfData->entryArgIdxs.tes.streamOutData.tablePtr; break; case ShaderStageGeometry: - if (m_pipelineState->getTargetInfo().getGfxIpVersion().major <= 10) { + if (m_pipelineState->enableSwXfb()) { + tablePtr = &intfData->entryArgIdxs.gs.streamOutData.tablePtr; + } else { + assert(m_pipelineState->getTargetInfo().getGfxIpVersion().major <= 10); // Allocate dummy stream-out register for geometry shader userDataArgs.push_back(UserDataArg(builder.getInt32Ty(), "dummyStreamOut")); - } else if (m_pipelineState->enableSwXfb()) { - userDataArgs.push_back(UserDataArg(builder.getInt32Ty(), "streamOutTable", UserDataMapping::StreamOutTable, - &intfData->entryArgIdxs.gs.streamOutData.tablePtr)); - // NOTE: For GFX11+, the SW stream-out needs an additional special user data SGPR to store the - // stream-out control buffer address. - specialUserDataArgs.push_back(UserDataArg(builder.getInt32Ty(), "streamOutControlBuf", - UserDataMapping::StreamOutControlBuf, - &intfData->entryArgIdxs.gs.streamOutData.controlBufPtr)); } + break; default: llvm_unreachable("Should never be called!"); break; } + + if (tablePtr) { + userDataArgs.push_back( + UserDataArg(builder.getInt32Ty(), "streamOutTable", UserDataMapping::StreamOutTable, tablePtr)); + } } } -} -// ===================================================================================================================== -// Add a UserDataArg to the vector for each user data node needed in user data SGPRs. -// -// @param userDataArgs : Vector to add args to -// @param builder : IRBuilder to get types from -void PatchEntryPointMutate::addUserDataArgs(SmallVectorImpl &userDataArgs, IRBuilder<> &builder) { - auto userDataUsage = getUserDataUsage(m_shaderStage); - llvm::ArrayRef userDataNodes = m_pipelineState->getUserDataNodes(); - for (unsigned userDataNodeIdx = 0; userDataNodeIdx != userDataNodes.size(); ++userDataNodeIdx) { - const ResourceNode &node = userDataNodes[userDataNodeIdx]; - switch (node.concreteType) { + // NOTE: For GFX11+, the SW stream-out needs an additional special user data SGPR to store the stream-out control + // buffer address. + if (m_pipelineState->enableSwXfb()) { + unsigned *controlBufPtr = nullptr; - case ResourceNodeType::IndirectUserDataVaPtr: - case ResourceNodeType::StreamOutTableVaPtr: + switch (m_shaderStage) { + case ShaderStageVertex: + controlBufPtr = &intfData->entryArgIdxs.vs.streamOutData.controlBufPtr; break; - - case ResourceNodeType::DescriptorTableVaPtr: { - // Check if the descriptor set is in use. For compute with calls, enable it anyway. - UserDataNodeUsage *descSetUsage = nullptr; - if (userDataUsage->descriptorTables.size() > userDataNodeIdx) - descSetUsage = &userDataUsage->descriptorTables[userDataNodeIdx]; - if (!isComputeWithCalls() && (!descSetUsage || descSetUsage->users.empty())) - break; - - // Add the arg (descriptor set pointer) that we can potentially unspill. - unsigned *argIndex = descSetUsage == nullptr ? nullptr : &descSetUsage->entryArgIdx; - addUserDataArg(userDataArgs, node.offsetInDwords, node.sizeInDwords, "descTable" + Twine(userDataNodeIdx), - argIndex, builder); + case ShaderStageTessEval: + controlBufPtr = &intfData->entryArgIdxs.tes.streamOutData.controlBufPtr; break; - } - - case ResourceNodeType::PushConst: { - // Always spill for compute libraries. - if (!isComputeWithCalls()) { - // We add a potential unspilled arg for each separate dword offset of the push const at which there is a load. - // We already know that loads we have on our pushConstOffsets lists are at dword-aligned offset and - // dword-aligned size. We need to ensure that all loads are the same size, by removing ones that are bigger than - // the minimum size. - // - // First cope with the case that the app uses more push const than the size of the resource node. This is - // a workaround for an incorrect application; according to the Vulkan spec (version 1.2.151, section 14.6.1 - // "Push Constant Interface"): - // - // Each statically used member of a push constant block must be placed at an Offset such that the entire - // member is entirely contained within the VkPushConstantRange for each OpEntryPoint that uses it, and - // the stageFlags for that range must specify the appropriate VkShaderStageFlagBits for that stage. - unsigned dwordEndOffset = userDataUsage->pushConstOffsets.size(); - if (dwordEndOffset > node.sizeInDwords) { - userDataUsage->pushConstSpill = true; - dwordEndOffset = node.sizeInDwords; - } - - for (unsigned dwordOffset = 0; dwordOffset != dwordEndOffset; ++dwordOffset) { - UserDataNodeUsage &pushConstOffset = userDataUsage->pushConstOffsets[dwordOffset]; - if (pushConstOffset.users.empty()) - continue; - - // Check that the load size does not overlap with the next used offset in the push constant. - bool haveOverlap = false; - unsigned endOffset = - std::min(dwordOffset + pushConstOffset.dwordSize, unsigned(userDataUsage->pushConstOffsets.size())); - for (unsigned followingOffset = dwordOffset + 1; followingOffset != endOffset; ++followingOffset) { - if (!userDataUsage->pushConstOffsets[followingOffset].users.empty()) { - haveOverlap = true; - break; - } - } - if (haveOverlap) { - userDataUsage->pushConstSpill = true; - continue; - } - - // Add the arg (part of the push const) that we can potentially unspill. - addUserDataArg(userDataArgs, node.offsetInDwords + dwordOffset, pushConstOffset.dwordSize, - "pushConst" + Twine(dwordOffset), &pushConstOffset.entryArgIdx, builder); - } - } else { - // Mark push constant for spill for compute library. - userDataUsage->pushConstSpill = true; - } - - // Ensure we mark the push constant's part of the spill table as used. - if (userDataUsage->pushConstSpill) - userDataUsage->spillUsage = std::min(userDataUsage->spillUsage, node.offsetInDwords); - + case ShaderStageGeometry: + controlBufPtr = &intfData->entryArgIdxs.gs.streamOutData.controlBufPtr; break; - } - default: - if (isComputeWithCalls()) { - // Always spill for compute libraries. - break; - } - - for (unsigned dwordOffset = node.offsetInDwords; dwordOffset != node.offsetInDwords + node.sizeInDwords; - ++dwordOffset) { - if (userDataUsage->rootDescriptors.size() <= dwordOffset) - break; - auto &rootDescUsage = userDataUsage->rootDescriptors[dwordOffset]; - // Skip unused descriptor. - if (rootDescUsage.users.empty()) - continue; - unsigned dwordSize = rootDescUsage.users[0]->getType()->getPrimitiveSizeInBits() / 32; - // Add the arg (root descriptor) that we can potentially unspill. - addUserDataArg(userDataArgs, dwordOffset, dwordSize, "rootDesc" + Twine(dwordOffset), - &rootDescUsage.entryArgIdx, builder); - } + // Ignore other shader stages break; } - } -} -// ===================================================================================================================== -// Add a single UserDataArg -// -// @param userDataArgs : Vector to add UserDataArg to -// @param userDataValue : PAL metadata user data value, ~0U (UserDataMapping::Invalid) for none -// @param sizeInDwords : Size of argument in dwords -// @param argIndex : Where to store arg index once it is allocated, nullptr for none -// @param builder : IRBuilder (just for getting types) -void PatchEntryPointMutate::addUserDataArg(SmallVectorImpl &userDataArgs, unsigned userDataValue, - unsigned sizeInDwords, const Twine &name, unsigned *argIndex, - IRBuilder<> &builder) { - Type *argTy = builder.getInt32Ty(); - if (sizeInDwords != 1) - argTy = FixedVectorType::get(argTy, sizeInDwords); - userDataArgs.push_back(UserDataArg(argTy, name, userDataValue, argIndex)); + if (controlBufPtr) { + specialUserDataArgs.push_back(UserDataArg(builder.getInt32Ty(), "streamOutControlBuf", + UserDataMapping::StreamOutControlBuf, controlBufPtr)); + } + } } // ===================================================================================================================== -// Determine which user data args are going to be "unspilled" (passed in shader entry SGPRs rather than loaded -// from spill table) +// Determine the final list of user data args and whether we require a spill table. // -// @param userDataArgs : First array of UserDataArg structs for candidate args -// @param specialUserDataArgs : Second array of UserDataArg structs for candidate args +// @param [in/out] userDataArgs : Input the array of prefix "system value" user data arguments; outputs the final list +// of user data arguments +// @param specialUserDataArgs : list of suffix "system value" user data arguments // @param builder : IRBuilder to get types from -// @param [out] unspilledArgs : Output vector of UserDataArg structs that will be "unspilled". Mostly these are -// copied from the input arrays, plus an extra one for the spill table pointer if -// needed. -// @param [out] unspilledArgNames : Argument names of unspilled arguments. -void PatchEntryPointMutate::determineUnspilledUserDataArgs(ArrayRef userDataArgs, - ArrayRef specialUserDataArgs, - IRBuilder<> &builder, - SmallVectorImpl &unspilledArgs) { - - std::optional spillTableArg; - +void PatchEntryPointMutate::finalizeUserDataArgs(SmallVectorImpl &userDataArgs, + ArrayRef specialUserDataArgs, IRBuilder<> &builder) { auto userDataUsage = getUserDataUsage(m_shaderStage); - if (!userDataUsage->spillTable.users.empty() || userDataUsage->pushConstSpill || - userDataUsage->spillUsage != UINT_MAX) { - // Spill table is already in use by code added in DescBuilder, or by uses of the push const pointer not - // all being of the form that can be unspilled. - spillTableArg = UserDataArg(builder.getInt32Ty(), "spillTable", UserDataMapping::SpillTable, - &userDataUsage->spillTable.entryArgIdx); - - // Determine the lowest offset at which the spill table is used, so we can set PAL metadata accordingly. - // (This only covers uses of the spill table generated by DescBuilder. It excludes the push const and args - // that are unspill candidates but we decide to spill; those ones are separately set in userDataUsage->spillUsage.) - SmallVector spillUsers; - spillUsers.insert(spillUsers.end(), userDataUsage->spillTable.users.begin(), userDataUsage->spillTable.users.end()); - unsigned minByteOffset = UINT_MAX; - for (unsigned i = 0; i != spillUsers.size(); ++i) { - for (User *user : spillUsers[i]->users()) { - auto inst = cast(user); - if (isa(inst)) { - spillUsers.push_back(inst); - continue; - } - if (auto gep = dyn_cast(inst)) { - APInt gepOffset(64, 0); - if (gep->accumulateConstantOffset(m_module->getDataLayout(), gepOffset)) { - minByteOffset = std::min(minByteOffset, unsigned(gepOffset.getZExtValue())); - continue; - } - } - minByteOffset = 0; - break; - } - } - // In relocatable shader compilation userDataUsage is unknown until linking. - if (minByteOffset != UINT_MAX && !m_pipelineState->isUnlinked()) - m_pipelineState->getPalMetadata()->setUserDataSpillUsage(std::min(userDataUsage->spillUsage, minByteOffset / 4)); - } // In compute-with-calls, we need to ensure that the compute shader and library code agree that s15 is the spill // table pointer, even if it is not needed, because library code does not know whether a spill table pointer is // needed in the pipeline. Thus we cannot use s15 for anything else. Using the single-arg UserDataArg // constructor like this means that the arg is not used, so it will not be set up in PAL metadata. - if (m_computeWithCalls && !spillTableArg.has_value()) - spillTableArg = UserDataArg(builder.getInt32Ty(), "spillTable", UserDataMapping::SpillTable, - &userDataUsage->spillTable.entryArgIdx); + bool spill = userDataUsage->haveDynamicUserDataLoads || m_computeWithCalls; // Figure out how many sgprs we have available for userDataArgs. // We have s0-s31 (s0-s15 for <=GFX8, or for a compute/task shader on any chip) for everything, so take off the number // of registers used by specialUserDataArgs. - unsigned userDataEnd = (m_shaderStage == ShaderStageCompute || m_shaderStage == ShaderStageTask) - ? InterfaceData::MaxCsUserDataCount - : m_pipelineState->getTargetInfo().getGpuProperty().maxUserDataCount; + unsigned userDataAvailable = (m_shaderStage == ShaderStageCompute || m_shaderStage == ShaderStageTask) + ? InterfaceData::MaxCsUserDataCount + : m_pipelineState->getTargetInfo().getGpuProperty().maxUserDataCount; // FIXME Restricting user data as the backend does not support more sgprs as arguments - if (isComputeWithCalls() && userDataEnd > 16) - userDataEnd = 16; + if (m_computeWithCalls && userDataAvailable > 16) + userDataAvailable = 16; - for (auto &userDataArg : specialUserDataArgs) - userDataEnd -= userDataArg.argDwordSize; + for (const auto &userDataArg : specialUserDataArgs) + userDataAvailable -= userDataArg.argDwordSize; // ... and the one used by the spill table if already added. - if (spillTableArg.has_value()) - userDataEnd -= 1; - - // See if we need to spill any user data nodes in userDataArgs, copying the unspilled ones across to unspilledArgs. - unsigned userDataIdx = 0; + if (spill) + userDataAvailable -= 1; + + unsigned userDataEnd = 0; + for (const auto &userDataArg : userDataArgs) + userDataEnd += userDataArg.argDwordSize; + assert(userDataEnd < userDataAvailable && "too many system value user data args"); + + if (m_computeWithCalls) { + // In compute with calls, the user data layout must be the same across all shaders and therefore cannot depend + // on an individual shader's usage pattern. + unsigned userDataSgprs = userDataAvailable - userDataEnd; + unsigned userDataDwords = 0; + for (const auto &node : m_pipelineState->getUserDataNodes()) + userDataDwords = std::max(userDataDwords, node.offsetInDwords + node.sizeInDwords); + + userDataUsage->entryArgIdxs.resize(userDataDwords); + for (unsigned i = 0; i != userDataSgprs; ++i) { + if (i < userDataDwords) + userDataArgs.emplace_back(builder.getInt32Ty(), "userdata" + Twine(i), i, &userDataUsage->entryArgIdxs[i]); + else + userDataArgs.emplace_back(builder.getInt32Ty(), "pad" + Twine(i)); + } + if (userDataSgprs < userDataDwords) + m_pipelineState->getPalMetadata()->setUserDataSpillUsage(userDataSgprs); + + // We must conservatively assume that there are functions with dynamic push constant accesses, and that therefore + // the push constants must be fully available in the spill region even if they fit (partially) into SGPRs. + const ResourceNode *node = m_pipelineState->findSingleRootResourceNode(ResourceNodeType::PushConst, m_shaderStage); + if (node) + m_pipelineState->getPalMetadata()->setUserDataSpillUsage(node->offsetInDwords); + } else { + // Greedily fit as many generic user data arguments as possible. + // Pre-allocate entryArgIdxs since we rely on stabgle pointers. + userDataUsage->entryArgIdxs.resize(userDataUsage->loadSizes.size()); + + unsigned lastIdx = 0; + unsigned lastSize = 0; + for (unsigned i = 0; i < userDataUsage->loadSizes.size();) { + unsigned size = userDataUsage->loadSizes[i]; + if (size == 0) { + ++i; + continue; + } - for (const UserDataArg &userDataArg : userDataArgs) { - unsigned afterUserDataIdx = userDataIdx + userDataArg.argDwordSize; - if (afterUserDataIdx > userDataEnd) { - // Spill this node. Allocate the spill table arg. - if (!spillTableArg.has_value()) { - spillTableArg = UserDataArg(builder.getInt32Ty(), "spillTable", UserDataMapping::SpillTable, - &userDataUsage->spillTable.entryArgIdx); - --userDataEnd; - - if (userDataIdx > userDataEnd) { - // We over-ran the available SGPRs by filling them up and then realizing we needed a spill table pointer. - // Remove the last unspilled node (and any padding arg before that), and ensure that spill usage is - // set correctly so that PAL metadata spill threshold is correct. - // (Note that this path cannot happen in compute-with-calls, because we pre-reserved a slot for the - // spill table pointer.) - userDataIdx -= unspilledArgs.back().argDwordSize; - userDataUsage->spillUsage = std::min(userDataUsage->spillUsage, unspilledArgs.back().userDataValue); - unspilledArgs.pop_back(); + if (userDataEnd + size > userDataAvailable) { + // We ran out of SGPR space -- need to spill. + unsigned spillUsage = i; + if (!spill) { + if (userDataEnd >= userDataAvailable) { + // No space left for the spill table, we need to backtrack. + assert(lastSize > 0); + userDataArgs.erase(userDataArgs.end() - lastSize, userDataArgs.end()); + userDataEnd -= lastSize; + spillUsage = lastIdx; + } + --userDataAvailable; + spill = true; } - } else if (!spillTableArg->argIndex) { - // This is the compute-with-calls case that we reserved s15 for the spill table pointer above, - // without setting its PAL metadata or spillTable.entryArgIdx, but now we find we do need to set - // them. - spillTableArg = UserDataArg(builder.getInt32Ty(), "spillTable", UserDataMapping::SpillTable, - &userDataUsage->spillTable.entryArgIdx); + m_pipelineState->getPalMetadata()->setUserDataSpillUsage(spillUsage); + break; } - // Ensure that spillUsage includes this offset. (We might be on a compute shader padding node, in which - // case userDataArg.userDataValue is Invalid, and this call has no effect.) - userDataUsage->spillUsage = std::min(userDataUsage->spillUsage, userDataArg.userDataValue); + lastSize = size; + lastIdx = i; + for (;;) { + userDataArgs.emplace_back(builder.getInt32Ty(), "userdata" + Twine(i), i, &userDataUsage->entryArgIdxs[i]); + ++userDataEnd; + ++i; + --size; - continue; - } - // Keep this node on the unspilled list. - userDataIdx = afterUserDataIdx; - unspilledArgs.push_back(userDataArg); - } + if (!size) + break; - // For compute-with-calls, add extra padding unspilled args until we get to s15. s15 will then be used for - // the spill table pointer below, even if we didn't appear to need one. - if (isComputeWithCalls()) { - while (userDataIdx < userDataEnd) { - unspilledArgs.push_back(UserDataArg(builder.getInt32Ty(), Twine())); - ++userDataIdx; + // Depending on the order in which loads were originally added, we may still have some unsplit overlapping + // loads registered. Split them now. + if (userDataUsage->loadSizes[i] && userDataUsage->loadSizes[i] > size) + userDataUsage->addLoad(i + size, userDataUsage->loadSizes[i] - size); + } } } - // Add the special args and the spill table pointer (if any) to unspilledArgs. + // Add the special args and the spill table pointer (if any). // (specialUserDataArgs is empty for compute, and thus for compute-with-calls.) - unspilledArgs.insert(unspilledArgs.end(), specialUserDataArgs.begin(), specialUserDataArgs.end()); - if (spillTableArg.has_value()) - unspilledArgs.insert(unspilledArgs.end(), *spillTableArg); + userDataArgs.insert(userDataArgs.end(), specialUserDataArgs.begin(), specialUserDataArgs.end()); + if (spill) { + userDataArgs.emplace_back(builder.getInt32Ty(), "spillTable", UserDataMapping::SpillTable, + &userDataUsage->spillTableEntryArgIdx); + } } // ===================================================================================================================== @@ -2062,3 +1795,25 @@ bool PatchEntryPointMutate::UserDataUsage::isSpecialUserDataUsed(UserDataMapping unsigned index = static_cast(kind) - static_cast(UserDataMapping::GlobalTable); return specialUserData.size() > index && !specialUserData[index].users.empty(); } + +// ===================================================================================================================== +void PatchEntryPointMutate::UserDataUsage::addLoad(unsigned dwordOffset, unsigned dwordSize) { + assert(dwordOffset + dwordSize <= 256 && "shader uses a user data region that is too large"); + + if (dwordOffset + dwordSize > loadSizes.size()) + loadSizes.resize(dwordOffset + dwordSize); + + while (dwordSize != 0) { + if (!loadSizes[dwordOffset]) { + loadSizes[dwordOffset] = dwordSize; + return; + } + + // Split our load or the pre-existing load, whichever is larger. + unsigned max = std::max(dwordSize, loadSizes[dwordOffset]); + unsigned min = std::min(dwordSize, loadSizes[dwordOffset]); + loadSizes[dwordOffset] = min; + dwordOffset += min; + dwordSize = max - min; + } +} diff --git a/lgc/patch/PatchInOutImportExport.cpp b/lgc/patch/PatchInOutImportExport.cpp index 9f107db5ed..5da4be0bf3 100644 --- a/lgc/patch/PatchInOutImportExport.cpp +++ b/lgc/patch/PatchInOutImportExport.cpp @@ -630,8 +630,8 @@ void PatchInOutImportExport::visitCallInst(CallInst &callInst) { } case ShaderStageMesh: { assert(callInst.arg_size() == 2); - Value *elemIdx = isDontCareValue(callInst.getOperand(1)) ? nullptr : callInst.getOperand(1); - input = patchMeshBuiltInInputImport(inputTy, builtInId, elemIdx, builder); + assert(isDontCareValue(callInst.getOperand(1))); + input = patchMeshBuiltInInputImport(inputTy, builtInId, builder); break; } case ShaderStageFragment: { @@ -670,21 +670,31 @@ void PatchInOutImportExport::visitCallInst(CallInst &callInst) { InOutLocationInfo origLocInfo; origLocInfo.setLocation(origLoc); - auto locInfoMapIt = resUsage->inOutUsage.inputLocInfoMap.find(origLocInfo); if (m_shaderStage == ShaderStageTessEval || (m_shaderStage == ShaderStageFragment && (m_pipelineState->getPrevShaderStage(m_shaderStage) == ShaderStageMesh || m_pipelineState->isUnlinked()))) { // NOTE: For generic inputs of tessellation evaluation shader or fragment shader whose previous shader stage // is mesh shader or is in unlinked pipeline, they could be per-patch ones or per-primitive ones. - if (locInfoMapIt != resUsage->inOutUsage.inputLocInfoMap.end()) { - loc = locInfoMapIt->second.getLocation(); - } else if (resUsage->inOutUsage.perPatchInputLocMap.find(origLoc) != - resUsage->inOutUsage.perPatchInputLocMap.end()) { - loc = resUsage->inOutUsage.perPatchInputLocMap[origLoc]; + const bool isPerPrimitive = genericLocationOp.getPerPrimitive(); + if (isPerPrimitive) { + auto &checkedMap = m_shaderStage == ShaderStageTessEval ? resUsage->inOutUsage.perPatchInputLocMap + : resUsage->inOutUsage.perPrimitiveInputLocMap; + auto locMapIt = checkedMap.find(origLoc); + if (locMapIt != checkedMap.end()) + loc = locMapIt->second; } else { - assert(resUsage->inOutUsage.perPrimitiveInputLocMap.find(origLoc) != - resUsage->inOutUsage.perPrimitiveInputLocMap.end()); - loc = resUsage->inOutUsage.perPrimitiveInputLocMap[origLoc]; + // NOTE: We need consider key if component index is constant. Because inputs within same + // location are compacted. + auto locInfoMapIt = resUsage->inOutUsage.inputLocInfoMap.find(origLocInfo); + if (locInfoMapIt != resUsage->inOutUsage.inputLocInfoMap.end()) { + loc = locInfoMapIt->second.getLocation(); + } else { + assert(isa(genericLocationOp.getElemIdx())); + origLocInfo.setComponent(cast(genericLocationOp.getElemIdx())->getZExtValue()); + auto locInfoMapIt = resUsage->inOutUsage.inputLocInfoMap.find(origLocInfo); + if (locInfoMapIt != resUsage->inOutUsage.inputLocInfoMap.end()) + loc = locInfoMapIt->second.getLocation(); + } } } else { if (m_pipelineState->canPackInput(m_shaderStage)) { @@ -695,15 +705,26 @@ void PatchInOutImportExport::visitCallInst(CallInst &callInst) { assert(!isTcs || (isa(genericLocationOp.getLocOffset()) && isa(genericLocationOp.getElemIdx()))); origLocInfo.setComponent(cast(genericLocationOp.getElemIdx())->getZExtValue()); - locInfoMapIt = resUsage->inOutUsage.inputLocInfoMap.find(origLocInfo); + auto locInfoMapIt = resUsage->inOutUsage.inputLocInfoMap.find(origLocInfo); assert(locInfoMapIt != resUsage->inOutUsage.inputLocInfoMap.end()); loc = locInfoMapIt->second.getLocation(); elemIdx = builder.getInt32(locInfoMapIt->second.getComponent()); highHalf = locInfoMapIt->second.isHighHalf(); } else { - assert(locInfoMapIt != resUsage->inOutUsage.inputLocInfoMap.end()); - loc = locInfoMapIt->second.getLocation(); + // NOTE: We need consider key if component index is constant. Because inputs within same + // location are compacted. + auto locInfoMapIt = resUsage->inOutUsage.inputLocInfoMap.find(origLocInfo); + if (locInfoMapIt != resUsage->inOutUsage.inputLocInfoMap.end()) { + loc = locInfoMapIt->second.getLocation(); + } else { + assert(isa(genericLocationOp.getElemIdx())); + origLocInfo.setComponent(cast(genericLocationOp.getElemIdx())->getZExtValue()); + auto locInfoMapIt = resUsage->inOutUsage.inputLocInfoMap.find(origLocInfo); + assert(locInfoMapIt != resUsage->inOutUsage.inputLocInfoMap.end()); + if (locInfoMapIt != resUsage->inOutUsage.inputLocInfoMap.end()) + loc = locInfoMapIt->second.getLocation(); + } } } assert(loc != InvalidValue); @@ -930,24 +951,39 @@ void PatchInOutImportExport::visitCallInst(CallInst &callInst) { origLocInfo.setLocation(value); if (m_shaderStage == ShaderStageGeometry) origLocInfo.setStreamId(cast(callInst.getOperand(2))->getZExtValue()); - auto locInfoMapIt = resUsage->inOutUsage.outputLocInfoMap.find(origLocInfo); if (m_shaderStage == ShaderStageTessControl || m_shaderStage == ShaderStageMesh) { locOffset = callInst.getOperand(1); // NOTE: For generic outputs of tessellation control shader or mesh shader, they could be per-patch ones or // per-primitive ones. - if (locInfoMapIt != resUsage->inOutUsage.outputLocInfoMap.end()) { - exist = true; - loc = locInfoMapIt->second.getLocation(); - } else if (resUsage->inOutUsage.perPatchOutputLocMap.find(value) != - resUsage->inOutUsage.perPatchOutputLocMap.end()) { - exist = true; - loc = resUsage->inOutUsage.perPatchOutputLocMap[value]; - } else if (resUsage->inOutUsage.perPrimitiveOutputLocMap.find(value) != - resUsage->inOutUsage.perPrimitiveOutputLocMap.end()) { - exist = true; - loc = resUsage->inOutUsage.perPrimitiveOutputLocMap[value]; + if (m_shaderStage == ShaderStageMesh && cast(callInst.getOperand(4))->getZExtValue() != 0) { + auto locMapIt = resUsage->inOutUsage.perPrimitiveOutputLocMap.find(value); + if (locMapIt != resUsage->inOutUsage.perPrimitiveOutputLocMap.end()) { + loc = locMapIt->second; + exist = true; + } + } else if (m_shaderStage == ShaderStageTessControl && isDontCareValue(callInst.getOperand(3))) { + auto locMapIt = resUsage->inOutUsage.perPatchOutputLocMap.find(value); + if (locMapIt != resUsage->inOutUsage.perPatchOutputLocMap.end()) { + loc = locMapIt->second; + exist = true; + } + } else { + // NOTE: We need consider key if component index is constant. Because outputs within + // same location are compacted. + auto locInfoMapIt = resUsage->inOutUsage.outputLocInfoMap.find(origLocInfo); + if (locInfoMapIt != resUsage->inOutUsage.outputLocInfoMap.end()) { + loc = locInfoMapIt->second.getLocation(); + exist = true; + } else if (isa(callInst.getOperand(2))) { + origLocInfo.setComponent(cast(callInst.getOperand(2))->getZExtValue()); + auto locInfoMapIt = resUsage->inOutUsage.outputLocInfoMap.find(origLocInfo); + if (locInfoMapIt != resUsage->inOutUsage.outputLocInfoMap.end()) { + loc = locInfoMapIt->second.getLocation(); + exist = true; + } + } } } else if (m_shaderStage == ShaderStageCopyShader) { exist = true; @@ -962,7 +998,7 @@ void PatchInOutImportExport::visitCallInst(CallInst &callInst) { if (output->getType()->getScalarSizeInBits() == 64) component *= 2; // Component in location info is dword-based origLocInfo.setComponent(component); - locInfoMapIt = resUsage->inOutUsage.outputLocInfoMap.find(origLocInfo); + auto locInfoMapIt = resUsage->inOutUsage.outputLocInfoMap.find(origLocInfo); if (m_pipelineState->canPackOutput(m_shaderStage)) { if (locInfoMapIt != resUsage->inOutUsage.outputLocInfoMap.end()) { @@ -2062,9 +2098,10 @@ void PatchInOutImportExport::patchMeshGenericOutputExport(Value *output, unsigne outputOffset = builder.CreateAdd(outputOffset, compIdx); - std::string callName(isPerPrimitive ? lgcName::MeshTaskWritePrimitiveOutput : lgcName::MeshTaskWriteVertexOutput); - callName += getTypeName(outputTy); - builder.CreateNamedCall(callName, builder.getVoidTy(), {outputOffset, vertexOrPrimitiveIdx, output}, {}); + if (isPerPrimitive) + builder.create(outputOffset, vertexOrPrimitiveIdx, output); + else + builder.create(outputOffset, vertexOrPrimitiveIdx, output); } // ===================================================================================================================== @@ -2340,20 +2377,14 @@ Value *PatchInOutImportExport::patchGsBuiltInInputImport(Type *inputTy, unsigned // // @param inputTy : Type of input value // @param builtInId : ID of the built-in variable -// @param elemIdx : Index used for vector element indexing (could be null) // @param builder : The IR builder to create and insert IR instruction -Value *PatchInOutImportExport::patchMeshBuiltInInputImport(Type *inputTy, unsigned builtInId, Value *elemIdx, - BuilderBase &builder) { +Value *PatchInOutImportExport::patchMeshBuiltInInputImport(Type *inputTy, unsigned builtInId, BuilderBase &builder) { // Handle work group size built-in if (builtInId == BuiltInWorkgroupSize) { // WorkgroupSize is a constant vector supplied by mesh shader mode. const auto &meshMode = m_pipelineState->getShaderModes()->getMeshShaderMode(); - Value *input = - ConstantVector::get({builder.getInt32(meshMode.workgroupSizeX), builder.getInt32(meshMode.workgroupSizeY), - builder.getInt32(meshMode.workgroupSizeZ)}); - if (elemIdx) - input = builder.CreateExtractElement(input, elemIdx); - return input; + return ConstantVector::get({builder.getInt32(meshMode.workgroupSizeX), builder.getInt32(meshMode.workgroupSizeY), + builder.getInt32(meshMode.workgroupSizeZ)}); } // Handle other built-ins @@ -2363,51 +2394,37 @@ Value *PatchInOutImportExport::patchMeshBuiltInInputImport(Type *inputTy, unsign switch (builtInId) { case BuiltInDrawIndex: assert(builtInUsage.drawIndex); - assert(!elemIdx); // No vector element indexing break; case BuiltInViewIndex: assert(builtInUsage.viewIndex); - assert(!elemIdx); // No vector element indexing break; case BuiltInNumWorkgroups: assert(builtInUsage.numWorkgroups); - inputTy = elemIdx ? FixedVectorType::get(builder.getInt32Ty(), 3) : inputTy; break; case BuiltInWorkgroupId: assert(builtInUsage.workgroupId); - inputTy = elemIdx ? FixedVectorType::get(builder.getInt32Ty(), 3) : inputTy; break; case BuiltInLocalInvocationId: assert(builtInUsage.localInvocationId); - inputTy = elemIdx ? FixedVectorType::get(builder.getInt32Ty(), 3) : inputTy; break; case BuiltInGlobalInvocationId: assert(builtInUsage.globalInvocationId); - inputTy = elemIdx ? FixedVectorType::get(builder.getInt32Ty(), 3) : inputTy; break; case BuiltInLocalInvocationIndex: assert(builtInUsage.localInvocationIndex); - assert(!elemIdx); // No vector element indexing break; case BuiltInSubgroupId: assert(builtInUsage.subgroupId); - assert(!elemIdx); // No vector element indexing break; case BuiltInNumSubgroups: assert(builtInUsage.numSubgroups); - assert(!elemIdx); // No vector element indexing break; default: llvm_unreachable("Unknown mesh shader built-in!"); break; } - std::string callName(lgcName::MeshTaskGetMeshInput); - callName += getTypeName(inputTy); - Value *input = builder.CreateNamedCall(callName, inputTy, builder.getInt32(builtInId), {}); - if (elemIdx) - input = builder.CreateExtractElement(input, elemIdx); - return input; + return builder.create(inputTy, builtInId); } // ===================================================================================================================== @@ -2438,8 +2455,26 @@ Value *PatchInOutImportExport::patchFsBuiltInInputImport(Type *inputTy, unsigned Value *sampleMaskIn = sampleCoverage; if (m_pipelineState->getRasterizerState().perSampleShading || builtInUsage.runAtSampleRate) { - // gl_SampleMaskIn[0] = (SampleCoverage & (1 << gl_SampleID)) - sampleMaskIn = builder.CreateShl(builder.getInt32(1), sampleId); + unsigned baseMask = 1; + if (!builtInUsage.sampleId) { + // Fix the failure for multisample_shader_builtin.sample_mask cases "gl_SampleMaskIn" should contain one + // or multiple covered sample bit. + // (1) If the 4 samples is divided into 2 sub invocation groups, broadcast sample mask bit <0, 1> + // to sample <2, 3>. + // (2) If the 8 samples is divided into 2 sub invocation groups, broadcast sample mask bit <0, 1> + // to sample <2, 3>, then re-broadcast sample mask bit <0, 1, 2, 3> to sample <4, 5, 6, 7>. + // (3) If the 8 samples is divided into 4 sub invocation groups, patch to broadcast sample mask bit + // <0, 1, 2, 3> to sample <4, 5, 6, 7>. + + unsigned baseMaskSamples = m_pipelineState->getRasterizerState().pixelShaderSamples; + while (baseMaskSamples < m_pipelineState->getRasterizerState().numSamples) { + baseMask |= baseMask << baseMaskSamples; + baseMaskSamples *= 2; + } + } + + // gl_SampleMaskIn[0] = (SampleCoverage & (baseMask << gl_SampleID)) + sampleMaskIn = builder.CreateShl(builder.getInt32(baseMask), sampleId); sampleMaskIn = builder.CreateAnd(sampleCoverage, sampleMaskIn); } @@ -2449,13 +2484,18 @@ Value *PatchInOutImportExport::patchFsBuiltInInputImport(Type *inputTy, unsigned break; } case BuiltInFragCoord: { - // TODO: Support layout qualifiers "pixel_center_integer" and "origin_upper_left". Value *fragCoord[4] = { getFunctionArgument(m_entryPoint, entryArgIdxs.fragCoord.x), getFunctionArgument(m_entryPoint, entryArgIdxs.fragCoord.y), getFunctionArgument(m_entryPoint, entryArgIdxs.fragCoord.z), getFunctionArgument(m_entryPoint, entryArgIdxs.fragCoord.w), }; + + if (m_pipelineState->getShaderModes()->getFragmentShaderMode().pixelCenterInteger) { + fragCoord[0] = builder.CreateFSub(fragCoord[0], ConstantFP::get(builder.getFloatTy(), 0.5)); + fragCoord[1] = builder.CreateFSub(fragCoord[1], ConstantFP::get(builder.getFloatTy(), 0.5)); + } + // Adjust gl_FragCoord.z value for the shading rate X, // // adjustedFragCoordZ = gl_FragCood.z + dFdxFine(gl_FragCood.z) * 1/16 @@ -3269,8 +3309,6 @@ void PatchInOutImportExport::patchMeshBuiltInOutputExport(Value *output, unsigne BuilderBase builder(*m_context); builder.SetInsertPoint(insertPos); - auto outputTy = output->getType(); - // Handle primitive indices built-ins if (builtInId == BuiltInPrimitivePointIndices || builtInId == BuiltInPrimitiveLineIndices || builtInId == BuiltInPrimitiveTriangleIndices) { @@ -3285,17 +3323,15 @@ void PatchInOutImportExport::patchMeshBuiltInOutputExport(Value *output, unsigne // whole, partial writes to the vector components for line and triangle primitives is not allowed." assert(!elemIdx); - builder.CreateNamedCall(lgcName::MeshTaskSetPrimitiveIndices + getTypeName(outputTy), builder.getVoidTy(), - {vertexOrPrimitiveIdx, output}, {}); + builder.create(vertexOrPrimitiveIdx, output); return; } // Handle cull primitive built-in if (builtInId == BuiltInCullPrimitive) { assert(isPerPrimitive); - assert(outputTy->isIntegerTy(1)); // Must be boolean - builder.CreateNamedCall(lgcName::MeshTaskSetPrimitiveCulled, builder.getVoidTy(), {vertexOrPrimitiveIdx, output}, - {}); + assert(output->getType()->isIntegerTy(1)); // Must be boolean + builder.create(vertexOrPrimitiveIdx, output); return; } @@ -3357,9 +3393,10 @@ void PatchInOutImportExport::patchMeshBuiltInOutputExport(Value *output, unsigne if (elemIdx) outputOffset = builder.CreateAdd(builder.getInt32(4 * loc), elemIdx); - std::string callName(isPerPrimitive ? lgcName::MeshTaskWritePrimitiveOutput : lgcName::MeshTaskWriteVertexOutput); - callName += getTypeName(outputTy); - builder.CreateNamedCall(callName, builder.getVoidTy(), {outputOffset, vertexOrPrimitiveIdx, output}, {}); + if (isPerPrimitive) + builder.create(outputOffset, vertexOrPrimitiveIdx, output); + else + builder.create(outputOffset, vertexOrPrimitiveIdx, output); } // ===================================================================================================================== diff --git a/lgc/patch/PatchPreparePipelineAbi.cpp b/lgc/patch/PatchPreparePipelineAbi.cpp index 8d79afcedb..03a0d1a919 100644 --- a/lgc/patch/PatchPreparePipelineAbi.cpp +++ b/lgc/patch/PatchPreparePipelineAbi.cpp @@ -227,8 +227,9 @@ void PatchPreparePipelineAbi::writeTessFactors(PipelineState *pipelineState, Val Value *tfBufferOffset = builder.CreateMul(relPatchId, builder.getInt32(calcFactor.tessFactorStride * sizeof(float))); CoherentFlag coherent = {}; - if (pipelineState->getTargetInfo().getGfxIpVersion().major <= 11) + if (pipelineState->getTargetInfo().getGfxIpVersion().major <= 11) { coherent.bits.glc = true; + } const auto numOuterTfs = cast(outerTf->getType())->getNumElements(); const auto numInnerTfs = innerTf ? cast(innerTf->getType())->getNumElements() diff --git a/lgc/patch/PatchResourceCollect.cpp b/lgc/patch/PatchResourceCollect.cpp index 8f09725a06..de9d889a25 100644 --- a/lgc/patch/PatchResourceCollect.cpp +++ b/lgc/patch/PatchResourceCollect.cpp @@ -2685,30 +2685,116 @@ void PatchResourceCollect::updateInputLocInfoMapWithUnpack() { if (prevStage == ShaderStageMesh) { eraseUnusedLocInfo = false; } + } else if (m_shaderStage == ShaderStageTessControl) { + // NOTE: If location offset or element index (64-bit element type) is dynamic, we keep all generic inputs of TCS. + for (auto call : m_inputCalls) { + auto locOffset = call->getLocOffset(); + if (!isa(locOffset)) { + eraseUnusedLocInfo = false; + break; + } + auto bitWidth = call->getType()->getScalarSizeInBits(); + if (bitWidth == 64) { + auto elemIdx = call->getElemIdx(); + if (!isa(elemIdx)) { + eraseUnusedLocInfo = false; + break; + } + } + } } if (eraseUnusedLocInfo) { + // Collect active locations + DenseSet activeLocs; for (auto call : m_inputCalls) { - InOutLocationInfo origLocInfo; - origLocInfo.setLocation(call->getLocation()); - auto mapIt = inputLocInfoMap.find(origLocInfo); - if (mapIt == inputLocInfoMap.end()) - inputLocInfoMap.erase(mapIt); + const unsigned loc = call->getLocation(); + activeLocs.insert(loc); + auto bitWidth = call->getType()->getPrimitiveSizeInBits(); + if (bitWidth > (8 * SizeOfVec4)) { + assert(bitWidth <= (8 * 2 * SizeOfVec4)); + activeLocs.insert(loc + 1); + } + } + // Clear per-vertex generic inputs + auto &locInfoMap = m_resUsage->inOutUsage.inputLocInfoMap; + for (auto iter = locInfoMap.begin(); iter != locInfoMap.end();) { + auto curIter = iter++; + if (activeLocs.count(curIter->first.getLocation()) == 0) + locInfoMap.erase(curIter); + } + + // clear per-patch inputs + auto &perPatchLocMap = m_resUsage->inOutUsage.perPatchInputLocMap; + for (auto iter = perPatchLocMap.begin(); iter != perPatchLocMap.end();) { + auto curIter = iter++; + if (activeLocs.count(curIter->first) == 0) + perPatchLocMap.erase(curIter); + } + + // Clear per-primitive inputs + auto &perPrimitiveLocMap = m_resUsage->inOutUsage.perPrimitiveInputLocMap; + for (auto iter = perPrimitiveLocMap.begin(); iter != perPrimitiveLocMap.end();) { + auto curIter = iter++; + if (activeLocs.count(curIter->first) == 0) + perPrimitiveLocMap.erase(curIter); + } + } + + // Special processing for TES/Mesh inputLocInfoMap and TES prePatchInputLocMap as their output location offset can be + // dynamic. The dynamic location offset is marked with non-invalid value in the output map. We should keep the + // corresponding input location in the next stage. For example, if TCS output has dynamic location indexing from + // [0,2], we need add the corresponding location info to TES input map. Otherwise, it will cause mismatch when the + // dynamic indexing is in a loop and TES only uses location 1. + auto preStage = m_pipelineState->getPrevShaderStage(m_shaderStage); + if (preStage == ShaderStageTessControl || preStage == ShaderStageMesh) { + if (!inputLocInfoMap.empty()) { + auto &outputLocInfoMap = m_pipelineState->getShaderResourceUsage(preStage)->inOutUsage.outputLocInfoMap; + for (auto &infoPair : outputLocInfoMap) { + if (infoPair.second != InvalidValue) { + inputLocInfoMap[infoPair.first] = InvalidValue; + infoPair.second = InvalidValue; + } + } + } + auto &perPatchInLocMap = inOutUsage.perPatchInputLocMap; + if (!perPatchInLocMap.empty()) { + auto &perPatchOutLocMap = m_pipelineState->getShaderResourceUsage(preStage)->inOutUsage.perPatchOutputLocMap; + for (auto &locPair : perPatchOutLocMap) { + if (locPair.second != InvalidValue) { + perPatchInLocMap[locPair.first] = InvalidValue; + locPair.second = InvalidValue; + } + } } } // Update the value of inputLocInfoMap if (!inputLocInfoMap.empty()) { unsigned nextMapLoc = 0; + DenseMap alreadyMappedLocs; // Map from original location to new location for (auto &locInfoPair : inputLocInfoMap) { auto &newLocationInfo = locInfoPair.second; if (m_shaderStage == ShaderStageVertex) { // NOTE: For vertex shader, use the original location as the remapped location newLocationInfo.setData(locInfoPair.first.getData()); } else { + const unsigned origLoc = locInfoPair.first.getLocation(); + unsigned mappedLoc = InvalidValue; // For other shaders, map the location to continuous locations + auto locMapIt = alreadyMappedLocs.find(origLoc); + if (locMapIt != alreadyMappedLocs.end()) { + mappedLoc = locMapIt->second; + } else { + mappedLoc = nextMapLoc++; + // NOTE: Record the map because we are handling multiple pairs of . Some pairs have the + // same location while the components are different. + alreadyMappedLocs.insert({origLoc, mappedLoc}); + } + newLocationInfo.setData(0); - newLocationInfo.setLocation(nextMapLoc++); + newLocationInfo.setLocation(mappedLoc); + newLocationInfo.setComponent(locInfoPair.first.getComponent()); } } } @@ -2717,10 +2803,8 @@ void PatchResourceCollect::updateInputLocInfoMapWithUnpack() { auto &perPatchInLocMap = inOutUsage.perPatchInputLocMap; if (!perPatchInLocMap.empty()) { unsigned nextMapLoc = 0; - for (auto &locPair : perPatchInLocMap) { - assert(locPair.second == InvalidValue); + for (auto &locPair : perPatchInLocMap) locPair.second = nextMapLoc++; - } } // Update the value of perPrimitiveInputLocMap @@ -2743,22 +2827,15 @@ void PatchResourceCollect::clearUnusedOutput() { auto &inOutUsage = m_pipelineState->getShaderResourceUsage(m_shaderStage)->inOutUsage; auto &outputLocInfoMap = inOutUsage.outputLocInfoMap; if (nextStage != ShaderStageInvalid) { - // Collect the locations of TCS with dynamic indexing or as imported output - DenseSet dynIndexedOrImportOutputLocs; + // Collect the locations of TCS's imported outputs + DenseSet importOutputLocs; if (m_shaderStage == ShaderStageTessControl) { - // Generic output export calls - for (auto &call : m_outputCalls) { - if (!isa(call->getOperand(1)) || !isa(call->getOperand(2))) { - const unsigned loc = cast(call->getOperand(0))->getZExtValue(); - dynIndexedOrImportOutputLocs.insert(loc); - } - } // Imported output calls for (auto &outputImport : m_importedOutputCalls) { unsigned loc = outputImport->getLocation(); Value *const locOffset = outputImport->getLocOffset(); Value *const compIdx = outputImport->getElemIdx(); - dynIndexedOrImportOutputLocs.insert(loc); + importOutputLocs.insert(loc); // Location offset and component index are both constant if (isa(locOffset) && isa(compIdx)) { loc += cast(locOffset)->getZExtValue(); @@ -2766,7 +2843,7 @@ void PatchResourceCollect::clearUnusedOutput() { if (bitWidth == 64 && cast(compIdx)->getZExtValue() >= 2) { // NOTE: For the addressing of .z/.w component of 64-bit vector/scalar, the count of // occupied locations are two. - dynIndexedOrImportOutputLocs.insert(loc + 1); + importOutputLocs.insert(loc + 1); } } } @@ -2806,9 +2883,8 @@ void PatchResourceCollect::clearUnusedOutput() { if (!isOutputXfb && !foundInNextStage) { bool isActiveLoc = false; if (m_shaderStage == ShaderStageTessControl) { - // NOTE: If either dynamic indexing of generic outputs exists or the generic output involve in - // output import, we have to mark it as active. - isActiveLoc = dynIndexedOrImportOutputLocs.find(origLoc) != dynIndexedOrImportOutputLocs.end(); + // NOTE: if the output is used as imported in TCS, it is marked as active. + isActiveLoc = importOutputLocs.find(origLoc) != importOutputLocs.end(); } if (isActiveLoc) { // The assigned location must not overlap with those used by inputs of next shader stage. @@ -2831,12 +2907,12 @@ void PatchResourceCollect::clearUnusedOutput() { const auto &nextPerPatchInLocMap = nextResUsage->inOutUsage.perPatchInputLocMap; unsigned availPerPatchInMapLoc = nextResUsage->inOutUsage.perPatchInputMapLocCount; - // Collect locations of those outputs that are not used by next shader stage + // Collect locations of those outputs that are not used by next shader stage or read by TCS SmallVector unusedLocs; for (auto &locPair : perPatchOutputLocMap) { const unsigned loc = locPair.first; if (nextPerPatchInLocMap.find(loc) == nextPerPatchInLocMap.end()) { - if (dynIndexedOrImportOutputLocs.find(loc) != dynIndexedOrImportOutputLocs.end()) + if (importOutputLocs.find(loc) != importOutputLocs.end()) locPair.second = availPerPatchInMapLoc++; else unusedLocs.push_back(loc); @@ -2858,7 +2934,7 @@ void PatchResourceCollect::clearUnusedOutput() { for (auto &locPair : perPrimitiveOutputLocMap) { const unsigned loc = locPair.first; if (nextPerPrimitiveInLocMap.find(loc) == nextPerPrimitiveInLocMap.end()) { - if (dynIndexedOrImportOutputLocs.find(loc) != dynIndexedOrImportOutputLocs.end()) + if (importOutputLocs.find(loc) != importOutputLocs.end()) locPair.second = availPerPrimitiveInMapLoc++; else unusedLocs.push_back(loc); diff --git a/lgc/patch/RegisterMetadataBuilder.cpp b/lgc/patch/RegisterMetadataBuilder.cpp index efa1640ac3..58ab6c10e9 100644 --- a/lgc/patch/RegisterMetadataBuilder.cpp +++ b/lgc/patch/RegisterMetadataBuilder.cpp @@ -58,6 +58,7 @@ void RegisterMetadataBuilder::buildPalMetadata() { m_isNggMode = m_pipelineState->getNggControl()->enableNgg; Util::Abi::PipelineType pipelineType = Util::Abi::PipelineType::VsPs; + auto lastVertexProcessingStage = m_pipelineState->getLastVertexProcessingStage(); DenseMap apiHwShaderMap; if (m_hasTask || m_hasMesh) { @@ -81,7 +82,7 @@ void RegisterMetadataBuilder::buildPalMetadata() { if (m_hasVs) apiHwShaderMap[ShaderStageVertex] = Util::Abi::HwShaderHs; } - auto lastVertexProcessingStage = m_pipelineState->getLastVertexProcessingStage(); + if (lastVertexProcessingStage != ShaderStageInvalid) { if (lastVertexProcessingStage == ShaderStageCopyShader) lastVertexProcessingStage = ShaderStageGeometry; @@ -172,6 +173,38 @@ void RegisterMetadataBuilder::buildPalMetadata() { if (hwStageMask & (Util::Abi::HwShaderGs | Util::Abi::HwShaderVs)) buildPaSpecificRegisters(); + if (lastVertexProcessingStage != ShaderStageInvalid && m_pipelineState->isUnlinked()) { + // Fill ".preraster_output_semantic" + auto resUsage = m_pipelineState->getShaderResourceUsage(lastVertexProcessingStage); + auto &outputLocInfoMap = resUsage->inOutUsage.outputLocInfoMap; + auto &builtInOutputLocMap = resUsage->inOutUsage.builtInOutputLocMap; + // Collect semantic info for generic input and builtIns {gl_ClipDistance, gl_CulDistance, gl_Layer, + // gl_ViewportIndex} that exports via generic output as well. + if (!outputLocInfoMap.empty() || !builtInOutputLocMap.empty()) { + auto preRasterOutputSemanticNode = + getPipelineNode()[Util::Abi::PipelineMetadataKey::PrerasterOutputSemantic].getArray(true); + unsigned elemIdx = 0; + for (auto locInfoPair : outputLocInfoMap) { + auto preRasterOutputSemanticElem = preRasterOutputSemanticNode[elemIdx].getMap(true); + preRasterOutputSemanticElem[Util::Abi::PrerasterOutputSemanticMetadataKey::Semantic] = + MaxBuiltIn + locInfoPair.first.getLocation(); + preRasterOutputSemanticElem[Util::Abi::PrerasterOutputSemanticMetadataKey::Index] = + locInfoPair.second.getLocation(); + ++elemIdx; + } + + for (auto locPair : builtInOutputLocMap) { + if (locPair.first == BuiltInClipDistance || locPair.first == BuiltInCullDistance || + locPair.first == BuiltInLayer || locPair.first == BuiltInViewportIndex) { + auto preRasterOutputSemanticElem = preRasterOutputSemanticNode[elemIdx].getMap(true); + preRasterOutputSemanticElem[Util::Abi::PrerasterOutputSemanticMetadataKey::Semantic] = locPair.first; + preRasterOutputSemanticElem[Util::Abi::PrerasterOutputSemanticMetadataKey::Index] = locPair.second; + ++elemIdx; + } + } + } + } + } else { addApiHwShaderMapping(ShaderStageCompute, Util::Abi::HwShaderCs); setPipelineType(Util::Abi::PipelineType::Cs); @@ -186,10 +219,10 @@ void RegisterMetadataBuilder::buildLsHsRegisters() { assert(m_hasTcs); // VGT_HOS_MIN(MAX)_TESS_LEVEL // Minimum and maximum tessellation factors supported by the hardware. - constexpr float minTessFactor = 1.0f; - constexpr float maxTessFactor = 64.0f; - getGraphicsRegNode()[Util::Abi::GraphicsRegisterMetadataKey::VgtHosMinTessLevel] = bit_cast(minTessFactor); - getGraphicsRegNode()[Util::Abi::GraphicsRegisterMetadataKey::VgtHosMaxTessLevel] = bit_cast(maxTessFactor); + constexpr unsigned minTessFactor = 1; + constexpr unsigned maxTessFactor = 64; + getGraphicsRegNode()[Util::Abi::GraphicsRegisterMetadataKey::VgtHosMinTessLevel] = minTessFactor; + getGraphicsRegNode()[Util::Abi::GraphicsRegisterMetadataKey::VgtHosMaxTessLevel] = maxTessFactor; // VGT_LS_HS_CONFIG const auto &calcFactor = m_pipelineState->getShaderResourceUsage(ShaderStageTessControl)->inOutUsage.tcs.calcFactor; @@ -718,18 +751,22 @@ void RegisterMetadataBuilder::buildHwVsRegisters() { // Builds register configuration for hardware pixel shader. void RegisterMetadataBuilder::buildPsRegisters() { ShaderStage shaderStage = ShaderStageFragment; + const auto &options = m_pipelineState->getOptions(); const auto &shaderOptions = m_pipelineState->getShaderOptions(shaderStage); const auto &fragmentMode = m_pipelineState->getShaderModes()->getFragmentShaderMode(); const auto resUsage = m_pipelineState->getShaderResourceUsage(shaderStage); const auto &builtInUsage = resUsage->builtInUsage.fs; + const bool useFloatLocationAtIteratedSampleNumber = + options.fragCoordUsesInterpLoc ? builtInUsage.fragCoordIsSample : builtInUsage.runAtSampleRate; + // SPI_BARYC_CNTL auto spiBarycCntl = getGraphicsRegNode()[Util::Abi::GraphicsRegisterMetadataKey::SpiBarycCntl].getMap(true); spiBarycCntl[Util::Abi::SpiBarycCntlMetadataKey::FrontFaceAllBits] = true; if (fragmentMode.pixelCenterInteger) { // TRUE - Force floating point position to upper left corner of pixel (X.0, Y.0) spiBarycCntl[Util::Abi::SpiBarycCntlMetadataKey::PosFloatUlc] = true; - } else if (builtInUsage.runAtSampleRate) { + } else if (useFloatLocationAtIteratedSampleNumber) { // 2 - Calculate per-pixel floating point position at iterated sample number spiBarycCntl[Util::Abi::SpiBarycCntlMetadataKey::PosFloatLocation] = 2; } else { @@ -834,16 +871,13 @@ void RegisterMetadataBuilder::buildPsRegisters() { spiPsInputCntlInfo.flatShade = interpInfoElem.flat && !interpInfoElem.isPerPrimitive; if (m_gfxIp.major >= 11 && interpInfoElem.isPerPrimitive) { - const auto preStage = m_pipelineState->getPrevShaderStage(ShaderStageFragment); - if (preStage == ShaderStageMesh) { - // NOTE: HW allocates and manages attribute ring based on the register fields: VS_EXPORT_COUNT and - // PRIM_EXPORT_COUNT. When VS_EXPORT_COUNT = 0, HW assumes there is still a vertex attribute exported even - // though this is not what we want. Hence, we should reserve param0 as a dummy vertex attribute and all - // primitive attributes are moved after it. - bool hasNoVertexAttrib = m_pipelineState->getShaderResourceUsage(ShaderStageMesh)->inOutUsage.expCount == 0; - if (hasNoVertexAttrib) - ++spiPsInputCntlInfo.offset; - } + // NOTE: HW allocates and manages attribute ring based on the register fields: VS_EXPORT_COUNT and + // PRIM_EXPORT_COUNT. When VS_EXPORT_COUNT = 0, HW assumes there is still a vertex attribute exported even + // though this is not what we want. Hence, we should reserve param0 as a dummy vertex attribute and all + // primitive attributes are moved after it. + bool hasNoVertexAttrib = m_pipelineState->getShaderResourceUsage(ShaderStageMesh)->inOutUsage.expCount == 0; + if (hasNoVertexAttrib) + ++spiPsInputCntlInfo.offset; spiPsInputCntlInfo.primAttr = true; } @@ -930,6 +964,33 @@ void RegisterMetadataBuilder::buildPsRegisters() { cbShaderMaskNode[Util::Abi::CbShaderMaskMetadataKey::Output5Enable] = (cbShaderMask >> 20) & 0xF; cbShaderMaskNode[Util::Abi::CbShaderMaskMetadataKey::Output6Enable] = (cbShaderMask >> 24) & 0xF; cbShaderMaskNode[Util::Abi::CbShaderMaskMetadataKey::Output7Enable] = (cbShaderMask >> 28) & 0xF; + + // Fill .ps_input_semantic for partial pipeline + if (m_pipelineState->isUnlinked()) { + // Collect semantic info for generic input and builtIns {gl_ClipDistance, gl_CulDistance, gl_Layer, + // gl_ViewportIndex} that exports via generic output as well. + auto &inputLocInfoMap = resUsage->inOutUsage.inputLocInfoMap; + auto &builtInInputLocMap = resUsage->inOutUsage.builtInInputLocMap; + if (!inputLocInfoMap.empty() || !builtInInputLocMap.empty()) { + auto psInputSemanticNode = getPipelineNode()[Util::Abi::PipelineMetadataKey::PsInputSemantic].getArray(true); + unsigned elemIdx = 0; + for (auto locInfoPair : inputLocInfoMap) { + auto psInputSemanticElem = psInputSemanticNode[elemIdx].getMap(true); + psInputSemanticElem[Util::Abi::PsInputSemanticMetadataKey::Semantic] = + MaxBuiltIn + locInfoPair.first.getLocation(); + ++elemIdx; + } + + for (auto locPair : builtInInputLocMap) { + if (locPair.first == BuiltInClipDistance || locPair.first == BuiltInCullDistance || + locPair.first == BuiltInLayer || locPair.first == BuiltInViewportIndex) { + auto psInputSemanticElem = psInputSemanticNode[elemIdx].getMap(true); + psInputSemanticElem[Util::Abi::PsInputSemanticMetadataKey::Semantic] = locPair.first; + ++elemIdx; + } + } + } + } } // ===================================================================================================================== @@ -1381,20 +1442,18 @@ void RegisterMetadataBuilder::setVgtShaderStagesEn(unsigned hwStageMask) { } if (hwStageMask & Util::Abi::HwShaderGs) { - unsigned esStageEn = ES_STAGE_REAL; ShaderStage apiStage = ShaderStageVertex; if (m_hasGs || m_hasMesh) { apiStage = m_hasGs ? ShaderStageGeometry : ShaderStageMesh; vgtShaderStagesEn[Util::Abi::VgtShaderStagesEnMetadataKey::GsStageEn] = GS_STAGE_ON; } else if (m_hasTes) { apiStage = ShaderStageTessEval; - esStageEn = ES_STAGE_DS; } const auto waveSize = m_pipelineState->getShaderWaveSize(apiStage); vgtShaderStagesEn[Util::Abi::VgtShaderStagesEnMetadataKey::GsW32En] = (waveSize == 32); if (m_gfxIp.major <= 11) { - vgtShaderStagesEn[Util::Abi::VgtShaderStagesEnMetadataKey::EsStageEn] = esStageEn; + vgtShaderStagesEn[Util::Abi::VgtShaderStagesEnMetadataKey::EsStageEn] = m_hasTes ? ES_STAGE_DS : ES_STAGE_REAL; if (m_isNggMode && !m_hasMesh) vgtShaderStagesEn[Util::Abi::VgtShaderStagesEnMetadataKey::VsStageEn] = VS_STAGE_REAL; } diff --git a/lgc/patch/VertexFetch.cpp b/lgc/patch/VertexFetch.cpp index 0d067684b1..a67948cfb4 100644 --- a/lgc/patch/VertexFetch.cpp +++ b/lgc/patch/VertexFetch.cpp @@ -97,7 +97,7 @@ class VertexFetchImpl : public VertexFetch { BuilderImpl &builderImpl) override; // Generate code to fetch a vertex value for uber shader - Value *fetchVertex(InputImportGenericOp *inst, Value *descPtr, BuilderBase &builder) override; + Value *fetchVertex(InputImportGenericOp *inst, Value *descPtr, Value *locMasks, BuilderBase &builder) override; private: void initialize(PipelineState *pipelineState); @@ -125,6 +125,8 @@ class VertexFetchImpl : public VertexFetch { bool needSecondVertexFetch(const VertexInputDescription *inputDesc) const; + bool needPatch32(const VertexInputDescription *inputDesc) const; + LgcContext *m_lgcContext = nullptr; // LGC context LLVMContext *m_context = nullptr; // LLVM context Value *m_vertexBufTablePtr = nullptr; // Vertex buffer table pointer @@ -133,8 +135,8 @@ class VertexFetchImpl : public VertexFetch { Value *m_instanceIndex = nullptr; // Instance index static const VertexCompFormatInfo m_vertexCompFormatInfo[]; // Info table of vertex component format - static const unsigned char m_vertexFormatMapGfx10[][8]; // Info table of vertex format mapping for GFX10 - static const unsigned char m_vertexFormatMapGfx11[][8]; // Info table of vertex format mapping for GFX11 + static const unsigned char m_vertexFormatMapGfx10[][9]; // Info table of vertex format mapping for GFX10 + static const unsigned char m_vertexFormatMapGfx11[][9]; // Info table of vertex format mapping for GFX11 // Default values for vertex fetch (<4 x i32> or <8 x i32>) struct { @@ -178,7 +180,7 @@ const VertexCompFormatInfo VertexFetchImpl::m_vertexCompFormatInfo[] = { }; // clang-format off -const unsigned char VertexFetchImpl::m_vertexFormatMapGfx10[][8] = { +const unsigned char VertexFetchImpl::m_vertexFormatMapGfx10[][9] = { // BUF_DATA_FORMAT // BUF_NUM_FORMAT_UNORM // BUF_NUM_FORMAT_SNORM @@ -188,6 +190,7 @@ const unsigned char VertexFetchImpl::m_vertexFormatMapGfx10[][8] = { // BUF_NUM_FORMAT_SINT // BUF_NUM_FORMAT_SNORM_NZ // BUF_NUM_FORMAT_FLOAT + // BUF_NUM_FORMAT_FIXED // BUF_DATA_FORMAT_INVALID {BUF_FORMAT_INVALID, @@ -197,6 +200,7 @@ const unsigned char VertexFetchImpl::m_vertexFormatMapGfx10[][8] = { BUF_FORMAT_INVALID, BUF_FORMAT_INVALID, BUF_FORMAT_INVALID, + BUF_FORMAT_INVALID, BUF_FORMAT_INVALID}, // BUF_DATA_FORMAT_8 @@ -207,6 +211,7 @@ const unsigned char VertexFetchImpl::m_vertexFormatMapGfx10[][8] = { BUF_FORMAT_8_UINT, BUF_FORMAT_8_SINT, BUF_FORMAT_INVALID, + BUF_FORMAT_INVALID, BUF_FORMAT_INVALID}, // BUF_DATA_FORMAT_16 @@ -217,7 +222,8 @@ const unsigned char VertexFetchImpl::m_vertexFormatMapGfx10[][8] = { BUF_FORMAT_16_UINT, BUF_FORMAT_16_SINT, BUF_FORMAT_INVALID, - BUF_FORMAT_16_FLOAT}, + BUF_FORMAT_16_FLOAT, + BUF_FORMAT_INVALID}, // BUF_DATA_FORMAT_8_8 {BUF_FORMAT_8_8_UNORM, @@ -227,17 +233,19 @@ const unsigned char VertexFetchImpl::m_vertexFormatMapGfx10[][8] = { BUF_FORMAT_8_8_UINT, BUF_FORMAT_8_8_SINT, BUF_FORMAT_INVALID, + BUF_FORMAT_INVALID, BUF_FORMAT_INVALID}, // BUF_DATA_FORMAT_32 - {BUF_FORMAT_INVALID, - BUF_FORMAT_INVALID, - BUF_FORMAT_INVALID, - BUF_FORMAT_INVALID, + {BUF_FORMAT_32_UINT, + BUF_FORMAT_32_SINT, + BUF_FORMAT_32_UINT, + BUF_FORMAT_32_SINT, BUF_FORMAT_32_UINT, BUF_FORMAT_32_SINT, BUF_FORMAT_INVALID, - BUF_FORMAT_32_FLOAT}, + BUF_FORMAT_32_FLOAT, + BUF_FORMAT_32_SINT}, // BUF_DATA_FORMAT_16_16 {BUF_FORMAT_16_16_UNORM, @@ -247,7 +255,8 @@ const unsigned char VertexFetchImpl::m_vertexFormatMapGfx10[][8] = { BUF_FORMAT_16_16_UINT, BUF_FORMAT_16_16_SINT, BUF_FORMAT_INVALID, - BUF_FORMAT_16_16_FLOAT}, + BUF_FORMAT_16_16_FLOAT, + BUF_FORMAT_INVALID}, // BUF_DATA_FORMAT_10_11_11 {BUF_FORMAT_10_11_11_UNORM_GFX10, @@ -257,7 +266,8 @@ const unsigned char VertexFetchImpl::m_vertexFormatMapGfx10[][8] = { BUF_FORMAT_10_11_11_UINT_GFX10, BUF_FORMAT_10_11_11_SINT_GFX10, BUF_FORMAT_INVALID, - BUF_FORMAT_10_11_11_FLOAT_GFX10}, + BUF_FORMAT_10_11_11_FLOAT_GFX10, + BUF_FORMAT_INVALID}, // BUF_DATA_FORMAT_11_11_10 {BUF_FORMAT_11_11_10_UNORM_GFX10, @@ -267,7 +277,8 @@ const unsigned char VertexFetchImpl::m_vertexFormatMapGfx10[][8] = { BUF_FORMAT_11_11_10_UINT_GFX10, BUF_FORMAT_11_11_10_SINT_GFX10, BUF_FORMAT_INVALID, - BUF_FORMAT_11_11_10_FLOAT_GFX10}, + BUF_FORMAT_11_11_10_FLOAT_GFX10, + BUF_FORMAT_INVALID}, // BUF_DATA_FORMAT_10_10_10_2 {BUF_FORMAT_10_10_10_2_UNORM_GFX10, @@ -277,6 +288,7 @@ const unsigned char VertexFetchImpl::m_vertexFormatMapGfx10[][8] = { BUF_FORMAT_10_10_10_2_UINT_GFX10, BUF_FORMAT_10_10_10_2_SINT_GFX10, BUF_FORMAT_INVALID, + BUF_FORMAT_INVALID, BUF_FORMAT_INVALID}, // BUF_DATA_FORMAT_2_10_10_10 @@ -287,6 +299,7 @@ const unsigned char VertexFetchImpl::m_vertexFormatMapGfx10[][8] = { BUF_FORMAT_2_10_10_10_UINT_GFX10, BUF_FORMAT_2_10_10_10_SINT_GFX10, BUF_FORMAT_INVALID, + BUF_FORMAT_INVALID, BUF_FORMAT_INVALID}, // BUF_DATA_FORMAT_8_8_8_8 @@ -297,17 +310,19 @@ const unsigned char VertexFetchImpl::m_vertexFormatMapGfx10[][8] = { BUF_FORMAT_8_8_8_8_UINT_GFX10, BUF_FORMAT_8_8_8_8_SINT_GFX10, BUF_FORMAT_INVALID, + BUF_FORMAT_INVALID, BUF_FORMAT_INVALID}, // BUF_DATA_FORMAT_32_32 - {BUF_FORMAT_INVALID, - BUF_FORMAT_INVALID, - BUF_FORMAT_INVALID, - BUF_FORMAT_INVALID, + {BUF_FORMAT_32_32_UINT_GFX10, + BUF_FORMAT_32_32_SINT_GFX10, + BUF_FORMAT_32_32_UINT_GFX10, + BUF_FORMAT_32_32_SINT_GFX10, BUF_FORMAT_32_32_UINT_GFX10, BUF_FORMAT_32_32_SINT_GFX10, BUF_FORMAT_INVALID, - BUF_FORMAT_32_32_FLOAT_GFX10}, + BUF_FORMAT_32_32_FLOAT_GFX10, + BUF_FORMAT_32_32_SINT_GFX10}, // BUF_DATA_FORMAT_16_16_16_16 {BUF_FORMAT_16_16_16_16_UNORM_GFX10, @@ -317,27 +332,30 @@ const unsigned char VertexFetchImpl::m_vertexFormatMapGfx10[][8] = { BUF_FORMAT_16_16_16_16_UINT_GFX10, BUF_FORMAT_16_16_16_16_SINT_GFX10, BUF_FORMAT_INVALID, - BUF_FORMAT_16_16_16_16_FLOAT_GFX10}, + BUF_FORMAT_16_16_16_16_FLOAT_GFX10, + BUF_FORMAT_INVALID}, // BUF_DATA_FORMAT_32_32_32 - {BUF_FORMAT_INVALID, - BUF_FORMAT_INVALID, - BUF_FORMAT_INVALID, - BUF_FORMAT_INVALID, + {BUF_FORMAT_32_32_32_UINT_GFX10, + BUF_FORMAT_32_32_32_SINT_GFX10, + BUF_FORMAT_32_32_32_UINT_GFX10, + BUF_FORMAT_32_32_32_SINT_GFX10, BUF_FORMAT_32_32_32_UINT_GFX10, BUF_FORMAT_32_32_32_SINT_GFX10, BUF_FORMAT_INVALID, - BUF_FORMAT_32_32_32_FLOAT_GFX10}, + BUF_FORMAT_32_32_32_FLOAT_GFX10, + BUF_FORMAT_32_32_32_SINT_GFX10}, // BUF_DATA_FORMAT_32_32_32_32 - {BUF_FORMAT_INVALID, - BUF_FORMAT_INVALID, - BUF_FORMAT_INVALID, - BUF_FORMAT_INVALID, + {BUF_FORMAT_32_32_32_32_UINT_GFX10, + BUF_FORMAT_32_32_32_32_SINT_GFX10, + BUF_FORMAT_32_32_32_32_UINT_GFX10, + BUF_FORMAT_32_32_32_32_SINT_GFX10, BUF_FORMAT_32_32_32_32_UINT_GFX10, BUF_FORMAT_32_32_32_32_SINT_GFX10, BUF_FORMAT_INVALID, - BUF_FORMAT_32_32_32_32_FLOAT_GFX10}, + BUF_FORMAT_32_32_32_32_FLOAT_GFX10, + BUF_FORMAT_32_32_32_32_SINT_GFX10}, // BUF_DATA_FORMAT_RESERVED_15 {BUF_FORMAT_INVALID, @@ -347,12 +365,13 @@ const unsigned char VertexFetchImpl::m_vertexFormatMapGfx10[][8] = { BUF_FORMAT_INVALID, BUF_FORMAT_INVALID, BUF_FORMAT_INVALID, + BUF_FORMAT_INVALID, BUF_FORMAT_INVALID}, }; // clang-format on // clang-format off -const unsigned char VertexFetchImpl::m_vertexFormatMapGfx11[][8] = { +const unsigned char VertexFetchImpl::m_vertexFormatMapGfx11[][9] = { // BUF_DATA_FORMAT // BUF_NUM_FORMAT_UNORM // BUF_NUM_FORMAT_SNORM @@ -362,6 +381,7 @@ const unsigned char VertexFetchImpl::m_vertexFormatMapGfx11[][8] = { // BUF_NUM_FORMAT_SINT // BUF_NUM_FORMAT_SNORM_NZ // BUF_NUM_FORMAT_FLOAT + // BUF_NUM_FORMAT_FIXED // BUF_DATA_FORMAT_INVALID {BUF_FORMAT_INVALID, @@ -371,6 +391,7 @@ const unsigned char VertexFetchImpl::m_vertexFormatMapGfx11[][8] = { BUF_FORMAT_INVALID, BUF_FORMAT_INVALID, BUF_FORMAT_INVALID, + BUF_FORMAT_INVALID, BUF_FORMAT_INVALID}, // BUF_DATA_FORMAT_8 @@ -381,6 +402,7 @@ const unsigned char VertexFetchImpl::m_vertexFormatMapGfx11[][8] = { BUF_FORMAT_8_UINT, BUF_FORMAT_8_SINT, BUF_FORMAT_INVALID, + BUF_FORMAT_INVALID, BUF_FORMAT_INVALID}, // BUF_DATA_FORMAT_16 @@ -391,7 +413,8 @@ const unsigned char VertexFetchImpl::m_vertexFormatMapGfx11[][8] = { BUF_FORMAT_16_UINT, BUF_FORMAT_16_SINT, BUF_FORMAT_INVALID, - BUF_FORMAT_16_FLOAT}, + BUF_FORMAT_16_FLOAT, + BUF_FORMAT_INVALID}, // BUF_DATA_FORMAT_8_8 {BUF_FORMAT_8_8_UNORM, @@ -401,17 +424,19 @@ const unsigned char VertexFetchImpl::m_vertexFormatMapGfx11[][8] = { BUF_FORMAT_8_8_UINT, BUF_FORMAT_8_8_SINT, BUF_FORMAT_INVALID, + BUF_FORMAT_INVALID, BUF_FORMAT_INVALID}, // BUF_DATA_FORMAT_32 - {BUF_FORMAT_INVALID, - BUF_FORMAT_INVALID, - BUF_FORMAT_INVALID, - BUF_FORMAT_INVALID, + {BUF_FORMAT_32_UINT, + BUF_FORMAT_32_SINT, + BUF_FORMAT_32_UINT, + BUF_FORMAT_32_SINT, BUF_FORMAT_32_UINT, BUF_FORMAT_32_SINT, BUF_FORMAT_INVALID, - BUF_FORMAT_32_FLOAT}, + BUF_FORMAT_32_FLOAT, + BUF_FORMAT_32_SINT}, // BUF_DATA_FORMAT_16_16 {BUF_FORMAT_16_16_UNORM, @@ -421,7 +446,8 @@ const unsigned char VertexFetchImpl::m_vertexFormatMapGfx11[][8] = { BUF_FORMAT_16_16_UINT, BUF_FORMAT_16_16_SINT, BUF_FORMAT_INVALID, - BUF_FORMAT_16_16_FLOAT}, + BUF_FORMAT_16_16_FLOAT, + BUF_FORMAT_INVALID}, // BUF_DATA_FORMAT_10_11_11 {BUF_FORMAT_INVALID, @@ -431,7 +457,8 @@ const unsigned char VertexFetchImpl::m_vertexFormatMapGfx11[][8] = { BUF_FORMAT_INVALID, BUF_FORMAT_INVALID, BUF_FORMAT_INVALID, - BUF_FORMAT_10_11_11_FLOAT_GFX11}, + BUF_FORMAT_10_11_11_FLOAT_GFX11, + BUF_FORMAT_INVALID}, // BUF_DATA_FORMAT_11_11_10 {BUF_FORMAT_INVALID, @@ -441,7 +468,8 @@ const unsigned char VertexFetchImpl::m_vertexFormatMapGfx11[][8] = { BUF_FORMAT_INVALID, BUF_FORMAT_INVALID, BUF_FORMAT_INVALID, - BUF_FORMAT_11_11_10_FLOAT_GFX11}, + BUF_FORMAT_11_11_10_FLOAT_GFX11, + BUF_FORMAT_INVALID}, // BUF_DATA_FORMAT_10_10_10_2 {BUF_FORMAT_10_10_10_2_UNORM_GFX11, @@ -451,6 +479,7 @@ const unsigned char VertexFetchImpl::m_vertexFormatMapGfx11[][8] = { BUF_FORMAT_10_10_10_2_UINT_GFX11, BUF_FORMAT_10_10_10_2_SINT_GFX11, BUF_FORMAT_INVALID, + BUF_FORMAT_INVALID, BUF_FORMAT_INVALID}, // BUF_DATA_FORMAT_2_10_10_10 @@ -461,6 +490,7 @@ const unsigned char VertexFetchImpl::m_vertexFormatMapGfx11[][8] = { BUF_FORMAT_2_10_10_10_UINT_GFX11, BUF_FORMAT_2_10_10_10_SINT_GFX11, BUF_FORMAT_INVALID, + BUF_FORMAT_INVALID, BUF_FORMAT_INVALID}, // BUF_DATA_FORMAT_8_8_8_8 @@ -471,17 +501,19 @@ const unsigned char VertexFetchImpl::m_vertexFormatMapGfx11[][8] = { BUF_FORMAT_8_8_8_8_UINT_GFX11, BUF_FORMAT_8_8_8_8_SINT_GFX11, BUF_FORMAT_INVALID, + BUF_FORMAT_INVALID, BUF_FORMAT_INVALID}, // BUF_DATA_FORMAT_32_32 - {BUF_FORMAT_INVALID, - BUF_FORMAT_INVALID, - BUF_FORMAT_INVALID, - BUF_FORMAT_INVALID, + {BUF_FORMAT_32_32_UINT_GFX11, + BUF_FORMAT_32_32_SINT_GFX11, + BUF_FORMAT_32_32_UINT_GFX11, + BUF_FORMAT_32_32_SINT_GFX11, BUF_FORMAT_32_32_UINT_GFX11, BUF_FORMAT_32_32_SINT_GFX11, BUF_FORMAT_INVALID, - BUF_FORMAT_32_32_FLOAT_GFX11}, + BUF_FORMAT_32_32_FLOAT_GFX11, + BUF_FORMAT_32_32_SINT_GFX11}, // BUF_DATA_FORMAT_16_16_16_16 {BUF_FORMAT_16_16_16_16_UNORM_GFX11, @@ -491,27 +523,30 @@ const unsigned char VertexFetchImpl::m_vertexFormatMapGfx11[][8] = { BUF_FORMAT_16_16_16_16_UINT_GFX11, BUF_FORMAT_16_16_16_16_SINT_GFX11, BUF_FORMAT_INVALID, - BUF_FORMAT_16_16_16_16_FLOAT_GFX11}, + BUF_FORMAT_16_16_16_16_FLOAT_GFX11, + BUF_FORMAT_INVALID}, // BUF_DATA_FORMAT_32_32_32 - {BUF_FORMAT_INVALID, - BUF_FORMAT_INVALID, - BUF_FORMAT_INVALID, - BUF_FORMAT_INVALID, + {BUF_FORMAT_32_32_32_UINT_GFX11, + BUF_FORMAT_32_32_32_SINT_GFX11, + BUF_FORMAT_32_32_32_UINT_GFX11, + BUF_FORMAT_32_32_32_SINT_GFX11, BUF_FORMAT_32_32_32_UINT_GFX11, BUF_FORMAT_32_32_32_SINT_GFX11, BUF_FORMAT_INVALID, - BUF_FORMAT_32_32_32_FLOAT_GFX11}, + BUF_FORMAT_32_32_32_FLOAT_GFX11, + BUF_FORMAT_32_32_32_SINT_GFX11}, // BUF_DATA_FORMAT_32_32_32_32 - {BUF_FORMAT_INVALID, - BUF_FORMAT_INVALID, - BUF_FORMAT_INVALID, - BUF_FORMAT_INVALID, + {BUF_FORMAT_32_32_32_32_UINT_GFX11, + BUF_FORMAT_32_32_32_32_SINT_GFX11, + BUF_FORMAT_32_32_32_32_UINT_GFX11, + BUF_FORMAT_32_32_32_32_SINT_GFX11, BUF_FORMAT_32_32_32_32_UINT_GFX11, BUF_FORMAT_32_32_32_32_SINT_GFX11, BUF_FORMAT_INVALID, - BUF_FORMAT_32_32_32_32_FLOAT_GFX11}, + BUF_FORMAT_32_32_32_32_FLOAT_GFX11, + BUF_FORMAT_32_32_32_32_SINT_GFX11}, // BUF_DATA_FORMAT_RESERVED_15 {BUF_FORMAT_INVALID, @@ -521,6 +556,7 @@ const unsigned char VertexFetchImpl::m_vertexFormatMapGfx11[][8] = { BUF_FORMAT_INVALID, BUF_FORMAT_INVALID, BUF_FORMAT_INVALID, + BUF_FORMAT_INVALID, BUF_FORMAT_INVALID}, }; // clang-format on @@ -570,13 +606,16 @@ bool LowerVertexFetch::runImpl(Module &module, PipelineState *pipelineState) { auto desc = builder.CreateLoadBufferDesc(InternalDescriptorSetId, FetchShaderInternalBufferBinding, builder.getInt32(0), Builder::BufferFlagAddress); - // The size of each input descriptor is sizeof(UberFetchShaderAttribInfo). vector4 - auto uberFetchAttrType = FixedVectorType::get(builder.getInt32Ty(), 4); - auto descPtr = builder.CreateIntToPtr(desc, PointerType::get(uberFetchAttrType, ADDR_SPACE_CONST)); + auto descPtr = builder.CreateIntToPtr(desc, builder.getPtrTy(ADDR_SPACE_CONST)); + Value *locationMasks = builder.getInt64(~0); +#if LLPC_CLIENT_INTERFACE_MAJOR_VERSION >= 67 + locationMasks = builder.CreateLoad(builder.getInt64Ty(), descPtr); + descPtr = builder.CreateGEP(builder.getInt64Ty(), descPtr, {builder.getInt32(1)}); +#endif for (InputImportGenericOp *inst : vertexFetches) { builder.SetInsertPoint(inst); - Value *vertex = vertexFetch->fetchVertex(inst, descPtr, BuilderBase::get(builder)); + Value *vertex = vertexFetch->fetchVertex(inst, descPtr, locationMasks, BuilderBase::get(builder)); // Replace and erase this instruction. inst->replaceAllUsesWith(vertex); inst->eraseFromParent(); @@ -684,13 +723,10 @@ bool LowerVertexFetch::runImpl(Module &module, PipelineState *pipelineState) { // // @param inst : the input instruction // @param descPtr : 64bit address of buffer +// @param locMasks : determine if the attribute data is valid. // @param builder : Builder to use to insert vertex fetch instructions // @returns : vertex -Value *VertexFetchImpl::fetchVertex(InputImportGenericOp *inst, llvm::Value *descPtr, BuilderBase &builder) { - unsigned location = inst->getLocation(); - unsigned compIdx = cast(inst->getElemIdx())->getZExtValue(); - auto zero = builder.getInt32(0); - +Value *VertexFetchImpl::fetchVertex(InputImportGenericOp *inst, Value *descPtr, Value *locMasks, BuilderBase &builder) { if (!m_vertexIndex) { IRBuilderBase::InsertPointGuard ipg(builder); builder.SetInsertPointPastAllocas(inst->getFunction()); @@ -720,18 +756,28 @@ Value *VertexFetchImpl::fetchVertex(InputImportGenericOp *inst, llvm::Value *des auto currentBlock = inst->getParent(); auto fetchEndBlock = currentBlock->splitBasicBlock(inst); - auto perCompEndBlock = createBlock(".perCompEnd", fetchEndBlock); + auto fetchUberEndBlock = createBlock(".fetchUberEndBlock", fetchEndBlock); + auto perCompEndBlock = createBlock(".perCompEnd", fetchUberEndBlock); auto comp3Block = createBlock(".comp3Block", perCompEndBlock); auto comp2Block = createBlock(".comp2Block", comp3Block); auto comp1Block = createBlock(".comp1Block", comp2Block); auto comp0Block = createBlock(".comp0Block", comp1Block); auto wholeVertexBlock = createBlock(".wholeVertex", comp0Block); - auto fetchStartBlock = createBlock("fetchStart", wholeVertexBlock); + auto fetchUberStartBlock = createBlock(".fetchUberStartBlock", wholeVertexBlock); + auto fetchStartBlock = createBlock(".fetchStart", fetchUberStartBlock); + unsigned location = inst->getLocation(); + auto zero = builder.getInt32(0); builder.SetInsertPoint(currentBlock->getTerminator()); builder.CreateBr(fetchStartBlock); currentBlock->getTerminator()->eraseFromParent(); builder.SetInsertPoint(fetchStartBlock); + + auto locationAnd = builder.CreateAnd(locMasks, builder.getInt64(1ull << location)); + auto isAttriValid = builder.CreateICmpNE(locationAnd, builder.getInt64(0)); + builder.CreateCondBr(isAttriValid, fetchUberStartBlock, fetchEndBlock); + + builder.SetInsertPoint(fetchUberStartBlock); // The size of each input descriptor is sizeof(UberFetchShaderAttribInfo). vector4 auto uberFetchAttrType = FixedVectorType::get(Type::getInt32Ty(*m_context), 4); descPtr = builder.CreateGEP(uberFetchAttrType, descPtr, {builder.getInt32(location)}); @@ -836,7 +882,7 @@ Value *VertexFetchImpl::fetchVertex(InputImportGenericOp *inst, llvm::Value *des auto secondFetch = builder.CreateIntrinsic(Intrinsic::amdgcn_struct_buffer_load_format, fetchType, args, {}); wholeVertex = builder.CreateShuffleVector(wholeVertex, secondFetch, ArrayRef{0, 1, 2, 3, 4, 5, 6, 7}); } - builder.CreateBr(fetchEndBlock); + builder.CreateBr(fetchUberEndBlock); } fetchType = FixedVectorType::get(builder.getInt32Ty(), is64bitFetch ? 8 : 4); @@ -970,11 +1016,11 @@ Value *VertexFetchImpl::fetchVertex(InputImportGenericOp *inst, llvm::Value *des auto fixedVertex = builder.CreateShuffleVector(lastVert, lastVert, ArrayRef{2, 1, 0, 3}); lastVert = builder.CreateSelect(isBgr, fixedVertex, lastVert); } - builder.CreateBr(fetchEndBlock); + builder.CreateBr(fetchUberEndBlock); } - // .fetchEnd - builder.SetInsertPoint(&*fetchEndBlock->getFirstInsertionPt()); + // .fetchUberEndBlock + builder.SetInsertPoint(fetchUberEndBlock); auto phiInst = builder.CreatePHI(lastVert->getType(), 2); phiInst->addIncoming(wholeVertex, wholeVertexBlock); phiInst->addIncoming(lastVert, perCompEndBlock); @@ -995,6 +1041,7 @@ Value *VertexFetchImpl::fetchVertex(InputImportGenericOp *inst, llvm::Value *des std::vector vertexValues(vertexCompCount); // NOTE: Original component index is based on the basic scalar type. + unsigned compIdx = cast(inst->getElemIdx())->getZExtValue(); compIdx *= (bitWidth == 64 ? 2 : 1); // Vertex input might take values from vertex fetch values or default fetch values @@ -1036,7 +1083,17 @@ Value *VertexFetchImpl::fetchVertex(InputImportGenericOp *inst, llvm::Value *des if (vertex->getType() != inputTy) vertex = builder.CreateBitCast(vertex, inputTy); - vertex->setName("vertex"); + builder.CreateBr(fetchEndBlock); + + // .fetchEndBlock + { + builder.SetInsertPoint(&*fetchEndBlock->getFirstInsertionPt()); + auto phiInst = builder.CreatePHI(inputTy, 2); + phiInst->addIncoming(PoisonValue::get(inputTy), fetchStartBlock); + phiInst->addIncoming(vertex, fetchUberEndBlock); + vertex = phiInst; + vertex->setName("vertex"); + } return vertex; } @@ -1149,7 +1206,8 @@ Value *VertexFetchImpl::fetchVertex(Type *inputTy, const VertexInputDescription std::vector shuffleMask; bool postShuffle = needPostShuffle(description, shuffleMask); bool patchA2S = needPatchA2S(description); - if (postShuffle || patchA2S) { + bool patch32 = needPatch32(description); + if (postShuffle || patchA2S || patch32) { if (postShuffle) { // NOTE: If we are fetching a swizzled format, we have to add an extra "shufflevector" instruction to // get the components in the right order. @@ -1233,6 +1291,56 @@ Value *VertexFetchImpl::fetchVertex(Type *inputTy, const VertexInputDescription vertexFetches[0] = InsertElementInst::Create(vertexFetches[0], alpha, ConstantInt::get(Type::getInt32Ty(*m_context), 3), "", insertPos); } + + if (patch32) { + bool isSigned = (description->nfmt == BufNumFormatSscaled || description->nfmt == BufNumFormatSnorm || + description->nfmt == BufNumFormatFixed); + + // Whether need to do normalization emulation. + bool isNorm = (description->nfmt == BufNumFormatSnorm || description->nfmt == BufNumFormatUnorm); + + // Whether need to do fixed point emulation + bool isFixed = (description->nfmt == BufNumFormatFixed); + + // Whether need to tranlate from int bits to float bits. + bool needTransToFp = (description->nfmt == BufNumFormatSscaled || description->nfmt == BufNumFormatSnorm || + description->nfmt == BufNumFormatUscaled || description->nfmt == BufNumFormatUnorm); + + // Only for 32 bits format patch and emulation. + for (unsigned i = 0; i < formatInfo.numChannels; ++i) { + Value *elemInstr = ExtractElementInst::Create(vertexFetches[0], + ConstantInt::get(Type::getInt32Ty(*m_context), i), "", insertPos); + if (needTransToFp) { + // A constant divisor for normalization emulation. + float normDiv = 2.14748365e+09f; + if (isSigned) { + // Signed int to float + elemInstr = new SIToFPInst(elemInstr, Type::getFloatTy(*m_context), "", insertPos); + } else { + // Unsigned int to float + elemInstr = new UIToFPInst(elemInstr, Type::getFloatTy(*m_context), "", insertPos); + normDiv = 4.29496730e+09f; + } + if (isNorm) { + // Normalization emulation. + elemInstr = BinaryOperator::CreateFDiv(elemInstr, ConstantFP::get(Type::getFloatTy(*m_context), normDiv), + "", insertPos); + } + } else if (isFixed) { + // A constant divisor to translate loaded float bits to fixed point format. + float fixedPointMul = 1.0f / 65536.0f; + elemInstr = new SIToFPInst(elemInstr, Type::getFloatTy(*m_context), "", insertPos); + elemInstr = BinaryOperator::CreateFMul( + elemInstr, ConstantFP::get(Type::getFloatTy(*m_context), fixedPointMul), "", insertPos); + } else { + llvm_unreachable("Should never be called!"); + } + + elemInstr = new BitCastInst(elemInstr, Type::getInt32Ty(*m_context), "", insertPos); + vertexFetches[0] = InsertElementInst::Create(vertexFetches[0], elemInstr, + ConstantInt::get(Type::getInt32Ty(*m_context), i), "", insertPos); + } + } } // Do the second vertex fetch operation @@ -1284,6 +1392,10 @@ Value *VertexFetchImpl::fetchVertex(Type *inputTy, const VertexInputDescription // Finalize vertex fetch Type *basicTy = inputTy->isVectorTy() ? cast(inputTy)->getElementType() : inputTy; + bool needDoubleEmulation = + description->dfmt >= BufDataFormat64 && description->dfmt <= BufDataFormat64_64_64_64 && basicTy->isFloatTy(); + if (needDoubleEmulation) + basicTy = Type::getDoubleTy(*m_context); const unsigned bitWidth = basicTy->getScalarSizeInBits(); assert(bitWidth == 8 || bitWidth == 16 || bitWidth == 32 || bitWidth == 64); @@ -1381,6 +1493,14 @@ Value *VertexFetchImpl::fetchVertex(Type *inputTy, const VertexInputDescription vertex = new TruncInst(vertex, truncTy, "", insertPos); } + if (needDoubleEmulation) { + // SPIR-V extended format emulation + // If input type is float32 but vertex attribute data format is float64, we need another float point trunc step. + int vecSize = cast(vertex->getType())->getNumElements() / 2; + vertex = new BitCastInst(vertex, FixedVectorType::get(Type::getDoubleTy(*m_context), vecSize), "", insertPos); + vertex = new FPTruncInst(vertex, FixedVectorType::get(Type::getFloatTy(*m_context), vecSize), "", insertPos); + } + if (vertex->getType() != inputTy) vertex = new BitCastInst(vertex, inputTy, "", insertPos); vertex->setName("vertex" + Twine(location) + "." + Twine(compIdx)); @@ -1456,7 +1576,7 @@ const VertexCompFormatInfo *VertexFetchImpl::getVertexComponentFormatInfo(unsign // @param nfmt : Numeric format unsigned VertexFetchImpl::mapVertexFormat(unsigned dfmt, unsigned nfmt) const { assert(dfmt < 16); - assert(nfmt < 8); + assert(nfmt < 9); unsigned format = 0; GfxIpVersion gfxIp = m_lgcContext->getTargetInfo().getGfxIpVersion(); @@ -1696,6 +1816,30 @@ bool VertexFetchImpl::needPostShuffle(const VertexInputDescription *inputDesc, return needShuffle; } +// ===================================================================================================================== +// Checks whether a patch (emulation) step is needed for some 32 bits vertex attribute formats. +// +// @param inputDesc : Vertex input description +bool VertexFetchImpl::needPatch32(const VertexInputDescription *inputDesc) const { + bool needPatch = false; + + switch (inputDesc->dfmt) { + case BufDataFormat32: + case BufDataFormat32_32: + case BufDataFormat32_32_32: + case BufDataFormat32_32_32_32: + if (inputDesc->nfmt == BufNumFormatSscaled || inputDesc->nfmt == BufNumFormatUscaled || + inputDesc->nfmt == BufNumFormatSnorm || inputDesc->nfmt == BufNumFormatUnorm || + inputDesc->nfmt == BufNumFormatFixed) + needPatch = true; + break; + default: + break; + } + + return needPatch; +} + // ===================================================================================================================== // Checks whether patching 2-bit signed alpha channel is required for vertex fetch operation. // diff --git a/lgc/state/Compiler.cpp b/lgc/state/Compiler.cpp index cc5c075abe..e0226c99eb 100644 --- a/lgc/state/Compiler.cpp +++ b/lgc/state/Compiler.cpp @@ -219,7 +219,7 @@ bool PipelineState::generate(Module *pipelineModule, raw_pwrite_stream &outStrea } else { // Patching. Patch::addPasses(this, *passMgr, patchTimer, optTimer, std::move(checkShaderCacheFunc), - getLgcContext()->getOptimizationLevel()); + static_cast(getLgcContext()->getOptimizationLevel())); // Add pass to clear pipeline state from IR passMgr->addPass(PipelineStateClearer()); diff --git a/lgc/state/LgcContext.cpp b/lgc/state/LgcContext.cpp index 2f3d3184b9..bf0c4705bf 100644 --- a/lgc/state/LgcContext.cpp +++ b/lgc/state/LgcContext.cpp @@ -61,7 +61,7 @@ static codegen::RegisterCodeGenFlags CGF; static bool Initialized; #endif -raw_ostream *LgcContext::m_llpcOuts; +thread_local raw_ostream *LgcContext::m_llpcOuts; // -emit-llvm: emit LLVM assembly instead of ISA static cl::opt EmitLlvm("emit-llvm", cl::desc("Emit LLVM assembly instead of AMD GPU ISA"), cl::init(false)); @@ -77,12 +77,23 @@ static cl::opt EmitLgc("emit-lgc", cl::desc("Emit LLVM assembly suitable f static cl::opt ShowEncoding("show-encoding", cl::desc("Show instruction encodings"), cl::init(false)); // -opt: Override the optimization level passed in to LGC with the given one. +#if LLVM_MAIN_REVISION && LLVM_MAIN_REVISION < 474768 +// Old version of the code static cl::opt OptLevel("opt", cl::desc("Set the optimization level for LGC:"), cl::init(CodeGenOpt::Default), values(clEnumValN(CodeGenOpt::None, "none", "no optimizations"), clEnumValN(CodeGenOpt::Less, "quick", "quick compilation time"), clEnumValN(CodeGenOpt::Default, "default", "default optimizations"), clEnumValN(CodeGenOpt::Aggressive, "fast", "fast execution time"))); +#else +// New version of the code (also handles unknown version, which we treat as latest) +static cl::opt + OptLevel("opt", cl::desc("Set the optimization level for LGC:"), cl::init(CodeGenOptLevel::Default), + values(clEnumValN(CodeGenOptLevel::None, "none", "no optimizations"), + clEnumValN(CodeGenOptLevel::Less, "quick", "quick compilation time"), + clEnumValN(CodeGenOptLevel::Default, "default", "default optimizations"), + clEnumValN(CodeGenOptLevel::Aggressive, "fast", "fast execution time"))); +#endif // ===================================================================================================================== // Set default for a command-line option, but only if command-line processing has not happened yet, or did not see @@ -216,7 +227,14 @@ bool LgcContext::isGpuNameValid(llvm::StringRef gpuName) { // // @param gpuName : LLVM GPU name (e.g. "gfx900"); empty to use -mcpu option setting // @param optLevel : LLVM optimization level used to initialize target machine -std::unique_ptr LgcContext::createTargetMachine(StringRef gpuName, CodeGenOpt::Level optLevel) { +#if LLVM_MAIN_REVISION && LLVM_MAIN_REVISION < 474768 +// Old version of the code +std::unique_ptr LgcContext::createTargetMachine(StringRef gpuName, CodeGenOpt::Level optLevel) +#else +// New version of the code (also handles unknown version, which we treat as latest) +std::unique_ptr LgcContext::createTargetMachine(StringRef gpuName, CodeGenOptLevel optLevel) +#endif +{ assert(Initialized && "Must call LgcContext::initialize before LgcContext::createTargetMachine"); std::string mcpuName = codegen::getMCPU(); // -mcpu setting from llvm/CodeGen/CommandFlags.h @@ -245,7 +263,7 @@ std::unique_ptr LgcContext::createTargetMachine(StringRef gpuName if (OptLevel.getPosition() != 0) optLevel = OptLevel; - LLPC_OUTS("TargetMachine optimization level = " << optLevel << "\n"); + LLPC_OUTS("TargetMachine optimization level = " << static_cast(optLevel) << "\n"); return std::unique_ptr(target->createTargetMachine(triple, gpuName, "", targetOpts, {}, {}, optLevel)); } @@ -358,7 +376,14 @@ void LgcContext::addTargetPasses(lgc::LegacyPassManager &passMgr, Timer *codeGen passMgr.add(createStartStopTimer(codeGenTimer, false)); } +#if LLVM_MAIN_REVISION && LLVM_MAIN_REVISION < 474768 +// Old version of the code llvm::CodeGenOpt::Level LgcContext::getOptimizationLevel() const { +#else +// New version of the code (also handles unknown version, which we treat as latest) +llvm::CodeGenOptLevel LgcContext::getOptimizationLevel() const { +#endif + return m_targetMachine->getOptLevel(); } diff --git a/lgc/state/PalMetadata.cpp b/lgc/state/PalMetadata.cpp index d2c54f88fd..fff1a5fb36 100644 --- a/lgc/state/PalMetadata.cpp +++ b/lgc/state/PalMetadata.cpp @@ -1113,6 +1113,26 @@ void PalMetadata::updateCbShaderMask(llvm::ArrayRef exps) { } } +// ===================================================================================================================== +// Updates the DB shader control that depends on the CB state. +// +void PalMetadata::updateDbShaderControl() { + if (m_pipelineState->getTargetInfo().getGfxIpVersion().major >= 9) { + if (m_pipelineState->useRegisterFieldFormat()) { + auto dbShaderControl = m_pipelineNode[Util::Abi::PipelineMetadataKey::GraphicsRegisters] + .getMap(true)[Util::Abi::GraphicsRegisterMetadataKey::DbShaderControl] + .getMap(true); + dbShaderControl[Util::Abi::DbShaderControlMetadataKey::AlphaToMaskDisable] = + !m_pipelineState->getColorExportState().alphaToCoverageEnable; + } else { + DB_SHADER_CONTROL dbShaderControl = {}; + dbShaderControl.u32All = getRegister(mmDB_SHADER_CONTROL); + dbShaderControl.bitfields.ALPHA_TO_MASK_DISABLE = !m_pipelineState->getColorExportState().alphaToCoverageEnable; + setRegister(mmDB_SHADER_CONTROL, dbShaderControl.u32All); + } + } +} + // ===================================================================================================================== // Fills the xglCacheInfo section of the PAL metadata with the given data. // @@ -1252,17 +1272,6 @@ void PalMetadata::eraseFragmentInputInfo() { m_pipelineNode.erase(array3It); } -// ===================================================================================================================== -// Returns true if the fragment input info has an entry for a builtin. -bool PalMetadata::fragmentShaderUsesMappedBuiltInInputs() { - auto array2It = m_pipelineNode.find(m_document->getNode(PipelineMetadataKey::FragInputMapping2)); - if (array2It != m_pipelineNode.end()) { - auto fragInputMappingArray2 = array2It->second.getArray(true); - return !fragInputMappingArray2.empty(); - } - return false; -} - // ===================================================================================================================== // Returns the location of the fragment builtin or InvalidValue if the builtin is not found. // diff --git a/lgc/state/PipelineState.cpp b/lgc/state/PipelineState.cpp index aead3f2c5f..74e4e11952 100644 --- a/lgc/state/PipelineState.cpp +++ b/lgc/state/PipelineState.cpp @@ -98,6 +98,7 @@ static unsigned getMaxComponentBitCount(BufDataFormat dfmt) { case BufDataFormat8_8_8_Bgr: case BufDataFormat8_8_8_8: case BufDataFormat8_8_8_8_Bgra: + case BufDataFormat8_A: return 8; case BufDataFormat5_9_9_9: return 9; @@ -146,6 +147,7 @@ static bool hasAlpha(BufDataFormat dfmt) { case BufDataFormat5_6_5_1_Bgra: case BufDataFormat1_5_6_5: case BufDataFormat5_9_9_9: + case BufDataFormat8_A: return true; default: return false; @@ -164,6 +166,7 @@ static unsigned getNumChannels(BufDataFormat dfmt) { case BufDataFormat16: case BufDataFormat32: case BufDataFormat64: + case BufDataFormat8_A: return 1; case BufDataFormat4_4: case BufDataFormat8_8: @@ -324,7 +327,7 @@ ComputeShaderMode Pipeline::getComputeShaderMode(Module &module) { // @param emitLgc : Whether the option -emit-lgc is on PipelineState::PipelineState(LgcContext *builderContext, bool emitLgc) : Pipeline(builderContext), m_emitLgc(emitLgc), m_meshRowExport(EnableRowExport) { - m_registerFieldFormat = getTargetInfo().getGfxIpVersion().major >= 11 && UseRegisterFieldFormat; + m_registerFieldFormat = getTargetInfo().getGfxIpVersion().major >= 9 && UseRegisterFieldFormat; m_tessLevel.inner[0] = -1.0f; m_tessLevel.inner[1] = -1.0f; m_tessLevel.outer[0] = -1.0f; @@ -1184,7 +1187,7 @@ void PipelineState::setColorExportState(ArrayRef formats, con // // @param location : Export location const ColorExportFormat &PipelineState::getColorExportFormat(unsigned location) { - if (getColorExportState().dualSourceBlendEnable) + if (getColorExportState().dualSourceBlendEnable || getColorExportState().dynamicDualSourceBlendEnable) location = 0; if (location >= m_colorExportFormats.size()) { @@ -1608,7 +1611,9 @@ unsigned PipelineState::computeExportFormat(Type *outputTy, unsigned location) { // When dual source blend is enabled, location 1 is location 0 index 1 in shader source. we need generate same export // format. const bool enableAlphaToCoverage = - (cbState->alphaToCoverageEnable && ((location == 0) || ((location == 1) && cbState->dualSourceBlendEnable))); + (cbState->alphaToCoverageEnable && + ((location == 0) || + ((location == 1) && (cbState->dualSourceBlendEnable || cbState->dynamicDualSourceBlendEnable)))); const bool blendEnabled = colorExportFormat->blendEnable; @@ -1628,7 +1633,7 @@ unsigned PipelineState::computeExportFormat(Type *outputTy, unsigned location) { const bool formatHasAlpha = hasAlpha(colorExportFormat->dfmt); const bool alphaExport = - (outputMask == 0xF && (formatHasAlpha || colorExportFormat->blendSrcAlphaToColor || enableAlphaToCoverage)); + (outputMask & 0x8 && (formatHasAlpha || colorExportFormat->blendSrcAlphaToColor || enableAlphaToCoverage)); const CompSetting compSetting = computeCompSetting(colorExportFormat->dfmt); @@ -1847,7 +1852,7 @@ unsigned PipelineState::getVerticesPerPrimitive() { return 1; if (tessMode.primitiveMode == PrimitiveMode::Isolines) return 2; - if (tessMode.primitiveMode == PrimitiveMode::Triangles) + if (tessMode.primitiveMode == PrimitiveMode::Triangles || tessMode.primitiveMode == PrimitiveMode::Quads) return 3; } else { auto primType = getInputAssemblyState().primitiveType; @@ -1910,25 +1915,27 @@ PrimitiveType PipelineState::getPrimitiveType() { void PipelineState::setXfbStateMetadata(Module *module) { // Read XFB state metadata for (auto &func : *module) { - if (isShaderEntryPoint(&func)) { - MDNode *xfbStateMetaNode = func.getMetadata(XfbStateMetadataName); - if (xfbStateMetaNode) { - auto &streamXfbBuffers = m_xfbStateMetadata.streamXfbBuffers; - auto &xfbStrides = m_xfbStateMetadata.xfbStrides; - for (unsigned xfbBuffer = 0; xfbBuffer < MaxTransformFeedbackBuffers; ++xfbBuffer) { - // Get the vertex streamId from metadata - auto metaOp = cast(xfbStateMetaNode->getOperand(2 * xfbBuffer)); - int streamId = cast(metaOp->getValue())->getSExtValue(); - if (streamId == InvalidValue) - continue; - streamXfbBuffers[streamId] |= 1 << xfbBuffer; // Bit mask of used xfbBuffers in a stream - // Get the stride from metadata - metaOp = cast(xfbStateMetaNode->getOperand(2 * xfbBuffer + 1)); - xfbStrides[xfbBuffer] = cast(metaOp->getValue())->getZExtValue(); - m_xfbStateMetadata.enableXfb = true; - } - m_xfbStateMetadata.enablePrimStats = !m_xfbStateMetadata.enableXfb; + if (!isShaderEntryPoint(&func)) + continue; + if (getShaderStage(&func) != getLastVertexProcessingStage()) + continue; + MDNode *xfbStateMetaNode = func.getMetadata(XfbStateMetadataName); + if (xfbStateMetaNode) { + auto &streamXfbBuffers = m_xfbStateMetadata.streamXfbBuffers; + auto &xfbStrides = m_xfbStateMetadata.xfbStrides; + for (unsigned xfbBuffer = 0; xfbBuffer < MaxTransformFeedbackBuffers; ++xfbBuffer) { + // Get the vertex streamId from metadata + auto metaOp = cast(xfbStateMetaNode->getOperand(2 * xfbBuffer)); + int streamId = cast(metaOp->getValue())->getSExtValue(); + if (streamId == InvalidValue) + continue; + streamXfbBuffers[streamId] |= 1 << xfbBuffer; // Bit mask of used xfbBuffers in a stream + // Get the stride from metadata + metaOp = cast(xfbStateMetaNode->getOperand(2 * xfbBuffer + 1)); + xfbStrides[xfbBuffer] = cast(metaOp->getValue())->getZExtValue(); + m_xfbStateMetadata.enableXfb = true; } + m_xfbStateMetadata.enablePrimStats = !m_xfbStateMetadata.enableXfb; } } } diff --git a/lgc/state/TargetInfo.cpp b/lgc/state/TargetInfo.cpp index 4e7eeacd69..5701968dfe 100644 --- a/lgc/state/TargetInfo.cpp +++ b/lgc/state/TargetInfo.cpp @@ -515,6 +515,19 @@ static void setGfx1100Info(TargetInfo *targetInfo) { targetInfo->getGpuProperty().numShaderEngines = 6; } +#if LLPC_BUILD_NAVI32 +// gfx1101 +// +// @param [in/out] targetInfo : Target info +static void setGfx1101Info(TargetInfo *targetInfo) { + setGfx11Info(targetInfo); + + targetInfo->getGpuWorkarounds().gfx11.waAtmPrecedesPos = 1; + + targetInfo->getGpuProperty().numShaderEngines = 3; +} +#endif + // gfx1102 // // @param [in/out] targetInfo : Target info @@ -585,6 +598,9 @@ bool TargetInfo::setTargetInfo(StringRef gpuName) { {"gfx1035", &setGfx1035Info}, // gfx1035, rembrandt {"gfx1036", &setGfx1036Info}, // gfx1036, raphael | mendocino {"gfx1100", &setGfx1100Info}, // gfx1100, navi31 +#if LLPC_BUILD_NAVI32 + {"gfx1101", &setGfx1101Info}, // gfx1101, navi32 +#endif {"gfx1102", &setGfx1102Info}, // gfx1102, navi33 #if LLPC_BUILD_PHOENIX1 {"gfx1103", &setGfx1103Info}, // gfx1103, phoenix1 diff --git a/lgc/test/BuiltIns/cs-numworkgroups.lgc b/lgc/test/BuiltIns/cs-numworkgroups.lgc index 528e404464..a8f0021c37 100644 --- a/lgc/test/BuiltIns/cs-numworkgroups.lgc +++ b/lgc/test/BuiltIns/cs-numworkgroups.lgc @@ -74,7 +74,7 @@ attributes #0 = { nounwind } ; CHECK-NEXT: - 0 ; CHECK-NEXT: .hardware_mapping: ; CHECK-NEXT: - .cs -; CHECK-NEXT: .spill_threshold: 0xffffffff +; CHECK-NEXT: .spill_threshold: 0xffff ; CHECK-NEXT: .type: Cs ; CHECK-NEXT: .user_data_limit: 0x3 ; CHECK-NEXT: amdpal.version: diff --git a/lgc/test/BuiltIns/cs-workgroupid.lgc b/lgc/test/BuiltIns/cs-workgroupid.lgc index 3d23a1ef0c..839f22feaa 100644 --- a/lgc/test/BuiltIns/cs-workgroupid.lgc +++ b/lgc/test/BuiltIns/cs-workgroupid.lgc @@ -70,7 +70,7 @@ attributes #0 = { nounwind } ; CHECK-NEXT: - 0 ; CHECK-NEXT: .hardware_mapping: ; CHECK-NEXT: - .cs -; CHECK-NEXT: .spill_threshold: 0xffffffff +; CHECK-NEXT: .spill_threshold: 0xffff ; CHECK-NEXT: .type: Cs ; CHECK-NEXT: .user_data_limit: 0x3 ; CHECK-NEXT: amdpal.version: diff --git a/lgc/test/CMakeLists.txt b/lgc/test/CMakeLists.txt index 7918045ac3..88eed715fd 100644 --- a/lgc/test/CMakeLists.txt +++ b/lgc/test/CMakeLists.txt @@ -32,6 +32,7 @@ set(LGC_TEST_SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR}) # required by configure_lit_site_cfg set(LLVM_LIT_OUTPUT_DIR ${LLVM_TOOLS_BINARY_DIR}) +get_target_property(LIT_DEFINITIONS LLVMlgc INTERFACE_COMPILE_DEFINITIONS) configure_lit_site_cfg( ${CMAKE_CURRENT_SOURCE_DIR}/lit.site.cfg.py.in ${CMAKE_CURRENT_BINARY_DIR}/lit.site.cfg.py diff --git a/lgc/test/CallLibFromCs-indirect.lgc b/lgc/test/CallLibFromCs-indirect.lgc index 2c354e9bfe..7eeb691b13 100644 --- a/lgc/test/CallLibFromCs-indirect.lgc +++ b/lgc/test/CallLibFromCs-indirect.lgc @@ -2,8 +2,8 @@ ; RUN: lgc -mcpu=gfx1010 -print-after=lgc-patch-entry-point-mutate -o /dev/null 2>&1 - <%s | FileCheck --check-prefixes=CHECK %s ; CHECK: IR Dump After Patch LLVM for entry-point mutation -; CHECK: define dllexport amdgpu_cs void @lgc.shader.CS.main(i32 inreg %globalTable, ptr addrspace(4) inreg %numWorkgroupsPtr, i32 inreg %descTable2, i32 inreg %0, i32 inreg %1, i32 inreg %2, i32 inreg %3, i32 inreg %4, i32 inreg %5, i32 inreg %6, i32 inreg %7, i32 inreg %8, i32 inreg %9, i32 inreg %10, i32 inreg %spillTable, <3 x i32> inreg %WorkgroupId, i32 inreg %MultiDispatchInfo, <3 x i32> %LocalInvocationId) #0 !lgc.shaderstage !7 { -; CHECK: call amdgpu_gfx i32 %func_ptr(i32 inreg %globalTable, ptr addrspace(4) inreg %numWorkgroupsPtr, i32 inreg %descTable2, i32 inreg %0, i32 inreg %1, i32 inreg %2, i32 inreg %3, i32 inreg %4, i32 inreg %5, i32 inreg %6, i32 inreg %7, i32 inreg %8, i32 inreg %9, i32 inreg %10, i32 inreg %spillTable, <3 x i32> inreg %WorkgroupId, i32 inreg %MultiDispatchInfo, <3 x i32> %LocalInvocationId) +; CHECK: define dllexport amdgpu_cs void @lgc.shader.CS.main(i32 inreg %globalTable, ptr addrspace(4) inreg %numWorkgroupsPtr, i32 inreg %userdata0, i32 inreg %userdata1, i32 inreg %userdata2, i32 inreg %userdata3, i32 inreg %userdata4, i32 inreg %userdata5, i32 inreg %userdata6, i32 inreg %userdata7, i32 inreg %userdata8, i32 inreg %userdata9, i32 inreg %userdata10, i32 inreg %userdata11, i32 inreg %spillTable, <3 x i32> inreg %WorkgroupId, i32 inreg %MultiDispatchInfo, <3 x i32> %LocalInvocationId) #0 !lgc.shaderstage !7 { +; CHECK: call amdgpu_gfx i32 %func_ptr(i32 inreg %globalTable, ptr addrspace(4) inreg %numWorkgroupsPtr, i32 inreg %userdata0, i32 inreg %userdata1, i32 inreg %userdata2, i32 inreg %userdata3, i32 inreg %userdata4, i32 inreg %userdata5, i32 inreg %userdata6, i32 inreg %userdata7, i32 inreg %userdata8, i32 inreg %userdata9, i32 inreg %userdata10, i32 inreg %userdata11, i32 inreg %spillTable, <3 x i32> inreg %WorkgroupId, i32 inreg %MultiDispatchInfo, <3 x i32> %LocalInvocationId) ; CHECK: !7 = !{i32 7} ; ModuleID = 'lgcPipeline' diff --git a/lgc/test/CallLibFromCs.lgc b/lgc/test/CallLibFromCs.lgc index dd1db1121b..4ae70ee130 100644 --- a/lgc/test/CallLibFromCs.lgc +++ b/lgc/test/CallLibFromCs.lgc @@ -3,8 +3,8 @@ ; RUN: lgc -mcpu=gfx1010 -print-after=lgc-patch-entry-point-mutate -o /dev/null 2>&1 - <%s | FileCheck --check-prefixes=CHECK %s ; CHECK: IR Dump After Patch LLVM for entry-point mutation ; CHECK: declare amdgpu_gfx i32 @compute_library_func() #0 -; CHECK: define dllexport amdgpu_cs void @lgc.shader.CS.main(i32 inreg %globalTable, ptr addrspace(4) inreg %numWorkgroupsPtr, i32 inreg %descTable2, i32 inreg %0, i32 inreg %1, i32 inreg %2, i32 inreg %3, i32 inreg %4, i32 inreg %5, i32 inreg %6, i32 inreg %7, i32 inreg %8, i32 inreg %9, i32 inreg %10, i32 inreg %spillTable, <3 x i32> inreg %WorkgroupId, i32 inreg %MultiDispatchInfo, <3 x i32> %LocalInvocationId) #1 !lgc.shaderstage !7 { -; CHECK: call amdgpu_gfx i32 @compute_library_func(i32 inreg %globalTable, ptr addrspace(4) inreg %numWorkgroupsPtr, i32 inreg %descTable2, i32 inreg %0, i32 inreg %1, i32 inreg %2, i32 inreg %3, i32 inreg %4, i32 inreg %5, i32 inreg %6, i32 inreg %7, i32 inreg %8, i32 inreg %9, i32 inreg %10, i32 inreg %spillTable, <3 x i32> inreg %WorkgroupId, i32 inreg %MultiDispatchInfo, <3 x i32> %LocalInvocationId) +; CHECK: define dllexport amdgpu_cs void @lgc.shader.CS.main(i32 inreg %globalTable, ptr addrspace(4) inreg %numWorkgroupsPtr, i32 inreg %userdata0, i32 inreg %userdata1, i32 inreg %userdata2, i32 inreg %userdata3, i32 inreg %userdata4, i32 inreg %userdata5, i32 inreg %userdata6, i32 inreg %userdata7, i32 inreg %userdata8, i32 inreg %userdata9, i32 inreg %userdata10, i32 inreg %userdata11, i32 inreg %spillTable, <3 x i32> inreg %WorkgroupId, i32 inreg %MultiDispatchInfo, <3 x i32> %LocalInvocationId) #1 !lgc.shaderstage !7 { +; CHECK: call amdgpu_gfx i32 @compute_library_func(i32 inreg %globalTable, ptr addrspace(4) inreg %numWorkgroupsPtr, i32 inreg %userdata0, i32 inreg %userdata1, i32 inreg %userdata2, i32 inreg %userdata3, i32 inreg %userdata4, i32 inreg %userdata5, i32 inreg %userdata6, i32 inreg %userdata7, i32 inreg %userdata8, i32 inreg %userdata9, i32 inreg %userdata10, i32 inreg %userdata11, i32 inreg %spillTable, <3 x i32> inreg %WorkgroupId, i32 inreg %MultiDispatchInfo, <3 x i32> %LocalInvocationId) ; CHECK: !7 = !{i32 7} ; ModuleID = 'lgcPipeline' diff --git a/lgc/test/CsComputeLibrary.lgc b/lgc/test/CsComputeLibrary.lgc index 390ed4a247..628aec321e 100644 --- a/lgc/test/CsComputeLibrary.lgc +++ b/lgc/test/CsComputeLibrary.lgc @@ -2,10 +2,10 @@ ; RUN: lgc -mcpu=gfx1010 -print-after=lgc-patch-entry-point-mutate -print-after=lgc-patch-prepare-pipeline-abi -print-after=lgc-patch-setup-target-features -o /dev/null 2>&1 - <%s | FileCheck --check-prefixes=CHECK %s ; CHECK: IR Dump After Patch LLVM for entry-point mutation -; CHECK: define amdgpu_gfx void @func(i32 inreg %globalTable, ptr addrspace(4) inreg %numWorkgroupsPtr, i32 inreg %descTable2, i32 inreg %0, i32 inreg %1, i32 inreg %2, i32 inreg %3, i32 inreg %4, i32 inreg %5, i32 inreg %6, i32 inreg %7, i32 inreg %8, i32 inreg %9, i32 inreg %10, i32 inreg %spillTable, <3 x i32> inreg %WorkgroupId, i32 inreg %MultiDispatchInfo, <3 x i32> %LocalInvocationId) #0 !lgc.shaderstage !7 { +; CHECK: define amdgpu_gfx void @func(i32 inreg %globalTable, ptr addrspace(4) inreg %numWorkgroupsPtr, i32 inreg %userdata0, i32 inreg %userdata1, i32 inreg %userdata2, i32 inreg %userdata3, i32 inreg %userdata4, i32 inreg %userdata5, i32 inreg %userdata6, i32 inreg %userdata7, i32 inreg %userdata8, i32 inreg %userdata9, i32 inreg %userdata10, i32 inreg %userdata11, i32 inreg %spillTable, <3 x i32> inreg %WorkgroupId, i32 inreg %MultiDispatchInfo, <3 x i32> %LocalInvocationId) #0 !lgc.shaderstage !7 { ; CHECK: !7 = !{i32 7} ; CHECK: IR Dump After Patch LLVM for preparing pipeline ABI -; CHECK: define amdgpu_gfx void @func(i32 inreg %globalTable, ptr addrspace(4) inreg %numWorkgroupsPtr, i32 inreg %descTable2, i32 inreg %0, i32 inreg %1, i32 inreg %2, i32 inreg %3, i32 inreg %4, i32 inreg %5, i32 inreg %6, i32 inreg %7, i32 inreg %8, i32 inreg %9, i32 inreg %10, i32 inreg %spillTable, <3 x i32> inreg %WorkgroupId, i32 inreg %MultiDispatchInfo, <3 x i32> %LocalInvocationId) #0 !lgc.shaderstage !7 { +; CHECK: define amdgpu_gfx void @func(i32 inreg %globalTable, ptr addrspace(4) inreg %numWorkgroupsPtr, i32 inreg %userdata0, i32 inreg %userdata1, i32 inreg %userdata2, i32 inreg %userdata3, i32 inreg %userdata4, i32 inreg %userdata5, i32 inreg %userdata6, i32 inreg %userdata7, i32 inreg %userdata8, i32 inreg %userdata9, i32 inreg %userdata10, i32 inreg %userdata11, i32 inreg %spillTable, <3 x i32> inreg %WorkgroupId, i32 inreg %MultiDispatchInfo, <3 x i32> %LocalInvocationId) #0 !lgc.shaderstage !7 { ; CHECK: IR Dump After Patch LLVM to set up target features ; CHECK: attributes #0 = { nounwind {{.*}}"amdgpu-flat-work-group-size"="6,6" diff --git a/lgc/test/CsComputeLibraryPayload.lgc b/lgc/test/CsComputeLibraryPayload.lgc index bfb41727bc..09c8a0e164 100644 --- a/lgc/test/CsComputeLibraryPayload.lgc +++ b/lgc/test/CsComputeLibraryPayload.lgc @@ -1,30 +1,17 @@ +; NOTE: Assertions have been autogenerated by tool/update_llpc_test_checks.py UTC_ARGS: --tool lgc +; RUN: lgc -mcpu=gfx1010 -o - - <%s | FileCheck --check-prefixes=CHECK %s + ; Define a compute library that can be called from a compute shader. ; Ensure that the first argument uses the same registers as the return value. ; The assembly should not have any movs of vector registers. -; RUN: lgc -mcpu=gfx1010 -o - - <%s | FileCheck --check-prefixes=CHECK %s - ; ModuleID = 'lgcPipeline' target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-ni:7" target triple = "amdgcn--amdpal" ; Function Attrs: nounwind define spir_func <10 x i32> @func(<10 x i32> %arg) local_unnamed_addr #0 !lgc.shaderstage !7 { -; CHECK-LABEL: func: -; CHECK: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK: s_getpc_b64 s[{{[0-9]+:[0-9]+}}] -; CHECK: s_mov_b32 s35, s{{[0-9]+}} -; CHECK-NEXT: v_mov_b32_e32 v{{[0-9]+}}, s{{[0-9]+}} -; CHECK-NEXT: s_load_dwordx4 s[{{[0-9]+:[0-9]+}}], s[34:35], 0x0 -; CHECK: v_mov_b32_e32 v12, s{{[0-9]+}} -; CHECK: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: buffer_store_dwordx3 v[{{[0-9]+:[0-9]+}}], off, s[{{[0-9]+:[0-9]+}}], 0 -; CHECK: s_setpc_b64 s[30:31] .entry: - %id = call <3 x i32> @lgc.shader.input.LocalInvocationId(i32 0) - %buf = call i8 addrspace(7)* (...) @lgc.create.load.buffer.desc.p7i8(i32 0, i32 2, i32 0, i32 2) - %buf2 = bitcast i8 addrspace(7)* %buf to <3 x i32> addrspace(7)* - store <3 x i32> %id, <3 x i32> addrspace(7)* %buf2, align 4 ret <10 x i32> %arg } @@ -49,3 +36,7 @@ attributes #1 = { nounwind readonly } !5 = !{!"DescriptorTableVaPtr", i32 0, i32 0, i32 20, i32 1, i32 1} !6 = !{!"DescriptorBuffer", i32 6, i32 0, i32 0, i32 4, i32 0, i32 2, i32 4} !7 = !{i32 7} + +; CHECK-LABEL: func: +; CHECK: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-NEXT: s_setpc_b64 s[30:31] diff --git a/lgc/test/FsComputeLibrary.lgc b/lgc/test/FsComputeLibrary.lgc deleted file mode 100644 index f005b5a481..0000000000 --- a/lgc/test/FsComputeLibrary.lgc +++ /dev/null @@ -1,37 +0,0 @@ -; Define a compute library that can be called from a compute shader. - -; RUN: lgc -mcpu=gfx1010 -print-after=lgc-patch-entry-point-mutate -print-after=lgc-patch-prepare-pipeline-abi -o /dev/null 2>&1 - <%s | FileCheck --check-prefixes=CHECK %s -; CHECK: IR Dump After Patch LLVM for entry-point mutation -; CHECK: define amdgpu_gfx void @func(i32 inreg %globalTable, ptr addrspace(4) inreg %numWorkgroupsPtr, i32 inreg %0, i32 inreg %1, i32 inreg %2, i32 inreg %3, i32 inreg %4, i32 inreg %5, i32 inreg %6, i32 inreg %7, i32 inreg %8, i32 inreg %9, i32 inreg %10, i32 inreg %11, i32 inreg %spillTable, <3 x i32> inreg %WorkgroupId, i32 inreg %MultiDispatchInfo, <3 x i32> %LocalInvocationId) #0 !lgc.shaderstage !5 { -; CHECK: !5 = !{i32 6} -; CHECK: IR Dump After Patch LLVM for preparing pipeline ABI -; CHECK: define amdgpu_gfx void @func(i32 inreg %globalTable, ptr addrspace(4) inreg %numWorkgroupsPtr, i32 inreg %0, i32 inreg %1, i32 inreg %2, i32 inreg %3, i32 inreg %4, i32 inreg %5, i32 inreg %6, i32 inreg %7, i32 inreg %8, i32 inreg %9, i32 inreg %10, i32 inreg %11, i32 inreg %spillTable, <3 x i32> inreg %WorkgroupId, i32 inreg %MultiDispatchInfo, <3 x i32> %LocalInvocationId) #0 !lgc.shaderstage !5 { - -; ModuleID = 'lgcPipeline' -target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-ni:7" -target triple = "amdgcn--amdpal" - -; Function Attrs: nounwind -define spir_func void @func() local_unnamed_addr #0 !lgc.shaderstage !5 { -.entry: - ret void -} - -; Function Attrs: nounwind readonly -declare ptr addrspace(7) @lgc.create.load.buffer.desc.p7(...) local_unnamed_addr #1 - -attributes #0 = { nounwind } -attributes #1 = { nounwind readonly } - -!lgc.unlinked = !{!0} -!lgc.options = !{!1} -!lgc.options.FS = !{!2} -!lgc.color.export.formats = !{!3} -!lgc.input.assembly.state = !{!4} - -!0 = !{i32 1} -!1 = !{i32 -794913950, i32 -27741903, i32 1278784547, i32 441582842, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 2} -!2 = !{i32 1072849668, i32 -352651751, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 64, i32 0, i32 0, i32 3} -!3 = !{i32 14, i32 7} -!4 = !{i32 0, i32 3} -!5 = !{i32 6} diff --git a/lgc/test/PatchInvalidImageDescriptor.lgc b/lgc/test/PatchInvalidImageDescriptor.lgc index 5652f50789..be099ba691 100644 --- a/lgc/test/PatchInvalidImageDescriptor.lgc +++ b/lgc/test/PatchInvalidImageDescriptor.lgc @@ -70,10 +70,13 @@ define dllexport spir_func void @lgc.shader.VS.main() local_unnamed_addr #0 !lgc %.query.size = call <2 x i32> (...) @lgc.create.image.query.size.v2i32(i32 1, i32 128, <8 x i32> %.desc, i32 0) %.query.levels = call i32 (...) @lgc.create.image.query.levels.i32(i32 1, i32 128, <8 x i32> %.desc) + %lane = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) ; just some source of divergence + %ofs = mul i32 %lane, 32 + ; Use a waterfall loop with last.use to test that is also handled correctly %.desc2.ptr2 = call <8 x i32> addrspace(4)* (...) @lgc.create.get.desc.ptr.p4v8i32(i32 1, i32 1, i32 3, i32 4) %.desc2.ptr1 = bitcast <8 x i32> addrspace(4)* %.desc2.ptr2 to i8 addrspace(4)* - %.desc2.ptr0 = getelementptr i8, i8 addrspace(4)* %.desc2.ptr1, i64 0 + %.desc2.ptr0 = getelementptr i8, i8 addrspace(4)* %.desc2.ptr1, i32 %ofs %.desc2.ptr = bitcast i8 addrspace(4)* %.desc2.ptr0 to <8 x i32> addrspace(4)* %.desc2 = load <8 x i32>, <8 x i32> addrspace(4)* %.desc2.ptr, align 32 call void (...) @lgc.create.image.store(<4 x float> zeroinitializer, i32 0, i32 8, <8 x i32> %.desc2, i32 zeroinitializer) @@ -92,6 +95,8 @@ declare <2 x float> @lgc.create.image.get.lod.v2f32(...) #0 declare <2 x i32> @lgc.create.image.query.size.v2i32(...) #0 declare i32 @lgc.create.image.query.levels.i32(...) #0 +declare i32 @llvm.amdgcn.mbcnt.lo(i32, i32) + attributes #0 = { nounwind } attributes #1 = { nounwind readonly } attributes #2 = { nounwind writeonly } diff --git a/lgc/test/TaskShaderRegConfig.lgc b/lgc/test/TaskShaderRegConfig.lgc index e3be43cfb1..fddcfe6e49 100644 --- a/lgc/test/TaskShaderRegConfig.lgc +++ b/lgc/test/TaskShaderRegConfig.lgc @@ -1,21 +1,7 @@ +; NOTE: Assertions have been autogenerated by tool/update_llpc_test_checks.py UTC_ARGS: --tool lgc --check-pal-metadata ; Test that relevant registers of task shader are built as expected. -; RUN: lgc -mcpu=gfx1030 -o=- - <%s | FileCheck --check-prefixes=CHECK %s - -; In this test case, we check if relevant registers of a task shader is built correctly. -; -; CHECK-LABEL: .registers: -; CHECK: (COMPUTE_NUM_THREAD_X): 0x20 -; CHECK: (COMPUTE_NUM_THREAD_Y): 0x1 -; CHECK: (COMPUTE_NUM_THREAD_Z): 0x1 -; CHECK: (COMPUTE_PGM_RSRC1): -; CHECK: (COMPUTE_PGM_RSRC2): -; CHECK: (COMPUTE_PGM_RSRC3): -; CHECK: (COMPUTE_SHADER_CHKSUM): -; CHECK: (COMPUTE_USER_DATA_0): 0x10000000 -; CHECK: (COMPUTE_USER_DATA_1): 0x10000012 -; CHECK: (COMPUTE_USER_DATA_4): 0x10000013 -; CHECK: (COMPUTE_USER_DATA_5): 0x10000014 +; RUN: lgc -mcpu=gfx1030 -o - -filetype=asm %s | FileCheck --check-prefixes=CHECK %s ; ModuleID = 'lgcPipeline' source_filename = "llpctask1" @@ -46,3 +32,99 @@ attributes #0 = { nounwind } !5 = !{i32 0, i32 3} !6 = !{!"\82\B0amdpal.pipelines\91\84\AA.registers\80\B0.spill_threshold\CE\FF\FF\FF\FF\B0.user_data_limit\00\AF.xgl_cache_info\82\B3.128_bit_cache_hash\92\CF*\B3\1C8\12\A8\7F\8D\CF\0B\01_tf\C7\1A\C5\AD.llpc_version\A452.2\AEamdpal.version\92\02\03"} !7 = !{i32 0} +; CHECK-LABEL: amdgpu_cs_main: +; CHECK: s_endpgm +; +; CHECK-LABEL: .amdgpu_pal_metadata +; CHECK-NEXT: --- +; CHECK-NEXT: amdpal.pipelines: +; CHECK-NEXT: - .api: Vulkan +; CHECK-NEXT: .compute_registers: +; CHECK-NEXT: .tg_size_en: true +; CHECK-NEXT: .tgid_x_en: true +; CHECK-NEXT: .tgid_y_en: true +; CHECK-NEXT: .tgid_z_en: true +; CHECK-NEXT: .tidig_comp_cnt: 0 +; CHECK-NEXT: .graphics_registers: +; CHECK-NEXT: .aa_coverage_to_shader_select: InputCoverage +; CHECK-NEXT: .pa_sc_shader_control: +; CHECK-NEXT: .wave_break_region_size: 0 +; CHECK-NEXT: .hardware_stages: +; CHECK-NEXT: .cs: +; CHECK-NEXT: .checksum_value: 0xbbc4ff6d +; CHECK-NEXT: .debug_mode: false +; CHECK-NEXT: .entry_point: _amdgpu_cs_main +; CHECK-NEXT: .excp_en: 0 +; CHECK-NEXT: .float_mode: 0xc0 +; CHECK-NEXT: .ieee_mode: false +; CHECK-NEXT: .lds_size: 0 +; CHECK-NEXT: .mem_ordered: true +; CHECK-NEXT: .scratch_en: false +; CHECK-NEXT: .scratch_memory_size: 0 +; CHECK-NEXT: .sgpr_count: 0xa +; CHECK-NEXT: .sgpr_limit: 0x6a +; CHECK-NEXT: .threadgroup_dimensions: +; CHECK-NEXT: - 0x20 +; CHECK-NEXT: - 0x1 +; CHECK-NEXT: - 0x1 +; CHECK-NEXT: .trap_present: false +; CHECK-NEXT: .user_data_reg_map: +; CHECK-NEXT: - 0x10000000 +; CHECK-NEXT: - 0x10000012 +; CHECK-NEXT: - 0xffffffff +; CHECK-NEXT: - 0xffffffff +; CHECK-NEXT: - 0x10000013 +; CHECK-NEXT: - 0x10000014 +; CHECK-NEXT: - 0xffffffff +; CHECK-NEXT: - 0xffffffff +; CHECK-NEXT: - 0xffffffff +; CHECK-NEXT: - 0xffffffff +; CHECK-NEXT: - 0xffffffff +; CHECK-NEXT: - 0xffffffff +; CHECK-NEXT: - 0xffffffff +; CHECK-NEXT: - 0xffffffff +; CHECK-NEXT: - 0xffffffff +; CHECK-NEXT: - 0xffffffff +; CHECK-NEXT: - 0xffffffff +; CHECK-NEXT: - 0xffffffff +; CHECK-NEXT: - 0xffffffff +; CHECK-NEXT: - 0xffffffff +; CHECK-NEXT: - 0xffffffff +; CHECK-NEXT: - 0xffffffff +; CHECK-NEXT: - 0xffffffff +; CHECK-NEXT: - 0xffffffff +; CHECK-NEXT: - 0xffffffff +; CHECK-NEXT: - 0xffffffff +; CHECK-NEXT: - 0xffffffff +; CHECK-NEXT: - 0xffffffff +; CHECK-NEXT: - 0xffffffff +; CHECK-NEXT: - 0xffffffff +; CHECK-NEXT: - 0xffffffff +; CHECK-NEXT: - 0xffffffff +; CHECK-NEXT: .user_sgprs: 0x6 +; CHECK-NEXT: .vgpr_count: 0x3 +; CHECK-NEXT: .vgpr_limit: 0x100 +; CHECK-NEXT: .wavefront_size: 0x20 +; CHECK-NEXT: .wgp_mode: false +; CHECK-NEXT: .internal_pipeline_hash: +; CHECK-NEXT: - 0x{{[0-9a-f]+}} +; CHECK-NEXT: - 0x{{[0-9a-f]+}} +; CHECK-NEXT: .registers: {} +; CHECK-NEXT: .shaders: +; CHECK-NEXT: .task: +; CHECK-NEXT: .api_shader_hash: +; CHECK-NEXT: - 0x{{[0-9a-f]+}} +; CHECK-NEXT: - 0 +; CHECK-NEXT: .hardware_mapping: +; CHECK-NEXT: - .cs +; CHECK-NEXT: .spill_threshold: 0xffff +; CHECK-NEXT: .user_data_limit: 0x1 +; CHECK-NEXT: .xgl_cache_info: +; CHECK-NEXT: .128_bit_cache_hash: +; CHECK-NEXT: - 0x{{[0-9a-f]+}} +; CHECK-NEXT: - 0x{{[0-9a-f]+}} +; CHECK-NEXT: .llpc_version: {{.*}} +; CHECK-NEXT: amdpal.version: +; CHECK-NEXT: - 0x3 +; CHECK-NEXT: - 0 +; CHECK-NEXT: ... diff --git a/lgc/test/TextureRange.lgc b/lgc/test/TextureRange.lgc index 9c0d176c91..2ed306ce36 100644 --- a/lgc/test/TextureRange.lgc +++ b/lgc/test/TextureRange.lgc @@ -1,15 +1,16 @@ ; RUN: lgc %s -print-after=lgc-builder-replayer -o /dev/null 2>&1 - <%s | FileCheck --check-prefixes=CHECK %s -; CHECK: [[desc0:%[0-9]+]] = call ptr addrspace(4) @lgc.descriptor.table.addr(i32 6 -; CHECK-NEXT: %{{.*}} = getelementptr i8, ptr addrspace(4) [[desc0]], i32 16 -; CHECK: call <2 x i32> @lgc.root.descriptor.v2i32(i32 6) +; CHECK: call <2 x i32> @lgc.load.user.data.v2i32(i32 24) ; CHECK: call ptr addrspace(7) @lgc.buffer.desc.to.ptr(<4 x i32> ; CHECK: [[varindex0:%[0-9]+]] = call ptr addrspace(7) @lgc.buffer.desc.to.ptr(<4 x i32> ; CHECK: [[varindex1:%[0-9]+]] = load i32, ptr addrspace(7) [[varindex0]], align 4 ; CHECK-NEXT: [[varindex2:%[0-9]+]] = sext i32 [[varindex1]] to i64 ; CHECK-NEXT: getelementptr <{ [4294967295 x float] }>, ptr addrspace(7) %{{.*}}, i64 0, i32 0, i64 [[varindex2]] -; CHECK: [[desc1:%[0-9]+]] = call ptr addrspace(4) @lgc.descriptor.table.addr(i32 1 +; CHECK: [[desc1lo:%[0-9]+]] = call i32 @lgc.load.user.data.i32(i32 4 +; CHECK-NEXT: [[desc1vec:%[0-9]+]] = insertelement <2 x i32> %{{[^,]+}}, i32 [[desc1lo]], i64 0 +; CHECK-NEXT: [[desc1lohi:%[0-9]+]] = bitcast <2 x i32> [[desc1vec]] to i64 +; CHECK-NEXT: [[desc1:%[0-9]+]] = inttoptr i64 [[desc1lohi]] to ptr addrspace(4) ; CHECK-NEXT: %{{.*}} = getelementptr i8, ptr addrspace(4) [[desc1]], i32 32 ; RUN: lgc -mcpu=gfx1030 -o - - <%s | FileCheck --check-prefixes=SHADER_TEST %s @@ -20,9 +21,41 @@ ; SHADER_TEST: buffer_load_dword {{.*}}, {{.*}}, s[12:15], 0 offen ; SHADER_TEST: image_sample {{.*}}, {{.*}}, [[desc]], {{.*}} dmask:0xf ; SHADER_TEST: s_load_dwordx4 {{.*}}, {{.*}}, 0x10 -; SHADER_TEST: .registers: -; SHADER_TEST: (SPI_SHADER_USER_DATA_PS_3): 0x6 -; SHADER_TEST: (SPI_SHADER_USER_DATA_PS_4): 0x7 +; SHADER_TEST: .hardware_stages: +; SHADER_TEST: .ps: +; SHADER_TEST: .user_data_reg_map: +; SHADER_TEST: - 0x10000000 +; SHADER_TEST: - 0 +; SHADER_TEST: - 0x1 +; SHADER_TEST: - 0x6 +; SHADER_TEST: - 0x7 +; SHADER_TEST: - 0x10000002 +; SHADER_TEST: - 0xffffffff +; SHADER_TEST: - 0xffffffff +; SHADER_TEST: - 0xffffffff +; SHADER_TEST: - 0xffffffff +; SHADER_TEST: - 0xffffffff +; SHADER_TEST: - 0xffffffff +; SHADER_TEST: - 0xffffffff +; SHADER_TEST: - 0xffffffff +; SHADER_TEST: - 0xffffffff +; SHADER_TEST: - 0xffffffff +; SHADER_TEST: - 0xffffffff +; SHADER_TEST: - 0xffffffff +; SHADER_TEST: - 0xffffffff +; SHADER_TEST: - 0xffffffff +; SHADER_TEST: - 0xffffffff +; SHADER_TEST: - 0xffffffff +; SHADER_TEST: - 0xffffffff +; SHADER_TEST: - 0xffffffff +; SHADER_TEST: - 0xffffffff +; SHADER_TEST: - 0xffffffff +; SHADER_TEST: - 0xffffffff +; SHADER_TEST: - 0xffffffff +; SHADER_TEST: - 0xffffffff +; SHADER_TEST: - 0xffffffff +; SHADER_TEST: - 0xffffffff +; SHADER_TEST: - 0xffffffff ; ModuleID = 'lgcPipeline' source_filename = "lgcPipeline" diff --git a/lgc/test/Transforms/Continufy/simple.lgc b/lgc/test/Transforms/Continufy/simple.lgc new file mode 100644 index 0000000000..81eaa6bd5a --- /dev/null +++ b/lgc/test/Transforms/Continufy/simple.lgc @@ -0,0 +1,92 @@ +; NOTE: Assertions have been autogenerated by tool/update_llpc_test_checks.py UTC_ARGS: --tool lgc --function-signature --check-globals +; RUN: lgc -mcpu=gfx1030 -o - -passes='require,lgc-continufy' %s | FileCheck --check-prefixes=CHECK %s + +define spir_func void @raygen() !lgc.shaderstage !{i32 7} !continufy.stage !{i32 0} { + %pushconst = call ptr addrspace(4) @lgc.user.data(i32 0) + %fn = load ptr, ptr addrspace(4) %pushconst + %p8 = getelementptr i8, ptr addrspace(4) %pushconst, i32 8 + %x = load i32, ptr addrspace(4) %p8 + %p16 = getelementptr i8, ptr addrspace(4) %pushconst, i32 16 + %dst = load ptr addrspace(1), ptr addrspace(4) %p16 + %r = call spir_func [2 x i32] %fn(i32 %x, ptr addrspace(1) %dst), !continufy.stage !{i32 -1} + store [2 x i32] %r, ptr addrspace(1) %dst + ret void +} + +define spir_func i32 @chs(i32 %x) !lgc.shaderstage !{i32 7} !continufy.stage !{i32 3} { + %pushconst = call ptr addrspace(4) @lgc.user.data(i32 24) + %fn = load ptr, ptr addrspace(4) %pushconst + %y = call spir_func i32 %fn(i32 %x), !continufy.stage !{i32 5} + ret i32 %y +} + +; Note: No !continufy.stage metadata here +define dllexport void @lgc.shader.CS.main() !lgc.shaderstage !{i32 7} { +entry: + %id = call i32 @lgc.shader.input.LocalInvocationId(i32 49) + %live = icmp ult i32 %id, 29 + br i1 %live, label %main, label %exit + +main: + %pushconst = call ptr addrspace(4) @lgc.user.data(i32 32) + %fn = load ptr, ptr addrspace(4) %pushconst + call spir_func void %fn(), !continufy.stage !{i32 0} + br label %exit + +exit: + ret void +} + +declare ptr addrspace(4) @lgc.user.data(i32) +declare i32 @lgc.shader.input.LocalInvocationId(i32) +; CHECK-LABEL: define {{[^@]+}}@raygen +; CHECK-SAME: ({} [[STATE:%.*]], i32 [[RCR:%.*]]) !lgc.shaderstage !2 !continufy.stage !3 !lgc.cps !3 { +; CHECK-NEXT: [[PUSHCONST:%.*]] = call ptr addrspace(4) @lgc.user.data(i32 0) +; CHECK-NEXT: [[FN:%.*]] = load ptr, ptr addrspace(4) [[PUSHCONST]], align 8 +; CHECK-NEXT: [[P8:%.*]] = getelementptr i8, ptr addrspace(4) [[PUSHCONST]], i32 8 +; CHECK-NEXT: [[X:%.*]] = load i32, ptr addrspace(4) [[P8]], align 4 +; CHECK-NEXT: [[P16:%.*]] = getelementptr i8, ptr addrspace(4) [[PUSHCONST]], i32 16 +; CHECK-NEXT: [[DST:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[P16]], align 8 +; CHECK-NEXT: [[TMP1:%.*]] = ptrtoint ptr [[FN]] to i32 +; CHECK-NEXT: [[TMP2:%.*]] = or i32 [[TMP1]], 2 +; CHECK-NEXT: [[TMP3:%.*]] = call [2 x i32] (...) @lgc.cps.await.a2i32(i32 [[TMP2]], i32 4, i32 [[X]], ptr addrspace(1) [[DST]]) +; CHECK-NEXT: store [2 x i32] [[TMP3]], ptr addrspace(1) [[DST]], align 4 +; CHECK-NEXT: ret void +; +; +; CHECK-LABEL: define {{[^@]+}}@chs +; CHECK-SAME: ({} [[STATE:%.*]], i32 [[RCR:%.*]], i32 [[X:%.*]]) !lgc.shaderstage !2 !continufy.stage !4 !lgc.cps !5 { +; CHECK-NEXT: [[PUSHCONST:%.*]] = call ptr addrspace(4) @lgc.user.data(i32 24) +; CHECK-NEXT: [[FN:%.*]] = load ptr, ptr addrspace(4) [[PUSHCONST]], align 8 +; CHECK-NEXT: [[TMP1:%.*]] = ptrtoint ptr [[FN]] to i32 +; CHECK-NEXT: [[TMP2:%.*]] = or i32 [[TMP1]], 1 +; CHECK-NEXT: [[TMP3:%.*]] = call i32 (...) @lgc.cps.await.i32(i32 [[TMP2]], i32 2, i32 [[X]]) +; CHECK-NEXT: call void (...) @lgc.cps.jump(i32 [[RCR]], i32 4, {} poison, i32 poison, i32 [[TMP3]]) +; CHECK-NEXT: unreachable +; +; +; CHECK-LABEL: define {{[^@]+}}@lgc.shader.CS.main() !lgc.shaderstage !2 { +; CHECK-NEXT: entry: +; CHECK-NEXT: [[ID:%.*]] = call i32 @lgc.shader.input.LocalInvocationId(i32 49) +; CHECK-NEXT: [[LIVE:%.*]] = icmp ult i32 [[ID]], 29 +; CHECK-NEXT: br i1 [[LIVE]], label [[MAIN:%.*]], label [[EXIT:%.*]] +; CHECK: main: +; CHECK-NEXT: [[PUSHCONST:%.*]] = call ptr addrspace(4) @lgc.user.data(i32 32) +; CHECK-NEXT: [[FN:%.*]] = load ptr, ptr addrspace(4) [[PUSHCONST]], align 8 +; CHECK-NEXT: [[TMP0:%.*]] = ptrtoint ptr [[FN]] to i32 +; CHECK-NEXT: call void (...) @lgc.cps.await.isVoid(i32 [[TMP0]], i32 1) +; CHECK-NEXT: br label [[EXIT]] +; CHECK: exit: +; CHECK-NEXT: ret void +; +;. +; CHECK: attributes #[[ATTR0:[0-9]+]] = { nounwind willreturn } +; CHECK: attributes #[[ATTR1:[0-9]+]] = { noreturn } +;. +; CHECK: [[META0:![0-9]+]] = !{!""} +; CHECK: [[META1:![0-9]+]] = !{!"\82\B0amdpal.pipelines\91\82\B0.spill_threshold\CD\FF\FF\B0.user_data_limit\00\AEamdpal.version\92\03\00"} +; CHECK: [[META2:![0-9]+]] = !{i32 7} +; CHECK: [[META3:![0-9]+]] = !{i32 0} +; CHECK: [[META4:![0-9]+]] = !{i32 3} +; CHECK: [[META5:![0-9]+]] = !{i32 1} +;. diff --git a/lgc/test/Transforms/CpsLowering/continuation-basic.lgc b/lgc/test/Transforms/CpsLowering/continuation-basic.lgc index 858254810e..1acb197360 100644 --- a/lgc/test/Transforms/CpsLowering/continuation-basic.lgc +++ b/lgc/test/Transforms/CpsLowering/continuation-basic.lgc @@ -1,6 +1,5 @@ ; NOTE: Assertions have been autogenerated by tool/update_llpc_test_checks.py UTC_ARGS: --tool lgc --function-signature ; RUN: lgc -mcpu=gfx1030 -o - -passes='require,lgc-patch-entry-point-mutate' %s | FileCheck --check-prefixes=CHECK %s -; REQUIRES: do-not-run-me declare void @lgc.cps.jump(i32 %target, i32 %levels, {i32} %state, ...) noreturn @@ -17,52 +16,68 @@ entry: } !0 = !{i32 1} ; level 1 -; CHECK-LABEL: define {{[^@]+}}@test.1 -; CHECK-SAME: (i32 inreg [[GLOBALTABLE:%.*]], i32 inreg [[PERSHADERTABLE:%.*]], ptr addrspace(4) inreg [[NUMWORKGROUPSPTR:%.*]], i32 inreg [[TMP0:%.*]], i32 inreg [[TMP1:%.*]], i32 inreg [[TMP2:%.*]], i32 inreg [[TMP3:%.*]], i32 inreg [[TMP4:%.*]], i32 inreg [[TMP5:%.*]], i32 inreg [[TMP6:%.*]], i32 inreg [[TMP7:%.*]], i32 inreg [[TMP8:%.*]], i32 inreg [[TMP9:%.*]], i32 inreg [[TMP10:%.*]], i32 inreg [[SPILLTABLE:%.*]], i32 [[VCR:%.*]], ptr addrspace(5) [[VSP:%.*]], i32 [[ARG:%.*]], ptr [[TABLE:%.*]]) #[[ATTR1:[0-9]+]] align 64 !lgc.cps !0 { +; CHECK-LABEL: define {{[^@]+}}@test +; CHECK-SAME: (i32 inreg [[GLOBALTABLE:%.*]], ptr addrspace(4) inreg [[NUMWORKGROUPSPTR:%.*]], i32 inreg [[PAD0:%.*]], i32 inreg [[PAD1:%.*]], i32 inreg [[PAD2:%.*]], i32 inreg [[PAD3:%.*]], i32 inreg [[PAD4:%.*]], i32 inreg [[PAD5:%.*]], i32 inreg [[PAD6:%.*]], i32 inreg [[PAD7:%.*]], i32 inreg [[PAD8:%.*]], i32 inreg [[PAD9:%.*]], i32 inreg [[PAD10:%.*]], i32 inreg [[PAD11:%.*]], i32 inreg [[SPILLTABLE:%.*]], i32 [[VCR:%.*]], ptr addrspace(5) [[VSP:%.*]], i32 [[ARG:%.*]], ptr [[TABLE:%.*]]) #[[ATTR1:[0-9]+]] align 64 !lgc.cps !2 { ; CHECK-NEXT: entry: -; CHECK-NEXT: [[TMP11:%.*]] = call i64 @llvm.amdgcn.s.getpc() -; CHECK-NEXT: [[TMP12:%.*]] = bitcast i64 [[TMP11]] to <2 x i32> -; CHECK-NEXT: [[TMP13:%.*]] = alloca ptr addrspace(5), align 4, addrspace(5) -; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds i8, ptr addrspace(5) [[VSP]], i32 -4 -; CHECK-NEXT: [[CPS_STATE:%.*]] = load { i32 }, ptr addrspace(5) [[TMP14]], align 4 -; CHECK-NEXT: store ptr addrspace(5) [[TMP14]], ptr addrspace(5) [[TMP13]], align 4 +; CHECK-NEXT: [[TMP0:%.*]] = alloca ptr addrspace(5), align 4, addrspace(5) +; CHECK-NEXT: [[TMP1:%.*]] = call i64 @llvm.amdgcn.s.getpc() +; CHECK-NEXT: [[TMP2:%.*]] = bitcast i64 [[TMP1]] to <2 x i32> +; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, ptr addrspace(5) [[VSP]], i32 -4 +; CHECK-NEXT: [[CPS_STATE:%.*]] = load { i32 }, ptr addrspace(5) [[TMP3]], align 4 +; CHECK-NEXT: store ptr addrspace(5) [[TMP3]], ptr addrspace(5) [[TMP0]], align 4 ; CHECK-NEXT: [[V:%.*]] = extractvalue { i32 } [[CPS_STATE]], 0 ; CHECK-NEXT: [[TABLE_0:%.*]] = getelementptr i32, ptr [[TABLE]], i32 0 ; CHECK-NEXT: [[CR_THEN:%.*]] = load i32, ptr [[TABLE_0]], align 4 ; CHECK-NEXT: [[THEN_ARG:%.*]] = add i32 [[ARG]], 1 ; CHECK-NEXT: [[V_THEN:%.*]] = mul i32 [[V]], 2 ; CHECK-NEXT: [[STATE_THEN:%.*]] = insertvalue { i32 } poison, i32 [[V_THEN]], 0 -; CHECK-NEXT: [[TMP15:%.*]] = load ptr addrspace(5), ptr addrspace(5) [[TMP13]], align 4 -; CHECK-NEXT: store { i32 } [[STATE_THEN]], ptr addrspace(5) [[TMP15]], align 4 -; CHECK-NEXT: [[TMP16:%.*]] = getelementptr ptr addrspace(5), ptr addrspace(5) [[TMP15]], i32 4 -; CHECK-NEXT: [[TMP17:%.*]] = insertvalue { i32, ptr addrspace(5), i32 } poison, i32 [[CR_THEN]], 0 -; CHECK-NEXT: [[TMP18:%.*]] = insertvalue { i32, ptr addrspace(5), i32 } [[TMP17]], ptr addrspace(5) [[TMP16]], 1 -; CHECK-NEXT: [[TMP19:%.*]] = insertvalue { i32, ptr addrspace(5), i32 } [[TMP18]], i32 [[THEN_ARG]], 2 -; CHECK-NEXT: [[TMP20:%.*]] = ptrtoint ptr addrspace(4) [[NUMWORKGROUPSPTR]] to i64 -; CHECK-NEXT: [[TMP21:%.*]] = bitcast i64 [[TMP20]] to <2 x i32> -; CHECK-NEXT: [[TMP22:%.*]] = extractelement <2 x i32> [[TMP21]], i64 0 -; CHECK-NEXT: [[TMP23:%.*]] = extractelement <2 x i32> [[TMP21]], i64 1 -; CHECK-NEXT: [[TMP24:%.*]] = insertelement <16 x i32> poison, i32 [[GLOBALTABLE]], i64 0 -; CHECK-NEXT: [[TMP25:%.*]] = insertelement <16 x i32> [[TMP24]], i32 [[PERSHADERTABLE]], i64 1 -; CHECK-NEXT: [[TMP26:%.*]] = insertelement <16 x i32> [[TMP25]], i32 [[TMP22]], i64 2 -; CHECK-NEXT: [[TMP27:%.*]] = insertelement <16 x i32> [[TMP26]], i32 [[TMP23]], i64 3 -; CHECK-NEXT: [[TMP28:%.*]] = insertelement <16 x i32> [[TMP27]], i32 [[TMP0]], i64 4 -; CHECK-NEXT: [[TMP29:%.*]] = insertelement <16 x i32> [[TMP28]], i32 [[TMP1]], i64 5 -; CHECK-NEXT: [[TMP30:%.*]] = insertelement <16 x i32> [[TMP29]], i32 [[TMP2]], i64 6 -; CHECK-NEXT: [[TMP31:%.*]] = insertelement <16 x i32> [[TMP30]], i32 [[TMP3]], i64 7 -; CHECK-NEXT: [[TMP32:%.*]] = insertelement <16 x i32> [[TMP31]], i32 [[TMP4]], i64 8 -; CHECK-NEXT: [[TMP33:%.*]] = insertelement <16 x i32> [[TMP32]], i32 [[TMP5]], i64 9 -; CHECK-NEXT: [[TMP34:%.*]] = insertelement <16 x i32> [[TMP33]], i32 [[TMP6]], i64 10 -; CHECK-NEXT: [[TMP35:%.*]] = insertelement <16 x i32> [[TMP34]], i32 [[TMP7]], i64 11 -; CHECK-NEXT: [[TMP36:%.*]] = insertelement <16 x i32> [[TMP35]], i32 [[TMP8]], i64 12 -; CHECK-NEXT: [[TMP37:%.*]] = insertelement <16 x i32> [[TMP36]], i32 [[TMP9]], i64 13 -; CHECK-NEXT: [[TMP38:%.*]] = insertelement <16 x i32> [[TMP37]], i32 [[TMP10]], i64 14 -; CHECK-NEXT: [[TMP39:%.*]] = insertelement <16 x i32> [[TMP38]], i32 [[SPILLTABLE]], i64 15 -; CHECK-NEXT: [[TMP40:%.*]] = and i32 [[CR_THEN]], -64 -; CHECK-NEXT: [[TMP41:%.*]] = insertelement <2 x i32> [[TMP12]], i32 [[TMP40]], i64 0 +; CHECK-NEXT: [[TMP4:%.*]] = load ptr addrspace(5), ptr addrspace(5) [[TMP0]], align 4 +; CHECK-NEXT: store { i32 } [[STATE_THEN]], ptr addrspace(5) [[TMP4]], align 4 +; CHECK-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr addrspace(5) [[TMP4]], i32 4 +; CHECK-NEXT: br label [[TAIL_BLOCK:%.*]] +; CHECK: tail.block: +; CHECK-NEXT: [[TMP6:%.*]] = insertvalue { i32, ptr addrspace(5), i32 } poison, i32 [[CR_THEN]], 0 +; CHECK-NEXT: [[TMP7:%.*]] = insertvalue { i32, ptr addrspace(5), i32 } [[TMP6]], ptr addrspace(5) [[TMP5]], 1 +; CHECK-NEXT: [[TMP8:%.*]] = insertvalue { i32, ptr addrspace(5), i32 } [[TMP7]], i32 [[THEN_ARG]], 2 +; CHECK-NEXT: [[TMP9:%.*]] = ptrtoint ptr addrspace(4) [[NUMWORKGROUPSPTR]] to i64 +; CHECK-NEXT: [[TMP10:%.*]] = bitcast i64 [[TMP9]] to <2 x i32> +; CHECK-NEXT: [[TMP11:%.*]] = extractelement <2 x i32> [[TMP10]], i64 0 +; CHECK-NEXT: [[TMP12:%.*]] = extractelement <2 x i32> [[TMP10]], i64 1 +; CHECK-NEXT: [[TMP13:%.*]] = insertelement <16 x i32> poison, i32 [[GLOBALTABLE]], i64 0 +; CHECK-NEXT: [[TMP14:%.*]] = insertelement <16 x i32> [[TMP13]], i32 [[TMP11]], i64 1 +; CHECK-NEXT: [[TMP15:%.*]] = insertelement <16 x i32> [[TMP14]], i32 [[TMP12]], i64 2 +; CHECK-NEXT: [[TMP16:%.*]] = insertelement <16 x i32> [[TMP15]], i32 [[PAD0]], i64 3 +; CHECK-NEXT: [[TMP17:%.*]] = insertelement <16 x i32> [[TMP16]], i32 [[PAD1]], i64 4 +; CHECK-NEXT: [[TMP18:%.*]] = insertelement <16 x i32> [[TMP17]], i32 [[PAD2]], i64 5 +; CHECK-NEXT: [[TMP19:%.*]] = insertelement <16 x i32> [[TMP18]], i32 [[PAD3]], i64 6 +; CHECK-NEXT: [[TMP20:%.*]] = insertelement <16 x i32> [[TMP19]], i32 [[PAD4]], i64 7 +; CHECK-NEXT: [[TMP21:%.*]] = insertelement <16 x i32> [[TMP20]], i32 [[PAD5]], i64 8 +; CHECK-NEXT: [[TMP22:%.*]] = insertelement <16 x i32> [[TMP21]], i32 [[PAD6]], i64 9 +; CHECK-NEXT: [[TMP23:%.*]] = insertelement <16 x i32> [[TMP22]], i32 [[PAD7]], i64 10 +; CHECK-NEXT: [[TMP24:%.*]] = insertelement <16 x i32> [[TMP23]], i32 [[PAD8]], i64 11 +; CHECK-NEXT: [[TMP25:%.*]] = insertelement <16 x i32> [[TMP24]], i32 [[PAD9]], i64 12 +; CHECK-NEXT: [[TMP26:%.*]] = insertelement <16 x i32> [[TMP25]], i32 [[PAD10]], i64 13 +; CHECK-NEXT: [[TMP27:%.*]] = insertelement <16 x i32> [[TMP26]], i32 [[PAD11]], i64 14 +; CHECK-NEXT: [[TMP28:%.*]] = insertelement <16 x i32> [[TMP27]], i32 [[SPILLTABLE]], i64 15 +; CHECK-NEXT: [[TMP29:%.*]] = extractvalue { i32, ptr addrspace(5), i32 } [[TMP8]], 0 +; CHECK-NEXT: [[TMP30:%.*]] = call i32 @llvm.amdgcn.setinactive.chain.arg(i32 [[TMP29]], i32 [[VCR]]) +; CHECK-NEXT: [[TMP31:%.*]] = icmp ne i32 [[TMP30]], 0 +; CHECK-NEXT: [[TMP32:%.*]] = call i32 @llvm.amdgcn.ballot.i32(i1 [[TMP31]]) +; CHECK-NEXT: [[TMP33:%.*]] = call i32 @llvm.cttz.i32(i32 [[TMP32]], i1 true) +; CHECK-NEXT: [[TMP34:%.*]] = call i32 @llvm.amdgcn.readlane(i32 [[TMP30]], i32 [[TMP33]]) +; CHECK-NEXT: [[TMP35:%.*]] = icmp eq i32 [[TMP30]], [[TMP34]] +; CHECK-NEXT: [[TMP36:%.*]] = call i32 @llvm.amdgcn.ballot.i32(i1 [[TMP35]]) +; CHECK-NEXT: [[TMP37:%.*]] = call i32 @llvm.amdgcn.wwm.i32(i32 [[TMP34]]) +; CHECK-NEXT: [[TMP38:%.*]] = call i32 @llvm.amdgcn.wwm.i32(i32 [[TMP36]]) +; CHECK-NEXT: [[TMP39:%.*]] = icmp eq i32 [[TMP37]], 0 +; CHECK-NEXT: br i1 [[TMP39]], label [[RET_BLOCK:%.*]], label [[CHAIN_BLOCK:%.*]] +; CHECK: chain.block: +; CHECK-NEXT: [[TMP40:%.*]] = and i32 [[TMP37]], -64 +; CHECK-NEXT: [[TMP41:%.*]] = insertelement <2 x i32> [[TMP2]], i32 [[TMP40]], i64 0 ; CHECK-NEXT: [[TMP42:%.*]] = bitcast <2 x i32> [[TMP41]] to i64 ; CHECK-NEXT: [[TMP43:%.*]] = inttoptr i64 [[TMP42]] to ptr -; CHECK-NEXT: [[TMP44:%.*]] = call i32 @llvm.amdgcn.ballot.i32(i1 true) -; CHECK-NEXT: call void (ptr, i32, <16 x i32>, { i32, ptr addrspace(5), i32 }, i32, ...) @llvm.amdgcn.cs.chain.p0.i32.v16i32.sl_i32p5i32s(ptr [[TMP43]], i32 [[TMP44]], <16 x i32> [[TMP39]], { i32, ptr addrspace(5), i32 } [[TMP19]], i32 0) +; CHECK-NEXT: call void (ptr, i32, <16 x i32>, { i32, ptr addrspace(5), i32 }, i32, ...) @llvm.amdgcn.cs.chain.p0.i32.v16i32.sl_i32p5i32s(ptr inreg [[TMP43]], i32 inreg [[TMP38]], <16 x i32> inreg [[TMP28]], { i32, ptr addrspace(5), i32 } [[TMP8]], i32 0) ; CHECK-NEXT: unreachable +; CHECK: ret.block: +; CHECK-NEXT: ret void ; diff --git a/lgc/test/Transforms/CpsLowering/cps-entry-point.lgc b/lgc/test/Transforms/CpsLowering/cps-entry-point.lgc index 4f14c15788..bec0822026 100644 --- a/lgc/test/Transforms/CpsLowering/cps-entry-point.lgc +++ b/lgc/test/Transforms/CpsLowering/cps-entry-point.lgc @@ -9,9 +9,7 @@ declare ptr addrspace(32) @lgc.cps.get.vsp() #2 define dllexport spir_func void @lgc.shader.CS.main() local_unnamed_addr #0 !lgc.shaderstage !3 { .entry: - %table = call ptr addrspace(4) @lgc.descriptor.table.addr(i32 6, i32 6, i64 0, i32 0, i32 -1) #1 - %p_desc = getelementptr i8, ptr addrspace(4) %table, i32 0 - %desc = load <4 x i32>, ptr addrspace(4) %p_desc, align 16 + %desc = call <4 x i32> @lgc.load.user.data.v4i32(i32 0) %ptr = call ptr addrspace(7) @lgc.buffer.desc.to.ptr(<4 x i32> %desc) %p0 = getelementptr i32, ptr addrspace(7) %ptr, i32 0 %i_vsp = load i32, ptr addrspace(7) %p0, align 4 @@ -32,7 +30,7 @@ define dllexport spir_func void @lgc.shader.CS.main() local_unnamed_addr #0 !lgc unreachable } -declare ptr addrspace(4) @lgc.descriptor.table.addr(i32, i32, i64, i32, i32) #4 +declare <4 x i32> @lgc.load.user.data.v4i32(i32) #4 declare ptr addrspace(7) @lgc.buffer.desc.to.ptr(<4 x i32>) #5 @@ -42,85 +40,83 @@ attributes #2 = { nounwind willreturn memory(inaccessiblemem: read) } attributes #4 = { nounwind memory(none) } attributes #5 = { nounwind willreturn memory(none) } -!lgc.user.data.nodes = !{!0, !1} +!lgc.user.data.nodes = !{!1} !llpc.compute.mode = !{!2} -!0 = !{!"DescriptorTableVaPtr", i32 0, i32 0, i32 2, i32 1, i32 1} -!1 = !{!"DescriptorBuffer", i32 6, i32 0, i32 0, i32 4, i64 0, i32 0, i32 4} +!1 = !{!"DescriptorBuffer", i32 6, i32 6, i32 0, i32 4, i64 0, i32 0, i32 4} !2 = !{i32 8, i32 4, i32 1} !3 = !{i32 7} ; CHECK-LABEL: define {{[^@]+}}@lgc.shader.CS.main -; CHECK-SAME: (i32 inreg [[GLOBALTABLE:%.*]], ptr addrspace(4) inreg [[NUMWORKGROUPSPTR:%.*]], i32 inreg [[DESCTABLE0:%.*]], i32 inreg [[TMP0:%.*]], i32 inreg [[TMP1:%.*]], i32 inreg [[TMP2:%.*]], i32 inreg [[TMP3:%.*]], i32 inreg [[TMP4:%.*]], i32 inreg [[TMP5:%.*]], i32 inreg [[TMP6:%.*]], i32 inreg [[TMP7:%.*]], i32 inreg [[TMP8:%.*]], i32 inreg [[TMP9:%.*]], i32 inreg [[TMP10:%.*]], i32 inreg [[SPILLTABLE:%.*]], <3 x i32> inreg [[WORKGROUPID:%.*]], i32 inreg [[MULTIDISPATCHINFO:%.*]], <3 x i32> [[LOCALINVOCATIONID:%.*]]) #[[ATTR3:[0-9]+]] !lgc.shaderstage !5 { +; CHECK-SAME: (i32 inreg [[GLOBALTABLE:%.*]], ptr addrspace(4) inreg [[NUMWORKGROUPSPTR:%.*]], i32 inreg [[USERDATA0:%.*]], i32 inreg [[USERDATA1:%.*]], i32 inreg [[USERDATA2:%.*]], i32 inreg [[USERDATA3:%.*]], i32 inreg [[PAD4:%.*]], i32 inreg [[PAD5:%.*]], i32 inreg [[PAD6:%.*]], i32 inreg [[PAD7:%.*]], i32 inreg [[PAD8:%.*]], i32 inreg [[PAD9:%.*]], i32 inreg [[PAD10:%.*]], i32 inreg [[PAD11:%.*]], i32 inreg [[SPILLTABLE:%.*]], <3 x i32> inreg [[WORKGROUPID:%.*]], i32 inreg [[MULTIDISPATCHINFO:%.*]], <3 x i32> [[LOCALINVOCATIONID:%.*]]) #[[ATTR3:[0-9]+]] !lgc.shaderstage !4 { ; CHECK-NEXT: .entry: -; CHECK-NEXT: [[TMP11:%.*]] = alloca ptr addrspace(5), align 4, addrspace(5) -; CHECK-NEXT: [[TMP12:%.*]] = call i64 @llvm.amdgcn.s.getpc() -; CHECK-NEXT: [[TMP13:%.*]] = bitcast i64 [[TMP12]] to <2 x i32> -; CHECK-NEXT: [[TMP14:%.*]] = insertelement <2 x i32> [[TMP13]], i32 [[DESCTABLE0]], i64 0 -; CHECK-NEXT: [[TMP15:%.*]] = bitcast <2 x i32> [[TMP14]] to i64 -; CHECK-NEXT: [[TMP16:%.*]] = inttoptr i64 [[TMP15]] to ptr addrspace(4) -; CHECK-NEXT: [[TMP17:%.*]] = insertelement <2 x i32> [[TMP13]], i32 [[SPILLTABLE]], i64 0 -; CHECK-NEXT: [[TMP18:%.*]] = bitcast <2 x i32> [[TMP17]] to i64 -; CHECK-NEXT: [[TMP19:%.*]] = inttoptr i64 [[TMP18]] to ptr addrspace(4) -; CHECK-NEXT: [[TMP20:%.*]] = call i64 @llvm.amdgcn.s.getpc() -; CHECK-NEXT: [[TMP21:%.*]] = bitcast i64 [[TMP20]] to <2 x i32> -; CHECK-NEXT: [[P_DESC:%.*]] = getelementptr i8, ptr addrspace(4) [[TMP16]], i32 0 -; CHECK-NEXT: [[DESC:%.*]] = load <4 x i32>, ptr addrspace(4) [[P_DESC]], align 16 -; CHECK-NEXT: [[PTR:%.*]] = call ptr addrspace(7) @lgc.buffer.desc.to.ptr(<4 x i32> [[DESC]]) +; CHECK-NEXT: [[TMP0:%.*]] = alloca ptr addrspace(5), align 4, addrspace(5) +; CHECK-NEXT: [[TMP1:%.*]] = call i64 @llvm.amdgcn.s.getpc() +; CHECK-NEXT: [[TMP2:%.*]] = bitcast i64 [[TMP1]] to <2 x i32> +; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x i32> [[TMP2]], i32 [[SPILLTABLE]], i64 0 +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x i32> [[TMP3]] to i64 +; CHECK-NEXT: [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr addrspace(4) +; CHECK-NEXT: [[TMP6:%.*]] = call i64 @llvm.amdgcn.s.getpc() +; CHECK-NEXT: [[TMP7:%.*]] = bitcast i64 [[TMP6]] to <2 x i32> +; CHECK-NEXT: [[TMP8:%.*]] = insertelement <4 x i32> poison, i32 [[USERDATA0]], i64 0 +; CHECK-NEXT: [[TMP9:%.*]] = insertelement <4 x i32> [[TMP8]], i32 [[USERDATA1]], i64 1 +; CHECK-NEXT: [[TMP10:%.*]] = insertelement <4 x i32> [[TMP9]], i32 [[USERDATA2]], i64 2 +; CHECK-NEXT: [[TMP11:%.*]] = insertelement <4 x i32> [[TMP10]], i32 [[USERDATA3]], i64 3 +; CHECK-NEXT: [[PTR:%.*]] = call ptr addrspace(7) @lgc.buffer.desc.to.ptr(<4 x i32> [[TMP11]]) ; CHECK-NEXT: [[P0:%.*]] = getelementptr i32, ptr addrspace(7) [[PTR]], i32 0 ; CHECK-NEXT: [[I_VSP:%.*]] = load i32, ptr addrspace(7) [[P0]], align 4 -; CHECK-NEXT: [[TMP22:%.*]] = inttoptr i32 [[I_VSP]] to ptr addrspace(5) -; CHECK-NEXT: store ptr addrspace(5) [[TMP22]], ptr addrspace(5) [[TMP11]], align 4 +; CHECK-NEXT: [[TMP12:%.*]] = inttoptr i32 [[I_VSP]] to ptr addrspace(5) +; CHECK-NEXT: store ptr addrspace(5) [[TMP12]], ptr addrspace(5) [[TMP0]], align 4 ; CHECK-NEXT: [[P1:%.*]] = getelementptr i32, ptr addrspace(7) [[PTR]], i32 1 ; CHECK-NEXT: [[CR:%.*]] = load i32, ptr addrspace(7) [[P1]], align 4 ; CHECK-NEXT: [[P2:%.*]] = getelementptr i32, ptr addrspace(7) [[PTR]], i32 2 ; CHECK-NEXT: [[ARG:%.*]] = load i32, ptr addrspace(7) [[P2]], align 4 ; CHECK-NEXT: [[STATE:%.*]] = insertvalue { i32 } poison, i32 [[ARG]], 0 -; CHECK-NEXT: [[TMP23:%.*]] = load ptr addrspace(5), ptr addrspace(5) [[TMP11]], align 4 -; CHECK-NEXT: [[TMP24:%.*]] = load ptr addrspace(5), ptr addrspace(5) [[TMP11]], align 4 -; CHECK-NEXT: store { i32 } [[STATE]], ptr addrspace(5) [[TMP24]], align 4 -; CHECK-NEXT: [[TMP25:%.*]] = getelementptr i8, ptr addrspace(5) [[TMP24]], i32 4 -; CHECK-NEXT: [[TMP26:%.*]] = ptrtoint ptr addrspace(5) [[TMP23]] to i32 +; CHECK-NEXT: [[TMP13:%.*]] = load ptr addrspace(5), ptr addrspace(5) [[TMP0]], align 4 +; CHECK-NEXT: [[TMP14:%.*]] = load ptr addrspace(5), ptr addrspace(5) [[TMP0]], align 4 +; CHECK-NEXT: store { i32 } [[STATE]], ptr addrspace(5) [[TMP14]], align 4 +; CHECK-NEXT: [[TMP15:%.*]] = getelementptr i8, ptr addrspace(5) [[TMP14]], i32 4 +; CHECK-NEXT: [[TMP16:%.*]] = ptrtoint ptr addrspace(5) [[TMP13]] to i32 ; CHECK-NEXT: br label [[TAIL_BLOCK:%.*]] ; CHECK: tail.block: -; CHECK-NEXT: [[TMP27:%.*]] = insertvalue { i32, ptr addrspace(5), i32, i32 } poison, i32 [[CR]], 0 -; CHECK-NEXT: [[TMP28:%.*]] = insertvalue { i32, ptr addrspace(5), i32, i32 } [[TMP27]], ptr addrspace(5) [[TMP25]], 1 -; CHECK-NEXT: [[TMP29:%.*]] = insertvalue { i32, ptr addrspace(5), i32, i32 } [[TMP28]], i32 [[ARG]], 2 -; CHECK-NEXT: [[TMP30:%.*]] = insertvalue { i32, ptr addrspace(5), i32, i32 } [[TMP29]], i32 [[TMP26]], 3 -; CHECK-NEXT: [[TMP31:%.*]] = ptrtoint ptr addrspace(4) [[NUMWORKGROUPSPTR]] to i64 -; CHECK-NEXT: [[TMP32:%.*]] = bitcast i64 [[TMP31]] to <2 x i32> -; CHECK-NEXT: [[TMP33:%.*]] = extractelement <2 x i32> [[TMP32]], i64 0 -; CHECK-NEXT: [[TMP34:%.*]] = extractelement <2 x i32> [[TMP32]], i64 1 -; CHECK-NEXT: [[TMP35:%.*]] = insertelement <16 x i32> poison, i32 [[GLOBALTABLE]], i64 0 -; CHECK-NEXT: [[TMP36:%.*]] = insertelement <16 x i32> [[TMP35]], i32 [[TMP33]], i64 1 -; CHECK-NEXT: [[TMP37:%.*]] = insertelement <16 x i32> [[TMP36]], i32 [[TMP34]], i64 2 -; CHECK-NEXT: [[TMP38:%.*]] = insertelement <16 x i32> [[TMP37]], i32 [[DESCTABLE0]], i64 3 -; CHECK-NEXT: [[TMP39:%.*]] = insertelement <16 x i32> [[TMP38]], i32 [[TMP0]], i64 4 -; CHECK-NEXT: [[TMP40:%.*]] = insertelement <16 x i32> [[TMP39]], i32 [[TMP1]], i64 5 -; CHECK-NEXT: [[TMP41:%.*]] = insertelement <16 x i32> [[TMP40]], i32 [[TMP2]], i64 6 -; CHECK-NEXT: [[TMP42:%.*]] = insertelement <16 x i32> [[TMP41]], i32 [[TMP3]], i64 7 -; CHECK-NEXT: [[TMP43:%.*]] = insertelement <16 x i32> [[TMP42]], i32 [[TMP4]], i64 8 -; CHECK-NEXT: [[TMP44:%.*]] = insertelement <16 x i32> [[TMP43]], i32 [[TMP5]], i64 9 -; CHECK-NEXT: [[TMP45:%.*]] = insertelement <16 x i32> [[TMP44]], i32 [[TMP6]], i64 10 -; CHECK-NEXT: [[TMP46:%.*]] = insertelement <16 x i32> [[TMP45]], i32 [[TMP7]], i64 11 -; CHECK-NEXT: [[TMP47:%.*]] = insertelement <16 x i32> [[TMP46]], i32 [[TMP8]], i64 12 -; CHECK-NEXT: [[TMP48:%.*]] = insertelement <16 x i32> [[TMP47]], i32 [[TMP9]], i64 13 -; CHECK-NEXT: [[TMP49:%.*]] = insertelement <16 x i32> [[TMP48]], i32 [[TMP10]], i64 14 -; CHECK-NEXT: [[TMP50:%.*]] = insertelement <16 x i32> [[TMP49]], i32 [[SPILLTABLE]], i64 15 -; CHECK-NEXT: [[TMP51:%.*]] = extractvalue { i32, ptr addrspace(5), i32, i32 } [[TMP30]], 0 -; CHECK-NEXT: [[TMP52:%.*]] = icmp ne i32 [[TMP51]], 0 -; CHECK-NEXT: [[TMP53:%.*]] = call i32 @llvm.amdgcn.ballot.i32(i1 [[TMP52]]) -; CHECK-NEXT: [[TMP54:%.*]] = call i32 @llvm.cttz.i32(i32 [[TMP53]], i1 true) -; CHECK-NEXT: [[TMP55:%.*]] = call i32 @llvm.amdgcn.readlane(i32 [[TMP51]], i32 [[TMP54]]) -; CHECK-NEXT: [[TMP56:%.*]] = icmp eq i32 [[TMP51]], [[TMP55]] -; CHECK-NEXT: [[TMP57:%.*]] = call i32 @llvm.amdgcn.ballot.i32(i1 [[TMP56]]) -; CHECK-NEXT: [[TMP58:%.*]] = icmp eq i32 [[TMP55]], 0 -; CHECK-NEXT: br i1 [[TMP58]], label [[RET_BLOCK:%.*]], label [[CHAIN_BLOCK:%.*]] +; CHECK-NEXT: [[TMP17:%.*]] = insertvalue { i32, ptr addrspace(5), i32, i32 } poison, i32 [[CR]], 0 +; CHECK-NEXT: [[TMP18:%.*]] = insertvalue { i32, ptr addrspace(5), i32, i32 } [[TMP17]], ptr addrspace(5) [[TMP15]], 1 +; CHECK-NEXT: [[TMP19:%.*]] = insertvalue { i32, ptr addrspace(5), i32, i32 } [[TMP18]], i32 [[ARG]], 2 +; CHECK-NEXT: [[TMP20:%.*]] = insertvalue { i32, ptr addrspace(5), i32, i32 } [[TMP19]], i32 [[TMP16]], 3 +; CHECK-NEXT: [[TMP21:%.*]] = ptrtoint ptr addrspace(4) [[NUMWORKGROUPSPTR]] to i64 +; CHECK-NEXT: [[TMP22:%.*]] = bitcast i64 [[TMP21]] to <2 x i32> +; CHECK-NEXT: [[TMP23:%.*]] = extractelement <2 x i32> [[TMP22]], i64 0 +; CHECK-NEXT: [[TMP24:%.*]] = extractelement <2 x i32> [[TMP22]], i64 1 +; CHECK-NEXT: [[TMP25:%.*]] = insertelement <16 x i32> poison, i32 [[GLOBALTABLE]], i64 0 +; CHECK-NEXT: [[TMP26:%.*]] = insertelement <16 x i32> [[TMP25]], i32 [[TMP23]], i64 1 +; CHECK-NEXT: [[TMP27:%.*]] = insertelement <16 x i32> [[TMP26]], i32 [[TMP24]], i64 2 +; CHECK-NEXT: [[TMP28:%.*]] = insertelement <16 x i32> [[TMP27]], i32 [[USERDATA0]], i64 3 +; CHECK-NEXT: [[TMP29:%.*]] = insertelement <16 x i32> [[TMP28]], i32 [[USERDATA1]], i64 4 +; CHECK-NEXT: [[TMP30:%.*]] = insertelement <16 x i32> [[TMP29]], i32 [[USERDATA2]], i64 5 +; CHECK-NEXT: [[TMP31:%.*]] = insertelement <16 x i32> [[TMP30]], i32 [[USERDATA3]], i64 6 +; CHECK-NEXT: [[TMP32:%.*]] = insertelement <16 x i32> [[TMP31]], i32 [[PAD4]], i64 7 +; CHECK-NEXT: [[TMP33:%.*]] = insertelement <16 x i32> [[TMP32]], i32 [[PAD5]], i64 8 +; CHECK-NEXT: [[TMP34:%.*]] = insertelement <16 x i32> [[TMP33]], i32 [[PAD6]], i64 9 +; CHECK-NEXT: [[TMP35:%.*]] = insertelement <16 x i32> [[TMP34]], i32 [[PAD7]], i64 10 +; CHECK-NEXT: [[TMP36:%.*]] = insertelement <16 x i32> [[TMP35]], i32 [[PAD8]], i64 11 +; CHECK-NEXT: [[TMP37:%.*]] = insertelement <16 x i32> [[TMP36]], i32 [[PAD9]], i64 12 +; CHECK-NEXT: [[TMP38:%.*]] = insertelement <16 x i32> [[TMP37]], i32 [[PAD10]], i64 13 +; CHECK-NEXT: [[TMP39:%.*]] = insertelement <16 x i32> [[TMP38]], i32 [[PAD11]], i64 14 +; CHECK-NEXT: [[TMP40:%.*]] = insertelement <16 x i32> [[TMP39]], i32 [[SPILLTABLE]], i64 15 +; CHECK-NEXT: [[TMP41:%.*]] = extractvalue { i32, ptr addrspace(5), i32, i32 } [[TMP20]], 0 +; CHECK-NEXT: [[TMP42:%.*]] = icmp ne i32 [[TMP41]], 0 +; CHECK-NEXT: [[TMP43:%.*]] = call i32 @llvm.amdgcn.ballot.i32(i1 [[TMP42]]) +; CHECK-NEXT: [[TMP44:%.*]] = call i32 @llvm.cttz.i32(i32 [[TMP43]], i1 true) +; CHECK-NEXT: [[TMP45:%.*]] = call i32 @llvm.amdgcn.readlane(i32 [[TMP41]], i32 [[TMP44]]) +; CHECK-NEXT: [[TMP46:%.*]] = icmp eq i32 [[TMP41]], [[TMP45]] +; CHECK-NEXT: [[TMP47:%.*]] = call i32 @llvm.amdgcn.ballot.i32(i1 [[TMP46]]) +; CHECK-NEXT: [[TMP48:%.*]] = icmp eq i32 [[TMP45]], 0 +; CHECK-NEXT: br i1 [[TMP48]], label [[RET_BLOCK:%.*]], label [[CHAIN_BLOCK:%.*]] ; CHECK: chain.block: -; CHECK-NEXT: [[TMP59:%.*]] = and i32 [[TMP55]], -64 -; CHECK-NEXT: [[TMP60:%.*]] = insertelement <2 x i32> [[TMP21]], i32 [[TMP59]], i64 0 -; CHECK-NEXT: [[TMP61:%.*]] = bitcast <2 x i32> [[TMP60]] to i64 -; CHECK-NEXT: [[TMP62:%.*]] = inttoptr i64 [[TMP61]] to ptr -; CHECK-NEXT: call void (ptr, i32, <16 x i32>, { i32, ptr addrspace(5), i32, i32 }, i32, ...) @llvm.amdgcn.cs.chain.p0.i32.v16i32.sl_i32p5i32i32s(ptr [[TMP62]], i32 [[TMP57]], <16 x i32> [[TMP50]], { i32, ptr addrspace(5), i32, i32 } [[TMP30]], i32 0) +; CHECK-NEXT: [[TMP49:%.*]] = and i32 [[TMP45]], -64 +; CHECK-NEXT: [[TMP50:%.*]] = insertelement <2 x i32> [[TMP7]], i32 [[TMP49]], i64 0 +; CHECK-NEXT: [[TMP51:%.*]] = bitcast <2 x i32> [[TMP50]] to i64 +; CHECK-NEXT: [[TMP52:%.*]] = inttoptr i64 [[TMP51]] to ptr +; CHECK-NEXT: call void (ptr, i32, <16 x i32>, { i32, ptr addrspace(5), i32, i32 }, i32, ...) @llvm.amdgcn.cs.chain.p0.i32.v16i32.sl_i32p5i32i32s(ptr inreg [[TMP52]], i32 inreg [[TMP47]], <16 x i32> inreg [[TMP40]], { i32, ptr addrspace(5), i32, i32 } [[TMP20]], i32 0) ; CHECK-NEXT: unreachable ; CHECK: ret.block: ; CHECK-NEXT: ret void diff --git a/lgc/test/Transforms/CpsLowering/cps-stack-lowering.lgc b/lgc/test/Transforms/CpsLowering/cps-stack-lowering.lgc index b69b69f1f7..6af11d15ce 100644 --- a/lgc/test/Transforms/CpsLowering/cps-stack-lowering.lgc +++ b/lgc/test/Transforms/CpsLowering/cps-stack-lowering.lgc @@ -1,6 +1,5 @@ -; NOTE: Assertions have been autogenerated by tool/update_llpc_test_checks.py UTC_ARGS: --tool lgc --function-signature +; NOTE: Assertions have been autogenerated by tool/update_llpc_test_checks.py UTC_ARGS: --tool lgc --function-signature --check-globals ; RUN: lgc -mcpu=gfx1030 -o - -passes='require,lgc-patch-entry-point-mutate' %s | FileCheck --check-prefixes=CHECK %s -; REQUIRES: do-not-run-me declare void @lgc.cps.jump(...) noreturn declare ptr addrspace(32) @lgc.cps.alloc(i32) @@ -49,185 +48,201 @@ define void @test.2({ ptr addrspace(32) } %state) !lgc.cps !{i32 1} { ret void } ; CHECK-LABEL: define {{[^@]+}}@test.0 -; CHECK-SAME: (i32 inreg [[GLOBALTABLE:%.*]], ptr addrspace(4) inreg [[NUMWORKGROUPSPTR:%.*]], i32 inreg [[TMP0:%.*]], i32 inreg [[TMP1:%.*]], i32 inreg [[TMP2:%.*]], i32 inreg [[TMP3:%.*]], i32 inreg [[TMP4:%.*]], i32 inreg [[TMP5:%.*]], i32 inreg [[TMP6:%.*]], i32 inreg [[TMP7:%.*]], i32 inreg [[TMP8:%.*]], i32 inreg [[TMP9:%.*]], i32 inreg [[TMP10:%.*]], i32 inreg [[TMP11:%.*]], i32 inreg [[SPILLTABLE:%.*]], i32 [[VCR:%.*]], ptr addrspace(5) [[VSP:%.*]]) #[[ATTR1:[0-9]+]] align 64 !lgc.cps !2 { -; CHECK-NEXT: [[TMP13:%.*]] = alloca ptr addrspace(5), align 4, addrspace(5) -; CHECK-NEXT: [[TMP14:%.*]] = call i64 @llvm.amdgcn.s.getpc() -; CHECK-NEXT: [[TMP15:%.*]] = bitcast i64 [[TMP14]] to <2 x i32> -; CHECK-NEXT: store ptr addrspace(5) [[VSP]], ptr addrspace(5) [[TMP13]], align 4 -; CHECK-NEXT: [[TMP16:%.*]] = load ptr addrspace(5), ptr addrspace(5) [[TMP13]], align 4 -; CHECK-NEXT: [[TMP17:%.*]] = getelementptr i8, ptr addrspace(5) [[TMP16]], i32 12 -; CHECK-NEXT: store ptr addrspace(5) [[TMP17]], ptr addrspace(5) [[TMP13]], align 4 -; CHECK-NEXT: store i32 333, ptr addrspace(5) [[TMP16]], align 4 -; CHECK-NEXT: [[TMP18:%.*]] = getelementptr i32, ptr addrspace(5) [[TMP16]], i32 1 -; CHECK-NEXT: store i32 111, ptr addrspace(5) [[TMP18]], align 4 -; CHECK-NEXT: [[TMP19:%.*]] = getelementptr i8, ptr addrspace(5) [[TMP16]], i32 9 -; CHECK-NEXT: store i8 99, ptr addrspace(5) [[TMP19]], align 1 -; CHECK-NEXT: [[Q1:%.*]] = ptrtoint ptr addrspace(5) [[TMP18]] to i32 -; CHECK-NEXT: [[STATE:%.*]] = insertvalue { ptr addrspace(5) } poison, ptr addrspace(5) [[TMP19]], 0 -; CHECK-NEXT: [[TMP20:%.*]] = load ptr addrspace(5), ptr addrspace(5) [[TMP13]], align 4 -; CHECK-NEXT: store { ptr addrspace(5) } [[STATE]], ptr addrspace(5) [[TMP20]], align 4 -; CHECK-NEXT: [[TMP21:%.*]] = getelementptr i8, ptr addrspace(5) [[TMP20]], i32 4 -; CHECK-NEXT: [[TMP22:%.*]] = ptrtoint ptr addrspace(5) [[TMP19]] to i32 +; CHECK-SAME: (i32 inreg [[GLOBALTABLE:%.*]], ptr addrspace(4) inreg [[NUMWORKGROUPSPTR:%.*]], i32 inreg [[PAD0:%.*]], i32 inreg [[PAD1:%.*]], i32 inreg [[PAD2:%.*]], i32 inreg [[PAD3:%.*]], i32 inreg [[PAD4:%.*]], i32 inreg [[PAD5:%.*]], i32 inreg [[PAD6:%.*]], i32 inreg [[PAD7:%.*]], i32 inreg [[PAD8:%.*]], i32 inreg [[PAD9:%.*]], i32 inreg [[PAD10:%.*]], i32 inreg [[PAD11:%.*]], i32 inreg [[SPILLTABLE:%.*]], i32 [[VCR:%.*]], ptr addrspace(5) [[VSP:%.*]]) #[[ATTR1:[0-9]+]] align 64 !lgc.cps !2 { +; CHECK-NEXT: [[TMP1:%.*]] = alloca ptr addrspace(5), align 4, addrspace(5) +; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.amdgcn.s.getpc() +; CHECK-NEXT: [[TMP3:%.*]] = bitcast i64 [[TMP2]] to <2 x i32> +; CHECK-NEXT: store ptr addrspace(5) [[VSP]], ptr addrspace(5) [[TMP1]], align 4 +; CHECK-NEXT: [[TMP4:%.*]] = load ptr addrspace(5), ptr addrspace(5) [[TMP1]], align 4 +; CHECK-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr addrspace(5) [[TMP4]], i32 12 +; CHECK-NEXT: store ptr addrspace(5) [[TMP5]], ptr addrspace(5) [[TMP1]], align 4 +; CHECK-NEXT: store i32 333, ptr addrspace(5) [[TMP4]], align 4 +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr i32, ptr addrspace(5) [[TMP4]], i32 1 +; CHECK-NEXT: store i32 111, ptr addrspace(5) [[TMP6]], align 4 +; CHECK-NEXT: [[TMP7:%.*]] = getelementptr i8, ptr addrspace(5) [[TMP4]], i32 9 +; CHECK-NEXT: store i8 99, ptr addrspace(5) [[TMP7]], align 1 +; CHECK-NEXT: [[Q1:%.*]] = ptrtoint ptr addrspace(5) [[TMP6]] to i32 +; CHECK-NEXT: [[STATE:%.*]] = insertvalue { ptr addrspace(5) } poison, ptr addrspace(5) [[TMP7]], 0 +; CHECK-NEXT: [[TMP8:%.*]] = or i32 ptrtoint (ptr @test.1 to i32), 1 +; CHECK-NEXT: [[TMP9:%.*]] = load ptr addrspace(5), ptr addrspace(5) [[TMP1]], align 4 +; CHECK-NEXT: store { ptr addrspace(5) } [[STATE]], ptr addrspace(5) [[TMP9]], align 4 +; CHECK-NEXT: [[TMP10:%.*]] = getelementptr i8, ptr addrspace(5) [[TMP9]], i32 4 +; CHECK-NEXT: [[TMP11:%.*]] = ptrtoint ptr addrspace(5) [[TMP7]] to i32 ; CHECK-NEXT: br label [[TAIL_BLOCK:%.*]] ; CHECK: tail.block: -; CHECK-NEXT: [[TMP23:%.*]] = insertvalue { i32, ptr addrspace(5), i32, i32 } { i32 or (i32 ptrtoint (ptr @test.1 to i32), i32 1), ptr addrspace(5) poison, i32 poison, i32 poison }, ptr addrspace(5) [[TMP21]], 1 -; CHECK-NEXT: [[TMP24:%.*]] = insertvalue { i32, ptr addrspace(5), i32, i32 } [[TMP23]], i32 [[TMP22]], 2 -; CHECK-NEXT: [[TMP25:%.*]] = insertvalue { i32, ptr addrspace(5), i32, i32 } [[TMP24]], i32 [[Q1]], 3 -; CHECK-NEXT: [[TMP26:%.*]] = ptrtoint ptr addrspace(4) [[NUMWORKGROUPSPTR]] to i64 -; CHECK-NEXT: [[TMP27:%.*]] = bitcast i64 [[TMP26]] to <2 x i32> -; CHECK-NEXT: [[TMP28:%.*]] = extractelement <2 x i32> [[TMP27]], i64 0 -; CHECK-NEXT: [[TMP29:%.*]] = extractelement <2 x i32> [[TMP27]], i64 1 -; CHECK-NEXT: [[TMP30:%.*]] = insertelement <16 x i32> poison, i32 [[GLOBALTABLE]], i64 0 -; CHECK-NEXT: [[TMP31:%.*]] = insertelement <16 x i32> [[TMP30]], i32 [[TMP28]], i64 1 -; CHECK-NEXT: [[TMP32:%.*]] = insertelement <16 x i32> [[TMP31]], i32 [[TMP29]], i64 2 -; CHECK-NEXT: [[TMP33:%.*]] = insertelement <16 x i32> [[TMP32]], i32 [[TMP0]], i64 3 -; CHECK-NEXT: [[TMP34:%.*]] = insertelement <16 x i32> [[TMP33]], i32 [[TMP1]], i64 4 -; CHECK-NEXT: [[TMP35:%.*]] = insertelement <16 x i32> [[TMP34]], i32 [[TMP2]], i64 5 -; CHECK-NEXT: [[TMP36:%.*]] = insertelement <16 x i32> [[TMP35]], i32 [[TMP3]], i64 6 -; CHECK-NEXT: [[TMP37:%.*]] = insertelement <16 x i32> [[TMP36]], i32 [[TMP4]], i64 7 -; CHECK-NEXT: [[TMP38:%.*]] = insertelement <16 x i32> [[TMP37]], i32 [[TMP5]], i64 8 -; CHECK-NEXT: [[TMP39:%.*]] = insertelement <16 x i32> [[TMP38]], i32 [[TMP6]], i64 9 -; CHECK-NEXT: [[TMP40:%.*]] = insertelement <16 x i32> [[TMP39]], i32 [[TMP7]], i64 10 -; CHECK-NEXT: [[TMP41:%.*]] = insertelement <16 x i32> [[TMP40]], i32 [[TMP8]], i64 11 -; CHECK-NEXT: [[TMP42:%.*]] = insertelement <16 x i32> [[TMP41]], i32 [[TMP9]], i64 12 -; CHECK-NEXT: [[TMP43:%.*]] = insertelement <16 x i32> [[TMP42]], i32 [[TMP10]], i64 13 -; CHECK-NEXT: [[TMP44:%.*]] = insertelement <16 x i32> [[TMP43]], i32 [[TMP11]], i64 14 -; CHECK-NEXT: [[TMP45:%.*]] = insertelement <16 x i32> [[TMP44]], i32 [[SPILLTABLE]], i64 15 -; CHECK-NEXT: [[TMP46:%.*]] = extractvalue { i32, ptr addrspace(5), i32, i32 } [[TMP25]], 0 -; CHECK-NEXT: [[TMP47:%.*]] = call i32 @llvm.amdgcn.setinactive.chain.arg(i32 [[TMP46]], i32 [[VCR]]) -; CHECK-NEXT: [[TMP48:%.*]] = icmp ne i32 [[TMP47]], 0 -; CHECK-NEXT: [[TMP49:%.*]] = call i32 @llvm.amdgcn.ballot.i32(i1 [[TMP48]]) -; CHECK-NEXT: [[TMP50:%.*]] = call i32 @llvm.cttz.i32(i32 [[TMP49]], i1 true) -; CHECK-NEXT: [[TMP51:%.*]] = call i32 @llvm.amdgcn.readlane(i32 [[TMP47]], i32 [[TMP50]]) -; CHECK-NEXT: [[TMP52:%.*]] = icmp eq i32 [[TMP47]], [[TMP51]] -; CHECK-NEXT: [[TMP53:%.*]] = call i32 @llvm.amdgcn.ballot.i32(i1 [[TMP52]]) -; CHECK-NEXT: [[TMP54:%.*]] = call i32 @llvm.amdgcn.wwm.i32(i32 [[TMP51]]) -; CHECK-NEXT: [[TMP55:%.*]] = call i32 @llvm.amdgcn.wwm.i32(i32 [[TMP53]]) -; CHECK-NEXT: [[TMP56:%.*]] = icmp eq i32 [[TMP54]], 0 -; CHECK-NEXT: br i1 [[TMP56]], label [[RET_BLOCK:%.*]], label [[CHAIN_BLOCK:%.*]] +; CHECK-NEXT: [[TMP12:%.*]] = insertvalue { i32, ptr addrspace(5), i32, i32 } poison, i32 [[TMP8]], 0 +; CHECK-NEXT: [[TMP13:%.*]] = insertvalue { i32, ptr addrspace(5), i32, i32 } [[TMP12]], ptr addrspace(5) [[TMP10]], 1 +; CHECK-NEXT: [[TMP14:%.*]] = insertvalue { i32, ptr addrspace(5), i32, i32 } [[TMP13]], i32 [[TMP11]], 2 +; CHECK-NEXT: [[TMP15:%.*]] = insertvalue { i32, ptr addrspace(5), i32, i32 } [[TMP14]], i32 [[Q1]], 3 +; CHECK-NEXT: [[TMP16:%.*]] = ptrtoint ptr addrspace(4) [[NUMWORKGROUPSPTR]] to i64 +; CHECK-NEXT: [[TMP17:%.*]] = bitcast i64 [[TMP16]] to <2 x i32> +; CHECK-NEXT: [[TMP18:%.*]] = extractelement <2 x i32> [[TMP17]], i64 0 +; CHECK-NEXT: [[TMP19:%.*]] = extractelement <2 x i32> [[TMP17]], i64 1 +; CHECK-NEXT: [[TMP20:%.*]] = insertelement <16 x i32> poison, i32 [[GLOBALTABLE]], i64 0 +; CHECK-NEXT: [[TMP21:%.*]] = insertelement <16 x i32> [[TMP20]], i32 [[TMP18]], i64 1 +; CHECK-NEXT: [[TMP22:%.*]] = insertelement <16 x i32> [[TMP21]], i32 [[TMP19]], i64 2 +; CHECK-NEXT: [[TMP23:%.*]] = insertelement <16 x i32> [[TMP22]], i32 [[PAD0]], i64 3 +; CHECK-NEXT: [[TMP24:%.*]] = insertelement <16 x i32> [[TMP23]], i32 [[PAD1]], i64 4 +; CHECK-NEXT: [[TMP25:%.*]] = insertelement <16 x i32> [[TMP24]], i32 [[PAD2]], i64 5 +; CHECK-NEXT: [[TMP26:%.*]] = insertelement <16 x i32> [[TMP25]], i32 [[PAD3]], i64 6 +; CHECK-NEXT: [[TMP27:%.*]] = insertelement <16 x i32> [[TMP26]], i32 [[PAD4]], i64 7 +; CHECK-NEXT: [[TMP28:%.*]] = insertelement <16 x i32> [[TMP27]], i32 [[PAD5]], i64 8 +; CHECK-NEXT: [[TMP29:%.*]] = insertelement <16 x i32> [[TMP28]], i32 [[PAD6]], i64 9 +; CHECK-NEXT: [[TMP30:%.*]] = insertelement <16 x i32> [[TMP29]], i32 [[PAD7]], i64 10 +; CHECK-NEXT: [[TMP31:%.*]] = insertelement <16 x i32> [[TMP30]], i32 [[PAD8]], i64 11 +; CHECK-NEXT: [[TMP32:%.*]] = insertelement <16 x i32> [[TMP31]], i32 [[PAD9]], i64 12 +; CHECK-NEXT: [[TMP33:%.*]] = insertelement <16 x i32> [[TMP32]], i32 [[PAD10]], i64 13 +; CHECK-NEXT: [[TMP34:%.*]] = insertelement <16 x i32> [[TMP33]], i32 [[PAD11]], i64 14 +; CHECK-NEXT: [[TMP35:%.*]] = insertelement <16 x i32> [[TMP34]], i32 [[SPILLTABLE]], i64 15 +; CHECK-NEXT: [[TMP36:%.*]] = extractvalue { i32, ptr addrspace(5), i32, i32 } [[TMP15]], 0 +; CHECK-NEXT: [[TMP37:%.*]] = call i32 @llvm.amdgcn.setinactive.chain.arg(i32 [[TMP36]], i32 [[VCR]]) +; CHECK-NEXT: [[TMP38:%.*]] = icmp ne i32 [[TMP37]], 0 +; CHECK-NEXT: [[TMP39:%.*]] = call i32 @llvm.amdgcn.ballot.i32(i1 [[TMP38]]) +; CHECK-NEXT: [[TMP40:%.*]] = call i32 @llvm.cttz.i32(i32 [[TMP39]], i1 true) +; CHECK-NEXT: [[TMP41:%.*]] = call i32 @llvm.amdgcn.readlane(i32 [[TMP37]], i32 [[TMP40]]) +; CHECK-NEXT: [[TMP42:%.*]] = icmp eq i32 [[TMP37]], [[TMP41]] +; CHECK-NEXT: [[TMP43:%.*]] = call i32 @llvm.amdgcn.ballot.i32(i1 [[TMP42]]) +; CHECK-NEXT: [[TMP44:%.*]] = call i32 @llvm.amdgcn.wwm.i32(i32 [[TMP41]]) +; CHECK-NEXT: [[TMP45:%.*]] = call i32 @llvm.amdgcn.wwm.i32(i32 [[TMP43]]) +; CHECK-NEXT: [[TMP46:%.*]] = icmp eq i32 [[TMP44]], 0 +; CHECK-NEXT: br i1 [[TMP46]], label [[RET_BLOCK:%.*]], label [[CHAIN_BLOCK:%.*]] ; CHECK: chain.block: -; CHECK-NEXT: [[TMP57:%.*]] = and i32 [[TMP54]], -64 -; CHECK-NEXT: [[TMP58:%.*]] = insertelement <2 x i32> [[TMP15]], i32 [[TMP57]], i64 0 -; CHECK-NEXT: [[TMP59:%.*]] = bitcast <2 x i32> [[TMP58]] to i64 -; CHECK-NEXT: [[TMP60:%.*]] = inttoptr i64 [[TMP59]] to ptr -; CHECK-NEXT: call void (ptr, i32, <16 x i32>, { i32, ptr addrspace(5), i32, i32 }, i32, ...) @llvm.amdgcn.cs.chain.p0.i32.v16i32.sl_i32p5i32i32s(ptr [[TMP60]], i32 [[TMP55]], <16 x i32> [[TMP45]], { i32, ptr addrspace(5), i32, i32 } [[TMP25]], i32 0) +; CHECK-NEXT: [[TMP47:%.*]] = and i32 [[TMP44]], -64 +; CHECK-NEXT: [[TMP48:%.*]] = insertelement <2 x i32> [[TMP3]], i32 [[TMP47]], i64 0 +; CHECK-NEXT: [[TMP49:%.*]] = bitcast <2 x i32> [[TMP48]] to i64 +; CHECK-NEXT: [[TMP50:%.*]] = inttoptr i64 [[TMP49]] to ptr +; CHECK-NEXT: call void (ptr, i32, <16 x i32>, { i32, ptr addrspace(5), i32, i32 }, i32, ...) @llvm.amdgcn.cs.chain.p0.i32.v16i32.sl_i32p5i32i32s(ptr inreg [[TMP50]], i32 inreg [[TMP45]], <16 x i32> inreg [[TMP35]], { i32, ptr addrspace(5), i32, i32 } [[TMP15]], i32 0) ; CHECK-NEXT: unreachable ; CHECK: ret.block: ; CHECK-NEXT: ret void ; ; ; CHECK-LABEL: define {{[^@]+}}@test.1 -; CHECK-SAME: (i32 inreg [[GLOBALTABLE:%.*]], ptr addrspace(4) inreg [[NUMWORKGROUPSPTR:%.*]], i32 inreg [[TMP0:%.*]], i32 inreg [[TMP1:%.*]], i32 inreg [[TMP2:%.*]], i32 inreg [[TMP3:%.*]], i32 inreg [[TMP4:%.*]], i32 inreg [[TMP5:%.*]], i32 inreg [[TMP6:%.*]], i32 inreg [[TMP7:%.*]], i32 inreg [[TMP8:%.*]], i32 inreg [[TMP9:%.*]], i32 inreg [[TMP10:%.*]], i32 inreg [[TMP11:%.*]], i32 inreg [[SPILLTABLE:%.*]], i32 [[VCR:%.*]], ptr addrspace(5) [[VSP:%.*]], ptr addrspace(5) [[P2:%.*]], i32 [[Q1:%.*]]) #[[ATTR1]] align 64 !lgc.cps !2 { -; CHECK-NEXT: [[TMP13:%.*]] = alloca ptr addrspace(5), align 4, addrspace(5) -; CHECK-NEXT: [[TMP14:%.*]] = call i64 @llvm.amdgcn.s.getpc() -; CHECK-NEXT: [[TMP15:%.*]] = bitcast i64 [[TMP14]] to <2 x i32> -; CHECK-NEXT: store ptr addrspace(5) [[VSP]], ptr addrspace(5) [[TMP13]], align 4 -; CHECK-NEXT: [[TMP16:%.*]] = inttoptr i32 [[Q1]] to ptr addrspace(5) -; CHECK-NEXT: [[N111:%.*]] = load i32, ptr addrspace(5) [[TMP16]], align 4 +; CHECK-SAME: (i32 inreg [[GLOBALTABLE:%.*]], ptr addrspace(4) inreg [[NUMWORKGROUPSPTR:%.*]], i32 inreg [[PAD0:%.*]], i32 inreg [[PAD1:%.*]], i32 inreg [[PAD2:%.*]], i32 inreg [[PAD3:%.*]], i32 inreg [[PAD4:%.*]], i32 inreg [[PAD5:%.*]], i32 inreg [[PAD6:%.*]], i32 inreg [[PAD7:%.*]], i32 inreg [[PAD8:%.*]], i32 inreg [[PAD9:%.*]], i32 inreg [[PAD10:%.*]], i32 inreg [[PAD11:%.*]], i32 inreg [[SPILLTABLE:%.*]], i32 [[VCR:%.*]], ptr addrspace(5) [[VSP:%.*]], ptr addrspace(5) [[P2:%.*]], i32 [[Q1:%.*]]) #[[ATTR1]] align 64 !lgc.cps !2 { +; CHECK-NEXT: [[TMP1:%.*]] = alloca ptr addrspace(5), align 4, addrspace(5) +; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.amdgcn.s.getpc() +; CHECK-NEXT: [[TMP3:%.*]] = bitcast i64 [[TMP2]] to <2 x i32> +; CHECK-NEXT: store ptr addrspace(5) [[VSP]], ptr addrspace(5) [[TMP1]], align 4 +; CHECK-NEXT: [[TMP4:%.*]] = inttoptr i32 [[Q1]] to ptr addrspace(5) +; CHECK-NEXT: [[N111:%.*]] = load i32, ptr addrspace(5) [[TMP4]], align 4 ; CHECK-NEXT: [[N99:%.*]] = load i8, ptr addrspace(5) [[P2]], align 1 -; CHECK-NEXT: [[TMP17:%.*]] = load ptr addrspace(5), ptr addrspace(5) [[TMP13]], align 4 +; CHECK-NEXT: [[TMP5:%.*]] = or i32 ptrtoint (ptr @test.2 to i32), 1 +; CHECK-NEXT: [[TMP6:%.*]] = load ptr addrspace(5), ptr addrspace(5) [[TMP1]], align 4 ; CHECK-NEXT: br label [[TAIL_BLOCK:%.*]] ; CHECK: tail.block: -; CHECK-NEXT: [[TMP18:%.*]] = insertvalue { i32, ptr addrspace(5) } { i32 or (i32 ptrtoint (ptr @test.2 to i32), i32 1), ptr addrspace(5) poison }, ptr addrspace(5) [[TMP17]], 1 -; CHECK-NEXT: [[TMP19:%.*]] = ptrtoint ptr addrspace(4) [[NUMWORKGROUPSPTR]] to i64 -; CHECK-NEXT: [[TMP20:%.*]] = bitcast i64 [[TMP19]] to <2 x i32> -; CHECK-NEXT: [[TMP21:%.*]] = extractelement <2 x i32> [[TMP20]], i64 0 -; CHECK-NEXT: [[TMP22:%.*]] = extractelement <2 x i32> [[TMP20]], i64 1 -; CHECK-NEXT: [[TMP23:%.*]] = insertelement <16 x i32> poison, i32 [[GLOBALTABLE]], i64 0 -; CHECK-NEXT: [[TMP24:%.*]] = insertelement <16 x i32> [[TMP23]], i32 [[TMP21]], i64 1 -; CHECK-NEXT: [[TMP25:%.*]] = insertelement <16 x i32> [[TMP24]], i32 [[TMP22]], i64 2 -; CHECK-NEXT: [[TMP26:%.*]] = insertelement <16 x i32> [[TMP25]], i32 [[TMP0]], i64 3 -; CHECK-NEXT: [[TMP27:%.*]] = insertelement <16 x i32> [[TMP26]], i32 [[TMP1]], i64 4 -; CHECK-NEXT: [[TMP28:%.*]] = insertelement <16 x i32> [[TMP27]], i32 [[TMP2]], i64 5 -; CHECK-NEXT: [[TMP29:%.*]] = insertelement <16 x i32> [[TMP28]], i32 [[TMP3]], i64 6 -; CHECK-NEXT: [[TMP30:%.*]] = insertelement <16 x i32> [[TMP29]], i32 [[TMP4]], i64 7 -; CHECK-NEXT: [[TMP31:%.*]] = insertelement <16 x i32> [[TMP30]], i32 [[TMP5]], i64 8 -; CHECK-NEXT: [[TMP32:%.*]] = insertelement <16 x i32> [[TMP31]], i32 [[TMP6]], i64 9 -; CHECK-NEXT: [[TMP33:%.*]] = insertelement <16 x i32> [[TMP32]], i32 [[TMP7]], i64 10 -; CHECK-NEXT: [[TMP34:%.*]] = insertelement <16 x i32> [[TMP33]], i32 [[TMP8]], i64 11 -; CHECK-NEXT: [[TMP35:%.*]] = insertelement <16 x i32> [[TMP34]], i32 [[TMP9]], i64 12 -; CHECK-NEXT: [[TMP36:%.*]] = insertelement <16 x i32> [[TMP35]], i32 [[TMP10]], i64 13 -; CHECK-NEXT: [[TMP37:%.*]] = insertelement <16 x i32> [[TMP36]], i32 [[TMP11]], i64 14 -; CHECK-NEXT: [[TMP38:%.*]] = insertelement <16 x i32> [[TMP37]], i32 [[SPILLTABLE]], i64 15 -; CHECK-NEXT: [[TMP39:%.*]] = extractvalue { i32, ptr addrspace(5) } [[TMP18]], 0 -; CHECK-NEXT: [[TMP40:%.*]] = call i32 @llvm.amdgcn.setinactive.chain.arg.1(i32 [[TMP39]], i32 [[VCR]]) -; CHECK-NEXT: [[TMP41:%.*]] = icmp ne i32 [[TMP40]], 0 -; CHECK-NEXT: [[TMP42:%.*]] = call i32 @llvm.amdgcn.ballot.i32(i1 [[TMP41]]) -; CHECK-NEXT: [[TMP43:%.*]] = call i32 @llvm.cttz.i32(i32 [[TMP42]], i1 true) -; CHECK-NEXT: [[TMP44:%.*]] = call i32 @llvm.amdgcn.readlane(i32 [[TMP40]], i32 [[TMP43]]) -; CHECK-NEXT: [[TMP45:%.*]] = icmp eq i32 [[TMP40]], [[TMP44]] -; CHECK-NEXT: [[TMP46:%.*]] = call i32 @llvm.amdgcn.ballot.i32(i1 [[TMP45]]) -; CHECK-NEXT: [[TMP47:%.*]] = call i32 @llvm.amdgcn.wwm.i32(i32 [[TMP44]]) -; CHECK-NEXT: [[TMP48:%.*]] = call i32 @llvm.amdgcn.wwm.i32(i32 [[TMP46]]) -; CHECK-NEXT: [[TMP49:%.*]] = icmp eq i32 [[TMP47]], 0 -; CHECK-NEXT: br i1 [[TMP49]], label [[RET_BLOCK:%.*]], label [[CHAIN_BLOCK:%.*]] +; CHECK-NEXT: [[TMP7:%.*]] = insertvalue { i32, ptr addrspace(5) } poison, i32 [[TMP5]], 0 +; CHECK-NEXT: [[TMP8:%.*]] = insertvalue { i32, ptr addrspace(5) } [[TMP7]], ptr addrspace(5) [[TMP6]], 1 +; CHECK-NEXT: [[TMP9:%.*]] = ptrtoint ptr addrspace(4) [[NUMWORKGROUPSPTR]] to i64 +; CHECK-NEXT: [[TMP10:%.*]] = bitcast i64 [[TMP9]] to <2 x i32> +; CHECK-NEXT: [[TMP11:%.*]] = extractelement <2 x i32> [[TMP10]], i64 0 +; CHECK-NEXT: [[TMP12:%.*]] = extractelement <2 x i32> [[TMP10]], i64 1 +; CHECK-NEXT: [[TMP13:%.*]] = insertelement <16 x i32> poison, i32 [[GLOBALTABLE]], i64 0 +; CHECK-NEXT: [[TMP14:%.*]] = insertelement <16 x i32> [[TMP13]], i32 [[TMP11]], i64 1 +; CHECK-NEXT: [[TMP15:%.*]] = insertelement <16 x i32> [[TMP14]], i32 [[TMP12]], i64 2 +; CHECK-NEXT: [[TMP16:%.*]] = insertelement <16 x i32> [[TMP15]], i32 [[PAD0]], i64 3 +; CHECK-NEXT: [[TMP17:%.*]] = insertelement <16 x i32> [[TMP16]], i32 [[PAD1]], i64 4 +; CHECK-NEXT: [[TMP18:%.*]] = insertelement <16 x i32> [[TMP17]], i32 [[PAD2]], i64 5 +; CHECK-NEXT: [[TMP19:%.*]] = insertelement <16 x i32> [[TMP18]], i32 [[PAD3]], i64 6 +; CHECK-NEXT: [[TMP20:%.*]] = insertelement <16 x i32> [[TMP19]], i32 [[PAD4]], i64 7 +; CHECK-NEXT: [[TMP21:%.*]] = insertelement <16 x i32> [[TMP20]], i32 [[PAD5]], i64 8 +; CHECK-NEXT: [[TMP22:%.*]] = insertelement <16 x i32> [[TMP21]], i32 [[PAD6]], i64 9 +; CHECK-NEXT: [[TMP23:%.*]] = insertelement <16 x i32> [[TMP22]], i32 [[PAD7]], i64 10 +; CHECK-NEXT: [[TMP24:%.*]] = insertelement <16 x i32> [[TMP23]], i32 [[PAD8]], i64 11 +; CHECK-NEXT: [[TMP25:%.*]] = insertelement <16 x i32> [[TMP24]], i32 [[PAD9]], i64 12 +; CHECK-NEXT: [[TMP26:%.*]] = insertelement <16 x i32> [[TMP25]], i32 [[PAD10]], i64 13 +; CHECK-NEXT: [[TMP27:%.*]] = insertelement <16 x i32> [[TMP26]], i32 [[PAD11]], i64 14 +; CHECK-NEXT: [[TMP28:%.*]] = insertelement <16 x i32> [[TMP27]], i32 [[SPILLTABLE]], i64 15 +; CHECK-NEXT: [[TMP29:%.*]] = extractvalue { i32, ptr addrspace(5) } [[TMP8]], 0 +; CHECK-NEXT: [[TMP30:%.*]] = call i32 @llvm.amdgcn.setinactive.chain.arg.1(i32 [[TMP29]], i32 [[VCR]]) +; CHECK-NEXT: [[TMP31:%.*]] = icmp ne i32 [[TMP30]], 0 +; CHECK-NEXT: [[TMP32:%.*]] = call i32 @llvm.amdgcn.ballot.i32(i1 [[TMP31]]) +; CHECK-NEXT: [[TMP33:%.*]] = call i32 @llvm.cttz.i32(i32 [[TMP32]], i1 true) +; CHECK-NEXT: [[TMP34:%.*]] = call i32 @llvm.amdgcn.readlane(i32 [[TMP30]], i32 [[TMP33]]) +; CHECK-NEXT: [[TMP35:%.*]] = icmp eq i32 [[TMP30]], [[TMP34]] +; CHECK-NEXT: [[TMP36:%.*]] = call i32 @llvm.amdgcn.ballot.i32(i1 [[TMP35]]) +; CHECK-NEXT: [[TMP37:%.*]] = call i32 @llvm.amdgcn.wwm.i32(i32 [[TMP34]]) +; CHECK-NEXT: [[TMP38:%.*]] = call i32 @llvm.amdgcn.wwm.i32(i32 [[TMP36]]) +; CHECK-NEXT: [[TMP39:%.*]] = icmp eq i32 [[TMP37]], 0 +; CHECK-NEXT: br i1 [[TMP39]], label [[RET_BLOCK:%.*]], label [[CHAIN_BLOCK:%.*]] ; CHECK: chain.block: -; CHECK-NEXT: [[TMP50:%.*]] = and i32 [[TMP47]], -64 -; CHECK-NEXT: [[TMP51:%.*]] = insertelement <2 x i32> [[TMP15]], i32 [[TMP50]], i64 0 -; CHECK-NEXT: [[TMP52:%.*]] = bitcast <2 x i32> [[TMP51]] to i64 -; CHECK-NEXT: [[TMP53:%.*]] = inttoptr i64 [[TMP52]] to ptr -; CHECK-NEXT: call void (ptr, i32, <16 x i32>, { i32, ptr addrspace(5) }, i32, ...) @llvm.amdgcn.cs.chain.p0.i32.v16i32.sl_i32p5s(ptr [[TMP53]], i32 [[TMP48]], <16 x i32> [[TMP38]], { i32, ptr addrspace(5) } [[TMP18]], i32 0) +; CHECK-NEXT: [[TMP40:%.*]] = and i32 [[TMP37]], -64 +; CHECK-NEXT: [[TMP41:%.*]] = insertelement <2 x i32> [[TMP3]], i32 [[TMP40]], i64 0 +; CHECK-NEXT: [[TMP42:%.*]] = bitcast <2 x i32> [[TMP41]] to i64 +; CHECK-NEXT: [[TMP43:%.*]] = inttoptr i64 [[TMP42]] to ptr +; CHECK-NEXT: call void (ptr, i32, <16 x i32>, { i32, ptr addrspace(5) }, i32, ...) @llvm.amdgcn.cs.chain.p0.i32.v16i32.sl_i32p5s(ptr inreg [[TMP43]], i32 inreg [[TMP38]], <16 x i32> inreg [[TMP28]], { i32, ptr addrspace(5) } [[TMP8]], i32 0) ; CHECK-NEXT: unreachable ; CHECK: ret.block: ; CHECK-NEXT: ret void ; ; ; CHECK-LABEL: define {{[^@]+}}@test.2 -; CHECK-SAME: (i32 inreg [[GLOBALTABLE:%.*]], ptr addrspace(4) inreg [[NUMWORKGROUPSPTR:%.*]], i32 inreg [[TMP0:%.*]], i32 inreg [[TMP1:%.*]], i32 inreg [[TMP2:%.*]], i32 inreg [[TMP3:%.*]], i32 inreg [[TMP4:%.*]], i32 inreg [[TMP5:%.*]], i32 inreg [[TMP6:%.*]], i32 inreg [[TMP7:%.*]], i32 inreg [[TMP8:%.*]], i32 inreg [[TMP9:%.*]], i32 inreg [[TMP10:%.*]], i32 inreg [[TMP11:%.*]], i32 inreg [[SPILLTABLE:%.*]], i32 [[VCR:%.*]], ptr addrspace(5) [[VSP:%.*]]) #[[ATTR1]] align 64 !lgc.cps !2 { -; CHECK-NEXT: [[TMP13:%.*]] = alloca ptr addrspace(5), align 4, addrspace(5) -; CHECK-NEXT: [[TMP14:%.*]] = call i64 @llvm.amdgcn.s.getpc() -; CHECK-NEXT: [[TMP15:%.*]] = bitcast i64 [[TMP14]] to <2 x i32> -; CHECK-NEXT: [[TMP16:%.*]] = getelementptr inbounds i8, ptr addrspace(5) [[VSP]], i32 -4 -; CHECK-NEXT: [[CPS_STATE:%.*]] = load { ptr addrspace(5) }, ptr addrspace(5) [[TMP16]], align 4 -; CHECK-NEXT: store ptr addrspace(5) [[TMP16]], ptr addrspace(5) [[TMP13]], align 4 -; CHECK-NEXT: [[TMP17:%.*]] = load ptr addrspace(5), ptr addrspace(5) [[TMP13]], align 4 -; CHECK-NEXT: [[TMP18:%.*]] = getelementptr i8, ptr addrspace(5) [[TMP17]], i32 -12 +; CHECK-SAME: (i32 inreg [[GLOBALTABLE:%.*]], ptr addrspace(4) inreg [[NUMWORKGROUPSPTR:%.*]], i32 inreg [[PAD0:%.*]], i32 inreg [[PAD1:%.*]], i32 inreg [[PAD2:%.*]], i32 inreg [[PAD3:%.*]], i32 inreg [[PAD4:%.*]], i32 inreg [[PAD5:%.*]], i32 inreg [[PAD6:%.*]], i32 inreg [[PAD7:%.*]], i32 inreg [[PAD8:%.*]], i32 inreg [[PAD9:%.*]], i32 inreg [[PAD10:%.*]], i32 inreg [[PAD11:%.*]], i32 inreg [[SPILLTABLE:%.*]], i32 [[VCR:%.*]], ptr addrspace(5) [[VSP:%.*]]) #[[ATTR1]] align 64 !lgc.cps !2 { +; CHECK-NEXT: [[TMP1:%.*]] = alloca ptr addrspace(5), align 4, addrspace(5) +; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.amdgcn.s.getpc() +; CHECK-NEXT: [[TMP3:%.*]] = bitcast i64 [[TMP2]] to <2 x i32> +; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i8, ptr addrspace(5) [[VSP]], i32 -4 +; CHECK-NEXT: [[CPS_STATE:%.*]] = load { ptr addrspace(5) }, ptr addrspace(5) [[TMP4]], align 4 +; CHECK-NEXT: store ptr addrspace(5) [[TMP4]], ptr addrspace(5) [[TMP1]], align 4 +; CHECK-NEXT: [[TMP5:%.*]] = load ptr addrspace(5), ptr addrspace(5) [[TMP1]], align 4 +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr i8, ptr addrspace(5) [[TMP5]], i32 -12 ; CHECK-NEXT: [[P2:%.*]] = extractvalue { ptr addrspace(5) } [[CPS_STATE]], 0 -; CHECK-NEXT: [[N333:%.*]] = load i32, ptr addrspace(5) [[TMP18]], align 4 +; CHECK-NEXT: [[N333:%.*]] = load i32, ptr addrspace(5) [[TMP6]], align 4 ; CHECK-NEXT: [[N99:%.*]] = load i8, ptr addrspace(5) [[P2]], align 1 -; CHECK-NEXT: [[TMP19:%.*]] = load ptr addrspace(5), ptr addrspace(5) [[TMP13]], align 4 -; CHECK-NEXT: [[TMP20:%.*]] = getelementptr i8, ptr addrspace(5) [[TMP19]], i32 -12 -; CHECK-NEXT: store ptr addrspace(5) [[TMP20]], ptr addrspace(5) [[TMP13]], align 4 +; CHECK-NEXT: [[TMP7:%.*]] = load ptr addrspace(5), ptr addrspace(5) [[TMP1]], align 4 +; CHECK-NEXT: [[TMP8:%.*]] = getelementptr i8, ptr addrspace(5) [[TMP7]], i32 -12 +; CHECK-NEXT: store ptr addrspace(5) [[TMP8]], ptr addrspace(5) [[TMP1]], align 4 ; CHECK-NEXT: br label [[TAIL_BLOCK:%.*]] ; CHECK: tail.block: -; CHECK-NEXT: [[TMP21:%.*]] = ptrtoint ptr addrspace(4) [[NUMWORKGROUPSPTR]] to i64 -; CHECK-NEXT: [[TMP22:%.*]] = bitcast i64 [[TMP21]] to <2 x i32> -; CHECK-NEXT: [[TMP23:%.*]] = extractelement <2 x i32> [[TMP22]], i64 0 -; CHECK-NEXT: [[TMP24:%.*]] = extractelement <2 x i32> [[TMP22]], i64 1 -; CHECK-NEXT: [[TMP25:%.*]] = insertelement <16 x i32> poison, i32 [[GLOBALTABLE]], i64 0 -; CHECK-NEXT: [[TMP26:%.*]] = insertelement <16 x i32> [[TMP25]], i32 [[TMP23]], i64 1 -; CHECK-NEXT: [[TMP27:%.*]] = insertelement <16 x i32> [[TMP26]], i32 [[TMP24]], i64 2 -; CHECK-NEXT: [[TMP28:%.*]] = insertelement <16 x i32> [[TMP27]], i32 [[TMP0]], i64 3 -; CHECK-NEXT: [[TMP29:%.*]] = insertelement <16 x i32> [[TMP28]], i32 [[TMP1]], i64 4 -; CHECK-NEXT: [[TMP30:%.*]] = insertelement <16 x i32> [[TMP29]], i32 [[TMP2]], i64 5 -; CHECK-NEXT: [[TMP31:%.*]] = insertelement <16 x i32> [[TMP30]], i32 [[TMP3]], i64 6 -; CHECK-NEXT: [[TMP32:%.*]] = insertelement <16 x i32> [[TMP31]], i32 [[TMP4]], i64 7 -; CHECK-NEXT: [[TMP33:%.*]] = insertelement <16 x i32> [[TMP32]], i32 [[TMP5]], i64 8 -; CHECK-NEXT: [[TMP34:%.*]] = insertelement <16 x i32> [[TMP33]], i32 [[TMP6]], i64 9 -; CHECK-NEXT: [[TMP35:%.*]] = insertelement <16 x i32> [[TMP34]], i32 [[TMP7]], i64 10 -; CHECK-NEXT: [[TMP36:%.*]] = insertelement <16 x i32> [[TMP35]], i32 [[TMP8]], i64 11 -; CHECK-NEXT: [[TMP37:%.*]] = insertelement <16 x i32> [[TMP36]], i32 [[TMP9]], i64 12 -; CHECK-NEXT: [[TMP38:%.*]] = insertelement <16 x i32> [[TMP37]], i32 [[TMP10]], i64 13 -; CHECK-NEXT: [[TMP39:%.*]] = insertelement <16 x i32> [[TMP38]], i32 [[TMP11]], i64 14 -; CHECK-NEXT: [[TMP40:%.*]] = insertelement <16 x i32> [[TMP39]], i32 [[SPILLTABLE]], i64 15 -; CHECK-NEXT: [[TMP41:%.*]] = call i32 @llvm.amdgcn.setinactive.chain.arg.2(i32 0, i32 [[VCR]]) -; CHECK-NEXT: [[TMP42:%.*]] = icmp ne i32 [[TMP41]], 0 -; CHECK-NEXT: [[TMP43:%.*]] = call i32 @llvm.amdgcn.ballot.i32(i1 [[TMP42]]) -; CHECK-NEXT: [[TMP44:%.*]] = call i32 @llvm.cttz.i32(i32 [[TMP43]], i1 true) -; CHECK-NEXT: [[TMP45:%.*]] = call i32 @llvm.amdgcn.readlane(i32 [[TMP41]], i32 [[TMP44]]) -; CHECK-NEXT: [[TMP46:%.*]] = icmp eq i32 [[TMP41]], [[TMP45]] -; CHECK-NEXT: [[TMP47:%.*]] = call i32 @llvm.amdgcn.ballot.i32(i1 [[TMP46]]) -; CHECK-NEXT: [[TMP48:%.*]] = call i32 @llvm.amdgcn.wwm.i32(i32 [[TMP45]]) -; CHECK-NEXT: [[TMP49:%.*]] = call i32 @llvm.amdgcn.wwm.i32(i32 [[TMP47]]) -; CHECK-NEXT: [[TMP50:%.*]] = icmp eq i32 [[TMP48]], 0 -; CHECK-NEXT: br i1 [[TMP50]], label [[RET_BLOCK:%.*]], label [[CHAIN_BLOCK:%.*]] +; CHECK-NEXT: [[TMP9:%.*]] = ptrtoint ptr addrspace(4) [[NUMWORKGROUPSPTR]] to i64 +; CHECK-NEXT: [[TMP10:%.*]] = bitcast i64 [[TMP9]] to <2 x i32> +; CHECK-NEXT: [[TMP11:%.*]] = extractelement <2 x i32> [[TMP10]], i64 0 +; CHECK-NEXT: [[TMP12:%.*]] = extractelement <2 x i32> [[TMP10]], i64 1 +; CHECK-NEXT: [[TMP13:%.*]] = insertelement <16 x i32> poison, i32 [[GLOBALTABLE]], i64 0 +; CHECK-NEXT: [[TMP14:%.*]] = insertelement <16 x i32> [[TMP13]], i32 [[TMP11]], i64 1 +; CHECK-NEXT: [[TMP15:%.*]] = insertelement <16 x i32> [[TMP14]], i32 [[TMP12]], i64 2 +; CHECK-NEXT: [[TMP16:%.*]] = insertelement <16 x i32> [[TMP15]], i32 [[PAD0]], i64 3 +; CHECK-NEXT: [[TMP17:%.*]] = insertelement <16 x i32> [[TMP16]], i32 [[PAD1]], i64 4 +; CHECK-NEXT: [[TMP18:%.*]] = insertelement <16 x i32> [[TMP17]], i32 [[PAD2]], i64 5 +; CHECK-NEXT: [[TMP19:%.*]] = insertelement <16 x i32> [[TMP18]], i32 [[PAD3]], i64 6 +; CHECK-NEXT: [[TMP20:%.*]] = insertelement <16 x i32> [[TMP19]], i32 [[PAD4]], i64 7 +; CHECK-NEXT: [[TMP21:%.*]] = insertelement <16 x i32> [[TMP20]], i32 [[PAD5]], i64 8 +; CHECK-NEXT: [[TMP22:%.*]] = insertelement <16 x i32> [[TMP21]], i32 [[PAD6]], i64 9 +; CHECK-NEXT: [[TMP23:%.*]] = insertelement <16 x i32> [[TMP22]], i32 [[PAD7]], i64 10 +; CHECK-NEXT: [[TMP24:%.*]] = insertelement <16 x i32> [[TMP23]], i32 [[PAD8]], i64 11 +; CHECK-NEXT: [[TMP25:%.*]] = insertelement <16 x i32> [[TMP24]], i32 [[PAD9]], i64 12 +; CHECK-NEXT: [[TMP26:%.*]] = insertelement <16 x i32> [[TMP25]], i32 [[PAD10]], i64 13 +; CHECK-NEXT: [[TMP27:%.*]] = insertelement <16 x i32> [[TMP26]], i32 [[PAD11]], i64 14 +; CHECK-NEXT: [[TMP28:%.*]] = insertelement <16 x i32> [[TMP27]], i32 [[SPILLTABLE]], i64 15 +; CHECK-NEXT: [[TMP29:%.*]] = call i32 @llvm.amdgcn.setinactive.chain.arg.2(i32 0, i32 [[VCR]]) +; CHECK-NEXT: [[TMP30:%.*]] = icmp ne i32 [[TMP29]], 0 +; CHECK-NEXT: [[TMP31:%.*]] = call i32 @llvm.amdgcn.ballot.i32(i1 [[TMP30]]) +; CHECK-NEXT: [[TMP32:%.*]] = call i32 @llvm.cttz.i32(i32 [[TMP31]], i1 true) +; CHECK-NEXT: [[TMP33:%.*]] = call i32 @llvm.amdgcn.readlane(i32 [[TMP29]], i32 [[TMP32]]) +; CHECK-NEXT: [[TMP34:%.*]] = icmp eq i32 [[TMP29]], [[TMP33]] +; CHECK-NEXT: [[TMP35:%.*]] = call i32 @llvm.amdgcn.ballot.i32(i1 [[TMP34]]) +; CHECK-NEXT: [[TMP36:%.*]] = call i32 @llvm.amdgcn.wwm.i32(i32 [[TMP33]]) +; CHECK-NEXT: [[TMP37:%.*]] = call i32 @llvm.amdgcn.wwm.i32(i32 [[TMP35]]) +; CHECK-NEXT: [[TMP38:%.*]] = icmp eq i32 [[TMP36]], 0 +; CHECK-NEXT: br i1 [[TMP38]], label [[RET_BLOCK:%.*]], label [[CHAIN_BLOCK:%.*]] ; CHECK: chain.block: -; CHECK-NEXT: [[TMP51:%.*]] = and i32 [[TMP48]], -64 -; CHECK-NEXT: [[TMP52:%.*]] = insertelement <2 x i32> [[TMP15]], i32 [[TMP51]], i64 0 -; CHECK-NEXT: [[TMP53:%.*]] = bitcast <2 x i32> [[TMP52]] to i64 -; CHECK-NEXT: [[TMP54:%.*]] = inttoptr i64 [[TMP53]] to ptr -; CHECK-NEXT: call void (ptr, i32, <16 x i32>, { i32, ptr addrspace(5) }, i32, ...) @llvm.amdgcn.cs.chain.p0.i32.v16i32.sl_i32p5s(ptr [[TMP54]], i32 [[TMP49]], <16 x i32> [[TMP40]], { i32, ptr addrspace(5) } { i32 0, ptr addrspace(5) poison }, i32 0) +; CHECK-NEXT: [[TMP39:%.*]] = and i32 [[TMP36]], -64 +; CHECK-NEXT: [[TMP40:%.*]] = insertelement <2 x i32> [[TMP3]], i32 [[TMP39]], i64 0 +; CHECK-NEXT: [[TMP41:%.*]] = bitcast <2 x i32> [[TMP40]] to i64 +; CHECK-NEXT: [[TMP42:%.*]] = inttoptr i64 [[TMP41]] to ptr +; CHECK-NEXT: call void (ptr, i32, <16 x i32>, { i32, ptr addrspace(5) }, i32, ...) @llvm.amdgcn.cs.chain.p0.i32.v16i32.sl_i32p5s(ptr inreg [[TMP42]], i32 inreg [[TMP37]], <16 x i32> inreg [[TMP28]], { i32, ptr addrspace(5) } { i32 0, ptr addrspace(5) poison }, i32 0) ; CHECK-NEXT: unreachable ; CHECK: ret.block: ; CHECK-NEXT: ret void ; +;. +; CHECK: attributes #[[ATTR0:[0-9]+]] = { noreturn } +; CHECK: attributes #[[ATTR1]] = { memory(readwrite) "amdgpu-memory-bound"="false" "amdgpu-unroll-threshold"="700" "amdgpu-wave-limiter"="false" } +; CHECK: attributes #[[ATTR2:[0-9]+]] = { convergent nocallback nofree nounwind willreturn memory(none) } +; CHECK: attributes #[[ATTR3:[0-9]+]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) } +; CHECK: attributes #[[ATTR4:[0-9]+]] = { convergent nocallback nofree nounwind speculatable willreturn memory(none) } +; CHECK: attributes #[[ATTR5:[0-9]+]] = { convergent noreturn nounwind } +;. +; CHECK: [[META0:![0-9]+]] = !{!""} +; CHECK: [[META1:![0-9]+]] = !{!"\82\B0amdpal.pipelines\91\83\B1.shader_functions\83\A6test.0\81\B4.frontend_stack_size\10\A6test.1\81\B4.frontend_stack_size\00\A6test.2\81\B4.frontend_stack_size\00\B0.spill_threshold\CD\FF\FF\B0.user_data_limit\00\AEamdpal.version\92\03\00"} +; CHECK: [[META2:![0-9]+]] = !{i32 1} +;. diff --git a/lgc/test/Transforms/CpsLowering/cps-unify-exits.lgc b/lgc/test/Transforms/CpsLowering/cps-unify-exits.lgc index 7dd96eece6..048489b597 100644 --- a/lgc/test/Transforms/CpsLowering/cps-unify-exits.lgc +++ b/lgc/test/Transforms/CpsLowering/cps-unify-exits.lgc @@ -47,14 +47,14 @@ else: !0 = !{i32 1} ; level 1 ; CHECK-LABEL: define {{[^@]+}}@unify_jumps -; CHECK-SAME: (i32 inreg [[GLOBALTABLE:%.*]], ptr addrspace(4) inreg [[NUMWORKGROUPSPTR:%.*]], i32 inreg [[TMP0:%.*]], i32 inreg [[TMP1:%.*]], i32 inreg [[TMP2:%.*]], i32 inreg [[TMP3:%.*]], i32 inreg [[TMP4:%.*]], i32 inreg [[TMP5:%.*]], i32 inreg [[TMP6:%.*]], i32 inreg [[TMP7:%.*]], i32 inreg [[TMP8:%.*]], i32 inreg [[TMP9:%.*]], i32 inreg [[TMP10:%.*]], i32 inreg [[TMP11:%.*]], i32 inreg [[SPILLTABLE:%.*]], i32 [[VCR:%.*]], ptr addrspace(5) [[VSP:%.*]], i32 [[ARG:%.*]], ptr [[TABLE:%.*]]) #[[ATTR1:[0-9]+]] align 64 !lgc.cps !2 { +; CHECK-SAME: (i32 inreg [[GLOBALTABLE:%.*]], ptr addrspace(4) inreg [[NUMWORKGROUPSPTR:%.*]], i32 inreg [[PAD0:%.*]], i32 inreg [[PAD1:%.*]], i32 inreg [[PAD2:%.*]], i32 inreg [[PAD3:%.*]], i32 inreg [[PAD4:%.*]], i32 inreg [[PAD5:%.*]], i32 inreg [[PAD6:%.*]], i32 inreg [[PAD7:%.*]], i32 inreg [[PAD8:%.*]], i32 inreg [[PAD9:%.*]], i32 inreg [[PAD10:%.*]], i32 inreg [[PAD11:%.*]], i32 inreg [[SPILLTABLE:%.*]], i32 [[VCR:%.*]], ptr addrspace(5) [[VSP:%.*]], i32 [[ARG:%.*]], ptr [[TABLE:%.*]]) #[[ATTR1:[0-9]+]] align 64 !lgc.cps !2 { ; CHECK-NEXT: entry: -; CHECK-NEXT: [[TMP12:%.*]] = alloca ptr addrspace(5), align 4, addrspace(5) -; CHECK-NEXT: [[TMP13:%.*]] = call i64 @llvm.amdgcn.s.getpc() -; CHECK-NEXT: [[TMP14:%.*]] = bitcast i64 [[TMP13]] to <2 x i32> -; CHECK-NEXT: [[TMP15:%.*]] = getelementptr inbounds i8, ptr addrspace(5) [[VSP]], i32 -4 -; CHECK-NEXT: [[CPS_STATE:%.*]] = load { i32 }, ptr addrspace(5) [[TMP15]], align 4 -; CHECK-NEXT: store ptr addrspace(5) [[TMP15]], ptr addrspace(5) [[TMP12]], align 4 +; CHECK-NEXT: [[TMP0:%.*]] = alloca ptr addrspace(5), align 4, addrspace(5) +; CHECK-NEXT: [[TMP1:%.*]] = call i64 @llvm.amdgcn.s.getpc() +; CHECK-NEXT: [[TMP2:%.*]] = bitcast i64 [[TMP1]] to <2 x i32> +; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, ptr addrspace(5) [[VSP]], i32 -4 +; CHECK-NEXT: [[CPS_STATE:%.*]] = load { i32 }, ptr addrspace(5) [[TMP3]], align 4 +; CHECK-NEXT: store ptr addrspace(5) [[TMP3]], ptr addrspace(5) [[TMP0]], align 4 ; CHECK-NEXT: [[V:%.*]] = extractvalue { i32 } [[CPS_STATE]], 0 ; CHECK-NEXT: [[COND:%.*]] = icmp ult i32 [[V]], 3 ; CHECK-NEXT: br i1 [[COND]], label [[THEN:%.*]], label [[ELSE:%.*]] @@ -64,78 +64,78 @@ else: ; CHECK-NEXT: [[THEN_ARG:%.*]] = add i32 [[ARG]], 1 ; CHECK-NEXT: [[V_THEN:%.*]] = mul i32 [[V]], 2 ; CHECK-NEXT: [[STATE_THEN:%.*]] = insertvalue { i32 } poison, i32 [[V_THEN]], 0 -; CHECK-NEXT: [[TMP16:%.*]] = load ptr addrspace(5), ptr addrspace(5) [[TMP12]], align 4 -; CHECK-NEXT: store { i32 } [[STATE_THEN]], ptr addrspace(5) [[TMP16]], align 4 -; CHECK-NEXT: [[TMP17:%.*]] = getelementptr i8, ptr addrspace(5) [[TMP16]], i32 4 +; CHECK-NEXT: [[TMP4:%.*]] = load ptr addrspace(5), ptr addrspace(5) [[TMP0]], align 4 +; CHECK-NEXT: store { i32 } [[STATE_THEN]], ptr addrspace(5) [[TMP4]], align 4 +; CHECK-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr addrspace(5) [[TMP4]], i32 4 ; CHECK-NEXT: br label [[TAIL_BLOCK:%.*]] ; CHECK: else: ; CHECK-NEXT: [[TABLE_1:%.*]] = getelementptr i32, ptr [[TABLE]], i32 1 ; CHECK-NEXT: [[CR_ELSE:%.*]] = load i32, ptr [[TABLE_1]], align 4 ; CHECK-NEXT: [[ELSE_ARG:%.*]] = uitofp i32 [[ARG]] to float -; CHECK-NEXT: [[TMP18:%.*]] = load ptr addrspace(5), ptr addrspace(5) [[TMP12]], align 4 -; CHECK-NEXT: [[TMP19:%.*]] = bitcast float [[ELSE_ARG]] to i32 +; CHECK-NEXT: [[TMP6:%.*]] = load ptr addrspace(5), ptr addrspace(5) [[TMP0]], align 4 +; CHECK-NEXT: [[TMP7:%.*]] = bitcast float [[ELSE_ARG]] to i32 ; CHECK-NEXT: br label [[TAIL_BLOCK]] ; CHECK: tail.block: -; CHECK-NEXT: [[TMP20:%.*]] = phi i32 [ [[CR_ELSE]], [[ELSE]] ], [ [[CR_THEN]], [[THEN]] ] -; CHECK-NEXT: [[TMP21:%.*]] = phi ptr addrspace(5) [ [[TMP18]], [[ELSE]] ], [ [[TMP17]], [[THEN]] ] -; CHECK-NEXT: [[TMP22:%.*]] = phi i32 [ [[TMP19]], [[ELSE]] ], [ [[THEN_ARG]], [[THEN]] ] -; CHECK-NEXT: [[TMP23:%.*]] = phi i32 [ 5, [[ELSE]] ], [ poison, [[THEN]] ] -; CHECK-NEXT: [[TMP24:%.*]] = insertvalue { i32, ptr addrspace(5), i32, i32 } poison, i32 [[TMP20]], 0 -; CHECK-NEXT: [[TMP25:%.*]] = insertvalue { i32, ptr addrspace(5), i32, i32 } [[TMP24]], ptr addrspace(5) [[TMP21]], 1 -; CHECK-NEXT: [[TMP26:%.*]] = insertvalue { i32, ptr addrspace(5), i32, i32 } [[TMP25]], i32 [[TMP22]], 2 -; CHECK-NEXT: [[TMP27:%.*]] = insertvalue { i32, ptr addrspace(5), i32, i32 } [[TMP26]], i32 [[TMP23]], 3 -; CHECK-NEXT: [[TMP28:%.*]] = ptrtoint ptr addrspace(4) [[NUMWORKGROUPSPTR]] to i64 -; CHECK-NEXT: [[TMP29:%.*]] = bitcast i64 [[TMP28]] to <2 x i32> -; CHECK-NEXT: [[TMP30:%.*]] = extractelement <2 x i32> [[TMP29]], i64 0 -; CHECK-NEXT: [[TMP31:%.*]] = extractelement <2 x i32> [[TMP29]], i64 1 -; CHECK-NEXT: [[TMP32:%.*]] = insertelement <16 x i32> poison, i32 [[GLOBALTABLE]], i64 0 -; CHECK-NEXT: [[TMP33:%.*]] = insertelement <16 x i32> [[TMP32]], i32 [[TMP30]], i64 1 -; CHECK-NEXT: [[TMP34:%.*]] = insertelement <16 x i32> [[TMP33]], i32 [[TMP31]], i64 2 -; CHECK-NEXT: [[TMP35:%.*]] = insertelement <16 x i32> [[TMP34]], i32 [[TMP0]], i64 3 -; CHECK-NEXT: [[TMP36:%.*]] = insertelement <16 x i32> [[TMP35]], i32 [[TMP1]], i64 4 -; CHECK-NEXT: [[TMP37:%.*]] = insertelement <16 x i32> [[TMP36]], i32 [[TMP2]], i64 5 -; CHECK-NEXT: [[TMP38:%.*]] = insertelement <16 x i32> [[TMP37]], i32 [[TMP3]], i64 6 -; CHECK-NEXT: [[TMP39:%.*]] = insertelement <16 x i32> [[TMP38]], i32 [[TMP4]], i64 7 -; CHECK-NEXT: [[TMP40:%.*]] = insertelement <16 x i32> [[TMP39]], i32 [[TMP5]], i64 8 -; CHECK-NEXT: [[TMP41:%.*]] = insertelement <16 x i32> [[TMP40]], i32 [[TMP6]], i64 9 -; CHECK-NEXT: [[TMP42:%.*]] = insertelement <16 x i32> [[TMP41]], i32 [[TMP7]], i64 10 -; CHECK-NEXT: [[TMP43:%.*]] = insertelement <16 x i32> [[TMP42]], i32 [[TMP8]], i64 11 -; CHECK-NEXT: [[TMP44:%.*]] = insertelement <16 x i32> [[TMP43]], i32 [[TMP9]], i64 12 -; CHECK-NEXT: [[TMP45:%.*]] = insertelement <16 x i32> [[TMP44]], i32 [[TMP10]], i64 13 -; CHECK-NEXT: [[TMP46:%.*]] = insertelement <16 x i32> [[TMP45]], i32 [[TMP11]], i64 14 -; CHECK-NEXT: [[TMP47:%.*]] = insertelement <16 x i32> [[TMP46]], i32 [[SPILLTABLE]], i64 15 -; CHECK-NEXT: [[TMP48:%.*]] = extractvalue { i32, ptr addrspace(5), i32, i32 } [[TMP27]], 0 -; CHECK-NEXT: [[TMP49:%.*]] = call i32 @llvm.amdgcn.setinactive.chain.arg(i32 [[TMP48]], i32 [[VCR]]) -; CHECK-NEXT: [[TMP50:%.*]] = icmp ne i32 [[TMP49]], 0 -; CHECK-NEXT: [[TMP51:%.*]] = call i32 @llvm.amdgcn.ballot.i32(i1 [[TMP50]]) -; CHECK-NEXT: [[TMP52:%.*]] = call i32 @llvm.cttz.i32(i32 [[TMP51]], i1 true) -; CHECK-NEXT: [[TMP53:%.*]] = call i32 @llvm.amdgcn.readlane(i32 [[TMP49]], i32 [[TMP52]]) -; CHECK-NEXT: [[TMP54:%.*]] = icmp eq i32 [[TMP49]], [[TMP53]] -; CHECK-NEXT: [[TMP55:%.*]] = call i32 @llvm.amdgcn.ballot.i32(i1 [[TMP54]]) -; CHECK-NEXT: [[TMP56:%.*]] = call i32 @llvm.amdgcn.wwm.i32(i32 [[TMP53]]) -; CHECK-NEXT: [[TMP57:%.*]] = call i32 @llvm.amdgcn.wwm.i32(i32 [[TMP55]]) -; CHECK-NEXT: [[TMP58:%.*]] = icmp eq i32 [[TMP56]], 0 -; CHECK-NEXT: br i1 [[TMP58]], label [[RET_BLOCK:%.*]], label [[CHAIN_BLOCK:%.*]] +; CHECK-NEXT: [[TMP8:%.*]] = phi i32 [ [[CR_ELSE]], [[ELSE]] ], [ [[CR_THEN]], [[THEN]] ] +; CHECK-NEXT: [[TMP9:%.*]] = phi ptr addrspace(5) [ [[TMP6]], [[ELSE]] ], [ [[TMP5]], [[THEN]] ] +; CHECK-NEXT: [[TMP10:%.*]] = phi i32 [ [[TMP7]], [[ELSE]] ], [ [[THEN_ARG]], [[THEN]] ] +; CHECK-NEXT: [[TMP11:%.*]] = phi i32 [ 5, [[ELSE]] ], [ poison, [[THEN]] ] +; CHECK-NEXT: [[TMP12:%.*]] = insertvalue { i32, ptr addrspace(5), i32, i32 } poison, i32 [[TMP8]], 0 +; CHECK-NEXT: [[TMP13:%.*]] = insertvalue { i32, ptr addrspace(5), i32, i32 } [[TMP12]], ptr addrspace(5) [[TMP9]], 1 +; CHECK-NEXT: [[TMP14:%.*]] = insertvalue { i32, ptr addrspace(5), i32, i32 } [[TMP13]], i32 [[TMP10]], 2 +; CHECK-NEXT: [[TMP15:%.*]] = insertvalue { i32, ptr addrspace(5), i32, i32 } [[TMP14]], i32 [[TMP11]], 3 +; CHECK-NEXT: [[TMP16:%.*]] = ptrtoint ptr addrspace(4) [[NUMWORKGROUPSPTR]] to i64 +; CHECK-NEXT: [[TMP17:%.*]] = bitcast i64 [[TMP16]] to <2 x i32> +; CHECK-NEXT: [[TMP18:%.*]] = extractelement <2 x i32> [[TMP17]], i64 0 +; CHECK-NEXT: [[TMP19:%.*]] = extractelement <2 x i32> [[TMP17]], i64 1 +; CHECK-NEXT: [[TMP20:%.*]] = insertelement <16 x i32> poison, i32 [[GLOBALTABLE]], i64 0 +; CHECK-NEXT: [[TMP21:%.*]] = insertelement <16 x i32> [[TMP20]], i32 [[TMP18]], i64 1 +; CHECK-NEXT: [[TMP22:%.*]] = insertelement <16 x i32> [[TMP21]], i32 [[TMP19]], i64 2 +; CHECK-NEXT: [[TMP23:%.*]] = insertelement <16 x i32> [[TMP22]], i32 [[PAD0]], i64 3 +; CHECK-NEXT: [[TMP24:%.*]] = insertelement <16 x i32> [[TMP23]], i32 [[PAD1]], i64 4 +; CHECK-NEXT: [[TMP25:%.*]] = insertelement <16 x i32> [[TMP24]], i32 [[PAD2]], i64 5 +; CHECK-NEXT: [[TMP26:%.*]] = insertelement <16 x i32> [[TMP25]], i32 [[PAD3]], i64 6 +; CHECK-NEXT: [[TMP27:%.*]] = insertelement <16 x i32> [[TMP26]], i32 [[PAD4]], i64 7 +; CHECK-NEXT: [[TMP28:%.*]] = insertelement <16 x i32> [[TMP27]], i32 [[PAD5]], i64 8 +; CHECK-NEXT: [[TMP29:%.*]] = insertelement <16 x i32> [[TMP28]], i32 [[PAD6]], i64 9 +; CHECK-NEXT: [[TMP30:%.*]] = insertelement <16 x i32> [[TMP29]], i32 [[PAD7]], i64 10 +; CHECK-NEXT: [[TMP31:%.*]] = insertelement <16 x i32> [[TMP30]], i32 [[PAD8]], i64 11 +; CHECK-NEXT: [[TMP32:%.*]] = insertelement <16 x i32> [[TMP31]], i32 [[PAD9]], i64 12 +; CHECK-NEXT: [[TMP33:%.*]] = insertelement <16 x i32> [[TMP32]], i32 [[PAD10]], i64 13 +; CHECK-NEXT: [[TMP34:%.*]] = insertelement <16 x i32> [[TMP33]], i32 [[PAD11]], i64 14 +; CHECK-NEXT: [[TMP35:%.*]] = insertelement <16 x i32> [[TMP34]], i32 [[SPILLTABLE]], i64 15 +; CHECK-NEXT: [[TMP36:%.*]] = extractvalue { i32, ptr addrspace(5), i32, i32 } [[TMP15]], 0 +; CHECK-NEXT: [[TMP37:%.*]] = call i32 @llvm.amdgcn.setinactive.chain.arg(i32 [[TMP36]], i32 [[VCR]]) +; CHECK-NEXT: [[TMP38:%.*]] = icmp ne i32 [[TMP37]], 0 +; CHECK-NEXT: [[TMP39:%.*]] = call i32 @llvm.amdgcn.ballot.i32(i1 [[TMP38]]) +; CHECK-NEXT: [[TMP40:%.*]] = call i32 @llvm.cttz.i32(i32 [[TMP39]], i1 true) +; CHECK-NEXT: [[TMP41:%.*]] = call i32 @llvm.amdgcn.readlane(i32 [[TMP37]], i32 [[TMP40]]) +; CHECK-NEXT: [[TMP42:%.*]] = icmp eq i32 [[TMP37]], [[TMP41]] +; CHECK-NEXT: [[TMP43:%.*]] = call i32 @llvm.amdgcn.ballot.i32(i1 [[TMP42]]) +; CHECK-NEXT: [[TMP44:%.*]] = call i32 @llvm.amdgcn.wwm.i32(i32 [[TMP41]]) +; CHECK-NEXT: [[TMP45:%.*]] = call i32 @llvm.amdgcn.wwm.i32(i32 [[TMP43]]) +; CHECK-NEXT: [[TMP46:%.*]] = icmp eq i32 [[TMP44]], 0 +; CHECK-NEXT: br i1 [[TMP46]], label [[RET_BLOCK:%.*]], label [[CHAIN_BLOCK:%.*]] ; CHECK: chain.block: -; CHECK-NEXT: [[TMP59:%.*]] = and i32 [[TMP56]], -64 -; CHECK-NEXT: [[TMP60:%.*]] = insertelement <2 x i32> [[TMP14]], i32 [[TMP59]], i64 0 -; CHECK-NEXT: [[TMP61:%.*]] = bitcast <2 x i32> [[TMP60]] to i64 -; CHECK-NEXT: [[TMP62:%.*]] = inttoptr i64 [[TMP61]] to ptr -; CHECK-NEXT: call void (ptr, i32, <16 x i32>, { i32, ptr addrspace(5), i32, i32 }, i32, ...) @llvm.amdgcn.cs.chain.p0.i32.v16i32.sl_i32p5i32i32s(ptr [[TMP62]], i32 [[TMP57]], <16 x i32> [[TMP47]], { i32, ptr addrspace(5), i32, i32 } [[TMP27]], i32 0) +; CHECK-NEXT: [[TMP47:%.*]] = and i32 [[TMP44]], -64 +; CHECK-NEXT: [[TMP48:%.*]] = insertelement <2 x i32> [[TMP2]], i32 [[TMP47]], i64 0 +; CHECK-NEXT: [[TMP49:%.*]] = bitcast <2 x i32> [[TMP48]] to i64 +; CHECK-NEXT: [[TMP50:%.*]] = inttoptr i64 [[TMP49]] to ptr +; CHECK-NEXT: call void (ptr, i32, <16 x i32>, { i32, ptr addrspace(5), i32, i32 }, i32, ...) @llvm.amdgcn.cs.chain.p0.i32.v16i32.sl_i32p5i32i32s(ptr inreg [[TMP50]], i32 inreg [[TMP45]], <16 x i32> inreg [[TMP35]], { i32, ptr addrspace(5), i32, i32 } [[TMP15]], i32 0) ; CHECK-NEXT: unreachable ; CHECK: ret.block: ; CHECK-NEXT: ret void ; ; ; CHECK-LABEL: define {{[^@]+}}@unify_jump_ret -; CHECK-SAME: (i32 inreg [[GLOBALTABLE:%.*]], ptr addrspace(4) inreg [[NUMWORKGROUPSPTR:%.*]], i32 inreg [[TMP0:%.*]], i32 inreg [[TMP1:%.*]], i32 inreg [[TMP2:%.*]], i32 inreg [[TMP3:%.*]], i32 inreg [[TMP4:%.*]], i32 inreg [[TMP5:%.*]], i32 inreg [[TMP6:%.*]], i32 inreg [[TMP7:%.*]], i32 inreg [[TMP8:%.*]], i32 inreg [[TMP9:%.*]], i32 inreg [[TMP10:%.*]], i32 inreg [[TMP11:%.*]], i32 inreg [[SPILLTABLE:%.*]], i32 [[VCR:%.*]], ptr addrspace(5) [[VSP:%.*]], i32 [[ARG:%.*]], ptr [[TABLE:%.*]]) #[[ATTR1]] align 64 !lgc.cps !2 { +; CHECK-SAME: (i32 inreg [[GLOBALTABLE:%.*]], ptr addrspace(4) inreg [[NUMWORKGROUPSPTR:%.*]], i32 inreg [[PAD0:%.*]], i32 inreg [[PAD1:%.*]], i32 inreg [[PAD2:%.*]], i32 inreg [[PAD3:%.*]], i32 inreg [[PAD4:%.*]], i32 inreg [[PAD5:%.*]], i32 inreg [[PAD6:%.*]], i32 inreg [[PAD7:%.*]], i32 inreg [[PAD8:%.*]], i32 inreg [[PAD9:%.*]], i32 inreg [[PAD10:%.*]], i32 inreg [[PAD11:%.*]], i32 inreg [[SPILLTABLE:%.*]], i32 [[VCR:%.*]], ptr addrspace(5) [[VSP:%.*]], i32 [[ARG:%.*]], ptr [[TABLE:%.*]]) #[[ATTR1]] align 64 !lgc.cps !2 { ; CHECK-NEXT: entry: -; CHECK-NEXT: [[TMP12:%.*]] = alloca ptr addrspace(5), align 4, addrspace(5) -; CHECK-NEXT: [[TMP13:%.*]] = call i64 @llvm.amdgcn.s.getpc() -; CHECK-NEXT: [[TMP14:%.*]] = bitcast i64 [[TMP13]] to <2 x i32> -; CHECK-NEXT: [[TMP15:%.*]] = getelementptr inbounds i8, ptr addrspace(5) [[VSP]], i32 -4 -; CHECK-NEXT: [[CPS_STATE:%.*]] = load { i32 }, ptr addrspace(5) [[TMP15]], align 4 -; CHECK-NEXT: store ptr addrspace(5) [[TMP15]], ptr addrspace(5) [[TMP12]], align 4 +; CHECK-NEXT: [[TMP0:%.*]] = alloca ptr addrspace(5), align 4, addrspace(5) +; CHECK-NEXT: [[TMP1:%.*]] = call i64 @llvm.amdgcn.s.getpc() +; CHECK-NEXT: [[TMP2:%.*]] = bitcast i64 [[TMP1]] to <2 x i32> +; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, ptr addrspace(5) [[VSP]], i32 -4 +; CHECK-NEXT: [[CPS_STATE:%.*]] = load { i32 }, ptr addrspace(5) [[TMP3]], align 4 +; CHECK-NEXT: store ptr addrspace(5) [[TMP3]], ptr addrspace(5) [[TMP0]], align 4 ; CHECK-NEXT: [[V:%.*]] = extractvalue { i32 } [[CPS_STATE]], 0 ; CHECK-NEXT: [[COND:%.*]] = icmp ult i32 [[V]], 3 ; CHECK-NEXT: br i1 [[COND]], label [[THEN:%.*]], label [[ELSE:%.*]] @@ -145,57 +145,57 @@ else: ; CHECK-NEXT: [[THEN_ARG:%.*]] = add i32 [[ARG]], 1 ; CHECK-NEXT: [[V_THEN:%.*]] = mul i32 [[V]], 2 ; CHECK-NEXT: [[STATE_THEN:%.*]] = insertvalue { i32 } poison, i32 [[V_THEN]], 0 -; CHECK-NEXT: [[TMP16:%.*]] = load ptr addrspace(5), ptr addrspace(5) [[TMP12]], align 4 -; CHECK-NEXT: store { i32 } [[STATE_THEN]], ptr addrspace(5) [[TMP16]], align 4 -; CHECK-NEXT: [[TMP17:%.*]] = getelementptr i8, ptr addrspace(5) [[TMP16]], i32 4 +; CHECK-NEXT: [[TMP4:%.*]] = load ptr addrspace(5), ptr addrspace(5) [[TMP0]], align 4 +; CHECK-NEXT: store { i32 } [[STATE_THEN]], ptr addrspace(5) [[TMP4]], align 4 +; CHECK-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr addrspace(5) [[TMP4]], i32 4 ; CHECK-NEXT: br label [[TAIL_BLOCK:%.*]] ; CHECK: else: ; CHECK-NEXT: br label [[TAIL_BLOCK]] ; CHECK: tail.block: -; CHECK-NEXT: [[TMP18:%.*]] = phi i32 [ [[CR_THEN]], [[THEN]] ], [ 0, [[ELSE]] ] -; CHECK-NEXT: [[TMP19:%.*]] = phi ptr addrspace(5) [ [[TMP17]], [[THEN]] ], [ poison, [[ELSE]] ] -; CHECK-NEXT: [[TMP20:%.*]] = phi i32 [ [[THEN_ARG]], [[THEN]] ], [ poison, [[ELSE]] ] -; CHECK-NEXT: [[TMP21:%.*]] = insertvalue { i32, ptr addrspace(5), i32 } poison, i32 [[TMP18]], 0 -; CHECK-NEXT: [[TMP22:%.*]] = insertvalue { i32, ptr addrspace(5), i32 } [[TMP21]], ptr addrspace(5) [[TMP19]], 1 -; CHECK-NEXT: [[TMP23:%.*]] = insertvalue { i32, ptr addrspace(5), i32 } [[TMP22]], i32 [[TMP20]], 2 -; CHECK-NEXT: [[TMP24:%.*]] = ptrtoint ptr addrspace(4) [[NUMWORKGROUPSPTR]] to i64 -; CHECK-NEXT: [[TMP25:%.*]] = bitcast i64 [[TMP24]] to <2 x i32> -; CHECK-NEXT: [[TMP26:%.*]] = extractelement <2 x i32> [[TMP25]], i64 0 -; CHECK-NEXT: [[TMP27:%.*]] = extractelement <2 x i32> [[TMP25]], i64 1 -; CHECK-NEXT: [[TMP28:%.*]] = insertelement <16 x i32> poison, i32 [[GLOBALTABLE]], i64 0 -; CHECK-NEXT: [[TMP29:%.*]] = insertelement <16 x i32> [[TMP28]], i32 [[TMP26]], i64 1 -; CHECK-NEXT: [[TMP30:%.*]] = insertelement <16 x i32> [[TMP29]], i32 [[TMP27]], i64 2 -; CHECK-NEXT: [[TMP31:%.*]] = insertelement <16 x i32> [[TMP30]], i32 [[TMP0]], i64 3 -; CHECK-NEXT: [[TMP32:%.*]] = insertelement <16 x i32> [[TMP31]], i32 [[TMP1]], i64 4 -; CHECK-NEXT: [[TMP33:%.*]] = insertelement <16 x i32> [[TMP32]], i32 [[TMP2]], i64 5 -; CHECK-NEXT: [[TMP34:%.*]] = insertelement <16 x i32> [[TMP33]], i32 [[TMP3]], i64 6 -; CHECK-NEXT: [[TMP35:%.*]] = insertelement <16 x i32> [[TMP34]], i32 [[TMP4]], i64 7 -; CHECK-NEXT: [[TMP36:%.*]] = insertelement <16 x i32> [[TMP35]], i32 [[TMP5]], i64 8 -; CHECK-NEXT: [[TMP37:%.*]] = insertelement <16 x i32> [[TMP36]], i32 [[TMP6]], i64 9 -; CHECK-NEXT: [[TMP38:%.*]] = insertelement <16 x i32> [[TMP37]], i32 [[TMP7]], i64 10 -; CHECK-NEXT: [[TMP39:%.*]] = insertelement <16 x i32> [[TMP38]], i32 [[TMP8]], i64 11 -; CHECK-NEXT: [[TMP40:%.*]] = insertelement <16 x i32> [[TMP39]], i32 [[TMP9]], i64 12 -; CHECK-NEXT: [[TMP41:%.*]] = insertelement <16 x i32> [[TMP40]], i32 [[TMP10]], i64 13 -; CHECK-NEXT: [[TMP42:%.*]] = insertelement <16 x i32> [[TMP41]], i32 [[TMP11]], i64 14 -; CHECK-NEXT: [[TMP43:%.*]] = insertelement <16 x i32> [[TMP42]], i32 [[SPILLTABLE]], i64 15 -; CHECK-NEXT: [[TMP44:%.*]] = extractvalue { i32, ptr addrspace(5), i32 } [[TMP23]], 0 -; CHECK-NEXT: [[TMP45:%.*]] = call i32 @llvm.amdgcn.setinactive.chain.arg.1(i32 [[TMP44]], i32 [[VCR]]) -; CHECK-NEXT: [[TMP46:%.*]] = icmp ne i32 [[TMP45]], 0 -; CHECK-NEXT: [[TMP47:%.*]] = call i32 @llvm.amdgcn.ballot.i32(i1 [[TMP46]]) -; CHECK-NEXT: [[TMP48:%.*]] = call i32 @llvm.cttz.i32(i32 [[TMP47]], i1 true) -; CHECK-NEXT: [[TMP49:%.*]] = call i32 @llvm.amdgcn.readlane(i32 [[TMP45]], i32 [[TMP48]]) -; CHECK-NEXT: [[TMP50:%.*]] = icmp eq i32 [[TMP45]], [[TMP49]] -; CHECK-NEXT: [[TMP51:%.*]] = call i32 @llvm.amdgcn.ballot.i32(i1 [[TMP50]]) -; CHECK-NEXT: [[TMP52:%.*]] = call i32 @llvm.amdgcn.wwm.i32(i32 [[TMP49]]) -; CHECK-NEXT: [[TMP53:%.*]] = call i32 @llvm.amdgcn.wwm.i32(i32 [[TMP51]]) -; CHECK-NEXT: [[TMP54:%.*]] = icmp eq i32 [[TMP52]], 0 -; CHECK-NEXT: br i1 [[TMP54]], label [[RET_BLOCK:%.*]], label [[CHAIN_BLOCK:%.*]] +; CHECK-NEXT: [[TMP6:%.*]] = phi i32 [ [[CR_THEN]], [[THEN]] ], [ 0, [[ELSE]] ] +; CHECK-NEXT: [[TMP7:%.*]] = phi ptr addrspace(5) [ [[TMP5]], [[THEN]] ], [ poison, [[ELSE]] ] +; CHECK-NEXT: [[TMP8:%.*]] = phi i32 [ [[THEN_ARG]], [[THEN]] ], [ poison, [[ELSE]] ] +; CHECK-NEXT: [[TMP9:%.*]] = insertvalue { i32, ptr addrspace(5), i32 } poison, i32 [[TMP6]], 0 +; CHECK-NEXT: [[TMP10:%.*]] = insertvalue { i32, ptr addrspace(5), i32 } [[TMP9]], ptr addrspace(5) [[TMP7]], 1 +; CHECK-NEXT: [[TMP11:%.*]] = insertvalue { i32, ptr addrspace(5), i32 } [[TMP10]], i32 [[TMP8]], 2 +; CHECK-NEXT: [[TMP12:%.*]] = ptrtoint ptr addrspace(4) [[NUMWORKGROUPSPTR]] to i64 +; CHECK-NEXT: [[TMP13:%.*]] = bitcast i64 [[TMP12]] to <2 x i32> +; CHECK-NEXT: [[TMP14:%.*]] = extractelement <2 x i32> [[TMP13]], i64 0 +; CHECK-NEXT: [[TMP15:%.*]] = extractelement <2 x i32> [[TMP13]], i64 1 +; CHECK-NEXT: [[TMP16:%.*]] = insertelement <16 x i32> poison, i32 [[GLOBALTABLE]], i64 0 +; CHECK-NEXT: [[TMP17:%.*]] = insertelement <16 x i32> [[TMP16]], i32 [[TMP14]], i64 1 +; CHECK-NEXT: [[TMP18:%.*]] = insertelement <16 x i32> [[TMP17]], i32 [[TMP15]], i64 2 +; CHECK-NEXT: [[TMP19:%.*]] = insertelement <16 x i32> [[TMP18]], i32 [[PAD0]], i64 3 +; CHECK-NEXT: [[TMP20:%.*]] = insertelement <16 x i32> [[TMP19]], i32 [[PAD1]], i64 4 +; CHECK-NEXT: [[TMP21:%.*]] = insertelement <16 x i32> [[TMP20]], i32 [[PAD2]], i64 5 +; CHECK-NEXT: [[TMP22:%.*]] = insertelement <16 x i32> [[TMP21]], i32 [[PAD3]], i64 6 +; CHECK-NEXT: [[TMP23:%.*]] = insertelement <16 x i32> [[TMP22]], i32 [[PAD4]], i64 7 +; CHECK-NEXT: [[TMP24:%.*]] = insertelement <16 x i32> [[TMP23]], i32 [[PAD5]], i64 8 +; CHECK-NEXT: [[TMP25:%.*]] = insertelement <16 x i32> [[TMP24]], i32 [[PAD6]], i64 9 +; CHECK-NEXT: [[TMP26:%.*]] = insertelement <16 x i32> [[TMP25]], i32 [[PAD7]], i64 10 +; CHECK-NEXT: [[TMP27:%.*]] = insertelement <16 x i32> [[TMP26]], i32 [[PAD8]], i64 11 +; CHECK-NEXT: [[TMP28:%.*]] = insertelement <16 x i32> [[TMP27]], i32 [[PAD9]], i64 12 +; CHECK-NEXT: [[TMP29:%.*]] = insertelement <16 x i32> [[TMP28]], i32 [[PAD10]], i64 13 +; CHECK-NEXT: [[TMP30:%.*]] = insertelement <16 x i32> [[TMP29]], i32 [[PAD11]], i64 14 +; CHECK-NEXT: [[TMP31:%.*]] = insertelement <16 x i32> [[TMP30]], i32 [[SPILLTABLE]], i64 15 +; CHECK-NEXT: [[TMP32:%.*]] = extractvalue { i32, ptr addrspace(5), i32 } [[TMP11]], 0 +; CHECK-NEXT: [[TMP33:%.*]] = call i32 @llvm.amdgcn.setinactive.chain.arg.1(i32 [[TMP32]], i32 [[VCR]]) +; CHECK-NEXT: [[TMP34:%.*]] = icmp ne i32 [[TMP33]], 0 +; CHECK-NEXT: [[TMP35:%.*]] = call i32 @llvm.amdgcn.ballot.i32(i1 [[TMP34]]) +; CHECK-NEXT: [[TMP36:%.*]] = call i32 @llvm.cttz.i32(i32 [[TMP35]], i1 true) +; CHECK-NEXT: [[TMP37:%.*]] = call i32 @llvm.amdgcn.readlane(i32 [[TMP33]], i32 [[TMP36]]) +; CHECK-NEXT: [[TMP38:%.*]] = icmp eq i32 [[TMP33]], [[TMP37]] +; CHECK-NEXT: [[TMP39:%.*]] = call i32 @llvm.amdgcn.ballot.i32(i1 [[TMP38]]) +; CHECK-NEXT: [[TMP40:%.*]] = call i32 @llvm.amdgcn.wwm.i32(i32 [[TMP37]]) +; CHECK-NEXT: [[TMP41:%.*]] = call i32 @llvm.amdgcn.wwm.i32(i32 [[TMP39]]) +; CHECK-NEXT: [[TMP42:%.*]] = icmp eq i32 [[TMP40]], 0 +; CHECK-NEXT: br i1 [[TMP42]], label [[RET_BLOCK:%.*]], label [[CHAIN_BLOCK:%.*]] ; CHECK: chain.block: -; CHECK-NEXT: [[TMP55:%.*]] = and i32 [[TMP52]], -64 -; CHECK-NEXT: [[TMP56:%.*]] = insertelement <2 x i32> [[TMP14]], i32 [[TMP55]], i64 0 -; CHECK-NEXT: [[TMP57:%.*]] = bitcast <2 x i32> [[TMP56]] to i64 -; CHECK-NEXT: [[TMP58:%.*]] = inttoptr i64 [[TMP57]] to ptr -; CHECK-NEXT: call void (ptr, i32, <16 x i32>, { i32, ptr addrspace(5), i32 }, i32, ...) @llvm.amdgcn.cs.chain.p0.i32.v16i32.sl_i32p5i32s(ptr [[TMP58]], i32 [[TMP53]], <16 x i32> [[TMP43]], { i32, ptr addrspace(5), i32 } [[TMP23]], i32 0) +; CHECK-NEXT: [[TMP43:%.*]] = and i32 [[TMP40]], -64 +; CHECK-NEXT: [[TMP44:%.*]] = insertelement <2 x i32> [[TMP2]], i32 [[TMP43]], i64 0 +; CHECK-NEXT: [[TMP45:%.*]] = bitcast <2 x i32> [[TMP44]] to i64 +; CHECK-NEXT: [[TMP46:%.*]] = inttoptr i64 [[TMP45]] to ptr +; CHECK-NEXT: call void (ptr, i32, <16 x i32>, { i32, ptr addrspace(5), i32 }, i32, ...) @llvm.amdgcn.cs.chain.p0.i32.v16i32.sl_i32p5i32s(ptr inreg [[TMP46]], i32 inreg [[TMP41]], <16 x i32> inreg [[TMP31]], { i32, ptr addrspace(5), i32 } [[TMP11]], i32 0) ; CHECK-NEXT: unreachable ; CHECK: ret.block: ; CHECK-NEXT: ret void diff --git a/lgc/test/UberFetchShader.lgc b/lgc/test/UberFetchShader.lgc index fbd9cbecf6..fc489feaa8 100644 --- a/lgc/test/UberFetchShader.lgc +++ b/lgc/test/UberFetchShader.lgc @@ -2,10 +2,9 @@ ; CHECK-LABEL: define dllexport spir_func void @lgc.shader.VS.main() ; Get the descriptor of Uber Fetch Shader buffer -; CHECK: [[Desc:%[0-9]*]] = call <2 x i32> @lgc.root.descriptor.v2i32(i32 1) +; CHECK: [[Desc:%[0-9]*]] = call i64 @lgc.load.user.data.i64(i32 4) -; CHECK: [[INT64DESC:%[0-9]*]] = bitcast <2 x i32> [[Desc]] to i64 -; CHECK: [[DESCPTR:%[0-9]*]] = inttoptr i64 [[INT64DESC]] to ptr addrspace(4) +; CHECK: [[DESCPTR:%[0-9]*]] = inttoptr i64 [[Desc]] to ptr addrspace(4) ; CHECK: [[UBERINFOPTR:%[0-9]*]] = getelementptr <4 x i32>, ptr addrspace(4) [[DESCPTR]], i32 0 ; CHECK: [[UBERINFO:%[0-9]*]] = load <4 x i32>, ptr addrspace(4) [[UBERINFOPTR]], align 16 diff --git a/lgc/test/VsComputeLibrary.lgc b/lgc/test/VsComputeLibrary.lgc deleted file mode 100644 index 392c36daf4..0000000000 --- a/lgc/test/VsComputeLibrary.lgc +++ /dev/null @@ -1,36 +0,0 @@ -; Define a compute library that can be called from a vertex shader. - -; RUN: lgc -mcpu=gfx1010 -print-after=lgc-patch-entry-point-mutate -print-after=lgc-patch-prepare-pipeline-abi -o /dev/null 2>&1 - <%s | FileCheck --check-prefixes=CHECK %s -; CHECK: IR Dump After Patch LLVM for entry-point mutation -; CHECK: define amdgpu_gfx <4 x float> @func(<4 x float> %0, i32 inreg %globalTable, ptr addrspace(4) inreg %numWorkgroupsPtr, i32 inreg %1, i32 inreg %2, i32 inreg %3, i32 inreg %4, i32 inreg %5, i32 inreg %6, i32 inreg %7, i32 inreg %8, i32 inreg %9, i32 inreg %10, i32 inreg %11, i32 inreg %12, i32 inreg %spillTable, <3 x i32> inreg %WorkgroupId, i32 inreg %MultiDispatchInfo, <3 x i32> %LocalInvocationId) #1 !lgc.shaderstage !0 { -; CHECK: !0 = !{i32 1} -; CHECK: IR Dump After Patch LLVM for preparing pipeline ABI -; CHECK: define amdgpu_gfx <4 x float> @func(<4 x float> %0, i32 inreg %globalTable, ptr addrspace(4) inreg %numWorkgroupsPtr, i32 inreg %1, i32 inreg %2, i32 inreg %3, i32 inreg %4, i32 inreg %5, i32 inreg %6, i32 inreg %7, i32 inreg %8, i32 inreg %9, i32 inreg %10, i32 inreg %11, i32 inreg %12, i32 inreg %spillTable, <3 x i32> inreg %WorkgroupId, i32 inreg %MultiDispatchInfo, <3 x i32> %LocalInvocationId) #0 !lgc.shaderstage !0 { - -; ModuleID = 'lgcPipeline' -target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-ni:7" -target triple = "amdgcn--amdpal" - -; Function Attrs: nounwind -define spir_func <4 x float> @func() local_unnamed_addr #0 !lgc.shaderstage !0 { -.entry: - %0 = call <4 x i32> (...) @lgc.create.read.generic.input.v4i32(i32 5, i32 0, i32 0, i32 0, i32 0, i32 undef) - %bc = bitcast <4 x i32> %0 to <4 x float> - ret <4 x float> %bc -} - -; Function Attrs: nounwind readonly -declare <4 x i32> @lgc.create.read.generic.input.v4i32(...) local_unnamed_addr #1 - -attributes #0 = { nounwind } -attributes #1 = { nounwind readonly } - -!lgc.unlinked = !{!0} -!lgc.options = !{!1} -!lgc.options.VS = !{!2} -!lgc.input.assembly.state = !{!4} - -!0 = !{i32 1} -!1 = !{i32 628083063, i32 1661573491, i32 -2141117829, i32 766255606, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 2} -!2 = !{i32 1951548461, i32 273960056, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 64, i32 0, i32 0, i32 3} -!4 = !{i32 2, i32 3} diff --git a/lgc/test/lgcdis.lgc b/lgc/test/lgcdis.lgc index 074f87d594..a2e3811168 100644 --- a/lgc/test/lgcdis.lgc +++ b/lgc/test/lgcdis.lgc @@ -9,8 +9,60 @@ ; CHECK-LABEL: _amdgpu_ps_main: ; CHECK: v_interp_p1 ; CHECK-LABEL: amdpal.pipelines: -; CHECK: .registers: -; CHECK: 0x2c0a (SPI_SHADER_PGM_RSRC1_PS): +; CHECK: .hardware_stages: +; CHECK: .ps: +; CHECK: .checksum_value: 0x759bd992 +; CHECK: .debug_mode: false +; CHECK: .entry_point: _amdgpu_ps_main +; CHECK: .float_mode: 0xc0 +; CHECK: .ieee_mode: false +; CHECK: .mem_ordered: true +; CHECK: .scratch_en: false +; CHECK: .scratch_memory_size: 0 +; CHECK: .sgpr_count: 0xe +; CHECK: .sgpr_limit: 0x6a +; CHECK: .trap_present: 0 +; CHECK: .user_data_reg_map: +; CHECK: - 0x10000000 +; CHECK: - 0xa +; CHECK: - 0x10 +; CHECK: - 0x11 +; CHECK: - 0x12 +; CHECK: - 0x13 +; CHECK: - 0xffffffff +; CHECK: - 0xffffffff +; CHECK: - 0xffffffff +; CHECK: - 0xffffffff +; CHECK: - 0xffffffff +; CHECK: - 0xffffffff +; CHECK: - 0xffffffff +; CHECK: - 0xffffffff +; CHECK: - 0xffffffff +; CHECK: - 0xffffffff +; CHECK: - 0xffffffff +; CHECK: - 0xffffffff +; CHECK: - 0xffffffff +; CHECK: - 0xffffffff +; CHECK: - 0xffffffff +; CHECK: - 0xffffffff +; CHECK: - 0xffffffff +; CHECK: - 0xffffffff +; CHECK: - 0xffffffff +; CHECK: - 0xffffffff +; CHECK: - 0xffffffff +; CHECK: - 0xffffffff +; CHECK: - 0xffffffff +; CHECK: - 0xffffffff +; CHECK: - 0xffffffff +; CHECK: - 0xffffffff +; CHECK: .user_sgprs: 0x6 +; CHECK: .uses_uavs: false +; CHECK: .vgpr_count: 0x4 +; CHECK: .vgpr_limit: 0x100 +; CHECK: .wavefront_size: 0x40 +; CHECK: .wgp_mode: false +; CHECK: .writes_depth: 0 +; CHECK: .writes_uavs: false target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7" target triple = "amdgcn--amdpal" @@ -78,7 +130,7 @@ attributes #2 = { nounwind readnone } !4 = !{i32 -1843601953, i32 337452067, i32 -1234379640, i32 1173800166} !8 = !{!"DescriptorTableVaPtr", i32 0, i32 0, i32 10, i32 1, i32 1} !9 = !{!"DescriptorResource", i32 1, i32 0, i32 0, i32 16, i32 0, i32 1, i32 8} -!14 = !{!"DescriptorSampler", i32 2, i32 0, i32 -1, i32 4, i32 0, i32 2, i32 4, <4 x i32> } +!14 = !{!"DescriptorSampler", i32 2, i32 0, i32 16, i32 4, i32 0, i32 2, i32 4, <4 x i32> } !19 = !{i32 0, i32 0, i32 0, i32 0, i32 13, i32 7, i32 -1} !20 = !{i32 1, i32 0, i32 24, i32 0, i32 11, i32 7, i32 -1} !21 = !{i32 10, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0} diff --git a/lgc/test/lit.site.cfg.py.in b/lgc/test/lit.site.cfg.py.in index 26390153a2..8be4ba2251 100644 --- a/lgc/test/lit.site.cfg.py.in +++ b/lgc/test/lit.site.cfg.py.in @@ -8,6 +8,12 @@ config.llvm_tools_dir = "@LLVM_TOOLS_DIR@" config.lit_tools_dir = "@LLVM_LIT_TOOLS_DIR@" config.python_executable = "@PYTHON_EXECUTABLE@" +for d in "@LIT_DEFINITIONS@".split(";"): + def_split = d.split("=") + name = def_split[0].lower() + val = def_split[1] if len(def_split) > 1 else "ON" + config.available_features.add(name) + # Support substitution of the tools_dir with user parameters. This is # used when we can't determine the tool dir at configuration time. try: diff --git a/lgc/tool/lgc/lgc.cpp b/lgc/tool/lgc/lgc.cpp index 2863450e5b..adc2f70ed4 100644 --- a/lgc/tool/lgc/lgc.cpp +++ b/lgc/tool/lgc/lgc.cpp @@ -159,6 +159,8 @@ static bool runPassPipeline(Pipeline &pipeline, Module &module, raw_pwrite_strea passMgr->addPass(VerifierPass()); passMgr->addPass(PipelineStateRecorder()); +#if LLVM_MAIN_REVISION && LLVM_MAIN_REVISION < 474768 + // Old version of the code switch (codegen::getFileType()) { case CGFT_AssemblyFile: passMgr->addPass(PrintModulePass(outStream)); @@ -169,6 +171,19 @@ static bool runPassPipeline(Pipeline &pipeline, Module &module, raw_pwrite_strea case CGFT_Null: break; } +#else + // New version of the code (also handles unknown version, which we treat as latest) + switch (codegen::getFileType()) { + case CodeGenFileType::AssemblyFile: + passMgr->addPass(PrintModulePass(outStream)); + break; + case CodeGenFileType::ObjectFile: + passMgr->addPass(BitcodeWriterPass(outStream)); + break; + case CodeGenFileType::Null: + break; + } +#endif passMgr->run(module); return true; @@ -184,7 +199,7 @@ int main(int argc, char **argv) { LgcContext::initialize(); LLVMContext context; - auto dialectContext = llvm_dialects::DialectContext::make(context); + auto dialectContext = llvm_dialects::DialectContext::make(context); // Set our category on options that we want to show in -help, and hide other options. auto opts = cl::getRegisteredOptions(); @@ -239,11 +254,23 @@ int main(int argc, char **argv) { assert(optIterator != cl::getRegisteredOptions().end()); cl::Option *opt = optIterator->second; if (opt->getNumOccurrences() == 0) +#if LLVM_MAIN_REVISION && LLVM_MAIN_REVISION < 474768 + // Old version of the code *static_cast *>(opt) = CGFT_AssemblyFile; +#else + // New version of the code (also handles unknown version, which we treat as latest) + *static_cast *>(opt) = CodeGenFileType::AssemblyFile; +#endif } // Create the LgcContext. +#if LLVM_MAIN_REVISION && LLVM_MAIN_REVISION < 474768 + // Old version of the code std::unique_ptr targetMachine(LgcContext::createTargetMachine(gpuName, CodeGenOpt::Level::Default)); +#else + // New version of the code (also handles unknown version, which we treat as latest) + std::unique_ptr targetMachine(LgcContext::createTargetMachine(gpuName, CodeGenOptLevel::Default)); +#endif if (!targetMachine) { errs() << progName << ": GPU type '" << gpuName << "' not recognized\n"; return 1; diff --git a/lgc/util/AddressExtender.cpp b/lgc/util/AddressExtender.cpp index 5069ba7d6a..1b66357f55 100644 --- a/lgc/util/AddressExtender.cpp +++ b/lgc/util/AddressExtender.cpp @@ -70,6 +70,20 @@ Instruction *AddressExtender::extend(Value *addr32, Value *highHalf, Type *ptrTy return cast(builder.CreateIntToPtr(ptr, ptrTy)); } +// ===================================================================================================================== +// Extend an i32 into a 64-bit pointer using the high 32 bits of the PC +// +// @param addr32 : Address as 32-bit value +// @param highHalf : Value to use for high half; The constant HighAddrPc to use PC +// @param ptrTy : Type to cast pointer to +// @param builder : IRBuilder to use, already set to the required insert point +// @returns : 64-bit pointer value +Instruction *AddressExtender::extendWithPc(Value *addr32, Type *ptrTy, IRBuilder<> &builder) { + Value *ptr = builder.CreateInsertElement(getPc(), addr32, uint64_t(0)); + ptr = builder.CreateBitCast(ptr, builder.getInt64Ty()); + return cast(builder.CreateIntToPtr(ptr, ptrTy)); +} + // ===================================================================================================================== // Get PC value as v2i32. The caller is only using the high half, so this only writes a single instance of the // code at the start of the function. diff --git a/lgc/util/CpsStackLowering.cpp b/lgc/util/CpsStackLowering.cpp index 6d2d8cacea..7daac65800 100644 --- a/lgc/util/CpsStackLowering.cpp +++ b/lgc/util/CpsStackLowering.cpp @@ -167,6 +167,8 @@ void CpsStackLowering::visitCpsAlloc(cps::AllocOp &alloc) { Value *vsp = builder.CreateAlignedLoad(builder.getPtrTy(getLoweredCpsStackAddrSpace()), m_cpsStackAlloca, Align(getLoweredCpsStackPointerSize(layout))); unsigned alignedSize = alignTo(cast(size)->getZExtValue(), continuationStackAlignment); + m_stackSizeInBytes += alignedSize; + // update stack pointer Value *ptr = builder.CreateConstGEP1_32(builder.getInt8Ty(), vsp, alignedSize); builder.CreateAlignedStore(ptr, m_cpsStackAlloca, Align(getLoweredCpsStackPointerSize(layout))); diff --git a/lgc/util/Internal.cpp b/lgc/util/Internal.cpp index bb17000f9d..edc85bf436 100644 --- a/lgc/util/Internal.cpp +++ b/lgc/util/Internal.cpp @@ -234,4 +234,16 @@ Type *getVgprTy(Type *ty) { return ty; } +Function *mutateFunctionArguments(Function &fn, Type *retTy, const ArrayRef argTys, AttributeList attributes) { + FunctionType *newFnTy = FunctionType::get(retTy, argTys, false); + auto *newFn = Function::Create(newFnTy, fn.getLinkage()); + newFn->copyAttributesFrom(&fn); + newFn->copyMetadata(&fn, 0); + newFn->takeName(&fn); + newFn->setAttributes(attributes); + newFn->splice(newFn->begin(), &fn); + fn.getParent()->getFunctionList().insertAfter(fn.getIterator(), newFn); + return newFn; +} + } // namespace lgc diff --git a/lgc/util/TypeLowering.cpp b/lgc/util/TypeLowering.cpp index c16607206b..7a0e050ed7 100644 --- a/lgc/util/TypeLowering.cpp +++ b/lgc/util/TypeLowering.cpp @@ -1,4 +1,5 @@ #include "lgc/util/TypeLowering.h" +#include "lgc/util/Internal.h" #include "llvm/IR/DerivedTypes.h" #include "llvm/IR/Instructions.h" @@ -115,15 +116,7 @@ Function *TypeLowering::lowerFunctionArguments(Function &fn) { if (remappedArgs.empty()) return &fn; - FunctionType *newFnTy = FunctionType::get(fn.getReturnType(), newArgTys, false); - auto *newFn = Function::Create(newFnTy, fn.getLinkage()); - newFn->copyAttributesFrom(&fn); - newFn->copyMetadata(&fn, 0); - newFn->takeName(&fn); - newFn->setAttributes(fn.getAttributes()); - newFn->splice(newFn->begin(), &fn); - fn.getParent()->getFunctionList().insertAfter(fn.getIterator(), newFn); - + auto *newFn = mutateFunctionArguments(fn, fn.getReturnType(), newArgTys, fn.getAttributes()); fn.replaceAllUsesWith(newFn); for (unsigned argIdx : remappedArgs) recordValue(fn.getArg(argIdx), {newFn->getArg(argIdx)}); diff --git a/llpc/CMakeLists.txt b/llpc/CMakeLists.txt index 3e0e541e9c..4f720e14ff 100644 --- a/llpc/CMakeLists.txt +++ b/llpc/CMakeLists.txt @@ -46,9 +46,6 @@ if(ICD_BUILD_LLPC) # Add LGC and its dependencies as LLVM external projects. include("../cmake/lgc.cmake") add_lgc_projects() - # Add other LLPC dependencies as LLVM external projects. - list(APPEND LLVM_EXTERNAL_PROJECTS LgcRt) - set(LLVM_EXTERNAL_LGCRT_SOURCE_DIR ${PROJECT_SOURCE_DIR}/../shared/lgcrt) # Set other LLVM settings. set(LLVM_TARGETS_TO_BUILD AMDGPU CACHE STRING Force) @@ -207,10 +204,8 @@ if(ICD_BUILD_LLPC) context/llpcContext.cpp context/llpcComputeContext.cpp context/llpcGraphicsContext.cpp - context/llpcShaderCache.cpp context/llpcPipelineContext.cpp context/llpcRayTracingContext.cpp - context/llpcShaderCacheManager.cpp ) # llpc/lower @@ -354,6 +349,8 @@ add_library(llpc_standalone_compiler tool/llpcInputUtils.cpp tool/llpcPipelineBuilder.cpp tool/llpcRayTracingPipelineBuilder.cpp + tool/llpcShaderCache.cpp + tool/llpcShaderCacheWrap.cpp ) add_dependencies(llpc_standalone_compiler llpc) diff --git a/llpc/context/llpcCompiler.cpp b/llpc/context/llpcCompiler.cpp index 643e4dcfb1..14885ecea0 100644 --- a/llpc/context/llpcCompiler.cpp +++ b/llpc/context/llpcCompiler.cpp @@ -128,10 +128,6 @@ opt PipelineDumpDir("pipeline-dump-dir", desc("Directory where pipe // -enable-pipeline-dump: enable pipeline info dump opt EnablePipelineDump("enable-pipeline-dump", desc("Enable pipeline info dump"), init(false)); -// -shader-cache-file-dir: root directory to store shader cache -opt ShaderCacheFileDir("shader-cache-file-dir", desc("Root directory to store shader cache"), - value_desc("dir"), init(".")); - // DEPRECATED: This option should be removed once XGL sets the corresponding pipeline option. // -use-relocatable-shader-elf: Gets LLVM to generate more generic elf files for each shader individually, and LLPC will // then link those ELF files to generate the compiled pipeline. @@ -150,25 +146,10 @@ opt RelocatableShaderElfLimit("relocatable-shader-elf-limit", "relocatable shader ELF. -1 means unlimited."), init(-1)); -// -shader-cache-mode: shader cache mode: -// 0 - Disable -// 1 - Runtime cache -// 2 - Cache to disk -// 3 - Use internal on-disk cache in read/write mode. -// 4 - Use internal on-disk cache in read-only mode. -opt ShaderCacheMode("shader-cache-mode", - desc("Shader cache mode, 0 - disable, 1 - runtime cache, 2 - cache to disk, 3 - " - "load on-disk cache for read/write, 4 - load on-disk cache for read only"), - init(0)); - // -cache-full-pipelines: Add full pipelines to the caches that are provided. opt CacheFullPipelines("cache-full-pipelines", desc("Add full pipelines to the caches that are provided."), init(true)); -// -executable-name: executable file name -static opt ExecutableName("executable-name", desc("Executable file name"), value_desc("filename"), - init("amdllpc")); - // -enable-per-stage-cache: Enable shader cache per shader stage opt EnablePerStageCache("enable-per-stage-cache", cl::desc("Enable shader cache per shader stage"), init(true)); @@ -186,6 +167,27 @@ opt EnablePartPipeline("enable-part-pipeline", cl::desc("Enable part pipel opt AddRtHelpers("add-rt-helpers", cl::desc("Add this number of helper threads for each RT pipeline compile"), init(0)); +#if LLPC_CLIENT_INTERFACE_MAJOR_VERSION < 66 +// -shader-cache-file-dir: root directory to store shader cache +opt ShaderCacheFileDir("shader-cache-file-dir", desc("Root directory to store shader cache"), + value_desc("dir"), init(".")); + +// -shader-cache-mode: shader cache mode: +// 0 - Disable +// 1 - Runtime cache +// 2 - Cache to disk +// 3 - Use internal on-disk cache in read/write mode. +// 4 - Use internal on-disk cache in read-only mode. +opt ShaderCacheMode("shader-cache-mode", + desc("Shader cache mode, 0 - disable, 1 - runtime cache, 2 - cache to disk, 3 - " + "load on-disk cache for read/write, 4 - load on-disk cache for read only"), + init(0)); + +// -executable-name: executable file name +opt ExecutableName("executable-name", desc("Executable file name"), value_desc("filename"), + init("amdllpc")); +#endif + extern opt EnableOuts; extern opt EnableErrs; @@ -252,10 +254,10 @@ static void fatalErrorHandler(void *userData, const char *reason, bool genCrashD // @param glueShaderIdentifier : The linker object for which the glue shaders are needed. // @param context : The context that contains the application caches. // @param compiler : The compiler object that contains the internal caches. -static CacheAccessor checkCacheForGlueShader(StringRef glueShaderIdentifier, Context *context, Compiler *compiler) { +static CacheAccessor checkCacheForGlueShader(StringRef glueShaderIdentifier, Compiler *compiler) { Hash glueShaderCacheHash = PipelineDumper::generateHashForGlueShader({glueShaderIdentifier.size(), glueShaderIdentifier.data()}); - return CacheAccessor(context, glueShaderCacheHash, compiler->getInternalCaches()); + return CacheAccessor(glueShaderCacheHash, compiler->getInternalCaches()); } // ===================================================================================================================== @@ -291,7 +293,7 @@ static void setGlueBinaryBlobsInLinker(ElfLinker *elfLinker, Context *context, C ArrayRef glueShaderIdentifiers = elfLinker->getGlueInfo(); for (unsigned i = 0; i < glueShaderIdentifiers.size(); ++i) { LLPC_OUTS("ID for glue shader" << i << ": " << llvm::toHex(glueShaderIdentifiers[i]) << "\n"); - CacheAccessor cacheAccessor = checkCacheForGlueShader(glueShaderIdentifiers[i], context, compiler); + CacheAccessor cacheAccessor = checkCacheForGlueShader(glueShaderIdentifiers[i], compiler); if (cacheAccessor.isInCache()) { LLPC_OUTS("Cache hit for glue shader " << i << "\n"); @@ -402,11 +404,6 @@ Result VKAPI_CALL ICompiler::Create(GfxIpVersion gfxIp, unsigned optionCount, co SOptionHash = optionHash; *ppCompiler = new Compiler(gfxIp, optionCount, options, SOptionHash, cache); assert(*ppCompiler); - - if (EnableOuts()) { - // LLPC_OUTS is enabled. Ensure it is enabled in LGC (the middle-end) too. - LgcContext::setLlpcOuts(&outs()); - } } else { *ppCompiler = nullptr; result = Result::ErrorInvalidValue; @@ -451,33 +448,6 @@ Compiler::Compiler(GfxIpVersion gfxIp, unsigned optionCount, const char *const * } } - // Initialize shader cache - ShaderCacheCreateInfo createInfo = {}; - ShaderCacheAuxCreateInfo auxCreateInfo = {}; - unsigned shaderCacheMode = cl::ShaderCacheMode; - auxCreateInfo.shaderCacheMode = static_cast(shaderCacheMode); - auxCreateInfo.gfxIp = m_gfxIp; - auxCreateInfo.hash = m_optionHash; - auxCreateInfo.executableName = cl::ExecutableName.c_str(); - - const char *shaderCachePath = cl::ShaderCacheFileDir.c_str(); - if (cl::ShaderCacheFileDir.empty()) { -#ifdef WIN_OS - shaderCachePath = getenv("LOCALAPPDATA"); - assert(shaderCachePath); -#else - llvm_unreachable("Should never be called!"); -#endif - } - - if (strlen(shaderCachePath) >= Llpc::MaxPathLen) { - LLPC_ERRS("The shader-cache-file-dir exceed the maximum length (" << Llpc::MaxPathLen << ")\n"); - llvm_unreachable("ShaderCacheFileDir is too long"); - } - auxCreateInfo.cacheFilePath = shaderCachePath; - - m_shaderCache = ShaderCacheManager::getShaderCacheManager()->getShaderCacheObject(&createInfo, &auxCreateInfo); - ++m_instanceCount; ++m_outRedirectCount; } @@ -516,8 +486,6 @@ Compiler::~Compiler() { --m_outRedirectCount; if (m_outRedirectCount == 0) redirectLogOutput(true, 0, nullptr); - - ShaderCacheManager::getShaderCacheManager()->releaseShaderCacheObject(m_shaderCache); } { @@ -529,7 +497,6 @@ Compiler::~Compiler() { } if (shutdown) { - ShaderCacheManager::shutdown(); remove_fatal_error_handler(); delete m_contextPool; m_contextPool = nullptr; @@ -732,12 +699,12 @@ Result Compiler::buildGraphicsShaderStage(const GraphicsPipelineBuildInfo *pipel if (metaDataSize > 0) { pipelineOut->fsOutputMetaData = code + candidateElf.size(); + pipelineOut->fsOutputMetaDataSize = metaDataSize; FragmentOutputs *outputs = static_cast(pipelineOut->fsOutputMetaData); outputs->fsOutInfoCount = fsOuts.size(); outputs->discard = discardState; void *offsetData = static_cast(pipelineOut->fsOutputMetaData) + sizeof(FragmentOutputs); memcpy(offsetData, fsOuts.data(), sizeof(FsOutInfo) * fsOuts.size()); - outputs->fsOutInfos = static_cast(offsetData); } return result; } @@ -757,6 +724,11 @@ Result Compiler::BuildColorExportShader(const GraphicsPipelineBuildInfo *pipelin if (!pipelineInfo->pfnOutputAlloc) return Result::ErrorInvalidPointer; + if (pipelineInfo->iaState.enableMultiView) { + LLPC_OUTS("Relocatable shader doesn't support \"MultiView\""); + return Result::RequireFullPipeline; + } + if (!fsOutputMetaData) return Result::Success; @@ -772,8 +744,13 @@ Result Compiler::BuildColorExportShader(const GraphicsPipelineBuildInfo *pipelin SmallVector exports; const FragmentOutputs *fsOuts = static_cast(fsOutputMetaData); + + const uint8 *metaPtr = static_cast(fsOutputMetaData); + metaPtr = metaPtr + sizeof(FragmentOutputs); + const FsOutInfo *outInfos = reinterpret_cast(metaPtr); + for (unsigned idx = 0; idx < fsOuts->fsOutInfoCount; idx++) { - auto outInfo = fsOuts->fsOutInfos[idx]; + auto outInfo = outInfos[idx]; ColorExportInfo colorExportInfo; colorExportInfo.hwColorTarget = outInfo.hwColorTarget; colorExportInfo.location = outInfo.location; @@ -860,7 +837,7 @@ Result Compiler::buildGraphicsPipelineWithElf(const GraphicsPipelineBuildInfo *p if (!canUseRelocatableGraphicsShaderElf(shaderInfo, pipelineInfo)) { LLPC_OUTS("Relocatable shader compilation requested but not possible.\n"); - return Result::ErrorInvalidValue; + return Result::RequireFullPipeline; } MetroHash::Hash cacheHash = {}; @@ -870,7 +847,7 @@ Result Compiler::buildGraphicsPipelineWithElf(const GraphicsPipelineBuildInfo *p std::optional cacheAccessor; if (cl::CacheFullPipelines) { - cacheAccessor.emplace(pipelineInfo, cacheHash, getInternalCaches()); + cacheAccessor.emplace(cacheHash, getInternalCaches()); } Result result = Result::Success; @@ -880,8 +857,7 @@ Result Compiler::buildGraphicsPipelineWithElf(const GraphicsPipelineBuildInfo *p if (cacheAccessor && cacheAccessor->isInCache()) { LLPC_OUTS("Cache hit for graphics pipeline.\n"); elfBin = cacheAccessor->getElfFromCache(); - pipelineOut->pipelineCacheAccess = - cacheAccessor->hitInternalCache() ? CacheAccessInfo::InternalCacheHit : CacheAccessInfo::CacheHit; + pipelineOut->pipelineCacheAccess = CacheAccessInfo::InternalCacheHit; } else { LLPC_OUTS("Cache miss for graphics pipeline.\n"); if (cacheAccessor && pipelineOut->pipelineCacheAccess == CacheAccessInfo::CacheNotChecked) @@ -921,16 +897,8 @@ Result Compiler::buildGraphicsPipelineWithElf(const GraphicsPipelineBuildInfo *p bool hasError = false; context->setDiagnosticHandler(std::make_unique(&hasError)); hasError |= !linkRelocatableShaderElf(elf, &pipelineElf, context); - context->setDiagnosticHandler(nullptr); - if (hasError) { - for (unsigned stage = 0; stage < ShaderStageGfxCount; stage++) { - if (doesShaderStageExist(shaderInfo, static_cast(stage))) { - pipelineOut->stageCacheAccesses[stage] = CacheMiss; - } - } - return Result::ErrorInvalidShader; - } + assert(!hasError); elfBin.codeSize = pipelineElf.size(); elfBin.pCode = pipelineElf.data(); @@ -967,6 +935,15 @@ Result Compiler::buildUnlinkedShaderInternal(Context *context, ArrayRefpModuleData); + // If fragment use builtIn inputs, return RequireFullPipeline. + const ShaderModuleData *moduleData = + static_cast(shaderInfo[ShaderStageFragment]->pModuleData); + if (moduleData->usage.useGenericBuiltIn) + return Result::RequireFullPipeline; + } + unsigned originalShaderStageMask = context->getPipelineContext()->getShaderStageMask(); const MetroHash::Hash originalCacheHash = context->getPipelineContext()->getCacheHashCodeWithoutCompact(); unsigned shaderStageMask = getShaderStageMaskForType(stage) & originalShaderStageMask; @@ -999,15 +976,14 @@ Result Compiler::buildUnlinkedShaderInternal(Context *context, ArrayRef(elfBin.pCode); elfPackage.assign(data, data + elfBin.codeSize); LLPC_OUTS("Cache hit for shader stage " << getUnlinkedShaderStageName(stage) << "\n"); for (ShaderStage gfxStage : shaderStages) - stageCacheAccesses[gfxStage] = - cacheAccessor.hitInternalCache() ? CacheAccessInfo::InternalCacheHit : CacheAccessInfo::CacheHit; + stageCacheAccesses[gfxStage] = CacheAccessInfo::InternalCacheHit; } else { LLPC_OUTS("Cache miss for shader stage " << getUnlinkedShaderStageName(stage) << "\n"); for (ShaderStage gfxStage : shaderStages) @@ -1416,12 +1392,11 @@ unsigned GraphicsShaderCacheChecker::check(const Module *module, const unsigned unsigned stagesLeftToCompile = stageMask; if (stageMask & getLgcShaderStageMask(ShaderStageFragment)) { - m_fragmentCacheAccessor.emplace(m_context, fragmentHash, m_compiler->getInternalCaches()); + m_fragmentCacheAccessor.emplace(fragmentHash, m_compiler->getInternalCaches()); if (m_fragmentCacheAccessor->isInCache()) { // Remove fragment shader stages. stagesLeftToCompile &= ~getLgcShaderStageMask(ShaderStageFragment); - stageCacheAccesses[ShaderStageFragment] = - m_fragmentCacheAccessor->hitInternalCache() ? CacheAccessInfo::InternalCacheHit : CacheAccessInfo::CacheHit; + stageCacheAccesses[ShaderStageFragment] = CacheAccessInfo::InternalCacheHit; } else { stageCacheAccesses[ShaderStageFragment] = CacheAccessInfo::CacheMiss; } @@ -1429,12 +1404,11 @@ unsigned GraphicsShaderCacheChecker::check(const Module *module, const unsigned if (stageMask & ~getLgcShaderStageMask(ShaderStageFragment)) { auto accessInfo = CacheAccessInfo::CacheNotChecked; - m_nonFragmentCacheAccessor.emplace(m_context, nonFragmentHash, m_compiler->getInternalCaches()); + m_nonFragmentCacheAccessor.emplace(nonFragmentHash, m_compiler->getInternalCaches()); if (m_nonFragmentCacheAccessor->isInCache()) { // Remove non-fragment shader stages. stagesLeftToCompile &= getLgcShaderStageMask(ShaderStageFragment); - accessInfo = m_nonFragmentCacheAccessor->hitInternalCache() ? CacheAccessInfo::InternalCacheHit - : CacheAccessInfo::CacheHit; + accessInfo = CacheAccessInfo::InternalCacheHit; } else { accessInfo = CacheAccessInfo::CacheMiss; } @@ -1648,14 +1622,13 @@ Result Compiler::buildGraphicsPipelineWithPartPipelines(Context *context, // Finalize the hash, and look it up in the cache. MetroHash::Hash partPipelineHash = {}; hasher.Finalize(partPipelineHash.bytes); - CacheAccessor cacheAccessor(context, partPipelineHash, getInternalCaches()); + CacheAccessor cacheAccessor(partPipelineHash, getInternalCaches()); if (cacheAccessor.isInCache()) { LLPC_OUTS("Cache hit for stage " << getPartPipelineStageName(partPipelineStage) << ".\n"); // Mark the applicable entries in stageCacheAccesses. for (ShaderStage shaderStage : maskToShaderStages(partStageMask)) { - stageCacheAccesses[shaderStage] = - cacheAccessor.hitInternalCache() ? CacheAccessInfo::InternalCacheHit : CacheAccessInfo::CacheHit; + stageCacheAccesses[shaderStage] = CacheAccessInfo::InternalCacheHit; } // Get the ELF from the cache. partPipelineElf = llvm::StringRef(static_cast(cacheAccessor.getElfFromCache().pCode), @@ -1785,7 +1758,7 @@ Result Compiler::BuildGraphicsPipeline(const GraphicsPipelineBuildInfo *pipeline std::optional cacheAccessor; if (cl::CacheFullPipelines) { - cacheAccessor.emplace(pipelineInfo, cacheHash, getInternalCaches()); + cacheAccessor.emplace(cacheHash, getInternalCaches()); } ElfPackage candidateElf; @@ -1807,8 +1780,7 @@ Result Compiler::BuildGraphicsPipeline(const GraphicsPipelineBuildInfo *pipeline LLPC_OUTS("Cache hit for graphics pipeline.\n"); elfBin = cacheAccessor->getElfFromCache(); if (cacheAccessor->isInCache()) { - pipelineOut->pipelineCacheAccess = - cacheAccessor->hitInternalCache() ? CacheAccessInfo::InternalCacheHit : CacheAccessInfo::CacheHit; + pipelineOut->pipelineCacheAccess = CacheAccessInfo::InternalCacheHit; } } @@ -1914,7 +1886,7 @@ Result Compiler::BuildComputePipeline(const ComputePipelineBuildInfo *pipelineIn std::optional cacheAccessor; if (cl::CacheFullPipelines) { - cacheAccessor.emplace(pipelineInfo, cacheHash, getInternalCaches()); + cacheAccessor.emplace(cacheHash, getInternalCaches()); } ElfPackage candidateElf; @@ -1933,8 +1905,7 @@ Result Compiler::BuildComputePipeline(const ComputePipelineBuildInfo *pipelineIn } else { LLPC_OUTS("Cache hit for compute pipeline.\n"); elfBin = cacheAccessor->getElfFromCache(); - pipelineOut->pipelineCacheAccess = - cacheAccessor->hitInternalCache() ? CacheAccessInfo::InternalCacheHit : CacheAccessInfo::CacheHit; + pipelineOut->pipelineCacheAccess = CacheAccessInfo::InternalCacheHit; } if (result == Result::Success) { @@ -1981,9 +1952,9 @@ std::unique_ptr Compiler::createGpurtShaderLibrary(Context *context) { shaderInfo.pEntryTarget = Vkgc::getEntryPointNameFromSpirvBinary(&rtState->gpurtShaderLibrary); shaderInfo.pModuleData = &moduleData; - // Disable fast math Contract when there is no hardware intersectRay + // Disable fast math contract on OpDot when there is no hardware intersectRay bool hwIntersectRay = rtState->bvhResDesc.dataSizeInDwords > 0; - shaderInfo.options.noContract = !hwIntersectRay; + shaderInfo.options.noContractOpDot = !hwIntersectRay; auto module = std::make_unique(RtName::TraceRayKHR, *context); context->setModuleTargetMachine(module.get()); @@ -2065,52 +2036,48 @@ Result Compiler::BuildRayTracingPipeline(const RayTracingPipelineBuildInfo *pipe PipelineDumper::DumpPipelineExtraInfo(reinterpret_cast(pipelineDumpFile), &extraInfo); } - ShaderEntryState cacheEntryState = ShaderEntryState::Compiling; - std::vector elfBinarys; std::vector shaderProps; - if (cacheEntryState == ShaderEntryState::Compiling) { - const PipelineShaderInfo *representativeShaderInfo = nullptr; - if (pipelineInfo->shaderCount > 0) - representativeShaderInfo = &pipelineInfo->pShaders[0]; + const PipelineShaderInfo *representativeShaderInfo = nullptr; + if (pipelineInfo->shaderCount > 0) + representativeShaderInfo = &pipelineInfo->pShaders[0]; - RayTracingContext rayTracingContext(m_gfxIp, pipelineInfo, representativeShaderInfo, &pipelineHash, &cacheHash, - pipelineInfo->indirectStageMask); + RayTracingContext rayTracingContext(m_gfxIp, pipelineInfo, representativeShaderInfo, &pipelineHash, &cacheHash, + pipelineInfo->indirectStageMask); - pipelineOut->hasTraceRay = false; - for (unsigned i = 0; i < pipelineInfo->shaderCount; ++i) { - const auto &shaderInfo = pipelineInfo->pShaders[i]; - const ShaderModuleData *moduleData = reinterpret_cast(shaderInfo.pModuleData); - if (moduleData->usage.hasTraceRay) { - pipelineOut->hasTraceRay = true; - break; - } + pipelineOut->hasTraceRay = false; + for (unsigned i = 0; i < pipelineInfo->shaderCount; ++i) { + const auto &shaderInfo = pipelineInfo->pShaders[i]; + const ShaderModuleData *moduleData = reinterpret_cast(shaderInfo.pModuleData); + if (moduleData->usage.hasTraceRay) { + pipelineOut->hasTraceRay = true; + break; } + } - std::vector rayTracingShaderInfo; - rayTracingShaderInfo.reserve(pipelineInfo->shaderCount + 1); - for (unsigned i = 0; i < pipelineInfo->shaderCount; ++i) { - rayTracingShaderInfo.push_back(&pipelineInfo->pShaders[i]); - auto &shaderInfo = rayTracingShaderInfo[i]; - const ShaderModuleData *moduleData = reinterpret_cast(shaderInfo->pModuleData); - if (shaderInfo->entryStage == ShaderStageRayTracingAnyHit || - shaderInfo->entryStage == ShaderStageRayTracingIntersect) { - if (moduleData->usage.enableRayQuery) { - rayTracingContext.setIndirectPipeline(); - } + std::vector rayTracingShaderInfo; + rayTracingShaderInfo.reserve(pipelineInfo->shaderCount + 1); + for (unsigned i = 0; i < pipelineInfo->shaderCount; ++i) { + rayTracingShaderInfo.push_back(&pipelineInfo->pShaders[i]); + auto &shaderInfo = rayTracingShaderInfo[i]; + const ShaderModuleData *moduleData = reinterpret_cast(shaderInfo->pModuleData); + if (shaderInfo->entryStage == ShaderStageRayTracingAnyHit || + shaderInfo->entryStage == ShaderStageRayTracingIntersect) { + if (moduleData->usage.enableRayQuery) { + rayTracingContext.setIndirectPipeline(); } } + } - // Add entry module - PipelineShaderInfo raygenMainShaderInfo = pipelineInfo->pShaders[0]; - raygenMainShaderInfo.entryStage = ShaderStageRayTracingRayGen; - raygenMainShaderInfo.pModuleData = nullptr; - rayTracingShaderInfo.push_back(&raygenMainShaderInfo); + // Add entry module + PipelineShaderInfo raygenMainShaderInfo = pipelineInfo->pShaders[0]; + raygenMainShaderInfo.entryStage = ShaderStageRayTracingRayGen; + raygenMainShaderInfo.pModuleData = nullptr; + rayTracingShaderInfo.push_back(&raygenMainShaderInfo); - result = buildRayTracingPipelineInternal(rayTracingContext, rayTracingShaderInfo, false, elfBinarys, shaderProps, - helperThreadProvider); - } + result = buildRayTracingPipelineInternal(rayTracingContext, rayTracingShaderInfo, false, elfBinarys, shaderProps, + helperThreadProvider); if (result == Result::Success) { void *allocBuf = nullptr; @@ -2182,17 +2149,6 @@ Result Compiler::BuildRayTracingPipeline(const RayTracingPipelineBuildInfo *pipe shaderHandles[i].intersectionId = getModuleIdByIndex(shaderGroup->intersectionShader); } } - - // By convention, we're in indirect mode if we produced more than one ELF. - if (pipelineOut->pipelineBinCount > 1) { - pipelineOut->shaderGroupHandle.shaderMapping = RayTracingShaderIdentifierMapping::ElfModuleGpuVa; - pipelineOut->shaderGroupHandle.anyHitMapping = RayTracingShaderIdentifierMapping::ElfModuleGpuVa; - pipelineOut->shaderGroupHandle.intersectionMapping = RayTracingShaderIdentifierMapping::ElfModuleGpuVa; - } else { - pipelineOut->shaderGroupHandle.shaderMapping = RayTracingShaderIdentifierMapping::None; - pipelineOut->shaderGroupHandle.anyHitMapping = RayTracingShaderIdentifierMapping::None; - pipelineOut->shaderGroupHandle.intersectionMapping = RayTracingShaderIdentifierMapping::None; - } } return result; @@ -2221,8 +2177,23 @@ Result Compiler::buildRayTracingPipelineElf(Context *context, std::unique_ptrgetOptions(); + MetroHash64 hasher; + MetroHash::Hash hash = {}; + hasher.Update(options.hash[1]); + hasher.Update(moduleIndex); + hasher.Finalize(hash.bytes); + options.hash[1] = MetroHash::compact64(&hash); + + if (static_cast(context->getPipelineContext())->getIndirectStageMask() == 0) + options.rtIndirectMode = lgc::RayTracingIndirectMode::NotIndirect; + + pipeline->setOptions(options); + generatePipeline(context, moduleIndex, std::move(module), pipelineElf, pipeline.get(), timerProfiler); if (moduleIndex > 0) @@ -2244,15 +2215,6 @@ Result Compiler::generatePipeline(Context *context, unsigned moduleIndex, std::u // Generate pipeline. std::unique_ptr pipelineModule; - auto options = pipeline->getOptions(); - MetroHash64 hasher; - MetroHash::Hash hash = {}; - hasher.Update(options.hash[1]); - hasher.Update(moduleIndex); - hasher.Finalize(hash.bytes); - options.hash[1] = MetroHash::compact64(&hash); - pipeline->setOptions(options); - pipelineModule.reset(pipeline->irLink(module.release(), context->getPipelineContext()->isUnlinked() ? PipelineLink::Unlinked : PipelineLink::WholePipeline)); @@ -2488,6 +2450,7 @@ Result Compiler::buildRayTracingPipelineInternal(RayTracingContext &rtContext, // Step 2: Link rayquery modules std::vector> newModules; + std::vector moduleUsesRayQuery; // Record which module calls TraceRay(), except the first one (For indirect mode, it is the entry function which will // never call TraceRay(). For inlined mode, we don't need to care). std::vector moduleCallsTraceRay; @@ -2509,6 +2472,7 @@ Result Compiler::buildRayTracingPipelineInternal(RayTracingContext &rtContext, shaderInfo = shaderInfo.drop_back(); newModules.push_back(std::move(entry)); + moduleUsesRayQuery.push_back(false); for (unsigned shaderIndex = 0; shaderIndex < pipelineInfo->shaderCount; ++shaderIndex) { const auto *shaderInfoEntry = shaderInfo[shaderIndex]; @@ -2523,6 +2487,7 @@ Result Compiler::buildRayTracingPipelineInternal(RayTracingContext &rtContext, newModules.push_back(std::move(shaderModule)); moduleCallsTraceRay.push_back(moduleData->usage.hasTraceRay); + moduleUsesRayQuery.push_back(moduleData->usage.enableRayQuery); } if (gpurtShaderLibrary) { @@ -2544,16 +2509,19 @@ Result Compiler::buildRayTracingPipelineInternal(RayTracingContext &rtContext, newModules.push_back(std::move(gpurtShaderLibrary)); moduleCallsTraceRay.push_back(false); + moduleUsesRayQuery.push_back(false); } assert(moduleCallsTraceRay.size() == (newModules.size() - 1)); + assert(moduleUsesRayQuery.size() == newModules.size()); - for (auto &module : newModules) { + for (unsigned i = 0; i < newModules.size(); i++) { + auto module = (newModules[i].get()); std::unique_ptr passMgr(lgc::PassManager::Create(builderContext)); SpirvLower::registerPasses(*passMgr); - SpirvLower::addPasses(mainContext, ShaderStageCompute, *passMgr, timerProfiler.getTimer(TimerLower), true, false, - false); - bool success = runPasses(&*passMgr, module.get()); + SpirvLower::addPasses(mainContext, ShaderStageCompute, *passMgr, timerProfiler.getTimer(TimerLower), true, + moduleUsesRayQuery[i], false); + bool success = runPasses(&*passMgr, module); if (!success) { LLPC_ERRS("Failed to translate SPIR-V or run per-shader passes\n"); return Result::ErrorInvalidShader; @@ -2722,13 +2690,10 @@ MetroHash::Hash Compiler::generateHashForCompileOptions(unsigned optionCount, co // Options which needn't affect compilation results static StringRef IgnoredOptions[] = {cl::PipelineDumpDir.ArgStr, cl::EnablePipelineDump.ArgStr, - cl::ShaderCacheFileDir.ArgStr, - cl::ShaderCacheMode.ArgStr, cl::EnableOuts.ArgStr, cl::EnableErrs.ArgStr, cl::LogFileDbgs.ArgStr, cl::LogFileOuts.ArgStr, - cl::ExecutableName.ArgStr, "unlinked", "o"}; @@ -2800,45 +2765,6 @@ Result Compiler::validatePipelineShaderInfo(const PipelineShaderInfo *shaderInfo return result; } -#if LLPC_ENABLE_SHADER_CACHE -// ===================================================================================================================== -// Creates shader cache object with the requested properties. -// @param : Shader cache create info. -// @param [out] : Shader cache object -// @returns : Result::Success if creation succeeds, error status otherwise. -Result Compiler::CreateShaderCache(const ShaderCacheCreateInfo *pCreateInfo, IShaderCache **ppShaderCache) { - Result result = Result::Success; - - ShaderCacheAuxCreateInfo auxCreateInfo = {}; - auxCreateInfo.shaderCacheMode = ShaderCacheMode::ShaderCacheEnableRuntime; - auxCreateInfo.gfxIp = m_gfxIp; - auxCreateInfo.hash = m_optionHash; - - ShaderCache *shaderCache = new ShaderCache(); - - if (shaderCache) { - result = shaderCache->init(pCreateInfo, &auxCreateInfo); - if (result != Result::Success) { - shaderCache->Destroy(); - delete shaderCache; - shaderCache = nullptr; - } - } else { - result = Result::ErrorOutOfMemory; - } - - *ppShaderCache = shaderCache; - - if ((result == Result::Success) && - ((cl::ShaderCacheMode == ShaderCacheEnableRuntime) || (cl::ShaderCacheMode == ShaderCacheEnableOnDisk)) && - (pCreateInfo->initialDataSize > 0)) { - result = m_shaderCache->Merge(1, const_cast(ppShaderCache)); - } - - return result; -} -#endif - // ===================================================================================================================== // Acquires a free context from context pool. Context *Compiler::acquireContext() const { @@ -2994,11 +2920,6 @@ bool Compiler::linkRelocatableShaderElf(ElfPackage *shaderElfs, ElfPackage *pipe } std::unique_ptr elfLinker(pipeline->createElfLinker(elfs)); - if (elfLinker->fragmentShaderUsesMappedBuiltInInputs()) { - LLPC_OUTS("Failed to link relocatable shaders because FS uses builtin inputs."); - return false; - } - setGlueBinaryBlobsInLinker(elfLinker.get(), context, this); // Do the link. raw_svector_ostream outStream(*pipelineElf); diff --git a/llpc/context/llpcCompiler.h b/llpc/context/llpcCompiler.h index a2ad53c4e1..7b2f6f9f0f 100644 --- a/llpc/context/llpcCompiler.h +++ b/llpc/context/llpcCompiler.h @@ -32,11 +32,13 @@ #include "llpc.h" #include "llpcCacheAccessor.h" -#include "llpcShaderCacheManager.h" #include "llpcShaderModuleHelper.h" +#include "llpcUtil.h" #include "vkgcElfReader.h" #include "vkgcMetroHash.h" #include "lgc/CommonDefs.h" +#include "llvm/Support/Mutex.h" +#include #include namespace llvm { @@ -154,15 +156,11 @@ class Compiler : public ICompiler { static MetroHash::Hash generateHashForCompileOptions(unsigned optionCount, const char *const *options); -#if LLPC_ENABLE_SHADER_CACHE - virtual Result CreateShaderCache(const ShaderCacheCreateInfo *pCreateInfo, IShaderCache **ppShaderCache); -#endif - static void buildShaderCacheHash(Context *context, unsigned stageMask, llvm::ArrayRef> stageHashes, MetroHash::Hash *fragmentHash, MetroHash::Hash *nonFragmentHash); - CachePair getInternalCaches() { return {m_cache, m_shaderCache.get()}; } + Vkgc::ICache *getInternalCaches() { return m_cache; } Context *acquireContext() const; void releaseContext(Context *context) const; @@ -206,7 +204,6 @@ class Compiler : public ICompiler { Vkgc::ICache *m_cache; // Point to ICache implemented in client static unsigned m_instanceCount; // The count of compiler instance static unsigned m_outRedirectCount; // The count of output redirect - ShaderCachePtr m_shaderCache; // Shader cache static llvm::sys::Mutex m_contextPoolMutex; // Mutex for context pool access static std::vector *m_contextPool; // Context pool unsigned m_relocatablePipelineCompilations; // The number of pipelines compiled using relocatable shader elf diff --git a/llpc/context/llpcContext.cpp b/llpc/context/llpcContext.cpp index 7b1730274e..57adabbf2d 100644 --- a/llpc/context/llpcContext.cpp +++ b/llpc/context/llpcContext.cpp @@ -35,8 +35,6 @@ #include "llpcCompiler.h" #include "llpcDebug.h" #include "llpcPipelineContext.h" -#include "llpcShaderCache.h" -#include "llpcShaderCacheManager.h" #include "vkgcMetroHash.h" #include "lgc/Builder.h" #include "lgc/GpurtDialect.h" @@ -62,6 +60,7 @@ using namespace lgc; using namespace lgc::rt; using namespace llvm; +using namespace lgc::cps; namespace Llpc { @@ -69,7 +68,7 @@ namespace Llpc { // // @param gfxIp : Graphics IP version info Context::Context(GfxIpVersion gfxIp) : LLVMContext(), m_gfxIp(gfxIp) { - m_dialectContext = llvm_dialects::DialectContext::make(*this); + m_dialectContext = llvm_dialects::DialectContext::make(*this); reset(); } @@ -90,20 +89,29 @@ LgcContext *Context::getLgcContext() { // Create the LgcContext on first execution or optimization level change. if (!m_builderContext || getLastOptimizationLevel() != getOptimizationLevel()) { std::string gpuName = LgcContext::getGpuNameString(m_gfxIp.major, m_gfxIp.minor, m_gfxIp.stepping); + // Pass the state of LLPC_OUTS on to LGC for the logging inside createTargetMachine. + LgcContext::setLlpcOuts(EnableOuts() ? &outs() : nullptr); m_targetMachine = LgcContext::createTargetMachine(gpuName, getOptimizationLevel()); + LgcContext::setLlpcOuts(nullptr); if (!m_targetMachine) report_fatal_error(Twine("Unknown target '") + Twine(gpuName) + Twine("'")); m_builderContext.reset(LgcContext::create(&*m_targetMachine, *this, PAL_CLIENT_INTERFACE_MAJOR_VERSION)); + + // Pass the state of LLPC_OUTS on to LGC. + LgcContext::setLlpcOuts(EnableOuts() ? &outs() : nullptr); } return &*m_builderContext; } +#if LLVM_MAIN_REVISION && LLVM_MAIN_REVISION < 474768 +// Old version of the code // ===================================================================================================================== // Get optimization level. Also resets what getLastOptimizationLevel() returns. // // @returns: the optimization level for the context. CodeGenOpt::Level Context::getOptimizationLevel() { - uint32_t optLevel = CodeGenOpt::Level::Default; + uint32_t optLevel = static_cast(CodeGenOpt::Level::Default); + optLevel = getPipelineContext()->getPipelineOptions()->optimizationLevel; if (optLevel > 3) optLevel = 3; @@ -119,6 +127,33 @@ CodeGenOpt::Level Context::getLastOptimizationLevel() const { return *m_lastOptLevel; } +#else +// New version of the code (also handles unknown version, which we treat as latest) + +// ===================================================================================================================== +// Get optimization level. Also resets what getLastOptimizationLevel() returns. +// +// @returns: the optimization level for the context. +CodeGenOptLevel Context::getOptimizationLevel() { + uint32_t optLevel = static_cast(CodeGenOptLevel::Default); + + optLevel = getPipelineContext()->getPipelineOptions()->optimizationLevel; + if (optLevel > 3) + optLevel = 3; + else if (optLevel == 0) // Workaround for noopt bugs in the AMDGPU backend in LLVM. + optLevel = 1; + m_lastOptLevel = CodeGenOptLevel(optLevel); + return *m_lastOptLevel; +} + +// ===================================================================================================================== +// Get the optimization level returned by the last getOptimizationLevel(). +CodeGenOptLevel Context::getLastOptimizationLevel() const { + return *m_lastOptLevel; +} + +#endif + // ===================================================================================================================== // Loads library from external LLVM library. // diff --git a/llpc/context/llpcContext.h b/llpc/context/llpcContext.h index b2f5f41901..54b77138dd 100644 --- a/llpc/context/llpcContext.h +++ b/llpc/context/llpcContext.h @@ -84,8 +84,15 @@ class Context : public llvm::LLVMContext { // Get (create if necessary) LgcContext lgc::LgcContext *getLgcContext(); +#if LLVM_MAIN_REVISION && LLVM_MAIN_REVISION < 474768 + // Old version of the code llvm::CodeGenOpt::Level getOptimizationLevel(); llvm::CodeGenOpt::Level getLastOptimizationLevel() const; +#else + // New version of the code (also handles unknown version, which we treat as latest) + llvm::CodeGenOptLevel getOptimizationLevel(); + llvm::CodeGenOptLevel getLastOptimizationLevel() const; +#endif std::unique_ptr loadLibrary(const BinaryData *lib); @@ -129,7 +136,14 @@ class Context : public llvm::LLVMContext { std::unique_ptr m_targetMachine; // Target machine for LGC context std::unique_ptr m_builderContext; // LGC context +#if LLVM_MAIN_REVISION && LLVM_MAIN_REVISION < 474768 + // Old version of the code std::optional m_lastOptLevel{}; // What getOptimizationLevel() last returned +#else + // New version of the code (also handles unknown version, which we treat as latest) + std::optional m_lastOptLevel{}; // What getOptimizationLevel() last returned +#endif + std::unique_ptr m_dialectContext; unsigned m_useCount = 0; // Number of times this context is used. diff --git a/llpc/context/llpcGraphicsContext.cpp b/llpc/context/llpcGraphicsContext.cpp index b5b3a9c871..cdc9c46383 100644 --- a/llpc/context/llpcGraphicsContext.cpp +++ b/llpc/context/llpcGraphicsContext.cpp @@ -309,8 +309,7 @@ void GraphicsContext::setColorExportState(Pipeline *pipeline, Util::MetroHash64 SmallVector formats; state.alphaToCoverageEnable = cbState.alphaToCoverageEnable; - state.dualSourceBlendEnable = - cbState.dualSourceBlendEnable || (pipelineInfo->cbState.dualSourceBlendDynamic && getUseDualSourceBlend()); + state.dualSourceBlendEnable = cbState.dualSourceBlendEnable; for (unsigned targetIndex = 0; targetIndex < MaxColorTargets; ++targetIndex) { if (cbState.target[targetIndex].format != VK_FORMAT_UNDEFINED) { @@ -493,6 +492,7 @@ void GraphicsContext::setGraphicsStateInPipeline(Pipeline *pipeline, Util::Metro rasterizerState.perSampleShading = inputRsState.perSampleShading; rasterizerState.numSamples = inputRsState.numSamples; rasterizerState.samplePatternIdx = inputRsState.samplePatternIdx; + rasterizerState.pixelShaderSamples = inputRsState.pixelShaderSamples; rasterizerState.dynamicSampleInfo = inputRsState.dynamicSampleInfo; } diff --git a/llpc/context/llpcPipelineContext.cpp b/llpc/context/llpcPipelineContext.cpp index 27428162ff..4ca1764923 100644 --- a/llpc/context/llpcPipelineContext.cpp +++ b/llpc/context/llpcPipelineContext.cpp @@ -283,7 +283,7 @@ void PipelineContext::setPipelineState(Pipeline *pipeline, Util::MetroHash64 *ha // @param [in/out] pipeline : Middle-end pipeline object; nullptr if only hashing // @param [in/out] hasher : Hasher object; nullptr if only setting LGC pipeline state Options PipelineContext::computePipelineOptions() const { - Options options; + Options options = {}; options.hash[0] = getPipelineHashCode(); options.hash[1] = get64BitCacheHashCode(); @@ -340,6 +340,7 @@ Options PipelineContext::computePipelineOptions() const { // Driver report full subgroup lanes for compute shader, here we just set fullSubgroups as default options options.fullSubgroups = true; options.internalRtShaders = getPipelineOptions()->internalRtShaders; + options.disableSampleMask = getPipelineOptions()->disableSampleMask; options.disableTruncCoordForGather = getPipelineOptions()->disableTruncCoordForGather; return options; @@ -369,222 +370,219 @@ void PipelineContext::setUserDataInPipeline(Pipeline *pipeline, Util::MetroHash6 if (!pipeline) return; // Only hashing - auto allocNodes = std::make_unique(resourceMapping->userDataNodeCount); - - for (unsigned idx = 0; idx < resourceMapping->userDataNodeCount; ++idx) - allocNodes[idx] = resourceMapping->pUserDataNodes[idx].node; - // Translate the resource nodes into the LGC format expected by Pipeline::SetUserDataNodes. - ArrayRef nodes(allocNodes.get(), resourceMapping->userDataNodeCount); - ArrayRef descriptorRangeValues(resourceMapping->pStaticDescriptorValues, - resourceMapping->staticDescriptorValueCount); + auto srcNodes = ArrayRef(resourceMapping->pUserDataNodes, resourceMapping->userDataNodeCount); + auto staticDescriptorValues = + ArrayRef(resourceMapping->pStaticDescriptorValues, resourceMapping->staticDescriptorValueCount); // First, create a map of immutable nodes. ImmutableNodesMap immutableNodesMap; - for (auto &rangeValue : descriptorRangeValues) + for (auto &rangeValue : staticDescriptorValues) immutableNodesMap[{rangeValue.set, rangeValue.binding}] = &rangeValue; // Count how many user data nodes we have, and allocate the buffer. - unsigned nodeCount = nodes.size(); - for (auto &node : nodes) { - if (node.type == ResourceMappingNodeType::DescriptorTableVaPtr) - nodeCount += node.tablePtr.nodeCount; + unsigned totalNodeCount = srcNodes.size(); + for (auto &node : srcNodes) { + if (node.node.type == ResourceMappingNodeType::DescriptorTableVaPtr) + totalNodeCount += node.node.tablePtr.nodeCount; } - auto allocUserDataNodes = std::make_unique(nodeCount); - // Copy nodes in. - ResourceNode *destTable = allocUserDataNodes.get(); - ResourceNode *destInnerTable = destTable + nodeCount; - auto userDataNodes = ArrayRef(destTable, nodes.size()); - setUserDataNodesTable(pipeline, nodes, immutableNodesMap, /*isRoot=*/true, destTable, destInnerTable); - assert(destInnerTable == destTable + nodes.size()); + std::vector allDstNodes; + allDstNodes.resize(totalNodeCount); + + auto dstNodes = MutableArrayRef(allDstNodes).take_front(srcNodes.size()); + auto dstInnerTable = MutableArrayRef(allDstNodes).drop_front(srcNodes.size()); + for (auto [dst, src] : llvm::zip(dstNodes, srcNodes)) { + unsigned visibility = src.visibility; + if (visibility & ShaderStageAllRayTracingBit) { + visibility &= ~ShaderStageAllRayTracingBit; + visibility |= ShaderStageComputeBit; + } + convertResourceNode(dst, src.node, visibility, immutableNodesMap, dstInnerTable); + } // Give the table to the LGC Pipeline interface. - pipeline->setUserDataNodes(userDataNodes); + pipeline->setUserDataNodes(dstNodes); } // ===================================================================================================================== -// Set one user data table, and its inner tables. Used by SetUserDataInPipeline above, and recursively calls -// itself for an inner table. This translates from a Vkgc ResourceMappingNode to an LGC ResourceNode. +// Convert one Vkgc::ResourceMappingNode into one lgc::ResourceNode, applying the given visibility. // -// @param context : LLVM context -// @param nodes : The resource mapping nodes -// @param immutableNodesMap : Map of immutable nodes -// @param isRoot : Whether this is the root table -// @param [out] destTable : Where to write nodes -// @param [in/out] destInnerTable : End of space available for inner tables -void PipelineContext::setUserDataNodesTable(Pipeline *pipeline, ArrayRef nodes, - const ImmutableNodesMap &immutableNodesMap, bool isRoot, - ResourceNode *destTable, ResourceNode *&destInnerTable) const { - for (unsigned idx = 0; idx != nodes.size(); ++idx) { - auto &node = nodes[idx]; - auto &destNode = destTable[idx]; - - destNode.sizeInDwords = node.sizeInDwords; - destNode.offsetInDwords = node.offsetInDwords; - destNode.abstractType = ResourceNodeType::Unknown; - destNode.visibility = 0; - - switch (node.type) { - case ResourceMappingNodeType::DescriptorTableVaPtr: { - // Process an inner table. - destNode.concreteType = ResourceNodeType::DescriptorTableVaPtr; - destNode.abstractType = ResourceNodeType::DescriptorTableVaPtr; - destInnerTable -= node.tablePtr.nodeCount; - destNode.innerTable = ArrayRef(destInnerTable, node.tablePtr.nodeCount); - setUserDataNodesTable(pipeline, ArrayRef(node.tablePtr.pNext, node.tablePtr.nodeCount), - immutableNodesMap, /*isRoot=*/false, destInnerTable, destInnerTable); - break; - } - case ResourceMappingNodeType::IndirectUserDataVaPtr: { - // Process an indirect pointer. - destNode.concreteType = ResourceNodeType::IndirectUserDataVaPtr; - destNode.abstractType = ResourceNodeType::IndirectUserDataVaPtr; - destNode.indirectSizeInDwords = node.userDataPtr.sizeInDwords; - break; - } - case ResourceMappingNodeType::StreamOutTableVaPtr: { - // Process an indirect pointer. - destNode.concreteType = ResourceNodeType::StreamOutTableVaPtr; - destNode.abstractType = ResourceNodeType::StreamOutTableVaPtr; - destNode.indirectSizeInDwords = node.userDataPtr.sizeInDwords; - break; - } - default: { - // Process an SRD. First check that a static_cast works to convert a Vkgc ResourceMappingNodeType - // to an LGC ResourceNodeType (with the exception of DescriptorCombinedBvhBuffer, whose value - // accidentally depends on LLPC version). - static_assert(ResourceNodeType::DescriptorResource == - static_cast(ResourceMappingNodeType::DescriptorResource), - "Mismatch"); - static_assert(ResourceNodeType::DescriptorSampler == - static_cast(ResourceMappingNodeType::DescriptorSampler), - "Mismatch"); - static_assert(ResourceNodeType::DescriptorCombinedTexture == - static_cast(ResourceMappingNodeType::DescriptorCombinedTexture), - "Mismatch"); - static_assert(ResourceNodeType::DescriptorTexelBuffer == - static_cast(ResourceMappingNodeType::DescriptorTexelBuffer), - "Mismatch"); - static_assert(ResourceNodeType::DescriptorFmask == - static_cast(ResourceMappingNodeType::DescriptorFmask), - "Mismatch"); - static_assert(ResourceNodeType::DescriptorBuffer == - static_cast(ResourceMappingNodeType::DescriptorBuffer), - "Mismatch"); - static_assert(ResourceNodeType::PushConst == static_cast(ResourceMappingNodeType::PushConst), - "Mismatch"); - static_assert(ResourceNodeType::DescriptorBufferCompact == - static_cast(ResourceMappingNodeType::DescriptorBufferCompact), - "Mismatch"); - - if (node.type == ResourceMappingNodeType::InlineBuffer) - destNode.concreteType = ResourceNodeType::InlineBuffer; - else if (node.type == ResourceMappingNodeType::DescriptorYCbCrSampler) - destNode.concreteType = ResourceNodeType::DescriptorResource; - else if (node.type == ResourceMappingNodeType::DescriptorImage) - destNode.concreteType = ResourceNodeType::DescriptorResource; - else if (node.type == ResourceMappingNodeType::DescriptorConstTexelBuffer) - destNode.concreteType = ResourceNodeType::DescriptorTexelBuffer; - else if (node.type == Vkgc::ResourceMappingNodeType::DescriptorConstBufferCompact) - destNode.concreteType = ResourceNodeType::DescriptorBufferCompact; - else if (node.type == Vkgc::ResourceMappingNodeType::DescriptorConstBuffer) - destNode.concreteType = ResourceNodeType::DescriptorBuffer; +// If the source node is a descriptor table, its children are recursively converted into the dstInnerTable, which is a +// reference to a sufficiently large pre-allocated array; the reference is updated to account for the consumed storage +// locations. +// +// @param dst : The destination resource node +// @param src : The source resource node +// @param visibility : A shader stage mask indicating visibility +// @param immutableNodesMap : Immutable nodes information (for immutable samplers) +// @param [in/out] dstInnerTable : Pre-allocated space for inner tables +void PipelineContext::convertResourceNode(ResourceNode &dst, const ResourceMappingNode &src, unsigned visibility, + const ImmutableNodesMap &immutableNodesMap, + MutableArrayRef &dstInnerTable) const { + dst.sizeInDwords = src.sizeInDwords; + dst.offsetInDwords = src.offsetInDwords; + dst.abstractType = ResourceNodeType::Unknown; + dst.visibility = visibility; + + switch (src.type) { + case ResourceMappingNodeType::DescriptorTableVaPtr: { + // Process an inner table. + dst.concreteType = ResourceNodeType::DescriptorTableVaPtr; + dst.abstractType = ResourceNodeType::DescriptorTableVaPtr; + auto innerTable = dstInnerTable.take_front(src.tablePtr.nodeCount); + dstInnerTable = dstInnerTable.drop_front(src.tablePtr.nodeCount); + dst.innerTable = innerTable; + + for (auto [childDst, childSrc] : llvm::zip(innerTable, ArrayRef(src.tablePtr.pNext, src.tablePtr.nodeCount))) + convertResourceNode(childDst, childSrc, visibility, immutableNodesMap, dstInnerTable); + break; + } + case ResourceMappingNodeType::IndirectUserDataVaPtr: { + // Process an indirect pointer. + dst.concreteType = ResourceNodeType::IndirectUserDataVaPtr; + dst.abstractType = ResourceNodeType::IndirectUserDataVaPtr; + dst.indirectSizeInDwords = src.userDataPtr.sizeInDwords; + break; + } + case ResourceMappingNodeType::StreamOutTableVaPtr: { + // Process an indirect pointer. + dst.concreteType = ResourceNodeType::StreamOutTableVaPtr; + dst.abstractType = ResourceNodeType::StreamOutTableVaPtr; + dst.indirectSizeInDwords = src.userDataPtr.sizeInDwords; + break; + } + default: { + // Process an SRD. First check that a static_cast works to convert a Vkgc ResourceMappingNodeType + // to an LGC ResourceNodeType (with the exception of DescriptorCombinedBvhBuffer, whose value + // accidentally depends on LLPC version). + static_assert(ResourceNodeType::DescriptorResource == + static_cast(ResourceMappingNodeType::DescriptorResource), + "Mismatch"); + static_assert(ResourceNodeType::DescriptorSampler == + static_cast(ResourceMappingNodeType::DescriptorSampler), + "Mismatch"); + static_assert(ResourceNodeType::DescriptorCombinedTexture == + static_cast(ResourceMappingNodeType::DescriptorCombinedTexture), + "Mismatch"); + static_assert(ResourceNodeType::DescriptorTexelBuffer == + static_cast(ResourceMappingNodeType::DescriptorTexelBuffer), + "Mismatch"); + static_assert(ResourceNodeType::DescriptorFmask == + static_cast(ResourceMappingNodeType::DescriptorFmask), + "Mismatch"); + static_assert(ResourceNodeType::DescriptorBuffer == + static_cast(ResourceMappingNodeType::DescriptorBuffer), + "Mismatch"); + static_assert(ResourceNodeType::PushConst == static_cast(ResourceMappingNodeType::PushConst), + "Mismatch"); + static_assert(ResourceNodeType::DescriptorBufferCompact == + static_cast(ResourceMappingNodeType::DescriptorBufferCompact), + "Mismatch"); + + if (src.type == ResourceMappingNodeType::InlineBuffer) + dst.concreteType = ResourceNodeType::InlineBuffer; + else if (src.type == ResourceMappingNodeType::DescriptorYCbCrSampler) + dst.concreteType = ResourceNodeType::DescriptorResource; + else if (src.type == ResourceMappingNodeType::DescriptorImage) + dst.concreteType = ResourceNodeType::DescriptorResource; + else if (src.type == ResourceMappingNodeType::DescriptorConstTexelBuffer) + dst.concreteType = ResourceNodeType::DescriptorTexelBuffer; + else if (src.type == Vkgc::ResourceMappingNodeType::DescriptorConstBufferCompact) + dst.concreteType = ResourceNodeType::DescriptorBufferCompact; + else if (src.type == Vkgc::ResourceMappingNodeType::DescriptorConstBuffer) + dst.concreteType = ResourceNodeType::DescriptorBuffer; #if LLPC_CLIENT_INTERFACE_MAJOR_VERSION >= 63 - else if (node.type == Vkgc::ResourceMappingNodeType::DescriptorAtomicCounter) - destNode.concreteType = ResourceNodeType::DescriptorBuffer; + else if (src.type == Vkgc::ResourceMappingNodeType::DescriptorAtomicCounter) + dst.concreteType = ResourceNodeType::DescriptorBuffer; #endif #if LLPC_CLIENT_INTERFACE_MAJOR_VERSION >= 61 - else if (node.type == Vkgc::ResourceMappingNodeType::DescriptorMutable) - destNode.concreteType = ResourceNodeType::DescriptorMutable; + else if (src.type == Vkgc::ResourceMappingNodeType::DescriptorMutable) + dst.concreteType = ResourceNodeType::DescriptorMutable; #endif - else - destNode.concreteType = static_cast(node.type); - - if (getPipelineOptions()->replaceSetWithResourceType && node.srdRange.set == 0) { - // Special value InternalDescriptorSetId(-1) will be passed in for internal usage - destNode.set = getGlResourceNodeSetFromType(node.type); - } else { - destNode.set = node.srdRange.set; - } - destNode.binding = node.srdRange.binding; - destNode.abstractType = destNode.concreteType; - destNode.immutableValue = nullptr; - destNode.immutableSize = 0; + else + dst.concreteType = static_cast(src.type); + + if (getPipelineOptions()->replaceSetWithResourceType && src.srdRange.set == 0) { + // Special value InternalDescriptorSetId(-1) will be passed in for internal usage + dst.set = getGlResourceNodeSetFromType(src.type); + } else { + dst.set = src.srdRange.set; + } + dst.binding = src.srdRange.binding; + dst.abstractType = dst.concreteType; + dst.immutableValue = nullptr; + dst.immutableSize = 0; #if LLPC_CLIENT_INTERFACE_MAJOR_VERSION >= 61 - // Normally we know the stride of items in a descriptor array. However in specific circumstances - // the type is not known by llpc. This is the case with mutable descriptors where we need the - // stride to be explicitly specified. - if (node.srdRange.strideInDwords > 0) { - destNode.stride = node.srdRange.strideInDwords; - } else { + // Normally we know the stride of items in a descriptor array. However in specific circumstances + // the type is not known by llpc. This is the case with mutable descriptors where we need the + // stride to be explicitly specified. + if (src.srdRange.strideInDwords > 0) { + dst.stride = src.srdRange.strideInDwords; + } else { #endif - switch (node.type) { - case ResourceMappingNodeType::DescriptorImage: - case ResourceMappingNodeType::DescriptorResource: - case ResourceMappingNodeType::DescriptorFmask: - destNode.stride = DescriptorSizeResource / sizeof(uint32_t); - break; - case ResourceMappingNodeType::DescriptorSampler: - destNode.stride = DescriptorSizeSampler / sizeof(uint32_t); - break; - case ResourceMappingNodeType::DescriptorCombinedTexture: - destNode.stride = (DescriptorSizeResource + DescriptorSizeSampler) / sizeof(uint32_t); - break; - case ResourceMappingNodeType::InlineBuffer: - case ResourceMappingNodeType::DescriptorYCbCrSampler: - // Current node.sizeInDwords = resourceDescSizeInDwords * M * N (M means plane count, N means array count) - // TODO: Desired destNode.stride = resourceDescSizeInDwords * M - // - // Temporary set stride to be node.sizeInDwords, for that the stride varies from different plane - // counts, and we don't know the real plane count currently. - // Thus, set stride to sizeInDwords, and just divide array count when it is available in handling immutable - // sampler descriptor (For YCbCrSampler, immutable sampler is always accessible) - destNode.stride = node.sizeInDwords; - break; - case ResourceMappingNodeType::DescriptorBufferCompact: - case ResourceMappingNodeType::DescriptorConstBufferCompact: - destNode.stride = 2; - break; - default: - destNode.stride = DescriptorSizeBuffer / sizeof(uint32_t); - break; - } -#if LLPC_CLIENT_INTERFACE_MAJOR_VERSION >= 61 + switch (src.type) { + case ResourceMappingNodeType::DescriptorImage: + case ResourceMappingNodeType::DescriptorResource: + case ResourceMappingNodeType::DescriptorFmask: + dst.stride = DescriptorSizeResource / sizeof(uint32_t); + break; + case ResourceMappingNodeType::DescriptorSampler: + dst.stride = DescriptorSizeSampler / sizeof(uint32_t); + break; + case ResourceMappingNodeType::DescriptorCombinedTexture: + dst.stride = (DescriptorSizeResource + DescriptorSizeSampler) / sizeof(uint32_t); + break; + case ResourceMappingNodeType::InlineBuffer: + case ResourceMappingNodeType::DescriptorYCbCrSampler: + // Current src.sizeInDwords = resourceDescSizeInDwords * M * N (M means plane count, N means array count) + // TODO: Desired dst.stride = resourceDescSizeInDwords * M + // + // Temporary set stride to be src.sizeInDwords, for that the stride varies from different plane + // counts, and we don't know the real plane count currently. + // Thus, set stride to sizeInDwords, and just divide array count when it is available in handling immutable + // sampler descriptor (For YCbCrSampler, immutable sampler is always accessible) + dst.stride = src.sizeInDwords; + break; + case ResourceMappingNodeType::DescriptorBufferCompact: + case ResourceMappingNodeType::DescriptorConstBufferCompact: + dst.stride = 2; + break; + default: + dst.stride = DescriptorSizeBuffer / sizeof(uint32_t); + break; } +#if LLPC_CLIENT_INTERFACE_MAJOR_VERSION >= 61 + } #endif - // Only check for an immutable value if the resource is or contains a sampler. This specifically excludes - // YCbCrSampler; that was handled in the SPIR-V reader. - if (node.type != ResourceMappingNodeType::DescriptorSampler && - node.type != ResourceMappingNodeType::DescriptorCombinedTexture && - node.type != ResourceMappingNodeType::DescriptorYCbCrSampler) - break; - - auto it = immutableNodesMap.find(std::pair(node.srdRange.set, node.srdRange.binding)); - if (it != immutableNodesMap.end()) { - // This set/binding is (or contains) an immutable value. The value can only be a sampler, so we - // can assume it is four dwords. - auto &immutableNode = *it->second; - - IRBuilder<> builder(pipeline->getContext()); - SmallVector values; - - if (immutableNode.arraySize != 0) { - if (node.type == ResourceMappingNodeType::DescriptorYCbCrSampler) { - // TODO: Remove the statement when destNode.stride is per array size - // Update destNode.stride = node.sizeInDwords / immutableNode.arraySize - destNode.stride /= immutableNode.arraySize; - } + // Only check for an immutable value if the resource is or contains a sampler. This specifically excludes + // YCbCrSampler; that was handled in the SPIR-V reader. + if (src.type != ResourceMappingNodeType::DescriptorSampler && + src.type != ResourceMappingNodeType::DescriptorCombinedTexture && + src.type != ResourceMappingNodeType::DescriptorYCbCrSampler) + break; - destNode.immutableSize = immutableNode.arraySize; - destNode.immutableValue = immutableNode.pValue; + auto it = immutableNodesMap.find(std::pair(src.srdRange.set, src.srdRange.binding)); + if (it != immutableNodesMap.end()) { + // This set/binding is (or contains) an immutable value. The value can only be a sampler, so we + // can assume it is four dwords. + auto &immutableNode = *it->second; + + if (immutableNode.arraySize != 0) { + if (src.type == ResourceMappingNodeType::DescriptorYCbCrSampler) { + // TODO: Remove the statement when dst.stride is per array size + // Update dst.stride = node.sizeInDwords / immutableNode.arraySize + dst.stride /= immutableNode.arraySize; } + + dst.immutableSize = immutableNode.arraySize; + dst.immutableValue = immutableNode.pValue; } - break; - } } + break; + } } } @@ -985,6 +983,10 @@ std::pair PipelineContext::mapVkFormat(VkFormat for { \ format, { format, dfmt, nfmt, true, false } \ } +#define EXT_VERTEX_FORMAT_ENTRY(format, dfmt, nfmt) \ + { \ + format, { static_cast(format), dfmt, nfmt, true, true } \ + } #define COLOR_FORMAT_ENTRY_EXT(format, dfmt, nfmt) \ { \ format, { format, dfmt, nfmt, false, true } \ @@ -1002,6 +1004,10 @@ std::pair PipelineContext::mapVkFormat(VkFormat for { \ format, { dfmt, nfmt, true, false } \ } +#define EXT_VERTEX_FORMAT_ENTRY(format, dfmt, nfmt) \ + { \ + format, { dfmt, nfmt, true, true } \ + } #define COLOR_FORMAT_ENTRY_EXT(format, dfmt, nfmt) \ { \ format, { dfmt, nfmt, false, true } \ @@ -1013,6 +1019,28 @@ std::pair PipelineContext::mapVkFormat(VkFormat for #endif COLOR_FORMAT_ENTRY_EXT(VK_FORMAT_A4R4G4B4_UNORM_PACK16_EXT, BufDataFormat4_4_4_4, BufNumFormatUnorm), COLOR_FORMAT_ENTRY_EXT(VK_FORMAT_A4B4G4R4_UNORM_PACK16_EXT, BufDataFormat4_4_4_4, BufNumFormatUnorm), + /// Currently OGL-only : Internal spv ext vertex attribute format - begin + EXT_VERTEX_FORMAT_ENTRY(VK_FORMAT_EXT_R32_UNORM, BufDataFormat32, BufNumFormatUnorm), + EXT_VERTEX_FORMAT_ENTRY(VK_FORMAT_EXT_R32G32_UNORM, BufDataFormat32_32, BufNumFormatUnorm), + EXT_VERTEX_FORMAT_ENTRY(VK_FORMAT_EXT_R32G32B32_UNORM, BufDataFormat32_32_32, BufNumFormatUnorm), + EXT_VERTEX_FORMAT_ENTRY(VK_FORMAT_EXT_R32G32B32A32_UNORM, BufDataFormat32_32_32_32, BufNumFormatUnorm), + EXT_VERTEX_FORMAT_ENTRY(VK_FORMAT_EXT_R32_SNORM, BufDataFormat32, BufNumFormatSnorm), + EXT_VERTEX_FORMAT_ENTRY(VK_FORMAT_EXT_R32G32_SNORM, BufDataFormat32_32, BufNumFormatSnorm), + EXT_VERTEX_FORMAT_ENTRY(VK_FORMAT_EXT_R32G32B32_SNORM, BufDataFormat32_32_32, BufNumFormatSnorm), + EXT_VERTEX_FORMAT_ENTRY(VK_FORMAT_EXT_R32G32B32A32_SNORM, BufDataFormat32_32_32_32, BufNumFormatSnorm), + EXT_VERTEX_FORMAT_ENTRY(VK_FORMAT_EXT_R32_FIXED, BufDataFormat32, BufNumFormatFixed), + EXT_VERTEX_FORMAT_ENTRY(VK_FORMAT_EXT_R32G32_FIXED, BufDataFormat32_32, BufNumFormatFixed), + EXT_VERTEX_FORMAT_ENTRY(VK_FORMAT_EXT_R32G32B32_FIXED, BufDataFormat32_32_32, BufNumFormatFixed), + EXT_VERTEX_FORMAT_ENTRY(VK_FORMAT_EXT_R32G32B32A32_FIXED, BufDataFormat32_32_32_32, BufNumFormatFixed), + EXT_VERTEX_FORMAT_ENTRY(VK_FORMAT_EXT_R32_USCALED, BufDataFormat32, BufNumFormatUscaled), + EXT_VERTEX_FORMAT_ENTRY(VK_FORMAT_EXT_R32G32_USCALED, BufDataFormat32_32, BufNumFormatUscaled), + EXT_VERTEX_FORMAT_ENTRY(VK_FORMAT_EXT_R32G32B32_USCALED, BufDataFormat32_32_32, BufNumFormatUscaled), + EXT_VERTEX_FORMAT_ENTRY(VK_FORMAT_EXT_R32G32B32A32_USCALED, BufDataFormat32_32_32_32, BufNumFormatUscaled), + EXT_VERTEX_FORMAT_ENTRY(VK_FORMAT_EXT_R32_SSCALED, BufDataFormat32, BufNumFormatSscaled), + EXT_VERTEX_FORMAT_ENTRY(VK_FORMAT_EXT_R32G32_SSCALED, BufDataFormat32_32, BufNumFormatSscaled), + EXT_VERTEX_FORMAT_ENTRY(VK_FORMAT_EXT_R32G32B32_SSCALED, BufDataFormat32_32_32, BufNumFormatSscaled), + EXT_VERTEX_FORMAT_ENTRY(VK_FORMAT_EXT_R32G32B32A32_SSCALED, BufDataFormat32_32_32_32, BufNumFormatSscaled) + /// Currently OGL only : Internal spv ext vertex attribute format - end }; BufDataFormat dfmt = BufDataFormatInvalid; @@ -1071,7 +1099,7 @@ uint32_t PipelineContext::getGlResourceNodeSetFromType(Vkgc::ResourceMappingNode resourceSet = GlResourceMappingSet::DescriptorFmask; break; default: - assert("Not supported resource type."); + assert("Not supportted resource type."); break; } diff --git a/llpc/context/llpcPipelineContext.h b/llpc/context/llpcPipelineContext.h index f310dcf2b3..e5b2b794dc 100644 --- a/llpc/context/llpcPipelineContext.h +++ b/llpc/context/llpcPipelineContext.h @@ -272,9 +272,9 @@ class PipelineContext { // Give the user data nodes and descriptor range values to the middle-end, and/or hash them. void setUserDataInPipeline(lgc::Pipeline *pipeline, Util::MetroHash64 *hasher, unsigned stageMask) const; - void setUserDataNodesTable(lgc::Pipeline *pipeline, llvm::ArrayRef nodes, - const ImmutableNodesMap &immutableNodesMap, bool isRoot, lgc::ResourceNode *destTable, - lgc::ResourceNode *&destInnerTable) const; + void convertResourceNode(lgc::ResourceNode &dst, const ResourceMappingNode &src, unsigned visibility, + const ImmutableNodesMap &immutableNodesMap, + llvm::MutableArrayRef &dstInnerTable) const; ShaderFpMode m_shaderFpModes[ShaderStageCountInternal] = {}; bool m_unlinked = false; // Whether we are building an "unlinked" shader ELF diff --git a/llpc/context/llpcRayTracingContext.cpp b/llpc/context/llpcRayTracingContext.cpp index f360da8813..d78630b3cb 100644 --- a/llpc/context/llpcRayTracingContext.cpp +++ b/llpc/context/llpcRayTracingContext.cpp @@ -273,6 +273,11 @@ lgc::Options RayTracingContext::computePipelineOptions() const { lgc::Options options = PipelineContext::computePipelineOptions(); // NOTE: raytracing waveSize and subgroupSize can be different. options.fullSubgroups = false; + + // TODO: Add a mode in Vkgc::LlpcRaytracingMode to represent lgc::RayTracingIndirectMode::Continuations. + if (m_pipelineInfo->mode == Vkgc::LlpcRaytracingMode::Continuations) + options.rtIndirectMode = lgc::RayTracingIndirectMode::ContinuationsContinufy; + return options; } diff --git a/llpc/context/llpcRayTracingContext.h b/llpc/context/llpcRayTracingContext.h index 7497817263..70275e018d 100644 --- a/llpc/context/llpcRayTracingContext.h +++ b/llpc/context/llpcRayTracingContext.h @@ -102,13 +102,17 @@ class RayTracingContext : public PipelineContext { static const unsigned TriangleHitGroup = static_cast(-2); llvm::Type *getPayloadType(lgc::Builder *builder); llvm::Type *getCallableDataType(lgc::Builder *builder); + unsigned getCallableDataSizeInBytes() { return m_callableDataMaxSize; } unsigned getAttributeDataSize(); + unsigned getAttributeDataSizeInBytes() { return m_attributeDataMaxSize; }; std::set> &getBuiltIns() { return m_builtIns; } bool getHitAttribute() { return m_attributeDataMaxSize > 0; } unsigned getPayloadSizeInDword() { return m_payloadMaxSize / 4; } + unsigned getPayloadSizeInBytes() { return m_payloadMaxSize; } bool hasPipelineLibrary() { return m_pipelineInfo->hasPipelineLibrary; } unsigned hasLibraryStage(unsigned stageMask) { return m_pipelineInfo->pipelineLibStageMask & stageMask; } bool isReplay() { return m_pipelineInfo->isReplay; } + Vkgc::LlpcRaytracingMode getRaytracingMode() { return m_pipelineInfo->mode; } protected: // Give the pipeline options to the middle-end, and/or hash them. diff --git a/llpc/context/llpcShaderCacheManager.cpp b/llpc/context/llpcShaderCacheManager.cpp deleted file mode 100644 index 6d58449d38..0000000000 --- a/llpc/context/llpcShaderCacheManager.cpp +++ /dev/null @@ -1,100 +0,0 @@ -/* - *********************************************************************************************************************** - * - * Copyright (c) 2017-2023 Advanced Micro Devices, Inc. All Rights Reserved. - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - * - **********************************************************************************************************************/ -/** -*********************************************************************************************************************** -@file llpcShaderCacheManager.cpp -@brief LLPC source file: contains implementation of class Llpc::ShaderCacheManager. -*********************************************************************************************************************** -*/ -#include "llpcShaderCacheManager.h" -#include "llpcError.h" - -#define DEBUG_TYPE "llpc-shader-cache-manager" - -using namespace llvm; - -namespace Llpc { - -// ===================================================================================================================== -// The global ShaderCacheManager object -ShaderCacheManager *ShaderCacheManager::m_manager = nullptr; - -// ===================================================================================================================== -// Destroy all objects -ShaderCacheManager::~ShaderCacheManager() { - for (auto cacheIt = m_shaderCaches.begin(), endIt = m_shaderCaches.end(); cacheIt != endIt; ++cacheIt) { - // Deletes managed object - (*cacheIt).reset(); - } - - m_shaderCaches.clear(); -} - -// ===================================================================================================================== -// Get ShaderCache instance with specified create info -// -// @param createInfo : Shader cache create info -// @param auxCreateInfo : Shader cache auxiliary info (static fields) -ShaderCachePtr ShaderCacheManager::getShaderCacheObject(const ShaderCacheCreateInfo *createInfo, - const ShaderCacheAuxCreateInfo *auxCreateInfo) { - ShaderCachePtr shaderCache; - auto cacheIt = m_shaderCaches.begin(); - auto endIt = m_shaderCaches.end(); - - for (; cacheIt != endIt; ++cacheIt) { - if ((*cacheIt)->isCompatible(createInfo, auxCreateInfo)) { - shaderCache = (*cacheIt); - break; - } - } - - // No compatible object is found, create a new one - if (cacheIt == endIt) { - shaderCache = std::make_shared(); - m_shaderCaches.push_back(shaderCache); - mustSucceed(shaderCache->init(createInfo, auxCreateInfo), "Failed to initialize shader cache"); - } - - return shaderCache; -} - -// ===================================================================================================================== -// Release ShaderCache instance -// -// @param shaderCachePtr : ShaderCache instance to be released -void ShaderCacheManager::releaseShaderCacheObject(ShaderCachePtr &shaderCachePtr) { - auto cacheIt = m_shaderCaches.begin(); - auto endIt = m_shaderCaches.end(); - for (; cacheIt != endIt; ++cacheIt) { - if ((*cacheIt).get() == shaderCachePtr.get()) - break; - } - - assert(cacheIt != endIt); - - shaderCachePtr.reset(); -} - -} // namespace Llpc diff --git a/llpc/include/llpc.h b/llpc/include/llpc.h index 7eace6500a..8685d7f751 100644 --- a/llpc/include/llpc.h +++ b/llpc/include/llpc.h @@ -131,6 +131,7 @@ enum CacheAccessInfo : uint8_t { struct GraphicsPipelineBuildOut { BinaryData pipelineBin; ///< Output pipeline binary data void *fsOutputMetaData; ///< Fragment outputs meta data. Valid for fragment shader. + unsigned fsOutputMetaDataSize; ///< Meta data size CacheAccessInfo pipelineCacheAccess; ///< Pipeline cache access status i.e., hit, miss, or not checked CacheAccessInfo stageCacheAccesses[ShaderStageCount]; ///< Shader cache access status i.e., hit, miss, or not checked }; @@ -151,6 +152,7 @@ struct RayTracingPipelineBuildOut { bool hasTraceRay; ///< Output whether have traceray module }; +#if LLPC_CLIENT_INTERFACE_MAJOR_VERSION < 66 /// Defines callback function used to lookup shader cache info in an external cache typedef Result (*ShaderCacheGetValue)(const void *pClientData, uint64_t hash, void *pValue, size_t *pValueLen); @@ -211,6 +213,7 @@ class IShaderCache { /// @internal Destructor. Prevent use of delete operator on this interface. virtual ~IShaderCache() {} }; +#endif // Users of LLPC may implement this interface to allow the compiler to request additional threads. // @@ -308,7 +311,8 @@ class ICompiler { virtual Result buildGraphicsPipelineWithElf(const GraphicsPipelineBuildInfo *pipelineInfo, GraphicsPipelineBuildOut *pipelineOut, const BinaryData *elfPackage) = 0; - /// Explicitly build the color export shader. + /// Explicitly build the color export shader. GraphicsPipelineBuildInfo::enableColorExportShader must be true, + /// Color export shader depends on cbState. /// /// @param [in] pipelineInfo : Info to build this shader module /// @param [in] fsOutputMetaData : Info to fragment outputs @@ -347,16 +351,6 @@ class ICompiler { RayTracingPipelineBuildOut *pPipelineOut, void *pPipelineDumpFile = nullptr, IHelperThreadProvider *pHelperThreadProvider = nullptr) = 0; -#if LLPC_ENABLE_SHADER_CACHE - /// Creates a shader cache object with the requested properties. - /// - /// @param [in] pCreateInfo Create info of the shader cache. - /// @param [out] ppShaderCache : Constructed shader cache object. - /// - /// @returns : Success if the shader cache was successfully created. Otherwise, ErrorOutOfMemory is returned. - virtual Result CreateShaderCache(const ShaderCacheCreateInfo *pCreateInfo, IShaderCache **ppShaderCache) = 0; -#endif - protected: ICompiler() {} /// Destructor diff --git a/llpc/lower/LowerGLCompatibility.cpp b/llpc/lower/LowerGLCompatibility.cpp index cf073a73c9..580b1114eb 100644 --- a/llpc/lower/LowerGLCompatibility.cpp +++ b/llpc/lower/LowerGLCompatibility.cpp @@ -261,11 +261,11 @@ void LowerGLCompatibility::createClipDistance() { mdValues.push_back(mdElement); mdValues.push_back(ConstantInt::get(int64Type, inOutMd.U64All[0])); mdValues.push_back(ConstantInt::get(int64Type, inOutMd.U64All[1])); - auto *mdVariable = ConstantStruct::get(static_cast(mdTy), mdValues); + auto *mdVriable = ConstantStruct::get(static_cast(mdTy), mdValues); // Setup input/output metadata std::vector mDs; - mDs.push_back(ConstantAsMetadata::get(mdVariable)); + mDs.push_back(ConstantAsMetadata::get(mdVriable)); auto mdNode = MDNode::get(*m_context, mDs); m_clipDistance->addMetadata(gSPIRVMD::InOut, *mdNode); } diff --git a/llpc/lower/LowerGpuRt.cpp b/llpc/lower/LowerGpuRt.cpp index 9f978ead75..03ad59133d 100644 --- a/llpc/lower/LowerGpuRt.cpp +++ b/llpc/lower/LowerGpuRt.cpp @@ -256,11 +256,16 @@ void LowerGpuRt::visitLdsStackInit(GpurtLdsStackInitOp &inst) { Value *stackBaseAsInt = m_builder->CreatePtrToInt( m_builder->CreateGEP(m_stackTy, m_stack, {m_builder->getInt32(0), stackBasePerThread}), m_builder->getInt32Ty()); - // stack_addr[31:18] = stack_base[15:2] - // stack_addr[17:0] = stack_index[17:0] - // The low 18 bits of stackAddr contain stackIndex which we always initialize to 0. - // Note that this relies on stackAddr being a multiple of 4, so that bits 17 and 16 are 0. - Value *stackAddr = m_builder->CreateShl(stackBaseAsInt, 16); + Value *stackAddr; + { + // stack_addr[31:18] = stack_base[15:2] + // stack_addr[17:0] = stack_index[17:0] + // The low 18 bits of stackAddr contain stackIndex which we always initialize to 0. + // Note that this relies on stackAddr being a multiple of 4, so that bits 17 and 16 are 0. + // stackAddrDw = (stackAddr >> 2) << 18. + stackAddr = m_builder->CreateShl(stackBaseAsInt, 16); + } + inst.replaceAllUsesWith(stackAddr); m_callsToLower.push_back(&inst); m_funcsToLower.insert(inst.getCalledFunction()); diff --git a/llpc/lower/PassRegistry.inc b/llpc/lower/PassRegistry.inc index 90b27da762..c85ef22d3c 100644 --- a/llpc/lower/PassRegistry.inc +++ b/llpc/lower/PassRegistry.inc @@ -42,6 +42,7 @@ LLPC_MODULE_PASS("llpc-spirv-lower-terminator", SpirvLowerTerminator) LLPC_MODULE_PASS("llpc-spirv-lower-translator", SpirvLowerTranslator) LLPC_MODULE_PASS("llpc-spirv-lower-global", SpirvLowerGlobal) LLPC_MODULE_PASS("llpc-spirv-lower-math-const-folding", SpirvLowerMathConstFolding) +LLPC_MODULE_PASS("llpc-spirv-lower-math-precision", SpirvLowerMathPrecision) LLPC_MODULE_PASS("llpc-spirv-lower-math-float-op", SpirvLowerMathFloatOp) LLPC_MODULE_PASS("llpc-spirv-lower-memory-op", SpirvLowerMemoryOp) LLPC_MODULE_PASS("llpc-spirv-lower-ray-query", SpirvLowerRayQuery) diff --git a/llpc/lower/llpcSpirvLower.cpp b/llpc/lower/llpcSpirvLower.cpp index 81a3708dd7..82ec9d50d5 100644 --- a/llpc/lower/llpcSpirvLower.cpp +++ b/llpc/lower/llpcSpirvLower.cpp @@ -201,7 +201,7 @@ void SpirvLower::addPasses(Context *context, ShaderStage stage, lgc::PassManager // Lower SPIR-V terminators passMgr.addPass(SpirvLowerTerminator()); - // Lower Glsl compatibility variables and operations + // Lower Glsl compatibility varaibles and operations passMgr.addPass(LowerGLCompatibility()); // Lower SPIR-V global variables, inputs, and outputs @@ -233,8 +233,15 @@ void SpirvLower::addPasses(Context *context, ShaderStage stage, lgc::PassManager // New version of the code (also handles unknown version, which we treat as latest) passMgr.addPass(createModuleToFunctionPassAdaptor(SROAPass(SROAOptions::ModifyCFG))); #endif + + // Lower SPIR-V precision / adjust fast math flags. + // Must be done before instruction combining pass to prevent incorrect contractions. + // Should be after SROA to avoid having to track values through memory load/store. + passMgr.addPass(SpirvLowerMathPrecision()); + passMgr.addPass(GlobalOptPass()); passMgr.addPass(createModuleToFunctionPassAdaptor(ADCEPass())); + #if LLVM_MAIN_REVISION && LLVM_MAIN_REVISION < 452298 // Old version of the code unsigned instCombineOpt = 2; @@ -252,8 +259,10 @@ void SpirvLower::addPasses(Context *context, ShaderStage stage, lgc::PassManager // Lower SPIR-V instruction metadata remove passMgr.addPass(SpirvLowerInstMetaRemove()); - if (rayTracing || rayQuery) + if (rayTracing || rayQuery) { passMgr.addPass(LowerGpuRt()); + passMgr.addPass(createModuleToFunctionPassAdaptor(InstCombinePass(instCombineOpt))); + } // Stop timer for lowering passes. if (lowerTimer) diff --git a/llpc/lower/llpcSpirvLowerConstImmediateStore.cpp b/llpc/lower/llpcSpirvLowerConstImmediateStore.cpp index a29d5567d3..94b269e720 100644 --- a/llpc/lower/llpcSpirvLowerConstImmediateStore.cpp +++ b/llpc/lower/llpcSpirvLowerConstImmediateStore.cpp @@ -31,6 +31,7 @@ #include "llpcSpirvLowerConstImmediateStore.h" #include "SPIRVInternal.h" #include "llpcContext.h" +#include "llvm/Analysis/ValueTracking.h" #include "llvm/IR/Instructions.h" #include "llvm/Support/Debug.h" #include @@ -126,7 +127,7 @@ StoreInst *SpirvLowerConstImmediateStore::findSingleStore(AllocaInst *allocaInst storeInstFound = storeInst; } else if (auto getElemPtrInst = dyn_cast(user)) pointers.push_back(getElemPtrInst); - else if (!isa(user)) { + else if (!isa(user) && !isAssumeLikeIntrinsic(user)) { // Pointer escapes by being used in some way other than "load/store/getelementptr". return nullptr; } diff --git a/llpc/lower/llpcSpirvLowerGlobal.cpp b/llpc/lower/llpcSpirvLowerGlobal.cpp index 00f176b223..1057269b99 100644 --- a/llpc/lower/llpcSpirvLowerGlobal.cpp +++ b/llpc/lower/llpcSpirvLowerGlobal.cpp @@ -33,6 +33,7 @@ #include "lgcrt/LgcRtDialect.h" #include "llpcContext.h" #include "llpcDebug.h" +#include "llpcRayTracingContext.h" #include "llpcSpirvLowerUtil.h" #include "lgc/LgcDialect.h" #include "llvm-dialects/Dialect/Visitor.h" @@ -44,6 +45,7 @@ #include "llvm/Support/Debug.h" #include "llvm/Support/raw_ostream.h" #include "llvm/Transforms/Utils/BasicBlockUtils.h" +#include "llvm/Transforms/Utils/Cloning.h" #include #define DEBUG_TYPE "llpc-spirv-lower-global" @@ -53,6 +55,12 @@ using namespace SPIRV; using namespace Llpc; using namespace lgc::rt; +namespace RtName { +static const char *HitAttribute = "HitAttribute"; +static const char *IncomingRayPayLoad = "IncomingRayPayloadKHR"; +static const char *IncomingCallableData = "IncomingCallableDataKHR"; +} // namespace RtName + namespace Llpc { // The code here relies on the SPIR-V built-in kind being the same as the Builder built-in kind. @@ -208,6 +216,8 @@ bool SpirvLowerGlobal::runImpl(Module &module) { SpirvLower::init(&module); + changeRtFunctionSignature(); + // Map globals to proxy variables for (auto global = m_module->global_begin(), end = m_module->global_end(); global != end; ++global) { if (global->getType()->getAddressSpace() == SPIRAS_Private) @@ -271,9 +281,9 @@ bool SpirvLowerGlobal::runImpl(Module &module) { void SpirvLowerGlobal::lowerEdgeFlag() { const unsigned int edgeflagInputLocation = Vkgc::GlCompatibilityAttributeLocation::EdgeFlag; - Llpc::PipelineContext *pipelineContext = m_context->getPipelineContext(); + Llpc::PipelineContext *pipelineContex = m_context->getPipelineContext(); const Vkgc::GraphicsPipelineBuildInfo *pipelineInfo = - static_cast(pipelineContext->getPipelineBuildInfo()); + static_cast(pipelineContex->getPipelineBuildInfo()); const VkPipelineVertexInputStateCreateInfo *vertexInfo = pipelineInfo->pVertexInput; if (!vertexInfo) @@ -582,8 +592,18 @@ void SpirvLowerGlobal::mapGlobalVariableToProxy(GlobalVariable *globalVar) { m_builder->SetInsertPointPastAllocas(m_entryPoint); - auto proxy = m_builder->CreateAlloca(globalVarTy, dataLayout.getAllocaAddrSpace(), nullptr, - Twine(LlpcName::GlobalProxyPrefix) + globalVar->getName()); + Value *proxy = nullptr; + + // Handle special globals, regular allocas will be removed by SROA pass. + if (globalVar->getName().startswith(RtName::HitAttribute)) + proxy = m_entryPoint->getArg(1); + else if (globalVar->getName().startswith(RtName::IncomingRayPayLoad)) + proxy = m_entryPoint->getArg(0); + else if (globalVar->getName().startswith(RtName::IncomingCallableData)) + proxy = m_entryPoint->getArg(0); + else + proxy = m_builder->CreateAlloca(globalVarTy, dataLayout.getAllocaAddrSpace(), nullptr, + Twine(LlpcName::GlobalProxyPrefix) + globalVar->getName()); if (globalVar->hasInitializer()) { auto initializer = globalVar->getInitializer(); @@ -598,10 +618,9 @@ void SpirvLowerGlobal::mapGlobalVariableToProxy(GlobalVariable *globalVar) { // // @param input : Input to be mapped void SpirvLowerGlobal::mapInputToProxy(GlobalVariable *input) { - // NOTE: For tessellation shader or mesh shader, we do not map inputs to real proxy variables. Instead, we directly + // NOTE: For tessellation shader, we do not map inputs to real proxy variables. Instead, we directly // replace "load" instructions with import calls in the lowering operation. - if (m_shaderStage == ShaderStageTessControl || m_shaderStage == ShaderStageTessEval || - m_shaderStage == ShaderStageMesh) { + if (m_shaderStage == ShaderStageTessControl || m_shaderStage == ShaderStageTessEval) { m_inputProxyMap[input] = nullptr; m_lowerInputInPlace = true; return; @@ -1196,6 +1215,27 @@ Value *SpirvLowerGlobal::addCallInstForInOutImport(Type *inOutTy, unsigned addrS // and "BuiltInSubgroupXXXMask" share the same numeric values. inOutValue = m_builder->CreateBitCast(inOutValue, FixedVectorType::get(inOutTy, 2)); inOutValue = m_builder->CreateExtractElement(inOutValue, uint64_t(0)); + } else if (builtIn == lgc::BuiltInFragCoord) { + auto buildInfo = static_cast(m_context->getPipelineBuildInfo()); + if (buildInfo->originUpperLeft != + static_cast(buildInfo->fs.pModuleData)->usage.originUpperLeft) { + unsigned offset = 0; + auto winSize = getUniformConstantEntryByLocation(m_context, m_shaderStage, + Vkgc::GlCompatibilityUniformLocation::FrameBufferSize); + if (winSize) { + offset = winSize->offset; + Value *bufferDesc = + m_builder->CreateLoadBufferDesc(Vkgc::InternalDescriptorSetId, Vkgc::ConstantBuffer0Binding, + m_builder->getInt32(0), lgc::Builder::BufferFlagNonConst); + // Layout is {width, height}, so the offset of height is added sizeof(float). + Value *winHeightPtr = + m_builder->CreateConstInBoundsGEP1_32(m_builder->getInt8Ty(), bufferDesc, offset + sizeof(float)); + auto winHeight = m_builder->CreateLoad(m_builder->getFloatTy(), winHeightPtr); + auto fragCoordY = m_builder->CreateExtractElement(inOutValue, 1); + fragCoordY = m_builder->CreateFSub(winHeight, fragCoordY); + inOutValue = m_builder->CreateInsertElement(inOutValue, fragCoordY, 1); + } + } } if (inOutValue->getType()->isIntegerTy(1)) { // Convert i1 to i32. @@ -1406,6 +1446,9 @@ void SpirvLowerGlobal::addCallInstForOutputExport(Value *outputValue, Constant * cast(locOffset)->getZExtValue(), outputInfo); } + if (m_context->getPipelineContext()->getUseDualSourceBlend()) { + outputInfo.setDualSourceBlendDynamic(true); + } m_builder->CreateWriteGenericOutput(outputValue, location, locOffset, elemIdx, maxLocOffset, outputInfo, vertexOrPrimitiveIdx); } @@ -2356,7 +2399,7 @@ void SpirvLowerGlobal::interpolateInputElement(unsigned interpLoc, Value *auxInt } // ===================================================================================================================== -// Fill the XFB info map from the Vkgc::ApiXfbOutData if XFB is specified by API interface +// Fill the XFB info map from the Vkgc::ApiXfbOutData if XFB is specified by API inerface void SpirvLowerGlobal::buildApiXfbMap() { auto pipelineBuildInfo = static_cast(m_context->getPipelineBuildInfo()); for (unsigned idx = 0; idx < pipelineBuildInfo->apiXfbOutData.numXfbOutInfo; ++idx) { @@ -2529,4 +2572,55 @@ void SpirvLowerGlobal::handleVolatileInput(GlobalVariable *input, Value *proxy) } } +// ===================================================================================================================== +// Changes function signature for RT shaders. Specifically, add payload / hit attribute / callable data pointers and +// metadata to function signature. +void SpirvLowerGlobal::changeRtFunctionSignature() { + if (!isRayTracingShaderStage(m_shaderStage)) + return; + + // Ray generation shader has no input payload or hit attributes + if (m_shaderStage == ShaderStageRayTracingRayGen) + return; + + auto rayTracingContext = static_cast(m_context->getPipelineContext()); + + ValueToValueMapTy VMap; + SmallVector argTys; + SmallVector retInsts; + Type *pointerTy = PointerType::get(*m_context, SPIRAS_Private); + switch (m_shaderStage) { + case ShaderStageRayTracingIntersect: + case ShaderStageRayTracingAnyHit: + case ShaderStageRayTracingClosestHit: + // Hit attribute + argTys.push_back(pointerTy); + setShaderHitAttributeSize(m_entryPoint, rayTracingContext->getAttributeDataSizeInBytes()); + LLVM_FALLTHROUGH; // Fall through: Handle payload + case ShaderStageRayTracingMiss: + // Payload + argTys.push_back(pointerTy); + setShaderPaq(m_entryPoint, getPaqFromSize(*m_context, rayTracingContext->getPayloadSizeInBytes())); + break; + case ShaderStageRayTracingCallable: + // Callable data + argTys.push_back(pointerTy); + setShaderArgSize(m_entryPoint, rayTracingContext->getCallableDataSizeInBytes()); + break; + default: + llvm_unreachable("Should never be called"); + } + + assert(m_entryPoint->arg_empty()); + + auto newFuncTy = FunctionType::get(m_entryPoint->getReturnType(), argTys, false); + auto newFunc = Function::Create(newFuncTy, m_entryPoint->getLinkage(), "", m_module); + newFunc->takeName(m_entryPoint); + + CloneFunctionInto(newFunc, m_entryPoint, VMap, CloneFunctionChangeType::LocalChangesOnly, retInsts); + assert(m_entryPoint->use_empty()); + m_entryPoint->eraseFromParent(); + m_entryPoint = newFunc; +} + } // namespace Llpc diff --git a/llpc/lower/llpcSpirvLowerGlobal.h b/llpc/lower/llpcSpirvLowerGlobal.h index 6a56371ef9..bbdd162b4b 100644 --- a/llpc/lower/llpcSpirvLowerGlobal.h +++ b/llpc/lower/llpcSpirvLowerGlobal.h @@ -83,6 +83,8 @@ class SpirvLowerGlobal : public SpirvLower, public llvm::PassInfoMixingetPipelineContext()->getPipelineOptions()->enableImplicitInvariantExports; } // ===================================================================================================================== @@ -123,7 +125,7 @@ bool SpirvLowerMath::isOperandNoContract(Value *operand) { // Disable fast math for all values related with the specified value // // @param value : Value to disable fast math for -void SpirvLowerMath::disableFastMath(Value *value) { +static void disableFastMath(Value *value) { std::set allValues; std::list workSet; if (isa(value)) { @@ -181,6 +183,9 @@ bool SpirvLowerMathConstFolding::runImpl(Module &module, SpirvLowerMath::init(module); + if (m_shaderStage == ShaderStageInvalid) + return false; + if (m_fp16DenormFlush || m_fp32DenormFlush || m_fp64DenormFlush) { // Do constant folding if we need flush denorm to zero. auto &targetLibInfo = getTargetLibraryInfo(); @@ -235,6 +240,72 @@ Function *SpirvLowerMathConstFolding::getEntryPoint() { } #undef DEBUG_TYPE // DEBUG_TYPE_CONST_FOLDING +#define DEBUG_TYPE DEBUG_TYPE_PRECISION + +// ===================================================================================================================== +// Run precision (fast math flag) adjustment SPIR-V lowering pass on the specified LLVM module. +// +// @param [in/out] module : LLVM module to be run on (empty on entry) +// @param [in/out] analysisManager : Analysis manager to use for this transformation +PreservedAnalyses SpirvLowerMathPrecision::run(Module &module, ModuleAnalysisManager &analysisManager) { + if (runImpl(module)) + return PreservedAnalyses::none(); + return PreservedAnalyses::all(); +} + +// ===================================================================================================================== +// Run precision (fast math flag) adjustment SPIR-V lowering pass on the specified LLVM module. +// +// @param [in/out] module : LLVM module to be run on +bool SpirvLowerMathPrecision::runImpl(Module &module) { + LLVM_DEBUG(dbgs() << "Run the pass Spirv-Lower-Math-Precision\n"); + + SpirvLower::init(&module); + if (m_shaderStage == ShaderStageInvalid) + return false; + + bool enableImplicitInvariantExports = + m_context->getPipelineContext()->getPipelineOptions()->enableImplicitInvariantExports; + if (!enableImplicitInvariantExports) + return false; + + bool changed = false; + for (auto &func : module.functions()) { + // Disable fast math for gl_Position. + // TODO: This requires knowledge of the Builder implementation, which is not ideal. + // We need to find a neater way to do it. + auto funcName = func.getName(); + bool isExport; + if (funcName.startswith("lgc.output.export.builtin.")) + isExport = true; + else if (funcName.startswith("lgc.create.write.builtin")) + isExport = false; + else + continue; + + for (User *user : func.users()) { + CallInst *callInst = cast(user); + unsigned builtIn; + Value *valueWritten; + if (isExport) { + builtIn = cast(callInst->getOperand(0))->getZExtValue(); + valueWritten = callInst->getOperand(callInst->arg_size() - 1); + } else { + builtIn = cast(callInst->getOperand(1))->getZExtValue(); + valueWritten = callInst->getOperand(0); + } + + if (valueWritten && builtIn == lgc::BuiltInPosition && enableImplicitInvariantExports) { + disableFastMath(valueWritten); + changed = true; + } + } + } + + return changed; +} + +#undef DEBUG_TYPE // DEBUG_TYPE_PRECISION #define DEBUG_TYPE DEBUG_TYPE_FLOAT_OP // ===================================================================================================================== @@ -380,22 +451,6 @@ void SpirvLowerMathFloatOp::visitCallInst(CallInst &callInst) { if (callee->isIntrinsic() && callee->getIntrinsicID() == Intrinsic::fabs) { // NOTE: FABS will be optimized by backend compiler with sign bit removed via AND. flushDenormIfNeeded(&callInst); - } else { - // Disable fast math for gl_Position. - // TODO: Having this here is not good, as it requires us to know implementation details of Builder. - // We need to find a neater way to do it. - auto calleeName = callee->getName(); - unsigned builtIn = InvalidValue; - Value *valueWritten = nullptr; - if (calleeName.startswith("lgc.output.export.builtin.")) { - builtIn = cast(callInst.getOperand(0))->getZExtValue(); - valueWritten = callInst.getOperand(callInst.arg_size() - 1); - } else if (calleeName.startswith("lgc.create.write.builtin")) { - builtIn = cast(callInst.getOperand(1))->getZExtValue(); - valueWritten = callInst.getOperand(0); - } - if (builtIn == lgc::BuiltInPosition && m_enableImplicitInvariantExports) - disableFastMath(valueWritten); } } diff --git a/llpc/lower/llpcSpirvLowerMath.h b/llpc/lower/llpcSpirvLowerMath.h index 53138c819c..98661cf739 100644 --- a/llpc/lower/llpcSpirvLowerMath.h +++ b/llpc/lower/llpcSpirvLowerMath.h @@ -48,15 +48,12 @@ class SpirvLowerMath : public SpirvLower { void flushDenormIfNeeded(llvm::Instruction *inst); bool isOperandNoContract(llvm::Value *operand); - void disableFastMath(llvm::Value *value); - - bool m_changed; // Whether the module is changed - bool m_fp16DenormFlush; // Whether FP mode wants f16 denorms to be flushed to zero - bool m_fp32DenormFlush; // Whether FP mode wants f32 denorms to be flushed to zero - bool m_fp64DenormFlush; // Whether FP mode wants f64 denorms to be flushed to zero - bool m_fp16RoundToZero; // Whether FP mode wants f16 round-to-zero - bool m_enableImplicitInvariantExports; // Whether fast math should be disabled - // for gl_Position exports + + bool m_changed; // Whether the module is changed + bool m_fp16DenormFlush; // Whether FP mode wants f16 denorms to be flushed to zero + bool m_fp32DenormFlush; // Whether FP mode wants f32 denorms to be flushed to zero + bool m_fp64DenormFlush; // Whether FP mode wants f64 denorms to be flushed to zero + bool m_fp16RoundToZero; // Whether FP mode wants f16 round-to-zero }; // ===================================================================================================================== @@ -78,6 +75,17 @@ class SpirvLowerMathConstFolding : public SpirvLowerMath, public llvm::PassInfoM llvm::Function *getEntryPoint(); }; +// ===================================================================================================================== +// SPIR-V lowering operations to adjust fast math flags. +class SpirvLowerMathPrecision : public SpirvLower, public llvm::PassInfoMixin { + +public: + llvm::PreservedAnalyses run(llvm::Module &module, llvm::ModuleAnalysisManager &analysisManager); + bool runImpl(llvm::Module &module); + + static llvm::StringRef name() { return "Lower SPIR-V for precision (fast math flags)"; } +}; + // ===================================================================================================================== // SPIR-V lowering operations for math floating point optimisation. class SpirvLowerMathFloatOp : public SpirvLowerMath, diff --git a/llpc/lower/llpcSpirvLowerRayQuery.cpp b/llpc/lower/llpcSpirvLowerRayQuery.cpp index 5f452fb8d0..5d25b52442 100644 --- a/llpc/lower/llpcSpirvLowerRayQuery.cpp +++ b/llpc/lower/llpcSpirvLowerRayQuery.cpp @@ -1228,7 +1228,7 @@ Value *SpirvLowerRayQuery::createTransformMatrix(unsigned builtInId, Value *acce // Bitcast instanceNodeOffsetAddr to i64 integer instanceNodeOffsetAddr = m_builder->CreateBitCast(instanceNodeOffsetAddr, m_builder->getInt64Ty()); - Type *gpuAddrAsPtrTy = Type::getInt8PtrTy(*m_context, SPIRAS_Global); + Type *gpuAddrAsPtrTy = PointerType::get(*m_context, SPIRAS_Global); auto instNodeOffsetAddrAsPtr = m_builder->CreateIntToPtr(instanceNodeOffsetAddr, gpuAddrAsPtrTy); Value *baseInstOffset = m_builder->CreateGEP(m_builder->getInt8Ty(), instNodeOffsetAddrAsPtr, zero); Type *baseInstOffsetTy = m_builder->getInt32Ty()->getPointerTo(SPIRAS_Global); @@ -1296,7 +1296,7 @@ bool SpirvLowerRayQuery::stageNotSupportLds(ShaderStage stage) { // @param instNodeAddr : 64-bit instance node address, in <2 x i32> Value *SpirvLowerRayQuery::createLoadInstanceIndex(Value *instNodeAddr) { Value *zero = m_builder->getInt32(0); - Type *gpuAddrAsPtrTy = Type::getInt8PtrTy(*m_context, SPIRAS_Global); + Type *gpuAddrAsPtrTy = PointerType::get(*m_context, SPIRAS_Global); auto int32x2Ty = FixedVectorType::get(m_builder->getInt32Ty(), 2); const unsigned instanceIndexOffset = offsetof(RayTracingInstanceNode, extra.instanceIndex); @@ -1310,7 +1310,7 @@ Value *SpirvLowerRayQuery::createLoadInstanceIndex(Value *instNodeAddr) { instanceIndexAddr = m_builder->CreateBitCast(instanceIndexAddr, m_builder->getInt64Ty()); auto instanceIndexAddrAsPtr = m_builder->CreateIntToPtr(instanceIndexAddr, gpuAddrAsPtrTy); auto loadValue = m_builder->CreateGEP(m_builder->getInt8Ty(), instanceIndexAddrAsPtr, zero); - loadValue = m_builder->CreateBitCast(loadValue, Type::getInt32PtrTy(*m_context, SPIRAS_Global)); + loadValue = m_builder->CreateBitCast(loadValue, PointerType::get(*m_context, SPIRAS_Global)); return m_builder->CreateLoad(m_builder->getInt32Ty(), loadValue); } @@ -1353,7 +1353,7 @@ Value *SpirvLowerRayQuery::createGetInstanceNodeAddr(Value *instNodePtr, Value * // @param instNodeAddr : 64-bit instance node address, in <2 x i32> Value *SpirvLowerRayQuery::createLoadInstanceId(Value *instNodeAddr) { Value *zero = m_builder->getInt32(0); - Type *gpuAddrAsPtrTy = Type::getInt8PtrTy(*m_context, SPIRAS_Global); + Type *gpuAddrAsPtrTy = PointerType::get(*m_context, SPIRAS_Global); auto int32x2Ty = FixedVectorType::get(m_builder->getInt32Ty(), 2); const unsigned instanceIdOffset = offsetof(RayTracingInstanceNode, desc.InstanceID_and_Mask); @@ -1367,7 +1367,7 @@ Value *SpirvLowerRayQuery::createLoadInstanceId(Value *instNodeAddr) { instanceIdAddr = m_builder->CreateBitCast(instanceIdAddr, m_builder->getInt64Ty()); auto instanceIdAddrAsPtr = m_builder->CreateIntToPtr(instanceIdAddr, gpuAddrAsPtrTy); auto loadValue = m_builder->CreateGEP(m_builder->getInt8Ty(), instanceIdAddrAsPtr, zero); - loadValue = m_builder->CreateBitCast(loadValue, Type::getInt32PtrTy(*m_context, SPIRAS_Global)); + loadValue = m_builder->CreateBitCast(loadValue, PointerType::get(*m_context, SPIRAS_Global)); loadValue = m_builder->CreateLoad(m_builder->getInt32Ty(), loadValue); // Mask out the instance ID in lower 24 bits @@ -1382,7 +1382,7 @@ Value *SpirvLowerRayQuery::createLoadInstanceId(Value *instNodeAddr) { // @param matrixAddr : Matrix address, which type is <2 x i32> Value *SpirvLowerRayQuery::createLoadMatrixFromAddr(Value *matrixAddr) { Value *zero = m_builder->getInt32(0); - Type *gpuAddrAsPtrTy = Type::getInt8PtrTy(*m_context, SPIRAS_Global); + Type *gpuAddrAsPtrTy = PointerType::get(*m_context, SPIRAS_Global); // Bitcast matrixAddr to i64 integer matrixAddr = m_builder->CreateBitCast(matrixAddr, m_builder->getInt64Ty()); diff --git a/llpc/lower/llpcSpirvLowerRayTracing.cpp b/llpc/lower/llpcSpirvLowerRayTracing.cpp index 6adce10afd..cfa06fd2ab 100644 --- a/llpc/lower/llpcSpirvLowerRayTracing.cpp +++ b/llpc/lower/llpcSpirvLowerRayTracing.cpp @@ -32,6 +32,7 @@ #include "llpcSpirvLowerRayTracing.h" #include "SPIRVInternal.h" #include "gpurt-compiler.h" +#include "lgccps/LgcCpsDialect.h" #include "lgcrt/LgcRtDialect.h" #include "llpcContext.h" #include "llpcRayTracingContext.h" @@ -68,12 +69,10 @@ namespace RtName { const char *TraceRayKHR = "_cs_"; const char *TraceRaySetTraceParams = "TraceRaySetTraceParams"; const char *ShaderTable = "ShaderTable"; -static const char *HitAttribute = "HitAttribute"; -static const char *IncomingPayLoad = "IncomingRayPayloadKHR"; -static const char *IncomingCallableData = "IncomingCallableDataKHR"; static const char *CallAnyHitShader = "AmdTraceRayCallAnyHitShader"; static const char *FetchTrianglePositionFromNodePointer = "FetchTrianglePositionFromNodePointer"; static const char *RemapCapturedVaToReplayVa = "AmdTraceRayRemapCapturedVaToReplayVa"; +static const char *ContinufyStageMeta = "continufy.stage"; } // namespace RtName namespace Llpc { @@ -161,7 +160,6 @@ void SpirvLowerRayTracing::processTraceRayCall(BaseTraceRayOp *inst) { args.push_back(func->getArg(i)); Value *parentRayId = func->arg_end() - 2; - generateTraceRayStaticId(); // RayGen shaders are non-recursive, initialize parent ray ID to -1 here. if (m_shaderStage == ShaderStageRayTracingRayGen) @@ -179,11 +177,18 @@ void SpirvLowerRayTracing::processTraceRayCall(BaseTraceRayOp *inst) { // Create the indirect function call result = m_builder->CreateCall(funcTy, funcPtr, args); result->setCallingConv(CallingConv::SPIR_FUNC); + + unsigned lgcRtStage = ~0u; + result->setMetadata(RtName::ContinufyStageMeta, + MDNode::get(*m_context, ConstantAsMetadata::get(m_builder->getInt32(lgcRtStage)))); } else { result = m_builder->CreateNamedCall(RtName::TraceRayKHR, funcTy->getReturnType(), args, {Attribute::AlwaysInline}); } + // Restore parent ray ID after call + m_builder->CreateStore(currentParentRayId, parentRayId); + // Save the return value to the input payloads for memcpy of type conversion m_builder->CreateStore(result, localPayload); m_builder->CreateMemCpy(payloadArg, align, localPayload, align, payloadArgSize); @@ -257,6 +262,11 @@ void SpirvLowerRayTracing::visitCallCallableShaderOp(CallCallableShaderOp &inst) auto funcPtr = m_builder->CreateIntToPtr(shaderIdentifier, funcPtrTy); CallInst *result = m_builder->CreateCall(funcTy, funcPtr, args); result->setCallingConv(CallingConv::SPIR_FUNC); + + unsigned lgcRtStage = static_cast(mapStageToLgcRtShaderStage(ShaderStageRayTracingCallable)); + result->setMetadata(RtName::ContinufyStageMeta, + MDNode::get(*m_context, ConstantAsMetadata::get(m_builder->getInt32(lgcRtStage)))); + m_builder->CreateStore(result, inputResult); m_builder->CreateBr(endBlock); } else { @@ -544,9 +554,13 @@ PreservedAnalyses SpirvLowerRayTracing::run(Module &module, ModuleAnalysisManage } // Process traceRays module if (m_shaderStage == ShaderStageCompute) { - CallInst *call = createTraceRay(); inlineTraceRay(call, analysisManager); + + unsigned lgcRtStage = ~0u; + m_entryPoint->setMetadata(RtName::ContinufyStageMeta, + MDNode::get(*m_context, ConstantAsMetadata::get(m_builder->getInt32(lgcRtStage)))); + static auto visitor = llvm_dialects::VisitorBuilder() .setStrategy(llvm_dialects::VisitorStrategy::ByFunctionDeclaration) .add(&SpirvLowerRayTracing::visitGetHitAttributes) @@ -571,6 +585,10 @@ PreservedAnalyses SpirvLowerRayTracing::run(Module &module, ModuleAnalysisManage createDispatchRaysInfoDesc(); m_spirvOpMetaKindId = m_context->getMDKindID(MetaNameSpirvOp); + unsigned lgcRtStage = static_cast(mapStageToLgcRtShaderStage(m_shaderStage)); + m_entryPoint->setMetadata(RtName::ContinufyStageMeta, + MDNode::get(*m_context, ConstantAsMetadata::get(m_builder->getInt32(lgcRtStage)))); + if (m_shaderStage == ShaderStageRayTracingAnyHit || m_shaderStage == ShaderStageRayTracingClosestHit || m_shaderStage == ShaderStageRayTracingIntersect) { m_worldToObjMatrix = nullptr; @@ -578,13 +596,6 @@ PreservedAnalyses SpirvLowerRayTracing::run(Module &module, ModuleAnalysisManage m_insertPosPastInit = insertPos; - static auto allocaVisitor = llvm_dialects::VisitorBuilder() - .setStrategy(llvm_dialects::VisitorStrategy::ByInstruction) - .add(&SpirvLowerRayTracing::visitAlloca) - .build(); - - allocaVisitor.visit(*this, *m_entryPoint); - static auto visitor = llvm_dialects::VisitorBuilder() .setStrategy(llvm_dialects::VisitorStrategy::ByFunctionDeclaration) .add(&SpirvLowerRayTracing::visitAcceptHitAndEndSearchOp) @@ -844,6 +855,11 @@ void SpirvLowerRayTracing::createCallShader(Function *func, ShaderStage stage, u auto funcPtr = m_builder->CreateIntToPtr(shaderId, funcPtrTy); CallInst *result = m_builder->CreateCall(funcTy, funcPtr, args); + + unsigned lgcRtStage = static_cast(mapStageToLgcRtShaderStage(stage)); + result->setMetadata(RtName::ContinufyStageMeta, + MDNode::get(*m_context, ConstantAsMetadata::get(m_builder->getInt32(lgcRtStage)))); + result->setCallingConv(CallingConv::SPIR_FUNC); storeFunctionCallResult(stage, result, traceParamsIt); m_builder->CreateBr(endBlock); @@ -1117,7 +1133,7 @@ Value *SpirvLowerRayTracing::getShaderIdentifier(ShaderStage stage, Value *shade offsetVal = m_builder->CreateAdd(offsetVal, m_builder->getInt32(offset)); // DWord_Load(TableAddr, offset) - Type *gpuAddrAsPtrTy = Type::getInt8PtrTy(*m_context, SPIRAS_Global); + Type *gpuAddrAsPtrTy = PointerType::get(*m_context, SPIRAS_Global); auto shaderIdentifierAsPtr = m_builder->CreateIntToPtr(tableAddrVal, gpuAddrAsPtrTy); Value *shaderIdentifier = m_builder->CreateGEP(m_builder->getInt8Ty(), shaderIdentifierAsPtr, offsetVal); auto loadPtrTy = m_builder->getInt64Ty()->getPointerTo(SPIRAS_Global); @@ -1254,6 +1270,16 @@ void SpirvLowerRayTracing::createRayGenEntryFunc() { m_builder->SetInsertPoint(mainBlock); auto rayGenId = getShaderIdentifier(m_shaderStage, m_builder->getInt32(0), m_dispatchRaysInfoDesc); auto rayTracingContext = static_cast(m_context->getPipelineContext()); + + if (rayTracingContext->getRaytracingMode() == Vkgc::LlpcRaytracingMode::Continuations) { + // Setup continuation stack pointer + auto offset = offsetof(GpuRt::DispatchRaysConstantData, cpsBackendStackSize); + auto gep = m_builder->CreateConstGEP1_32(m_builder->getInt8Ty(), m_dispatchRaysInfoDesc, offset); + Value *stackPtr = m_builder->CreateLoad(m_builder->getInt32Ty(), gep); + stackPtr = m_builder->CreateIntToPtr(stackPtr, PointerType::get(*m_context, lgc::cps::stackAddrSpace)); + m_builder->create(stackPtr); + } + bool indirect = rayTracingContext->getIndirectStageMask() & shaderStageToMask(m_shaderStage); if (!indirect) { // Create Shader selection @@ -1270,6 +1296,11 @@ void SpirvLowerRayTracing::createRayGenEntryFunc() { auto funcPtr = m_builder->CreateIntToPtr(rayGenId, funcPtrTy); CallInst *call = m_builder->CreateCall(funcTy, funcPtr, {}); call->setCallingConv(CallingConv::SPIR_FUNC); + + unsigned lgcRtStage = static_cast(mapStageToLgcRtShaderStage(ShaderStageRayTracingRayGen)); + call->setMetadata(RtName::ContinufyStageMeta, + MDNode::get(*m_context, ConstantAsMetadata::get(m_builder->getInt32(lgcRtStage)))); + m_builder->CreateBr(endBlock); } // Construct end block @@ -1568,9 +1599,9 @@ CallInst *SpirvLowerRayTracing::createTraceRay() { } m_builder->CreateStore(arg, traceRaysArgs[TraceRayLibFuncParam::TMax]); - // Parent ray ID and static ID if logging feature is enabled + // Parent ray ID and static ID for logging feature arg = ++argIt; - // ParentRayId is ignored for now + m_builder->CreateStore(arg, m_traceParams[TraceParam::ParentRayId]); arg = ++argIt; m_builder->create(arg); @@ -1751,6 +1782,16 @@ Instruction *SpirvLowerRayTracing::createEntryFunc(Function *func) { Function *newFunc = Function::Create(newFuncTy, GlobalValue::ExternalLinkage, m_module->getName(), m_module); newFunc->setCallingConv(CallingConv::SPIR_FUNC); + createTraceParams(func); + func->getArg(0)->replaceAllUsesWith(m_traceParams[TraceParam::Payload]); + setShaderPaq(newFunc, getShaderPaq(func)); + if (m_shaderStage != ShaderStageRayTracingMiss) { + assert((m_shaderStage == ShaderStageRayTracingIntersect) || (m_shaderStage == ShaderStageRayTracingAnyHit) || + (m_shaderStage == ShaderStageRayTracingClosestHit)); + func->getArg(1)->replaceAllUsesWith(m_traceParams[TraceParam::HitAttributes]); + setShaderHitAttributeSize(newFunc, getShaderHitAttributeSize(func)); + } + // Transfer code from old entry function to the new entry function while (!func->empty()) { BasicBlock *block = &func->front(); @@ -1765,7 +1806,6 @@ Instruction *SpirvLowerRayTracing::createEntryFunc(Function *func) { m_entryPoint = newFunc; m_entryPoint->addFnAttr(Attribute::NoUnwind); m_entryPoint->addFnAttr(Attribute::AlwaysInline); - createTraceParams(newFunc); Instruction *insertPos = &*(newFunc->begin()->getFirstNonPHIOrDbgOrAlloca()); m_builder->SetInsertPoint(insertPos); @@ -1851,7 +1891,7 @@ FunctionType *SpirvLowerRayTracing::getTraceRayFuncTy() { m_builder->getFloatTy(), // Ray Tmax }; - // Add parent ray ID and static ID if logging feature is enabled. + // Add parent ray ID and static ID for logging feature. argsTys.push_back(m_builder->getInt32Ty()); argsTys.push_back(m_builder->getInt32Ty()); @@ -1872,6 +1912,11 @@ Instruction *SpirvLowerRayTracing::createCallableShaderEntryFunc(Function *func) Function *newFunc = Function::Create(newFuncTy, GlobalValue::ExternalLinkage, m_module->getName(), m_module); newFunc->setCallingConv(CallingConv::C); + m_builder->SetInsertPointPastAllocas(func); + m_callableData = m_builder->CreateAlloca(newFunc->getReturnType()); + func->getArg(0)->replaceAllUsesWith(m_callableData); + setShaderArgSize(newFunc, getShaderArgSize(func)); + // Transfer code from old entry function to the new entry function while (!func->empty()) { BasicBlock *block = &func->front(); @@ -1894,7 +1939,6 @@ Instruction *SpirvLowerRayTracing::createCallableShaderEntryFunc(Function *func) // Save the function input parameter value to the global callable // the global payload here are needed for the recursive traceray function of the shader stage Value *callableData = argIt++; - m_callableData = m_builder->CreateAlloca(newFunc->getReturnType()); m_builder->CreateStore(callableData, m_callableData); // Save the shader record index @@ -2827,7 +2871,7 @@ void SpirvLowerRayTracing::visitShaderIndexOp(lgc::rt::ShaderIndexOp &inst) { } // ===================================================================================================================== -// Visits "lgc.rt.get.shader.record.buffer.ptr" instructions +// Visits "lgc.rt.shader.record.buffer" instructions // // @param inst : The instruction void SpirvLowerRayTracing::visitShaderRecordBufferOp(lgc::rt::ShaderRecordBufferOp &inst) { @@ -2893,28 +2937,6 @@ void SpirvLowerRayTracing::visitShaderRecordBufferOp(lgc::rt::ShaderRecordBuffer m_funcsToLower.insert(inst.getCalledFunction()); } -// ===================================================================================================================== -// Visit alloca instructions -// -// @param alloca : the instruction -void SpirvLowerRayTracing::visitAlloca(AllocaInst &inst) { - // This alloca should be in the entry block of the entry function. - assert(inst.getParent() == &m_entryPoint->getEntryBlock()); - auto allocaName = inst.getName(); - if (allocaName.contains(RtName::HitAttribute)) { - inst.replaceAllUsesWith(m_traceParams[TraceParam::HitAttributes]); - } else if (allocaName.contains(RtName::IncomingPayLoad)) { - m_builder->SetInsertPoint(&inst); - inst.replaceAllUsesWith(m_traceParams[TraceParam::Payload]); - } else if (allocaName.contains(RtName::IncomingCallableData)) { - inst.replaceAllUsesWith(m_callableData); - } else { - return; - } - - m_callsToLower.push_back(&inst); -} - // ===================================================================================================================== // Creates instructions to load instance node address Value *SpirvLowerRayTracing::createLoadInstNodeAddr() { @@ -2950,4 +2972,9 @@ llvm::Function *SpirvLowerRayTracing::createImplFunc(CallInst &inst, ArrayRefgetFunction(mangledName); } +lgc::rt::RayTracingShaderStage SpirvLowerRayTracing::mapStageToLgcRtShaderStage(ShaderStage stage) { + assert((stage >= ShaderStageRayTracingRayGen) && (stage <= ShaderStageRayTracingCallable)); + return static_cast(stage - ShaderStageRayTracingRayGen); +} + } // namespace Llpc diff --git a/llpc/lower/llpcSpirvLowerRayTracing.h b/llpc/lower/llpcSpirvLowerRayTracing.h index 0bcdfcd93c..e01e74d244 100644 --- a/llpc/lower/llpcSpirvLowerRayTracing.h +++ b/llpc/lower/llpcSpirvLowerRayTracing.h @@ -61,6 +61,7 @@ class PrimitiveIndexOp; class InstanceInclusionMaskOp; class ShaderIndexOp; class ShaderRecordBufferOp; +enum class RayTracingShaderStage; } // namespace lgc::rt namespace lgc { @@ -262,10 +263,10 @@ class SpirvLowerRayTracing : public SpirvLowerRayQuery { void visitShaderIndexOp(lgc::rt::ShaderIndexOp &inst); void visitShaderRecordBufferOp(lgc::rt::ShaderRecordBufferOp &inst); - void visitAlloca(llvm::AllocaInst &inst); - llvm::Value *createLoadInstNodeAddr(); + lgc::rt::RayTracingShaderStage mapStageToLgcRtShaderStage(ShaderStage stage); + llvm::Value *m_traceParams[TraceParam::Count]; // Trace ray set parameters llvm::Value *m_worldToObjMatrix = nullptr; // World to Object matrix llvm::AllocaInst *m_callableData = nullptr; // Callable data variable for current callable shader diff --git a/llpc/lower/llpcSpirvLowerRayTracingIntrinsics.cpp b/llpc/lower/llpcSpirvLowerRayTracingIntrinsics.cpp deleted file mode 100644 index 46e32a14a8..0000000000 --- a/llpc/lower/llpcSpirvLowerRayTracingIntrinsics.cpp +++ /dev/null @@ -1,184 +0,0 @@ -/* - *********************************************************************************************************************** - * - * Copyright (c) 2020-2023 Advanced Micro Devices, Inc. All Rights Reserved. - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - * - **********************************************************************************************************************/ -/** - *********************************************************************************************************************** - * @file llpcSpirvLowerRayTracingIntrinsics.cpp - * @brief LLPC source file: contains implementation of class Llpc::SpirvLowerRayTracingIntrinsics. - *********************************************************************************************************************** - */ - -#include "llpcSpirvLowerRayTracingIntrinsics.h" -#include "SPIRVInternal.h" -#include "llpcContext.h" -#include "llpcSpirvLowerUtil.h" -#include "lgc/Builder.h" - -#define DEBUG_TYPE "llpc-spirv-lower-ray-tracing-intrinsics" - -using namespace llvm; -using namespace Llpc; - -namespace RtName { -const char *LoadDwordAtAddr = "AmdExtD3DShaderIntrinsics_LoadDwordAtAddr"; -const char *LoadDwordAtAddrx2 = "AmdExtD3DShaderIntrinsics_LoadDwordAtAddrx2"; -const char *LoadDwordAtAddrx4 = "AmdExtD3DShaderIntrinsics_LoadDwordAtAddrx4"; -const char *ConvertF32toF16NegInf = "AmdExtD3DShaderIntrinsics_ConvertF32toF16NegInf"; -const char *ConvertF32toF16PosInf = "AmdExtD3DShaderIntrinsics_ConvertF32toF16PosInf"; -} // namespace RtName - -namespace Llpc { - -// ===================================================================================================================== -// Executes this SPIR-V lowering pass on the specified LLVM module. -// -// @param [in/out] module : LLVM module to be run on -// @param [in/out] analysisManager : Analysis manager to use for this transformation -PreservedAnalyses SpirvLowerRayTracingIntrinsics::run(Module &module, ModuleAnalysisManager &analysisManager) { - if (runImpl(module)) - return PreservedAnalyses::none(); - return PreservedAnalyses::all(); -} - -// ===================================================================================================================== -// Executes this SPIR-V lowering pass on the specified LLVM module. -// -// @param [in,out] module : LLVM module to be run on -bool SpirvLowerRayTracingIntrinsics::runImpl(Module &module) { - LLVM_DEBUG(dbgs() << "Run the pass Spirv-Lower-Ray-Tracing-Intrinsics\n"); - - SpirvLower::init(&module); - - bool changed = false; - - for (auto funcIt = module.begin(), funcEnd = module.end(); funcIt != funcEnd;) { - Function *func = &*funcIt++; - changed |= processIntrinsicsFunction(func); - } - - return changed; -} - -// ===================================================================================================================== -// Process intrinsics function in the module -// -// @param func : The function to process -bool SpirvLowerRayTracingIntrinsics::processIntrinsicsFunction(Function *func) { - bool changed = false; - auto mangledName = func->getName(); - if (mangledName.equals(RtName::LoadDwordAtAddr)) { - createLoadDwordAtAddr(func, m_builder->getInt32Ty()); - changed = true; - } else if (mangledName.equals(RtName::LoadDwordAtAddrx2)) { - auto int32x2Ty = FixedVectorType::get(Type::getInt32Ty(*m_context), 2); - createLoadDwordAtAddr(func, int32x2Ty); - changed = true; - } else if (mangledName.equals(RtName::LoadDwordAtAddrx4)) { - auto int32x4Ty = FixedVectorType::get(Type::getInt32Ty(*m_context), 4); - createLoadDwordAtAddr(func, int32x4Ty); - changed = true; - } else if (mangledName.equals(RtName::ConvertF32toF16NegInf)) { - // RM = fp::rmDownward; - createConvertF32toF16(func, 2); - changed = true; - } else if (mangledName.equals(RtName::ConvertF32toF16PosInf)) { - // RM = fp::rmUpward; - createConvertF32toF16(func, 3); - changed = true; - } - - // TODO: Add support for other intrinsics function if needed. - - return changed; -} - -// ===================================================================================================================== -// Create AmdExtD3DShaderIntrinsics_LoadDwordAtAddr, LoadDwordAtAddrx2, LoadDwordAtAddrx4, -// -// @param func : Function to create -// @param loadTy : Base type of the load value -void SpirvLowerRayTracingIntrinsics::createLoadDwordAtAddr(Function *func, Type *loadTy) { - assert(func->size() == 1); - (*func->begin()).eraseFromParent(); - - Type *loadPtrTy = loadTy->getPointerTo(SPIRAS_Global); - - BasicBlock *entryBlock = BasicBlock::Create(m_builder->getContext(), "", func); - m_builder->SetInsertPoint(entryBlock); - auto argIt = func->arg_begin(); - - Value *gpuLowAddr = m_builder->CreateLoad(m_builder->getInt32Ty(), argIt++); - Value *gpuHighAddr = m_builder->CreateLoad(m_builder->getInt32Ty(), argIt++); - Value *offset = m_builder->CreateLoad(m_builder->getInt32Ty(), argIt++); - - // Use (gpuLowAddr, gpuHighAddr) to calculate i64 gpuAddr - gpuLowAddr = m_builder->CreateZExt(gpuLowAddr, m_builder->getInt64Ty()); - gpuHighAddr = m_builder->CreateZExt(gpuHighAddr, m_builder->getInt64Ty()); - gpuHighAddr = m_builder->CreateShl(gpuHighAddr, m_builder->getInt64(32)); - Value *gpuAddr = m_builder->CreateOr(gpuLowAddr, gpuHighAddr); - - Type *gpuAddrAsPtrTy = Type::getInt8PtrTy(m_builder->getContext(), SPIRAS_Global); - auto gpuAddrAsPtr = m_builder->CreateIntToPtr(gpuAddr, gpuAddrAsPtrTy); - - // Create GEP to get the byte address with byte offset - Value *loadValue = m_builder->CreateGEP(m_builder->getInt8Ty(), gpuAddrAsPtr, offset); - // Cast to the return type pointer - loadValue = m_builder->CreateBitCast(loadValue, loadPtrTy); - - loadValue = m_builder->CreateLoad(loadTy, loadValue); - m_builder->CreateRet(loadValue); -} - -// ===================================================================================================================== -// Create AmdExtD3DShaderIntrinsics_ConvertF32toF16NegInf, AmdExtD3DShaderIntrinsics_ConvertF32toF16PosInf -// -// @param func : Function to create -// @param roundingMode : Rounding mode for the conversion -void SpirvLowerRayTracingIntrinsics::createConvertF32toF16(Function *func, unsigned roundingMode) { - // uint3 AmdExtD3DShaderIntrinsics_ConvertF32toF16NegInf/PosInf(in float3 inVec) - // { - // return uint3(f32tof16NegInf/PosInf(inVec)); - // } - - assert(func->size() == 1); - (*func->begin()).eraseFromParent(); - - BasicBlock *entryBlock = BasicBlock::Create(m_builder->getContext(), "", func); - m_builder->SetInsertPoint(entryBlock); - auto argIt = func->arg_begin(); - - Type *convertInputType = FixedVectorType::get(m_builder->getFloatTy(), 3); - Value *inVec = m_builder->CreateLoad(convertInputType, argIt); - // TODO: Backend currently does not support rounding mode correctly. LGC is also treating all rounding mode other than - // RTE as RTZ. We need RTN and RTP here. LGC needs a change after backend confirm the support of rounding mode. - Value *result = m_builder->CreateFpTruncWithRounding(inVec, FixedVectorType::get(m_builder->getHalfTy(), 3), - static_cast(roundingMode)); - - result = m_builder->CreateBitCast(result, FixedVectorType::get(m_builder->getInt16Ty(), 3)); - result = m_builder->CreateZExt(result, FixedVectorType::get(m_builder->getInt32Ty(), 3)); - - m_builder->CreateRet(result); -} - -} // namespace Llpc diff --git a/llpc/lower/llpcSpirvLowerRayTracingIntrinsics.h b/llpc/lower/llpcSpirvLowerRayTracingIntrinsics.h deleted file mode 100644 index 171b3ff3b0..0000000000 --- a/llpc/lower/llpcSpirvLowerRayTracingIntrinsics.h +++ /dev/null @@ -1,57 +0,0 @@ -/* - *********************************************************************************************************************** - * - * Copyright (c) 2020-2023 Advanced Micro Devices, Inc. All Rights Reserved. - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - * - **********************************************************************************************************************/ -/** - *********************************************************************************************************************** - * @file llpcSpirvLowerRayTracingIntrinsics.h - * @brief LLPC header file: contains declaration of Llpc::SpirvLowerRayTracingIntrinsics - *********************************************************************************************************************** - */ - -#pragma once - -#include "SPIRVInternal.h" -#include "llpcSpirvLower.h" -#include "llvm/IR/PassManager.h" - -namespace Llpc { - -// ===================================================================================================================== -// Represents the pass of SPIR-V lowering ray tracing intrinsics. -class SpirvLowerRayTracingIntrinsics : public SpirvLower, public llvm::PassInfoMixin { -public: - llvm::PreservedAnalyses run(llvm::Module &module, llvm::ModuleAnalysisManager &analysisManager); - virtual bool runImpl(llvm::Module &module); - - static llvm::StringRef name() { return "Lower SPIR-V RayTracing intrinsics"; } - -protected: - void createLoadDwordAtAddr(llvm::Function *func, llvm::Type *loadTy); - void createConvertF32toF16(llvm::Function *func, unsigned roundingMode); - -private: - bool processIntrinsicsFunction(llvm::Function *func); -}; - -} // namespace Llpc diff --git a/llpc/lower/llpcSpirvProcessGpuRtLibrary.cpp b/llpc/lower/llpcSpirvProcessGpuRtLibrary.cpp index b4dceaa5b3..1f0ebaedaf 100644 --- a/llpc/lower/llpcSpirvProcessGpuRtLibrary.cpp +++ b/llpc/lower/llpcSpirvProcessGpuRtLibrary.cpp @@ -377,7 +377,6 @@ void SpirvProcessGpuRtLibrary::createIntersectBvh(Function *func) { // } auto argIt = func->arg_begin(); - Value *address = m_builder->CreateLoad(FixedVectorType::get(m_builder->getInt32Ty(), 2), argIt); argIt++; diff --git a/llpc/test/shaderdb/core/OpExtInst_NMinNMaxNaNFlags_lit.spvasm b/llpc/test/shaderdb/core/OpExtInst_NMinNMaxNaNFlags_lit.spvasm new file mode 100644 index 0000000000..962a317d71 --- /dev/null +++ b/llpc/test/shaderdb/core/OpExtInst_NMinNMaxNaNFlags_lit.spvasm @@ -0,0 +1,57 @@ +; BEGIN_SHADERTEST +; RUN: amdllpc -v %gfxip %s | FileCheck -check-prefix=SHADERTEST %s + +; Test nnan flags are removed when NMin or NMax are used. + +; SHADERTEST-LABEL: {{^// LLPC.*}} final pipeline module info +; SHADERTEST-NOT: nnan +; SHADERTEST: ret void + +; SHADERTEST: AMDLLPC SUCCESS +; END_SHADERTEST + + +; SPIR-V +; Version: 1.3 +; Schema: 0 + OpCapability Shader + %1 = OpExtInstImport "GLSL.std.450" + OpMemoryModel Logical GLSL450 + OpEntryPoint Vertex %MainVS "MainVS" %in_var_Position %out_var_Position + OpName %in_var_Position "in.var.Position" + OpName %out_var_Position "out.var.Position" + OpName %MainVS "MainVS" + OpDecorate %in_var_Position Location 0 + OpDecorate %out_var_Position Location 0 + %void = OpTypeVoid + %float = OpTypeFloat 32 + %float_0 = OpConstant %float 0 + %v4float = OpTypeVector %float 4 +%_ptr_Input_v4float = OpTypePointer Input %v4float +%_ptr_Output_v4float = OpTypePointer Output %v4float +%in_var_Position = OpVariable %_ptr_Input_v4float Input +%out_var_Position = OpVariable %_ptr_Output_v4float Output +%float_1_10000002 = OpConstant %float 1.10000002 +%float_0_400000006 = OpConstant %float 0.400000006 + %main_type = OpTypeFunction %void + %MainVS = OpFunction %void None %main_type + %entry = OpLabel + %c0 = OpExtInst %float %1 NMin %float_0_400000006 %float_1_10000002 + %c1 = OpExtInst %float %1 NMax %float_0_400000006 %float_1_10000002 + %v0 = OpFDiv %float %c0 %c1 + %pos = OpLoad %v4float %in_var_Position + %p0 = OpCompositeExtract %float %pos 0 + %p1 = OpCompositeExtract %float %pos 1 + %p2 = OpCompositeExtract %float %pos 2 + %p3 = OpCompositeExtract %float %pos 3 + %t0 = OpFAdd %float %p0 %p1 + %v1 = OpExtInst %float %1 NMin %t0 %p0 + %v2 = OpExtInst %float %1 NMax %t0 %p1 + %v3 = OpFAdd %float %p2 %p3 + %r0 = OpCompositeInsert %v4float %v0 %pos 0 + %r1 = OpCompositeInsert %v4float %v1 %r0 1 + %r2 = OpCompositeInsert %v4float %v2 %r1 2 + %r3 = OpCompositeInsert %v4float %v3 %r2 3 + OpStore %out_var_Position %r3 + OpReturn + OpFunctionEnd diff --git a/llpc/test/shaderdb/core/OpIsInf_TestDouble_lit.frag b/llpc/test/shaderdb/core/OpIsInf_TestDouble_lit.frag index 89bf38bc3a..f261819242 100644 --- a/llpc/test/shaderdb/core/OpIsInf_TestDouble_lit.frag +++ b/llpc/test/shaderdb/core/OpIsInf_TestDouble_lit.frag @@ -14,11 +14,13 @@ void main() // BEGIN_SHADERTEST /* ; RUN: amdllpc -v %gfxip %s | FileCheck -check-prefix=SHADERTEST %s +; REQUIRES: do-not-run-me ; SHADERTEST-LABEL: {{^// LLPC}} SPIRV-to-LLVM translation results ; SHADERTEST: = call i1 (...) @lgc.create.isinf.i1(double ; SHADERTEST-LABEL: {{^// LLPC}} pipeline patching results -; SHADERTEST: call i1 @llvm.is.fpclass.f64(double %{{[0-9]*}}, i32 516) +; SHADERTEST: %[[FABS:[0-9]+]] = call double @llvm.fabs.f64(double %{{[0-9]*}}) +; SHADERTEST: = fcmp oeq double %[[FABS]], 0x7FF0000000000000 ; SHADERTEST: AMDLLPC SUCCESS */ diff --git a/llpc/test/shaderdb/core/OpIsInf_TestFloat_lit.frag b/llpc/test/shaderdb/core/OpIsInf_TestFloat_lit.frag index f9964fb666..1200e9bea1 100644 --- a/llpc/test/shaderdb/core/OpIsInf_TestFloat_lit.frag +++ b/llpc/test/shaderdb/core/OpIsInf_TestFloat_lit.frag @@ -14,11 +14,13 @@ void main() // BEGIN_SHADERTEST /* ; RUN: amdllpc -v %gfxip %s | FileCheck -check-prefix=SHADERTEST %s +; REQUIRES: do-not-run-me ; SHADERTEST-LABEL: {{^// LLPC}} SPIRV-to-LLVM translation results ; SHADERTEST: = call i1 (...) @lgc.create.isinf.i1(float ; SHADERTEST-LABEL: {{^// LLPC}} pipeline patching results -; SHADERTEST: call i1 @llvm.is.fpclass.f32(float %{{[0-9]*}}, i32 516) +; SHADERTEST: %[[FABS:[0-9]+]] = call float @llvm.fabs.f32(float %{{[0-9]*}}) +; SHADERTEST: = fcmp oeq float %[[FABS]], 0x7FF0000000000000 ; SHADERTEST: AMDLLPC SUCCESS */ diff --git a/llpc/test/shaderdb/core/TestReverseThreadGroup.comp b/llpc/test/shaderdb/core/TestReverseThreadGroup.comp index e550a108d1..ee55b32f17 100644 --- a/llpc/test/shaderdb/core/TestReverseThreadGroup.comp +++ b/llpc/test/shaderdb/core/TestReverseThreadGroup.comp @@ -15,10 +15,13 @@ void main() // BEGIN_REVERSETEST // RUN: amdllpc -v %gfxip %s --reverse-thread-group=1 | FileCheck -check-prefix=REVERSETEST %s // REVERSETEST-LABEL: {{^// LLPC}} pipeline before-patching results -// There should be a call to get the gl_NumWorkGroups -// REVERSETEST: %{{[0-9]+}} = call ptr addrspace(4) @lgc.special.user.data.Workgroup(i32 268435462) -// There should be a call to get the internal buffer descriptor -// REVERSETEST: %{{[0-9]+}} = call ptr addrspace(4) @lgc.descriptor.table.addr(i32 10, i32 10, i64 4294967295, i32 7, i32 -1) +// There should be a calls to: +// - get the descriptor table containing the buffer descriptor +// - get the gl_NumWorkGroups +// - get the internal descriptor table +// REVERSETEST-DAG: %{{[0-9]+}} = call i32 @lgc.load.user.data.i32(i32 0) +// REVERSETEST-DAG: %{{[0-9]+}} = call ptr addrspace(4) @lgc.special.user.data.Workgroup(i32 268435462) +// REVERSETEST-DAG: %{{[0-9]+}} = call i32 @lgc.load.user.data.i32(i32 4) // There should be a select between the reversed thread group ID and original thread group ID // REVERSETEST: %{{[0-9]+}} = select i1 %{{[0-9]+}}, <3 x i32> %{{[0-9]+}}, <3 x i32> %{{[0-9]+}} // REVERSETEST: AMDLLPC SUCCESS diff --git a/llpc/test/shaderdb/extensions/PipelineVsFs_ViewIndexWithMultiViewDisabled.pipe b/llpc/test/shaderdb/extensions/PipelineVsFs_ViewIndexWithMultiViewDisabled.pipe index 6b36d48f66..5366b21879 100644 --- a/llpc/test/shaderdb/extensions/PipelineVsFs_ViewIndexWithMultiViewDisabled.pipe +++ b/llpc/test/shaderdb/extensions/PipelineVsFs_ViewIndexWithMultiViewDisabled.pipe @@ -70,7 +70,6 @@ entryPoint = main [ResourceMapping] -userDataNode[0].visibility = 1 userDataNode[0].type = DescriptorConstBufferCompact userDataNode[0].offsetInDwords = 2 userDataNode[0].sizeInDwords = 2 diff --git a/llpc/test/shaderdb/general/PipelineCs_MultipleRootInlineBuffer.pipe b/llpc/test/shaderdb/general/PipelineCs_MultipleRootInlineBuffer.pipe index 8d4e6f7cbe..051de1103a 100644 --- a/llpc/test/shaderdb/general/PipelineCs_MultipleRootInlineBuffer.pipe +++ b/llpc/test/shaderdb/general/PipelineCs_MultipleRootInlineBuffer.pipe @@ -10,11 +10,8 @@ ; SHADERTEST-LABEL: {{^//}} LLPC pipeline before-patching results ; SHADERTEST: define dllexport spir_func void @lgc.shader.CS.main() -; Get a pointer to the spill table -; SHADERTEST: [[spill_table1:%[0-9]*]] = call ptr addrspace(4) @lgc.spill.table() #2 - -; Get a pointer to the second inline buffer in the spill table. Offset 40 comes from the user data nodes. -; SHADERTEST: [[buf_addr1:%[0-9]*]] = getelementptr i8, ptr addrspace(4) [[spill_table1]], i32 40 +; Get a pointer to the second inline buffer. Offset 40 comes from the user data nodes +; SHADERTEST: [[buf_addr1:%[0-9]*]] = call ptr addrspace(4) @lgc.user.data(i32 40) ; SHADERTEST: [[buf1:%[0-9]*]] = ptrtoint ptr addrspace(4) [[buf_addr1]] to i64 ; Build the descriptor. The first two elements comes from the address of the buffer. @@ -32,8 +29,8 @@ ; Get the "fat pointer" for the buffer ; SHADERTEST: call ptr addrspace(7) @lgc.buffer.desc.to.ptr(<4 x i32> [[desc1_3]]) -; Get a pointer to the spill table -; SHADERTEST: [[spill_table0:%[0-9]*]] = call ptr addrspace(4) @lgc.spill.table() #2 +; Get a pointer to the first inline buffer. Offset 4 comes from the user data nodes +; SHADERTEST: [[buf_addr0:%[0-9]*]] = call ptr addrspace(4) @lgc.user.data(i32 4) ; SHADERTEST: ret void ; SHADERTEST-LABEL: {{^//}} LLPC pipeline patching results @@ -110,23 +107,19 @@ version = 46 entryPoint = main [ResourceMapping] -userDataNode[0].visibility = 1 userDataNode[0].type = StreamOutTableVaPtr userDataNode[0].offsetInDwords = 0 userDataNode[0].sizeInDwords = 1 -userDataNode[1].visibility = 32 userDataNode[1].type = InlineBuffer userDataNode[1].offsetInDwords = 1 userDataNode[1].sizeInDwords = 9 userDataNode[1].set = 0 userDataNode[1].binding = 0 -userDataNode[2].visibility = 32 userDataNode[2].type = InlineBuffer userDataNode[2].offsetInDwords = 10 userDataNode[2].sizeInDwords = 9 userDataNode[2].set = 0 userDataNode[2].binding = 1 -userDataNode[3].visibility = 32 userDataNode[3].type = DescriptorTableVaPtr userDataNode[3].offsetInDwords = 10 userDataNode[3].sizeInDwords = 1 diff --git a/llpc/test/shaderdb/general/PipelineCs_TestMultiEntryPoint_lit.pipe b/llpc/test/shaderdb/general/PipelineCs_TestMultiEntryPoint_lit.pipe index f366859b30..b2d8041943 100644 --- a/llpc/test/shaderdb/general/PipelineCs_TestMultiEntryPoint_lit.pipe +++ b/llpc/test/shaderdb/general/PipelineCs_TestMultiEntryPoint_lit.pipe @@ -6,7 +6,7 @@ ; SHADERTEST: !llpc.compute.mode = !{![[COMPUTEMODE:[0-9]+]]} ; SHADERTEST: ![[COMPUTEMODE]] = !{i32 1, i32 1, i32 1} ; SHADERTEST-LABEL: {{^// LLPC}} pipeline patching results -; SHADERTEST: define {{.*}} void @_amdgpu_cs_main(i32 inreg %globalTable, i32 inreg %descTable0, <3 x i32> inreg %WorkgroupId, i32 inreg %MultiDispatchInfo, <3 x i32> %LocalInvocationId) +; SHADERTEST: define {{.*}} void @_amdgpu_cs_main(i32 inreg %globalTable, i32 inreg %userdata0, <3 x i32> inreg %WorkgroupId, i32 inreg %MultiDispatchInfo, <3 x i32> %LocalInvocationId) ; SHADERTEST: AMDLLPC SUCCESS ; END_SHADERTEST diff --git a/llpc/test/shaderdb/general/PipelineTcsTes_TestLocMapLoadGenericOutput.pipe b/llpc/test/shaderdb/general/PipelineTcsTes_TestLocMapLoadGenericOutput.pipe index d3a95d6f75..96e06f575f 100644 --- a/llpc/test/shaderdb/general/PipelineTcsTes_TestLocMapLoadGenericOutput.pipe +++ b/llpc/test/shaderdb/general/PipelineTcsTes_TestLocMapLoadGenericOutput.pipe @@ -1,7 +1,34 @@ ; BEGIN_SHADERTEST ; RUN: amdllpc -v %gfxip %s | FileCheck -check-prefix=SHADERTEST %s -; SHADERTEST-LABEL: {{^// LLPC}} SPIRV-to-LLVM translation results +; SHADERTEST-LABEL: LLPC location input/output mapping results (TES shader) + +; SHADERTEST: (TES) Input: loc = 2, comp = 0 => Mapped = 0, 0 + +; SHADERTEST: (TES) Input (per-patch): loc = 3 => Mapped = 0 +; SHADERTEST: (TES) Input (per-patch): loc = 4 => Mapped = 1 +; SHADERTEST: (TES) Input (per-patch): loc = 5 => Mapped = 2 + +; SHADERTEST-LABEL: LLPC location count results (after input/output matching) + +; SHADERTEST: (TES) Input: loc count = 1 +; SHADERTEST: (TES) Output: loc count = 0 +; SHADERTEST: (TES) Input (per-patch): loc count = 3 + +; SHADERTEST-LABEL: LLPC location input/output mapping results (TCS shader) + +; SHADERTEST: (TCS) Output: loc = 1, comp = 0 => Mapped = 1, 0 +; SHADERTEST: (TCS) Output: loc = 2, comp = 0 => Mapped = 0, 0 + +; SHADERTEST: (TCS) Output (per-patch): loc = 3 => Mapped = 0 +; SHADERTEST: (TCS) Output (per-patch): loc = 4 => Mapped = 1 +; SHADERTEST: (TCS) Output (per-patch): loc = 5 => Mapped = 2 + +; SHADERTEST-LABEL: LLPC location count results (after input/output matching) + +; SHADERTEST: (TCS) Input: loc count = 0 +; SHADERTEST: (TCS) Output: loc count = 2 +; SHADERTEST: (TCS) Output (per-patch): loc count = 3 ; SHADERTEST: AMDLLPC SUCCESS ; END_SHADERTEST @@ -10,20 +37,24 @@ layout(vertices = 3) out; -layout(location = 0) out vec4 outColor[]; -layout(location = 3) out vec4 outData1[]; -layout(location = 4) out vec4 outData2[]; +layout(location = 0) out vec4 unused[]; +layout(location = 1) out vec4 importOut[]; +layout(location = 2) out vec4 outColor[]; +layout(location = 3) patch out vec4 patchDynIdx[3]; void main (void) { outColor[gl_InvocationID] = gl_in[gl_InvocationID].gl_Position; - outData1[gl_InvocationID] = vec4(6.0); - outData2[gl_InvocationID][2] += 3.0; + unused[gl_InvocationID] = vec4(6.0); + importOut[gl_InvocationID][1] += 3.0; gl_TessLevelInner[1] = 1.0; gl_TessLevelOuter[1] = 2.0; + + for (int i = 0; i < 3; ++i) + patchDynIdx[i] = vec4(float(i)); } [TcsInfo] @@ -34,13 +65,15 @@ entryPoint = main layout(triangles) in; -layout(location = 0) in vec4 inColor[]; +layout(location = 2) in vec4 inColor[]; +layout(location = 3) patch in vec4 inPatch[3]; + layout(location = 0) out vec4 outColor; void main() { outColor += gl_in[1].gl_Position; - outColor = inColor[0] + inColor[1] + inColor[2]; + outColor = inColor[0] + inColor[1] + inColor[2] + inPatch[1]; } [TesInfo] diff --git a/llpc/test/shaderdb/general/PipelineVsFs_DisableFMA.pipe b/llpc/test/shaderdb/general/PipelineVsFs_DisableFMA.pipe index ff4e95982c..10a91d7161 100644 --- a/llpc/test/shaderdb/general/PipelineVsFs_DisableFMA.pipe +++ b/llpc/test/shaderdb/general/PipelineVsFs_DisableFMA.pipe @@ -44,29 +44,24 @@ void main() entryPoint = main [ResourceMapping] -userDataNode[0].visibility = 1 userDataNode[0].type = StreamOutTableVaPtr userDataNode[0].offsetInDwords = 0 userDataNode[0].sizeInDwords = 1 -userDataNode[1].visibility = 17 userDataNode[1].type = DescriptorBuffer userDataNode[1].offsetInDwords = 1 userDataNode[1].sizeInDwords = 4 userDataNode[1].set = 0 userDataNode[1].binding = 0 -userDataNode[2].visibility = 17 userDataNode[2].type = DescriptorBuffer userDataNode[2].offsetInDwords = 5 userDataNode[2].sizeInDwords = 4 userDataNode[2].set = 1 userDataNode[2].binding = 0 -userDataNode[3].visibility = 17 userDataNode[3].type = DescriptorBuffer userDataNode[3].offsetInDwords = 9 userDataNode[3].sizeInDwords = 4 userDataNode[3].set = 1 userDataNode[3].binding = 1 -userDataNode[4].visibility = 1 userDataNode[4].type = IndirectUserDataVaPtr userDataNode[4].offsetInDwords = 13 userDataNode[4].sizeInDwords = 1 diff --git a/llpc/test/shaderdb/general/PipelineVsFs_GlPositionFMF.pipe b/llpc/test/shaderdb/general/PipelineVsFs_GlPositionFMF.pipe new file mode 100644 index 0000000000..ac195358cc --- /dev/null +++ b/llpc/test/shaderdb/general/PipelineVsFs_GlPositionFMF.pipe @@ -0,0 +1,55 @@ +// Ensure that fast math flags are removed early enough to prevent +// instruction combine removing subtraction for gl_Position computation. + +; BEGIN_SHADERTEST +; RUN: amdllpc --gfxip=10.3.0 -v %s | FileCheck -check-prefix=SHADERTEST %s +; SHADERTEST-LABEL: {{^// LLPC}} SPIR-V lowering results +; SHADERTEST: fsub float 1.000000e+00, %__llpc_input_proxy_in_Pos.0.vec.extract +; SHADERTEST-LABEL: _amdgpu_vs_main: +; SHADERTEST: v_sub_f32_e32 v{{[0-9]+}}, 1.0, v{{[0-9]+}} +; END_SHADERTEST + +[Version] +version = 46 + +[VsGlsl] +#version 450 + +layout(location = 0) in vec4 in_Pos; +layout(location = 1) in vec4 in_Col; + +void main() +{ + float t = (((1.0 - in_Pos.x) * 0.23529411852359771728515625) * (clamp(in_Pos.y * 0.125, 0.25, 1.0) * in_Pos.z)) + in_Pos.w; + gl_Position = vec4(t, 0, 0, 1.0); +} + +[VsInfo] +entryPoint = main + +[FsGlsl] +#version 450 +layout(early_fragment_tests) in; + +layout(location = 0, component = 0) out vec4 _out; + +void main() +{ + _out = vec4(0.0, 1.0, 0.0, 1.0); +} + +[FsInfo] +entryPoint = main + +[GraphicsPipelineState] +colorBuffer[0].format = VK_FORMAT_B8G8R8A8_UNORM +colorBuffer[0].channelWriteMask = 15 + +[VertexInputState] +binding[0].binding = 0 +binding[0].stride = 2 +binding[0].inputRate = VK_VERTEX_INPUT_RATE_VERTEX +attribute[0].location = 0 +attribute[0].binding = 0 +attribute[0].format = VK_FORMAT_R8G8_SNORM +attribute[0].offset = 0 diff --git a/llpc/test/shaderdb/general/PipelineVsFs_MultiTableDescSet.pipe b/llpc/test/shaderdb/general/PipelineVsFs_MultiTableDescSet.pipe index a6bd5f6c0c..cb5b9cbe58 100644 --- a/llpc/test/shaderdb/general/PipelineVsFs_MultiTableDescSet.pipe +++ b/llpc/test/shaderdb/general/PipelineVsFs_MultiTableDescSet.pipe @@ -22,17 +22,17 @@ ; descriptors. The high half of the descriptor load should come from the PC. ; The low half should come from the user data node at offset 1 and 2. ; SHADERTEST-LABEL: _amdgpu_ps_main: -; SHADERTEST: s_getpc_b64 s{{\[}}[[PS_PC_LO:[0-9]*]]:[[PS_PC_HI:[0-9]*]]] -; SHADERTEST: s_mov_b32 s[[T1_ADDR_LO:[0-9]*]], s[[table1:[0-9]*]] -; SHADERTEST: s_mov_b32 s[[T2_ADDR_HI:[0-9]*]], s[[PS_PC_HI]] -; SHADERTEST: s_mov_b32 s[[T1_ADDR_HI:[0-9]*]], s[[PS_PC_HI]] -; SHADERTEST: s_load_dwordx8 s{{\[}}[[T1_DESC:[0-9]*]]:{{[0-9]*}}], s{{\[}}[[T1_ADDR_LO]]:[[T1_ADDR_HI]]], 0x0 -; SHADERTEST: s_load_dwordx4 s{{\[}}[[T2_DESC:[0-9]*]]:{{[0-9]*}}], s{{\[}}[[table2:[0-9]*]]:[[T2_ADDR_HI]]], 0x0 +; SHADERTEST: s_getpc_b64 s{{\[}}[[VS_PC_LO:[0-9]*]]:[[VS_PC_HI:[0-9]*]]] +; SHADERTEST: s_mov_b32 s0, s1 +; SHADERTEST: s_mov_b32 s1, s[[VS_PC_HI]] +; SHADERTEST: s_mov_b32 s3, s[[VS_PC_HI]] +; SHADERTEST: s_load_dwordx8 s{{\[}}[[T1_DESC:[0-9]*]]:{{[0-9]*}}], s[0:1], 0x0 +; SHADERTEST: s_load_dwordx4 s{{\[}}[[T2_DESC:[0-9]*]]:{{[0-9]*}}], s[2:3], 0x0 ; SHADERTEST: image_sample v[{{[0-9]*:[0-9]*}}], v[{{[0-9]*:[0-9]*}}], s{{\[}}[[T1_DESC]]:{{[0-9]*}}], s{{\[}}[[T2_DESC]]:{{[0-9]*}}] ; SHADERTEST-LABEL: PalMetadata ; SHADERTEST-LABEL: .registers: -; SHADERTEST: SPI_SHADER_USER_DATA_PS_[[table1]] 0x0000000000000002 -; SHADERTEST: SPI_SHADER_USER_DATA_PS_[[table2]] 0x0000000000000003 +; SHADERTEST: SPI_SHADER_USER_DATA_PS_1 0x0000000000000002 +; SHADERTEST: SPI_SHADER_USER_DATA_PS_2 0x0000000000000003 ; SHADERTEST: SPI_SHADER_USER_DATA_VS_[[table0]] 0x0000000000000001 ; END_SHADERTEST diff --git a/llpc/test/shaderdb/general/PipelineVsFs_TestExpWithRGB_UINT_PACK32.pipe b/llpc/test/shaderdb/general/PipelineVsFs_TestExpWithRGB_UINT_PACK32.pipe index 9f8918eebc..9053867ed7 100644 --- a/llpc/test/shaderdb/general/PipelineVsFs_TestExpWithRGB_UINT_PACK32.pipe +++ b/llpc/test/shaderdb/general/PipelineVsFs_TestExpWithRGB_UINT_PACK32.pipe @@ -102,29 +102,24 @@ options.unrollHintThreshold = 0 options.dontUnrollHintThreshold = 0 [ResourceMapping] -userDataNode[0].visibility = 1 userDataNode[0].type = StreamOutTableVaPtr userDataNode[0].offsetInDwords = 0 userDataNode[0].sizeInDwords = 1 -userDataNode[1].visibility = 17 userDataNode[1].type = DescriptorBuffer userDataNode[1].offsetInDwords = 1 userDataNode[1].sizeInDwords = 4 userDataNode[1].set = 0 userDataNode[1].binding = 0 -userDataNode[2].visibility = 17 userDataNode[2].type = DescriptorBuffer userDataNode[2].offsetInDwords = 5 userDataNode[2].sizeInDwords = 4 userDataNode[2].set = 1 userDataNode[2].binding = 0 -userDataNode[3].visibility = 17 userDataNode[3].type = DescriptorBuffer userDataNode[3].offsetInDwords = 9 userDataNode[3].sizeInDwords = 4 userDataNode[3].set = 1 userDataNode[3].binding = 1 -userDataNode[4].visibility = 1 userDataNode[4].type = IndirectUserDataVaPtr userDataNode[4].offsetInDwords = 13 userDataNode[4].sizeInDwords = 1 diff --git a/llpc/test/shaderdb/general/PipelineVsFs_TestIndirectResourceLayout.pipe b/llpc/test/shaderdb/general/PipelineVsFs_TestIndirectResourceLayout.pipe index 6cd4f4c9b2..519d70313e 100644 --- a/llpc/test/shaderdb/general/PipelineVsFs_TestIndirectResourceLayout.pipe +++ b/llpc/test/shaderdb/general/PipelineVsFs_TestIndirectResourceLayout.pipe @@ -8,13 +8,16 @@ ; SHADERTEST: call void (...) @lgc.create.write.generic.output(<4 x float> [[Value]], i32 0, i32 0, i32 0, i32 0, i32 0, i32 poison) ; SHADERTEST-LABEL: {{^// LLPC}} pipeline before-patching results -; SHADERTEST: [[Desc:%[0-9]*]] = call ptr addrspace(4) @lgc.descriptor.table.addr(i32 9, i32 9, i64 4294967295, i32 0, i32 -1) +; SHADERTEST: [[DescLo:%[0-9]*]] = call i32 @lgc.load.user.data.i32(i32 4) +; SHADERTEST: [[DescVec:%[0-9]*]] = insertelement <2 x i32> %{{[^,]*}}, i32 [[DescLo]], i64 0 +; SHADERTEST: [[Desc64:%[0-9]*]] = bitcast <2 x i32> [[DescVec]] to i64 +; SHADERTEST: [[Desc:%[0-9]*]] = inttoptr i64 [[Desc64]] to ptr addrspace(4) ; SHADERTEST: [[Value:%[0-9]*]] = load <4 x float>, ptr addrspace(4) [[Desc]], align 16 ; SHADERTEST: call void @lgc.output.export.generic.i32.i32.v4f32(i32 0, i32 0, <4 x float> [[Value]]) ; SHADERTEST-LABEL: {{^// LLPC}} pipeline patching results -; SHADERTEST: define dllexport amdgpu_ps void @_amdgpu_ps_main(i32 inreg %{{[^,]*}}, i32 inreg %descTable1, -; SHADERTEST: [[Addr0:%[0-9]*]] = zext i32 %descTable1 to i64 +; SHADERTEST: define dllexport amdgpu_ps void @_amdgpu_ps_main(i32 inreg %{{[^,]*}}, i32 inreg %userdata1, +; SHADERTEST: [[Addr0:%[0-9]*]] = zext i32 %userdata1 to i64 ; SHADERTEST: [[Addr1:%[0-9]*]] = or i64 %{{[0-9]*}}, [[Addr0]] ; SHADERTEST: [[Addr2:%[0-9]*]] = inttoptr i64 [[Addr1]] to ptr addrspace(4) ; SHADERTEST: [[Value:%[0-9]*]] = load <4 x float>, ptr addrspace(4) [[Addr2]], align 16 diff --git a/llpc/test/shaderdb/general/PipelineVsFs_TestNullFs.pipe b/llpc/test/shaderdb/general/PipelineVsFs_TestNullFs.pipe index d12c4090d3..ea04355641 100644 --- a/llpc/test/shaderdb/general/PipelineVsFs_TestNullFs.pipe +++ b/llpc/test/shaderdb/general/PipelineVsFs_TestNullFs.pipe @@ -19,7 +19,6 @@ dualSourceBlendEnable = 0 colorBuffer[0].format = VK_FORMAT_B8G8R8A8_UNORM colorBuffer[0].blendEnable = 0 colorBuffer[0].blendSrcAlphaToColor = 0 - ; CHECK-LABEL: amdgpu_vs_main: ; CHECK: v_mov_b32_e32 v0, 0 ; CHECK-NEXT: exp pos0 v0, v0, v0, v0 done @@ -32,71 +31,247 @@ colorBuffer[0].blendSrcAlphaToColor = 0 ; CHECK-NEXT: --- ; CHECK-NEXT: amdpal.pipelines: ; CHECK-NEXT: - .api: Vulkan +; CHECK-NEXT: .graphics_registers: +; CHECK-NEXT: .aa_coverage_to_shader_select: InputCoverage +; CHECK-NEXT: .cb_shader_mask: +; CHECK-NEXT: .output0_enable: 0 +; CHECK-NEXT: .output1_enable: 0 +; CHECK-NEXT: .output2_enable: 0 +; CHECK-NEXT: .output3_enable: 0 +; CHECK-NEXT: .output4_enable: 0 +; CHECK-NEXT: .output5_enable: 0 +; CHECK-NEXT: .output6_enable: 0 +; CHECK-NEXT: .output7_enable: 0 +; CHECK-NEXT: .db_shader_control: +; CHECK-NEXT: .alpha_to_mask_disable: true +; CHECK-NEXT: .conservative_z_export: 0 +; CHECK-NEXT: .depth_before_shader: 0 +; CHECK-NEXT: .exec_on_hier_fail: false +; CHECK-NEXT: .exec_on_noop: false +; CHECK-NEXT: .kill_enable: false +; CHECK-NEXT: .mask_export_enable: false +; CHECK-NEXT: .pre_shader_depth_coverage_enable: 0 +; CHECK-NEXT: .stencil_test_val_export_enable: 0 +; CHECK-NEXT: .z_export_enable: 0 +; CHECK-NEXT: .z_order: 0x1 +; CHECK-NEXT: .ia_multi_vgt_param_piped: +; CHECK-NEXT: .primgroup_size: 0x7f +; CHECK-NEXT: .pa_cl_clip_cntl: +; CHECK-NEXT: .dx_linear_attr_clip_ena: true +; CHECK-NEXT: .rasterization_kill: false +; CHECK-NEXT: .vte_vport_provoke_disable: false +; CHECK-NEXT: .pa_cl_vte_cntl: +; CHECK-NEXT: .vtx_w0_fmt: true +; CHECK-NEXT: .x_offset_ena: true +; CHECK-NEXT: .x_scale_ena: true +; CHECK-NEXT: .y_offset_ena: true +; CHECK-NEXT: .y_scale_ena: true +; CHECK-NEXT: .z_offset_ena: true +; CHECK-NEXT: .z_scale_ena: true +; CHECK-NEXT: .pa_sc_shader_control: +; CHECK-NEXT: .wave_break_region_size: 0 +; CHECK-NEXT: .pa_su_vtx_cntl: +; CHECK-NEXT: .pix_center: 0x1 +; CHECK-NEXT: .quant_mode: 0x5 +; CHECK-NEXT: .round_mode: 0x2 +; CHECK-NEXT: .ps_extra_lds_size: 0 +; CHECK-NEXT: .ps_iter_sample: false +; CHECK-NEXT: .spi_baryc_cntl: +; CHECK-NEXT: .front_face_all_bits: true +; CHECK-NEXT: .pos_float_location: 0 +; CHECK-NEXT: .spi_ps_in_control: +; CHECK-NEXT: .num_interps: 0 +; CHECK-NEXT: .num_prim_interp: 0 +; CHECK-NEXT: .ps_w32_en: false +; CHECK-NEXT: .spi_ps_input_addr: +; CHECK-NEXT: .ancillary_ena: false +; CHECK-NEXT: .front_face_ena: false +; CHECK-NEXT: .line_stipple_tex_ena: false +; CHECK-NEXT: .linear_center_ena: false +; CHECK-NEXT: .linear_centroid_ena: false +; CHECK-NEXT: .linear_sample_ena: false +; CHECK-NEXT: .persp_center_ena: false +; CHECK-NEXT: .persp_centroid_ena: false +; CHECK-NEXT: .persp_pull_model_ena: false +; CHECK-NEXT: .persp_sample_ena: true +; CHECK-NEXT: .pos_fixed_pt_ena: false +; CHECK-NEXT: .pos_w_float_ena: false +; CHECK-NEXT: .pos_x_float_ena: false +; CHECK-NEXT: .pos_y_float_ena: false +; CHECK-NEXT: .pos_z_float_ena: false +; CHECK-NEXT: .sample_coverage_ena: false +; CHECK-NEXT: .spi_ps_input_cntl: +; CHECK-NEXT: - .attr0_valid: 0 +; CHECK-NEXT: .attr1_valid: 0 +; CHECK-NEXT: .flat_shade: false +; CHECK-NEXT: .fp16_interp_mode: false +; CHECK-NEXT: .offset: 0 +; CHECK-NEXT: .prim_attr: false +; CHECK-NEXT: .pt_sprite_tex: false +; CHECK-NEXT: .spi_ps_input_ena: +; CHECK-NEXT: .ancillary_ena: false +; CHECK-NEXT: .front_face_ena: false +; CHECK-NEXT: .line_stipple_tex_ena: false +; CHECK-NEXT: .linear_center_ena: false +; CHECK-NEXT: .linear_centroid_ena: false +; CHECK-NEXT: .linear_sample_ena: false +; CHECK-NEXT: .persp_center_ena: false +; CHECK-NEXT: .persp_centroid_ena: false +; CHECK-NEXT: .persp_pull_model_ena: false +; CHECK-NEXT: .persp_sample_ena: true +; CHECK-NEXT: .pos_fixed_pt_ena: false +; CHECK-NEXT: .pos_w_float_ena: false +; CHECK-NEXT: .pos_x_float_ena: false +; CHECK-NEXT: .pos_y_float_ena: false +; CHECK-NEXT: .pos_z_float_ena: false +; CHECK-NEXT: .sample_coverage_ena: false +; CHECK-NEXT: .spi_shader_col_format: +; CHECK-NEXT: .col_0_export_format: 0 +; CHECK-NEXT: .col_1_export_format: 0 +; CHECK-NEXT: .col_2_export_format: 0 +; CHECK-NEXT: .col_3_export_format: 0 +; CHECK-NEXT: .col_4_export_format: 0 +; CHECK-NEXT: .col_5_export_format: 0 +; CHECK-NEXT: .col_6_export_format: 0 +; CHECK-NEXT: .col_7_export_format: 0 +; CHECK-NEXT: .spi_shader_pos_format: +; CHECK-NEXT: - 0x4 +; CHECK-NEXT: - 0 +; CHECK-NEXT: - 0 +; CHECK-NEXT: - 0 +; CHECK-NEXT: - 0 +; CHECK-NEXT: .spi_shader_z_format: 0 +; CHECK-NEXT: .spi_vs_out_config: +; CHECK-NEXT: .no_pc_export: true +; CHECK-NEXT: .vgt_reuse_off: false +; CHECK-NEXT: .vgt_shader_stages_en: +; CHECK-NEXT: .max_primgroup_in_wave: 0x2 +; CHECK-NEXT: .vs_stage_en: 0 +; CHECK-NEXT: .vs_w32_en: true +; CHECK-NEXT: .vgt_strmout_buffer_config: +; CHECK-NEXT: .stream_0_buffer_en: 0 +; CHECK-NEXT: .stream_1_buffer_en: 0 +; CHECK-NEXT: .stream_2_buffer_en: 0 +; CHECK-NEXT: .stream_3_buffer_en: 0 +; CHECK-NEXT: .vgt_strmout_config: +; CHECK-NEXT: .streamout_0_en: false +; CHECK-NEXT: .streamout_1_en: false +; CHECK-NEXT: .streamout_2_en: false +; CHECK-NEXT: .streamout_3_en: false +; CHECK-NEXT: .vs_so_base0_en: false +; CHECK-NEXT: .vs_so_base1_en: false +; CHECK-NEXT: .vs_so_base2_en: false +; CHECK-NEXT: .vs_so_base3_en: false +; CHECK-NEXT: .vs_streamout_en: false ; CHECK-NEXT: .hardware_stages: ; CHECK-NEXT: .ps: +; CHECK-NEXT: .checksum_value: 0 +; CHECK-NEXT: .debug_mode: false ; CHECK-NEXT: .entry_point: _amdgpu_ps_main +; CHECK-NEXT: .float_mode: 0xc0 +; CHECK-NEXT: .ieee_mode: false +; CHECK-NEXT: .mem_ordered: true +; CHECK-NEXT: .scratch_en: false ; CHECK-NEXT: .scratch_memory_size: 0 ; CHECK-NEXT: .sgpr_count: 0x2 ; CHECK-NEXT: .sgpr_limit: 0x6a +; CHECK-NEXT: .trap_present: 0 +; CHECK-NEXT: .user_data_reg_map: +; CHECK-NEXT: - 0x10000000 +; CHECK-NEXT: - 0xffffffff +; CHECK-NEXT: - 0xffffffff +; CHECK-NEXT: - 0xffffffff +; CHECK-NEXT: - 0xffffffff +; CHECK-NEXT: - 0xffffffff +; CHECK-NEXT: - 0xffffffff +; CHECK-NEXT: - 0xffffffff +; CHECK-NEXT: - 0xffffffff +; CHECK-NEXT: - 0xffffffff +; CHECK-NEXT: - 0xffffffff +; CHECK-NEXT: - 0xffffffff +; CHECK-NEXT: - 0xffffffff +; CHECK-NEXT: - 0xffffffff +; CHECK-NEXT: - 0xffffffff +; CHECK-NEXT: - 0xffffffff +; CHECK-NEXT: - 0xffffffff +; CHECK-NEXT: - 0xffffffff +; CHECK-NEXT: - 0xffffffff +; CHECK-NEXT: - 0xffffffff +; CHECK-NEXT: - 0xffffffff +; CHECK-NEXT: - 0xffffffff +; CHECK-NEXT: - 0xffffffff +; CHECK-NEXT: - 0xffffffff +; CHECK-NEXT: - 0xffffffff +; CHECK-NEXT: - 0xffffffff +; CHECK-NEXT: - 0xffffffff +; CHECK-NEXT: - 0xffffffff +; CHECK-NEXT: - 0xffffffff +; CHECK-NEXT: - 0xffffffff +; CHECK-NEXT: - 0xffffffff +; CHECK-NEXT: - 0xffffffff +; CHECK-NEXT: .user_sgprs: 0x1 +; CHECK-NEXT: .uses_uavs: false ; CHECK-NEXT: .vgpr_count: 0x2 ; CHECK-NEXT: .vgpr_limit: 0x100 ; CHECK-NEXT: .wavefront_size: 0x40 +; CHECK-NEXT: .wgp_mode: false +; CHECK-NEXT: .writes_depth: 0 +; CHECK-NEXT: .writes_uavs: false ; CHECK-NEXT: .vs: +; CHECK-NEXT: .checksum_value: 0xba71f629 +; CHECK-NEXT: .debug_mode: false ; CHECK-NEXT: .entry_point: _amdgpu_vs_main +; CHECK-NEXT: .float_mode: 0xc0 +; CHECK-NEXT: .ieee_mode: false +; CHECK-NEXT: .mem_ordered: true +; CHECK-NEXT: .scratch_en: false ; CHECK-NEXT: .scratch_memory_size: 0 ; CHECK-NEXT: .sgpr_count: 0x3 ; CHECK-NEXT: .sgpr_limit: 0x6a +; CHECK-NEXT: .trap_present: 0 +; CHECK-NEXT: .user_data_reg_map: +; CHECK-NEXT: - 0x10000000 +; CHECK-NEXT: - 0x10000003 +; CHECK-NEXT: - 0x10000004 +; CHECK-NEXT: - 0xffffffff +; CHECK-NEXT: - 0xffffffff +; CHECK-NEXT: - 0xffffffff +; CHECK-NEXT: - 0xffffffff +; CHECK-NEXT: - 0xffffffff +; CHECK-NEXT: - 0xffffffff +; CHECK-NEXT: - 0xffffffff +; CHECK-NEXT: - 0xffffffff +; CHECK-NEXT: - 0xffffffff +; CHECK-NEXT: - 0xffffffff +; CHECK-NEXT: - 0xffffffff +; CHECK-NEXT: - 0xffffffff +; CHECK-NEXT: - 0xffffffff +; CHECK-NEXT: - 0xffffffff +; CHECK-NEXT: - 0xffffffff +; CHECK-NEXT: - 0xffffffff +; CHECK-NEXT: - 0xffffffff +; CHECK-NEXT: - 0xffffffff +; CHECK-NEXT: - 0xffffffff +; CHECK-NEXT: - 0xffffffff +; CHECK-NEXT: - 0xffffffff +; CHECK-NEXT: - 0xffffffff +; CHECK-NEXT: - 0xffffffff +; CHECK-NEXT: - 0xffffffff +; CHECK-NEXT: - 0xffffffff +; CHECK-NEXT: - 0xffffffff +; CHECK-NEXT: - 0xffffffff +; CHECK-NEXT: - 0xffffffff +; CHECK-NEXT: - 0xffffffff +; CHECK-NEXT: .user_sgprs: 0x3 ; CHECK-NEXT: .vgpr_count: 0x4 ; CHECK-NEXT: .vgpr_limit: 0x100 ; CHECK-NEXT: .wavefront_size: 0x20 +; CHECK-NEXT: .wgp_mode: false ; CHECK-NEXT: .internal_pipeline_hash: ; CHECK-NEXT: - 0x{{[0-9a-f]+}} ; CHECK-NEXT: - 0x{{[0-9a-f]+}} -; CHECK-NEXT: .registers: -; CHECK-NEXT: 0x2c01: 0 -; CHECK-NEXT: 0x2c06 (SPI_SHADER_PGM_CHKSUM_PS): {{.*}} -; CHECK-NEXT: 0x2c0a (SPI_SHADER_PGM_RSRC1_PS): 0x22c0000 -; CHECK-NEXT: 0x2c0b (SPI_SHADER_PGM_RSRC2_PS): 0x2 -; CHECK-NEXT: 0x2c0c (SPI_SHADER_USER_DATA_PS_0): 0x10000000 -; CHECK-NEXT: 0x2c45 (SPI_SHADER_PGM_CHKSUM_VS): {{.*}} -; CHECK-NEXT: 0x2c4a (SPI_SHADER_PGM_RSRC1_VS): 0x82c0000 -; CHECK-NEXT: 0x2c4b (SPI_SHADER_PGM_RSRC2_VS): 0x6 -; CHECK-NEXT: 0x2c4c (SPI_SHADER_USER_DATA_VS_0): 0x10000000 -; CHECK-NEXT: 0x2c4d (SPI_SHADER_USER_DATA_VS_1): 0x10000003 -; CHECK-NEXT: 0x2c4e (SPI_SHADER_USER_DATA_VS_2): 0x10000004 -; CHECK-NEXT: 0xa08f (CB_SHADER_MASK): 0 -; CHECK-NEXT: 0xa191 (SPI_PS_INPUT_CNTL_0): 0 -; CHECK-NEXT: 0xa1b1 (SPI_VS_OUT_CONFIG): 0x80 -; CHECK-NEXT: 0xa1b3 (SPI_PS_INPUT_ENA): 0x1 -; CHECK-NEXT: 0xa1b4 (SPI_PS_INPUT_ADDR): 0x1 -; CHECK-NEXT: 0xa1b5 (SPI_INTERP_CONTROL_0): 0 -; CHECK-NEXT: 0xa1b6 (SPI_PS_IN_CONTROL): 0 -; CHECK-NEXT: 0xa1b8 (SPI_BARYC_CNTL): 0x1000000 -; CHECK-NEXT: 0xa1c3 (SPI_SHADER_POS_FORMAT): 0x4 -; CHECK-NEXT: 0xa1c4 (SPI_SHADER_Z_FORMAT): 0 -; CHECK-NEXT: 0xa1c5 (SPI_SHADER_COL_FORMAT): 0 -; CHECK-NEXT: 0xa203 (DB_SHADER_CONTROL): 0x810 -; CHECK-NEXT: 0xa204 (PA_CL_CLIP_CNTL): 0x1000000 -; CHECK-NEXT: 0xa206 (PA_CL_VTE_CNTL): 0x43f -; CHECK-NEXT: 0xa207 (PA_CL_VS_OUT_CNTL): 0 -; CHECK-NEXT: 0xa210 (PA_STEREO_CNTL): 0x2 -; CHECK-NEXT: 0xa291 (VGT_GS_ONCHIP_CNTL): 0 -; CHECK-NEXT: 0xa293 (PA_SC_MODE_CNTL_1): 0x602018c -; CHECK-NEXT: 0xa2a1 (VGT_PRIMITIVEID_EN): 0 -; CHECK-NEXT: 0xa2ad (VGT_REUSE_OFF): 0 -; CHECK-NEXT: 0xa2b5 (VGT_STRMOUT_VTX_STRIDE_0): 0 -; CHECK-NEXT: 0xa2b9 (VGT_STRMOUT_VTX_STRIDE_1): 0 -; CHECK-NEXT: 0xa2bd (VGT_STRMOUT_VTX_STRIDE_2): 0 -; CHECK-NEXT: 0xa2c1 (VGT_STRMOUT_VTX_STRIDE_3): 0 -; CHECK-NEXT: 0xa2d5 (VGT_SHADER_STAGES_EN): 0x810000 -; CHECK-NEXT: 0xa2e5 (VGT_STRMOUT_CONFIG): 0 -; CHECK-NEXT: 0xa2e6 (VGT_STRMOUT_BUFFER_CONFIG): 0 -; CHECK-NEXT: 0xa2f8 (PA_SC_AA_CONFIG): 0 -; CHECK-NEXT: 0xa2f9 (PA_SU_VTX_CNTL): 0x2d -; CHECK-NEXT: 0xa310 (PA_SC_SHADER_CONTROL): 0 -; CHECK-NEXT: 0xc258 (IA_MULTI_VGT_PARAM_PIPED): 0x7f -; CHECK-NEXT: 0xc25f (GE_STEREO_CNTL): 0 -; CHECK-NEXT: 0xc262 (GE_USER_VGPR_EN): 0 +; CHECK-NEXT: .num_interpolants: 0x1 +; CHECK-NEXT: .registers: {} ; CHECK-NEXT: .shaders: ; CHECK-NEXT: .pixel: ; CHECK-NEXT: .api_shader_hash: @@ -110,15 +285,20 @@ colorBuffer[0].blendSrcAlphaToColor = 0 ; CHECK-NEXT: - 0 ; CHECK-NEXT: .hardware_mapping: ; CHECK-NEXT: - .vs -; CHECK-NEXT: .spill_threshold: 0xffffffff +; CHECK-NEXT: .spill_threshold: 0xffff +; CHECK-NEXT: .streamout_vertex_strides: +; CHECK-NEXT: - 0 +; CHECK-NEXT: - 0 +; CHECK-NEXT: - 0 +; CHECK-NEXT: - 0 ; CHECK-NEXT: .type: VsPs -; CHECK-NEXT: .user_data_limit: 0 +; CHECK-NEXT: .user_data_limit: 0x1 ; CHECK-NEXT: .xgl_cache_info: ; CHECK-NEXT: .128_bit_cache_hash: ; CHECK-NEXT: - 0x{{[0-9a-f]+}} ; CHECK-NEXT: - 0x{{[0-9a-f]+}} ; CHECK-NEXT: .llpc_version: {{.*}} ; CHECK-NEXT: amdpal.version: -; CHECK-NEXT: - 0x2 -; CHECK-NEXT: - 0x6 +; CHECK-NEXT: - 0x3 +; CHECK-NEXT: - 0 ; CHECK-NEXT: ... diff --git a/llpc/test/shaderdb/general/PipelineVsFs_TestUberShader.pipe b/llpc/test/shaderdb/general/PipelineVsFs_TestUberShader.pipe index 5463e0ddee..bcba847b60 100644 --- a/llpc/test/shaderdb/general/PipelineVsFs_TestUberShader.pipe +++ b/llpc/test/shaderdb/general/PipelineVsFs_TestUberShader.pipe @@ -5,8 +5,10 @@ ; SHADERTEST-LABEL: {{^// LLPC}} pipeline patching results ; Load input descriptor -; SHADERTEST: [[DESCPTR:%[0-9]*]] = bitcast <2 x i32> %rootDesc2 to i64 -; SHADERTEST: [[INTDESCPTR:%[0-9]*]] = inttoptr i64 [[DESCPTR]] to ptr addrspace(4) +; SHADERTEST: [[PTR0:%[.a-z0-9]+]] = insertelement <2 x i32> poison, i32 %userdata2, i64 0 +; SHADERTEST: [[PTR1:%[.a-z0-9]+]] = insertelement <2 x i32> [[PTR0]], i32 %userdata3, i64 1 +; SHADERTEST: [[PTR2:%[.a-z0-9]+]] = bitcast <2 x i32> [[PTR1]] to i64 +; SHADERTEST: [[INTDESCPTR:%[0-9]+]] = inttoptr i64 [[PTR2]] to ptr addrspace(4) ; SHADERTEST: [[UBERINFO:%[0-9]*]] = load <4 x i32>, ptr addrspace(4) [[INTDESCPTR]], align 16 ; Load vertex diff --git a/llpc/test/shaderdb/general/PipelineVsFs_TestVertexFetchWithR8G8.pipe b/llpc/test/shaderdb/general/PipelineVsFs_TestVertexFetchWithR8G8.pipe index 9582a103a2..4fc69f2523 100644 --- a/llpc/test/shaderdb/general/PipelineVsFs_TestVertexFetchWithR8G8.pipe +++ b/llpc/test/shaderdb/general/PipelineVsFs_TestVertexFetchWithR8G8.pipe @@ -103,29 +103,24 @@ options.unrollHintThreshold = 0 options.dontUnrollHintThreshold = 0 [ResourceMapping] -userDataNode[0].visibility = 1 userDataNode[0].type = StreamOutTableVaPtr userDataNode[0].offsetInDwords = 0 userDataNode[0].sizeInDwords = 1 -userDataNode[1].visibility = 17 userDataNode[1].type = DescriptorBuffer userDataNode[1].offsetInDwords = 1 userDataNode[1].sizeInDwords = 4 userDataNode[1].set = 0 userDataNode[1].binding = 0 -userDataNode[2].visibility = 17 userDataNode[2].type = DescriptorBuffer userDataNode[2].offsetInDwords = 5 userDataNode[2].sizeInDwords = 4 userDataNode[2].set = 1 userDataNode[2].binding = 0 -userDataNode[3].visibility = 17 userDataNode[3].type = DescriptorBuffer userDataNode[3].offsetInDwords = 9 userDataNode[3].sizeInDwords = 4 userDataNode[3].set = 1 userDataNode[3].binding = 1 -userDataNode[4].visibility = 1 userDataNode[4].type = IndirectUserDataVaPtr userDataNode[4].offsetInDwords = 13 userDataNode[4].sizeInDwords = 1 diff --git a/llpc/test/shaderdb/general/TestConstantImmStore_FunctionInline.frag b/llpc/test/shaderdb/general/TestConstantImmStore_FunctionInline.frag new file mode 100644 index 0000000000..d420285de8 --- /dev/null +++ b/llpc/test/shaderdb/general/TestConstantImmStore_FunctionInline.frag @@ -0,0 +1,69 @@ +#version 460 +#extension GL_EXT_samplerless_texture_functions : require + +// BEGIN_SHADERTEST + +// RUN: amdllpc -v %gfxip %s | FileCheck -check-prefix=SHADERTEST %s + +// SHADERTEST-LABEL: {{^// LLPC}} SPIR-V lowering results + +// SHADERTEST: @{{.*}} = internal addrspace(4) constant [16 x <2 x float>] [<2 x float> , <2 x float> , <2 x float> , <2 x float> , <2 x float> , <2 x float> , <2 x float> , <2 x float> , <2 x float> , <2 x float> , <2 x float> , <2 x float> , <2 x float> , <2 x float> , <2 x float> , <2 x float> ] +// SHADERTEST: @{{.*}} = internal addrspace(4) constant [8 x <2 x float>] [<2 x float> , <2 x float> , <2 x float> , <2 x float> , <2 x float> , <2 x float> , <2 x float> , <2 x float> ] +// SHADERTEST: @{{.*}} = internal addrspace(4) constant [4 x <2 x float>] [<2 x float> , <2 x float> , <2 x float> , <2 x float> ] +// SHADERTEST: @{{.*}} = internal addrspace(4) constant [2 x <2 x float>] [<2 x float> , <2 x float> ] + +// SHADERTEST: AMDLLPC SUCCESS + +// END_SHADERTEST + + +layout(location = 1) in vec4 i0; +layout(location = 0) out vec4 o0; + +vec4 getSamplePosition(int count, inout int index) +{ + vec2 _171[2] = vec2[](vec2(0.25), vec2(-0.25)); + vec2 _188[4] = vec2[](vec2(-0.125, -0.375), vec2(0.375, -0.125), vec2(-0.375, 0.125), vec2(0.125, 0.375)); + vec2 _213[8] = vec2[](vec2(0.0625, -0.1875), vec2(-0.0625, 0.1875), vec2(0.3125, 0.0625), vec2(-0.1875, -0.3125), vec2(-0.3125, 0.3125), vec2(-0.4375, -0.0625), vec2(0.1875, 0.4375), vec2(0.4375, -0.4375)); + vec2 _240[16] = vec2[](vec2(0.0625), vec2(-0.0625, -0.1875), vec2(-0.1875, 0.125), vec2(0.25, -0.0625), vec2(-0.3125, -0.125), vec2(0.125, 0.3125), vec2(0.3125, 0.1875), vec2(0.1875, -0.3125), vec2(-0.125, 0.375), vec2(0.0, -0.4375), vec2(-0.25, -0.375), vec2(-0.375, 0.25), vec2(-0.5, 0.0), vec2(0.4375, -0.25), vec2(0.375, 0.4375), vec2(-0.4375, -0.5)); + index = clamp(index, 0, count - 1); + vec2 pos; + switch (count) + { + case 2: + { + pos = _171[index]; + break; + } + case 4: + { + pos = _188[index]; + break; + } + case 8: + { + pos = _213[index]; + break; + } + case 16: + { + pos = _240[index]; + break; + } + default: + { + pos = vec2(0.0); + break; + } + } + return vec4(pos, 0.0, 0.0); +} + +void main() +{ + int count = int(uint(i0.x)); + int index = int(uint(gl_SampleID)); + vec4 samplePosition = getSamplePosition(count, index); + o0.xy = samplePosition.xy; +} + diff --git a/llpc/test/shaderdb/general/TestWorkgroupIdOpt.comp b/llpc/test/shaderdb/general/TestWorkgroupIdOpt.comp index 663dfc8011..e5f6107908 100644 --- a/llpc/test/shaderdb/general/TestWorkgroupIdOpt.comp +++ b/llpc/test/shaderdb/general/TestWorkgroupIdOpt.comp @@ -16,11 +16,11 @@ void main() test = gl_WorkGroupID.x; } // CHECK-LABEL: define {{[^@]+}}@_amdgpu_cs_main -// CHECK-SAME: (i32 inreg [[GLOBALTABLE:%.*]], i32 inreg [[DESCTABLE0:%.*]], i32 inreg [[WORKGROUPID1:%.*]], i32 inreg [[MULTIDISPATCHINFO:%.*]], <3 x i32> [[LOCALINVOCATIONID:%.*]]) #[[ATTR0:[0-9]+]] !lgc.shaderstage !5 { +// CHECK-SAME: (i32 inreg [[GLOBALTABLE:%.*]], i32 inreg [[USERDATA0:%.*]], i32 inreg [[WORKGROUPID1:%.*]], i32 inreg [[MULTIDISPATCHINFO:%.*]], <3 x i32> [[LOCALINVOCATIONID:%.*]]) #[[ATTR0:[0-9]+]] !lgc.shaderstage !5 { // CHECK-NEXT: .entry: // CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.amdgcn.s.getpc() // CHECK-NEXT: [[TMP1:%.*]] = and i64 [[TMP0]], -4294967296 -// CHECK-NEXT: [[TMP2:%.*]] = zext i32 [[DESCTABLE0]] to i64 +// CHECK-NEXT: [[TMP2:%.*]] = zext i32 [[USERDATA0]] to i64 // CHECK-NEXT: [[TMP3:%.*]] = or i64 [[TMP1]], [[TMP2]] // CHECK-NEXT: [[TMP4:%.*]] = inttoptr i64 [[TMP3]] to ptr addrspace(4) // CHECK-NEXT: [[TMP5:%.*]] = load <4 x i32>, ptr addrspace(4) [[TMP4]], align 16 @@ -30,6 +30,5 @@ void main() //. // CHECK: attributes #[[ATTR0]] = { nounwind memory(readwrite) "amdgpu-flat-work-group-size"="256,256" "amdgpu-memory-bound"="false" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-unroll-threshold"="700" "amdgpu-wave-limiter"="false" "amdgpu-work-group-info-arg-no"="3" "denormal-fp-math-f32"="preserve-sign" "target-features"=",+wavefrontsize64,+cumode,+enable-flat-scratch" } // CHECK: attributes #[[ATTR1:[0-9]+]] = { nounwind willreturn memory(none) } -// CHECK: attributes #[[ATTR2:[0-9]+]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) } // CHECK: attributes #[[ATTR3:[0-9]+]] = { nocallback nofree nosync nounwind willreturn memory(write) } //. diff --git a/llpc/test/shaderdb/gfx10/PipelineVsFs_TestSubgroupSizeUsageFragment.pipe b/llpc/test/shaderdb/gfx10/PipelineVsFs_TestSubgroupSizeUsageFragment.pipe index 3d29866a4b..16b7924816 100644 --- a/llpc/test/shaderdb/gfx10/PipelineVsFs_TestSubgroupSizeUsageFragment.pipe +++ b/llpc/test/shaderdb/gfx10/PipelineVsFs_TestSubgroupSizeUsageFragment.pipe @@ -3,7 +3,10 @@ ; RUN: amdllpc -enable-part-pipeline=0 -v %gfxip %s | FileCheck -check-prefix=SHADERTEST %s ; RUN: amdllpc -enable-part-pipeline=1 -v %gfxip %s | FileCheck -check-prefix=SHADERTEST %s ; SHADERTEST-LABEL: LLPC final ELF info -; SHADERTEST: VGT_SHADER_STAGES_EN 0x0000000000010000 +; SHADERTEST: .vgt_shader_stages_en: { +; SHADERTEST: .max_primgroup_in_wave: 0x0000000000000002 +; SHADERTEST: .vs_stage_en: 0x0000000000000000 +; SHADERTEST: .vs_w32_en: 0 } ; SHADERTEST: AMDLLPC SUCCESS [Version] diff --git a/llpc/test/shaderdb/gfx10/PipelineVsFs_TestSubgroupSizeUsageVertex.pipe b/llpc/test/shaderdb/gfx10/PipelineVsFs_TestSubgroupSizeUsageVertex.pipe index 5d02b9a74e..ca28c358df 100644 --- a/llpc/test/shaderdb/gfx10/PipelineVsFs_TestSubgroupSizeUsageVertex.pipe +++ b/llpc/test/shaderdb/gfx10/PipelineVsFs_TestSubgroupSizeUsageVertex.pipe @@ -3,7 +3,10 @@ ; RUN: amdllpc -enable-part-pipeline=0 -v %gfxip %s | FileCheck -check-prefix=SHADERTEST %s ; RUN: amdllpc -enable-part-pipeline=1 -v %gfxip %s | FileCheck -check-prefix=SHADERTEST %s ; SHADERTEST-LABEL: LLPC final ELF info -; SHADERTEST: VGT_SHADER_STAGES_EN 0x0000000000010000 +; SHADERTEST: .vgt_shader_stages_en: { +; SHADERTEST: .max_primgroup_in_wave: 0x0000000000000002 +; SHADERTEST: .vs_stage_en: 0x0000000000000000 +; SHADERTEST: .vs_w32_en: 0 } ; SHADERTEST: AMDLLPC SUCCESS [Version] diff --git a/llpc/test/shaderdb/gfx10/PipelineVsFs_TestVsOutMiscSideBusEna.pipe b/llpc/test/shaderdb/gfx10/PipelineVsFs_TestVsOutMiscSideBusEna.pipe index 91d77a5372..c8e7dde2c7 100644 --- a/llpc/test/shaderdb/gfx10/PipelineVsFs_TestVsOutMiscSideBusEna.pipe +++ b/llpc/test/shaderdb/gfx10/PipelineVsFs_TestVsOutMiscSideBusEna.pipe @@ -1,9 +1,7 @@ -; Test that VS_OUT_MISC_SIDE_BUS_ENA (0x1000000) is set correctly. +; NOTE: Assertions have been autogenerated by tool/update_llpc_test_checks.py UTC_ARGS: --check-pal-metadata +; Test that VS_OUT_MISC_SIDE_BUS_ENA is set true correctly. -; RUN: amdllpc -v %gfxip %s | FileCheck -check-prefix=SHADERTEST %s -; SHADERTEST-LABEL: LLPC final ELF info -; SHADERTEST: PA_CL_VS_OUT_CNTL 0x0000000001400101 -; SHADERTEST: AMDLLPC SUCCESS +; RUN: amdllpc -gfxip=10.3 -o - -filetype=asm %s | FileCheck -check-prefix=SHADERTEST %s [VsGlsl] #version 450 @@ -25,4 +23,306 @@ void main() { } [FsInfo] entryPoint = main - +; SHADERTEST-LABEL: amdgpu_vs_main: +; SHADERTEST: v_mov_b32_e32 v0, 1.0 +; SHADERTEST-NEXT: exp pos0 v0, v0, v0, v0 +; SHADERTEST-NEXT: exp pos1 v0, v0, v0, v0 done +; SHADERTEST-NEXT: s_endpgm +; +; SHADERTEST-LABEL: amdgpu_ps_main: +; SHADERTEST: s_endpgm +; +; SHADERTEST-LABEL: .amdgpu_pal_metadata +; SHADERTEST-NEXT: --- +; SHADERTEST-NEXT: amdpal.pipelines: +; SHADERTEST-NEXT: - .api: Vulkan +; SHADERTEST-NEXT: .graphics_registers: +; SHADERTEST-NEXT: .aa_coverage_to_shader_select: InputCoverage +; SHADERTEST-NEXT: .cb_shader_mask: +; SHADERTEST-NEXT: .output0_enable: 0 +; SHADERTEST-NEXT: .output1_enable: 0 +; SHADERTEST-NEXT: .output2_enable: 0 +; SHADERTEST-NEXT: .output3_enable: 0 +; SHADERTEST-NEXT: .output4_enable: 0 +; SHADERTEST-NEXT: .output5_enable: 0 +; SHADERTEST-NEXT: .output6_enable: 0 +; SHADERTEST-NEXT: .output7_enable: 0 +; SHADERTEST-NEXT: .db_shader_control: +; SHADERTEST-NEXT: .alpha_to_mask_disable: true +; SHADERTEST-NEXT: .conservative_z_export: 0 +; SHADERTEST-NEXT: .depth_before_shader: 0 +; SHADERTEST-NEXT: .exec_on_hier_fail: false +; SHADERTEST-NEXT: .exec_on_noop: false +; SHADERTEST-NEXT: .kill_enable: false +; SHADERTEST-NEXT: .mask_export_enable: false +; SHADERTEST-NEXT: .pre_shader_depth_coverage_enable: 0 +; SHADERTEST-NEXT: .stencil_test_val_export_enable: 0 +; SHADERTEST-NEXT: .z_export_enable: 0 +; SHADERTEST-NEXT: .z_order: 0x1 +; SHADERTEST-NEXT: .ia_multi_vgt_param_piped: +; SHADERTEST-NEXT: .primgroup_size: 0x7f +; SHADERTEST-NEXT: .pa_cl_clip_cntl: +; SHADERTEST-NEXT: .dx_linear_attr_clip_ena: true +; SHADERTEST-NEXT: .rasterization_kill: false +; SHADERTEST-NEXT: .vs_out_misc_side_bus_ena: true +; SHADERTEST-NEXT: .vte_vport_provoke_disable: false +; SHADERTEST-NEXT: .pa_cl_vs_out_cntl: +; SHADERTEST-NEXT: .clip_dist_ena_0: true +; SHADERTEST-NEXT: .clip_dist_ena_1: false +; SHADERTEST-NEXT: .clip_dist_ena_2: false +; SHADERTEST-NEXT: .clip_dist_ena_3: false +; SHADERTEST-NEXT: .clip_dist_ena_4: false +; SHADERTEST-NEXT: .clip_dist_ena_5: false +; SHADERTEST-NEXT: .clip_dist_ena_6: false +; SHADERTEST-NEXT: .clip_dist_ena_7: false +; SHADERTEST-NEXT: .cull_dist_ena_0: true +; SHADERTEST-NEXT: .cull_dist_ena_1: false +; SHADERTEST-NEXT: .cull_dist_ena_2: false +; SHADERTEST-NEXT: .cull_dist_ena_3: false +; SHADERTEST-NEXT: .cull_dist_ena_4: false +; SHADERTEST-NEXT: .cull_dist_ena_5: false +; SHADERTEST-NEXT: .cull_dist_ena_6: false +; SHADERTEST-NEXT: .cull_dist_ena_7: false +; SHADERTEST-NEXT: .vs_out_cc_dist0_vec_ena: true +; SHADERTEST-NEXT: .pa_cl_vte_cntl: +; SHADERTEST-NEXT: .vtx_w0_fmt: true +; SHADERTEST-NEXT: .x_offset_ena: true +; SHADERTEST-NEXT: .x_scale_ena: true +; SHADERTEST-NEXT: .y_offset_ena: true +; SHADERTEST-NEXT: .y_scale_ena: true +; SHADERTEST-NEXT: .z_offset_ena: true +; SHADERTEST-NEXT: .z_scale_ena: true +; SHADERTEST-NEXT: .pa_sc_shader_control: +; SHADERTEST-NEXT: .wave_break_region_size: 0 +; SHADERTEST-NEXT: .pa_su_vtx_cntl: +; SHADERTEST-NEXT: .pix_center: 0x1 +; SHADERTEST-NEXT: .quant_mode: 0x5 +; SHADERTEST-NEXT: .round_mode: 0x2 +; SHADERTEST-NEXT: .ps_extra_lds_size: 0 +; SHADERTEST-NEXT: .ps_iter_sample: false +; SHADERTEST-NEXT: .spi_baryc_cntl: +; SHADERTEST-NEXT: .front_face_all_bits: true +; SHADERTEST-NEXT: .pos_float_location: 0 +; SHADERTEST-NEXT: .spi_ps_in_control: +; SHADERTEST-NEXT: .num_interps: 0 +; SHADERTEST-NEXT: .num_prim_interp: 0 +; SHADERTEST-NEXT: .ps_w32_en: false +; SHADERTEST-NEXT: .spi_ps_input_addr: +; SHADERTEST-NEXT: .ancillary_ena: false +; SHADERTEST-NEXT: .front_face_ena: false +; SHADERTEST-NEXT: .line_stipple_tex_ena: false +; SHADERTEST-NEXT: .linear_center_ena: false +; SHADERTEST-NEXT: .linear_centroid_ena: false +; SHADERTEST-NEXT: .linear_sample_ena: false +; SHADERTEST-NEXT: .persp_center_ena: false +; SHADERTEST-NEXT: .persp_centroid_ena: false +; SHADERTEST-NEXT: .persp_pull_model_ena: false +; SHADERTEST-NEXT: .persp_sample_ena: true +; SHADERTEST-NEXT: .pos_fixed_pt_ena: false +; SHADERTEST-NEXT: .pos_w_float_ena: false +; SHADERTEST-NEXT: .pos_x_float_ena: false +; SHADERTEST-NEXT: .pos_y_float_ena: false +; SHADERTEST-NEXT: .pos_z_float_ena: false +; SHADERTEST-NEXT: .sample_coverage_ena: false +; SHADERTEST-NEXT: .spi_ps_input_cntl: +; SHADERTEST-NEXT: - .attr0_valid: 0 +; SHADERTEST-NEXT: .attr1_valid: 0 +; SHADERTEST-NEXT: .flat_shade: false +; SHADERTEST-NEXT: .fp16_interp_mode: false +; SHADERTEST-NEXT: .offset: 0 +; SHADERTEST-NEXT: .prim_attr: false +; SHADERTEST-NEXT: .pt_sprite_tex: false +; SHADERTEST-NEXT: .spi_ps_input_ena: +; SHADERTEST-NEXT: .ancillary_ena: false +; SHADERTEST-NEXT: .front_face_ena: false +; SHADERTEST-NEXT: .line_stipple_tex_ena: false +; SHADERTEST-NEXT: .linear_center_ena: false +; SHADERTEST-NEXT: .linear_centroid_ena: false +; SHADERTEST-NEXT: .linear_sample_ena: false +; SHADERTEST-NEXT: .persp_center_ena: false +; SHADERTEST-NEXT: .persp_centroid_ena: false +; SHADERTEST-NEXT: .persp_pull_model_ena: false +; SHADERTEST-NEXT: .persp_sample_ena: true +; SHADERTEST-NEXT: .pos_fixed_pt_ena: false +; SHADERTEST-NEXT: .pos_w_float_ena: false +; SHADERTEST-NEXT: .pos_x_float_ena: false +; SHADERTEST-NEXT: .pos_y_float_ena: false +; SHADERTEST-NEXT: .pos_z_float_ena: false +; SHADERTEST-NEXT: .sample_coverage_ena: false +; SHADERTEST-NEXT: .spi_shader_col_format: +; SHADERTEST-NEXT: .col_0_export_format: 0 +; SHADERTEST-NEXT: .col_1_export_format: 0 +; SHADERTEST-NEXT: .col_2_export_format: 0 +; SHADERTEST-NEXT: .col_3_export_format: 0 +; SHADERTEST-NEXT: .col_4_export_format: 0 +; SHADERTEST-NEXT: .col_5_export_format: 0 +; SHADERTEST-NEXT: .col_6_export_format: 0 +; SHADERTEST-NEXT: .col_7_export_format: 0 +; SHADERTEST-NEXT: .spi_shader_pos_format: +; SHADERTEST-NEXT: - 0x4 +; SHADERTEST-NEXT: - 0x4 +; SHADERTEST-NEXT: - 0 +; SHADERTEST-NEXT: - 0 +; SHADERTEST-NEXT: - 0 +; SHADERTEST-NEXT: .spi_shader_z_format: 0 +; SHADERTEST-NEXT: .spi_vs_out_config: +; SHADERTEST-NEXT: .no_pc_export: true +; SHADERTEST-NEXT: .vgt_reuse_off: false +; SHADERTEST-NEXT: .vgt_shader_stages_en: +; SHADERTEST-NEXT: .max_primgroup_in_wave: 0x2 +; SHADERTEST-NEXT: .vs_stage_en: 0 +; SHADERTEST-NEXT: .vs_w32_en: true +; SHADERTEST-NEXT: .vgt_strmout_buffer_config: +; SHADERTEST-NEXT: .stream_0_buffer_en: 0 +; SHADERTEST-NEXT: .stream_1_buffer_en: 0 +; SHADERTEST-NEXT: .stream_2_buffer_en: 0 +; SHADERTEST-NEXT: .stream_3_buffer_en: 0 +; SHADERTEST-NEXT: .vgt_strmout_config: +; SHADERTEST-NEXT: .streamout_0_en: false +; SHADERTEST-NEXT: .streamout_1_en: false +; SHADERTEST-NEXT: .streamout_2_en: false +; SHADERTEST-NEXT: .streamout_3_en: false +; SHADERTEST-NEXT: .vs_so_base0_en: false +; SHADERTEST-NEXT: .vs_so_base1_en: false +; SHADERTEST-NEXT: .vs_so_base2_en: false +; SHADERTEST-NEXT: .vs_so_base3_en: false +; SHADERTEST-NEXT: .vs_streamout_en: false +; SHADERTEST-NEXT: .hardware_stages: +; SHADERTEST-NEXT: .ps: +; SHADERTEST-NEXT: .checksum_value: 0x4658ef51 +; SHADERTEST-NEXT: .debug_mode: false +; SHADERTEST-NEXT: .entry_point: _amdgpu_ps_main +; SHADERTEST-NEXT: .float_mode: 0xc0 +; SHADERTEST-NEXT: .ieee_mode: false +; SHADERTEST-NEXT: .mem_ordered: true +; SHADERTEST-NEXT: .scratch_en: false +; SHADERTEST-NEXT: .scratch_memory_size: 0 +; SHADERTEST-NEXT: .sgpr_count: 0x2 +; SHADERTEST-NEXT: .sgpr_limit: 0x6a +; SHADERTEST-NEXT: .trap_present: 0 +; SHADERTEST-NEXT: .user_data_reg_map: +; SHADERTEST-NEXT: - 0x10000000 +; SHADERTEST-NEXT: - 0xffffffff +; SHADERTEST-NEXT: - 0xffffffff +; SHADERTEST-NEXT: - 0xffffffff +; SHADERTEST-NEXT: - 0xffffffff +; SHADERTEST-NEXT: - 0xffffffff +; SHADERTEST-NEXT: - 0xffffffff +; SHADERTEST-NEXT: - 0xffffffff +; SHADERTEST-NEXT: - 0xffffffff +; SHADERTEST-NEXT: - 0xffffffff +; SHADERTEST-NEXT: - 0xffffffff +; SHADERTEST-NEXT: - 0xffffffff +; SHADERTEST-NEXT: - 0xffffffff +; SHADERTEST-NEXT: - 0xffffffff +; SHADERTEST-NEXT: - 0xffffffff +; SHADERTEST-NEXT: - 0xffffffff +; SHADERTEST-NEXT: - 0xffffffff +; SHADERTEST-NEXT: - 0xffffffff +; SHADERTEST-NEXT: - 0xffffffff +; SHADERTEST-NEXT: - 0xffffffff +; SHADERTEST-NEXT: - 0xffffffff +; SHADERTEST-NEXT: - 0xffffffff +; SHADERTEST-NEXT: - 0xffffffff +; SHADERTEST-NEXT: - 0xffffffff +; SHADERTEST-NEXT: - 0xffffffff +; SHADERTEST-NEXT: - 0xffffffff +; SHADERTEST-NEXT: - 0xffffffff +; SHADERTEST-NEXT: - 0xffffffff +; SHADERTEST-NEXT: - 0xffffffff +; SHADERTEST-NEXT: - 0xffffffff +; SHADERTEST-NEXT: - 0xffffffff +; SHADERTEST-NEXT: - 0xffffffff +; SHADERTEST-NEXT: .user_sgprs: 0x1 +; SHADERTEST-NEXT: .uses_uavs: false +; SHADERTEST-NEXT: .vgpr_count: 0x2 +; SHADERTEST-NEXT: .vgpr_limit: 0x100 +; SHADERTEST-NEXT: .wavefront_size: 0x40 +; SHADERTEST-NEXT: .wgp_mode: false +; SHADERTEST-NEXT: .writes_depth: 0 +; SHADERTEST-NEXT: .writes_uavs: false +; SHADERTEST-NEXT: .vs: +; SHADERTEST-NEXT: .checksum_value: 0xd2536693 +; SHADERTEST-NEXT: .debug_mode: false +; SHADERTEST-NEXT: .entry_point: _amdgpu_vs_main +; SHADERTEST-NEXT: .float_mode: 0xc0 +; SHADERTEST-NEXT: .ieee_mode: false +; SHADERTEST-NEXT: .mem_ordered: true +; SHADERTEST-NEXT: .scratch_en: false +; SHADERTEST-NEXT: .scratch_memory_size: 0 +; SHADERTEST-NEXT: .sgpr_count: 0x3 +; SHADERTEST-NEXT: .sgpr_limit: 0x6a +; SHADERTEST-NEXT: .trap_present: 0 +; SHADERTEST-NEXT: .user_data_reg_map: +; SHADERTEST-NEXT: - 0x10000000 +; SHADERTEST-NEXT: - 0x10000003 +; SHADERTEST-NEXT: - 0x10000004 +; SHADERTEST-NEXT: - 0xffffffff +; SHADERTEST-NEXT: - 0xffffffff +; SHADERTEST-NEXT: - 0xffffffff +; SHADERTEST-NEXT: - 0xffffffff +; SHADERTEST-NEXT: - 0xffffffff +; SHADERTEST-NEXT: - 0xffffffff +; SHADERTEST-NEXT: - 0xffffffff +; SHADERTEST-NEXT: - 0xffffffff +; SHADERTEST-NEXT: - 0xffffffff +; SHADERTEST-NEXT: - 0xffffffff +; SHADERTEST-NEXT: - 0xffffffff +; SHADERTEST-NEXT: - 0xffffffff +; SHADERTEST-NEXT: - 0xffffffff +; SHADERTEST-NEXT: - 0xffffffff +; SHADERTEST-NEXT: - 0xffffffff +; SHADERTEST-NEXT: - 0xffffffff +; SHADERTEST-NEXT: - 0xffffffff +; SHADERTEST-NEXT: - 0xffffffff +; SHADERTEST-NEXT: - 0xffffffff +; SHADERTEST-NEXT: - 0xffffffff +; SHADERTEST-NEXT: - 0xffffffff +; SHADERTEST-NEXT: - 0xffffffff +; SHADERTEST-NEXT: - 0xffffffff +; SHADERTEST-NEXT: - 0xffffffff +; SHADERTEST-NEXT: - 0xffffffff +; SHADERTEST-NEXT: - 0xffffffff +; SHADERTEST-NEXT: - 0xffffffff +; SHADERTEST-NEXT: - 0xffffffff +; SHADERTEST-NEXT: - 0xffffffff +; SHADERTEST-NEXT: .user_sgprs: 0x3 +; SHADERTEST-NEXT: .vgpr_count: 0x4 +; SHADERTEST-NEXT: .vgpr_limit: 0x100 +; SHADERTEST-NEXT: .wavefront_size: 0x20 +; SHADERTEST-NEXT: .wgp_mode: false +; SHADERTEST-NEXT: .internal_pipeline_hash: +; SHADERTEST-NEXT: - 0x{{[0-9a-f]+}} +; SHADERTEST-NEXT: - 0x{{[0-9a-f]+}} +; SHADERTEST-NEXT: .num_interpolants: 0x1 +; SHADERTEST-NEXT: .registers: {} +; SHADERTEST-NEXT: .shaders: +; SHADERTEST-NEXT: .pixel: +; SHADERTEST-NEXT: .api_shader_hash: +; SHADERTEST-NEXT: - 0x{{[0-9a-f]+}} +; SHADERTEST-NEXT: - 0 +; SHADERTEST-NEXT: .hardware_mapping: +; SHADERTEST-NEXT: - .ps +; SHADERTEST-NEXT: .vertex: +; SHADERTEST-NEXT: .api_shader_hash: +; SHADERTEST-NEXT: - 0x{{[0-9a-f]+}} +; SHADERTEST-NEXT: - 0 +; SHADERTEST-NEXT: .hardware_mapping: +; SHADERTEST-NEXT: - .vs +; SHADERTEST-NEXT: .spill_threshold: 0xffff +; SHADERTEST-NEXT: .streamout_vertex_strides: +; SHADERTEST-NEXT: - 0 +; SHADERTEST-NEXT: - 0 +; SHADERTEST-NEXT: - 0 +; SHADERTEST-NEXT: - 0 +; SHADERTEST-NEXT: .type: VsPs +; SHADERTEST-NEXT: .user_data_limit: 0x1 +; SHADERTEST-NEXT: .xgl_cache_info: +; SHADERTEST-NEXT: .128_bit_cache_hash: +; SHADERTEST-NEXT: - 0x{{[0-9a-f]+}} +; SHADERTEST-NEXT: - 0x{{[0-9a-f]+}} +; SHADERTEST-NEXT: .llpc_version: {{.*}} +; SHADERTEST-NEXT: amdpal.version: +; SHADERTEST-NEXT: - 0x3 +; SHADERTEST-NEXT: - 0 +; SHADERTEST-NEXT: ... diff --git a/llpc/test/shaderdb/gfx11/SgprUserDataInit_Cs.pipe b/llpc/test/shaderdb/gfx11/SgprUserDataInit_Cs.pipe index 0fece56fe9..98726c68a2 100644 --- a/llpc/test/shaderdb/gfx11/SgprUserDataInit_Cs.pipe +++ b/llpc/test/shaderdb/gfx11/SgprUserDataInit_Cs.pipe @@ -227,7 +227,7 @@ options.threadGroupSwizzleMode = Default ; CHECK-NEXT: - 0 ; CHECK-NEXT: .hardware_mapping: ; CHECK-NEXT: - .cs -; CHECK-NEXT: .spill_threshold: 0xffffffff +; CHECK-NEXT: .spill_threshold: 0xffff ; CHECK-NEXT: .type: Cs ; CHECK-NEXT: .user_data_limit: 0x1 ; CHECK-NEXT: .xgl_cache_info: diff --git a/llpc/test/shaderdb/gfx11/SgprUserDataInit_Fs.pipe b/llpc/test/shaderdb/gfx11/SgprUserDataInit_Fs.pipe index 6d70f8e106..af463020ca 100644 --- a/llpc/test/shaderdb/gfx11/SgprUserDataInit_Fs.pipe +++ b/llpc/test/shaderdb/gfx11/SgprUserDataInit_Fs.pipe @@ -417,7 +417,7 @@ colorBuffer[0].blendSrcAlphaToColor = 0 ; CHECK-NEXT: - 0 ; CHECK-NEXT: .hardware_mapping: ; CHECK-NEXT: - .gs -; CHECK-NEXT: .spill_threshold: 0xffffffff +; CHECK-NEXT: .spill_threshold: 0xffff ; CHECK-NEXT: .type: Ngg ; CHECK-NEXT: .user_data_limit: 0x13 ; CHECK-NEXT: .xgl_cache_info: diff --git a/llpc/test/shaderdb/gfx9/PipelineVsFs_TestAlpha2Coverage.pipe b/llpc/test/shaderdb/gfx9/PipelineVsFs_TestAlpha2Coverage.pipe index d1e9e92033..d8b488c281 100644 --- a/llpc/test/shaderdb/gfx9/PipelineVsFs_TestAlpha2Coverage.pipe +++ b/llpc/test/shaderdb/gfx9/PipelineVsFs_TestAlpha2Coverage.pipe @@ -3,7 +3,17 @@ ; RUN: amdllpc -enable-part-pipeline=0 -v %gfxip %s | FileCheck -check-prefix=SHADERTEST %s ; RUN: amdllpc -enable-part-pipeline=1 -v %gfxip %s | FileCheck -check-prefix=SHADERTEST %s ; SHADERTEST-LABEL: LLPC final ELF info -; SHADERTEST: DB_SHADER_CONTROL 0x0000000000000010 +; SHADERTEST: .db_shader_control: { +; SHADERTEST: .alpha_to_mask_disable: 0 +; SHADERTEST: .conservative_z_export: 0x0000000000000000 +; SHADERTEST: .depth_before_shader: 0x0000000000000000 +; SHADERTEST: .exec_on_hier_fail: 0 +; SHADERTEST: .exec_on_noop: 0 +; SHADERTEST: .kill_enable: 0 +; SHADERTEST: .mask_export_enable: 0 +; SHADERTEST: .stencil_test_val_export_enable: 0x0000000000000000 +; SHADERTEST: .z_export_enable: 0x0000000000000000 +; SHADERTEST: .z_order: 0x0000000000000001 } ; SHADERTEST: AMDLLPC SUCCESS [Version] diff --git a/llpc/test/shaderdb/gfx9/PipelineVsFs_TestFetchSingleInput.pipe b/llpc/test/shaderdb/gfx9/PipelineVsFs_TestFetchSingleInput.pipe index 49aaddd5be..642213640a 100644 --- a/llpc/test/shaderdb/gfx9/PipelineVsFs_TestFetchSingleInput.pipe +++ b/llpc/test/shaderdb/gfx9/PipelineVsFs_TestFetchSingleInput.pipe @@ -5,7 +5,7 @@ ; Skip to the patching results for the fetch shader ; SHADERTEST-LABEL: LLPC pipeline patching results ; Check the inputs to the vertex shader. This should be all of the regular inputs. There is one vertex attribute being passed in: The vector at the end. -; SHADERTEST: define dllexport amdgpu_vs void @_amdgpu_vs_main_fetchless(i32 inreg %globalTable, i32 inreg %descTable0, i32 inreg %vertexBufferTable, i32 inreg %baseVertex, i32 inreg %baseInstance, i32 %VertexId, i32 %RelVertexId, i32 %PrimitiveId, i32 %InstanceId, <4 x float> %vertex0.0) +; SHADERTEST: define dllexport amdgpu_vs void @_amdgpu_vs_main_fetchless(i32 inreg %globalTable, i32 inreg %userdata0, i32 inreg %vertexBufferTable, i32 inreg %baseVertex, i32 inreg %baseInstance, i32 %VertexId, i32 %RelVertexId, i32 %PrimitiveId, i32 %InstanceId, <4 x float> %vertex0.0) ; SHADERTEST-LABEL: LGC glue shader results ; Check the inputs to the fetch shader. This should match the vertex shader except: ; - there are extra inreg inputs because its determination of how many SGPR inputs diff --git a/llpc/test/shaderdb/relocatable_shaders/PipelineCs_PipelineCacheHit.pipe b/llpc/test/shaderdb/relocatable_shaders/PipelineCs_PipelineCacheHit.pipe index ccbb0a76f2..c54e135f63 100644 --- a/llpc/test/shaderdb/relocatable_shaders/PipelineCs_PipelineCacheHit.pipe +++ b/llpc/test/shaderdb/relocatable_shaders/PipelineCs_PipelineCacheHit.pipe @@ -1,6 +1,7 @@ ; Test that there is a pipeline cache hit when the same pipeline is compiled twice. ; BEGIN_SHADERTEST ; RUN: amdllpc -shader-cache-mode=1 \ +; RUN: -cache-full-pipelines=true \ ; RUN: -enable-relocatable-shader-elf \ ; RUN: -o %t.elf %gfxip %s %s -v | FileCheck -check-prefix=SHADERTEST %s ; SHADERTEST: Cache miss for compute pipeline. diff --git a/llpc/test/shaderdb/relocatable_shaders/PipelineCs_ShaderCache.pipe b/llpc/test/shaderdb/relocatable_shaders/PipelineCs_ShaderCache.pipe deleted file mode 100644 index 1f882276ec..0000000000 --- a/llpc/test/shaderdb/relocatable_shaders/PipelineCs_ShaderCache.pipe +++ /dev/null @@ -1,97 +0,0 @@ -; This test case checks shader cache creation in the standalone compilation mode. -; BEGIN_SHADERTEST -; RUN: rm -rf %t_dir && \ -; RUN: mkdir -p %t_dir && \ -; RUN: amdllpc %gfxip \ -; RUN: -shader-cache-mode=2 \ -; RUN: -shader-cache-filename=cache.bin -shader-cache-file-dir=%t_dir \ -; RUN: -enable-relocatable-shader-elf \ -; RUN: -cache-full-pipelines=false \ -; RUN: -o %t.elf %s -v | FileCheck -check-prefix=CREATE %s -; REQUIRES: llpc_enable_shader_cache -; CREATE: Building pipeline with relocatable shader elf. -; CREATE: Cache miss for shader stage compute -; CREATE: Updating the cache for unlinked shader stage compute -; CREATE: ===== AMDLLPC SUCCESS ===== -; END_SHADERTEST - -; Check that the cache file exists under the expected location and is not empty. -; BEGIN_SHADERTEST -; RUN: ls -s %t_dir/AMD/LlpcCache/cache.bin | FileCheck -check-prefix=SIZE %s -; REQUIRES: llpc_enable_shader_cache -; SIZE: {{[1-9][0-9]*}} {{.*}}cache.bin -; END_SHADERTEST - -; Now, attempt to load the shader cache from the previous run in the read-only mode. -; No new cache entries should be added. -; BEGIN_SHADERTEST -; RUN: amdllpc %gfxip \ -; RUN: -shader-cache-mode=4 \ -; RUN: -shader-cache-filename=cache.bin -shader-cache-file-dir=%t_dir \ -; RUN: -enable-relocatable-shader-elf \ -; RUN: -cache-full-pipelines=false \ -; RUN: -o %t.elf %s -v | FileCheck -check-prefix=LOAD %s -; REQUIRES: llpc_enable_shader_cache -; LOAD: Building pipeline with relocatable shader elf. -; LOAD: Cache hit for shader stage compute -; LOAD-NOT: Updating the cache for shader stage compute -; LOAD: ===== AMDLLPC SUCCESS ===== -; END_SHADERTEST - -; Lastly, attempt to load the shader cache from the previous run in the read/write mode and modify it. -; Add a dummy compilation flag to force a different shader hash and a cache miss. -; BEGIN_SHADERTEST -; RUN: amdllpc %gfxip \ -; RUN: -vgpr-limit=64 \ -; RUN: -shader-cache-mode=3 \ -; RUN: -shader-cache-filename=cache.bin -shader-cache-file-dir=%t_dir \ -; RUN: -enable-relocatable-shader-elf \ -; RUN: -cache-full-pipelines=false \ -; RUN: -o %t.elf %s -v | FileCheck -check-prefix=MODIFY %s -; REQUIRES: llpc_enable_shader_cache -; MODIFY: Building pipeline with relocatable shader elf. -; MODIFY: Cache miss for shader stage compute -; MODIFY: Updating the cache for unlinked shader stage compute -; MODIFY: ===== AMDLLPC SUCCESS ===== -; END_SHADERTEST - - -[CsGlsl] -#version 450 -#extension GL_ARB_separate_shader_objects : enable - -layout(binding = 0) uniform UniformBufferObject { - vec4 i; -} ubo; - -layout(set = 1, binding = 0, std430) buffer OUT -{ - vec4 o; -}; - -layout(local_size_x = 2, local_size_y = 3) in; -void main() { - o = ubo.i; -} - - -[CsInfo] -entryPoint = main -userDataNode[0].type = DescriptorTableVaPtr -userDataNode[0].offsetInDwords = 0 -userDataNode[0].sizeInDwords = 1 -userDataNode[0].set = 0 -userDataNode[0].next[0].type = DescriptorBuffer -userDataNode[0].next[0].offsetInDwords = 4 -userDataNode[0].next[0].sizeInDwords = 8 -userDataNode[0].next[0].set = 0 -userDataNode[0].next[0].binding = 0 -userDataNode[1].type = DescriptorTableVaPtr -userDataNode[1].offsetInDwords = 1 -userDataNode[1].sizeInDwords = 1 -userDataNode[1].set = 1 -userDataNode[1].next[0].type = DescriptorBuffer -userDataNode[1].next[0].offsetInDwords = 4 -userDataNode[1].next[0].sizeInDwords = 8 -userDataNode[1].next[0].set = 1 -userDataNode[1].next[0].binding = 0 diff --git a/llpc/test/shaderdb/relocatable_shaders/PipelineVsFs_CheckFloatModeFlushToZero.pipe b/llpc/test/shaderdb/relocatable_shaders/PipelineVsFs_CheckFloatModeFlushToZero.pipe index 908fed7bfd..4413aad6d4 100644 --- a/llpc/test/shaderdb/relocatable_shaders/PipelineVsFs_CheckFloatModeFlushToZero.pipe +++ b/llpc/test/shaderdb/relocatable_shaders/PipelineVsFs_CheckFloatModeFlushToZero.pipe @@ -4,7 +4,9 @@ ; RUN: amdllpc -o %t.elf %gfxip %s -v | FileCheck -check-prefix=SHADERTEST %s ; RUN: amdllpc -enable-relocatable-shader-elf -o %t.elf %gfxip %s -v | FileCheck -check-prefix=SHADERTEST %s ; SHADERTEST-LABEL: PalMetadata -; SHADERTEST: SPI_SHADER_PGM_RSRC1_PS 0x00000000002C +; SHADERTEST: .hardware_stages: { +; SHADERTEST: .ps: { +; SHADERTEST: .float_mode: 0x00000000000000C0 ; END_SHADERTEST [Version] diff --git a/llpc/test/shaderdb/relocatable_shaders/PipelineVsFs_CheckFloatModePreserve.pipe b/llpc/test/shaderdb/relocatable_shaders/PipelineVsFs_CheckFloatModePreserve.pipe index 1be9d480a9..dd01a246f3 100644 --- a/llpc/test/shaderdb/relocatable_shaders/PipelineVsFs_CheckFloatModePreserve.pipe +++ b/llpc/test/shaderdb/relocatable_shaders/PipelineVsFs_CheckFloatModePreserve.pipe @@ -4,7 +4,9 @@ ; RUN: amdllpc -o %t.elf %gfxip %s -v | FileCheck -check-prefix=SHADERTEST %s ; RUN: amdllpc -enable-relocatable-shader-elf -o %t.elf %gfxip %s -v | FileCheck -check-prefix=SHADERTEST %s ; SHADERTEST-LABEL: PalMetadata -; SHADERTEST: SPI_SHADER_PGM_RSRC1_PS 0x00000000002F +; SHADERTEST: .hardware_stages: { +; SHADERTEST: .ps: { +; SHADERTEST: .float_mode: 0x00000000000000F0 ; END_SHADERTEST [Version] diff --git a/llpc/test/shaderdb/relocatable_shaders/PipelineVsFs_EnableColorExport.pipe b/llpc/test/shaderdb/relocatable_shaders/PipelineVsFs_EnableColorExport.pipe index d410d45e6a..d74c34412b 100644 --- a/llpc/test/shaderdb/relocatable_shaders/PipelineVsFs_EnableColorExport.pipe +++ b/llpc/test/shaderdb/relocatable_shaders/PipelineVsFs_EnableColorExport.pipe @@ -36,7 +36,6 @@ void main() entryPoint = main [ResourceMapping] -userDataNode[0].visibility = 17 userDataNode[0].type = DescriptorTableVaPtr userDataNode[0].offsetInDwords = 11 userDataNode[0].sizeInDwords = 1 @@ -50,7 +49,6 @@ userDataNode[0].next[1].offsetInDwords = 0 userDataNode[0].next[1].sizeInDwords = 8 userDataNode[0].next[1].set = 0 userDataNode[0].next[1].binding = 1 -userDataNode[1].visibility = 2 userDataNode[1].type = IndirectUserDataVaPtr userDataNode[1].offsetInDwords = 12 userDataNode[1].sizeInDwords = 1 @@ -79,51 +77,51 @@ attribute[0].offset = 0 ; SHADERTEST-NEXT: [[PERSPINTERPCENTER_I1:%.*]] = extractelement <2 x float> [[PERSPINTERPCENTER:%.*]], i64 1 ; SHADERTEST-NEXT: [[PERSPINTERPCENTER_I0:%.*]] = extractelement <2 x float> [[PERSPINTERPCENTER]], i64 0 ; SHADERTEST-NEXT: [[TMP11:%.*]] = call i64 @llvm.amdgcn.s.getpc() +; SHADERTEST-NEXT: [[TMP16:%.*]] = call float @llvm.amdgcn.interp.p1(float [[PERSPINTERPCENTER_I0]], i32 immarg 0, i32 immarg 0, i32 [[PRIMMASK:%.*]]) +; SHADERTEST-NEXT: [[TMP17:%.*]] = call float @llvm.amdgcn.interp.p2(float [[TMP16]], float [[PERSPINTERPCENTER_I1]], i32 immarg 0, i32 immarg 0, i32 [[PRIMMASK]]) +; SHADERTEST-NEXT: [[TMP18:%.*]] = call float @llvm.amdgcn.interp.p1(float [[PERSPINTERPCENTER_I0]], i32 immarg 1, i32 immarg 0, i32 [[PRIMMASK]]) +; SHADERTEST-NEXT: [[TMP19:%.*]] = call float @llvm.amdgcn.interp.p2(float [[TMP18]], float [[PERSPINTERPCENTER_I1]], i32 immarg 1, i32 immarg 0, i32 [[PRIMMASK]]) +; SHADERTEST-NEXT: [[DOTI0:%.*]] = fptosi float [[TMP17]] to i32 +; SHADERTEST-NEXT: [[DOTI1:%.*]] = fptosi float [[TMP19]] to i32 ; SHADERTEST-NEXT: [[TMP12:%.*]] = and i64 [[TMP11]], -4294967296 ; SHADERTEST-NEXT: [[TMP13:%.*]] = zext i32 [[DESCTABLE0:%.*]] to i64 ; SHADERTEST-NEXT: [[TMP14:%.*]] = or i64 [[TMP12]], [[TMP13]] ; SHADERTEST-NEXT: [[TMP15:%.*]] = inttoptr i64 [[TMP14]] to ptr addrspace(4) -; SHADERTEST-NEXT: [[TMP16:%.*]] = call float @llvm.amdgcn.interp.p1(float [[PERSPINTERPCENTER_I0]], i32 immarg 0, i32 immarg 0, i32 [[PRIMMASK:%.*]]) #[[ATTR2:[0-9]+]] -; SHADERTEST-NEXT: [[TMP17:%.*]] = call float @llvm.amdgcn.interp.p2(float [[TMP16]], float [[PERSPINTERPCENTER_I1]], i32 immarg 0, i32 immarg 0, i32 [[PRIMMASK]]) #[[ATTR2]] -; SHADERTEST-NEXT: [[TMP18:%.*]] = call float @llvm.amdgcn.interp.p1(float [[PERSPINTERPCENTER_I0]], i32 immarg 1, i32 immarg 0, i32 [[PRIMMASK]]) #[[ATTR2]] -; SHADERTEST-NEXT: [[TMP19:%.*]] = call float @llvm.amdgcn.interp.p2(float [[TMP18]], float [[PERSPINTERPCENTER_I1]], i32 immarg 1, i32 immarg 0, i32 [[PRIMMASK]]) #[[ATTR2]] -; SHADERTEST-NEXT: [[DOTI0:%.*]] = fptosi float [[TMP17]] to i32 -; SHADERTEST-NEXT: [[DOTI1:%.*]] = fptosi float [[TMP19]] to i32 ; SHADERTEST-NEXT: [[TMP20:%.*]] = load <8 x i32>, ptr addrspace(4) [[TMP15]], align 32, !invariant.load !10 ; SHADERTEST-NEXT: [[TMP21:%.*]] = load <4 x i32>, ptr addrspace(4) [[TMP15]], align 16, !invariant.load !10 ; SHADERTEST-NEXT: [[DOTI01:%.*]] = sitofp i32 [[DOTI0]] to float ; SHADERTEST-NEXT: [[DOTI12:%.*]] = sitofp i32 [[DOTI1]] to float -; SHADERTEST-NEXT: [[TMP22:%.*]] = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32 15, float [[DOTI01]], float [[DOTI12]], <8 x i32> [[TMP20]], <4 x i32> [[TMP21]], i1 false, i32 0, i32 0) -; SHADERTEST-NEXT: [[TMP23:%.*]] = insertvalue { <4 x float> } poison, <4 x float> [[TMP22]], 0 -; SHADERTEST-NEXT: [[TMP24:%.*]] = zext i32 [[COLOREXPADDR:%.*]] to i64 -; SHADERTEST-NEXT: [[TMP25:%.*]] = or i64 [[TMP12]], [[TMP24]] -; SHADERTEST-NEXT: [[TMP26:%.*]] = inttoptr i64 [[TMP25]] to ptr addrspace(4) -; SHADERTEST-NEXT: call amdgpu_gfx addrspace(4) void [[TMP26]]({ <4 x float> } [[TMP23]]) +; SHADERTEST-NEXT: [[TMP23:%.*]] = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32 15, float [[DOTI01]], float [[DOTI12]], <8 x i32> [[TMP20]], <4 x i32> [[TMP21]], i1 false, i32 0, i32 0) +; SHADERTEST-NEXT: [[TMP24:%.*]] = insertvalue { <4 x float> } poison, <4 x float> [[TMP23]], 0 +; SHADERTEST-NEXT: [[TMP25:%.*]] = zext i32 [[COLOREXPADDR:%.*]] to i64 +; SHADERTEST-NEXT: [[TMP26:%.*]] = or i64 [[TMP12]], [[TMP25]] +; SHADERTEST-NEXT: [[TMP27:%.*]] = inttoptr i64 [[TMP26]] to ptr addrspace(4) +; SHADERTEST-NEXT: call amdgpu_gfx addrspace(4) void [[TMP27]]({ <4 x float> } [[TMP24]]) ; SHADERTEST-NEXT: unreachable ; ; ; SHADERTEST-LABEL: amdgpu_ps_main: -; SHADERTEST: s_getpc_b64 s[2:3] -; SHADERTEST-NEXT: s_mov_b32 s2, s0 +; SHADERTEST: s_getpc_b64 s[6:7] +; SHADERTEST-NEXT: s_mov_b32 s6, s0 ; SHADERTEST-NEXT: s_mov_b32 s32, 0 -; SHADERTEST-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 +; SHADERTEST-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x0 ; SHADERTEST-NEXT: s_waitcnt lgkmcnt(0) -; SHADERTEST-NEXT: s_and_b32 s3, s3, 0xffff -; SHADERTEST-NEXT: s_add_u32 s2, s2, s17 -; SHADERTEST-NEXT: s_addc_u32 s3, s3, 0 -; SHADERTEST-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2 -; SHADERTEST-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3 +; SHADERTEST-NEXT: s_and_b32 s7, s7, 0xffff +; SHADERTEST-NEXT: s_add_u32 s6, s6, s4 +; SHADERTEST-NEXT: s_addc_u32 s7, s7, 0 +; SHADERTEST-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s6 +; SHADERTEST-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s7 ; SHADERTEST-NEXT: s_wqm_b64 exec, exec -; SHADERTEST-NEXT: s_getpc_b64 s[12:13] -; SHADERTEST-NEXT: s_mov_b32 s8, s1 -; SHADERTEST-NEXT: s_mov_b32 m0, s16 -; SHADERTEST-NEXT: s_mov_b32 s9, s13 +; SHADERTEST-NEXT: s_getpc_b64 s[16:17] +; SHADERTEST-NEXT: s_mov_b32 s0, s1 +; SHADERTEST-NEXT: s_mov_b32 m0, s3 +; SHADERTEST-NEXT: s_mov_b32 s1, s17 ; SHADERTEST-NEXT: v_interp_p1_f32_e32 v2, v0, attr0.x ; SHADERTEST-NEXT: v_interp_p1_f32_e32 v0, v0, attr0.y ; SHADERTEST-NEXT: s_clause 0x1 -; SHADERTEST-NEXT: s_load_dwordx8 s[0:7], s[8:9], 0x0 -; SHADERTEST-NEXT: s_load_dwordx4 s[8:11], s[8:9], 0x0 -; SHADERTEST-NEXT: s_mov_b32 s15, s13 +; SHADERTEST-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x0 +; SHADERTEST-NEXT: s_load_dwordx4 s[12:15], s[0:1], 0x0 +; SHADERTEST-NEXT: s_mov_b32 s3, s17 ; SHADERTEST-NEXT: v_interp_p2_f32_e32 v2, v1, attr0.x ; SHADERTEST-NEXT: v_interp_p2_f32_e32 v0, v1, attr0.y ; SHADERTEST-NEXT: v_cvt_i32_f32_e32 v1, v2 @@ -131,18 +129,18 @@ attribute[0].offset = 0 ; SHADERTEST-NEXT: v_cvt_f32_i32_e32 v0, v1 ; SHADERTEST-NEXT: v_cvt_f32_i32_e32 v1, v2 ; SHADERTEST-NEXT: s_waitcnt lgkmcnt(0) -; SHADERTEST-NEXT: image_sample v[0:3], v[0:1], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D -; SHADERTEST-NEXT: s_swappc_b64 s[30:31], s[14:15] +; SHADERTEST-NEXT: image_sample v[0:3], v[0:1], s[4:11], s[12:15] dmask:0xf dim:SQ_RSRC_IMG_2D +; SHADERTEST-NEXT: s_swappc_b64 s[30:31], s[2:3] ; ; ; SHADERTEST-LABEL: @color_export_shader( -; SHADERTEST-NEXT: call void @llvm.amdgcn.s.waitcnt(i32 0) ; SHADERTEST-NEXT: [[TMP2:%.*]] = extractelement <4 x float> [[TMP0:%.*]], i64 0 ; SHADERTEST-NEXT: [[TMP3:%.*]] = extractelement <4 x float> [[TMP0]], i64 1 ; SHADERTEST-NEXT: [[TMP4:%.*]] = extractelement <4 x float> [[TMP0]], i64 2 ; SHADERTEST-NEXT: [[TMP5:%.*]] = extractelement <4 x float> [[TMP0]], i64 3 -; SHADERTEST-NEXT: call void @llvm.amdgcn.exp.f32(i32 immarg 0, i32 immarg 15, float [[TMP2]], float [[TMP3]], float [[TMP4]], float [[TMP5]], i1 immarg true, i1 immarg true) #[[ATTR2]] -; SHADERTEST-NEXT: ret void +; SHADERTEST-NEXT: call void @llvm.amdgcn.exp.f32(i32 immarg 0, i32 immarg 15, float [[TMP2]], float [[TMP3]], float [[TMP4]], float [[TMP5]], i1 immarg true, i1 immarg true) +; SHADERTEST-NEXT: call void @llvm.amdgcn.endpgm() +; SHADERTEST-NEXT: unreachable ; ; SHADERTEST-LABEL: color_export_shader: ; SHADERTEST: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) diff --git a/llpc/test/shaderdb/relocatable_shaders/PipelineVsFs_FillPsInput.pipe b/llpc/test/shaderdb/relocatable_shaders/PipelineVsFs_FillPsInput.pipe index 1a7259640a..dc6f071193 100644 --- a/llpc/test/shaderdb/relocatable_shaders/PipelineVsFs_FillPsInput.pipe +++ b/llpc/test/shaderdb/relocatable_shaders/PipelineVsFs_FillPsInput.pipe @@ -3,11 +3,36 @@ ; BEGIN_SHADERTEST ; RUN: amdllpc -enable-relocatable-shader-elf -v %gfxip %s | FileCheck -check-prefix=SHADERTEST %s -; SHADERTEST: PalMetadata -; SHADERTEST: SPI_PS_INPUT_CNTL_0 -; SHADERTEST-NEXT: SPI_PS_INPUT_CNTL_1 -; SHADERTEST-NEXT: SPI_PS_INPUT_CNTL_2 -; SHADERTEST-NEXT: SPI_PS_INPUT_CNTL_3 +; SHADERTEST-LABEL: PalMetadata +; SHADERTEST: .spi_ps_input_cntl: [ { +; SHADERTEST: .attr0_valid: 0x0000000000000000 +; SHADERTEST: .attr1_valid: 0x0000000000000000 +; SHADERTEST: .flat_shade: 0 +; SHADERTEST: .fp16_interp_mode: 0 +; SHADERTEST: .offset: 0x0000000000000000 +; SHADERTEST: .prim_attr: 0 +; SHADERTEST: .pt_sprite_tex: 0 }{ +; SHADERTEST: .attr0_valid: 0x0000000000000000 +; SHADERTEST: .attr1_valid: 0x0000000000000000 +; SHADERTEST: .flat_shade: 0 +; SHADERTEST: .fp16_interp_mode: 0 +; SHADERTEST: .offset: 0x0000000000000001 +; SHADERTEST: .prim_attr: 0 +; SHADERTEST: .pt_sprite_tex: 0 }{ +; SHADERTEST: .attr0_valid: 0x0000000000000000 +; SHADERTEST: .attr1_valid: 0x0000000000000000 +; SHADERTEST: .flat_shade: 0 +; SHADERTEST: .fp16_interp_mode: 0 +; SHADERTEST: .offset: 0x0000000000000002 +; SHADERTEST: .prim_attr: 0 +; SHADERTEST: .pt_sprite_tex: 0 }{ +; SHADERTEST: .attr0_valid: 0x0000000000000000 +; SHADERTEST: .attr1_valid: 0x0000000000000000 +; SHADERTEST: .flat_shade: 0 +; SHADERTEST: .fp16_interp_mode: 0 +; SHADERTEST: .offset: 0x0000000000000003 +; SHADERTEST: .prim_attr: 0 +; SHADERTEST: .pt_sprite_tex: 0 }] ; SHADERTEST: AMDLLPC SUCCESS ; END_SHADERTEST diff --git a/llpc/test/shaderdb/relocatable_shaders/PipelineVsFs_GlueCache.pipe b/llpc/test/shaderdb/relocatable_shaders/PipelineVsFs_GlueCache.pipe deleted file mode 100644 index e9e95db18a..0000000000 --- a/llpc/test/shaderdb/relocatable_shaders/PipelineVsFs_GlueCache.pipe +++ /dev/null @@ -1,128 +0,0 @@ -; This test checks that all of the shaders in a graphics pipeline are correctly add to the shader cache and retrieved from it. -; We only check the internal shader cache because of the limitation on the testing infrastructure. - -; BEGIN_SHADERTEST -; RUN: rm -rf %t_dir && \ -; RUN: mkdir -p %t_dir && \ -; RUN: amdllpc %gfxip \ -; RUN: -shader-cache-mode=2 \ -; RUN: -shader-cache-filename=cache.bin -shader-cache-file-dir=%t_dir \ -; RUN: -enable-relocatable-shader-elf \ -; RUN: -cache-full-pipelines=false \ -; RUN: -o %t.elf %s -v | FileCheck -check-prefix=CREATE %s -; REQUIRES: llpc_enable_shader_cache -; CREATE: Building pipeline with relocatable shader elf. -; CREATE: Cache miss for shader stage vertex -; CREATE: Updating the cache for unlinked shader stage vertex -; CREATE: Cache miss for shader stage fragment -; CREATE: Updating the cache for unlinked shader stage fragment -; CREATE: ID for glue shader0: 00000000000000007632663332570000000100000002000000030000000E00000000000000030000000400000000000000000000000000000000000000000000000E000000070000000000000001 -; CREATE: Cache miss for glue shader 0 -; CREATE: Updating the cache for glue shader 0 -; CREATE: ID for glue shader1: 0000000000000000007634663332090000000000000000000000000000000000000000000000000000000000000000 -; CREATE: Cache miss for glue shader 1 -; CREATE: Updating the cache for glue shader 1 -; CREATE: ===== AMDLLPC SUCCESS ===== -; END_SHADERTEST - -; Check that the cache file exists under the expected location and is not empty. -; BEGIN_SHADERTEST -; RUN: ls -s %t_dir/AMD/LlpcCache/cache.bin | FileCheck -check-prefix=SIZE %s -; REQUIRES: llpc_enable_shader_cache -; SIZE: {{[1-9][0-9]*}} {{.*}}cache.bin -; END_SHADERTEST - -; Now, attempt to load the shader cache from the previous run in the read-only mode. -; No new cache entries should be added. -; BEGIN_SHADERTEST -; RUN: amdllpc %gfxip \ -; RUN: -shader-cache-mode=4 \ -; RUN: -shader-cache-filename=cache.bin -shader-cache-file-dir=%t_dir \ -; RUN: -enable-relocatable-shader-elf \ -; RUN: -cache-full-pipelines=false \ -; RUN: -o %t.elf %s -v | FileCheck -check-prefix=LOAD %s -; REQUIRES: llpc_enable_shader_cache -; LOAD: Building pipeline with relocatable shader elf. -; LOAD: Cache hit for shader stage vertex -; LOAD-NOT: Updating the cache for shader stage vertex -; LOAD: Cache hit for shader stage fragment -; LOAD-NOT: Updating the cache for shader stage fragment -; LOAD: ID for glue shader0: 00000000000000007632663332570000000100000002000000030000000E00000000000000030000000400000000000000000000000000000000000000000000000E000000070000000000000001 -; LOAD: Cache hit for glue shader 0 -; LOAD-NOT: Updating the cache for glue shader -; LOAD: ID for glue shader1: 0000000000000000007634663332090000000000000000000000000000000000000000000000000000000000000000 -; LOAD: Cache hit for glue shader 1 -; LOAD-NOT: Updating the cache for glue shader -; LOAD: ===== AMDLLPC SUCCESS ===== -; END_SHADERTEST - -[Version] -version = 38 - -[VsGlsl] -#version 450 - -layout(location = 0) in vec2 inPosition; -layout(location = 0) out vec2 outUV; - -void main() { - outUV = inPosition; -} - - -[VsInfo] -entryPoint = main - -[FsGlsl] -#version 450 core - -layout(set = 0, binding = 0) uniform sampler s; -layout(set = 0, binding = 1) uniform texture2D tex; -layout(location = 0) in vec2 inUV; -layout(location = 0) out vec4 oColor; - -void main() -{ - ivec2 iUV = ivec2(inUV); - oColor = texture(sampler2D(tex, s), iUV); -} - -[FsInfo] -entryPoint = main - -[ResourceMapping] -userDataNode[0].visibility = 17 -userDataNode[0].type = DescriptorTableVaPtr -userDataNode[0].offsetInDwords = 11 -userDataNode[0].sizeInDwords = 1 -userDataNode[0].next[0].type = DescriptorSampler -userDataNode[0].next[0].offsetInDwords = 0 -userDataNode[0].next[0].sizeInDwords = 4 -userDataNode[0].next[0].set = 0 -userDataNode[0].next[0].binding = 0 -userDataNode[0].next[1].type = DescriptorResource -userDataNode[0].next[1].offsetInDwords = 0 -userDataNode[0].next[1].sizeInDwords = 8 -userDataNode[0].next[1].set = 0 -userDataNode[0].next[1].binding = 1 -userDataNode[1].visibility = 2 -userDataNode[1].type = IndirectUserDataVaPtr -userDataNode[1].offsetInDwords = 12 -userDataNode[1].sizeInDwords = 1 -userDataNode[1].indirectUserDataCount = 4 - -[GraphicsPipelineState] -topology = VK_PRIMITIVE_TOPOLOGY_TRIANGLE_STRIP -colorBuffer[0].format = VK_FORMAT_R32G32B32A32_SFLOAT -colorBuffer[0].channelWriteMask = 15 -colorBuffer[0].blendEnable = 1 -colorBuffer[0].blendSrcAlphaToColor = 1 - -[VertexInputState] -binding[0].binding = 1 -binding[0].stride = 16 -binding[0].inputRate = VK_VERTEX_INPUT_RATE_VERTEX -attribute[0].location = 0 -attribute[0].binding = 0 -attribute[0].format = VK_FORMAT_R32G32B32A32_SFLOAT -attribute[0].offset = 0 diff --git a/llpc/test/shaderdb/relocatable_shaders/PipelineVsFs_ImmutableSampler.pipe b/llpc/test/shaderdb/relocatable_shaders/PipelineVsFs_ImmutableSampler.pipe index a5d5c9809c..899c08ff56 100644 --- a/llpc/test/shaderdb/relocatable_shaders/PipelineVsFs_ImmutableSampler.pipe +++ b/llpc/test/shaderdb/relocatable_shaders/PipelineVsFs_ImmutableSampler.pipe @@ -41,14 +41,14 @@ void main() entryPoint = main [ResourceMapping] -descriptorRangeValue[0].visibility = 17 +descriptorRangeValue[0].visibility = 66 descriptorRangeValue[0].type = DescriptorSampler descriptorRangeValue[0].set = 0 descriptorRangeValue[0].binding = 0 descriptorRangeValue[0].arraySize = 1 descriptorRangeValue[0].uintData = 2156034194, 184545280, 3371171840, 2147483648 -userDataNode[0].visibility = 17 +userDataNode[0].visibility = 66 userDataNode[0].type = DescriptorTableVaPtr userDataNode[0].offsetInDwords = 11 userDataNode[0].sizeInDwords = 1 diff --git a/llpc/test/shaderdb/relocatable_shaders/PipelineVsFs_MultiDwordPushConst.pipe b/llpc/test/shaderdb/relocatable_shaders/PipelineVsFs_MultiDwordPushConst.pipe index 78e338274c..bbdb465124 100644 --- a/llpc/test/shaderdb/relocatable_shaders/PipelineVsFs_MultiDwordPushConst.pipe +++ b/llpc/test/shaderdb/relocatable_shaders/PipelineVsFs_MultiDwordPushConst.pipe @@ -6,11 +6,9 @@ ; Check that the llvm-ir gets the push constant values from as a parameter. ; SHADERTEST: // LLPC pipeline patching results -; SHADERTEST: define dllexport amdgpu_ps { <4 x float> } @_amdgpu_ps_main({{.*}}, <2 x i32> inreg %pushConst2, {{.*}}) -; SHADERTEST: [[bc0:%[a-zA-Z0-9]+]] = bitcast <2 x i32> %pushConst2 to <2 x float> -; SHADERTEST: [[pushConst0:%[.a-zA-Z0-9]+]] = extractelement <2 x float> [[bc0]], i64 0 -; SHADERTEST: [[bc1:%[a-zA-Z0-9]+]] = bitcast <2 x i32> %pushConst2 to <2 x float> -; SHADERTEST: [[pushConst1:%[.a-zA-Z0-9]+]] = extractelement <2 x float> [[bc1]], i64 1 +; SHADERTEST: define dllexport amdgpu_ps { <4 x float> } @_amdgpu_ps_main({{.*}}, i32 inreg %userdata3, i32 inreg %userdata4, {{.*}}) +; SHADERTEST: [[pushConst0:%[.a-zA-Z0-9]+]] = bitcast i32 %userdata3 to float +; SHADERTEST: [[pushConst1:%[.a-zA-Z0-9]+]] = bitcast i32 %userdata4 to float ; SHADERTEST: @llvm.amdgcn.image.gather4.lz.2d.sl_v4f32i32s.f32({{.*}}, float [[pushConst0]], float [[pushConst1]], {{.*}}) ; Check that those parameters are passed in as s2 and s3. @@ -22,8 +20,9 @@ ; SHADERTEST: image_gather4_lz v[0:4], v[5:6] ; Check that the PAL metadata will place the correct values in those registers. -; SHADERTEST: SPI_SHADER_USER_DATA_PS_1 0x0000000000000003 -; SHADERTEST: SPI_SHADER_USER_DATA_PS_2 0x0000000000000004 +; SHADERTEST: .hardware_stages: { +; SHADERTEST: .ps: { +; SHADERTEST: .user_data_reg_map: [ 0x0000000010000000 0x0000000000000003 0x0000000000000004 0x00000000FFFFFFFF 0x00000000FFFFFFFF 0x00000000FFFFFFFF 0x00000000FFFFFFFF 0x00000000FFFFFFFF 0x00000000FFFFFFFF 0x00000000FFFFFFFF 0x00000000FFFFFFFF 0x00000000FFFFFFFF 0x00000000FFFFFFFF 0x00000000FFFFFFFF 0x00000000FFFFFFFF 0x00000000FFFFFFFF 0x00000000FFFFFFFF 0x00000000FFFFFFFF 0x00000000FFFFFFFF 0x00000000FFFFFFFF 0x00000000FFFFFFFF 0x00000000FFFFFFFF 0x00000000FFFFFFFF 0x00000000FFFFFFFF 0x00000000FFFFFFFF 0x00000000FFFFFFFF 0x00000000FFFFFFFF 0x00000000FFFFFFFF 0x00000000FFFFFFFF 0x00000000FFFFFFFF 0x00000000FFFFFFFF 0x00000000FFFFFFFF ] ; END_SHADERTEST [Version] @@ -84,7 +83,6 @@ entryPoint = main entryPoint = main [ResourceMapping] -userDataNode[0].visibility = 17 userDataNode[0].type = PushConst userDataNode[0].offsetInDwords = 1 userDataNode[0].sizeInDwords = 4 diff --git a/llpc/test/shaderdb/relocatable_shaders/PipelineVsFs_PipelineCacheHit.pipe b/llpc/test/shaderdb/relocatable_shaders/PipelineVsFs_PipelineCacheHit.pipe index 152af6290a..e14ccddc98 100644 --- a/llpc/test/shaderdb/relocatable_shaders/PipelineVsFs_PipelineCacheHit.pipe +++ b/llpc/test/shaderdb/relocatable_shaders/PipelineVsFs_PipelineCacheHit.pipe @@ -2,6 +2,7 @@ ; BEGIN_SHADERTEST ; RUN: amdllpc \ ; RUN: -shader-cache-mode=1 \ +; RUN: -cache-full-pipelines=true \ ; RUN: -enable-relocatable-shader-elf \ ; RUN: -o %t.elf %gfxip %s %s -v | FileCheck -check-prefix=SHADERTEST %s ; SHADERTEST: Cache miss for graphics pipeline. diff --git a/llpc/test/shaderdb/relocatable_shaders/PipelineVsFs_RelocCheckPsInControl.pipe b/llpc/test/shaderdb/relocatable_shaders/PipelineVsFs_RelocCheckPsInControl.pipe index 2831699c34..a1b21a6831 100644 --- a/llpc/test/shaderdb/relocatable_shaders/PipelineVsFs_RelocCheckPsInControl.pipe +++ b/llpc/test/shaderdb/relocatable_shaders/PipelineVsFs_RelocCheckPsInControl.pipe @@ -2,7 +2,9 @@ ; BEGIN_SHADERTEST ; RUN: amdllpc -enable-relocatable-shader-elf -o %t.elf -gfxip=10.1.2 %s -v | FileCheck -check-prefix=SHADERTEST %s -; SHADERTEST: SPI_PS_IN_CONTROL 0x0000000000000000 +; SHADERTEST: .spi_ps_in_control: { +; SHADERTEST: .num_interps: 0x0000000000000000 +; SHADERTEST: .ps_w32_en: 0 } ; END_SHADERTEST [Version] diff --git a/llpc/test/shaderdb/relocatable_shaders/PipelineVsFs_ShadowDescTable.pipe b/llpc/test/shaderdb/relocatable_shaders/PipelineVsFs_ShadowDescTable.pipe index c6db261006..05fab4850f 100644 --- a/llpc/test/shaderdb/relocatable_shaders/PipelineVsFs_ShadowDescTable.pipe +++ b/llpc/test/shaderdb/relocatable_shaders/PipelineVsFs_ShadowDescTable.pipe @@ -22,25 +22,6 @@ void main() { [VsInfo] entryPoint = main -userDataNode[0].type = DescriptorTableVaPtr -userDataNode[0].offsetInDwords = 11 -userDataNode[0].sizeInDwords = 1 -userDataNode[0].set = 0 -userDataNode[0].next[0].type = DescriptorResource -userDataNode[0].next[0].offsetInDwords = 4 -userDataNode[0].next[0].sizeInDwords = 8 -userDataNode[0].next[0].set = 0 -userDataNode[0].next[0].binding = 0 -userDataNode[0].next[1].type = DescriptorFmask -userDataNode[0].next[1].offsetInDwords = 12 -userDataNode[0].next[1].sizeInDwords = 8 -userDataNode[0].next[1].set = 0 -userDataNode[0].next[1].binding = 0 -userDataNode[1].visibility = 2 -userDataNode[1].type = IndirectUserDataVaPtr -userDataNode[1].offsetInDwords = 12 -userDataNode[1].sizeInDwords = 1 -userDataNode[1].indirectUserDataCount = 4 [FsGlsl] #version 450 core @@ -64,6 +45,10 @@ colorBuffer[0].format = VK_FORMAT_R32G32B32A32_SFLOAT colorBuffer[0].channelWriteMask = 15 colorBuffer[0].blendEnable = 1 colorBuffer[0].blendSrcAlphaToColor = 1 +options.shadowDescriptorTableUsage = Enable +options.shadowDescriptorTablePtrHigh = 0xFFFF + +[ResourceMapping] userDataNode[0].type = DescriptorTableVaPtr userDataNode[0].offsetInDwords = 11 userDataNode[0].sizeInDwords = 1 @@ -78,13 +63,10 @@ userDataNode[0].next[1].offsetInDwords = 12 userDataNode[0].next[1].sizeInDwords = 8 userDataNode[0].next[1].set = 0 userDataNode[0].next[1].binding = 0 -userDataNode[1].visibility = 2 userDataNode[1].type = IndirectUserDataVaPtr userDataNode[1].offsetInDwords = 12 userDataNode[1].sizeInDwords = 1 userDataNode[1].indirectUserDataCount = 4 -options.shadowDescriptorTableUsage = Enable -options.shadowDescriptorTablePtrHigh = 0xFFFF [VertexInputState] binding[0].binding = 1 diff --git a/llpc/test/shaderdb/relocatable_shaders/PipelineVsFs_ShadowDescTableMissingFmask.pipe b/llpc/test/shaderdb/relocatable_shaders/PipelineVsFs_ShadowDescTableMissingFmask.pipe index 096fd962f9..b27e73a6da 100644 --- a/llpc/test/shaderdb/relocatable_shaders/PipelineVsFs_ShadowDescTableMissingFmask.pipe +++ b/llpc/test/shaderdb/relocatable_shaders/PipelineVsFs_ShadowDescTableMissingFmask.pipe @@ -22,20 +22,6 @@ void main() { [VsInfo] entryPoint = main -userDataNode[0].type = DescriptorTableVaPtr -userDataNode[0].offsetInDwords = 11 -userDataNode[0].sizeInDwords = 1 -userDataNode[0].set = 0 -userDataNode[0].next[0].type = DescriptorResource -userDataNode[0].next[0].offsetInDwords = 4 -userDataNode[0].next[0].sizeInDwords = 8 -userDataNode[0].next[0].set = 0 -userDataNode[0].next[0].binding = 0 -userDataNode[1].visibility = 2 -userDataNode[1].type = IndirectUserDataVaPtr -userDataNode[1].offsetInDwords = 12 -userDataNode[1].sizeInDwords = 1 -userDataNode[1].indirectUserDataCount = 4 [FsGlsl] @@ -60,6 +46,10 @@ colorBuffer[0].format = VK_FORMAT_R32G32B32A32_SFLOAT colorBuffer[0].channelWriteMask = 15 colorBuffer[0].blendEnable = 1 colorBuffer[0].blendSrcAlphaToColor = 1 +options.shadowDescriptorTableUsage = Enable +options.shadowDescriptorTablePtrHigh = 0xFFFF + +[ResourceMapping] userDataNode[0].type = DescriptorTableVaPtr userDataNode[0].offsetInDwords = 11 userDataNode[0].sizeInDwords = 1 @@ -69,13 +59,10 @@ userDataNode[0].next[0].offsetInDwords = 4 userDataNode[0].next[0].sizeInDwords = 8 userDataNode[0].next[0].set = 0 userDataNode[0].next[0].binding = 0 -userDataNode[1].visibility = 2 userDataNode[1].type = IndirectUserDataVaPtr userDataNode[1].offsetInDwords = 12 userDataNode[1].sizeInDwords = 1 userDataNode[1].indirectUserDataCount = 4 -options.shadowDescriptorTableUsage = Enable -options.shadowDescriptorTablePtrHigh = 0xFFFF [VertexInputState] binding[0].binding = 1 diff --git a/llpc/test/shaderdb/relocatable_shaders/PipelineVsFs_TestFsBuiltInInput.pipe b/llpc/test/shaderdb/relocatable_shaders/PipelineVsFs_TestFsBuiltInInput.pipe deleted file mode 100644 index f7512c554b..0000000000 --- a/llpc/test/shaderdb/relocatable_shaders/PipelineVsFs_TestFsBuiltInInput.pipe +++ /dev/null @@ -1,46 +0,0 @@ -; Test that the relocatable shaders cannot be linked, but that the compilation still succeeds. -; If this pipeline was compiled by linking relocatable shaders, the VS would write the clip distance to channel 1, but -; the FS would try to read it from channel 0. - -; BEGIN_SHADERTEST -; RUN: amdllpc -enable-relocatable-shader-elf -auto-layout-desc -v %gfxip %s | FileCheck -check-prefix=SHADERTEST %s -; SHADERTEST: Failed to link relocatable shaders because FS uses builtin inputs. -; SHADERTEST: AMDLLPC SUCCESS -; END_SHADERTEST - - -[Version] -version = 38 - -[VsGlsl] -#version 450 - -out float gl_ClipDistance[1]; - -layout(location = 0) out vec3 _2; - -void main() -{ - gl_Position = vec4(0.5); - gl_ClipDistance[0] = 0.4; - _2 = vec3(0.5); -} - -[VsInfo] -entryPoint = main - -[FsGlsl] -#version 450 - -in float gl_ClipDistance[1]; - -layout(location = 0) out vec4 _3; - -void main() -{ - _3 = vec4(gl_ClipDistance[0u]); -} - - -[FsInfo] -entryPoint = main diff --git a/llpc/test/shaderdb/relocatable_shaders/PipelineVsPs_PsInput.pipe b/llpc/test/shaderdb/relocatable_shaders/PipelineVsPs_PsInput.pipe index 798e39dc99..d21db4589f 100644 --- a/llpc/test/shaderdb/relocatable_shaders/PipelineVsPs_PsInput.pipe +++ b/llpc/test/shaderdb/relocatable_shaders/PipelineVsPs_PsInput.pipe @@ -1,10 +1,52 @@ ; BEGIN_SHADERTEST ; RUN: amdllpc -enable-relocatable-shader-elf -o %t.elf %gfxip %s -v \ ; RUN: | FileCheck -check-prefix=SHADERTEST %s -; SHADERTEST: SPI_PS_INPUT_CNTL_0 0x0000000000000000 -; SHADERTEST: SPI_PS_INPUT_ENA 0x0000000000000002 -; SHADERTEST: SPI_PS_INPUT_ADDR 0x0000000000000002 -; SHADERTEST: SPI_PS_IN_CONTROL 0x0000000000000001 +; SHADERTEST-LABEL: PalMetadata +; SHADERTEST: .spi_ps_in_control: { +; SHADERTEST: .num_interps: 0x0000000000000001 +; SHADERTEST: .ps_w32_en: 0 } +; SHADERTEST: .spi_ps_input_addr: { +; SHADERTEST: .ancillary_ena: 0 +; SHADERTEST: .front_face_ena: 0 +; SHADERTEST: .line_stipple_tex_ena: 0 +; SHADERTEST: .linear_center_ena: 0 +; SHADERTEST: .linear_centroid_ena: 0 +; SHADERTEST: .linear_sample_ena: 0 +; SHADERTEST: .persp_center_ena: 1 +; SHADERTEST: .persp_centroid_ena: 0 +; SHADERTEST: .persp_pull_model_ena: 0 +; SHADERTEST: .persp_sample_ena: 0 +; SHADERTEST: .pos_fixed_pt_ena: 0 +; SHADERTEST: .pos_w_float_ena: 0 +; SHADERTEST: .pos_x_float_ena: 0 +; SHADERTEST: .pos_y_float_ena: 0 +; SHADERTEST: .pos_z_float_ena: 0 +; SHADERTEST: .sample_coverage_ena: 0 } +; SHADERTEST: .spi_ps_input_cntl: [ { +; SHADERTEST: .attr0_valid: 0x0000000000000000 +; SHADERTEST: .attr1_valid: 0x0000000000000000 +; SHADERTEST: .flat_shade: 0 +; SHADERTEST: .fp16_interp_mode: 0 +; SHADERTEST: .offset: 0x0000000000000000 +; SHADERTEST: .prim_attr: 0 +; SHADERTEST: .pt_sprite_tex: 0 }] +; SHADERTEST: .spi_ps_input_ena: { +; SHADERTEST: .ancillary_ena: 0 +; SHADERTEST: .front_face_ena: 0 +; SHADERTEST: .line_stipple_tex_ena: 0 +; SHADERTEST: .linear_center_ena: 0 +; SHADERTEST: .linear_centroid_ena: 0 +; SHADERTEST: .linear_sample_ena: 0 +; SHADERTEST: .persp_center_ena: 1 +; SHADERTEST: .persp_centroid_ena: 0 +; SHADERTEST: .persp_pull_model_ena: 0 +; SHADERTEST: .persp_sample_ena: 0 +; SHADERTEST: .pos_fixed_pt_ena: 0 +; SHADERTEST: .pos_w_float_ena: 0 +; SHADERTEST: .pos_x_float_ena: 0 +; SHADERTEST: .pos_y_float_ena: 0 +; SHADERTEST: .pos_z_float_ena: 0 +; SHADERTEST: .sample_coverage_ena: 0 } ; END_SHADERTEST [Version] diff --git a/llpc/tool/amdllpc.cpp b/llpc/tool/amdllpc.cpp index f0847a6196..a18dac846a 100644 --- a/llpc/tool/amdllpc.cpp +++ b/llpc/tool/amdllpc.cpp @@ -40,6 +40,7 @@ #include "llpcFile.h" #include "llpcInputUtils.h" #include "llpcPipelineBuilder.h" +#include "llpcShaderCacheWrap.h" #include "llpcThreading.h" #include "llpcUtil.h" #include "spvgen.h" @@ -303,12 +304,24 @@ cl::opt DumpDuplicatePipelines( // -llpc_opt: Override the optimization level passed in to LGC with the given one. This options is the same as the // `-opt` option in lgc. The reason for the second option is to be able to test the LLPC API. If both options are set // then `-opt` wins. + +#if LLVM_MAIN_REVISION && LLVM_MAIN_REVISION < 474768 +// Old version of the code cl::opt LlpcOptLevel("llpc-opt", cl::desc("The optimization level for amdllpc to pass to LLPC:"), cl::init(CodeGenOpt::Default), values(clEnumValN(CodeGenOpt::None, "none", "no optimizations"), clEnumValN(CodeGenOpt::Less, "quick", "quick compilation time"), clEnumValN(CodeGenOpt::Default, "default", "default optimizations"), clEnumValN(CodeGenOpt::Aggressive, "fast", "fast execution time"))); +#else + // New version of the code (also handles unknown version, which we treat as latest) +cl::opt LlpcOptLevel("llpc-opt", cl::desc("The optimization level for amdllpc to pass to LLPC:"), + cl::init(CodeGenOptLevel::Default), + values(clEnumValN(CodeGenOptLevel::None, "none", "no optimizations"), + clEnumValN(CodeGenOptLevel::Less, "quick", "quick compilation time"), + clEnumValN(CodeGenOptLevel::Default, "default", "default optimizations"), + clEnumValN(CodeGenOptLevel::Aggressive, "fast", "fast execution time"))); +#endif // -resource-layout-scheme: specifies the layout scheme of the resource cl::opt LayoutScheme("resource-layout-scheme", cl::desc("The resource layout scheme:"), @@ -334,7 +347,6 @@ cl::opt GpuRtLibrary("gpurt-library", cl::desc("Use the GPURT shade cl::opt EnableColorExportShader("enable-color-export-shader", cl::desc("Enable color export shader, only compile each stage of the pipeline without linking"), cl::init(false)); - } // namespace // clang-format on namespace llvm { @@ -398,8 +410,9 @@ cl::opt> ExtPrinter{"ext", cl::desc("Di // @param argc : Count of arguments // @param argv : List of arguments // @param [out] compiler : Created LLPC compiler object +// @param [out] cache : Created LLPC cache object // @returns : Result::Success on success, other status codes on failure -static Result init(int argc, char *argv[], ICompiler *&compiler) { +static Result init(int argc, char *argv[], ICompiler *&compiler, ShaderCacheWrap *&cache) { // Before we get to LLVM command-line option parsing, we need to find the -gfxip option value. for (int i = 1; i != argc; ++i) { StringRef arg = argv[i]; @@ -491,7 +504,10 @@ static Result init(int argc, char *argv[], ICompiler *&compiler) { return Result::Unsupported; } - Result result = ICompiler::Create(ParsedGfxIp, argc, argv, &compiler); + // Create internal cache + cache = ShaderCacheWrap::Create(argc, argv); + + Result result = ICompiler::Create(ParsedGfxIp, argc, argv, &compiler, cache); if (result != Result::Success) return result; @@ -534,8 +550,16 @@ static void initCompileInfo(CompileInfo *compileInfo) { } // We want the default optimization level to be "Default" which is not 0. - compileInfo->gfxPipelineInfo.options.optimizationLevel = CodeGenOpt::Level::Default; - compileInfo->compPipelineInfo.options.optimizationLevel = CodeGenOpt::Level::Default; +#if LLVM_MAIN_REVISION && LLVM_MAIN_REVISION < 474768 + // Old version of the code + compileInfo->gfxPipelineInfo.options.optimizationLevel = static_cast(CodeGenOpt::Level::Default); + compileInfo->compPipelineInfo.options.optimizationLevel = static_cast(CodeGenOpt::Level::Default); +#else + // New version of the code (also handles unknown version, which we treat as latest) + compileInfo->gfxPipelineInfo.options.optimizationLevel = static_cast(CodeGenOptLevel::Default); + compileInfo->compPipelineInfo.options.optimizationLevel = static_cast(CodeGenOptLevel::Default); +#endif + compileInfo->gfxPipelineInfo.options.resourceLayoutScheme = LayoutScheme; compileInfo->compPipelineInfo.options.forceCsThreadIdSwizzling = ForceCsThreadIdSwizzling; compileInfo->compPipelineInfo.options.overrideThreadGroupSizeX = OverrideThreadGroupSizeX; @@ -758,7 +782,8 @@ int main(int argc, char *argv[]) { #endif ICompiler *compiler = nullptr; - Result result = init(argc, argv, compiler); + ShaderCacheWrap *cache = nullptr; + Result result = init(argc, argv, compiler, cache); #ifdef WIN_OS if (AssertToMsgBox) { @@ -767,12 +792,15 @@ int main(int argc, char *argv[]) { #endif // Cleanup code that gets run automatically before returning. - auto onExit = make_scope_exit([compiler, &result] { + auto onExit = make_scope_exit([compiler, cache, &result] { FinalizeSpvgen(); if (compiler) compiler->Destroy(); + if (cache) + cache->Destroy(); + if (result == Result::Success) LLPC_OUTS("\n===== AMDLLPC SUCCESS =====\n"); else diff --git a/llpc/tool/llpcCompilationUtils.h b/llpc/tool/llpcCompilationUtils.h index 456da7c854..dc22245d57 100644 --- a/llpc/tool/llpcCompilationUtils.h +++ b/llpc/tool/llpcCompilationUtils.h @@ -103,8 +103,14 @@ struct CompileInfo { bool scratchAccessBoundsChecks; // Whether to enable scratch access bounds checks bool enableImplicitInvariantExports; // Whether to enable implicit marking of position exports as invariant VfxPipelineType pipelineType; // Pipeline type +#if LLVM_MAIN_REVISION && LLVM_MAIN_REVISION < 474768 + // Old version of the code std::optional optimizationLevel; // The optimization level to pass the compiler - bool internalRtShaders; // Whether to enable intrinsics for internal RT shaders +#else + // New version of the code (also handles unknown version, which we treat as latest) + std::optional optimizationLevel; // The optimization level to pass the compiler +#endif + bool internalRtShaders; // Whether to enable intrinsics for internal RT shaders bool enableColorExportShader; // Enable color export shader, only compile each stage of the pipeline without linking }; diff --git a/llpc/tool/llpcComputePipelineBuilder.cpp b/llpc/tool/llpcComputePipelineBuilder.cpp index e81db59926..0c416bb1c3 100644 --- a/llpc/tool/llpcComputePipelineBuilder.cpp +++ b/llpc/tool/llpcComputePipelineBuilder.cpp @@ -138,7 +138,7 @@ Expected ComputePipelineBuilder::buildComputePipeline() { pipelineInfo->options.overrideThreadGroupSizeY = compileInfo.compPipelineInfo.options.overrideThreadGroupSizeY; pipelineInfo->options.overrideThreadGroupSizeZ = compileInfo.compPipelineInfo.options.overrideThreadGroupSizeZ; if (compileInfo.optimizationLevel.has_value()) { - pipelineInfo->options.optimizationLevel = compileInfo.optimizationLevel.value(); + pipelineInfo->options.optimizationLevel = static_cast(compileInfo.optimizationLevel.value()); } pipelineInfo->options.threadGroupSwizzleMode = compileInfo.compPipelineInfo.options.threadGroupSwizzleMode; pipelineInfo->options.reverseThreadGroup = compileInfo.compPipelineInfo.options.reverseThreadGroup; diff --git a/llpc/tool/llpcGraphicsPipelineBuilder.cpp b/llpc/tool/llpcGraphicsPipelineBuilder.cpp index 778a0da096..cdbe175080 100644 --- a/llpc/tool/llpcGraphicsPipelineBuilder.cpp +++ b/llpc/tool/llpcGraphicsPipelineBuilder.cpp @@ -154,7 +154,7 @@ Expected GraphicsPipelineBuilder::buildGraphicsPipeline() { pipelineInfo->options.enableScratchAccessBoundsChecks = compileInfo.scratchAccessBoundsChecks; pipelineInfo->options.enableImplicitInvariantExports = compileInfo.enableImplicitInvariantExports; if (compileInfo.optimizationLevel.has_value()) { - pipelineInfo->options.optimizationLevel = compileInfo.optimizationLevel.value(); + pipelineInfo->options.optimizationLevel = static_cast(compileInfo.optimizationLevel.value()); } pipelineInfo->options.internalRtShaders = compileInfo.internalRtShaders; pipelineInfo->enableColorExportShader |= compileInfo.enableColorExportShader; diff --git a/llpc/context/llpcShaderCache.cpp b/llpc/tool/llpcShaderCache.cpp similarity index 98% rename from llpc/context/llpcShaderCache.cpp rename to llpc/tool/llpcShaderCache.cpp index 1661786599..10b46e4259 100644 --- a/llpc/context/llpcShaderCache.cpp +++ b/llpc/tool/llpcShaderCache.cpp @@ -876,4 +876,20 @@ bool ShaderCache::isCompatible(const ShaderCacheCreateInfo *createInfo, const Sh return isCompatible && m_gfxIp == auxCreateInfo->gfxIp; } +// ===================================================================================================================== +// Wait compiling finish for specified cache entry +// @param hEntry: Shader cache entry handle +Result ShaderCache::waitForEntry(CacheEntryHandle hEntry) { + ShaderIndex *index = reinterpret_cast(hEntry); + if (index->state == ShaderEntryState::Compiling) { + CacheMapLock lock = makeCacheLock(true); + m_conditionVariable.wait(lock, [index] { + // The lock must have been acquired by the time we enter this lambda. + return index->state != ShaderEntryState::Compiling; + }); + assert(index->state != ShaderEntryState::Compiling); + } + + return Result::Success; +} } // namespace Llpc diff --git a/llpc/context/llpcShaderCache.h b/llpc/tool/llpcShaderCache.h similarity index 72% rename from llpc/context/llpcShaderCache.h rename to llpc/tool/llpcShaderCache.h index 549bab9a89..7f0f7facfb 100644 --- a/llpc/context/llpcShaderCache.h +++ b/llpc/tool/llpcShaderCache.h @@ -114,6 +114,69 @@ struct ShaderCacheSerializedHeader { typedef void *CacheEntryHandle; +#if LLPC_CLIENT_INTERFACE_MAJOR_VERSION >= 66 +/// Defines callback function used to lookup shader cache info in an external cache +typedef Result (*ShaderCacheGetValue)(const void *pClientData, uint64_t hash, void *pValue, size_t *pValueLen); + +/// Defines callback function used to store shader cache info in an external cache +typedef Result (*ShaderCacheStoreValue)(const void *pClientData, uint64_t hash, const void *pValue, size_t valueLen); + +/// Specifies all information necessary to create a shader cache object. +struct ShaderCacheCreateInfo { + const void *pInitialData; ///< Pointer to a data buffer whose contents should be used to seed the shader + /// cache. This may be null if no initial data is present. + size_t initialDataSize; ///< Size of the initial data buffer, in bytes. + + // NOTE: The following parameters are all optional, and are only used when the IShaderCache will be used in + // tandem with an external cache which serves as a backing store for the cached shader data. + + // [optional] Private client-opaque data which will be passed to the pClientData parameters of the Get and + // Store callback functions. + const void *pClientData; + ShaderCacheGetValue pfnGetValueFunc; ///< [Optional] Function to lookup shader cache data in an external cache + ShaderCacheStoreValue pfnStoreValueFunc; ///< [Optional] Function to store shader cache data in an external cache +}; + +// ===================================================================================================================== +/// Represents the interface of a cache for compiled shaders. The shader cache is designed to be optionally passed in at +/// pipeline create time. The compiled binary for the shaders is stored in the cache object to avoid compiling the same +/// shader multiple times. The shader cache also provides a method to serialize its data to be stored to disk. +class IShaderCache { +public: + /// Serializes the shader cache data or queries the size required for serialization. + /// + /// @param [in] pBlob System memory pointer where the serialized data should be placed. This parameter can + /// be null when querying the size of the serialized data. When non-null (and the size is + /// correct/sufficient) then the contents of the shader cache will be placed in this + /// location. The data is an opaque blob which is not intended to be parsed by clients. + /// @param [in,out] pSize Size of the memory pointed to by pBlob. If the value stored in pSize is zero then no + /// data will be copied and instead the size required for serialization will be returned + /// in pSize. + /// + /// @returns : Success if data was serialized successfully, Unknown if fail to do serialize. + virtual Result Serialize(void *pBlob, size_t *pSize) = 0; + + /// Merges the provided source shader caches' content into this shader cache. + /// + /// @param [in] srcCacheCount Count of source shader caches to be merged. + /// @param [in] ppSrcCaches Pointer to an array of pointers to shader cache objects. + /// + /// @returns : Success if data of source shader caches was merged successfully, OutOfMemory if the internal allocator + /// memory cannot be allocated. + virtual Result Merge(unsigned srcCacheCount, const IShaderCache **ppSrcCaches) = 0; + + /// Frees all resources associated with this object. + virtual void Destroy() = 0; + +protected: + /// @internal Constructor. Prevent use of new operator on this interface. + IShaderCache() {} + + /// @internal Destructor. Prevent use of delete operator on this interface. + virtual ~IShaderCache() {} +}; +#endif + // ===================================================================================================================== // This class implements a cache for compiled shaders. The shader cache persists in memory at runtime and can be // serialized to disk by the client/application for persistence between runs. @@ -140,6 +203,8 @@ class ShaderCache : public IShaderCache { LLPC_NODISCARD bool isCompatible(const ShaderCacheCreateInfo *createInfo, const ShaderCacheAuxCreateInfo *auxCreateInfo); + LLPC_NODISCARD Result waitForEntry(CacheEntryHandle hEntry); + private: ShaderCache(const ShaderCache &) = delete; ShaderCache &operator=(const ShaderCache &) = delete; diff --git a/llpc/tool/llpcShaderCacheWrap.cpp b/llpc/tool/llpcShaderCacheWrap.cpp new file mode 100644 index 0000000000..ae977561a5 --- /dev/null +++ b/llpc/tool/llpcShaderCacheWrap.cpp @@ -0,0 +1,190 @@ +/* + *********************************************************************************************************************** + * + * Copyright (c) 2017-2023 Advanced Micro Devices, Inc. All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + **********************************************************************************************************************/ +/** +*********************************************************************************************************************** +@file llpcShaderCacheWrap.cpp +@brief LLPC source file: contains implementation of class Llpc::ShaderCacheWrap. +*********************************************************************************************************************** +*/ +#include "llpcShaderCacheWrap.h" +#include "llpcError.h" +#include "llvm/Support/CommandLine.h" + +#define DEBUG_TYPE "llpc-shader-cache-wrap" + +using namespace llvm; + +// clang-format off +namespace llvm { +namespace cl { +#if LLPC_CLIENT_INTERFACE_MAJOR_VERSION < 66 +extern opt ShaderCacheFileDir; +extern opt ShaderCacheMode; +extern opt ExecutableName; +#else +// -shader-cache-mode: shader cache mode: +// 0 - Disable +// 1 - Runtime cache +// 2 - Cache to disk +// 3 - Use internal on-disk cache in read/write mode. +// 4 - Use internal on-disk cache in read-only mode. +opt ShaderCacheMode("shader-cache-mode", + desc("Shader cache mode, 0 - disable, 1 - runtime cache, 2 - cache to disk, 3 - " + "load on-disk cache for read/write, 4 - load on-disk cache for read only"), + init(0)); + +// -shader-cache-file-dir: root directory to store shader cache +opt ShaderCacheFileDir("shader-cache-file-dir", desc("Root directory to store shader cache"), + value_desc("dir"), init(".")); + +// -executable-name: executable file name +opt ExecutableName("executable-name", desc("Executable file name"), value_desc("filename"), + init("amdllpc")); +#endif +} // namespace cl +} // namespace llvm +// clang-format on + +namespace Llpc { + +// ===================================================================================================================== +ShaderCacheWrap *ShaderCacheWrap::Create(unsigned optionCount, const char *const *options) { + bool createDummyCompiler = false; + // Build effecting options + for (unsigned i = 1; i < optionCount; ++i) { + if (options[i][0] != '-') { + // Ignore input file names. + continue; + } + + StringRef option = options[i] + 1; // Skip '-' in options + + if (option.startswith(cl::ShaderCacheMode.ArgStr) || option.startswith(cl::ShaderCacheFileDir.ArgStr) || + option.startswith(cl::ExecutableName.ArgStr)) { + createDummyCompiler = true; + break; + } + } + + if (createDummyCompiler) { + GfxIpVersion gfxip = {10, 3, 0}; + ICompiler *pCompiler = nullptr; + ICompiler::Create(gfxip, optionCount, options, &pCompiler); + pCompiler->Destroy(); + } + // Initialize shader cache + ShaderCacheCreateInfo createInfo = {}; + ShaderCacheAuxCreateInfo auxCreateInfo = {}; + unsigned shaderCacheMode = cl::ShaderCacheMode; + auxCreateInfo.shaderCacheMode = static_cast(shaderCacheMode); + + if (auxCreateInfo.shaderCacheMode == ShaderCacheDisable) { + return nullptr; + } + + auxCreateInfo.executableName = cl::ExecutableName.c_str(); + + const char *shaderCachePath = cl::ShaderCacheFileDir.c_str(); + if (cl::ShaderCacheFileDir.empty()) { +#ifdef WIN_OS + shaderCachePath = getenv("LOCALAPPDATA"); + assert(shaderCachePath); +#else + llvm_unreachable("Should never be called!"); +#endif + } + + auxCreateInfo.cacheFilePath = shaderCachePath; + ShaderCacheWrap *pCache = nullptr; + ShaderCache *pShaderCache = new ShaderCache(); + if (pShaderCache != nullptr) { + Result result = pShaderCache->init(&createInfo, &auxCreateInfo); + if (result == Result::Success) { + pCache = new ShaderCacheWrap(pShaderCache); + } else { + pShaderCache->Destroy(); + delete pShaderCache; + } + } + return pCache; +} + +// ===================================================================================================================== +void ShaderCacheWrap::Destroy() { + if (m_pShaderCache != nullptr) { + m_pShaderCache->Destroy(); + delete m_pShaderCache; + m_pShaderCache = nullptr; + } + + delete this; +} + +// ===================================================================================================================== +Result ShaderCacheWrap::GetEntry(Vkgc::HashId hash, bool allocateOnMiss, Vkgc::EntryHandle *pHandle) { + MetroHash::Hash metroHash = {}; + metroHash.qwords[0] = hash.qwords[0]; + metroHash.qwords[1] = hash.qwords[1]; + CacheEntryHandle hEntry = {}; + Result result = Result::Success; + ShaderEntryState entryState = m_pShaderCache->findShader(metroHash, allocateOnMiss, &hEntry); + *pHandle = Vkgc::EntryHandle(this, hEntry, entryState == ShaderEntryState::Compiling); + + if (entryState == ShaderEntryState::Compiling) { + result = Result::NotFound; + } else if (entryState == ShaderEntryState::Unavailable) { + result = Result::ErrorUnavailable; + } + return result; +} + +// ===================================================================================================================== +void ShaderCacheWrap::ReleaseEntry(Vkgc::RawEntryHandle rawHandle) { + return; +} + +// ===================================================================================================================== +Result ShaderCacheWrap::WaitForEntry(Vkgc::RawEntryHandle rawHandle) { + return m_pShaderCache->waitForEntry(rawHandle); +} + +// ===================================================================================================================== +Result ShaderCacheWrap::GetValue(Vkgc::RawEntryHandle rawHandle, void *pData, size_t *pDataLen) { + assert(0); + return Result::ErrorUnavailable; +} + +// ===================================================================================================================== +Result ShaderCacheWrap::GetValueZeroCopy(Vkgc::RawEntryHandle rawHandle, const void **ppData, size_t *pDataLen) { + return m_pShaderCache->retrieveShader(rawHandle, ppData, pDataLen); +} + +// ===================================================================================================================== +Result ShaderCacheWrap::SetValue(Vkgc::RawEntryHandle rawHandle, bool success, const void *pData, size_t dataLen) { + m_pShaderCache->insertShader(rawHandle, pData, dataLen); + return Result::Success; +} + +} // namespace Llpc diff --git a/llpc/context/llpcShaderCacheManager.h b/llpc/tool/llpcShaderCacheWrap.h similarity index 66% rename from llpc/context/llpcShaderCacheManager.h rename to llpc/tool/llpcShaderCacheWrap.h index 45cf563e47..390665a597 100644 --- a/llpc/context/llpcShaderCacheManager.h +++ b/llpc/tool/llpcShaderCacheWrap.h @@ -24,8 +24,8 @@ **********************************************************************************************************************/ /** *********************************************************************************************************************** - @file llpcShaderCacheManager.h - @brief LLPC header file: contains declaration of class Llpc::ShaderCacheManager. + @file llpcShaderCacheWrap.h + @brief LLPC header file: contains declaration of class Llpc::ShaderCacheWrap. *********************************************************************************************************************** */ #pragma once @@ -36,38 +36,33 @@ namespace Llpc { -typedef std::shared_ptr ShaderCachePtr; - // ===================================================================================================================== -// This class manages shader cache instances for different GFXIP -class ShaderCacheManager { +// Helper class to wrap shader cache with ICache interface +class ShaderCacheWrap : public Vkgc::ICache { public: // Constructor - ShaderCacheManager() {} + ShaderCacheWrap(ShaderCache *pShaderCache) : m_pShaderCache(pShaderCache) {} - ~ShaderCacheManager(); + virtual ~ShaderCacheWrap() { assert(m_pShaderCache == nullptr); }; - // Get the global ShaderCacheManager object - static ShaderCacheManager *getShaderCacheManager() { - if (!m_manager) - m_manager = new ShaderCacheManager(); - return m_manager; - } + static ShaderCacheWrap *Create(unsigned optionCount, const char *const *options); - static void shutdown() { - delete m_manager; - m_manager = nullptr; - } + void Destroy(); - ShaderCachePtr getShaderCacheObject(const ShaderCacheCreateInfo *createInfo, - const ShaderCacheAuxCreateInfo *auxCreateInfo); + LLPC_NODISCARD Result GetEntry(Vkgc::HashId hash, bool allocateOnMiss, Vkgc::EntryHandle *pHandle); - void releaseShaderCacheObject(ShaderCachePtr &shaderCachePtr); + LLPC_NODISCARD void ReleaseEntry(Vkgc::RawEntryHandle rawHandle); -private: - std::list m_shaderCaches; // ShaderCache instances for all GFXIP + LLPC_NODISCARD Result WaitForEntry(Vkgc::RawEntryHandle rawHandle); + + LLPC_NODISCARD Result GetValue(Vkgc::RawEntryHandle rawHandle, void *pData, size_t *pDataLen); - static ShaderCacheManager *m_manager; // Static manager + LLPC_NODISCARD Result GetValueZeroCopy(Vkgc::RawEntryHandle rawHandle, const void **ppData, size_t *pDataLen); + + LLPC_NODISCARD Result SetValue(Vkgc::RawEntryHandle rawHandle, bool success, const void *pData, size_t dataLen); + +private: + ShaderCache *m_pShaderCache; // ShaderCache object }; } // namespace Llpc diff --git a/llpc/translator/lib/SPIRV/SPIRVReader.cpp b/llpc/translator/lib/SPIRV/SPIRVReader.cpp index 3dfa77341b..6609cf8b6a 100644 --- a/llpc/translator/lib/SPIRV/SPIRVReader.cpp +++ b/llpc/translator/lib/SPIRV/SPIRVReader.cpp @@ -617,7 +617,7 @@ Type *SPIRVToLLVM::transTypeWithOpcode(SPIRVType *const spvType, return samplerPtrTy; return StructType::get(*m_context, {imagePtrTy, samplerPtrTy}); } else { - // Uniform constant variable outside of a block use std430 layout. + // Uniform contant variable outside of a block use std430 layout. pointeeLayout = isAccelerationStructureType(spvElementType) ? LayoutMode::Explicit : LayoutMode::Std430; // From now on (GPURT major version >= 34), AS header may start at a non-zero offset, GPURT now request base // offset of the resource, and it will calculate the actual GPUVA, instead of compiler providing one loaded from @@ -3473,7 +3473,10 @@ template <> Value *SPIRVToLLVM::transValueWithOpcode(SPIRVValue *co // // @param spvValue : A SPIR-V value. template <> Value *SPIRVToLLVM::transValueWithOpcode(SPIRVValue *const spvValue) { - return getBuilder()->CreateSubgroupElect(); + Value *result = nullptr; + + result = getBuilder()->CreateSubgroupElect(); + return result; } // ===================================================================================================================== @@ -3483,12 +3486,16 @@ template <> Value *SPIRVToLLVM::transValueWithOpcode(SPI template <> Value *SPIRVToLLVM::transValueWithOpcode(SPIRVValue *const spvValue) { SPIRVInstruction *const spvInst = static_cast(spvValue); std::vector spvOperands = spvInst->getOperands(); - assert(static_cast(spvOperands[0])->getZExtIntValue() == ScopeSubgroup); - BasicBlock *const block = getBuilder()->GetInsertBlock(); Function *const func = getBuilder()->GetInsertBlock()->getParent(); Value *const predicate = transValue(spvOperands[1], func, block); - return getBuilder()->CreateSubgroupAll(predicate); + Value *result = nullptr; + + { + assert(static_cast(spvOperands[0])->getZExtIntValue() == ScopeSubgroup); + result = getBuilder()->CreateSubgroupAll(predicate); + } + return result; } // ===================================================================================================================== @@ -3498,12 +3505,17 @@ template <> Value *SPIRVToLLVM::transValueWithOpcode(SPIRV template <> Value *SPIRVToLLVM::transValueWithOpcode(SPIRVValue *const spvValue) { SPIRVInstruction *const spvInst = static_cast(spvValue); std::vector spvOperands = spvInst->getOperands(); - assert(static_cast(spvOperands[0])->getZExtIntValue() == ScopeSubgroup); BasicBlock *const block = getBuilder()->GetInsertBlock(); Function *const func = getBuilder()->GetInsertBlock()->getParent(); Value *const predicate = transValue(spvOperands[1], func, block); - return getBuilder()->CreateSubgroupAny(predicate); + Value *result = nullptr; + + { + assert(static_cast(spvOperands[0])->getZExtIntValue() == ScopeSubgroup); + result = getBuilder()->CreateSubgroupAny(predicate); + } + return result; } // ===================================================================================================================== @@ -3513,12 +3525,17 @@ template <> Value *SPIRVToLLVM::transValueWithOpcode(SPIRV template <> Value *SPIRVToLLVM::transValueWithOpcode(SPIRVValue *const spvValue) { SPIRVInstruction *const spvInst = static_cast(spvValue); std::vector spvOperands = spvInst->getOperands(); - assert(static_cast(spvOperands[0])->getZExtIntValue() == ScopeSubgroup); BasicBlock *const block = getBuilder()->GetInsertBlock(); Function *const func = getBuilder()->GetInsertBlock()->getParent(); Value *const value = transValue(spvOperands[1], func, block); - return getBuilder()->CreateSubgroupAllEqual(value); + Value *result = nullptr; + + { + assert(static_cast(spvOperands[0])->getZExtIntValue() == ScopeSubgroup); + result = getBuilder()->CreateSubgroupAllEqual(value); + } + return result; } // ===================================================================================================================== @@ -3528,13 +3545,18 @@ template <> Value *SPIRVToLLVM::transValueWithOpcode( template <> Value *SPIRVToLLVM::transValueWithOpcode(SPIRVValue *const spvValue) { SPIRVInstruction *const spvInst = static_cast(spvValue); std::vector spvOperands = spvInst->getOperands(); - assert(static_cast(spvOperands[0])->getZExtIntValue() == ScopeSubgroup); BasicBlock *const block = getBuilder()->GetInsertBlock(); Function *const func = getBuilder()->GetInsertBlock()->getParent(); Value *const value = transValue(spvOperands[1], func, block); Value *const index = transValue(spvOperands[2], func, block); - return getBuilder()->CreateSubgroupBroadcastWaterfall(value, index); + Value *result = nullptr; + + { + assert(static_cast(spvOperands[0])->getZExtIntValue() == ScopeSubgroup); + result = getBuilder()->CreateSubgroupBroadcastWaterfall(value, index); + } + return result; } // ===================================================================================================================== @@ -3544,12 +3566,17 @@ template <> Value *SPIRVToLLVM::transValueWithOpcode template <> Value *SPIRVToLLVM::transValueWithOpcode(SPIRVValue *const spvValue) { SPIRVInstruction *const spvInst = static_cast(spvValue); std::vector spvOperands = spvInst->getOperands(); - assert(static_cast(spvOperands[0])->getZExtIntValue() == ScopeSubgroup); BasicBlock *const block = getBuilder()->GetInsertBlock(); Function *const func = getBuilder()->GetInsertBlock()->getParent(); Value *const value = transValue(spvOperands[1], func, block); - return getBuilder()->CreateSubgroupBroadcastFirst(value); + Value *result = nullptr; + + { + assert(static_cast(spvOperands[0])->getZExtIntValue() == ScopeSubgroup); + result = getBuilder()->CreateSubgroupBroadcastFirst(value); + } + return result; } // ===================================================================================================================== @@ -3559,12 +3586,17 @@ template <> Value *SPIRVToLLVM::transValueWithOpcode Value *SPIRVToLLVM::transValueWithOpcode(SPIRVValue *const spvValue) { SPIRVInstruction *const spvInst = static_cast(spvValue); std::vector spvOperands = spvInst->getOperands(); - assert(static_cast(spvOperands[0])->getZExtIntValue() == ScopeSubgroup); BasicBlock *const block = getBuilder()->GetInsertBlock(); Function *const func = getBuilder()->GetInsertBlock()->getParent(); Value *const predicate = transValue(spvOperands[1], func, block); - return getBuilder()->CreateSubgroupBallot(predicate); + Value *result = nullptr; + + { + assert(static_cast(spvOperands[0])->getZExtIntValue() == ScopeSubgroup); + result = getBuilder()->CreateSubgroupBallot(predicate); + } + return result; } // ===================================================================================================================== @@ -3574,12 +3606,17 @@ template <> Value *SPIRVToLLVM::transValueWithOpcode(SP template <> Value *SPIRVToLLVM::transValueWithOpcode(SPIRVValue *const spvValue) { SPIRVInstruction *const spvInst = static_cast(spvValue); std::vector spvOperands = spvInst->getOperands(); - assert(static_cast(spvOperands[0])->getZExtIntValue() == ScopeSubgroup); BasicBlock *const block = getBuilder()->GetInsertBlock(); Function *const func = getBuilder()->GetInsertBlock()->getParent(); Value *const value = transValue(spvOperands[1], func, block); - return getBuilder()->CreateSubgroupInverseBallot(value); + Value *result = nullptr; + + { + assert(static_cast(spvOperands[0])->getZExtIntValue() == ScopeSubgroup); + result = getBuilder()->CreateSubgroupInverseBallot(value); + } + return result; } // ===================================================================================================================== @@ -3589,13 +3626,18 @@ template <> Value *SPIRVToLLVM::transValueWithOpcode Value *SPIRVToLLVM::transValueWithOpcode(SPIRVValue *const spvValue) { SPIRVInstruction *const spvInst = static_cast(spvValue); std::vector spvOperands = spvInst->getOperands(); - assert(static_cast(spvOperands[0])->getZExtIntValue() == ScopeSubgroup); BasicBlock *const block = getBuilder()->GetInsertBlock(); Function *const func = getBuilder()->GetInsertBlock()->getParent(); Value *const value = transValue(spvOperands[1], func, block); Value *const index = transValue(spvOperands[2], func, block); - return getBuilder()->CreateSubgroupBallotBitExtract(value, index); + Value *result = nullptr; + + { + assert(static_cast(spvOperands[0])->getZExtIntValue() == ScopeSubgroup); + result = getBuilder()->CreateSubgroupBallotBitExtract(value, index); + } + return result; } // ===================================================================================================================== @@ -3605,22 +3647,24 @@ template <> Value *SPIRVToLLVM::transValueWithOpcode Value *SPIRVToLLVM::transValueWithOpcode(SPIRVValue *const spvValue) { SPIRVInstruction *const spvInst = static_cast(spvValue); std::vector spvOperands = spvInst->getOperands(); - assert(static_cast(spvOperands[0])->getZExtIntValue() == ScopeSubgroup); BasicBlock *const block = getBuilder()->GetInsertBlock(); Function *const func = getBuilder()->GetInsertBlock()->getParent(); Value *const value = transValue(spvOperands[2], func, block); - switch (static_cast(spvOperands[1])->getZExtIntValue()) { - case GroupOperationReduce: - return getBuilder()->CreateSubgroupBallotBitCount(value); - case GroupOperationInclusiveScan: - return getBuilder()->CreateSubgroupBallotInclusiveBitCount(value); - case GroupOperationExclusiveScan: - return getBuilder()->CreateSubgroupBallotExclusiveBitCount(value); - default: - llvm_unreachable("Should never be called!"); - return nullptr; + { + assert(static_cast(spvOperands[0])->getZExtIntValue() == ScopeSubgroup); + switch (static_cast(spvOperands[1])->getZExtIntValue()) { + case GroupOperationReduce: + return getBuilder()->CreateSubgroupBallotBitCount(value); + case GroupOperationInclusiveScan: + return getBuilder()->CreateSubgroupBallotInclusiveBitCount(value); + case GroupOperationExclusiveScan: + return getBuilder()->CreateSubgroupBallotExclusiveBitCount(value); + default: + llvm_unreachable("Should never be called!"); + return nullptr; + } } } @@ -3631,12 +3675,17 @@ template <> Value *SPIRVToLLVM::transValueWithOpcode Value *SPIRVToLLVM::transValueWithOpcode(SPIRVValue *const spvValue) { SPIRVInstruction *const spvInst = static_cast(spvValue); std::vector spvOperands = spvInst->getOperands(); - assert(static_cast(spvOperands[0])->getZExtIntValue() == ScopeSubgroup); BasicBlock *const block = getBuilder()->GetInsertBlock(); Function *const func = getBuilder()->GetInsertBlock()->getParent(); Value *const value = transValue(spvOperands[1], func, block); - return getBuilder()->CreateSubgroupBallotFindLsb(value); + Value *result = nullptr; + + { + assert(static_cast(spvOperands[0])->getZExtIntValue() == ScopeSubgroup); + result = getBuilder()->CreateSubgroupBallotFindLsb(value); + } + return result; } // ===================================================================================================================== @@ -3646,12 +3695,17 @@ template <> Value *SPIRVToLLVM::transValueWithOpcode Value *SPIRVToLLVM::transValueWithOpcode(SPIRVValue *const spvValue) { SPIRVInstruction *const spvInst = static_cast(spvValue); std::vector spvOperands = spvInst->getOperands(); - assert(static_cast(spvOperands[0])->getZExtIntValue() == ScopeSubgroup); BasicBlock *const block = getBuilder()->GetInsertBlock(); Function *const func = getBuilder()->GetInsertBlock()->getParent(); Value *const value = transValue(spvOperands[1], func, block); - return getBuilder()->CreateSubgroupBallotFindMsb(value); + Value *result = nullptr; + + { + assert(static_cast(spvOperands[0])->getZExtIntValue() == ScopeSubgroup); + result = getBuilder()->CreateSubgroupBallotFindMsb(value); + } + return result; } // ===================================================================================================================== @@ -3661,13 +3715,18 @@ template <> Value *SPIRVToLLVM::transValueWithOpcode Value *SPIRVToLLVM::transValueWithOpcode(SPIRVValue *const spvValue) { SPIRVInstruction *const spvInst = static_cast(spvValue); std::vector spvOperands = spvInst->getOperands(); - assert(static_cast(spvOperands[0])->getZExtIntValue() == ScopeSubgroup); BasicBlock *const block = getBuilder()->GetInsertBlock(); Function *const func = getBuilder()->GetInsertBlock()->getParent(); Value *const value = transValue(spvOperands[1], func, block); Value *const index = transValue(spvOperands[2], func, block); - return getBuilder()->CreateSubgroupShuffle(value, index); + Value *result = nullptr; + + { + assert(static_cast(spvOperands[0])->getZExtIntValue() == ScopeSubgroup); + result = getBuilder()->CreateSubgroupShuffle(value, index); + } + return result; } // ===================================================================================================================== @@ -3677,13 +3736,18 @@ template <> Value *SPIRVToLLVM::transValueWithOpcode(S template <> Value *SPIRVToLLVM::transValueWithOpcode(SPIRVValue *const spvValue) { SPIRVInstruction *const spvInst = static_cast(spvValue); std::vector spvOperands = spvInst->getOperands(); - assert(static_cast(spvOperands[0])->getZExtIntValue() == ScopeSubgroup); BasicBlock *const block = getBuilder()->GetInsertBlock(); Function *const func = getBuilder()->GetInsertBlock()->getParent(); Value *const value = transValue(spvOperands[1], func, block); Value *const mask = transValue(spvOperands[2], func, block); - return getBuilder()->CreateSubgroupShuffleXor(value, mask); + Value *result = nullptr; + + { + assert(static_cast(spvOperands[0])->getZExtIntValue() == ScopeSubgroup); + result = getBuilder()->CreateSubgroupShuffleXor(value, mask); + } + return result; } // ===================================================================================================================== @@ -3693,13 +3757,18 @@ template <> Value *SPIRVToLLVM::transValueWithOpcode Value *SPIRVToLLVM::transValueWithOpcode(SPIRVValue *const spvValue) { SPIRVInstruction *const spvInst = static_cast(spvValue); std::vector spvOperands = spvInst->getOperands(); - assert(static_cast(spvOperands[0])->getZExtIntValue() == ScopeSubgroup); BasicBlock *const block = getBuilder()->GetInsertBlock(); Function *const func = getBuilder()->GetInsertBlock()->getParent(); Value *const value = transValue(spvOperands[1], func, block); Value *const delta = transValue(spvOperands[2], func, block); - return getBuilder()->CreateSubgroupShuffleUp(value, delta); + Value *result = nullptr; + + { + assert(static_cast(spvOperands[0])->getZExtIntValue() == ScopeSubgroup); + result = getBuilder()->CreateSubgroupShuffleUp(value, delta); + } + return result; } // ===================================================================================================================== @@ -3709,13 +3778,18 @@ template <> Value *SPIRVToLLVM::transValueWithOpcode template <> Value *SPIRVToLLVM::transValueWithOpcode(SPIRVValue *const spvValue) { SPIRVInstruction *const spvInst = static_cast(spvValue); std::vector spvOperands = spvInst->getOperands(); - assert(static_cast(spvOperands[0])->getZExtIntValue() == ScopeSubgroup); BasicBlock *const block = getBuilder()->GetInsertBlock(); Function *const func = getBuilder()->GetInsertBlock()->getParent(); Value *const value = transValue(spvOperands[1], func, block); Value *const delta = transValue(spvOperands[2], func, block); - return getBuilder()->CreateSubgroupShuffleDown(value, delta); + Value *result = nullptr; + + { + assert(static_cast(spvOperands[0])->getZExtIntValue() == ScopeSubgroup); + result = getBuilder()->CreateSubgroupShuffleDown(value, delta); + } + return result; } // ===================================================================================================================== @@ -3865,7 +3939,7 @@ template <> Value *SPIRVToLLVM::transValueWithOpcode(SPIRVV Function *const func = getBuilder()->GetInsertBlock()->getParent(); Value *const vertexCount = transValue(spvOperands[0], func, block); Value *const primitiveCount = transValue(spvOperands[1], func, block); - return getBuilder()->CreateSetMeshOutputs(vertexCount, primitiveCount); + return getBuilder()->create(vertexCount, primitiveCount); } // ===================================================================================================================== @@ -4688,7 +4762,7 @@ template <> Value *SPIRVToLLVM::transValueWithOpcode(SPIRVValue // // @param spvValue : A SPIR-V value. template <> Value *SPIRVToLLVM::transValueWithOpcode(SPIRVValue *const spvValue) { - if (m_shaderOptions->noContract) { + if (m_shaderOptions->noContractOpDot) { auto fmf = getBuilder()->getFastMathFlags(); fmf.setAllowContract(false); getBuilder()->setFastMathFlags(fmf); @@ -7249,6 +7323,7 @@ bool SPIRVToLLVM::translate(ExecutionModel entryExecModel, const char *entryName if (m_execModule >= ExecutionModelVertex && m_execModule <= ExecutionModelGeometry) hasXfbOuts = m_entryTarget->getExecutionMode(ExecutionModeXfb) != nullptr; + } else { createLibraryEntryFunc(); } @@ -7870,7 +7945,8 @@ bool SPIRVToLLVM::transShaderDecoration(SPIRVValue *bv, Value *v) { } // If dual source blend is dynamically set, need to confirm whether the fragment shader actually uses - // dual-source blending by checking if there is an output at Location 0, Index 1 + // dual-source blending by checking if there is an output at Location 0. If there isn't any output in + // index, the poison value will be exported. Llpc::Context *llpcContext = static_cast(m_context); if ((llpcContext->getPipelineType() == PipelineType::Graphics) && (m_execModule == spv::ExecutionModelFragment)) { auto *buildInfo = static_cast(llpcContext->getPipelineBuildInfo()); diff --git a/llpc/translator/lib/SPIRV/SPIRVToLLVMDbgTran.cpp b/llpc/translator/lib/SPIRV/SPIRVToLLVMDbgTran.cpp index 586bc52ae9..e6d4713528 100644 --- a/llpc/translator/lib/SPIRV/SPIRVToLLVMDbgTran.cpp +++ b/llpc/translator/lib/SPIRV/SPIRVToLLVMDbgTran.cpp @@ -82,7 +82,7 @@ void SPIRVToLLVMDbgTran::createCompilationUnit() { } DIFile *SPIRVToLLVMDbgTran::getDIFile(const string &FileName) { - return getOrInsert(FileMap, FileName, [=]() { + return getOrInsert(FileMap, FileName, [=, this]() { SplitFileName Split(FileName); return Builder.createFile(Split.BaseName, Split.Path); }); diff --git a/llpc/translator/lib/SPIRV/libSPIRV/SPIRVInstruction.h b/llpc/translator/lib/SPIRV/libSPIRV/SPIRVInstruction.h index 9cebae32fd..11fd9fb6c5 100644 --- a/llpc/translator/lib/SPIRV/libSPIRV/SPIRVInstruction.h +++ b/llpc/translator/lib/SPIRV/libSPIRV/SPIRVInstruction.h @@ -918,7 +918,7 @@ class SPIRVPhi : public SPIRVInstruction { assert(WordCount == Pairs.size() + FixedWordCount); assert(OpCode == OC); assert(Pairs.size() % 2 == 0); - foreachPair([=](SPIRVValue *IncomingV, SPIRVBasicBlock *IncomingBB) { + foreachPair([=, this](SPIRVValue *IncomingV, SPIRVBasicBlock *IncomingBB) { assert(IncomingV->isForward() || IncomingV->getType() == Type); assert(IncomingBB->isBasicBlock() || IncomingBB->isForward()); }); diff --git a/llpc/util/llpcCacheAccessor.cpp b/llpc/util/llpcCacheAccessor.cpp index 782ab216e8..2d0361c5bd 100644 --- a/llpc/util/llpcCacheAccessor.cpp +++ b/llpc/util/llpcCacheAccessor.cpp @@ -37,41 +37,14 @@ using namespace llvm; -namespace llvm { -namespace cl { -extern opt ShaderCacheMode; -} // namespace cl -} // namespace llvm - namespace Llpc { - -// ===================================================================================================================== -// Access the given caches using the hash. -// -// @param context : The context that will give the caches from the application. -// @param hash : The hash for the entry to access. -// @param internalCaches : The internal caches to check. -CacheAccessor::CacheAccessor(Context *context, MetroHash::Hash &cacheHash, CachePair internalCaches) { - assert(context); - if (context->getPipelineType() == PipelineType::Graphics) { - const auto *pipelineInfo = reinterpret_cast(context->getPipelineBuildInfo()); - initializeUsingBuildInfo(pipelineInfo, cacheHash, internalCaches); - } else { - const auto *pipelineInfo = reinterpret_cast(context->getPipelineBuildInfo()); - initializeUsingBuildInfo(pipelineInfo, cacheHash, internalCaches); - } -} - // ===================================================================================================================== // Initializes the cache accessor to check the given caches. The caches can be nullptr. // // @param userCache : The ICache supplied by the application. nullptr if no cache is provided. -// @param userShaderCache : The shader cache supplied by the application. nullptr if no cache is provided. // @param internalCaches : The internal caches to check. -void CacheAccessor::initialize(Vkgc::ICache *userCache, IShaderCache *userShaderCache, CachePair internalCaches) { - m_internalCaches = internalCaches; - m_applicationCaches = {userCache, userShaderCache}; - resetShaderCacheTrackingData(); +void CacheAccessor::initialize(Vkgc::ICache *internalCache) { + m_internalCache = internalCache; m_cacheResult = Result::ErrorUnknown; m_cacheEntry = Vkgc::EntryHandle(); m_elf = {0, nullptr}; @@ -87,12 +60,8 @@ void CacheAccessor::lookUpInCaches(const MetroHash::Hash &hash) { Result cacheResult = Result::Unsupported; if (getInternalCache()) { - cacheResult = lookUpInCache(getInternalCache(), !getApplicationCache(), hashId); - if (cacheResult == Result::Success) - m_internalCacheHit = true; + cacheResult = lookUpInCache(getInternalCache(), true, hashId); } - if (getApplicationCache() && cacheResult != Result::Success) - cacheResult = lookUpInCache(getApplicationCache(), true, hashId); m_cacheResult = cacheResult; } @@ -119,73 +88,12 @@ Result CacheAccessor::lookUpInCache(Vkgc::ICache *cache, bool allocateOnMiss, co return cacheResult; } -// ===================================================================================================================== -// Looks for the given hash in the shader caches and sets the cache accessor state with the results. -// -// @param hash : The hash to look up. -void CacheAccessor::lookUpInShaderCaches(MetroHash::Hash &hash) { - ShaderCache *applicationCache = static_cast(getApplicationShaderCache()); - ShaderCache *internalCache = static_cast(getInternalShaderCache()); - bool usingApplicationCache = applicationCache && cl::ShaderCacheMode != ShaderCacheForceInternalCacheOnDisk; - if (internalCache) { - if (lookUpInShaderCache(hash, !usingApplicationCache, internalCache)) - return; - } - if (usingApplicationCache) { - if (lookUpInShaderCache(hash, true, applicationCache)) - return; - } - resetShaderCacheTrackingData(); -} - -// ===================================================================================================================== -// Set to entry tracking the shader cache to indicate that it is not tracking any shader cache entry. -void CacheAccessor::resetShaderCacheTrackingData() { - m_shaderCache = nullptr; - m_shaderCacheEntry = nullptr; - m_shaderCacheEntryState = ShaderEntryState::New; -} - -// ===================================================================================================================== -// Looks for the given hash in the given shader cache and sets the cache accessor state with the results. A new entry -// will be allocated if there is a cache miss and allocateOnMiss is true. -// -// @param hash : The hash to look up. -// @param allocateOnMiss : Will add an entry to the cache on a miss if true. -// @param cache : The cache in with to look. -bool CacheAccessor::lookUpInShaderCache(const MetroHash::Hash &hash, bool allocateOnMiss, ShaderCache *cache) { - CacheEntryHandle currentEntry; - ShaderEntryState cacheEntryState = cache->findShader(hash, allocateOnMiss, ¤tEntry); - if (cacheEntryState == ShaderEntryState::Ready) { - Result result = cache->retrieveShader(currentEntry, &m_elf.pCode, &m_elf.codeSize); - if (result == Result::Success) { - m_shaderCacheEntryState = ShaderEntryState::Ready; - return true; - } - } else if (cacheEntryState == ShaderEntryState::Compiling) { - m_shaderCache = cache; - m_shaderCacheEntry = currentEntry; - m_shaderCacheEntryState = ShaderEntryState::Compiling; - return true; - } - return false; -} - // ===================================================================================================================== // Sets the ELF entry for the hash on a cache miss. Does nothing if there was a cache hit or the ELF has already been // set. // // @param elf : The binary encoding of the elf to place in the cache. void CacheAccessor::setElfInCache(BinaryData elf) { - if (m_shaderCacheEntryState == ShaderEntryState::Compiling && m_shaderCacheEntry) { - updateShaderCache(elf); - if (m_shaderCache->retrieveShader(m_shaderCacheEntry, &m_elf.pCode, &m_elf.codeSize) == Result::Success) { - m_shaderCacheEntryState = ShaderEntryState::Ready; - } else { - return; - } - } - if (!m_cacheEntry.IsEmpty()) { m_cacheResult = Result::ErrorUnknown; if (elf.pCode) { @@ -197,23 +105,4 @@ void CacheAccessor::setElfInCache(BinaryData elf) { } } -// ===================================================================================================================== -// Updates the entry in the shader cache, if there is one, for this access. -// -// @param elf : The binary encoding of the elf to place in the cache. -void CacheAccessor::updateShaderCache(BinaryData &elf) { - ShaderCache *shaderCache = m_shaderCache; - if (!m_shaderCacheEntry) - return; - - if (!shaderCache) - shaderCache = static_cast(getInternalShaderCache()); - - if (elf.pCode) { - assert(elf.codeSize > 0); - shaderCache->insertShader(m_shaderCacheEntry, elf.pCode, elf.codeSize); - } else - shaderCache->resetShader(m_shaderCacheEntry); -} - } // namespace Llpc diff --git a/llpc/util/llpcCacheAccessor.h b/llpc/util/llpcCacheAccessor.h index 4f9c9b4da1..afa770db68 100644 --- a/llpc/util/llpcCacheAccessor.h +++ b/llpc/util/llpcCacheAccessor.h @@ -32,7 +32,6 @@ #pragma once #include "llpc.h" -#include "llpcShaderCache.h" #include "vkgcMetroHash.h" #include "llvm/Support/CommandLine.h" @@ -40,11 +39,6 @@ namespace Llpc { class Context; -struct CachePair { - Vkgc::ICache *cache = nullptr; - IShaderCache *shaderCache = nullptr; -}; - class CacheAccessor { public: // Checks the caches in the build info and the internal caches for an entry with the given hash. @@ -52,115 +46,62 @@ class CacheAccessor { // @param buildInfo : The build information that will give the caches from the application. // @param hash : The hash for the entry to access. // @param internalCaches : The internal caches to check. - template CacheAccessor(BuildInfo *buildInfo, MetroHash::Hash &cacheHash, CachePair internalCaches) { - initializeUsingBuildInfo(buildInfo, cacheHash, internalCaches); + CacheAccessor(MetroHash::Hash &cacheHash, Vkgc::ICache *internalCache) { + initializeUsingBuildInfo(cacheHash, internalCache); } CacheAccessor(CacheAccessor &&ca) { *this = std::move(ca); } CacheAccessor &operator=(CacheAccessor &&ca) { - m_applicationCaches = ca.m_applicationCaches; - m_internalCaches = ca.m_internalCaches; - m_shaderCacheEntryState = ca.m_shaderCacheEntryState; - m_shaderCacheEntry = ca.m_shaderCacheEntry; - m_shaderCache = ca.m_shaderCache; + m_internalCache = ca.m_internalCache; m_cacheResult = ca.m_cacheResult; m_cacheEntry = std::move(ca.m_cacheEntry); m_elf = ca.m_elf; // Reinitialize ca with not caches. It needs to be in an appropriate state for the destructor. - ca.initialize(nullptr, nullptr, {nullptr, nullptr}); + ca.initialize(nullptr); return *this; } - CacheAccessor(Context *context, MetroHash::Hash &cacheHash, CachePair internalCaches); - // Finalizes the cache access by releasing any handles that need to be released. ~CacheAccessor() { setElfInCache({0, nullptr}); } // Returns true of the entry was in at least on of the caches or has been added to the cache. - bool isInCache() const { - return m_cacheResult == Result::Success || m_shaderCacheEntryState == ShaderEntryState::Ready; - } + bool isInCache() const { return m_cacheResult == Result::Success; } // Returns the ELF that was found in the cache. BinaryData getElfFromCache() const { return m_elf; } void setElfInCache(BinaryData elf); - // Returns true if there was a cache hit in an internal cache. - bool hitInternalCache() const { - if (!isInCache()) - return false; - if (m_cacheResult == Result::Success) { - return m_internalCacheHit; - } - return getApplicationShaderCache() == m_shaderCache; - } - private: CacheAccessor() = delete; CacheAccessor(const CacheAccessor &) = delete; CacheAccessor &operator=(const CacheAccessor &) = delete; - const Vkgc::ICache *getApplicationCache() const { return m_applicationCaches.cache; } - const IShaderCache *getApplicationShaderCache() const { return m_applicationCaches.shaderCache; } - const Vkgc::ICache *getInternalCache() const { return m_internalCaches.cache; } - const IShaderCache *getInternalShaderCache() const { return m_internalCaches.shaderCache; } - Vkgc::ICache *getApplicationCache() { return m_applicationCaches.cache; } - IShaderCache *getApplicationShaderCache() { return m_applicationCaches.shaderCache; } - Vkgc::ICache *getInternalCache() { return m_internalCaches.cache; } - IShaderCache *getInternalShaderCache() { return m_internalCaches.shaderCache; } + const Vkgc::ICache *getInternalCache() const { return m_internalCache; } + Vkgc::ICache *getInternalCache() { return m_internalCache; } // Access the given caches using the hash. // // @param buildInfo : The build info object that the caches from the application. // @param hash : The hash for the entry to access. // @param internalCaches : The internal caches to check. - template - void initializeUsingBuildInfo(const BuildInfo *buildInfo, MetroHash::Hash &hash, CachePair internalCaches) { - assert(buildInfo); - Vkgc::ICache *userCache = buildInfo->cache; - - IShaderCache *userShaderCache = nullptr; -#if LLPC_ENABLE_SHADER_CACHE - userShaderCache = reinterpret_cast(buildInfo->pShaderCache); -#endif - - initialize(userCache, userShaderCache, internalCaches); + void initializeUsingBuildInfo(MetroHash::Hash &hash, Vkgc::ICache *internalCache) { + initialize(internalCache); lookUpInCaches(hash); - if (m_cacheResult != Result::Success) - lookUpInShaderCaches(hash); } - void initialize(Vkgc::ICache *userCache, IShaderCache *userShaderCache, CachePair internalCaches); + void initialize(Vkgc::ICache *internalCache); void lookUpInCaches(const MetroHash::Hash &hash); Result lookUpInCache(Vkgc::ICache *cache, bool allocateOnMiss, const Vkgc::HashId &hashId); - void lookUpInShaderCaches(MetroHash::Hash &hash); - bool lookUpInShaderCache(const MetroHash::Hash &hash, bool allocateOnMiss, ShaderCache *cache); - void updateShaderCache(BinaryData &elf); - void resetShaderCacheTrackingData(); - - CachePair m_applicationCaches; - CachePair m_internalCaches; - - // The state of the shader cache look up. - ShaderEntryState m_shaderCacheEntryState = ShaderEntryState::New; - - // The handle to the entry in the shader cache. - CacheEntryHandle m_shaderCacheEntry = nullptr; - - // The shader cache that the entry refers to. - ShaderCache *m_shaderCache = nullptr; + Vkgc::ICache *m_internalCache; // The result of checking the ICache. Result m_cacheResult = Result::ErrorUnknown; - // Whether the cache hit came from the internal cache. - bool m_internalCacheHit = false; - // The handle to the entry in the cache. Vkgc::EntryHandle m_cacheEntry; diff --git a/llpc/util/llpcShaderModuleHelper.cpp b/llpc/util/llpcShaderModuleHelper.cpp index e8257b2567..0808926af3 100644 --- a/llpc/util/llpcShaderModuleHelper.cpp +++ b/llpc/util/llpcShaderModuleHelper.cpp @@ -84,8 +84,16 @@ ShaderModuleUsage ShaderModuleHelper::getShaderModuleUsageInfo(const BinaryData } case OpExtInst: { auto extInst = static_cast(codePos[4]); - if (extInst == GLSLstd450InterpolateAtSample) { + switch (extInst) { + case GLSLstd450InterpolateAtSample: shaderModuleUsage.useSampleInfo = true; + break; + case GLSLstd450NMin: + case GLSLstd450NMax: + shaderModuleUsage.useIsNan = true; + break; + default: + break; } break; } @@ -96,6 +104,21 @@ ShaderModuleUsage ShaderModuleHelper::getShaderModuleUsageInfo(const BinaryData } break; } + case OpExecutionMode: { + auto execMode = static_cast(codePos[2]); + switch (execMode) { + case ExecutionModeOriginUpperLeft: + shaderModuleUsage.originUpperLeft = true; + break; + case ExecutionModePixelCenterInteger: + shaderModuleUsage.pixelCenterInteger = true; + break; + default: { + break; + } + } + break; + } case OpDecorate: case OpMemberDecorate: { auto decoration = @@ -119,6 +142,18 @@ ShaderModuleUsage ShaderModuleHelper::getShaderModuleUsageInfo(const BinaryData shaderModuleUsage.useSampleInfo = true; break; } + case BuiltInFragCoord: { + shaderModuleUsage.useFragCoord = true; + break; + } + case BuiltInPointCoord: + case BuiltInPrimitiveId: + case BuiltInLayer: + case BuiltInClipDistance: + case BuiltInCullDistance: { + shaderModuleUsage.useGenericBuiltIn = true; + break; + } default: { break; } diff --git a/llpc/util/llpcUtil.h b/llpc/util/llpcUtil.h index a614509a98..dea84ac134 100644 --- a/llpc/util/llpcUtil.h +++ b/llpc/util/llpcUtil.h @@ -132,7 +132,7 @@ const char *getUnlinkedShaderStageName(Vkgc::UnlinkedShaderStage type); const char *getPartPipelineStageName(Vkgc::PartPipelineStage type); // Returns the uniform constant map entry of the given location. -Vkgc::UniformConstantMapEntry *getUniformConstantEntryByLocation(const Llpc::Context *context, Vkgc::ShaderStage stage, +Vkgc::UniformConstantMapEntry *getUniformConstantEntryByLocation(const Llpc::Context *contex, Vkgc::ShaderStage stage, unsigned loc); inline bool doesShaderStageExist(llvm::ArrayRef shaderInfo, ShaderStage stage) { diff --git a/shared/README.md b/shared/README.md index 67432edfad..7afd8a804c 100644 --- a/shared/README.md +++ b/shared/README.md @@ -3,4 +3,5 @@ This repository contains code that is shared between different graphics compilers and drivers. - [Continuations](./continuations) contains a collection of passes to convert shaders to coroutines. +- [CompilerUtils](./compilerutils) is a library of helpers for frontendy functionality shared by different compiler stacks. - [LgcRt](./lgcrt) contains the lgc.rt dialect definition and helpers. diff --git a/shared/continuations/CMakeLists.txt b/shared/continuations/CMakeLists.txt index 37bc709d66..e615b7e627 100644 --- a/shared/continuations/CMakeLists.txt +++ b/shared/continuations/CMakeLists.txt @@ -12,7 +12,6 @@ function(set_compiler_options PROJECT_NAME) endfunction() add_llvm_library(LLVMContinuations - lib/AddTypesMetadata.cpp lib/CleanupContinuations.cpp lib/ContinuationsDialect.cpp lib/ContinuationsUtil.cpp @@ -23,6 +22,7 @@ add_llvm_library(LLVMContinuations lib/DXILContPreCoroutine.cpp lib/DXILMetadata.cpp lib/DXILSupport.cpp + lib/LegacyCleanupContinuations.cpp lib/LowerAwait.cpp lib/LowerRaytracingPipeline.cpp lib/PassRegistry.inc @@ -38,6 +38,7 @@ add_llvm_library(LLVMContinuations LINK_COMPONENTS Analysis Core + Coroutines IPO Scalar Support @@ -50,11 +51,11 @@ target_include_directories(LLVMContinuations PUBLIC $ ) -target_link_libraries(LLVMContinuations PUBLIC llvm_dialects PRIVATE LLVMLgcRt) +target_link_libraries(LLVMContinuations PUBLIC llvm_dialects PRIVATE LLVMLgcRt LLVMLgcCps) set_compiler_options(LLVMContinuations) # TableGen for continuations dialect -set(CONTINUATIONS_TABLEGEN_EXE llvm-dialects-tblgen) +set(CONTINUATIONS_TABLEGEN_EXE $) set(CONTINUATIONS_TABLEGEN_TARGET llvm-dialects-tblgen) set(LLVM_TARGET_DEFINITIONS include/continuations/ContinuationsDialect.td) set(LLVM_TARGET_DEPENDS continuations) @@ -69,7 +70,7 @@ tablegen(CONTINUATIONS ContinuationsDialect.cpp.inc -gen-dialect-defs --dialect EXTRA_INCLUDES ${CMAKE_CURRENT_SOURCE_DIR}/../../imported/llvm-dialects/include) add_public_tablegen_target(ContinuationsDialectTableGen) -add_dependencies(LLVMContinuations ContinuationsDialectTableGen LgcRtDialectTableGen) +add_dependencies(LLVMContinuations ContinuationsDialectTableGen LgcRtDialectTableGen LgcCpsDialectTableGen) target_compile_features(LLVMContinuations PUBLIC cxx_std_17) set_target_properties(LLVMContinuations PROPERTIES CXX_EXTENSIONS OFF) diff --git a/shared/continuations/include/continuations/Continuations.h b/shared/continuations/include/continuations/Continuations.h index 19ad2a24b4..d8cfbad577 100644 --- a/shared/continuations/include/continuations/Continuations.h +++ b/shared/continuations/include/continuations/Continuations.h @@ -257,14 +257,14 @@ class DialectContextAnalysis bool NeedDialectContext; }; -class CleanupContinuationsPass - : public llvm::PassInfoMixin { +class LegacyCleanupContinuationsPass + : public llvm::PassInfoMixin { public: - CleanupContinuationsPass(); + LegacyCleanupContinuationsPass(); llvm::PreservedAnalyses run(llvm::Module &Module, llvm::ModuleAnalysisManager &AnalysisManager); - static llvm::StringRef name() { return "continuation cleanup"; } + static llvm::StringRef name() { return "legacy continuation cleanup"; } private: struct ContinuationData { @@ -306,6 +306,46 @@ class CleanupContinuationsPass uint32_t MaxContStateBytes; }; +class CleanupContinuationsPass + : public llvm::PassInfoMixin { +public: + CleanupContinuationsPass(); + llvm::PreservedAnalyses run(llvm::Module &Module, + llvm::ModuleAnalysisManager &AnalysisManager); + + static llvm::StringRef name() { return "continuation cleanup"; } + +private: + struct ContinuationData { + /// All functions belonging to this continuation, the entry function is the + /// first one + SmallVector Functions; + /// Size of the continuation state in byte + uint32_t ContStateBytes = 0; + CallInst *MallocCall = nullptr; + MDNode *MD = nullptr; + SmallVector NewFunctions; + }; + + void removeContFreeCall(Function *F, Function *ContFree); + Value *getContinuationFramePtr(Function *F, bool IsStart, + const ContinuationData &ContinuationInfo, + SmallVector &InstsToRemove); + void freeCpsStack(Function *F, ContinuationData &CpsInfo); + void updateCpsStack(Function *F, Function *NewFunc, bool IsStart, + ContinuationData &CpsInfo); + void analyzeContinuation(Function &F, MDNode *MD); + void processContinuations(); + void handleContinue(ContinuationData &Data, Instruction *Ret); + void handleSingleContinue(ContinuationData &Data, CallInst *Call, + Value *ResumeFun); + + llvm_dialects::Builder *Builder; + Function *ContMalloc; + Function *ContFree; + MapVector ToProcess; + uint32_t MaxContStateBytes; +}; class LowerRaytracingPipelinePass : public llvm::PassInfoMixin { public: @@ -501,13 +541,21 @@ class DXILCoroSplitPass : public CoroSplitPass { } }; -// Pass to add !types metadata to function definitions and declarations -class AddTypesMetadataPass : public llvm::PassInfoMixin { +// Rematerializable callback specific to LgcCps - mainly used to extend what's +// considered rematerializable for continuations +bool LgcMaterializable(Instruction &I); + +// Define a wrapper pass that is used for testing using opt (lgc-coro-split vs +// coro-split) +class LgcCoroSplitPass : public CoroSplitPass { public: - llvm::PreservedAnalyses run(llvm::Module &Module, - llvm::ModuleAnalysisManager &AnalysisManager); + LgcCoroSplitPass() + : CoroSplitPass(std::function(&LgcMaterializable), + true) {} - static llvm::StringRef name() { return "Add types metadata"; } + static llvm::StringRef name() { + return "Lgc continuations coro split pass wrapper"; + } }; // Pass to remove !types metadata from function definitions and declarations @@ -552,6 +600,9 @@ class DXILContLgcRtOpConverterPass void applyPayloadMetadataTypesOnShaders(); }; +/// Add necessary continuation transform passes for LGC. +void addLgcContinuationTransform(ModulePassManager &MPM); + /// LLVM parser callback which adds !types metadata during DXIL parsing void DXILValueTypeMetadataCallback(Value *V, unsigned TypeID, GetTypeByIDTy GetTypeByID, diff --git a/shared/continuations/include/continuations/ContinuationsUtil.h b/shared/continuations/include/continuations/ContinuationsUtil.h index 1561131604..c733c8313a 100644 --- a/shared/continuations/include/continuations/ContinuationsUtil.h +++ b/shared/continuations/include/continuations/ContinuationsUtil.h @@ -97,8 +97,15 @@ const unsigned FirstPayloadHitAttributeStorageRegister = 1; /// = D3D12_RAYTRACING_MAX_ATTRIBUTE_SIZE_IN_BYTES /// Smaller limits may be specified in metadata. const unsigned GlobalMaxHitAttributeBytes = 32; -/// The minimum size for the pre-allocated continuation state is the size of a -/// pointer. +/// We tell the LLVM coroutine passes the size of a preallocated buffer +/// for the continuation state that can be used without dynamic allocations. +/// If the continuation state is larger, coroutine passes will use a special +/// malloc call that will be replaced later. If we find the malloc, we know +/// the exact continuation state size. If we don't find a malloc, but there +/// are usages of the frame pointer, we need to pessimistically assume +/// that the full size is required. +/// TODO: Figure out whether we can pass a fixed size of 0, eliminating +/// this pessimism. const unsigned MinimumContinuationStateBytes = 8; struct DxRayIntrinsic { @@ -197,7 +204,6 @@ class DXILContFuncTy { DXILContArgTy ReturnTy; SmallVector ArgTys; - static DXILContFuncTy get(const FunctionType *FuncTy); static DXILContFuncTy get(const Function *F); static DXILContFuncTy get(const Metadata *MD, LLVMContext &Context); @@ -438,6 +444,24 @@ class DXILContHelper { return extractZExtI32Constant(F.getMetadata(MDMaxPayloadBytesName)); } + static void setStackSize(Function *F, uint32_t StackSize) { + F->setMetadata(MDStackSizeName, + getI32MDConstant(F->getContext(), StackSize)); + } + + // If the function already has stacksize metadata, add the given value. + // Otherwise, assume an existing value of zero, and set the pass value. + static void addStackSize(Function *F, uint32_t AddedStackSize) { + auto ExistingSize = tryGetStackSize(F).value_or(0); + F->setMetadata( + MDStackSizeName, + getI32MDConstant(F->getContext(), ExistingSize + AddedStackSize)); + } + + static std::optional tryGetStackSize(const Function *F) { + return extractZExtI32Constant(F->getMetadata(MDStackSizeName)); + } + // If there is module-level metadata specifying the stack addrspace, // return that value. Otherwise, return std::nullopt. static std::optional @@ -470,15 +494,6 @@ class DXILContHelper { return extractZExtI32Constant(F.getMetadata(MDStateName)); } - static Function *getAliasedFunction(Module &M, StringRef Name) { - llvm::Constant *FuncOrAlias = M.getNamedValue(Name); - if (!FuncOrAlias) - return nullptr; - while (auto *Alias = dyn_cast(FuncOrAlias)) - FuncOrAlias = Alias->getAliasee(); - return dyn_cast(FuncOrAlias); - } - static bool isTraversal(Function &F) { // TODO: Make this more robust somehow, restricting to library functions. return F.getName().contains("Traversal"); diff --git a/shared/continuations/include/continuations/LowerRaytracingPipeline.h b/shared/continuations/include/continuations/LowerRaytracingPipeline.h index c1f1163ba1..4fd24ecb83 100644 --- a/shared/continuations/include/continuations/LowerRaytracingPipeline.h +++ b/shared/continuations/include/continuations/LowerRaytracingPipeline.h @@ -288,6 +288,11 @@ class LowerRaytracingPipelinePassImpl final { SmallVector Awaits; SmallVector RestoreSystemDatas; SmallVector EntriesWithPayloadTypeMetadata; + + // We specialize certain intrinsics that lead to suspend-points (TraceRay, + // CallShader, ReportHit) based on the payload or hit attribute type. + // We store these types (either payload or hit attribute) here for later use. + DenseMap PayloadOrAttrTypesForSpecializedFunctions; }; } // namespace llvm diff --git a/shared/continuations/lib/AddTypesMetadata.cpp b/shared/continuations/lib/AddTypesMetadata.cpp deleted file mode 100644 index 6f1ae03e49..0000000000 --- a/shared/continuations/lib/AddTypesMetadata.cpp +++ /dev/null @@ -1,65 +0,0 @@ -/* - *********************************************************************************************************************** - * - * Copyright (c) 2023 Advanced Micro Devices, Inc. All Rights Reserved. - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - *deal in the Software without restriction, including without limitation the - *rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - *sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in - *all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - *FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS - *IN THE SOFTWARE. - * - **********************************************************************************************************************/ - -//===- AddTypesMetadata.cpp - Build !types metadata -----------------------===// -// -// A pass that adds !types metadata to functions representing their argument -// types. -// This provides for transitioning IR to opaque pointers by embedding the -// required pointer typing information in metadata. -// -//===----------------------------------------------------------------------===// - -#include "continuations/Continuations.h" -#include "llvm/ADT/SmallVector.h" -#include "llvm/ADT/StringRef.h" -#include "llvm/IR/IRBuilder.h" -#include "llvm/IR/Module.h" -#include "llvm/IR/Type.h" -#include "llvm/InitializePasses.h" -#include - -using namespace llvm; - -#define DEBUG_TYPE "add-types-metadata" - -llvm::PreservedAnalyses -AddTypesMetadataPass::run(llvm::Module &M, - llvm::ModuleAnalysisManager &AnalysisManager) { - LLVM_DEBUG(dbgs() << "Run add-types-metadata pass\n"); - - bool Changed = false; - for (Function &F : M) { - // Skip functions which have already been annotated - if (F.hasMetadata("types")) - continue; - DXILContFuncTy::get(F.getFunctionType()).writeMetadata(&F); - Changed = true; - } - - if (Changed) - return PreservedAnalyses::none(); - return PreservedAnalyses::all(); -} diff --git a/shared/continuations/lib/CleanupContinuations.cpp b/shared/continuations/lib/CleanupContinuations.cpp index 5cb5803b1d..a7c136d59c 100644 --- a/shared/continuations/lib/CleanupContinuations.cpp +++ b/shared/continuations/lib/CleanupContinuations.cpp @@ -47,20 +47,22 @@ // // Convert the result from the coroutine passes to something more suitable for // the compiler backend. -// -// Instead of return values, use continue, waitContinue and complete intrinsics. -// Add arguments to resume functions, which are the return values of the called -// continuation. -// -// Add a global register buffer to store the continuation state. -// +// 1. Replace returning handle with lgc.cps.jump() with the right continuation +// reference. +// 2. Replace @continuation.return with simple `ret`, which means thread +// termination. +// 3. Edit function signatures, like removing coroutine frame pointer argument, +// adding needed ones (state, rcr, returned_values) for resume function. +// 4. Allocating/freeing cps stack space as needed. //===----------------------------------------------------------------------===// #include "continuations/Continuations.h" #include "continuations/ContinuationsDialect.h" +#include "lgccps/LgcCpsDialect.h" +#include "llvm-dialects/Dialect/Visitor.h" +#include "llvm/ADT/STLExtras.h" #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/StringRef.h" -#include "llvm/IR/IRBuilder.h" #include "llvm/IR/InstIterator.h" #include "llvm/IR/Module.h" #include "llvm/IR/Type.h" @@ -69,6 +71,7 @@ #include using namespace llvm; +using namespace lgc; #define DEBUG_TYPE "cleanup-continuations" @@ -145,17 +148,6 @@ findTokenOrigin(BasicBlock *BB, Value *V, return Result; } -/// Create a memcopy of an array, which the translator understands -static void createCopy(IRBuilder<> &B, Value *Dst, Value *Src, Type *Ty) { - assert(Ty->isArrayTy() && "Can only copy arrays"); - for (unsigned I = 0; I < Ty->getArrayNumElements(); I++) { - auto *SrcGep = B.CreateConstInBoundsGEP2_32(Ty, Src, 0, I); - auto *DstGep = B.CreateConstInBoundsGEP2_32(Ty, Dst, 0, I); - auto *Load = B.CreateLoad(Ty->getArrayElementType(), SrcGep); - B.CreateStore(Load, DstGep); - } -} - void CleanupContinuationsPass::analyzeContinuation(Function &F, MDNode *MD) { // Only analyze main continuation auto *MDTup = cast(MD); @@ -191,24 +183,236 @@ void CleanupContinuationsPass::analyzeContinuation(Function &F, MDNode *MD) { MaxContStateBytes = Data.ContStateBytes; } -void CleanupContinuationsPass::processContinuations() { - auto &Context = M->getContext(); - auto *Void = Type::getVoidTy(Context); +void CleanupContinuationsPass::updateCpsStack(Function *F, Function *NewFunc, + bool IsStart, + ContinuationData &CpsInfo) { + + Builder->SetInsertPoint( + &*NewFunc->getEntryBlock().getFirstNonPHIOrDbgOrAlloca()); + Value *CpsStack = nullptr; + if (IsStart) { + CpsStack = Builder->create( + Builder->getInt32(CpsInfo.ContStateBytes)); + } else { + CpsStack = + Builder->create(Builder->getInt32(CpsInfo.ContStateBytes)); + } + + SmallVector ToBeRemoved; + Value *OldBase = getContinuationFramePtr(F, IsStart, CpsInfo, ToBeRemoved); + + // Traversal through the users and setup the addrspace for the cps stack + // pointers. + // TODO: Investigate whether we can do this through mutateType(). + SmallVector Worklist(OldBase->users()); + DenseMap Replaced; + Replaced.insert(std::pair(OldBase, CpsStack)); + + while (!Worklist.empty()) { + Value *Ptr = Worklist.pop_back_val(); + + Instruction *Inst = cast(Ptr); + switch (Inst->getOpcode()) { + default: + LLVM_DEBUG(Inst->dump()); + llvm_unreachable("Unhandled instruction\n"); + break; + case Instruction::Call: { + if (Inst->isLifetimeStartOrEnd()) { + // The lifetime marker is not useful anymore. + Inst->eraseFromParent(); + } else { + LLVM_DEBUG(Inst->dump()); + llvm_unreachable("Unhandled call instruction\n"); + } + // No further processing needed for the users. + continue; + } + case Instruction::Load: + case Instruction::Store: + Ptr = getLoadStorePointerOperand(Inst); + Inst->replaceUsesOfWith(Ptr, Replaced.at(Ptr)); + // No further processing needed for the users. + continue; + case Instruction::And: + case Instruction::Add: + break; + case Instruction::AddrSpaceCast: + Replaced.insert(std::pair(Inst, Replaced.at(Inst))); + ToBeRemoved.push_back(Inst); + break; + case Instruction::PtrToInt: { + Builder->SetInsertPoint(Inst); + auto *NewInst = Builder->CreatePtrToInt(Replaced.at(Inst->getOperand(0)), + Inst->getType()); + Replaced.insert(std::pair(Inst, NewInst)); + ToBeRemoved.push_back(Inst); + break; + } + case Instruction::IntToPtr: { + Builder->SetInsertPoint(Inst); + auto *NewInst = Builder->CreateIntToPtr( + Inst->getOperand(0), Builder->getPtrTy(lgc::cps::stackAddrSpace)); + Replaced.insert(std::pair(Inst, NewInst)); + ToBeRemoved.push_back(Inst); + break; + } + case Instruction::GetElementPtr: { + GetElementPtrInst *GEP = cast(Ptr); + Builder->SetInsertPoint(GEP); + SmallVector Indexes(GEP->idx_begin(), GEP->idx_end()); + + Value *NewGEP = nullptr; + auto *NewPtr = Replaced.at(GEP->getPointerOperand()); + auto *ElemTy = GEP->getSourceElementType(); + if (GEP->isInBounds()) + NewGEP = Builder->CreateInBoundsGEP(ElemTy, NewPtr, Indexes); + else + NewGEP = Builder->CreateGEP(ElemTy, NewPtr, Indexes); + Replaced.insert(std::pair(GEP, NewGEP)); + cast(NewGEP)->copyMetadata(*GEP); + ToBeRemoved.push_back(GEP); + break; + } + } + + Worklist.append(Ptr->users().begin(), Ptr->users().end()); + } + for (auto *I : reverse(ToBeRemoved)) + I->eraseFromParent(); +} + +static void updateCpsFunctionArgs(Function *OldFunc, Function *NewFunc, + const SmallVector &AllArgValues) { + // Set arg names for new function + for (unsigned Idx = 0; Idx != NewFunc->getFunctionType()->params().size(); + ++Idx) { + Argument *Arg = NewFunc->getArg(Idx); + Value *OldVal = AllArgValues[Idx]; + if (OldVal) { + Arg->setName(OldVal->getName()); + OldVal->replaceAllUsesWith(Arg); + } + } +} + +static void buildCpsArgInfos(Function *F, bool IsStart, + SmallVector &AllArgTypes, + SmallVector &AllArgValues, + SmallVector &ParamAttrs, + SmallVector &InstsToRemove) { + + auto &Context = F->getContext(); + AttributeList FAttrs = F->getAttributes(); + if (IsStart) { + unsigned ArgNo = 0; + assert(F->arg_size() >= 1 && "Entry function has at least one argument"); + // Use all arguments except the last (pre-allocated buffer for the + // coroutine passes) for the continuation start + for (auto Arg = F->arg_begin(), ArgEnd = F->arg_end() - 1; Arg != ArgEnd; + Arg++) { + AllArgTypes.push_back(Arg->getType()); + AllArgValues.push_back(Arg); + ParamAttrs.push_back(FAttrs.getParamAttrs(ArgNo)); + ArgNo++; + } + } else { + // Add extra arguments ({} %state, i32 %rcr) for resume part. But for now, + // we always use continuation stack to pass continuation state. + AllArgTypes.push_back(StructType::get(Context, {})); + AllArgValues.push_back(nullptr); + AllArgTypes.push_back(IntegerType::get(Context, 32)); + AllArgValues.push_back(nullptr); + + // Find arguments from continuation.returnvalue calls + for (auto &I : F->getEntryBlock()) { + if (auto *Intr = dyn_cast(&I)) { + AllArgTypes.push_back(Intr->getType()); + AllArgValues.push_back(Intr); + InstsToRemove.push_back(Intr); + } + } + } +} + +/// Find the continuation state pointer, either returned by the malloc or +/// given as an argument +Value *CleanupContinuationsPass::getContinuationFramePtr( + Function *F, bool IsStart, const ContinuationData &ContinuationInfo, + SmallVector &InstsToRemove) { + if (!ContinuationInfo.MallocCall) + return IsStart ? F->getArg(F->arg_size() - 1) : F->getArg(0); + + if (IsStart) { + InstsToRemove.push_back(ContinuationInfo.MallocCall); + + auto *BufferArg = F->getArg(F->arg_size() - 1); + auto *Store = cast(BufferArg->getUniqueUndroppableUser()); + // Erase immediately to make later continuation stack setup easy. + Store->eraseFromParent(); + return ContinuationInfo.MallocCall; + } + // Look for the load of the allocated pointer + Instruction *Load = + cast(F->getArg(0)->getUniqueUndroppableUser()); + InstsToRemove.push_back(Load); // Load needs to be eliminated + return Load; +} + +/// Remove call to continuation.free() in F, ContFree is the pointer to +/// declaration of continuation.free(). +void CleanupContinuationsPass::removeContFreeCall(Function *F, + Function *ContFree) { + for (auto *User : make_early_inc_range(ContFree->users())) { + if (auto *Call = dyn_cast(User)) { + if (Call->getFunction() == F) { + Call->eraseFromParent(); + break; + } + } + } +} + +/// Insert cps.free() before the original function exits. +/// Note: we skip the cps.free() insertion before calls to @continuation.return. +/// Because this is not useful any more as it means the thread termination. +void CleanupContinuationsPass::freeCpsStack(Function *F, + ContinuationData &CpsInfo) { + struct VisitState { + ContinuationData &CpsInfo; + llvm_dialects::Builder *Builder; + Function *F; + }; + VisitState State = {CpsInfo, Builder, F}; + static const auto Visitor = + llvm_dialects::VisitorBuilder() + .setStrategy(llvm_dialects::VisitorStrategy::ByFunctionDeclaration) + .add([](auto &State, auto &Jump) { + if (Jump.getFunction() == State.F) { + State.Builder->SetInsertPoint(&Jump); + State.Builder->template create( + State.Builder->getInt32(State.CpsInfo.ContStateBytes)); + } + }) + .build(); + Visitor.visit(State, *F); +} +void CleanupContinuationsPass::processContinuations() { + // Summarize of what to do here: + // 1. Continuation Stack + // a.) cps.alloc() in start, and cps.peek() cps.free() in resume. + // b.) change the address space for cps stack to 32. + // 2. prepare arguments passed to cps.jump and insert the call at the exit of + // start part. + // 3. Edit resume signature to add the state/rcr/returnvalues. for (auto &FuncData : ToProcess) { LLVM_DEBUG(dbgs() << "Processing function: " << FuncData.first->getName() << "\n"); - bool IsEntry = FuncData.first->hasMetadata(DXILContHelper::MDEntryName); - for (auto *F : FuncData.second.Functions) { - if (F != FuncData.first) { - // Entry marker should only be on the start and not on resume functions - F->eraseMetadata(Context.getMDKindID(DXILContHelper::MDEntryName)); - // Same for stacksize - F->eraseMetadata(Context.getMDKindID(DXILContHelper::MDStackSizeName)); - // Set same linkage as for start function + // Set same linkage as for start function + if (F != FuncData.first) F->setLinkage(FuncData.first->getLinkage()); - } // Ignore the stub created for the coroutine passes if (F->empty()) @@ -217,89 +421,30 @@ void CleanupContinuationsPass::processContinuations() { LLVM_DEBUG(dbgs() << "Processing function part: " << F->getName() << "\n"); - bool IsStart = F == FuncData.first; // If this is the continuation start + // If this is the continuation start + bool IsStart = F == FuncData.first; + // We don't need to touch resume part of non-cps function, this usually + // should be entry-point compute kernel. The resume part will be erased + // at the end. + if (!IsStart && !cps::isCpsFunction(*F)) + continue; + SmallVector AllArgTypes; SmallVector AllArgValues; - SmallVector InstsToRemove; - AttributeList FAttrs = F->getAttributes(); SmallVector ParamAttrs; + SmallVector InstsToRemove; - // Use all arguments except the last (pre-allocated buffer for the - // coroutine passes) for the continuation start - if (IsStart) { - unsigned ArgNo = 0; - assert(F->arg_size() >= 1 && - "Entry function has at least one argument"); - for (auto Arg = F->arg_begin(), ArgEnd = F->arg_end() - 1; - Arg != ArgEnd; Arg++) { - AllArgTypes.push_back(Arg->getType()); - AllArgValues.push_back(Arg); - ParamAttrs.push_back(FAttrs.getParamAttrs(ArgNo)); - ArgNo++; - } - } else { - IRBuilder<> B(&*F->getEntryBlock().getFirstNonPHIOrDbgOrAlloca()); - AllArgTypes.push_back( - getContinuationStackOffsetType(Context)); // continuation stack ptr - AllArgValues.push_back(nullptr); - - // Find arguments from continuation.returnvalue calls - for (auto &I : F->getEntryBlock()) { - if (auto *Intr = dyn_cast(&I)) { - AllArgTypes.push_back(Intr->getType()); - AllArgValues.push_back(Intr); - InstsToRemove.push_back(Intr); - } - } - } + buildCpsArgInfos(F, IsStart, AllArgTypes, AllArgValues, ParamAttrs, + InstsToRemove); - // Find the free call if there is one - if (ContFree) { - for (auto *User : ContFree->users()) { - if (auto *Call = dyn_cast(User)) { - if (Call->getFunction() == F) { - InstsToRemove.push_back(Call); - break; - } - } - } - } - - // Find the continuation state pointer, either returned by the malloc or - // given as an argument - Value *ContFrame = nullptr; - if (FuncData.second.MallocCall) { - if (IsStart) { - ContFrame = FuncData.second.MallocCall; - InstsToRemove.push_back(FuncData.second.MallocCall); - - auto *BufferArg = F->getArg(F->arg_size() - 1); - auto *Cast = cast(BufferArg->getUniqueUndroppableUser()); - auto *CastUser = Cast->getUniqueUndroppableUser(); - auto *Store = cast(CastUser); - InstsToRemove.push_back(Store); // Store needs to be eliminated first - InstsToRemove.push_back(Cast); - } else { - // Look for the load of the allocated pointer - auto *User = F->getArg(0)->getUniqueUndroppableUser(); - auto *Cast = cast(User); - - auto *CastUser = Cast->getUniqueUndroppableUser(); - auto *Load = cast(CastUser); - InstsToRemove.push_back(Load); // Load needs to be eliminated first - InstsToRemove.push_back(Cast); - ContFrame = Load; - } - } else { - if (IsStart) - ContFrame = F->getArg(F->arg_size() - 1); - else - ContFrame = F->getArg(0); - } + if (ContFree) + removeContFreeCall(F, ContFree); // Create new empty function F->eraseMetadata(FuncData.second.MD->getMetadataID()); - auto *NewFuncTy = FunctionType::get(Void, AllArgTypes, false); + auto &Context = F->getContext(); + auto *NewFuncTy = + FunctionType::get(Type::getVoidTy(Context), AllArgTypes, false); Function *NewFunc = cloneFunctionHeader(*F, NewFuncTy, ParamAttrs); NewFunc->takeName(F); FuncData.second.NewFunctions.push_back(NewFunc); @@ -307,46 +452,31 @@ void CleanupContinuationsPass::processContinuations() { // Transfer code from old function to new function llvm::moveFunctionBody(*F, *NewFunc); - // Set arg names for new function - for (unsigned Idx = 0; Idx != NewFunc->getFunctionType()->params().size(); - ++Idx) { - Argument *Arg = NewFunc->getArg(Idx); - Value *OldVal = AllArgValues[Idx]; - if (OldVal) { - Arg->setName(OldVal->getName()); - OldVal->replaceAllUsesWith(Arg); - } - if (IsStart) { - Argument *OldArg = F->getArg(Idx); - if (OldArg->hasInRegAttr()) - Arg->addAttr(Attribute::InReg); - else - Arg->removeAttr(Attribute::AttrKind::InReg); - } - } - - // Handle the function entry - IRBuilder<> B(&*NewFunc->getEntryBlock().getFirstNonPHIOrDbgOrAlloca()); - if (IsStart) - FuncData.second.NewStart = NewFunc; - handleFunctionEntry(B, FuncData.second, NewFunc, IsEntry); + auto &CpsInfo = FuncData.second; + if (CpsInfo.ContStateBytes) + updateCpsStack(F, NewFunc, IsStart, CpsInfo); - // Handle the function body - // Use the global continuation state - ContFrame->replaceAllUsesWith(B.CreateBitOrPointerCast( - FuncData.second.NewContState, ContFrame->getType())); + updateCpsFunctionArgs(F, NewFunc, AllArgValues); + freeCpsStack(NewFunc, CpsInfo); // Handle the function returns for (auto &BB : make_early_inc_range(*NewFunc)) { auto *I = BB.getTerminator(); - if (I->getOpcode() == Instruction::Ret) { - handleContinue(B, FuncData.second, I); + if (isa(I)) { + handleContinue(FuncData.second, I); } else if (I->getOpcode() == Instruction::Unreachable) { - if (auto *Call = dyn_cast(--I->getIterator())) { - if (auto *Called = Call->getCalledFunction()) { - if (Called->getName() == "continuation.return") - handleReturn(B, FuncData.second, Call); - } + // We should only possibly have 'continuation.return' or + // 'lgc.cps.jump' call before unreachable. + auto *Call = cast(--I->getIterator()); + auto *Called = Call->getCalledFunction(); + if (Called->getName() == "continuation.return") { + assert(Call->arg_empty() && "Should have no argument\n"); + Builder->SetInsertPoint(Call); + Builder->CreateRetVoid(); + Call->eraseFromParent(); + I->eraseFromParent(); + } else { + assert(isa(*Call)); } } } @@ -354,136 +484,38 @@ void CleanupContinuationsPass::processContinuations() { for (auto *I : InstsToRemove) I->eraseFromParent(); - // Remove the old function - F->replaceAllUsesWith(ConstantExpr::getBitCast(NewFunc, F->getType())); + // Replace the old function with the new one. + F->replaceAllUsesWith(NewFunc); } } - // Remove the old functions and update metadata + // Remove the old functions for (auto &FuncData : ToProcess) { if (FuncData.second.Functions.size() > 1) { // Only for functions that were split for (auto *F : FuncData.second.Functions) F->eraseFromParent(); - - MDTuple *ContMDTuple = MDTuple::get( - Context, {ValueAsMetadata::get(FuncData.second.NewStart)}); - for (auto *F : FuncData.second.NewFunctions) { - F->setMetadata(DXILContHelper::MDContinuationName, ContMDTuple); - if (F != FuncData.second.NewStart) { - // For non-start functions, set (incoming) continuation registercount - // metadata by looking at the continue calls that reference this - // function. These continue calls both specify the number of their - // outgoing registers, and the number of incoming payload registers - // coming back into the resume function (i.e. us). - SmallVector Worklist(F->users()); - std::optional RegCount; - while (!Worklist.empty()) { - auto *U = Worklist.pop_back_val(); - if (auto *Const = dyn_cast(U)) { - Worklist.append(Const->user_begin(), Const->user_end()); - continue; - } - assert(isa(U) && - "User of a resume function should be a call to continue"); - auto *Inst = cast(U); - if (auto Count = - DXILContHelper::tryGetReturnedRegisterCount(Inst)) { - assert((!RegCount || *RegCount == *Count) && - "Got different returned registercounts in continues to " - "the same resume function"); - RegCount = *Count; -#ifdef NDEBUG - break; -#endif - } else { - LLVM_DEBUG(Inst->dump()); - report_fatal_error( - "Found a continue call without " - "continuation returned registercount metadata"); - } - } - - // Add metadata - DXILContHelper::setIncomingRegisterCount(F, RegCount.value()); - } - } - } - } - - fixupDxilMetadata(*M); -} - -void CleanupContinuationsPass::handleFunctionEntry(IRBuilder<> &B, - ContinuationData &Data, - Function *F, bool IsEntry) { - auto &Context = F->getContext(); - bool IsStart = F == Data.NewStart; - - // Create alloca to keep the continuation state - uint64_t ContStateNumI32s = divideCeil(Data.ContStateBytes, RegisterBytes); - auto *ContStateTy = ArrayType::get(I32, ContStateNumI32s); - Data.NewContState = B.CreateAlloca(ContStateTy, nullptr, "cont.state"); - - uint64_t NeededStackSize = computeNeededStackSizeForRegisterBuffer( - ContStateNumI32s, ContinuationStateRegisterCount); - if (IsStart) { - // Add function metadata that stores how big the continuation state is in - // bytes - DXILContHelper::setContinuationStateByteCount(*F, Data.ContStateBytes); - - // Add intrinsic call to save the previous continuation state - if (!IsEntry && Data.ContStateBytes) - B.CreateCall(SaveContState); - - if (NeededStackSize) { - // Add to continuation stack size metadata - uint64_t CurStackSize = 0; - if (auto *StackSizeMD = F->getMetadata(DXILContHelper::MDStackSizeName)) - CurStackSize = mdconst::extract(StackSizeMD->getOperand(0)) - ->getZExtValue(); - F->setMetadata( - DXILContHelper::MDStackSizeName, - MDTuple::get(Context, {ConstantAsMetadata::get(ConstantInt::get( - I32, NeededStackSize + CurStackSize))})); - } - } else { - // Read continuation state from global into local variable - createCopy( - B, Data.NewContState, - B.CreateBitOrPointerCast( - ContState, ContStateTy->getPointerTo(ContState->getAddressSpace())), - ContStateTy); - - // Deallocate continuation stack space if necessary - if (NeededStackSize) { - // Add barrier so that the csp is only decremented after the continuation - // state is read - auto *Csp = B.CreateCall( - getContinuationStackOffset(*B.GetInsertPoint()->getModule())); - B.CreateCall(RegisterBufferSetPointerBarrier, {ContState, Csp}); - - moveContinuationStackOffset(B, -NeededStackSize); } } } /// Transform -/// %tok = call %continuation.token* @foo() !continuation.registercount !0 -/// %0 = insertvalue { i8*, %continuation.token* } { i8* bitcast ({ i8*, -/// %continuation.token* } (i8*, i1)* @fun.resume.0 to i8*), -/// %continuation.token* undef }, %continuation.token* %tok, 1 -/// ret { i8*, %continuation.token* } %0 -/// to -/// %resume_addr = ptrtoint i8* ... @fun.resume.0 to i64 -/// %foo = ptrtoint %continuation.token* () @foo to i64 -/// call void @continuation.continue(i64 %foo, i8 addrspace(21)* %csp, i64 -/// %resume_addr, ) !continuation.registercount !0 -/// unreachable +/// %cr = call i32 @lgc.cps.as.continuation.reference(ptr @callee) +/// %2 = inttoptr i32 %cr to ptr +/// %3 = call i32 %2(i32 %cr, i32 2, ...) +/// %4 = insertvalue { ptr, i32 } undef, ptr @test.resume.0, 0 +/// %5 = insertvalue { ptr, i32 } %4, i32 %3, 1 +/// ret { ptr, i32 } %5 +/// +/// To: +/// %cr = call i32 @lgc.cps.as.continuation.reference(ptr @callee) +/// %cr2 = call i32 (...) @lgc.cps.as.continuation.reference(ptr +/// @test.resume.0) +/// call void (...) @lgc.cps.jump(i32 %cr, i32 2, {} poison, +/// i32 %cr2, ...) /// /// Also handles cases where the token and resume function are behind a phi. -void CleanupContinuationsPass::handleContinue(IRBuilder<> &B, - ContinuationData &Data, +void CleanupContinuationsPass::handleContinue(ContinuationData &Data, Instruction *Ret) { // Find the function call that generates the token LLVM_DEBUG(dbgs() << "Converting ret to continue: " << *Ret @@ -502,7 +534,7 @@ void CleanupContinuationsPass::handleContinue(IRBuilder<> &B, << "\n"); auto *Call = Entry.second.first; auto *ResumeFun = Entry.second.second; - handleSingleContinue(B, Data, Call, ResumeFun); + handleSingleContinue(Data, Call, ResumeFun); } if (BB->empty()) { @@ -512,114 +544,54 @@ void CleanupContinuationsPass::handleContinue(IRBuilder<> &B, } } -void CleanupContinuationsPass::handleSingleContinue(IRBuilder<> &B, - ContinuationData &Data, +void CleanupContinuationsPass::handleSingleContinue(ContinuationData &Data, CallInst *Call, Value *ResumeFun) { - // Pass resume address as argument - B.SetInsertPoint(Call); - auto *ReturnAddrInt = B.CreatePtrToInt(ResumeFun, I64); - - auto *CpsType = getContinuationStackOffsetType(Call->getContext()); - auto *CspFun = getContinuationStackOffset(*Call->getModule()); - - // Write local continuation state to stack and registers - uint64_t ContStateNumI32s = divideCeil(Data.ContStateBytes, RegisterBytes); - uint64_t NeededStackSize = computeNeededStackSizeForRegisterBuffer( - ContStateNumI32s, ContinuationStateRegisterCount); - - if (NeededStackSize) { - // Allocate continuation stack space if necessary - moveContinuationStackOffset(B, NeededStackSize); - // Add barrier so that the csp is only incremented before the continuation - // state is written - auto *Csp = B.CreateCall(CspFun); - B.CreateCall(RegisterBufferSetPointerBarrier, {ContState, Csp}); - } + Builder->SetInsertPoint(Call); - // Copy continuation state from local variable into global - auto *ContStateTy = Data.NewContState->getAllocatedType(); - createCopy( - B, - B.CreateBitOrPointerCast( - ContState, ContStateTy->getPointerTo(ContState->getAddressSpace())), - Data.NewContState, ContStateTy); - - auto *Csp = B.CreateLoad(CpsType, B.CreateCall(CspFun)); - - // Replace this instruction with a call to continuation.continue - SmallVector Args; - Args.push_back(B.CreatePointerCast(Call->getCalledOperand(), I64)); - Args.push_back(Csp); - Args.push_back(ReturnAddrInt); - Args.append(Call->arg_begin(), Call->arg_end()); - auto *ContinueCall = B.CreateCall(Continue, Args); - ContinueCall->copyMetadata(*Call); - assert(DXILContHelper::tryGetOutgoingRegisterCount(ContinueCall) && - "Missing registercount metadata!"); + SmallVector TailArgs; + // %rcr (aka. return continuation reference) for the callee. + if (cps::isCpsFunction(*cast(ResumeFun))) { + auto *ResumeCR = Builder->create(ResumeFun); + TailArgs.push_back(ResumeCR); + } else { + // For entry-point compute kernel, pass a poison %rcr. + TailArgs.push_back(PoisonValue::get(Builder->getInt32Ty())); + } + // Skip continuation.reference and levels. + TailArgs.append(SmallVector(drop_begin(Call->args(), 2))); + auto *CR = Call->getArgOperand(0); + Value *Level = Call->getArgOperand(1); + unsigned LevelImm = cast(Level)->getZExtValue(); + // TODO: Continuation state are passed through stack for now. + auto *State = PoisonValue::get(StructType::get(Builder->getContext(), {})); + auto *JumpCall = Builder->create(CR, LevelImm, State, TailArgs); + // Replace this instruction with a call to cps.jump. + JumpCall->copyMetadata(*Call); // Remove instructions at the end of the block - auto *Unreachable = B.CreateUnreachable(); - for (auto &I : make_early_inc_range(reverse(*ContinueCall->getParent()))) { + Builder->SetInsertPoint(Call); + auto *Unreachable = Builder->CreateUnreachable(); + for (auto &I : make_early_inc_range(reverse(*JumpCall->getParent()))) { if (&I == Unreachable) break; I.eraseFromParent(); } } -/// Transform -/// call void (i64, ...) @continuation.return(i64 %returnaddr, ) -/// unreachable -/// to -/// -/// call void @continuation.restore.continuation_state() -/// call void @continuation.continue(i64 %returnaddr, i8 addrspace(21)* %csp, -/// ) -/// unreachable -void CleanupContinuationsPass::handleReturn(IRBuilder<> &B, - ContinuationData &Data, - CallInst *ContRet) { - LLVM_DEBUG(dbgs() << "Converting return to continue: " << *ContRet << "\n"); - bool IsEntry = isa(ContRet->getArgOperand(0)); - B.SetInsertPoint(ContRet); - if (IsEntry) { - assert(ContRet->arg_size() == 1 && - "Entry functions ignore the return value"); - B.CreateCall(Complete); - } else { - // Add intrinsic call to restore the previous continuation state - if (Data.ContStateBytes) - B.CreateCall(RestoreContState); - - SmallVector Args(ContRet->args()); - auto *CspType = getContinuationStackOffsetType(ContRet->getContext()); - auto *CspFun = getContinuationStackOffset(*ContRet->getModule()); - auto *Csp = B.CreateLoad(CspType, B.CreateCall(CspFun)); - Args.insert(Args.begin() + 1, Csp); - - auto *ContinueCall = B.CreateCall(Continue, Args); - Data.NewReturnContinues.push_back(ContinueCall); - - ContinueCall->copyMetadata(*ContRet); - assert(DXILContHelper::tryGetOutgoingRegisterCount(ContinueCall) && - "Missing registercount metadata!"); - } - - ContRet->eraseFromParent(); -} - llvm::PreservedAnalyses CleanupContinuationsPass::run(llvm::Module &Mod, llvm::ModuleAnalysisManager &AnalysisManager) { - LLVM_DEBUG(dbgs() << "Run the cleanup-continuations pass\n"); + LLVM_DEBUG(dbgs() << "Run the lgc-cleanup-continuations pass\n"); AnalysisManager.getResult(Mod); - M = &Mod; ToProcess.clear(); MaxContStateBytes = 0; ContMalloc = Mod.getFunction("continuation.malloc"); ContFree = Mod.getFunction("continuation.free"); + llvm_dialects::Builder B(Mod.getContext()); + Builder = &B; // Map the entry function of a continuation to the analysis result for (auto &F : Mod.functions()) { if (F.empty()) @@ -632,8 +604,8 @@ CleanupContinuationsPass::run(llvm::Module &Mod, for (auto &FuncData : ToProcess) { if (!FuncData.second.MallocCall) { for (auto *F : FuncData.second.Functions) { - bool IsStart = - (F == FuncData.first); // If this is the continuation start + // If this is the continuation start part. + bool IsStart = (F == FuncData.first); Value *ContFrame; if (IsStart) ContFrame = F->getArg(F->arg_size() - 1); @@ -649,46 +621,8 @@ CleanupContinuationsPass::run(llvm::Module &Mod, } if (!ToProcess.empty()) { - auto &Context = Mod.getContext(); - I32 = Type::getInt32Ty(Context); - I64 = Type::getInt64Ty(Context); - SaveContState = getContinuationSaveContinuationState(Mod); - RestoreContState = getContinuationRestoreContinuationState(Mod); - Continue = getContinuationContinue(Mod); - Complete = getContinuationComplete(Mod); - - // Add global - // Size is the maximum of all continuations, but at least the register size - uint32_t ContStateSize = std::max( - MaxContStateBytes, ContinuationStateRegisterCount * RegisterBytes); - auto *ContStateTy = - ArrayType::get(I32, divideCeil(ContStateSize, RegisterBytes)); - ContState = cast(Mod.getOrInsertGlobal( - DXILContHelper::GlobalContStateName, ContStateTy, [&] { - return new GlobalVariable(Mod, ContStateTy, false, - GlobalVariable::ExternalLinkage, nullptr, - DXILContHelper::GlobalContStateName, - nullptr, GlobalVariable::NotThreadLocal); - })); - - RegisterBufferSetPointerBarrier = getRegisterBufferSetPointerBarrier(Mod); - - // Add registerbuffer metadata to split accesses at into i32s and spill to - // memory if necessary - std::optional StackAddrspace = - DXILContHelper::tryGetStackAddrspace(*M); - if (!StackAddrspace) - report_fatal_error("Missing stack addrspace metadata!"); - RegisterBufferMD RMD; - RMD.RegisterCount = ContinuationStateRegisterCount; - RMD.Addrspace = static_cast(*StackAddrspace); - auto *MD = createRegisterBufferMetadata(Context, RMD); - ContState->addMetadata("registerbuffer", *MD); - processContinuations(); - } - - if (!ToProcess.empty()) return PreservedAnalyses::none(); + } return PreservedAnalyses::all(); } diff --git a/shared/continuations/lib/DXILCont.cpp b/shared/continuations/lib/DXILCont.cpp index cc285b8e3e..ae00b4a003 100644 --- a/shared/continuations/lib/DXILCont.cpp +++ b/shared/continuations/lib/DXILCont.cpp @@ -33,6 +33,7 @@ #include "continuations/ContinuationsDialect.h" #include "continuations/ContinuationsUtil.h" #include "continuations/LowerRaytracingPipeline.h" +#include "lgccps/LgcCpsDialect.h" #include "lgcrt/LgcRtDialect.h" #include "llvm-dialects/Dialect/Builder.h" #include "llvm-dialects/Dialect/Dialect.h" @@ -208,7 +209,7 @@ void DXILContHelper::addContinuationPasses(ModulePassManager &MPM) { MPM.addPass(createModuleToFunctionPassAdaptor(CoroElidePass())); MPM.addPass(CoroCleanupPass()); - MPM.addPass(CleanupContinuationsPass()); + MPM.addPass(LegacyCleanupContinuationsPass()); MPM.addPass(RegisterBufferPass()); MPM.addPass(SaveContinuationStatePass()); MPM.addPass(DXILContPostProcessPass()); @@ -258,7 +259,8 @@ DialectContextAnalysis::run(llvm::Module &M, if (NeedDialectContext) { Context = llvm_dialects::DialectContext::make( + lgc::rt::LgcRtDialect, + lgc::cps::LgcCpsDialect>( M.getContext()); } return DialectContextAnalysis::Result(); @@ -595,7 +597,7 @@ CallInst *llvm::replaceIntrinsicCall(IRBuilder<> &B, Type *SystemDataTy, return nullptr; std::string Name = ("_cont_" + IntrImplEntry->Name).str(); - auto *IntrImpl = DXILContHelper::getAliasedFunction(M, Name); + auto *IntrImpl = M.getFunction(Name); if (!IntrImpl) cantFail(make_error(Twine("Intrinsic implementation '") + Name + "' not found", @@ -697,13 +699,78 @@ Type *llvm::getFuncArgPtrElementType(const Function *F, const Argument *Arg) { if (!ArgTy->isPointerTy()) return nullptr; - // NOTE: fast path code to be removed later - if (!ArgTy->isOpaquePointerTy()) - return ArgTy->getNonOpaquePointerElementType(); - return DXILContArgTy::get(F, Arg).getPointerElementType(); } Type *llvm::getFuncArgPtrElementType(const Function *F, int ArgNo) { return getFuncArgPtrElementType(F, F->getArg(ArgNo)); } + +namespace llvm { +namespace coro { +bool defaultMaterializable(Instruction &V); +} // End namespace coro +} // End namespace llvm + +bool llvm::LgcMaterializable(Instruction &OrigI) { + Instruction *V = &OrigI; + + // extract instructions are rematerializable, but increases the size of the + // continuation state, so as a heuristic only rematerialize this if the source + // can be rematerialized as well. + while (true) { + Instruction *NewInst = nullptr; + if (auto *Val = dyn_cast(V)) + NewInst = dyn_cast(Val->getVectorOperand()); + else if (auto *Val = dyn_cast(V)) + NewInst = dyn_cast(Val->getAggregateOperand()); + + if (NewInst) + V = NewInst; + else + break; + } + + if (coro::defaultMaterializable(*V)) + return true; + + if (auto *LI = dyn_cast(V)) { + // load from constant address space + if (LI->getPointerAddressSpace() == 4) + return true; + } + + if (auto *CInst = dyn_cast(V)) { + if (auto *CalledFunc = CInst->getCalledFunction()) { + // Before rematerialization happens, lgc.rt dialect operations that cannot + // be rematerialized are replaced by their implementation, so that the + // necessary values can be put into the coroutine frame. Therefore, we + // can assume all left-over intrinsics can be rematerialized. + if (isRematerializableLgcRtOp(*CInst)) + return true; + + auto CalledName = CalledFunc->getName(); + // FIXME: switch to dialectOp check. + if (CalledName.startswith("lgc.user.data") || + CalledName.startswith("lgc.load.user.data")) + return true; + } + } + + return false; +} + +namespace llvm { +void addLgcContinuationTransform(ModulePassManager &MPM) { + MPM.addPass(LowerAwaitPass()); + + MPM.addPass(CoroEarlyPass()); + CGSCCPassManager CGPM; + CGPM.addPass(LgcCoroSplitPass()); + MPM.addPass(createModuleToPostOrderCGSCCPassAdaptor(std::move(CGPM))); + MPM.addPass(createModuleToFunctionPassAdaptor(CoroElidePass())); + MPM.addPass(CoroCleanupPass()); + + MPM.addPass(CleanupContinuationsPass()); +} +} // End namespace llvm diff --git a/shared/continuations/lib/DXILContPostProcess.cpp b/shared/continuations/lib/DXILContPostProcess.cpp index 211bc7ca44..cdde7ea7e9 100644 --- a/shared/continuations/lib/DXILContPostProcess.cpp +++ b/shared/continuations/lib/DXILContPostProcess.cpp @@ -351,43 +351,6 @@ static void replaceGlobal(const DataLayout &DL, GlobalVariable *Registers, Registers->getValueType(), Registers, Indices); auto *Repl = ConstantExpr::getBitCast(Gep, G->getType()); - // TODO Can remove i64 handling - if (G->getValueType()->isIntegerTy(64)) { - auto *I32 = Type::getInt32Ty(G->getContext()); - auto *I32Ptr = ConstantExpr::getBitCast( - Gep, I32->getPointerTo(Registers->getAddressSpace())); - // Special case the return address: Convert i64 loads and stores to i32 - // ones for the translator - for (auto *U : make_early_inc_range(G->users())) { - if (auto *Load = dyn_cast(U)) { - if (Load->getPointerOperand() != G) - continue; - IRBuilder<> B(Load); - auto *Part0 = B.CreateLoad(I32, I32Ptr); - auto *Part1 = B.CreateLoad(I32, ConstantExpr::getInBoundsGetElementPtr( - I32, I32Ptr, B.getInt64(1))); - auto *Vec = B.CreateInsertElement(FixedVectorType::get(I32, 2), Part0, - static_cast(0)); - Vec = B.CreateInsertElement(Vec, Part1, 1); - auto *Loaded = B.CreateBitCast(Vec, I64); - Load->replaceAllUsesWith(Loaded); - Load->eraseFromParent(); - } else if (auto *Store = dyn_cast(U)) { - if (Store->getPointerOperand() != G) - continue; - IRBuilder<> B(Store); - auto *Vec = B.CreateBitCast(Store->getValueOperand(), - FixedVectorType::get(I32, 2)); - auto *Part0 = B.CreateExtractElement(Vec, static_cast(0)); - auto *Part1 = B.CreateExtractElement(Vec, 1); - B.CreateStore(Part0, I32Ptr); - B.CreateStore(Part1, ConstantExpr::getInBoundsGetElementPtr( - I32, I32Ptr, B.getInt64(1))); - Store->eraseFromParent(); - } - } - } - G->replaceAllUsesWith(Repl); G->eraseFromParent(); } @@ -849,17 +812,7 @@ void DXILContPostProcessPass::handleContStackAlloc(FunctionAnalysisManager &FAM, CInst->eraseFromParent(); // Add allocation to the stack size of this function - uint64_t CurStackSize = 0; - if (auto *StackSizeMD = - Func->getMetadata(DXILContHelper::MDStackSizeName)) - CurStackSize = - mdconst::extract(StackSizeMD->getOperand(0)) - ->getZExtValue(); - Func->setMetadata( - DXILContHelper::MDStackSizeName, - MDTuple::get(Func->getContext(), - {ConstantAsMetadata::get(ConstantInt::get( - B.getInt32Ty(), Size + CurStackSize))})); + DXILContHelper::addStackSize(Func, Size); } } } @@ -876,8 +829,7 @@ DXILContPostProcessPass::run(llvm::Module &M, ToProcess.clear(); MapVector ShaderKinds; analyzeShaderKinds(M, ShaderKinds); - auto *SetupRayGen = - DXILContHelper::getAliasedFunction(M, "_cont_SetupRayGen"); + auto *SetupRayGen = M.getFunction("_cont_SetupRayGen"); for (auto &Entry : ShaderKinds) { switch (Entry.second) { case DXILShaderKind::RayGeneration: diff --git a/shared/continuations/lib/DXILMetadata.cpp b/shared/continuations/lib/DXILMetadata.cpp index 774601a792..1652e8c4dc 100644 --- a/shared/continuations/lib/DXILMetadata.cpp +++ b/shared/continuations/lib/DXILMetadata.cpp @@ -33,21 +33,11 @@ namespace llvm { -// -enforce-pointer-metadata: Always use metadata to get pointer type, missing -// metadata is an error. -static cl::opt EnforcePointerMetadata( - "enforce-pointer-metadata", - cl::desc("Pointer types are only retrieved by metadata"), cl::init(false)); - DXILContArgTy::DXILContArgTy(Type *Arg) { - if (auto *PT = dyn_cast(Arg)) { - assert(!PT->isOpaque() && "opaque pointers are not supported"); - ArgTy = Arg; - ElemTy = PT->getNonOpaquePointerElementType(); - } else { - ArgTy = Arg; - ElemTy = nullptr; - } + assert(!Arg->isPointerTy() && + "pointers are not supported by this constructor"); + ArgTy = Arg; + ElemTy = nullptr; } DXILContArgTy DXILContArgTy::get(const Function *F, const Argument *Arg) { @@ -66,18 +56,10 @@ DXILContArgTy DXILContArgTy::get(const Function *F, const Argument *Arg) { DXILContArgTy Result = get(&*TypesMD->getOperand(ArgNo), F->getContext()); - assert(ArgTy->isOpaquePointerTy() || - Result == - DXILContArgTy(ArgTy, ArgTy->getNonOpaquePointerElementType())); - return Result; } - if (EnforcePointerMetadata) - report_fatal_error("Missing metadata for pointer type!"); - - assert(!ArgTy->isOpaquePointerTy() && "opaque pointers are not supported"); - return DXILContArgTy(ArgTy, ArgTy->getNonOpaquePointerElementType()); + report_fatal_error("Missing metadata for pointer type!"); } DXILContArgTy DXILContArgTy::get(const Function *F, const unsigned ArgNo) { @@ -146,29 +128,9 @@ Metadata *DXILContArgTy::getTypeMetadata(LLVMContext &Context) { return MDTuple::get(Context, MD); } -DXILContFuncTy DXILContFuncTy::get(const FunctionType *FuncTy) { - DXILContFuncTy FT; - FT.ReturnTy = DXILContArgTy(FuncTy->getReturnType()); - for (Type *Param : FuncTy->params()) - FT.ArgTys.emplace_back(Param); - return FT; -} - DXILContFuncTy DXILContFuncTy::get(const Function *F) { auto *TypesMD = F->getMetadata(DXILContHelper::MDTypesName); - - // NOTE: fallback path code to be removed later - if (!TypesMD) { - assert(!F->getReturnType()->isOpaquePointerTy() && - !llvm::any_of(F->args(), [](const Argument &Arg) { - return Arg.getType()->isOpaquePointerTy(); - })); - DXILContFuncTy FuncTy; - for (auto &Arg : F->args()) - FuncTy.ArgTys.push_back(DXILContArgTy(Arg.getType())); - FuncTy.ReturnTy = DXILContArgTy(F->getReturnType()); - return FuncTy; - } + assert(TypesMD); return get(TypesMD, F->getContext()); } diff --git a/shared/continuations/lib/LegacyCleanupContinuations.cpp b/shared/continuations/lib/LegacyCleanupContinuations.cpp new file mode 100644 index 0000000000..b004efd169 --- /dev/null +++ b/shared/continuations/lib/LegacyCleanupContinuations.cpp @@ -0,0 +1,676 @@ +/* + *********************************************************************************************************************** + * + * Copyright (c) 2022-2023 Advanced Micro Devices, Inc. All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + *deal in the Software without restriction, including without limitation the + *rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + *sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + *all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + *FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + *IN THE SOFTWARE. + * + **********************************************************************************************************************/ + +//= LegacyCleanupContinuations.cpp - Post-process output of coroutine passes =// +// +// Convert the result from the coroutine passes to something more suitable for +// the compiler backend. +// +// Instead of return values, use continue, waitContinue and complete intrinsics. +// Add arguments to resume functions, which are the return values of the called +// continuation. +// +// Add a global register buffer to store the continuation state. +// +//===----------------------------------------------------------------------===// + +#include "continuations/Continuations.h" +#include "continuations/ContinuationsDialect.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/ADT/StringRef.h" +#include "llvm/IR/IRBuilder.h" +#include "llvm/IR/InstIterator.h" +#include "llvm/IR/Module.h" +#include "llvm/IR/Type.h" +#include "llvm/InitializePasses.h" +#include "llvm/Support/MathExtras.h" +#include + +using namespace llvm; + +#define DEBUG_TYPE "legacy-cleanup-continuations" + +LegacyCleanupContinuationsPass::LegacyCleanupContinuationsPass() {} + +/// Find the original call that created the continuation token and the matching +/// resume function for a return value. +/// +/// Returns a map (origin BB, (call that created the continuation token, resume +/// function)). +static DenseMap> +findTokenOrigin(BasicBlock *BB, Value *V, + SmallVectorImpl &ToRemove) { + DenseMap> Result; + Value *Call = nullptr; + Value *ResumeFun = nullptr; + while (auto *Insert = dyn_cast(V)) { + LLVM_DEBUG(dbgs() << "Insert: " << *Insert << "\n"); + assert(Insert->getNumIndices() == 1 && "Expected a flat struct"); + if (*Insert->idx_begin() == 0) + ResumeFun = Insert->getInsertedValueOperand(); + else if (*Insert->idx_begin() == 1) + Call = Insert->getInsertedValueOperand(); + V = Insert->getAggregateOperand(); + ToRemove.push_back(Insert); + } + + if (!ResumeFun) { + if (auto *Const = dyn_cast(V)) + ResumeFun = Const->getOperand(0); + } + + assert(Call && "Did not find call that creates the token"); + assert(ResumeFun && "Did not find resume function"); + + // Strip bitcast + if (auto *Cast = dyn_cast(ResumeFun)) { + ResumeFun = Cast->getOperand(0); + ToRemove.push_back(Cast); + } + if (auto *Const = dyn_cast(ResumeFun)) { + if (Const->isCast()) + ResumeFun = Const->getOperand(0); + } + + auto RegisterTokenOrigin = [&Result](BasicBlock *TheBB, Value *Token, + Value *TheResumeFun) { + assert(isa(TheResumeFun) && + "Resume function should be a constant function"); + // Strip away bitcasts -- this can happen with multiple token types + if (auto *TokenBitcast = dyn_cast(Token)) + Token = TokenBitcast->getOperand(0); + assert(isa(Token) && "Call should be a CallInst"); + auto *CallI = cast(Token); + Result.insert(std::make_pair(TheBB, std::make_pair(CallI, TheResumeFun))); + }; + + // Walk through phis + if (auto *CallPhi = dyn_cast(Call)) { + assert(isa(ResumeFun) && "Resume fun should also be a phi node"); + auto *ResumeFunPhi = cast(ResumeFun); + ToRemove.push_back(CallPhi); + ToRemove.push_back(ResumeFunPhi); + + for (auto CallEntry : + llvm::zip(CallPhi->blocks(), CallPhi->incoming_values())) { + auto *PhiBB = std::get<0>(CallEntry); + auto *ResumeFunEntry = ResumeFunPhi->getIncomingValueForBlock(PhiBB); + assert(ResumeFunEntry && "Need a resume fun for each call"); + RegisterTokenOrigin(PhiBB, std::get<1>(CallEntry), ResumeFunEntry); + } + } else { + RegisterTokenOrigin(BB, Call, ResumeFun); + } + return Result; +} + +/// Create a memcopy of an array, which the translator understands +static void createCopy(IRBuilder<> &B, Value *Dst, Value *Src, Type *Ty) { + assert(Ty->isArrayTy() && "Can only copy arrays"); + for (unsigned I = 0; I < Ty->getArrayNumElements(); I++) { + auto *SrcGep = B.CreateConstInBoundsGEP2_32(Ty, Src, 0, I); + auto *DstGep = B.CreateConstInBoundsGEP2_32(Ty, Dst, 0, I); + auto *Load = B.CreateLoad(Ty->getArrayElementType(), SrcGep); + B.CreateStore(Load, DstGep); + } +} + +void LegacyCleanupContinuationsPass::analyzeContinuation(Function &F, + MDNode *MD) { + // Only analyze main continuation + auto *MDTup = cast(MD); + auto *EntryF = mdconst::extract(MDTup->getOperand(0)); + + auto &Data = ToProcess[EntryF]; + + if (&F != EntryF) { + Data.Functions.push_back(&F); + return; + } + Data.Functions.insert(Data.Functions.begin(), &F); + Data.MD = MD; + + // Search the malloc call to find the size of the continuation state + if (ContMalloc) { + for (auto *User : ContMalloc->users()) { + if (auto *Call = dyn_cast(User)) { + if (Call->getFunction() == &F) { + Data.MallocCall = Call; + break; + } + } + } + } + + // Without malloc call, we check later if the continuation state is used + if (Data.MallocCall) { + Data.ContStateBytes = + cast(Data.MallocCall->getArgOperand(0))->getSExtValue(); + } + if (Data.ContStateBytes > MaxContStateBytes) + MaxContStateBytes = Data.ContStateBytes; +} + +void LegacyCleanupContinuationsPass::processContinuations() { + auto &Context = M->getContext(); + auto *Void = Type::getVoidTy(Context); + + for (auto &FuncData : ToProcess) { + LLVM_DEBUG(dbgs() << "Processing function: " << FuncData.first->getName() + << "\n"); + bool IsEntry = FuncData.first->hasMetadata(DXILContHelper::MDEntryName); + + for (auto *F : FuncData.second.Functions) { + if (F != FuncData.first) { + // Entry marker should only be on the start and not on resume functions + F->eraseMetadata(Context.getMDKindID(DXILContHelper::MDEntryName)); + // Same for stacksize + F->eraseMetadata(Context.getMDKindID(DXILContHelper::MDStackSizeName)); + // Set same linkage as for start function + F->setLinkage(FuncData.first->getLinkage()); + } + + // Ignore the stub created for the coroutine passes + if (F->empty()) + continue; + + LLVM_DEBUG(dbgs() << "Processing function part: " << F->getName() + << "\n"); + + bool IsStart = F == FuncData.first; // If this is the continuation start + SmallVector AllArgTypes; + SmallVector AllArgValues; + SmallVector InstsToRemove; + AttributeList FAttrs = F->getAttributes(); + SmallVector ParamAttrs; + + // Use all arguments except the last (pre-allocated buffer for the + // coroutine passes) for the continuation start + if (IsStart) { + unsigned ArgNo = 0; + assert(F->arg_size() >= 1 && + "Entry function has at least one argument"); + for (auto Arg = F->arg_begin(), ArgEnd = F->arg_end() - 1; + Arg != ArgEnd; Arg++) { + AllArgTypes.push_back(Arg->getType()); + AllArgValues.push_back(Arg); + ParamAttrs.push_back(FAttrs.getParamAttrs(ArgNo)); + ArgNo++; + } + } else { + IRBuilder<> B(&*F->getEntryBlock().getFirstNonPHIOrDbgOrAlloca()); + AllArgTypes.push_back( + getContinuationStackOffsetType(Context)); // continuation stack ptr + AllArgValues.push_back(nullptr); + + // Find arguments from continuation.returnvalue calls + for (auto &I : F->getEntryBlock()) { + if (auto *Intr = dyn_cast(&I)) { + AllArgTypes.push_back(Intr->getType()); + AllArgValues.push_back(Intr); + InstsToRemove.push_back(Intr); + } + } + } + + // Find the free call if there is one + if (ContFree) { + for (auto *User : ContFree->users()) { + if (auto *Call = dyn_cast(User)) { + if (Call->getFunction() == F) { + InstsToRemove.push_back(Call); + break; + } + } + } + } + + // Find the continuation state pointer, either returned by the malloc or + // given as an argument + Value *ContFrame = nullptr; + if (FuncData.second.MallocCall) { + if (IsStart) { + ContFrame = FuncData.second.MallocCall; + InstsToRemove.push_back(FuncData.second.MallocCall); + + auto *BufferArg = F->getArg(F->arg_size() - 1); + auto *User = BufferArg->getUniqueUndroppableUser(); + auto *Cast = dyn_cast(User); + if (Cast) + User = Cast->getUniqueUndroppableUser(); + auto *Store = cast(User); + InstsToRemove.push_back(Store); // Store needs to be eliminated first + if (Cast) + InstsToRemove.push_back(Cast); + } else { + // Look for the load of the allocated pointer + auto *User = F->getArg(0)->getUniqueUndroppableUser(); + auto *Cast = dyn_cast(User); + if (Cast) + User = Cast->getUniqueUndroppableUser(); + auto *Load = cast(User); + InstsToRemove.push_back(Load); // Load needs to be eliminated first + if (Cast) + InstsToRemove.push_back(Cast); + ContFrame = Load; + } + } else { + if (IsStart) + ContFrame = F->getArg(F->arg_size() - 1); + else + ContFrame = F->getArg(0); + } + + // Create new empty function + F->eraseMetadata(FuncData.second.MD->getMetadataID()); + auto *NewFuncTy = FunctionType::get(Void, AllArgTypes, false); + Function *NewFunc = cloneFunctionHeader(*F, NewFuncTy, ParamAttrs); + NewFunc->takeName(F); + FuncData.second.NewFunctions.push_back(NewFunc); + + // Transfer code from old function to new function + llvm::moveFunctionBody(*F, *NewFunc); + + // Set arg names for new function + for (unsigned Idx = 0; Idx != NewFunc->getFunctionType()->params().size(); + ++Idx) { + Argument *Arg = NewFunc->getArg(Idx); + Value *OldVal = AllArgValues[Idx]; + if (OldVal) { + Arg->setName(OldVal->getName()); + OldVal->replaceAllUsesWith(Arg); + } + if (IsStart) { + Argument *OldArg = F->getArg(Idx); + if (OldArg->hasInRegAttr()) + Arg->addAttr(Attribute::InReg); + else + Arg->removeAttr(Attribute::AttrKind::InReg); + } + } + + // Handle the function entry + IRBuilder<> B(&*NewFunc->getEntryBlock().getFirstNonPHIOrDbgOrAlloca()); + if (IsStart) + FuncData.second.NewStart = NewFunc; + handleFunctionEntry(B, FuncData.second, NewFunc, IsEntry); + + // Handle the function body + // Use the global continuation state + ContFrame->replaceAllUsesWith(B.CreateBitOrPointerCast( + FuncData.second.NewContState, ContFrame->getType())); + + // Handle the function returns + for (auto &BB : make_early_inc_range(*NewFunc)) { + auto *I = BB.getTerminator(); + if (I->getOpcode() == Instruction::Ret) { + handleContinue(B, FuncData.second, I); + } else if (I->getOpcode() == Instruction::Unreachable) { + if (auto *Call = dyn_cast(--I->getIterator())) { + if (auto *Called = Call->getCalledFunction()) { + if (Called->getName() == "continuation.return") + handleReturn(B, FuncData.second, Call); + } + } + } + } + + for (auto *I : InstsToRemove) + I->eraseFromParent(); + + // Remove the old function + F->replaceAllUsesWith(ConstantExpr::getBitCast(NewFunc, F->getType())); + } + } + + // Remove the old functions and update metadata + for (auto &FuncData : ToProcess) { + if (FuncData.second.Functions.size() > 1) { + // Only for functions that were split + for (auto *F : FuncData.second.Functions) + F->eraseFromParent(); + + MDTuple *ContMDTuple = MDTuple::get( + Context, {ValueAsMetadata::get(FuncData.second.NewStart)}); + for (auto *F : FuncData.second.NewFunctions) { + F->setMetadata(DXILContHelper::MDContinuationName, ContMDTuple); + if (F != FuncData.second.NewStart) { + // For non-start functions, set (incoming) continuation registercount + // metadata by looking at the continue calls that reference this + // function. These continue calls both specify the number of their + // outgoing registers, and the number of incoming payload registers + // coming back into the resume function (i.e. us). + SmallVector Worklist(F->users()); + std::optional RegCount; + while (!Worklist.empty()) { + auto *U = Worklist.pop_back_val(); + if (auto *Const = dyn_cast(U)) { + Worklist.append(Const->user_begin(), Const->user_end()); + continue; + } + assert(isa(U) && + "User of a resume function should be a call to continue"); + auto *Inst = cast(U); + if (auto Count = + DXILContHelper::tryGetReturnedRegisterCount(Inst)) { + assert((!RegCount || *RegCount == *Count) && + "Got different returned registercounts in continues to " + "the same resume function"); + RegCount = *Count; +#ifdef NDEBUG + break; +#endif + } else { + LLVM_DEBUG(Inst->dump()); + report_fatal_error( + "Found a continue call without " + "continuation returned registercount metadata"); + } + } + + // Add metadata + DXILContHelper::setIncomingRegisterCount(F, RegCount.value()); + } + } + } + } + + fixupDxilMetadata(*M); +} + +void LegacyCleanupContinuationsPass::handleFunctionEntry(IRBuilder<> &B, + ContinuationData &Data, + Function *F, + bool IsEntry) { + bool IsStart = F == Data.NewStart; + + // Create alloca to keep the continuation state + uint64_t ContStateNumI32s = divideCeil(Data.ContStateBytes, RegisterBytes); + auto *ContStateTy = ArrayType::get(I32, ContStateNumI32s); + Data.NewContState = B.CreateAlloca(ContStateTy, nullptr, "cont.state"); + + uint64_t NeededStackSize = computeNeededStackSizeForRegisterBuffer( + ContStateNumI32s, ContinuationStateRegisterCount); + if (IsStart) { + // Add function metadata that stores how big the continuation state is in + // bytes + DXILContHelper::setContinuationStateByteCount(*F, Data.ContStateBytes); + + // Add intrinsic call to save the previous continuation state + if (!IsEntry && Data.ContStateBytes) + B.CreateCall(SaveContState); + + if (NeededStackSize) { + // Add to continuation stack size metadata + DXILContHelper::addStackSize(F, NeededStackSize); + } + } else { + // Read continuation state from global into local variable + createCopy( + B, Data.NewContState, + B.CreateBitOrPointerCast( + ContState, ContStateTy->getPointerTo(ContState->getAddressSpace())), + ContStateTy); + + // Deallocate continuation stack space if necessary + if (NeededStackSize) { + // Add barrier so that the csp is only decremented after the continuation + // state is read + auto *Csp = B.CreateCall( + getContinuationStackOffset(*B.GetInsertPoint()->getModule())); + B.CreateCall(RegisterBufferSetPointerBarrier, {ContState, Csp}); + + moveContinuationStackOffset(B, -NeededStackSize); + } + } +} + +/// Transform +/// %tok = call %continuation.token* @foo() !continuation.registercount !0 +/// %0 = insertvalue { i8*, %continuation.token* } { i8* bitcast ({ i8*, +/// %continuation.token* } (i8*, i1)* @fun.resume.0 to i8*), +/// %continuation.token* undef }, %continuation.token* %tok, 1 +/// ret { i8*, %continuation.token* } %0 +/// to +/// %resume_addr = ptrtoint i8* ... @fun.resume.0 to i64 +/// %foo = ptrtoint %continuation.token* () @foo to i64 +/// call void @continuation.continue(i64 %foo, i8 addrspace(21)* %csp, i64 +/// %resume_addr, ) !continuation.registercount !0 +/// unreachable +/// +/// Also handles cases where the token and resume function are behind a phi. +void LegacyCleanupContinuationsPass::handleContinue(IRBuilder<> &B, + ContinuationData &Data, + Instruction *Ret) { + // Find the function call that generates the token + LLVM_DEBUG(dbgs() << "Converting ret to continue: " << *Ret + << "\nArgument: " << *Ret->getOperand(0) << "\n"); + auto *BB = Ret->getParent(); + SmallVector ToRemove; + ToRemove.push_back(Ret); + auto Calls = findTokenOrigin(Ret->getParent(), Ret->getOperand(0), ToRemove); + + for (auto *I : ToRemove) + I->eraseFromParent(); + + for (auto &Entry : Calls) { + LLVM_DEBUG(dbgs() << "Handling call: " << *Entry.second.first + << " with resume function " << Entry.second.second + << "\n"); + auto *Call = Entry.second.first; + auto *ResumeFun = Entry.second.second; + handleSingleContinue(B, Data, Call, ResumeFun); + } + + if (BB->empty()) { + assert(BB->hasNPredecessorsOrMore(0) && + "Handled all continues but the block still has predecessors left"); + BB->eraseFromParent(); + } +} + +void LegacyCleanupContinuationsPass::handleSingleContinue( + IRBuilder<> &B, ContinuationData &Data, CallInst *Call, Value *ResumeFun) { + // Pass resume address as argument + B.SetInsertPoint(Call); + auto *ReturnAddrInt = B.CreatePtrToInt(ResumeFun, I64); + + auto *CpsType = getContinuationStackOffsetType(Call->getContext()); + auto *CspFun = getContinuationStackOffset(*Call->getModule()); + + // Write local continuation state to stack and registers + uint64_t ContStateNumI32s = divideCeil(Data.ContStateBytes, RegisterBytes); + uint64_t NeededStackSize = computeNeededStackSizeForRegisterBuffer( + ContStateNumI32s, ContinuationStateRegisterCount); + + if (NeededStackSize) { + // Allocate continuation stack space if necessary + moveContinuationStackOffset(B, NeededStackSize); + // Add barrier so that the csp is only incremented before the continuation + // state is written + auto *Csp = B.CreateCall(CspFun); + B.CreateCall(RegisterBufferSetPointerBarrier, {ContState, Csp}); + } + + // Copy continuation state from local variable into global + auto *ContStateTy = Data.NewContState->getAllocatedType(); + createCopy( + B, + B.CreateBitOrPointerCast( + ContState, ContStateTy->getPointerTo(ContState->getAddressSpace())), + Data.NewContState, ContStateTy); + + auto *Csp = B.CreateLoad(CpsType, B.CreateCall(CspFun)); + + // Replace this instruction with a call to continuation.continue + SmallVector Args; + Args.push_back(B.CreatePointerCast(Call->getCalledOperand(), I64)); + Args.push_back(Csp); + Args.push_back(ReturnAddrInt); + Args.append(Call->arg_begin(), Call->arg_end()); + auto *ContinueCall = B.CreateCall(Continue, Args); + ContinueCall->copyMetadata(*Call); + assert(DXILContHelper::tryGetOutgoingRegisterCount(ContinueCall) && + "Missing registercount metadata!"); + + // Remove instructions at the end of the block + auto *Unreachable = B.CreateUnreachable(); + for (auto &I : make_early_inc_range(reverse(*ContinueCall->getParent()))) { + if (&I == Unreachable) + break; + I.eraseFromParent(); + } +} + +/// Transform +/// call void (i64, ...) @continuation.return(i64 %returnaddr, ) +/// unreachable +/// to +/// +/// call void @continuation.restore.continuation_state() +/// call void @continuation.continue(i64 %returnaddr, i8 addrspace(21)* %csp, +/// ) +/// unreachable +void LegacyCleanupContinuationsPass::handleReturn(IRBuilder<> &B, + ContinuationData &Data, + CallInst *ContRet) { + LLVM_DEBUG(dbgs() << "Converting return to continue: " << *ContRet << "\n"); + bool IsEntry = isa(ContRet->getArgOperand(0)); + B.SetInsertPoint(ContRet); + if (IsEntry) { + assert(ContRet->arg_size() == 1 && + "Entry functions ignore the return value"); + B.CreateCall(Complete); + } else { + // Add intrinsic call to restore the previous continuation state + if (Data.ContStateBytes) + B.CreateCall(RestoreContState); + + SmallVector Args(ContRet->args()); + auto *CspType = getContinuationStackOffsetType(ContRet->getContext()); + auto *CspFun = getContinuationStackOffset(*ContRet->getModule()); + auto *Csp = B.CreateLoad(CspType, B.CreateCall(CspFun)); + Args.insert(Args.begin() + 1, Csp); + + auto *ContinueCall = B.CreateCall(Continue, Args); + Data.NewReturnContinues.push_back(ContinueCall); + + ContinueCall->copyMetadata(*ContRet); + assert(DXILContHelper::tryGetOutgoingRegisterCount(ContinueCall) && + "Missing registercount metadata!"); + } + + ContRet->eraseFromParent(); +} + +llvm::PreservedAnalyses LegacyCleanupContinuationsPass::run( + llvm::Module &Mod, llvm::ModuleAnalysisManager &AnalysisManager) { + LLVM_DEBUG(dbgs() << "Run the cleanup-continuations pass\n"); + AnalysisManager.getResult(Mod); + + M = &Mod; + ToProcess.clear(); + MaxContStateBytes = 0; + ContMalloc = Mod.getFunction("continuation.malloc"); + ContFree = Mod.getFunction("continuation.free"); + + // Map the entry function of a continuation to the analysis result + for (auto &F : Mod.functions()) { + if (F.empty()) + continue; + if (auto *MD = F.getMetadata(DXILContHelper::MDContinuationName)) + analyzeContinuation(F, MD); + } + + // Check if the continuation state is used in any function part + for (auto &FuncData : ToProcess) { + if (!FuncData.second.MallocCall) { + for (auto *F : FuncData.second.Functions) { + bool IsStart = + (F == FuncData.first); // If this is the continuation start + Value *ContFrame; + if (IsStart) + ContFrame = F->getArg(F->arg_size() - 1); + else + ContFrame = F->getArg(0); + // If there are uses, we need to assume a size of + // MinimumContinuationStateBytes, because for all sizes up to this size + // coroutine passes will not emit a malloc that we can use to determine + // the exact size. If however the frame pointer is not used in any of + // the continuation functions, it's safe to assume an empty continuation + // state. + if (!ContFrame->user_empty()) { + FuncData.second.ContStateBytes = MinimumContinuationStateBytes; + if (MinimumContinuationStateBytes > MaxContStateBytes) + MaxContStateBytes = MinimumContinuationStateBytes; + } + } + } + } + + if (!ToProcess.empty()) { + auto &Context = Mod.getContext(); + I32 = Type::getInt32Ty(Context); + I64 = Type::getInt64Ty(Context); + SaveContState = getContinuationSaveContinuationState(Mod); + RestoreContState = getContinuationRestoreContinuationState(Mod); + Continue = getContinuationContinue(Mod); + Complete = getContinuationComplete(Mod); + + // Add global + // Size is the maximum of all continuations, but at least the register size + uint32_t ContStateSize = std::max( + MaxContStateBytes, ContinuationStateRegisterCount * RegisterBytes); + auto *ContStateTy = + ArrayType::get(I32, divideCeil(ContStateSize, RegisterBytes)); + ContState = cast(Mod.getOrInsertGlobal( + DXILContHelper::GlobalContStateName, ContStateTy, [&] { + return new GlobalVariable(Mod, ContStateTy, false, + GlobalVariable::ExternalLinkage, nullptr, + DXILContHelper::GlobalContStateName, + nullptr, GlobalVariable::NotThreadLocal); + })); + + RegisterBufferSetPointerBarrier = getRegisterBufferSetPointerBarrier(Mod); + + // Add registerbuffer metadata to split accesses at into i32s and spill to + // memory if necessary + std::optional StackAddrspace = + DXILContHelper::tryGetStackAddrspace(*M); + if (!StackAddrspace) + report_fatal_error("Missing stack addrspace metadata!"); + RegisterBufferMD RMD; + RMD.RegisterCount = ContinuationStateRegisterCount; + RMD.Addrspace = static_cast(*StackAddrspace); + auto *MD = createRegisterBufferMetadata(Context, RMD); + ContState->addMetadata("registerbuffer", *MD); + + processContinuations(); + } + + if (!ToProcess.empty()) + return PreservedAnalyses::none(); + return PreservedAnalyses::all(); +} diff --git a/shared/continuations/lib/LowerAwait.cpp b/shared/continuations/lib/LowerAwait.cpp index 7094497d5a..60590aa111 100644 --- a/shared/continuations/lib/LowerAwait.cpp +++ b/shared/continuations/lib/LowerAwait.cpp @@ -36,8 +36,10 @@ #include "continuations/Continuations.h" #include "continuations/ContinuationsDialect.h" +#include "lgccps/LgcCpsDialect.h" #include "llvm-dialects/Dialect/Builder.h" #include "llvm-dialects/Dialect/Dialect.h" +#include "llvm-dialects/Dialect/Visitor.h" #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/StringRef.h" #include "llvm/IR/IRBuilder.h" @@ -147,8 +149,7 @@ static Function *getContinuationReturn(Module &M) { return F; auto &C = M.getContext(); auto *Void = Type::getVoidTy(C); - auto *I64 = Type::getInt64Ty(C); - auto *FuncTy = FunctionType::get(Void, I64, true); + auto *FuncTy = FunctionType::get(Void, {}, true); AttributeList AL = AttributeList::get(C, AttributeList::FunctionIndex, {Attribute::NoReturn}); return cast(M.getOrInsertFunction(Name, FuncTy, AL).getCallee()); @@ -157,8 +158,8 @@ static Function *getContinuationReturn(Module &M) { LowerAwaitPass::LowerAwaitPass() {} static void processContinuations( - Module &M, - const MapVector> &ToProcess) { + Module &M, const MapVector> &ToProcess, + bool LowerLgcAwait) { // We definitely have a call that requires continuation in this function // // If this is the first time we've done this for this function @@ -174,17 +175,8 @@ static void processContinuations( auto *I32 = Type::getInt32Ty(Context); auto *I64 = Type::getInt64Ty(Context); - // Get continuation.token type from an await call - Type *TokenTy = nullptr; - for (const auto &FuncData : ToProcess) { - if (!FuncData.second.empty()) { - TokenTy = FuncData.second.front()->getFunctionType()->getParamType(0); - break; - } - } - - if (TokenTy == nullptr) - TokenTy = StructType::create(Context, "continuation.token")->getPointerTo(); + Type *TokenTy = + StructType::create(Context, "continuation.token")->getPointerTo(); SmallVector ReturnTypes; ReturnTypes.push_back(I8Ptr); // Continue function pointer @@ -194,14 +186,18 @@ static void processContinuations( for (auto &FuncData : ToProcess) { Function *F = FuncData.first; - bool IsEntry = F->hasMetadata(DXILContHelper::MDEntryName); + LLVM_DEBUG(dbgs() << "Processing function: " << F->getName() << "\n"); // Change the return type and arguments SmallVector AllArgTypes; - // Add continuation stack pointer and passed return address - if (!IsEntry) { + // Lgc.cps dialect will handle stack pointer and return address in other + // places. + bool IsLegacyNonEntry = + !F->hasMetadata(DXILContHelper::MDEntryName) && !LowerLgcAwait; + // Add continuation stack pointer and passed return address. + if (IsLegacyNonEntry) { AllArgTypes.push_back(getContinuationStackOffsetType(Context)); AllArgTypes.push_back(I64); } @@ -222,13 +218,13 @@ static void processContinuations( llvm::moveFunctionBody(*F, *NewFunc); // Set arg names for new function - if (!IsEntry) { + if (IsLegacyNonEntry) { NewFunc->getArg(0)->setName("cspInit"); NewFunc->getArg(1)->setName("returnAddr"); } for (unsigned Idx = 0; Idx != F->getFunctionType()->params().size(); ++Idx) { - Argument *Arg = NewFunc->getArg(Idx + (IsEntry ? 0 : 2)); + Argument *Arg = NewFunc->getArg(Idx + (IsLegacyNonEntry ? 2 : 0)); Argument *OldArg = F->getArg(Idx); Arg->setName(OldArg->getName()); OldArg->replaceAllUsesWith(Arg); @@ -278,8 +274,9 @@ static void processContinuations( llvm_dialects::Builder B( &*NewFunc->getEntryBlock().getFirstNonPHIOrDbgOrAlloca()); // Claim that the buffer has the minimum required size of a pointer - Value *BufSize = ConstantInt::get(I32, 8); + Value *BufSize = ConstantInt::get(I32, MinimumContinuationStateBytes); Value *BufAlign = ConstantInt::get(I32, 4); + Value *const CoroId = B.CreateIntrinsic(Intrinsic::coro_id_retcon, {}, {BufSize, BufAlign, StorageArg, ContProtoFuncPtr, @@ -287,18 +284,28 @@ static void processContinuations( auto *CPN = ConstantPointerNull::get(I8Ptr); B.CreateIntrinsic(Intrinsic::coro_begin, {}, {CoroId, CPN}); - // Save the return address at the start of the function - Value *SavedRetAddr = nullptr; - if (IsEntry) - SavedRetAddr = UndefValue::get(B.getInt64Ty()); - else - SavedRetAddr = NewFunc->getArg(1); // Return addr - // Replace await calls with suspend points for (auto *CI : FuncData.second) { B.SetInsertPoint(CI); + Value *SuspendRetconArg = nullptr; + if (LowerLgcAwait) { + SmallVector Args; + SmallVector ArgTys; + for (Value *Arg : CI->args()) { + Args.push_back(Arg); + ArgTys.push_back(Arg->getType()); + } + + // Insert a dummy call to remember the arguments to lgc.cps.await. + auto *ShaderTy = FunctionType::get(TokenTy, ArgTys, false); + auto *ShaderFun = + B.CreateIntToPtr(CI->getArgOperand(0), ShaderTy->getPointerTo()); + SuspendRetconArg = B.CreateCall(ShaderTy, ShaderFun, Args); + } else { + SuspendRetconArg = CI->getArgOperand(0); + } B.CreateIntrinsic(Intrinsic::coro_suspend_retcon, {B.getInt1Ty()}, - {CI->getArgOperand(0)}); + SuspendRetconArg); auto *RetTy = CI->getType(); if (!RetTy->isVoidTy()) { auto *RetVal = B.create(RetTy); @@ -307,6 +314,16 @@ static void processContinuations( CI->eraseFromParent(); } + // Save the return address at the start of the function for legacy path. + // For lgc.cps, we don't need to save any value, so just not passing any + // argument. + Value *SavedRetAddr = nullptr; + if (!LowerLgcAwait) { + if (IsLegacyNonEntry) + SavedRetAddr = NewFunc->getArg(1); // Return addr + else + SavedRetAddr = UndefValue::get(I64); + } // Convert returns to continuation.return calls auto *ContRet = getContinuationReturn(M); for (auto &BB : *NewFunc) { @@ -314,10 +331,13 @@ static void processContinuations( if (I->getOpcode() == Instruction::Ret) { // Replace this instruction with a call to continuation.return B.SetInsertPoint(I); - SmallVector RetVals{SavedRetAddr}; + SmallVector RetVals; - if (I->getNumOperands() != 0) - RetVals.push_back(I->getOperand(0)); + if (!LowerLgcAwait) { + RetVals.push_back(SavedRetAddr); + if (I->getNumOperands() != 0) + RetVals.push_back(I->getOperand(0)); + } auto *ContRetCall = B.CreateCall(ContRet, RetVals); // DXILCont passes use annotations on the ret to pass information // on the shader exit to later passes. Copy such metadata to the ContRet @@ -338,22 +358,35 @@ LowerAwaitPass::run(llvm::Module &M, AnalysisManager.getResult(M); MapVector> ToProcess; - for (auto &F : M.functions()) { - if (!F.getName().startswith("await.")) { - // Force processing annotated functions, even if they don't have await - // calls - if (F.hasMetadata(DXILContHelper::MDContinuationName)) - ToProcess[&F].size(); - continue; - } - for (auto *U : F.users()) { - if (auto *Inst = dyn_cast(U)) - ToProcess[Inst->getFunction()].push_back(Inst); + static auto Visitor = + llvm_dialects::VisitorBuilder< + MapVector>>() + .setStrategy(llvm_dialects::VisitorStrategy::ByFunctionDeclaration) + .add([](auto &ToProcess, auto &Op) { + ToProcess[Op.getFunction()].push_back(&Op); + }) + .build(); + Visitor.visit(ToProcess, M); + + bool LowerLgcAwait = !ToProcess.empty(); + if (!LowerLgcAwait) { + for (auto &F : M.functions()) { + if (!F.getName().startswith("await.")) { + // Force processing annotated functions, even if they don't have await + // calls + if (F.hasMetadata(DXILContHelper::MDContinuationName)) + ToProcess[&F].size(); + continue; + } + for (auto *U : F.users()) { + if (auto *Inst = dyn_cast(U)) + ToProcess[Inst->getFunction()].push_back(Inst); + } } } if (!ToProcess.empty()) { - processContinuations(M, ToProcess); + processContinuations(M, ToProcess, LowerLgcAwait); return PreservedAnalyses::none(); } return PreservedAnalyses::all(); diff --git a/shared/continuations/lib/LowerRaytracingPipeline.cpp b/shared/continuations/lib/LowerRaytracingPipeline.cpp index abac2df351..3db113f2be 100644 --- a/shared/continuations/lib/LowerRaytracingPipeline.cpp +++ b/shared/continuations/lib/LowerRaytracingPipeline.cpp @@ -111,7 +111,7 @@ struct PayloadCopyHelper { return; } - copyField(Node, It->second.IndexIntervals); + copyField(Node->Ty, It->second.IndexIntervals); // Register node as copied if (CopiedNodes) @@ -120,13 +120,49 @@ struct PayloadCopyHelper { // Perform copy for each index interval (i.e, for each contiguous range of // storage memory) - void copyField(const PAQNode *Node, const PAQIndexIntervals &Intervals) { + void copyField(Type *FieldTy, const PAQIndexIntervals &Intervals) { auto *I32 = Type::getInt32Ty(M.getContext()); // Pointer to the node field in the local payload auto *LocalFieldPtr = B.CreateInBoundsGEP(&PayloadTy, LocalPayload, PayloadIdxList); assert(cast(LocalFieldPtr->getType()) - ->isOpaqueOrPointeeTypeMatches(Node->Ty)); + ->isOpaqueOrPointeeTypeMatches(FieldTy)); + + // If the field is serialized in multiple intervals in the global, + // we perform a manual bytewise copy using i32 and i8. + // However, if the field is serialized using a single, contiguous interval + // and does not have stricter alignment requirements than i32, + // then we can just load/store the field type from/to the global storage. + // + // We currently restrict this mechanism to single-DWord fields to avoid + // issues with the RegisterBuffer pass which struggles with loads and stores + // of large vector types, leading to bad IR with additional allocas. + // TODO: Remove this restriction once we have moved to LLPC-style + // continuations without the RegisterBuffer pass. + const DataLayout &DL = M.getDataLayout(); + if (Intervals.size() == 1 && + DL.getABITypeAlign(FieldTy) <= DL.getABITypeAlign(I32) && + Intervals[0].size() == 1) { + + // Do a single load+store + Value *Src = LocalFieldPtr; + + auto *GlobalIntervalI32Ptr = B.CreateInBoundsGEP( + Layout->SerializationTy, Serialization, + {B.getInt32(0), B.getInt32(0), B.getInt32(Intervals[0].Begin)}); + Value *Dst = B.CreateBitCast( + GlobalIntervalI32Ptr, + FieldTy->getPointerTo( + GlobalIntervalI32Ptr->getType()->getPointerAddressSpace())); + + if (GlobalAccessKind != PAQAccessKind::Write) + std::swap(Src, Dst); + + auto *Val = B.CreateLoad(FieldTy, Src); + B.CreateStore(Val, Dst); + return; + } + // I32 pointer to start of field in local payload Value *FieldI32Ptr = B.CreateBitCast( LocalFieldPtr, @@ -135,7 +171,7 @@ struct PayloadCopyHelper { // Counts how many bytes have already been copied unsigned FieldByteOffset = 0; unsigned FieldNumBytes = - M.getDataLayout().getTypeStoreSize(Node->Ty).getFixedValue(); + M.getDataLayout().getTypeStoreSize(FieldTy).getFixedValue(); for (unsigned IntervalIdx = 0; IntervalIdx < Intervals.size(); ++IntervalIdx) { const PAQIndexInterval &Interval = Intervals[IntervalIdx]; @@ -227,17 +263,9 @@ static void setStacksizeMetadata(Function &F, uint64_t PayloadI32s, uint64_t NeededStackSize = computeNeededStackSizeForRegisterBuffer( PayloadI32s, PayloadRegisterCount); if (NeededStackSize) { - auto &Context = F.getContext(); - uint64_t CurStackSize = 0; - if (auto *StackSizeMD = F.getMetadata(DXILContHelper::MDStackSizeName)) - CurStackSize = mdconst::extract(StackSizeMD->getOperand(0)) - ->getZExtValue(); + uint64_t CurStackSize = DXILContHelper::tryGetStackSize(&F).value_or(0); if (NeededStackSize > CurStackSize) - F.setMetadata( - DXILContHelper::MDStackSizeName, - MDTuple::get(Context, - {ConstantAsMetadata::get(ConstantInt::get( - Type::getInt32Ty(Context), NeededStackSize))})); + DXILContHelper::setStackSize(&F, NeededStackSize); } } @@ -364,28 +392,30 @@ CallInst *LowerRaytracingPipelinePassImpl::replaceCall( AwaitData.CallType = CallType; AwaitData.FuncConfig = Data.FuncConfig; if (!SpecializedFunction) { - // Copy function - // Construct new function type via DXILContFuncTy: This way, we also get - // type metadata - DXILContFuncTy SpecializedContFuncTy; - DXILContFuncTy OldContFuncTy = DXILContFuncTy::get(Func); + // Copy function, modify argument types + SmallVector ArgTys; + ArgTys.reserve(Func->getFunctionType()->params().size() + 1); // Add system data argument - SpecializedContFuncTy.ArgTys.push_back( - DXILContArgTy(SystemDataTy->getPointerTo(), SystemDataTy)); + ArgTys.push_back(SystemDataTy->getPointerTo()); // Skip intrinsic id argument - SpecializedContFuncTy.ArgTys.append(OldContFuncTy.ArgTys.begin() + 1, - OldContFuncTy.ArgTys.end()); + ArgTys.append(Func->getFunctionType()->params().begin() + 1, + Func->getFunctionType()->params().end()); + // Add payload argument - SpecializedContFuncTy.ArgTys.push_back( - Call->getArgOperand(Call->arg_size() - 2)->getType()); - SpecializedContFuncTy.ReturnTy = DXILContArgTy(Func->getReturnType()); + ArgTys.push_back(Call->getArgOperand(Call->arg_size() - 2)->getType()); SpecializedFunction = cloneFunctionHeader( - *Func, SpecializedContFuncTy.asFunctionType(Mod->getContext()), {}); + *Func, FunctionType::get(Func->getReturnType(), ArgTys, false), {}); SpecializedFunction->setName(NewName); - SpecializedContFuncTy.writeMetadata(SpecializedFunction); + + assert(PayloadOrAttrTypesForSpecializedFunctions.count( + SpecializedFunction) == 0); + // Store payload or hit attribute type for later. Despite the name, payload + // metadata also gives hit attribute types for ReportHit. + PayloadOrAttrTypesForSpecializedFunctions[SpecializedFunction] = + DXILContHelper::getPayloadTypeFromMetadata(*Call); assert(!AwaitsToProcess.count(SpecializedFunction) && "Unexpected existing await data entry!"); @@ -410,6 +440,10 @@ CallInst *LowerRaytracingPipelinePassImpl::replaceCall( CloneFunctionInto(SpecializedFunction, Func, VMap, CloneFunctionChangeType::LocalChangesOnly, Returns); + // Do not propagate type metadata to the cloned function. It would be + // incorrect, because arguments differ, and we should no longer need it. + SpecializedFunction->setMetadata(DXILContHelper::MDTypesName, nullptr); + if (CallType == ContinuationCallType::AnyHit) handleReportHit(Data, *SpecializedFunction); } else { @@ -542,7 +576,7 @@ void LowerRaytracingPipelinePassImpl::replaceContinuationCall( if (CallType != ContinuationCallType::AnyHit) { // Payload is unchanged by Intersection and passed implicitly PassedPayload = F->getArg(F->arg_size() - 1); - PayloadTy = getFuncArgPtrElementType(F, PassedPayload); + PayloadTy = PayloadOrAttrTypesForSpecializedFunctions.at(F); } const PAQSerializationLayout *OutgoingSerializationLayout = nullptr; @@ -662,7 +696,7 @@ void LowerRaytracingPipelinePassImpl::replaceContinuationCall( SystemDataTy = TraversalDataTy; // Add hit attributes to arguments auto *HitAttrsArg = F->getArg(F->arg_size() - 1); - auto *HitAttrsTy = getFuncArgPtrElementType(F, HitAttrsArg); + auto *HitAttrsTy = PayloadOrAttrTypesForSpecializedFunctions.at(F); ArgTys.push_back(HitAttrsTy); auto *HitAttrs = B.CreateLoad(HitAttrsTy, HitAttrsArg); Args.push_back(HitAttrs); @@ -726,8 +760,8 @@ void LowerRaytracingPipelinePassImpl::handleReportHit(FunctionData &Data, if (Call->getCalledFunction()->getName() == "_AmdAcceptHitAttributes") { // Commit hit attributes B.SetInsertPoint(Call); - auto *SystemDataTy = getFuncArgPtrElementType(&F, 0); - copyHitAttributes(B, Data, Call->getArgOperand(0), SystemDataTy, + assert(TraversalDataTy != 0 && "Missing traversal system data!"); + copyHitAttributes(B, Data, Call->getArgOperand(0), TraversalDataTy, HitAttrsArg, false, nullptr); // Make sure that we store the hit attributes into the correct system // data (just in case dxc copied them around). @@ -1750,10 +1784,10 @@ bool LowerRaytracingPipelinePassImpl::run() { Data->second.CallShaderCalls.push_back(&CallCallableShader); }) .add([](VisitorState &State, auto &ReportHitOp) { - auto *HitAttributesArg = - ReportHitOp.getArgOperand(ReportHitOp.arg_size() - 2); - auto *HitAttributesTy = DXILContArgTy(HitAttributesArg->getType()) - .getPointerElementType(); + // The converter uses payload type metadata also to indicate hit + // attribute types + auto HitAttributesTy = + DXILContHelper::getPayloadTypeFromMetadata(ReportHitOp); auto Data = State.Processables.find(ReportHitOp.getFunction()); if (Data == State.Processables.end()) return; diff --git a/shared/continuations/lib/PassRegistry.inc b/shared/continuations/lib/PassRegistry.inc index 15fb843480..0b5f106695 100644 --- a/shared/continuations/lib/PassRegistry.inc +++ b/shared/continuations/lib/PassRegistry.inc @@ -47,7 +47,7 @@ CONT_MODULE_ANALYSIS("dialect-context-analysis", DialectContextAnalysis(NeedDialectContext)) -CONT_MODULE_PASS("add-types-metadata", AddTypesMetadataPass()) +CONT_MODULE_PASS("legacy-cleanup-continuations", LegacyCleanupContinuationsPass()) CONT_MODULE_PASS("cleanup-continuations", CleanupContinuationsPass()) CONT_MODULE_PASS("lower-raytracing-pipeline", LowerRaytracingPipelinePass()) CONT_MODULE_PASS("dxil-cont-intrinsic-prepare", DXILContIntrinsicPreparePass()) @@ -62,6 +62,7 @@ CONT_MODULE_PASS("remove-types-metadata", RemoveTypesMetadataPass()) CONT_MODULE_PASS("save-continuation-state", SaveContinuationStatePass()) CONT_CGSCC_PASS("dxil-coro-split", DXILCoroSplitPass()) +CONT_CGSCC_PASS("lgc-coro-split", LgcCoroSplitPass()) #undef CONT_PASS #undef CONT_LOOP_PASS diff --git a/shared/continuations/lib/PayloadAccessQualifiers.cpp b/shared/continuations/lib/PayloadAccessQualifiers.cpp index 47f17de991..4a6ea3ceb3 100644 --- a/shared/continuations/lib/PayloadAccessQualifiers.cpp +++ b/shared/continuations/lib/PayloadAccessQualifiers.cpp @@ -1655,13 +1655,16 @@ PAQTraceRaySerializationInfo::create(Module &M, // Some serialization structs include storage for committed hit attributes. // Because we do not know whether intersection shaders are part of the // pipeline or not, let alone the maximum size of occurring attribute types, - // we need to be pessimistic and assume the largest size allowed by the API. - // SystemData provides some storage for attributes (currently 2 registers), - // which leaves 6 registers in the payload storage. - // A whole-pipeline analysis should allow to eliminate these registers, - // e.g. in case no intersection shaders are present. + // we need to be pessimistic and assume the maximum possible hit attribute + // size as specified by the app, obtained from + // PAQConfig.MaxHitAttributeByteCount. SystemData provides some storage for + // attributes (currently 2 registers), which leaves 6 registers in the payload + // storage. A whole-pipeline analysis should allow to eliminate these + // registers, e.g. in case no intersection shaders are present. assert(PAQConfig.MaxHitAttributeByteCount <= GlobalMaxHitAttributeBytes); - const uint64_t InlineHitAttrBytes = getInlineHitAttrsBytes(M); + const uint32_t MaxInlineHitAttrBytes = getInlineHitAttrsBytes(M); + const uint32_t InlineHitAttrBytes = + std::min(MaxInlineHitAttrBytes, PAQConfig.MaxHitAttributeByteCount); const uint64_t PayloadHitAttrI32s = divideCeil( PAQConfig.MaxHitAttributeByteCount - InlineHitAttrBytes, RegisterBytes); diff --git a/shared/continuations/lib/SaveContinuationState.cpp b/shared/continuations/lib/SaveContinuationState.cpp index f1c5e6a00e..622def5160 100644 --- a/shared/continuations/lib/SaveContinuationState.cpp +++ b/shared/continuations/lib/SaveContinuationState.cpp @@ -108,14 +108,7 @@ bool SaveContinuationStatePass::lowerCalls(Function *Intr, bool IsSave) { if (IsSave) { // Add to continuation stack size metadata - uint64_t CurStackSize = 0; - if (auto *StackSizeMD = F->getMetadata(DXILContHelper::MDStackSizeName)) - CurStackSize = mdconst::extract(StackSizeMD->getOperand(0)) - ->getZExtValue(); - F->setMetadata(DXILContHelper::MDStackSizeName, - MDTuple::get(F->getContext(), - {ConstantAsMetadata::get(ConstantInt::get( - I32, NeededStackSize + CurStackSize))})); + DXILContHelper::addStackSize(F, NeededStackSize); } Call->eraseFromParent(); diff --git a/shared/continuations/plugin/CMakeLists.txt b/shared/continuations/plugin/CMakeLists.txt index 0106b13d5d..954d83df46 100644 --- a/shared/continuations/plugin/CMakeLists.txt +++ b/shared/continuations/plugin/CMakeLists.txt @@ -2,6 +2,9 @@ set(LLVM_CONTINUATIONSPLUGIN_LINK_INTO_TOOLS ON CACHE BOOL "Link plugin into too add_llvm_pass_plugin(ContinuationsPlugin Plugin.cpp + + LINK_COMPONENTS + Support ) target_link_libraries(ContinuationsPlugin PRIVATE LLVMContinuations) diff --git a/shared/continuations/test/add-types-metadata.ll b/shared/continuations/test/add-types-metadata.ll deleted file mode 100644 index 017b8c304d..0000000000 --- a/shared/continuations/test/add-types-metadata.ll +++ /dev/null @@ -1,538 +0,0 @@ -; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-attributes --check-globals --version 2 -; RUN: opt --opaque-pointers=0 --enforce-pointer-metadata=1 --verify-each -passes='add-types-metadata' -S %s 2>%t.stderr | FileCheck -check-prefix=METADATA %s -; RUN: count 0 < %t.stderr - -target datalayout = "e-m:e-p:64:32-p20:32:32-p21:32:32-i1:32-i8:8-i16:32-i32:32-i64:32-f16:32-f32:32-f64:32-v16:32-v32:32-v48:32-v64:32-v80:32-v96:32-v112:32-v128:32-v144:32-v160:32-v176:32-v192:32-v208:32-v224:32-v240:32-v256:32-n8:16:32" - -%struct.HitData = type { <3 x float>, <3 x float>, float, i32 } -%struct.DispatchSystemData = type { <3 x i32> } -%struct.SystemData = type { %struct.DispatchSystemData } -%struct.TraversalData = type { %struct.SystemData, %struct.HitData, <3 x float>, <3 x float>, float, i64 } -%struct.AnyHitTraversalData = type { %struct.TraversalData, %struct.HitData } - -declare i32 @_cont_GetContinuationStackAddr() #4 -declare %struct.DispatchSystemData @_cont_SetupRayGen() #4 -declare %struct.DispatchSystemData @_AmdAwaitTraversal(i64, %struct.TraversalData) #4 -declare %struct.DispatchSystemData @_AmdAwaitShader(i64, %struct.DispatchSystemData) #4 -declare %struct.AnyHitTraversalData @_AmdAwaitAnyHit(i64, %struct.AnyHitTraversalData, float, i32) #4 -declare %struct.HitData @_cont_GetCandidateState(%struct.AnyHitTraversalData*) #4 -declare %struct.HitData @_cont_GetCommittedState(%struct.SystemData*) #4 -declare %struct.BuiltInTriangleIntersectionAttributes @_cont_GetTriangleHitAttributes(%struct.SystemData*) #4 -declare void @_cont_SetTriangleHitAttributes(%struct.SystemData*, %struct.BuiltInTriangleIntersectionAttributes) #4 -declare i32 @_cont_GetLocalRootIndex(%struct.DispatchSystemData*) -declare i1 @_cont_IsEndSearch(%struct.TraversalData*) #4 -declare i32 @_cont_HitKind(%struct.SystemData*) #4 -declare i64 @_AmdGetResumePointAddr() #1 -declare void @_AmdRestoreSystemData(%struct.DispatchSystemData* %data) #1 -declare void @_AmdRestoreSystemDataAnyHit(%struct.AnyHitTraversalData* %data) #1 -declare void @_cont_AcceptHitAndEndSearch(%struct.DispatchSystemData* nocapture readnone %data) #1 -declare void @_cont_AcceptHit(%struct.AnyHitTraversalData* nocapture readnone %data) #1 -declare void @_cont_IgnoreHit(%struct.DispatchSystemData* nocapture readnone %data) #1 -declare void @_AmdAcceptHitAttributes(%struct.AnyHitTraversalData* nocapture readnone %data) #1 - -define void @_cont_TraceRay(%struct.DispatchSystemData* %data, i64, i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float) #4 { -; METADATA-LABEL: define void @_cont_TraceRay -; METADATA-SAME: (%struct.DispatchSystemData* [[DATA:%.*]], i64 [[TMP0:%.*]], i32 [[TMP1:%.*]], i32 [[TMP2:%.*]], i32 [[TMP3:%.*]], i32 [[TMP4:%.*]], i32 [[TMP5:%.*]], float [[TMP6:%.*]], float [[TMP7:%.*]], float [[TMP8:%.*]], float [[TMP9:%.*]], float [[TMP10:%.*]], float [[TMP11:%.*]], float [[TMP12:%.*]], float [[TMP13:%.*]]) #[[ATTR0:[0-9]+]] !types !44 { -; METADATA-NEXT: [[DIS_DATA:%.*]] = load [[STRUCT_DISPATCHSYSTEMDATA:%.*]], %struct.DispatchSystemData* [[DATA]], align 4 -; METADATA-NEXT: [[SYS_DATA:%.*]] = insertvalue [[STRUCT_SYSTEMDATA:%.*]] undef, [[STRUCT_DISPATCHSYSTEMDATA]] [[DIS_DATA]], 0 -; METADATA-NEXT: [[TRAV_DATA:%.*]] = insertvalue [[STRUCT_TRAVERSALDATA:%.*]] undef, [[STRUCT_SYSTEMDATA]] [[SYS_DATA]], 0 -; METADATA-NEXT: [[ADDR:%.*]] = call i64 @_AmdGetResumePointAddr() #[[ATTR3:[0-9]+]] -; METADATA-NEXT: [[TRAV_DATA2:%.*]] = insertvalue [[STRUCT_TRAVERSALDATA]] [[TRAV_DATA]], i64 [[ADDR]], 5 -; METADATA-NEXT: [[NEWDATA:%.*]] = call [[STRUCT_DISPATCHSYSTEMDATA]] @_AmdAwaitTraversal(i64 4, [[STRUCT_TRAVERSALDATA]] [[TRAV_DATA2]]) -; METADATA-NEXT: store [[STRUCT_DISPATCHSYSTEMDATA]] [[NEWDATA]], %struct.DispatchSystemData* [[DATA]], align 4 -; METADATA-NEXT: call void @_AmdRestoreSystemData(%struct.DispatchSystemData* [[DATA]]) -; METADATA-NEXT: ret void -; - %dis_data = load %struct.DispatchSystemData, %struct.DispatchSystemData* %data - %sys_data = insertvalue %struct.SystemData undef, %struct.DispatchSystemData %dis_data, 0 - %trav_data = insertvalue %struct.TraversalData undef, %struct.SystemData %sys_data, 0 - %addr = call i64 @_AmdGetResumePointAddr() #2 - %trav_data2 = insertvalue %struct.TraversalData %trav_data, i64 %addr, 5 - %newdata = call %struct.DispatchSystemData @_AmdAwaitTraversal(i64 4, %struct.TraversalData %trav_data2) - store %struct.DispatchSystemData %newdata, %struct.DispatchSystemData* %data - call void @_AmdRestoreSystemData(%struct.DispatchSystemData* %data) - ret void -} - -define void @_cont_CallShader(%struct.DispatchSystemData* %data, i32) #4 { -; METADATA-LABEL: define void @_cont_CallShader -; METADATA-SAME: (%struct.DispatchSystemData* [[DATA:%.*]], i32 [[TMP0:%.*]]) #[[ATTR0]] !types !45 { -; METADATA-NEXT: [[DIS_DATA:%.*]] = load [[STRUCT_DISPATCHSYSTEMDATA:%.*]], %struct.DispatchSystemData* [[DATA]], align 4 -; METADATA-NEXT: [[NEWDATA:%.*]] = call [[STRUCT_DISPATCHSYSTEMDATA]] @_AmdAwaitShader(i64 2, [[STRUCT_DISPATCHSYSTEMDATA]] [[DIS_DATA]]) -; METADATA-NEXT: store [[STRUCT_DISPATCHSYSTEMDATA]] [[NEWDATA]], %struct.DispatchSystemData* [[DATA]], align 4 -; METADATA-NEXT: call void @_AmdRestoreSystemData(%struct.DispatchSystemData* [[DATA]]) -; METADATA-NEXT: ret void -; - %dis_data = load %struct.DispatchSystemData, %struct.DispatchSystemData* %data - %newdata = call %struct.DispatchSystemData @_AmdAwaitShader(i64 2, %struct.DispatchSystemData %dis_data) - store %struct.DispatchSystemData %newdata, %struct.DispatchSystemData* %data - call void @_AmdRestoreSystemData(%struct.DispatchSystemData* %data) - ret void -} - -define i1 @_cont_ReportHit(%struct.AnyHitTraversalData* %data, float %t, i32 %hitKind) #4 { -; METADATA-LABEL: define i1 @_cont_ReportHit -; METADATA-SAME: (%struct.AnyHitTraversalData* [[DATA:%.*]], float [[T:%.*]], i32 [[HITKIND:%.*]]) #[[ATTR0]] !types !46 { -; METADATA-NEXT: [[ORIGTPTR:%.*]] = getelementptr inbounds [[STRUCT_ANYHITTRAVERSALDATA:%.*]], %struct.AnyHitTraversalData* [[DATA]], i32 0, i32 0, i32 4 -; METADATA-NEXT: [[ORIGT:%.*]] = load float, float* [[ORIGTPTR]], align 4 -; METADATA-NEXT: [[ISNOHIT:%.*]] = fcmp fast uge float [[T]], [[ORIGT]] -; METADATA-NEXT: br i1 [[ISNOHIT]], label [[ISEND:%.*]], label [[CALLAHIT:%.*]] -; METADATA: callAHit: -; METADATA-NEXT: [[TRAV_DATA:%.*]] = load [[STRUCT_ANYHITTRAVERSALDATA]], %struct.AnyHitTraversalData* [[DATA]], align 4 -; METADATA-NEXT: [[NEWDATA:%.*]] = call [[STRUCT_ANYHITTRAVERSALDATA]] @_AmdAwaitAnyHit(i64 3, [[STRUCT_ANYHITTRAVERSALDATA]] [[TRAV_DATA]], float [[T]], i32 [[HITKIND]]) -; METADATA-NEXT: store [[STRUCT_ANYHITTRAVERSALDATA]] [[NEWDATA]], %struct.AnyHitTraversalData* [[DATA]], align 4 -; METADATA-NEXT: call void @_AmdRestoreSystemDataAnyHit(%struct.AnyHitTraversalData* [[DATA]]) -; METADATA-NEXT: ret i1 true -; METADATA: isEnd: -; METADATA-NEXT: call void @_AmdAcceptHitAttributes(%struct.AnyHitTraversalData* [[DATA]]) -; METADATA-NEXT: ret i1 false -; - %origTPtr = getelementptr inbounds %struct.AnyHitTraversalData, %struct.AnyHitTraversalData* %data, i32 0, i32 0, i32 4 - %origT = load float, float* %origTPtr, align 4 - %isNoHit = fcmp fast uge float %t, %origT - br i1 %isNoHit, label %isEnd, label %callAHit - -callAHit: - %trav_data = load %struct.AnyHitTraversalData, %struct.AnyHitTraversalData* %data - %newdata = call %struct.AnyHitTraversalData @_AmdAwaitAnyHit(i64 3, %struct.AnyHitTraversalData %trav_data, float %t, i32 %hitKind) - store %struct.AnyHitTraversalData %newdata, %struct.AnyHitTraversalData* %data - call void @_AmdRestoreSystemDataAnyHit(%struct.AnyHitTraversalData* %data) - ret i1 1 - -isEnd: - ; Call AcceptHitAttributes, just to simulate it - call void @_AmdAcceptHitAttributes(%struct.AnyHitTraversalData* %data) - ret i1 0 -} - -define i32 @_cont_DispatchRaysIndex(%struct.DispatchSystemData* %data, i32 %i) { -; METADATA-LABEL: define i32 @_cont_DispatchRaysIndex -; METADATA-SAME: (%struct.DispatchSystemData* [[DATA:%.*]], i32 [[I:%.*]]) !types !47 { -; METADATA-NEXT: [[RESPTR:%.*]] = getelementptr [[STRUCT_DISPATCHSYSTEMDATA:%.*]], %struct.DispatchSystemData* [[DATA]], i32 0, i32 0, i32 [[I]] -; METADATA-NEXT: [[RES:%.*]] = load i32, i32* [[RESPTR]], align 4 -; METADATA-NEXT: ret i32 [[RES]] -; - %resPtr = getelementptr %struct.DispatchSystemData, %struct.DispatchSystemData* %data, i32 0, i32 0, i32 %i - %res = load i32, i32* %resPtr - ret i32 %res -} - -define float @_cont_ObjectRayOrigin(%struct.DispatchSystemData* nocapture readnone %data, %struct.HitData* %hitData, i32 %i) { -; METADATA-LABEL: define float @_cont_ObjectRayOrigin -; METADATA-SAME: (%struct.DispatchSystemData* nocapture readnone [[DATA:%.*]], %struct.HitData* [[HITDATA:%.*]], i32 [[I:%.*]]) !types !48 { -; METADATA-NEXT: [[RESPTR:%.*]] = getelementptr [[STRUCT_HITDATA:%.*]], %struct.HitData* [[HITDATA]], i32 0, i32 0, i32 [[I]] -; METADATA-NEXT: [[RES:%.*]] = load float, float* [[RESPTR]], align 4 -; METADATA-NEXT: ret float [[RES]] -; - %resPtr = getelementptr %struct.HitData, %struct.HitData* %hitData, i32 0, i32 0, i32 %i - %res = load float, float* %resPtr - ret float %res -} -define float @_cont_ObjectRayDirection(%struct.DispatchSystemData* nocapture readnone %data, %struct.HitData* %hitData, i32 %i) { -; METADATA-LABEL: define float @_cont_ObjectRayDirection -; METADATA-SAME: (%struct.DispatchSystemData* nocapture readnone [[DATA:%.*]], %struct.HitData* [[HITDATA:%.*]], i32 [[I:%.*]]) !types !48 { -; METADATA-NEXT: [[RESPTR:%.*]] = getelementptr [[STRUCT_HITDATA:%.*]], %struct.HitData* [[HITDATA]], i32 0, i32 1, i32 [[I]] -; METADATA-NEXT: [[RES:%.*]] = load float, float* [[RESPTR]], align 4 -; METADATA-NEXT: ret float [[RES]] -; - %resPtr = getelementptr %struct.HitData, %struct.HitData* %hitData, i32 0, i32 1, i32 %i - %res = load float, float* %resPtr - ret float %res -} - -define float @_cont_RayTCurrent(%struct.DispatchSystemData* nocapture readnone %data, %struct.HitData* %hitData) { -; METADATA-LABEL: define float @_cont_RayTCurrent -; METADATA-SAME: (%struct.DispatchSystemData* nocapture readnone [[DATA:%.*]], %struct.HitData* [[HITDATA:%.*]]) !types !50 { -; METADATA-NEXT: [[RESPTR:%.*]] = getelementptr [[STRUCT_HITDATA:%.*]], %struct.HitData* [[HITDATA]], i32 0, i32 2 -; METADATA-NEXT: [[RES:%.*]] = load float, float* [[RESPTR]], align 4 -; METADATA-NEXT: ret float [[RES]] -; - %resPtr = getelementptr %struct.HitData, %struct.HitData* %hitData, i32 0, i32 2 - %res = load float, float* %resPtr - ret float %res -} - -%dx.types.Handle = type { i8* } -%struct.RayPayload = type { <4 x float> } -%dx.types.ResourceProperties = type { i32, i32 } -%struct.BuiltInTriangleIntersectionAttributes = type { <2 x float> } -%struct.RaytracingAccelerationStructure = type { i32 } -%"class.RWTexture2D >" = type { <4 x float> } - -@"\01?Scene@@3URaytracingAccelerationStructure@@A" = external constant %dx.types.Handle, align 4 -@"\01?RenderTarget@@3V?$RWTexture2D@V?$vector@M$03@@@@A" = external constant %dx.types.Handle, align 4 - -; Function Attrs: nounwind -define void @MyRayGen() #0 { -; METADATA: Function Attrs: nounwind -; METADATA-LABEL: define void @MyRayGen -; METADATA-SAME: () #[[ATTR2:[0-9]+]] { -; METADATA-NEXT: [[TMP1:%.*]] = load [[DX_TYPES_HANDLE:%.*]], %dx.types.Handle* @"\01?Scene@@3URaytracingAccelerationStructure@@A", align 4 -; METADATA-NEXT: [[TMP2:%.*]] = load [[DX_TYPES_HANDLE]], %dx.types.Handle* @"\01?RenderTarget@@3V?$RWTexture2D@V?$vector@M$03@@@@A", align 4 -; METADATA-NEXT: [[TMP3:%.*]] = alloca [[STRUCT_RAYPAYLOAD:%.*]], align 4 -; METADATA-NEXT: [[TMP4:%.*]] = bitcast %struct.RayPayload* [[TMP3]] to i8* -; METADATA-NEXT: call void @llvm.lifetime.start.p0i8(i64 16, i8* [[TMP4]]) #[[ATTR1:[0-9]+]] -; METADATA-NEXT: [[TMP5:%.*]] = getelementptr inbounds [[STRUCT_RAYPAYLOAD]], %struct.RayPayload* [[TMP3]], i32 0, i32 0 -; METADATA-NEXT: store <4 x float> zeroinitializer, <4 x float>* [[TMP5]], align 4, !tbaa [[TBAA51:![0-9]+]] -; METADATA-NEXT: [[TMP6:%.*]] = call [[DX_TYPES_HANDLE]] @dx.op.createHandleForLib.dx.types.Handle(i32 160, [[DX_TYPES_HANDLE]] [[TMP1]]) -; METADATA-NEXT: [[TMP7:%.*]] = call [[DX_TYPES_HANDLE]] @dx.op.annotateHandle(i32 216, [[DX_TYPES_HANDLE]] [[TMP6]], [[DX_TYPES_RESOURCEPROPERTIES:%.*]] { i32 16, i32 0 }) -; METADATA-NEXT: call void @dx.op.traceRay.struct.RayPayload(i32 157, [[DX_TYPES_HANDLE]] [[TMP7]], i32 16, i32 -1, i32 0, i32 1, i32 0, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0x3F50624DE0000000, float 1.000000e+00, float 0.000000e+00, float 0.000000e+00, float 1.000000e+04, %struct.RayPayload* nonnull [[TMP3]]) -; METADATA-NEXT: [[TMP8:%.*]] = load <4 x float>, <4 x float>* [[TMP5]], align 4, !tbaa [[TBAA51]] -; METADATA-NEXT: [[TMP9:%.*]] = call i32 @dx.op.dispatchRaysIndex.i32(i32 145, i8 0) -; METADATA-NEXT: [[TMP10:%.*]] = call i32 @dx.op.dispatchRaysIndex.i32(i32 145, i8 1) -; METADATA-NEXT: [[TMP11:%.*]] = call [[DX_TYPES_HANDLE]] @dx.op.createHandleForLib.dx.types.Handle(i32 160, [[DX_TYPES_HANDLE]] [[TMP2]]) -; METADATA-NEXT: [[TMP12:%.*]] = call [[DX_TYPES_HANDLE]] @dx.op.annotateHandle(i32 216, [[DX_TYPES_HANDLE]] [[TMP11]], [[DX_TYPES_RESOURCEPROPERTIES]] { i32 4098, i32 1033 }) -; METADATA-NEXT: [[TMP13:%.*]] = extractelement <4 x float> [[TMP8]], i64 0 -; METADATA-NEXT: [[TMP14:%.*]] = extractelement <4 x float> [[TMP8]], i64 1 -; METADATA-NEXT: [[TMP15:%.*]] = extractelement <4 x float> [[TMP8]], i64 2 -; METADATA-NEXT: [[TMP16:%.*]] = extractelement <4 x float> [[TMP8]], i64 3 -; METADATA-NEXT: call void @dx.op.textureStore.f32(i32 67, [[DX_TYPES_HANDLE]] [[TMP12]], i32 [[TMP9]], i32 [[TMP10]], i32 undef, float [[TMP13]], float [[TMP14]], float [[TMP15]], float [[TMP16]], i8 15) -; METADATA-NEXT: call void @llvm.lifetime.end.p0i8(i64 16, i8* [[TMP4]]) #[[ATTR1]] -; METADATA-NEXT: ret void -; - %1 = load %dx.types.Handle, %dx.types.Handle* @"\01?Scene@@3URaytracingAccelerationStructure@@A", align 4 - %2 = load %dx.types.Handle, %dx.types.Handle* @"\01?RenderTarget@@3V?$RWTexture2D@V?$vector@M$03@@@@A", align 4 - %3 = alloca %struct.RayPayload, align 4 - %4 = bitcast %struct.RayPayload* %3 to i8* - call void @llvm.lifetime.start(i64 16, i8* %4) #1 - %5 = getelementptr inbounds %struct.RayPayload, %struct.RayPayload* %3, i32 0, i32 0 - store <4 x float> zeroinitializer, <4 x float>* %5, align 4, !tbaa !31 - %6 = call %dx.types.Handle @dx.op.createHandleForLib.dx.types.Handle(i32 160, %dx.types.Handle %1) ; CreateHandleForLib(Resource) - %7 = call %dx.types.Handle @dx.op.annotateHandle(i32 216, %dx.types.Handle %6, %dx.types.ResourceProperties { i32 16, i32 0 }) ; AnnotateHandle(res,props) resource: RTAccelerationStructure - call void @dx.op.traceRay.struct.RayPayload(i32 157, %dx.types.Handle %7, i32 16, i32 -1, i32 0, i32 1, i32 0, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0x3F50624DE0000000, float 1.000000e+00, float 0.000000e+00, float 0.000000e+00, float 1.000000e+04, %struct.RayPayload* nonnull %3) ; TraceRay(AccelerationStructure,RayFlags,InstanceInclusionMask,RayContributionToHitGroupIndex,MultiplierForGeometryContributionToShaderIndex,MissShaderIndex,Origin_X,Origin_Y,Origin_Z,TMin,Direction_X,Direction_Y,Direction_Z,TMax,payload) - %8 = load <4 x float>, <4 x float>* %5, align 4, !tbaa !31 - %9 = call i32 @dx.op.dispatchRaysIndex.i32(i32 145, i8 0) ; DispatchRaysIndex(col) - %10 = call i32 @dx.op.dispatchRaysIndex.i32(i32 145, i8 1) ; DispatchRaysIndex(col) - %11 = call %dx.types.Handle @dx.op.createHandleForLib.dx.types.Handle(i32 160, %dx.types.Handle %2) ; CreateHandleForLib(Resource) - %12 = call %dx.types.Handle @dx.op.annotateHandle(i32 216, %dx.types.Handle %11, %dx.types.ResourceProperties { i32 4098, i32 1033 }) ; AnnotateHandle(res,props) resource: RWTexture2D<4xF32> - %13 = extractelement <4 x float> %8, i64 0 - %14 = extractelement <4 x float> %8, i64 1 - %15 = extractelement <4 x float> %8, i64 2 - %16 = extractelement <4 x float> %8, i64 3 - call void @dx.op.textureStore.f32(i32 67, %dx.types.Handle %12, i32 %9, i32 %10, i32 undef, float %13, float %14, float %15, float %16, i8 15) ; TextureStore(srv,coord0,coord1,coord2,value0,value1,value2,value3,mask) - call void @llvm.lifetime.end(i64 16, i8* %4) #1 - ret void -} - -; Function Attrs: nounwind -declare void @llvm.lifetime.start(i64, i8* nocapture) #1 - -; Function Attrs: nounwind -declare void @llvm.lifetime.end(i64, i8* nocapture) #1 - -; Function Attrs: nounwind -define void @MyClosestHitShader(%struct.RayPayload* noalias nocapture %payload, %struct.BuiltInTriangleIntersectionAttributes* nocapture readonly %attr) #0 { -; METADATA: Function Attrs: nounwind -; METADATA-LABEL: define void @MyClosestHitShader -; METADATA-SAME: (%struct.RayPayload* noalias nocapture [[PAYLOAD:%.*]], %struct.BuiltInTriangleIntersectionAttributes* nocapture readonly [[ATTR:%.*]]) #[[ATTR2]] !types !54 { -; METADATA-NEXT: [[TMP1:%.*]] = getelementptr inbounds [[STRUCT_BUILTINTRIANGLEINTERSECTIONATTRIBUTES:%.*]], %struct.BuiltInTriangleIntersectionAttributes* [[ATTR]], i32 0, i32 0 -; METADATA-NEXT: [[TMP2:%.*]] = load <2 x float>, <2 x float>* [[TMP1]], align 4 -; METADATA-NEXT: [[TMP3:%.*]] = extractelement <2 x float> [[TMP2]], i32 0 -; METADATA-NEXT: [[TMP4:%.*]] = fsub fast float 1.000000e+00, [[TMP3]] -; METADATA-NEXT: [[TMP5:%.*]] = extractelement <2 x float> [[TMP2]], i32 1 -; METADATA-NEXT: [[TMP6:%.*]] = fsub fast float [[TMP4]], [[TMP5]] -; METADATA-NEXT: [[TMP7:%.*]] = insertelement <4 x float> undef, float [[TMP6]], i64 0 -; METADATA-NEXT: [[TMP8:%.*]] = insertelement <4 x float> [[TMP7]], float [[TMP3]], i64 1 -; METADATA-NEXT: [[TMP9:%.*]] = insertelement <4 x float> [[TMP8]], float [[TMP5]], i64 2 -; METADATA-NEXT: [[TMP10:%.*]] = insertelement <4 x float> [[TMP9]], float 1.000000e+00, i64 3 -; METADATA-NEXT: [[TMP11:%.*]] = getelementptr inbounds [[STRUCT_RAYPAYLOAD:%.*]], %struct.RayPayload* [[PAYLOAD]], i32 0, i32 0 -; METADATA-NEXT: store <4 x float> [[TMP10]], <4 x float>* [[TMP11]], align 4 -; METADATA-NEXT: ret void -; - %1 = getelementptr inbounds %struct.BuiltInTriangleIntersectionAttributes, %struct.BuiltInTriangleIntersectionAttributes* %attr, i32 0, i32 0 - %2 = load <2 x float>, <2 x float>* %1, align 4 - %3 = extractelement <2 x float> %2, i32 0 - %4 = fsub fast float 1.000000e+00, %3 - %5 = extractelement <2 x float> %2, i32 1 - %6 = fsub fast float %4, %5 - %7 = insertelement <4 x float> undef, float %6, i64 0 - %8 = insertelement <4 x float> %7, float %3, i64 1 - %9 = insertelement <4 x float> %8, float %5, i64 2 - %10 = insertelement <4 x float> %9, float 1.000000e+00, i64 3 - %11 = getelementptr inbounds %struct.RayPayload, %struct.RayPayload* %payload, i32 0, i32 0 - store <4 x float> %10, <4 x float>* %11, align 4 - ret void -} - -; Function Attrs: nounwind -define void @MyAnyHitShader(%struct.RayPayload* noalias nocapture %payload, %struct.BuiltInTriangleIntersectionAttributes* nocapture readnone %attr) #0 { -; METADATA: Function Attrs: nounwind -; METADATA-LABEL: define void @MyAnyHitShader -; METADATA-SAME: (%struct.RayPayload* noalias nocapture [[PAYLOAD:%.*]], %struct.BuiltInTriangleIntersectionAttributes* nocapture readnone [[ATTR:%.*]]) #[[ATTR2]] !types !54 { -; METADATA-NEXT: [[TMP1:%.*]] = getelementptr inbounds [[STRUCT_RAYPAYLOAD:%.*]], %struct.RayPayload* [[PAYLOAD]], i32 0, i32 0 -; METADATA-NEXT: [[TMP2:%.*]] = load <4 x float>, <4 x float>* [[TMP1]], align 4 -; METADATA-NEXT: [[TMP3:%.*]] = call float @dx.op.objectRayOrigin.f32(i32 149, i8 0) -; METADATA-NEXT: [[TMP4:%.*]] = call float @dx.op.objectRayDirection.f32(i32 150, i8 0) -; METADATA-NEXT: [[TMP5:%.*]] = call float @dx.op.rayTCurrent.f32(i32 154) -; METADATA-NEXT: [[TMP6:%.*]] = fmul fast float [[TMP5]], [[TMP4]] -; METADATA-NEXT: [[TMP7:%.*]] = fadd fast float [[TMP6]], [[TMP3]] -; METADATA-NEXT: [[TMP8:%.*]] = fcmp fast ogt float [[TMP7]], 0.000000e+00 -; METADATA-NEXT: [[TMP9:%.*]] = fcmp fast ogt float [[TMP7]], 1.000000e+00 -; METADATA-NEXT: [[TMP10:%.*]] = fcmp fast ogt float [[TMP7]], -1.000000e+00 -; METADATA-NEXT: br i1 [[TMP8]], label [[TMP11:%.*]], label [[TMP14:%.*]] -; METADATA: 11: -; METADATA-NEXT: store <4 x float> [[TMP2]], <4 x float>* [[TMP1]], align 4 -; METADATA-NEXT: br i1 [[TMP9]], label [[TMP12:%.*]], label [[TMP13:%.*]] -; METADATA: 12: -; METADATA-NEXT: call void @dx.op.acceptHitAndEndSearch(i32 156) -; METADATA-NEXT: unreachable -; METADATA: 13: -; METADATA-NEXT: call void @dx.op.acceptHitAndEndSearch(i32 156) -; METADATA-NEXT: ret void -; METADATA: 14: -; METADATA-NEXT: br i1 [[TMP10]], label [[TMP15:%.*]], label [[TMP18:%.*]] -; METADATA: 15: -; METADATA-NEXT: br i1 [[TMP9]], label [[TMP16:%.*]], label [[TMP17:%.*]] -; METADATA: 16: -; METADATA-NEXT: call void @dx.op.ignoreHit(i32 155) -; METADATA-NEXT: unreachable -; METADATA: 17: -; METADATA-NEXT: call void @dx.op.ignoreHit(i32 155) -; METADATA-NEXT: ret void -; METADATA: 18: -; METADATA-NEXT: store <4 x float> [[TMP2]], <4 x float>* [[TMP1]], align 4 -; METADATA-NEXT: ret void -; - %1 = getelementptr inbounds %struct.RayPayload, %struct.RayPayload* %payload, i32 0, i32 0 - %2 = load <4 x float>, <4 x float>* %1, align 4 - %3 = call float @dx.op.objectRayOrigin.f32(i32 149, i8 0) ; ObjectRayOrigin(col) - %4 = call float @dx.op.objectRayDirection.f32(i32 150, i8 0) ; ObjectRayDirection(col) - %5 = call float @dx.op.rayTCurrent.f32(i32 154) ; RayTCurrent() - %6 = fmul fast float %5, %4 - %7 = fadd fast float %6, %3 - %8 = fcmp fast ogt float %7, 0.000000e+00 - %9 = fcmp fast ogt float %7, 1.000000e+00 - %10 = fcmp fast ogt float %7, -1.000000e+00 - br i1 %8, label %11, label %14 - -;