diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index a8364b4..2a83e3b 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -1,9 +1,11 @@ variables: - SampleName: FfxParallelSort + SampleName: FFX_ParallelSort GIT_SUBMODULE_STRATEGY: normal + stages: - build - deploy + build_dx12: tags: - windows @@ -15,6 +17,19 @@ build_dx12: artifacts: paths: - sample/bin/ + +build_vk: + tags: + - windows + - amd64 + stage: build + script: + - 'cmake -S sample -B sample/build/VK -G "Visual Studio 16 2019" -A x64 -DGFX_API=VK -DBUILD_INSTALLER=ON' + - 'cmake --build sample/build/VK --config Release' + artifacts: + paths: + - sample/bin/ + package_sample: tags: - windows @@ -22,10 +37,14 @@ package_sample: stage: deploy dependencies: - build_dx12 + - build_vk script: - echo "Packaging build" + - copy %VULKAN_SDK%\Bin\glslc.exe .\sample\bin - echo cd .\sample\bin\ > %SampleName%_DX12.bat - echo start %SampleName%_DX12.exe >> %SampleName%_DX12.bat + - echo cd .\sample\bin\ > %SampleName%_VK.bat + - echo start %SampleName%_VK.exe >> %SampleName%_VK.bat artifacts: name: "%SampleName%-%CI_COMMIT_TAG%-%CI_COMMIT_REF_NAME%-%CI_COMMIT_SHORT_SHA%" paths: @@ -35,3 +54,4 @@ package_sample: - "readme.md" - "license.txt" - "%SampleName%_DX12.bat" + - "%SampleName%_VK.bat" diff --git a/LICENSE.txt b/LICENSE.txt index f20b91b..cc2870e 100644 --- a/LICENSE.txt +++ b/LICENSE.txt @@ -1,4 +1,4 @@ -Copyright (c) 2020 Advanced Micro Devices, Inc. All rights reserved. +Copyright (c) 2020-2021 Advanced Micro Devices, Inc. All rights reserved. Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal @@ -16,4 +16,4 @@ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN -THE SOFTWARE. \ No newline at end of file +THE SOFTWARE. diff --git a/README.md b/README.md index 4b21bab..ab3e6ca 100644 --- a/README.md +++ b/README.md @@ -1,15 +1,37 @@ # FidelityFX Parallel Sort +Copyright (c) 2020-2021 Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. + +## Parallel Sort + The AMD FidelityFX Parallel Sort provides an open source header implementation to easily integrate a highly optimized compute-based radix sort into your game. Features of the implementation: -- Direct and Indirect execution support +- Direct and indirect execution support - RDNA+ optimized algorithm -- Support for DirectX 12 +- Support for the Vulkan and Direct3D 12 APIs - Shaders written in HLSL utilizing SM 6.0 wave-level operations -- DirectX 12 sample +- A sample application is provided for both Direct3D 12 and Vulkan -Resources: +## Resources -Introduction to GPU Radix Sort - http://www.heterogeneouscompute.org/wordpress/wp-content/uploads/2011/06/RadixSort.pdf \ No newline at end of file +[Introduction to GPU Radix Sort](http://www.heterogeneouscompute.org/wordpress/wp-content/uploads/2011/06/RadixSort.pdf) diff --git a/ffx-parallelsort/FFX_ParallelSort.h b/ffx-parallelsort/FFX_ParallelSort.h index 38b34ae..d910e91 100644 --- a/ffx-parallelsort/FFX_ParallelSort.h +++ b/ffx-parallelsort/FFX_ParallelSort.h @@ -97,12 +97,12 @@ uint NumScanValues; }; - groupshared uint gs_Histogram[FFX_PARALLELSORT_THREADGROUP_SIZE * FFX_PARALLELSORT_SORT_BIN_COUNT]; + groupshared uint gs_FFX_PARALLELSORT_Histogram[FFX_PARALLELSORT_THREADGROUP_SIZE * FFX_PARALLELSORT_SORT_BIN_COUNT]; void FFX_ParallelSort_Count_uint(uint localID, uint groupID, FFX_ParallelSortCB CBuffer, uint ShiftBit, RWStructuredBuffer SrcBuffer, RWStructuredBuffer SumTable) { // Start by clearing our local counts in LDS for (int i = 0; i < FFX_PARALLELSORT_SORT_BIN_COUNT; i++) - gs_Histogram[(i * FFX_PARALLELSORT_THREADGROUP_SIZE) + localID] = 0; + gs_FFX_PARALLELSORT_Histogram[(i * FFX_PARALLELSORT_THREADGROUP_SIZE) + localID] = 0; // Wait for everyone to catch up GroupMemoryBarrierWithGroupSync(); @@ -141,7 +141,7 @@ if (DataIndex < CBuffer.NumKeys) { uint localKey = (srcKeys[i] >> ShiftBit) & 0xf; - InterlockedAdd(gs_Histogram[(localKey * FFX_PARALLELSORT_THREADGROUP_SIZE) + localID], 1); + InterlockedAdd(gs_FFX_PARALLELSORT_Histogram[(localKey * FFX_PARALLELSORT_THREADGROUP_SIZE) + localID], 1); DataIndex += FFX_PARALLELSORT_THREADGROUP_SIZE; } } @@ -156,13 +156,13 @@ uint sum = 0; for (int i = 0; i < FFX_PARALLELSORT_THREADGROUP_SIZE; i++) { - sum += gs_Histogram[localID * FFX_PARALLELSORT_THREADGROUP_SIZE + i]; + sum += gs_FFX_PARALLELSORT_Histogram[localID * FFX_PARALLELSORT_THREADGROUP_SIZE + i]; } SumTable[localID * CBuffer.NumThreadGroups + groupID] = sum; } } - groupshared uint gs_LDSSums[FFX_PARALLELSORT_THREADGROUP_SIZE]; + groupshared uint gs_FFX_PARALLELSORT_LDSSums[FFX_PARALLELSORT_THREADGROUP_SIZE]; uint FFX_ParallelSort_ThreadgroupReduce(uint localSum, uint localID) { // Do wave local reduce @@ -172,14 +172,14 @@ // Note that some hardware with very small HW wave sizes (i.e. <= 8) may exhibit issues with this algorithm, and have not been tested. uint waveID = localID / WaveGetLaneCount(); if (WaveIsFirstLane()) - gs_LDSSums[waveID] = waveReduced; + gs_FFX_PARALLELSORT_LDSSums[waveID] = waveReduced; // Wait for everyone to catch up GroupMemoryBarrierWithGroupSync(); // First wave worth of threads sum up wave reductions if (!waveID) - waveReduced = WaveActiveSum( (localID < FFX_PARALLELSORT_THREADGROUP_SIZE / WaveGetLaneCount()) ? gs_LDSSums[localID] : 0); + waveReduced = WaveActiveSum( (localID < FFX_PARALLELSORT_THREADGROUP_SIZE / WaveGetLaneCount()) ? gs_FFX_PARALLELSORT_LDSSums[localID] : 0); // Returned the reduced sum return waveReduced; @@ -196,20 +196,20 @@ // Last element in a wave writes out partial sum to LDS if (laneID == WaveGetLaneCount() - 1) - gs_LDSSums[waveID] = wavePrefixed + localSum; + gs_FFX_PARALLELSORT_LDSSums[waveID] = wavePrefixed + localSum; // Wait for everyone to catch up GroupMemoryBarrierWithGroupSync(); // First wave prefixes partial sums if (!waveID) - gs_LDSSums[localID] = WavePrefixSum(gs_LDSSums[localID]); + gs_FFX_PARALLELSORT_LDSSums[localID] = WavePrefixSum(gs_FFX_PARALLELSORT_LDSSums[localID]); // Wait for everyone to catch up GroupMemoryBarrierWithGroupSync(); // Add the partial sums back to each wave prefix - wavePrefixed += gs_LDSSums[waveID]; + wavePrefixed += gs_FFX_PARALLELSORT_LDSSums[waveID]; return wavePrefixed; } @@ -244,7 +244,7 @@ // This is to transform uncoalesced loads into coalesced loads and // then scattered loads from LDS - groupshared int gs_LDS[FFX_PARALLELSORT_ELEMENTS_PER_THREAD][FFX_PARALLELSORT_THREADGROUP_SIZE]; + groupshared int gs_FFX_PARALLELSORT_LDS[FFX_PARALLELSORT_ELEMENTS_PER_THREAD][FFX_PARALLELSORT_THREADGROUP_SIZE]; void FFX_ParallelSort_ScanPrefix(uint numValuesToScan, uint localID, uint groupID, uint BinOffset, uint BaseIndex, bool AddPartialSums, FFX_ParallelSortCB CBuffer, RWStructuredBuffer ScanSrc, RWStructuredBuffer ScanDst, RWStructuredBuffer ScanScratch) { @@ -255,7 +255,7 @@ uint col = ((i * FFX_PARALLELSORT_THREADGROUP_SIZE) + localID) / FFX_PARALLELSORT_ELEMENTS_PER_THREAD; uint row = ((i * FFX_PARALLELSORT_THREADGROUP_SIZE) + localID) % FFX_PARALLELSORT_ELEMENTS_PER_THREAD; - gs_LDS[row][col] = (DataIndex < numValuesToScan) ? ScanSrc[BinOffset + DataIndex] : 0; + gs_FFX_PARALLELSORT_LDS[row][col] = (DataIndex < numValuesToScan) ? ScanSrc[BinOffset + DataIndex] : 0; } // Wait for everyone to catch up @@ -265,8 +265,8 @@ // Calculate the local scan-prefix for current thread for (uint i = 0; i < FFX_PARALLELSORT_ELEMENTS_PER_THREAD; i++) { - uint tmp = gs_LDS[i][localID]; - gs_LDS[i][localID] = threadgroupSum; + uint tmp = gs_FFX_PARALLELSORT_LDS[i][localID]; + gs_FFX_PARALLELSORT_LDS[i][localID] = threadgroupSum; threadgroupSum += tmp; } @@ -284,7 +284,7 @@ // Add the block scanned-prefixes back in for (uint i = 0; i < FFX_PARALLELSORT_ELEMENTS_PER_THREAD; i++) - gs_LDS[i][localID] += threadgroupSum; + gs_FFX_PARALLELSORT_LDS[i][localID] += threadgroupSum; // Wait for everyone to catch up GroupMemoryBarrierWithGroupSync(); @@ -298,25 +298,25 @@ uint row = ((i * FFX_PARALLELSORT_THREADGROUP_SIZE) + localID) % FFX_PARALLELSORT_ELEMENTS_PER_THREAD; if (DataIndex < numValuesToScan) - ScanDst[BinOffset + DataIndex] = gs_LDS[row][col] + partialSum; + ScanDst[BinOffset + DataIndex] = gs_FFX_PARALLELSORT_LDS[row][col] + partialSum; } } // Offset cache to avoid loading the offsets all the time - groupshared uint gs_BinOffsetCache[FFX_PARALLELSORT_THREADGROUP_SIZE]; + groupshared uint gs_FFX_PARALLELSORT_BinOffsetCache[FFX_PARALLELSORT_THREADGROUP_SIZE]; // Local histogram for offset calculations - groupshared uint gs_LocalHistogram[FFX_PARALLELSORT_SORT_BIN_COUNT]; + groupshared uint gs_FFX_PARALLELSORT_LocalHistogram[FFX_PARALLELSORT_SORT_BIN_COUNT]; // Scratch area for algorithm - groupshared uint gs_LDSScratch[FFX_PARALLELSORT_THREADGROUP_SIZE]; + groupshared uint gs_FFX_PARALLELSORT_LDSScratch[FFX_PARALLELSORT_THREADGROUP_SIZE]; void FFX_ParallelSort_Scatter_uint(uint localID, uint groupID, FFX_ParallelSortCB CBuffer, uint ShiftBit, RWStructuredBuffer SrcBuffer, RWStructuredBuffer DstBuffer, RWStructuredBuffer SumTable #ifdef kRS_ValueCopy - ,RWStructuredBuffer SrcPayload, RWStructuredBuffer DstPayload + ,RWStructuredBuffer SrcPayload, RWStructuredBuffer DstPayload #endif // kRS_ValueCopy ) { // Load the sort bin threadgroup offsets into LDS for faster referencing if (localID < FFX_PARALLELSORT_SORT_BIN_COUNT) - gs_BinOffsetCache[localID] = SumTable[localID * CBuffer.NumThreadGroups + groupID]; + gs_FFX_PARALLELSORT_BinOffsetCache[localID] = SumTable[localID * CBuffer.NumThreadGroups + groupID]; // Wait for everyone to catch up GroupMemoryBarrierWithGroupSync(); @@ -363,7 +363,7 @@ { // Clear the local histogram if (localID < FFX_PARALLELSORT_SORT_BIN_COUNT) - gs_LocalHistogram[localID] = 0; + gs_FFX_PARALLELSORT_LocalHistogram[localID] = 0; uint localKey = (DataIndex < CBuffer.NumKeys ? srcKeys[i] : 0xffffffff); #ifdef kRS_ValueCopy @@ -386,13 +386,13 @@ // Last thread stores the updated histogram counts for the thread group // Scratch = 0xsum3|sum2|sum1|sum0 for thread group if (localID == (FFX_PARALLELSORT_THREADGROUP_SIZE - 1)) - gs_LDSScratch[0] = localSum + packedHistogram; + gs_FFX_PARALLELSORT_LDSScratch[0] = localSum + packedHistogram; // Wait for everyone to catch up GroupMemoryBarrierWithGroupSync(); // Load the sums value for the thread group - packedHistogram = gs_LDSScratch[0]; + packedHistogram = gs_FFX_PARALLELSORT_LDSScratch[0]; // Add prefix offsets for all 4 bit "keys" (packedHistogram = 0xsum2_1_0|sum1_0|sum0|0) packedHistogram = (packedHistogram << 8) + (packedHistogram << 16) + (packedHistogram << 24); @@ -404,18 +404,18 @@ uint keyOffset = (localSum >> (bitKey * 8)) & 0xff; // Re-arrange the keys (store, sync, load) - gs_LDSSums[keyOffset] = localKey; + gs_FFX_PARALLELSORT_LDSSums[keyOffset] = localKey; GroupMemoryBarrierWithGroupSync(); - localKey = gs_LDSSums[localID]; + localKey = gs_FFX_PARALLELSORT_LDSSums[localID]; // Wait for everyone to catch up GroupMemoryBarrierWithGroupSync(); #ifdef kRS_ValueCopy // Re-arrange the values if we have them (store, sync, load) - gs_LDSSums[keyOffset] = localValue; + gs_FFX_PARALLELSORT_LDSSums[keyOffset] = localValue; GroupMemoryBarrierWithGroupSync(); - localValue = gs_LDSSums[localID]; + localValue = gs_FFX_PARALLELSORT_LDSSums[localID]; // Wait for everyone to catch up GroupMemoryBarrierWithGroupSync(); @@ -426,26 +426,26 @@ uint keyIndex = (localKey >> ShiftBit) & 0xf; // Reconstruct histogram - InterlockedAdd(gs_LocalHistogram[keyIndex], 1); + InterlockedAdd(gs_FFX_PARALLELSORT_LocalHistogram[keyIndex], 1); // Wait for everyone to catch up GroupMemoryBarrierWithGroupSync(); // Prefix histogram - uint histogramPrefixSum = WavePrefixSum(localID < FFX_PARALLELSORT_SORT_BIN_COUNT ? gs_LocalHistogram[localID] : 0); + uint histogramPrefixSum = WavePrefixSum(localID < FFX_PARALLELSORT_SORT_BIN_COUNT ? gs_FFX_PARALLELSORT_LocalHistogram[localID] : 0); // Broadcast prefix-sum via LDS if (localID < FFX_PARALLELSORT_SORT_BIN_COUNT) - gs_LDSScratch[localID] = histogramPrefixSum; + gs_FFX_PARALLELSORT_LDSScratch[localID] = histogramPrefixSum; // Get the global offset for this key out of the cache - uint globalOffset = gs_BinOffsetCache[keyIndex]; + uint globalOffset = gs_FFX_PARALLELSORT_BinOffsetCache[keyIndex]; // Wait for everyone to catch up GroupMemoryBarrierWithGroupSync(); // Get the local offset (at this point the keys are all in increasing order from 0 -> num bins in localID 0 -> thread group size) - uint localOffset = localID - gs_LDSScratch[keyIndex]; + uint localOffset = localID - gs_FFX_PARALLELSORT_LDSScratch[keyIndex]; // Write to destination uint totalOffset = globalOffset + localOffset; @@ -464,7 +464,7 @@ // Update the cached histogram for the next set of entries if (localID < FFX_PARALLELSORT_SORT_BIN_COUNT) - gs_BinOffsetCache[localID] += gs_LocalHistogram[localID]; + gs_FFX_PARALLELSORT_BinOffsetCache[localID] += gs_FFX_PARALLELSORT_LocalHistogram[localID]; DataIndex += FFX_PARALLELSORT_THREADGROUP_SIZE; // Increase the data offset by thread group size } diff --git a/sample/CMakeLists.txt b/sample/CMakeLists.txt index ddcd588..207ad51 100644 --- a/sample/CMakeLists.txt +++ b/sample/CMakeLists.txt @@ -1,7 +1,44 @@ -cmake_minimum_required(VERSION 3.4) -set(CMAKE_GENERATOR_PLATFORM x64) +cmake_minimum_required(VERSION 3.6) -project (FfxParallelSort_${GFX_API}) +option (GFX_API_DX12 "Build with DX12" ON) +option (GFX_API_VK "Build with Vulkan" ON) + +if(NOT DEFINED GFX_API) + project (FFX_ParallelSort_) +else() + project (FFX_ParallelSort_${GFX_API}) + + set_property(DIRECTORY ${CMAKE_PROJECT_DIR} PROPERTY VS_STARTUP_PROJECT ${PROJECT_NAME}) + + if(GFX_API STREQUAL DX12) + set(GFX_API_DX12 ON) + set(GFX_API_VK OFF) + elseif(GFX_API STREQUAL VK) + set(GFX_API_DX12 OFF) + set(GFX_API_VK ON) + else() + message(STATUS "----------------------------------------------------------------------------------------") + message(STATUS "") + message(STATUS "** Almost there!!") + message(STATUS "") + message(STATUS " This framework supports DX12 and VULKAN, you need to invoke cmake in one of these ways:") + message(STATUS "") + message(STATUS " Examples:") + message(STATUS " Generate selected one:") + message(STATUS " cmake -DGFX_API=DX12") + message(STATUS " cmake -DGFX_API=VK") + message(STATUS " Generate with switches (Default is ON):") + message(STATUS " cmake [-DGFX_API_DX12=ON|OFF] [-DGFX_API_VK=ON|OFF]") + message(STATUS "") + message(STATUS "----------------------------------------------------------------------------------------") + message(FATAL_ERROR "") + endif() +endif() + +# Check MSVC toolset version, Visual Studio 2019 required +if(MSVC_TOOLSET_VERSION VERSION_LESS 142) + message(FATAL_ERROR "Cannot find MSVC toolset version 142 or greater. Please make sure Visual Studio 2019 or newer installed") +endif() # ouput exe to bin directory SET(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_HOME_DIRECTORY}/bin) @@ -10,24 +47,25 @@ foreach( OUTPUTCONFIG ${CMAKE_CONFIGURATION_TYPES} ) set( CMAKE_RUNTIME_OUTPUT_DIRECTORY_${OUTPUTCONFIG} ${CMAKE_HOME_DIRECTORY}/bin ) endforeach( OUTPUTCONFIG CMAKE_CONFIGURATION_TYPES ) +add_compile_options(/MP) + # reference libs used by both backends add_subdirectory(libs/cauldron) -set_property(DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} PROPERTY VS_STARTUP_PROJECT ${PROJECT_NAME}) +# application icon +set(icon_src + ${CMAKE_CURRENT_SOURCE_DIR}/libs/cauldron/src/common/Icon/GPUOpenChip.ico + ${CMAKE_CURRENT_SOURCE_DIR}/libs/cauldron/src/common/Icon/resource.h + ${CMAKE_CURRENT_SOURCE_DIR}/libs/cauldron/src/common/Icon/Cauldron_Common.rc +) -if(GFX_API STREQUAL DX12) +if(GFX_API_VK) + find_package(Vulkan REQUIRED) + add_subdirectory(src/VK) +endif() +if(GFX_API_DX12) add_subdirectory(src/DX12) -else() - message(STATUS "----------------------------------------------------------------------------------------") - message(STATUS "") - message(STATUS "** Almost there!!") - message(STATUS "") - message(STATUS " This sample supports DX12 so you need to invoke cmake this way:") - message(STATUS "") - message(STATUS " Examples:") - message(STATUS " cmake -DGFX_API=DX12") - message(STATUS "") - message(STATUS "----------------------------------------------------------------------------------------") - message(FATAL_ERROR "") endif() +set_source_files_properties(${CMAKE_CURRENT_SOURCE_DIR}/libs/cauldron/src/common/Icon/Cauldron_Common.rc PROPERTIES VS_TOOL_OVERRIDE "Resource compiler") +set_source_files_properties(${CMAKE_CURRENT_SOURCE_DIR}/libs/cauldron/src/common/Icon/GPUOpenChip.ico PROPERTIES VS_TOOL_OVERRIDE "Image") diff --git a/sample/build/GenerateSolutions.bat b/sample/build/GenerateSolutions.bat index 383eadb..5ee4fbc 100644 --- a/sample/build/GenerateSolutions.bat +++ b/sample/build/GenerateSolutions.bat @@ -1,4 +1,9 @@ mkdir DX12 cd DX12 cmake ..\.. -DGFX_API=DX12 +cd .. + +mkdir VK +cd VK +cmake ..\.. -DGFX_API=VK cd .. \ No newline at end of file diff --git a/sample/libs/cauldron b/sample/libs/cauldron index b3a4f62..d220600 160000 --- a/sample/libs/cauldron +++ b/sample/libs/cauldron @@ -1 +1 @@ -Subproject commit b3a4f62bf79034240b979d575e67ee51790ab435 +Subproject commit d22060043138b38e3aa01f766227ee0bd4e4f83e diff --git a/sample/src/Common/FFXParallelSort.json b/sample/src/Common/FFXParallelSort.json index 8029da5..28069b4 100644 --- a/sample/src/Common/FFXParallelSort.json +++ b/sample/src/Common/FFXParallelSort.json @@ -1,13 +1,15 @@ { "globals": { - "CpuValidationLayerEnabled": true, + "CpuValidationLayerEnabled": false, "GpuValidationLayerEnabled": false, - "fullScreen": false, + "presentationMode": 0, "width": 1920, "height": 1080, "activeScene": 0, "benchmark": false, - "stablePowerState": false + "vsync": false, + "stablePowerState": false, + "fontsize": 13 }, "BenchmarkSettings": { "timeStep": 1, diff --git a/sample/src/DX12/shaders/ParallelSortCS.hlsl b/sample/src/Common/shaders/ParallelSortCS.hlsl similarity index 55% rename from sample/src/DX12/shaders/ParallelSortCS.hlsl rename to sample/src/Common/shaders/ParallelSortCS.hlsl index a4c8574..eb2cf08 100644 --- a/sample/src/DX12/shaders/ParallelSortCS.hlsl +++ b/sample/src/Common/shaders/ParallelSortCS.hlsl @@ -1,17 +1,17 @@ // ParallelSortCS.hlsl // -// Copyright (c) 2020 Advanced Micro Devices, Inc. All rights reserved. +// Copyright(c) 2021 Advanced Micro Devices, Inc.All rights reserved. // Permission is hereby granted, free of charge, to any person obtaining a copy -// of this software and associated documentation files (the "Software"), to deal +// of this software and associated documentation files(the "Software"), to deal // in the Software without restriction, including without limitation the rights -// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// to use, copy, modify, merge, publish, distribute, sublicense, and / or sell // copies of the Software, and to permit persons to whom the Software is -// furnished to do so, subject to the following conditions: +// furnished to do so, subject to the following conditions : // The above copyright notice and this permission notice shall be included in // all copies or substantial portions of the Software. // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN @@ -21,36 +21,43 @@ //-------------------------------------------------------------------------------------- // ParallelSort Shaders/Includes //-------------------------------------------------------------------------------------- +#define FFX_HLSL #include "FFX-ParallelSort/FFX_ParallelSort.h" -ConstantBuffer CBuffer : register(b0); // Constant buffer -cbuffer SetupIndirectCB : register(b1) // Setup Indirect Constant buffer +[[vk::binding(0, 0)]] ConstantBuffer CBuffer : register(b0); // Constant buffer +[[vk::binding(0, 1)]] cbuffer SetupIndirectCB : register(b1) // Setup Indirect Constant buffer { uint NumKeysIndex; uint MaxThreadGroups; }; -cbuffer RootConstants : register(b2) // Store the shift bit directly in the root signature -{ - uint CShiftBit; -} - -RWStructuredBuffer SrcBuffer : register(u0, space0); // The unsorted keys or scan data -RWStructuredBuffer SrcPayload : register(u0, space1); // The payload data - -RWStructuredBuffer SumTable : register(u0, space2); // The sum table we will write sums to -RWStructuredBuffer ReduceTable : register(u0, space3); // The reduced sum table we will write sums to -RWStructuredBuffer DstBuffer : register(u0, space4); // The sorted keys or prefixed data -RWStructuredBuffer DstPayload : register(u0, space5); // the sorted payload data - -RWStructuredBuffer ScanSrc : register(u0, space6); // Source for Scan Data -RWStructuredBuffer ScanDst : register(u0, space7); // Destination for Scan Data -RWStructuredBuffer ScanScratch : register(u0, space8); // Scratch data for Scan +struct RootConstantData { + uint CShiftBit; +}; -RWStructuredBuffer NumKeysBuffer : register(u0, space9); // Number of keys to sort for indirect execution -RWStructuredBuffer CBufferUAV : register(u0, space10);// UAV for constant buffer parameters for indirect execution -RWStructuredBuffer CountScatterArgs : register(u0, space11); // Count and Scatter Args for indirect execution -RWStructuredBuffer ReduceScanArgs : register(u0, space12); // Reduce and Scan Args for indirect execution +#ifdef VK_Const + [[vk::push_constant]] RootConstantData rootConstData; // Store the shift bit directly in the root signature +#else + ConstantBuffer rootConstData : register(b2); // Store the shift bit directly in the root signature +#endif // VK_Const + +[[vk::binding(0, 2)]] RWStructuredBuffer SrcBuffer : register(u0, space0); // The unsorted keys or scan data +[[vk::binding(2, 2)]] RWStructuredBuffer SrcPayload : register(u0, space1); // The payload data + +[[vk::binding(0, 4)]] RWStructuredBuffer SumTable : register(u0, space2); // The sum table we will write sums to +[[vk::binding(1, 4)]] RWStructuredBuffer ReduceTable : register(u0, space3); // The reduced sum table we will write sums to + +[[vk::binding(1, 2)]] RWStructuredBuffer DstBuffer : register(u0, space4); // The sorted keys or prefixed data +[[vk::binding(3, 2)]] RWStructuredBuffer DstPayload : register(u0, space5); // the sorted payload data + +[[vk::binding(0, 3)]] RWStructuredBuffer ScanSrc : register(u0, space6); // Source for Scan Data +[[vk::binding(1, 3)]] RWStructuredBuffer ScanDst : register(u0, space7); // Destination for Scan Data +[[vk::binding(2, 3)]] RWStructuredBuffer ScanScratch : register(u0, space8); // Scratch data for Scan + +[[vk::binding(0, 5)]] RWStructuredBuffer NumKeysBuffer : register(u0, space9); // Number of keys to sort for indirect execution +[[vk::binding(1, 5)]] RWStructuredBuffer CBufferUAV : register(u0, space10); // UAV for constant buffer parameters for indirect execution +[[vk::binding(2, 5)]] RWStructuredBuffer CountScatterArgs: register(u0, space11); // Count and Scatter Args for indirect execution +[[vk::binding(3, 5)]] RWStructuredBuffer ReduceScanArgs : register(u0, space12); // Reduce and Scan Args for indirect execution // FPS Count @@ -58,7 +65,7 @@ RWStructuredBuffer ReduceScanArgs : register(u0, space12); // Reduce and void FPS_Count(uint localID : SV_GroupThreadID, uint groupID : SV_GroupID) { // Call the uint version of the count part of the algorithm - FFX_ParallelSort_Count_uint(localID, groupID, CBuffer, CShiftBit, SrcBuffer, SumTable); + FFX_ParallelSort_Count_uint(localID, groupID, CBuffer, rootConstData.CShiftBit, SrcBuffer, SumTable); } // FPS Reduce @@ -90,7 +97,6 @@ void FPS_ScanAdd(uint localID : SV_GroupThreadID, uint groupID : SV_GroupID) uint BinOffset = BinID * CBuffer.NumThreadGroups; // Get the base index for this thread group - //uint BaseIndex = FFX_PARALLELSORT_ELEMENTS_PER_THREAD * FFX_PARALLELSORT_THREADGROUP_SIZE * (groupID / FFX_PARALLELSORT_SORT_BIN_COUNT); uint BaseIndex = (groupID % CBuffer.NumReduceThreadgroupPerBin) * FFX_PARALLELSORT_ELEMENTS_PER_THREAD * FFX_PARALLELSORT_THREADGROUP_SIZE; FFX_ParallelSort_ScanPrefix(CBuffer.NumThreadGroups, localID, groupID, BinOffset, BaseIndex, true, @@ -101,7 +107,7 @@ void FPS_ScanAdd(uint localID : SV_GroupThreadID, uint groupID : SV_GroupID) [numthreads(FFX_PARALLELSORT_THREADGROUP_SIZE, 1, 1)] void FPS_Scatter(uint localID : SV_GroupThreadID, uint groupID : SV_GroupID) { - FFX_ParallelSort_Scatter_uint(localID, groupID, CBuffer, CShiftBit, SrcBuffer, DstBuffer, SumTable + FFX_ParallelSort_Scatter_uint(localID, groupID, CBuffer, rootConstData.CShiftBit, SrcBuffer, DstBuffer, SumTable #ifdef kRS_ValueCopy ,SrcPayload, DstPayload #endif // kRS_ValueCopy @@ -112,4 +118,4 @@ void FPS_Scatter(uint localID : SV_GroupThreadID, uint groupID : SV_GroupID) void FPS_SetupIndirectParameters(uint localID : SV_GroupThreadID) { FFX_ParallelSort_SetupIndirectParams(NumKeysBuffer[NumKeysIndex], MaxThreadGroups, CBufferUAV, CountScatterArgs, ReduceScanArgs); -} \ No newline at end of file +} diff --git a/sample/src/DX12/shaders/ParallelSortVerify.hlsl b/sample/src/Common/shaders/ParallelSortVerify.hlsl similarity index 81% rename from sample/src/DX12/shaders/ParallelSortVerify.hlsl rename to sample/src/Common/shaders/ParallelSortVerify.hlsl index baf4628..8c9ce4d 100644 --- a/sample/src/DX12/shaders/ParallelSortVerify.hlsl +++ b/sample/src/Common/shaders/ParallelSortVerify.hlsl @@ -1,17 +1,17 @@ // ParallelSortVerify.hlsl // -// Copyright (c) 2020 Advanced Micro Devices, Inc. All rights reserved. +// Copyright(c) 2021 Advanced Micro Devices, Inc.All rights reserved. // Permission is hereby granted, free of charge, to any person obtaining a copy -// of this software and associated documentation files (the "Software"), to deal +// of this software and associated documentation files(the "Software"), to deal // in the Software without restriction, including without limitation the rights -// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// to use, copy, modify, merge, publish, distribute, sublicense, and / or sell // copies of the Software, and to permit persons to whom the Software is -// furnished to do so, subject to the following conditions: +// furnished to do so, subject to the following conditions : // The above copyright notice and this permission notice shall be included in // all copies or substantial portions of the Software. // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN @@ -20,7 +20,7 @@ //-------------------------------------------------------------------------------------- // Render Verification Shaders/Constant buffers //-------------------------------------------------------------------------------------- -cbuffer ParallelSortRenderCB : register(b0) // If you change this, also change struct ParallelSortRenderCB in ParallelSort.h +[[vk::binding(0, 0)]] cbuffer ParallelSortRenderCB : register(b0) // If you change this, also change struct ParallelSortRenderCB in ParallelSort.h { int CB_Width; int CB_Height; @@ -28,8 +28,8 @@ cbuffer ParallelSortRenderCB : register(b0) // If you change this, also change s int CB_SortHeight; }; -RWStructuredBuffer SortBuffer : register(u0, space0); -RWTexture2D ValidationTexture : register(u0, space1); +[[vk::binding(0, 1)]] RWStructuredBuffer SortBuffer : register(u0, space0); +[[vk::binding(0, 2)]] Texture2D ValidationTexture : register(t0, space0); struct VertexOut { diff --git a/sample/src/DX12/CMakeLists.txt b/sample/src/DX12/CMakeLists.txt index 721e44c..375cea1 100644 --- a/sample/src/DX12/CMakeLists.txt +++ b/sample/src/DX12/CMakeLists.txt @@ -10,14 +10,17 @@ set(sources sample.h stdafx.cpp stdafx.h - samplerenderer.cpp - samplerenderer.h + Renderer.cpp + Renderer.h + UI.cpp + UI.h ParallelSort.cpp - ParallelSort.h) + ParallelSort.h + dpiawarescaling.manifest) set(shader_sources - ${CMAKE_CURRENT_SOURCE_DIR}/shaders/ParallelSortCS.hlsl - ${CMAKE_CURRENT_SOURCE_DIR}/shaders/ParallelSortVerify.hlsl) + ${CMAKE_CURRENT_SOURCE_DIR}/../Common/shaders/ParallelSortCS.hlsl + ${CMAKE_CURRENT_SOURCE_DIR}/../Common/shaders/ParallelSortVerify.hlsl) set(fidelityfx_sources ${CMAKE_CURRENT_SOURCE_DIR}/../../../FFX-ParallelSort/FFX_ParallelSort.h) @@ -36,12 +39,13 @@ source_group("Common" FILES ${common_sources}) source_group("Shaders" FILES ${shader_sources}) source_group("FidelityFX" FILES ${fidelityfx_sources}) source_group("Sources" FILES ${sources}) +source_group("Icon" FILES ${icon_src}) # defined in top-level CMakeLists.txt # prevent VS from processing/compiling these files set_source_files_properties(${Shaders_src} PROPERTIES VS_TOOL_OVERRIDE "Text") -add_executable(${PROJECT_NAME} WIN32 ${common_sources} ${shader_sources} ${sources} ${fidelityfx_sources}) +add_executable(${PROJECT_NAME} WIN32 ${common_sources} ${shader_sources} ${sources} ${fidelityfx_sources} ${icon_src}) target_link_libraries(${PROJECT_NAME} LINK_PUBLIC Cauldron_DX12 ImGUI amd_ags DXC d3dcompiler D3D12 DXGI) -set_target_properties(${PROJECT_NAME} PROPERTIES VS_DEBUGGER_WORKING_DIRECTORY "${CMAKE_HOME_DIRECTORY}/bin") +set_target_properties(${PROJECT_NAME} PROPERTIES VS_DEBUGGER_WORKING_DIRECTORY "${CMAKE_HOME_DIRECTORY}/bin" DEBUG_POSTFIX "d") addManifest(${PROJECT_NAME}) diff --git a/sample/src/DX12/ParallelSort.cpp b/sample/src/DX12/ParallelSort.cpp index c7cb84c..ffc6fc7 100644 --- a/sample/src/DX12/ParallelSort.cpp +++ b/sample/src/DX12/ParallelSort.cpp @@ -1,17 +1,17 @@ // ParallelSort.cpp // -// Copyright (c) 2020 Advanced Micro Devices, Inc. All rights reserved. +// Copyright(c) 2021 Advanced Micro Devices, Inc.All rights reserved. // Permission is hereby granted, free of charge, to any person obtaining a copy -// of this software and associated documentation files (the "Software"), to deal +// of this software and associated documentation files(the "Software"), to deal // in the Software without restriction, including without limitation the rights -// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// to use, copy, modify, merge, publish, distribute, sublicense, and / or sell // copies of the Software, and to permit persons to whom the Software is -// furnished to do so, subject to the following conditions: +// furnished to do so, subject to the following conditions : // The above copyright notice and this permission notice shall be included in // all copies or substantial portions of the Software. // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN @@ -27,850 +27,856 @@ static const uint32_t NumKeys[] = { 1920 * 1080, 2560 * 1440, 3840 * 2160 }; ////////////////////////////////////////////////////////////////////////// -namespace CAULDRON_DX12 + +////////////////////////////////////////////////////////////////////////// +// For doing command-line based benchmark runs +int FFXParallelSort::KeySetOverride = -1; +void FFXParallelSort::OverrideKeySet(int ResolutionOverride) +{ + KeySetOverride = ResolutionOverride; +} +bool FFXParallelSort::PayloadOverride = false; +void FFXParallelSort::OverridePayload() +{ + PayloadOverride = true; +} +////////////////////////////////////////////////////////////////////////// + +// Create all of the sort data for the sample +void FFXParallelSort::CreateKeyPayloadBuffers() +{ + std::vector KeyData1080(NumKeys[0]); + std::vector KeyData2K(NumKeys[1]); + std::vector KeyData4K(NumKeys[2]); + + // Populate the buffers with linear access index + std::iota(KeyData1080.begin(), KeyData1080.end(), 0); + std::iota(KeyData2K.begin(), KeyData2K.end(), 0); + std::iota(KeyData4K.begin(), KeyData4K.end(), 0); + + // Shuffle the data + std::shuffle(KeyData1080.begin(), KeyData1080.end(), std::mt19937{ std::random_device{}() }); + std::shuffle(KeyData2K.begin(), KeyData2K.end(), std::mt19937{ std::random_device{}() }); + std::shuffle(KeyData4K.begin(), KeyData4K.end(), std::mt19937{ std::random_device{}() }); + + // 1080p + CD3DX12_RESOURCE_DESC ResourceDesc = CD3DX12_RESOURCE_DESC::Buffer(sizeof(uint32_t) * NumKeys[0], D3D12_RESOURCE_FLAG_ALLOW_UNORDERED_ACCESS); + m_SrcKeyBuffers[0].InitBuffer(m_pDevice, "SrcKeys1080", &ResourceDesc, sizeof(uint32_t), D3D12_RESOURCE_STATE_COPY_DEST); + // 2K + ResourceDesc = CD3DX12_RESOURCE_DESC::Buffer(sizeof(uint32_t) * NumKeys[1], D3D12_RESOURCE_FLAG_ALLOW_UNORDERED_ACCESS); + m_SrcKeyBuffers[1].InitBuffer(m_pDevice, "SrcKeys2K", &ResourceDesc, sizeof(uint32_t), D3D12_RESOURCE_STATE_COPY_DEST); + // 4K + ResourceDesc = CD3DX12_RESOURCE_DESC::Buffer(sizeof(uint32_t) * NumKeys[2], D3D12_RESOURCE_FLAG_ALLOW_UNORDERED_ACCESS); + m_SrcKeyBuffers[2].InitBuffer(m_pDevice, "SrcKeys4K", &ResourceDesc, sizeof(uint32_t), D3D12_RESOURCE_STATE_COPY_DEST); + m_SrcPayloadBuffers.InitBuffer(m_pDevice, "SrcPayloadBuffer", &ResourceDesc, sizeof(uint32_t), D3D12_RESOURCE_STATE_COPY_DEST); + + // The DstKey and DstPayload buffers will be used as src/dst when sorting. A copy of the + // source key/payload will be copied into them before hand so we can keep our original values + ResourceDesc = CD3DX12_RESOURCE_DESC::Buffer(sizeof(uint32_t) * NumKeys[2], D3D12_RESOURCE_FLAG_ALLOW_UNORDERED_ACCESS); + m_DstKeyBuffers[0].InitBuffer(m_pDevice, "DstKeyBuf0", &ResourceDesc, sizeof(uint32_t), D3D12_RESOURCE_STATE_UNORDERED_ACCESS); + m_DstKeyBuffers[1].InitBuffer(m_pDevice, "DstKeyBuf1", &ResourceDesc, sizeof(uint32_t), D3D12_RESOURCE_STATE_UNORDERED_ACCESS); + m_DstPayloadBuffers[0].InitBuffer(m_pDevice, "DstPayloadBuf0", &ResourceDesc, sizeof(uint32_t), D3D12_RESOURCE_STATE_UNORDERED_ACCESS); + m_DstPayloadBuffers[1].InitBuffer(m_pDevice, "DstPayloadBuf1", &ResourceDesc, sizeof(uint32_t), D3D12_RESOURCE_STATE_UNORDERED_ACCESS); + + // Copy data in + + // 1080 + uint8_t* pKeyDataBuffer = m_pUploadHeap->Suballocate(NumKeys[0] * sizeof(uint32_t), sizeof(uint32_t)); + memcpy(pKeyDataBuffer, KeyData1080.data() , sizeof(uint32_t) * NumKeys[0]); + m_pUploadHeap->GetCommandList()->CopyBufferRegion(m_SrcKeyBuffers[0].GetResource(), 0, m_pUploadHeap->GetResource(), pKeyDataBuffer - m_pUploadHeap->BasePtr(), sizeof(uint32_t) * NumKeys[0]); + + // 2K + pKeyDataBuffer = m_pUploadHeap->Suballocate(NumKeys[1] * sizeof(uint32_t), sizeof(uint32_t)); + memcpy(pKeyDataBuffer, KeyData2K.data(), sizeof(uint32_t) * NumKeys[1]); + m_pUploadHeap->GetCommandList()->CopyBufferRegion(m_SrcKeyBuffers[1].GetResource(), 0, m_pUploadHeap->GetResource(), pKeyDataBuffer - m_pUploadHeap->BasePtr(), sizeof(uint32_t) * NumKeys[1]); + + // 4K + pKeyDataBuffer = m_pUploadHeap->Suballocate(NumKeys[2] * sizeof(uint32_t), sizeof(uint32_t)); + memcpy(pKeyDataBuffer, KeyData4K.data(), sizeof(uint32_t) * NumKeys[2]); + m_pUploadHeap->GetCommandList()->CopyBufferRegion(m_SrcKeyBuffers[2].GetResource(), 0, m_pUploadHeap->GetResource(), pKeyDataBuffer - m_pUploadHeap->BasePtr(), sizeof(uint32_t) * NumKeys[2]); + uint8_t* pPayloadDataBuffer = m_pUploadHeap->Suballocate(NumKeys[2] * sizeof(uint32_t), sizeof(uint32_t)); + memcpy(pPayloadDataBuffer, KeyData4K.data(), sizeof(uint32_t) * NumKeys[2]); // Copy the 4k source data for payload (it doesn't matter what the payload is as we really only want it to measure cost of copying/sorting) + m_pUploadHeap->GetCommandList()->CopyBufferRegion(m_SrcPayloadBuffers.GetResource(), 0, m_pUploadHeap->GetResource(), pPayloadDataBuffer - m_pUploadHeap->BasePtr(), sizeof(uint32_t) * NumKeys[2]); + + + // Once we are done copying the data, put in barriers to transition the source resources to + // copy source (which is what they will stay for the duration of app runtime) + CD3DX12_RESOURCE_BARRIER Barriers[6] = { CD3DX12_RESOURCE_BARRIER::Transition(m_SrcKeyBuffers[2].GetResource(), D3D12_RESOURCE_STATE_COPY_DEST, D3D12_RESOURCE_STATE_COPY_SOURCE), + CD3DX12_RESOURCE_BARRIER::Transition(m_SrcPayloadBuffers.GetResource(), D3D12_RESOURCE_STATE_COPY_DEST, D3D12_RESOURCE_STATE_COPY_SOURCE), + CD3DX12_RESOURCE_BARRIER::Transition(m_SrcKeyBuffers[1].GetResource(), D3D12_RESOURCE_STATE_COPY_DEST, D3D12_RESOURCE_STATE_COPY_SOURCE), + CD3DX12_RESOURCE_BARRIER::Transition(m_SrcKeyBuffers[0].GetResource(), D3D12_RESOURCE_STATE_COPY_DEST, D3D12_RESOURCE_STATE_COPY_SOURCE), + + // Copy the data into the dst[0] buffers for use on first frame + CD3DX12_RESOURCE_BARRIER::Transition(m_DstKeyBuffers[0].GetResource(), D3D12_RESOURCE_STATE_UNORDERED_ACCESS, D3D12_RESOURCE_STATE_COPY_DEST), + CD3DX12_RESOURCE_BARRIER::Transition(m_DstPayloadBuffers[0].GetResource(), D3D12_RESOURCE_STATE_UNORDERED_ACCESS, D3D12_RESOURCE_STATE_COPY_DEST) }; + m_pUploadHeap->GetCommandList()->ResourceBarrier(6, Barriers); + + m_pUploadHeap->GetCommandList()->CopyBufferRegion(m_DstKeyBuffers[0].GetResource(), 0, m_SrcKeyBuffers[m_UIResolutionSize].GetResource(), 0, sizeof(uint32_t) * NumKeys[m_UIResolutionSize]); + m_pUploadHeap->GetCommandList()->CopyBufferRegion(m_DstPayloadBuffers[0].GetResource(), 0, m_SrcPayloadBuffers.GetResource(), 0, sizeof(uint32_t) * NumKeys[m_UIResolutionSize]); + + // Put the dst buffers back to UAVs for sort usage + Barriers[0] = CD3DX12_RESOURCE_BARRIER::Transition(m_DstKeyBuffers[0].GetResource(), D3D12_RESOURCE_STATE_COPY_DEST, D3D12_RESOURCE_STATE_UNORDERED_ACCESS); + Barriers[1] = CD3DX12_RESOURCE_BARRIER::Transition(m_DstPayloadBuffers[0].GetResource(), D3D12_RESOURCE_STATE_COPY_DEST, D3D12_RESOURCE_STATE_UNORDERED_ACCESS); + m_pUploadHeap->GetCommandList()->ResourceBarrier(2, Barriers); + + // Create UAVs + m_SrcKeyBuffers[2].CreateBufferUAV(2, nullptr, &m_SrcKeyUAVTable); + m_SrcKeyBuffers[1].CreateBufferUAV(1, nullptr, &m_SrcKeyUAVTable); + m_SrcKeyBuffers[0].CreateBufferUAV(0, nullptr, &m_SrcKeyUAVTable); + m_SrcPayloadBuffers.CreateBufferUAV(0, nullptr, &m_SrcPayloadUAV); + m_DstKeyBuffers[0].CreateBufferUAV(0, nullptr, &m_DstKeyUAVTable); + m_DstKeyBuffers[1].CreateBufferUAV(1, nullptr, &m_DstKeyUAVTable); + m_DstPayloadBuffers[0].CreateBufferUAV(0, nullptr, &m_DstPayloadUAVTable); + m_DstPayloadBuffers[1].CreateBufferUAV(1, nullptr, &m_DstPayloadUAVTable); +} + +// Compile specified radix sort shader and create pipeline +void FFXParallelSort::CompileRadixPipeline(const char* shaderFile, const DefineList* defines, const char* entryPoint, ID3D12PipelineState*& pPipeline) { - ////////////////////////////////////////////////////////////////////////// - // For testing in the lab - int FFXParallelSort::KeySetOverride = -1; - void FFXParallelSort::OverrideKeySet(int ResolutionOverride) - { - KeySetOverride = ResolutionOverride; - } - bool FFXParallelSort::PayloadOverride = false; - void FFXParallelSort::OverridePayload() - { - PayloadOverride = true; - } - ////////////////////////////////////////////////////////////////////////// - - void FFXParallelSort::CreateKeyPayloadBuffers() - { - std::vector KeyData1080(NumKeys[0]); - std::vector KeyData2K(NumKeys[1]); - std::vector KeyData4K(NumKeys[2]); - - // Populate the buffers with linear access index - std::iota(KeyData1080.begin(), KeyData1080.end(), 0); - std::iota(KeyData2K.begin(), KeyData2K.end(), 0); - std::iota(KeyData4K.begin(), KeyData4K.end(), 0); - - // Shuffle the data - std::shuffle(KeyData1080.begin(), KeyData1080.end(), std::mt19937{ std::random_device{}() }); - std::shuffle(KeyData2K.begin(), KeyData2K.end(), std::mt19937{ std::random_device{}() }); - std::shuffle(KeyData4K.begin(), KeyData4K.end(), std::mt19937{ std::random_device{}() }); - - // 1080p - CD3DX12_RESOURCE_DESC ResourceDesc = CD3DX12_RESOURCE_DESC::Buffer(sizeof(uint32_t) * NumKeys[0], D3D12_RESOURCE_FLAG_ALLOW_UNORDERED_ACCESS); - m_SrcKeyBuffers[0].InitBuffer(m_pDevice, "SrcKeys1080", &ResourceDesc, sizeof(uint32_t), D3D12_RESOURCE_STATE_COPY_DEST); - // 2K - ResourceDesc = CD3DX12_RESOURCE_DESC::Buffer(sizeof(uint32_t) * NumKeys[1], D3D12_RESOURCE_FLAG_ALLOW_UNORDERED_ACCESS); - m_SrcKeyBuffers[1].InitBuffer(m_pDevice, "SrcKeys2K", &ResourceDesc, sizeof(uint32_t), D3D12_RESOURCE_STATE_COPY_DEST); - // 4K - ResourceDesc = CD3DX12_RESOURCE_DESC::Buffer(sizeof(uint32_t) * NumKeys[2], D3D12_RESOURCE_FLAG_ALLOW_UNORDERED_ACCESS); - m_SrcKeyBuffers[2].InitBuffer(m_pDevice, "SrcKeys4K", &ResourceDesc, sizeof(uint32_t), D3D12_RESOURCE_STATE_COPY_DEST); - m_SrcPayloadBuffers.InitBuffer(m_pDevice, "SrcPayloadBuffer", &ResourceDesc, sizeof(uint32_t), D3D12_RESOURCE_STATE_COPY_DEST); - - // The DstKey and DstPayload buffers will be used as src/dst when sorting. A copy of the - // source key/payload will be copied into them before hand so we can keep our original values - ResourceDesc = CD3DX12_RESOURCE_DESC::Buffer(sizeof(uint32_t) * NumKeys[2], D3D12_RESOURCE_FLAG_ALLOW_UNORDERED_ACCESS); - m_DstKeyBuffers[0].InitBuffer(m_pDevice, "DstKeyBuf0", &ResourceDesc, sizeof(uint32_t), D3D12_RESOURCE_STATE_UNORDERED_ACCESS); - m_DstKeyBuffers[1].InitBuffer(m_pDevice, "DstKeyBuf1", &ResourceDesc, sizeof(uint32_t), D3D12_RESOURCE_STATE_UNORDERED_ACCESS); - m_DstPayloadBuffers[0].InitBuffer(m_pDevice, "DstPayloadBuf0", &ResourceDesc, sizeof(uint32_t), D3D12_RESOURCE_STATE_UNORDERED_ACCESS); - m_DstPayloadBuffers[1].InitBuffer(m_pDevice, "DstPayloadBuf1", &ResourceDesc, sizeof(uint32_t), D3D12_RESOURCE_STATE_UNORDERED_ACCESS); - - // Copy data in - - // 1080 - uint8_t* pKeyDataBuffer = m_pUploadHeap->Suballocate(NumKeys[0] * sizeof(uint32_t), sizeof(uint32_t)); - memcpy(pKeyDataBuffer, KeyData1080.data() , sizeof(uint32_t) * NumKeys[0]); - m_pUploadHeap->GetCommandList()->CopyBufferRegion(m_SrcKeyBuffers[0].GetResource(), 0, m_pUploadHeap->GetResource(), pKeyDataBuffer - m_pUploadHeap->BasePtr(), sizeof(uint32_t) * NumKeys[0]); - - // 2K - pKeyDataBuffer = m_pUploadHeap->Suballocate(NumKeys[1] * sizeof(uint32_t), sizeof(uint32_t)); - memcpy(pKeyDataBuffer, KeyData2K.data(), sizeof(uint32_t) * NumKeys[1]); - m_pUploadHeap->GetCommandList()->CopyBufferRegion(m_SrcKeyBuffers[1].GetResource(), 0, m_pUploadHeap->GetResource(), pKeyDataBuffer - m_pUploadHeap->BasePtr(), sizeof(uint32_t) * NumKeys[1]); - - // 4K - pKeyDataBuffer = m_pUploadHeap->Suballocate(NumKeys[2] * sizeof(uint32_t), sizeof(uint32_t)); - memcpy(pKeyDataBuffer, KeyData4K.data(), sizeof(uint32_t) * NumKeys[2]); - m_pUploadHeap->GetCommandList()->CopyBufferRegion(m_SrcKeyBuffers[2].GetResource(), 0, m_pUploadHeap->GetResource(), pKeyDataBuffer - m_pUploadHeap->BasePtr(), sizeof(uint32_t) * NumKeys[2]); - uint8_t* pPayloadDataBuffer = m_pUploadHeap->Suballocate(NumKeys[2] * sizeof(uint32_t), sizeof(uint32_t)); - memcpy(pPayloadDataBuffer, KeyData4K.data(), sizeof(uint32_t) * NumKeys[2]); // Copy the 4k source data for payload (it doesn't matter what the payload is as we really only want it to measure cost of copying/sorting) - m_pUploadHeap->GetCommandList()->CopyBufferRegion(m_SrcPayloadBuffers.GetResource(), 0, m_pUploadHeap->GetResource(), pPayloadDataBuffer - m_pUploadHeap->BasePtr(), sizeof(uint32_t) * NumKeys[2]); - - - // Once we are done copying the data, put in barriers to transition the source resources to - // copy source (which is what they will stay for the duration of app runtime) - CD3DX12_RESOURCE_BARRIER Barriers[6] = { CD3DX12_RESOURCE_BARRIER::Transition(m_SrcKeyBuffers[2].GetResource(), D3D12_RESOURCE_STATE_COPY_DEST, D3D12_RESOURCE_STATE_COPY_SOURCE), - CD3DX12_RESOURCE_BARRIER::Transition(m_SrcPayloadBuffers.GetResource(), D3D12_RESOURCE_STATE_COPY_DEST, D3D12_RESOURCE_STATE_COPY_SOURCE), - CD3DX12_RESOURCE_BARRIER::Transition(m_SrcKeyBuffers[1].GetResource(), D3D12_RESOURCE_STATE_COPY_DEST, D3D12_RESOURCE_STATE_COPY_SOURCE), - CD3DX12_RESOURCE_BARRIER::Transition(m_SrcKeyBuffers[0].GetResource(), D3D12_RESOURCE_STATE_COPY_DEST, D3D12_RESOURCE_STATE_COPY_SOURCE), - - // Copy the data into the dst[0] buffers for use on first frame - CD3DX12_RESOURCE_BARRIER::Transition(m_DstKeyBuffers[0].GetResource(), D3D12_RESOURCE_STATE_UNORDERED_ACCESS, D3D12_RESOURCE_STATE_COPY_DEST), - CD3DX12_RESOURCE_BARRIER::Transition(m_DstPayloadBuffers[0].GetResource(), D3D12_RESOURCE_STATE_UNORDERED_ACCESS, D3D12_RESOURCE_STATE_COPY_DEST) }; - m_pUploadHeap->GetCommandList()->ResourceBarrier(6, Barriers); - - m_pUploadHeap->GetCommandList()->CopyBufferRegion(m_DstKeyBuffers[0].GetResource(), 0, m_SrcKeyBuffers[m_UIResolutionSize].GetResource(), 0, sizeof(uint32_t) * NumKeys[m_UIResolutionSize]); - m_pUploadHeap->GetCommandList()->CopyBufferRegion(m_DstPayloadBuffers[0].GetResource(), 0, m_SrcPayloadBuffers.GetResource(), 0, sizeof(uint32_t) * NumKeys[m_UIResolutionSize]); - - // Put the dst buffers back to UAVs for sort usage - Barriers[0] = CD3DX12_RESOURCE_BARRIER::Transition(m_DstKeyBuffers[0].GetResource(), D3D12_RESOURCE_STATE_COPY_DEST, D3D12_RESOURCE_STATE_UNORDERED_ACCESS); - Barriers[1] = CD3DX12_RESOURCE_BARRIER::Transition(m_DstPayloadBuffers[0].GetResource(), D3D12_RESOURCE_STATE_COPY_DEST, D3D12_RESOURCE_STATE_UNORDERED_ACCESS); - m_pUploadHeap->GetCommandList()->ResourceBarrier(2, Barriers); - - // Create UAVs - m_SrcKeyBuffers[2].CreateBufferUAV(2, nullptr, &m_SrcKeyUAVTable); - m_SrcKeyBuffers[1].CreateBufferUAV(1, nullptr, &m_SrcKeyUAVTable); - m_SrcKeyBuffers[0].CreateBufferUAV(0, nullptr, &m_SrcKeyUAVTable); - m_SrcPayloadBuffers.CreateBufferUAV(0, nullptr, &m_SrcPayloadUAV); - m_DstKeyBuffers[0].CreateBufferUAV(0, nullptr, &m_DstKeyUAVTable); - m_DstKeyBuffers[1].CreateBufferUAV(1, nullptr, &m_DstKeyUAVTable); - m_DstPayloadBuffers[0].CreateBufferUAV(0, nullptr, &m_DstPayloadUAVTable); - m_DstPayloadBuffers[1].CreateBufferUAV(1, nullptr, &m_DstPayloadUAVTable); - } - - void FFXParallelSort::CompileRadixPipeline(const char* shaderFile, const DefineList* defines, const char* entryPoint, ID3D12PipelineState*& pPipeline) - { - std::string CompileFlags("-T cs_6_0"); + std::string CompileFlags("-T cs_6_0"); #ifdef _DEBUG - CompileFlags += " -Zi -Od"; + CompileFlags += " -Zi -Od"; #endif // _DEBUG - D3D12_SHADER_BYTECODE shaderByteCode = {}; - CompileShaderFromFile(shaderFile, defines, entryPoint, CompileFlags.c_str(), &shaderByteCode); - - D3D12_COMPUTE_PIPELINE_STATE_DESC descPso = {}; - descPso.CS = shaderByteCode; - descPso.Flags = D3D12_PIPELINE_STATE_FLAG_NONE; - descPso.pRootSignature = m_pFPSRootSignature; - descPso.NodeMask = 0; - - ThrowIfFailed(m_pDevice->GetDevice()->CreateComputePipelineState(&descPso, IID_PPV_ARGS(&pPipeline))); - SetName(pPipeline, entryPoint); - } - - void FFXParallelSort::OnCreate(Device* pDevice, ResourceViewHeaps* pResourceViewHeaps, DynamicBufferRing* pConstantBufferRing, UploadHeap* pUploadHeap, SwapChain* pSwapChain) - { - m_pDevice = pDevice; - m_pUploadHeap = pUploadHeap; - m_pResourceViewHeaps = pResourceViewHeaps; - m_pConstantBufferRing = pConstantBufferRing; - m_MaxNumThreadgroups = 800; - - // Overrides for testing - if (KeySetOverride >= 0) - m_UIResolutionSize = KeySetOverride; - if (PayloadOverride) - m_UISortPayload = true; - - // Allocate UAVs to use for data - m_pResourceViewHeaps->AllocCBV_SRV_UAVDescriptor(3, &m_SrcKeyUAVTable); - m_pResourceViewHeaps->AllocCBV_SRV_UAVDescriptor(1, &m_SrcPayloadUAV); - m_pResourceViewHeaps->AllocCBV_SRV_UAVDescriptor(2, &m_DstKeyUAVTable); - m_pResourceViewHeaps->AllocCBV_SRV_UAVDescriptor(2, &m_DstPayloadUAVTable); - m_pResourceViewHeaps->AllocCBV_SRV_UAVDescriptor(1, &m_FPSScratchUAV); - m_pResourceViewHeaps->AllocCBV_SRV_UAVDescriptor(1, &m_FPSReducedScratchUAV); - m_pResourceViewHeaps->AllocCBV_SRV_UAVDescriptor(1, &m_IndirectKeyCountsUAV); - m_pResourceViewHeaps->AllocCBV_SRV_UAVDescriptor(1, &m_IndirectConstantBufferUAV); - m_pResourceViewHeaps->AllocCBV_SRV_UAVDescriptor(1, &m_IndirectCountScatterArgsUAV); - m_pResourceViewHeaps->AllocCBV_SRV_UAVDescriptor(1, &m_IndirectReduceScanArgsUAV); - m_pResourceViewHeaps->AllocCBV_SRV_UAVDescriptor(3, &m_ValidateTextureUAV); - - // Create resources to test with. Sorts will be done for 1080p, 2K, and 4K resolution data sets - CreateKeyPayloadBuffers(); - - // We are just going to fudge the indirect execution parameters for each resolution - CD3DX12_RESOURCE_DESC ResourceDesc = CD3DX12_RESOURCE_DESC::Buffer(sizeof(uint32_t) * 3, D3D12_RESOURCE_FLAG_ALLOW_UNORDERED_ACCESS); - m_IndirectKeyCounts.InitBuffer(m_pDevice, "IndirectKeyCounts", &ResourceDesc, sizeof(uint32_t), D3D12_RESOURCE_STATE_COPY_DEST); - m_IndirectKeyCounts.CreateBufferUAV(0, nullptr, &m_IndirectKeyCountsUAV); - uint8_t* pNumKeysBuffer = m_pUploadHeap->Suballocate(sizeof(uint32_t) * 3, D3D12_DEFAULT_RESOURCE_PLACEMENT_ALIGNMENT); - memcpy(pNumKeysBuffer, NumKeys, sizeof(uint32_t) * 3); - m_pUploadHeap->GetCommandList()->CopyBufferRegion(m_IndirectKeyCounts.GetResource(), 0, m_pUploadHeap->GetResource(), pNumKeysBuffer - m_pUploadHeap->BasePtr(), sizeof(uint32_t) * 3); - CD3DX12_RESOURCE_BARRIER Barrier = CD3DX12_RESOURCE_BARRIER::Transition(m_IndirectKeyCounts.GetResource(), D3D12_RESOURCE_STATE_COPY_DEST, D3D12_RESOURCE_STATE_UNORDERED_ACCESS); - m_pUploadHeap->GetCommandList()->ResourceBarrier(1, &Barrier); - - // Create resources for sort validation (image that goes from shuffled to sorted) - m_Validate1080pTexture.InitFromFile(m_pDevice, m_pUploadHeap, "Validate1080p.png", false, 1.f, D3D12_RESOURCE_FLAG_ALLOW_UNORDERED_ACCESS ); - m_Validate1080pTexture.CreateUAV(0, &m_ValidateTextureUAV); - m_Validate2KTexture.InitFromFile(m_pDevice, m_pUploadHeap, "Validate2K.png", false, 1.f, D3D12_RESOURCE_FLAG_ALLOW_UNORDERED_ACCESS); - m_Validate2KTexture.CreateUAV(1, &m_ValidateTextureUAV); - m_Validate4KTexture.InitFromFile(m_pDevice, m_pUploadHeap, "Validate4K.png", false, 1.f, D3D12_RESOURCE_FLAG_ALLOW_UNORDERED_ACCESS); - m_Validate4KTexture.CreateUAV(2, &m_ValidateTextureUAV); - - // Finish up - m_pUploadHeap->FlushAndFinish(); - - // Allocate the scratch buffers needed for radix sort - uint32_t scratchBufferSize; - uint32_t reducedScratchBufferSize; - FFX_ParallelSort_CalculateScratchResourceSize(NumKeys[2], scratchBufferSize, reducedScratchBufferSize); - - ResourceDesc = CD3DX12_RESOURCE_DESC::Buffer(scratchBufferSize, D3D12_RESOURCE_FLAG_ALLOW_UNORDERED_ACCESS); - m_FPSScratchBuffer.InitBuffer(m_pDevice, "Scratch", &ResourceDesc, sizeof(uint32_t), D3D12_RESOURCE_STATE_UNORDERED_ACCESS); - m_FPSScratchBuffer.CreateBufferUAV(0, nullptr, &m_FPSScratchUAV); - - ResourceDesc = CD3DX12_RESOURCE_DESC::Buffer(reducedScratchBufferSize, D3D12_RESOURCE_FLAG_ALLOW_UNORDERED_ACCESS); - m_FPSReducedScratchBuffer.InitBuffer(m_pDevice, "ReducedScratch", &ResourceDesc, sizeof(uint32_t), D3D12_RESOURCE_STATE_UNORDERED_ACCESS); - m_FPSReducedScratchBuffer.CreateBufferUAV(0, nullptr, &m_FPSReducedScratchUAV); - - // Allocate the buffers for indirect execution of the algorithm - ResourceDesc = CD3DX12_RESOURCE_DESC::Buffer(sizeof(FFX_ParallelSortCB), D3D12_RESOURCE_FLAG_ALLOW_UNORDERED_ACCESS); - m_IndirectConstantBuffer.InitBuffer(m_pDevice, "IndirectConstantBuffer", &ResourceDesc, sizeof(FFX_ParallelSortCB), D3D12_RESOURCE_STATE_UNORDERED_ACCESS); - m_IndirectConstantBuffer.CreateBufferUAV(0, nullptr, &m_IndirectConstantBufferUAV); - - ResourceDesc = CD3DX12_RESOURCE_DESC::Buffer(sizeof(uint32_t) * 3, D3D12_RESOURCE_FLAG_ALLOW_UNORDERED_ACCESS); - m_IndirectCountScatterArgs.InitBuffer(m_pDevice, "IndirectCount_Scatter_DispatchArgs", &ResourceDesc, sizeof(uint32_t), D3D12_RESOURCE_STATE_UNORDERED_ACCESS); - m_IndirectCountScatterArgs.CreateBufferUAV(0, nullptr, &m_IndirectCountScatterArgsUAV); - m_IndirectReduceScanArgs.InitBuffer(m_pDevice, "IndirectCount_Scatter_DispatchArgs", &ResourceDesc, sizeof(uint32_t), D3D12_RESOURCE_STATE_UNORDERED_ACCESS); - m_IndirectReduceScanArgs.CreateBufferUAV(0, nullptr, &m_IndirectReduceScanArgsUAV); - - // Create root signature for Radix sort passes - { - D3D12_DESCRIPTOR_RANGE descRange[15]; - D3D12_ROOT_PARAMETER rootParams[16]; - - // Constant buffer table (always have 1) - descRange[0] = { D3D12_DESCRIPTOR_RANGE_TYPE_CBV, 1, 0, 0, D3D12_DESCRIPTOR_RANGE_OFFSET_APPEND }; - rootParams[0].ParameterType = D3D12_ROOT_PARAMETER_TYPE_CBV; rootParams[0].ShaderVisibility = D3D12_SHADER_VISIBILITY_ALL; - rootParams[0].Descriptor = { descRange[0].BaseShaderRegister, descRange[0].RegisterSpace }; - - // Constant buffer to setup indirect params (indirect) - descRange[1] = { D3D12_DESCRIPTOR_RANGE_TYPE_CBV, 1, 1, 0, D3D12_DESCRIPTOR_RANGE_OFFSET_APPEND }; - rootParams[1].ParameterType = D3D12_ROOT_PARAMETER_TYPE_CBV; rootParams[1].ShaderVisibility = D3D12_SHADER_VISIBILITY_ALL; - rootParams[1].Descriptor = { descRange[1].BaseShaderRegister, descRange[1].RegisterSpace }; - - rootParams[2].ParameterType = D3D12_ROOT_PARAMETER_TYPE_32BIT_CONSTANTS; rootParams[2].ShaderVisibility = D3D12_SHADER_VISIBILITY_ALL; - rootParams[2].Constants = { 2, 0, 1 }; - - // SrcBuffer (sort or scan) - descRange[2] = { D3D12_DESCRIPTOR_RANGE_TYPE_UAV, 1, 0, 0, D3D12_DESCRIPTOR_RANGE_OFFSET_APPEND }; - rootParams[3].ParameterType = D3D12_ROOT_PARAMETER_TYPE_DESCRIPTOR_TABLE; rootParams[3].ShaderVisibility = D3D12_SHADER_VISIBILITY_ALL; - rootParams[3].DescriptorTable = { 1, &descRange[2] }; - - // ScrPayload (sort only) - descRange[3] = { D3D12_DESCRIPTOR_RANGE_TYPE_UAV, 1, 0, 1, D3D12_DESCRIPTOR_RANGE_OFFSET_APPEND }; - rootParams[4].ParameterType = D3D12_ROOT_PARAMETER_TYPE_DESCRIPTOR_TABLE; rootParams[4].ShaderVisibility = D3D12_SHADER_VISIBILITY_ALL; - rootParams[4].DescriptorTable = { 1, &descRange[3] }; - - // Scratch (sort only) - descRange[4] = { D3D12_DESCRIPTOR_RANGE_TYPE_UAV, 1, 0, 2, D3D12_DESCRIPTOR_RANGE_OFFSET_APPEND }; - rootParams[5].ParameterType = D3D12_ROOT_PARAMETER_TYPE_DESCRIPTOR_TABLE; rootParams[5].ShaderVisibility = D3D12_SHADER_VISIBILITY_ALL; - rootParams[5].DescriptorTable = { 1, &descRange[4] }; - - // Scratch (reduced) - descRange[5] = { D3D12_DESCRIPTOR_RANGE_TYPE_UAV, 1, 0, 3, D3D12_DESCRIPTOR_RANGE_OFFSET_APPEND }; - rootParams[6].ParameterType = D3D12_ROOT_PARAMETER_TYPE_DESCRIPTOR_TABLE; rootParams[6].ShaderVisibility = D3D12_SHADER_VISIBILITY_ALL; - rootParams[6].DescriptorTable = { 1, &descRange[5] }; - - // DstBuffer (sort or scan) - descRange[6] = { D3D12_DESCRIPTOR_RANGE_TYPE_UAV, 1, 0, 4, D3D12_DESCRIPTOR_RANGE_OFFSET_APPEND }; - rootParams[7].ParameterType = D3D12_ROOT_PARAMETER_TYPE_DESCRIPTOR_TABLE; rootParams[7].ShaderVisibility = D3D12_SHADER_VISIBILITY_ALL; - rootParams[7].DescriptorTable = { 1, &descRange[6] }; - - // DstPayload (sort only) - descRange[7] = { D3D12_DESCRIPTOR_RANGE_TYPE_UAV, 1, 0, 5, D3D12_DESCRIPTOR_RANGE_OFFSET_APPEND }; - rootParams[8].ParameterType = D3D12_ROOT_PARAMETER_TYPE_DESCRIPTOR_TABLE; rootParams[8].ShaderVisibility = D3D12_SHADER_VISIBILITY_ALL; - rootParams[8].DescriptorTable = { 1, &descRange[7] }; - - // ScanSrc - descRange[8] = { D3D12_DESCRIPTOR_RANGE_TYPE_UAV, 1, 0, 6, D3D12_DESCRIPTOR_RANGE_OFFSET_APPEND }; - rootParams[9].ParameterType = D3D12_ROOT_PARAMETER_TYPE_DESCRIPTOR_TABLE; rootParams[9].ShaderVisibility = D3D12_SHADER_VISIBILITY_ALL; - rootParams[9].DescriptorTable = { 1, &descRange[8] }; - - // ScanDst - descRange[9] = { D3D12_DESCRIPTOR_RANGE_TYPE_UAV, 1, 0, 7, D3D12_DESCRIPTOR_RANGE_OFFSET_APPEND }; - rootParams[10].ParameterType = D3D12_ROOT_PARAMETER_TYPE_DESCRIPTOR_TABLE; rootParams[10].ShaderVisibility = D3D12_SHADER_VISIBILITY_ALL; - rootParams[10].DescriptorTable = { 1, &descRange[9] }; - - // ScanScratch - descRange[10] = { D3D12_DESCRIPTOR_RANGE_TYPE_UAV, 1, 0, 8, D3D12_DESCRIPTOR_RANGE_OFFSET_APPEND }; - rootParams[11].ParameterType = D3D12_ROOT_PARAMETER_TYPE_DESCRIPTOR_TABLE; rootParams[11].ShaderVisibility = D3D12_SHADER_VISIBILITY_ALL; - rootParams[11].DescriptorTable = { 1, &descRange[10] }; - - // NumKeys (indirect) - descRange[11] = { D3D12_DESCRIPTOR_RANGE_TYPE_UAV, 1, 0, 9, D3D12_DESCRIPTOR_RANGE_OFFSET_APPEND }; - rootParams[12].ParameterType = D3D12_ROOT_PARAMETER_TYPE_DESCRIPTOR_TABLE; rootParams[12].ShaderVisibility = D3D12_SHADER_VISIBILITY_ALL; - rootParams[12].DescriptorTable = { 1, &descRange[11] }; - - // CBufferUAV (indirect) - descRange[12] = { D3D12_DESCRIPTOR_RANGE_TYPE_UAV, 1, 0, 10, D3D12_DESCRIPTOR_RANGE_OFFSET_APPEND }; - rootParams[13].ParameterType = D3D12_ROOT_PARAMETER_TYPE_DESCRIPTOR_TABLE; rootParams[13].ShaderVisibility = D3D12_SHADER_VISIBILITY_ALL; - rootParams[13].DescriptorTable = { 1, &descRange[12] }; - - // CountScatterArgs (indirect) - descRange[13] = { D3D12_DESCRIPTOR_RANGE_TYPE_UAV, 1, 0, 11, D3D12_DESCRIPTOR_RANGE_OFFSET_APPEND }; - rootParams[14].ParameterType = D3D12_ROOT_PARAMETER_TYPE_DESCRIPTOR_TABLE; rootParams[14].ShaderVisibility = D3D12_SHADER_VISIBILITY_ALL; - rootParams[14].DescriptorTable = { 1, &descRange[13] }; - - // ReduceScanArgs (indirect) - descRange[14] = { D3D12_DESCRIPTOR_RANGE_TYPE_UAV, 1, 0, 12, D3D12_DESCRIPTOR_RANGE_OFFSET_APPEND }; - rootParams[15].ParameterType = D3D12_ROOT_PARAMETER_TYPE_DESCRIPTOR_TABLE; rootParams[15].ShaderVisibility = D3D12_SHADER_VISIBILITY_ALL; - rootParams[15].DescriptorTable = { 1, &descRange[14] }; - - D3D12_ROOT_SIGNATURE_DESC rootSigDesc = {}; - rootSigDesc.NumParameters = 16; - rootSigDesc.pParameters = rootParams; - rootSigDesc.NumStaticSamplers = 0; - rootSigDesc.pStaticSamplers = nullptr; - rootSigDesc.Flags = D3D12_ROOT_SIGNATURE_FLAG_NONE; - - ID3DBlob* pOutBlob, * pErrorBlob = nullptr; - ThrowIfFailed(D3D12SerializeRootSignature(&rootSigDesc, D3D_ROOT_SIGNATURE_VERSION_1, &pOutBlob, &pErrorBlob)); - ThrowIfFailed(pDevice->GetDevice()->CreateRootSignature(0, pOutBlob->GetBufferPointer(), pOutBlob->GetBufferSize(), IID_PPV_ARGS(&m_pFPSRootSignature))); - SetName(m_pFPSRootSignature, "FPS_Signature"); - - pOutBlob->Release(); - if (pErrorBlob) - pErrorBlob->Release(); - - // Also create the command signature for the indirect version - D3D12_INDIRECT_ARGUMENT_DESC dispatch = {}; - dispatch.Type = D3D12_INDIRECT_ARGUMENT_TYPE_DISPATCH; - D3D12_COMMAND_SIGNATURE_DESC desc = {}; - desc.ByteStride = sizeof(D3D12_DISPATCH_ARGUMENTS); - desc.NodeMask = 0; - desc.NumArgumentDescs = 1; - desc.pArgumentDescs = &dispatch; - - ThrowIfFailed(pDevice->GetDevice()->CreateCommandSignature(&desc, nullptr, IID_PPV_ARGS(&m_pFPSCommandSignature))); - m_pFPSCommandSignature->SetName(L"FPS_CommandSignature"); - } - - // Create root signature for Render of RadixBuffer info - { - CD3DX12_DESCRIPTOR_RANGE DescRange[3]; - CD3DX12_ROOT_PARAMETER RTSlot[3]; - - // Constant buffer - DescRange[0].Init(D3D12_DESCRIPTOR_RANGE_TYPE_CBV, 1, 0, 0); - RTSlot[0].InitAsConstantBufferView(0, 0, D3D12_SHADER_VISIBILITY_ALL); - - // UAV for RadixBufer - DescRange[1].Init(D3D12_DESCRIPTOR_RANGE_TYPE_UAV, 1, 0, 0); - RTSlot[1].InitAsDescriptorTable(1, &DescRange[1], D3D12_SHADER_VISIBILITY_ALL); - - // SRV for Validation texture - DescRange[2].Init(D3D12_DESCRIPTOR_RANGE_TYPE_UAV, 1, 0, 1); - RTSlot[2].InitAsDescriptorTable(1, &DescRange[2], D3D12_SHADER_VISIBILITY_ALL); - - CD3DX12_ROOT_SIGNATURE_DESC descRootSignature = CD3DX12_ROOT_SIGNATURE_DESC(); - descRootSignature.NumParameters = 3; - descRootSignature.pParameters = RTSlot; - descRootSignature.NumStaticSamplers = 0; - descRootSignature.pStaticSamplers = nullptr; - descRootSignature.Flags = D3D12_ROOT_SIGNATURE_FLAG_NONE; - - ID3DBlob* pOutBlob, * pErrorBlob = nullptr; - ThrowIfFailed(D3D12SerializeRootSignature(&descRootSignature, D3D_ROOT_SIGNATURE_VERSION_1, &pOutBlob, &pErrorBlob)); - ThrowIfFailed(pDevice->GetDevice()->CreateRootSignature(0, pOutBlob->GetBufferPointer(), pOutBlob->GetBufferSize(), IID_PPV_ARGS(&m_pRenderRootSignature))); - SetName(m_pRenderRootSignature, "FPS_RenderResults_Signature"); - - pOutBlob->Release(); - if (pErrorBlob) - pErrorBlob->Release(); - } - - ////////////////////////////////////////////////////////////////////////// - // Create pipelines for radix sort - { - // Create all of the necessary pipelines for Sort and Scan - DefineList defines; - defines["FFX_HLSL"] = std::to_string(1); - - // SetupIndirectParams (indirect only) - CompileRadixPipeline("ParallelSortCS.hlsl", &defines, "FPS_SetupIndirectParameters", m_pFPSIndirectSetupParametersPipeline); - - // Radix count (sum table generation) - CompileRadixPipeline("ParallelSortCS.hlsl", &defines, "FPS_Count", m_pFPSCountPipeline); - // Radix count reduce (sum table reduction for offset prescan) - CompileRadixPipeline("ParallelSortCS.hlsl", &defines, "FPS_CountReduce", m_pFPSCountReducePipeline); - // Radix scan (prefix scan) - CompileRadixPipeline("ParallelSortCS.hlsl", &defines, "FPS_Scan", m_pFPSScanPipeline); - // Radix scan add (prefix scan + reduced prefix scan addition) - CompileRadixPipeline("ParallelSortCS.hlsl", &defines, "FPS_ScanAdd", m_pFPSScanAddPipeline); - // Radix scatter (key redistribution) - CompileRadixPipeline("ParallelSortCS.hlsl", &defines, "FPS_Scatter", m_pFPSScatterPipeline); - // Radix scatter with payload (key and payload redistribution) - defines["kRS_ValueCopy"] = std::to_string(1); - CompileRadixPipeline("ParallelSortCS.hlsl", &defines, "FPS_Scatter", m_pFPSScatterPayloadPipeline); - } - - ////////////////////////////////////////////////////////////////////////// - // Create pipelines for render pass - { - DefineList defines; + D3D12_SHADER_BYTECODE shaderByteCode = {}; + CompileShaderFromFile(shaderFile, defines, entryPoint, CompileFlags.c_str(), &shaderByteCode); + D3D12_COMPUTE_PIPELINE_STATE_DESC descPso = {}; + descPso.CS = shaderByteCode; + descPso.Flags = D3D12_PIPELINE_STATE_FLAG_NONE; + descPso.pRootSignature = m_pFPSRootSignature; + descPso.NodeMask = 0; + + ThrowIfFailed(m_pDevice->GetDevice()->CreateComputePipelineState(&descPso, IID_PPV_ARGS(&pPipeline))); + SetName(pPipeline, entryPoint); +} + +// Parallel Sort initialization +void FFXParallelSort::OnCreate(Device* pDevice, ResourceViewHeaps* pResourceViewHeaps, DynamicBufferRing* pConstantBufferRing, UploadHeap* pUploadHeap, SwapChain* pSwapChain) +{ + m_pDevice = pDevice; + m_pUploadHeap = pUploadHeap; + m_pResourceViewHeaps = pResourceViewHeaps; + m_pConstantBufferRing = pConstantBufferRing; + m_MaxNumThreadgroups = 800; + + // Overrides for testing + if (KeySetOverride >= 0) + m_UIResolutionSize = KeySetOverride; + if (PayloadOverride) + m_UISortPayload = true; + + // Allocate UAVs to use for data + m_pResourceViewHeaps->AllocCBV_SRV_UAVDescriptor(3, &m_SrcKeyUAVTable); + m_pResourceViewHeaps->AllocCBV_SRV_UAVDescriptor(1, &m_SrcPayloadUAV); + m_pResourceViewHeaps->AllocCBV_SRV_UAVDescriptor(2, &m_DstKeyUAVTable); + m_pResourceViewHeaps->AllocCBV_SRV_UAVDescriptor(2, &m_DstPayloadUAVTable); + m_pResourceViewHeaps->AllocCBV_SRV_UAVDescriptor(1, &m_FPSScratchUAV); + m_pResourceViewHeaps->AllocCBV_SRV_UAVDescriptor(1, &m_FPSReducedScratchUAV); + m_pResourceViewHeaps->AllocCBV_SRV_UAVDescriptor(1, &m_IndirectKeyCountsUAV); + m_pResourceViewHeaps->AllocCBV_SRV_UAVDescriptor(1, &m_IndirectConstantBufferUAV); + m_pResourceViewHeaps->AllocCBV_SRV_UAVDescriptor(1, &m_IndirectCountScatterArgsUAV); + m_pResourceViewHeaps->AllocCBV_SRV_UAVDescriptor(1, &m_IndirectReduceScanArgsUAV); + m_pResourceViewHeaps->AllocCBV_SRV_UAVDescriptor(3, &m_ValidateTextureSRV); + + // Create resources to test with. Sorts will be done for 1080p, 2K, and 4K resolution data sets + CreateKeyPayloadBuffers(); + + // We are just going to fudge the indirect execution parameters for each resolution + CD3DX12_RESOURCE_DESC ResourceDesc = CD3DX12_RESOURCE_DESC::Buffer(sizeof(uint32_t) * 3, D3D12_RESOURCE_FLAG_ALLOW_UNORDERED_ACCESS); + m_IndirectKeyCounts.InitBuffer(m_pDevice, "IndirectKeyCounts", &ResourceDesc, sizeof(uint32_t), D3D12_RESOURCE_STATE_COPY_DEST); + m_IndirectKeyCounts.CreateBufferUAV(0, nullptr, &m_IndirectKeyCountsUAV); + uint8_t* pNumKeysBuffer = m_pUploadHeap->Suballocate(sizeof(uint32_t) * 3, D3D12_DEFAULT_RESOURCE_PLACEMENT_ALIGNMENT); + memcpy(pNumKeysBuffer, NumKeys, sizeof(uint32_t) * 3); + m_pUploadHeap->GetCommandList()->CopyBufferRegion(m_IndirectKeyCounts.GetResource(), 0, m_pUploadHeap->GetResource(), pNumKeysBuffer - m_pUploadHeap->BasePtr(), sizeof(uint32_t) * 3); + CD3DX12_RESOURCE_BARRIER Barrier = CD3DX12_RESOURCE_BARRIER::Transition(m_IndirectKeyCounts.GetResource(), D3D12_RESOURCE_STATE_COPY_DEST, D3D12_RESOURCE_STATE_UNORDERED_ACCESS); + m_pUploadHeap->GetCommandList()->ResourceBarrier(1, &Barrier); + + // Create resources for sort validation (image that goes from shuffled to sorted) + m_Validate1080pTexture.InitFromFile(m_pDevice, m_pUploadHeap, "Validate1080p.png", false, 1.f, D3D12_RESOURCE_FLAG_ALLOW_UNORDERED_ACCESS ); + m_Validate1080pTexture.CreateSRV(0, &m_ValidateTextureSRV, 0); + m_Validate2KTexture.InitFromFile(m_pDevice, m_pUploadHeap, "Validate2K.png", false, 1.f, D3D12_RESOURCE_FLAG_ALLOW_UNORDERED_ACCESS); + m_Validate2KTexture.CreateSRV(1, &m_ValidateTextureSRV, 0); + m_Validate4KTexture.InitFromFile(m_pDevice, m_pUploadHeap, "Validate4K.png", false, 1.f, D3D12_RESOURCE_FLAG_ALLOW_UNORDERED_ACCESS); + m_Validate4KTexture.CreateSRV(2, &m_ValidateTextureSRV, 0); + + // Finish up + m_pUploadHeap->FlushAndFinish(); + + // Allocate the scratch buffers needed for radix sort + uint32_t scratchBufferSize; + uint32_t reducedScratchBufferSize; + FFX_ParallelSort_CalculateScratchResourceSize(NumKeys[2], scratchBufferSize, reducedScratchBufferSize); + + ResourceDesc = CD3DX12_RESOURCE_DESC::Buffer(scratchBufferSize, D3D12_RESOURCE_FLAG_ALLOW_UNORDERED_ACCESS); + m_FPSScratchBuffer.InitBuffer(m_pDevice, "Scratch", &ResourceDesc, sizeof(uint32_t), D3D12_RESOURCE_STATE_UNORDERED_ACCESS); + m_FPSScratchBuffer.CreateBufferUAV(0, nullptr, &m_FPSScratchUAV); + + ResourceDesc = CD3DX12_RESOURCE_DESC::Buffer(reducedScratchBufferSize, D3D12_RESOURCE_FLAG_ALLOW_UNORDERED_ACCESS); + m_FPSReducedScratchBuffer.InitBuffer(m_pDevice, "ReducedScratch", &ResourceDesc, sizeof(uint32_t), D3D12_RESOURCE_STATE_UNORDERED_ACCESS); + m_FPSReducedScratchBuffer.CreateBufferUAV(0, nullptr, &m_FPSReducedScratchUAV); + + // Allocate the buffers for indirect execution of the algorithm + ResourceDesc = CD3DX12_RESOURCE_DESC::Buffer(sizeof(FFX_ParallelSortCB), D3D12_RESOURCE_FLAG_ALLOW_UNORDERED_ACCESS); + m_IndirectConstantBuffer.InitBuffer(m_pDevice, "IndirectConstantBuffer", &ResourceDesc, sizeof(FFX_ParallelSortCB), D3D12_RESOURCE_STATE_UNORDERED_ACCESS); + m_IndirectConstantBuffer.CreateBufferUAV(0, nullptr, &m_IndirectConstantBufferUAV); + + ResourceDesc = CD3DX12_RESOURCE_DESC::Buffer(sizeof(uint32_t) * 3, D3D12_RESOURCE_FLAG_ALLOW_UNORDERED_ACCESS); + m_IndirectCountScatterArgs.InitBuffer(m_pDevice, "IndirectCount_Scatter_DispatchArgs", &ResourceDesc, sizeof(uint32_t), D3D12_RESOURCE_STATE_UNORDERED_ACCESS); + m_IndirectCountScatterArgs.CreateBufferUAV(0, nullptr, &m_IndirectCountScatterArgsUAV); + m_IndirectReduceScanArgs.InitBuffer(m_pDevice, "IndirectReduceScanArgs", &ResourceDesc, sizeof(uint32_t), D3D12_RESOURCE_STATE_UNORDERED_ACCESS); + m_IndirectReduceScanArgs.CreateBufferUAV(0, nullptr, &m_IndirectReduceScanArgsUAV); + + // Create root signature for Radix sort passes + { + D3D12_DESCRIPTOR_RANGE descRange[15]; + D3D12_ROOT_PARAMETER rootParams[16]; + + // Constant buffer table (always have 1) + descRange[0] = { D3D12_DESCRIPTOR_RANGE_TYPE_CBV, 1, 0, 0, D3D12_DESCRIPTOR_RANGE_OFFSET_APPEND }; + rootParams[0].ParameterType = D3D12_ROOT_PARAMETER_TYPE_CBV; rootParams[0].ShaderVisibility = D3D12_SHADER_VISIBILITY_ALL; + rootParams[0].Descriptor = { descRange[0].BaseShaderRegister, descRange[0].RegisterSpace }; + + // Constant buffer to setup indirect params (indirect) + descRange[1] = { D3D12_DESCRIPTOR_RANGE_TYPE_CBV, 1, 1, 0, D3D12_DESCRIPTOR_RANGE_OFFSET_APPEND }; + rootParams[1].ParameterType = D3D12_ROOT_PARAMETER_TYPE_CBV; rootParams[1].ShaderVisibility = D3D12_SHADER_VISIBILITY_ALL; + rootParams[1].Descriptor = { descRange[1].BaseShaderRegister, descRange[1].RegisterSpace }; + + rootParams[2].ParameterType = D3D12_ROOT_PARAMETER_TYPE_32BIT_CONSTANTS; rootParams[2].ShaderVisibility = D3D12_SHADER_VISIBILITY_ALL; + rootParams[2].Constants = { 2, 0, 1 }; + + // SrcBuffer (sort or scan) + descRange[2] = { D3D12_DESCRIPTOR_RANGE_TYPE_UAV, 1, 0, 0, D3D12_DESCRIPTOR_RANGE_OFFSET_APPEND }; + rootParams[3].ParameterType = D3D12_ROOT_PARAMETER_TYPE_DESCRIPTOR_TABLE; rootParams[3].ShaderVisibility = D3D12_SHADER_VISIBILITY_ALL; + rootParams[3].DescriptorTable = { 1, &descRange[2] }; + + // ScrPayload (sort only) + descRange[3] = { D3D12_DESCRIPTOR_RANGE_TYPE_UAV, 1, 0, 1, D3D12_DESCRIPTOR_RANGE_OFFSET_APPEND }; + rootParams[4].ParameterType = D3D12_ROOT_PARAMETER_TYPE_DESCRIPTOR_TABLE; rootParams[4].ShaderVisibility = D3D12_SHADER_VISIBILITY_ALL; + rootParams[4].DescriptorTable = { 1, &descRange[3] }; + + // Scratch (sort only) + descRange[4] = { D3D12_DESCRIPTOR_RANGE_TYPE_UAV, 1, 0, 2, D3D12_DESCRIPTOR_RANGE_OFFSET_APPEND }; + rootParams[5].ParameterType = D3D12_ROOT_PARAMETER_TYPE_DESCRIPTOR_TABLE; rootParams[5].ShaderVisibility = D3D12_SHADER_VISIBILITY_ALL; + rootParams[5].DescriptorTable = { 1, &descRange[4] }; + + // Scratch (reduced) + descRange[5] = { D3D12_DESCRIPTOR_RANGE_TYPE_UAV, 1, 0, 3, D3D12_DESCRIPTOR_RANGE_OFFSET_APPEND }; + rootParams[6].ParameterType = D3D12_ROOT_PARAMETER_TYPE_DESCRIPTOR_TABLE; rootParams[6].ShaderVisibility = D3D12_SHADER_VISIBILITY_ALL; + rootParams[6].DescriptorTable = { 1, &descRange[5] }; + + // DstBuffer (sort or scan) + descRange[6] = { D3D12_DESCRIPTOR_RANGE_TYPE_UAV, 1, 0, 4, D3D12_DESCRIPTOR_RANGE_OFFSET_APPEND }; + rootParams[7].ParameterType = D3D12_ROOT_PARAMETER_TYPE_DESCRIPTOR_TABLE; rootParams[7].ShaderVisibility = D3D12_SHADER_VISIBILITY_ALL; + rootParams[7].DescriptorTable = { 1, &descRange[6] }; + + // DstPayload (sort only) + descRange[7] = { D3D12_DESCRIPTOR_RANGE_TYPE_UAV, 1, 0, 5, D3D12_DESCRIPTOR_RANGE_OFFSET_APPEND }; + rootParams[8].ParameterType = D3D12_ROOT_PARAMETER_TYPE_DESCRIPTOR_TABLE; rootParams[8].ShaderVisibility = D3D12_SHADER_VISIBILITY_ALL; + rootParams[8].DescriptorTable = { 1, &descRange[7] }; + + // ScanSrc + descRange[8] = { D3D12_DESCRIPTOR_RANGE_TYPE_UAV, 1, 0, 6, D3D12_DESCRIPTOR_RANGE_OFFSET_APPEND }; + rootParams[9].ParameterType = D3D12_ROOT_PARAMETER_TYPE_DESCRIPTOR_TABLE; rootParams[9].ShaderVisibility = D3D12_SHADER_VISIBILITY_ALL; + rootParams[9].DescriptorTable = { 1, &descRange[8] }; + + // ScanDst + descRange[9] = { D3D12_DESCRIPTOR_RANGE_TYPE_UAV, 1, 0, 7, D3D12_DESCRIPTOR_RANGE_OFFSET_APPEND }; + rootParams[10].ParameterType = D3D12_ROOT_PARAMETER_TYPE_DESCRIPTOR_TABLE; rootParams[10].ShaderVisibility = D3D12_SHADER_VISIBILITY_ALL; + rootParams[10].DescriptorTable = { 1, &descRange[9] }; + + // ScanScratch + descRange[10] = { D3D12_DESCRIPTOR_RANGE_TYPE_UAV, 1, 0, 8, D3D12_DESCRIPTOR_RANGE_OFFSET_APPEND }; + rootParams[11].ParameterType = D3D12_ROOT_PARAMETER_TYPE_DESCRIPTOR_TABLE; rootParams[11].ShaderVisibility = D3D12_SHADER_VISIBILITY_ALL; + rootParams[11].DescriptorTable = { 1, &descRange[10] }; + + // NumKeys (indirect) + descRange[11] = { D3D12_DESCRIPTOR_RANGE_TYPE_UAV, 1, 0, 9, D3D12_DESCRIPTOR_RANGE_OFFSET_APPEND }; + rootParams[12].ParameterType = D3D12_ROOT_PARAMETER_TYPE_DESCRIPTOR_TABLE; rootParams[12].ShaderVisibility = D3D12_SHADER_VISIBILITY_ALL; + rootParams[12].DescriptorTable = { 1, &descRange[11] }; + + // CBufferUAV (indirect) + descRange[12] = { D3D12_DESCRIPTOR_RANGE_TYPE_UAV, 1, 0, 10, D3D12_DESCRIPTOR_RANGE_OFFSET_APPEND }; + rootParams[13].ParameterType = D3D12_ROOT_PARAMETER_TYPE_DESCRIPTOR_TABLE; rootParams[13].ShaderVisibility = D3D12_SHADER_VISIBILITY_ALL; + rootParams[13].DescriptorTable = { 1, &descRange[12] }; + + // CountScatterArgs (indirect) + descRange[13] = { D3D12_DESCRIPTOR_RANGE_TYPE_UAV, 1, 0, 11, D3D12_DESCRIPTOR_RANGE_OFFSET_APPEND }; + rootParams[14].ParameterType = D3D12_ROOT_PARAMETER_TYPE_DESCRIPTOR_TABLE; rootParams[14].ShaderVisibility = D3D12_SHADER_VISIBILITY_ALL; + rootParams[14].DescriptorTable = { 1, &descRange[13] }; + + // ReduceScanArgs (indirect) + descRange[14] = { D3D12_DESCRIPTOR_RANGE_TYPE_UAV, 1, 0, 12, D3D12_DESCRIPTOR_RANGE_OFFSET_APPEND }; + rootParams[15].ParameterType = D3D12_ROOT_PARAMETER_TYPE_DESCRIPTOR_TABLE; rootParams[15].ShaderVisibility = D3D12_SHADER_VISIBILITY_ALL; + rootParams[15].DescriptorTable = { 1, &descRange[14] }; + + D3D12_ROOT_SIGNATURE_DESC rootSigDesc = {}; + rootSigDesc.NumParameters = 16; + rootSigDesc.pParameters = rootParams; + rootSigDesc.NumStaticSamplers = 0; + rootSigDesc.pStaticSamplers = nullptr; + rootSigDesc.Flags = D3D12_ROOT_SIGNATURE_FLAG_NONE; + + ID3DBlob* pOutBlob, * pErrorBlob = nullptr; + ThrowIfFailed(D3D12SerializeRootSignature(&rootSigDesc, D3D_ROOT_SIGNATURE_VERSION_1, &pOutBlob, &pErrorBlob)); + ThrowIfFailed(pDevice->GetDevice()->CreateRootSignature(0, pOutBlob->GetBufferPointer(), pOutBlob->GetBufferSize(), IID_PPV_ARGS(&m_pFPSRootSignature))); + SetName(m_pFPSRootSignature, "FPS_Signature"); + + pOutBlob->Release(); + if (pErrorBlob) + pErrorBlob->Release(); + + // Also create the command signature for the indirect version + D3D12_INDIRECT_ARGUMENT_DESC dispatch = {}; + dispatch.Type = D3D12_INDIRECT_ARGUMENT_TYPE_DISPATCH; + D3D12_COMMAND_SIGNATURE_DESC desc = {}; + desc.ByteStride = sizeof(D3D12_DISPATCH_ARGUMENTS); + desc.NodeMask = 0; + desc.NumArgumentDescs = 1; + desc.pArgumentDescs = &dispatch; + + ThrowIfFailed(pDevice->GetDevice()->CreateCommandSignature(&desc, nullptr, IID_PPV_ARGS(&m_pFPSCommandSignature))); + m_pFPSCommandSignature->SetName(L"FPS_CommandSignature"); + } + + // Create root signature for Render of RadixBuffer info + { + CD3DX12_DESCRIPTOR_RANGE DescRange[3]; + CD3DX12_ROOT_PARAMETER RTSlot[3]; + + // Constant buffer + DescRange[0].Init(D3D12_DESCRIPTOR_RANGE_TYPE_CBV, 1, 0, 0); + RTSlot[0].InitAsConstantBufferView(0, 0, D3D12_SHADER_VISIBILITY_ALL); + + // UAV for RadixBufer + DescRange[1].Init(D3D12_DESCRIPTOR_RANGE_TYPE_UAV, 1, 0, 0); + RTSlot[1].InitAsDescriptorTable(1, &DescRange[1], D3D12_SHADER_VISIBILITY_ALL); + + // SRV for Validation texture + DescRange[2].Init(D3D12_DESCRIPTOR_RANGE_TYPE_SRV, 1, 0, 0); + RTSlot[2].InitAsDescriptorTable(1, &DescRange[2], D3D12_SHADER_VISIBILITY_ALL); + + CD3DX12_ROOT_SIGNATURE_DESC descRootSignature = CD3DX12_ROOT_SIGNATURE_DESC(); + descRootSignature.NumParameters = 3; + descRootSignature.pParameters = RTSlot; + descRootSignature.NumStaticSamplers = 0; + descRootSignature.pStaticSamplers = nullptr; + descRootSignature.Flags = D3D12_ROOT_SIGNATURE_FLAG_NONE; + + ID3DBlob* pOutBlob, * pErrorBlob = nullptr; + ThrowIfFailed(D3D12SerializeRootSignature(&descRootSignature, D3D_ROOT_SIGNATURE_VERSION_1, &pOutBlob, &pErrorBlob)); + ThrowIfFailed(pDevice->GetDevice()->CreateRootSignature(0, pOutBlob->GetBufferPointer(), pOutBlob->GetBufferSize(), IID_PPV_ARGS(&m_pRenderRootSignature))); + SetName(m_pRenderRootSignature, "FPS_RenderResults_Signature"); + + pOutBlob->Release(); + if (pErrorBlob) + pErrorBlob->Release(); + } + + ////////////////////////////////////////////////////////////////////////// + // Create pipelines for radix sort + { + // Create all of the necessary pipelines for Sort and Scan + + // SetupIndirectParams (indirect only) + CompileRadixPipeline("ParallelSortCS.hlsl", nullptr, "FPS_SetupIndirectParameters", m_pFPSIndirectSetupParametersPipeline); + + // Radix count (sum table generation) + CompileRadixPipeline("ParallelSortCS.hlsl", nullptr, "FPS_Count", m_pFPSCountPipeline); + // Radix count reduce (sum table reduction for offset prescan) + CompileRadixPipeline("ParallelSortCS.hlsl", nullptr, "FPS_CountReduce", m_pFPSCountReducePipeline); + // Radix scan (prefix scan) + CompileRadixPipeline("ParallelSortCS.hlsl", nullptr, "FPS_Scan", m_pFPSScanPipeline); + // Radix scan add (prefix scan + reduced prefix scan addition) + CompileRadixPipeline("ParallelSortCS.hlsl", nullptr, "FPS_ScanAdd", m_pFPSScanAddPipeline); + // Radix scatter (key redistribution) + CompileRadixPipeline("ParallelSortCS.hlsl", nullptr, "FPS_Scatter", m_pFPSScatterPipeline); + + // Radix scatter with payload (key and payload redistribution) + DefineList defines; + defines["kRS_ValueCopy"] = std::to_string(1); + CompileRadixPipeline("ParallelSortCS.hlsl", &defines, "FPS_Scatter", m_pFPSScatterPayloadPipeline); + } + + ////////////////////////////////////////////////////////////////////////// + // Create pipelines for render pass + { #ifdef _DEBUG - std::string CompileFlagsVS("-T vs_6_0 -Zi -Od"); - std::string CompileFlagsPS("-T ps_6_0 -Zi -Od"); + std::string CompileFlagsVS("-T vs_6_0 -Zi -Od"); + std::string CompileFlagsPS("-T ps_6_0 -Zi -Od"); #else - std::string CompileFlagsVS("-T vs_6_0"); - std::string CompileFlagsPS("-T ps_6_0"); + std::string CompileFlagsVS("-T vs_6_0"); + std::string CompileFlagsPS("-T ps_6_0"); #endif // _DEBUG - - D3D12_SHADER_BYTECODE shaderByteCodeVS = {}; - CompileShaderFromFile("ParallelSortVerify.hlsl", &defines, "FullscreenVS", CompileFlagsVS.c_str(), &shaderByteCodeVS); - - D3D12_SHADER_BYTECODE shaderByteCodePS = {}; - CompileShaderFromFile("ParallelSortVerify.hlsl", &defines, "RenderSortValidationPS", CompileFlagsPS.c_str(), &shaderByteCodePS); - - D3D12_GRAPHICS_PIPELINE_STATE_DESC descPso = {}; - descPso.InputLayout = { nullptr, 0 }; - descPso.pRootSignature = m_pRenderRootSignature; - descPso.VS = shaderByteCodeVS; - descPso.PS = shaderByteCodePS; - descPso.RasterizerState = CD3DX12_RASTERIZER_DESC(D3D12_DEFAULT); - descPso.RasterizerState.CullMode = D3D12_CULL_MODE_NONE; - descPso.BlendState = CD3DX12_BLEND_DESC(D3D12_DEFAULT); - descPso.BlendState.RenderTarget[0].BlendEnable = FALSE; - descPso.DepthStencilState = CD3DX12_DEPTH_STENCIL_DESC(D3D12_DEFAULT); - descPso.DepthStencilState.DepthEnable = FALSE; - descPso.SampleMask = UINT_MAX; - descPso.PrimitiveTopologyType = D3D12_PRIMITIVE_TOPOLOGY_TYPE_TRIANGLE; - descPso.NumRenderTargets = 1; - descPso.RTVFormats[0] = pSwapChain->GetFormat(); - descPso.DSVFormat = DXGI_FORMAT_D32_FLOAT; - descPso.SampleDesc.Count = 1; - descPso.NodeMask = 0; - ThrowIfFailed(m_pDevice->GetDevice()->CreateGraphicsPipelineState(&descPso, IID_PPV_ARGS(&m_pRenderResultVerificationPipeline))); - SetName(m_pRenderResultVerificationPipeline, "RenderFPSResults_Pipeline"); - } - } - - void FFXParallelSort::OnDestroy() - { - // Release verification render resources - m_pRenderResultVerificationPipeline->Release(); - m_pRenderRootSignature->Release(); - m_Validate4KTexture.OnDestroy(); - m_Validate2KTexture.OnDestroy(); - m_Validate1080pTexture.OnDestroy(); - - // Release radix sort indirect resources - m_IndirectKeyCounts.OnDestroy(); - m_IndirectConstantBuffer.OnDestroy(); - m_IndirectCountScatterArgs.OnDestroy(); - m_IndirectReduceScanArgs.OnDestroy(); - m_pFPSCommandSignature->Release(); - m_pFPSIndirectSetupParametersPipeline->Release(); - - // Release radix sort algorithm resources - m_FPSScratchBuffer.OnDestroy(); - m_FPSReducedScratchBuffer.OnDestroy(); - m_pFPSRootSignature->Release(); - m_pFPSCountPipeline->Release(); - m_pFPSCountReducePipeline->Release(); - m_pFPSScanPipeline->Release(); - m_pFPSScanAddPipeline->Release(); - m_pFPSScatterPipeline->Release(); - m_pFPSScatterPayloadPipeline->Release(); - - // Release all of our resources - m_SrcKeyBuffers[0].OnDestroy(); - m_SrcKeyBuffers[1].OnDestroy(); - m_SrcKeyBuffers[2].OnDestroy(); - m_SrcPayloadBuffers.OnDestroy(); - m_DstKeyBuffers[0].OnDestroy(); - m_DstKeyBuffers[1].OnDestroy(); - m_DstPayloadBuffers[0].OnDestroy(); - m_DstPayloadBuffers[1].OnDestroy(); - } - + + D3D12_SHADER_BYTECODE shaderByteCodeVS = {}; + CompileShaderFromFile("ParallelSortVerify.hlsl", nullptr, "FullscreenVS", CompileFlagsVS.c_str(), &shaderByteCodeVS); + + D3D12_SHADER_BYTECODE shaderByteCodePS = {}; + CompileShaderFromFile("ParallelSortVerify.hlsl", nullptr, "RenderSortValidationPS", CompileFlagsPS.c_str(), &shaderByteCodePS); + + D3D12_GRAPHICS_PIPELINE_STATE_DESC descPso = {}; + descPso.InputLayout = { nullptr, 0 }; + descPso.pRootSignature = m_pRenderRootSignature; + descPso.VS = shaderByteCodeVS; + descPso.PS = shaderByteCodePS; + descPso.RasterizerState = CD3DX12_RASTERIZER_DESC(D3D12_DEFAULT); + descPso.RasterizerState.CullMode = D3D12_CULL_MODE_NONE; + descPso.BlendState = CD3DX12_BLEND_DESC(D3D12_DEFAULT); + descPso.BlendState.RenderTarget[0].BlendEnable = FALSE; + descPso.DepthStencilState = CD3DX12_DEPTH_STENCIL_DESC(D3D12_DEFAULT); + descPso.DepthStencilState.DepthEnable = FALSE; + descPso.SampleMask = UINT_MAX; + descPso.PrimitiveTopologyType = D3D12_PRIMITIVE_TOPOLOGY_TYPE_TRIANGLE; + descPso.NumRenderTargets = 1; + descPso.RTVFormats[0] = pSwapChain->GetFormat(); + descPso.DSVFormat = DXGI_FORMAT_D32_FLOAT; + descPso.SampleDesc.Count = 1; + descPso.NodeMask = 0; + ThrowIfFailed(m_pDevice->GetDevice()->CreateGraphicsPipelineState(&descPso, IID_PPV_ARGS(&m_pRenderResultVerificationPipeline))); + SetName(m_pRenderResultVerificationPipeline, "RenderFPSResults_Pipeline"); + } +} + +// Parallel Sort termination +void FFXParallelSort::OnDestroy() +{ + // Release verification render resources + m_pRenderResultVerificationPipeline->Release(); + m_pRenderRootSignature->Release(); + m_Validate4KTexture.OnDestroy(); + m_Validate2KTexture.OnDestroy(); + m_Validate1080pTexture.OnDestroy(); + + // Release radix sort indirect resources + m_IndirectKeyCounts.OnDestroy(); + m_IndirectConstantBuffer.OnDestroy(); + m_IndirectCountScatterArgs.OnDestroy(); + m_IndirectReduceScanArgs.OnDestroy(); + m_pFPSCommandSignature->Release(); + m_pFPSIndirectSetupParametersPipeline->Release(); + + // Release radix sort algorithm resources + m_FPSScratchBuffer.OnDestroy(); + m_FPSReducedScratchBuffer.OnDestroy(); + m_pFPSRootSignature->Release(); + m_pFPSCountPipeline->Release(); + m_pFPSCountReducePipeline->Release(); + m_pFPSScanPipeline->Release(); + m_pFPSScanAddPipeline->Release(); + m_pFPSScatterPipeline->Release(); + m_pFPSScatterPayloadPipeline->Release(); + + // Release all of our resources + m_SrcKeyBuffers[0].OnDestroy(); + m_SrcKeyBuffers[1].OnDestroy(); + m_SrcKeyBuffers[2].OnDestroy(); + m_SrcPayloadBuffers.OnDestroy(); + m_DstKeyBuffers[0].OnDestroy(); + m_DstKeyBuffers[1].OnDestroy(); + m_DstPayloadBuffers[0].OnDestroy(); + m_DstPayloadBuffers[1].OnDestroy(); +} + +// This allows us to validate that the sorted data is actually in ascending order. Only used when doing algorithm changes. #ifdef DEVELOPERMODE - void FFXParallelSort::CreateValidationResources(ID3D12GraphicsCommandList* pCommandList, RdxDX12ResourceInfo* pKeyDstInfo) - { - // Create the read-back resource - CD3DX12_HEAP_PROPERTIES readBackHeapProperties(D3D12_HEAP_TYPE_READBACK); - CD3DX12_RESOURCE_DESC bufferDesc = CD3DX12_RESOURCE_DESC::Buffer(sizeof(uint32_t) * NumKeys[m_UIResolutionSize], D3D12_RESOURCE_FLAG_NONE); - ThrowIfFailed(m_pDevice->GetDevice()->CreateCommittedResource(&readBackHeapProperties, D3D12_HEAP_FLAG_NONE, &bufferDesc, D3D12_RESOURCE_STATE_COPY_DEST, - nullptr, IID_PPV_ARGS(&m_ReadBackBufferResource))); - m_ReadBackBufferResource->SetName(L"Validation Read-back Buffer"); - - // And the fence for us to wait on - ThrowIfFailed(m_pDevice->GetDevice()->CreateFence(0, D3D12_FENCE_FLAG_NONE, IID_PPV_ARGS(&m_ReadBackFence))); - m_ReadBackFence->SetName(L"Validation Read-back Fence"); - - // Transition, copy, and transition back - pCommandList->ResourceBarrier(1, &CD3DX12_RESOURCE_BARRIER::Transition(pKeyDstInfo->pResource, D3D12_RESOURCE_STATE_UNORDERED_ACCESS, D3D12_RESOURCE_STATE_COPY_SOURCE)); - pCommandList->CopyBufferRegion(m_ReadBackBufferResource, 0, pKeyDstInfo->pResource, 0, sizeof(uint32_t) * NumKeys[m_UIResolutionSize]); - pCommandList->ResourceBarrier(1, &CD3DX12_RESOURCE_BARRIER::Transition(pKeyDstInfo->pResource, D3D12_RESOURCE_STATE_COPY_SOURCE, D3D12_RESOURCE_STATE_UNORDERED_ACCESS)); - } - - void FFXParallelSort::WaitForValidationResults() - { - if (!m_ReadBackFence && !m_ReadBackBufferResource) - return; - - // Insert the fence to wait on and create the event to trigger when it's been processed - ThrowIfFailed(m_pDevice->GetGraphicsQueue()->Signal(m_ReadBackFence, 1)); - m_ReadBackFenceEvent = CreateEvent(nullptr, FALSE, FALSE, nullptr); - m_ReadBackFence->SetEventOnCompletion(1, m_ReadBackFenceEvent); - - // Wait for fence to have been processed - WaitForSingleObject(m_ReadBackFenceEvent, INFINITE); - CloseHandle(m_ReadBackFenceEvent); - - // Validate data ... - Trace("Validating Data"); - - D3D12_RANGE range; - range.Begin = 0; - range.End = sizeof(uint32_t) * NumKeys[m_UIResolutionSize]; - void* pData; - m_ReadBackBufferResource->Map(0, &range, &pData); - - uint32_t* SortedData = (uint32_t*)pData; - - // Do the validation - uint32_t keysToValidate = NumKeys[m_UIResolutionSize]; - bool dataValid = true; - - for (uint32_t i = 0; i < keysToValidate - 1; i++) - { - if (SortedData[i] > SortedData[i + 1]) - { - std::string message = "Sort invalidated. Entry "; - message += std::to_string(i); - message += " is larger next entry.\n"; - Trace(message); - dataValid = false; - } - } - - m_ReadBackBufferResource->Unmap(0, nullptr); - - if (dataValid) - Trace("Data Valid"); - - // We are done with the fence and the read-back buffer - m_ReadBackBufferResource->Release(); - m_ReadBackBufferResource = nullptr; - m_ReadBackFence->Release(); - m_ReadBackFence = nullptr; - } +void FFXParallelSort::CreateValidationResources(ID3D12GraphicsCommandList* pCommandList, RdxDX12ResourceInfo* pKeyDstInfo) +{ + // Create the read-back resource + CD3DX12_HEAP_PROPERTIES readBackHeapProperties(D3D12_HEAP_TYPE_READBACK); + CD3DX12_RESOURCE_DESC bufferDesc = CD3DX12_RESOURCE_DESC::Buffer(sizeof(uint32_t) * NumKeys[m_UIResolutionSize], D3D12_RESOURCE_FLAG_NONE); + ThrowIfFailed(m_pDevice->GetDevice()->CreateCommittedResource(&readBackHeapProperties, D3D12_HEAP_FLAG_NONE, &bufferDesc, D3D12_RESOURCE_STATE_COPY_DEST, + nullptr, IID_PPV_ARGS(&m_ReadBackBufferResource))); + m_ReadBackBufferResource->SetName(L"Validation Read-back Buffer"); + + // And the fence for us to wait on + ThrowIfFailed(m_pDevice->GetDevice()->CreateFence(0, D3D12_FENCE_FLAG_NONE, IID_PPV_ARGS(&m_ReadBackFence))); + m_ReadBackFence->SetName(L"Validation Read-back Fence"); + + // Transition, copy, and transition back + pCommandList->ResourceBarrier(1, &CD3DX12_RESOURCE_BARRIER::Transition(pKeyDstInfo->pResource, D3D12_RESOURCE_STATE_UNORDERED_ACCESS, D3D12_RESOURCE_STATE_COPY_SOURCE)); + pCommandList->CopyBufferRegion(m_ReadBackBufferResource, 0, pKeyDstInfo->pResource, 0, sizeof(uint32_t) * NumKeys[m_UIResolutionSize]); + pCommandList->ResourceBarrier(1, &CD3DX12_RESOURCE_BARRIER::Transition(pKeyDstInfo->pResource, D3D12_RESOURCE_STATE_COPY_SOURCE, D3D12_RESOURCE_STATE_UNORDERED_ACCESS)); +} + +void FFXParallelSort::WaitForValidationResults() +{ + if (!m_ReadBackFence && !m_ReadBackBufferResource) + return; + + // Insert the fence to wait on and create the event to trigger when it's been processed + ThrowIfFailed(m_pDevice->GetGraphicsQueue()->Signal(m_ReadBackFence, 1)); + m_ReadBackFenceEvent = CreateEvent(nullptr, FALSE, FALSE, nullptr); + m_ReadBackFence->SetEventOnCompletion(1, m_ReadBackFenceEvent); + + // Wait for fence to have been processed + WaitForSingleObject(m_ReadBackFenceEvent, INFINITE); + CloseHandle(m_ReadBackFenceEvent); + + // Validate data ... + Trace("Validating Data"); + + D3D12_RANGE range; + range.Begin = 0; + range.End = sizeof(uint32_t) * NumKeys[m_UIResolutionSize]; + void* pData; + m_ReadBackBufferResource->Map(0, &range, &pData); + + uint32_t* SortedData = (uint32_t*)pData; + + // Do the validation + uint32_t keysToValidate = NumKeys[m_UIResolutionSize]; + bool dataValid = true; + + for (uint32_t i = 0; i < keysToValidate - 1; i++) + { + if (SortedData[i] > SortedData[i + 1]) + { + std::string message = "Sort invalidated. Entry "; + message += std::to_string(i); + message += " is larger next entry.\n"; + Trace(message); + dataValid = false; + } + } + + m_ReadBackBufferResource->Unmap(0, nullptr); + + if (dataValid) + Trace("Data Valid"); + + // We are done with the fence and the read-back buffer + m_ReadBackBufferResource->Release(); + m_ReadBackBufferResource = nullptr; + m_ReadBackFence->Release(); + m_ReadBackFence = nullptr; +} #endif // DEVELOPERMODE - void FFXParallelSort::CopySourceDataForFrame(ID3D12GraphicsCommandList* pCommandList) - { - // Copy the contents the source buffer to the dstBuffer[0] each frame in order to not - // lose our original data - - // Copy the data into the dst[0] buffers for use on first frame - CD3DX12_RESOURCE_BARRIER Barriers[2] = { CD3DX12_RESOURCE_BARRIER::Transition(m_DstKeyBuffers[0].GetResource(), D3D12_RESOURCE_STATE_UNORDERED_ACCESS, D3D12_RESOURCE_STATE_COPY_DEST), - CD3DX12_RESOURCE_BARRIER::Transition(m_DstPayloadBuffers[0].GetResource(), D3D12_RESOURCE_STATE_UNORDERED_ACCESS, D3D12_RESOURCE_STATE_COPY_DEST) }; - pCommandList->ResourceBarrier(2, Barriers); - - pCommandList->CopyBufferRegion(m_DstKeyBuffers[0].GetResource(), 0, m_SrcKeyBuffers[m_UIResolutionSize].GetResource(), 0, sizeof(uint32_t) * NumKeys[m_UIResolutionSize]); - pCommandList->CopyBufferRegion(m_DstPayloadBuffers[0].GetResource(), 0, m_SrcPayloadBuffers.GetResource(), 0, sizeof(uint32_t) * NumKeys[m_UIResolutionSize]); - - // Put the dst buffers back to UAVs for sort usage - Barriers[0] = CD3DX12_RESOURCE_BARRIER::Transition(m_DstKeyBuffers[0].GetResource(), D3D12_RESOURCE_STATE_COPY_DEST, D3D12_RESOURCE_STATE_UNORDERED_ACCESS); - Barriers[1] = CD3DX12_RESOURCE_BARRIER::Transition(m_DstPayloadBuffers[0].GetResource(), D3D12_RESOURCE_STATE_COPY_DEST, D3D12_RESOURCE_STATE_UNORDERED_ACCESS); - pCommandList->ResourceBarrier(2, Barriers); - } - - void FFXParallelSort::Draw(ID3D12GraphicsCommandList* pCommandList, bool isBenchmarking, float benchmarkTime) - { - bool bIndirectDispatch = m_UIIndirectSort; - - std::string markerText = "FFXParallelSort"; - if (bIndirectDispatch) markerText += " Indirect"; - UserMarker marker(pCommandList, markerText.c_str()); - - FFX_ParallelSortCB constantBufferData = { 0 }; - - // Bind the descriptor heaps - ID3D12DescriptorHeap* pDescriptorHeap = m_pResourceViewHeaps->GetCBV_SRV_UAVHeap(); - pCommandList->SetDescriptorHeaps(1, &pDescriptorHeap); - - // Bind the root signature - pCommandList->SetComputeRootSignature(m_pFPSRootSignature); - - // Fill in the constant buffer data structure (this will be done by a shader in the indirect version) - uint32_t NumThreadgroupsToRun; - uint32_t NumReducedThreadgroupsToRun; - if (!bIndirectDispatch) - { - uint32_t NumberOfKeys = NumKeys[m_UIResolutionSize]; - FFX_ParallelSort_SetConstantAndDispatchData(NumberOfKeys, m_MaxNumThreadgroups, constantBufferData, NumThreadgroupsToRun, NumReducedThreadgroupsToRun); - } - else - { - struct SetupIndirectCB - { - uint32_t NumKeysIndex; - uint32_t MaxThreadGroups; - }; - SetupIndirectCB IndirectSetupCB; - IndirectSetupCB.NumKeysIndex = m_UIResolutionSize; - IndirectSetupCB.MaxThreadGroups = m_MaxNumThreadgroups; - - // Copy the data into the constant buffer - D3D12_GPU_VIRTUAL_ADDRESS constantBuffer = m_pConstantBufferRing->AllocConstantBuffer(sizeof(SetupIndirectCB), &IndirectSetupCB); - pCommandList->SetComputeRootConstantBufferView(1, constantBuffer); // SetupIndirect Constant buffer - - // Bind other buffer - pCommandList->SetComputeRootDescriptorTable(12, m_IndirectKeyCountsUAV.GetGPU()); // Key counts - pCommandList->SetComputeRootDescriptorTable(13, m_IndirectConstantBufferUAV.GetGPU()); // Indirect Sort Constant Buffer - pCommandList->SetComputeRootDescriptorTable(14, m_IndirectCountScatterArgsUAV.GetGPU()); // Indirect Sort Count/Scatter Args - pCommandList->SetComputeRootDescriptorTable(15, m_IndirectReduceScanArgsUAV.GetGPU()); // Indirect Sort Reduce/Scan Args - - // Dispatch - pCommandList->SetPipelineState(m_pFPSIndirectSetupParametersPipeline); - pCommandList->Dispatch(1, 1, 1); - - // When done, transition the args buffers to INDIRECT_ARGUMENT, and the constant buffer UAV to Constant buffer - CD3DX12_RESOURCE_BARRIER barriers[5]; - barriers[0] = CD3DX12_RESOURCE_BARRIER::UAV(m_IndirectCountScatterArgs.GetResource()); - barriers[1] = CD3DX12_RESOURCE_BARRIER::UAV(m_IndirectReduceScanArgs.GetResource()); - barriers[2] = CD3DX12_RESOURCE_BARRIER::Transition(m_IndirectConstantBuffer.GetResource(), D3D12_RESOURCE_STATE_UNORDERED_ACCESS, D3D12_RESOURCE_STATE_VERTEX_AND_CONSTANT_BUFFER); - barriers[3] = CD3DX12_RESOURCE_BARRIER::Transition(m_IndirectCountScatterArgs.GetResource(), D3D12_RESOURCE_STATE_UNORDERED_ACCESS, D3D12_RESOURCE_STATE_INDIRECT_ARGUMENT); - barriers[4] = CD3DX12_RESOURCE_BARRIER::Transition(m_IndirectReduceScanArgs.GetResource(), D3D12_RESOURCE_STATE_UNORDERED_ACCESS, D3D12_RESOURCE_STATE_INDIRECT_ARGUMENT); - pCommandList->ResourceBarrier(5, barriers); - } - - // Setup resource/UAV pairs to use during sort - RdxDX12ResourceInfo KeySrcInfo = { m_DstKeyBuffers[0].GetResource(), m_DstKeyUAVTable.GetGPU(0) }; - RdxDX12ResourceInfo PayloadSrcInfo = { m_DstPayloadBuffers[0].GetResource(), m_DstPayloadUAVTable.GetGPU(0) }; - RdxDX12ResourceInfo KeyTmpInfo = { m_DstKeyBuffers[1].GetResource(), m_DstKeyUAVTable.GetGPU(1) }; - RdxDX12ResourceInfo PayloadTmpInfo = { m_DstPayloadBuffers[1].GetResource(), m_DstPayloadUAVTable.GetGPU(1) }; - RdxDX12ResourceInfo ScratchBufferInfo = { m_FPSScratchBuffer.GetResource(), m_FPSScratchUAV.GetGPU() }; - RdxDX12ResourceInfo ReducedScratchBufferInfo = { m_FPSReducedScratchBuffer.GetResource(), m_FPSReducedScratchUAV.GetGPU() }; - - // Buffers to ping-pong between when writing out sorted values - const RdxDX12ResourceInfo* ReadBufferInfo(&KeySrcInfo), * WriteBufferInfo(&KeyTmpInfo); - const RdxDX12ResourceInfo* ReadPayloadBufferInfo(&PayloadSrcInfo), * WritePayloadBufferInfo(&PayloadTmpInfo); - bool bHasPayload = m_UISortPayload; - - // Setup barriers for the run - CD3DX12_RESOURCE_BARRIER barriers[3]; - - // Perform Radix Sort (currently only support 32-bit key/payload sorting - for (uint32_t Shift = 0; Shift < 32u; Shift += FFX_PARALLELSORT_SORT_BITS_PER_PASS) - { - // Update the bit shift - pCommandList->SetComputeRoot32BitConstant(2, Shift, 0); - - // Copy the data into the constant buffer - D3D12_GPU_VIRTUAL_ADDRESS constantBuffer; - if (bIndirectDispatch) - constantBuffer = m_IndirectConstantBuffer.GetResource()->GetGPUVirtualAddress(); - else - constantBuffer = m_pConstantBufferRing->AllocConstantBuffer(sizeof(FFX_ParallelSortCB), &constantBufferData); - - // Bind to root signature - pCommandList->SetComputeRootConstantBufferView(0, constantBuffer); // Constant buffer - pCommandList->SetComputeRootDescriptorTable(3, ReadBufferInfo->resourceGPUHandle); // SrcBuffer - pCommandList->SetComputeRootDescriptorTable(5, ScratchBufferInfo.resourceGPUHandle); // Scratch buffer - - // Sort Count - { - pCommandList->SetPipelineState(m_pFPSCountPipeline); - - if (bIndirectDispatch) - { - pCommandList->ExecuteIndirect(m_pFPSCommandSignature, 1, m_IndirectCountScatterArgs.GetResource(), 0, nullptr, 0); - } - else - { - pCommandList->Dispatch(NumThreadgroupsToRun, 1, 1); - } - } - - // UAV barrier on the sum table - barriers[0] = CD3DX12_RESOURCE_BARRIER::UAV(ScratchBufferInfo.pResource); - pCommandList->ResourceBarrier(1, barriers); - - pCommandList->SetComputeRootDescriptorTable(6, ReducedScratchBufferInfo.resourceGPUHandle); // Scratch reduce buffer - - // Sort Reduce - { - pCommandList->SetPipelineState(m_pFPSCountReducePipeline); - - if (bIndirectDispatch) - { - pCommandList->ExecuteIndirect(m_pFPSCommandSignature, 1, m_IndirectReduceScanArgs.GetResource(), 0, nullptr, 0); - } - else - { - pCommandList->Dispatch(NumReducedThreadgroupsToRun, 1, 1); - } - - // UAV barrier on the reduced sum table - barriers[0] = CD3DX12_RESOURCE_BARRIER::UAV(ReducedScratchBufferInfo.pResource); - pCommandList->ResourceBarrier(1, barriers); - } - - // Sort Scan - { - // First do scan prefix of reduced values - pCommandList->SetComputeRootDescriptorTable(9, ReducedScratchBufferInfo.resourceGPUHandle); - pCommandList->SetComputeRootDescriptorTable(10, ReducedScratchBufferInfo.resourceGPUHandle); - - pCommandList->SetPipelineState(m_pFPSScanPipeline); - if (!bIndirectDispatch) - { - assert(NumReducedThreadgroupsToRun < FFX_PARALLELSORT_ELEMENTS_PER_THREAD * FFX_PARALLELSORT_THREADGROUP_SIZE && "Need to account for bigger reduced histogram scan"); - } - pCommandList->Dispatch(1, 1, 1); - - // UAV barrier on the reduced sum table - barriers[0] = CD3DX12_RESOURCE_BARRIER::UAV(ReducedScratchBufferInfo.pResource); - pCommandList->ResourceBarrier(1, barriers); - - // Next do scan prefix on the histogram with partial sums that we just did - pCommandList->SetComputeRootDescriptorTable(9, ScratchBufferInfo.resourceGPUHandle); - pCommandList->SetComputeRootDescriptorTable(10, ScratchBufferInfo.resourceGPUHandle); - pCommandList->SetComputeRootDescriptorTable(11, ReducedScratchBufferInfo.resourceGPUHandle); - - pCommandList->SetPipelineState(m_pFPSScanAddPipeline); - if (bIndirectDispatch) - { - pCommandList->ExecuteIndirect(m_pFPSCommandSignature, 1, m_IndirectReduceScanArgs.GetResource(), 0, nullptr, 0); - } - else - { - pCommandList->Dispatch(NumReducedThreadgroupsToRun, 1, 1); - } - } - - // UAV barrier on the sum table - barriers[0] = CD3DX12_RESOURCE_BARRIER::UAV(ScratchBufferInfo.pResource); - pCommandList->ResourceBarrier(1, barriers); - - if (bHasPayload) - { - pCommandList->SetComputeRootDescriptorTable(4, ReadPayloadBufferInfo->resourceGPUHandle); // ScrPayload - pCommandList->SetComputeRootDescriptorTable(8, WritePayloadBufferInfo->resourceGPUHandle); // DstPayload - } - - pCommandList->SetComputeRootDescriptorTable(7, WriteBufferInfo->resourceGPUHandle); // DstBuffer - - // Sort Scatter - { - pCommandList->SetPipelineState(bHasPayload ? m_pFPSScatterPayloadPipeline : m_pFPSScatterPipeline); - - if (bIndirectDispatch) - { - pCommandList->ExecuteIndirect(m_pFPSCommandSignature, 1, m_IndirectCountScatterArgs.GetResource(), 0, nullptr, 0); - } - else - { - pCommandList->Dispatch(NumThreadgroupsToRun, 1, 1); - } - } - - // Finish doing everything and barrier for the next pass - int numBarriers = 0; - barriers[numBarriers++] = CD3DX12_RESOURCE_BARRIER::UAV(WriteBufferInfo->pResource); - if (bHasPayload) - barriers[numBarriers++] = CD3DX12_RESOURCE_BARRIER::UAV(WritePayloadBufferInfo->pResource); - pCommandList->ResourceBarrier(numBarriers, barriers); - - // Swap read/write sources - std::swap(ReadBufferInfo, WriteBufferInfo); - if (bHasPayload) - std::swap(ReadPayloadBufferInfo, WritePayloadBufferInfo); - } - - // When we are all done, transition indirect buffers back to UAV for the next frame (if doing indirect dispatch) - if (bIndirectDispatch) - { - barriers[0] = CD3DX12_RESOURCE_BARRIER::Transition(m_IndirectCountScatterArgs.GetResource(), D3D12_RESOURCE_STATE_INDIRECT_ARGUMENT, D3D12_RESOURCE_STATE_UNORDERED_ACCESS); - barriers[1] = CD3DX12_RESOURCE_BARRIER::Transition(m_IndirectReduceScanArgs.GetResource(), D3D12_RESOURCE_STATE_INDIRECT_ARGUMENT, D3D12_RESOURCE_STATE_UNORDERED_ACCESS); - barriers[2] = CD3DX12_RESOURCE_BARRIER::Transition(m_IndirectConstantBuffer.GetResource(), D3D12_RESOURCE_STATE_VERTEX_AND_CONSTANT_BUFFER, D3D12_RESOURCE_STATE_UNORDERED_ACCESS); - pCommandList->ResourceBarrier(3, barriers); - } - - // Do we need to validate the results? If so, create a read back buffer to use for this frame +// Because we are sorting the data every frame, need to reset to unsorted version of data before running sort +void FFXParallelSort::CopySourceDataForFrame(ID3D12GraphicsCommandList* pCommandList) +{ + // Copy the contents the source buffer to the dstBuffer[0] each frame in order to not + // lose our original data + + // Copy the data into the dst[0] buffers for use on first frame + CD3DX12_RESOURCE_BARRIER Barriers[2] = { CD3DX12_RESOURCE_BARRIER::Transition(m_DstKeyBuffers[0].GetResource(), D3D12_RESOURCE_STATE_UNORDERED_ACCESS, D3D12_RESOURCE_STATE_COPY_DEST), + CD3DX12_RESOURCE_BARRIER::Transition(m_DstPayloadBuffers[0].GetResource(), D3D12_RESOURCE_STATE_UNORDERED_ACCESS, D3D12_RESOURCE_STATE_COPY_DEST) }; + pCommandList->ResourceBarrier(2, Barriers); + + pCommandList->CopyBufferRegion(m_DstKeyBuffers[0].GetResource(), 0, m_SrcKeyBuffers[m_UIResolutionSize].GetResource(), 0, sizeof(uint32_t) * NumKeys[m_UIResolutionSize]); + pCommandList->CopyBufferRegion(m_DstPayloadBuffers[0].GetResource(), 0, m_SrcPayloadBuffers.GetResource(), 0, sizeof(uint32_t) * NumKeys[m_UIResolutionSize]); + + // Put the dst buffers back to UAVs for sort usage + Barriers[0] = CD3DX12_RESOURCE_BARRIER::Transition(m_DstKeyBuffers[0].GetResource(), D3D12_RESOURCE_STATE_COPY_DEST, D3D12_RESOURCE_STATE_UNORDERED_ACCESS); + Barriers[1] = CD3DX12_RESOURCE_BARRIER::Transition(m_DstPayloadBuffers[0].GetResource(), D3D12_RESOURCE_STATE_COPY_DEST, D3D12_RESOURCE_STATE_UNORDERED_ACCESS); + pCommandList->ResourceBarrier(2, Barriers); +} + +// Perform Parallel Sort (radix-based sort) +void FFXParallelSort::Sort(ID3D12GraphicsCommandList* pCommandList, bool isBenchmarking, float benchmarkTime) +{ + bool bIndirectDispatch = m_UIIndirectSort; + + std::string markerText = "FFXParallelSort"; + if (bIndirectDispatch) markerText += " Indirect"; + UserMarker marker(pCommandList, markerText.c_str()); + + FFX_ParallelSortCB constantBufferData = { 0 }; + + // Bind the descriptor heaps + ID3D12DescriptorHeap* pDescriptorHeap = m_pResourceViewHeaps->GetCBV_SRV_UAVHeap(); + pCommandList->SetDescriptorHeaps(1, &pDescriptorHeap); + + // Bind the root signature + pCommandList->SetComputeRootSignature(m_pFPSRootSignature); + + // Fill in the constant buffer data structure (this will be done by a shader in the indirect version) + uint32_t NumThreadgroupsToRun; + uint32_t NumReducedThreadgroupsToRun; + if (!bIndirectDispatch) + { + uint32_t NumberOfKeys = NumKeys[m_UIResolutionSize]; + FFX_ParallelSort_SetConstantAndDispatchData(NumberOfKeys, m_MaxNumThreadgroups, constantBufferData, NumThreadgroupsToRun, NumReducedThreadgroupsToRun); + } + else + { + struct SetupIndirectCB + { + uint32_t NumKeysIndex; + uint32_t MaxThreadGroups; + }; + SetupIndirectCB IndirectSetupCB; + IndirectSetupCB.NumKeysIndex = m_UIResolutionSize; + IndirectSetupCB.MaxThreadGroups = m_MaxNumThreadgroups; + + // Copy the data into the constant buffer + D3D12_GPU_VIRTUAL_ADDRESS constantBuffer = m_pConstantBufferRing->AllocConstantBuffer(sizeof(SetupIndirectCB), &IndirectSetupCB); + pCommandList->SetComputeRootConstantBufferView(1, constantBuffer); // SetupIndirect Constant buffer + + // Bind other buffer + pCommandList->SetComputeRootDescriptorTable(12, m_IndirectKeyCountsUAV.GetGPU()); // Key counts + pCommandList->SetComputeRootDescriptorTable(13, m_IndirectConstantBufferUAV.GetGPU()); // Indirect Sort Constant Buffer + pCommandList->SetComputeRootDescriptorTable(14, m_IndirectCountScatterArgsUAV.GetGPU()); // Indirect Sort Count/Scatter Args + pCommandList->SetComputeRootDescriptorTable(15, m_IndirectReduceScanArgsUAV.GetGPU()); // Indirect Sort Reduce/Scan Args + + // Dispatch + pCommandList->SetPipelineState(m_pFPSIndirectSetupParametersPipeline); + pCommandList->Dispatch(1, 1, 1); + + // When done, transition the args buffers to INDIRECT_ARGUMENT, and the constant buffer UAV to Constant buffer + CD3DX12_RESOURCE_BARRIER barriers[5]; + barriers[0] = CD3DX12_RESOURCE_BARRIER::UAV(m_IndirectCountScatterArgs.GetResource()); + barriers[1] = CD3DX12_RESOURCE_BARRIER::UAV(m_IndirectReduceScanArgs.GetResource()); + barriers[2] = CD3DX12_RESOURCE_BARRIER::Transition(m_IndirectConstantBuffer.GetResource(), D3D12_RESOURCE_STATE_UNORDERED_ACCESS, D3D12_RESOURCE_STATE_VERTEX_AND_CONSTANT_BUFFER); + barriers[3] = CD3DX12_RESOURCE_BARRIER::Transition(m_IndirectCountScatterArgs.GetResource(), D3D12_RESOURCE_STATE_UNORDERED_ACCESS, D3D12_RESOURCE_STATE_INDIRECT_ARGUMENT); + barriers[4] = CD3DX12_RESOURCE_BARRIER::Transition(m_IndirectReduceScanArgs.GetResource(), D3D12_RESOURCE_STATE_UNORDERED_ACCESS, D3D12_RESOURCE_STATE_INDIRECT_ARGUMENT); + pCommandList->ResourceBarrier(5, barriers); + } + + // Setup resource/UAV pairs to use during sort + RdxDX12ResourceInfo KeySrcInfo = { m_DstKeyBuffers[0].GetResource(), m_DstKeyUAVTable.GetGPU(0) }; + RdxDX12ResourceInfo PayloadSrcInfo = { m_DstPayloadBuffers[0].GetResource(), m_DstPayloadUAVTable.GetGPU(0) }; + RdxDX12ResourceInfo KeyTmpInfo = { m_DstKeyBuffers[1].GetResource(), m_DstKeyUAVTable.GetGPU(1) }; + RdxDX12ResourceInfo PayloadTmpInfo = { m_DstPayloadBuffers[1].GetResource(), m_DstPayloadUAVTable.GetGPU(1) }; + RdxDX12ResourceInfo ScratchBufferInfo = { m_FPSScratchBuffer.GetResource(), m_FPSScratchUAV.GetGPU() }; + RdxDX12ResourceInfo ReducedScratchBufferInfo = { m_FPSReducedScratchBuffer.GetResource(), m_FPSReducedScratchUAV.GetGPU() }; + + // Buffers to ping-pong between when writing out sorted values + const RdxDX12ResourceInfo* ReadBufferInfo(&KeySrcInfo), * WriteBufferInfo(&KeyTmpInfo); + const RdxDX12ResourceInfo* ReadPayloadBufferInfo(&PayloadSrcInfo), * WritePayloadBufferInfo(&PayloadTmpInfo); + bool bHasPayload = m_UISortPayload; + + // Setup barriers for the run + CD3DX12_RESOURCE_BARRIER barriers[3]; + + // Perform Radix Sort (currently only support 32-bit key/payload sorting + for (uint32_t Shift = 0; Shift < 32u; Shift += FFX_PARALLELSORT_SORT_BITS_PER_PASS) + { + // Update the bit shift + pCommandList->SetComputeRoot32BitConstant(2, Shift, 0); + + // Copy the data into the constant buffer + D3D12_GPU_VIRTUAL_ADDRESS constantBuffer; + if (bIndirectDispatch) + constantBuffer = m_IndirectConstantBuffer.GetResource()->GetGPUVirtualAddress(); + else + constantBuffer = m_pConstantBufferRing->AllocConstantBuffer(sizeof(FFX_ParallelSortCB), &constantBufferData); + + // Bind to root signature + pCommandList->SetComputeRootConstantBufferView(0, constantBuffer); // Constant buffer + pCommandList->SetComputeRootDescriptorTable(3, ReadBufferInfo->resourceGPUHandle); // SrcBuffer + pCommandList->SetComputeRootDescriptorTable(5, ScratchBufferInfo.resourceGPUHandle); // Scratch buffer + + // Sort Count + { + pCommandList->SetPipelineState(m_pFPSCountPipeline); + + if (bIndirectDispatch) + { + pCommandList->ExecuteIndirect(m_pFPSCommandSignature, 1, m_IndirectCountScatterArgs.GetResource(), 0, nullptr, 0); + } + else + { + pCommandList->Dispatch(NumThreadgroupsToRun, 1, 1); + } + } + + // UAV barrier on the sum table + barriers[0] = CD3DX12_RESOURCE_BARRIER::UAV(ScratchBufferInfo.pResource); + pCommandList->ResourceBarrier(1, barriers); + + pCommandList->SetComputeRootDescriptorTable(6, ReducedScratchBufferInfo.resourceGPUHandle); // Scratch reduce buffer + + // Sort Reduce + { + pCommandList->SetPipelineState(m_pFPSCountReducePipeline); + + if (bIndirectDispatch) + { + pCommandList->ExecuteIndirect(m_pFPSCommandSignature, 1, m_IndirectReduceScanArgs.GetResource(), 0, nullptr, 0); + } + else + { + pCommandList->Dispatch(NumReducedThreadgroupsToRun, 1, 1); + } + + // UAV barrier on the reduced sum table + barriers[0] = CD3DX12_RESOURCE_BARRIER::UAV(ReducedScratchBufferInfo.pResource); + pCommandList->ResourceBarrier(1, barriers); + } + + // Sort Scan + { + // First do scan prefix of reduced values + pCommandList->SetComputeRootDescriptorTable(9, ReducedScratchBufferInfo.resourceGPUHandle); + pCommandList->SetComputeRootDescriptorTable(10, ReducedScratchBufferInfo.resourceGPUHandle); + + pCommandList->SetPipelineState(m_pFPSScanPipeline); + if (!bIndirectDispatch) + { + assert(NumReducedThreadgroupsToRun < FFX_PARALLELSORT_ELEMENTS_PER_THREAD * FFX_PARALLELSORT_THREADGROUP_SIZE && "Need to account for bigger reduced histogram scan"); + } + pCommandList->Dispatch(1, 1, 1); + + // UAV barrier on the reduced sum table + barriers[0] = CD3DX12_RESOURCE_BARRIER::UAV(ReducedScratchBufferInfo.pResource); + pCommandList->ResourceBarrier(1, barriers); + + // Next do scan prefix on the histogram with partial sums that we just did + pCommandList->SetComputeRootDescriptorTable(9, ScratchBufferInfo.resourceGPUHandle); + pCommandList->SetComputeRootDescriptorTable(10, ScratchBufferInfo.resourceGPUHandle); + pCommandList->SetComputeRootDescriptorTable(11, ReducedScratchBufferInfo.resourceGPUHandle); + + pCommandList->SetPipelineState(m_pFPSScanAddPipeline); + if (bIndirectDispatch) + { + pCommandList->ExecuteIndirect(m_pFPSCommandSignature, 1, m_IndirectReduceScanArgs.GetResource(), 0, nullptr, 0); + } + else + { + pCommandList->Dispatch(NumReducedThreadgroupsToRun, 1, 1); + } + } + + // UAV barrier on the sum table + barriers[0] = CD3DX12_RESOURCE_BARRIER::UAV(ScratchBufferInfo.pResource); + pCommandList->ResourceBarrier(1, barriers); + + if (bHasPayload) + { + pCommandList->SetComputeRootDescriptorTable(4, ReadPayloadBufferInfo->resourceGPUHandle); // ScrPayload + pCommandList->SetComputeRootDescriptorTable(8, WritePayloadBufferInfo->resourceGPUHandle); // DstPayload + } + + pCommandList->SetComputeRootDescriptorTable(7, WriteBufferInfo->resourceGPUHandle); // DstBuffer + + // Sort Scatter + { + pCommandList->SetPipelineState(bHasPayload ? m_pFPSScatterPayloadPipeline : m_pFPSScatterPipeline); + + if (bIndirectDispatch) + { + pCommandList->ExecuteIndirect(m_pFPSCommandSignature, 1, m_IndirectCountScatterArgs.GetResource(), 0, nullptr, 0); + } + else + { + pCommandList->Dispatch(NumThreadgroupsToRun, 1, 1); + } + } + + // Finish doing everything and barrier for the next pass + int numBarriers = 0; + barriers[numBarriers++] = CD3DX12_RESOURCE_BARRIER::UAV(WriteBufferInfo->pResource); + if (bHasPayload) + barriers[numBarriers++] = CD3DX12_RESOURCE_BARRIER::UAV(WritePayloadBufferInfo->pResource); + pCommandList->ResourceBarrier(numBarriers, barriers); + + // Swap read/write sources + std::swap(ReadBufferInfo, WriteBufferInfo); + if (bHasPayload) + std::swap(ReadPayloadBufferInfo, WritePayloadBufferInfo); + } + + // When we are all done, transition indirect buffers back to UAV for the next frame (if doing indirect dispatch) + if (bIndirectDispatch) + { + barriers[0] = CD3DX12_RESOURCE_BARRIER::Transition(m_IndirectCountScatterArgs.GetResource(), D3D12_RESOURCE_STATE_INDIRECT_ARGUMENT, D3D12_RESOURCE_STATE_UNORDERED_ACCESS); + barriers[1] = CD3DX12_RESOURCE_BARRIER::Transition(m_IndirectReduceScanArgs.GetResource(), D3D12_RESOURCE_STATE_INDIRECT_ARGUMENT, D3D12_RESOURCE_STATE_UNORDERED_ACCESS); + barriers[2] = CD3DX12_RESOURCE_BARRIER::Transition(m_IndirectConstantBuffer.GetResource(), D3D12_RESOURCE_STATE_VERTEX_AND_CONSTANT_BUFFER, D3D12_RESOURCE_STATE_UNORDERED_ACCESS); + pCommandList->ResourceBarrier(3, barriers); + } + + // Do we need to validate the results? If so, create a read back buffer to use for this frame #ifdef DEVELOPERMODE - if (m_UIValidateSortResults && !isBenchmarking) - { - CreateValidationResources(pCommandList, &KeySrcInfo); - // Only do this for 1 frame - m_UIValidateSortResults = false; - } + if (m_UIValidateSortResults && !isBenchmarking) + { + CreateValidationResources(pCommandList, &KeySrcInfo); + // Only do this for 1 frame + m_UIValidateSortResults = false; + } #endif // DEVELOPERMODE - } - - void FFXParallelSort::DrawGui() - { - if (ImGui::CollapsingHeader("FidelityFX Parallel Sort", ImGuiTreeNodeFlags_DefaultOpen)) - { - static const char* ResolutionSizeStrings[] = { "1920x1080", "2560x1440", "3840x2160" }; - - ImVec2 textSize = ImGui::CalcTextSize("3840x2160"); - if (KeySetOverride < 0) - { - ImGui::PushItemWidth(textSize.x * 2); - ImGui::Combo("Sort Buffer Resolution", &m_UIResolutionSize, ResolutionSizeStrings, _countof(ResolutionSizeStrings)); - ImGui::PopItemWidth(); - } - - ImGui::Checkbox("Sort Payload", &m_UISortPayload); - ImGui::Checkbox("Use Indirect Execution", &m_UIIndirectSort); +} + +// Render Parallel Sort related GUI +void FFXParallelSort::DrawGui() +{ + if (ImGui::CollapsingHeader("FFX Parallel Sort", ImGuiTreeNodeFlags_DefaultOpen)) + { + static const char* ResolutionSizeStrings[] = { "1920x1080", "2560x1440", "3840x2160" }; + + ImVec2 textSize = ImGui::CalcTextSize("3840x2160"); + if (KeySetOverride < 0) + { + ImGui::PushItemWidth(textSize.x * 2); + ImGui::Combo("Sort Buffer Resolution", &m_UIResolutionSize, ResolutionSizeStrings, _countof(ResolutionSizeStrings)); + ImGui::PopItemWidth(); + } + + ImGui::Checkbox("Sort Payload", &m_UISortPayload); + ImGui::Checkbox("Use Indirect Execution", &m_UIIndirectSort); #ifdef DEVELOPERMODE - ImGui::Checkbox("Validate Sort Results", &m_UIValidateSortResults); + if (ImGui::Button("Validate Sort Results")) + m_UIValidateSortResults = true; #endif // DEVELOPERMODE - ImGui::RadioButton("Render Unsorted Keys", &m_UIVisualOutput, 0); - ImGui::RadioButton("Render Sorted Keys", &m_UIVisualOutput, 1); - } - } - - void FFXParallelSort::DrawVisualization(ID3D12GraphicsCommandList* pCommandList, uint32_t RTWidth, uint32_t RTHeight) - { - // Setup the constant buffer - ParallelSortRenderCB ConstantBuffer; - ConstantBuffer.Width = RTWidth; - ConstantBuffer.Height = RTHeight; - static const uint32_t SortWidths[] = { 1920, 2560, 3840 }; - static const uint32_t SortHeights[] = { 1080, 1440, 2160 }; - ConstantBuffer.SortWidth = SortWidths[m_UIResolutionSize]; - ConstantBuffer.SortHeight = SortHeights[m_UIResolutionSize]; - - // Bind root signature and descriptor heaps - ID3D12DescriptorHeap* pDescriptorHeap = m_pResourceViewHeaps->GetCBV_SRV_UAVHeap(); - pCommandList->SetDescriptorHeaps(1, &pDescriptorHeap); - pCommandList->SetGraphicsRootSignature(m_pRenderRootSignature); - - // Bind constant buffer - D3D12_GPU_VIRTUAL_ADDRESS GPUCB = m_pConstantBufferRing->AllocConstantBuffer(sizeof(ParallelSortRenderCB), &ConstantBuffer); - pCommandList->SetGraphicsRootConstantBufferView(0, GPUCB); - - // If we are showing unsorted values, need to transition the source data buffer from copy source to UAV and back - if (!m_UIVisualOutput) - { - CD3DX12_RESOURCE_BARRIER barrier = CD3DX12_RESOURCE_BARRIER::Transition(m_SrcKeyBuffers[m_UIResolutionSize].GetResource(), D3D12_RESOURCE_STATE_COPY_SOURCE, D3D12_RESOURCE_STATE_UNORDERED_ACCESS); - pCommandList->ResourceBarrier(1, &barrier); - pCommandList->SetGraphicsRootDescriptorTable(1, m_SrcKeyUAVTable.GetGPU(m_UIResolutionSize)); - } - else - pCommandList->SetGraphicsRootDescriptorTable(1, m_DstKeyUAVTable.GetGPU(0)); - - // Bind validation texture - pCommandList->SetGraphicsRootDescriptorTable(2, m_ValidateTextureUAV.GetGPU(m_UIResolutionSize)); - - D3D12_VIEWPORT vp = {}; - vp.Width = (float)RTWidth; - vp.Height = (float)RTHeight; - vp.MinDepth = 0.0f; - vp.MaxDepth = 1.0f; - vp.TopLeftX = vp.TopLeftY = 0.0f; - pCommandList->RSSetViewports(1, &vp); - - // Set the shader and dispatch - pCommandList->IASetVertexBuffers(0, 0, nullptr); - pCommandList->IASetPrimitiveTopology(D3D_PRIMITIVE_TOPOLOGY_TRIANGLELIST); - pCommandList->SetPipelineState(m_pRenderResultVerificationPipeline); - pCommandList->DrawInstanced(3, 1, 0, 0); - - // If we are showing unsorted values, need to transition the source data buffer from copy source to UAV and back - if (!m_UIVisualOutput) - { - CD3DX12_RESOURCE_BARRIER barrier = CD3DX12_RESOURCE_BARRIER::Transition(m_SrcKeyBuffers[m_UIResolutionSize].GetResource(), D3D12_RESOURCE_STATE_UNORDERED_ACCESS, D3D12_RESOURCE_STATE_COPY_SOURCE); - pCommandList->ResourceBarrier(1, &barrier); - } - } -} \ No newline at end of file + ImGui::RadioButton("Render Unsorted Keys", &m_UIVisualOutput, 0); + ImGui::RadioButton("Render Sorted Keys", &m_UIVisualOutput, 1); + } +} + +// Renders the image with the sorted/unsorted indicies for visual representation +void FFXParallelSort::DrawVisualization(ID3D12GraphicsCommandList* pCommandList, uint32_t RTWidth, uint32_t RTHeight) +{ + // Setup the constant buffer + ParallelSortRenderCB ConstantBuffer; + ConstantBuffer.Width = RTWidth; + ConstantBuffer.Height = RTHeight; + static const uint32_t SortWidths[] = { 1920, 2560, 3840 }; + static const uint32_t SortHeights[] = { 1080, 1440, 2160 }; + ConstantBuffer.SortWidth = SortWidths[m_UIResolutionSize]; + ConstantBuffer.SortHeight = SortHeights[m_UIResolutionSize]; + + // Bind root signature and descriptor heaps + ID3D12DescriptorHeap* pDescriptorHeap = m_pResourceViewHeaps->GetCBV_SRV_UAVHeap(); + pCommandList->SetDescriptorHeaps(1, &pDescriptorHeap); + pCommandList->SetGraphicsRootSignature(m_pRenderRootSignature); + + // Bind constant buffer + D3D12_GPU_VIRTUAL_ADDRESS GPUCB = m_pConstantBufferRing->AllocConstantBuffer(sizeof(ParallelSortRenderCB), &ConstantBuffer); + pCommandList->SetGraphicsRootConstantBufferView(0, GPUCB); + + // If we are showing unsorted values, need to transition the source data buffer from copy source to UAV and back + if (!m_UIVisualOutput) + { + CD3DX12_RESOURCE_BARRIER barrier = CD3DX12_RESOURCE_BARRIER::Transition(m_SrcKeyBuffers[m_UIResolutionSize].GetResource(), D3D12_RESOURCE_STATE_COPY_SOURCE, D3D12_RESOURCE_STATE_UNORDERED_ACCESS); + pCommandList->ResourceBarrier(1, &barrier); + pCommandList->SetGraphicsRootDescriptorTable(1, m_SrcKeyUAVTable.GetGPU(m_UIResolutionSize)); + } + else + pCommandList->SetGraphicsRootDescriptorTable(1, m_DstKeyUAVTable.GetGPU(0)); + + // Bind validation texture + pCommandList->SetGraphicsRootDescriptorTable(2, m_ValidateTextureSRV.GetGPU(m_UIResolutionSize)); + + D3D12_VIEWPORT vp = {}; + vp.Width = (float)RTWidth; + vp.Height = (float)RTHeight; + vp.MinDepth = 0.0f; + vp.MaxDepth = 1.0f; + vp.TopLeftX = vp.TopLeftY = 0.0f; + pCommandList->RSSetViewports(1, &vp); + + // Set the shader and dispatch + pCommandList->IASetVertexBuffers(0, 0, nullptr); + pCommandList->IASetPrimitiveTopology(D3D_PRIMITIVE_TOPOLOGY_TRIANGLELIST); + pCommandList->SetPipelineState(m_pRenderResultVerificationPipeline); + pCommandList->DrawInstanced(3, 1, 0, 0); + + // If we are showing unsorted values, need to transition the source data buffer from copy source to UAV and back + if (!m_UIVisualOutput) + { + CD3DX12_RESOURCE_BARRIER barrier = CD3DX12_RESOURCE_BARRIER::Transition(m_SrcKeyBuffers[m_UIResolutionSize].GetResource(), D3D12_RESOURCE_STATE_UNORDERED_ACCESS, D3D12_RESOURCE_STATE_COPY_SOURCE); + pCommandList->ResourceBarrier(1, &barrier); + } +} diff --git a/sample/src/DX12/ParallelSort.h b/sample/src/DX12/ParallelSort.h index 96d8861..c778191 100644 --- a/sample/src/DX12/ParallelSort.h +++ b/sample/src/DX12/ParallelSort.h @@ -1,17 +1,17 @@ // ParallelSort.h // -// Copyright (c) 2020 Advanced Micro Devices, Inc. All rights reserved. +// Copyright(c) 2021 Advanced Micro Devices, Inc.All rights reserved. // Permission is hereby granted, free of charge, to any person obtaining a copy -// of this software and associated documentation files (the "Software"), to deal +// of this software and associated documentation files(the "Software"), to deal // in the Software without restriction, including without limitation the rights -// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// to use, copy, modify, merge, publish, distribute, sublicense, and / or sell // copies of the Software, and to permit persons to whom the Software is -// furnished to do so, subject to the following conditions: +// furnished to do so, subject to the following conditions : // The above copyright notice and this permission notice shall be included in // all copies or substantial portions of the Software. // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN @@ -20,135 +20,130 @@ #pragma once #include +using namespace CAULDRON_DX12; + // Uncomment the following line to enable developer mode which compiles in data verification mechanism //#define DEVELOPERMODE -#define SORT_BITS_PER_PASS 4 -#define SORT_BIN_COUNT (1 << SORT_BITS_PER_PASS) -#define THREADGROUP_SIZE 64 -#define ELEMENTS_PER_THREAD 4 // (256 / THREADGROUP_SIZE) -#define ITEMS_PER_WI 16 -#define INV_ITEMS_PER_WI 1/16 - -struct ParallelSortRenderCB // If you change this, also change struct ParallelSortRenderCB in ParallelSortVerify.hlsl +struct ParallelSortRenderCB // If you change this, also change struct ParallelSortRenderCB in ParallelSortVerify.hlsl { - int32_t Width; - int32_t Height; - int32_t SortWidth; - int32_t SortHeight; + int32_t Width; + int32_t Height; + int32_t SortWidth; + int32_t SortHeight; }; // Convenience struct for passing resource/UAV pairs around typedef struct RdxDX12ResourceInfo { - ID3D12Resource* pResource; ///< Pointer to the resource -- used for barriers and syncs (must NOT be nullptr) - D3D12_GPU_DESCRIPTOR_HANDLE resourceGPUHandle; ///< The GPU Descriptor Handle to use for binding the resource + ID3D12Resource* pResource; ///< Pointer to the resource -- used for barriers and syncs (must NOT be nullptr) + D3D12_GPU_DESCRIPTOR_HANDLE resourceGPUHandle; ///< The GPU Descriptor Handle to use for binding the resource } RdxDX12ResourceInfo; namespace CAULDRON_DX12 { - class Device; - class ResourceViewHeaps; - class DynamicBufferRing; - class StaticBufferPool; - - class FFXParallelSort - { - public: - void OnCreate(Device* pDevice, ResourceViewHeaps* pResourceViewHeaps, DynamicBufferRing* pConstantBufferRing, UploadHeap* pUploadHeap, SwapChain* pSwapChain); - void OnDestroy(); - - void Draw(ID3D12GraphicsCommandList* pCommandList, bool isBenchmarking, float benchmarkTime); + class Device; + class ResourceViewHeaps; + class DynamicBufferRing; + class StaticBufferPool; +} + +class FFXParallelSort +{ +public: + void OnCreate(Device* pDevice, ResourceViewHeaps* pResourceViewHeaps, DynamicBufferRing* pConstantBufferRing, UploadHeap* pUploadHeap, SwapChain* pSwapChain); + void OnDestroy(); + + void Sort(ID3D12GraphicsCommandList* pCommandList, bool isBenchmarking, float benchmarkTime); #ifdef DEVELOPERMODE - void WaitForValidationResults(); + void WaitForValidationResults(); #endif // DEVELOPERMODE - void CopySourceDataForFrame(ID3D12GraphicsCommandList* pCommandList); - void DrawGui(); - void DrawVisualization(ID3D12GraphicsCommandList* pCommandList, uint32_t RTWidth, uint32_t RTHeight); - - // Temp -- For command line overrides - static void OverrideKeySet(int ResolutionOverride); - static void OverridePayload(); - // Temp -- For command line overrides - - private: - void CreateKeyPayloadBuffers(); - void CompileRadixPipeline(const char* shaderFile, const DefineList* defines, const char* entryPoint, ID3D12PipelineState*& pPipeline); + void CopySourceDataForFrame(ID3D12GraphicsCommandList* pCommandList); + void DrawGui(); + void DrawVisualization(ID3D12GraphicsCommandList* pCommandList, uint32_t RTWidth, uint32_t RTHeight); + + // Temp -- For command line overrides + static void OverrideKeySet(int ResolutionOverride); + static void OverridePayload(); + // Temp -- For command line overrides + +private: + void CreateKeyPayloadBuffers(); + void CompileRadixPipeline(const char* shaderFile, const DefineList* defines, const char* entryPoint, ID3D12PipelineState*& pPipeline); #ifdef DEVELOPERMODE - void CreateValidationResources(ID3D12GraphicsCommandList* pCommandList, RdxDX12ResourceInfo* pKeyDstInfo); + void CreateValidationResources(ID3D12GraphicsCommandList* pCommandList, RdxDX12ResourceInfo* pKeyDstInfo); #endif // DEVELOPERMODE - // Temp -- For command line overrides - static int KeySetOverride; - static bool PayloadOverride; - // Temp -- For command line overrides - - Device* m_pDevice = nullptr; - UploadHeap* m_pUploadHeap = nullptr; - ResourceViewHeaps* m_pResourceViewHeaps = nullptr; - DynamicBufferRing* m_pConstantBufferRing = nullptr; - uint32_t m_MaxNumThreadgroups = 320; // Use a generic thread group size when not on AMD hardware (taken from experiments to determine best performance threshold) - - // Sample resources - CAULDRON_DX12::Texture m_SrcKeyBuffers[3]; // 32 bit source key buffers (for 1080, 2K, 4K resolution) - CAULDRON_DX12::CBV_SRV_UAV m_SrcKeyUAVTable; // 32 bit source key UAVs (for 1080, 2K, 4K resolution) - - CAULDRON_DX12::Texture m_SrcPayloadBuffers; // 32 bit source payload buffers - CAULDRON_DX12::CBV_SRV_UAV m_SrcPayloadUAV; // 32 bit source payload UAVs - - CAULDRON_DX12::Texture m_DstKeyBuffers[2]; // 32 bit destination key buffers (when not doing in place writes) - CAULDRON_DX12::CBV_SRV_UAV m_DstKeyUAVTable; // 32 bit destination key UAVs - - CAULDRON_DX12::Texture m_DstPayloadBuffers[2]; // 32 bit destination payload buffers (when not doing in place writes) - CAULDRON_DX12::CBV_SRV_UAV m_DstPayloadUAVTable; // 32 bit destination payload UAVs - - // Resources for parallel sort algorithm - CAULDRON_DX12::Texture m_FPSScratchBuffer; // Sort scratch buffer - CAULDRON_DX12::CBV_SRV_UAV m_FPSScratchUAV; // UAV needed for sort scratch buffer - CAULDRON_DX12::Texture m_FPSReducedScratchBuffer; // Sort reduced scratch buffer - CAULDRON_DX12::CBV_SRV_UAV m_FPSReducedScratchUAV; // UAV needed for sort reduced scratch buffer - - ID3D12RootSignature* m_pFPSRootSignature = nullptr; - ID3D12PipelineState* m_pFPSCountPipeline = nullptr; - ID3D12PipelineState* m_pFPSCountReducePipeline = nullptr; - ID3D12PipelineState* m_pFPSScanPipeline = nullptr; - ID3D12PipelineState* m_pFPSScanAddPipeline = nullptr; - ID3D12PipelineState* m_pFPSScatterPipeline = nullptr; - ID3D12PipelineState* m_pFPSScatterPayloadPipeline = nullptr; - - // Resources for indirect execution of algorithm - CAULDRON_DX12::Texture m_IndirectKeyCounts; // Buffer to hold num keys for indirect dispatch - CAULDRON_DX12::CBV_SRV_UAV m_IndirectKeyCountsUAV; // UAV needed for num keys buffer - CAULDRON_DX12::Texture m_IndirectConstantBuffer; // Buffer to hold radix sort constant buffer data for indirect dispatch - CAULDRON_DX12::CBV_SRV_UAV m_IndirectConstantBufferUAV; // UAV needed for indirect constant buffer - CAULDRON_DX12::Texture m_IndirectCountScatterArgs; // Buffer to hold dispatch arguments used for Count/Scatter parts of the algorithm - CAULDRON_DX12::CBV_SRV_UAV m_IndirectCountScatterArgsUAV; // UAV needed for count/scatter args buffer - CAULDRON_DX12::Texture m_IndirectReduceScanArgs; // Buffer to hold dispatch arguments used for Reduce/Scan parts of the algorithm - CAULDRON_DX12::CBV_SRV_UAV m_IndirectReduceScanArgsUAV; // UAV needed for reduce/scan args buffer - - ID3D12CommandSignature* m_pFPSCommandSignature; - ID3D12PipelineState* m_pFPSIndirectSetupParametersPipeline = nullptr; - - // Resources for verification render - ID3D12RootSignature* m_pRenderRootSignature = nullptr; - ID3D12PipelineState* m_pRenderResultVerificationPipeline = nullptr; - CAULDRON_DX12::Texture m_Validate4KTexture; - CAULDRON_DX12::Texture m_Validate2KTexture; - CAULDRON_DX12::Texture m_Validate1080pTexture; - CAULDRON_DX12::CBV_SRV_UAV m_ValidateTextureUAV; - - // For correctness validation - ID3D12Resource* m_ReadBackBufferResource; // For sort validation - ID3D12Fence* m_ReadBackFence; // To know when we can check sort results - HANDLE m_ReadBackFenceEvent; + // Temp -- For command line overrides + static int KeySetOverride; + static bool PayloadOverride; + // Temp -- For command line overrides + + Device* m_pDevice = nullptr; + UploadHeap* m_pUploadHeap = nullptr; + ResourceViewHeaps* m_pResourceViewHeaps = nullptr; + DynamicBufferRing* m_pConstantBufferRing = nullptr; + uint32_t m_MaxNumThreadgroups = 320; // Use a generic thread group size when not on AMD hardware (taken from experiments to determine best performance threshold) + + // Sample resources + Texture m_SrcKeyBuffers[3]; // 32 bit source key buffers (for 1080, 2K, 4K resolution) + CBV_SRV_UAV m_SrcKeyUAVTable; // 32 bit source key UAVs (for 1080, 2K, 4K resolution) + + Texture m_SrcPayloadBuffers; // 32 bit source payload buffers + CBV_SRV_UAV m_SrcPayloadUAV; // 32 bit source payload UAVs + + Texture m_DstKeyBuffers[2]; // 32 bit destination key buffers (when not doing in place writes) + CBV_SRV_UAV m_DstKeyUAVTable; // 32 bit destination key UAVs + + Texture m_DstPayloadBuffers[2]; // 32 bit destination payload buffers (when not doing in place writes) + CBV_SRV_UAV m_DstPayloadUAVTable; // 32 bit destination payload UAVs + + // Resources for parallel sort algorithm + Texture m_FPSScratchBuffer; // Sort scratch buffer + CBV_SRV_UAV m_FPSScratchUAV; // UAV needed for sort scratch buffer + Texture m_FPSReducedScratchBuffer; // Sort reduced scratch buffer + CBV_SRV_UAV m_FPSReducedScratchUAV; // UAV needed for sort reduced scratch buffer + + ID3D12RootSignature* m_pFPSRootSignature = nullptr; + ID3D12PipelineState* m_pFPSCountPipeline = nullptr; + ID3D12PipelineState* m_pFPSCountReducePipeline = nullptr; + ID3D12PipelineState* m_pFPSScanPipeline = nullptr; + ID3D12PipelineState* m_pFPSScanAddPipeline = nullptr; + ID3D12PipelineState* m_pFPSScatterPipeline = nullptr; + ID3D12PipelineState* m_pFPSScatterPayloadPipeline = nullptr; + + // Resources for indirect execution of algorithm + Texture m_IndirectKeyCounts; // Buffer to hold num keys for indirect dispatch + CBV_SRV_UAV m_IndirectKeyCountsUAV; // UAV needed for num keys buffer + Texture m_IndirectConstantBuffer; // Buffer to hold radix sort constant buffer data for indirect dispatch + CBV_SRV_UAV m_IndirectConstantBufferUAV; // UAV needed for indirect constant buffer + Texture m_IndirectCountScatterArgs; // Buffer to hold dispatch arguments used for Count/Scatter parts of the algorithm + CBV_SRV_UAV m_IndirectCountScatterArgsUAV; // UAV needed for count/scatter args buffer + Texture m_IndirectReduceScanArgs; // Buffer to hold dispatch arguments used for Reduce/Scan parts of the algorithm + CBV_SRV_UAV m_IndirectReduceScanArgsUAV; // UAV needed for reduce/scan args buffer + + ID3D12CommandSignature* m_pFPSCommandSignature; + ID3D12PipelineState* m_pFPSIndirectSetupParametersPipeline = nullptr; + + // Resources for verification render + ID3D12RootSignature* m_pRenderRootSignature = nullptr; + ID3D12PipelineState* m_pRenderResultVerificationPipeline = nullptr; + Texture m_Validate4KTexture; + Texture m_Validate2KTexture; + Texture m_Validate1080pTexture; + CBV_SRV_UAV m_ValidateTextureSRV; + + // For correctness validation + ID3D12Resource* m_ReadBackBufferResource; // For sort validation + ID3D12Fence* m_ReadBackFence; // To know when we can check sort results + HANDLE m_ReadBackFenceEvent; #ifdef DEVELOPERMODE - bool m_UIValidateSortResults = false; // Validate the results + bool m_UIValidateSortResults = false; // Validate the results #endif // DEVELOPERMODE - // Options for UI and test to run - int m_UIResolutionSize = 0; - bool m_UISortPayload = false; - bool m_UIIndirectSort = false; - int m_UIVisualOutput = 0; - }; -} \ No newline at end of file + // Options for UI and test to run + int m_UIResolutionSize = 0; + bool m_UISortPayload = false; + bool m_UIIndirectSort = false; + int m_UIVisualOutput = 0; +}; \ No newline at end of file diff --git a/sample/src/DX12/UI.cpp b/sample/src/DX12/UI.cpp new file mode 100644 index 0000000..df1c8d8 --- /dev/null +++ b/sample/src/DX12/UI.cpp @@ -0,0 +1,174 @@ +// AMD SampleDX12 sample code +// +// Copyright(c) 2021 Advanced Micro Devices, Inc.All rights reserved. +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files(the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and / or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions : +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +// THE SOFTWARE. + +#include "UI.h" +#include "Sample.h" +#include "imgui.h" +#include "base/FrameworkWindows.h" + +// To use the 'disabled UI state' functionality (ImGuiItemFlags_Disabled), include internal header +// https://github.com/ocornut/imgui/issues/211#issuecomment-339241929 +#include "imgui_internal.h" +static void DisableUIStateBegin(const bool& bEnable) +{ + if (!bEnable) + { + ImGui::PushItemFlag(ImGuiItemFlags_Disabled, true); + ImGui::PushStyleVar(ImGuiStyleVar_Alpha, ImGui::GetStyle().Alpha * 0.5f); + } +}; +static void DisableUIStateEnd(const bool& bEnable) +{ + if (!bEnable) + { + ImGui::PopItemFlag(); + ImGui::PopStyleVar(); + } +}; + +void Sample::BuildUI() +{ + ImGuiIO& io = ImGui::GetIO(); + ImGuiStyle& style = ImGui::GetStyle(); + style.FrameBorderSize = 1.0f; + + const uint32_t W = this->GetWidth(); + const uint32_t H = this->GetHeight(); + + const uint32_t PROFILER_WINDOW_PADDING_X = 10; + const uint32_t PROFILER_WINDOW_PADDING_Y = 10; + const uint32_t PROFILER_WINDOW_SIZE_X = 330; + const uint32_t PROFILER_WINDOW_SIZE_Y = 450; + const uint32_t PROFILER_WINDOW_POS_X = W - PROFILER_WINDOW_PADDING_X - PROFILER_WINDOW_SIZE_X; + const uint32_t PROFILER_WINDOW_POS_Y = PROFILER_WINDOW_PADDING_Y; + + const uint32_t CONTROLS_WINDOW_POS_X = 10; + const uint32_t CONTROLS_WINDOW_POS_Y = 10; + const uint32_t CONTROLW_WINDOW_SIZE_X = 350; + const uint32_t CONTROLW_WINDOW_SIZE_Y = 780; // assuming > 720p + + // Render CONTROLS window + // + ImGui::SetNextWindowPos(ImVec2(CONTROLS_WINDOW_POS_X, CONTROLS_WINDOW_POS_Y), ImGuiCond_FirstUseEver); + ImGui::SetNextWindowSize(ImVec2(CONTROLW_WINDOW_SIZE_X, CONTROLW_WINDOW_SIZE_Y), ImGuiCond_FirstUseEver); + + if (m_UIState.bShowControlsWindow) + { + ImGui::Begin("CONTROLS (F1)", &m_UIState.bShowControlsWindow); + + // Render UI for Radix Sort + m_pRenderer->RenderParallelSortUI(); + + ImGui::Spacing(); + ImGui::Spacing(); + + if (ImGui::CollapsingHeader("Presentation Mode", ImGuiTreeNodeFlags_DefaultOpen)) + { + const char* fullscreenModes[] = { "Windowed", "BorderlessFullscreen", "ExclusiveFullscreen" }; + if (ImGui::Combo("Fullscreen Mode", (int*)&m_fullscreenMode, fullscreenModes, _countof(fullscreenModes))) + { + if (m_previousFullscreenMode != m_fullscreenMode) + { + HandleFullScreen(); + m_previousFullscreenMode = m_fullscreenMode; + } + } + } + + ImGui::End(); // CONTROLS + } + + // Render PROFILER window + // + if (m_UIState.bShowProfilerWindow) + { + constexpr size_t NUM_FRAMES = 128; + static float FRAME_TIME_ARRAY[NUM_FRAMES] = { 0 }; + + // track highest frame rate and determine the max value of the graph based on the measured highest value + static float RECENT_HIGHEST_FRAME_TIME = 0.0f; + constexpr int FRAME_TIME_GRAPH_MAX_FPS[] = { 800, 240, 120, 90, 60, 45, 30, 15, 10, 5, 4, 3, 2, 1 }; + static float FRAME_TIME_GRAPH_MAX_VALUES[_countof(FRAME_TIME_GRAPH_MAX_FPS)] = { 0 }; // us + for (int i = 0; i < _countof(FRAME_TIME_GRAPH_MAX_FPS); ++i) { FRAME_TIME_GRAPH_MAX_VALUES[i] = 1000000.f / FRAME_TIME_GRAPH_MAX_FPS[i]; } + + //scrolling data and average FPS computing + const std::vector& timeStamps = m_pRenderer->GetTimingValues(); + const bool bTimeStampsAvailable = timeStamps.size() > 0; + if (bTimeStampsAvailable) + { + RECENT_HIGHEST_FRAME_TIME = 0; + FRAME_TIME_ARRAY[NUM_FRAMES - 1] = timeStamps.back().m_microseconds; + for (uint32_t i = 0; i < NUM_FRAMES - 1; i++) + { + FRAME_TIME_ARRAY[i] = FRAME_TIME_ARRAY[i + 1]; + } + RECENT_HIGHEST_FRAME_TIME = max(RECENT_HIGHEST_FRAME_TIME, FRAME_TIME_ARRAY[NUM_FRAMES - 1]); + } + const float& frameTime_us = FRAME_TIME_ARRAY[NUM_FRAMES - 1]; + const float frameTime_ms = frameTime_us * 0.001f; + const int fps = bTimeStampsAvailable ? static_cast(1000000.0f / frameTime_us) : 0; + + // UI + ImGui::SetNextWindowPos(ImVec2((float)PROFILER_WINDOW_POS_X, (float)PROFILER_WINDOW_POS_Y), ImGuiCond_FirstUseEver); + ImGui::SetNextWindowSize(ImVec2(PROFILER_WINDOW_SIZE_X, PROFILER_WINDOW_SIZE_Y), ImGuiCond_FirstUseEver); + ImGui::Begin("PROFILER (F2)", &m_UIState.bShowProfilerWindow); + + ImGui::Text("Resolution : %ix%i", m_Width, m_Height); + ImGui::Text("API : %s", m_systemInfo.mGfxAPI.c_str()); + ImGui::Text("GPU : %s", m_systemInfo.mGPUName.c_str()); + ImGui::Text("CPU : %s", m_systemInfo.mCPUName.c_str()); + ImGui::Text("FPS : %d (%.2f ms)", fps, frameTime_ms); + + if (ImGui::CollapsingHeader("GPU Timings", ImGuiTreeNodeFlags_DefaultOpen)) + { + std::string msOrUsButtonText = m_UIState.bShowMilliseconds ? "Switch to microseconds" : "Switch to milliseconds"; + if (ImGui::Button(msOrUsButtonText.c_str())) { + m_UIState.bShowMilliseconds = !m_UIState.bShowMilliseconds; + } + ImGui::Spacing(); + + // find the index of the FrameTimeGraphMaxValue as the next higher-than-recent-highest-frame-time in the pre-determined value list + size_t iFrameTimeGraphMaxValue = 0; + for (int i = 0; i < _countof(FRAME_TIME_GRAPH_MAX_VALUES); ++i) + { + if (RECENT_HIGHEST_FRAME_TIME < FRAME_TIME_GRAPH_MAX_VALUES[i]) // FRAME_TIME_GRAPH_MAX_VALUES are in increasing order + { + iFrameTimeGraphMaxValue = min(_countof(FRAME_TIME_GRAPH_MAX_VALUES) - 1, i + 1); + break; + } + } + ImGui::PlotLines("", FRAME_TIME_ARRAY, NUM_FRAMES, 0, "GPU frame time (us)", 0.0f, FRAME_TIME_GRAPH_MAX_VALUES[iFrameTimeGraphMaxValue], ImVec2(0, 80)); + + for (uint32_t i = 0; i < timeStamps.size(); i++) + { + float value = m_UIState.bShowMilliseconds ? timeStamps[i].m_microseconds / 1000.0f : timeStamps[i].m_microseconds; + const char* pStrUnit = m_UIState.bShowMilliseconds ? "ms" : "us"; + ImGui::Text("%-18s: %7.2f %s", timeStamps[i].m_label.c_str(), value, pStrUnit); + } + } + ImGui::End(); // PROFILER + } +} + +void UIState::Initialize() +{ + // init GUI state + this->bShowControlsWindow = true; + this->bShowProfilerWindow = true; +} diff --git a/sample/src/DX12/UI.h b/sample/src/DX12/UI.h new file mode 100644 index 0000000..f76cba3 --- /dev/null +++ b/sample/src/DX12/UI.h @@ -0,0 +1,41 @@ +// AMD SampleDX12 sample code +// +// Copyright(c) 2021 Advanced Micro Devices, Inc.All rights reserved. +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files(the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and / or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions : +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +// THE SOFTWARE. + +#pragma once + +#include "PostProc/MagnifierPS.h" +#include + +struct UIState +{ + // + // WINDOW MANAGEMENT + // + bool bShowControlsWindow; + bool bShowProfilerWindow; + + // + // PROFILER CONTROLS + // + bool bShowMilliseconds; + + // ----------------------------------------------- + + void Initialize(); +}; \ No newline at end of file diff --git a/sample/src/DX12/samplerenderer.cpp b/sample/src/DX12/renderer.cpp similarity index 65% rename from sample/src/DX12/samplerenderer.cpp rename to sample/src/DX12/renderer.cpp index bc8bc3e..b08f594 100644 --- a/sample/src/DX12/samplerenderer.cpp +++ b/sample/src/DX12/renderer.cpp @@ -1,25 +1,24 @@ // samplerenderer.cpp // -// Copyright (c) 2020 Advanced Micro Devices, Inc. All rights reserved. +// Copyright(c) 2021 Advanced Micro Devices, Inc.All rights reserved. // Permission is hereby granted, free of charge, to any person obtaining a copy -// of this software and associated documentation files (the "Software"), to deal +// of this software and associated documentation files(the "Software"), to deal // in the Software without restriction, including without limitation the rights -// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// to use, copy, modify, merge, publish, distribute, sublicense, and / or sell // copies of the Software, and to permit persons to whom the Software is -// furnished to do so, subject to the following conditions: +// furnished to do so, subject to the following conditions : // The above copyright notice and this permission notice shall be included in // all copies or substantial portions of the Software. // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN // THE SOFTWARE. -#include "stdafx.h" - -#include "SampleRenderer.h" +#include "Renderer.h" +#include "UI.h" //-------------------------------------------------------------------------------------- @@ -27,7 +26,7 @@ // OnCreate // //-------------------------------------------------------------------------------------- -void SampleRenderer::OnCreate(Device* pDevice, SwapChain *pSwapChain) +void Renderer::OnCreate(Device* pDevice, SwapChain* pSwapChain, float FontSize) { m_pDevice = pDevice; @@ -40,38 +39,36 @@ void SampleRenderer::OnCreate(Device* pDevice, SwapChain *pSwapChain) const uint32_t dsvDescriptorCount = 10; const uint32_t rtvDescriptorCount = 60; const uint32_t samplerDescriptorCount = 20; - m_resourceViewHeaps.OnCreate(pDevice, cbvDescriptorCount, srvDescriptorCount, uavDescriptorCount, dsvDescriptorCount, rtvDescriptorCount, samplerDescriptorCount); + m_ResourceViewHeaps.OnCreate(pDevice, cbvDescriptorCount, srvDescriptorCount, uavDescriptorCount, dsvDescriptorCount, rtvDescriptorCount, samplerDescriptorCount); // Create a commandlist ring for the Direct queue uint32_t commandListsPerBackBuffer = 8; - m_CommandListRing.OnCreate(pDevice, backBufferCount, commandListsPerBackBuffer, pDevice->GetGraphicsQueue()->GetDesc()); + m_CommandListRing.OnCreate(pDevice, BackBufferCount, commandListsPerBackBuffer, pDevice->GetGraphicsQueue()->GetDesc()); // Create a 'dynamic' constant buffer const uint32_t constantBuffersMemSize = 20 * 1024 * 1024; - m_ConstantBufferRing.OnCreate(pDevice, backBufferCount, constantBuffersMemSize, &m_resourceViewHeaps); + m_ConstantBufferRing.OnCreate(pDevice, BackBufferCount, constantBuffersMemSize, &m_ResourceViewHeaps); // Create a 'static' pool for vertices, indices and constant buffers const uint32_t staticGeometryMemSize = (2 * 128) * 1024 * 1024; - m_VidMemBufferPool.OnCreate(pDevice, staticGeometryMemSize, USE_VID_MEM, "StaticGeom"); + m_VidMemBufferPool.OnCreate(pDevice, staticGeometryMemSize, true, "StaticGeom"); // initialize the GPU time stamps module - m_GPUTimer.OnCreate(pDevice, backBufferCount); + m_GPUTimer.OnCreate(pDevice, BackBufferCount); // Quick helper to upload resources, it has it's own commandList and uses sub-allocation. const uint32_t uploadHeapMemSize = 100 * 1024 * 1024; m_UploadHeap.OnCreate(pDevice, uploadHeapMemSize); // initialize an upload heap (uses sub-allocation for faster results) // Initialize UI rendering resources - m_ImGUI.OnCreate(pDevice, &m_UploadHeap, &m_resourceViewHeaps, &m_ConstantBufferRing, pSwapChain->GetFormat()); + m_ImGUI.OnCreate(pDevice, &m_UploadHeap, &m_ResourceViewHeaps, &m_ConstantBufferRing, pSwapChain->GetFormat(), FontSize); // Create FFX Parallel Sort pass - m_FPS.OnCreate(pDevice, &m_resourceViewHeaps, &m_ConstantBufferRing, &m_UploadHeap, pSwapChain); + m_ParallelSort.OnCreate(pDevice, &m_ResourceViewHeaps, &m_ConstantBufferRing, &m_UploadHeap, pSwapChain); // Make sure upload heap has finished uploading before continuing -#if (USE_VID_MEM==true) m_VidMemBufferPool.UploadData(m_UploadHeap.GetCommandList()); m_UploadHeap.FlushAndFinish(); -#endif } //-------------------------------------------------------------------------------------- @@ -79,17 +76,16 @@ void SampleRenderer::OnCreate(Device* pDevice, SwapChain *pSwapChain) // OnDestroy // //-------------------------------------------------------------------------------------- -void SampleRenderer::OnDestroy() +void Renderer::OnDestroy() { - m_FPS.OnDestroy(); - + m_ParallelSort.OnDestroy(); m_ImGUI.OnDestroy(); m_UploadHeap.OnDestroy(); m_GPUTimer.OnDestroy(); m_VidMemBufferPool.OnDestroy(); m_ConstantBufferRing.OnDestroy(); - m_resourceViewHeaps.OnDestroy(); + m_ResourceViewHeaps.OnDestroy(); m_CommandListRing.OnDestroy(); } @@ -98,19 +94,14 @@ void SampleRenderer::OnDestroy() // OnCreateWindowSizeDependentResources // //-------------------------------------------------------------------------------------- -void SampleRenderer::OnCreateWindowSizeDependentResources(SwapChain *pSwapChain, uint32_t Width, uint32_t Height) +void Renderer::OnCreateWindowSizeDependentResources(SwapChain *pSwapChain, uint32_t Width, uint32_t Height) { m_Width = Width; m_Height = Height; - // Set the viewport - m_viewport = { 0.0f, 0.0f, static_cast(Width), static_cast(Height), 0.0f, 1.0f }; - - // Create scissor rectangle - m_rectScissor = { 0, 0, (LONG)Width, (LONG)Height }; - - // Create depth buffer - m_depthBuffer.InitDepthStencil(m_pDevice, "depthbuffer", &CD3DX12_RESOURCE_DESC::Tex2D(DXGI_FORMAT_R32_TYPELESS, Width, Height, 1, 1, 4, 0, D3D12_RESOURCE_FLAG_ALLOW_DEPTH_STENCIL | D3D12_RESOURCE_FLAG_DENY_SHADER_RESOURCE)); + // Set the viewport & scissors rect + m_Viewport = { 0.0f, 0.0f, static_cast(Width), static_cast(Height), 0.0f, 1.0f }; + m_RectScissor = { 0, 0, (LONG)Width, (LONG)Height }; } //-------------------------------------------------------------------------------------- @@ -118,9 +109,14 @@ void SampleRenderer::OnCreateWindowSizeDependentResources(SwapChain *pSwapChain, // OnDestroyWindowSizeDependentResources // //-------------------------------------------------------------------------------------- -void SampleRenderer::OnDestroyWindowSizeDependentResources() +void Renderer::OnDestroyWindowSizeDependentResources() { - m_depthBuffer.OnDestroy(); +} + +void Renderer::OnUpdateDisplayDependentResources(SwapChain* pSwapChain) +{ + // Update pipelines in case the format of the RTs changed (this happens when going HDR) + m_ImGUI.UpdatePipeline(pSwapChain->GetFormat()); } //-------------------------------------------------------------------------------------- @@ -128,7 +124,7 @@ void SampleRenderer::OnDestroyWindowSizeDependentResources() // OnRender // //-------------------------------------------------------------------------------------- -void SampleRenderer::OnRender(State* pState, SwapChain* pSwapChain) +void Renderer::OnRender(const UIState* pState, SwapChain* pSwapChain, float Time, bool bIsBenchmarking) { // Timing values UINT64 gpuTicksPerSecond; @@ -139,19 +135,17 @@ void SampleRenderer::OnRender(State* pState, SwapChain* pSwapChain) m_ConstantBufferRing.OnBeginFrame(); m_GPUTimer.OnBeginFrame(gpuTicksPerSecond, &m_TimeStamps); - m_GPUTimer.GetTimeStampUser({ "time (s)", pState->time }); - // command buffer calls ID3D12GraphicsCommandList* pCmdLst1 = m_CommandListRing.GetNewCommandList(); - pCmdLst1->ResourceBarrier(1, &CD3DX12_RESOURCE_BARRIER::Transition(pSwapChain->GetCurrentBackBufferResource(), D3D12_RESOURCE_STATE_PRESENT, D3D12_RESOURCE_STATE_RENDER_TARGET)); + pCmdLst1->ResourceBarrier(1, &CD3DX12_RESOURCE_BARRIER::Transition(pSwapChain->GetCurrentBackBufferResource(), D3D12_RESOURCE_STATE_PRESENT, D3D12_RESOURCE_STATE_RENDER_TARGET)); // Copy the data to sort for the frame (don't time this -- external to process) - m_FPS.CopySourceDataForFrame(pCmdLst1); + m_ParallelSort.CopySourceDataForFrame(pCmdLst1); m_GPUTimer.GetTimeStamp(pCmdLst1, "Begin Frame"); // Do sort tests ----------------------------------------------------------------------- - m_FPS.Draw(pCmdLst1, pState->m_isBenchmarking, pState->time); - m_GPUTimer.GetTimeStamp(pCmdLst1, "Parallel Sort"); + m_ParallelSort.Sort(pCmdLst1, bIsBenchmarking, Time); + m_GPUTimer.GetTimeStamp(pCmdLst1, "FFX Parallel Sort"); // submit command buffer #1 ThrowIfFailed(pCmdLst1->Close()); @@ -160,29 +154,31 @@ void SampleRenderer::OnRender(State* pState, SwapChain* pSwapChain) // Check against parallel sort validation if needed (just returns if not needed) #ifdef DEVELOPERMODE - m_FPS.WaitForValidationResults(); + m_ParallelSort.WaitForValidationResults(); #endif // DEVELOPERMODE // Wait for swapchain (we are going to render to it) ----------------------------------- pSwapChain->WaitForSwapChain(); ID3D12GraphicsCommandList* pCmdLst2 = m_CommandListRing.GetNewCommandList(); - pCmdLst2->RSSetViewports(1, &m_viewport); - pCmdLst2->RSSetScissorRects(1, &m_rectScissor); - pCmdLst2->OMSetRenderTargets(1, pSwapChain->GetCurrentBackBufferRTV(), true, nullptr); + pCmdLst2->RSSetViewports(1, &m_Viewport); + pCmdLst2->RSSetScissorRects(1, &m_RectScissor); + pCmdLst2->OMSetRenderTargets(1, pSwapChain->GetCurrentBackBufferRTV(), true, nullptr); float clearColor[4] = { 0, 0, 0, 0 }; pCmdLst2->ClearRenderTargetView(*pSwapChain->GetCurrentBackBufferRTV(), clearColor, 0, nullptr); - // Render sort source/results over everything except the HUD -------------------------- - m_FPS.DrawVisualization(pCmdLst2, m_Width, m_Height); - - // Render HUD - m_ImGUI.Draw(pCmdLst2); - m_GPUTimer.GetTimeStamp(pCmdLst2, "ImGUI Rendering"); + // Render sort source/results over everything except the HUD -------------------------- + m_ParallelSort.DrawVisualization(pCmdLst2, m_Width, m_Height); + + // Render HUD ------------------------------------------------------------------------ + { + m_ImGUI.Draw(pCmdLst2); + m_GPUTimer.GetTimeStamp(pCmdLst2, "ImGUI Rendering"); + } - if (pState->m_pScreenShotName != nullptr) + if (!m_pScreenShotName.empty()) { - m_saveTexture.CopyRenderTargetIntoStagingTexture(m_pDevice->GetDevice(), pCmdLst2, pSwapChain->GetCurrentBackBufferResource(), D3D12_RESOURCE_STATE_RENDER_TARGET); + m_SaveTexture.CopyRenderTargetIntoStagingTexture(m_pDevice->GetDevice(), pCmdLst2, pSwapChain->GetCurrentBackBufferResource(), D3D12_RESOURCE_STATE_RENDER_TARGET); } // Transition swap chain into present mode @@ -196,9 +192,10 @@ void SampleRenderer::OnRender(State* pState, SwapChain* pSwapChain) ID3D12CommandList* CmdListList2[] = { pCmdLst2 }; m_pDevice->GetGraphicsQueue()->ExecuteCommandLists(1, CmdListList2); - if (pState->m_pScreenShotName != nullptr) + // Handle screenshot request + if (!m_pScreenShotName.empty()) { - m_saveTexture.SaveStagingTextureAsJpeg(m_pDevice->GetDevice(), m_pDevice->GetGraphicsQueue(), pState->m_pScreenShotName->c_str()); - pState->m_pScreenShotName = nullptr; + m_SaveTexture.SaveStagingTextureAsJpeg(m_pDevice->GetDevice(), m_pDevice->GetGraphicsQueue(), m_pScreenShotName.c_str()); + m_pScreenShotName.clear(); } } diff --git a/sample/src/DX12/renderer.h b/sample/src/DX12/renderer.h new file mode 100644 index 0000000..bb2b24d --- /dev/null +++ b/sample/src/DX12/renderer.h @@ -0,0 +1,80 @@ +// samplerenderer.h +// +// Copyright(c) 2021 Advanced Micro Devices, Inc.All rights reserved. +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files(the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and / or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions : +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +// THE SOFTWARE. + +#pragma once + +#include "stdafx.h" +#include "PostProc/MagnifierPS.h" + +struct UIState; + +// We are queuing (BackBufferCount + 0.5) frames, so we need to triple buffer the resources that get modified each frame +static const int BackBufferCount = 3; + +using namespace CAULDRON_DX12; + +// +// This class deals with the GPU side of the sample. +class Renderer +{ +public: + + void OnCreate(Device* pDevice, SwapChain* pSwapChain, float FontSize); + void OnDestroy(); + + void OnCreateWindowSizeDependentResources(SwapChain* pSwapChain, uint32_t Width, uint32_t Height); + void OnDestroyWindowSizeDependentResources(); + + void OnUpdateDisplayDependentResources(SwapChain* pSwapChain); + + const std::vector& GetTimingValues() const { return m_TimeStamps; } + std::string& GetScreenshotFileName() { return m_pScreenShotName; } + + void OnRender(const UIState *pState, SwapChain *pSwapChain, float Time, bool bIsBenchmarking); + + void RenderParallelSortUI() { m_ParallelSort.DrawGui(); } + +private: + Device* m_pDevice; + + uint32_t m_Width; + uint32_t m_Height; + D3D12_VIEWPORT m_Viewport; + D3D12_RECT m_RectScissor; + + // Initialize helper classes + ResourceViewHeaps m_ResourceViewHeaps; + UploadHeap m_UploadHeap; + DynamicBufferRing m_ConstantBufferRing; + StaticBufferPool m_VidMemBufferPool; + CommandListRing m_CommandListRing; + GPUTimestamps m_GPUTimer; + + FFXParallelSort m_ParallelSort; + + // GUI + ImGUI m_ImGUI; + + // For benchmarking + std::vector m_TimeStamps; + + // screen shot + std::string m_pScreenShotName = ""; + SaveTexture m_SaveTexture; +}; diff --git a/sample/src/DX12/sample.cpp b/sample/src/DX12/sample.cpp index efb0008..83bae25 100644 --- a/sample/src/DX12/sample.cpp +++ b/sample/src/DX12/sample.cpp @@ -1,17 +1,17 @@ // sample.cpp // -// Copyright (c) 2020 Advanced Micro Devices, Inc. All rights reserved. +// Copyright(c) 2021 Advanced Micro Devices, Inc.All rights reserved. // Permission is hereby granted, free of charge, to any person obtaining a copy -// of this software and associated documentation files (the "Software"), to deal +// of this software and associated documentation files(the "Software"), to deal // in the Software without restriction, including without limitation the rights -// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// to use, copy, modify, merge, publish, distribute, sublicense, and / or sell // copies of the Software, and to permit persons to whom the Software is -// furnished to do so, subject to the following conditions: +// furnished to do so, subject to the following conditions : // The above copyright notice and this permission notice shall be included in // all copies or substantial portions of the Software. // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN @@ -23,83 +23,66 @@ #include #include -// Uncomment to generate a pix capture on first frame -//#define GENERATE_PIXCAPTURE - -#ifdef GENERATE_PIXCAPTURE -// Uncomment to enable PIX capture of first frame -#include -#include -#include - -Microsoft::WRL::ComPtr ga; -#endif // GENERATE_PIXCAPTURE - -Sample::Sample(LPCSTR name) : FrameworkWindows(name) -{ - m_lastFrameTime = MillisecondsNow(); - m_time = 0; - m_bPlay = true; -} - //-------------------------------------------------------------------------------------- // // OnParseCommandLine // //-------------------------------------------------------------------------------------- -void Sample::OnParseCommandLine(LPSTR lpCmdLine, uint32_t* pWidth, uint32_t* pHeight, bool *pbFullScreen) +void Sample::OnParseCommandLine(LPSTR lpCmdLine, uint32_t* pWidth, uint32_t* pHeight) { // set some default values - *pWidth = 1920; - *pHeight = 1080; - *pbFullScreen = false; - m_state.m_isBenchmarking = false; + *pWidth = 1920; + *pHeight = 1080; + m_VsyncEnabled = false; + m_bIsBenchmarking = false; + m_fontSize = 13.f; // default value overridden by a json file if available m_isCpuValidationLayerEnabled = false; m_isGpuValidationLayerEnabled = false; m_stablePowerState = false; - //read globals + // Read globals auto process = [&](json jData) { *pWidth = jData.value("width", *pWidth); *pHeight = jData.value("height", *pHeight); - *pbFullScreen = jData.value("fullScreen", *pbFullScreen); + m_fullscreenMode = jData.value("presentationMode", m_fullscreenMode); m_isCpuValidationLayerEnabled = jData.value("CpuValidationLayerEnabled", m_isCpuValidationLayerEnabled); m_isGpuValidationLayerEnabled = jData.value("GpuValidationLayerEnabled", m_isGpuValidationLayerEnabled); - m_state.m_isBenchmarking = jData.value("benchmark", m_state.m_isBenchmarking); + m_VsyncEnabled = jData.value("vsync", m_VsyncEnabled); + m_bIsBenchmarking = jData.value("benchmark", m_bIsBenchmarking); m_stablePowerState = jData.value("stablePowerState", m_stablePowerState); + m_fontSize = jData.value("fontsize", m_fontSize); }; - // read config file (and override values from commandline if so) - // - { - std::ifstream f("FFXParallelSort.json"); - if (!f) - { - MessageBox(nullptr, "Config file not found!\n", "Cauldron Panic!", MB_ICONERROR); - exit(0); - } - - try - { - f >> m_jsonConfigFile; - } - catch (json::parse_error) - { - MessageBox(nullptr, "Error parsing FFXParallelSort.json!\n", "Cauldron Panic!", MB_ICONERROR); - exit(0); - } - } - - json globals = m_jsonConfigFile["globals"]; - process(globals); + // Read config file (and override values from commandline if so) + { + std::ifstream f("FFXParallelSort.json"); + if (!f) + { + MessageBox(nullptr, "Config file not found!\n", "Cauldron Panic!", MB_ICONERROR); + exit(0); + } + + try + { + f >> m_jsonConfigFile; + } + catch (json::parse_error) + { + MessageBox(nullptr, "Error parsing FFXParallelSort.json!\n", "Cauldron Panic!", MB_ICONERROR); + exit(0); + } + } + + json globals = m_jsonConfigFile["globals"]; + process(globals); // Process the command line to see if we need to do anything for the sample (i.e. benchmarking, setup certain settings, etc.) std::string charString = lpCmdLine; if (!charString.compare("")) return; // No parameters - // Need to first convert the char string to a wide character set (Note: Why aren't all strings wide in the framework)? + // Need to first convert the char string to a wide character set std::wstring wideString; wideString.assign(charString.begin(), charString.end()); @@ -113,7 +96,7 @@ void Sample::OnParseCommandLine(LPSTR lpCmdLine, uint32_t* pWidth, uint32_t* pHe // Enable benchmarking if (!wideString.compare(L"-benchmark")) { - m_state.m_isBenchmarking = true; + m_bIsBenchmarking = true; ++CurrentArg; } @@ -148,46 +131,31 @@ void Sample::OnParseCommandLine(LPSTR lpCmdLine, uint32_t* pWidth, uint32_t* pHe // OnCreate // //-------------------------------------------------------------------------------------- -void Sample::OnCreate(HWND hWnd) +void Sample::OnCreate() { - // Create Device - m_device.OnCreate("myapp", "myEngine", m_isCpuValidationLayerEnabled, m_isGpuValidationLayerEnabled, hWnd); - m_device.CreatePipelineCache(); - - // set stable power state - if (m_stablePowerState) - m_device.GetDevice()->SetStablePowerState(TRUE); - - //init the shader compiler + // Init the shader compiler InitDirectXCompiler(); CreateShaderCache(); - // Create Swapchain - uint32_t dwNumberOfBackBuffers = 2; - m_swapChain.OnCreate(&m_device, dwNumberOfBackBuffers, hWnd); - // Create a instance of the renderer and initialize it, we need to do that for each GPU - m_Node = new SampleRenderer(); - m_Node->OnCreate(&m_device, &m_swapChain); - - // init GUI (non gfx stuff) - ImGUI_Init((void *)hWnd); - - if (m_state.m_isBenchmarking) - { - std::string deviceName; - std::string driverVersion; - m_device.GetDeviceInfo(&deviceName, &driverVersion); - BenchmarkConfig(m_jsonConfigFile["BenchmarkSettings"], -1, nullptr, deviceName, driverVersion); - } - -#ifdef GENERATE_PIXCAPTURE - // Uncomment to enable PIX capture of first frame - HRESULT hr = DXGIGetDebugInterface1(0, IID_PPV_ARGS(&ga)); - // hr will be E_NOINTERFACE if not attached for GPU capture - if (hr == E_NOINTERFACE) - ga = nullptr; -#endif // GENERATE_PIXCAPTURE + m_pRenderer = new Renderer(); + m_pRenderer->OnCreate(&m_device, &m_swapChain, m_fontSize); + + // set benchmarking state if enabled + if (m_bIsBenchmarking) + { + std::string deviceName; + std::string driverVersion; + m_device.GetDeviceInfo(&deviceName, &driverVersion); + BenchmarkConfig(m_jsonConfigFile["BenchmarkSettings"], -1, nullptr, deviceName, driverVersion); + } + + // Init GUI (non gfx stuff) + ImGUI_Init((void*)m_windowHwnd); + m_UIState.Initialize(); + + OnResize(); + OnUpdateDisplay(); } //-------------------------------------------------------------------------------------- @@ -201,21 +169,13 @@ void Sample::OnDestroy() m_device.GPUFlush(); - // Fullscreen state should always be false before exiting the app. - m_swapChain.SetFullScreen(false); - - m_Node->OnDestroyWindowSizeDependentResources(); - m_Node->OnDestroy(); + m_pRenderer->OnDestroyWindowSizeDependentResources(); + m_pRenderer->OnDestroy(); - delete m_Node; - - m_swapChain.OnDestroyWindowSizeDependentResources(); - m_swapChain.OnDestroy(); + delete m_pRenderer; //shut down the shader compiler DestroyShaderCache(&m_device); - - m_device.OnDestroy(); } //-------------------------------------------------------------------------------------- @@ -223,108 +183,86 @@ void Sample::OnDestroy() // OnEvent, win32 sends us events and we forward them to ImGUI // //-------------------------------------------------------------------------------------- +static void ToggleBool(bool& b) { b = !b; } bool Sample::OnEvent(MSG msg) { if (ImGUI_WndProcHandler(msg.hwnd, msg.message, msg.wParam, msg.lParam)) return true; + // handle function keys (F1, F2...) here, rest of the input is handled + // by imGUI later in HandleInput() function + const WPARAM& KeyPressed = msg.wParam; + switch (msg.message) + { + case WM_KEYUP: + case WM_SYSKEYUP: + /* WINDOW TOGGLES */ + if (KeyPressed == VK_F1) m_UIState.bShowControlsWindow ^= 1; + if (KeyPressed == VK_F2) m_UIState.bShowProfilerWindow ^= 1; + break; + } + return true; } //-------------------------------------------------------------------------------------- // -// SetFullScreen +// OnResize // //-------------------------------------------------------------------------------------- -void Sample::SetFullScreen(bool fullscreen) +void Sample::OnResize() { - m_device.GPUFlush(); - m_swapChain.SetFullScreen(fullscreen); + // Destroy resources (if we are not minimized) + if (m_Width && m_Height && m_pRenderer) + { + m_pRenderer->OnDestroyWindowSizeDependentResources(); + m_pRenderer->OnCreateWindowSizeDependentResources(&m_swapChain, m_Width, m_Height); + } } //-------------------------------------------------------------------------------------- // -// OnResize +// UpdateDisplay // //-------------------------------------------------------------------------------------- -void Sample::OnResize(uint32_t width, uint32_t height, DisplayModes displayMode) +void Sample::OnUpdateDisplay() { - if (m_Width != width || m_Height != height) + // Destroy resources (if we are not minimized) + if (m_pRenderer) { - // Flush GPU - m_device.GPUFlush(); - - // destroy resources (if were not minimized) - if (m_Width > 0 && m_Height > 0) - { - if (m_Node!=nullptr) - { - m_Node->OnDestroyWindowSizeDependentResources(); - } - m_swapChain.OnDestroyWindowSizeDependentResources(); - } - - m_Width = width; - m_Height = height; - - // if resizing but not minimizing the recreate it with the new size - if (m_Width > 0 && m_Height > 0) - { - m_swapChain.OnCreateWindowSizeDependentResources(m_Width, m_Height, false, displayMode); - if (m_Node != nullptr) - { - m_Node->OnCreateWindowSizeDependentResources(&m_swapChain, m_Width, m_Height); - } - } + m_pRenderer->OnUpdateDisplayDependentResources(&m_swapChain); } } //-------------------------------------------------------------------------------------- // -// BuildUI, all UI code should be here +// OnUpdate // //-------------------------------------------------------------------------------------- -void Sample::BuildUI() +void Sample::OnUpdate() { - ImGuiStyle& style = ImGui::GetStyle(); - style.FrameBorderSize = 1.0f; - - ImGui::SetNextWindowPos(ImVec2(10, 10), ImGuiCond_FirstUseEver); - ImGui::SetNextWindowSize(ImVec2(250, 700), ImGuiCond_FirstUseEver); - - bool opened = true; - ImGui::Begin("Stats", &opened); + ImGuiIO& io = ImGui::GetIO(); - if (ImGui::CollapsingHeader("Info", ImGuiTreeNodeFlags_DefaultOpen)) + //If the mouse was not used by the GUI then it's for the camera + if (io.WantCaptureMouse) { - ImGui::Text("Resolution : %ix%i", m_Width, m_Height); + io.MouseDelta.x = 0; + io.MouseDelta.y = 0; + io.MouseWheel = 0; } - // Render UI for Radix Sort - this->m_Node->RenderParallelSortUI(); + // Keyboard & Mouse + HandleInput(io); - if (ImGui::CollapsingHeader("Profiler", ImGuiTreeNodeFlags_DefaultOpen)) - { - std::vector timeStamps = m_Node->GetTimingValues(); - if (timeStamps.size() > 0) - { - for (uint32_t i = 0; i < timeStamps.size(); i++) - { - ImGui::Text("%-22s: %7.1f", timeStamps[i].m_label.c_str(), timeStamps[i].m_microseconds); - } - - //scrolling data and average computing - static float values[128]; - values[127] = timeStamps.back().m_microseconds; - for (uint32_t i = 0; i < 128 - 1; i++) { values[i] = values[i + 1]; } - ImGui::PlotLines("", values, 128, 0, "GPU frame time (us)", 0.0f, 30000.0f, ImVec2(0, 80)); - } - } + // Increase time + m_time += (float)m_deltaTime / 1000.0f; // time in seconds +} - ImGui::End(); +void Sample::HandleInput(const ImGuiIO& io) +{ + auto fnIsKeyTriggered = [&io](char key) { return io.KeysDown[key] && io.KeysDownDuration[key] == 0.0f; }; - // Process I/O - ImGuiIO& io = ImGui::GetIO(); + // Handle Keyboard/Mouse input here } //-------------------------------------------------------------------------------------- @@ -334,63 +272,30 @@ void Sample::BuildUI() //-------------------------------------------------------------------------------------- void Sample::OnRender() { - // Get timings - // - double timeNow = MillisecondsNow(); - float deltaTime = (float)(timeNow - m_lastFrameTime); - m_lastFrameTime = timeNow; + // Do any start of frame necessities + BeginFrame(); ImGUI_UpdateIO(); ImGui::NewFrame(); - if (m_state.m_isBenchmarking) + if (m_bIsBenchmarking) { - // benchmarking takes control of the time, and exits the app when the animation is done - std::vector timeStamps = m_Node->GetTimingValues(); - m_time = BenchmarkLoop(timeStamps, nullptr, &m_state.m_pScreenShotName); + // Benchmarking takes control of the time, and exits the app when the animation is done + std::vector timeStamps = m_pRenderer->GetTimingValues(); + m_time = BenchmarkLoop(timeStamps, nullptr, m_pRenderer->GetScreenshotFileName()); } else { // Build the UI. Note that the rendering of the UI happens later. BuildUI(); - - // Set animation time - // - if (m_bPlay) - { - m_time += (float)deltaTime / 1000.0f; - } + OnUpdate(); } - // Update time - m_state.time = m_time; - -#ifdef GENERATE_PIXCAPTURE - // Uncomment to enable PIX capture of first frame - static uint32_t frameID = 0; - if (!frameID) - { - // Use renderdoc or PIX to take a capture of the first frame if enabled/attached - if (ga) - ga->BeginCapture(); + // Do Render frame using AFR + m_pRenderer->OnRender(&m_UIState, &m_swapChain, m_time, m_bIsBenchmarking); - } -#endif // GENERATE_PIXCAPTURE - - // Do Render frame using AFR - m_Node->OnRender(&m_state, &m_swapChain); - m_swapChain.Present(); - -#ifdef GENERATE_PIXCAPTURE - // Uncomment to enable PIX capture of first frame - if (!frameID) - { - if (ga) - ga->EndCapture(); - - frameID++; - } -#endif // GENERATE_PIXCAPTURE + // Framework will handle Present and some other end of frame logic + EndFrame(); } @@ -404,7 +309,7 @@ int WINAPI WinMain(HINSTANCE hInstance, LPSTR lpCmdLine, int nCmdShow) { - LPCSTR Name = "FidelityFX Parallel Sort v1.0"; + LPCSTR Name = "FidelityFX Parallel Sort DX12 v1.1"; // create new DX sample return RunFramework(hInstance, lpCmdLine, nCmdShow, new Sample(Name)); diff --git a/sample/src/DX12/sample.h b/sample/src/DX12/sample.h index 8d450d7..5862b89 100644 --- a/sample/src/DX12/sample.h +++ b/sample/src/DX12/sample.h @@ -1,17 +1,17 @@ // sample.h // -// Copyright (c) 2020 Advanced Micro Devices, Inc. All rights reserved. +// Copyright(c) 2021 Advanced Micro Devices, Inc.All rights reserved. // Permission is hereby granted, free of charge, to any person obtaining a copy -// of this software and associated documentation files (the "Software"), to deal +// of this software and associated documentation files(the "Software"), to deal // in the Software without restriction, including without limitation the rights -// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// to use, copy, modify, merge, publish, distribute, sublicense, and / or sell // copies of the Software, and to permit persons to whom the Software is -// furnished to do so, subject to the following conditions: +// furnished to do so, subject to the following conditions : // The above copyright notice and this permission notice shall be included in // all copies or substantial portions of the Software. // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN @@ -19,52 +19,38 @@ #pragma once -#include "SampleRenderer.h" +#include "base/FrameworkWindows.h" +#include "Renderer.h" +#include "UI.h" -// -// This is the main class, it manages the state of the sample and does all the high level work without touching the GPU directly. -// This class uses the GPU via the the SampleRenderer class. We would have a SampleRenderer instance for each GPU. -// -// This class takes care of: -// -// - loading a scene (just the CPU data) -// - updating the camera -// - keeping track of time -// - handling the keyboard -// - updating the animation -// - building the UI (but do not renders it) -// - uses the SampleRenderer to update all the state to the GPU and do the rendering -// +// This class encapsulates the 'application' and is responsible for handling window events and scene updates (simulation) +// Rendering and rendering resource management is done by the Renderer class class Sample : public FrameworkWindows { public: - Sample(LPCSTR name); - void OnParseCommandLine(LPSTR lpCmdLine, uint32_t* pWidth, uint32_t* pHeight, bool* pbFullScreen); - void OnCreate(HWND hWnd); - void OnDestroy(); - void BuildUI(); - void OnRender(); - bool OnEvent(MSG msg); - void OnResize(uint32_t Width, uint32_t Height) { OnResize(Width, Height, DISPLAYMODE_SDR); } - void OnResize(uint32_t Width, uint32_t Height, DisplayModes displayMode); - void SetFullScreen(bool fullscreen); + Sample(LPCSTR name) : FrameworkWindows(name) { m_time = 0.f; } + void OnParseCommandLine(LPSTR lpCmdLine, uint32_t* pWidth, uint32_t* pHeight) override; + void OnCreate() override; + void OnDestroy() override; + void OnRender() override; + bool OnEvent(MSG msg) override; + void OnResize() override; + void OnUpdateDisplay() override; + + void BuildUI(); + void OnUpdate(); + void HandleInput(const ImGuiIO& io); private: - Device m_device; - SwapChain m_swapChain; + // Benchmarking support + bool m_bIsBenchmarking; + float m_time; - SampleRenderer* m_Node = nullptr; - SampleRenderer::State m_state; + Renderer* m_pRenderer = NULL; + UIState m_UIState; + float m_fontSize; - float m_time; // WallClock in seconds. - double m_lastFrameTime; - - // json config file - json m_jsonConfigFile; - bool m_stablePowerState; - bool m_isCpuValidationLayerEnabled; - bool m_isGpuValidationLayerEnabled; - - bool m_bPlay; + // json config file + json m_jsonConfigFile; }; diff --git a/sample/src/DX12/stdafx.cpp b/sample/src/DX12/stdafx.cpp index 9f1b280..541c2a6 100644 --- a/sample/src/DX12/stdafx.cpp +++ b/sample/src/DX12/stdafx.cpp @@ -1,17 +1,17 @@ // stdafx.cpp // -// Copyright (c) 2020 Advanced Micro Devices, Inc. All rights reserved. +// Copyright(c) 2021 Advanced Micro Devices, Inc.All rights reserved. // Permission is hereby granted, free of charge, to any person obtaining a copy -// of this software and associated documentation files (the "Software"), to deal +// of this software and associated documentation files(the "Software"), to deal // in the Software without restriction, including without limitation the rights -// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// to use, copy, modify, merge, publish, distribute, sublicense, and / or sell // copies of the Software, and to permit persons to whom the Software is -// furnished to do so, subject to the following conditions: +// furnished to do so, subject to the following conditions : // The above copyright notice and this permission notice shall be included in // all copies or substantial portions of the Software. // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN diff --git a/sample/src/DX12/stdafx.h b/sample/src/DX12/stdafx.h index c6c160e..d61d7ec 100644 --- a/sample/src/DX12/stdafx.h +++ b/sample/src/DX12/stdafx.h @@ -1,17 +1,17 @@ // stdafx.h // -// Copyright (c) 2020 Advanced Micro Devices, Inc. All rights reserved. +// Copyright(c) 2021 Advanced Micro Devices, Inc.All rights reserved. // Permission is hereby granted, free of charge, to any person obtaining a copy -// of this software and associated documentation files (the "Software"), to deal +// of this software and associated documentation files(the "Software"), to deal // in the Software without restriction, including without limitation the rights -// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// to use, copy, modify, merge, publish, distribute, sublicense, and / or sell // copies of the Software, and to permit persons to whom the Software is -// furnished to do so, subject to the following conditions: +// furnished to do so, subject to the following conditions : // The above copyright notice and this permission notice shall be included in // all copies or substantial portions of the Software. // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN @@ -41,6 +41,7 @@ using namespace DirectX; #include "Base/Imgui.h" #include "Base/ImguiHelper.h" #include "Base/Fence.h" +#include "Base/FrameworkWindows.h" #include "Base/Helper.h" #include "Base/Device.h" #include "Base/Texture.h" @@ -52,6 +53,7 @@ using namespace DirectX; #include "Base/StaticBufferPool.h" #include "Base/DynamicBufferRing.h" #include "Base/ResourceViewHeaps.h" +#include "Base/SaveTexture.h" #include "Base/ShaderCompilerHelper.h" #include "Base/StaticConstantBufferPool.h" @@ -63,7 +65,6 @@ using namespace DirectX; #include "Misc/Misc.h" #include "Misc/Error.h" #include "Misc/Camera.h" -#include "Misc/FrameworkWindows.h" #include "PostProc/TAA.h" #include "PostProc/Bloom.h" @@ -78,6 +79,7 @@ using namespace DirectX; #include "PostProc/ShadowResolvePass.h" #include "ParallelSort.h" +#include "UI.h" #include "Widgets/wireframe.h" diff --git a/sample/src/VK/CMakeLists.txt b/sample/src/VK/CMakeLists.txt new file mode 100644 index 0000000..d15ce7c --- /dev/null +++ b/sample/src/VK/CMakeLists.txt @@ -0,0 +1,51 @@ +project (${PROJECT_NAME}) + +include(${CMAKE_CURRENT_SOURCE_DIR}/../../common.cmake) + +add_compile_options(/MP) +add_compile_definitions(FFX_CPP) + +set(sources + sample.cpp + sample.h + stdafx.cpp + stdafx.h + Renderer.cpp + Renderer.h + UI.cpp + UI.h + ParallelSort.cpp + ParallelSort.h + dpiawarescaling.manifest) + +set(shader_sources + ${CMAKE_CURRENT_SOURCE_DIR}/../Common/shaders/ParallelSortCS.hlsl + ${CMAKE_CURRENT_SOURCE_DIR}/../Common/shaders/ParallelSortVerify.hlsl) + +set(fidelityfx_sources + ${CMAKE_CURRENT_SOURCE_DIR}/../../../FFX-ParallelSort/FFX_ParallelSort.h) + +set(common_sources + ${CMAKE_CURRENT_SOURCE_DIR}/../Common/FFXParallelSort.json + ${CMAKE_CURRENT_SOURCE_DIR}/../Common/Validate4K.png + ${CMAKE_CURRENT_SOURCE_DIR}/../Common/Validate2K.png + ${CMAKE_CURRENT_SOURCE_DIR}/../Common/Validate1080p.png) + +copyCommand("${shader_sources}" ${CMAKE_HOME_DIRECTORY}/bin/ShaderLibVK) +copyCommand("${fidelityfx_sources}" ${CMAKE_HOME_DIRECTORY}/bin/ShaderLibVK/FFX-ParallelSort) +copyCommand("${common_sources}" ${CMAKE_HOME_DIRECTORY}/bin) + +source_group("Common" FILES ${common_sources}) +source_group("Shaders" FILES ${shader_sources}) +source_group("FidelityFX" FILES ${fidelityfx_sources}) +source_group("Sources" FILES ${sources}) +source_group("Icon" FILES ${icon_src}) # defined in top-level CMakeLists.txt + +# prevent VS from processing/compiling these files +set_source_files_properties(${Shaders_src} PROPERTIES VS_TOOL_OVERRIDE "Text") + +add_executable(${PROJECT_NAME} WIN32 ${common_sources} ${shader_sources} ${sources} ${fidelityfx_sources} ${icon_src}) +target_link_libraries(${PROJECT_NAME} LINK_PUBLIC Cauldron_VK ImGUI Vulkan::Vulkan) +set_target_properties(${PROJECT_NAME} PROPERTIES VS_DEBUGGER_WORKING_DIRECTORY "${CMAKE_HOME_DIRECTORY}/bin" DEBUG_POSTFIX "d") + +addManifest(${PROJECT_NAME}) \ No newline at end of file diff --git a/sample/src/VK/ParallelSort.cpp b/sample/src/VK/ParallelSort.cpp new file mode 100644 index 0000000..f9a09aa --- /dev/null +++ b/sample/src/VK/ParallelSort.cpp @@ -0,0 +1,1154 @@ +// ParallelSort.cpp +// +// Copyright(c) 2021 Advanced Micro Devices, Inc.All rights reserved. +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files(the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and / or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions : +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +// THE SOFTWARE. + +#include "stdafx.h" +#include "../../../FFX-ParallelSort/FFX_ParallelSort.h" + +#include +#include +#include + +static const uint32_t NumKeys[] = { 1920 * 1080, 2560 * 1440, 3840 * 2160 }; + + +////////////////////////////////////////////////////////////////////////// +// For doing command-line based benchmark runs +int FFXParallelSort::KeySetOverride = -1; +void FFXParallelSort::OverrideKeySet(int ResolutionOverride) +{ + KeySetOverride = ResolutionOverride; +} +bool FFXParallelSort::PayloadOverride = false; +void FFXParallelSort::OverridePayload() +{ + PayloadOverride = true; +} +////////////////////////////////////////////////////////////////////////// + +////////////////////////////////////////////////////////////////////////// +// Helper functions for Vulkan + +// Transition barrier +VkBufferMemoryBarrier BufferTransition(VkBuffer buffer, VkAccessFlags before, VkAccessFlags after, uint32_t size) +{ + VkBufferMemoryBarrier bufferBarrier = {}; + bufferBarrier.sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER; + bufferBarrier.srcAccessMask = before; + bufferBarrier.dstAccessMask = after; + bufferBarrier.srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED; + bufferBarrier.dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED; + bufferBarrier.buffer = buffer; + bufferBarrier.size = size; + + return bufferBarrier; +} + +// Constant buffer binding +void FFXParallelSort::BindConstantBuffer(VkDescriptorBufferInfo& GPUCB, VkDescriptorSet& DescriptorSet, uint32_t Binding/*=0*/, uint32_t Count/*=1*/) +{ + VkWriteDescriptorSet write_set = { VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET }; + write_set.pNext = nullptr; + write_set.dstSet = DescriptorSet; + write_set.dstBinding = Binding; + write_set.dstArrayElement = 0; + write_set.descriptorCount = Count; + write_set.descriptorType = VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER; + write_set.pImageInfo = nullptr; + write_set.pBufferInfo = &GPUCB; + write_set.pTexelBufferView = nullptr; + vkUpdateDescriptorSets(m_pDevice->GetDevice(), 1, &write_set, 0, nullptr); +} + +// UAV Buffer binding +void FFXParallelSort::BindUAVBuffer(VkBuffer* pBuffer, VkDescriptorSet& DescriptorSet, uint32_t Binding/*=0*/, uint32_t Count/*=1*/) +{ + std::vector bufferInfos; + for (uint32_t i = 0; i < Count; i++) + { + VkDescriptorBufferInfo bufferInfo; + bufferInfo.buffer = pBuffer[i]; + bufferInfo.offset = 0; + bufferInfo.range = VK_WHOLE_SIZE; + bufferInfos.push_back(bufferInfo); + } + + VkWriteDescriptorSet write_set = { VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET }; + write_set.pNext = nullptr; + write_set.dstSet = DescriptorSet; + write_set.dstBinding = Binding; + write_set.dstArrayElement = 0; + write_set.descriptorCount = Count; + write_set.descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER; + write_set.pImageInfo = nullptr; + write_set.pBufferInfo = bufferInfos.data(); + write_set.pTexelBufferView = nullptr; + + vkUpdateDescriptorSets(m_pDevice->GetDevice(), 1, &write_set, 0, nullptr); +} +////////////////////////////////////////////////////////////////////////// + +// Create all of the sort data for the sample +void FFXParallelSort::CreateKeyPayloadBuffers() +{ + std::vector KeyData1080(NumKeys[0]); + std::vector KeyData2K(NumKeys[1]); + std::vector KeyData4K(NumKeys[2]); + + // Populate the buffers with linear access index + std::iota(KeyData1080.begin(), KeyData1080.end(), 0); + std::iota(KeyData2K.begin(), KeyData2K.end(), 0); + std::iota(KeyData4K.begin(), KeyData4K.end(), 0); + + // Shuffle the data + std::shuffle(KeyData1080.begin(), KeyData1080.end(), std::mt19937{ std::random_device{}() }); + std::shuffle(KeyData2K.begin(), KeyData2K.end(), std::mt19937{ std::random_device{}() }); + std::shuffle(KeyData4K.begin(), KeyData4K.end(), std::mt19937{ std::random_device{}() }); + + VkBufferCreateInfo bufferCreateInfo = { VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO }; + bufferCreateInfo.pNext = nullptr; + bufferCreateInfo.sharingMode = VK_SHARING_MODE_EXCLUSIVE; + bufferCreateInfo.usage = VK_BUFFER_USAGE_STORAGE_TEXEL_BUFFER_BIT | VK_BUFFER_USAGE_STORAGE_BUFFER_BIT | VK_BUFFER_USAGE_TRANSFER_DST_BIT | VK_BUFFER_USAGE_TRANSFER_SRC_BIT; + + VmaAllocationCreateInfo allocCreateInfo = {}; + allocCreateInfo.memoryTypeBits = 0; + allocCreateInfo.pool = VK_NULL_HANDLE; + allocCreateInfo.preferredFlags = 0; + allocCreateInfo.requiredFlags = 0; + allocCreateInfo.usage = VMA_MEMORY_USAGE_UNKNOWN; + + // 1080p + bufferCreateInfo.size = sizeof(uint32_t) * NumKeys[0]; + allocCreateInfo.pUserData = "SrcKeys1080"; + if (VK_SUCCESS != vmaCreateBuffer(m_pDevice->GetAllocator(), &bufferCreateInfo, &allocCreateInfo, &m_SrcKeyBuffers[0], &m_SrcKeyBufferAllocations[0], nullptr)) + { + Trace("Failed to create buffer for SrcKeys1080"); + } + // 2K + bufferCreateInfo.size = sizeof(uint32_t) * NumKeys[1]; + allocCreateInfo.pUserData = "SrcKeys2K"; + if (VK_SUCCESS != vmaCreateBuffer(m_pDevice->GetAllocator(), &bufferCreateInfo, &allocCreateInfo, &m_SrcKeyBuffers[1], &m_SrcKeyBufferAllocations[1], nullptr)) + { + Trace("Failed to create buffer for SrcKeys2K"); + } + // 4K + bufferCreateInfo.size = sizeof(uint32_t) * NumKeys[2]; + allocCreateInfo.pUserData = "SrcKeys4K"; + if (VK_SUCCESS != vmaCreateBuffer(m_pDevice->GetAllocator(), &bufferCreateInfo, &allocCreateInfo, &m_SrcKeyBuffers[2], &m_SrcKeyBufferAllocations[2], nullptr)) + { + Trace("Failed to create buffer for SrcKeys4K"); + } + allocCreateInfo.pUserData = "SrcPayloadBuffer"; + if (VK_SUCCESS != vmaCreateBuffer(m_pDevice->GetAllocator(), &bufferCreateInfo, &allocCreateInfo, &m_SrcPayloadBuffers, &m_SrcPayloadBufferAllocation, nullptr)) + { + Trace("Failed to create buffer for SrcPayloadBuffer"); + } + + // Clear out transfer bit on remaining buffers + bufferCreateInfo.usage &= ~VK_BUFFER_USAGE_TRANSFER_SRC_BIT; + + // The DstKey and DstPayload buffers will be used as src/dst when sorting. A copy of the + // source key/payload will be copied into them before hand so we can keep our original values + bufferCreateInfo.size = sizeof(uint32_t) * NumKeys[2]; + allocCreateInfo.pUserData = "DstKeyBuf0"; + if (VK_SUCCESS != vmaCreateBuffer(m_pDevice->GetAllocator(), &bufferCreateInfo, &allocCreateInfo, &m_DstKeyBuffers[0], &m_DstKeyBufferAllocations[0], nullptr)) + { + Trace("Failed to create buffer for DstKeyBuf0"); + } + + allocCreateInfo.pUserData = "DstKeyBuf1"; + if (VK_SUCCESS != vmaCreateBuffer(m_pDevice->GetAllocator(), &bufferCreateInfo, &allocCreateInfo, &m_DstKeyBuffers[1], &m_DstKeyBufferAllocations[1], nullptr)) + { + Trace("Failed to create buffer for DstKeyBuf1"); + } + + allocCreateInfo.pUserData = "DstPayloadBuf0"; + if (VK_SUCCESS != vmaCreateBuffer(m_pDevice->GetAllocator(), &bufferCreateInfo, &allocCreateInfo, &m_DstPayloadBuffers[0], &m_DstPayloadBufferAllocations[0], nullptr)) + { + Trace("Failed to create buffer for DstPayloadBuf0"); + } + + allocCreateInfo.pUserData = "DstPayloadBuf1"; + if (VK_SUCCESS != vmaCreateBuffer(m_pDevice->GetAllocator(), &bufferCreateInfo, &allocCreateInfo, &m_DstPayloadBuffers[1], &m_DstPayloadBufferAllocations[1], nullptr)) + { + Trace("Failed to create buffer for DstPayloadBuf1"); + } + + // Copy data in + VkBufferCopy copyInfo = { 0 }; + // 1080 + uint8_t* pKeyDataBuffer = m_pUploadHeap->Suballocate(NumKeys[0] * sizeof(uint32_t), sizeof(uint32_t)); + memcpy(pKeyDataBuffer, KeyData1080.data() , sizeof(uint32_t) * NumKeys[0]); + copyInfo.srcOffset = pKeyDataBuffer - m_pUploadHeap->BasePtr(); + copyInfo.size = sizeof(uint32_t) * NumKeys[0]; + vkCmdCopyBuffer(m_pUploadHeap->GetCommandList(), m_pUploadHeap->GetResource(), m_SrcKeyBuffers[0], 1, ©Info); + + // 2K + pKeyDataBuffer = m_pUploadHeap->Suballocate(NumKeys[1] * sizeof(uint32_t), sizeof(uint32_t)); + memcpy(pKeyDataBuffer, KeyData2K.data(), sizeof(uint32_t) * NumKeys[1]); + copyInfo.srcOffset = pKeyDataBuffer - m_pUploadHeap->BasePtr(); + copyInfo.size = sizeof(uint32_t) * NumKeys[1]; + vkCmdCopyBuffer(m_pUploadHeap->GetCommandList(), m_pUploadHeap->GetResource(), m_SrcKeyBuffers[1], 1, ©Info); + + // 4K + pKeyDataBuffer = m_pUploadHeap->Suballocate(NumKeys[2] * sizeof(uint32_t), sizeof(uint32_t)); + memcpy(pKeyDataBuffer, KeyData4K.data(), sizeof(uint32_t) * NumKeys[2]); + copyInfo.srcOffset = pKeyDataBuffer - m_pUploadHeap->BasePtr(); + copyInfo.size = sizeof(uint32_t) * NumKeys[2]; + vkCmdCopyBuffer(m_pUploadHeap->GetCommandList(), m_pUploadHeap->GetResource(), m_SrcKeyBuffers[2], 1, ©Info); + + uint8_t* pPayloadDataBuffer = m_pUploadHeap->Suballocate(NumKeys[2] * sizeof(uint32_t), sizeof(uint32_t)); + memcpy(pPayloadDataBuffer, KeyData4K.data(), sizeof(uint32_t) * NumKeys[2]); // Copy the 4k source data for payload (it doesn't matter what the payload is as we really only want it to measure cost of copying/sorting) + copyInfo.srcOffset = pPayloadDataBuffer - m_pUploadHeap->BasePtr(); + copyInfo.size = sizeof(uint32_t) * NumKeys[2]; + vkCmdCopyBuffer(m_pUploadHeap->GetCommandList(), m_pUploadHeap->GetResource(), m_SrcPayloadBuffers, 1, ©Info); + + // Once we are done copying the data, put in barriers to transition the source resources to + // copy source (which is what they will stay for the duration of app runtime) + VkBufferMemoryBarrier Barriers[6] = { BufferTransition(m_SrcKeyBuffers[2], VK_ACCESS_TRANSFER_WRITE_BIT, VK_ACCESS_TRANSFER_READ_BIT, sizeof(uint32_t) * NumKeys[2]), + BufferTransition(m_SrcPayloadBuffers, VK_ACCESS_TRANSFER_WRITE_BIT, VK_ACCESS_TRANSFER_READ_BIT, sizeof(uint32_t) * NumKeys[2]), + BufferTransition(m_SrcKeyBuffers[1], VK_ACCESS_TRANSFER_WRITE_BIT, VK_ACCESS_TRANSFER_READ_BIT, sizeof(uint32_t) * NumKeys[1]), + BufferTransition(m_SrcKeyBuffers[0], VK_ACCESS_TRANSFER_WRITE_BIT, VK_ACCESS_TRANSFER_READ_BIT, sizeof(uint32_t) * NumKeys[0]), + + // Copy the data into the dst[0] buffers for use on first frame + BufferTransition(m_DstKeyBuffers[0], VK_ACCESS_SHADER_READ_BIT | VK_ACCESS_SHADER_WRITE_BIT, VK_ACCESS_TRANSFER_WRITE_BIT, sizeof(uint32_t) * NumKeys[2]) , + BufferTransition(m_DstPayloadBuffers[0], VK_ACCESS_SHADER_READ_BIT | VK_ACCESS_SHADER_WRITE_BIT, VK_ACCESS_TRANSFER_WRITE_BIT, sizeof(uint32_t) * NumKeys[2]) }; + + vkCmdPipelineBarrier(m_pUploadHeap->GetCommandList(), VK_PIPELINE_STAGE_ALL_COMMANDS_BIT, VK_PIPELINE_STAGE_ALL_COMMANDS_BIT, 0, 0, nullptr, 6, Barriers, 0, nullptr); + + copyInfo.srcOffset = 0; + copyInfo.size = sizeof(uint32_t) * NumKeys[m_UIResolutionSize]; + vkCmdCopyBuffer(m_pUploadHeap->GetCommandList(), m_SrcKeyBuffers[m_UIResolutionSize], m_DstKeyBuffers[0], 1, ©Info); + vkCmdCopyBuffer(m_pUploadHeap->GetCommandList(), m_SrcPayloadBuffers, m_DstPayloadBuffers[0], 1, ©Info); + + // Put the dst buffers back to UAVs for sort usage + Barriers[0] = BufferTransition(m_DstKeyBuffers[0], VK_ACCESS_TRANSFER_WRITE_BIT, VK_ACCESS_SHADER_READ_BIT | VK_ACCESS_SHADER_WRITE_BIT, sizeof(uint32_t) * NumKeys[m_UIResolutionSize]); + Barriers[1] = BufferTransition(m_DstPayloadBuffers[0], VK_ACCESS_TRANSFER_WRITE_BIT, VK_ACCESS_SHADER_READ_BIT | VK_ACCESS_SHADER_WRITE_BIT, sizeof(uint32_t) * NumKeys[m_UIResolutionSize]); + vkCmdPipelineBarrier(m_pUploadHeap->GetCommandList(), VK_PIPELINE_STAGE_ALL_COMMANDS_BIT, VK_PIPELINE_STAGE_ALL_COMMANDS_BIT, 0, 0, nullptr, 2, Barriers, 0, nullptr); +} + +// Compile specified radix sort shader and create pipeline +void FFXParallelSort::CompileRadixPipeline(const char* shaderFile, const DefineList* defines, const char* entryPoint, VkPipeline& pPipeline) +{ + std::string CompileFlags("-T cs_6_0"); +#ifdef _DEBUG + CompileFlags += " -Zi -Od"; +#endif // _DEBUG + + VkPipelineShaderStageCreateInfo stage_create_info = { VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO }; + + VkResult vkResult = VKCompileFromFile(m_pDevice->GetDevice(), VK_SHADER_STAGE_COMPUTE_BIT, shaderFile, entryPoint, "-T cs_6_0", defines, &stage_create_info); + stage_create_info.flags = 0; + assert(vkResult == VK_SUCCESS); + + VkComputePipelineCreateInfo create_info = { VK_STRUCTURE_TYPE_COMPUTE_PIPELINE_CREATE_INFO }; + create_info.pNext = nullptr; + create_info.basePipelineHandle = VK_NULL_HANDLE; + create_info.basePipelineIndex = 0; + create_info.flags = 0; + create_info.layout = m_SortPipelineLayout; + create_info.stage = stage_create_info; + vkResult = vkCreateComputePipelines(m_pDevice->GetDevice(), VK_NULL_HANDLE, 1, &create_info, nullptr, &pPipeline); + assert(vkResult == VK_SUCCESS); +} + +// Parallel Sort initialization +void FFXParallelSort::OnCreate(Device* pDevice, ResourceViewHeaps* pResourceViewHeaps, DynamicBufferRing* pConstantBufferRing, UploadHeap* pUploadHeap, SwapChain* pSwapChain) +{ + m_pDevice = pDevice; + m_pUploadHeap = pUploadHeap; + m_pResourceViewHeaps = pResourceViewHeaps; + m_pConstantBufferRing = pConstantBufferRing; + m_MaxNumThreadgroups = 800; + + // Overrides for testing + if (KeySetOverride >= 0) + m_UIResolutionSize = KeySetOverride; + if (PayloadOverride) + m_UISortPayload = true; + + // Create resources to test with. Sorts will be done for 1080p, 2K, and 4K resolution data sets + CreateKeyPayloadBuffers(); + + VkBufferCreateInfo bufferCreateInfo = { VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO }; + bufferCreateInfo.pNext = nullptr; + bufferCreateInfo.sharingMode = VK_SHARING_MODE_EXCLUSIVE; + bufferCreateInfo.usage = VK_BUFFER_USAGE_STORAGE_TEXEL_BUFFER_BIT | VK_BUFFER_USAGE_STORAGE_BUFFER_BIT | VK_BUFFER_USAGE_TRANSFER_DST_BIT; // | VK_BUFFER_USAGE_TRANSFER_SRC_BIT; + + VmaAllocationCreateInfo allocCreateInfo = {}; + allocCreateInfo.memoryTypeBits = 0; + allocCreateInfo.pool = VK_NULL_HANDLE; + allocCreateInfo.preferredFlags = 0; + allocCreateInfo.requiredFlags = 0; + allocCreateInfo.usage = VMA_MEMORY_USAGE_UNKNOWN; + + // We are just going to fudge the indirect execution parameters for each resolution + bufferCreateInfo.size = sizeof(uint32_t) * 3; + allocCreateInfo.pUserData = "IndirectKeyCounts"; + if (VK_SUCCESS != vmaCreateBuffer(m_pDevice->GetAllocator(), &bufferCreateInfo, &allocCreateInfo, &m_IndirectKeyCounts, &m_IndirectKeyCountsAllocation, nullptr)) + { + Trace("Failed to create buffer for IndirectKeyCounts"); + } + + VkBufferCopy copyInfo = { 0 }; + uint8_t* pNumKeysBuffer = m_pUploadHeap->Suballocate(sizeof(uint32_t) * 3, sizeof(uint32_t)); + memcpy(pNumKeysBuffer, NumKeys, sizeof(uint32_t) * 3); + copyInfo.srcOffset = pNumKeysBuffer - m_pUploadHeap->BasePtr(); + copyInfo.size = sizeof(uint32_t) * 3; + vkCmdCopyBuffer(m_pUploadHeap->GetCommandList(), m_pUploadHeap->GetResource(), m_IndirectKeyCounts, 1, ©Info); + + VkBufferMemoryBarrier barrier = BufferTransition(m_IndirectKeyCounts, VK_ACCESS_TRANSFER_WRITE_BIT, VK_ACCESS_SHADER_READ_BIT | VK_ACCESS_SHADER_WRITE_BIT, sizeof(uint32_t) * 3); + vkCmdPipelineBarrier(m_pUploadHeap->GetCommandList(), VK_PIPELINE_STAGE_ALL_COMMANDS_BIT, VK_PIPELINE_STAGE_ALL_COMMANDS_BIT, 0, 0, nullptr, 1, &barrier, 0, nullptr); + + // Create resources for sort validation (image that goes from shuffled to sorted) + m_Validate1080pTexture.InitFromFile(m_pDevice, m_pUploadHeap, "Validate1080p.png", false,VK_IMAGE_USAGE_SAMPLED_BIT | VK_IMAGE_USAGE_STORAGE_BIT | VK_IMAGE_USAGE_TRANSFER_DST_BIT); + m_Validate1080pTexture.CreateSRV(&m_ValidationImageViews[0], 0); + m_Validate2KTexture.InitFromFile(m_pDevice, m_pUploadHeap, "Validate2K.png", false, VK_IMAGE_USAGE_SAMPLED_BIT | VK_IMAGE_USAGE_STORAGE_BIT | VK_IMAGE_USAGE_TRANSFER_DST_BIT); + m_Validate2KTexture.CreateSRV(&m_ValidationImageViews[1], 0); + m_Validate4KTexture.InitFromFile(m_pDevice, m_pUploadHeap, "Validate4K.png", false, VK_IMAGE_USAGE_SAMPLED_BIT | VK_IMAGE_USAGE_STORAGE_BIT | VK_IMAGE_USAGE_TRANSFER_DST_BIT); + m_Validate4KTexture.CreateSRV(&m_ValidationImageViews[2], 0); + + // Finish up + m_pUploadHeap->FlushAndFinish(); + + // Allocate the scratch buffers needed for radix sort + FFX_ParallelSort_CalculateScratchResourceSize(NumKeys[2], m_ScratchBufferSize, m_ReducedScratchBufferSize); + + bufferCreateInfo.size = m_ScratchBufferSize; + allocCreateInfo.pUserData = "Scratch"; + if (VK_SUCCESS != vmaCreateBuffer(m_pDevice->GetAllocator(), &bufferCreateInfo, &allocCreateInfo, &m_FPSScratchBuffer, &m_FPSScratchBufferAllocation, nullptr)) + { + Trace("Failed to create buffer for Scratch"); + } + + bufferCreateInfo.size = m_ReducedScratchBufferSize; + allocCreateInfo.pUserData = "ReducedScratch"; + if (VK_SUCCESS != vmaCreateBuffer(m_pDevice->GetAllocator(), &bufferCreateInfo, &allocCreateInfo, &m_FPSReducedScratchBuffer, &m_FPSReducedScratchBufferAllocation, nullptr)) + { + Trace("Failed to create buffer for ReducedScratch"); + } + + // Allocate the buffers for indirect execution of the algorithm + + bufferCreateInfo.size = sizeof(uint32_t) * 3; + bufferCreateInfo.usage = VK_BUFFER_USAGE_INDIRECT_BUFFER_BIT | VK_BUFFER_USAGE_STORAGE_TEXEL_BUFFER_BIT | VK_BUFFER_USAGE_STORAGE_BUFFER_BIT | VK_BUFFER_USAGE_TRANSFER_DST_BIT; + allocCreateInfo.pUserData = "IndirectCount_Scatter_DispatchArgs"; + if (VK_SUCCESS != vmaCreateBuffer(m_pDevice->GetAllocator(), &bufferCreateInfo, &allocCreateInfo, &m_IndirectCountScatterArgs, &m_IndirectCountScatterArgsAllocation, nullptr)) + { + Trace("Failed to create buffer for IndirectCount_Scatter_DispatchArgs"); + } + + allocCreateInfo.pUserData = "IndirectReduceScanArgs"; + if (VK_SUCCESS != vmaCreateBuffer(m_pDevice->GetAllocator(), &bufferCreateInfo, &allocCreateInfo, &m_IndirectReduceScanArgs, &m_IndirectReduceScanArgsAllocation, nullptr)) + { + Trace("Failed to create buffer for IndirectCount_Scatter_DispatchArgs"); + } + + bufferCreateInfo.size = sizeof(FFX_ParallelSortCB); + bufferCreateInfo.usage = VK_BUFFER_USAGE_UNIFORM_BUFFER_BIT | VK_BUFFER_USAGE_STORAGE_TEXEL_BUFFER_BIT | VK_BUFFER_USAGE_STORAGE_BUFFER_BIT | VK_BUFFER_USAGE_TRANSFER_DST_BIT; + allocCreateInfo.pUserData = "IndirectConstantBuffer"; + if (VK_SUCCESS != vmaCreateBuffer(m_pDevice->GetAllocator(), &bufferCreateInfo, &allocCreateInfo, &m_IndirectConstantBuffer, &m_IndirectConstantBufferAllocation, nullptr)) + { + Trace("Failed to create buffer for IndirectConstantBuffer"); + } + + // Create Pipeline layout for Sort pass + { + // Create binding for Radix sort passes + VkDescriptorSetLayoutBinding layout_bindings_set_0[] = { + { 0, VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER, 1, VK_SHADER_STAGE_ALL, nullptr } // Constant buffer table + }; + + VkDescriptorSetLayoutBinding layout_bindings_set_1[] = { + { 0, VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER, 1, VK_SHADER_STAGE_ALL, nullptr } // Constant buffer to setup indirect params (indirect) + }; + + VkDescriptorSetLayoutBinding layout_bindings_set_InputOutputs[] = { + { 0, VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, 1, VK_SHADER_STAGE_ALL, nullptr }, // SrcBuffer (sort) + { 1, VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, 1, VK_SHADER_STAGE_ALL, nullptr }, // DstBuffer (sort) + { 2, VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, 1, VK_SHADER_STAGE_ALL, nullptr }, // ScrPayload (sort only) + { 3, VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, 1, VK_SHADER_STAGE_ALL, nullptr }, // DstPayload (sort only) + }; + + VkDescriptorSetLayoutBinding layout_bindings_set_Scan[] = { + { 0, VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, 1, VK_SHADER_STAGE_ALL, nullptr }, // ScanSrc + { 1, VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, 1, VK_SHADER_STAGE_ALL, nullptr }, // ScanDst + { 2, VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, 1, VK_SHADER_STAGE_ALL, nullptr }, // ScanScratch + }; + + VkDescriptorSetLayoutBinding layout_bindings_set_Scratch[] = { + { 0, VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, 1, VK_SHADER_STAGE_ALL, nullptr }, // Scratch (sort only) + { 1, VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, 1, VK_SHADER_STAGE_ALL, nullptr }, // Scratch (reduced) + }; + + VkDescriptorSetLayoutBinding layout_bindings_set_Indirect[] = { + { 0, VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, 1, VK_SHADER_STAGE_ALL, nullptr }, // NumKeys (indirect) + { 1, VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, 1, VK_SHADER_STAGE_ALL, nullptr }, // CBufferUAV (indirect) + { 2, VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, 1, VK_SHADER_STAGE_ALL, nullptr }, // CountScatterArgs (indirect) + { 3, VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, 1, VK_SHADER_STAGE_ALL, nullptr } // ReduceScanArgs (indirect) + }; + + VkDescriptorSetLayoutCreateInfo descriptor_set_layout_create_info = { VK_STRUCTURE_TYPE_DESCRIPTOR_SET_LAYOUT_CREATE_INFO }; + descriptor_set_layout_create_info.pNext = nullptr; + descriptor_set_layout_create_info.flags = 0; + descriptor_set_layout_create_info.pBindings = layout_bindings_set_0; + descriptor_set_layout_create_info.bindingCount = 1; + VkResult vkResult = vkCreateDescriptorSetLayout(m_pDevice->GetDevice(), &descriptor_set_layout_create_info, nullptr, &m_SortDescriptorSetLayoutConstants); + assert(vkResult == VK_SUCCESS); + bool bDescriptorAlloc = true; + bDescriptorAlloc &= m_pResourceViewHeaps->AllocDescriptor(m_SortDescriptorSetLayoutConstants, &m_SortDescriptorSetConstants[0]); + bDescriptorAlloc &= m_pResourceViewHeaps->AllocDescriptor(m_SortDescriptorSetLayoutConstants, &m_SortDescriptorSetConstants[1]); + bDescriptorAlloc &= m_pResourceViewHeaps->AllocDescriptor(m_SortDescriptorSetLayoutConstants, &m_SortDescriptorSetConstants[2]); + assert(bDescriptorAlloc == true); + + descriptor_set_layout_create_info.pBindings = layout_bindings_set_1; + descriptor_set_layout_create_info.bindingCount = 1; + vkResult = vkCreateDescriptorSetLayout(m_pDevice->GetDevice(), &descriptor_set_layout_create_info, nullptr, &m_SortDescriptorSetLayoutConstantsIndirect); + assert(vkResult == VK_SUCCESS); + bDescriptorAlloc &= m_pResourceViewHeaps->AllocDescriptor(m_SortDescriptorSetLayoutConstantsIndirect, &m_SortDescriptorSetConstantsIndirect[0]); + bDescriptorAlloc &= m_pResourceViewHeaps->AllocDescriptor(m_SortDescriptorSetLayoutConstantsIndirect, &m_SortDescriptorSetConstantsIndirect[1]); + bDescriptorAlloc &= m_pResourceViewHeaps->AllocDescriptor(m_SortDescriptorSetLayoutConstantsIndirect, &m_SortDescriptorSetConstantsIndirect[2]); + assert(bDescriptorAlloc == true); + + descriptor_set_layout_create_info.pBindings = layout_bindings_set_InputOutputs; + descriptor_set_layout_create_info.bindingCount = 4; + vkResult = vkCreateDescriptorSetLayout(m_pDevice->GetDevice(), &descriptor_set_layout_create_info, nullptr, &m_SortDescriptorSetLayoutInputOutputs); + assert(vkResult == VK_SUCCESS); + bDescriptorAlloc = m_pResourceViewHeaps->AllocDescriptor(m_SortDescriptorSetLayoutInputOutputs, &m_SortDescriptorSetInputOutput[0]); + assert(bDescriptorAlloc == true); + bDescriptorAlloc = m_pResourceViewHeaps->AllocDescriptor(m_SortDescriptorSetLayoutInputOutputs, &m_SortDescriptorSetInputOutput[1]); + assert(bDescriptorAlloc == true); + + descriptor_set_layout_create_info.pBindings = layout_bindings_set_Scan; + descriptor_set_layout_create_info.bindingCount = 3; + vkResult = vkCreateDescriptorSetLayout(m_pDevice->GetDevice(), &descriptor_set_layout_create_info, nullptr, &m_SortDescriptorSetLayoutScan); + assert(vkResult == VK_SUCCESS); + bDescriptorAlloc = m_pResourceViewHeaps->AllocDescriptor(m_SortDescriptorSetLayoutScan, &m_SortDescriptorSetScanSets[0]); + assert(bDescriptorAlloc == true); + bDescriptorAlloc = m_pResourceViewHeaps->AllocDescriptor(m_SortDescriptorSetLayoutScan, &m_SortDescriptorSetScanSets[1]); + assert(bDescriptorAlloc == true); + + descriptor_set_layout_create_info.pBindings = layout_bindings_set_Scratch; + descriptor_set_layout_create_info.bindingCount = 2; + vkResult = vkCreateDescriptorSetLayout(m_pDevice->GetDevice(), &descriptor_set_layout_create_info, nullptr, &m_SortDescriptorSetLayoutScratch); + assert(vkResult == VK_SUCCESS); + bDescriptorAlloc = m_pResourceViewHeaps->AllocDescriptor(m_SortDescriptorSetLayoutScratch, &m_SortDescriptorSetScratch); + assert(bDescriptorAlloc == true); + + descriptor_set_layout_create_info.pBindings = layout_bindings_set_Indirect; + descriptor_set_layout_create_info.bindingCount = 4; + vkResult = vkCreateDescriptorSetLayout(m_pDevice->GetDevice(), &descriptor_set_layout_create_info, nullptr, &m_SortDescriptorSetLayoutIndirect); + assert(vkResult == VK_SUCCESS); + bDescriptorAlloc = m_pResourceViewHeaps->AllocDescriptor(m_SortDescriptorSetLayoutIndirect, &m_SortDescriptorSetIndirect); + assert(bDescriptorAlloc == true); + + // Create constant range representing our static constant + VkPushConstantRange constant_range; + constant_range.stageFlags = VK_SHADER_STAGE_ALL; + constant_range.offset = 0; + constant_range.size = 4; + + // Create the pipeline layout (Root signature) + VkPipelineLayoutCreateInfo layout_create_info = { VK_STRUCTURE_TYPE_PIPELINE_LAYOUT_CREATE_INFO }; + layout_create_info.pNext = nullptr; + layout_create_info.flags = 0; + layout_create_info.setLayoutCount = 6; + VkDescriptorSetLayout layouts[] = { m_SortDescriptorSetLayoutConstants, m_SortDescriptorSetLayoutConstantsIndirect, m_SortDescriptorSetLayoutInputOutputs, + m_SortDescriptorSetLayoutScan, m_SortDescriptorSetLayoutScratch, m_SortDescriptorSetLayoutIndirect }; + layout_create_info.pSetLayouts = layouts; + layout_create_info.pushConstantRangeCount = 1; + layout_create_info.pPushConstantRanges = &constant_range; + VkResult bCreatePipelineLayout = vkCreatePipelineLayout(m_pDevice->GetDevice(), &layout_create_info, nullptr, &m_SortPipelineLayout); + assert(bCreatePipelineLayout == VK_SUCCESS); + } + + // Create Pipeline layout for Render of RadixBuffer info + { + // Create binding for Radix sort passes + VkDescriptorSetLayoutBinding layout_bindings_set_0[] = { + { 0, VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER, 1, VK_SHADER_STAGE_ALL, nullptr } // Constant buffer table + }; + + VkDescriptorSetLayoutBinding layout_bindings_set_1[] = { + { 0, VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, 1, VK_SHADER_STAGE_ALL, nullptr } // Sort Buffer + }; + + VkDescriptorSetLayoutBinding layout_bindings_set_2[] = { + { 0, VK_DESCRIPTOR_TYPE_SAMPLED_IMAGE, 1, VK_SHADER_STAGE_ALL, nullptr } // ValidationTexture + }; + + // Create descriptor set layout and descriptor set + VkDescriptorSetLayoutCreateInfo descriptor_set_layout_create_info = { VK_STRUCTURE_TYPE_DESCRIPTOR_SET_LAYOUT_CREATE_INFO }; + descriptor_set_layout_create_info.pNext = nullptr; + descriptor_set_layout_create_info.flags = 0; + descriptor_set_layout_create_info.bindingCount = 1; + descriptor_set_layout_create_info.pBindings = layout_bindings_set_0; + VkResult vkResult = vkCreateDescriptorSetLayout(m_pDevice->GetDevice(), &descriptor_set_layout_create_info, nullptr, &m_RenderDescriptorSetLayout0); + assert(vkResult == VK_SUCCESS); + bool bDescriptorAlloc = m_pResourceViewHeaps->AllocDescriptor(m_RenderDescriptorSetLayout0, &m_RenderDescriptorSet0); + assert(bDescriptorAlloc == true); + descriptor_set_layout_create_info.pBindings = layout_bindings_set_1; + vkResult = vkCreateDescriptorSetLayout(m_pDevice->GetDevice(), &descriptor_set_layout_create_info, nullptr, &m_RenderDescriptorSetLayout1); + assert(vkResult == VK_SUCCESS); + bDescriptorAlloc &= m_pResourceViewHeaps->AllocDescriptor(m_RenderDescriptorSetLayout1, &m_RenderDescriptorSet1[0]); + bDescriptorAlloc &= m_pResourceViewHeaps->AllocDescriptor(m_RenderDescriptorSetLayout1, &m_RenderDescriptorSet1[1]); + bDescriptorAlloc &= m_pResourceViewHeaps->AllocDescriptor(m_RenderDescriptorSetLayout1, &m_RenderDescriptorSet1[2]); + bDescriptorAlloc &= m_pResourceViewHeaps->AllocDescriptor(m_RenderDescriptorSetLayout1, &m_RenderDescriptorSet1[3]); + assert(bDescriptorAlloc == true); + descriptor_set_layout_create_info.pBindings = layout_bindings_set_2; + vkResult = vkCreateDescriptorSetLayout(m_pDevice->GetDevice(), &descriptor_set_layout_create_info, nullptr, &m_RenderDescriptorSetLayout2); + assert(vkResult == VK_SUCCESS); + bDescriptorAlloc &= m_pResourceViewHeaps->AllocDescriptor(m_RenderDescriptorSetLayout2, &m_RenderDescriptorSet2[0]); + bDescriptorAlloc &= m_pResourceViewHeaps->AllocDescriptor(m_RenderDescriptorSetLayout2, &m_RenderDescriptorSet2[1]); + bDescriptorAlloc &= m_pResourceViewHeaps->AllocDescriptor(m_RenderDescriptorSetLayout2, &m_RenderDescriptorSet2[2]); + assert(bDescriptorAlloc == true); + + // Create the pipeline layout (Root signature) + VkPipelineLayoutCreateInfo layout_create_info = { VK_STRUCTURE_TYPE_PIPELINE_LAYOUT_CREATE_INFO }; + layout_create_info.pNext = nullptr; + layout_create_info.flags = 0; + layout_create_info.setLayoutCount = 3; + VkDescriptorSetLayout layouts[] = { m_RenderDescriptorSetLayout0, m_RenderDescriptorSetLayout1, m_RenderDescriptorSetLayout2 }; + layout_create_info.pSetLayouts = layouts; + layout_create_info.pushConstantRangeCount = 0; + layout_create_info.pPushConstantRanges = nullptr; + VkResult bCreatePipelineLayout = vkCreatePipelineLayout(m_pDevice->GetDevice(), &layout_create_info, nullptr, &m_RenderPipelineLayout); + assert(bCreatePipelineLayout == VK_SUCCESS); + } + + ////////////////////////////////////////////////////////////////////////// + // Create pipelines for radix sort + { + // Create all of the necessary pipelines for Sort and Scan + + // SetupIndirectParams (indirect only) + DefineList defines; + defines["VK_Const"] = std::to_string(1); + CompileRadixPipeline("ParallelSortCS.hlsl", &defines, "FPS_SetupIndirectParameters", m_FPSIndirectSetupParametersPipeline); + + // Radix count (sum table generation) + CompileRadixPipeline("ParallelSortCS.hlsl", &defines, "FPS_Count", m_FPSCountPipeline); + // Radix count reduce (sum table reduction for offset prescan) + CompileRadixPipeline("ParallelSortCS.hlsl", &defines, "FPS_CountReduce", m_FPSCountReducePipeline); + // Radix scan (prefix scan) + CompileRadixPipeline("ParallelSortCS.hlsl", &defines, "FPS_Scan", m_FPSScanPipeline); + // Radix scan add (prefix scan + reduced prefix scan addition) + CompileRadixPipeline("ParallelSortCS.hlsl", &defines, "FPS_ScanAdd", m_FPSScanAddPipeline); + // Radix scatter (key redistribution) + CompileRadixPipeline("ParallelSortCS.hlsl", &defines, "FPS_Scatter", m_FPSScatterPipeline); + + // Radix scatter with payload (key and payload redistribution) + defines["kRS_ValueCopy"] = std::to_string(1); + CompileRadixPipeline("ParallelSortCS.hlsl", &defines, "FPS_Scatter", m_FPSScatterPayloadPipeline); + } + + ////////////////////////////////////////////////////////////////////////// + // Create pipelines for render pass + { +#ifdef _DEBUG + std::string CompileFlagsVS("-T vs_6_0 -Zi -Od"); + std::string CompileFlagsPS("-T ps_6_0 -Zi -Od"); +#else + std::string CompileFlagsVS("-T vs_6_0"); + std::string CompileFlagsPS("-T ps_6_0"); +#endif // _DEBUG + + // VS + VkPipelineShaderStageCreateInfo stage_create_info_VS = { VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO }; + VkResult vkResult = VKCompileFromFile(m_pDevice->GetDevice(), VK_SHADER_STAGE_VERTEX_BIT, "ParallelSortVerify.hlsl", "FullscreenVS", CompileFlagsVS.c_str(), nullptr, &stage_create_info_VS); + stage_create_info_VS.flags = 0; + assert(vkResult == VK_SUCCESS); + // PS + VkPipelineShaderStageCreateInfo stage_create_info_PS = { VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO }; + vkResult = VKCompileFromFile(m_pDevice->GetDevice(), VK_SHADER_STAGE_FRAGMENT_BIT, "ParallelSortVerify.hlsl", "RenderSortValidationPS", CompileFlagsPS.c_str(), nullptr, &stage_create_info_PS); + stage_create_info_PS.flags = 0; + assert(vkResult == VK_SUCCESS); + + // Pipeline creation + VkGraphicsPipelineCreateInfo create_info = { VK_STRUCTURE_TYPE_GRAPHICS_PIPELINE_CREATE_INFO }; + create_info.pNext = nullptr; + create_info.flags = 0; + create_info.stageCount = 2; + VkPipelineShaderStageCreateInfo stages[] = { stage_create_info_VS, stage_create_info_PS }; + create_info.pStages = stages; + + VkPipelineVertexInputStateCreateInfo vi = {}; + vi.sType = VK_STRUCTURE_TYPE_PIPELINE_VERTEX_INPUT_STATE_CREATE_INFO; + vi.pNext = NULL; + vi.flags = 0; + vi.vertexBindingDescriptionCount = 0; + vi.pVertexBindingDescriptions = nullptr; + vi.vertexAttributeDescriptionCount = 0; + vi.pVertexAttributeDescriptions = nullptr; + create_info.pVertexInputState = &vi; + + VkPipelineInputAssemblyStateCreateInfo ia; + ia.sType = VK_STRUCTURE_TYPE_PIPELINE_INPUT_ASSEMBLY_STATE_CREATE_INFO; + ia.pNext = NULL; + ia.flags = 0; + ia.primitiveRestartEnable = VK_FALSE; + ia.topology = VK_PRIMITIVE_TOPOLOGY_TRIANGLE_LIST; + create_info.pInputAssemblyState = &ia; + create_info.pTessellationState = nullptr; + + VkPipelineViewportStateCreateInfo vp = {}; + vp.sType = VK_STRUCTURE_TYPE_PIPELINE_VIEWPORT_STATE_CREATE_INFO; + vp.pNext = NULL; + vp.flags = 0; + vp.viewportCount = 1; + vp.scissorCount = 1; + vp.pScissors = NULL; + vp.pViewports = NULL; + create_info.pViewportState = &vp; + + VkPipelineRasterizationStateCreateInfo rs; + rs.sType = VK_STRUCTURE_TYPE_PIPELINE_RASTERIZATION_STATE_CREATE_INFO; + rs.pNext = NULL; + rs.flags = 0; + rs.polygonMode = VK_POLYGON_MODE_FILL; + rs.cullMode = VK_CULL_MODE_NONE; + rs.frontFace = VK_FRONT_FACE_COUNTER_CLOCKWISE; + rs.depthClampEnable = VK_FALSE; + rs.rasterizerDiscardEnable = VK_FALSE; + rs.depthBiasEnable = VK_FALSE; + rs.depthBiasConstantFactor = 0; + rs.depthBiasClamp = 0; + rs.depthBiasSlopeFactor = 0; + rs.lineWidth = 1.0f; + create_info.pRasterizationState = &rs; + + VkPipelineMultisampleStateCreateInfo ms; + ms.sType = VK_STRUCTURE_TYPE_PIPELINE_MULTISAMPLE_STATE_CREATE_INFO; + ms.pNext = NULL; + ms.flags = 0; + ms.pSampleMask = NULL; + ms.rasterizationSamples = VK_SAMPLE_COUNT_1_BIT; + ms.sampleShadingEnable = VK_FALSE; + ms.alphaToCoverageEnable = VK_FALSE; + ms.alphaToOneEnable = VK_FALSE; + ms.minSampleShading = 0.0; + create_info.pMultisampleState = &ms; + + VkPipelineDepthStencilStateCreateInfo ds; + ds.sType = VK_STRUCTURE_TYPE_PIPELINE_DEPTH_STENCIL_STATE_CREATE_INFO; + ds.pNext = NULL; + ds.flags = 0; + ds.depthTestEnable = VK_FALSE; + ds.depthWriteEnable = VK_FALSE; + ds.depthCompareOp = VK_COMPARE_OP_LESS_OR_EQUAL; + ds.depthBoundsTestEnable = VK_FALSE; + ds.stencilTestEnable = VK_FALSE; + ds.back.failOp = VK_STENCIL_OP_KEEP; + ds.back.passOp = VK_STENCIL_OP_KEEP; + ds.back.compareOp = VK_COMPARE_OP_ALWAYS; + ds.back.compareMask = 0; + ds.back.reference = 0; + ds.back.depthFailOp = VK_STENCIL_OP_KEEP; + ds.back.writeMask = 0; + ds.minDepthBounds = 0; + ds.maxDepthBounds = 0; + ds.stencilTestEnable = VK_FALSE; + ds.front = ds.back; + create_info.pDepthStencilState = &ds; + + VkPipelineColorBlendAttachmentState att_state[1]; + att_state[0].colorWriteMask = 0xf; + att_state[0].blendEnable = VK_FALSE; + att_state[0].alphaBlendOp = VK_BLEND_OP_ADD; + att_state[0].colorBlendOp = VK_BLEND_OP_ADD; + att_state[0].srcColorBlendFactor = VK_BLEND_FACTOR_SRC_ALPHA; + att_state[0].dstColorBlendFactor = VK_BLEND_FACTOR_ONE_MINUS_SRC_ALPHA; + att_state[0].srcAlphaBlendFactor = VK_BLEND_FACTOR_ONE; + att_state[0].dstAlphaBlendFactor = VK_BLEND_FACTOR_ZERO; + + VkPipelineColorBlendStateCreateInfo cb; + cb.sType = VK_STRUCTURE_TYPE_PIPELINE_COLOR_BLEND_STATE_CREATE_INFO; + cb.flags = 0; + cb.pNext = NULL; + cb.attachmentCount = 1; + cb.pAttachments = att_state; + cb.logicOpEnable = VK_FALSE; + cb.logicOp = VK_LOGIC_OP_NO_OP; + cb.blendConstants[0] = 1.0f; + cb.blendConstants[1] = 1.0f; + cb.blendConstants[2] = 1.0f; + cb.blendConstants[3] = 1.0f; + create_info.pColorBlendState = &cb; + + std::vector dynamicStateEnables = { + VK_DYNAMIC_STATE_VIEWPORT, + VK_DYNAMIC_STATE_SCISSOR, + VK_DYNAMIC_STATE_BLEND_CONSTANTS + }; + VkPipelineDynamicStateCreateInfo dynamicState = {}; + dynamicState.sType = VK_STRUCTURE_TYPE_PIPELINE_DYNAMIC_STATE_CREATE_INFO; + dynamicState.pNext = NULL; + dynamicState.pDynamicStates = dynamicStateEnables.data(); + dynamicState.dynamicStateCount = (uint32_t)dynamicStateEnables.size(); + create_info.pDynamicState = &dynamicState; + + create_info.layout = m_RenderPipelineLayout; + create_info.renderPass = pSwapChain->GetRenderPass(); + create_info.subpass = 0; + create_info.basePipelineHandle = VK_NULL_HANDLE; + create_info.basePipelineIndex = 0; + + vkResult = vkCreateGraphicsPipelines(m_pDevice->GetDevice(), m_pDevice->GetPipelineCache(), 1, &create_info, NULL, &m_RenderResultVerificationPipeline); + assert(vkResult == VK_SUCCESS); + } + + // Do binding setups + { + VkBuffer BufferMaps[4]; + + // Map inputs/outputs + BufferMaps[0] = m_DstKeyBuffers[0]; + BufferMaps[1] = m_DstKeyBuffers[1]; + BufferMaps[2] = m_DstPayloadBuffers[0]; + BufferMaps[3] = m_DstPayloadBuffers[1]; + BindUAVBuffer(BufferMaps, m_SortDescriptorSetInputOutput[0], 0, 4); + + BufferMaps[0] = m_DstKeyBuffers[1]; + BufferMaps[1] = m_DstKeyBuffers[0]; + BufferMaps[2] = m_DstPayloadBuffers[1]; + BufferMaps[3] = m_DstPayloadBuffers[0]; + BindUAVBuffer(BufferMaps, m_SortDescriptorSetInputOutput[1], 0, 4); + + // Map scan sets (reduced, scratch) + BufferMaps[0] = BufferMaps[1] = m_FPSReducedScratchBuffer; + BindUAVBuffer(BufferMaps, m_SortDescriptorSetScanSets[0], 0, 2); + + BufferMaps[0] = BufferMaps[1] = m_FPSScratchBuffer; + BufferMaps[2] = m_FPSReducedScratchBuffer; + BindUAVBuffer(BufferMaps, m_SortDescriptorSetScanSets[1], 0, 3); + + // Map Scratch areas (fixed) + BufferMaps[0] = m_FPSScratchBuffer; + BufferMaps[1] = m_FPSReducedScratchBuffer; + BindUAVBuffer(BufferMaps, m_SortDescriptorSetScratch, 0, 2); + + // Map indirect buffers + BufferMaps[0] = m_IndirectKeyCounts; + BufferMaps[1] = m_IndirectConstantBuffer; + BufferMaps[2] = m_IndirectCountScatterArgs; + BufferMaps[3] = m_IndirectReduceScanArgs; + BindUAVBuffer(BufferMaps, m_SortDescriptorSetIndirect, 0, 4); + + // Bind validation textures + for (int i = 0; i < 3; ++i) + { + VkDescriptorImageInfo imageinfo; + imageinfo.imageView = m_ValidationImageViews[i]; + imageinfo.imageLayout = VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL; + imageinfo.sampler = VK_NULL_HANDLE; + + VkWriteDescriptorSet write_set = { VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET }; + write_set.pNext = nullptr; + write_set.dstSet = m_RenderDescriptorSet2[i]; + write_set.dstBinding = 0; + write_set.dstArrayElement = 0; + write_set.descriptorCount = 1; + write_set.descriptorType = VK_DESCRIPTOR_TYPE_SAMPLED_IMAGE; + write_set.pImageInfo = &imageinfo; + write_set.pBufferInfo = nullptr; + write_set.pTexelBufferView = nullptr; + + vkUpdateDescriptorSets(m_pDevice->GetDevice(), 1, &write_set, 0, nullptr); + } + + // Bind buffers from which we will pull the indices into the image buffer + BindUAVBuffer(&m_SrcKeyBuffers[0], m_RenderDescriptorSet1[0]); + BindUAVBuffer(&m_SrcKeyBuffers[1], m_RenderDescriptorSet1[1]); + BindUAVBuffer(&m_SrcKeyBuffers[2], m_RenderDescriptorSet1[2]); + BindUAVBuffer(&m_DstKeyBuffers[0], m_RenderDescriptorSet1[3]); + } +} + +// Parallel Sort termination +void FFXParallelSort::OnDestroy() +{ + // Release verification render resources + vkDestroyPipelineLayout(m_pDevice->GetDevice(), m_RenderPipelineLayout, nullptr); + vkDestroyDescriptorSetLayout(m_pDevice->GetDevice(), m_RenderDescriptorSetLayout0, nullptr); + m_pResourceViewHeaps->FreeDescriptor(m_RenderDescriptorSet0); + vkDestroyDescriptorSetLayout(m_pDevice->GetDevice(), m_RenderDescriptorSetLayout1, nullptr); + m_pResourceViewHeaps->FreeDescriptor(m_RenderDescriptorSet1[0]); + m_pResourceViewHeaps->FreeDescriptor(m_RenderDescriptorSet1[1]); + m_pResourceViewHeaps->FreeDescriptor(m_RenderDescriptorSet1[2]); + m_pResourceViewHeaps->FreeDescriptor(m_RenderDescriptorSet1[3]); + vkDestroyDescriptorSetLayout(m_pDevice->GetDevice(), m_RenderDescriptorSetLayout2, nullptr); + + vkDestroyPipeline(m_pDevice->GetDevice(), m_RenderResultVerificationPipeline, nullptr); + + m_pResourceViewHeaps->FreeDescriptor(m_RenderDescriptorSet2[0]); + m_pResourceViewHeaps->FreeDescriptor(m_RenderDescriptorSet2[1]); + m_pResourceViewHeaps->FreeDescriptor(m_RenderDescriptorSet2[2]); + m_Validate4KTexture.OnDestroy(); + m_Validate2KTexture.OnDestroy(); + m_Validate1080pTexture.OnDestroy(); + vkDestroyImageView(m_pDevice->GetDevice(), m_ValidationImageViews[0], nullptr); + vkDestroyImageView(m_pDevice->GetDevice(), m_ValidationImageViews[1], nullptr); + vkDestroyImageView(m_pDevice->GetDevice(), m_ValidationImageViews[2], nullptr); + + // Release radix sort indirect resources + vmaDestroyBuffer(m_pDevice->GetAllocator(), m_IndirectKeyCounts, m_IndirectKeyCountsAllocation); + vmaDestroyBuffer(m_pDevice->GetAllocator(), m_IndirectConstantBuffer, m_IndirectConstantBufferAllocation); + vmaDestroyBuffer(m_pDevice->GetAllocator(), m_IndirectCountScatterArgs, m_IndirectCountScatterArgsAllocation); + vmaDestroyBuffer(m_pDevice->GetAllocator(), m_IndirectReduceScanArgs, m_IndirectReduceScanArgsAllocation); + vkDestroyPipeline(m_pDevice->GetDevice(), m_FPSIndirectSetupParametersPipeline, nullptr); + + // Release radix sort algorithm resources + vmaDestroyBuffer(m_pDevice->GetAllocator(), m_FPSScratchBuffer, m_FPSScratchBufferAllocation); + vmaDestroyBuffer(m_pDevice->GetAllocator(), m_FPSReducedScratchBuffer, m_FPSReducedScratchBufferAllocation); + + vkDestroyPipelineLayout(m_pDevice->GetDevice(), m_SortPipelineLayout, nullptr); + vkDestroyDescriptorSetLayout(m_pDevice->GetDevice(), m_SortDescriptorSetLayoutConstants, nullptr); + m_pResourceViewHeaps->FreeDescriptor(m_SortDescriptorSetConstants[0]); + m_pResourceViewHeaps->FreeDescriptor(m_SortDescriptorSetConstants[1]); + m_pResourceViewHeaps->FreeDescriptor(m_SortDescriptorSetConstants[2]); + vkDestroyDescriptorSetLayout(m_pDevice->GetDevice(), m_SortDescriptorSetLayoutConstantsIndirect, nullptr); + m_pResourceViewHeaps->FreeDescriptor(m_SortDescriptorSetConstantsIndirect[0]); + m_pResourceViewHeaps->FreeDescriptor(m_SortDescriptorSetConstantsIndirect[1]); + m_pResourceViewHeaps->FreeDescriptor(m_SortDescriptorSetConstantsIndirect[2]); + vkDestroyDescriptorSetLayout(m_pDevice->GetDevice(), m_SortDescriptorSetLayoutInputOutputs, nullptr); + m_pResourceViewHeaps->FreeDescriptor(m_SortDescriptorSetInputOutput[0]); + m_pResourceViewHeaps->FreeDescriptor(m_SortDescriptorSetInputOutput[1]); + + vkDestroyDescriptorSetLayout(m_pDevice->GetDevice(), m_SortDescriptorSetLayoutScan, nullptr); + m_pResourceViewHeaps->FreeDescriptor(m_SortDescriptorSetScanSets[0]); + m_pResourceViewHeaps->FreeDescriptor(m_SortDescriptorSetScanSets[1]); + + vkDestroyDescriptorSetLayout(m_pDevice->GetDevice(), m_SortDescriptorSetLayoutScratch, nullptr); + m_pResourceViewHeaps->FreeDescriptor(m_SortDescriptorSetScratch); + + vkDestroyDescriptorSetLayout(m_pDevice->GetDevice(), m_SortDescriptorSetLayoutIndirect, nullptr); + m_pResourceViewHeaps->FreeDescriptor(m_SortDescriptorSetIndirect); + + vkDestroyPipeline(m_pDevice->GetDevice(), m_FPSCountPipeline, nullptr); + vkDestroyPipeline(m_pDevice->GetDevice(), m_FPSCountReducePipeline, nullptr); + vkDestroyPipeline(m_pDevice->GetDevice(), m_FPSScanPipeline, nullptr); + vkDestroyPipeline(m_pDevice->GetDevice(), m_FPSScanAddPipeline, nullptr); + vkDestroyPipeline(m_pDevice->GetDevice(), m_FPSScatterPipeline, nullptr); + vkDestroyPipeline(m_pDevice->GetDevice(), m_FPSScatterPayloadPipeline, nullptr); + + // Release all of our resources + vmaDestroyBuffer(m_pDevice->GetAllocator(), m_SrcKeyBuffers[0], m_SrcKeyBufferAllocations[0]); + vmaDestroyBuffer(m_pDevice->GetAllocator(), m_SrcKeyBuffers[1], m_SrcKeyBufferAllocations[1]); + vmaDestroyBuffer(m_pDevice->GetAllocator(), m_SrcKeyBuffers[2], m_SrcKeyBufferAllocations[2]); + vmaDestroyBuffer(m_pDevice->GetAllocator(), m_SrcPayloadBuffers, m_SrcPayloadBufferAllocation); + vmaDestroyBuffer(m_pDevice->GetAllocator(), m_DstKeyBuffers[0], m_DstKeyBufferAllocations[0]); + vmaDestroyBuffer(m_pDevice->GetAllocator(), m_DstKeyBuffers[1], m_DstKeyBufferAllocations[1]); + vmaDestroyBuffer(m_pDevice->GetAllocator(), m_DstPayloadBuffers[0], m_DstPayloadBufferAllocations[0]); + vmaDestroyBuffer(m_pDevice->GetAllocator(), m_DstPayloadBuffers[1], m_DstPayloadBufferAllocations[1]); +} + +// Because we are sorting the data every frame, need to reset to unsorted version of data before running sort +void FFXParallelSort::CopySourceDataForFrame(VkCommandBuffer commandList) +{ + // Copy the contents the source buffer to the dstBuffer[0] each frame in order to not + // lose our original data + VkBufferMemoryBarrier Barriers[2] = { + BufferTransition(m_DstKeyBuffers[0], VK_ACCESS_SHADER_READ_BIT | VK_ACCESS_SHADER_WRITE_BIT, VK_ACCESS_TRANSFER_WRITE_BIT, sizeof(uint32_t) * NumKeys[m_UIResolutionSize]) , + BufferTransition(m_DstPayloadBuffers[0], VK_ACCESS_SHADER_READ_BIT | VK_ACCESS_SHADER_WRITE_BIT, VK_ACCESS_TRANSFER_WRITE_BIT, sizeof(uint32_t) * NumKeys[m_UIResolutionSize]) + }; + vkCmdPipelineBarrier(commandList, VK_PIPELINE_STAGE_ALL_COMMANDS_BIT, VK_PIPELINE_STAGE_ALL_COMMANDS_BIT, 0, 0, nullptr, 2, Barriers, 0, nullptr); + + VkBufferCopy copyInfo = { 0 }; + copyInfo.srcOffset = 0; + copyInfo.size = sizeof(uint32_t) * NumKeys[m_UIResolutionSize]; + vkCmdCopyBuffer(commandList, m_SrcKeyBuffers[m_UIResolutionSize], m_DstKeyBuffers[0], 1, ©Info); + vkCmdCopyBuffer(commandList, m_SrcPayloadBuffers, m_DstPayloadBuffers[0], 1, ©Info); + + // Put the dst buffers back to UAVs for sort usage + Barriers[0] = BufferTransition(m_DstKeyBuffers[0], VK_ACCESS_TRANSFER_WRITE_BIT, VK_ACCESS_SHADER_READ_BIT | VK_ACCESS_SHADER_WRITE_BIT, sizeof(uint32_t) * NumKeys[m_UIResolutionSize]); + Barriers[1] = BufferTransition(m_DstPayloadBuffers[0], VK_ACCESS_TRANSFER_WRITE_BIT, VK_ACCESS_SHADER_READ_BIT | VK_ACCESS_SHADER_WRITE_BIT, sizeof(uint32_t) * NumKeys[m_UIResolutionSize]); + vkCmdPipelineBarrier(commandList, VK_PIPELINE_STAGE_ALL_COMMANDS_BIT, VK_PIPELINE_STAGE_ALL_COMMANDS_BIT, 0, 0, nullptr, 2, Barriers, 0, nullptr); +} + +// Perform Parallel Sort (radix-based sort) +void FFXParallelSort::Sort(VkCommandBuffer commandList, bool isBenchmarking, float benchmarkTime) +{ + bool bIndirectDispatch = m_UIIndirectSort; + + // To control which descriptor set to use for updating data + static uint32_t frameCount = 0; + uint32_t frameConstants = (++frameCount) % 3; + + std::string markerText = "FFXParallelSort"; + if (bIndirectDispatch) markerText += " Indirect"; + SetPerfMarkerBegin(commandList, markerText.c_str()); + + // Buffers to ping-pong between when writing out sorted values + VkBuffer* ReadBufferInfo(&m_DstKeyBuffers[0]), * WriteBufferInfo(&m_DstKeyBuffers[1]); + VkBuffer* ReadPayloadBufferInfo(&m_DstPayloadBuffers[0]), * WritePayloadBufferInfo(&m_DstPayloadBuffers[1]); + bool bHasPayload = m_UISortPayload; + + // Setup barriers for the run + VkBufferMemoryBarrier Barriers[3]; + FFX_ParallelSortCB constantBufferData = { 0 }; + + // Fill in the constant buffer data structure (this will be done by a shader in the indirect version) + uint32_t NumThreadgroupsToRun; + uint32_t NumReducedThreadgroupsToRun; + if (!bIndirectDispatch) + { + uint32_t NumberOfKeys = NumKeys[m_UIResolutionSize]; + FFX_ParallelSort_SetConstantAndDispatchData(NumberOfKeys, m_MaxNumThreadgroups, constantBufferData, NumThreadgroupsToRun, NumReducedThreadgroupsToRun); + } + else + { + struct SetupIndirectCB + { + uint32_t NumKeysIndex; + uint32_t MaxThreadGroups; + }; + SetupIndirectCB IndirectSetupCB; + IndirectSetupCB.NumKeysIndex = m_UIResolutionSize; + IndirectSetupCB.MaxThreadGroups = m_MaxNumThreadgroups; + + // Copy the data into the constant buffer + VkDescriptorBufferInfo constantBuffer = m_pConstantBufferRing->AllocConstantBuffer(sizeof(SetupIndirectCB), (void*)&IndirectSetupCB); + BindConstantBuffer(constantBuffer, m_SortDescriptorSetConstantsIndirect[frameConstants]); + + // Dispatch + vkCmdBindDescriptorSets(commandList, VK_PIPELINE_BIND_POINT_COMPUTE, m_SortPipelineLayout, 1, 1, &m_SortDescriptorSetConstantsIndirect[frameConstants], 0, nullptr); + vkCmdBindDescriptorSets(commandList, VK_PIPELINE_BIND_POINT_COMPUTE, m_SortPipelineLayout, 5, 1, &m_SortDescriptorSetIndirect, 0, nullptr); + vkCmdBindPipeline(commandList, VK_PIPELINE_BIND_POINT_COMPUTE, m_FPSIndirectSetupParametersPipeline); + vkCmdDispatch(commandList, 1, 1, 1); + + // When done, transition the args buffers to INDIRECT_ARGUMENT, and the constant buffer UAV to Constant buffer + VkBufferMemoryBarrier barriers[5]; + barriers[0] = BufferTransition(m_IndirectCountScatterArgs, VK_ACCESS_SHADER_READ_BIT | VK_ACCESS_SHADER_WRITE_BIT, VK_ACCESS_SHADER_READ_BIT | VK_ACCESS_SHADER_WRITE_BIT, sizeof(uint32_t) * 3); + barriers[1] = BufferTransition(m_IndirectReduceScanArgs, VK_ACCESS_SHADER_READ_BIT | VK_ACCESS_SHADER_WRITE_BIT, VK_ACCESS_SHADER_READ_BIT | VK_ACCESS_SHADER_WRITE_BIT, sizeof(uint32_t) * 3); + barriers[2] = BufferTransition(m_IndirectConstantBuffer, VK_ACCESS_SHADER_READ_BIT | VK_ACCESS_SHADER_WRITE_BIT, VK_ACCESS_SHADER_READ_BIT, sizeof(FFX_ParallelSortCB)); + barriers[3] = BufferTransition(m_IndirectCountScatterArgs, VK_ACCESS_SHADER_READ_BIT | VK_ACCESS_SHADER_WRITE_BIT, VK_ACCESS_INDIRECT_COMMAND_READ_BIT, sizeof(uint32_t) * 3); + barriers[4] = BufferTransition(m_IndirectReduceScanArgs, VK_ACCESS_SHADER_READ_BIT | VK_ACCESS_SHADER_WRITE_BIT, VK_ACCESS_INDIRECT_COMMAND_READ_BIT, sizeof(uint32_t) * 3); + vkCmdPipelineBarrier(commandList, VK_PIPELINE_STAGE_ALL_COMMANDS_BIT, VK_PIPELINE_STAGE_ALL_COMMANDS_BIT, 0, 0, nullptr, 5, barriers, 0, nullptr); + } + + // Bind the scratch descriptor sets + vkCmdBindDescriptorSets(commandList, VK_PIPELINE_BIND_POINT_COMPUTE, m_SortPipelineLayout, 4, 1, &m_SortDescriptorSetScratch, 0, nullptr); + + // Copy the data into the constant buffer and bind + if (bIndirectDispatch) + { + //constantBuffer = m_IndirectConstantBuffer.GetResource()->GetGPUVirtualAddress(); + VkDescriptorBufferInfo constantBuffer; + constantBuffer.buffer = m_IndirectConstantBuffer; + constantBuffer.offset = 0; + constantBuffer.range = VK_WHOLE_SIZE; + BindConstantBuffer(constantBuffer, m_SortDescriptorSetConstants[frameConstants]); + } + else + { + VkDescriptorBufferInfo constantBuffer = m_pConstantBufferRing->AllocConstantBuffer(sizeof(FFX_ParallelSortCB), (void*)&constantBufferData); + BindConstantBuffer(constantBuffer, m_SortDescriptorSetConstants[frameConstants]); + } + // Bind constants + vkCmdBindDescriptorSets(commandList, VK_PIPELINE_BIND_POINT_COMPUTE, m_SortPipelineLayout, 0, 1, &m_SortDescriptorSetConstants[frameConstants], 0, nullptr); + + // Perform Radix Sort (currently only support 32-bit key/payload sorting + uint32_t inputSet = 0; + for (uint32_t Shift = 0; Shift < 32u; Shift += FFX_PARALLELSORT_SORT_BITS_PER_PASS) + { + // Update the bit shift + vkCmdPushConstants(commandList, m_SortPipelineLayout, VK_SHADER_STAGE_ALL, 0, 4, &Shift); + + // Bind input/output for this pass + vkCmdBindDescriptorSets(commandList, VK_PIPELINE_BIND_POINT_COMPUTE, m_SortPipelineLayout, 2, 1, &m_SortDescriptorSetInputOutput[inputSet], 0, nullptr); + + // Sort Count + { + vkCmdBindPipeline(commandList, VK_PIPELINE_BIND_POINT_COMPUTE, m_FPSCountPipeline); + + if (bIndirectDispatch) + vkCmdDispatchIndirect(commandList, m_IndirectCountScatterArgs, 0); + else + vkCmdDispatch(commandList, NumThreadgroupsToRun, 1, 1); + } + + // UAV barrier on the sum table + Barriers[0] = BufferTransition(m_FPSScratchBuffer, VK_ACCESS_SHADER_READ_BIT | VK_ACCESS_SHADER_WRITE_BIT, VK_ACCESS_SHADER_READ_BIT | VK_ACCESS_SHADER_WRITE_BIT, m_ScratchBufferSize); + vkCmdPipelineBarrier(commandList, VK_PIPELINE_STAGE_ALL_COMMANDS_BIT, VK_PIPELINE_STAGE_ALL_COMMANDS_BIT, 0, 0, nullptr, 1, Barriers, 0, nullptr); + + // Sort Reduce + { + vkCmdBindPipeline(commandList, VK_PIPELINE_BIND_POINT_COMPUTE, m_FPSCountReducePipeline); + + if (bIndirectDispatch) + vkCmdDispatchIndirect(commandList, m_IndirectReduceScanArgs, 0); + else + vkCmdDispatch(commandList, NumReducedThreadgroupsToRun, 1, 1); + + // UAV barrier on the reduced sum table + Barriers[0] = BufferTransition(m_FPSReducedScratchBuffer, VK_ACCESS_SHADER_READ_BIT | VK_ACCESS_SHADER_WRITE_BIT, VK_ACCESS_SHADER_READ_BIT | VK_ACCESS_SHADER_WRITE_BIT, m_ReducedScratchBufferSize); + vkCmdPipelineBarrier(commandList, VK_PIPELINE_STAGE_ALL_COMMANDS_BIT, VK_PIPELINE_STAGE_ALL_COMMANDS_BIT, 0, 0, nullptr, 1, Barriers, 0, nullptr); + } + + // Sort Scan + { + // First do scan prefix of reduced values + vkCmdBindDescriptorSets(commandList, VK_PIPELINE_BIND_POINT_COMPUTE, m_SortPipelineLayout, 3, 1, &m_SortDescriptorSetScanSets[0], 0, nullptr); + vkCmdBindPipeline(commandList, VK_PIPELINE_BIND_POINT_COMPUTE, m_FPSScanPipeline); + + if (!bIndirectDispatch) + { + assert(NumReducedThreadgroupsToRun < FFX_PARALLELSORT_ELEMENTS_PER_THREAD * FFX_PARALLELSORT_THREADGROUP_SIZE && "Need to account for bigger reduced histogram scan"); + } + vkCmdDispatch(commandList, 1, 1, 1); + + // UAV barrier on the reduced sum table + Barriers[0] = BufferTransition(m_FPSReducedScratchBuffer, VK_ACCESS_SHADER_READ_BIT | VK_ACCESS_SHADER_WRITE_BIT, VK_ACCESS_SHADER_READ_BIT | VK_ACCESS_SHADER_WRITE_BIT, m_ReducedScratchBufferSize); + vkCmdPipelineBarrier(commandList, VK_PIPELINE_STAGE_ALL_COMMANDS_BIT, VK_PIPELINE_STAGE_ALL_COMMANDS_BIT, 0, 0, nullptr, 1, Barriers, 0, nullptr); + + // Next do scan prefix on the histogram with partial sums that we just did + vkCmdBindDescriptorSets(commandList, VK_PIPELINE_BIND_POINT_COMPUTE, m_SortPipelineLayout, 3, 1, &m_SortDescriptorSetScanSets[1], 0, nullptr); + + vkCmdBindPipeline(commandList, VK_PIPELINE_BIND_POINT_COMPUTE, m_FPSScanAddPipeline); + if (bIndirectDispatch) + vkCmdDispatchIndirect(commandList, m_IndirectReduceScanArgs, 0); + else + vkCmdDispatch(commandList, NumReducedThreadgroupsToRun, 1, 1); + } + + // UAV barrier on the sum table + Barriers[0] = BufferTransition(m_FPSScratchBuffer, VK_ACCESS_SHADER_READ_BIT | VK_ACCESS_SHADER_WRITE_BIT, VK_ACCESS_SHADER_READ_BIT | VK_ACCESS_SHADER_WRITE_BIT, m_ScratchBufferSize); + vkCmdPipelineBarrier(commandList, VK_PIPELINE_STAGE_ALL_COMMANDS_BIT, VK_PIPELINE_STAGE_ALL_COMMANDS_BIT, 0, 0, nullptr, 1, Barriers, 0, nullptr); + + // Sort Scatter + { + vkCmdBindPipeline(commandList, VK_PIPELINE_BIND_POINT_COMPUTE, bHasPayload ? m_FPSScatterPayloadPipeline : m_FPSScatterPipeline); + + if (bIndirectDispatch) + vkCmdDispatchIndirect(commandList, m_IndirectCountScatterArgs, 0); + else + vkCmdDispatch(commandList, NumThreadgroupsToRun, 1, 1); + } + + // Finish doing everything and barrier for the next pass + int numBarriers = 0; + Barriers[numBarriers++] = BufferTransition(*WriteBufferInfo, VK_ACCESS_SHADER_READ_BIT | VK_ACCESS_SHADER_WRITE_BIT, VK_ACCESS_SHADER_READ_BIT | VK_ACCESS_SHADER_WRITE_BIT, sizeof(uint32_t) * NumKeys[2]); + if (bHasPayload) + Barriers[numBarriers++] = BufferTransition(*WritePayloadBufferInfo, VK_ACCESS_SHADER_READ_BIT | VK_ACCESS_SHADER_WRITE_BIT, VK_ACCESS_SHADER_READ_BIT | VK_ACCESS_SHADER_WRITE_BIT, sizeof(uint32_t) * NumKeys[2]); + vkCmdPipelineBarrier(commandList, VK_PIPELINE_STAGE_ALL_COMMANDS_BIT, VK_PIPELINE_STAGE_ALL_COMMANDS_BIT, 0, 0, nullptr, numBarriers, Barriers, 0, nullptr); + + // Swap read/write sources + std::swap(ReadBufferInfo, WriteBufferInfo); + if (bHasPayload) + std::swap(ReadPayloadBufferInfo, WritePayloadBufferInfo); + inputSet = !inputSet; + } + + // When we are all done, transition indirect buffers back to UAV for the next frame (if doing indirect dispatch) + if (bIndirectDispatch) + { + VkBufferMemoryBarrier barriers[3]; + barriers[0] = BufferTransition(m_IndirectConstantBuffer, VK_ACCESS_SHADER_READ_BIT, VK_ACCESS_SHADER_READ_BIT | VK_ACCESS_SHADER_WRITE_BIT, sizeof(FFX_ParallelSortCB)); + barriers[1] = BufferTransition(m_IndirectCountScatterArgs, VK_ACCESS_INDIRECT_COMMAND_READ_BIT, VK_ACCESS_SHADER_READ_BIT | VK_ACCESS_SHADER_WRITE_BIT, sizeof(uint32_t) * 3); + barriers[2] = BufferTransition(m_IndirectReduceScanArgs, VK_ACCESS_INDIRECT_COMMAND_READ_BIT, VK_ACCESS_SHADER_READ_BIT | VK_ACCESS_SHADER_WRITE_BIT, sizeof(uint32_t) * 3); + vkCmdPipelineBarrier(commandList, VK_PIPELINE_STAGE_ALL_COMMANDS_BIT, VK_PIPELINE_STAGE_ALL_COMMANDS_BIT, 0, 0, nullptr, 3, barriers, 0, nullptr); + } + + // Close out the perf capture + SetPerfMarkerEnd(commandList); +} + +// Render Parallel Sort related GUI +void FFXParallelSort::DrawGui() +{ + if (ImGui::CollapsingHeader("FFX Parallel Sort", ImGuiTreeNodeFlags_DefaultOpen)) + { + static const char* ResolutionSizeStrings[] = { "1920x1080", "2560x1440", "3840x2160" }; + + ImVec2 textSize = ImGui::CalcTextSize("3840x2160"); + if (KeySetOverride < 0) + { + ImGui::PushItemWidth(textSize.x * 2); + ImGui::Combo("Sort Buffer Resolution", &m_UIResolutionSize, ResolutionSizeStrings, _countof(ResolutionSizeStrings)); + ImGui::PopItemWidth(); + } + + ImGui::Checkbox("Sort Payload", &m_UISortPayload); + ImGui::Checkbox("Use Indirect Execution", &m_UIIndirectSort); + + ImGui::RadioButton("Render Unsorted Keys", &m_UIVisualOutput, 0); + ImGui::RadioButton("Render Sorted Keys", &m_UIVisualOutput, 1); + } +} + +// Renders the image with the sorted/unsorted indicies for visual representation +void FFXParallelSort::DrawVisualization(VkCommandBuffer commandList, uint32_t RTWidth, uint32_t RTHeight) +{ + // Setup the constant buffer + ParallelSortRenderCB ConstantBuffer; + ConstantBuffer.Width = RTWidth; + ConstantBuffer.Height = RTHeight; + static const uint32_t SortWidths[] = { 1920, 2560, 3840 }; + static const uint32_t SortHeights[] = { 1080, 1440, 2160 }; + ConstantBuffer.SortWidth = SortWidths[m_UIResolutionSize]; + ConstantBuffer.SortHeight = SortHeights[m_UIResolutionSize]; + + // Bind constant buffer + VkDescriptorBufferInfo GPUCB = m_pConstantBufferRing->AllocConstantBuffer(sizeof(ParallelSortRenderCB), (void*)&ConstantBuffer); + BindConstantBuffer(GPUCB, m_RenderDescriptorSet0); + + // If we are showing unsorted values, need to transition the source data buffer from copy source to UAV and back + int descriptorIndex = 0; + if (!m_UIVisualOutput) + { + VkBufferMemoryBarrier Barrier = BufferTransition(m_SrcKeyBuffers[m_UIResolutionSize], VK_ACCESS_TRANSFER_READ_BIT, VK_ACCESS_SHADER_READ_BIT | VK_ACCESS_SHADER_WRITE_BIT, sizeof(uint32_t) * NumKeys[m_UIResolutionSize]); + vkCmdPipelineBarrier(m_pUploadHeap->GetCommandList(), VK_PIPELINE_STAGE_ALL_COMMANDS_BIT, VK_PIPELINE_STAGE_ALL_COMMANDS_BIT, 0, 0, nullptr, 1, &Barrier, 0, nullptr); + descriptorIndex = m_UIResolutionSize; + } + else + descriptorIndex = 3; + + // Bind buffer from which we will pull the indices into the image buffer + VkDescriptorSet descriptorSets[] = { m_RenderDescriptorSet0, m_RenderDescriptorSet1[descriptorIndex], m_RenderDescriptorSet2[m_UIResolutionSize] }; + + // Bind pipeline layout and descriptor sets + vkCmdBindPipeline(commandList, VK_PIPELINE_BIND_POINT_GRAPHICS, m_RenderResultVerificationPipeline); + vkCmdBindDescriptorSets(commandList, VK_PIPELINE_BIND_POINT_GRAPHICS, m_RenderPipelineLayout, 0, 3, descriptorSets, 0, nullptr); + + VkViewport viewport; + viewport.x = 0; + viewport.y = (float)RTHeight; + viewport.width = (float)RTWidth; + viewport.height = -(float)(RTHeight); + viewport.minDepth = (float)0.0f; + viewport.maxDepth = (float)1.0f; + + // Create scissor rectangle + VkRect2D scissor; + scissor.extent.width = RTWidth; + scissor.extent.height = RTHeight; + scissor.offset.x = 0; + scissor.offset.y = 0; + + // Draw + vkCmdSetViewport(commandList, 0, 1, &viewport); + vkCmdSetScissor(commandList, 0, 1, &scissor); + vkCmdDraw(commandList, 3, 1, 0, 0); + + // If we are showing unsorted values, need to transition the source data buffer from copy source to UAV and back + if (!m_UIVisualOutput) + { + VkBufferMemoryBarrier Barrier = BufferTransition(m_SrcKeyBuffers[m_UIResolutionSize], VK_ACCESS_SHADER_READ_BIT | VK_ACCESS_SHADER_WRITE_BIT, VK_ACCESS_TRANSFER_READ_BIT, sizeof(uint32_t) * NumKeys[m_UIResolutionSize]); + vkCmdPipelineBarrier(m_pUploadHeap->GetCommandList(), VK_PIPELINE_STAGE_ALL_COMMANDS_BIT, VK_PIPELINE_STAGE_ALL_COMMANDS_BIT, 0, 0, nullptr, 1, &Barrier, 0, nullptr); + } +} \ No newline at end of file diff --git a/sample/src/VK/ParallelSort.h b/sample/src/VK/ParallelSort.h new file mode 100644 index 0000000..5bf629f --- /dev/null +++ b/sample/src/VK/ParallelSort.h @@ -0,0 +1,152 @@ +// ParallelSort.h +// +// Copyright(c) 2021 Advanced Micro Devices, Inc.All rights reserved. +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files(the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and / or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions : +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +// THE SOFTWARE. + +#pragma once +#include "vulkan/vulkan.h" + +using namespace CAULDRON_VK; + +struct ParallelSortRenderCB // If you change this, also change struct ParallelSortRenderCB in ParallelSortVerify.hlsl +{ + int32_t Width; + int32_t Height; + int32_t SortWidth; + int32_t SortHeight; +}; + +namespace CAULDRON_VK +{ + class Device; + class ResourceViewHeaps; + class DynamicBufferRing; + class StaticBufferPool; +} + +class FFXParallelSort +{ +public: + void OnCreate(Device* pDevice, ResourceViewHeaps* pResourceViewHeaps, DynamicBufferRing* pConstantBufferRing, UploadHeap* pUploadHeap, SwapChain* pSwapChain); + void OnDestroy(); + + void Sort(VkCommandBuffer commandList, bool isBenchmarking, float benchmarkTime); + void CopySourceDataForFrame(VkCommandBuffer commandList); + void DrawGui(); + void DrawVisualization(VkCommandBuffer commandList, uint32_t RTWidth, uint32_t RTHeight); + + // Temp -- For command line overrides + static void OverrideKeySet(int ResolutionOverride); + static void OverridePayload(); + // Temp -- For command line overrides + +private: + void CreateKeyPayloadBuffers(); + void CompileRadixPipeline(const char* shaderFile, const DefineList* defines, const char* entryPoint, VkPipeline& pPipeline); + void BindConstantBuffer(VkDescriptorBufferInfo& GPUCB, VkDescriptorSet& DescriptorSet, uint32_t Binding = 0, uint32_t Count = 1); + void BindUAVBuffer(VkBuffer* pBuffer, VkDescriptorSet& DescriptorSet, uint32_t Binding = 0, uint32_t Count = 1); + + // Temp -- For command line overrides + static int KeySetOverride; + static bool PayloadOverride; + // Temp -- For command line overrides + + Device* m_pDevice = nullptr; + UploadHeap* m_pUploadHeap = nullptr; + ResourceViewHeaps* m_pResourceViewHeaps = nullptr; + DynamicBufferRing* m_pConstantBufferRing = nullptr; + uint32_t m_MaxNumThreadgroups = 800; + + uint32_t m_ScratchBufferSize; + uint32_t m_ReducedScratchBufferSize; + + // Sample resources + VkBuffer m_SrcKeyBuffers[3]; // 32 bit source key buffers (for 1080, 2K, 4K resolution) + VmaAllocation m_SrcKeyBufferAllocations[3]; + + VkBuffer m_SrcPayloadBuffers; // 32 bit source payload buffers + VmaAllocation m_SrcPayloadBufferAllocation; + + VkBuffer m_DstKeyBuffers[2]; // 32 bit destination key buffers (when not doing in place writes) + VmaAllocation m_DstKeyBufferAllocations[2]; + + VkBuffer m_DstPayloadBuffers[2]; // 32 bit destination payload buffers (when not doing in place writes) + VmaAllocation m_DstPayloadBufferAllocations[2]; + + VkBuffer m_FPSScratchBuffer; // Sort scratch buffer + VmaAllocation m_FPSScratchBufferAllocation; + + VkBuffer m_FPSReducedScratchBuffer; // Sort reduced scratch buffer + VmaAllocation m_FPSReducedScratchBufferAllocation; + + VkDescriptorSetLayout m_SortDescriptorSetLayoutConstants; + VkDescriptorSet m_SortDescriptorSetConstants[3]; + VkDescriptorSetLayout m_SortDescriptorSetLayoutConstantsIndirect; + VkDescriptorSet m_SortDescriptorSetConstantsIndirect[3]; + + VkDescriptorSetLayout m_SortDescriptorSetLayoutInputOutputs; + VkDescriptorSetLayout m_SortDescriptorSetLayoutScan; + VkDescriptorSetLayout m_SortDescriptorSetLayoutScratch; + VkDescriptorSetLayout m_SortDescriptorSetLayoutIndirect; + + VkDescriptorSet m_SortDescriptorSetInputOutput[2]; + VkDescriptorSet m_SortDescriptorSetScanSets[2]; + VkDescriptorSet m_SortDescriptorSetScratch; + VkDescriptorSet m_SortDescriptorSetIndirect; + VkPipelineLayout m_SortPipelineLayout; + + VkPipeline m_FPSCountPipeline; + VkPipeline m_FPSCountReducePipeline; + VkPipeline m_FPSScanPipeline; + VkPipeline m_FPSScanAddPipeline; + VkPipeline m_FPSScatterPipeline; + VkPipeline m_FPSScatterPayloadPipeline; + + // Resources for indirect execution of algorithm + VkBuffer m_IndirectKeyCounts; // Buffer to hold num keys for indirect dispatch + VmaAllocation m_IndirectKeyCountsAllocation; + VkBuffer m_IndirectConstantBuffer; // Buffer to hold radix sort constant buffer data for indirect dispatch + VmaAllocation m_IndirectConstantBufferAllocation; + VkBuffer m_IndirectCountScatterArgs; // Buffer to hold dispatch arguments used for Count/Scatter parts of the algorithm + VmaAllocation m_IndirectCountScatterArgsAllocation; + VkBuffer m_IndirectReduceScanArgs; // Buffer to hold dispatch arguments used for Reduce/Scan parts of the algorithm + VmaAllocation m_IndirectReduceScanArgsAllocation; + + VkPipeline m_FPSIndirectSetupParametersPipeline; + + // Resources for verification render + Texture m_Validate4KTexture; + Texture m_Validate2KTexture; + Texture m_Validate1080pTexture; + VkImageView m_ValidationImageViews[3]; + + VkDescriptorSetLayout m_RenderDescriptorSetLayout0; + VkDescriptorSet m_RenderDescriptorSet0; + VkDescriptorSetLayout m_RenderDescriptorSetLayout1; + VkDescriptorSet m_RenderDescriptorSet1[4]; + VkDescriptorSetLayout m_RenderDescriptorSetLayout2; + VkDescriptorSet m_RenderDescriptorSet2[3]; + VkPipelineLayout m_RenderPipelineLayout; + + VkPipeline m_RenderResultVerificationPipeline; + + // Options for UI and test to run + int m_UIResolutionSize = 0; + bool m_UISortPayload = false; + bool m_UIIndirectSort = false; + int m_UIVisualOutput = 0; +}; \ No newline at end of file diff --git a/sample/src/VK/UI.cpp b/sample/src/VK/UI.cpp new file mode 100644 index 0000000..9b2632d --- /dev/null +++ b/sample/src/VK/UI.cpp @@ -0,0 +1,178 @@ +// AMD SampleVK sample code +// +// Copyright(c) 2021 Advanced Micro Devices, Inc.All rights reserved. +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files(the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and / or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions : +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +// THE SOFTWARE. + +#include "stdafx.h" + +#include "UI.h" +#include "Sample.h" +#include "imgui.h" + +#include "base/FrameworkWindows.h" + +// To use the 'disabled UI state' functionality (ImGuiItemFlags_Disabled), include internal header +// https://github.com/ocornut/imgui/issues/211#issuecomment-339241929 +#include "imgui_internal.h" +static void DisableUIStateBegin(const bool& bEnable) +{ + if (!bEnable) + { + ImGui::PushItemFlag(ImGuiItemFlags_Disabled, true); + ImGui::PushStyleVar(ImGuiStyleVar_Alpha, ImGui::GetStyle().Alpha * 0.5f); + } +}; +static void DisableUIStateEnd(const bool& bEnable) +{ + if (!bEnable) + { + ImGui::PopItemFlag(); + ImGui::PopStyleVar(); + } +}; + +void Sample::BuildUI() +{ + ImGuiIO& io = ImGui::GetIO(); + ImGuiStyle& style = ImGui::GetStyle(); + style.FrameBorderSize = 1.0f; + + const uint32_t W = this->GetWidth(); + const uint32_t H = this->GetHeight(); + + const uint32_t PROFILER_WINDOW_PADDING_X = 10; + const uint32_t PROFILER_WINDOW_PADDING_Y = 10; + const uint32_t PROFILER_WINDOW_SIZE_X = 330; + const uint32_t PROFILER_WINDOW_SIZE_Y = 450; + const uint32_t PROFILER_WINDOW_POS_X = W - PROFILER_WINDOW_PADDING_X - PROFILER_WINDOW_SIZE_X; + const uint32_t PROFILER_WINDOW_POS_Y = PROFILER_WINDOW_PADDING_Y; + + const uint32_t CONTROLS_WINDOW_POS_X = 10; + const uint32_t CONTROLS_WINDOW_POS_Y = 10; + const uint32_t CONTROLW_WINDOW_SIZE_X = 350; + const uint32_t CONTROLW_WINDOW_SIZE_Y = 780; // assuming > 720p + + // Render CONTROLS window + // + ImGui::SetNextWindowPos(ImVec2(CONTROLS_WINDOW_POS_X, CONTROLS_WINDOW_POS_Y), ImGuiCond_FirstUseEver); + ImGui::SetNextWindowSize(ImVec2(CONTROLW_WINDOW_SIZE_X, CONTROLW_WINDOW_SIZE_Y), ImGuiCond_FirstUseEver); + + if (m_UIState.bShowControlsWindow) + { + ImGui::Begin("CONTROLS (F1)", &m_UIState.bShowControlsWindow); + + // Render UI for Radix Sort + m_pRenderer->RenderParallelSortUI(); + + ImGui::Spacing(); + ImGui::Spacing(); + + if (ImGui::CollapsingHeader("Presentation Mode", ImGuiTreeNodeFlags_DefaultOpen)) + { + const char* fullscreenModes[] = { "Windowed", "BorderlessFullscreen", "ExclusiveFullscreen" }; + if (ImGui::Combo("Fullscreen Mode", (int*)&m_fullscreenMode, fullscreenModes, _countof(fullscreenModes))) + { + if (m_previousFullscreenMode != m_fullscreenMode) + { + HandleFullScreen(); + m_previousFullscreenMode = m_fullscreenMode; + } + } + } + + ImGui::End(); // CONTROLS + } + + + // Render PROFILER window + // + if (m_UIState.bShowProfilerWindow) + { + constexpr size_t NUM_FRAMES = 128; + static float FRAME_TIME_ARRAY[NUM_FRAMES] = { 0 }; + + // track highest frame rate and determine the max value of the graph based on the measured highest value + static float RECENT_HIGHEST_FRAME_TIME = 0.0f; + constexpr int FRAME_TIME_GRAPH_MAX_FPS[] = { 800, 240, 120, 90, 60, 45, 30, 15, 10, 5, 4, 3, 2, 1 }; + static float FRAME_TIME_GRAPH_MAX_VALUES[_countof(FRAME_TIME_GRAPH_MAX_FPS)] = { 0 }; // us + for (int i = 0; i < _countof(FRAME_TIME_GRAPH_MAX_FPS); ++i) { FRAME_TIME_GRAPH_MAX_VALUES[i] = 1000000.f / FRAME_TIME_GRAPH_MAX_FPS[i]; } + + //scrolling data and average FPS computing + const std::vector& timeStamps = m_pRenderer->GetTimingValues(); + const bool bTimeStampsAvailable = timeStamps.size() > 0; + if (bTimeStampsAvailable) + { + RECENT_HIGHEST_FRAME_TIME = 0; + FRAME_TIME_ARRAY[NUM_FRAMES - 1] = timeStamps.back().m_microseconds; + for (uint32_t i = 0; i < NUM_FRAMES - 1; i++) + { + FRAME_TIME_ARRAY[i] = FRAME_TIME_ARRAY[i + 1]; + } + RECENT_HIGHEST_FRAME_TIME = max(RECENT_HIGHEST_FRAME_TIME, FRAME_TIME_ARRAY[NUM_FRAMES - 1]); + } + const float& frameTime_us = FRAME_TIME_ARRAY[NUM_FRAMES - 1]; + const float frameTime_ms = frameTime_us * 0.001f; + const int fps = bTimeStampsAvailable ? static_cast(1000000.0f / frameTime_us) : 0; + + // UI + ImGui::SetNextWindowPos(ImVec2((float)PROFILER_WINDOW_POS_X, (float)PROFILER_WINDOW_POS_Y), ImGuiCond_FirstUseEver); + ImGui::SetNextWindowSize(ImVec2(PROFILER_WINDOW_SIZE_X, PROFILER_WINDOW_SIZE_Y), ImGuiCond_FirstUseEver); + ImGui::Begin("PROFILER (F2)", &m_UIState.bShowProfilerWindow); + + ImGui::Text("Resolution : %ix%i", m_Width, m_Height); + ImGui::Text("API : %s", m_systemInfo.mGfxAPI.c_str()); + ImGui::Text("GPU : %s", m_systemInfo.mGPUName.c_str()); + ImGui::Text("CPU : %s", m_systemInfo.mCPUName.c_str()); + ImGui::Text("FPS : %d (%.2f ms)", fps, frameTime_ms); + + if (ImGui::CollapsingHeader("GPU Timings", ImGuiTreeNodeFlags_DefaultOpen)) + { + std::string msOrUsButtonText = m_UIState.bShowMilliseconds ? "Switch to microseconds(us)" : "Switch to milliseconds(ms)"; + if (ImGui::Button(msOrUsButtonText.c_str())) { + m_UIState.bShowMilliseconds = !m_UIState.bShowMilliseconds; + } + ImGui::Spacing(); + + // find the index of the FrameTimeGraphMaxValue as the next higher-than-recent-highest-frame-time in the pre-determined value list + size_t iFrameTimeGraphMaxValue = 0; + for (int i = 0; i < _countof(FRAME_TIME_GRAPH_MAX_VALUES); ++i) + { + if (RECENT_HIGHEST_FRAME_TIME < FRAME_TIME_GRAPH_MAX_VALUES[i]) // FRAME_TIME_GRAPH_MAX_VALUES are in increasing order + { + iFrameTimeGraphMaxValue = min(_countof(FRAME_TIME_GRAPH_MAX_VALUES) - 1, i + 1); + break; + } + } + ImGui::PlotLines("", FRAME_TIME_ARRAY, NUM_FRAMES, 0, "GPU frame time (us)", 0.0f, FRAME_TIME_GRAPH_MAX_VALUES[iFrameTimeGraphMaxValue], ImVec2(0, 80)); + + for (uint32_t i = 0; i < timeStamps.size(); i++) + { + float value = m_UIState.bShowMilliseconds ? timeStamps[i].m_microseconds / 1000.0f : timeStamps[i].m_microseconds; + const char* pStrUnit = m_UIState.bShowMilliseconds ? "ms" : "us"; + ImGui::Text("%-18s: %7.2f %s", timeStamps[i].m_label.c_str(), value, pStrUnit); + } + } + ImGui::End(); // PROFILER + } +} + +void UIState::Initialize() +{ + // init GUI state + this->bShowControlsWindow = true; + this->bShowProfilerWindow = true; +} diff --git a/sample/src/VK/UI.h b/sample/src/VK/UI.h new file mode 100644 index 0000000..25c8489 --- /dev/null +++ b/sample/src/VK/UI.h @@ -0,0 +1,40 @@ +// AMD SampleVK sample code +// +// Copyright(c) 2021 Advanced Micro Devices, Inc.All rights reserved. +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files(the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and / or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions : +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +// THE SOFTWARE. + +#pragma once + +#include + +struct UIState +{ + // + // WINDOW MANAGEMENT + // + bool bShowControlsWindow; + bool bShowProfilerWindow; + + // + // PROFILER CONTROLS + // + bool bShowMilliseconds; + + // ----------------------------------------------- + + void Initialize(); +}; \ No newline at end of file diff --git a/sample/src/VK/dpiawarescaling.manifest b/sample/src/VK/dpiawarescaling.manifest new file mode 100644 index 0000000..be73c8a --- /dev/null +++ b/sample/src/VK/dpiawarescaling.manifest @@ -0,0 +1,8 @@ + + + + + true/PM + + + diff --git a/sample/src/VK/renderer.cpp b/sample/src/VK/renderer.cpp new file mode 100644 index 0000000..7cbb3db --- /dev/null +++ b/sample/src/VK/renderer.cpp @@ -0,0 +1,269 @@ +// samplerenderer.cpp +// +// Copyright(c) 2021 Advanced Micro Devices, Inc.All rights reserved. +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files(the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and / or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions : +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +// THE SOFTWARE. + +#include "Renderer.h" +#include "UI.h" + +//-------------------------------------------------------------------------------------- +// +// OnCreate +// +//-------------------------------------------------------------------------------------- +void Renderer::OnCreate(Device* pDevice, SwapChain *pSwapChain, float FontSize) +{ + m_pDevice = pDevice; + + // Initialize helpers + + // Create all the heaps for the resources views + const uint32_t cbvDescriptorCount = 4000; + const uint32_t srvDescriptorCount = 8000; + const uint32_t uavDescriptorCount = 10; + const uint32_t samplerDescriptorCount = 20; + m_ResourceViewHeaps.OnCreate(pDevice, cbvDescriptorCount, srvDescriptorCount, uavDescriptorCount, samplerDescriptorCount); + + // Create a commandlist ring for the Direct queue + uint32_t commandListsPerBackBuffer = 8; + m_CommandListRing.OnCreate(pDevice, BackBufferCount, commandListsPerBackBuffer); + + // Create a 'dynamic' constant buffer + const uint32_t constantBuffersMemSize = 20 * 1024 * 1024; + m_ConstantBufferRing.OnCreate(pDevice, BackBufferCount, constantBuffersMemSize, "Uniforms"); + + // Create a 'static' pool for vertices, indices and constant buffers + const uint32_t staticGeometryMemSize = (2 * 128) * 1024 * 1024; + m_VidMemBufferPool.OnCreate(pDevice, staticGeometryMemSize, true, "StaticGeom"); + + // initialize the GPU time stamps module + m_GPUTimer.OnCreate(pDevice, BackBufferCount); + + // Quick helper to upload resources, it has it's own commandList and uses sub-allocation. + const uint32_t uploadHeapMemSize = 100 * 1024 * 1024; + m_UploadHeap.OnCreate(pDevice, uploadHeapMemSize); // initialize an upload heap (uses sub-allocation for faster results) + + // Initialize UI rendering resources + m_ImGUI.OnCreate(m_pDevice, pSwapChain->GetRenderPass(), &m_UploadHeap, &m_ConstantBufferRing, FontSize); + + // Create FFX Parallel Sort pass + m_ParallelSort.OnCreate(pDevice, &m_ResourceViewHeaps, &m_ConstantBufferRing, &m_UploadHeap, pSwapChain); + + // Make sure upload heap has finished uploading before continuing + m_VidMemBufferPool.UploadData(m_UploadHeap.GetCommandList()); + m_UploadHeap.FlushAndFinish(); +} + +//-------------------------------------------------------------------------------------- +// +// OnDestroy +// +//-------------------------------------------------------------------------------------- +void Renderer::OnDestroy() +{ + m_ParallelSort.OnDestroy(); + m_ImGUI.OnDestroy(); + + m_UploadHeap.OnDestroy(); + m_GPUTimer.OnDestroy(); + m_VidMemBufferPool.OnDestroy(); + m_ConstantBufferRing.OnDestroy(); + m_ResourceViewHeaps.OnDestroy(); + m_CommandListRing.OnDestroy(); +} + +//-------------------------------------------------------------------------------------- +// +// OnCreateWindowSizeDependentResources +// +//-------------------------------------------------------------------------------------- +void Renderer::OnCreateWindowSizeDependentResources(SwapChain* pSwapChain, uint32_t Width, uint32_t Height) +{ + m_Width = Width; + m_Height = Height; + + // Set the viewport & scissors rect + m_Viewport.x = 0; + m_Viewport.y = (float)Height; + m_Viewport.width = (float)Width; + m_Viewport.height = -(float)(Height); + m_Viewport.minDepth = (float)0.0f; + m_Viewport.maxDepth = (float)1.0f; + m_RectScissor.extent.width = Width; + m_RectScissor.extent.height = Height; + m_RectScissor.offset.x = 0; + m_RectScissor.offset.y = 0; +} + +//-------------------------------------------------------------------------------------- +// +// OnDestroyWindowSizeDependentResources +// +//-------------------------------------------------------------------------------------- +void Renderer::OnDestroyWindowSizeDependentResources() +{ +} + +void Renderer::OnUpdateDisplayDependentResources(SwapChain* pSwapChain) +{ + // Update pipelines in case the format of the RTs changed (this happens when going HDR) + m_ImGUI.UpdatePipeline(pSwapChain->GetRenderPass()); +} + +//-------------------------------------------------------------------------------------- +// +// OnRender +// +//-------------------------------------------------------------------------------------- +void Renderer::OnRender(const UIState* pState, SwapChain* pSwapChain, float Time, bool bIsBenchmarking) +{ + // Let our resource managers do some house keeping + m_ConstantBufferRing.OnBeginFrame(); + + // command buffer calls + VkCommandBuffer cmdBuf1 = m_CommandListRing.GetNewCommandList(); + + { + VkCommandBufferBeginInfo cmd_buf_info; + cmd_buf_info.sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO; + cmd_buf_info.pNext = NULL; + cmd_buf_info.flags = VK_COMMAND_BUFFER_USAGE_ONE_TIME_SUBMIT_BIT; + cmd_buf_info.pInheritanceInfo = NULL; + VkResult res = vkBeginCommandBuffer(cmdBuf1, &cmd_buf_info); + assert(res == VK_SUCCESS); + } + + m_GPUTimer.OnBeginFrame(cmdBuf1, &m_TimeStamps); + + // Copy the data to sort for the frame (don't time this -- external to process) + m_ParallelSort.CopySourceDataForFrame(cmdBuf1); + m_GPUTimer.GetTimeStamp(cmdBuf1, "Begin Frame"); + + // Do sort tests ----------------------------------------------------------------------- + m_ParallelSort.Sort(cmdBuf1, bIsBenchmarking, Time); + m_GPUTimer.GetTimeStamp(cmdBuf1, "FFX Parallel Sort"); + + // submit command buffer #1 + { + VkResult res = vkEndCommandBuffer(cmdBuf1); + assert(res == VK_SUCCESS); + + VkSubmitInfo submit_info; + submit_info.sType = VK_STRUCTURE_TYPE_SUBMIT_INFO; + submit_info.pNext = NULL; + submit_info.waitSemaphoreCount = 0; + submit_info.pWaitSemaphores = NULL; + submit_info.pWaitDstStageMask = NULL; + submit_info.commandBufferCount = 1; + submit_info.pCommandBuffers = &cmdBuf1; + submit_info.signalSemaphoreCount = 0; + submit_info.pSignalSemaphores = NULL; + res = vkQueueSubmit(m_pDevice->GetGraphicsQueue(), 1, &submit_info, VK_NULL_HANDLE); + assert(res == VK_SUCCESS); + } + + // Wait for swapchain (we are going to render to it) ----------------------------------- + int imageIndex = pSwapChain->WaitForSwapChain(); + + m_CommandListRing.OnBeginFrame(); + + VkCommandBuffer cmdBuf2 = m_CommandListRing.GetNewCommandList(); + + { + VkCommandBufferBeginInfo cmd_buf_info; + cmd_buf_info.sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO; + cmd_buf_info.pNext = NULL; + cmd_buf_info.flags = VK_COMMAND_BUFFER_USAGE_ONE_TIME_SUBMIT_BIT; + cmd_buf_info.pInheritanceInfo = NULL; + VkResult res = vkBeginCommandBuffer(cmdBuf2, &cmd_buf_info); + assert(res == VK_SUCCESS); + } + + SetPerfMarkerBegin(cmdBuf2, "rendering to swap chain"); + + // prepare render pass + { + VkRenderPassBeginInfo rp_begin = {}; + rp_begin.sType = VK_STRUCTURE_TYPE_RENDER_PASS_BEGIN_INFO; + rp_begin.pNext = NULL; + rp_begin.renderPass = pSwapChain->GetRenderPass(); + rp_begin.framebuffer = pSwapChain->GetFramebuffer(imageIndex); + rp_begin.renderArea.offset.x = 0; + rp_begin.renderArea.offset.y = 0; + rp_begin.renderArea.extent.width = m_Width; + rp_begin.renderArea.extent.height = m_Height; + rp_begin.clearValueCount = 0; + rp_begin.pClearValues = nullptr; + vkCmdBeginRenderPass(cmdBuf2, &rp_begin, VK_SUBPASS_CONTENTS_INLINE); + + VkClearValue clearColor; + clearColor.color = { 0.f, 0.f, 0.f, 0.f }; + VkClearAttachment clearAttachment; + clearAttachment.aspectMask = VK_IMAGE_ASPECT_COLOR_BIT; + clearAttachment.colorAttachment = 0; + clearAttachment.clearValue = clearColor; + VkClearRect clearRect; + clearRect.baseArrayLayer = 0; + clearRect.layerCount = 1; + clearRect.rect.offset = { 0, 0 }; + clearRect.rect.extent.width = m_Width; + clearRect.rect.extent.height = m_Height; + vkCmdClearAttachments(cmdBuf2, 1, &clearAttachment, 1, &clearRect); + } + + vkCmdSetScissor(cmdBuf2, 0, 1, &m_RectScissor); + vkCmdSetViewport(cmdBuf2, 0, 1, &m_Viewport); + + // Render sort source/results over everything except the HUD -------------------------- + m_ParallelSort.DrawVisualization(cmdBuf2, m_Width, m_Height); + + // Render HUD + m_ImGUI.Draw(cmdBuf2); + m_GPUTimer.GetTimeStamp(cmdBuf2, "ImGUI Rendering"); + + m_GPUTimer.OnEndFrame(); + + vkCmdEndRenderPass(cmdBuf2); + + SetPerfMarkerEnd(cmdBuf2); + + // Close & Submit the command list ---------------------------------------------------- + { + VkResult res = vkEndCommandBuffer(cmdBuf2); + assert(res == VK_SUCCESS); + + VkSemaphore ImageAvailableSemaphore; + VkSemaphore RenderFinishedSemaphores; + VkFence CmdBufExecutedFences; + pSwapChain->GetSemaphores(&ImageAvailableSemaphore, &RenderFinishedSemaphores, &CmdBufExecutedFences); + + VkPipelineStageFlags submitWaitStage = VK_PIPELINE_STAGE_COLOR_ATTACHMENT_OUTPUT_BIT; + VkSubmitInfo submit_info2; + submit_info2.sType = VK_STRUCTURE_TYPE_SUBMIT_INFO; + submit_info2.pNext = NULL; + submit_info2.waitSemaphoreCount = 1; + submit_info2.pWaitSemaphores = &ImageAvailableSemaphore; + submit_info2.pWaitDstStageMask = &submitWaitStage; + submit_info2.commandBufferCount = 1; + submit_info2.pCommandBuffers = &cmdBuf2; + submit_info2.signalSemaphoreCount = 1; + submit_info2.pSignalSemaphores = &RenderFinishedSemaphores; + + res = vkQueueSubmit(m_pDevice->GetGraphicsQueue(), 1, &submit_info2, CmdBufExecutedFences); + assert(res == VK_SUCCESS); + } +} diff --git a/sample/src/DX12/samplerenderer.h b/sample/src/VK/renderer.h similarity index 54% rename from sample/src/DX12/samplerenderer.h rename to sample/src/VK/renderer.h index aa8ffa1..04a3e87 100644 --- a/sample/src/DX12/samplerenderer.h +++ b/sample/src/VK/renderer.h @@ -1,86 +1,78 @@ // samplerenderer.h // -// Copyright (c) 2020 Advanced Micro Devices, Inc. All rights reserved. +// Copyright(c) 2021 Advanced Micro Devices, Inc.All rights reserved. // Permission is hereby granted, free of charge, to any person obtaining a copy -// of this software and associated documentation files (the "Software"), to deal +// of this software and associated documentation files(the "Software"), to deal // in the Software without restriction, including without limitation the rights -// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// to use, copy, modify, merge, publish, distribute, sublicense, and / or sell // copies of the Software, and to permit persons to whom the Software is -// furnished to do so, subject to the following conditions: +// furnished to do so, subject to the following conditions : // The above copyright notice and this permission notice shall be included in // all copies or substantial portions of the Software. // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN // THE SOFTWARE. #pragma once -#include "base\\SaveTexture.h" -// We are queuing (backBufferCount + 0.5) frames, so we need to triple buffer the resources that get modified each frame -static const int backBufferCount = 3; +#include "stdafx.h" +#include "PostProc/MagnifierPS.h" + +struct UIState; + +// We are queuing (BackBufferCount + 0.5) frames, so we need to triple buffer the resources that get modified each frame +static const int BackBufferCount = 3; -#define USE_VID_MEM true -#define USE_AOMASK false #define USE_SHADOWMASK false -using namespace CAULDRON_DX12; +using namespace CAULDRON_VK; // // This class deals with the GPU side of the sample. // -class SampleRenderer +class Renderer { public: - struct State - { - float time; - bool m_isBenchmarking; - const std::string *m_pScreenShotName = nullptr; - }; - - void OnCreate(Device* pDevice, SwapChain *pSwapChain); + + void OnCreate(Device* pDevice, SwapChain* pSwapChain, float FontSize); void OnDestroy(); - void OnCreateWindowSizeDependentResources(SwapChain *pSwapChain, uint32_t Width, uint32_t Height); + void OnCreateWindowSizeDependentResources(SwapChain* pSwapChain, uint32_t Width, uint32_t Height); void OnDestroyWindowSizeDependentResources(); - const std::vector &GetTimingValues() { return m_TimeStamps; } + void OnUpdateDisplayDependentResources(SwapChain* pSwapChain); - void OnRender(State *pState, SwapChain *pSwapChain); + const std::vector& GetTimingValues() const { return m_TimeStamps; } - void RenderParallelSortUI() { m_FPS.DrawGui(); } + void OnRender(const UIState* pState, SwapChain* pSwapChain, float Time, bool bIsBenchmarking); + + void RenderParallelSortUI() { m_ParallelSort.DrawGui(); } private: - Device *m_pDevice; + Device* m_pDevice; uint32_t m_Width; uint32_t m_Height; - D3D12_VIEWPORT m_viewport; - D3D12_RECT m_rectScissor; + VkRect2D m_RectScissor; + VkViewport m_Viewport; // Initialize helper classes - ResourceViewHeaps m_resourceViewHeaps; + ResourceViewHeaps m_ResourceViewHeaps; UploadHeap m_UploadHeap; DynamicBufferRing m_ConstantBufferRing; StaticBufferPool m_VidMemBufferPool; CommandListRing m_CommandListRing; GPUTimestamps m_GPUTimer; - FFXParallelSort m_FPS; + FFXParallelSort m_ParallelSort; // GUI ImGUI m_ImGUI; - // Temporary render targets - - // Depth buffer - Texture m_depthBuffer; - // For benchmarking std::vector m_TimeStamps; - SaveTexture m_saveTexture; }; diff --git a/sample/src/VK/sample.cpp b/sample/src/VK/sample.cpp new file mode 100644 index 0000000..996ed10 --- /dev/null +++ b/sample/src/VK/sample.cpp @@ -0,0 +1,317 @@ +// sample.cpp +// +// Copyright(c) 2021 Advanced Micro Devices, Inc.All rights reserved. +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files(the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and / or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions : +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +// THE SOFTWARE. + +#include "stdafx.h" +#include "Sample.h" + +#include +#include + +//-------------------------------------------------------------------------------------- +// +// OnParseCommandLine +// +//-------------------------------------------------------------------------------------- +void Sample::OnParseCommandLine(LPSTR lpCmdLine, uint32_t* pWidth, uint32_t* pHeight) +{ + // set some default values + *pWidth = 1920; + *pHeight = 1080; + m_VsyncEnabled = false; + m_bIsBenchmarking = false; + m_fontSize = 13.f; // default value overridden by a json file if available + m_isCpuValidationLayerEnabled = false; + m_isGpuValidationLayerEnabled = false; + m_stablePowerState = false; + + // Read globals + auto process = [&](json jData) + { + *pWidth = jData.value("width", *pWidth); + *pHeight = jData.value("height", *pHeight); + m_fullscreenMode = jData.value("presentationMode", m_fullscreenMode); + m_isCpuValidationLayerEnabled = jData.value("CpuValidationLayerEnabled", m_isCpuValidationLayerEnabled); + m_isGpuValidationLayerEnabled = jData.value("GpuValidationLayerEnabled", m_isGpuValidationLayerEnabled); + m_VsyncEnabled = jData.value("vsync", m_VsyncEnabled); + m_bIsBenchmarking = jData.value("benchmark", m_bIsBenchmarking); + m_stablePowerState = jData.value("stablePowerState", m_stablePowerState); + m_fontSize = jData.value("fontsize", m_fontSize); + }; + + // Read config file (and override values from commandline if so) + { + std::ifstream f("FFXParallelSort.json"); + if (!f) + { + MessageBox(nullptr, "Config file not found!\n", "Cauldron Panic!", MB_ICONERROR); + exit(0); + } + + try + { + f >> m_jsonConfigFile; + } + catch (json::parse_error) + { + MessageBox(nullptr, "Error parsing FFXParallelSort.json!\n", "Cauldron Panic!", MB_ICONERROR); + exit(0); + } + } + + json globals = m_jsonConfigFile["globals"]; + process(globals); + + // Process the command line to see if we need to do anything for the sample (i.e. benchmarking, setup certain settings, etc.) + std::string charString = lpCmdLine; + if (!charString.compare("")) + return; // No parameters + + // Need to first convert the char string to a wide character set + std::wstring wideString; + wideString.assign(charString.begin(), charString.end()); + + LPWSTR* ArgList; + int ArgCount, CurrentArg(0); + ArgList = CommandLineToArgvW(wideString.c_str(), &ArgCount); + while (CurrentArg < ArgCount) + { + wideString = ArgList[CurrentArg]; + + // Enable benchmarking + if (!wideString.compare(L"-benchmark")) + { + m_bIsBenchmarking = true; + ++CurrentArg; + } + + // Set num keys to sort + else if (!wideString.compare(L"-keyset")) + { + assert(ArgCount > CurrentArg + 1 && "Incorrect usage of -keyset <0-2>"); + // Get the parameter + int keySet = std::stoi(ArgList[CurrentArg + 1]); + assert(keySet >= 0 && keySet < 3 && "Incorrect usage of -keyset <0-2>"); + FFXParallelSort::OverrideKeySet(keySet); + CurrentArg += 2; + } + + // Set payload sort + else if (!wideString.compare(L"-payload")) + { + FFXParallelSort::OverridePayload(); + ++CurrentArg; + } + + else + { + assert(false && "Unsupported command line parameter"); + exit(0); + } + } +} + +//-------------------------------------------------------------------------------------- +// +// OnCreate +// +//-------------------------------------------------------------------------------------- +void Sample::OnCreate() +{ + // Init the shader compiler + InitDirectXCompiler(); + CreateShaderCache(); + + // Create a instance of the renderer and initialize it, we need to do that for each GPU + m_pRenderer = new Renderer(); + m_pRenderer->OnCreate(&m_device, &m_swapChain, m_fontSize); + + // set benchmarking state if enabled + if (m_bIsBenchmarking) + { + std::string deviceName; + std::string driverVersion; + m_device.GetDeviceInfo(&deviceName, &driverVersion); + BenchmarkConfig(m_jsonConfigFile["BenchmarkSettings"], -1, nullptr, deviceName, driverVersion); + } + + // Init GUI (non gfx stuff) + ImGUI_Init((void*)m_windowHwnd); + m_UIState.Initialize(); + + OnResize(); + OnUpdateDisplay(); +} + +//-------------------------------------------------------------------------------------- +// +// OnDestroy +// +//-------------------------------------------------------------------------------------- +void Sample::OnDestroy() +{ + ImGUI_Shutdown(); + + m_device.GPUFlush(); + + m_pRenderer->OnDestroyWindowSizeDependentResources(); + m_pRenderer->OnDestroy(); + + delete m_pRenderer; + + //shut down the shader compiler + DestroyShaderCache(&m_device); +} + +//-------------------------------------------------------------------------------------- +// +// OnEvent, win32 sends us events and we forward them to ImGUI +// +//-------------------------------------------------------------------------------------- +static void ToggleBool(bool& b) { b = !b; } +bool Sample::OnEvent(MSG msg) +{ + if (ImGUI_WndProcHandler(msg.hwnd, msg.message, msg.wParam, msg.lParam)) + return true; + + // handle function keys (F1, F2...) here, rest of the input is handled + // by imGUI later in HandleInput() function + const WPARAM& KeyPressed = msg.wParam; + switch (msg.message) + { + case WM_KEYUP: + case WM_SYSKEYUP: + /* WINDOW TOGGLES */ + if (KeyPressed == VK_F1) m_UIState.bShowControlsWindow ^= 1; + if (KeyPressed == VK_F2) m_UIState.bShowProfilerWindow ^= 1; + break; + } + + return true; +} + +//-------------------------------------------------------------------------------------- +// +// OnResize +// +//-------------------------------------------------------------------------------------- +void Sample::OnResize() +{ + // Destroy resources (if we are not minimized) + if (m_Width && m_Height && m_pRenderer) + { + m_pRenderer->OnDestroyWindowSizeDependentResources(); + m_pRenderer->OnCreateWindowSizeDependentResources(&m_swapChain, m_Width, m_Height); + } +} + +//-------------------------------------------------------------------------------------- +// +// UpdateDisplay +// +//-------------------------------------------------------------------------------------- +void Sample::OnUpdateDisplay() +{ + // Destroy resources (if we are not minimized) + if (m_pRenderer) + { + m_pRenderer->OnUpdateDisplayDependentResources(&m_swapChain); + } +} + +//-------------------------------------------------------------------------------------- +// +// OnUpdate +// +//-------------------------------------------------------------------------------------- +void Sample::OnUpdate() +{ + ImGuiIO& io = ImGui::GetIO(); + + //If the mouse was not used by the GUI then it's for the camera + if (io.WantCaptureMouse) + { + io.MouseDelta.x = 0; + io.MouseDelta.y = 0; + io.MouseWheel = 0; + } + + // Keyboard & Mouse + HandleInput(io); + + // Increase time + m_time += (float)m_deltaTime / 1000.0f; // time in seconds +} + +void Sample::HandleInput(const ImGuiIO& io) +{ + auto fnIsKeyTriggered = [&io](char key) { return io.KeysDown[key] && io.KeysDownDuration[key] == 0.0f; }; + + // Handle Keyboard/Mouse input here +} + +//-------------------------------------------------------------------------------------- +// +// OnRender, updates the state from the UI, animates, transforms and renders the scene +// +//-------------------------------------------------------------------------------------- +void Sample::OnRender() +{ + // Do any start of frame necessities + BeginFrame(); + + ImGUI_UpdateIO(); + ImGui::NewFrame(); + + if (m_bIsBenchmarking) + { + // Benchmarking takes control of the time, and exits the app when the animation is done + std::vector timeStamps = m_pRenderer->GetTimingValues(); + std::string Filename; + m_time = BenchmarkLoop(timeStamps, nullptr, Filename); + } + else + { + // Build the UI. Note that the rendering of the UI happens later. + BuildUI(); + OnUpdate(); + } + + // Do Render frame using AFR + m_pRenderer->OnRender(&m_UIState, &m_swapChain, m_time, m_bIsBenchmarking); + + // Framework will handle Present and some other end of frame logic + EndFrame(); +} + + +//-------------------------------------------------------------------------------------- +// +// WinMain +// +//-------------------------------------------------------------------------------------- +int WINAPI WinMain(HINSTANCE hInstance, + HINSTANCE hPrevInstance, + LPSTR lpCmdLine, + int nCmdShow) +{ + LPCSTR Name = "FidelityFX Parallel Sort VK v1.1"; + + // create new DX sample + return RunFramework(hInstance, lpCmdLine, nCmdShow, new Sample(Name)); +} diff --git a/sample/src/VK/sample.h b/sample/src/VK/sample.h new file mode 100644 index 0000000..43fa8fc --- /dev/null +++ b/sample/src/VK/sample.h @@ -0,0 +1,56 @@ +// sample.h +// +// Copyright(c) 2021 Advanced Micro Devices, Inc.All rights reserved. +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files(the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and / or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions : +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +// THE SOFTWARE. + +#pragma once + +#include "base/FrameworkWindows.h" +#include "Renderer.h" +#include "UI.h" + +// This class encapsulates the 'application' and is responsible for handling window events and scene updates (simulation) +// Rendering and rendering resource management is done by the Renderer class + +class Sample : public FrameworkWindows +{ +public: + Sample(LPCSTR name) : FrameworkWindows(name) { m_time = 0.f; } + void OnParseCommandLine(LPSTR lpCmdLine, uint32_t* pWidth, uint32_t* pHeight) override; + void OnCreate() override; + void OnDestroy() override; + void OnRender() override; + bool OnEvent(MSG msg) override; + void OnResize() override; + void OnUpdateDisplay() override; + + void BuildUI(); + void OnUpdate(); + void HandleInput(const ImGuiIO& io); + +private: + // Benchmarking support + bool m_bIsBenchmarking; + float m_time; + + Renderer* m_pRenderer = NULL; + UIState m_UIState; + float m_fontSize; + + // json config file + json m_jsonConfigFile; +}; diff --git a/sample/src/VK/stdafx.cpp b/sample/src/VK/stdafx.cpp new file mode 100644 index 0000000..541c2a6 --- /dev/null +++ b/sample/src/VK/stdafx.cpp @@ -0,0 +1,23 @@ +// stdafx.cpp +// +// Copyright(c) 2021 Advanced Micro Devices, Inc.All rights reserved. +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files(the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and / or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions : +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +// THE SOFTWARE. + +#include "stdafx.h" + +// TODO: reference any additional headers you need in STDAFX.H +// and not in this file diff --git a/sample/src/VK/stdafx.h b/sample/src/VK/stdafx.h new file mode 100644 index 0000000..6016047 --- /dev/null +++ b/sample/src/VK/stdafx.h @@ -0,0 +1,81 @@ +// stdafx.h +// +// Copyright(c) 2021 Advanced Micro Devices, Inc.All rights reserved. +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files(the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and / or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions : +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +// THE SOFTWARE. + +#pragma once + +#define WIN32_LEAN_AND_MEAN // Exclude rarely-used stuff from Windows headers +// Windows Header Files: +#include +#include + +// C RunTime Header Files +#include +#include +#include +#include +#include + +#include "vulkan/vulkan.h" + +// we are using DirectXMath +#include +using namespace DirectX; + +// TODO: reference additional headers your program requires here +#include "Base/Imgui.h" +#include "Base/ImguiHelper.h" +#include "Base/Helper.h" +#include "Base/Device.h" +#include "Base/FrameworkWindows.h" +#include "Base/Texture.h" +#include "Base/SwapChain.h" +#include "Base/UploadHeap.h" +#include "Base/GPUTimestamps.h" +#include "Base/CommandListRing.h" +#include "Base/StaticBufferPool.h" +#include "Base/DynamicBufferRing.h" +#include "Base/ResourceViewHeaps.h" +#include "Base/ShaderCompilerHelper.h" + +#include "GLTF/GltfPbrPass.h" +#include "GLTF/GltfBBoxPass.h" +#include "GLTF/GltfDepthPass.h" +#include "GLTF/GltfMotionVectorsPass.h" + +#include "Misc/Misc.h" +#include "Misc/Error.h" +#include "Misc/Camera.h" + +#include "PostProc/TAA.h" +#include "PostProc/Bloom.h" +#include "PostProc/BlurPS.h" +#include "PostProc/SkyDome.h" +#include "PostProc/SkyDomeProc.h" +#include "PostProc/PostProcCS.h" +#include "PostProc/ToneMapping.h" +#include "PostProc/ToneMappingCS.h" +#include "PostProc/ColorConversionPS.h" +#include "PostProc/DownSamplePS.h" + +#include "ParallelSort.h" + +#include "Widgets/wireframe.h" + + +using namespace CAULDRON_VK;