Skip to content

Commit

Permalink
FidelityFX Parallel Sort v1.1
Browse files Browse the repository at this point in the history
  • Loading branch information
rys committed Apr 26, 2021
1 parent 0ab7e34 commit 40b1a9c
Show file tree
Hide file tree
Showing 33 changed files with 4,024 additions and 1,420 deletions.
22 changes: 21 additions & 1 deletion .gitlab-ci.yml
Original file line number Diff line number Diff line change
@@ -1,9 +1,11 @@
variables:
SampleName: FfxParallelSort
SampleName: FFX_ParallelSort
GIT_SUBMODULE_STRATEGY: normal

stages:
- build
- deploy

build_dx12:
tags:
- windows
Expand All @@ -15,17 +17,34 @@ build_dx12:
artifacts:
paths:
- sample/bin/

build_vk:
tags:
- windows
- amd64
stage: build
script:
- 'cmake -S sample -B sample/build/VK -G "Visual Studio 16 2019" -A x64 -DGFX_API=VK -DBUILD_INSTALLER=ON'
- 'cmake --build sample/build/VK --config Release'
artifacts:
paths:
- sample/bin/

package_sample:
tags:
- windows
- amd64
stage: deploy
dependencies:
- build_dx12
- build_vk
script:
- echo "Packaging build"
- copy %VULKAN_SDK%\Bin\glslc.exe .\sample\bin
- echo cd .\sample\bin\ > %SampleName%_DX12.bat
- echo start %SampleName%_DX12.exe >> %SampleName%_DX12.bat
- echo cd .\sample\bin\ > %SampleName%_VK.bat
- echo start %SampleName%_VK.exe >> %SampleName%_VK.bat
artifacts:
name: "%SampleName%-%CI_COMMIT_TAG%-%CI_COMMIT_REF_NAME%-%CI_COMMIT_SHORT_SHA%"
paths:
Expand All @@ -35,3 +54,4 @@ package_sample:
- "readme.md"
- "license.txt"
- "%SampleName%_DX12.bat"
- "%SampleName%_VK.bat"
4 changes: 2 additions & 2 deletions LICENSE.txt
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
Copyright (c) 2020 Advanced Micro Devices, Inc. All rights reserved.
Copyright (c) 2020-2021 Advanced Micro Devices, Inc. All rights reserved.

Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
Expand All @@ -16,4 +16,4 @@ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
THE SOFTWARE.
32 changes: 27 additions & 5 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,15 +1,37 @@
# FidelityFX Parallel Sort

Copyright (c) 2020-2021 Advanced Micro Devices, Inc. All rights reserved.

Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:

The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.

## Parallel Sort

The AMD FidelityFX Parallel Sort provides an open source header implementation to easily integrate a highly optimized compute-based radix sort into your game.

Features of the implementation:

- Direct and Indirect execution support
- Direct and indirect execution support
- RDNA+ optimized algorithm
- Support for DirectX 12
- Support for the Vulkan and Direct3D 12 APIs
- Shaders written in HLSL utilizing SM 6.0 wave-level operations
- DirectX 12 sample
- A sample application is provided for both Direct3D 12 and Vulkan

Resources:
## Resources

Introduction to GPU Radix Sort - http://www.heterogeneouscompute.org/wordpress/wp-content/uploads/2011/06/RadixSort.pdf
[Introduction to GPU Radix Sort](http://www.heterogeneouscompute.org/wordpress/wp-content/uploads/2011/06/RadixSort.pdf)
68 changes: 34 additions & 34 deletions ffx-parallelsort/FFX_ParallelSort.h
Original file line number Diff line number Diff line change
Expand Up @@ -97,12 +97,12 @@
uint NumScanValues;
};

groupshared uint gs_Histogram[FFX_PARALLELSORT_THREADGROUP_SIZE * FFX_PARALLELSORT_SORT_BIN_COUNT];
groupshared uint gs_FFX_PARALLELSORT_Histogram[FFX_PARALLELSORT_THREADGROUP_SIZE * FFX_PARALLELSORT_SORT_BIN_COUNT];
void FFX_ParallelSort_Count_uint(uint localID, uint groupID, FFX_ParallelSortCB CBuffer, uint ShiftBit, RWStructuredBuffer<uint> SrcBuffer, RWStructuredBuffer<uint> SumTable)
{
// Start by clearing our local counts in LDS
for (int i = 0; i < FFX_PARALLELSORT_SORT_BIN_COUNT; i++)
gs_Histogram[(i * FFX_PARALLELSORT_THREADGROUP_SIZE) + localID] = 0;
gs_FFX_PARALLELSORT_Histogram[(i * FFX_PARALLELSORT_THREADGROUP_SIZE) + localID] = 0;

// Wait for everyone to catch up
GroupMemoryBarrierWithGroupSync();
Expand Down Expand Up @@ -141,7 +141,7 @@
if (DataIndex < CBuffer.NumKeys)
{
uint localKey = (srcKeys[i] >> ShiftBit) & 0xf;
InterlockedAdd(gs_Histogram[(localKey * FFX_PARALLELSORT_THREADGROUP_SIZE) + localID], 1);
InterlockedAdd(gs_FFX_PARALLELSORT_Histogram[(localKey * FFX_PARALLELSORT_THREADGROUP_SIZE) + localID], 1);
DataIndex += FFX_PARALLELSORT_THREADGROUP_SIZE;
}
}
Expand All @@ -156,13 +156,13 @@
uint sum = 0;
for (int i = 0; i < FFX_PARALLELSORT_THREADGROUP_SIZE; i++)
{
sum += gs_Histogram[localID * FFX_PARALLELSORT_THREADGROUP_SIZE + i];
sum += gs_FFX_PARALLELSORT_Histogram[localID * FFX_PARALLELSORT_THREADGROUP_SIZE + i];
}
SumTable[localID * CBuffer.NumThreadGroups + groupID] = sum;
}
}

groupshared uint gs_LDSSums[FFX_PARALLELSORT_THREADGROUP_SIZE];
groupshared uint gs_FFX_PARALLELSORT_LDSSums[FFX_PARALLELSORT_THREADGROUP_SIZE];
uint FFX_ParallelSort_ThreadgroupReduce(uint localSum, uint localID)
{
// Do wave local reduce
Expand All @@ -172,14 +172,14 @@
// Note that some hardware with very small HW wave sizes (i.e. <= 8) may exhibit issues with this algorithm, and have not been tested.
uint waveID = localID / WaveGetLaneCount();
if (WaveIsFirstLane())
gs_LDSSums[waveID] = waveReduced;
gs_FFX_PARALLELSORT_LDSSums[waveID] = waveReduced;

// Wait for everyone to catch up
GroupMemoryBarrierWithGroupSync();

// First wave worth of threads sum up wave reductions
if (!waveID)
waveReduced = WaveActiveSum( (localID < FFX_PARALLELSORT_THREADGROUP_SIZE / WaveGetLaneCount()) ? gs_LDSSums[localID] : 0);
waveReduced = WaveActiveSum( (localID < FFX_PARALLELSORT_THREADGROUP_SIZE / WaveGetLaneCount()) ? gs_FFX_PARALLELSORT_LDSSums[localID] : 0);

// Returned the reduced sum
return waveReduced;
Expand All @@ -196,20 +196,20 @@

// Last element in a wave writes out partial sum to LDS
if (laneID == WaveGetLaneCount() - 1)
gs_LDSSums[waveID] = wavePrefixed + localSum;
gs_FFX_PARALLELSORT_LDSSums[waveID] = wavePrefixed + localSum;

// Wait for everyone to catch up
GroupMemoryBarrierWithGroupSync();

// First wave prefixes partial sums
if (!waveID)
gs_LDSSums[localID] = WavePrefixSum(gs_LDSSums[localID]);
gs_FFX_PARALLELSORT_LDSSums[localID] = WavePrefixSum(gs_FFX_PARALLELSORT_LDSSums[localID]);

// Wait for everyone to catch up
GroupMemoryBarrierWithGroupSync();

// Add the partial sums back to each wave prefix
wavePrefixed += gs_LDSSums[waveID];
wavePrefixed += gs_FFX_PARALLELSORT_LDSSums[waveID];

return wavePrefixed;
}
Expand Down Expand Up @@ -244,7 +244,7 @@

// This is to transform uncoalesced loads into coalesced loads and
// then scattered loads from LDS
groupshared int gs_LDS[FFX_PARALLELSORT_ELEMENTS_PER_THREAD][FFX_PARALLELSORT_THREADGROUP_SIZE];
groupshared int gs_FFX_PARALLELSORT_LDS[FFX_PARALLELSORT_ELEMENTS_PER_THREAD][FFX_PARALLELSORT_THREADGROUP_SIZE];
void FFX_ParallelSort_ScanPrefix(uint numValuesToScan, uint localID, uint groupID, uint BinOffset, uint BaseIndex, bool AddPartialSums,
FFX_ParallelSortCB CBuffer, RWStructuredBuffer<uint> ScanSrc, RWStructuredBuffer<uint> ScanDst, RWStructuredBuffer<uint> ScanScratch)
{
Expand All @@ -255,7 +255,7 @@

uint col = ((i * FFX_PARALLELSORT_THREADGROUP_SIZE) + localID) / FFX_PARALLELSORT_ELEMENTS_PER_THREAD;
uint row = ((i * FFX_PARALLELSORT_THREADGROUP_SIZE) + localID) % FFX_PARALLELSORT_ELEMENTS_PER_THREAD;
gs_LDS[row][col] = (DataIndex < numValuesToScan) ? ScanSrc[BinOffset + DataIndex] : 0;
gs_FFX_PARALLELSORT_LDS[row][col] = (DataIndex < numValuesToScan) ? ScanSrc[BinOffset + DataIndex] : 0;
}

// Wait for everyone to catch up
Expand All @@ -265,8 +265,8 @@
// Calculate the local scan-prefix for current thread
for (uint i = 0; i < FFX_PARALLELSORT_ELEMENTS_PER_THREAD; i++)
{
uint tmp = gs_LDS[i][localID];
gs_LDS[i][localID] = threadgroupSum;
uint tmp = gs_FFX_PARALLELSORT_LDS[i][localID];
gs_FFX_PARALLELSORT_LDS[i][localID] = threadgroupSum;
threadgroupSum += tmp;
}

Expand All @@ -284,7 +284,7 @@

// Add the block scanned-prefixes back in
for (uint i = 0; i < FFX_PARALLELSORT_ELEMENTS_PER_THREAD; i++)
gs_LDS[i][localID] += threadgroupSum;
gs_FFX_PARALLELSORT_LDS[i][localID] += threadgroupSum;

// Wait for everyone to catch up
GroupMemoryBarrierWithGroupSync();
Expand All @@ -298,25 +298,25 @@
uint row = ((i * FFX_PARALLELSORT_THREADGROUP_SIZE) + localID) % FFX_PARALLELSORT_ELEMENTS_PER_THREAD;

if (DataIndex < numValuesToScan)
ScanDst[BinOffset + DataIndex] = gs_LDS[row][col] + partialSum;
ScanDst[BinOffset + DataIndex] = gs_FFX_PARALLELSORT_LDS[row][col] + partialSum;
}
}

// Offset cache to avoid loading the offsets all the time
groupshared uint gs_BinOffsetCache[FFX_PARALLELSORT_THREADGROUP_SIZE];
groupshared uint gs_FFX_PARALLELSORT_BinOffsetCache[FFX_PARALLELSORT_THREADGROUP_SIZE];
// Local histogram for offset calculations
groupshared uint gs_LocalHistogram[FFX_PARALLELSORT_SORT_BIN_COUNT];
groupshared uint gs_FFX_PARALLELSORT_LocalHistogram[FFX_PARALLELSORT_SORT_BIN_COUNT];
// Scratch area for algorithm
groupshared uint gs_LDSScratch[FFX_PARALLELSORT_THREADGROUP_SIZE];
groupshared uint gs_FFX_PARALLELSORT_LDSScratch[FFX_PARALLELSORT_THREADGROUP_SIZE];
void FFX_ParallelSort_Scatter_uint(uint localID, uint groupID, FFX_ParallelSortCB CBuffer, uint ShiftBit, RWStructuredBuffer<uint> SrcBuffer, RWStructuredBuffer<uint> DstBuffer, RWStructuredBuffer<uint> SumTable
#ifdef kRS_ValueCopy
,RWStructuredBuffer<uint> SrcPayload, RWStructuredBuffer<uint> DstPayload
,RWStructuredBuffer<uint> SrcPayload, RWStructuredBuffer<uint> DstPayload
#endif // kRS_ValueCopy
)
{
// Load the sort bin threadgroup offsets into LDS for faster referencing
if (localID < FFX_PARALLELSORT_SORT_BIN_COUNT)
gs_BinOffsetCache[localID] = SumTable[localID * CBuffer.NumThreadGroups + groupID];
gs_FFX_PARALLELSORT_BinOffsetCache[localID] = SumTable[localID * CBuffer.NumThreadGroups + groupID];

// Wait for everyone to catch up
GroupMemoryBarrierWithGroupSync();
Expand Down Expand Up @@ -363,7 +363,7 @@
{
// Clear the local histogram
if (localID < FFX_PARALLELSORT_SORT_BIN_COUNT)
gs_LocalHistogram[localID] = 0;
gs_FFX_PARALLELSORT_LocalHistogram[localID] = 0;

uint localKey = (DataIndex < CBuffer.NumKeys ? srcKeys[i] : 0xffffffff);
#ifdef kRS_ValueCopy
Expand All @@ -386,13 +386,13 @@
// Last thread stores the updated histogram counts for the thread group
// Scratch = 0xsum3|sum2|sum1|sum0 for thread group
if (localID == (FFX_PARALLELSORT_THREADGROUP_SIZE - 1))
gs_LDSScratch[0] = localSum + packedHistogram;
gs_FFX_PARALLELSORT_LDSScratch[0] = localSum + packedHistogram;

// Wait for everyone to catch up
GroupMemoryBarrierWithGroupSync();

// Load the sums value for the thread group
packedHistogram = gs_LDSScratch[0];
packedHistogram = gs_FFX_PARALLELSORT_LDSScratch[0];

// Add prefix offsets for all 4 bit "keys" (packedHistogram = 0xsum2_1_0|sum1_0|sum0|0)
packedHistogram = (packedHistogram << 8) + (packedHistogram << 16) + (packedHistogram << 24);
Expand All @@ -404,18 +404,18 @@
uint keyOffset = (localSum >> (bitKey * 8)) & 0xff;

// Re-arrange the keys (store, sync, load)
gs_LDSSums[keyOffset] = localKey;
gs_FFX_PARALLELSORT_LDSSums[keyOffset] = localKey;
GroupMemoryBarrierWithGroupSync();
localKey = gs_LDSSums[localID];
localKey = gs_FFX_PARALLELSORT_LDSSums[localID];

// Wait for everyone to catch up
GroupMemoryBarrierWithGroupSync();

#ifdef kRS_ValueCopy
// Re-arrange the values if we have them (store, sync, load)
gs_LDSSums[keyOffset] = localValue;
gs_FFX_PARALLELSORT_LDSSums[keyOffset] = localValue;
GroupMemoryBarrierWithGroupSync();
localValue = gs_LDSSums[localID];
localValue = gs_FFX_PARALLELSORT_LDSSums[localID];

// Wait for everyone to catch up
GroupMemoryBarrierWithGroupSync();
Expand All @@ -426,26 +426,26 @@
uint keyIndex = (localKey >> ShiftBit) & 0xf;

// Reconstruct histogram
InterlockedAdd(gs_LocalHistogram[keyIndex], 1);
InterlockedAdd(gs_FFX_PARALLELSORT_LocalHistogram[keyIndex], 1);

// Wait for everyone to catch up
GroupMemoryBarrierWithGroupSync();

// Prefix histogram
uint histogramPrefixSum = WavePrefixSum(localID < FFX_PARALLELSORT_SORT_BIN_COUNT ? gs_LocalHistogram[localID] : 0);
uint histogramPrefixSum = WavePrefixSum(localID < FFX_PARALLELSORT_SORT_BIN_COUNT ? gs_FFX_PARALLELSORT_LocalHistogram[localID] : 0);

// Broadcast prefix-sum via LDS
if (localID < FFX_PARALLELSORT_SORT_BIN_COUNT)
gs_LDSScratch[localID] = histogramPrefixSum;
gs_FFX_PARALLELSORT_LDSScratch[localID] = histogramPrefixSum;

// Get the global offset for this key out of the cache
uint globalOffset = gs_BinOffsetCache[keyIndex];
uint globalOffset = gs_FFX_PARALLELSORT_BinOffsetCache[keyIndex];

// Wait for everyone to catch up
GroupMemoryBarrierWithGroupSync();

// Get the local offset (at this point the keys are all in increasing order from 0 -> num bins in localID 0 -> thread group size)
uint localOffset = localID - gs_LDSScratch[keyIndex];
uint localOffset = localID - gs_FFX_PARALLELSORT_LDSScratch[keyIndex];

// Write to destination
uint totalOffset = globalOffset + localOffset;
Expand All @@ -464,7 +464,7 @@

// Update the cached histogram for the next set of entries
if (localID < FFX_PARALLELSORT_SORT_BIN_COUNT)
gs_BinOffsetCache[localID] += gs_LocalHistogram[localID];
gs_FFX_PARALLELSORT_BinOffsetCache[localID] += gs_FFX_PARALLELSORT_LocalHistogram[localID];

DataIndex += FFX_PARALLELSORT_THREADGROUP_SIZE; // Increase the data offset by thread group size
}
Expand Down
Loading

0 comments on commit 40b1a9c

Please sign in to comment.