Skip to content

Commit

Permalink
[WIP][LLPC] Scalarize non-uniform loads inside the waterfall loop
Browse files Browse the repository at this point in the history
  • Loading branch information
kmitropoulou committed Nov 10, 2023
1 parent 267ae83 commit a5df20e
Show file tree
Hide file tree
Showing 8 changed files with 569 additions and 105 deletions.
3 changes: 2 additions & 1 deletion include/vkgcDefs.h
Original file line number Diff line number Diff line change
Expand Up @@ -241,6 +241,7 @@ struct optional_bool : private std::optional<bool> {
using std::optional<bool>::has_value;
using std::optional<bool>::value;
using std::optional<bool>::value_or;
using std::optional<bool>::operator*;
};

/// Enumerates result codes of LLPC operations.
Expand Down Expand Up @@ -873,7 +874,7 @@ struct PipelineShaderOptions {
unsigned ldsSpillLimitDwords;

/// Attempt to scalarize waterfall descriptor loads.
bool scalarizeWaterfallLoads;
optional_bool scalarizeWaterfallLoads;

/// Force rearranges threadId within group into blocks of 8*8 or 8*4
bool overrideForceThreadIdSwizzling;
Expand Down
287 changes: 260 additions & 27 deletions lgc/builder/BuilderImpl.cpp

Large diffs are not rendered by default.

11 changes: 5 additions & 6 deletions llpc/context/llpcPipelineContext.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -612,13 +612,12 @@ ShaderOptions PipelineContext::computeShaderOptions(const PipelineShaderInfo &sh
}
}

if (ScalarizeWaterfallDescriptorLoads.getNumOccurrences() > 0) {
if (ScalarizeWaterfallDescriptorLoads.getNumOccurrences() > 0)
shaderOptions.scalarizeWaterfallLoads = ScalarizeWaterfallDescriptorLoads;
} else {
shaderOptions.scalarizeWaterfallLoads = shaderInfo.options.scalarizeWaterfallLoads;
// Enable waterfall load scalarization when vgpr limit is set.
if (shaderOptions.vgprLimit != 0 && shaderOptions.vgprLimit != UINT_MAX)
shaderOptions.scalarizeWaterfallLoads = true;
else {
shaderOptions.scalarizeWaterfallLoads = true;
if (shaderInfo.options.scalarizeWaterfallLoads.has_value())
shaderOptions.scalarizeWaterfallLoads = *shaderInfo.options.scalarizeWaterfallLoads;
}

shaderOptions.sgprLimit = shaderInfo.options.sgprLimit;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -18,16 +18,24 @@ void main()
_3 = texture(_11[nonuniformEXT(_12)], vec2(0.0));
}

// BEGIN_SHADERTEST
/*
; RUN: amdllpc -v %gfxip %s | FileCheck -check-prefix=SHADERTEST %s
; Make sure that the begin indices chosen are the non-uniform offsets rather than the whole resource desc
; Make sure that there's a waterfall.readfirstlane for both the image resource desc and sample desc
; SHADERTEST-LABEL: {{^// LLPC}} pipeline patching results
; SHADERTEST-DAG: call i32 @llvm.amdgcn.waterfall.begin.i32
; SHADERTEST-DAG: call i32 @llvm.amdgcn.waterfall.begin.i32
; SHADERTEST-DAG: call <8 x i32> @llvm.amdgcn.waterfall.readfirstlane.v8i32.v8i32
; SHADERTEST-DAG: call <4 x i32> @llvm.amdgcn.waterfall.readfirstlane.v4i32.v4i32
; SHADERTEST: AMDLLPC SUCCESS
*/
// END_SHADERTEST
// RUN: amdllpc -v %gfxip %s | FileCheck -check-prefix=SHADERTEST %s
// Make sure that the begin indices chosen are the non-uniform offsets rather than the whole resource desc
// Make sure that there's a waterfall.readfirstlane for both the image resource desc and sample desc
// SHADERTEST-LABEL: {{^// LLPC}} pipeline patching results
// SHADERTEST: %[[mul:[0-9]+]] = mul i32 %{{.*}}, 48
// SHADERTEST-NEXT: %[[begin:[0-9]+]] = call i32 @llvm.amdgcn.waterfall.begin.i32(i32 0, i32 %[[mul]])
// SHADERTEST-NEXT: %[[readfirstlane:[0-9]+]] = call i32 @llvm.amdgcn.waterfall.readfirstlane.i32.i32(i32 %[[begin]], i32 %[[mul]])
// SHADERTEST-NEXT: %[[sext:[0-9]+]] = sext i32 %[[readfirstlane]] to i64
// SHADERTEST-NEXT: %[[gep1:[0-9]+]] = getelementptr i8, ptr addrspace(4) %{{.*}}, i64 %[[sext]]
// SHADERTEST-NEXT: %[[load1:[0-9]+]] = load <8 x i32>, ptr addrspace(4) %[[gep1]], align 32
// SHADERTEST-NEXT: %[[extract:[.a-z0-9]+]] = extractelement <8 x i32> %[[load1]], i64 3
// SHADERTEST-NEXT: %[[and:[0-9]+]] = and i32 %[[extract]], 268435455
// SHADERTEST-NEXT: %[[cmp:[0-9]+]] = icmp slt i32 %[[extract]], 0
// SHADERTEST-NEXT: %[[select:[0-9]+]] = select i1 %[[cmp]], i32 %[[extract]], i32 %[[and]]
// SHADERTEST-NEXT: %[[insert:[.a-z0-9]+]] = insertelement <8 x i32> %[[load1]], i32 %[[select]], i64 3
// SHADERTEST-NEXT: %[[shufflevector:[0-9]+]] = shufflevector <8 x i32> %[[insert]], <8 x i32> %[[load1]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
// SHADERTEST-NEXT: %[[gep2:[0-9]+]] = getelementptr i8, ptr addrspace(4) %{{.*}}, i64 %[[sext]]
// SHADERTEST-NEXT: %[[load2:[0-9]+]] = load <4 x i32>, ptr addrspace(4) %[[gep2]], align 16
// SHADERTEST-NEXT: %[[image_call:[0-9]+]] = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f16(i32 15, half 0xH0000, half 0xH0000, <8 x i32> %[[shufflevector]], <4 x i32> %[[load2]], i1 false, i32 0, i32 0)
// SHADERTEST-NEXT: %[[end:[0-9]+]] = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.waterfall.end.v4f32(i32 %[[begin]], <4 x float> %[[image_call]])
// SHADERTEST: AMDLLPC SUCCESS
Original file line number Diff line number Diff line change
@@ -1,6 +1,3 @@
// Make sure that there is a single begin index
// Make sure that there is a single waterfall.readfirstlane for the offset

#version 450
#extension GL_EXT_nonuniform_qualifier : require

Expand All @@ -16,18 +13,57 @@ void main()
_3 = texture(_11[nonuniformEXT(_12)], _6);
}

// BEGIN_SHADERTEST
//
// RUN: amdllpc -scalarize-waterfall-descriptor-loads -v %gfxip %s | FileCheck -check-prefix=SHADERTEST %s
// RUN: amdllpc -scalarize-waterfall-descriptor-loads -v %gfxip %s | FileCheck -check-prefix=GFX %s
// Explicitly check GFX10.3 ASIC variants:
// RUN: amdllpc -scalarize-waterfall-descriptor-loads -v --gfxip=10.3.0 %s | FileCheck -check-prefix=SHADERTEST %s
// RUN: amdllpc -scalarize-waterfall-descriptor-loads -v --gfxip=10.3.2 %s | FileCheck -check-prefix=SHADERTEST %s
// SHADERTEST-LABEL: {{^// LLPC}} pipeline patching results
// SHADERTEST: call i32 @llvm.amdgcn.waterfall.begin.i32
// SHADERTEST-NOT: call i32 @llvm.amdgcn.waterfall.begin.i32
// SHADERTEST: call i32 @llvm.amdgcn.waterfall.readfirstlane.i32.i32
// SHADERTEST-NOT: call i32 @llvm.amdgcn.waterfall.readfirstlane.i32.i32
// SHADERTEST: call {{.*}} <4 x float> @llvm.amdgcn.waterfall.end.v4f32
// SHADERTEST: AMDLLPC SUCCESS
//
// END_SHADERTEST
// RUN: amdllpc -scalarize-waterfall-descriptor-loads -v --gfxip=10.3.0 %s | FileCheck -check-prefix=GFX_10_3_0 %s
// RUN: amdllpc -scalarize-waterfall-descriptor-loads -v --gfxip=10.3.2 %s | FileCheck -check-prefix=GFX_10_3_2 %s

// GFX-LABEL: {{^// LLPC}} pipeline patching results
// GFX: %[[mul:[0-9]+]] = mul i32 %{{.*}}, 48
// GFX-NEXT: %[[begin:[0-9]+]] = call i32 @llvm.amdgcn.waterfall.begin.i32(i32 0, i32 %[[mul]])
// GFX-NEXT: %[[readfirstlane:[0-9]+]] = call i32 @llvm.amdgcn.waterfall.readfirstlane.i32.i32(i32 %[[begin]], i32 %[[mul]])
// GFX-NEXT: %[[sext:[0-9]+]] = sext i32 %[[readfirstlane]] to i64
// GFX-NEXT: %[[gep1:[0-9]+]] = getelementptr i8, ptr addrspace(4) %{{.*}}, i64 %[[sext]]
// GFX-NEXT: %[[load1:[0-9]+]] = load <8 x i32>, ptr addrspace(4) %[[gep1]], align 32
// GFX-NEXT: %[[extract:[.a-z0-9]+]] = extractelement <8 x i32> %[[load1]], i64 3
// GFX-NEXT: %[[and:[0-9]+]] = and i32 %[[extract]], 268435455
// GFX-NEXT: %[[cmp:[0-9]+]] = icmp slt i32 %[[extract]], 0
// GFX-NEXT: %[[select:[0-9]+]] = select i1 %[[cmp]], i32 %[[extract]], i32 %[[and]]
// GFX-NEXT: %[[insert:[.a-z0-9]+]] = insertelement <8 x i32> %[[load1]], i32 %[[select]], i64 3
// GFX-NEXT: %[[shufflevector:[0-9]+]] = shufflevector <8 x i32> %[[insert]], <8 x i32> %[[load1]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
// GFX-NEXT: %[[gep2:[0-9]+]] = getelementptr i8, ptr addrspace(4) %{{.*}}, i64 %[[sext]]
// GFX-NEXT: %[[load2:[0-9]+]] = load <4 x i32>, ptr addrspace(4) %[[gep2]], align 16
// GFX-NEXT: %[[image_call:[0-9]+]] = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32 15, float %{{.*}}, float %{{.*}}, <8 x i32> %[[shufflevector]], <4 x i32> %[[load2]], i1 false, i32 0, i32 0)
// GFX-NEXT: %[[end:[0-9]+]] = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.waterfall.end.v4f32(i32 %[[begin]], <4 x float> %[[image_call]])
// GFX: AMDLLPC SUCCESS

// GFX_10_3_0-LABEL: {{^// LLPC}} pipeline patching results
// GFX_10_3_0: %[[mul:[0-9]+]] = mul i32 %{{.*}}, 48
// GFX_10_3_0-NEXT: %[[begin:[0-9]+]] = call i32 @llvm.amdgcn.waterfall.begin.i32(i32 0, i32 %[[mul]])
// GFX_10_3_0-NEXT: %[[readfirstlane:[0-9]+]] = call i32 @llvm.amdgcn.waterfall.readfirstlane.i32.i32(i32 %[[begin]], i32 %[[mul]])
// GFX_10_3_0-NEXT: %[[sext:[0-9]+]] = sext i32 %[[readfirstlane]] to i64
// GFX_10_3_0-NEXT: %[[gep1:[0-9]+]] = getelementptr i8, ptr addrspace(4) %{{.*}}, i64 %[[sext]]
// GFX_10_3_0-NEXT: %[[load1:[0-9]+]] = load <8 x i32>, ptr addrspace(4) %[[gep1]], align 32
// GFX_10_3_0-NEXT: %[[gep2:[0-9]+]] = getelementptr i8, ptr addrspace(4) %{{.*}}, i64 %[[sext]]
// GFX_10_3_0-NEXT: %[[load2:[0-9]+]] = load <4 x i32>, ptr addrspace(4) %[[gep2]], align 16
// GFX_10_3_0-NEXT: %[[image_call:[0-9]+]] = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32 15, float %{{.*}}, float %{{.*}}, <8 x i32> %[[load1]], <4 x i32> %[[load2]], i1 false, i32 0, i32 0)
// GFX_10_3_0-NEXT: %[[end:[0-9]+]] = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.waterfall.end.v4f32(i32 %[[begin]], <4 x float> %[[image_call]])
// GFX_10_3_0: AMDLLPC SUCCESS


// GFX_10_3_2-LABEL: {{^// LLPC}} pipeline patching results
// GFX_10_3_2: %[[mul:[0-9]+]] = mul i32 %{{.*}}, 48
// GFX_10_3_2-NEXT: %[[begin:[0-9]+]] = call i32 @llvm.amdgcn.waterfall.begin.i32(i32 0, i32 %[[mul]])
// GFX_10_3_2-NEXT: %[[readfirstlane:[0-9]+]] = call i32 @llvm.amdgcn.waterfall.readfirstlane.i32.i32(i32 %[[begin]], i32 %[[mul]])
// GFX_10_3_2-NEXT: %[[sext:[0-9]+]] = sext i32 %[[readfirstlane]] to i64
// GFX_10_3_2-NEXT: %[[gep1:[0-9]+]] = getelementptr i8, ptr addrspace(4) %{{.*}}, i64 %[[sext]]
// GFX_10_3_2-NEXT: %[[load1:[0-9]+]] = load <8 x i32>, ptr addrspace(4) %[[gep1]], align 32
// GFX_10_3_2-NEXT: %[[extract:[.a-z0-9]+]] = extractelement <8 x i32> %[[load1]], i64 6
// GFX_10_3_2-NEXT: %[[and:[0-9]+]] = and i32 %[[extract]], -1048577
// GFX_10_3_2-NEXT: %[[insert:[.a-z0-9]+]] = insertelement <8 x i32> %[[load1]], i32 %[[and]], i64 6
// GFX_10_3_2-NEXT: %[[shufflevector:[0-9]+]] = shufflevector <8 x i32> %[[insert]], <8 x i32> %[[load1]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 15>
// GFX_10_3_2-NEXT: %[[gep2:[0-9]+]] = getelementptr i8, ptr addrspace(4) %{{.*}}, i64 %[[sext]]
// GFX_10_3_2-NEXT: %[[load2:[0-9]+]] = load <4 x i32>, ptr addrspace(4) %[[gep2]], align 16
// GFX_10_3_2-NEXT: %[[image_call:[0-9]+]] = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32 15, float %{{.*}}, float %{{.*}}, <8 x i32> %[[shufflevector]], <4 x i32> %[[load2]], i1 false, i32 0, i32 0)
// GFX_10_3_2-NEXT: %[[end:[0-9]+]] = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.waterfall.end.v4f32(i32 %[[begin]], <4 x float> %[[image_call]])
// GFX_10_3_2: AMDLLPC SUCCESS
Loading

0 comments on commit a5df20e

Please sign in to comment.