Skip to content

Commit

Permalink
[WIP][LLPC] Scalarize non-uniform loads inside the waterfall loop
Browse files Browse the repository at this point in the history
  • Loading branch information
kmitropoulou committed Nov 10, 2023
1 parent 86ca151 commit 8edc895
Show file tree
Hide file tree
Showing 12 changed files with 1,083 additions and 105 deletions.
3 changes: 2 additions & 1 deletion include/vkgcDefs.h
Original file line number Diff line number Diff line change
Expand Up @@ -244,6 +244,7 @@ struct optional_bool : private std::optional<bool> {
using std::optional<bool>::has_value;
using std::optional<bool>::value;
using std::optional<bool>::value_or;
using std::optional<bool>::operator*;
};

/// Enumerates result codes of LLPC operations.
Expand Down Expand Up @@ -882,7 +883,7 @@ struct PipelineShaderOptions {
unsigned ldsSpillLimitDwords;

/// Attempt to scalarize waterfall descriptor loads.
bool scalarizeWaterfallLoads;
optional_bool scalarizeWaterfallLoads;

/// Force rearranges threadId within group into blocks of 8*8 or 8*4
bool overrideForceThreadIdSwizzling;
Expand Down
287 changes: 260 additions & 27 deletions lgc/builder/BuilderImpl.cpp

Large diffs are not rendered by default.

80 changes: 80 additions & 0 deletions lgc/test/scalarizationDescriptorLoadsNegativeTest1.ll
Original file line number Diff line number Diff line change
@@ -0,0 +1,80 @@
; NOTE: Assertions have been autogenerated by tool/update_llpc_test_checks.py UTC_ARGS: --tool lgc
; RUN: lgc -mcpu=gfx1010 -print-after=lgc-builder-replayer -o - %s 2>&1 | FileCheck --check-prefixes=CHECK %s
; ModuleID = 'lgcPipeline'
target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-p7:160:256:256:32-p8:128:128-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7:8-p32:32:32"
target triple = "amdgcn--amdpal"

; Function Attrs: nounwind
define dllexport spir_func void @lgc.shader.FS.main() local_unnamed_addr #0 !spirv.ExecutionModel !2 !lgc.shaderstage !3 {
.entry:
%0 = call <2 x float> (...) @lgc.create.read.generic.input.v2f32(i32 1, i32 0, i32 0, i32 0, i32 17, i32 poison)
%1 = call i32 (...) @lgc.create.read.generic.input.i32(i32 3, i32 0, i32 0, i32 0, i32 17, i32 poison)
%2 = call ptr addrspace(4) (...) @lgc.create.get.desc.ptr.p4(i32 1, i32 1, i64 0, i32 7)
%3 = call i32 (...) @lgc.create.get.desc.stride.i32(i32 1, i32 1, i64 0, i32 7)
%4 = mul i32 %1, %3
%5 = sext i32 %4 to i64
%6 = getelementptr i8, ptr addrspace(4) %2, i64 %5
%7 = load <8 x i32>, ptr addrspace(4) %6, align 32
%8 = call reassoc nnan nsz arcp contract afn <4 x float> (...) @lgc.create.image.sample.v4f32(i32 1, i32 24, <8 x i32> %7, <4 x i32> <i32 42, i32 42, i32 42, i32 42>, i32 1, <2 x float> %0)
call void (...) @lgc.create.write.generic.output(<4 x float> %8, i32 0, i32 0, i32 0, i32 0, i32 0, i32 poison)
ret void
}

declare spir_func void @spirv.NonUniform.i32(i32) local_unnamed_addr

; Function Attrs: nounwind memory(none)
declare ptr addrspace(4) @lgc.create.get.desc.ptr.p4(...) local_unnamed_addr #1

; Function Attrs: nounwind memory(none)
declare i32 @lgc.create.get.desc.stride.i32(...) local_unnamed_addr #1

; Function Attrs: nounwind willreturn memory(read)
declare <4 x float> @lgc.create.image.sample.v4f32(...) local_unnamed_addr #2

; Function Attrs: nounwind willreturn memory(read)
declare i32 @lgc.create.read.generic.input.i32(...) local_unnamed_addr #2

; Function Attrs: nounwind willreturn memory(read)
declare <2 x float> @lgc.create.read.generic.input.v2f32(...) local_unnamed_addr #2

; Function Attrs: nounwind
declare void @lgc.create.write.generic.output(...) local_unnamed_addr #3

attributes #0 = { nounwind "denormal-fp-math-f32"="preserve-sign" }
attributes #1 = { nounwind memory(none) }
attributes #2 = { nounwind willreturn memory(read) }
attributes #3 = { nounwind }

!lgc.user.data.nodes = !{!0, !1}

!0 = !{!"DescriptorTableVaPtr", i32 7, i32 64, i32 0, i32 1, i32 1}
!1 = !{!"DescriptorCombinedTexture", i32 3, i32 64, i32 0, i32 192, i64 0, i32 7, i32 12}
!2 = !{i32 4}
!3 = !{i32 6}

; CHECK-LABEL: @lgc.shader.FS.main(
; CHECK-NEXT: .entry:
; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.amdgcn.s.getpc()
; CHECK-NEXT: [[TMP1:%.*]] = bitcast i64 [[TMP0]] to <2 x i32>
; CHECK-NEXT: [[TMP2:%.*]] = call <2 x float> (...) @lgc.input.import.interpolated.v2f32(i1 false, i32 1, i32 0, i32 0, i32 poison, i32 1, i32 poison)
; CHECK-NEXT: [[TMP3:%.*]] = call i32 (...) @lgc.input.import.interpolated.i32(i1 false, i32 3, i32 0, i32 0, i32 poison, i32 1, i32 poison)
; CHECK-NEXT: [[TMP4:%.*]] = call i32 @lgc.load.user.data.i32(i32 0)
; CHECK-NEXT: [[TMP5:%.*]] = insertelement <2 x i32> [[TMP1]], i32 [[TMP4]], i64 0
; CHECK-NEXT: [[TMP6:%.*]] = bitcast <2 x i32> [[TMP5]] to i64
; CHECK-NEXT: [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr addrspace(4)
; CHECK-NEXT: [[TMP8:%.*]] = getelementptr i8, ptr addrspace(4) [[TMP7]], i32 0
; CHECK-NEXT: [[TMP9:%.*]] = mul i32 [[TMP3]], 48
; CHECK-NEXT: [[TMP10:%.*]] = sext i32 [[TMP9]] to i64
; CHECK-NEXT: [[TMP11:%.*]] = getelementptr i8, ptr addrspace(4) [[TMP8]], i64 [[TMP10]]
; CHECK-NEXT: [[TMP12:%.*]] = load <8 x i32>, ptr addrspace(4) [[TMP11]], align 32
; CHECK-NEXT: [[TMP13:%.*]] = extractelement <2 x float> [[TMP2]], i64 0
; CHECK-NEXT: [[TMP14:%.*]] = extractelement <2 x float> [[TMP2]], i64 1
; CHECK-NEXT: [[TMP15:%.*]] = call i32 @llvm.amdgcn.waterfall.begin.i32(i32 0, i32 [[TMP9]])
; CHECK-NEXT: [[TMP16:%.*]] = call i32 @llvm.amdgcn.waterfall.begin.v4i32(i32 [[TMP15]], <4 x i32> <i32 42, i32 42, i32 42, i32 42>)
; CHECK-NEXT: [[TMP17:%.*]] = call <8 x i32> @llvm.amdgcn.waterfall.readfirstlane.v8i32.v8i32(i32 [[TMP16]], <8 x i32> [[TMP12]])
; CHECK-NEXT: [[TMP18:%.*]] = call <4 x i32> @llvm.amdgcn.waterfall.readfirstlane.v4i32.v4i32(i32 [[TMP16]], <4 x i32> <i32 42, i32 42, i32 42, i32 42>)
; CHECK-NEXT: [[TMP19:%.*]] = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32 15, float [[TMP13]], float [[TMP14]], <8 x i32> [[TMP17]], <4 x i32> [[TMP18]], i1 false, i32 0, i32 0)
; CHECK-NEXT: [[TMP20:%.*]] = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.waterfall.end.v4f32(i32 [[TMP16]], <4 x float> [[TMP19]])
; CHECK-NEXT: call void @lgc.output.export.generic.i32.i32.v4f32(i32 0, i32 0, <4 x float> [[TMP20]]) #[[ATTR5:[0-9]+]]
; CHECK-NEXT: ret void
;
86 changes: 86 additions & 0 deletions lgc/test/scalarizationDescriptorLoadsNegativeTest2.ll
Original file line number Diff line number Diff line change
@@ -0,0 +1,86 @@
; NOTE: Assertions have been autogenerated by tool/update_llpc_test_checks.py UTC_ARGS: --tool lgc
; RUN: lgc -mcpu=gfx1010 -print-after=lgc-builder-replayer -o - %s 2>&1 | FileCheck --check-prefixes=CHECK %s
; ModuleID = 'lgcPipeline'
target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-p7:160:256:256:32-p8:128:128-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7:8-p32:32:32"
target triple = "amdgcn--amdpal"

declare <4 x i32> @foo1(<4 x i32> %V)

; Function Attrs: nounwind
define dllexport spir_func void @lgc.shader.FS.main() local_unnamed_addr #0 !spirv.ExecutionModel !2 !lgc.shaderstage !3 {
.entry:
%0 = call <2 x float> (...) @lgc.create.read.generic.input.v2f32(i32 1, i32 0, i32 0, i32 0, i32 17, i32 poison)
%1 = call i32 (...) @lgc.create.read.generic.input.i32(i32 3, i32 0, i32 0, i32 0, i32 17, i32 poison)
%2 = call ptr addrspace(4) (...) @lgc.create.get.desc.ptr.p4(i32 1, i32 1, i64 0, i32 7)
%3 = call i32 (...) @lgc.create.get.desc.stride.i32(i32 1, i32 1, i64 0, i32 7)
%4 = mul i32 %1, %3
%5 = sext i32 %4 to i64
%6 = getelementptr i8, ptr addrspace(4) %2, i64 %5
%7 = load <8 x i32>, ptr addrspace(4) %6, align 32
%8 = load <4 x i32>, ptr addrspace(4) %6, align 32
%9 = call <4 x i32> @foo1(<4 x i32> %8)
%10 = call reassoc nnan nsz arcp contract afn <4 x float> (...) @lgc.create.image.sample.v4f32(i32 1, i32 24, <8 x i32> %7, <4 x i32> %9, i32 1, <2 x float> %0)
call void (...) @lgc.create.write.generic.output(<4 x float> %10, i32 0, i32 0, i32 0, i32 0, i32 0, i32 poison)
ret void
}

declare spir_func void @spirv.NonUniform.i32(i32) local_unnamed_addr

; Function Attrs: nounwind memory(none)
declare ptr addrspace(4) @lgc.create.get.desc.ptr.p4(...) local_unnamed_addr #1

; Function Attrs: nounwind memory(none)
declare i32 @lgc.create.get.desc.stride.i32(...) local_unnamed_addr #1

; Function Attrs: nounwind willreturn memory(read)
declare <4 x float> @lgc.create.image.sample.v4f32(...) local_unnamed_addr #2

; Function Attrs: nounwind willreturn memory(read)
declare i32 @lgc.create.read.generic.input.i32(...) local_unnamed_addr #2

; Function Attrs: nounwind willreturn memory(read)
declare <2 x float> @lgc.create.read.generic.input.v2f32(...) local_unnamed_addr #2

; Function Attrs: nounwind
declare void @lgc.create.write.generic.output(...) local_unnamed_addr #3

attributes #0 = { nounwind "denormal-fp-math-f32"="preserve-sign" }
attributes #1 = { nounwind memory(none) }
attributes #2 = { nounwind willreturn memory(read) }
attributes #3 = { nounwind }

!lgc.user.data.nodes = !{!0, !1}

!0 = !{!"DescriptorTableVaPtr", i32 7, i32 64, i32 0, i32 1, i32 1}
!1 = !{!"DescriptorCombinedTexture", i32 3, i32 64, i32 0, i32 192, i64 0, i32 7, i32 12}
!2 = !{i32 4}
!3 = !{i32 6}

; CHECK-LABEL: @lgc.shader.FS.main(
; CHECK-NEXT: .entry:
; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.amdgcn.s.getpc()
; CHECK-NEXT: [[TMP1:%.*]] = bitcast i64 [[TMP0]] to <2 x i32>
; CHECK-NEXT: [[TMP2:%.*]] = call <2 x float> (...) @lgc.input.import.interpolated.v2f32(i1 false, i32 1, i32 0, i32 0, i32 poison, i32 1, i32 poison)
; CHECK-NEXT: [[TMP3:%.*]] = call i32 (...) @lgc.input.import.interpolated.i32(i1 false, i32 3, i32 0, i32 0, i32 poison, i32 1, i32 poison)
; CHECK-NEXT: [[TMP4:%.*]] = call i32 @lgc.load.user.data.i32(i32 0)
; CHECK-NEXT: [[TMP5:%.*]] = insertelement <2 x i32> [[TMP1]], i32 [[TMP4]], i64 0
; CHECK-NEXT: [[TMP6:%.*]] = bitcast <2 x i32> [[TMP5]] to i64
; CHECK-NEXT: [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr addrspace(4)
; CHECK-NEXT: [[TMP8:%.*]] = getelementptr i8, ptr addrspace(4) [[TMP7]], i32 0
; CHECK-NEXT: [[TMP9:%.*]] = mul i32 [[TMP3]], 48
; CHECK-NEXT: [[TMP10:%.*]] = sext i32 [[TMP9]] to i64
; CHECK-NEXT: [[TMP11:%.*]] = getelementptr i8, ptr addrspace(4) [[TMP8]], i64 [[TMP10]]
; CHECK-NEXT: [[TMP12:%.*]] = load <8 x i32>, ptr addrspace(4) [[TMP11]], align 32
; CHECK-NEXT: [[TMP13:%.*]] = load <4 x i32>, ptr addrspace(4) [[TMP11]], align 32
; CHECK-NEXT: [[TMP14:%.*]] = call <4 x i32> @foo1(<4 x i32> [[TMP13]])
; CHECK-NEXT: [[TMP15:%.*]] = extractelement <2 x float> [[TMP2]], i64 0
; CHECK-NEXT: [[TMP16:%.*]] = extractelement <2 x float> [[TMP2]], i64 1
; CHECK-NEXT: [[TMP17:%.*]] = call i32 @llvm.amdgcn.waterfall.begin.i32(i32 0, i32 [[TMP9]])
; CHECK-NEXT: [[TMP18:%.*]] = call i32 @llvm.amdgcn.waterfall.begin.v4i32(i32 [[TMP17]], <4 x i32> [[TMP14]])
; CHECK-NEXT: [[TMP19:%.*]] = call <8 x i32> @llvm.amdgcn.waterfall.readfirstlane.v8i32.v8i32(i32 [[TMP18]], <8 x i32> [[TMP12]])
; CHECK-NEXT: [[TMP20:%.*]] = call <4 x i32> @llvm.amdgcn.waterfall.readfirstlane.v4i32.v4i32(i32 [[TMP18]], <4 x i32> [[TMP14]])
; CHECK-NEXT: [[TMP21:%.*]] = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32 15, float [[TMP15]], float [[TMP16]], <8 x i32> [[TMP19]], <4 x i32> [[TMP20]], i1 false, i32 0, i32 0)
; CHECK-NEXT: [[TMP22:%.*]] = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.waterfall.end.v4f32(i32 [[TMP18]], <4 x float> [[TMP21]])
; CHECK-NEXT: call void @lgc.output.export.generic.i32.i32.v4f32(i32 0, i32 0, <4 x float> [[TMP22]]) #[[ATTR5:[0-9]+]]
; CHECK-NEXT: ret void
;
Loading

0 comments on commit 8edc895

Please sign in to comment.