[WIP][LLPC] Scalarize non-uniform loads inside the waterfall loop

GPUOpen-Drivers · Nov 10, 2023 · 8edc895 · 8edc895
1 parent 86ca151
commit 8edc895
Show file tree

Hide file tree

Showing 12 changed files with 1,083 additions and 105 deletions.
diff --git a/include/vkgcDefs.h b/include/vkgcDefs.h
@@ -244,6 +244,7 @@ struct optional_bool : private std::optional<bool> {
   using std::optional<bool>::has_value;
   using std::optional<bool>::value;
   using std::optional<bool>::value_or;
+  using std::optional<bool>::operator*;
 };
 
 /// Enumerates result codes of LLPC operations.
@@ -882,7 +883,7 @@ struct PipelineShaderOptions {
   unsigned ldsSpillLimitDwords;
 
   /// Attempt to scalarize waterfall descriptor loads.
-  bool scalarizeWaterfallLoads;
+  optional_bool scalarizeWaterfallLoads;
 
   /// Force rearranges threadId within group into blocks of 8*8 or 8*4
   bool overrideForceThreadIdSwizzling;

diff --git a/lgc/builder/BuilderImpl.cpp b/lgc/builder/BuilderImpl.cpp
diff --git a/lgc/test/scalarizationDescriptorLoadsNegativeTest1.ll b/lgc/test/scalarizationDescriptorLoadsNegativeTest1.ll
@@ -0,0 +1,80 @@
+; NOTE: Assertions have been autogenerated by tool/update_llpc_test_checks.py UTC_ARGS: --tool lgc
+; RUN: lgc -mcpu=gfx1010 -print-after=lgc-builder-replayer -o - %s 2>&1 | FileCheck --check-prefixes=CHECK %s
+; ModuleID = 'lgcPipeline'
+target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-p7:160:256:256:32-p8:128:128-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7:8-p32:32:32"
+target triple = "amdgcn--amdpal"
+
+; Function Attrs: nounwind
+define dllexport spir_func void @lgc.shader.FS.main() local_unnamed_addr #0 !spirv.ExecutionModel !2 !lgc.shaderstage !3 {
+.entry:
+  %0 = call <2 x float> (...) @lgc.create.read.generic.input.v2f32(i32 1, i32 0, i32 0, i32 0, i32 17, i32 poison)
+  %1 = call i32 (...) @lgc.create.read.generic.input.i32(i32 3, i32 0, i32 0, i32 0, i32 17, i32 poison)
+  %2 = call ptr addrspace(4) (...) @lgc.create.get.desc.ptr.p4(i32 1, i32 1, i64 0, i32 7)
+  %3 = call i32 (...) @lgc.create.get.desc.stride.i32(i32 1, i32 1, i64 0, i32 7)
+  %4 = mul i32 %1, %3
+  %5 = sext i32 %4 to i64
+  %6 = getelementptr i8, ptr addrspace(4) %2, i64 %5
+  %7 = load <8 x i32>, ptr addrspace(4) %6, align 32
+  %8 = call reassoc nnan nsz arcp contract afn <4 x float> (...) @lgc.create.image.sample.v4f32(i32 1, i32 24, <8 x i32> %7, <4 x i32> <i32 42, i32 42, i32 42, i32 42>, i32 1, <2 x float> %0)
+  call void (...) @lgc.create.write.generic.output(<4 x float> %8, i32 0, i32 0, i32 0, i32 0, i32 0, i32 poison)
+  ret void
+}
+
+declare spir_func void @spirv.NonUniform.i32(i32) local_unnamed_addr
+
+; Function Attrs: nounwind memory(none)
+declare ptr addrspace(4) @lgc.create.get.desc.ptr.p4(...) local_unnamed_addr #1
+
+; Function Attrs: nounwind memory(none)
+declare i32 @lgc.create.get.desc.stride.i32(...) local_unnamed_addr #1
+
+; Function Attrs: nounwind willreturn memory(read)
+declare <4 x float> @lgc.create.image.sample.v4f32(...) local_unnamed_addr #2
+
+; Function Attrs: nounwind willreturn memory(read)
+declare i32 @lgc.create.read.generic.input.i32(...) local_unnamed_addr #2
+
+; Function Attrs: nounwind willreturn memory(read)
+declare <2 x float> @lgc.create.read.generic.input.v2f32(...) local_unnamed_addr #2
+
+; Function Attrs: nounwind
+declare void @lgc.create.write.generic.output(...) local_unnamed_addr #3
+
+attributes #0 = { nounwind "denormal-fp-math-f32"="preserve-sign" }
+attributes #1 = { nounwind memory(none) }
+attributes #2 = { nounwind willreturn memory(read) }
+attributes #3 = { nounwind }
+
+!lgc.user.data.nodes = !{!0, !1}
+
+!0 = !{!"DescriptorTableVaPtr", i32 7, i32 64, i32 0, i32 1, i32 1}
+!1 = !{!"DescriptorCombinedTexture", i32 3, i32 64, i32 0, i32 192, i64 0, i32 7, i32 12}
+!2 = !{i32 4}
+!3 = !{i32 6}
+
+; CHECK-LABEL: @lgc.shader.FS.main(
+; CHECK-NEXT:  .entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.amdgcn.s.getpc()
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i64 [[TMP0]] to <2 x i32>
+; CHECK-NEXT:    [[TMP2:%.*]] = call <2 x float> (...) @lgc.input.import.interpolated.v2f32(i1 false, i32 1, i32 0, i32 0, i32 poison, i32 1, i32 poison)
+; CHECK-NEXT:    [[TMP3:%.*]] = call i32 (...) @lgc.input.import.interpolated.i32(i1 false, i32 3, i32 0, i32 0, i32 poison, i32 1, i32 poison)
+; CHECK-NEXT:    [[TMP4:%.*]] = call i32 @lgc.load.user.data.i32(i32 0)
+; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <2 x i32> [[TMP1]], i32 [[TMP4]], i64 0
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <2 x i32> [[TMP5]] to i64
+; CHECK-NEXT:    [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr addrspace(4)
+; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr addrspace(4) [[TMP7]], i32 0
+; CHECK-NEXT:    [[TMP9:%.*]] = mul i32 [[TMP3]], 48
+; CHECK-NEXT:    [[TMP10:%.*]] = sext i32 [[TMP9]] to i64
+; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr i8, ptr addrspace(4) [[TMP8]], i64 [[TMP10]]
+; CHECK-NEXT:    [[TMP12:%.*]] = load <8 x i32>, ptr addrspace(4) [[TMP11]], align 32
+; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <2 x float> [[TMP2]], i64 0
+; CHECK-NEXT:    [[TMP14:%.*]] = extractelement <2 x float> [[TMP2]], i64 1
+; CHECK-NEXT:    [[TMP15:%.*]] = call i32 @llvm.amdgcn.waterfall.begin.i32(i32 0, i32 [[TMP9]])
+; CHECK-NEXT:    [[TMP16:%.*]] = call i32 @llvm.amdgcn.waterfall.begin.v4i32(i32 [[TMP15]], <4 x i32> <i32 42, i32 42, i32 42, i32 42>)
+; CHECK-NEXT:    [[TMP17:%.*]] = call <8 x i32> @llvm.amdgcn.waterfall.readfirstlane.v8i32.v8i32(i32 [[TMP16]], <8 x i32> [[TMP12]])
+; CHECK-NEXT:    [[TMP18:%.*]] = call <4 x i32> @llvm.amdgcn.waterfall.readfirstlane.v4i32.v4i32(i32 [[TMP16]], <4 x i32> <i32 42, i32 42, i32 42, i32 42>)
+; CHECK-NEXT:    [[TMP19:%.*]] = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32 15, float [[TMP13]], float [[TMP14]], <8 x i32> [[TMP17]], <4 x i32> [[TMP18]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    [[TMP20:%.*]] = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.waterfall.end.v4f32(i32 [[TMP16]], <4 x float> [[TMP19]])
+; CHECK-NEXT:    call void @lgc.output.export.generic.i32.i32.v4f32(i32 0, i32 0, <4 x float> [[TMP20]]) #[[ATTR5:[0-9]+]]
+; CHECK-NEXT:    ret void
+;
diff --git a/lgc/test/scalarizationDescriptorLoadsNegativeTest2.ll b/lgc/test/scalarizationDescriptorLoadsNegativeTest2.ll
@@ -0,0 +1,86 @@
+; NOTE: Assertions have been autogenerated by tool/update_llpc_test_checks.py UTC_ARGS: --tool lgc
+; RUN: lgc -mcpu=gfx1010 -print-after=lgc-builder-replayer -o - %s 2>&1 | FileCheck --check-prefixes=CHECK %s
+; ModuleID = 'lgcPipeline'
+target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-p7:160:256:256:32-p8:128:128-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7:8-p32:32:32"
+target triple = "amdgcn--amdpal"
+
+declare <4 x i32> @foo1(<4 x i32> %V)
+
+; Function Attrs: nounwind
+define dllexport spir_func void @lgc.shader.FS.main() local_unnamed_addr #0 !spirv.ExecutionModel !2 !lgc.shaderstage !3 {
+.entry:
+  %0 = call <2 x float> (...) @lgc.create.read.generic.input.v2f32(i32 1, i32 0, i32 0, i32 0, i32 17, i32 poison)
+  %1 = call i32 (...) @lgc.create.read.generic.input.i32(i32 3, i32 0, i32 0, i32 0, i32 17, i32 poison)
+  %2 = call ptr addrspace(4) (...) @lgc.create.get.desc.ptr.p4(i32 1, i32 1, i64 0, i32 7)
+  %3 = call i32 (...) @lgc.create.get.desc.stride.i32(i32 1, i32 1, i64 0, i32 7)
+  %4 = mul i32 %1, %3
+  %5 = sext i32 %4 to i64
+  %6 = getelementptr i8, ptr addrspace(4) %2, i64 %5
+  %7 = load <8 x i32>, ptr addrspace(4) %6, align 32
+  %8 = load <4 x i32>, ptr addrspace(4) %6, align 32
+  %9 = call <4 x i32> @foo1(<4 x i32> %8)
+  %10 = call reassoc nnan nsz arcp contract afn <4 x float> (...) @lgc.create.image.sample.v4f32(i32 1, i32 24, <8 x i32> %7, <4 x i32> %9, i32 1, <2 x float> %0)
+  call void (...) @lgc.create.write.generic.output(<4 x float> %10, i32 0, i32 0, i32 0, i32 0, i32 0, i32 poison)
+  ret void
+}
+
+declare spir_func void @spirv.NonUniform.i32(i32) local_unnamed_addr
+
+; Function Attrs: nounwind memory(none)
+declare ptr addrspace(4) @lgc.create.get.desc.ptr.p4(...) local_unnamed_addr #1
+
+; Function Attrs: nounwind memory(none)
+declare i32 @lgc.create.get.desc.stride.i32(...) local_unnamed_addr #1
+
+; Function Attrs: nounwind willreturn memory(read)
+declare <4 x float> @lgc.create.image.sample.v4f32(...) local_unnamed_addr #2
+
+; Function Attrs: nounwind willreturn memory(read)
+declare i32 @lgc.create.read.generic.input.i32(...) local_unnamed_addr #2
+
+; Function Attrs: nounwind willreturn memory(read)
+declare <2 x float> @lgc.create.read.generic.input.v2f32(...) local_unnamed_addr #2
+
+; Function Attrs: nounwind
+declare void @lgc.create.write.generic.output(...) local_unnamed_addr #3
+
+attributes #0 = { nounwind "denormal-fp-math-f32"="preserve-sign" }
+attributes #1 = { nounwind memory(none) }
+attributes #2 = { nounwind willreturn memory(read) }
+attributes #3 = { nounwind }
+
+!lgc.user.data.nodes = !{!0, !1}
+
+!0 = !{!"DescriptorTableVaPtr", i32 7, i32 64, i32 0, i32 1, i32 1}
+!1 = !{!"DescriptorCombinedTexture", i32 3, i32 64, i32 0, i32 192, i64 0, i32 7, i32 12}
+!2 = !{i32 4}
+!3 = !{i32 6}
+
+; CHECK-LABEL: @lgc.shader.FS.main(
+; CHECK-NEXT:  .entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.amdgcn.s.getpc()
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i64 [[TMP0]] to <2 x i32>
+; CHECK-NEXT:    [[TMP2:%.*]] = call <2 x float> (...) @lgc.input.import.interpolated.v2f32(i1 false, i32 1, i32 0, i32 0, i32 poison, i32 1, i32 poison)
+; CHECK-NEXT:    [[TMP3:%.*]] = call i32 (...) @lgc.input.import.interpolated.i32(i1 false, i32 3, i32 0, i32 0, i32 poison, i32 1, i32 poison)
+; CHECK-NEXT:    [[TMP4:%.*]] = call i32 @lgc.load.user.data.i32(i32 0)
+; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <2 x i32> [[TMP1]], i32 [[TMP4]], i64 0
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <2 x i32> [[TMP5]] to i64
+; CHECK-NEXT:    [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr addrspace(4)
+; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr addrspace(4) [[TMP7]], i32 0
+; CHECK-NEXT:    [[TMP9:%.*]] = mul i32 [[TMP3]], 48
+; CHECK-NEXT:    [[TMP10:%.*]] = sext i32 [[TMP9]] to i64
+; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr i8, ptr addrspace(4) [[TMP8]], i64 [[TMP10]]
+; CHECK-NEXT:    [[TMP12:%.*]] = load <8 x i32>, ptr addrspace(4) [[TMP11]], align 32
+; CHECK-NEXT:    [[TMP13:%.*]] = load <4 x i32>, ptr addrspace(4) [[TMP11]], align 32
+; CHECK-NEXT:    [[TMP14:%.*]] = call <4 x i32> @foo1(<4 x i32> [[TMP13]])
+; CHECK-NEXT:    [[TMP15:%.*]] = extractelement <2 x float> [[TMP2]], i64 0
+; CHECK-NEXT:    [[TMP16:%.*]] = extractelement <2 x float> [[TMP2]], i64 1
+; CHECK-NEXT:    [[TMP17:%.*]] = call i32 @llvm.amdgcn.waterfall.begin.i32(i32 0, i32 [[TMP9]])
+; CHECK-NEXT:    [[TMP18:%.*]] = call i32 @llvm.amdgcn.waterfall.begin.v4i32(i32 [[TMP17]], <4 x i32> [[TMP14]])
+; CHECK-NEXT:    [[TMP19:%.*]] = call <8 x i32> @llvm.amdgcn.waterfall.readfirstlane.v8i32.v8i32(i32 [[TMP18]], <8 x i32> [[TMP12]])
+; CHECK-NEXT:    [[TMP20:%.*]] = call <4 x i32> @llvm.amdgcn.waterfall.readfirstlane.v4i32.v4i32(i32 [[TMP18]], <4 x i32> [[TMP14]])
+; CHECK-NEXT:    [[TMP21:%.*]] = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32 15, float [[TMP15]], float [[TMP16]], <8 x i32> [[TMP19]], <4 x i32> [[TMP20]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    [[TMP22:%.*]] = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.waterfall.end.v4f32(i32 [[TMP18]], <4 x float> [[TMP21]])
+; CHECK-NEXT:    call void @lgc.output.export.generic.i32.i32.v4f32(i32 0, i32 0, <4 x float> [[TMP22]]) #[[ATTR5:[0-9]+]]
+; CHECK-NEXT:    ret void
+;