forked from GPUOpen-Drivers/llpc
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
[LLPC] Scalarize non-uniform loads inside the waterfall loop
- Loading branch information
1 parent
7eb4d8a
commit 195b936
Showing
14 changed files
with
1,299 additions
and
119 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Large diffs are not rendered by default.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,113 @@ | ||
; NOTE: Assertions have been autogenerated by tool/update_llpc_test_checks.py UTC_ARGS: --tool lgc | ||
; RUN: lgc -mcpu=gfx1010 -print-after=lgc-builder-replayer -o - %s 2>&1 | FileCheck --check-prefixes=CHECK %s | ||
; ModuleID = 'lgcPipeline' | ||
source_filename = "lgcPipeline" | ||
target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-p7:160:256:256:32-p8:128:128-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7:8-p32:32:32" | ||
target triple = "amdgcn--amdpal" | ||
|
||
; Function Attrs: nounwind | ||
define dllexport spir_func void @lgc.shader.VS.main() local_unnamed_addr #0 !spirv.ExecutionModel !14 !lgc.shaderstage !15 { | ||
.entry: | ||
%0 = call <4 x i32> (...) @lgc.create.read.generic.input.v4i32(i32 2, i32 0, i32 0, i32 0, i32 0, i32 poison) | ||
%.fr = freeze <4 x i32> %0 | ||
%__llpc_input_proxy_4.0.vec.extract = extractelement <4 x i32> %.fr, i64 0 | ||
%__llpc_input_proxy_4.4.vec.extract = extractelement <4 x i32> %.fr, i64 1 | ||
%1 = call ptr addrspace(4) (...) @lgc.create.get.desc.ptr.p4(i32 4, i32 4, i64 0, i32 0) | ||
%2 = call i32 (...) @lgc.create.get.desc.stride.i32(i32 4, i32 4, i64 0, i32 0) | ||
%3 = mul i32 %__llpc_input_proxy_4.4.vec.extract, %2 | ||
%4 = sext i32 %3 to i64 | ||
%5 = getelementptr i8, ptr addrspace(4) %1, i64 %4 | ||
%6 = load <4 x i32>, ptr addrspace(4) %5, align 16, !invariant.load !16 | ||
%7 = mul i32 %__llpc_input_proxy_4.0.vec.extract, %2 | ||
%8 = sext i32 %7 to i64 | ||
%9 = getelementptr i8, ptr addrspace(4) %1, i64 %8 | ||
%10 = load <4 x i32>, ptr addrspace(4) %9, align 16, !invariant.load !16 | ||
%11 = call reassoc nnan nsz arcp contract afn <4 x float> (...) @lgc.create.image.load.v4f32(i32 0, i32 8, <4 x i32> %10, i32 0) | ||
call void (...) @lgc.create.image.store(<4 x float> %11, i32 0, i32 8, <4 x i32> %6, i32 1) | ||
ret void | ||
} | ||
|
||
; Function Attrs: nounwind willreturn memory(read) | ||
declare <4 x i32> @lgc.create.read.generic.input.v4i32(...) local_unnamed_addr #1 | ||
|
||
; Function Attrs: nounwind memory(none) | ||
declare ptr addrspace(4) @lgc.create.get.desc.ptr.p4(...) local_unnamed_addr #2 | ||
|
||
; Function Attrs: nounwind memory(none) | ||
declare i32 @lgc.create.get.desc.stride.i32(...) local_unnamed_addr #2 | ||
|
||
; Function Attrs: nounwind willreturn memory(read) | ||
declare <4 x float> @lgc.create.image.load.v4f32(...) local_unnamed_addr #1 | ||
|
||
; Function Attrs: nounwind memory(write) | ||
declare void @lgc.create.image.store(...) local_unnamed_addr #3 | ||
|
||
attributes #0 = { nounwind "denormal-fp-math-f32"="preserve-sign" } | ||
attributes #1 = { nounwind willreturn memory(read) } | ||
attributes #2 = { nounwind memory(none) } | ||
attributes #3 = { nounwind memory(write) } | ||
|
||
!lgc.client = !{!0} | ||
!lgc.options = !{!1} | ||
!lgc.options.VS = !{!2} | ||
!lgc.options.FS = !{!3} | ||
!lgc.user.data.nodes = !{!4, !5, !6, !7} | ||
!lgc.vertex.inputs = !{!8, !9, !10} | ||
!lgc.color.export.formats = !{!11} | ||
!lgc.rasterizer.state = !{!12} | ||
!amdgpu.pal.metadata.msgpack = !{!13} | ||
|
||
!0 = !{!"Vulkan"} | ||
!1 = !{i32 -1055878566, i32 -1332805290, i32 1045905781, i32 -589165353, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 256, i32 256, i32 -1, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 16777216} | ||
!2 = !{i32 1639417258, i32 -1495429105, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 64, i32 0, i32 0, i32 3, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 20, i32 1800, i32 0, i32 0, i32 1} | ||
!3 = !{i32 -1409621709, i32 -171549995, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 64, i32 64, i32 0, i32 0, i32 3, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 20, i32 1800, i32 0, i32 0, i32 1} | ||
!4 = !{!"DescriptorTableVaPtr", i32 7, i32 66, i32 0, i32 1, i32 1} | ||
!5 = !{!"DescriptorTexelBuffer", i32 4, i32 66, i32 0, i32 16384, i64 0, i32 0, i32 4} | ||
!6 = !{!"StreamOutTableVaPtr", i32 11, i32 4, i32 1, i32 1, i32 0} | ||
!7 = !{!"IndirectUserDataVaPtr", i32 8, i32 2, i32 2, i32 1, i32 4} | ||
!8 = !{i32 0, i32 0, i32 0, i32 40, i32 14, i32 7} | ||
!9 = !{i32 1, i32 0, i32 16, i32 40, i32 11, i32 7} | ||
!10 = !{i32 2, i32 0, i32 24, i32 40, i32 14, i32 5} | ||
!11 = !{i32 14, i32 7, i32 0, i32 0, i32 15} | ||
!12 = !{i32 0, i32 0, i32 0, i32 1} | ||
!13 = !{!"\82\B0amdpal.pipelines\91\83\B0.spill_threshold\CD\FF\FF\B0.user_data_limit\00\AF.xgl_cache_info\82\B3.128_bit_cache_hash\92\CF\D6\9C\B4\09\0A8A\DA\CF3\09\AF\FF\11\A9U\06\AD.llpc_version\A470.1\AEamdpal.version\92\03\00"} | ||
!14 = !{i32 0} | ||
!15 = !{i32 1} | ||
!16 = !{} | ||
; CHECK-LABEL: @lgc.shader.VS.main( | ||
; CHECK-NEXT: .entry: | ||
; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.amdgcn.s.getpc() | ||
; CHECK-NEXT: [[TMP1:%.*]] = bitcast i64 [[TMP0]] to <2 x i32> | ||
; CHECK-NEXT: [[TMP2:%.*]] = call <4 x i32> @lgc.input.import.generic.v4i32(i1 false, i32 2, i32 0, i32 0, i32 poison) | ||
; CHECK-NEXT: [[DOTFR:%.*]] = freeze <4 x i32> [[TMP2]] | ||
; CHECK-NEXT: [[__LLPC_INPUT_PROXY_4_0_VEC_EXTRACT:%.*]] = extractelement <4 x i32> [[DOTFR]], i64 0 | ||
; CHECK-NEXT: [[__LLPC_INPUT_PROXY_4_4_VEC_EXTRACT:%.*]] = extractelement <4 x i32> [[DOTFR]], i64 1 | ||
; CHECK-NEXT: [[TMP3:%.*]] = call i32 @lgc.load.user.data.i32(i32 0) | ||
; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x i32> [[TMP1]], i32 [[TMP3]], i64 0 | ||
; CHECK-NEXT: [[TMP5:%.*]] = bitcast <2 x i32> [[TMP4]] to i64 | ||
; CHECK-NEXT: [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr addrspace(4) | ||
; CHECK-NEXT: [[TMP7:%.*]] = getelementptr i8, ptr addrspace(4) [[TMP6]], i32 0 | ||
; CHECK-NEXT: [[TMP8:%.*]] = mul i32 [[__LLPC_INPUT_PROXY_4_4_VEC_EXTRACT]], 16 | ||
; CHECK-NEXT: [[TMP9:%.*]] = sext i32 [[TMP8]] to i64 | ||
; CHECK-NEXT: [[TMP10:%.*]] = getelementptr i8, ptr addrspace(4) [[TMP7]], i64 [[TMP9]] | ||
; CHECK-NEXT: [[TMP11:%.*]] = load <4 x i32>, ptr addrspace(4) [[TMP10]], align 16, !invariant.load !16 | ||
; CHECK-NEXT: [[TMP12:%.*]] = mul i32 [[__LLPC_INPUT_PROXY_4_0_VEC_EXTRACT]], 16 | ||
; CHECK-NEXT: [[TMP13:%.*]] = sext i32 [[TMP12]] to i64 | ||
; CHECK-NEXT: [[TMP14:%.*]] = getelementptr i8, ptr addrspace(4) [[TMP7]], i64 [[TMP13]] | ||
; CHECK-NEXT: [[TMP15:%.*]] = load <4 x i32>, ptr addrspace(4) [[TMP14]], align 16, !invariant.load !16 | ||
; CHECK-NEXT: [[TMP16:%.*]] = call i32 @llvm.amdgcn.waterfall.begin.i32(i32 0, i32 [[TMP12]]) | ||
; CHECK-NEXT: [[TMP17:%.*]] = call i32 @llvm.amdgcn.waterfall.readfirstlane.i32.i32(i32 [[TMP16]], i32 [[TMP12]]) | ||
; CHECK-NEXT: [[TMP18:%.*]] = sext i32 [[TMP17]] to i64 | ||
; CHECK-NEXT: [[TMP19:%.*]] = getelementptr i8, ptr addrspace(4) [[TMP7]], i64 [[TMP18]] | ||
; CHECK-NEXT: [[TMP20:%.*]] = load <4 x i32>, ptr addrspace(4) [[TMP19]], align 16, !invariant.load !16 | ||
; CHECK-NEXT: [[TMP21:%.*]] = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.struct.buffer.load.format.v4f32(<4 x i32> [[TMP20]], i32 0, i32 0, i32 0, i32 0) | ||
; CHECK-NEXT: [[TMP22:%.*]] = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.waterfall.end.v4f32(i32 [[TMP16]], <4 x float> [[TMP21]]) | ||
; CHECK-NEXT: [[TMP23:%.*]] = call i32 @llvm.amdgcn.waterfall.begin.i32(i32 0, i32 [[TMP8]]) | ||
; CHECK-NEXT: [[TMP24:%.*]] = call i32 @llvm.amdgcn.waterfall.readfirstlane.i32.i32(i32 [[TMP23]], i32 [[TMP8]]) | ||
; CHECK-NEXT: [[TMP25:%.*]] = sext i32 [[TMP24]] to i64 | ||
; CHECK-NEXT: [[TMP26:%.*]] = getelementptr i8, ptr addrspace(4) [[TMP7]], i64 [[TMP25]] | ||
; CHECK-NEXT: [[TMP27:%.*]] = load <4 x i32>, ptr addrspace(4) [[TMP26]], align 16, !invariant.load !16 | ||
; CHECK-NEXT: [[TMP28:%.*]] = call <4 x i32> @llvm.amdgcn.waterfall.last.use.v4i32(i32 [[TMP23]], <4 x i32> [[TMP27]]) | ||
; CHECK-NEXT: call void @llvm.amdgcn.struct.buffer.store.format.v4f32(<4 x float> [[TMP22]], <4 x i32> [[TMP28]], i32 1, i32 0, i32 0, i32 0) | ||
; CHECK-NEXT: ret void | ||
; |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,98 @@ | ||
; NOTE: Assertions have been autogenerated by tool/update_llpc_test_checks.py UTC_ARGS: --tool lgc | ||
; RUN: lgc -mcpu=gfx1010 -print-after=lgc-builder-replayer -o - %s 2>&1 | FileCheck --check-prefixes=CHECK %s | ||
; ModuleID = 'lgcPipeline' | ||
source_filename = "lgcPipeline" | ||
target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-p7:160:256:256:32-p8:128:128-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7:8-p32:32:32" | ||
target triple = "amdgcn--amdpal" | ||
|
||
; Function Attrs: nounwind | ||
define dllexport spir_func void @lgc.shader.VS.main() local_unnamed_addr #0 !spirv.ExecutionModel !14 !lgc.shaderstage !15 { | ||
.entry: | ||
%0 = call <4 x i32> (...) @lgc.create.read.generic.input.v4i32(i32 2, i32 0, i32 0, i32 0, i32 0, i32 poison) | ||
%.fr = freeze <4 x i32> %0 | ||
%__llpc_input_proxy_4.0.vec.extract = extractelement <4 x i32> %.fr, i64 0 | ||
%1 = call ptr addrspace(4) (...) @lgc.create.get.desc.ptr.p4(i32 4, i32 4, i64 0, i32 0) | ||
%2 = call i32 (...) @lgc.create.get.desc.stride.i32(i32 4, i32 4, i64 0, i32 0) | ||
%3 = mul i32 %__llpc_input_proxy_4.0.vec.extract, %2 | ||
%4 = sext i32 %3 to i64 | ||
%5 = getelementptr i8, ptr addrspace(4) %1, i64 %4 | ||
%6 = load <4 x i32>, ptr addrspace(4) %5, align 16, !invariant.load !16 | ||
%7 = getelementptr i8, ptr addrspace(4) %1, i64 %4 | ||
%8 = load <4 x i32>, ptr addrspace(4) %7, align 16, !invariant.load !16 | ||
call void (...) @lgc.create.image.store(<4 x i32> %8, i32 0, i32 8, <4 x i32> %6, i32 1) | ||
ret void | ||
} | ||
|
||
; Function Attrs: nounwind willreturn memory(read) | ||
declare <4 x i32> @lgc.create.read.generic.input.v4i32(...) local_unnamed_addr #1 | ||
|
||
; Function Attrs: nounwind memory(none) | ||
declare ptr addrspace(4) @lgc.create.get.desc.ptr.p4(...) local_unnamed_addr #2 | ||
|
||
; Function Attrs: nounwind memory(none) | ||
declare i32 @lgc.create.get.desc.stride.i32(...) local_unnamed_addr #2 | ||
|
||
; Function Attrs: nounwind memory(write) | ||
declare void @lgc.create.image.store(...) local_unnamed_addr #3 | ||
|
||
attributes #0 = { nounwind "denormal-fp-math-f32"="preserve-sign" } | ||
attributes #1 = { nounwind willreturn memory(read) } | ||
attributes #2 = { nounwind memory(none) } | ||
attributes #3 = { nounwind memory(write) } | ||
attributes #4 = { nounwind } | ||
|
||
!lgc.client = !{!0} | ||
!lgc.options = !{!1} | ||
!lgc.options.VS = !{!2} | ||
!lgc.options.FS = !{!3} | ||
!lgc.user.data.nodes = !{!4, !5, !6, !7} | ||
!lgc.vertex.inputs = !{!8, !9, !10} | ||
!lgc.color.export.formats = !{!11} | ||
!lgc.rasterizer.state = !{!12} | ||
!amdgpu.pal.metadata.msgpack = !{!13} | ||
|
||
!0 = !{!"Vulkan"} | ||
!1 = !{i32 -1055878566, i32 -1332805290, i32 1045905781, i32 -589165353, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 256, i32 256, i32 -1, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 16777216} | ||
!2 = !{i32 1639417258, i32 -1495429105, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 64, i32 0, i32 0, i32 3, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 20, i32 1800, i32 0, i32 0, i32 1} | ||
!3 = !{i32 -1409621709, i32 -171549995, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 64, i32 64, i32 0, i32 0, i32 3, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 20, i32 1800, i32 0, i32 0, i32 1} | ||
!4 = !{!"DescriptorTableVaPtr", i32 7, i32 66, i32 0, i32 1, i32 1} | ||
!5 = !{!"DescriptorTexelBuffer", i32 4, i32 66, i32 0, i32 16384, i64 0, i32 0, i32 4} | ||
!6 = !{!"StreamOutTableVaPtr", i32 11, i32 4, i32 1, i32 1, i32 0} | ||
!7 = !{!"IndirectUserDataVaPtr", i32 8, i32 2, i32 2, i32 1, i32 4} | ||
!8 = !{i32 0, i32 0, i32 0, i32 40, i32 14, i32 7} | ||
!9 = !{i32 1, i32 0, i32 16, i32 40, i32 11, i32 7} | ||
!10 = !{i32 2, i32 0, i32 24, i32 40, i32 14, i32 5} | ||
!11 = !{i32 14, i32 7, i32 0, i32 0, i32 15} | ||
!12 = !{i32 0, i32 0, i32 0, i32 1} | ||
!13 = !{!"\82\B0amdpal.pipelines\91\83\B0.spill_threshold\CD\FF\FF\B0.user_data_limit\00\AF.xgl_cache_info\82\B3.128_bit_cache_hash\92\CF\D6\9C\B4\09\0A8A\DA\CF3\09\AF\FF\11\A9U\06\AD.llpc_version\A470.1\AEamdpal.version\92\03\00"} | ||
!14 = !{i32 0} | ||
!15 = !{i32 1} | ||
!16 = !{} | ||
; CHECK-LABEL: @lgc.shader.VS.main( | ||
; CHECK-NEXT: .entry: | ||
; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.amdgcn.s.getpc() | ||
; CHECK-NEXT: [[TMP1:%.*]] = bitcast i64 [[TMP0]] to <2 x i32> | ||
; CHECK-NEXT: [[TMP2:%.*]] = call <4 x i32> @lgc.input.import.generic.v4i32(i1 false, i32 2, i32 0, i32 0, i32 poison) | ||
; CHECK-NEXT: [[DOTFR:%.*]] = freeze <4 x i32> [[TMP2]] | ||
; CHECK-NEXT: [[__LLPC_INPUT_PROXY_4_0_VEC_EXTRACT:%.*]] = extractelement <4 x i32> [[DOTFR]], i64 0 | ||
; CHECK-NEXT: [[TMP3:%.*]] = call i32 @lgc.load.user.data.i32(i32 0) | ||
; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x i32> [[TMP1]], i32 [[TMP3]], i64 0 | ||
; CHECK-NEXT: [[TMP5:%.*]] = bitcast <2 x i32> [[TMP4]] to i64 | ||
; CHECK-NEXT: [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr addrspace(4) | ||
; CHECK-NEXT: [[TMP7:%.*]] = getelementptr i8, ptr addrspace(4) [[TMP6]], i32 0 | ||
; CHECK-NEXT: [[TMP8:%.*]] = mul i32 [[__LLPC_INPUT_PROXY_4_0_VEC_EXTRACT]], 16 | ||
; CHECK-NEXT: [[TMP9:%.*]] = sext i32 [[TMP8]] to i64 | ||
; CHECK-NEXT: [[TMP10:%.*]] = getelementptr i8, ptr addrspace(4) [[TMP7]], i64 [[TMP9]] | ||
; CHECK-NEXT: [[TMP11:%.*]] = load <4 x i32>, ptr addrspace(4) [[TMP10]], align 16, !invariant.load !16 | ||
; CHECK-NEXT: [[TMP12:%.*]] = getelementptr i8, ptr addrspace(4) [[TMP7]], i64 [[TMP9]] | ||
; CHECK-NEXT: [[TMP13:%.*]] = load <4 x i32>, ptr addrspace(4) [[TMP12]], align 16, !invariant.load !16 | ||
; CHECK-NEXT: [[TMP14:%.*]] = bitcast <4 x i32> [[TMP13]] to <4 x float> | ||
; CHECK-NEXT: [[TMP15:%.*]] = call i32 @llvm.amdgcn.waterfall.begin.i32(i32 0, i32 [[TMP8]]) | ||
; CHECK-NEXT: [[TMP16:%.*]] = call i32 @llvm.amdgcn.waterfall.readfirstlane.i32.i32(i32 [[TMP15]], i32 [[TMP8]]) | ||
; CHECK-NEXT: [[TMP17:%.*]] = sext i32 [[TMP16]] to i64 | ||
; CHECK-NEXT: [[TMP18:%.*]] = getelementptr i8, ptr addrspace(4) [[TMP7]], i64 [[TMP17]] | ||
; CHECK-NEXT: [[TMP19:%.*]] = load <4 x i32>, ptr addrspace(4) [[TMP18]], align 16, !invariant.load !16 | ||
; CHECK-NEXT: [[TMP20:%.*]] = call <4 x i32> @llvm.amdgcn.waterfall.last.use.v4i32(i32 [[TMP15]], <4 x i32> [[TMP19]]) | ||
; CHECK-NEXT: call void @llvm.amdgcn.struct.buffer.store.format.v4f32(<4 x float> [[TMP14]], <4 x i32> [[TMP20]], i32 1, i32 0, i32 0, i32 0) | ||
; CHECK-NEXT: ret void | ||
; |
Oops, something went wrong.