GPUOpen-Drivers · perlfu · Nov 16, 2023 · Nov 14, 2023
@@ -1754,9 +1754,10 @@ Value *PatchInOutImportExport::performFsParameterLoad(BuilderBase &builder, Valu
     compValue = builder.CreateIntrinsic(Intrinsic::amdgcn_mov_dpp, builder.getInt32Ty(),
                                         {compValue, builder.getInt32(static_cast<unsigned>(dppCtrl)),
                                          builder.getInt32(15), builder.getInt32(15), builder.getTrue()});
-    // NOTE: Make mov_dpp and its source instructions run in WQM to make sure the mov_dpp could fetch
-    // correct data from possible inactive lanes.
-    compValue = builder.CreateIntrinsic(Intrinsic::amdgcn_wqm, builder.getInt32Ty(), compValue);
+    // NOTE: mov_dpp must run in strict WQM to access lanes potentially inactive with normal exec/WQM.
+    // lds_param_load always runs in strict WQM, but exec/WQM may not match this due to discards or divergence.
+    // Ideally we would use the FI bit on the mov_dpp, but there is currently no backend support.
+    compValue = builder.CreateIntrinsic(Intrinsic::amdgcn_strict_wqm, builder.getInt32Ty(), compValue);
     compValue = builder.CreateBitCast(compValue, builder.getFloatTy());
   } else {
     Value *args[] = {

@@ -0,0 +1,34 @@
+// Check that flat parameter load uses DPP in strict WQM
+
+// RUN: amdllpc %gfxip --v %s |\
+// RUN:   FileCheck %s --check-prefix=CHECK
+//
+// CHECK-LABEL: {{^}}// LLPC pipeline patching results
+// CHECK:       call void @llvm.amdgcn.kill(i1 false)
+// CHECK:       [[P0:%.*]] = call float @llvm.amdgcn.lds.param.load(i32 immarg 2, i32 immarg 2, i32 %PrimMask)
+// CHECK:       [[P1:%.*]] = bitcast float [[P0]] to i32
+// CHECK:       [[P2:%.*]] = call i32 @llvm.amdgcn.mov.dpp.i32(i32 [[P1]], i32 0, i32 15, i32 15, i1 true)
+// CHECK:       [[P3:%.*]] = call i32 @llvm.amdgcn.strict.wqm.i32(i32 [[P2]])
+// CHECK-LABEL: {{^}}===== AMDLLPC SUCCESS =====
+
+#version 450
+
+layout (location = 0) in vec3 texCoordIn;
+layout (location = 1) in float discardPixel;
+layout (location = 2) flat in vec4 p0;
+
+layout (binding = 0) uniform sampler2D image1;
+layout (binding = 1) uniform sampler s0;
+layout (binding = 2) uniform textureCube t0;
+
+layout (location = 0) out vec4 fragColor;
+
+void main() {
+  fragColor = texture(image1, texCoordIn.xy);
+
+  if (discardPixel > 0.0)
+    discard;
+
+  float lod = p0.z;
+  fragColor += textureLod(samplerCube(t0, s0), texCoordIn, lod);
+}