Fix performance regression of normalize

The PR #2778 tries to resolve the signed zero issue of normalize. But it unconditionally adds v_cmp and v_cndmask instructions. It causes performance drop. Therefore, we add check of NSZ flag. When NSZ is specified, we still follow previous handling.
GPUOpen-Drivers · Nov 21, 2023 · cecf495 · cecf495
1 parent 6d724b7
commit cecf495
Showing 1 changed file with 14 additions and 4 deletions.
diff --git a/lgc/builder/ArithBuilder.cpp b/lgc/builder/ArithBuilder.cpp
@@ -833,13 +833,23 @@ Value *BuilderImpl::CreateNormalizeVector(Value *x, const Twine &instName) {
   Value *dot = CreateDotProduct(x, x);
   Value *sqrt = CreateSqrt(dot);
   Value *rsq = CreateFDiv(ConstantFP::get(sqrt->getType(), 1.0), sqrt);
+  Value *result = nullptr;
   if (x->getType()->getScalarType()->isFloatTy()) {
     // Make sure a FP32 zero vector is normalized to a FP32 zero vector, rather than NaNs.
-    auto zero = ConstantFP::get(getFloatTy(), 0.0);
-    auto isZeroDot = CreateFCmpOEQ(dot, zero);
-    rsq = CreateSelect(isZeroDot, zero, rsq);
+    if (!getFastMathFlags().noSignedZeros()) {
+      // When NSZ is not specified, we avoid using fmul_legacy since the sign of the input is dropped.
+      auto zero = ConstantFP::get(getFloatTy(), 0.0);
+      auto isZeroDot = CreateFCmpOEQ(dot, zero);
+      rsq = CreateSelect(isZeroDot, zero, rsq);
+      result = scalarize(x, [this, rsq](Value *x) -> Value * { return CreateFMul(x, rsq); });
+    } else {
+      result = scalarize(x, [this, rsq](Value *x) -> Value * {
+        return CreateIntrinsic(Intrinsic::amdgcn_fmul_legacy, {}, {x, rsq});
+      });
+    }
+  } else {
+    result = scalarize(x, [this, rsq](Value *x) -> Value * { return CreateFMul(x, rsq); });
   }
-  Value *result = scalarize(x, [this, rsq](Value *x) -> Value * { return CreateFMul(x, rsq); });
   result->setName(instName);
   return result;
 }