Skip to content

Commit

Permalink
Fix performance regression of normalize
Browse files Browse the repository at this point in the history
The PR #2778 tries to
resolve the signed zero issue of normalize. But it unconditionally adds
v_cmp and v_cndmask instructions. It causes performance drop. Therefore,
we add check of NSZ flag. When NSZ is specified, we still follow
previous handling.
  • Loading branch information
amdrexu committed Nov 21, 2023
1 parent 6d724b7 commit cecf495
Showing 1 changed file with 14 additions and 4 deletions.
18 changes: 14 additions & 4 deletions lgc/builder/ArithBuilder.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -833,13 +833,23 @@ Value *BuilderImpl::CreateNormalizeVector(Value *x, const Twine &instName) {
Value *dot = CreateDotProduct(x, x);
Value *sqrt = CreateSqrt(dot);
Value *rsq = CreateFDiv(ConstantFP::get(sqrt->getType(), 1.0), sqrt);
Value *result = nullptr;
if (x->getType()->getScalarType()->isFloatTy()) {
// Make sure a FP32 zero vector is normalized to a FP32 zero vector, rather than NaNs.
auto zero = ConstantFP::get(getFloatTy(), 0.0);
auto isZeroDot = CreateFCmpOEQ(dot, zero);
rsq = CreateSelect(isZeroDot, zero, rsq);
if (!getFastMathFlags().noSignedZeros()) {
// When NSZ is not specified, we avoid using fmul_legacy since the sign of the input is dropped.
auto zero = ConstantFP::get(getFloatTy(), 0.0);
auto isZeroDot = CreateFCmpOEQ(dot, zero);
rsq = CreateSelect(isZeroDot, zero, rsq);
result = scalarize(x, [this, rsq](Value *x) -> Value * { return CreateFMul(x, rsq); });
} else {
result = scalarize(x, [this, rsq](Value *x) -> Value * {
return CreateIntrinsic(Intrinsic::amdgcn_fmul_legacy, {}, {x, rsq});
});
}
} else {
result = scalarize(x, [this, rsq](Value *x) -> Value * { return CreateFMul(x, rsq); });
}
Value *result = scalarize(x, [this, rsq](Value *x) -> Value * { return CreateFMul(x, rsq); });
result->setName(instName);
return result;
}
Expand Down

0 comments on commit cecf495

Please sign in to comment.