diff --git a/FEXCore/Source/Interface/Core/CPUBackend.cpp b/FEXCore/Source/Interface/Core/CPUBackend.cpp index b9ecca4332..629b490ca8 100644 --- a/FEXCore/Source/Interface/Core/CPUBackend.cpp +++ b/FEXCore/Source/Interface/Core/CPUBackend.cpp @@ -39,6 +39,12 @@ namespace CPU { {0xC90F'DAA2'2168'C235ULL, 0x0000'0000'0000'4000ULL}, // NAMED_VECTOR_X87_PI {0x9A20'9A84'FBCF'F799ULL, 0x0000'0000'0000'3FFDULL}, // NAMED_VECTOR_X87_LOG10_2 {0xB172'17F7'D1CF'79ACULL, 0x0000'0000'0000'3FFEULL}, // NAMED_VECTOR_X87_LOG_2 + {0x4F00'0000'4F00'0000ULL, 0x4F00'0000'4F00'0000ULL}, // NAMED_VECTOR_CVTMAX_F32_I32 + {0x5F00'0000'5F00'0000ULL, 0x5F00'0000'5F00'0000ULL}, // NAMED_VECTOR_CVTMAX_F32_I64 + {0x41E0'0000'0000'0000ULL, 0x41E0'0000'0000'0000ULL}, // NAMED_VECTOR_CVTMAX_F64_I32 + {0x43E0'0000'0000'0000ULL, 0x43E0'0000'0000'0000ULL}, // NAMED_VECTOR_CVTMAX_F64_I64 + {0x8000'0000'8000'0000ULL, 0x8000'0000'8000'0000ULL}, // NAMED_VECTOR_CVTMAX_I32 + {0x8000'0000'0000'0000ULL, 0x8000'0000'0000'0000ULL}, // NAMED_VECTOR_CVTMAX_I64 }; constexpr static auto PSHUFLW_LUT {[]() consteval { diff --git a/FEXCore/Source/Interface/Core/OpcodeDispatcher.cpp b/FEXCore/Source/Interface/Core/OpcodeDispatcher.cpp index 9b08ea64f4..1c477988a5 100644 --- a/FEXCore/Source/Interface/Core/OpcodeDispatcher.cpp +++ b/FEXCore/Source/Interface/Core/OpcodeDispatcher.cpp @@ -5089,9 +5089,9 @@ void OpDispatchBuilder::InstallHostSpecificOpcodeHandlers() { {OPD(1, 0b10, 0x5A), 1, &OpDispatchBuilder::AVXInsertScalar_CVT_Float_To_Float}, {OPD(1, 0b11, 0x5A), 1, &OpDispatchBuilder::AVXInsertScalar_CVT_Float_To_Float}, - {OPD(1, 0b00, 0x5B), 1, &OpDispatchBuilder::AVXVector_CVT_Int_To_Float}, - {OPD(1, 0b01, 0x5B), 1, &OpDispatchBuilder::AVXVector_CVT_Float_To_Int}, - {OPD(1, 0b10, 0x5B), 1, &OpDispatchBuilder::AVXVector_CVT_Float_To_Int}, + {OPD(1, 0b00, 0x5B), 1, &OpDispatchBuilder::Vector_CVT_Int_To_Float}, + {OPD(1, 0b01, 0x5B), 1, &OpDispatchBuilder::Vector_CVT_Float_To_Int}, + {OPD(1, 0b10, 0x5B), 1, &OpDispatchBuilder::Vector_CVT_Float_To_Int}, {OPD(1, 0b00, 0x5C), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::AVXVectorALUOp, IR::OP_VFSUB, OpSize::i32Bit>}, {OPD(1, 0b01, 0x5C), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::AVXVectorALUOp, IR::OP_VFSUB, OpSize::i64Bit>}, @@ -5191,9 +5191,9 @@ void OpDispatchBuilder::InstallHostSpecificOpcodeHandlers() { {OPD(1, 0b01, 0xE4), 1, &OpDispatchBuilder::VPMULHWOp}, {OPD(1, 0b01, 0xE5), 1, &OpDispatchBuilder::VPMULHWOp}, - {OPD(1, 0b01, 0xE6), 1, &OpDispatchBuilder::AVXVector_CVT_Float_To_Int}, - {OPD(1, 0b10, 0xE6), 1, &OpDispatchBuilder::AVXVector_CVT_Int_To_Float}, - {OPD(1, 0b11, 0xE6), 1, &OpDispatchBuilder::AVXVector_CVT_Float_To_Int}, + {OPD(1, 0b01, 0xE6), 1, &OpDispatchBuilder::Vector_CVT_Float_To_Int}, + {OPD(1, 0b10, 0xE6), 1, &OpDispatchBuilder::Vector_CVT_Int_To_Float}, + {OPD(1, 0b11, 0xE6), 1, &OpDispatchBuilder::Vector_CVT_Float_To_Int}, {OPD(1, 0b01, 0xE7), 1, &OpDispatchBuilder::MOVVectorNTOp}, diff --git a/FEXCore/Source/Interface/Core/OpcodeDispatcher.h b/FEXCore/Source/Interface/Core/OpcodeDispatcher.h index 2fee162807..4c88a19f85 100644 --- a/FEXCore/Source/Interface/Core/OpcodeDispatcher.h +++ b/FEXCore/Source/Interface/Core/OpcodeDispatcher.h @@ -469,10 +469,10 @@ class OpDispatchBuilder final : public IREmitter { template void Scalar_CVT_Float_To_Float(OpcodeArgs); void Vector_CVT_Float_To_Float(OpcodeArgs, IR::OpSize DstElementSize, IR::OpSize SrcElementSize, bool IsAVX); - template + template void Vector_CVT_Float_To_Int(OpcodeArgs); void MMX_To_XMM_Vector_CVT_Int_To_Float(OpcodeArgs); - template + template void XMM_To_MMX_Vector_CVT_Float_To_Int(OpcodeArgs); void MASKMOVOp(OpcodeArgs); void MOVBetweenGPR_FPR(OpcodeArgs, VectorOpType VectorType); @@ -518,12 +518,6 @@ class OpDispatchBuilder final : public IREmitter { template void AVXScalar_CVT_Float_To_Float(OpcodeArgs); - template - void AVXVector_CVT_Float_To_Int(OpcodeArgs); - - template - void AVXVector_CVT_Int_To_Float(OpcodeArgs); - template void VectorScalarInsertALUOp(OpcodeArgs); template @@ -1032,7 +1026,7 @@ class OpDispatchBuilder final : public IREmitter { template void AVX128_Vector_CVT_Float_To_Float(OpcodeArgs); - template + template void AVX128_Vector_CVT_Float_To_Int(OpcodeArgs); template @@ -1471,7 +1465,10 @@ class OpDispatchBuilder final : public IREmitter { Ref Scalar_CVT_Float_To_FloatImpl(OpcodeArgs, IR::OpSize DstElementSize, IR::OpSize SrcElementSize, const X86Tables::DecodedOperand& Src1Op, const X86Tables::DecodedOperand& Src2Op); - Ref Vector_CVT_Float_To_IntImpl(OpcodeArgs, IR::OpSize SrcElementSize, bool Narrow, bool HostRoundingMode); + Ref CVTFPR_To_GPRImpl(OpcodeArgs, Ref Src, IR::OpSize SrcElementSize, bool HostRoundingMode); + + Ref Vector_CVT_Float_To_Int32Impl(OpcodeArgs, IR::OpSize DstSize, Ref Src, IR::OpSize SrcSize, IR::OpSize SrcElementSize, + bool HostRoundingMode, bool ZeroUpperHalf); Ref Vector_CVT_Int_To_FloatImpl(OpcodeArgs, IR::OpSize SrcElementSize, bool Widen); diff --git a/FEXCore/Source/Interface/Core/OpcodeDispatcher/AVX_128.cpp b/FEXCore/Source/Interface/Core/OpcodeDispatcher/AVX_128.cpp index 5bcf58d12a..f8f32c62a3 100644 --- a/FEXCore/Source/Interface/Core/OpcodeDispatcher/AVX_128.cpp +++ b/FEXCore/Source/Interface/Core/OpcodeDispatcher/AVX_128.cpp @@ -116,8 +116,8 @@ void OpDispatchBuilder::InstallAVX128Handlers() { {OPD(1, 0b11, 0x5A), 1, &OpDispatchBuilder::AVX128_InsertScalar_CVT_Float_To_Float}, {OPD(1, 0b00, 0x5B), 1, &OpDispatchBuilder::AVX128_Vector_CVT_Int_To_Float}, - {OPD(1, 0b01, 0x5B), 1, &OpDispatchBuilder::AVX128_Vector_CVT_Float_To_Int}, - {OPD(1, 0b10, 0x5B), 1, &OpDispatchBuilder::AVX128_Vector_CVT_Float_To_Int}, + {OPD(1, 0b01, 0x5B), 1, &OpDispatchBuilder::AVX128_Vector_CVT_Float_To_Int}, + {OPD(1, 0b10, 0x5B), 1, &OpDispatchBuilder::AVX128_Vector_CVT_Float_To_Int}, {OPD(1, 0b00, 0x5C), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::AVX128_VectorALU, IR::OP_VFSUB, OpSize::i32Bit>}, {OPD(1, 0b01, 0x5C), 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::AVX128_VectorALU, IR::OP_VFSUB, OpSize::i64Bit>}, @@ -217,9 +217,9 @@ void OpDispatchBuilder::InstallAVX128Handlers() { {OPD(1, 0b01, 0xE4), 1, &OpDispatchBuilder::AVX128_VPMULHW}, {OPD(1, 0b01, 0xE5), 1, &OpDispatchBuilder::AVX128_VPMULHW}, - {OPD(1, 0b01, 0xE6), 1, &OpDispatchBuilder::AVX128_Vector_CVT_Float_To_Int}, + {OPD(1, 0b01, 0xE6), 1, &OpDispatchBuilder::AVX128_Vector_CVT_Float_To_Int}, {OPD(1, 0b10, 0xE6), 1, &OpDispatchBuilder::AVX128_Vector_CVT_Int_To_Float}, - {OPD(1, 0b11, 0xE6), 1, &OpDispatchBuilder::AVX128_Vector_CVT_Float_To_Int}, + {OPD(1, 0b11, 0xE6), 1, &OpDispatchBuilder::AVX128_Vector_CVT_Float_To_Int}, {OPD(1, 0b01, 0xE7), 1, &OpDispatchBuilder::AVX128_MOVVectorNT}, @@ -1058,18 +1058,8 @@ void OpDispatchBuilder::AVX128_CVTFPR_To_GPR(OpcodeArgs) { Src.Low = LoadSource_WithOpSize(FPRClass, Op, Op->Src[0], OpSizeFromSrc(Op), Op->Flags); } - // GPR size is determined by REX.W - // Source Element size is determined by instruction - const auto GPRSize = OpSizeFromDst(Op); - - Ref Result {}; - if constexpr (HostRoundingMode) { - Result = _Float_ToGPR_S(GPRSize, SrcElementSize, Src.Low); - } else { - Result = _Float_ToGPR_ZS(GPRSize, SrcElementSize, Src.Low); - } - - StoreResult_WithOpSize(GPRClass, Op, Op->Dest, Result, GPRSize, OpSize::iInvalid); + Ref Result = CVTFPR_To_GPRImpl(Op, Src.Low, SrcElementSize, HostRoundingMode); + StoreResult(GPRClass, Op, Result, OpSize::iInvalid); } void OpDispatchBuilder::AVX128_VANDN(OpcodeArgs) { @@ -1604,7 +1594,7 @@ void OpDispatchBuilder::AVX128_Vector_CVT_Float_To_Float(OpcodeArgs) { AVX128_StoreResult_WithOpSize(Op, Op->Dest, Result); } -template +template void OpDispatchBuilder::AVX128_Vector_CVT_Float_To_Int(OpcodeArgs) { const auto SrcSize = GetSrcSize(Op); @@ -1614,48 +1604,22 @@ void OpDispatchBuilder::AVX128_Vector_CVT_Float_To_Int(OpcodeArgs) { auto Src = AVX128_LoadSource_WithOpSize(Op, Op->Src[0], Op->Flags, !Is128BitSrc); RefPair Result {}; - if (SrcElementSize == OpSize::i64Bit && Narrow) { - ///< Special case for VCVTPD2DQ/CVTTPD2DQ because it has weird rounding requirements. - Result.Low = _Vector_F64ToI32(OpSize::i128Bit, Src.Low, HostRoundingMode ? Round_Host : Round_Towards_Zero, Is128BitSrc); - - if (!Is128BitSrc) { - // Also convert the upper 128-bit lane - auto ResultHigh = _Vector_F64ToI32(OpSize::i128Bit, Src.High, HostRoundingMode ? Round_Host : Round_Towards_Zero, false); - - // Zip the two halves together in to the lower 128-bits - Result.Low = _VZip(OpSize::i128Bit, OpSize::i64Bit, Result.Low, ResultHigh); - } + Result.Low = Vector_CVT_Float_To_Int32Impl(Op, OpSize::i128Bit, Src.Low, OpSize::i128Bit, SrcElementSize, HostRoundingMode, Is128BitSrc); + if (Is128BitSrc) { + // Zero the upper 128-bit lane of the result. + Result = AVX128_Zext(Result.Low); } else { - auto Convert = [this](Ref Src) -> Ref { - auto ElementSize = SrcElementSize; - if (Narrow) { - ElementSize = ElementSize >> 1; - Src = _Vector_FToF(OpSize::i128Bit, ElementSize, Src, SrcElementSize); - } - - if (HostRoundingMode) { - return _Vector_FToS(OpSize::i128Bit, ElementSize, Src); - } else { - return _Vector_FToZS(OpSize::i128Bit, ElementSize, Src); - } - }; - - Result.Low = Convert(Src.Low); + Result.High = Vector_CVT_Float_To_Int32Impl(Op, OpSize::i128Bit, Src.High, OpSize::i128Bit, SrcElementSize, HostRoundingMode, false); + // Also convert the upper 128-bit lane + if (SrcElementSize == OpSize::i64Bit) { + // Zip the two halves together in to the lower 128-bits + Result.Low = _VZip(OpSize::i128Bit, OpSize::i64Bit, Result.Low, Result.High); - if (!Is128BitSrc) { - if (!Narrow) { - Result.High = Convert(Src.High); - } else { - Result.Low = _VInsElement(OpSize::i128Bit, OpSize::i64Bit, 1, 0, Result.Low, Convert(Src.High)); - } + // Zero the upper 128-bit lane of the result. + Result = AVX128_Zext(Result.Low); } } - if (Narrow || Is128BitSrc) { - // Zero the upper 128-bit lane of the result. - Result = AVX128_Zext(Result.Low); - } - AVX128_StoreResult_WithOpSize(Op, Op->Dest, Result); } diff --git a/FEXCore/Source/Interface/Core/OpcodeDispatcher/DDDTables.h b/FEXCore/Source/Interface/Core/OpcodeDispatcher/DDDTables.h index 7f986b177f..044d434037 100644 --- a/FEXCore/Source/Interface/Core/OpcodeDispatcher/DDDTables.h +++ b/FEXCore/Source/Interface/Core/OpcodeDispatcher/DDDTables.h @@ -7,7 +7,7 @@ constexpr std::tuple OpDisp {0x0C, 1, &OpDispatchBuilder::PI2FWOp}, {0x0D, 1, &OpDispatchBuilder::Vector_CVT_Int_To_Float}, {0x1C, 1, &OpDispatchBuilder::PF2IWOp}, - {0x1D, 1, &OpDispatchBuilder::Vector_CVT_Float_To_Int}, + {0x1D, 1, &OpDispatchBuilder::Vector_CVT_Float_To_Int}, {0x86, 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::VectorUnaryOp, IR::OP_VFRECP, OpSize::i32Bit>}, {0x87, 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::VectorUnaryOp, IR::OP_VFRSQRT, OpSize::i32Bit>}, diff --git a/FEXCore/Source/Interface/Core/OpcodeDispatcher/SecondaryTables.h b/FEXCore/Source/Interface/Core/OpcodeDispatcher/SecondaryTables.h index 7cb9978ffb..ce0eb83dcd 100644 --- a/FEXCore/Source/Interface/Core/OpcodeDispatcher/SecondaryTables.h +++ b/FEXCore/Source/Interface/Core/OpcodeDispatcher/SecondaryTables.h @@ -57,8 +57,8 @@ constexpr std::tuple OpDisp {0x28, 2, &OpDispatchBuilder::MOVVectorAlignedOp}, {0x2A, 1, &OpDispatchBuilder::InsertMMX_To_XMM_Vector_CVT_Int_To_Float}, {0x2B, 1, &OpDispatchBuilder::MOVVectorNTOp}, - {0x2C, 1, &OpDispatchBuilder::XMM_To_MMX_Vector_CVT_Float_To_Int}, - {0x2D, 1, &OpDispatchBuilder::XMM_To_MMX_Vector_CVT_Float_To_Int}, + {0x2C, 1, &OpDispatchBuilder::XMM_To_MMX_Vector_CVT_Float_To_Int}, + {0x2D, 1, &OpDispatchBuilder::XMM_To_MMX_Vector_CVT_Float_To_Int}, {0x2E, 2, &OpDispatchBuilder::UCOMISxOp}, {0x50, 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::MOVMSKOp, OpSize::i32Bit>}, {0x51, 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::VectorUnaryOp, IR::OP_VFSQRT, OpSize::i32Bit>}, @@ -161,7 +161,7 @@ constexpr std::tuple OpDisp {0x58, 1, &OpDispatchBuilder::VectorScalarInsertALUOp}, {0x59, 1, &OpDispatchBuilder::VectorScalarInsertALUOp}, {0x5A, 1, &OpDispatchBuilder::InsertScalar_CVT_Float_To_Float}, - {0x5B, 1, &OpDispatchBuilder::Vector_CVT_Float_To_Int}, + {0x5B, 1, &OpDispatchBuilder::Vector_CVT_Float_To_Int}, {0x5C, 1, &OpDispatchBuilder::VectorScalarInsertALUOp}, {0x5D, 1, &OpDispatchBuilder::VectorScalarInsertALUOp}, {0x5E, 1, &OpDispatchBuilder::VectorScalarInsertALUOp}, @@ -200,7 +200,7 @@ constexpr std::tuple OpDisp {0xD0, 1, &OpDispatchBuilder::ADDSUBPOp}, {0xD6, 1, &OpDispatchBuilder::MOVQ2DQ}, {0xC2, 1, &OpDispatchBuilder::InsertScalarFCMPOp}, - {0xE6, 1, &OpDispatchBuilder::Vector_CVT_Float_To_Int}, + {0xE6, 1, &OpDispatchBuilder::Vector_CVT_Float_To_Int}, {0xF0, 1, &OpDispatchBuilder::MOVVectorUnalignedOp}, }; @@ -213,8 +213,8 @@ constexpr std::tuple OpDisp {0x28, 2, &OpDispatchBuilder::MOVVectorAlignedOp}, {0x2A, 1, &OpDispatchBuilder::MMX_To_XMM_Vector_CVT_Int_To_Float}, {0x2B, 1, &OpDispatchBuilder::MOVVectorNTOp}, - {0x2C, 1, &OpDispatchBuilder::XMM_To_MMX_Vector_CVT_Float_To_Int}, - {0x2D, 1, &OpDispatchBuilder::XMM_To_MMX_Vector_CVT_Float_To_Int}, + {0x2C, 1, &OpDispatchBuilder::XMM_To_MMX_Vector_CVT_Float_To_Int}, + {0x2D, 1, &OpDispatchBuilder::XMM_To_MMX_Vector_CVT_Float_To_Int}, {0x2E, 2, &OpDispatchBuilder::UCOMISxOp}, {0x50, 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::MOVMSKOp, OpSize::i64Bit>}, @@ -226,7 +226,7 @@ constexpr std::tuple OpDisp {0x58, 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::VectorALUOp, IR::OP_VFADD, OpSize::i64Bit>}, {0x59, 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::VectorALUOp, IR::OP_VFMUL, OpSize::i64Bit>}, {0x5A, 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::Vector_CVT_Float_To_Float, OpSize::i32Bit, OpSize::i64Bit, false>}, - {0x5B, 1, &OpDispatchBuilder::Vector_CVT_Float_To_Int}, + {0x5B, 1, &OpDispatchBuilder::Vector_CVT_Float_To_Int}, {0x5C, 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::VectorALUOp, IR::OP_VFSUB, OpSize::i64Bit>}, {0x5D, 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::VectorALUOp, IR::OP_VFMIN, OpSize::i64Bit>}, {0x5E, 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::VectorALUOp, IR::OP_VFDIV, OpSize::i64Bit>}, @@ -284,7 +284,7 @@ constexpr std::tuple OpDisp {0xE3, 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::VectorALUOp, IR::OP_VURAVG, OpSize::i16Bit>}, {0xE4, 1, &OpDispatchBuilder::PMULHW}, {0xE5, 1, &OpDispatchBuilder::PMULHW}, - {0xE6, 1, &OpDispatchBuilder::Vector_CVT_Float_To_Int}, + {0xE6, 1, &OpDispatchBuilder::Vector_CVT_Float_To_Int}, {0xE7, 1, &OpDispatchBuilder::MOVVectorNTOp}, {0xE8, 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::VectorALUOp, IR::OP_VSQSUB, OpSize::i8Bit>}, {0xE9, 1, &OpDispatchBuilder::Bind<&OpDispatchBuilder::VectorALUOp, IR::OP_VSQSUB, OpSize::i16Bit>}, diff --git a/FEXCore/Source/Interface/Core/OpcodeDispatcher/Vector.cpp b/FEXCore/Source/Interface/Core/OpcodeDispatcher/Vector.cpp index b0a2ea53fc..b2bdcdf9a0 100644 --- a/FEXCore/Source/Interface/Core/OpcodeDispatcher/Vector.cpp +++ b/FEXCore/Source/Interface/Core/OpcodeDispatcher/Vector.cpp @@ -2067,6 +2067,24 @@ void OpDispatchBuilder::AVXCVTGPR_To_FPR(OpcodeArgs) { template void OpDispatchBuilder::AVXCVTGPR_To_FPR(OpcodeArgs); template void OpDispatchBuilder::AVXCVTGPR_To_FPR(OpcodeArgs); +Ref OpDispatchBuilder::CVTFPR_To_GPRImpl(OpcodeArgs, Ref Src, IR::OpSize SrcElementSize, bool HostRoundingMode) { + // GPR size is determined by REX.W + // Source Element size is determined by instruction + const auto GPRSize = OpSizeFromDst(Op); + + if (HostRoundingMode) { + Src = _Vector_FToI(SrcElementSize, SrcElementSize, Src, Round_Host); + } + Ref Converted = _Float_ToGPR_ZS(GPRSize, SrcElementSize, Src); + + bool Dst32 = GPRSize == OpSize::i32Bit; + Ref MaxI = Dst32 ? _Constant(0x80000000) : _Constant(0x8000000000000000); + Ref MaxF = LoadAndCacheNamedVectorConstant(SrcElementSize, (SrcElementSize == OpSize::i32Bit) ? + (Dst32 ? NAMED_VECTOR_CVTMAX_F32_I32 : NAMED_VECTOR_CVTMAX_F32_I64) : + (Dst32 ? NAMED_VECTOR_CVTMAX_F64_I32 : NAMED_VECTOR_CVTMAX_F64_I64)); + return _Select(GPRSize, SrcElementSize, CondClassType {FEXCore::IR::COND_FGT}, MaxF, Src, Converted, MaxI); +} + template void OpDispatchBuilder::CVTFPR_To_GPR(OpcodeArgs) { // If loading a vector, use the full size, so we don't @@ -2074,18 +2092,8 @@ void OpDispatchBuilder::CVTFPR_To_GPR(OpcodeArgs) { // memory, then we want to load the element size exactly. const auto SrcSize = Op->Src[0].IsGPR() ? OpSize::i128Bit : OpSizeFromSrc(Op); Ref Src = LoadSource_WithOpSize(FPRClass, Op, Op->Src[0], SrcSize, Op->Flags); - - // GPR size is determined by REX.W - // Source Element size is determined by instruction - const auto GPRSize = OpSizeFromDst(Op); - - if constexpr (HostRoundingMode) { - Src = _Float_ToGPR_S(GPRSize, SrcElementSize, Src); - } else { - Src = _Float_ToGPR_ZS(GPRSize, SrcElementSize, Src); - } - - StoreResult_WithOpSize(GPRClass, Op, Op->Dest, Src, GPRSize, OpSize::iInvalid); + Ref Result = CVTFPR_To_GPRImpl(Op, Src, SrcElementSize, HostRoundingMode); + StoreResult(GPRClass, Op, Result, OpSize::iInvalid); } template void OpDispatchBuilder::CVTFPR_To_GPR(OpcodeArgs); @@ -2127,77 +2135,43 @@ void OpDispatchBuilder::Vector_CVT_Int_To_Float(OpcodeArgs) { template void OpDispatchBuilder::Vector_CVT_Int_To_Float(OpcodeArgs); template void OpDispatchBuilder::Vector_CVT_Int_To_Float(OpcodeArgs); -template -void OpDispatchBuilder::AVXVector_CVT_Int_To_Float(OpcodeArgs) { - Ref Result = Vector_CVT_Int_To_FloatImpl(Op, SrcElementSize, Widen); - StoreResult(FPRClass, Op, Result, OpSize::iInvalid); -} - -template void OpDispatchBuilder::AVXVector_CVT_Int_To_Float(OpcodeArgs); -template void OpDispatchBuilder::AVXVector_CVT_Int_To_Float(OpcodeArgs); - -Ref OpDispatchBuilder::Vector_CVT_Float_To_IntImpl(OpcodeArgs, IR::OpSize SrcElementSize, bool Narrow, bool HostRoundingMode) { - const auto DstSize = OpSizeFromDst(Op); - auto ElementSize = SrcElementSize; - - Ref Src = LoadSource(FPRClass, Op, Op->Src[0], Op->Flags); - - if (Narrow) { - Src = _Vector_FToF(DstSize, SrcElementSize >> 1, Src, SrcElementSize); - ElementSize = ElementSize >> 1; - } - +Ref OpDispatchBuilder::Vector_CVT_Float_To_Int32Impl(OpcodeArgs, IR::OpSize DstSize, Ref Src, IR::OpSize SrcSize, IR::OpSize SrcElementSize, + bool HostRoundingMode, bool ZeroUpperHalf) { if (HostRoundingMode) { - return _Vector_FToS(DstSize, ElementSize, Src); - } else { - return _Vector_FToZS(DstSize, ElementSize, Src); + Src = _Vector_FToI(SrcSize, SrcElementSize, Src, Round_Host); } -} -template -void OpDispatchBuilder::Vector_CVT_Float_To_Int(OpcodeArgs) { - const auto DstSize = OpSizeFromDst(Op); + OpSize OverflowConstSize = ZeroUpperHalf && SrcElementSize == OpSize::i64Bit ? DstSize / 2 : DstSize; + Ref MaxI = LoadAndCacheNamedVectorConstant(OverflowConstSize, NAMED_VECTOR_CVTMAX_I32); + Ref Converted {}, Cmp {}; + if (SrcElementSize == OpSize::i64Bit) { + Ref MaxF = LoadAndCacheNamedVectorConstant(SrcSize, NAMED_VECTOR_CVTMAX_F64_I32); + Converted = _Vector_F64ToI32(DstSize, Src, Round_Towards_Zero, ZeroUpperHalf); - Ref Result {}; - if (SrcElementSize == OpSize::i64Bit && Narrow) { - ///< Special case for CVTTPD2DQ because it has weird rounding requirements. - Ref Src = LoadSource(FPRClass, Op, Op->Src[0], Op->Flags); - Result = _Vector_F64ToI32(DstSize, Src, HostRoundingMode ? Round_Host : Round_Towards_Zero, true); + Cmp = _VFCMPGT(SrcSize, OpSize::i64Bit, MaxF, Src); + Cmp = _VUShrNI(DstSize, OpSize::i64Bit, Cmp, 32); } else { - Result = Vector_CVT_Float_To_IntImpl(Op, SrcElementSize, Narrow, HostRoundingMode); + Ref MaxF = LoadAndCacheNamedVectorConstant(DstSize, NAMED_VECTOR_CVTMAX_F32_I32); + Converted = _Vector_FToZS(DstSize, OpSize::i32Bit, Src); + Cmp = _VFCMPGT(DstSize, OpSize::i32Bit, MaxF, Src); } - - StoreResult_WithOpSize(FPRClass, Op, Op->Dest, Result, DstSize, OpSize::iInvalid); + return _VBSL(DstSize, Cmp, Converted, MaxI); } -template void OpDispatchBuilder::Vector_CVT_Float_To_Int(OpcodeArgs); -template void OpDispatchBuilder::Vector_CVT_Float_To_Int(OpcodeArgs); -template void OpDispatchBuilder::Vector_CVT_Float_To_Int(OpcodeArgs); - -template void OpDispatchBuilder::Vector_CVT_Float_To_Int(OpcodeArgs); -template void OpDispatchBuilder::Vector_CVT_Float_To_Int(OpcodeArgs); - -template -void OpDispatchBuilder::AVXVector_CVT_Float_To_Int(OpcodeArgs) { +template +void OpDispatchBuilder::Vector_CVT_Float_To_Int(OpcodeArgs) { const auto DstSize = OpSizeFromDst(Op); - Ref Result {}; - if (SrcElementSize == OpSize::i64Bit && Narrow) { - ///< Special case for CVTPD2DQ/CVTTPD2DQ because it has weird rounding requirements. - Ref Src = LoadSource(FPRClass, Op, Op->Src[0], Op->Flags); - Result = _Vector_F64ToI32(DstSize, Src, HostRoundingMode ? Round_Host : Round_Towards_Zero, true); - } else { - Result = Vector_CVT_Float_To_IntImpl(Op, SrcElementSize, Narrow, HostRoundingMode); - } - + Ref Src = LoadSource(FPRClass, Op, Op->Src[0], Op->Flags); + Ref Result = Vector_CVT_Float_To_Int32Impl(Op, DstSize, Src, OpSizeFromSrc(Op), SrcElementSize, HostRoundingMode, true); StoreResult_WithOpSize(FPRClass, Op, Op->Dest, Result, DstSize, OpSize::iInvalid); } -template void OpDispatchBuilder::AVXVector_CVT_Float_To_Int(OpcodeArgs); -template void OpDispatchBuilder::AVXVector_CVT_Float_To_Int(OpcodeArgs); +template void OpDispatchBuilder::Vector_CVT_Float_To_Int(OpcodeArgs); +template void OpDispatchBuilder::Vector_CVT_Float_To_Int(OpcodeArgs); -template void OpDispatchBuilder::AVXVector_CVT_Float_To_Int(OpcodeArgs); -template void OpDispatchBuilder::AVXVector_CVT_Float_To_Int(OpcodeArgs); +template void OpDispatchBuilder::Vector_CVT_Float_To_Int(OpcodeArgs); +template void OpDispatchBuilder::Vector_CVT_Float_To_Int(OpcodeArgs); Ref OpDispatchBuilder::Scalar_CVT_Float_To_FloatImpl(OpcodeArgs, IR::OpSize DstElementSize, IR::OpSize SrcElementSize, const X86Tables::DecodedOperand& Src1Op, const X86Tables::DecodedOperand& Src2Op) { @@ -2277,7 +2251,7 @@ void OpDispatchBuilder::MMX_To_XMM_Vector_CVT_Int_To_Float(OpcodeArgs) { StoreResult(FPRClass, Op, Src, OpSize::iInvalid); } -template +template void OpDispatchBuilder::XMM_To_MMX_Vector_CVT_Float_To_Int(OpcodeArgs) { // This function causes a change in MMX state from X87 to MMX if (MMXState == MMXState_X87) { @@ -2288,29 +2262,16 @@ void OpDispatchBuilder::XMM_To_MMX_Vector_CVT_Float_To_Int(OpcodeArgs) { // unnecessarily zero extend the vector. Otherwise, if // memory, then we want to load the element size exactly. const auto SrcSize = Op->Src[0].IsGPR() ? OpSize::i128Bit : OpSizeFromSrc(Op); + const auto DstSize = OpSizeFromDst(Op); Ref Src = LoadSource_WithOpSize(FPRClass, Op, Op->Src[0], SrcSize, Op->Flags); - - auto ElementSize = SrcElementSize; - const auto Size = OpSizeFromDst(Op); - - if (Narrow) { - Src = _Vector_FToF(Size, SrcElementSize >> 1, Src, SrcElementSize); - ElementSize = ElementSize >> 1; - } - - if constexpr (HostRoundingMode) { - Src = _Vector_FToS(Size, ElementSize, Src); - } else { - Src = _Vector_FToZS(Size, ElementSize, Src); - } - - StoreResult_WithOpSize(FPRClass, Op, Op->Dest, Src, Size, OpSize::iInvalid); + Ref Result = Vector_CVT_Float_To_Int32Impl(Op, DstSize, Src, SrcSize, SrcElementSize, HostRoundingMode, false /* TODO? */); + StoreResult_WithOpSize(FPRClass, Op, Op->Dest, Result, DstSize, OpSize::iInvalid); } -template void OpDispatchBuilder::XMM_To_MMX_Vector_CVT_Float_To_Int(OpcodeArgs); -template void OpDispatchBuilder::XMM_To_MMX_Vector_CVT_Float_To_Int(OpcodeArgs); -template void OpDispatchBuilder::XMM_To_MMX_Vector_CVT_Float_To_Int(OpcodeArgs); -template void OpDispatchBuilder::XMM_To_MMX_Vector_CVT_Float_To_Int(OpcodeArgs); +template void OpDispatchBuilder::XMM_To_MMX_Vector_CVT_Float_To_Int(OpcodeArgs); +template void OpDispatchBuilder::XMM_To_MMX_Vector_CVT_Float_To_Int(OpcodeArgs); +template void OpDispatchBuilder::XMM_To_MMX_Vector_CVT_Float_To_Int(OpcodeArgs); +template void OpDispatchBuilder::XMM_To_MMX_Vector_CVT_Float_To_Int(OpcodeArgs); void OpDispatchBuilder::MASKMOVOp(OpcodeArgs) { const auto Size = OpSizeFromSrc(Op); diff --git a/FEXCore/Source/Interface/IR/IRDumper.cpp b/FEXCore/Source/Interface/IR/IRDumper.cpp index b4a94aa1b8..c1fdfa5e0d 100644 --- a/FEXCore/Source/Interface/IR/IRDumper.cpp +++ b/FEXCore/Source/Interface/IR/IRDumper.cpp @@ -209,6 +209,18 @@ static void PrintArg(fextl::stringstream* out, [[maybe_unused]] const IRListView return "x87_log10_2"; case NamedVectorConstant::NAMED_VECTOR_X87_LOG_2: return "x87_log2"; + case NamedVectorConstant::NAMED_VECTOR_CVTMAX_F32_I32: + return "cvtmax_f32_i32"; + case NamedVectorConstant::NAMED_VECTOR_CVTMAX_F32_I64: + return "cvtmax_f32_i64"; + case NamedVectorConstant::NAMED_VECTOR_CVTMAX_F64_I32: + return "cvtmax_f64_i32"; + case NamedVectorConstant::NAMED_VECTOR_CVTMAX_F64_I64: + return "cvtmax_f64_i64"; + case NamedVectorConstant::NAMED_VECTOR_CVTMAX_I32: + return "cvtmax_i32"; + case NamedVectorConstant::NAMED_VECTOR_CVTMAX_I64: + return "cvtmax_i64"; default: return ""; } diff --git a/FEXCore/include/FEXCore/IR/IR.h b/FEXCore/include/FEXCore/IR/IR.h index 005b41883c..635d62c3a4 100644 --- a/FEXCore/include/FEXCore/IR/IR.h +++ b/FEXCore/include/FEXCore/IR/IR.h @@ -71,6 +71,13 @@ enum NamedVectorConstant : uint8_t { NAMED_VECTOR_X87_LOG10_2, NAMED_VECTOR_X87_LOG_2, + NAMED_VECTOR_CVTMAX_F32_I32, + NAMED_VECTOR_CVTMAX_F32_I64, + NAMED_VECTOR_CVTMAX_F64_I32, + NAMED_VECTOR_CVTMAX_F64_I64, + NAMED_VECTOR_CVTMAX_I32, + NAMED_VECTOR_CVTMAX_I64, + NAMED_VECTOR_CONST_POOL_MAX, // Beginning of named constants that don't have a constant pool backing. NAMED_VECTOR_ZERO = NAMED_VECTOR_CONST_POOL_MAX, diff --git a/unittests/gcc-target-tests-64/Known_Failures b/unittests/gcc-target-tests-64/Known_Failures index 46d1bc6c35..d7b0bdcd09 100644 --- a/unittests/gcc-target-tests-64/Known_Failures +++ b/unittests/gcc-target-tests-64/Known_Failures @@ -12,13 +12,3 @@ asm-5.c.gcc-target-test-64 # Which turns the value in to 0xfffff2f5 # This causes its comparison to fail sse2-mmx-pextrw.c.gcc-target-test-64 - -# These tests fail because of things unrelated to the sse4.1 instructions -sse4_1-ceil-sfix-vec.c.gcc-target-test-64 -sse4_1-ceilf-sfix-vec.c.gcc-target-test-64 -sse4_1-floor-sfix-vec.c.gcc-target-test-64 -sse4_1-floorf-sfix-vec.c.gcc-target-test-64 -sse4_1-rint-sfix-vec.c.gcc-target-test-64 -sse4_1-rintf-sfix-vec.c.gcc-target-test-64 -sse4_1-round-sfix-vec.c.gcc-target-test-64 -sse4_1-roundf-sfix-vec.c.gcc-target-test-64