diff --git a/FEXCore/Scripts/json_ir_generator.py b/FEXCore/Scripts/json_ir_generator.py index cda8d9c509..fed9ad9915 100755 --- a/FEXCore/Scripts/json_ir_generator.py +++ b/FEXCore/Scripts/json_ir_generator.py @@ -55,6 +55,7 @@ class OpDefinition: NonSSAArgNum: int DynamicDispatch: bool LoweredX87: bool + MaybeClobbersPredRegs: bool JITDispatch: bool JITDispatchOverride: str TiedSource: int @@ -79,6 +80,7 @@ def __init__(self): self.NonSSAArgNum = 0 self.DynamicDispatch = False self.LoweredX87 = False + self.MaybeClobbersPredRegs = False self.JITDispatch = True self.JITDispatchOverride = None self.TiedSource = -1 @@ -223,7 +225,7 @@ def parse_ops(ops): (OpArg.Type == "GPR" or OpArg.Type == "GPRPair" or OpArg.Type == "FPR" or - OpArg.Type == "PR")): + OpArg.Type == "PRED")): OpDef.EmitValidation.append(f"GetOpRegClass({ArgName}) == InvalidClass || WalkFindRegClass({ArgName}) == {OpArg.Type}Class") OpArg.Name = ArgName @@ -277,6 +279,9 @@ def parse_ops(ops): assert("JITDispatch" not in op_val) OpDef.JITDispatch = False + if "MaybeClobbersPredRegs" in op_val: + OpDef.MaybeClobbersPredRegs = op_val["MaybeClobbersPredRegs"] + if "TiedSource" in op_val: OpDef.TiedSource = op_val["TiedSource"] @@ -506,6 +511,7 @@ def print_ir_hassideeffects(): ("HasSideEffects", "bool"), ("ImplicitFlagClobber", "bool"), ("LoweredX87", "bool"), + ("MaybeClobbersPredRegs", "bool"), ("TiedSource", "int8_t"), ]: output_file.write( @@ -707,6 +713,9 @@ def print_ir_allocator_helpers(): "\t\tif(MMXState == MMXState_MMX) ChgStateMMX_X87();\n" ) + if op.MaybeClobbersPredRegs: + output_file.write("\t\tResetInitPredicateCache();\n") + output_file.write("\t\tauto _Op = AllocateOp();\n".format(op.Name, op.Name.upper())) if op.SSAArgNum != 0: diff --git a/FEXCore/Source/Interface/Core/OpcodeDispatcher.cpp b/FEXCore/Source/Interface/Core/OpcodeDispatcher.cpp index 6dd2d8b108..f729823364 100644 --- a/FEXCore/Source/Interface/Core/OpcodeDispatcher.cpp +++ b/FEXCore/Source/Interface/Core/OpcodeDispatcher.cpp @@ -4314,7 +4314,7 @@ Ref OpDispatchBuilder::LoadSource_WithOpSize(RegisterClassType Class, const X86T Ref MemSrc = LoadEffectiveAddress(A, true); if (CTX->HostFeatures.SupportsSVE128 || CTX->HostFeatures.SupportsSVE256) { // Using SVE we can load this with a single instruction. - auto PReg = _InitPredicate(OpSize::i16Bit, FEXCore::ToUnderlying(ARMEmitter::PredicatePattern::SVE_VL5)); + auto PReg = InitPredicateCached(OpSize::i16Bit, ARMEmitter::PredicatePattern::SVE_VL5); return _LoadMemPredicate(OpSize::i128Bit, OpSize::i16Bit, PReg, MemSrc); } else { // For X87 extended doubles, Split the load. @@ -4448,7 +4448,7 @@ void OpDispatchBuilder::StoreResult_WithOpSize(FEXCore::IR::RegisterClassType Cl if (OpSize == OpSize::f80Bit) { Ref MemStoreDst = LoadEffectiveAddress(A, true); if (CTX->HostFeatures.SupportsSVE128 || CTX->HostFeatures.SupportsSVE256) { - auto PReg = _InitPredicate(OpSize::i16Bit, FEXCore::ToUnderlying(ARMEmitter::PredicatePattern::SVE_VL5)); + auto PReg = InitPredicateCached(OpSize::i16Bit, ARMEmitter::PredicatePattern::SVE_VL5); _StoreMemPredicate(OpSize::i128Bit, OpSize::i16Bit, Src, PReg, MemStoreDst); } else { // For X87 extended doubles, split before storing diff --git a/FEXCore/Source/Interface/Core/OpcodeDispatcher.h b/FEXCore/Source/Interface/Core/OpcodeDispatcher.h index 9545e87612..c4c01c21f9 100644 --- a/FEXCore/Source/Interface/Core/OpcodeDispatcher.h +++ b/FEXCore/Source/Interface/Core/OpcodeDispatcher.h @@ -119,6 +119,7 @@ class OpDispatchBuilder final : public IREmitter { CachedNZCV = nullptr; CFInverted = CFInvertedABI; FlushRegisterCache(); + ResetInitPredicateCache(); // New block needs to reset segment telemetry. SegmentsNeedReadCheck = ~0U; @@ -718,7 +719,6 @@ class OpDispatchBuilder final : public IREmitter { void FNINIT(OpcodeArgs); void X87ModifySTP(OpcodeArgs, bool Inc); - void X87SinCos(OpcodeArgs); void X87FYL2X(OpcodeArgs, bool IsFYL2XP1); void X87LDENV(OpcodeArgs); void X87FLDCW(OpcodeArgs); @@ -764,9 +764,6 @@ class OpDispatchBuilder final : public IREmitter { void FTSTF64(OpcodeArgs); void FRNDINTF64(OpcodeArgs); void FSQRTF64(OpcodeArgs); - void X87UnaryOpF64(OpcodeArgs, FEXCore::IR::IROps IROp); - void X87BinaryOpF64(OpcodeArgs, FEXCore::IR::IROps IROp); - void X87SinCosF64(OpcodeArgs); void X87FLDCWF64(OpcodeArgs); void X87TANF64(OpcodeArgs); void X87ATANF64(OpcodeArgs); diff --git a/FEXCore/Source/Interface/Core/OpcodeDispatcher/X87.cpp b/FEXCore/Source/Interface/Core/OpcodeDispatcher/X87.cpp index 1470768f8f..3eed79f50d 100644 --- a/FEXCore/Source/Interface/Core/OpcodeDispatcher/X87.cpp +++ b/FEXCore/Source/Interface/Core/OpcodeDispatcher/X87.cpp @@ -130,7 +130,11 @@ void OpDispatchBuilder::FILD(OpcodeArgs) { void OpDispatchBuilder::FST(OpcodeArgs, IR::OpSize Width) { Ref Mem = LoadSource(GPRClass, Op, Op->Dest, Op->Flags, {.LoadData = false}); - _StoreStackMemory(Mem, OpSize::i128Bit, true, Width); + Ref PredReg = Invalid(); + if (CTX->HostFeatures.SupportsSVE128 || CTX->HostFeatures.SupportsSVE256) { + PredReg = InitPredicateCached(OpSize::i16Bit, ARMEmitter::PredicatePattern::SVE_VL5); + } + _StoreStackMemory(PredReg, Mem, OpSize::i128Bit, true, Width); if (Op->TableInfo->Flags & X86Tables::InstFlags::FLAGS_POP) { _PopStackDestroy(); } @@ -267,9 +271,9 @@ void OpDispatchBuilder::FDIV(OpcodeArgs, IR::OpSize Width, bool Integer, bool Re void OpDispatchBuilder::FSUB(OpcodeArgs, IR::OpSize Width, bool Integer, bool Reverse, OpDispatchBuilder::OpResult ResInST0) { if (Op->Src[0].IsNone()) { - const auto Offset = Op->OP & 7; - const auto St0 = 0; - const auto Result = (ResInST0 == OpResult::RES_STI) ? Offset : St0; + const uint8_t Offset = Op->OP & 7; + const uint8_t St0 = 0; + const uint8_t Result = (ResInST0 == OpResult::RES_STI) ? Offset : St0; if (Reverse ^ (ResInST0 == OpResult::RES_STI)) { _F80SubStack(Result, Offset, St0); @@ -751,13 +755,11 @@ void OpDispatchBuilder::FNINIT(OpcodeArgs) { } void OpDispatchBuilder::X87FFREE(OpcodeArgs) { - _InvalidateStack(Op->OP & 7); } void OpDispatchBuilder::X87EMMS(OpcodeArgs) { // Tags all get set to 0b11 - _InvalidateStack(0xff); } diff --git a/FEXCore/Source/Interface/Core/OpcodeDispatcher/X87F64.cpp b/FEXCore/Source/Interface/Core/OpcodeDispatcher/X87F64.cpp index ca4e91f0b5..7313125185 100644 --- a/FEXCore/Source/Interface/Core/OpcodeDispatcher/X87F64.cpp +++ b/FEXCore/Source/Interface/Core/OpcodeDispatcher/X87F64.cpp @@ -105,7 +105,7 @@ void OpDispatchBuilder::FILDF64(OpcodeArgs) { void OpDispatchBuilder::FSTF64(OpcodeArgs, IR::OpSize Width) { Ref Mem = LoadSource(GPRClass, Op, Op->Dest, Op->Flags, {.LoadData = false}); - _StoreStackMemory(Mem, OpSize::i64Bit, true, Width); + _StoreStackMemory(Invalid(), Mem, OpSize::i64Bit, true, Width); if (Op->TableInfo->Flags & X86Tables::InstFlags::FLAGS_POP) { _PopStackDestroy(); diff --git a/FEXCore/Source/Interface/IR/IR.json b/FEXCore/Source/Interface/IR/IR.json index 1cb3e38690..4147c6884f 100644 --- a/FEXCore/Source/Interface/IR/IR.json +++ b/FEXCore/Source/Interface/IR/IR.json @@ -211,7 +211,8 @@ }, "ThreadRemoveCodeEntry": { - "HasSideEffects": true + "HasSideEffects": true, + "MaybeClobbersPredRegs": true }, "GPR = ProcessorID": { @@ -248,7 +249,8 @@ "Print SSA:$Value": { "HasSideEffects": true, "Desc": ["Debug operation that prints an SSA value to the console", - "May only print 64bits of the value"] + "May only print 64bits of the value"], + "MaybeClobbersPredRegs": true }, "GPR = AllocateGPR i1:$ForPair": { "Desc": ["Silly pseudo-instruction to allocate a register for a future destination", @@ -312,7 +314,8 @@ "HasSideEffects": true, "Desc": ["Dispatches a guest syscall through to the SyscallHandler class" ], - "DestSize": "OpSize::i64Bit" + "DestSize": "OpSize::i64Bit", + "MaybeClobbersPredRegs": true }, "GPR = InlineSyscall GPR:$Arg0, GPR:$Arg1, GPR:$Arg2, GPR:$Arg3, GPR:$Arg4, GPR:$Arg5, i32:$HostSyscallNumber, SyscallFlags:$Flags": { @@ -328,18 +331,21 @@ }, "Thunk GPR:$ArgPtr, SHA256Sum:$ThunkNameHash": { - "HasSideEffects": true + "HasSideEffects": true, + "MaybeClobbersPredRegs": true }, "GPR:$EAX, GPR:$EBX, GPR:$ECX, GPR:$EDX = CPUID GPR:$Function, GPR:$Leaf": { "Desc": ["Calls in to the CPUID handler function to return emulated CPUID"], "DestSize": "OpSize::i32Bit", - "HasSideEffects": true + "HasSideEffects": true, + "MaybeClobbersPredRegs": true }, "GPR:$EAX, GPR:$EDX = XGetBV GPR:$Function": { "Desc": ["Calls in to the XCR handler function to return emulated XCR"], "DestSize": "OpSize::i32Bit", - "HasSideEffects": true + "HasSideEffects": true, + "MaybeClobbersPredRegs": true } }, "Moves": { @@ -578,6 +584,7 @@ "HasSideEffects": true, "ElementSize": "ElementSize" }, + "FPR = LoadMemPredicate OpSize:#RegisterSize, OpSize:#ElementSize, PRED:$Mask, GPR:$Addr": { "Desc": [ "Loads a value to memory using SVE predicate mask." ], "DestSize": "RegisterSize", @@ -1631,7 +1638,8 @@ "DestSize": "Size", "EmitValidation": [ "Size == FEXCore::IR::OpSize::i16Bit || Size == FEXCore::IR::OpSize::i32Bit || Size == FEXCore::IR::OpSize::i64Bit" - ] + ], + "MaybeClobbersPredRegs": true }, "GPR = LUDiv OpSize:#Size, GPR:$Lower, GPR:$Upper, GPR:$Divisor": { "Desc": ["Integer long unsigned division returning lower bits", @@ -1641,7 +1649,8 @@ "DestSize": "Size", "EmitValidation": [ "Size == FEXCore::IR::OpSize::i16Bit || Size == FEXCore::IR::OpSize::i32Bit || Size == FEXCore::IR::OpSize::i64Bit" - ] + ], + "MaybeClobbersPredRegs": true }, "GPR = LRem OpSize:#Size, GPR:$Lower, GPR:$Upper, GPR:$Divisor": { "Desc": ["Integer long signed remainder returning lower bits", @@ -1651,7 +1660,8 @@ "DestSize": "Size", "EmitValidation": [ "Size == FEXCore::IR::OpSize::i16Bit || Size == FEXCore::IR::OpSize::i32Bit || Size == FEXCore::IR::OpSize::i64Bit" - ] + ], + "MaybeClobbersPredRegs": true }, "GPR = LURem OpSize:#Size, GPR:$Lower, GPR:$Upper, GPR:$Divisor": { "Desc": ["Integer long unsigned remainder returning lower bits", @@ -1661,7 +1671,8 @@ "DestSize": "Size", "EmitValidation": [ "Size == FEXCore::IR::OpSize::i16Bit || Size == FEXCore::IR::OpSize::i32Bit || Size == FEXCore::IR::OpSize::i64Bit" - ] + ], + "MaybeClobbersPredRegs": true }, "Float to GPR": {"Ignore": 1}, @@ -2487,7 +2498,8 @@ "course of creating the intermediate result" ], "DestSize": "OpSize::i32Bit", - "JITDispatch": false + "JITDispatch": false, + "MaybeClobbersPredRegs": true }, "GPR = VPCMPISTRX FPR:$LHS, FPR:$RHS, u8:$Control": { "Desc": ["Performs intermediate behavior analogous to the x86 PCMPISTRI/PCMPISTRM instruction", @@ -2788,13 +2800,14 @@ "HasSideEffects": true, "X87": true }, - "StoreStackMemory GPR:$Addr, OpSize:$SourceSize, i1:$Float, OpSize:$StoreSize": { + "StoreStackMemory PRED:$PredReg, GPR:$Addr, OpSize:$SourceSize, i1:$Float, OpSize:$StoreSize": { "Desc": [ "Takes the top value off the x87 stack and stores it to memory.", "SourceSize is 128bit for F80 values, 64-bit for low precision.", "StoreSize is the store size for conversion:", "Float: 80-bit, 64-bit, or 32-bit", - "Int: 64-bit, 32-bit, 16-bit" + "Int: 64-bit, 32-bit, 16-bit", + "If possible, it will use the PredReg for an SVE store." ], "HasSideEffects": true, "X87": true @@ -2834,18 +2847,21 @@ "Adds two stack locations together, storing the result in to the first stack location" ], "HasSideEffects": true, - "X87": true + "X87": true, + "MaybeClobbersPredRegs": true }, "F80AddValue u8:$SrcStack, FPR:$X80Src": { "Desc": [ "Adds a operand value to a stack location. The result stored in to the stack location provided." ], "HasSideEffects": true, - "X87": true + "X87": true, + "MaybeClobbersPredRegs": true }, "FPR = F80Add FPR:$X80Src1, FPR:$X80Src2": { "DestSize": "OpSize::i128Bit", - "JITDispatch": false + "JITDispatch": false, + "MaybeClobbersPredRegs": true }, "F80SubStack u8:$DstStack, u8:$SrcStack1, u8:$SrcStack2": { "Desc": [ @@ -2853,7 +2869,8 @@ "The result is stored in stack location TOP+$DstStack." ], "HasSideEffects": true, - "X87": true + "X87": true, + "MaybeClobbersPredRegs": true }, "F80SubValue u8:$SrcStack, FPR:$X80Src": { "Desc": [ @@ -2861,7 +2878,8 @@ "The result is stored in stack location TOP." ], "HasSideEffects": true, - "X87": true + "X87": true, + "MaybeClobbersPredRegs": true }, "F80SubRValue FPR:$X80Src, u8:$SrcStack": { "Desc": [ @@ -2869,7 +2887,8 @@ "The result is stored in stack location TOP." ], "HasSideEffects": true, - "X87": true + "X87": true, + "MaybeClobbersPredRegs": true }, "FPR = F80Sub FPR:$X80Src1, FPR:$X80Src2": { "Desc": [ @@ -2878,25 +2897,29 @@ "`FPR = X80Src2 - X80Src1`" ], "DestSize": "OpSize::i128Bit", - "JITDispatch": false + "JITDispatch": false, + "MaybeClobbersPredRegs": true }, "F80MulStack u8:$SrcStack1, u8:$SrcStack2": { "Desc": [ "Multiplies two stack locations together, storing the result in to the first stack location" ], "HasSideEffects": true, - "X87": true + "X87": true, + "MaybeClobbersPredRegs": true }, "F80MulValue u8:$SrcStack, FPR:$X80Src": { "Desc": [ "Multiplies a operand value to a stack location. The result stored in to the stack location provided." ], "HasSideEffects": true, - "X87": true + "X87": true, + "MaybeClobbersPredRegs": true }, "FPR = F80Mul FPR:$X80Src1, FPR:$X80Src2": { "DestSize": "OpSize::i128Bit", - "JITDispatch": false + "JITDispatch": false, + "MaybeClobbersPredRegs": true }, "F80DivStack u8:$DstStack, u8:$SrcStack1, u8:$SrcStack2": { "Desc": [ @@ -2905,7 +2928,8 @@ "`FPR|Stack[TOP+DstStack] = Stack[TOP+SrcStack1] / Stack[TOP+SrcStack2]`" ], "HasSideEffects": true, - "X87": true + "X87": true, + "MaybeClobbersPredRegs": true }, "F80DivValue u8:$SrcStack, FPR:$X80Src": { "Desc": [ @@ -2914,7 +2938,8 @@ "`FPR|Stack[TOP] = Stack[TOP+SrcStack] / X80Src`" ], "HasSideEffects": true, - "X87": true + "X87": true, + "MaybeClobbersPredRegs": true }, "F80DivRValue FPR:$X80Src, u8:$SrcStack": { "Desc": [ @@ -2923,7 +2948,8 @@ "`FPR|Stack[TOP] = X80Src / Stack[TOP+SrcStack]`" ], "HasSideEffects": true, - "X87": true + "X87": true, + "MaybeClobbersPredRegs": true }, "FPR = F80Div FPR:$X80Src1, FPR:$X80Src2": { "Desc": [ @@ -2932,7 +2958,8 @@ "`FPR = X80Src1 / X80Src2`" ], "DestSize": "OpSize::i128Bit", - "JITDispatch": false + "JITDispatch": false, + "MaybeClobbersPredRegs": true }, "F80StackXchange u8:$SrcStack": { "Desc": [ @@ -2957,14 +2984,16 @@ ], "HasSideEffects": true, "DestSize": "OpSize::i128Bit", - "X87": true + "X87": true, + "MaybeClobbersPredRegs": true }, "F80PTANStack": { "Desc": [ "Computes the approximate tangent of the source operand in register ST(0), stores the result in ST(0), and pushes a 1.0 onto the FPU register stack." ], "X87": true, - "HasSideEffects": true + "HasSideEffects": true, + "MaybeClobbersPredRegs": true }, "FPR = F80ATANStack": { "Desc": [ @@ -2972,51 +3001,63 @@ ], "DestSize": "OpSize::i128Bit", "X87": true, - "HasSideEffects": true + "HasSideEffects": true, + "MaybeClobbersPredRegs": true }, "FPR = F80ATAN FPR:$X80Src1, FPR:$X80Src2": { "DestSize": "OpSize::i128Bit", - "JITDispatch": false + "JITDispatch": false, + "MaybeClobbersPredRegs": true }, "F80FPREMStack": { "X87": true, - "HasSideEffects": true + "HasSideEffects": true, + "MaybeClobbersPredRegs": true }, "FPR = F80FPREM FPR:$X80Src1, FPR:$X80Src2": { "DestSize": "OpSize::i128Bit", - "JITDispatch": false + "JITDispatch": false, + "MaybeClobbersPredRegs": true }, "F80FPREM1Stack": { "X87": true, - "HasSideEffects": true + "HasSideEffects": true, + "MaybeClobbersPredRegs": true }, "FPR = F80FPREM1 FPR:$X80Src1, FPR:$X80Src2": { "DestSize": "OpSize::i128Bit", - "JITDispatch": false + "JITDispatch": false, + "MaybeClobbersPredRegs": true }, "F80SCALEStack": { "X87": true, - "HasSideEffects": true + "HasSideEffects": true, + "MaybeClobbersPredRegs": true }, "FPR = F80SCALE FPR:$X80Src1, FPR:$X80Src2": { "DestSize": "OpSize::i128Bit", - "JITDispatch": false + "JITDispatch": false, + "MaybeClobbersPredRegs": true }, "FPR = F80CVT OpSize:#Size, FPR:$X80Src": { "DestSize": "Size", - "JITDispatch": false + "JITDispatch": false, + "MaybeClobbersPredRegs": true }, "GPR = F80CVTInt OpSize:#Size, FPR:$X80Src, i1:$Truncate": { "DestSize": "Size", - "JITDispatch": false + "JITDispatch": false, + "MaybeClobbersPredRegs": true }, "FPR = F80CVTTo FPR:$X80Src, OpSize:$SrcSize": { "DestSize": "OpSize::i128Bit", - "JITDispatch": false + "JITDispatch": false, + "MaybeClobbersPredRegs": true }, "FPR = F80CVTToInt GPR:$Src, OpSize:$SrcSize": { "DestSize": "OpSize::i128Bit", - "JITDispatch": false + "JITDispatch": false, + "MaybeClobbersPredRegs": true }, "F80RoundStack": { "Desc": [ @@ -3027,62 +3068,76 @@ }, "FPR = F80Round FPR:$X80Src": { "DestSize": "OpSize::i128Bit", - "JITDispatch": false + "JITDispatch": false, + "MaybeClobbersPredRegs": true }, "F80F2XM1Stack": { "X87": true, - "HasSideEffects": true + "HasSideEffects": true, + "MaybeClobbersPredRegs": true }, "FPR = F80F2XM1 FPR:$X80Src": { "DestSize": "OpSize::i128Bit", - "JITDispatch": false + "JITDispatch": false, + "MaybeClobbersPredRegs": true }, "FPR = F80TAN FPR:$X80Src": { "DestSize": "OpSize::i128Bit", - "JITDispatch": false + "JITDispatch": false, + "MaybeClobbersPredRegs": true }, "F80SINStack": { "X87": true, - "HasSideEffects": true + "HasSideEffects": true, + "MaybeClobbersPredRegs": true }, "FPR = F80SIN FPR:$X80Src": { "DestSize": "OpSize::i128Bit", - "JITDispatch": false + "JITDispatch": false, + "MaybeClobbersPredRegs": true }, "F80COSStack": { "X87": true, - "HasSideEffects": true + "HasSideEffects": true, + "MaybeClobbersPredRegs": true }, "FPR = F80COS FPR:$X80Src": { "DestSize": "OpSize::i128Bit", - "JITDispatch": false + "JITDispatch": false, + "MaybeClobbersPredRegs": true }, "F80SINCOSStack": { "X87": true, - "HasSideEffects": true + "HasSideEffects": true, + "MaybeClobbersPredRegs": true }, "F80SQRTStack": { "X87": true, - "HasSideEffects": true + "HasSideEffects": true, + "MaybeClobbersPredRegs": true }, "FPR = F80SQRT FPR:$X80Src": { "DestSize": "OpSize::i128Bit", - "JITDispatch": false + "JITDispatch": false, + "MaybeClobbersPredRegs": true }, "FPR = F80XTRACT_EXP FPR:$X80Src": { "DestSize": "OpSize::i128Bit", - "JITDispatch": false + "JITDispatch": false, + "MaybeClobbersPredRegs": true }, "FPR = F80XTRACT_SIG FPR:$X80Src": { "DestSize": "OpSize::i128Bit", - "JITDispatch": false + "JITDispatch": false, + "MaybeClobbersPredRegs": true }, "GPR = F80StackTest u8:$SrcStack": { "Desc": [ "Does comparison between value in stack at TOP + SrcStack" ], "DestSize": "OpSize::i32Bit", - "X87": true + "X87": true, + "MaybeClobbersPredRegs": true }, "GPR = F80CmpStack u8:$SrcStack": { "Desc": [ @@ -3090,7 +3145,8 @@ "Ordering flag result is true if either float input is NaN" ], "DestSize": "OpSize::i32Bit", - "X87": true + "X87": true, + "MaybeClobbersPredRegs": true }, "GPR = F80CmpValue FPR:$X80Src": { "Desc": [ @@ -3099,14 +3155,16 @@ ], "DestSize": "OpSize::i32Bit", "HasSideEffects": true, - "X87": true + "X87": true, + "MaybeClobbersPredRegs": true }, "GPR = F80Cmp FPR:$X80Src1, FPR:$X80Src2": { "Desc": ["Does a scalar unordered compare and stores the flags in to a GPR", "Ordering flag result is true if either float input is NaN" ], "DestSize": "OpSize::i32Bit", - "JITDispatch": false + "JITDispatch": false, + "MaybeClobbersPredRegs": true }, "FPR = F80BCDLoad FPR:$X80Src": { "DestSize": "OpSize::i128Bit", @@ -3124,11 +3182,13 @@ ], "HasSideEffects": true, "DestSize": "OpSize::i128Bit", - "X87": true + "X87": true, + "MaybeClobbersPredRegs": true }, "FPR = F80FYL2X FPR:$X80Src1, FPR:$X80Src2": { "DestSize": "OpSize::i128Bit", - "JITDispatch": false + "JITDispatch": false, + "MaybeClobbersPredRegs": true }, "F80VBSLStack OpSize:#RegisterSize, FPR:$VectorMask, u8:$SrcStack1, u8:$SrcStack2": { "Desc": [ @@ -3138,7 +3198,8 @@ "Writes the result to the top of the stack." ], "X87": true, - "HasSideEffects": true + "HasSideEffects": true, + "MaybeClobbersPredRegs": true } }, "Backend": { diff --git a/FEXCore/Source/Interface/IR/IREmitter.cpp b/FEXCore/Source/Interface/IR/IREmitter.cpp index 0850187b1c..95cb2e73dd 100644 --- a/FEXCore/Source/Interface/IR/IREmitter.cpp +++ b/FEXCore/Source/Interface/IR/IREmitter.cpp @@ -41,6 +41,7 @@ FEXCore::IR::RegisterClassType IREmitter::WalkFindRegClass(Ref Node) { case FPRClass: case GPRFixedClass: case FPRFixedClass: + case PREDClass: case InvalidClass: return Class; default: break; } diff --git a/FEXCore/Source/Interface/IR/IREmitter.h b/FEXCore/Source/Interface/IR/IREmitter.h index 0cfc4027be..c5af4efdd3 100644 --- a/FEXCore/Source/Interface/IR/IREmitter.h +++ b/FEXCore/Source/Interface/IR/IREmitter.h @@ -1,6 +1,7 @@ // SPDX-License-Identifier: MIT #pragma once +#include "CodeEmitter/Emitter.h" #include "Interface/IR/IR.h" #include "Interface/IR/IntrusiveIRList.h" @@ -9,9 +10,9 @@ #include #include +#include #include -#include #include #include @@ -45,6 +46,37 @@ class IREmitter { } void ResetWorkingList(); + // Predicate Cache Implementation + // This lives here rather than OpcodeDispatcher because x87StackOptimization Pass + // also needs it. + struct PredicateKey { + ARMEmitter::PredicatePattern Pattern; + OpSize Size; + bool operator==(const PredicateKey& rhs) const = default; + }; + + struct PredicateKeyHash { + size_t operator()(const PredicateKey& key) const { + return FEXCore::ToUnderlying(key.Pattern) + (FEXCore::ToUnderlying(key.Size) * FEXCore::ToUnderlying(OpSize::iInvalid)); + } + }; + fextl::unordered_map InitPredicateCache; + + Ref InitPredicateCached(OpSize Size, ARMEmitter::PredicatePattern Pattern) { + PredicateKey Key {Pattern, Size}; + auto ValIt = InitPredicateCache.find(Key); + if (ValIt == InitPredicateCache.end()) { + auto Predicate = _InitPredicate(Size, static_cast(FEXCore::ToUnderlying(Pattern))); + InitPredicateCache[Key] = Predicate; + return Predicate; + } + return ValIt->second; + } + + void ResetInitPredicateCache() { + InitPredicateCache.clear(); + } + /** * @name IR allocation routines * diff --git a/FEXCore/Source/Interface/IR/Passes/x87StackOptimizationPass.cpp b/FEXCore/Source/Interface/IR/Passes/x87StackOptimizationPass.cpp index 247b54633e..476d9f9eab 100644 --- a/FEXCore/Source/Interface/IR/Passes/x87StackOptimizationPass.cpp +++ b/FEXCore/Source/Interface/IR/Passes/x87StackOptimizationPass.cpp @@ -822,10 +822,11 @@ void X87StackOptimization::Run(IREmitter* Emit) { if (Op->StoreSize != OpSize::f80Bit) { // if it's not 80bits then convert StackNode = IREmit->_F80CVT(Op->StoreSize, StackNode); } - if (Op->StoreSize == OpSize::f80Bit) { // Part of code from StoreResult_WithOpSize() - if (Features.SupportsSVE128 || Features.SupportsSVE256) { - auto PReg = IREmit->_InitPredicate(OpSize::i16Bit, FEXCore::ToUnderlying(ARMEmitter::PredicatePattern::SVE_VL5)); - IREmit->_StoreMemPredicate(OpSize::i128Bit, OpSize::i16Bit, StackNode, PReg, AddrNode); + if (Op->StoreSize == OpSize::f80Bit) { + Ref PredReg = CurrentIR.GetNode(Op->PredReg); + bool CanUsePredicateStore = (Features.SupportsSVE128 || Features.SupportsSVE256) && PredReg; + if (CanUsePredicateStore) { + IREmit->_StoreMemPredicate(OpSize::i128Bit, OpSize::i16Bit, StackNode, PredReg, AddrNode); } else { // For X87 extended doubles, split before storing IREmit->_StoreMem(FPRClass, OpSize::i64Bit, AddrNode, StackNode); diff --git a/unittests/ASM/X87/MemcopyWithCPUID.asm b/unittests/ASM/X87/MemcopyWithCPUID.asm new file mode 100644 index 0000000000..62e7558747 --- /dev/null +++ b/unittests/ASM/X87/MemcopyWithCPUID.asm @@ -0,0 +1,36 @@ +%ifdef CONFIG +{ + "RegData": { + "RBX": "0x8000000000000000", + "RCX": "0x3fff" + } +} +%endif + +; Related to #4274 - ensures that if cpuid clobbers the predicate register, +; we reset the predicate cache. + +section .data +align 8 + +data: + dt 1.0 + +section .bss +align 8 + +data2: + resb 10 + +section .text +lea r8, [rel data] +fld tword [r8] + +mov rax, 0x0 +cpuid ; Will this instruction clobber the predicate register? + +fstp tword [rel data2] + +mov rbx, [rel data2] +mov rcx, [rel data2+8] +hlt diff --git a/unittests/InstructionCountCI/X87ldst-SVE.json b/unittests/InstructionCountCI/X87ldst-SVE.json index d82b68d9b1..2050491143 100644 --- a/unittests/InstructionCountCI/X87ldst-SVE.json +++ b/unittests/InstructionCountCI/X87ldst-SVE.json @@ -17,10 +17,10 @@ "ExpectedInstructionCount": 13, "Comment": "Single 80-bit store.", "ExpectedArm64ASM": [ + "ptrue p2.h, vl5", "ldrb w20, [x28, #1019]", "add x0, x28, x20, lsl #4", "ldr q2, [x0, #1040]", - "ptrue p2.h, vl5", "st1h {z2.h}, p2, [x4]", "ldrb w21, [x28, #1298]", "mov w22, #0x1", @@ -34,16 +34,16 @@ }, "2-store 80bit": { "x86InstructionCount": 2, - "ExpectedInstructionCount": 25, + "ExpectedInstructionCount": 24, "x86Insts": [ "fstp tword [rax]", "fstp tword [rax+10]" ], "ExpectedArm64ASM": [ + "ptrue p2.h, vl5", "ldrb w20, [x28, #1019]", "add x0, x28, x20, lsl #4", "ldr q2, [x0, #1040]", - "ptrue p2.h, vl5", "st1h {z2.h}, p2, [x4]", "ldrb w21, [x28, #1298]", "mov w22, #0x1", @@ -56,7 +56,6 @@ "add x21, x4, #0xa (10)", "add x0, x28, x20, lsl #4", "ldr q2, [x0, #1040]", - "ptrue p2.h, vl5", "st1h {z2.h}, p2, [x21]", "ldrb w21, [x28, #1298]", "lsl w22, w22, w20", @@ -69,7 +68,7 @@ }, "8-store 80bit": { "x86InstructionCount": 8, - "ExpectedInstructionCount": 97, + "ExpectedInstructionCount": 90, "x86Insts": [ "fstp tword [rax]", "fstp tword [rax+10]", @@ -81,10 +80,10 @@ "fstp tword [rax+70]" ], "ExpectedArm64ASM": [ + "ptrue p2.h, vl5", "ldrb w20, [x28, #1019]", "add x0, x28, x20, lsl #4", "ldr q2, [x0, #1040]", - "ptrue p2.h, vl5", "st1h {z2.h}, p2, [x4]", "ldrb w21, [x28, #1298]", "mov w22, #0x1", @@ -97,7 +96,6 @@ "add x21, x4, #0xa (10)", "add x0, x28, x20, lsl #4", "ldr q2, [x0, #1040]", - "ptrue p2.h, vl5", "st1h {z2.h}, p2, [x21]", "ldrb w21, [x28, #1298]", "lsl w23, w22, w20", @@ -109,7 +107,6 @@ "add x21, x4, #0x14 (20)", "add x0, x28, x20, lsl #4", "ldr q2, [x0, #1040]", - "ptrue p2.h, vl5", "st1h {z2.h}, p2, [x21]", "ldrb w21, [x28, #1298]", "lsl w23, w22, w20", @@ -121,7 +118,6 @@ "add x21, x4, #0x1e (30)", "add x0, x28, x20, lsl #4", "ldr q2, [x0, #1040]", - "ptrue p2.h, vl5", "st1h {z2.h}, p2, [x21]", "ldrb w21, [x28, #1298]", "lsl w23, w22, w20", @@ -133,7 +129,6 @@ "add x21, x4, #0x28 (40)", "add x0, x28, x20, lsl #4", "ldr q2, [x0, #1040]", - "ptrue p2.h, vl5", "st1h {z2.h}, p2, [x21]", "ldrb w21, [x28, #1298]", "lsl w23, w22, w20", @@ -145,7 +140,6 @@ "add x21, x4, #0x32 (50)", "add x0, x28, x20, lsl #4", "ldr q2, [x0, #1040]", - "ptrue p2.h, vl5", "st1h {z2.h}, p2, [x21]", "ldrb w21, [x28, #1298]", "lsl w23, w22, w20", @@ -157,7 +151,6 @@ "add x21, x4, #0x3c (60)", "add x0, x28, x20, lsl #4", "ldr q2, [x0, #1040]", - "ptrue p2.h, vl5", "st1h {z2.h}, p2, [x21]", "ldrb w21, [x28, #1298]", "lsl w23, w22, w20", @@ -169,7 +162,6 @@ "add x21, x4, #0x46 (70)", "add x0, x28, x20, lsl #4", "ldr q2, [x0, #1040]", - "ptrue p2.h, vl5", "st1h {z2.h}, p2, [x21]", "ldrb w21, [x28, #1298]", "lsl w22, w22, w20", @@ -201,7 +193,7 @@ }, "2-load 80bit": { "x86InstructionCount": 2, - "ExpectedInstructionCount": 22, + "ExpectedInstructionCount": 21, "x86Insts": [ "fld tword [rax]", "fld tword [rax+10]" @@ -210,7 +202,6 @@ "ptrue p2.h, vl5", "ld1h {z2.h}, p2/z, [x4]", "add x20, x4, #0xa (10)", - "ptrue p2.h, vl5", "ld1h {z3.h}, p2/z, [x20]", "ldrb w20, [x28, #1019]", "sub w20, w20, #0x2 (2)", @@ -233,7 +224,7 @@ }, "8-load 80bit": { "x86InstructionCount": 8, - "ExpectedInstructionCount": 59, + "ExpectedInstructionCount": 52, "x86Insts": [ "fld tword [rax]", "fld tword [rax+10]", @@ -248,25 +239,18 @@ "ptrue p2.h, vl5", "ld1h {z2.h}, p2/z, [x4]", "add x20, x4, #0xa (10)", - "ptrue p2.h, vl5", "ld1h {z3.h}, p2/z, [x20]", "add x20, x4, #0x14 (20)", - "ptrue p2.h, vl5", "ld1h {z4.h}, p2/z, [x20]", "add x20, x4, #0x1e (30)", - "ptrue p2.h, vl5", "ld1h {z5.h}, p2/z, [x20]", "add x20, x4, #0x28 (40)", - "ptrue p2.h, vl5", "ld1h {z6.h}, p2/z, [x20]", "add x20, x4, #0x32 (50)", - "ptrue p2.h, vl5", "ld1h {z7.h}, p2/z, [x20]", "add x20, x4, #0x3c (60)", - "ptrue p2.h, vl5", "ld1h {z8.h}, p2/z, [x20]", "add x20, x4, #0x46 (70)", - "ptrue p2.h, vl5", "ld1h {z9.h}, p2/z, [x20]", "ldrb w20, [x28, #1019]", "sub w20, w20, #0x8 (8)",