From 36a0414dad1816b8a7bd1987e07fbd9266fee6bd Mon Sep 17 00:00:00 2001 From: Paulo Matos Date: Tue, 3 Dec 2024 17:40:25 +0100 Subject: [PATCH 1/3] Ensure predicate cache is reset when control flow leaves block Whenever the control float leaves the block, it might clobber the predicate register so we reset the cache whenever that happens. The difficulty here is that the cache is valid only during IR generation so we need to make sure we catch all the cases during this pass where the execution might leave the block. Fixes #4264 --- FEXCore/Scripts/json_ir_generator.py | 11 +- .../Interface/Core/OpcodeDispatcher.cpp | 4 +- .../Source/Interface/Core/OpcodeDispatcher.h | 5 +- .../Interface/Core/OpcodeDispatcher/X87.cpp | 14 +- .../Core/OpcodeDispatcher/X87F64.cpp | 2 +- FEXCore/Source/Interface/IR/IR.json | 183 ++++++++++++------ FEXCore/Source/Interface/IR/IREmitter.cpp | 1 + FEXCore/Source/Interface/IR/IREmitter.h | 34 +++- .../IR/Passes/x87StackOptimizationPass.cpp | 9 +- 9 files changed, 183 insertions(+), 80 deletions(-) diff --git a/FEXCore/Scripts/json_ir_generator.py b/FEXCore/Scripts/json_ir_generator.py index cda8d9c509..fed9ad9915 100755 --- a/FEXCore/Scripts/json_ir_generator.py +++ b/FEXCore/Scripts/json_ir_generator.py @@ -55,6 +55,7 @@ class OpDefinition: NonSSAArgNum: int DynamicDispatch: bool LoweredX87: bool + MaybeClobbersPredRegs: bool JITDispatch: bool JITDispatchOverride: str TiedSource: int @@ -79,6 +80,7 @@ def __init__(self): self.NonSSAArgNum = 0 self.DynamicDispatch = False self.LoweredX87 = False + self.MaybeClobbersPredRegs = False self.JITDispatch = True self.JITDispatchOverride = None self.TiedSource = -1 @@ -223,7 +225,7 @@ def parse_ops(ops): (OpArg.Type == "GPR" or OpArg.Type == "GPRPair" or OpArg.Type == "FPR" or - OpArg.Type == "PR")): + OpArg.Type == "PRED")): OpDef.EmitValidation.append(f"GetOpRegClass({ArgName}) == InvalidClass || WalkFindRegClass({ArgName}) == {OpArg.Type}Class") OpArg.Name = ArgName @@ -277,6 +279,9 @@ def parse_ops(ops): assert("JITDispatch" not in op_val) OpDef.JITDispatch = False + if "MaybeClobbersPredRegs" in op_val: + OpDef.MaybeClobbersPredRegs = op_val["MaybeClobbersPredRegs"] + if "TiedSource" in op_val: OpDef.TiedSource = op_val["TiedSource"] @@ -506,6 +511,7 @@ def print_ir_hassideeffects(): ("HasSideEffects", "bool"), ("ImplicitFlagClobber", "bool"), ("LoweredX87", "bool"), + ("MaybeClobbersPredRegs", "bool"), ("TiedSource", "int8_t"), ]: output_file.write( @@ -707,6 +713,9 @@ def print_ir_allocator_helpers(): "\t\tif(MMXState == MMXState_MMX) ChgStateMMX_X87();\n" ) + if op.MaybeClobbersPredRegs: + output_file.write("\t\tResetInitPredicateCache();\n") + output_file.write("\t\tauto _Op = AllocateOp();\n".format(op.Name, op.Name.upper())) if op.SSAArgNum != 0: diff --git a/FEXCore/Source/Interface/Core/OpcodeDispatcher.cpp b/FEXCore/Source/Interface/Core/OpcodeDispatcher.cpp index 6dd2d8b108..f729823364 100644 --- a/FEXCore/Source/Interface/Core/OpcodeDispatcher.cpp +++ b/FEXCore/Source/Interface/Core/OpcodeDispatcher.cpp @@ -4314,7 +4314,7 @@ Ref OpDispatchBuilder::LoadSource_WithOpSize(RegisterClassType Class, const X86T Ref MemSrc = LoadEffectiveAddress(A, true); if (CTX->HostFeatures.SupportsSVE128 || CTX->HostFeatures.SupportsSVE256) { // Using SVE we can load this with a single instruction. - auto PReg = _InitPredicate(OpSize::i16Bit, FEXCore::ToUnderlying(ARMEmitter::PredicatePattern::SVE_VL5)); + auto PReg = InitPredicateCached(OpSize::i16Bit, ARMEmitter::PredicatePattern::SVE_VL5); return _LoadMemPredicate(OpSize::i128Bit, OpSize::i16Bit, PReg, MemSrc); } else { // For X87 extended doubles, Split the load. @@ -4448,7 +4448,7 @@ void OpDispatchBuilder::StoreResult_WithOpSize(FEXCore::IR::RegisterClassType Cl if (OpSize == OpSize::f80Bit) { Ref MemStoreDst = LoadEffectiveAddress(A, true); if (CTX->HostFeatures.SupportsSVE128 || CTX->HostFeatures.SupportsSVE256) { - auto PReg = _InitPredicate(OpSize::i16Bit, FEXCore::ToUnderlying(ARMEmitter::PredicatePattern::SVE_VL5)); + auto PReg = InitPredicateCached(OpSize::i16Bit, ARMEmitter::PredicatePattern::SVE_VL5); _StoreMemPredicate(OpSize::i128Bit, OpSize::i16Bit, Src, PReg, MemStoreDst); } else { // For X87 extended doubles, split before storing diff --git a/FEXCore/Source/Interface/Core/OpcodeDispatcher.h b/FEXCore/Source/Interface/Core/OpcodeDispatcher.h index 9545e87612..c4c01c21f9 100644 --- a/FEXCore/Source/Interface/Core/OpcodeDispatcher.h +++ b/FEXCore/Source/Interface/Core/OpcodeDispatcher.h @@ -119,6 +119,7 @@ class OpDispatchBuilder final : public IREmitter { CachedNZCV = nullptr; CFInverted = CFInvertedABI; FlushRegisterCache(); + ResetInitPredicateCache(); // New block needs to reset segment telemetry. SegmentsNeedReadCheck = ~0U; @@ -718,7 +719,6 @@ class OpDispatchBuilder final : public IREmitter { void FNINIT(OpcodeArgs); void X87ModifySTP(OpcodeArgs, bool Inc); - void X87SinCos(OpcodeArgs); void X87FYL2X(OpcodeArgs, bool IsFYL2XP1); void X87LDENV(OpcodeArgs); void X87FLDCW(OpcodeArgs); @@ -764,9 +764,6 @@ class OpDispatchBuilder final : public IREmitter { void FTSTF64(OpcodeArgs); void FRNDINTF64(OpcodeArgs); void FSQRTF64(OpcodeArgs); - void X87UnaryOpF64(OpcodeArgs, FEXCore::IR::IROps IROp); - void X87BinaryOpF64(OpcodeArgs, FEXCore::IR::IROps IROp); - void X87SinCosF64(OpcodeArgs); void X87FLDCWF64(OpcodeArgs); void X87TANF64(OpcodeArgs); void X87ATANF64(OpcodeArgs); diff --git a/FEXCore/Source/Interface/Core/OpcodeDispatcher/X87.cpp b/FEXCore/Source/Interface/Core/OpcodeDispatcher/X87.cpp index 1470768f8f..3eed79f50d 100644 --- a/FEXCore/Source/Interface/Core/OpcodeDispatcher/X87.cpp +++ b/FEXCore/Source/Interface/Core/OpcodeDispatcher/X87.cpp @@ -130,7 +130,11 @@ void OpDispatchBuilder::FILD(OpcodeArgs) { void OpDispatchBuilder::FST(OpcodeArgs, IR::OpSize Width) { Ref Mem = LoadSource(GPRClass, Op, Op->Dest, Op->Flags, {.LoadData = false}); - _StoreStackMemory(Mem, OpSize::i128Bit, true, Width); + Ref PredReg = Invalid(); + if (CTX->HostFeatures.SupportsSVE128 || CTX->HostFeatures.SupportsSVE256) { + PredReg = InitPredicateCached(OpSize::i16Bit, ARMEmitter::PredicatePattern::SVE_VL5); + } + _StoreStackMemory(PredReg, Mem, OpSize::i128Bit, true, Width); if (Op->TableInfo->Flags & X86Tables::InstFlags::FLAGS_POP) { _PopStackDestroy(); } @@ -267,9 +271,9 @@ void OpDispatchBuilder::FDIV(OpcodeArgs, IR::OpSize Width, bool Integer, bool Re void OpDispatchBuilder::FSUB(OpcodeArgs, IR::OpSize Width, bool Integer, bool Reverse, OpDispatchBuilder::OpResult ResInST0) { if (Op->Src[0].IsNone()) { - const auto Offset = Op->OP & 7; - const auto St0 = 0; - const auto Result = (ResInST0 == OpResult::RES_STI) ? Offset : St0; + const uint8_t Offset = Op->OP & 7; + const uint8_t St0 = 0; + const uint8_t Result = (ResInST0 == OpResult::RES_STI) ? Offset : St0; if (Reverse ^ (ResInST0 == OpResult::RES_STI)) { _F80SubStack(Result, Offset, St0); @@ -751,13 +755,11 @@ void OpDispatchBuilder::FNINIT(OpcodeArgs) { } void OpDispatchBuilder::X87FFREE(OpcodeArgs) { - _InvalidateStack(Op->OP & 7); } void OpDispatchBuilder::X87EMMS(OpcodeArgs) { // Tags all get set to 0b11 - _InvalidateStack(0xff); } diff --git a/FEXCore/Source/Interface/Core/OpcodeDispatcher/X87F64.cpp b/FEXCore/Source/Interface/Core/OpcodeDispatcher/X87F64.cpp index ca4e91f0b5..7313125185 100644 --- a/FEXCore/Source/Interface/Core/OpcodeDispatcher/X87F64.cpp +++ b/FEXCore/Source/Interface/Core/OpcodeDispatcher/X87F64.cpp @@ -105,7 +105,7 @@ void OpDispatchBuilder::FILDF64(OpcodeArgs) { void OpDispatchBuilder::FSTF64(OpcodeArgs, IR::OpSize Width) { Ref Mem = LoadSource(GPRClass, Op, Op->Dest, Op->Flags, {.LoadData = false}); - _StoreStackMemory(Mem, OpSize::i64Bit, true, Width); + _StoreStackMemory(Invalid(), Mem, OpSize::i64Bit, true, Width); if (Op->TableInfo->Flags & X86Tables::InstFlags::FLAGS_POP) { _PopStackDestroy(); diff --git a/FEXCore/Source/Interface/IR/IR.json b/FEXCore/Source/Interface/IR/IR.json index 1cb3e38690..4147c6884f 100644 --- a/FEXCore/Source/Interface/IR/IR.json +++ b/FEXCore/Source/Interface/IR/IR.json @@ -211,7 +211,8 @@ }, "ThreadRemoveCodeEntry": { - "HasSideEffects": true + "HasSideEffects": true, + "MaybeClobbersPredRegs": true }, "GPR = ProcessorID": { @@ -248,7 +249,8 @@ "Print SSA:$Value": { "HasSideEffects": true, "Desc": ["Debug operation that prints an SSA value to the console", - "May only print 64bits of the value"] + "May only print 64bits of the value"], + "MaybeClobbersPredRegs": true }, "GPR = AllocateGPR i1:$ForPair": { "Desc": ["Silly pseudo-instruction to allocate a register for a future destination", @@ -312,7 +314,8 @@ "HasSideEffects": true, "Desc": ["Dispatches a guest syscall through to the SyscallHandler class" ], - "DestSize": "OpSize::i64Bit" + "DestSize": "OpSize::i64Bit", + "MaybeClobbersPredRegs": true }, "GPR = InlineSyscall GPR:$Arg0, GPR:$Arg1, GPR:$Arg2, GPR:$Arg3, GPR:$Arg4, GPR:$Arg5, i32:$HostSyscallNumber, SyscallFlags:$Flags": { @@ -328,18 +331,21 @@ }, "Thunk GPR:$ArgPtr, SHA256Sum:$ThunkNameHash": { - "HasSideEffects": true + "HasSideEffects": true, + "MaybeClobbersPredRegs": true }, "GPR:$EAX, GPR:$EBX, GPR:$ECX, GPR:$EDX = CPUID GPR:$Function, GPR:$Leaf": { "Desc": ["Calls in to the CPUID handler function to return emulated CPUID"], "DestSize": "OpSize::i32Bit", - "HasSideEffects": true + "HasSideEffects": true, + "MaybeClobbersPredRegs": true }, "GPR:$EAX, GPR:$EDX = XGetBV GPR:$Function": { "Desc": ["Calls in to the XCR handler function to return emulated XCR"], "DestSize": "OpSize::i32Bit", - "HasSideEffects": true + "HasSideEffects": true, + "MaybeClobbersPredRegs": true } }, "Moves": { @@ -578,6 +584,7 @@ "HasSideEffects": true, "ElementSize": "ElementSize" }, + "FPR = LoadMemPredicate OpSize:#RegisterSize, OpSize:#ElementSize, PRED:$Mask, GPR:$Addr": { "Desc": [ "Loads a value to memory using SVE predicate mask." ], "DestSize": "RegisterSize", @@ -1631,7 +1638,8 @@ "DestSize": "Size", "EmitValidation": [ "Size == FEXCore::IR::OpSize::i16Bit || Size == FEXCore::IR::OpSize::i32Bit || Size == FEXCore::IR::OpSize::i64Bit" - ] + ], + "MaybeClobbersPredRegs": true }, "GPR = LUDiv OpSize:#Size, GPR:$Lower, GPR:$Upper, GPR:$Divisor": { "Desc": ["Integer long unsigned division returning lower bits", @@ -1641,7 +1649,8 @@ "DestSize": "Size", "EmitValidation": [ "Size == FEXCore::IR::OpSize::i16Bit || Size == FEXCore::IR::OpSize::i32Bit || Size == FEXCore::IR::OpSize::i64Bit" - ] + ], + "MaybeClobbersPredRegs": true }, "GPR = LRem OpSize:#Size, GPR:$Lower, GPR:$Upper, GPR:$Divisor": { "Desc": ["Integer long signed remainder returning lower bits", @@ -1651,7 +1660,8 @@ "DestSize": "Size", "EmitValidation": [ "Size == FEXCore::IR::OpSize::i16Bit || Size == FEXCore::IR::OpSize::i32Bit || Size == FEXCore::IR::OpSize::i64Bit" - ] + ], + "MaybeClobbersPredRegs": true }, "GPR = LURem OpSize:#Size, GPR:$Lower, GPR:$Upper, GPR:$Divisor": { "Desc": ["Integer long unsigned remainder returning lower bits", @@ -1661,7 +1671,8 @@ "DestSize": "Size", "EmitValidation": [ "Size == FEXCore::IR::OpSize::i16Bit || Size == FEXCore::IR::OpSize::i32Bit || Size == FEXCore::IR::OpSize::i64Bit" - ] + ], + "MaybeClobbersPredRegs": true }, "Float to GPR": {"Ignore": 1}, @@ -2487,7 +2498,8 @@ "course of creating the intermediate result" ], "DestSize": "OpSize::i32Bit", - "JITDispatch": false + "JITDispatch": false, + "MaybeClobbersPredRegs": true }, "GPR = VPCMPISTRX FPR:$LHS, FPR:$RHS, u8:$Control": { "Desc": ["Performs intermediate behavior analogous to the x86 PCMPISTRI/PCMPISTRM instruction", @@ -2788,13 +2800,14 @@ "HasSideEffects": true, "X87": true }, - "StoreStackMemory GPR:$Addr, OpSize:$SourceSize, i1:$Float, OpSize:$StoreSize": { + "StoreStackMemory PRED:$PredReg, GPR:$Addr, OpSize:$SourceSize, i1:$Float, OpSize:$StoreSize": { "Desc": [ "Takes the top value off the x87 stack and stores it to memory.", "SourceSize is 128bit for F80 values, 64-bit for low precision.", "StoreSize is the store size for conversion:", "Float: 80-bit, 64-bit, or 32-bit", - "Int: 64-bit, 32-bit, 16-bit" + "Int: 64-bit, 32-bit, 16-bit", + "If possible, it will use the PredReg for an SVE store." ], "HasSideEffects": true, "X87": true @@ -2834,18 +2847,21 @@ "Adds two stack locations together, storing the result in to the first stack location" ], "HasSideEffects": true, - "X87": true + "X87": true, + "MaybeClobbersPredRegs": true }, "F80AddValue u8:$SrcStack, FPR:$X80Src": { "Desc": [ "Adds a operand value to a stack location. The result stored in to the stack location provided." ], "HasSideEffects": true, - "X87": true + "X87": true, + "MaybeClobbersPredRegs": true }, "FPR = F80Add FPR:$X80Src1, FPR:$X80Src2": { "DestSize": "OpSize::i128Bit", - "JITDispatch": false + "JITDispatch": false, + "MaybeClobbersPredRegs": true }, "F80SubStack u8:$DstStack, u8:$SrcStack1, u8:$SrcStack2": { "Desc": [ @@ -2853,7 +2869,8 @@ "The result is stored in stack location TOP+$DstStack." ], "HasSideEffects": true, - "X87": true + "X87": true, + "MaybeClobbersPredRegs": true }, "F80SubValue u8:$SrcStack, FPR:$X80Src": { "Desc": [ @@ -2861,7 +2878,8 @@ "The result is stored in stack location TOP." ], "HasSideEffects": true, - "X87": true + "X87": true, + "MaybeClobbersPredRegs": true }, "F80SubRValue FPR:$X80Src, u8:$SrcStack": { "Desc": [ @@ -2869,7 +2887,8 @@ "The result is stored in stack location TOP." ], "HasSideEffects": true, - "X87": true + "X87": true, + "MaybeClobbersPredRegs": true }, "FPR = F80Sub FPR:$X80Src1, FPR:$X80Src2": { "Desc": [ @@ -2878,25 +2897,29 @@ "`FPR = X80Src2 - X80Src1`" ], "DestSize": "OpSize::i128Bit", - "JITDispatch": false + "JITDispatch": false, + "MaybeClobbersPredRegs": true }, "F80MulStack u8:$SrcStack1, u8:$SrcStack2": { "Desc": [ "Multiplies two stack locations together, storing the result in to the first stack location" ], "HasSideEffects": true, - "X87": true + "X87": true, + "MaybeClobbersPredRegs": true }, "F80MulValue u8:$SrcStack, FPR:$X80Src": { "Desc": [ "Multiplies a operand value to a stack location. The result stored in to the stack location provided." ], "HasSideEffects": true, - "X87": true + "X87": true, + "MaybeClobbersPredRegs": true }, "FPR = F80Mul FPR:$X80Src1, FPR:$X80Src2": { "DestSize": "OpSize::i128Bit", - "JITDispatch": false + "JITDispatch": false, + "MaybeClobbersPredRegs": true }, "F80DivStack u8:$DstStack, u8:$SrcStack1, u8:$SrcStack2": { "Desc": [ @@ -2905,7 +2928,8 @@ "`FPR|Stack[TOP+DstStack] = Stack[TOP+SrcStack1] / Stack[TOP+SrcStack2]`" ], "HasSideEffects": true, - "X87": true + "X87": true, + "MaybeClobbersPredRegs": true }, "F80DivValue u8:$SrcStack, FPR:$X80Src": { "Desc": [ @@ -2914,7 +2938,8 @@ "`FPR|Stack[TOP] = Stack[TOP+SrcStack] / X80Src`" ], "HasSideEffects": true, - "X87": true + "X87": true, + "MaybeClobbersPredRegs": true }, "F80DivRValue FPR:$X80Src, u8:$SrcStack": { "Desc": [ @@ -2923,7 +2948,8 @@ "`FPR|Stack[TOP] = X80Src / Stack[TOP+SrcStack]`" ], "HasSideEffects": true, - "X87": true + "X87": true, + "MaybeClobbersPredRegs": true }, "FPR = F80Div FPR:$X80Src1, FPR:$X80Src2": { "Desc": [ @@ -2932,7 +2958,8 @@ "`FPR = X80Src1 / X80Src2`" ], "DestSize": "OpSize::i128Bit", - "JITDispatch": false + "JITDispatch": false, + "MaybeClobbersPredRegs": true }, "F80StackXchange u8:$SrcStack": { "Desc": [ @@ -2957,14 +2984,16 @@ ], "HasSideEffects": true, "DestSize": "OpSize::i128Bit", - "X87": true + "X87": true, + "MaybeClobbersPredRegs": true }, "F80PTANStack": { "Desc": [ "Computes the approximate tangent of the source operand in register ST(0), stores the result in ST(0), and pushes a 1.0 onto the FPU register stack." ], "X87": true, - "HasSideEffects": true + "HasSideEffects": true, + "MaybeClobbersPredRegs": true }, "FPR = F80ATANStack": { "Desc": [ @@ -2972,51 +3001,63 @@ ], "DestSize": "OpSize::i128Bit", "X87": true, - "HasSideEffects": true + "HasSideEffects": true, + "MaybeClobbersPredRegs": true }, "FPR = F80ATAN FPR:$X80Src1, FPR:$X80Src2": { "DestSize": "OpSize::i128Bit", - "JITDispatch": false + "JITDispatch": false, + "MaybeClobbersPredRegs": true }, "F80FPREMStack": { "X87": true, - "HasSideEffects": true + "HasSideEffects": true, + "MaybeClobbersPredRegs": true }, "FPR = F80FPREM FPR:$X80Src1, FPR:$X80Src2": { "DestSize": "OpSize::i128Bit", - "JITDispatch": false + "JITDispatch": false, + "MaybeClobbersPredRegs": true }, "F80FPREM1Stack": { "X87": true, - "HasSideEffects": true + "HasSideEffects": true, + "MaybeClobbersPredRegs": true }, "FPR = F80FPREM1 FPR:$X80Src1, FPR:$X80Src2": { "DestSize": "OpSize::i128Bit", - "JITDispatch": false + "JITDispatch": false, + "MaybeClobbersPredRegs": true }, "F80SCALEStack": { "X87": true, - "HasSideEffects": true + "HasSideEffects": true, + "MaybeClobbersPredRegs": true }, "FPR = F80SCALE FPR:$X80Src1, FPR:$X80Src2": { "DestSize": "OpSize::i128Bit", - "JITDispatch": false + "JITDispatch": false, + "MaybeClobbersPredRegs": true }, "FPR = F80CVT OpSize:#Size, FPR:$X80Src": { "DestSize": "Size", - "JITDispatch": false + "JITDispatch": false, + "MaybeClobbersPredRegs": true }, "GPR = F80CVTInt OpSize:#Size, FPR:$X80Src, i1:$Truncate": { "DestSize": "Size", - "JITDispatch": false + "JITDispatch": false, + "MaybeClobbersPredRegs": true }, "FPR = F80CVTTo FPR:$X80Src, OpSize:$SrcSize": { "DestSize": "OpSize::i128Bit", - "JITDispatch": false + "JITDispatch": false, + "MaybeClobbersPredRegs": true }, "FPR = F80CVTToInt GPR:$Src, OpSize:$SrcSize": { "DestSize": "OpSize::i128Bit", - "JITDispatch": false + "JITDispatch": false, + "MaybeClobbersPredRegs": true }, "F80RoundStack": { "Desc": [ @@ -3027,62 +3068,76 @@ }, "FPR = F80Round FPR:$X80Src": { "DestSize": "OpSize::i128Bit", - "JITDispatch": false + "JITDispatch": false, + "MaybeClobbersPredRegs": true }, "F80F2XM1Stack": { "X87": true, - "HasSideEffects": true + "HasSideEffects": true, + "MaybeClobbersPredRegs": true }, "FPR = F80F2XM1 FPR:$X80Src": { "DestSize": "OpSize::i128Bit", - "JITDispatch": false + "JITDispatch": false, + "MaybeClobbersPredRegs": true }, "FPR = F80TAN FPR:$X80Src": { "DestSize": "OpSize::i128Bit", - "JITDispatch": false + "JITDispatch": false, + "MaybeClobbersPredRegs": true }, "F80SINStack": { "X87": true, - "HasSideEffects": true + "HasSideEffects": true, + "MaybeClobbersPredRegs": true }, "FPR = F80SIN FPR:$X80Src": { "DestSize": "OpSize::i128Bit", - "JITDispatch": false + "JITDispatch": false, + "MaybeClobbersPredRegs": true }, "F80COSStack": { "X87": true, - "HasSideEffects": true + "HasSideEffects": true, + "MaybeClobbersPredRegs": true }, "FPR = F80COS FPR:$X80Src": { "DestSize": "OpSize::i128Bit", - "JITDispatch": false + "JITDispatch": false, + "MaybeClobbersPredRegs": true }, "F80SINCOSStack": { "X87": true, - "HasSideEffects": true + "HasSideEffects": true, + "MaybeClobbersPredRegs": true }, "F80SQRTStack": { "X87": true, - "HasSideEffects": true + "HasSideEffects": true, + "MaybeClobbersPredRegs": true }, "FPR = F80SQRT FPR:$X80Src": { "DestSize": "OpSize::i128Bit", - "JITDispatch": false + "JITDispatch": false, + "MaybeClobbersPredRegs": true }, "FPR = F80XTRACT_EXP FPR:$X80Src": { "DestSize": "OpSize::i128Bit", - "JITDispatch": false + "JITDispatch": false, + "MaybeClobbersPredRegs": true }, "FPR = F80XTRACT_SIG FPR:$X80Src": { "DestSize": "OpSize::i128Bit", - "JITDispatch": false + "JITDispatch": false, + "MaybeClobbersPredRegs": true }, "GPR = F80StackTest u8:$SrcStack": { "Desc": [ "Does comparison between value in stack at TOP + SrcStack" ], "DestSize": "OpSize::i32Bit", - "X87": true + "X87": true, + "MaybeClobbersPredRegs": true }, "GPR = F80CmpStack u8:$SrcStack": { "Desc": [ @@ -3090,7 +3145,8 @@ "Ordering flag result is true if either float input is NaN" ], "DestSize": "OpSize::i32Bit", - "X87": true + "X87": true, + "MaybeClobbersPredRegs": true }, "GPR = F80CmpValue FPR:$X80Src": { "Desc": [ @@ -3099,14 +3155,16 @@ ], "DestSize": "OpSize::i32Bit", "HasSideEffects": true, - "X87": true + "X87": true, + "MaybeClobbersPredRegs": true }, "GPR = F80Cmp FPR:$X80Src1, FPR:$X80Src2": { "Desc": ["Does a scalar unordered compare and stores the flags in to a GPR", "Ordering flag result is true if either float input is NaN" ], "DestSize": "OpSize::i32Bit", - "JITDispatch": false + "JITDispatch": false, + "MaybeClobbersPredRegs": true }, "FPR = F80BCDLoad FPR:$X80Src": { "DestSize": "OpSize::i128Bit", @@ -3124,11 +3182,13 @@ ], "HasSideEffects": true, "DestSize": "OpSize::i128Bit", - "X87": true + "X87": true, + "MaybeClobbersPredRegs": true }, "FPR = F80FYL2X FPR:$X80Src1, FPR:$X80Src2": { "DestSize": "OpSize::i128Bit", - "JITDispatch": false + "JITDispatch": false, + "MaybeClobbersPredRegs": true }, "F80VBSLStack OpSize:#RegisterSize, FPR:$VectorMask, u8:$SrcStack1, u8:$SrcStack2": { "Desc": [ @@ -3138,7 +3198,8 @@ "Writes the result to the top of the stack." ], "X87": true, - "HasSideEffects": true + "HasSideEffects": true, + "MaybeClobbersPredRegs": true } }, "Backend": { diff --git a/FEXCore/Source/Interface/IR/IREmitter.cpp b/FEXCore/Source/Interface/IR/IREmitter.cpp index 0850187b1c..95cb2e73dd 100644 --- a/FEXCore/Source/Interface/IR/IREmitter.cpp +++ b/FEXCore/Source/Interface/IR/IREmitter.cpp @@ -41,6 +41,7 @@ FEXCore::IR::RegisterClassType IREmitter::WalkFindRegClass(Ref Node) { case FPRClass: case GPRFixedClass: case FPRFixedClass: + case PREDClass: case InvalidClass: return Class; default: break; } diff --git a/FEXCore/Source/Interface/IR/IREmitter.h b/FEXCore/Source/Interface/IR/IREmitter.h index 0cfc4027be..c5af4efdd3 100644 --- a/FEXCore/Source/Interface/IR/IREmitter.h +++ b/FEXCore/Source/Interface/IR/IREmitter.h @@ -1,6 +1,7 @@ // SPDX-License-Identifier: MIT #pragma once +#include "CodeEmitter/Emitter.h" #include "Interface/IR/IR.h" #include "Interface/IR/IntrusiveIRList.h" @@ -9,9 +10,9 @@ #include #include +#include #include -#include #include #include @@ -45,6 +46,37 @@ class IREmitter { } void ResetWorkingList(); + // Predicate Cache Implementation + // This lives here rather than OpcodeDispatcher because x87StackOptimization Pass + // also needs it. + struct PredicateKey { + ARMEmitter::PredicatePattern Pattern; + OpSize Size; + bool operator==(const PredicateKey& rhs) const = default; + }; + + struct PredicateKeyHash { + size_t operator()(const PredicateKey& key) const { + return FEXCore::ToUnderlying(key.Pattern) + (FEXCore::ToUnderlying(key.Size) * FEXCore::ToUnderlying(OpSize::iInvalid)); + } + }; + fextl::unordered_map InitPredicateCache; + + Ref InitPredicateCached(OpSize Size, ARMEmitter::PredicatePattern Pattern) { + PredicateKey Key {Pattern, Size}; + auto ValIt = InitPredicateCache.find(Key); + if (ValIt == InitPredicateCache.end()) { + auto Predicate = _InitPredicate(Size, static_cast(FEXCore::ToUnderlying(Pattern))); + InitPredicateCache[Key] = Predicate; + return Predicate; + } + return ValIt->second; + } + + void ResetInitPredicateCache() { + InitPredicateCache.clear(); + } + /** * @name IR allocation routines * diff --git a/FEXCore/Source/Interface/IR/Passes/x87StackOptimizationPass.cpp b/FEXCore/Source/Interface/IR/Passes/x87StackOptimizationPass.cpp index 247b54633e..476d9f9eab 100644 --- a/FEXCore/Source/Interface/IR/Passes/x87StackOptimizationPass.cpp +++ b/FEXCore/Source/Interface/IR/Passes/x87StackOptimizationPass.cpp @@ -822,10 +822,11 @@ void X87StackOptimization::Run(IREmitter* Emit) { if (Op->StoreSize != OpSize::f80Bit) { // if it's not 80bits then convert StackNode = IREmit->_F80CVT(Op->StoreSize, StackNode); } - if (Op->StoreSize == OpSize::f80Bit) { // Part of code from StoreResult_WithOpSize() - if (Features.SupportsSVE128 || Features.SupportsSVE256) { - auto PReg = IREmit->_InitPredicate(OpSize::i16Bit, FEXCore::ToUnderlying(ARMEmitter::PredicatePattern::SVE_VL5)); - IREmit->_StoreMemPredicate(OpSize::i128Bit, OpSize::i16Bit, StackNode, PReg, AddrNode); + if (Op->StoreSize == OpSize::f80Bit) { + Ref PredReg = CurrentIR.GetNode(Op->PredReg); + bool CanUsePredicateStore = (Features.SupportsSVE128 || Features.SupportsSVE256) && PredReg; + if (CanUsePredicateStore) { + IREmit->_StoreMemPredicate(OpSize::i128Bit, OpSize::i16Bit, StackNode, PredReg, AddrNode); } else { // For X87 extended doubles, split before storing IREmit->_StoreMem(FPRClass, OpSize::i64Bit, AddrNode, StackNode); From 173e007140257af68018617d4d037dae65fd2427 Mon Sep 17 00:00:00 2001 From: Paulo Matos Date: Mon, 13 Jan 2025 15:31:02 +0100 Subject: [PATCH 2/3] instcountci: Ensure predicate cache is reset when control flow leaves block --- unittests/InstructionCountCI/X87ldst-SVE.json | 30 +++++-------------- 1 file changed, 7 insertions(+), 23 deletions(-) diff --git a/unittests/InstructionCountCI/X87ldst-SVE.json b/unittests/InstructionCountCI/X87ldst-SVE.json index d82b68d9b1..2050491143 100644 --- a/unittests/InstructionCountCI/X87ldst-SVE.json +++ b/unittests/InstructionCountCI/X87ldst-SVE.json @@ -17,10 +17,10 @@ "ExpectedInstructionCount": 13, "Comment": "Single 80-bit store.", "ExpectedArm64ASM": [ + "ptrue p2.h, vl5", "ldrb w20, [x28, #1019]", "add x0, x28, x20, lsl #4", "ldr q2, [x0, #1040]", - "ptrue p2.h, vl5", "st1h {z2.h}, p2, [x4]", "ldrb w21, [x28, #1298]", "mov w22, #0x1", @@ -34,16 +34,16 @@ }, "2-store 80bit": { "x86InstructionCount": 2, - "ExpectedInstructionCount": 25, + "ExpectedInstructionCount": 24, "x86Insts": [ "fstp tword [rax]", "fstp tword [rax+10]" ], "ExpectedArm64ASM": [ + "ptrue p2.h, vl5", "ldrb w20, [x28, #1019]", "add x0, x28, x20, lsl #4", "ldr q2, [x0, #1040]", - "ptrue p2.h, vl5", "st1h {z2.h}, p2, [x4]", "ldrb w21, [x28, #1298]", "mov w22, #0x1", @@ -56,7 +56,6 @@ "add x21, x4, #0xa (10)", "add x0, x28, x20, lsl #4", "ldr q2, [x0, #1040]", - "ptrue p2.h, vl5", "st1h {z2.h}, p2, [x21]", "ldrb w21, [x28, #1298]", "lsl w22, w22, w20", @@ -69,7 +68,7 @@ }, "8-store 80bit": { "x86InstructionCount": 8, - "ExpectedInstructionCount": 97, + "ExpectedInstructionCount": 90, "x86Insts": [ "fstp tword [rax]", "fstp tword [rax+10]", @@ -81,10 +80,10 @@ "fstp tword [rax+70]" ], "ExpectedArm64ASM": [ + "ptrue p2.h, vl5", "ldrb w20, [x28, #1019]", "add x0, x28, x20, lsl #4", "ldr q2, [x0, #1040]", - "ptrue p2.h, vl5", "st1h {z2.h}, p2, [x4]", "ldrb w21, [x28, #1298]", "mov w22, #0x1", @@ -97,7 +96,6 @@ "add x21, x4, #0xa (10)", "add x0, x28, x20, lsl #4", "ldr q2, [x0, #1040]", - "ptrue p2.h, vl5", "st1h {z2.h}, p2, [x21]", "ldrb w21, [x28, #1298]", "lsl w23, w22, w20", @@ -109,7 +107,6 @@ "add x21, x4, #0x14 (20)", "add x0, x28, x20, lsl #4", "ldr q2, [x0, #1040]", - "ptrue p2.h, vl5", "st1h {z2.h}, p2, [x21]", "ldrb w21, [x28, #1298]", "lsl w23, w22, w20", @@ -121,7 +118,6 @@ "add x21, x4, #0x1e (30)", "add x0, x28, x20, lsl #4", "ldr q2, [x0, #1040]", - "ptrue p2.h, vl5", "st1h {z2.h}, p2, [x21]", "ldrb w21, [x28, #1298]", "lsl w23, w22, w20", @@ -133,7 +129,6 @@ "add x21, x4, #0x28 (40)", "add x0, x28, x20, lsl #4", "ldr q2, [x0, #1040]", - "ptrue p2.h, vl5", "st1h {z2.h}, p2, [x21]", "ldrb w21, [x28, #1298]", "lsl w23, w22, w20", @@ -145,7 +140,6 @@ "add x21, x4, #0x32 (50)", "add x0, x28, x20, lsl #4", "ldr q2, [x0, #1040]", - "ptrue p2.h, vl5", "st1h {z2.h}, p2, [x21]", "ldrb w21, [x28, #1298]", "lsl w23, w22, w20", @@ -157,7 +151,6 @@ "add x21, x4, #0x3c (60)", "add x0, x28, x20, lsl #4", "ldr q2, [x0, #1040]", - "ptrue p2.h, vl5", "st1h {z2.h}, p2, [x21]", "ldrb w21, [x28, #1298]", "lsl w23, w22, w20", @@ -169,7 +162,6 @@ "add x21, x4, #0x46 (70)", "add x0, x28, x20, lsl #4", "ldr q2, [x0, #1040]", - "ptrue p2.h, vl5", "st1h {z2.h}, p2, [x21]", "ldrb w21, [x28, #1298]", "lsl w22, w22, w20", @@ -201,7 +193,7 @@ }, "2-load 80bit": { "x86InstructionCount": 2, - "ExpectedInstructionCount": 22, + "ExpectedInstructionCount": 21, "x86Insts": [ "fld tword [rax]", "fld tword [rax+10]" @@ -210,7 +202,6 @@ "ptrue p2.h, vl5", "ld1h {z2.h}, p2/z, [x4]", "add x20, x4, #0xa (10)", - "ptrue p2.h, vl5", "ld1h {z3.h}, p2/z, [x20]", "ldrb w20, [x28, #1019]", "sub w20, w20, #0x2 (2)", @@ -233,7 +224,7 @@ }, "8-load 80bit": { "x86InstructionCount": 8, - "ExpectedInstructionCount": 59, + "ExpectedInstructionCount": 52, "x86Insts": [ "fld tword [rax]", "fld tword [rax+10]", @@ -248,25 +239,18 @@ "ptrue p2.h, vl5", "ld1h {z2.h}, p2/z, [x4]", "add x20, x4, #0xa (10)", - "ptrue p2.h, vl5", "ld1h {z3.h}, p2/z, [x20]", "add x20, x4, #0x14 (20)", - "ptrue p2.h, vl5", "ld1h {z4.h}, p2/z, [x20]", "add x20, x4, #0x1e (30)", - "ptrue p2.h, vl5", "ld1h {z5.h}, p2/z, [x20]", "add x20, x4, #0x28 (40)", - "ptrue p2.h, vl5", "ld1h {z6.h}, p2/z, [x20]", "add x20, x4, #0x32 (50)", - "ptrue p2.h, vl5", "ld1h {z7.h}, p2/z, [x20]", "add x20, x4, #0x3c (60)", - "ptrue p2.h, vl5", "ld1h {z8.h}, p2/z, [x20]", "add x20, x4, #0x46 (70)", - "ptrue p2.h, vl5", "ld1h {z9.h}, p2/z, [x20]", "ldrb w20, [x28, #1019]", "sub w20, w20, #0x8 (8)", From 9aea249b5370b780e83d1bc68e753ea830fc4918 Mon Sep 17 00:00:00 2001 From: Paulo Matos Date: Thu, 16 Jan 2025 09:49:01 +0100 Subject: [PATCH 3/3] asm test: Ensure predicate cache is reset when control flow leaves block --- unittests/ASM/X87/MemcopyWithCPUID.asm | 36 ++++++++++++++++++++++++++ 1 file changed, 36 insertions(+) create mode 100644 unittests/ASM/X87/MemcopyWithCPUID.asm diff --git a/unittests/ASM/X87/MemcopyWithCPUID.asm b/unittests/ASM/X87/MemcopyWithCPUID.asm new file mode 100644 index 0000000000..62e7558747 --- /dev/null +++ b/unittests/ASM/X87/MemcopyWithCPUID.asm @@ -0,0 +1,36 @@ +%ifdef CONFIG +{ + "RegData": { + "RBX": "0x8000000000000000", + "RCX": "0x3fff" + } +} +%endif + +; Related to #4274 - ensures that if cpuid clobbers the predicate register, +; we reset the predicate cache. + +section .data +align 8 + +data: + dt 1.0 + +section .bss +align 8 + +data2: + resb 10 + +section .text +lea r8, [rel data] +fld tword [r8] + +mov rax, 0x0 +cpuid ; Will this instruction clobber the predicate register? + +fstp tword [rel data2] + +mov rbx, [rel data2] +mov rcx, [rel data2+8] +hlt