From db49d4e84209696d720b90bfb8f6cdd92b19cdbc Mon Sep 17 00:00:00 2001 From: Paulo Matos Date: Tue, 2 Apr 2024 15:03:34 +0200 Subject: [PATCH] D8_00 passes Generating prologue and epilogue properly. --- .../IR/Passes/x87StackOptimizationPass.cpp | 316 +++++++++++++----- 1 file changed, 235 insertions(+), 81 deletions(-) diff --git a/FEXCore/Source/Interface/IR/Passes/x87StackOptimizationPass.cpp b/FEXCore/Source/Interface/IR/Passes/x87StackOptimizationPass.cpp index a2713c12b3..88773f90f6 100644 --- a/FEXCore/Source/Interface/IR/Passes/x87StackOptimizationPass.cpp +++ b/FEXCore/Source/Interface/IR/Passes/x87StackOptimizationPass.cpp @@ -1,7 +1,6 @@ #include "Interface/IR/IREmitter.h" #include "Interface/IR/PassManager.h" #include -#include #include #include @@ -10,16 +9,126 @@ namespace FEXCore::IR { +template +class CircularBuffer { +private: + using StorageType = typename std::aligned_storage::type; + fextl::vector buffer; + fextl::vector constructed; // TODO(pmatos): We probably can use something better here + int index; // Current insertion index + +public: + CircularBuffer(std::size_t size) + : buffer(size) + , constructed(size, false) + , index(0) {} + + ~CircularBuffer() { + for (std::size_t i = 0; i < size(); ++i) { + if (constructed[i]) { + reinterpret_cast(&buffer[i])->~T(); + } + } + } + + template + void push(Args&&... args) { + index = (index - 1 + size()) % size(); + std::size_t pos = index; + if (constructed[pos]) { + reinterpret_cast(&buffer[pos])->~T(); + } + LogMan::Msg::DFmt("Push to {}\n", index); + new (&buffer[pos]) T(std::forward(args)...); + constructed[pos] = true; + } + + template + void setTop(Args&&... args) { + std::size_t pos = index; + if (constructed[pos]) { + reinterpret_cast(&buffer[pos])->~T(); + } + LogMan::Msg::DFmt("SetTop to {}\n", index); + new (&buffer[pos]) T(std::forward(args)...); + constructed[pos] = true; + } + + void pop() { + if (!constructed.empty() && constructed[index]) { + LogMan::Msg::DFmt("Pop\n"); + std::size_t popIndex = (index + 1) % size(); + reinterpret_cast(&buffer[popIndex])->~T(); + constructed[popIndex] = false; + index = popIndex; + } + } + + T& top() { + LogMan::Msg::DFmt("Top\n"); + std::size_t pos = index; + return *reinterpret_cast(&buffer[pos]); + } + + const T& top(size_t offset = 0) const { + size_t pos = index; + return *reinterpret_cast(&buffer[(pos + offset) % size()]); + } + + inline size_t count() const { + size_t sz = 0; + for (size_t i = 0; i < constructed.size(); ++i) { + if (constructed[i]) { + sz++; + } + } + LogMan::Msg::DFmt("Count: {}\n", sz); + return sz; + } + + inline size_t size() const { + return constructed.size(); + } + + inline T& operator[](size_t i) { + return *reinterpret_cast(&buffer[i]); + } + + inline const T& operator[](size_t i) const { + return *reinterpret_cast(&buffer[i]); + } + + inline void clear() { + for (std::size_t i = 0; i < size(); ++i) { + if (constructed[i]) { + reinterpret_cast(&buffer[i])->~T(); + constructed[i] = false; + } + } + index = 0; + } + + inline bool valid(size_t i) const { + return constructed[i]; + } +}; + class X87StackOptimization final : public FEXCore::IR::Pass { public: - bool Run(IREmitter *IREmit) override; + bool Run(IREmitter* IREmit) override; private: - // FIXME: copy from OpcodeDispatcher.h - [[nodiscard]] uint32_t MMBaseOffset() { + // FIXME(pmatos): copy from OpcodeDispatcher.h + [[nodiscard]] + uint32_t MMBaseOffset() { return static_cast(offsetof(Core::CPUState, mm[0][0])); } + // Top Management Helpers + OrderedNode* GetX87Top(IREmitter* IREmit); + void SetX87Top(IREmitter* IREmit, OrderedNode* Value); + void SetX87ValidTag(IREmitter* IREmit, OrderedNode* Value, bool Valid); + struct StackMemberInfo { IR::OpSize SourceDataSize; // Size of SourceDataNode IR::OpSize StackDataSize; // Size of the loaded data (??? FIXME) @@ -27,18 +136,37 @@ class X87StackOptimization final : public FEXCore::IR::Pass { IR::OrderedNode* SourceDataNode; // Reference to the value pushed to stack IR::OrderedNode* DataLoadNode; // Reference to the IR node that loaded the data bool InterpretAsFloat {}; // True if this is a floating point value, false if integer + + }; - fextl::vector StackData{8}; + // Index on vector is offset to top value at start of block + CircularBuffer StackData {8}; }; -bool X87StackOptimization::Run(IREmitter *IREmit) { +OrderedNode* X87StackOptimization::GetX87Top(IREmitter* IREmit) { + return IREmit->_LoadContext(1, GPRClass, offsetof(FEXCore::Core::CPUState, flags) + FEXCore::X86State::X87FLAG_TOP_LOC); +} + +void X87StackOptimization::SetX87Top(IREmitter* IREmit, OrderedNode* Value) { + IREmit->_StoreContext(1, GPRClass, Value, offsetof(FEXCore::Core::CPUState, flags) + FEXCore::X86State::X87FLAG_TOP_LOC); +} + +void X87StackOptimization::SetX87ValidTag(IREmitter* IREmit, OrderedNode* Value, bool Valid) { + // if we are popping then we must first mark this location as empty + OrderedNode* AbridgedFTW = IREmit->_LoadContext(1, GPRClass, offsetof(FEXCore::Core::CPUState, AbridgedFTW)); + OrderedNode* RegMask = IREmit->_Lshl(OpSize::i32Bit, IREmit->_Constant(1), Value); + OrderedNode* NewAbridgedFTW = Valid ? IREmit->_Or(OpSize::i32Bit, AbridgedFTW, RegMask) : IREmit->_Andn(OpSize::i32Bit, AbridgedFTW, RegMask); + IREmit->_StoreContext(1, GPRClass, NewAbridgedFTW, offsetof(FEXCore::Core::CPUState, AbridgedFTW)); +} + +bool X87StackOptimization::Run(IREmitter* IREmit) { FEXCORE_PROFILE_SCOPED("PassManager::x87StackOpt"); bool Changed = false; auto CurrentIR = IREmit->ViewIR(); - auto *OriginalWriteCursor = IREmit->GetWriteCursor(); + auto* OriginalWriteCursor = IREmit->GetWriteCursor(); - auto *HeaderOp = CurrentIR.GetHeader(); + auto* HeaderOp = CurrentIR.GetHeader(); LOGMAN_THROW_AA_FMT(HeaderOp->Header.Op == OP_IRHEADER, "First op wasn't IRHeader"); if (!HeaderOp->HasX87) { @@ -54,86 +182,102 @@ bool X87StackOptimization::Run(IREmitter *IREmit) { // through the x87 tag register. // TODO(pmatos) + // Get beginning of block + // FIXME(pmatos): there must be a better way to do this. + auto [BlockBegin, BlockBeginHeader] = *CurrentIR.GetBlocks().begin(); + auto [CodeBegin, IROpBegin] = *CurrentIR.GetCode(BlockBegin).begin(); + + // Get Top at beginning of block + IREmit->SetWriteCursor(CodeBegin); + auto* orig_top = GetX87Top(IREmit); + // Run optimization proper for (auto [BlockNode, BlockHeader] : CurrentIR.GetBlocks()) { for (auto [CodeNode, IROp] : CurrentIR.GetCode(BlockNode)) { switch (IROp->Op) { - case IR::OP_PUSHSTACK: { - LogMan::Msg::DFmt("OP_PUSHSTACK\n"); - const auto *Op = IROp->C(); - auto SourceNodeID = Op->X80Src.ID(); - auto *SourceNode = CurrentIR.GetNode(Op->X80Src); - auto *SourceNodeOp = CurrentIR.GetOp(SourceNode); - auto SourceNodeSize = SourceNodeOp->Size; - StackData.emplace_back(StackMemberInfo { - .SourceDataSize = IR::SizeToOpSize(SourceNodeSize), - .StackDataSize = IR::SizeToOpSize(Op->LoadSize), - .SourceDataNodeID = SourceNodeID, - .SourceDataNode = SourceNode, - .DataLoadNode = CodeNode, - .InterpretAsFloat = Op->Float, - }); - - LogMan::Msg::DFmt("Stack depth at: {}", StackData.size()); - IREmit->SetWriteCursor(CodeNode); - IREmit->Remove(CodeNode); // Remove PushStack - it's a nop, we just need to track the stack - Changed = true; - break; + case IR::OP_PUSHSTACK: { + LogMan::Msg::DFmt("OP_PUSHSTACK\n"); + const auto* Op = IROp->C(); + auto SourceNodeID = Op->X80Src.ID(); + auto* SourceNode = CurrentIR.GetNode(Op->X80Src); + auto* SourceNodeOp = CurrentIR.GetOp(SourceNode); + auto SourceNodeSize = SourceNodeOp->Size; + StackData.push(StackMemberInfo { + .SourceDataSize = IR::SizeToOpSize(SourceNodeSize), + .StackDataSize = IR::SizeToOpSize(Op->LoadSize), + .SourceDataNodeID = SourceNodeID, + .SourceDataNode = SourceNode, + .DataLoadNode = CodeNode, + .InterpretAsFloat = Op->Float, + }); + + LogMan::Msg::DFmt("Stack depth at: {}", StackData.count()); + IREmit->SetWriteCursor(CodeNode); + IREmit->Remove(CodeNode); // Remove PushStack - it's a nop, we just need to track the stack + Changed = true; + break; + } + case IR::OP_POPSTACKMEMORY: { + LogMan::Msg::DFmt("OP_POPSTACKMEMORY\n"); + const auto* Op = IROp->C(); + const auto& StackMember = StackData.top(); + if (Op->Float == StackMember.InterpretAsFloat && Op->StoreSize == StackMember.StackDataSize && Op->StoreSize == StackMember.SourceDataSize) { + LogMan::Msg::DFmt("Could optimize memcpy!"); } - case IR::OP_POPSTACKMEMORY: { - LogMan::Msg::DFmt("OP_POPSTACKMEMORY\n"); - const auto *Op = IROp->C(); - const auto& StackMember = StackData.back(); - if (Op->Float == StackMember.InterpretAsFloat && - Op->StoreSize == StackMember.StackDataSize && - Op->StoreSize == StackMember.SourceDataSize) { - LogMan::Msg::DFmt("Could optimize memcpy!"); - } - IREmit->SetWriteCursor(CodeNode); + IREmit->SetWriteCursor(CodeNode); - auto *AddrNode = CurrentIR.GetNode(Op->Addr); - if (StackMember.SourceDataSize == OpSize::i128Bit) { - IREmit->_StoreMem(FPRClass, OpSize::i64Bit, AddrNode, StackMember.SourceDataNode, 1); - auto NewLocation = IREmit->_Add(OpSize::i64Bit, AddrNode, IREmit->_Constant(8)); - IREmit->_VStoreVectorElement(OpSize::i128Bit, OpSize::i16Bit, StackMember.SourceDataNode, 4, NewLocation); - } - else { - IREmit->_StoreMem(FPRClass, StackMember.SourceDataSize, AddrNode, StackMember.SourceDataNode, 1); - } + auto* AddrNode = CurrentIR.GetNode(Op->Addr); + if (StackMember.SourceDataSize == OpSize::i128Bit) { + IREmit->_StoreMem(FPRClass, OpSize::i64Bit, AddrNode, StackMember.SourceDataNode, 1); + auto NewLocation = IREmit->_Add(OpSize::i64Bit, AddrNode, IREmit->_Constant(8)); + IREmit->_VStoreVectorElement(OpSize::i128Bit, OpSize::i16Bit, StackMember.SourceDataNode, 4, NewLocation); + } else { + IREmit->_StoreMem(FPRClass, StackMember.SourceDataSize, AddrNode, StackMember.SourceDataNode, 1); + } - IREmit->Remove(StackMember.DataLoadNode); - IREmit->Remove(CodeNode); - Changed = true; + IREmit->Remove(StackMember.DataLoadNode); + IREmit->Remove(CodeNode); + Changed = true; - StackData.pop_back(); - LogMan::Msg::DFmt("Stack depth at: {}", StackData.size()); - break; - } - case IR::OP_F80ADDSTACK: { - LogMan::Msg::DFmt("OP_F80ADDSTACK\n"); - const auto* Op = IROp->C(); - (void)Op; // avoid warning for now - LogMan::Msg::DFmt("Stack depth at: {}", StackData.size()); - break; - } - case IR::OP_F80ADDVALUE: { - LogMan::Msg::DFmt("F80ADDVALUE\n"); - const auto* Op = IROp->C(); - auto* ValueNode = CurrentIR.GetNode(Op->X80Src); - - auto StackOffset = Op->SrcStack1; - const auto& StackMember = StackData[StackData.size() - StackOffset - 1]; - auto* StackNode = StackMember.SourceDataNode; - - IREmit->SetWriteCursor(CodeNode); - IREmit->_F80Add(ValueNode, StackNode); - IREmit->Remove(CodeNode); - Changed = true; - LogMan::Msg::DFmt("Stack depth at: {}", StackData.size()); - break; - } - default: break; + StackData.pop(); + LogMan::Msg::DFmt("Stack depth at: {}", StackData.count()); + break; + } + case IR::OP_F80ADDSTACK: { + LogMan::Msg::DFmt("OP_F80ADDSTACK\n"); + const auto* Op = IROp->C(); + (void)Op; // avoid warning for now + LogMan::Msg::DFmt("Stack depth at: {}", StackData.count()); + break; + } + case IR::OP_F80ADDVALUE: { + LogMan::Msg::DFmt("F80ADDVALUE\n"); + const auto* Op = IROp->C(); + auto SourceNodeID = Op->X80Src.ID(); + auto* ValueNode = CurrentIR.GetNode(Op->X80Src); + + auto StackOffset = Op->SrcStack1; + const auto& StackMember = StackData.top(StackOffset); + auto* StackNode = StackMember.SourceDataNode; + + IREmit->SetWriteCursor(CodeNode); + + auto AddNode = IREmit->_F80Add(ValueNode, StackNode); + // Store it in the stack + StackData.setTop(StackMemberInfo {.SourceDataSize = StackMember.SourceDataSize, + .StackDataSize = StackMember.StackDataSize, + .SourceDataNodeID = SourceNodeID, + .SourceDataNode = AddNode, + .DataLoadNode = CodeNode, + .InterpretAsFloat = StackMember.InterpretAsFloat}); + + IREmit->Remove(CodeNode); + Changed = true; + LogMan::Msg::DFmt("Stack depth at: {}", StackData.count()); + break; + } + default: break; } } } @@ -155,13 +299,23 @@ bool X87StackOptimization::Run(IREmitter *IREmit) { // context so that the values are correct. Copy SourceDataNode in the // stack to the respective mmX register. for (size_t i = 0; i < StackData.size(); ++i) { + if (!StackData.valid(i)) { + continue; + } LogMan::Msg::DFmt("Writing stack member {} to context", i); Changed = true; auto &StackMember = StackData[i]; auto *Node = StackMember.SourceDataNode; - IREmit->_StoreContextIndexed(Node, IREmit->_Constant(i), 16, - MMBaseOffset(), 16, FPRClass); + IREmit->_StoreContextIndexed(Node, IREmit->_Add(OpSize::i32Bit, orig_top, IREmit->_Constant(i)), 16, MMBaseOffset(), 16, FPRClass); } + + // Store new top which is now the original top - the number of elements in stack. + // Careful with underflow wraparound. + auto mask = IREmit->_Constant(0x7); + auto new_top = IREmit->_And(OpSize::i32Bit, IREmit->_Sub(OpSize::i32Bit, orig_top, IREmit->_Constant(1)), mask); + SetX87ValidTag(IREmit, new_top, true); + SetX87Top(IREmit, new_top); + break; } }