diff --git a/FEXCore/include/FEXCore/Debug/InternalThreadState.h b/FEXCore/include/FEXCore/Debug/InternalThreadState.h index 5eb185e54a..5c8bc424fb 100644 --- a/FEXCore/include/FEXCore/Debug/InternalThreadState.h +++ b/FEXCore/include/FEXCore/Debug/InternalThreadState.h @@ -99,6 +99,7 @@ struct InternalThreadState : public FEXCore::Allocator::FEXAllocOperators { std::shared_mutex ObjectCacheRefCounter {}; + // This pointer is owned by the frontend. FEXCore::Profiler::ThreadStats* ThreadStats {}; ///< Data pointer for exclusive use by the frontend diff --git a/FEXCore/include/FEXCore/Utils/Profiler.h b/FEXCore/include/FEXCore/Utils/Profiler.h index 3653dafc45..f9653f6ac8 100644 --- a/FEXCore/include/FEXCore/Utils/Profiler.h +++ b/FEXCore/include/FEXCore/Utils/Profiler.h @@ -46,7 +46,7 @@ struct ThreadStats { #ifdef _M_ARM_64 /** - * @brief Get the raw cycle counter which is synchronizing. + * @brief Get the raw cycle counter with synchronizing isb. * * `CNTVCTSS_EL0` also does the same thing, but requires the FEAT_ECV feature. */ diff --git a/Source/Common/Profiler.cpp b/Source/Common/Profiler.cpp index 70001a7621..c5650bd7be 100644 --- a/Source/Common/Profiler.cpp +++ b/Source/Common/Profiler.cpp @@ -14,7 +14,7 @@ void StatAllocBase::SaveHeader(FEXCore::Profiler::AppType AppType) { Head->Size.store(CurrentSize, std::memory_order_relaxed); Head->Version = FEXCore::Profiler::STATS_VERSION; - constexpr std::array::length(GIT_DESCRIBE_STRING) + 1> GitString = {GIT_DESCRIBE_STRING}; + std::string_view GitString = GIT_DESCRIBE_STRING; strncpy(Head->fex_version, GitString.data(), std::min(GitString.size(), sizeof(Head->fex_version))); Head->app_type = AppType; @@ -26,7 +26,7 @@ void StatAllocBase::SaveHeader(FEXCore::Profiler::AppType AppType) { bool StatAllocBase::AllocateMoreSlots() { const auto OriginalSlotCount = TotalSlotsFromSize(); - uint64_t NewSize = AllocateMoreSlots(CurrentSize * 2); + uint64_t NewSize = FrontendAllocateSlots(CurrentSize * 2); if (NewSize == CurrentSize) { return false; @@ -39,7 +39,7 @@ bool StatAllocBase::AllocateMoreSlots() { return true; } -FEXCore::Profiler::ThreadStats* StatAllocBase::AllocateBaseSlot(uint32_t TID) { +FEXCore::Profiler::ThreadStats* StatAllocBase::AllocateSlot(uint32_t TID) { if (!RemainingSlots) { if (!AllocateMoreSlots()) { return nullptr; @@ -76,7 +76,7 @@ FEXCore::Profiler::ThreadStats* StatAllocBase::AllocateBaseSlot(uint32_t TID) { return AllocatedSlot; } -void StatAllocBase::DeallocateBaseSlot(FEXCore::Profiler::ThreadStats* AllocatedSlot) { +void StatAllocBase::DeallocateSlot(FEXCore::Profiler::ThreadStats* AllocatedSlot) { if (!AllocatedSlot) { return; } diff --git a/Source/Common/Profiler.h b/Source/Common/Profiler.h index 023678ce8b..c4063934a8 100644 --- a/Source/Common/Profiler.h +++ b/Source/Common/Profiler.h @@ -20,17 +20,15 @@ static inline void memory_barrier() { #else static inline void memory_barrier() { // Intentionally empty. + // x86 is strongly memory ordered with regular loadstores. No need for barrier. } #endif namespace FEX::Profiler { class StatAllocBase { -public: - virtual ~StatAllocBase() = default; - protected: - FEXCore::Profiler::ThreadStats* AllocateBaseSlot(uint32_t TID); - void DeallocateBaseSlot(FEXCore::Profiler::ThreadStats* AllocatedSlot); + FEXCore::Profiler::ThreadStats* AllocateSlot(uint32_t TID); + void DeallocateSlot(FEXCore::Profiler::ThreadStats* AllocatedSlot); uint32_t OffsetFromStat(FEXCore::Profiler::ThreadStats* Stat) const { return reinterpret_cast(Stat) - reinterpret_cast(Base); @@ -38,6 +36,10 @@ class StatAllocBase { size_t TotalSlotsFromSize() const { return (CurrentSize - sizeof(FEXCore::Profiler::ThreadStatsHeader)) / sizeof(FEXCore::Profiler::ThreadStats) - 1; } + size_t TotalSlotsFromSize(uint64_t Size) const { + return (Size - sizeof(FEXCore::Profiler::ThreadStatsHeader)) / sizeof(FEXCore::Profiler::ThreadStats) - 1; + } + size_t SlotIndexFromOffset(uint32_t Offset) { return (Offset - sizeof(FEXCore::Profiler::ThreadStatsHeader)) / sizeof(FEXCore::Profiler::ThreadStats); } @@ -60,7 +62,7 @@ class StatAllocBase { constexpr static size_t MAX_STATS_SIZE = 4 * 1024 * 1024; private: - virtual uint64_t AllocateMoreSlots(uint64_t NewSize) = 0; + virtual uint64_t FrontendAllocateSlots(uint64_t NewSize) = 0; bool AllocateMoreSlots(); }; diff --git a/Source/Tools/LinuxEmulation/LinuxSyscalls/ThreadManager.cpp b/Source/Tools/LinuxEmulation/LinuxSyscalls/ThreadManager.cpp index 3a1d156bc0..b3fc48af6f 100644 --- a/Source/Tools/LinuxEmulation/LinuxSyscalls/ThreadManager.cpp +++ b/Source/Tools/LinuxEmulation/LinuxSyscalls/ThreadManager.cpp @@ -37,8 +37,15 @@ void ThreadManager::StatAlloc::Initialize() { LogMan::Msg::EFmt("[StatAlloc] ftruncate failed"); goto err; } + for (size_t i = 4096; i <= (128 * 1024 * 1024); i *= 2) { + LogMan::Msg::DFmt("{}: {} slots", i, TotalSlotsFromSize(i)); + } - // 128MB ought to be enough for anyone. + // Reserve a region of MAX_STATS_SIZE so we can grow the allocation buffer. + // Number of thread slots when ThreadStatsHeader == 64bytes and ThreadStats == 40bytes: + // 1 page: 99 slots + // 1 MB: 26211 slots + // 128 MB: 3355440 slots Base = ::mmap(nullptr, MAX_STATS_SIZE, PROT_NONE, MAP_PRIVATE | MAP_ANONYMOUS | MAP_NORESERVE, -1, 0); if (Base == MAP_FAILED) { LogMan::Msg::EFmt("[StatAlloc] mmap base failed"); @@ -61,9 +68,10 @@ void ThreadManager::StatAlloc::Initialize() { close(fd); } -uint64_t ThreadManager::StatAlloc::AllocateMoreSlots(uint64_t NewSize) { +uint64_t ThreadManager::StatAlloc::FrontendAllocateSlots(uint64_t NewSize) { if (CurrentSize == MAX_STATS_SIZE) { - // Nope. + // Allocator has reached maximum slots. We can't allocate anymore. + // New threads won't get stats. return CurrentSize; } NewSize = std::max(MAX_STATS_SIZE, NewSize); @@ -86,14 +94,6 @@ uint64_t ThreadManager::StatAlloc::AllocateMoreSlots(uint64_t NewSize) { LogMan::Msg::EFmt("[StatAlloc] allocate more mmap shm failed"); goto err; } - - // TODO: Just a sanity check. - const char* SharedTest = (const char*)Base; - for (size_t i = CurrentSize; i < NewSize; ++i) { - if (SharedTest[i] != 0) { - LogMan::Msg::EFmt("truncate and map shared resulted in not zero'd memory!"); - } - } } err: @@ -103,7 +103,7 @@ uint64_t ThreadManager::StatAlloc::AllocateMoreSlots(uint64_t NewSize) { FEXCore::Profiler::ThreadStats* ThreadManager::StatAlloc::AllocateSlot(uint32_t TID) { std::scoped_lock lk(StatMutex); - return AllocateBaseSlot(TID); + return StatAllocBase::AllocateSlot(TID); } void ThreadManager::StatAlloc::DeallocateSlot(FEXCore::Profiler::ThreadStats* AllocatedSlot) { @@ -112,7 +112,7 @@ void ThreadManager::StatAlloc::DeallocateSlot(FEXCore::Profiler::ThreadStats* Al } std::scoped_lock lk(StatMutex); - DeallocateBaseSlot(AllocatedSlot); + StatAllocBase::DeallocateSlot(AllocatedSlot); } void ThreadManager::StatAlloc::CleanupForExit() { @@ -138,8 +138,8 @@ void ThreadManager::StatAlloc::UnlockAfterFork(FEXCore::Core::InternalThreadStat StatMutex.StealAndDropActiveLocks(); - // shm_memory tied to this process is now not owned by this process. - // Replace the shm region! Otherwise this process will keep reporting time in the original parent thread's stats region! + // shm_memory ownership is retained by the parent process, so the child must replace it with its own one. + // Otherwise this process will keep reporting in the original parent thread's stats region. munmap(Base, MAX_STATS_SIZE); Base = nullptr; CurrentSize = 0; @@ -380,7 +380,7 @@ void ThreadManager::UnlockAfterFork(FEXCore::Core::InternalThreadState* LiveThre // This function is called after fork // We need to cleanup some of the thread data that is dead for (auto& DeadThread : Threads) { - // This is not owned by the child after fork. + // The fork parent retains ownership of ThreadStats DeadThread->Thread->ThreadStats = nullptr; if (DeadThread->Thread == LiveThread) { diff --git a/Source/Tools/LinuxEmulation/LinuxSyscalls/ThreadManager.h b/Source/Tools/LinuxEmulation/LinuxSyscalls/ThreadManager.h index 949088853e..570d4b4ecf 100644 --- a/Source/Tools/LinuxEmulation/LinuxSyscalls/ThreadManager.h +++ b/Source/Tools/LinuxEmulation/LinuxSyscalls/ThreadManager.h @@ -123,7 +123,7 @@ class ThreadManager final { private: void Initialize(); - uint64_t AllocateMoreSlots(uint64_t NewSize) override; + uint64_t FrontendAllocateSlots(uint64_t NewSize) override; FEX_CONFIG_OPT(ProfileStats, PROFILESTATS); FEX_CONFIG_OPT(Is64BitMode, IS64BIT_MODE); diff --git a/Source/Windows/Common/Profiler.cpp b/Source/Windows/Common/Profiler.cpp index 2dedde586e..8e0f546389 100644 --- a/Source/Windows/Common/Profiler.cpp +++ b/Source/Windows/Common/Profiler.cpp @@ -20,7 +20,7 @@ __attribute__((naked)) uint64_t linux_getpid() { : "r0", "r8"); } -uint64_t StatAlloc::AllocateMoreSlots(uint64_t NewSize) { +uint64_t StatAlloc::FrontendAllocateSlots(uint64_t NewSize) { LogMan::Msg::DFmt("Ran out of slots. Can't allocate more"); return CurrentSize; } diff --git a/Source/Windows/Common/Profiler.h b/Source/Windows/Common/Profiler.h index b12ad9631a..a3262cb648 100644 --- a/Source/Windows/Common/Profiler.h +++ b/Source/Windows/Common/Profiler.h @@ -10,7 +10,7 @@ class StatAlloc final : public FEX::Profiler::StatAllocBase { virtual ~StatAlloc(); FEXCore::Profiler::ThreadStats* AllocateSlot(uint32_t TID) { - return AllocateBaseSlot(TID); + return StatAllocBase::AllocateSlot(TID); } void DeallocateSlot(FEXCore::Profiler::ThreadStats* AllocatedSlot) { @@ -18,11 +18,11 @@ class StatAlloc final : public FEX::Profiler::StatAllocBase { return; } - DeallocateBaseSlot(AllocatedSlot); + StatAllocBase::DeallocateSlot(AllocatedSlot); } private: - uint64_t AllocateMoreSlots(uint64_t NewSize) override; + uint64_t FrontendAllocateSlots(uint64_t NewSize) override; }; } // namespace FEX::Windows