Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add simde #22

Open
wants to merge 6 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions .gitmodules
Original file line number Diff line number Diff line change
Expand Up @@ -13,3 +13,6 @@
[submodule "thirdparty/tiny-AES-c"]
path = thirdparty/tiny-AES-c
url = https://github.com/kokke/tiny-AES-c.git
[submodule "thirdparty/simde"]
path = thirdparty/simde
url = https://github.com/simd-everywhere/simde-no-tests.git
2 changes: 1 addition & 1 deletion XenonRecomp/pch.h
Original file line number Diff line number Diff line change
Expand Up @@ -16,4 +16,4 @@
#include <xbox.h>
#include <xxhash.h>
#include <fmt/core.h>
#include <xmmintrin.h>
#include <x86/sse.h>
178 changes: 89 additions & 89 deletions XenonRecomp/recompiler.cpp

Large diffs are not rendered by default.

3 changes: 2 additions & 1 deletion XenonUtils/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -17,8 +17,9 @@ target_compile_definitions(XenonUtils
)

target_include_directories(XenonUtils
PUBLIC
PUBLIC
.
"${THIRDPARTY_ROOT}/simde"
PRIVATE
"${THIRDPARTY_ROOT}/libmspack/libmspack/mspack"
"${THIRDPARTY_ROOT}/tiny-AES-c"
Expand Down
165 changes: 107 additions & 58 deletions XenonUtils/ppc_context.h
Original file line number Diff line number Diff line change
Expand Up @@ -12,13 +12,13 @@
#include <cstdlib>
#include <cstring>

#include <x86intrin.h>
#include <x86/avx.h>
#include <x86/sse.h>
#include <x86/sse4.1.h>

#ifdef _WIN32
#include <intrin.h>
#else
#include <xmmintrin.h>
#include <smmintrin.h>
// SSE3 constants are missing from simde
#ifndef _MM_DENORMALS_ZERO_MASK
#define _MM_DENORMALS_ZERO_MASK 0x0040
#endif

#define PPC_JOIN(x, y) x##y
Expand Down Expand Up @@ -175,18 +175,18 @@ struct PPCCRRegister
eq = !un && (left == right);
}

inline void setFromMask(__m128 mask, int imm) noexcept
inline void setFromMask(simde__m128 mask, int imm) noexcept
{
int m = _mm_movemask_ps(mask);
int m = simde_mm_movemask_ps(mask);
lt = m == imm; // all equal
gt = 0;
eq = m == 0; // none equal
so = 0;
}

inline void setFromMask(__m128i mask, int imm) noexcept
inline void setFromMask(simde__m128i mask, int imm) noexcept
{
int m = _mm_movemask_epi8(mask);
int m = simde_mm_movemask_epi8(mask);
lt = m == imm; // all equal
gt = 0;
eq = m == 0; // none equal
Expand Down Expand Up @@ -221,42 +221,79 @@ struct PPCFPSCRRegister
{
uint32_t csr;

static constexpr size_t GuestToHost[] = { _MM_ROUND_NEAREST, _MM_ROUND_TOWARD_ZERO, _MM_ROUND_UP, _MM_ROUND_DOWN };
static constexpr size_t HostToGuest[] = { PPC_ROUND_NEAREST, PPC_ROUND_DOWN, PPC_ROUND_UP, PPC_ROUND_TOWARD_ZERO };

// simde does not handle denormal flags, so we need to implement per-arch.
#if defined(__x86_64__) || defined(_M_X64)
static constexpr size_t RoundShift = 13;
static constexpr size_t RoundMask = SIMDE_MM_ROUND_MASK;
static constexpr size_t FlushMask = SIMDE_MM_FLUSH_ZERO_MASK | _MM_DENORMALS_ZERO_MASK;
static constexpr size_t GuestToHost[] = { SIMDE_MM_ROUND_NEAREST, SIMDE_MM_ROUND_TOWARD_ZERO, SIMDE_MM_ROUND_UP, SIMDE_MM_ROUND_DOWN };

inline uint32_t getcsr() noexcept
{
return simde_mm_getcsr();
}

inline void setcsr(uint32_t csr) noexcept
{
simde_mm_setcsr(csr);
}
#elif defined(__aarch64__) || defined(_M_ARM64)
// RMode
static constexpr size_t RoundShift = 22;
static constexpr size_t RoundMask = 3 << RoundShift;
// FZ and FZ16
static constexpr size_t FlushMask = (1 << 19) | (1 << 24);
// Nearest, Zero, -Infinity, -Infinity
static constexpr size_t GuestToHost[] = { 0 << RoundShift, 3 << RoundShift, 1 << RoundShift, 2 << RoundShift };

inline uint32_t getcsr() noexcept
{
uint64_t csr;
__asm__ __volatile__("mrs %0, fpcr" : "=r"(csr));
return csr;
}

inline void setcsr(uint32_t csr) noexcept
{
__asm__ __volatile__("msr fpcr, %0" : : "r"(csr));
}
#else
# error "Missing implementation for FPSCR."
#endif

inline uint32_t loadFromHost() noexcept
{
csr = _mm_getcsr();
return HostToGuest[(csr & _MM_ROUND_MASK) >> 13];
csr = getcsr();
return HostToGuest[(csr & RoundMask) >> RoundShift];
}

inline void storeFromGuest(uint32_t value) noexcept
{
csr &= ~_MM_ROUND_MASK;
csr &= ~RoundMask;
csr |= GuestToHost[value & PPC_ROUND_MASK];
_mm_setcsr(csr);
setcsr(csr);
}

static constexpr size_t FlushMask = _MM_FLUSH_ZERO_MASK | _MM_DENORMALS_ZERO_MASK;

inline void enableFlushModeUnconditional() noexcept
{
csr |= FlushMask;
_mm_setcsr(csr);
setcsr(csr);
}

inline void disableFlushModeUnconditional() noexcept
{
csr &= ~FlushMask;
_mm_setcsr(csr);
setcsr(csr);
}

inline void enableFlushMode() noexcept
{
if ((csr & FlushMask) != FlushMask) [[unlikely]]
{
csr |= FlushMask;
_mm_setcsr(csr);
setcsr(csr);
}
}

Expand All @@ -265,7 +302,7 @@ struct PPCFPSCRRegister
if ((csr & FlushMask) != 0) [[unlikely]]
{
csr &= ~FlushMask;
_mm_setcsr(csr);
setcsr(csr);
}
}
};
Expand Down Expand Up @@ -593,68 +630,80 @@ inline uint8_t VectorShiftTableR[] =
0x10, 0x0F, 0x0E, 0x0D, 0x0C, 0x0B, 0x0A, 0x09, 0x08, 0x07, 0x06, 0x05, 0x04, 0x03, 0x02, 0x01,
};

inline __m128i _mm_adds_epu32(__m128i a, __m128i b)
inline simde__m128i simde_mm_adds_epu32(simde__m128i a, simde__m128i b)
{
return _mm_add_epi32(a, _mm_min_epu32(_mm_xor_si128(a, _mm_cmpeq_epi32(a, a)), b));
return simde_mm_add_epi32(a, simde_mm_min_epu32(simde_mm_xor_si128(a, simde_mm_cmpeq_epi32(a, a)), b));
}

inline __m128i _mm_avg_epi8(__m128i a, __m128i b)
inline simde__m128i simde_mm_avg_epi8(simde__m128i a, simde__m128i b)
{
__m128i c = _mm_set1_epi8(char(128));
return _mm_xor_si128(c, _mm_avg_epu8(_mm_xor_si128(c, a), _mm_xor_si128(c, b)));
simde__m128i c = simde_mm_set1_epi8(char(128));
return simde_mm_xor_si128(c, simde_mm_avg_epu8(simde_mm_xor_si128(c, a), simde_mm_xor_si128(c, b)));
}

inline __m128i _mm_avg_epi16(__m128i a, __m128i b)
inline simde__m128i simde_mm_avg_epi16(simde__m128i a, simde__m128i b)
{
__m128i c = _mm_set1_epi16(short(32768));
return _mm_xor_si128(c, _mm_avg_epu16(_mm_xor_si128(c, a), _mm_xor_si128(c, b)));
simde__m128i c = simde_mm_set1_epi16(short(32768));
return simde_mm_xor_si128(c, simde_mm_avg_epu16(simde_mm_xor_si128(c, a), simde_mm_xor_si128(c, b)));
}

inline __m128 _mm_cvtepu32_ps_(__m128i src1)
inline simde__m128 simde_mm_cvtepu32_ps_(simde__m128i src1)
{
__m128i xmm1 = _mm_add_epi32(src1, _mm_set1_epi32(127));
__m128i xmm0 = _mm_slli_epi32(src1, 31 - 8);
xmm0 = _mm_srli_epi32(xmm0, 31);
xmm0 = _mm_add_epi32(xmm0, xmm1);
xmm0 = _mm_srai_epi32(xmm0, 8);
xmm0 = _mm_add_epi32(xmm0, _mm_set1_epi32(0x4F800000));
__m128 xmm2 = _mm_cvtepi32_ps(src1);
return _mm_blendv_ps(xmm2, _mm_castsi128_ps(xmm0), _mm_castsi128_ps(src1));
simde__m128i xmm1 = simde_mm_add_epi32(src1, simde_mm_set1_epi32(127));
simde__m128i xmm0 = simde_mm_slli_epi32(src1, 31 - 8);
xmm0 = simde_mm_srli_epi32(xmm0, 31);
xmm0 = simde_mm_add_epi32(xmm0, xmm1);
xmm0 = simde_mm_srai_epi32(xmm0, 8);
xmm0 = simde_mm_add_epi32(xmm0, simde_mm_set1_epi32(0x4F800000));
simde__m128 xmm2 = simde_mm_cvtepi32_ps(src1);
return simde_mm_blendv_ps(xmm2, simde_mm_castsi128_ps(xmm0), simde_mm_castsi128_ps(src1));
}

inline __m128i _mm_perm_epi8_(__m128i a, __m128i b, __m128i c)
inline simde__m128i simde_mm_perm_epi8_(simde__m128i a, simde__m128i b, simde__m128i c)
{
__m128i d = _mm_set1_epi8(0xF);
__m128i e = _mm_sub_epi8(d, _mm_and_si128(c, d));
return _mm_blendv_epi8(_mm_shuffle_epi8(a, e), _mm_shuffle_epi8(b, e), _mm_slli_epi32(c, 3));
simde__m128i d = simde_mm_set1_epi8(0xF);
simde__m128i e = simde_mm_sub_epi8(d, simde_mm_and_si128(c, d));
return simde_mm_blendv_epi8(simde_mm_shuffle_epi8(a, e), simde_mm_shuffle_epi8(b, e), simde_mm_slli_epi32(c, 3));
}

inline __m128i _mm_cmpgt_epu8(__m128i a, __m128i b)
inline simde__m128i simde_mm_cmpgt_epu8(simde__m128i a, simde__m128i b)
{
__m128i c = _mm_set1_epi8(char(128));
return _mm_cmpgt_epi8(_mm_xor_si128(a, c), _mm_xor_si128(b, c));
simde__m128i c = simde_mm_set1_epi8(char(128));
return simde_mm_cmpgt_epi8(simde_mm_xor_si128(a, c), simde_mm_xor_si128(b, c));
}

inline __m128i _mm_cmpgt_epu16(__m128i a, __m128i b)
inline simde__m128i simde_mm_cmpgt_epu16(simde__m128i a, simde__m128i b)
{
__m128i c = _mm_set1_epi16(short(32768));
return _mm_cmpgt_epi16(_mm_xor_si128(a, c), _mm_xor_si128(b, c));
simde__m128i c = simde_mm_set1_epi16(short(32768));
return simde_mm_cmpgt_epi16(simde_mm_xor_si128(a, c), simde_mm_xor_si128(b, c));
}

inline __m128i _mm_vctsxs(__m128 src1)
inline simde__m128i simde_mm_vctsxs(simde__m128 src1)
{
__m128 xmm2 = _mm_cmpunord_ps(src1, src1);
__m128i xmm0 = _mm_cvttps_epi32(src1);
__m128i xmm1 = _mm_cmpeq_epi32(xmm0, _mm_set1_epi32(INT_MIN));
xmm1 = _mm_andnot_si128(_mm_castps_si128(src1), xmm1);
__m128 dest = _mm_blendv_ps(_mm_castsi128_ps(xmm0), _mm_castsi128_ps(_mm_set1_epi32(INT_MAX)), _mm_castsi128_ps(xmm1));
return _mm_andnot_si128(_mm_castps_si128(xmm2), _mm_castps_si128(dest));
simde__m128 xmm2 = simde_mm_cmpunord_ps(src1, src1);
simde__m128i xmm0 = simde_mm_cvttps_epi32(src1);
simde__m128i xmm1 = simde_mm_cmpeq_epi32(xmm0, simde_mm_set1_epi32(INT_MIN));
xmm1 = simde_mm_andnot_si128(simde_mm_castps_si128(src1), xmm1);
simde__m128 dest = simde_mm_blendv_ps(simde_mm_castsi128_ps(xmm0), simde_mm_castsi128_ps(simde_mm_set1_epi32(INT_MAX)), simde_mm_castsi128_ps(xmm1));
return simde_mm_andnot_si128(simde_mm_castps_si128(xmm2), simde_mm_castps_si128(dest));
}

inline __m128i _mm_vsr(__m128i a, __m128i b)
inline simde__m128i simde_mm_vsr(simde__m128i a, simde__m128i b)
{
b = _mm_srli_epi64(_mm_slli_epi64(b, 61), 61);
return _mm_castps_si128(_mm_insert_ps(_mm_castsi128_ps(_mm_srl_epi64(a, b)), _mm_castsi128_ps(_mm_srl_epi64(_mm_srli_si128(a, 4), b)), 0x10));
b = simde_mm_srli_epi64(simde_mm_slli_epi64(b, 61), 61);
return simde_mm_castps_si128(simde_mm_insert_ps(simde_mm_castsi128_ps(simde_mm_srl_epi64(a, b)), simde_mm_castsi128_ps(simde_mm_srl_epi64(simde_mm_srli_si128(a, 4), b)), 0x10));
}

#if defined(__aarch64__) || defined(_M_ARM64)
inline uint64_t __rdtsc()
{
uint64_t ret;
asm volatile("mrs %0, cntvct_el0\n\t"
: "=r"(ret)::"memory");
return ret;
}
#elif !defined(__x86_64__) && !defined(_M_X64)
# error "Missing implementation for __rdtsc()"
#endif

#endif
1 change: 1 addition & 0 deletions thirdparty/simde
Submodule simde added at a532a1