diff --git a/arch/x86/chunkset_avx2.c b/arch/x86/chunkset_avx2.c index b9051bb90e..c7f336fde7 100644 --- a/arch/x86/chunkset_avx2.c +++ b/arch/x86/chunkset_avx2.c @@ -32,7 +32,13 @@ static inline void chunkmemset_8(uint8_t *from, chunk_t *chunk) { } static inline void chunkmemset_16(uint8_t *from, chunk_t *chunk) { + /* See explanation in chunkset_avx512.c */ +#if defined(_MSC_VER) && _MSC_VER <= 1900 + halfchunk_t half = _mm_loadu_si128((__m128i*)from); + *chunk = _mm256_inserti128_si256(_mm256_castsi128_si256(half), half, 1); +#else *chunk = _mm256_broadcastsi128_si256(_mm_loadu_si128((__m128i*)from)); +#endif } static inline void loadchunk(uint8_t const *s, chunk_t *chunk) { diff --git a/arch/x86/chunkset_avx512.c b/arch/x86/chunkset_avx512.c index 929d04cdd9..9d28d33d5e 100644 --- a/arch/x86/chunkset_avx512.c +++ b/arch/x86/chunkset_avx512.c @@ -46,7 +46,17 @@ static inline void chunkmemset_8(uint8_t *from, chunk_t *chunk) { } static inline void chunkmemset_16(uint8_t *from, chunk_t *chunk) { + /* Unfortunately there seems to be a compiler bug in Visual Studio 2015 where + * the load is dumped to the stack with an aligned move for this memory-register + * broadcast. The vbroadcasti128 instruction is 2 fewer cycles and this dump to + * stack doesn't exist if compiled with optimizations. For the sake of working + * properly in a debugger, let's take the 2 cycle penalty */ +#if defined(_MSC_VER) && _MSC_VER <= 1900 + halfchunk_t half = _mm_loadu_si128((__m128i*)from); + *chunk = _mm256_inserti128_si256(_mm256_castsi128_si256(half), half, 1); +#else *chunk = _mm256_broadcastsi128_si256(_mm_loadu_si128((__m128i*)from)); +#endif } static inline void loadchunk(uint8_t const *s, chunk_t *chunk) {