-
Notifications
You must be signed in to change notification settings - Fork 205
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Windows: ARM64/NEON Support #769
base: main
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -284,7 +284,11 @@ volk_32f_index_min_32u_neon(uint32_t* target, const float* source, uint32_t num_ | |
if (minValuesBuffer[number] < min) { | ||
index = minIndexesBuffer[number]; | ||
min = minValuesBuffer[number]; | ||
#ifdef _MSC_VER | ||
} else if (minValues.n128_f32[number] == min) { | ||
#else | ||
} else if (minValues[number] == min) { | ||
#endif | ||
Comment on lines
+287
to
+291
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This seems like smth very special. Could you add a comment with some context? Maybe a link that explains the necessary change? |
||
if (index > minIndexesBuffer[number]) | ||
index = minIndexesBuffer[number]; | ||
} | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -229,10 +229,10 @@ static inline void volk_32fc_accumulator_s32fc_neon(lv_32fc_t* result, | |
lv_32fc_t returnValue = lv_cmake(0.f, 0.f); | ||
unsigned int eighthPoints = num_points / 8; | ||
float32x4_t in_vec; | ||
float32x4_t out_vec0 = { 0.f, 0.f, 0.f, 0.f }; | ||
float32x4_t out_vec1 = { 0.f, 0.f, 0.f, 0.f }; | ||
float32x4_t out_vec2 = { 0.f, 0.f, 0.f, 0.f }; | ||
float32x4_t out_vec3 = { 0.f, 0.f, 0.f, 0.f }; | ||
float32x4_t out_vec0 = { 0.f }; | ||
float32x4_t out_vec1 = { 0.f }; | ||
float32x4_t out_vec2 = { 0.f }; | ||
float32x4_t out_vec3 = { 0.f }; | ||
Comment on lines
231
to
+235
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I fear this affects initialization. Are we sure all values are initialized correctly for all compilers? |
||
__VOLK_ATTR_ALIGNED(32) float tempBuffer[4]; | ||
|
||
for (; number < eighthPoints; number++) { | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -236,7 +236,7 @@ static inline void volk_32fc_convert_16ic_neonv8(lv_16sc_t* outputVector, | |
const float32x4_t max_val = vmovq_n_f32(max_val_f); | ||
float32x4_t ret1, ret2, a, b; | ||
|
||
int32x4_t toint_a = { 0, 0, 0, 0 }, toint_b = { 0, 0, 0, 0 }; | ||
int32x4_t toint_a = { 0 }, toint_b = { 0 }; | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. While you're at it, could you change this to give every variable their own statement? |
||
int16x4_t intInputVal1, intInputVal2; | ||
int16x8_t res; | ||
|
||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -262,7 +262,9 @@ volk_32u_reverse_32u_neonv8(uint32_t* out, const uint32_t* in, unsigned int num_ | |
const uint32_t* in_ptr = in; | ||
uint32_t* out_ptr = out; | ||
|
||
const uint8x16_t idx = { 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12 }; | ||
uint8x16_t idx; | ||
const uint8_t idx_data[] = { 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12 }; | ||
idx = vld1q_u8(idx_data); | ||
Comment on lines
-265
to
+267
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. why? The data type implies 16 values of the same type. Does MSVC fail to perform the correct aggregate initialization? This indirection may negatively impact optimizations. |
||
|
||
const unsigned int quarterPoints = num_points / 4; | ||
unsigned int number = 0; | ||
|
@@ -290,8 +292,15 @@ volk_32u_reverse_32u_neonv8(uint32_t* out, const uint32_t* in, unsigned int num_ | |
|
||
#ifdef LV_HAVE_NEON | ||
#include <arm_neon.h> | ||
|
||
#if defined(__aarch64__) | ||
#ifdef _MSC_VER | ||
#define DO_RBIT \ | ||
*out_ptr = _byteswap_ulong(*in_ptr); \ | ||
*out_ptr = ((*out_ptr & 0x55555555) << 1) | ((*out_ptr & 0xAAAAAAAA) >> 1); \ | ||
*out_ptr = ((*out_ptr & 0x33333333) << 2) | ((*out_ptr & 0xCCCCCCCC) >> 2); \ | ||
*out_ptr = ((*out_ptr & 0x0F0F0F0F) << 4) | ((*out_ptr & 0xF0F0F0F0) >> 4); \ | ||
in_ptr++; \ | ||
out_ptr++; | ||
#elif defined(__aarch64__) | ||
Comment on lines
-294
to
+303
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This section indicates that we already had special treatment for some conditions. Now, it looks like we exchange them for another set of conditions. This change would warrant a comment on the specifics of different platforms. This will help anyone looking at this in the future. |
||
#define DO_RBIT \ | ||
__VOLK_ASM("rbit %w[result], %w[value]" \ | ||
: [result] "=r"(*out_ptr) \ | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -221,20 +221,24 @@ check_c_source_compiles( | |
|
||
if(neon_compile_result) | ||
set(CMAKE_REQUIRED_INCLUDES ${PROJECT_SOURCE_DIR}/include) | ||
if(MSVC) | ||
if(CMAKE_SYSTEM_PROCESSOR STREQUAL "ARM") | ||
overrule_arch(neonv8 "Compiler doesn't support neonv8") | ||
endif() | ||
Comment on lines
+224
to
+227
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 😢 |
||
else(MSVC) | ||
check_c_source_compiles( | ||
"#include <volk/volk_common.h>\n int main(){__VOLK_ASM(\"sub v1.4s,v1.4s,v1.4s\");}" | ||
have_neonv8_result) | ||
if(NOT have_neonv8_result) | ||
overrule_arch(neonv8 "Compiler doesn't support neonv8") | ||
endif() | ||
endif(MSVC) | ||
check_c_source_compiles( | ||
"#include <volk/volk_common.h>\n int main(){__VOLK_ASM(\"vrev32.8 q0, q0\");}" | ||
have_neonv7_result) | ||
check_c_source_compiles( | ||
"#include <volk/volk_common.h>\n int main(){__VOLK_ASM(\"sub v1.4s,v1.4s,v1.4s\");}" | ||
have_neonv8_result) | ||
|
||
if(NOT have_neonv7_result) | ||
overrule_arch(neonv7 "Compiler doesn't support neonv7") | ||
endif() | ||
|
||
if(NOT have_neonv8_result) | ||
overrule_arch(neonv8 "Compiler doesn't support neonv8") | ||
endif() | ||
else(neon_compile_result) | ||
Comment on lines
238
to
242
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Why would you want to remove the non-MSVC checks here? Looking at this section makes me think, it might be a good time to unify this section. Just a thought. |
||
overrule_arch(neon "Compiler doesn't support NEON") | ||
overrule_arch(neonv7 "Compiler doesn't support NEON") | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I must admit I don't quite understand that; what type is
float32x4_t
on MSVC/aarch64/neon?