Skip to content

Commit

Permalink
naive neon implementation
Browse files Browse the repository at this point in the history
  • Loading branch information
MikkelSchubert committed Feb 4, 2025
1 parent 9635be5 commit f5288e0
Show file tree
Hide file tree
Showing 4 changed files with 116 additions and 0 deletions.
4 changes: 4 additions & 0 deletions src/meson.build
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ config = configuration_data(
'HAVE_SSE2': '0',
'HAVE_AVX2': '0',
'HAVE_AVX512': '0',
'HAVE_NEON': '0',
},
)

Expand All @@ -40,13 +41,16 @@ elif target_machine.cpu_family() == 'x86_64'
else
warning('AVX512 support requires GCC>=11 or clang>=8')
endif
elif target_machine.cpu_family() == 'aarch64'
config.set('HAVE_NEON', '1')
endif

libsimd = []
libsimd_config = {
'HAVE_SSE2': ['sse2', 'simd_sse2.cpp', '-msse2'],
'HAVE_AVX2': ['avx2', 'simd_avx2.cpp', '-mavx2'],
'HAVE_AVX512': ['avx512', 'simd_avx512bw.cpp', '-mavx512bw'],
'HAVE_NEON': ['neon', 'simd_neon.cpp', []],
}

foreach key, simd_config : libsimd_config
Expand Down
27 changes: 27 additions & 0 deletions src/simd.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,14 @@ compare_subsequences_avx512(size_t& n_mismatches,
size_t length,
size_t max_penalty);

bool
compare_subsequences_neon(size_t& n_mismatches,
size_t& n_ambiguous,
const char* seq_1,
const char* seq_2,
size_t length,
size_t max_penalty);

std::vector<instruction_set>
supported()
{
Expand All @@ -92,6 +100,10 @@ supported()
}
#endif

#if HAVE_NEON
choices.push_back(instruction_set::neon);
#endif

return choices;
}

Expand All @@ -107,6 +119,8 @@ name(instruction_set value)
return "AVX2";
case instruction_set::avx512:
return "AVX512";
case instruction_set::neon:
return "NEON";
default:
AR_FAIL("SIMD function not implemented!");
}
Expand Down Expand Up @@ -136,6 +150,13 @@ padding(instruction_set value)
#else
AR_FAIL("AVX512 not supported!");
#endif
case instruction_set::neon:
#if HAVE_NEON
return 15;
#else
AR_FAIL("NEON not supported!");
#endif

default:
AR_FAIL("SIMD function not implemented!");
}
Expand Down Expand Up @@ -164,6 +185,12 @@ get_compare_subsequences_func(instruction_set is)
return &compare_subsequences_avx512;
#else
AR_FAIL("AVX512 not supported!");
#endif
case instruction_set::neon:
#if HAVE_NEON
return &compare_subsequences_neon;
#else
AR_FAIL("NEON not supported!");
#endif
default:
AR_FAIL("SIMD function not implemented!");
Expand Down
1 change: 1 addition & 0 deletions src/simd.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@ enum class instruction_set
sse2,
avx2,
avx512,
neon,
};

/** Returns vector of supports instruction sets */
Expand Down
84 changes: 84 additions & 0 deletions src/simd_neon.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,84 @@
/*************************************************************************\
* AdapterRemoval - cleaning next-generation sequencing reads *
* *
* Copyright (C) 2022 by Mikkel Schubert - [email protected] *
* *
* This program is free software: you can redistribute it and/or modify *
* it under the terms of the GNU General Public License as published by *
* the Free Software Foundation, either version 3 of the License, or *
* (at your option) any later version. *
* *
* This program is distributed in the hope that it will be useful, *
* but WITHOUT ANY WARRANTY; without even the implied warranty of *
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
* GNU General Public License for more details. *
* *
* You should have received a copy of the GNU General Public License *
* along with this program. If not, see <http://www.gnu.org/licenses/>. *
\*************************************************************************/
// #include "simd.hpp" // declarations
#include <algorithm> // for min
#include <arm_neon.h> // for vdupq_n_u8, vld1q_u8, vorrq_u8, ...
#include <cstddef> // for size_t

namespace adapterremoval {

namespace simd {

namespace {

/** Counts the number of masked bytes **/
inline auto
count_masked_neon(uint8x16_t value)
{
// Extract one bit per uint8_t and sum across vector
return vaddvq_u8(vandq_u8(vdupq_n_u8(0x01), value));
}

} // namespace

bool
compare_subsequences_neon(size_t& n_mismatches,
size_t& n_ambiguous,
const char* seq_1,
const char* seq_2,
size_t length,
size_t max_penalty)
{
//! Mask of all Ns
const auto n_mask = vdupq_n_u8('N');

// The sequence is assumed to be padded with 15 'N's not included in `length`.
while (length) {
auto s1 = vld1q_u8(reinterpret_cast<const uint8_t*>(seq_1));
auto s2 = vld1q_u8(reinterpret_cast<const uint8_t*>(seq_2));

// Sets 0xFF for every byte where one or both nts is N
const auto ns_mask = vorrq_u8(vceqq_u8(s1, n_mask), vceqq_u8(s2, n_mask));

// Sets 0xFF for every byte where bytes are equal or N
const auto eq_mask = vorrq_u8(vceqq_u8(s1, s2), ns_mask);

n_mismatches += 16 - count_masked_neon(eq_mask);
if (2 * n_mismatches + n_ambiguous > max_penalty) {
return false;
}

// Fragment length without 'N' padding
size_t unpadded_length = std::min<size_t>(16, length);

// Early termination is almost always due to mismatches, so updating the
// number of Ns after the above check saves time in the common case.
n_ambiguous += count_masked_neon(ns_mask) - (16 - unpadded_length);

seq_1 += 16;
seq_2 += 16;
length -= unpadded_length;
}

return true;
}

} // namespace simd

} // namespace adapterremoval

0 comments on commit f5288e0

Please sign in to comment.