diff --git a/README.md b/README.md index b7862cf..a8ae16b 100644 --- a/README.md +++ b/README.md @@ -19,7 +19,6 @@ due to the fact that a memory allocator has to rely on inherently unsafe operati The codebase is extremely compact (<500 LoC) and is therefore trivial to validate. The allocator is designed to be portable across all conventional architectures, from 8-bit to 64-bit systems. -Multi-threaded environments are supported with the help of external synchronization hooks provided by the application. ## Design @@ -199,6 +198,18 @@ If not specified, the macro expands as follows: For example, for GCC, Clang, and ARM Compiler, it expands into `__builtin_expect((x), 1)`. - For other (unknown) compilers it expands into the original expression with no modifications: `(x)`. +#### O1HEAP_CLZ(x) + +The count leading zeros (CLZ) function is used for fast binary logarithm computation (which has to be done +multiple times per allocation, so its performance is critical). +Most of the modern processors implement dedicated hardware support for fast CLZ computation, +which is available via compiler intrinsics. + +If not overridden by the user, for some compilers `O1HEAP_CLZ(x)` will expand to the appropriate intrinsic +(e.g., `__builtin_clzl(x)` for GCC/Clang). +For other compilers it will default to a slow software implementation, +which is likely to significantly degrade the performance of the library. + ## Development ### Dependencies @@ -257,6 +268,13 @@ An exception applies for the case of false-positive (invalid) warnings -- those ## Changelog +### v2.1 + +- Significantly accelerate (de-)allocation by replacing the naïve log2 implementation with fast CLZ intrinsics; + see `O1HEAP_CLZ(x)`. +- Do not require char to be 8-bit wide: replace `uint8_t` with `uint_fast8_t`. + This is to enhance compatibility with odd embedded platforms where `CHAR_BIT!=8` (e.g., ADSP TS-201, TMS320C2804). + ### v2.0 - Remove critical section hooks to enhance MISRA conformance [#4](https://github.com/pavel-kirienko/o1heap/issues/4) diff --git a/o1heap/o1heap.c b/o1heap/o1heap.c index 37b5f7e..61e6e39 100644 --- a/o1heap/o1heap.c +++ b/o1heap/o1heap.c @@ -16,6 +16,7 @@ #include "o1heap.h" #include +#include // ---------------------------------------- BUILD CONFIGURATION OPTIONS ---------------------------------------- @@ -32,21 +33,52 @@ # define O1HEAP_ASSERT(x) assert(x) // NOSONAR #endif +/// Allow usage of compiler intrinsics for branch annotation and CLZ. +#ifndef O1HEAP_USE_INTRINSICS +# define O1HEAP_USE_INTRINSICS 1 +#endif + /// Branch probability annotations are used to improve the worst case execution time (WCET). They are entirely optional. -#ifndef O1HEAP_LIKELY +#if O1HEAP_USE_INTRINSICS && !defined(O1HEAP_LIKELY) # if defined(__GNUC__) || defined(__clang__) || defined(__CC_ARM) // Intentional violation of MISRA: branch hinting macro cannot be replaced with a function definition. # define O1HEAP_LIKELY(x) __builtin_expect((x), 1) // NOSONAR -# else -# define O1HEAP_LIKELY(x) x # endif #endif +#ifndef O1HEAP_LIKELY +# define O1HEAP_LIKELY(x) x +#endif /// This option is used for testing only. Do not use in production. #ifndef O1HEAP_PRIVATE # define O1HEAP_PRIVATE static inline #endif +/// Count leading zeros (CLZ) is used for fast computation of binary logarithm (which needs to be done very often). +/// Most of the modern processors (including the embedded ones) implement dedicated hardware support for fast CLZ +/// computation, which is available via compiler intrinsics. The default implementation will automatically use +/// the intrinsics for some of the compilers; for others it will default to the slow software emulation, +/// which can be overridden by the user via O1HEAP_CONFIG_HEADER. The library guarantees that the argument is positive. +#if O1HEAP_USE_INTRINSICS && !defined(O1HEAP_CLZ) +# if defined(__GNUC__) || defined(__clang__) || defined(__CC_ARM) +# define O1HEAP_CLZ __builtin_clzl +# endif +#endif +#ifndef O1HEAP_CLZ +O1HEAP_PRIVATE uint_fast8_t O1HEAP_CLZ(const size_t x) +{ + O1HEAP_ASSERT(x > 0); + size_t t = ((size_t) 1U) << ((sizeof(size_t) * CHAR_BIT) - 1U); + uint_fast8_t r = 0; + while ((x & t) == 0) + { + t >>= 1U; + r++; + } + return r; +} +#endif + // ---------------------------------------- INTERNAL DEFINITIONS ---------------------------------------- #if !defined(__STDC_VERSION__) || (__STDC_VERSION__ < 199901L) @@ -72,7 +104,7 @@ /// Normally we should subtract log2(FRAGMENT_SIZE_MIN) but log2 is bulky to compute using the preprocessor only. /// We will certainly end up with unused bins this way, but it is cheap to ignore. -#define NUM_BINS_MAX (sizeof(size_t) * 8U) +#define NUM_BINS_MAX (sizeof(size_t) * CHAR_BIT) static_assert((O1HEAP_ALIGNMENT & (O1HEAP_ALIGNMENT - 1U)) == 0U, "Not a power of 2"); static_assert((FRAGMENT_SIZE_MIN & (FRAGMENT_SIZE_MIN - 1U)) == 0U, "Not a power of 2"); @@ -113,41 +145,37 @@ struct O1HeapInstance static_assert(INSTANCE_SIZE_PADDED >= sizeof(O1HeapInstance), "Invalid instance footprint computation"); static_assert((INSTANCE_SIZE_PADDED % O1HEAP_ALIGNMENT) == 0U, "Invalid instance footprint computation"); -/// True if the argument is an integer power of two or zero. -O1HEAP_PRIVATE bool isPowerOf2(const size_t x) +/// Undefined for zero argument. +O1HEAP_PRIVATE uint_fast8_t log2Floor(const size_t x) { - return (x & (x - 1U)) == 0U; + O1HEAP_ASSERT(x > 0); + // NOLINTNEXTLINE redundant cast to the same type. + return (uint_fast8_t) (((sizeof(x) * CHAR_BIT) - 1U) - ((uint_fast8_t) O1HEAP_CLZ(x))); } /// Special case: if the argument is zero, returns zero. -O1HEAP_PRIVATE uint8_t log2Floor(const size_t x) +O1HEAP_PRIVATE uint_fast8_t log2Ceil(const size_t x) { - size_t tmp = x; - uint8_t y = 0; - // This is currently the only exception to the statement "routines contain neither loops nor recursion". - // It is unclear if there is a better way to compute the binary logarithm than this. - while (tmp > 1U) - { - tmp >>= 1U; - y++; - } - return y; -} - -/// Special case: if the argument is zero, returns zero. -O1HEAP_PRIVATE uint8_t log2Ceil(const size_t x) -{ - return (uint8_t) (log2Floor(x) + (isPowerOf2(x) ? 0U : 1U)); + // NOLINTNEXTLINE redundant cast to the same type. + return (x <= 1U) ? 0U : (uint_fast8_t) ((sizeof(x) * CHAR_BIT) - ((uint_fast8_t) O1HEAP_CLZ(x - 1U))); } /// Raise 2 into the specified power. /// You might be tempted to do something like (1U << power). WRONG! We humans are prone to forgetting things. /// If you forget to cast your 1U to size_t or ULL, you may end up with undefined behavior. -O1HEAP_PRIVATE size_t pow2(const uint8_t power) +O1HEAP_PRIVATE size_t pow2(const uint_fast8_t power) { return ((size_t) 1U) << power; } +/// This is equivalent to pow2(log2Ceil(x)). Undefined for x<2. +O1HEAP_PRIVATE size_t roundUpToPowerOf2(const size_t x) +{ + O1HEAP_ASSERT(x >= 2U); + // NOLINTNEXTLINE redundant cast to the same type. + return ((size_t) 1U) << ((sizeof(x) * CHAR_BIT) - ((uint_fast8_t) O1HEAP_CLZ(x - 1U))); +} + /// Links two fragments so that their next/prev pointers point to each other; left goes before right. O1HEAP_PRIVATE void interlink(Fragment* const left, Fragment* const right) { @@ -168,7 +196,7 @@ O1HEAP_PRIVATE void rebin(O1HeapInstance* const handle, Fragment* const fragment O1HEAP_ASSERT(fragment != NULL); O1HEAP_ASSERT(fragment->header.size >= FRAGMENT_SIZE_MIN); O1HEAP_ASSERT((fragment->header.size % FRAGMENT_SIZE_MIN) == 0U); - const uint8_t idx = log2Floor(fragment->header.size / FRAGMENT_SIZE_MIN); // Round DOWN when inserting. + const uint_fast8_t idx = log2Floor(fragment->header.size / FRAGMENT_SIZE_MIN); // Round DOWN when inserting. O1HEAP_ASSERT(idx < NUM_BINS_MAX); // Add the new fragment to the beginning of the bin list. // I.e., each allocation will be returning the most-recently-used fragment -- good for caching. @@ -189,7 +217,7 @@ O1HEAP_PRIVATE void unbin(O1HeapInstance* const handle, const Fragment* const fr O1HEAP_ASSERT(fragment != NULL); O1HEAP_ASSERT(fragment->header.size >= FRAGMENT_SIZE_MIN); O1HEAP_ASSERT((fragment->header.size % FRAGMENT_SIZE_MIN) == 0U); - const uint8_t idx = log2Floor(fragment->header.size / FRAGMENT_SIZE_MIN); // Round DOWN when removing. + const uint_fast8_t idx = log2Floor(fragment->header.size / FRAGMENT_SIZE_MIN); // Round DOWN when removing. O1HEAP_ASSERT(idx < NUM_BINS_MAX); // Remove the bin from the free fragment list. if (O1HEAP_LIKELY(fragment->next_free != NULL)) @@ -244,7 +272,7 @@ O1HeapInstance* o1heapInit(void* const base, const size_t size) O1HEAP_ASSERT((capacity >= FRAGMENT_SIZE_MIN) && (capacity <= FRAGMENT_SIZE_MAX)); // Initialize the root fragment. - Fragment* const frag = (Fragment*) (void*) (((uint8_t*) base) + INSTANCE_SIZE_PADDED); + Fragment* const frag = (Fragment*) (void*) (((char*) base) + INSTANCE_SIZE_PADDED); O1HEAP_ASSERT((((size_t) frag) % O1HEAP_ALIGNMENT) == 0U); frag->header.next = NULL; frag->header.prev = NULL; @@ -279,13 +307,13 @@ void* o1heapAllocate(O1HeapInstance* const handle, const size_t amount) { // Add the header size and align the allocation size to the power of 2. // See "Timing-Predictable Memory Allocation In Hard Real-Time Systems", Herter, page 27. - const size_t fragment_size = pow2(log2Ceil(amount + O1HEAP_ALIGNMENT)); + const size_t fragment_size = roundUpToPowerOf2(amount + O1HEAP_ALIGNMENT); O1HEAP_ASSERT(fragment_size <= FRAGMENT_SIZE_MAX); O1HEAP_ASSERT(fragment_size >= FRAGMENT_SIZE_MIN); O1HEAP_ASSERT(fragment_size >= amount + O1HEAP_ALIGNMENT); - O1HEAP_ASSERT(isPowerOf2(fragment_size)); + O1HEAP_ASSERT((fragment_size & (fragment_size - 1U)) == 0U); // Is power of 2. - const uint8_t optimal_bin_index = log2Ceil(fragment_size / FRAGMENT_SIZE_MIN); // Use CEIL when fetching. + const uint_fast8_t optimal_bin_index = log2Ceil(fragment_size / FRAGMENT_SIZE_MIN); // Use CEIL when fetching. O1HEAP_ASSERT(optimal_bin_index < NUM_BINS_MAX); const size_t candidate_bin_mask = ~(pow2(optimal_bin_index) - 1U); @@ -294,8 +322,8 @@ void* o1heapAllocate(O1HeapInstance* const handle, const size_t amount) const size_t smallest_bin_mask = suitable_bins & ~(suitable_bins - 1U); // Clear all bits but the lowest. if (O1HEAP_LIKELY(smallest_bin_mask != 0)) { - O1HEAP_ASSERT(isPowerOf2(smallest_bin_mask)); - const uint8_t bin_index = log2Floor(smallest_bin_mask); + O1HEAP_ASSERT((smallest_bin_mask & (smallest_bin_mask - 1U)) == 0U); // Is power of 2. + const uint_fast8_t bin_index = log2Floor(smallest_bin_mask); O1HEAP_ASSERT(bin_index >= optimal_bin_index); O1HEAP_ASSERT(bin_index < NUM_BINS_MAX); @@ -314,7 +342,7 @@ void* o1heapAllocate(O1HeapInstance* const handle, const size_t amount) O1HEAP_ASSERT(leftover % FRAGMENT_SIZE_MIN == 0U); // Alignment check. if (O1HEAP_LIKELY(leftover >= FRAGMENT_SIZE_MIN)) { - Fragment* const new_frag = (Fragment*) (void*) (((uint8_t*) frag) + fragment_size); + Fragment* const new_frag = (Fragment*) (void*) (((char*) frag) + fragment_size); O1HEAP_ASSERT(((size_t) new_frag) % O1HEAP_ALIGNMENT == 0U); new_frag->header.size = leftover; new_frag->header.used = false; @@ -336,7 +364,7 @@ void* o1heapAllocate(O1HeapInstance* const handle, const size_t amount) O1HEAP_ASSERT(frag->header.size >= amount + O1HEAP_ALIGNMENT); frag->header.used = true; - out = ((uint8_t*) frag) + O1HEAP_ALIGNMENT; + out = ((char*) frag) + O1HEAP_ALIGNMENT; } } @@ -359,7 +387,7 @@ void o1heapFree(O1HeapInstance* const handle, void* const pointer) O1HEAP_ASSERT(handle->diagnostics.capacity <= FRAGMENT_SIZE_MAX); if (O1HEAP_LIKELY(pointer != NULL)) // NULL pointer is a no-op. { - Fragment* const frag = (Fragment*) (void*) (((uint8_t*) pointer) - O1HEAP_ALIGNMENT); + Fragment* const frag = (Fragment*) (void*) (((char*) pointer) - O1HEAP_ALIGNMENT); // Check for heap corruption in debug builds. O1HEAP_ASSERT(((size_t) frag) % sizeof(Fragment*) == 0U); @@ -429,7 +457,7 @@ bool o1heapDoInvariantsHold(const O1HeapInstance* const handle) // Check the bin mask consistency. for (size_t i = 0; i < NUM_BINS_MAX; i++) // Dear compiler, feel free to unroll this loop. { - const bool mask_bit_set = (handle->nonempty_bin_mask & pow2((uint8_t) i)) != 0U; + const bool mask_bit_set = (handle->nonempty_bin_mask & pow2((uint_fast8_t) i)) != 0U; const bool bin_nonempty = handle->bins[i] != NULL; valid = valid && (mask_bit_set == bin_nonempty); } diff --git a/tests/.idea/dictionaries/pavel.xml b/tests/.idea/dictionaries/pavel.xml index f32082d..1c92cdc 100644 --- a/tests/.idea/dictionaries/pavel.xml +++ b/tests/.idea/dictionaries/pavel.xml @@ -1,9 +1,11 @@ + adsp deallocation herter kirienko + naïve noninfringement nosonar ogasawara diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index 0827fe5..46cf3f6 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -78,14 +78,16 @@ function(gen_test name files compile_definitions compile_features compile_flags add_test("run_${name}" "${name}" --rng-seed time) endfunction() -function(gen_test_matrix name files compile_definitions) - gen_test("${name}_c99_x64" "${files}" "${compile_definitions}" c_std_99 "-m64" "-m64") - gen_test("${name}_c99_x32" "${files}" "${compile_definitions}" c_std_99 "-m32" "-m32") - gen_test("${name}_c11_x64" "${files}" "${compile_definitions}" c_std_11 "-m64" "-m64") - gen_test("${name}_c11_x32" "${files}" "${compile_definitions}" c_std_11 "-m32" "-m32") +function(gen_test_matrix name files defs) + gen_test("${name}_c99_x64" "${files}" "${defs}" c_std_99 "-m64" "-m64") + gen_test("${name}_c99_x32" "${files}" "${defs}" c_std_99 "-m32" "-m32") + gen_test("${name}_c11_x64" "${files}" "${defs}" c_std_11 "-m64" "-m64") + gen_test("${name}_c11_x32" "${files}" "${defs}" c_std_11 "-m32" "-m32") + gen_test("${name}_c11_x64_ni" "${files}" "${defs};O1HEAP_USE_INTRINSICS=0" c_std_11 "-m64" "-m64") + gen_test("${name}_c11_x32_ni" "${files}" "${defs};O1HEAP_USE_INTRINSICS=0" c_std_11 "-m32" "-m32") # Coverage is only available for GCC builds. if ((CMAKE_CXX_COMPILER_ID STREQUAL "GNU") AND (CMAKE_BUILD_TYPE STREQUAL "Debug")) - gen_test("${name}_cov" "${files}" "${compile_definitions}" c_std_11 "-g -O0 --coverage" "--coverage") + gen_test("${name}_cov" "${files}" "${defs}" c_std_11 "-g -O0 --coverage" "--coverage") endif () endfunction() diff --git a/tests/internal.hpp b/tests/internal.hpp index 45c4fba..f329aec 100644 --- a/tests/internal.hpp +++ b/tests/internal.hpp @@ -34,10 +34,10 @@ namespace internal { extern "C" { -auto isPowerOf2(const std::size_t x) -> bool; auto log2Floor(const std::size_t x) -> std::uint8_t; auto log2Ceil(const std::size_t x) -> std::uint8_t; auto pow2(const std::uint8_t power) -> std::size_t; +auto roundUpToPowerOf2(const std::size_t x) -> std::size_t; } struct Fragment; diff --git a/tests/test_private.cpp b/tests/test_private.cpp index ff0d138..0070ab2 100644 --- a/tests/test_private.cpp +++ b/tests/test_private.cpp @@ -16,26 +16,11 @@ #include "internal.hpp" -TEST_CASE("Private: isPowerOf2") -{ - using internal::isPowerOf2; - REQUIRE(isPowerOf2(0)); // Special case. - REQUIRE(isPowerOf2(1)); // 2**0 - REQUIRE(isPowerOf2(2)); // 2**1 - REQUIRE(!isPowerOf2(3)); - REQUIRE(isPowerOf2(4)); - REQUIRE(!isPowerOf2(5)); - REQUIRE(!isPowerOf2(6)); - REQUIRE(!isPowerOf2(7)); - REQUIRE(isPowerOf2(8)); - REQUIRE(!isPowerOf2(9)); -} - TEST_CASE("Private: log2") { using internal::log2Floor; using internal::log2Ceil; - REQUIRE(log2Floor(0) == 0); + // The function is only defined for x>=1. REQUIRE(log2Floor(1) == 0); REQUIRE(log2Floor(2) == 1); REQUIRE(log2Floor(3) == 1); @@ -44,7 +29,7 @@ TEST_CASE("Private: log2") REQUIRE(log2Floor(60) == 5); REQUIRE(log2Floor(64) == 6); - REQUIRE(log2Ceil(0) == 0); + REQUIRE(log2Ceil(0) == 0); // Special case. REQUIRE(log2Ceil(1) == 0); REQUIRE(log2Ceil(2) == 1); REQUIRE(log2Ceil(3) == 2); @@ -68,3 +53,34 @@ TEST_CASE("Private: pow2") REQUIRE(pow2(8) == 256); REQUIRE(pow2(9) == 512); } + +TEST_CASE("Private: roundUpToPowerOf2") +{ + using internal::log2Ceil; + using internal::pow2; + using internal::roundUpToPowerOf2; + // The function is only defined for x>=2. + REQUIRE(roundUpToPowerOf2(2) == 2); + REQUIRE(roundUpToPowerOf2(3) == 4); + REQUIRE(roundUpToPowerOf2(4) == 4); + REQUIRE(roundUpToPowerOf2(5) == 8); + REQUIRE(roundUpToPowerOf2(6) == 8); + REQUIRE(roundUpToPowerOf2(7) == 8); + REQUIRE(roundUpToPowerOf2(8) == 8); + REQUIRE(roundUpToPowerOf2(9) == 16); + REQUIRE(roundUpToPowerOf2(10) == 16); + REQUIRE(roundUpToPowerOf2(11) == 16); + REQUIRE(roundUpToPowerOf2(12) == 16); + REQUIRE(roundUpToPowerOf2(13) == 16); + REQUIRE(roundUpToPowerOf2(14) == 16); + REQUIRE(roundUpToPowerOf2(15) == 16); + REQUIRE(roundUpToPowerOf2(16) == 16); + REQUIRE(roundUpToPowerOf2(17) == 32); + REQUIRE(roundUpToPowerOf2(32) == 32); + REQUIRE(roundUpToPowerOf2(2147483647U) == 2147483648U); + REQUIRE(roundUpToPowerOf2(2147483648U) == 2147483648U); + for (auto i = 2U; i < 1'000'000; i++) + { + REQUIRE(pow2(log2Ceil(i)) == roundUpToPowerOf2(i)); + } +}