Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Enhancement] Optimize code in arm (backport #55072) #55487

Open
wants to merge 1 commit into
base: branch-3.4
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion be/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -1016,7 +1016,7 @@ endif()

set(STARROCKS_LINK_LIBS ${STARROCKS_LINK_LIBS}
${WL_LINK_STATIC} -lbfd
${WL_LINK_DYNAMIC} -lresolv -liberty -lc -lm -ldl -rdynamic -pthread -Wl,-wrap=__cxa_throw
${WL_LINK_DYNAMIC} -lresolv -liberty -lc -lm -ldl -rdynamic -pthread -Wl,-wrap,__cxa_throw -Wl,-wrap,__floattidf
)

# link gcov if WITH_GCOV is on
Expand Down
9 changes: 9 additions & 0 deletions be/src/exprs/runtime_filter.h
Original file line number Diff line number Diff line change
Expand Up @@ -119,6 +119,15 @@ class SimdBlockFilter {
const __m256i mask = make_mask(hash >> _log_num_buckets);
__m256i* const bucket = &reinterpret_cast<__m256i*>(_directory)[bucket_idx];
_mm256_store_si256(bucket, _mm256_or_si256(*bucket, mask));
#elif defined(__ARM_NEON)
uint32x4_t masks[2];
make_mask(hash >> _log_num_buckets, masks);
uint32x4_t directory_1 = vld1q_u32(&_directory[bucket_idx][0]);
uint32x4_t directory_2 = vld1q_u32(&_directory[bucket_idx][4]);
directory_1 = vorrq_u32(directory_1, masks[0]);
directory_2 = vorrq_u32(directory_2, masks[1]);
vst1q_u32(&_directory[bucket_idx][0], directory_1);
vst1q_u32(&_directory[bucket_idx][4], directory_2);
#else
uint32_t masks[BITS_SET_PER_BLOCK];
make_mask(hash >> _log_num_buckets, masks);
Expand Down
1 change: 1 addition & 0 deletions be/src/runtime/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -106,6 +106,7 @@ set(RUNTIME_FILES
dictionary_cache_sink.cpp
type_pack.cpp
customized_result_writer.cpp
int128_to_double.cpp
)

set(RUNTIME_FILES ${RUNTIME_FILES}
Expand Down
94 changes: 94 additions & 0 deletions be/src/runtime/int128_to_double.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,94 @@
// Copyright 2021-present StarRocks, Inc. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// https://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

#include "runtime/int128_to_double.h"

#include <glog/logging.h>

#include <climits>
#include <cstdint>

#include "integer_overflow_arithmetics.h"
namespace starrocks {
double __wrap___floattidf(__int128 a) {
typedef double dst_t;
typedef uint64_t dst_rep_t;
typedef __uint128_t usrc_t;
#define DST_REP_C UINT64_C

enum {
dstSigBits = 52,
};

if (a == 0) return 0.0;

enum {
dstMantDig = dstSigBits + 1,
srcBits = sizeof(__int128) * CHAR_BIT,
srcIsSigned = ((__int128)-1) < 0,
};

const __int128 s = srcIsSigned ? a >> (srcBits - 1) : 0;

a = (usrc_t)(a ^ s) - s;
int sd = srcBits - clz128(a); // number of significant digits
int e = sd - 1; // exponent
if (sd > dstMantDig) {
// start: 0000000000000000000001xxxxxxxxxxxxxxxxxxxxxxPQxxxxxxxxxxxxxxxxxx
// finish: 000000000000000000000000000000000000001xxxxxxxxxxxxxxxxxxxxxxPQR
// 12345678901234567890123456
// 1 = msb 1 bit
// P = bit dstMantDig-1 bits to the right of 1
// Q = bit dstMantDig bits to the right of 1
// R = "or" of all bits to the right of Q
if (sd == dstMantDig + 1) {
a <<= 1;
} else if (sd == dstMantDig + 2) {
// Do nothing.
} else {
a = ((usrc_t)a >> (sd - (dstMantDig + 2))) |
((a & ((usrc_t)(-1) >> ((srcBits + dstMantDig + 2) - sd))) != 0);
}
// finish:
a |= (a & 4) != 0; // Or P into R
++a; // round - this step may add a significant bit
a >>= 2; // dump Q and R
// a is now rounded to dstMantDig or dstMantDig+1 bits
if (a & ((usrc_t)1 << dstMantDig)) {
a >>= 1;
++e;
}
// a is now rounded to dstMantDig bits
} else {
a <<= (dstMantDig - sd);
// a is now rounded to dstMantDig bits
}
const int dstBits = sizeof(dst_t) * CHAR_BIT;
const dst_rep_t dstSignMask = DST_REP_C(1) << (dstBits - 1);
const int dstExpBits = dstBits - dstSigBits - 1;
const int dstExpBias = (1 << (dstExpBits - 1)) - 1;
const dst_rep_t dstSignificandMask = (DST_REP_C(1) << dstSigBits) - 1;
// Combine sign, exponent, and mantissa.
const dst_rep_t result = ((dst_rep_t)s & dstSignMask) | ((dst_rep_t)(e + dstExpBias) << dstSigBits) |
((dst_rep_t)(a)&dstSignificandMask);

const union {
dst_t f;
dst_rep_t i;
} rep = {.i = result};

DCHECK(std::abs(rep.f - __real___floattidf(a)) < 0.001);
return rep.f;
}
} // namespace starrocks
22 changes: 22 additions & 0 deletions be/src/runtime/int128_to_double.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
// Copyright 2021-present StarRocks, Inc. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// https://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
namespace starrocks {
extern "C" {
// origin from llvm-project
// https://github.com/llvm/llvm-project/blob/main/compiler-rt/lib/builtins/int_to_fp_impl.inc
// this implementation is 20x faster than gcc
double __wrap___floattidf(__int128 a);
double __real___floattidf(__int128 a);
}
} // namespace starrocks
20 changes: 1 addition & 19 deletions be/src/runtime/integer_overflow_arithmetics.h
Original file line number Diff line number Diff line change
Expand Up @@ -113,25 +113,7 @@ inline int clz128(unsigned __int128 v) {
}

inline bool int128_mul_overflow(int128_t a, int128_t b, int128_t* c) {
if (a == 0 || b == 0) {
*c = 0;
return false;
}

// sgn(x)
auto sa = a >> 127;
// sgn(y)
auto sb = b >> 127;
// abx(x), abs(y)
a = (a ^ sa) - sa;
b = (b ^ sb) - sb;
// sgn(x * y)
sa ^= sb;
*c = a * b;
// sgn(x * y) and abs(x) * abs(y) produces x * y;
*c = (*c ^ sa) - sa;
static constexpr auto int128_max = get_max<int128_t>();
return clz128(a) + clz128(b) < sizeof(int128_t) || int128_max / a < b;
return __builtin_mul_overflow(a, b, c);
}

template <>
Expand Down
32 changes: 31 additions & 1 deletion thirdparty/patches/streamvbyte.patch
Original file line number Diff line number Diff line change
@@ -1,13 +1,43 @@
From 676d0175085a7996f909d9d2e63ab7b4683ef475 Mon Sep 17 00:00:00 2001
From: before-Sunrise <[email protected]>
Date: Tue, 14 Jan 2025 18:41:46 +0800
Subject: [PATCH] patch

Signed-off-by: before-Sunrise <[email protected]>
---
CMakeLists.txt | 5 +----
include/streamvbyte.h | 2 +-
2 files changed, 2 insertions(+), 5 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 39df85d..1e32b0c 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -32,10 +32,7 @@ if (MSVC)
endif()
# test for arm
if(CMAKE_SYSTEM_PROCESSOR MATCHES "^(aarch64.*|AARCH64.*)")
- set(BASE_FLAGS
- ${BASE_FLAGS}
- "-D__ARM_NEON__"
- )
+ add_compile_options(-D__ARM_NEON__)
endif()
set(STREAMVBYTE_SRCS
${PROJECT_SOURCE_DIR}/src/streamvbyte_encode.c
diff --git a/include/streamvbyte.h b/include/streamvbyte.h
index bc9533c..a6cbb1a 100644
--- a/include/streamvbyte.h
+++ b/include/streamvbyte.h
@@ -1,7 +1,7 @@

#ifndef INCLUDE_STREAMVBYTE_H_
#define INCLUDE_STREAMVBYTE_H_
-#define __STDC_FORMAT_MACROS
+// #define __STDC_FORMAT_MACROS
#include <inttypes.h>
#include <stdint.h>// please use a C99-compatible compiler
#include <stddef.h>
--
2.34.1

Loading