From af9eec14c5026a20826129159d4d7fade3b0fc72 Mon Sep 17 00:00:00 2001 From: "zilong.dai" <2840478265@qq.com> Date: Mon, 4 Nov 2024 12:09:31 +0800 Subject: [PATCH] optimize non-asm code through flto and inline --- src/bls12-381/Makefile.am | 10 +-- src/bls12-381/configure.ac | 65 +++++++++---------- .../include/bls12-381/arithmetic.hpp | 24 +++---- src/bls12-381/libbls.pc.in | 2 +- src/bls12-381/src/arithmetic.cpp | 20 +++--- 5 files changed, 55 insertions(+), 66 deletions(-) diff --git a/src/bls12-381/Makefile.am b/src/bls12-381/Makefile.am index bbb854ad1a4..2b472aa336a 100644 --- a/src/bls12-381/Makefile.am +++ b/src/bls12-381/Makefile.am @@ -1,12 +1,8 @@ -ACLOCAL_AMFLAGS = -I build-aux/m4 -AM_CXXFLAGS = -std=c++11 -fpic -O2 -AM_CPPFLAGS = -std=c++11 -fpic -O2 +AM_CXXFLAGS = $(CXXFLAGS) -AM_CFLAGS = --fpic -O2 +pkgconfigdir = $(libdir)/pkgconfig +pkgconfig_DATA = libbls.pc lib_LTLIBRARIES = libbls.la libbls_la_SOURCES = src/groth16.cpp src/arithmetic.cpp src/fp.cpp src/g.cpp src/pairing.cpp src/scalar.cpp src/utils.cpp - -pkgconfigdir = $(libdir)/pkgconfig -pkgconfig_DATA = libbls.pc diff --git a/src/bls12-381/configure.ac b/src/bls12-381/configure.ac index 9642d712f55..8da7ba750fc 100644 --- a/src/bls12-381/configure.ac +++ b/src/bls12-381/configure.ac @@ -13,55 +13,47 @@ AC_INIT([libbls], m4_join([.], _PKG_VERSION_MAJOR, _PKG_VERSION_MINOR, _PKG_VERS AC_SUBST(LIB_VERSION_CURRENT, _LIB_VERSION_CURRENT) AC_SUBST(LIB_VERSION_REVISION, _LIB_VERSION_REVISION) AC_SUBST(LIB_VERSION_AGE, _LIB_VERSION_AGE) -AC_CONFIG_AUX_DIR([build-aux]) -AC_CONFIG_MACRO_DIR([build-aux/m4]) -AC_CANONICAL_HOST -AH_TOP([#ifndef LIBMCL_CONFIG_H]) -AH_TOP([#define LIBMCL_CONFIG_H]) -AH_BOTTOM([#endif /*LIBMCL_CONFIG_H*/]) AM_INIT_AUTOMAKE([1.11.2 foreign subdir-objects]) -m4_ifdef([AM_SILENT_RULES], [AM_SILENT_RULES([yes])]) +AC_ARG_ENABLE([blsasm], [AS_HELP_STRING([--enable-blsasm=yes|no], [Enable blsasm option])], [], [enable_blsasm=no]) AC_PROG_CXX -AC_PROG_CC AM_PROG_AR LT_INIT([win32-dll]) -AC_CANONICAL_HOST -case "$host_os" in - *mingw64*) - CXXFLAGS="$CXXFLAGS -D__USE_MINGW_ANSI_STDIO=1" - ;; - *cygwin*) - ;; - *darwin*) - AC_DEFINE([DARWIN], [1], [Define if on Darwin/Mac OS X]) - ;; - *openbsd*) - CXXFLAGS="$CXXFLAGS -I/usr/local/include" - LDFLAGS="$LDFLAGS -L/usr/local/lib" - ;; - *freebsd*) - CXXFLAGS="$CXXFLAGS -I/usr/local/include" - LDFLAGS="$LDFLAGS -L/usr/local/lib" - ;; - *linux*) - CXXFLAGS="$CXXFLAGS -I/usr/local/include" - LDFLAGS="$LDFLAGS -L/usr/local/lib" - ;; -esac +CXXFLAGS="-O3 -I include" + +AC_CANONICAL_TARGET +case "$target_cpu" in + "x86_64") + AC_MSG_NOTICE(["The target platform is $target_cpu"]) + if test "x$enable_blsasm" = "xyes"; then + AC_MSG_NOTICE(["Enable blsasm"]) + CXXFLAGS="$CXXFLAGS -D__x86_64_asm__" + else + AC_MSG_NOTICE(["Disable blsasm"]) + case "$host_os" in + "*darwin*") + CXXFLAGS="$CXXFLAGS" + ;; + "*") + CXXFLAGS="$CXXFLAGS -flto" + ;; + esac + fi + ;; + "*") + AC_MSG_NOTICE([The target platform is not x86_64]) + CXXFLAGS="$CXXFLAGS -flto" + ;; +esac -CXXFLAGS_WARN="-Wall -Wextra -Wformat=2 -Wcast-qual -Wcast-align -Wwrite-strings -Wfloat-equal -Wpointer-arith -Wundef" -CXXFLAGS="$CXXFLAGS $CXXFLAGS_WARN -I include -I src" -AC_SUBST(CXXFLAGS) -AC_SUBST(LDFLAGS) AC_SUBST(CXX) -AC_SUBST(CC) +AC_SUBST(CXXFLAGS) AC_CONFIG_FILES([Makefile libbls.pc]) @@ -76,4 +68,5 @@ echo " CPPFLAGS = $CPPFLAGS" echo " CXX = $CXX" echo " CXXFLAGS = $CXXFLAGS" echo " LDFLAGS = $LDFLAGS" +echo " ENABLE_BLSASM = $enable_blsasm" echo diff --git a/src/bls12-381/include/bls12-381/arithmetic.hpp b/src/bls12-381/include/bls12-381/arithmetic.hpp index ae7a3e1a604..eecbf02b451 100644 --- a/src/bls12-381/include/bls12-381/arithmetic.hpp +++ b/src/bls12-381/include/bls12-381/arithmetic.hpp @@ -30,7 +30,7 @@ void _lsubtract(fp* z, const fp* x, const fp* y); // The "smaller than 4p" here means the montgomery form itself as number is less than 4p. // Therefore, at most ONE _ladd/_lsubstract/_ldouble is allowed before passing the result to _multiply, // unless the algorithm makes sure the number is small. -#if defined(__x86_64__) +#if defined(__x86_64_asm__) extern void (*_multiply)(fp*, const fp*, const fp*); #else void _multiply(fp*, const fp*, const fp*); @@ -41,7 +41,7 @@ void _multiply(fp*, const fp*, const fp*); // The carryOut output is guaranteed to be 0 or 1. // // This function's execution time does not depend on the inputs. -std::tuple Add64( +inline std::tuple Add64( const uint64_t& x, const uint64_t& y, const uint64_t& carry @@ -63,12 +63,12 @@ std::tuple Sub64( // half returned in lo. // // This function's execution time does not depend on the inputs. -std::tuple Mul64( +inline std::tuple Mul64( const uint64_t& x, const uint64_t& y ); -std::tuple madd( +inline std::tuple madd( const uint64_t& a, const uint64_t& b, const uint64_t& t, @@ -77,21 +77,21 @@ std::tuple madd( ); // madd0 hi = a*b + c (discards lo bits) -uint64_t madd0( +inline uint64_t madd0( const uint64_t& a, const uint64_t& b, const uint64_t& c ); // madd1 hi, lo = a*b + c -std::tuple madd1( +inline std::tuple madd1( const uint64_t& a, const uint64_t& b, const uint64_t& c ); // madd2 hi, lo = a*b + c + d -std::tuple madd2( +inline std::tuple madd2( const uint64_t& a, const uint64_t& b, const uint64_t& c, @@ -99,7 +99,7 @@ std::tuple madd2( ); // madd2s superhi, hi, lo = 2*a*b + c + d + e -std::tuple madd2s( +inline std::tuple madd2s( const uint64_t& a, const uint64_t& b, const uint64_t& c, @@ -107,27 +107,27 @@ std::tuple madd2s( const uint64_t& e ); -std::tuple madd1s( +inline std::tuple madd1s( const uint64_t& a, const uint64_t& b, const uint64_t& d, const uint64_t& e ); -std::tuple madd2sb( +inline std::tuple madd2sb( const uint64_t& a, const uint64_t& b, const uint64_t& c, const uint64_t& e ); -std::tuple madd1sb( +inline std::tuple madd1sb( const uint64_t& a, const uint64_t& b, const uint64_t& e ); -std::tuple madd3( +inline std::tuple madd3( const uint64_t& a, const uint64_t& b, const uint64_t& c, diff --git a/src/bls12-381/libbls.pc.in b/src/bls12-381/libbls.pc.in index e8cf3caa83e..0a672b5b23c 100644 --- a/src/bls12-381/libbls.pc.in +++ b/src/bls12-381/libbls.pc.in @@ -3,7 +3,7 @@ exec_prefix=@exec_prefix@ libdir=@libdir@ includedir=@includedir@ -Name: libmcl +Name: libbls Description: Pairing Library URL: https://github.com/QEDProtocol/bls12-381 Version: @PACKAGE_VERSION@ diff --git a/src/bls12-381/src/arithmetic.cpp b/src/bls12-381/src/arithmetic.cpp index 8e9d61a762f..0a9d19c3d06 100644 --- a/src/bls12-381/src/arithmetic.cpp +++ b/src/bls12-381/src/arithmetic.cpp @@ -1,5 +1,5 @@ #include -#ifdef __x86_64__ +#ifdef __x86_64_asm__ #include #endif @@ -8,7 +8,7 @@ using namespace std; namespace bls12_381 { -#ifdef __x86_64__ +#ifdef __x86_64_asm__ void _add(fp* z, const fp* x, const fp* y) { // x86_64 calling convention (https://en.wikipedia.org/wiki/X86_calling_conventions#System_V_AMD64_ABI): @@ -96,7 +96,7 @@ void _add(fp* z, const fp* x, const fp* y) } #endif -#ifdef __x86_64__ +#ifdef __x86_64_asm__ void _ladd(fp* z, const fp* x, const fp* y) { // x86_64 calling convention (https://en.wikipedia.org/wiki/X86_calling_conventions#System_V_AMD64_ABI): @@ -136,7 +136,7 @@ void _ladd(fp* z, const fp* x, const fp* y) } #endif -#ifdef __x86_64__ +#ifdef __x86_64_asm__ void _double(fp* z, const fp* x) { // x86_64 calling convention (https://en.wikipedia.org/wiki/X86_calling_conventions#System_V_AMD64_ABI): @@ -223,7 +223,7 @@ void _double(fp* z, const fp* x) } #endif -#ifdef __x86_64__ +#ifdef __x86_64_asm__ void _ldouble(fp* z, const fp* x) { // x86_64 calling convention (https://en.wikipedia.org/wiki/X86_calling_conventions#System_V_AMD64_ABI): @@ -263,7 +263,7 @@ void _ldouble(fp* z, const fp* x) } #endif -#ifdef __x86_64__ +#ifdef __x86_64_asm__ void _subtract(fp* z, const fp* x, const fp* y) { // x86_64 calling convention (https://en.wikipedia.org/wiki/X86_calling_conventions#System_V_AMD64_ABI): @@ -342,7 +342,7 @@ void _subtract(fp* z, const fp* x, const fp* y) } #endif -#ifdef __x86_64__ +#ifdef __x86_64_asm__ void _lsubtract(fp* z, const fp* x, const fp* y) { // x86_64 calling convention (https://en.wikipedia.org/wiki/X86_calling_conventions#System_V_AMD64_ABI): @@ -382,7 +382,7 @@ void _lsubtract(fp* z, const fp* x, const fp* y) } #endif -#ifdef __x86_64__ +#ifdef __x86_64_asm__ void __negate(fp* z, const fp* x) { // x86_64 calling convention (https://en.wikipedia.org/wiki/X86_calling_conventions#System_V_AMD64_ABI): @@ -436,7 +436,7 @@ void _negate(fp* z, const fp* x) } #endif -#ifdef __x86_64__ +#ifdef __x86_64_asm__ void __multiply(fp* z, const fp* x, const fp* y) { // x86_64 calling convention (https://en.wikipedia.org/wiki/X86_calling_conventions#System_V_AMD64_ABI): @@ -1624,7 +1624,7 @@ void _multiply(fp* z, const fp* x, const fp* y) } #endif -#ifdef __x86_64__ +#ifdef __x86_64_asm__ void _square(fp* z, const fp* x) { #ifdef __clang__