From af9eec14c5026a20826129159d4d7fade3b0fc72 Mon Sep 17 00:00:00 2001
From: "zilong.dai" <2840478265@qq.com>
Date: Mon, 4 Nov 2024 12:09:31 +0800
Subject: [PATCH] optimize non-asm code through flto and inline

---
 src/bls12-381/Makefile.am                     | 10 +--
 src/bls12-381/configure.ac                    | 65 +++++++++----------
 .../include/bls12-381/arithmetic.hpp          | 24 +++----
 src/bls12-381/libbls.pc.in                    |  2 +-
 src/bls12-381/src/arithmetic.cpp              | 20 +++---
 5 files changed, 55 insertions(+), 66 deletions(-)

diff --git a/src/bls12-381/Makefile.am b/src/bls12-381/Makefile.am
index bbb854ad1a4..2b472aa336a 100644
--- a/src/bls12-381/Makefile.am
+++ b/src/bls12-381/Makefile.am
@@ -1,12 +1,8 @@
-ACLOCAL_AMFLAGS = -I build-aux/m4
 
-AM_CXXFLAGS = -std=c++11 -fpic -O2
-AM_CPPFLAGS = -std=c++11 -fpic -O2
+AM_CXXFLAGS = $(CXXFLAGS)
 
-AM_CFLAGS = --fpic -O2
+pkgconfigdir = $(libdir)/pkgconfig
+pkgconfig_DATA = libbls.pc
 
 lib_LTLIBRARIES = libbls.la
 libbls_la_SOURCES = src/groth16.cpp src/arithmetic.cpp src/fp.cpp src/g.cpp src/pairing.cpp src/scalar.cpp src/utils.cpp
-
-pkgconfigdir = $(libdir)/pkgconfig
-pkgconfig_DATA = libbls.pc
diff --git a/src/bls12-381/configure.ac b/src/bls12-381/configure.ac
index 9642d712f55..8da7ba750fc 100644
--- a/src/bls12-381/configure.ac
+++ b/src/bls12-381/configure.ac
@@ -13,55 +13,47 @@ AC_INIT([libbls], m4_join([.], _PKG_VERSION_MAJOR, _PKG_VERSION_MINOR, _PKG_VERS
 AC_SUBST(LIB_VERSION_CURRENT, _LIB_VERSION_CURRENT)
 AC_SUBST(LIB_VERSION_REVISION, _LIB_VERSION_REVISION)
 AC_SUBST(LIB_VERSION_AGE, _LIB_VERSION_AGE)
-AC_CONFIG_AUX_DIR([build-aux])
-AC_CONFIG_MACRO_DIR([build-aux/m4])
-AC_CANONICAL_HOST
-AH_TOP([#ifndef LIBMCL_CONFIG_H])
-AH_TOP([#define LIBMCL_CONFIG_H])
-AH_BOTTOM([#endif /*LIBMCL_CONFIG_H*/])
 
 AM_INIT_AUTOMAKE([1.11.2 foreign subdir-objects])
 
-m4_ifdef([AM_SILENT_RULES], [AM_SILENT_RULES([yes])])
+AC_ARG_ENABLE([blsasm], [AS_HELP_STRING([--enable-blsasm=yes|no], [Enable blsasm option])], [], [enable_blsasm=no])
 
 AC_PROG_CXX
-AC_PROG_CC
 AM_PROG_AR
 
 LT_INIT([win32-dll])
 
-AC_CANONICAL_HOST
-case "$host_os" in
-    *mingw64*)
-        CXXFLAGS="$CXXFLAGS -D__USE_MINGW_ANSI_STDIO=1"
-        ;;
-    *cygwin*)
-        ;;
-    *darwin*)
-        AC_DEFINE([DARWIN], [1], [Define if on Darwin/Mac OS X])
-        ;;
-    *openbsd*)
-        CXXFLAGS="$CXXFLAGS -I/usr/local/include"
-        LDFLAGS="$LDFLAGS -L/usr/local/lib"
-        ;;
-    *freebsd*)
-        CXXFLAGS="$CXXFLAGS -I/usr/local/include"
-        LDFLAGS="$LDFLAGS -L/usr/local/lib"
-        ;;
-    *linux*)
-        CXXFLAGS="$CXXFLAGS -I/usr/local/include"
-        LDFLAGS="$LDFLAGS -L/usr/local/lib"
-        ;;
-esac
 
+CXXFLAGS="-O3 -I include"
+
+AC_CANONICAL_TARGET
+case "$target_cpu" in 
+    "x86_64") 
+    AC_MSG_NOTICE(["The target platform is $target_cpu"])
+    if test "x$enable_blsasm" = "xyes"; then
+        AC_MSG_NOTICE(["Enable blsasm"])
+        CXXFLAGS="$CXXFLAGS -D__x86_64_asm__"
+    else
+        AC_MSG_NOTICE(["Disable blsasm"])
+        case "$host_os" in
+            "*darwin*")
+            CXXFLAGS="$CXXFLAGS"
+            ;;
+            "*")
+            CXXFLAGS="$CXXFLAGS -flto"
+            ;;
+        esac
+    fi
+    ;;
+    "*")
+    AC_MSG_NOTICE([The target platform is not x86_64])
+    CXXFLAGS="$CXXFLAGS -flto"
+    ;;
+esac
 
-CXXFLAGS_WARN="-Wall -Wextra -Wformat=2 -Wcast-qual -Wcast-align -Wwrite-strings -Wfloat-equal -Wpointer-arith -Wundef"
-CXXFLAGS="$CXXFLAGS $CXXFLAGS_WARN -I include -I src"
 
-AC_SUBST(CXXFLAGS)
-AC_SUBST(LDFLAGS)
 AC_SUBST(CXX)
-AC_SUBST(CC)
+AC_SUBST(CXXFLAGS)
 
 AC_CONFIG_FILES([Makefile libbls.pc])
 
@@ -76,4 +68,5 @@ echo "  CPPFLAGS      = $CPPFLAGS"
 echo "  CXX           = $CXX"
 echo "  CXXFLAGS      = $CXXFLAGS"
 echo "  LDFLAGS       = $LDFLAGS"
+echo "  ENABLE_BLSASM = $enable_blsasm"
 echo
diff --git a/src/bls12-381/include/bls12-381/arithmetic.hpp b/src/bls12-381/include/bls12-381/arithmetic.hpp
index ae7a3e1a604..eecbf02b451 100644
--- a/src/bls12-381/include/bls12-381/arithmetic.hpp
+++ b/src/bls12-381/include/bls12-381/arithmetic.hpp
@@ -30,7 +30,7 @@ void _lsubtract(fp* z, const fp* x, const fp* y);
 // The "smaller than 4p" here means the montgomery form itself as number is less than 4p.
 // Therefore, at most ONE _ladd/_lsubstract/_ldouble is allowed before passing the result to _multiply,
 // unless the algorithm makes sure the number is small.
-#if defined(__x86_64__)
+#if defined(__x86_64_asm__)
 extern void (*_multiply)(fp*, const fp*, const fp*);
 #else
 void _multiply(fp*, const fp*, const fp*);
@@ -41,7 +41,7 @@ void _multiply(fp*, const fp*, const fp*);
 // The carryOut output is guaranteed to be 0 or 1.
 //
 // This function's execution time does not depend on the inputs.
-std::tuple<uint64_t, uint64_t> Add64(
+inline std::tuple<uint64_t, uint64_t> Add64(
     const uint64_t& x,
     const uint64_t& y,
     const uint64_t& carry
@@ -63,12 +63,12 @@ std::tuple<uint64_t, uint64_t> Sub64(
 // half returned in lo.
 //
 // This function's execution time does not depend on the inputs.
-std::tuple<uint64_t, uint64_t> Mul64(
+inline std::tuple<uint64_t, uint64_t> Mul64(
     const uint64_t& x,
     const uint64_t& y
 );
 
-std::tuple<uint64_t, uint64_t, uint64_t> madd(
+inline std::tuple<uint64_t, uint64_t, uint64_t> madd(
     const uint64_t& a,
     const uint64_t& b,
     const uint64_t& t,
@@ -77,21 +77,21 @@ std::tuple<uint64_t, uint64_t, uint64_t> madd(
 );
 
 // madd0 hi = a*b + c (discards lo bits)
-uint64_t madd0(
+inline uint64_t madd0(
     const uint64_t& a,
     const uint64_t& b,
     const uint64_t& c
 );
 
 // madd1 hi, lo = a*b + c
-std::tuple<uint64_t, uint64_t> madd1(
+inline std::tuple<uint64_t, uint64_t> madd1(
     const uint64_t& a,
     const uint64_t& b,
     const uint64_t& c
 );
 
 // madd2 hi, lo = a*b + c + d
-std::tuple<uint64_t, uint64_t> madd2(
+inline std::tuple<uint64_t, uint64_t> madd2(
     const uint64_t& a,
     const uint64_t& b,
     const uint64_t& c,
@@ -99,7 +99,7 @@ std::tuple<uint64_t, uint64_t> madd2(
 );
 
 // madd2s superhi, hi, lo = 2*a*b + c + d + e
-std::tuple<uint64_t, uint64_t, uint64_t> madd2s(
+inline std::tuple<uint64_t, uint64_t, uint64_t> madd2s(
     const uint64_t& a,
     const uint64_t& b,
     const uint64_t& c,
@@ -107,27 +107,27 @@ std::tuple<uint64_t, uint64_t, uint64_t> madd2s(
     const uint64_t& e
 );
 
-std::tuple<uint64_t, uint64_t, uint64_t> madd1s(
+inline std::tuple<uint64_t, uint64_t, uint64_t> madd1s(
     const uint64_t& a,
     const uint64_t& b,
     const uint64_t& d,
     const uint64_t& e
 );
 
-std::tuple<uint64_t, uint64_t, uint64_t> madd2sb(
+inline std::tuple<uint64_t, uint64_t, uint64_t> madd2sb(
     const uint64_t& a,
     const uint64_t& b,
     const uint64_t& c,
     const uint64_t& e
 );
 
-std::tuple<uint64_t, uint64_t, uint64_t> madd1sb(
+inline std::tuple<uint64_t, uint64_t, uint64_t> madd1sb(
     const uint64_t& a,
     const uint64_t& b,
     const uint64_t& e
 );
 
-std::tuple<uint64_t, uint64_t> madd3(
+inline std::tuple<uint64_t, uint64_t> madd3(
     const uint64_t& a,
     const uint64_t& b,
     const uint64_t& c,
diff --git a/src/bls12-381/libbls.pc.in b/src/bls12-381/libbls.pc.in
index e8cf3caa83e..0a672b5b23c 100644
--- a/src/bls12-381/libbls.pc.in
+++ b/src/bls12-381/libbls.pc.in
@@ -3,7 +3,7 @@ exec_prefix=@exec_prefix@
 libdir=@libdir@
 includedir=@includedir@
 
-Name: libmcl
+Name: libbls
 Description: Pairing Library
 URL: https://github.com/QEDProtocol/bls12-381
 Version: @PACKAGE_VERSION@
diff --git a/src/bls12-381/src/arithmetic.cpp b/src/bls12-381/src/arithmetic.cpp
index 8e9d61a762f..0a9d19c3d06 100644
--- a/src/bls12-381/src/arithmetic.cpp
+++ b/src/bls12-381/src/arithmetic.cpp
@@ -1,5 +1,5 @@
 #include <bls12-381/bls12-381.hpp>
-#ifdef __x86_64__
+#ifdef __x86_64_asm__
 #include <cpuid.h>
 #endif
 
@@ -8,7 +8,7 @@ using namespace std;
 namespace bls12_381
 {
 
-#ifdef __x86_64__
+#ifdef __x86_64_asm__
 void _add(fp* z, const fp* x, const fp* y)
 {
     // x86_64 calling convention (https://en.wikipedia.org/wiki/X86_calling_conventions#System_V_AMD64_ABI):
@@ -96,7 +96,7 @@ void _add(fp* z, const fp* x, const fp* y)
 }
 #endif
 
-#ifdef __x86_64__
+#ifdef __x86_64_asm__
 void _ladd(fp* z, const fp* x, const fp* y)
 {
     // x86_64 calling convention (https://en.wikipedia.org/wiki/X86_calling_conventions#System_V_AMD64_ABI):
@@ -136,7 +136,7 @@ void _ladd(fp* z, const fp* x, const fp* y)
 }
 #endif
 
-#ifdef __x86_64__
+#ifdef __x86_64_asm__
 void _double(fp* z, const fp* x)
 {
     // x86_64 calling convention (https://en.wikipedia.org/wiki/X86_calling_conventions#System_V_AMD64_ABI):
@@ -223,7 +223,7 @@ void _double(fp* z, const fp* x)
 }
 #endif
 
-#ifdef __x86_64__
+#ifdef __x86_64_asm__
 void _ldouble(fp* z, const fp* x)
 {
     // x86_64 calling convention (https://en.wikipedia.org/wiki/X86_calling_conventions#System_V_AMD64_ABI):
@@ -263,7 +263,7 @@ void _ldouble(fp* z, const fp* x)
 }
 #endif
 
-#ifdef __x86_64__
+#ifdef __x86_64_asm__
 void _subtract(fp* z, const fp* x, const fp* y)
 {
     // x86_64 calling convention (https://en.wikipedia.org/wiki/X86_calling_conventions#System_V_AMD64_ABI):
@@ -342,7 +342,7 @@ void _subtract(fp* z, const fp* x, const fp* y)
 }
 #endif
 
-#ifdef __x86_64__
+#ifdef __x86_64_asm__
 void _lsubtract(fp* z, const fp* x, const fp* y)
 {
     // x86_64 calling convention (https://en.wikipedia.org/wiki/X86_calling_conventions#System_V_AMD64_ABI):
@@ -382,7 +382,7 @@ void _lsubtract(fp* z, const fp* x, const fp* y)
 }
 #endif
 
-#ifdef __x86_64__
+#ifdef __x86_64_asm__
 void __negate(fp* z, const fp* x)
 {
     // x86_64 calling convention (https://en.wikipedia.org/wiki/X86_calling_conventions#System_V_AMD64_ABI):
@@ -436,7 +436,7 @@ void _negate(fp* z, const fp* x)
 }
 #endif
 
-#ifdef __x86_64__
+#ifdef __x86_64_asm__
 void __multiply(fp* z, const fp* x, const fp* y)
 {
     // x86_64 calling convention (https://en.wikipedia.org/wiki/X86_calling_conventions#System_V_AMD64_ABI):
@@ -1624,7 +1624,7 @@ void _multiply(fp* z, const fp* x, const fp* y)
 }
 #endif
 
-#ifdef __x86_64__
+#ifdef __x86_64_asm__
 void _square(fp* z, const fp* x)
 {
     #ifdef __clang__