diff --git a/src/bls12-381/configure.ac b/src/bls12-381/configure.ac
index e24f854b1c3..6300e48082c 100644
--- a/src/bls12-381/configure.ac
+++ b/src/bls12-381/configure.ac
@@ -16,6 +16,7 @@ AC_SUBST(LIB_VERSION_AGE, _LIB_VERSION_AGE)
 
 AM_INIT_AUTOMAKE([1.11.2 foreign subdir-objects])
 
+AC_ARG_ENABLE([blsasm], [AS_HELP_STRING([--enable-blsasm=yes|no], [Enable blsasm option])], [], [enable_blsasm=no])
 
 AC_PROG_CXX
 AM_PROG_AR
@@ -23,7 +24,26 @@ AM_PROG_AR
 LT_INIT([win32-dll])
 
 
-CXXFLAGS="-O3 -flto -I include"
+CXXFLAGS="-O3 -I include"
+
+AC_CANONICAL_TARGET
+case "$target_cpu" in 
+    "x86_64") 
+    AC_MSG_NOTICE(["The target platform is $target_cpu"])
+    if test "x$enable_blsasm" = "xyes"; then
+        AC_MSG_NOTICE(["Enable blsasm"])
+        CXXFLAGS="$CXXFLAGS -D__x86_64_asm__"
+    else 
+        AC_MSG_NOTICE(["Disable blsasm"])
+        CXXFLAGS="$CXXFLAGS -flto"
+    fi
+    ;;
+    "*")
+    AC_MSG_NOTICE([The target platform is not x86_64])
+    CXXFLAGS="$CXXFLAGS -flto"
+    ;;
+esac
+
 
 AC_SUBST(CXX)
 AC_SUBST(CXXFLAGS)
@@ -41,4 +61,5 @@ echo "  CPPFLAGS      = $CPPFLAGS"
 echo "  CXX           = $CXX"
 echo "  CXXFLAGS      = $CXXFLAGS"
 echo "  LDFLAGS       = $LDFLAGS"
+echo "  ENABLE_BLSASM = $enable_blsasm"
 echo
diff --git a/src/bls12-381/include/bls12-381/arithmetic.hpp b/src/bls12-381/include/bls12-381/arithmetic.hpp
index a98f3f50937..bf5339561e1 100644
--- a/src/bls12-381/include/bls12-381/arithmetic.hpp
+++ b/src/bls12-381/include/bls12-381/arithmetic.hpp
@@ -1,7 +1,6 @@
 #include <cstdint>
 #include <tuple>
 
-#undef __x86_64__
 
 #if defined(UINT128_MAX) || defined(__SIZEOF_INT128__)
 #define USE_INT128
@@ -32,7 +31,7 @@ void _lsubtract(fp* z, const fp* x, const fp* y);
 // The "smaller than 4p" here means the montgomery form itself as number is less than 4p.
 // Therefore, at most ONE _ladd/_lsubstract/_ldouble is allowed before passing the result to _multiply,
 // unless the algorithm makes sure the number is small.
-#if defined(__x86_64__)
+#if defined(__x86_64_asm__)
 extern void (*_multiply)(fp*, const fp*, const fp*);
 #else
 void _multiply(fp*, const fp*, const fp*);
diff --git a/src/bls12-381/src/arithmetic.cpp b/src/bls12-381/src/arithmetic.cpp
index 8e9d61a762f..0a9d19c3d06 100644
--- a/src/bls12-381/src/arithmetic.cpp
+++ b/src/bls12-381/src/arithmetic.cpp
@@ -1,5 +1,5 @@
 #include <bls12-381/bls12-381.hpp>
-#ifdef __x86_64__
+#ifdef __x86_64_asm__
 #include <cpuid.h>
 #endif
 
@@ -8,7 +8,7 @@ using namespace std;
 namespace bls12_381
 {
 
-#ifdef __x86_64__
+#ifdef __x86_64_asm__
 void _add(fp* z, const fp* x, const fp* y)
 {
     // x86_64 calling convention (https://en.wikipedia.org/wiki/X86_calling_conventions#System_V_AMD64_ABI):
@@ -96,7 +96,7 @@ void _add(fp* z, const fp* x, const fp* y)
 }
 #endif
 
-#ifdef __x86_64__
+#ifdef __x86_64_asm__
 void _ladd(fp* z, const fp* x, const fp* y)
 {
     // x86_64 calling convention (https://en.wikipedia.org/wiki/X86_calling_conventions#System_V_AMD64_ABI):
@@ -136,7 +136,7 @@ void _ladd(fp* z, const fp* x, const fp* y)
 }
 #endif
 
-#ifdef __x86_64__
+#ifdef __x86_64_asm__
 void _double(fp* z, const fp* x)
 {
     // x86_64 calling convention (https://en.wikipedia.org/wiki/X86_calling_conventions#System_V_AMD64_ABI):
@@ -223,7 +223,7 @@ void _double(fp* z, const fp* x)
 }
 #endif
 
-#ifdef __x86_64__
+#ifdef __x86_64_asm__
 void _ldouble(fp* z, const fp* x)
 {
     // x86_64 calling convention (https://en.wikipedia.org/wiki/X86_calling_conventions#System_V_AMD64_ABI):
@@ -263,7 +263,7 @@ void _ldouble(fp* z, const fp* x)
 }
 #endif
 
-#ifdef __x86_64__
+#ifdef __x86_64_asm__
 void _subtract(fp* z, const fp* x, const fp* y)
 {
     // x86_64 calling convention (https://en.wikipedia.org/wiki/X86_calling_conventions#System_V_AMD64_ABI):
@@ -342,7 +342,7 @@ void _subtract(fp* z, const fp* x, const fp* y)
 }
 #endif
 
-#ifdef __x86_64__
+#ifdef __x86_64_asm__
 void _lsubtract(fp* z, const fp* x, const fp* y)
 {
     // x86_64 calling convention (https://en.wikipedia.org/wiki/X86_calling_conventions#System_V_AMD64_ABI):
@@ -382,7 +382,7 @@ void _lsubtract(fp* z, const fp* x, const fp* y)
 }
 #endif
 
-#ifdef __x86_64__
+#ifdef __x86_64_asm__
 void __negate(fp* z, const fp* x)
 {
     // x86_64 calling convention (https://en.wikipedia.org/wiki/X86_calling_conventions#System_V_AMD64_ABI):
@@ -436,7 +436,7 @@ void _negate(fp* z, const fp* x)
 }
 #endif
 
-#ifdef __x86_64__
+#ifdef __x86_64_asm__
 void __multiply(fp* z, const fp* x, const fp* y)
 {
     // x86_64 calling convention (https://en.wikipedia.org/wiki/X86_calling_conventions#System_V_AMD64_ABI):
@@ -1624,7 +1624,7 @@ void _multiply(fp* z, const fp* x, const fp* y)
 }
 #endif
 
-#ifdef __x86_64__
+#ifdef __x86_64_asm__
 void _square(fp* z, const fp* x)
 {
     #ifdef __clang__