diff --git a/internal/sm2ec/p256_asm_arm64.s b/internal/sm2ec/p256_asm_arm64.s
index 8353e66e..e1a5f153 100644
--- a/internal/sm2ec/p256_asm_arm64.s
+++ b/internal/sm2ec/p256_asm_arm64.s
@@ -99,7 +99,7 @@ TEXT ·p256MovCond(SB),NOSPLIT,$0
 	MOVD	cond+24(FP), R3
 
 	VEOR V0.B16, V0.B16, V0.B16
-	VMOV R3, V1.S4
+	VDUP R3, V1.S4
 	VCMEQ V0.S4, V1.S4, V2.S4
 
 	VLD1.P (48)(a_ptr), [V3.B16, V4.B16, V5.B16]
@@ -278,7 +278,7 @@ TEXT ·p256Select(SB),NOSPLIT,$0
 	MOVD	table+8(FP), b_ptr
 	MOVD	res+0(FP), res_ptr
 
-	VMOV const0, V0.S4                  // will use VDUP after upgrade go to 1.17+
+	VDUP const0, V0.S4
 
 	VEOR V2.B16, V2.B16, V2.B16
 	VEOR V3.B16, V3.B16, V3.B16
@@ -291,7 +291,7 @@ TEXT ·p256Select(SB),NOSPLIT,$0
 
 loop_select:
 		ADD	$1, const1
-		VMOV const1, V1.S4             // will use VDUP after upgrade go to 1.17+
+		VDUP const1, V1.S4
 		VCMEQ V0.S4, V1.S4, V14.S4
 		VLD1.P (48)(b_ptr), [V8.B16, V9.B16, V10.B16]
 		VLD1.P (48)(b_ptr), [V11.B16, V12.B16, V13.B16]
@@ -314,7 +314,7 @@ TEXT ·p256SelectAffine(SB),NOSPLIT,$0
 	MOVD	table+8(FP), t1
 	MOVD	res+0(FP), res_ptr
 
-	VMOV t0, V0.S4                  // will use VDUP after upgrade go to 1.17+
+	VDUP t0, V0.S4
 
 	VEOR V2.B16, V2.B16, V2.B16
 	VEOR V3.B16, V3.B16, V3.B16
@@ -325,7 +325,7 @@ TEXT ·p256SelectAffine(SB),NOSPLIT,$0
 
 loop_select:
 		ADD	$1, t2
-		VMOV t2, V1.S4             // will use VDUP after upgrade go to 1.17+
+		VDUP t2, V1.S4
 		VCMEQ V0.S4, V1.S4, V10.S4
 		VLD1.P (64)(t1), [V6.B16, V7.B16, V8.B16, V9.B16]
 		VBIT V10.B16, V6.B16, V2.B16
@@ -1153,9 +1153,9 @@ TEXT ·p256PointAddAffineAsm(SB),0,$264-48
 	MOVD	zero+40(FP), t1
 
 	VEOR V12.B16, V12.B16, V12.B16
-	VMOV hlp1, V13.S4  // will use VDUP after go 1.17
+	VDUP hlp1, V13.S4
 	VCMEQ V12.S4, V13.S4, V13.S4
-	VMOV t1, V14.S4    // will use VDUP after go 1.17
+	VDUP t1, V14.S4
 	VCMEQ V12.S4, V14.S4, V14.S4	
 
 	LDP	p256p<>+0x00(SB), (const0, const1)
diff --git a/sm4/aesni_macros_arm64.s b/sm4/aesni_macros_arm64.s
index 40842956..7a31165b 100644
--- a/sm4/aesni_macros_arm64.s
+++ b/sm4/aesni_macros_arm64.s
@@ -139,7 +139,7 @@ GLOBL fk_mask<>(SB), (16+8), $16
 // - t3: 128 bits register for data
 #define SM4_ROUND(RK, tmp32, x, y, z, t0, t1, t2, t3) \ 
 	MOVW.P 4(RK), tmp32;                              \
-	VMOV tmp32, x.S4;                                 \
+	VDUP tmp32, x.S4;                                 \
 	VEOR t1.B16, x.B16, x.B16;                        \
 	VEOR t2.B16, x.B16, x.B16;                        \
 	VEOR t3.B16, x.B16, x.B16;                        \
@@ -160,7 +160,7 @@ GLOBL fk_mask<>(SB), (16+8), $16
 // - t3: 128 bits register for data
 #define SM4_8BLOCKS_ROUND(RK, tmp32, x, y, z, tmp, t0, t1, t2, t3, t4, t5, t6, t7) \ 
 	MOVW.P 4(RK), tmp32;                              \
-	VMOV tmp32, tmp.S4;                               \
+	VDUP tmp32, tmp.S4;                               \
 	VEOR t1.B16, tmp.B16, x.B16;                      \
 	VEOR t2.B16, x.B16, x.B16;                        \
 	VEOR t3.B16, x.B16, x.B16;                        \
diff --git a/zuc/asm_arm64.s b/zuc/asm_arm64.s
index 63844beb..560a41b9 100644
--- a/zuc/asm_arm64.s
+++ b/zuc/asm_arm64.s
@@ -243,7 +243,7 @@ GLOBL mask_S1<>(SB), RODATA, $16
 	EORW R9, R11                             \ // V = L2(Q) = R11D, hi(R11)=0
 	LSL $32, R11                             \
 	EOR R11, DX                              \
-	VMOV DX, V0.D2                           \
+	VDUP DX, V0.D2                           \
 	VMOV V0.B16, V1.B16                      \ 
 	S0_comput(V1, V2, V3)                    \
 	S1_comput(V0, V2, V3)                    \
diff --git a/zuc/eia256_asm_arm64.s b/zuc/eia256_asm_arm64.s
index 6f17690b..0598862d 100644
--- a/zuc/eia256_asm_arm64.s
+++ b/zuc/eia256_asm_arm64.s
@@ -46,21 +46,16 @@ GLOBL shuf_mask_dw2_0_dw3_0<>(SB), RODATA, $16
 #define SHUF_MASK_DW2_DW3 V24
 
 #define LOAD_GLOBAL_DATA() \
-	LDP bit_reverse_table_l<>(SB), (R0, R1)                   \
-	VMOV R0, BIT_REV_TAB_L.D[0]                               \
-	VMOV R1, BIT_REV_TAB_L.D[1]                               \
-	LDP bit_reverse_table_h<>(SB), (R0, R1)                   \
-	VMOV R0, BIT_REV_TAB_H.D[0]                               \
-	VMOV R1, BIT_REV_TAB_H.D[1]                               \	
-	LDP bit_reverse_and_table<>(SB), (R0, R1)                 \
-	VMOV R0, BIT_REV_AND_TAB.D[0]                             \
-	VMOV R1, BIT_REV_AND_TAB.D[1]                             \
-	LDP shuf_mask_dw0_0_dw1_0<>(SB), (R0, R1)                 \
-	VMOV R0, SHUF_MASK_DW0_DW1.D[0]                           \
-	VMOV R1, SHUF_MASK_DW0_DW1.D[1]                           \
-	LDP shuf_mask_dw2_0_dw3_0<>(SB), (R0, R1)                 \
-	VMOV R0, SHUF_MASK_DW2_DW3.D[0]                           \
-	VMOV R1, SHUF_MASK_DW2_DW3.D[1]
+	MOVD $bit_reverse_table_l<>(SB), R0                   \
+	VMOV (R0), [BIT_REV_TAB_L.B16]                        \
+	MOVD $bit_reverse_table_h<>(SB), R0                   \
+	VMOV (R0), [BIT_REV_TAB_H.B16]                        \
+	MOVD $bit_reverse_and_table<>(SB), R0                 \
+	VMOV (R0), [BIT_REV_AND_TAB.B16]                      \
+	MOVD $shuf_mask_dw0_0_dw1_0<>(SB), R0                 \
+	VMOV (R0), [SHUF_MASK_DW0_DW1.B16]                    \
+	MOVD $shuf_mask_dw2_0_dw3_0<>(SB), R0                 \
+	VMOV (R0), [SHUF_MASK_DW2_DW3.B16]                    \
 
 // func eia256RoundTag8(t *uint32, keyStream *uint32, p *byte)
 TEXT ·eia256RoundTag8(SB),NOSPLIT,$0