diff --git a/zuc/asm_arm64.s b/zuc/asm_arm64.s index 560a41b9..a4711e86 100644 --- a/zuc/asm_arm64.s +++ b/zuc/asm_arm64.s @@ -2,57 +2,40 @@ #include "textflag.h" -DATA Top3_bits_of_the_byte<>+0x00(SB)/8, $0xe0e0e0e0e0e0e0e0 -DATA Top3_bits_of_the_byte<>+0x08(SB)/8, $0xe0e0e0e0e0e0e0e0 -GLOBL Top3_bits_of_the_byte<>(SB), RODATA, $16 - -DATA Bottom5_bits_of_the_byte<>+0x00(SB)/8, $0x1f1f1f1f1f1f1f1f -DATA Bottom5_bits_of_the_byte<>+0x08(SB)/8, $0x1f1f1f1f1f1f1f1f -GLOBL Bottom5_bits_of_the_byte<>(SB), RODATA, $16 - -DATA nibble_mask<>+0x00(SB)/8, $0x0F0F0F0F0F0F0F0F -DATA nibble_mask<>+0x08(SB)/8, $0x0F0F0F0F0F0F0F0F -GLOBL nibble_mask<>(SB), RODATA, $16 - -DATA P1_data<>+0x00(SB)/8, $0x0A020F0F0E000F09 -DATA P1_data<>+0x08(SB)/8, $0x090305070C000400 -GLOBL P1_data<>(SB), RODATA, $16 - -DATA P2_data<>+0x00(SB)/8, $0x040C000705060D08 -DATA P2_data<>+0x08(SB)/8, $0x0209030F0A0E010B -GLOBL P2_data<>(SB), RODATA, $16 - -DATA P3_data<>+0x00(SB)/8, $0x0F0A0D00060A0602 -DATA P3_data<>+0x08(SB)/8, $0x0D0C0900050D0303 -GLOBL P3_data<>(SB), RODATA, $16 - -DATA Aes_to_Zuc_mul_low_nibble<>+0x00(SB)/8, $0x1D1C9F9E83820100 -DATA Aes_to_Zuc_mul_low_nibble<>+0x08(SB)/8, $0x3938BBBAA7A62524 -GLOBL Aes_to_Zuc_mul_low_nibble<>(SB), RODATA, $16 - -DATA Aes_to_Zuc_mul_high_nibble<>+0x00(SB)/8, $0xA174A97CDD08D500 -DATA Aes_to_Zuc_mul_high_nibble<>+0x08(SB)/8, $0x3DE835E04194499C -GLOBL Aes_to_Zuc_mul_high_nibble<>(SB), RODATA, $16 - -DATA Comb_matrix_mul_low_nibble<>+0x00(SB)/8, $0xA8BC0216D9CD7367 -DATA Comb_matrix_mul_low_nibble<>+0x08(SB)/8, $0x1F0BB5A16E7AC4D0 -GLOBL Comb_matrix_mul_low_nibble<>(SB), RODATA, $16 - -DATA Comb_matrix_mul_high_nibble<>+0x00(SB)/8, $0x638CFA1523CCBA55 -DATA Comb_matrix_mul_high_nibble<>+0x08(SB)/8, $0x3FD0A6497F90E609 -GLOBL Comb_matrix_mul_high_nibble<>(SB), RODATA, $16 +DATA Top3_Bottom5_bits_of_the_byte<>+0x00(SB)/8, $0xe0e0e0e0e0e0e0e0 +DATA Top3_Bottom5_bits_of_the_byte<>+0x08(SB)/8, $0xe0e0e0e0e0e0e0e0 +DATA Top3_Bottom5_bits_of_the_byte<>+0x10(SB)/8, $0x1f1f1f1f1f1f1f1f +DATA Top3_Bottom5_bits_of_the_byte<>+0x18(SB)/8, $0x1f1f1f1f1f1f1f1f +GLOBL Top3_Bottom5_bits_of_the_byte<>(SB), RODATA, $32 + +DATA P123_data<>+0x00(SB)/8, $0x0A020F0F0E000F09 +DATA P123_data<>+0x08(SB)/8, $0x090305070C000400 +DATA P123_data<>+0x10(SB)/8, $0x040C000705060D08 +DATA P123_data<>+0x18(SB)/8, $0x0209030F0A0E010B +DATA P123_data<>+0x20(SB)/8, $0x0F0A0D00060A0602 +DATA P123_data<>+0x28(SB)/8, $0x0D0C0900050D0303 +GLOBL P123_data<>(SB), RODATA, $48 + +// Affine transform 1 & 2 (low and high nibbles) +DATA m1_2<>+0x00(SB)/8, $0x1D1C9F9E83820100 +DATA m1_2<>+0x08(SB)/8, $0x3938BBBAA7A62524 +DATA m1_2<>+0x10(SB)/8, $0xA174A97CDD08D500 +DATA m1_2<>+0x18(SB)/8, $0x3DE835E04194499C +DATA m1_2<>+0x20(SB)/8, $0xA8BC0216D9CD7367 +DATA m1_2<>+0x28(SB)/8, $0x1F0BB5A16E7AC4D0 +DATA m1_2<>+0x30(SB)/8, $0x638CFA1523CCBA55 +DATA m1_2<>+0x38(SB)/8, $0x3FD0A6497F90E609 +GLOBL m1_2<>(SB), RODATA, $64 DATA Shuf_mask<>+0x00(SB)/8, $0x0B0E0104070A0D00 DATA Shuf_mask<>+0x08(SB)/8, $0x0306090C0F020508 GLOBL Shuf_mask<>(SB), RODATA, $16 -DATA mask_S0<>+0x00(SB)/8, $0xff00ff00ff00ff00 -DATA mask_S0<>+0x08(SB)/8, $0xff00ff00ff00ff00 -GLOBL mask_S0<>(SB), RODATA, $16 - -DATA mask_S1<>+0x00(SB)/8, $0x00ff00ff00ff00ff -DATA mask_S1<>+0x08(SB)/8, $0x00ff00ff00ff00ff -GLOBL mask_S1<>(SB), RODATA, $16 +DATA mask_S01<>+0x00(SB)/8, $0xff00ff00ff00ff00 +DATA mask_S01<>+0x08(SB)/8, $0xff00ff00ff00ff00 +DATA mask_S01<>+0x10(SB)/8, $0x00ff00ff00ff00ff +DATA mask_S01<>+0x18(SB)/8, $0x00ff00ff00ff00ff +GLOBL mask_S01<>(SB), RODATA, $32 #define SI R0 #define DI R1 @@ -85,30 +68,16 @@ GLOBL mask_S1<>(SB), RODATA, $16 #define OFFSET_BRC_X3 (21*4) #define LOAD_GLOBAL_DATA() \ - MOVD $nibble_mask<>(SB), R0 \ - VLD1 (R0), [NIBBLE_MASK.B16] \ - MOVD $Top3_bits_of_the_byte<>(SB), R0 \ - VLD1 (R0), [TOP3_BITS.B16] \ - MOVD $Bottom5_bits_of_the_byte<>(SB), R0 \ - VLD1 (R0), [BOTTOM5_BITS.B16] \ - MOVD $Aes_to_Zuc_mul_low_nibble<>(SB), R0 \ - VLD1 (R0), [M1L.B16] \ - MOVD $Aes_to_Zuc_mul_high_nibble<>(SB), R0 \ - VLD1 (R0), [M1H.B16] \ - MOVD $Comb_matrix_mul_low_nibble<>(SB), R0 \ - VLD1 (R0), [M2L.B16] \ - MOVD $Comb_matrix_mul_high_nibble<>(SB), R0 \ - VLD1 (R0), [M2H.B16] \ - MOVD $P1_data<>(SB), R0 \ - VLD1 (R0), [P1.B16] \ - MOVD $P2_data<>(SB), R0 \ - VLD1 (R0), [P2.B16] \ - MOVD $P3_data<>(SB), R0 \ - VLD1 (R0), [P3.B16] \ - MOVD $mask_S0<>(SB), R0 \ - VLD1 (R0), [S0_MASK.B16] \ - MOVD $mask_S1<>(SB), R0 \ - VLD1 (R0), [S1_MASK.B16] \ + MOVW $0x0F0F0F0F, R0 \ + VDUP R0, NIBBLE_MASK.S4 \ + MOVD $Top3_Bottom5_bits_of_the_byte<>(SB), R0 \ + VLD1 (R0), [TOP3_BITS.B16, BOTTOM5_BITS.B16] \ + MOVD $m1_2<>(SB), R0 \ + VLD1 (R0), [M1L.B16, M1H.B16, M2L.B16, M2H.B16] \ + MOVD $P123_data<>(SB), R0 \ + VLD1 (R0), [P1.B16, P2.B16, P3.B16] \ + MOVD $mask_S01<>(SB), R0 \ + VLD1 (R0), [S0_MASK.B16, S1_MASK.B16] \ MOVD $Shuf_mask<>(SB), R0 \ VLD1 (R0), [INVERSE_SHIFT_ROWS.B16] \ diff --git a/zuc/eia256_asm_arm64.s b/zuc/eia256_asm_arm64.s index 7a3f61ae..7fa6bf9e 100644 --- a/zuc/eia256_asm_arm64.s +++ b/zuc/eia256_asm_arm64.s @@ -2,25 +2,17 @@ #include "textflag.h" -DATA bit_reverse_table_l<>+0x00(SB)/8, $0x0e060a020c040800 -DATA bit_reverse_table_l<>+0x08(SB)/8, $0x0f070b030d050901 -GLOBL bit_reverse_table_l<>(SB), RODATA, $16 - -DATA bit_reverse_table_h<>+0x00(SB)/8, $0xe060a020c0408000 -DATA bit_reverse_table_h<>+0x08(SB)/8, $0xf070b030d0509010 -GLOBL bit_reverse_table_h<>(SB), RODATA, $16 - -DATA bit_reverse_and_table<>+0x00(SB)/8, $0x0f0f0f0f0f0f0f0f -DATA bit_reverse_and_table<>+0x08(SB)/8, $0x0f0f0f0f0f0f0f0f -GLOBL bit_reverse_and_table<>(SB), RODATA, $16 - -DATA shuf_mask_dw0_0_dw1_0<>+0x00(SB)/8, $0xffffffff03020100 -DATA shuf_mask_dw0_0_dw1_0<>+0x08(SB)/8, $0xffffffff07060504 -GLOBL shuf_mask_dw0_0_dw1_0<>(SB), RODATA, $16 - -DATA shuf_mask_dw2_0_dw3_0<>+0x00(SB)/8, $0xffffffff0b0a0908 -DATA shuf_mask_dw2_0_dw3_0<>+0x08(SB)/8, $0xffffffff0f0e0d0c -GLOBL shuf_mask_dw2_0_dw3_0<>(SB), RODATA, $16 +DATA bit_reverse_table<>+0x00(SB)/8, $0x0e060a020c040800 +DATA bit_reverse_table<>+0x08(SB)/8, $0x0f070b030d050901 +DATA bit_reverse_table<>+0x10(SB)/8, $0xe060a020c0408000 +DATA bit_reverse_table<>+0x18(SB)/8, $0xf070b030d0509010 +GLOBL bit_reverse_table<>(SB), RODATA, $32 + +DATA shuf_mask_dw<>+0x00(SB)/8, $0xffffffff03020100 +DATA shuf_mask_dw<>+0x08(SB)/8, $0xffffffff07060504 +DATA shuf_mask_dw<>+0x10(SB)/8, $0xffffffff0b0a0908 +DATA shuf_mask_dw<>+0x18(SB)/8, $0xffffffff0f0e0d0c +GLOBL shuf_mask_dw<>(SB), RODATA, $32 #define AX R2 #define BX R3 @@ -46,16 +38,12 @@ GLOBL shuf_mask_dw2_0_dw3_0<>(SB), RODATA, $16 #define SHUF_MASK_DW2_DW3 V24 #define LOAD_GLOBAL_DATA() \ - MOVD $bit_reverse_table_l<>(SB), R0 \ - VLD1 (R0), [BIT_REV_TAB_L.B16] \ - MOVD $bit_reverse_table_h<>(SB), R0 \ - VLD1 (R0), [BIT_REV_TAB_H.B16] \ - MOVD $bit_reverse_and_table<>(SB), R0 \ - VLD1 (R0), [BIT_REV_AND_TAB.B16] \ - MOVD $shuf_mask_dw0_0_dw1_0<>(SB), R0 \ - VLD1 (R0), [SHUF_MASK_DW0_DW1.B16] \ - MOVD $shuf_mask_dw2_0_dw3_0<>(SB), R0 \ - VLD1 (R0), [SHUF_MASK_DW2_DW3.B16] \ + MOVD $bit_reverse_table<>(SB), R0 \ + VLD1 (R0), [BIT_REV_TAB_L.B16, BIT_REV_TAB_H.B16] \ + MOVW $0x0F0F0F0F, R0 \ + VDUP R0, BIT_REV_AND_TAB.S4 \ + MOVD $shuf_mask_dw<>(SB), R0 \ + VLD1 (R0), [SHUF_MASK_DW0_DW1.B16, SHUF_MASK_DW2_DW3.B16] // func eia256RoundTag8(t *uint32, keyStream *uint32, p *byte) TEXT ·eia256RoundTag8(SB),NOSPLIT,$0 diff --git a/zuc/eia_asm_arm64.s b/zuc/eia_asm_arm64.s index ea8826c2..e421f8d1 100644 --- a/zuc/eia_asm_arm64.s +++ b/zuc/eia_asm_arm64.s @@ -2,25 +2,17 @@ #include "textflag.h" -DATA bit_reverse_table_l<>+0x00(SB)/8, $0x0e060a020c040800 -DATA bit_reverse_table_l<>+0x08(SB)/8, $0x0f070b030d050901 -GLOBL bit_reverse_table_l<>(SB), RODATA, $16 - -DATA bit_reverse_table_h<>+0x00(SB)/8, $0xe060a020c0408000 -DATA bit_reverse_table_h<>+0x08(SB)/8, $0xf070b030d0509010 -GLOBL bit_reverse_table_h<>(SB), RODATA, $16 - -DATA bit_reverse_and_table<>+0x00(SB)/8, $0x0f0f0f0f0f0f0f0f -DATA bit_reverse_and_table<>+0x08(SB)/8, $0x0f0f0f0f0f0f0f0f -GLOBL bit_reverse_and_table<>(SB), RODATA, $16 - -DATA shuf_mask_dw0_0_dw1_0<>+0x00(SB)/8, $0xffffffff03020100 -DATA shuf_mask_dw0_0_dw1_0<>+0x08(SB)/8, $0xffffffff07060504 -GLOBL shuf_mask_dw0_0_dw1_0<>(SB), RODATA, $16 - -DATA shuf_mask_dw2_0_dw3_0<>+0x00(SB)/8, $0xffffffff0b0a0908 -DATA shuf_mask_dw2_0_dw3_0<>+0x08(SB)/8, $0xffffffff0f0e0d0c -GLOBL shuf_mask_dw2_0_dw3_0<>(SB), RODATA, $16 +DATA bit_reverse_table<>+0x00(SB)/8, $0x0e060a020c040800 +DATA bit_reverse_table<>+0x08(SB)/8, $0x0f070b030d050901 +DATA bit_reverse_table<>+0x10(SB)/8, $0xe060a020c0408000 +DATA bit_reverse_table<>+0x18(SB)/8, $0xf070b030d0509010 +GLOBL bit_reverse_table<>(SB), RODATA, $32 + +DATA shuf_mask_dw<>+0x00(SB)/8, $0xffffffff03020100 +DATA shuf_mask_dw<>+0x08(SB)/8, $0xffffffff07060504 +DATA shuf_mask_dw<>+0x10(SB)/8, $0xffffffff0b0a0908 +DATA shuf_mask_dw<>+0x18(SB)/8, $0xffffffff0f0e0d0c +GLOBL shuf_mask_dw<>(SB), RODATA, $32 #define AX R2 #define BX R3 @@ -46,16 +38,12 @@ GLOBL shuf_mask_dw2_0_dw3_0<>(SB), RODATA, $16 #define SHUF_MASK_DW2_DW3 V24 #define LOAD_GLOBAL_DATA() \ - MOVD $bit_reverse_table_l<>(SB), R0 \ - VLD1 (R0), [BIT_REV_TAB_L.B16] \ - MOVD $bit_reverse_table_h<>(SB), R0 \ - VLD1 (R0), [BIT_REV_TAB_H.B16] \ - MOVD $bit_reverse_and_table<>(SB), R0 \ - VLD1 (R0), [BIT_REV_AND_TAB.B16] \ - MOVD $shuf_mask_dw0_0_dw1_0<>(SB), R0 \ - VLD1 (R0), [SHUF_MASK_DW0_DW1.B16] \ - MOVD $shuf_mask_dw2_0_dw3_0<>(SB), R0 \ - VLD1 (R0), [SHUF_MASK_DW2_DW3.B16] \ + MOVD $bit_reverse_table<>(SB), R0 \ + VLD1 (R0), [BIT_REV_TAB_L.B16, BIT_REV_TAB_H.B16] \ + MOVW $0x0F0F0F0F, R0 \ + VDUP R0, BIT_REV_AND_TAB.S4 \ + MOVD $shuf_mask_dw<>(SB), R0 \ + VLD1 (R0), [SHUF_MASK_DW0_DW1.B16, SHUF_MASK_DW2_DW3.B16] // func eia3Round16B(t *uint32, keyStream *uint32, p *byte, tagSize int) TEXT ·eia3Round16B(SB),NOSPLIT,$0