From 999312e28172acf43fd78ac37e5ee74194829f72 Mon Sep 17 00:00:00 2001 From: Jack O'Connor Date: Thu, 18 Jan 2024 14:35:40 -0800 Subject: [PATCH] s/riscv64gcv/riscv_rva23u64/ everywhere --- rust/guts/build.rs | 8 +- rust/guts/src/lib.rs | 6 +- rust/guts/src/riscv_rva23u64.S | 96 +++++++++---------- .../src/{riscv64gcv.rs => riscv_rva23u64.rs} | 30 +++--- 4 files changed, 70 insertions(+), 70 deletions(-) rename rust/guts/src/{riscv64gcv.rs => riscv_rva23u64.rs} (79%) diff --git a/rust/guts/build.rs b/rust/guts/build.rs index f68d10330..2708632ab 100644 --- a/rust/guts/build.rs +++ b/rust/guts/build.rs @@ -229,15 +229,15 @@ fn build_neon_c_intrinsics() { build.compile("blake3_neon"); } -fn build_riscv64gcv_assembly() { - println!("cargo:rustc-cfg=blake3_riscv64gcv_ffi"); +fn build_riscv_rva23u64_assembly() { + println!("cargo:rustc-cfg=blake3_riscv_rva23u64_ffi"); let mut build = new_build(); let asm_path = "src/riscv_rva23u64.S"; build.file(asm_path); build.flag("--target=riscv64"); build.flag("-march=rv64gcv_zbb_zvbb1p0"); build.flag("-menable-experimental-extensions"); - build.compile("blake3_riscv64gcv_assembly"); + build.compile("blake3_riscv_rva23u64_assembly"); println!("cargo:rerun-if-changed={asm_path}"); } @@ -277,7 +277,7 @@ fn main() { // TODO: This implementation assumes some bleeding-edge extensions, and it should probably be // gated by a Cargo feature. if is_riscv64gc() && !is_pure() { - build_riscv64gcv_assembly(); + build_riscv_rva23u64_assembly(); } // The `cc` crate doesn't automatically emit rerun-if directives for the diff --git a/rust/guts/src/lib.rs b/rust/guts/src/lib.rs index 4c4c9d5bd..cb85091bf 100644 --- a/rust/guts/src/lib.rs +++ b/rust/guts/src/lib.rs @@ -8,7 +8,7 @@ use core::sync::atomic::{AtomicPtr, Ordering::Relaxed}; pub mod avx512; pub mod portable; #[cfg(any(target_arch = "riscv64"))] -pub mod riscv64gcv; +pub mod riscv_rva23u64; #[cfg(test)] mod test; @@ -46,7 +46,7 @@ cfg_if::cfg_if! { if #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] { pub const MAX_SIMD_DEGREE: usize = avx512::DEGREE; } else if #[cfg(target_arch = "riscv64")] { - pub const MAX_SIMD_DEGREE: usize = riscv64gcv::MAX_SIMD_DEGREE; + pub const MAX_SIMD_DEGREE: usize = riscv_rva23u64::MAX_SIMD_DEGREE; } else if #[cfg(blake3_neon)] { pub const MAX_SIMD_DEGREE: usize = 4; } else { @@ -79,7 +79,7 @@ fn detect() -> Implementation { } #[cfg(target_arch = "riscv64")] { - return riscv64gcv::implementation(); + return riscv_rva23u64::implementation(); } #[allow(unreachable_code)] portable::implementation() diff --git a/rust/guts/src/riscv_rva23u64.S b/rust/guts/src/riscv_rva23u64.S index 7758ff4b6..d672a30d7 100644 --- a/rust/guts/src/riscv_rva23u64.S +++ b/rust/guts/src/riscv_rva23u64.S @@ -61,8 +61,8 @@ MSG_PERMUTE: // a3: counter // a4: flags // a5: out_ptr -.global blake3_guts_riscv64gcv_compress -blake3_guts_riscv64gcv_compress: +.global blake3_guts_riscv_rva23u64_compress +blake3_guts_riscv_rva23u64_compress: // Load the message load and message permutation indexes. vsetivli zero, 16, e16, m2, ta, ma la t0, MSG_LOAD @@ -110,7 +110,7 @@ blake3_guts_riscv64gcv_compress: vslide1down.vx v3, v3, a1 vslide1down.vx v3, v3, a4 li t0, 7 // round counter -blake3_guts_riscv64gcv_compress_round_loop: +blake3_guts_riscv_rva23u64_compress_round_loop: vadd.vv v0, v0, v4 vadd.vv v0, v0, v1 vxor.vv v3, v3, v0 @@ -148,7 +148,7 @@ blake3_guts_riscv64gcv_compress_round_loop: vrgather.vv v3, v23, v13 vrgather.vv v2, v22, v12 addi t0, t0, -1 - beqz t0, blake3_guts_riscv64gcv_compress_end + beqz t0, blake3_guts_riscv_rva23u64_compress_end // Shuffle message words. // TODO: Find a way to do this without so much movement? vmv.v.v v16, v4 @@ -169,8 +169,8 @@ blake3_guts_riscv64gcv_compress_round_loop: vmv.v.v v5, v16 vmv.v.v v6, v20 vmv.v.v v7, v24 - j blake3_guts_riscv64gcv_compress_round_loop -blake3_guts_riscv64gcv_compress_end: + j blake3_guts_riscv_rva23u64_compress_round_loop +blake3_guts_riscv_rva23u64_compress_end: vxor.vv v0, v0, v2 vxor.vv v1, v1, v3 vsetivli zero, 16, e8, m1, ta, ma @@ -180,8 +180,8 @@ blake3_guts_riscv64gcv_compress_end: ret -.global blake3_guts_riscv64gcv_degree -blake3_guts_riscv64gcv_degree: +.global blake3_guts_riscv_rva23u64_degree +blake3_guts_riscv_rva23u64_degree: csrr t0, vlenb srli t0, t0, 2 li t1, MAX_SIMD_DEGREE @@ -189,7 +189,7 @@ blake3_guts_riscv64gcv_degree: ret // clobbers: t0 -blake3_guts_riscv64gcv_kernel: +blake3_guts_riscv_rva23u64_kernel: li t0, IV0 vmv.v.x v8, t0 li t0, IV1 @@ -993,7 +993,7 @@ blake3_guts_riscv64gcv_kernel: // a5: aligned+transposed output [unused] // a6: total chunks [unused] // a7: remaining_bytes_in_last_chunk -blake3_guts_riscv64gcv_hash_blocks: +blake3_guts_riscv_rva23u64_hash_blocks: // t0 := full_blocks := (input_len + 1024 - 64) / 1024 addi t0, a1, 1024 - 64 srli t0, t0, 10 @@ -1056,7 +1056,7 @@ partial_block_finished: vslide1down.vx v15, v15, t1 // execute the kernel mv t6, ra - call blake3_guts_riscv64gcv_kernel + call blake3_guts_riscv_rva23u64_kernel mv ra, t6 // xor the two halves of the state vxor.vv v0, v0, v8 @@ -1148,15 +1148,15 @@ vlenb_less_than_32: // a3: counter // a4: flags // a5: aligned+transposed output -.global blake3_guts_riscv64gcv_hash_chunks -blake3_guts_riscv64gcv_hash_chunks: +.global blake3_guts_riscv_rva23u64_hash_chunks +blake3_guts_riscv_rva23u64_hash_chunks: // Save the original num_chunks = (input_len+1023)/1024 in a6. addi a6, a1, 1023 srli a6, a6, 10 // Track the bytes remaining in the last chunk in a7. The initial value // of this is ((input_len - 1) % 1024) + 1. (The input to this function // is never empty.) It decrements by 64 with each call to - // blake3_guts_riscv64gcv_hash_chunks, but not below 0. + // blake3_guts_riscv_rva23u64_hash_chunks, but not below 0. addi a7, a1, -1 andi a7, a7, 1023 addi a7, a7, 1 @@ -1183,24 +1183,24 @@ blake3_guts_riscv64gcv_hash_chunks: // the input length. mv t5, ra ori a4, a4, 1 // set CHUNK_START - call blake3_guts_riscv64gcv_hash_blocks + call blake3_guts_riscv_rva23u64_hash_blocks andi a4, a4, -2 // unset CHUNK_START - call blake3_guts_riscv64gcv_hash_blocks - call blake3_guts_riscv64gcv_hash_blocks - call blake3_guts_riscv64gcv_hash_blocks - call blake3_guts_riscv64gcv_hash_blocks - call blake3_guts_riscv64gcv_hash_blocks - call blake3_guts_riscv64gcv_hash_blocks - call blake3_guts_riscv64gcv_hash_blocks - call blake3_guts_riscv64gcv_hash_blocks - call blake3_guts_riscv64gcv_hash_blocks - call blake3_guts_riscv64gcv_hash_blocks - call blake3_guts_riscv64gcv_hash_blocks - call blake3_guts_riscv64gcv_hash_blocks - call blake3_guts_riscv64gcv_hash_blocks - call blake3_guts_riscv64gcv_hash_blocks + call blake3_guts_riscv_rva23u64_hash_blocks + call blake3_guts_riscv_rva23u64_hash_blocks + call blake3_guts_riscv_rva23u64_hash_blocks + call blake3_guts_riscv_rva23u64_hash_blocks + call blake3_guts_riscv_rva23u64_hash_blocks + call blake3_guts_riscv_rva23u64_hash_blocks + call blake3_guts_riscv_rva23u64_hash_blocks + call blake3_guts_riscv_rva23u64_hash_blocks + call blake3_guts_riscv_rva23u64_hash_blocks + call blake3_guts_riscv_rva23u64_hash_blocks + call blake3_guts_riscv_rva23u64_hash_blocks + call blake3_guts_riscv_rva23u64_hash_blocks + call blake3_guts_riscv_rva23u64_hash_blocks + call blake3_guts_riscv_rva23u64_hash_blocks ori a4, a4, 2 // set CHUNK_END - call blake3_guts_riscv64gcv_hash_blocks + call blake3_guts_riscv_rva23u64_hash_blocks mv ra, t5 // If the final chunk is short, we need to set vl back to the total // number of chunks. @@ -1228,8 +1228,8 @@ blake3_guts_riscv64gcv_hash_chunks: // a2: key // a3: flags // a4: out pointer -.global blake3_guts_riscv64gcv_hash_parents -blake3_guts_riscv64gcv_hash_parents: +.global blake3_guts_riscv_rva23u64_hash_parents +blake3_guts_riscv_rva23u64_hash_parents: // load the transposed CVs and split alternating words into the low and // high halves of the input vectors vsetvli zero, a1, e32, m1, ta, ma @@ -1285,7 +1285,7 @@ blake3_guts_riscv64gcv_hash_parents: // execute the kernel mv t6, ra - call blake3_guts_riscv64gcv_kernel + call blake3_guts_riscv_rva23u64_kernel mv ra, t6 // xor the two halves of the state @@ -1322,7 +1322,7 @@ blake3_guts_riscv64gcv_hash_parents: // a4: flags // a5: out_ptr // a6: out_len -blake3_guts_riscv64gcv_xof_inner: +blake3_guts_riscv_rva23u64_xof_inner: // t1 := total_blocks := (out_len + 63) / 64 addi t1, a6, 63 srli t1, t1, 6 @@ -1395,7 +1395,7 @@ blake3_guts_riscv64gcv_xof_inner: // execute the kernel mv t6, ra - call blake3_guts_riscv64gcv_kernel + call blake3_guts_riscv_rva23u64_kernel mv ra, t6 // reload the CV, this time into v16-23 @@ -1441,10 +1441,10 @@ blake3_guts_riscv64gcv_xof_inner: // a4: flags // a5: out_ptr // a6: out_len -.global blake3_guts_riscv64gcv_xof -blake3_guts_riscv64gcv_xof: +.global blake3_guts_riscv_rva23u64_xof +blake3_guts_riscv_rva23u64_xof: mv t5, ra - call blake3_guts_riscv64gcv_xof_inner + call blake3_guts_riscv_rva23u64_xof_inner mv ra, t5 // t1 is now total_blocks, and t2 is full_blocks. Set vl to t2 and the @@ -1466,9 +1466,9 @@ blake3_guts_riscv64gcv_xof: // If full_blocks != partial_blocks, we need to handle the final // partial block. Otherwise, we're done. - bne t1, t2, blake3_guts_riscv64gcv_xof_partial_block + bne t1, t2, blake3_guts_riscv_rva23u64_xof_partial_block ret -blake3_guts_riscv64gcv_xof_partial_block: +blake3_guts_riscv_rva23u64_xof_partial_block: // Collect groups of 4 words in v0, v4, v8, and v12. vsetivli zero, 4, e32, m1, ta, ma vslidedown.vx v0, v0, t2 @@ -1520,10 +1520,10 @@ blake3_guts_riscv64gcv_xof_partial_block: // a4: flags // a5: out_ptr // a6: out_len -.global blake3_guts_riscv64gcv_xof_xor -blake3_guts_riscv64gcv_xof_xor: +.global blake3_guts_riscv_rva23u64_xof_xor +blake3_guts_riscv_rva23u64_xof_xor: mv t5, ra - call blake3_guts_riscv64gcv_xof_inner + call blake3_guts_riscv_rva23u64_xof_inner mv ra, t5 // t1 is now total_blocks, and t2 is full_blocks. Set vl to t2 and the @@ -1564,9 +1564,9 @@ blake3_guts_riscv64gcv_xof_xor: // If full_blocks != partial_blocks, we need to handle the final // partial block. Otherwise, we're done. - bne t1, t2, blake3_guts_riscv64gcv_xof_xor_partial_block + bne t1, t2, blake3_guts_riscv_rva23u64_xof_xor_partial_block ret -blake3_guts_riscv64gcv_xof_xor_partial_block: +blake3_guts_riscv_rva23u64_xof_xor_partial_block: // Collect groups of 4 words in v0, v4, v8, and v12. vsetivli zero, 4, e32, m1, ta, ma vslidedown.vx v0, v0, t2 @@ -1618,8 +1618,8 @@ blake3_guts_riscv64gcv_xof_xor_partial_block: // a2: key // a3: counter // a4: out_ptr -.global blake3_guts_riscv64gcv_universal_hash -blake3_guts_riscv64gcv_universal_hash: +.global blake3_guts_riscv_rva23u64_universal_hash +blake3_guts_riscv_rva23u64_universal_hash: // t0 := full_blocks := input_len / 64 srli t0, a1, 6 // Load and transpose full message blocks. These are "strided segment @@ -1675,7 +1675,7 @@ universal_hash_partial_block_finished: vmv.v.x v15, t1 // Execute the kernel. mv t6, ra - call blake3_guts_riscv64gcv_kernel + call blake3_guts_riscv_rva23u64_kernel mv ra, t6 // Finish the first four state vectors. The rest are dropped. vxor.vv v0, v0, v8 diff --git a/rust/guts/src/riscv64gcv.rs b/rust/guts/src/riscv_rva23u64.rs similarity index 79% rename from rust/guts/src/riscv64gcv.rs rename to rust/guts/src/riscv_rva23u64.rs index b350660c0..7f2a7abb8 100644 --- a/rust/guts/src/riscv64gcv.rs +++ b/rust/guts/src/riscv_rva23u64.rs @@ -1,4 +1,4 @@ -//! This implementation currently assumes riscv64gcv_zbb_zvbb. Zvbb in particular ("Vector +//! This implementation currently assumes riscv_rva23u64_zbb_zvbb. Zvbb in particular ("Vector //! Bit-manipulation used in Cryptography") is a bleeding-edge extension that was only frozen a few //! weeks ago at the time I'm writing this comment. Compiling and testing this code currently //! requires quite a lot of effort, including building Clang from master and building QEMU from a @@ -10,8 +10,8 @@ use crate::{BlockBytes, CVBytes, Implementation}; pub(crate) const MAX_SIMD_DEGREE: usize = 16; extern "C" { - fn blake3_guts_riscv64gcv_degree() -> usize; - fn blake3_guts_riscv64gcv_compress( + fn blake3_guts_riscv_rva23u64_degree() -> usize; + fn blake3_guts_riscv_rva23u64_compress( block: *const BlockBytes, block_len: u32, cv: *const CVBytes, @@ -19,7 +19,7 @@ extern "C" { flags: u32, out: *mut CVBytes, ); - fn blake3_guts_riscv64gcv_hash_chunks( + fn blake3_guts_riscv_rva23u64_hash_chunks( input: *const u8, input_len: usize, key: *const CVBytes, @@ -27,14 +27,14 @@ extern "C" { flags: u32, transposed_output: *mut u32, ); - fn blake3_guts_riscv64gcv_hash_parents( + fn blake3_guts_riscv_rva23u64_hash_parents( transposed_input: *const u32, num_parents: usize, key: *const CVBytes, flags: u32, transposed_output: *mut u32, ); - fn blake3_guts_riscv64gcv_xof( + fn blake3_guts_riscv_rva23u64_xof( block: *const BlockBytes, block_len: u32, cv: *const CVBytes, @@ -43,7 +43,7 @@ extern "C" { out: *mut u8, out_len: usize, ); - fn blake3_guts_riscv64gcv_xof_xor( + fn blake3_guts_riscv_rva23u64_xof_xor( block: *const BlockBytes, block_len: u32, cv: *const CVBytes, @@ -52,7 +52,7 @@ extern "C" { out: *mut u8, out_len: usize, ); - fn blake3_guts_riscv64gcv_universal_hash( + fn blake3_guts_riscv_rva23u64_universal_hash( input: *const u8, input_len: usize, key: *const CVBytes, @@ -63,13 +63,13 @@ extern "C" { pub fn implementation() -> Implementation { Implementation::new( - blake3_guts_riscv64gcv_degree, - blake3_guts_riscv64gcv_compress, - blake3_guts_riscv64gcv_hash_chunks, - blake3_guts_riscv64gcv_hash_parents, - blake3_guts_riscv64gcv_xof, - blake3_guts_riscv64gcv_xof_xor, - blake3_guts_riscv64gcv_universal_hash, + blake3_guts_riscv_rva23u64_degree, + blake3_guts_riscv_rva23u64_compress, + blake3_guts_riscv_rva23u64_hash_chunks, + blake3_guts_riscv_rva23u64_hash_parents, + blake3_guts_riscv_rva23u64_xof, + blake3_guts_riscv_rva23u64_xof_xor, + blake3_guts_riscv_rva23u64_universal_hash, ) }