From 8ccc92de71ed8ec70dad914ffc96a045aa2d27ca Mon Sep 17 00:00:00 2001 From: Hanting Zhang Date: Mon, 11 Dec 2023 20:56:53 +0000 Subject: [PATCH] initial commit, not passing tests --- Cargo.toml | 47 ++++++++++++++ benches/msm.rs | 74 ++++++++++++++++++++++ build.rs | 102 ++++++++++++++++++++++++++++++ cuda/bn254.cu | 24 +++++++ cuda/grumpkin.cu | 24 +++++++ examples/msm.rs | 158 ++++++++++++++++++++++++++++++++++++++++++++++ rustfmt.toml | 1 + src/lib.rs | 106 +++++++++++++++++++++++++++++++ src/pippenger.cpp | 26 ++++++++ src/tests.rs | 150 +++++++++++++++++++++++++++++++++++++++++++ 10 files changed, 712 insertions(+) create mode 100644 Cargo.toml create mode 100644 benches/msm.rs create mode 100644 build.rs create mode 100644 cuda/bn254.cu create mode 100644 cuda/grumpkin.cu create mode 100644 examples/msm.rs create mode 100644 rustfmt.toml create mode 100644 src/lib.rs create mode 100644 src/pippenger.cpp create mode 100644 src/tests.rs diff --git a/Cargo.toml b/Cargo.toml new file mode 100644 index 0000000..5d3d11c --- /dev/null +++ b/Cargo.toml @@ -0,0 +1,47 @@ +[package] +name = "grumpkin-msm" +version = "0.1.0" +edition = "2021" +license = "Apache-2.0" +description = "Optimized multiscalar multiplicaton for the Grumpkin curve cycle" +repository = "https://github.com/lurk-lab/grumpkin-msm" +readme = "README.md" +include = [ + "/benches/**", + "/cuda/**", + "/src/**", + "/Cargo.toml", + "/build.rs", + "/README.md", +] + +[features] +# By default, compile with ADX extension if the host supports it. +# Binary can be executed on systems similar to the host. +default = [] +# Compile in portable mode, without ISA extensions. +# Binary can be executed on all systems. +portable = [ "blst/portable" ] +# Enable ADX even if the host CPU doesn't support it. +# Binary can be executed on Broadwell+ and Ryzen+ systems. +force-adx = [ "blst/force-adx" ] +cuda-mobile = [] + +[dependencies] +blst = "~0.3.11" +sppark = "~0.1.2" +halo2curves = { version = "0.4.0" } + +[build-dependencies] +cc = "^1.0.70" +which = "^4.0" + +[dev-dependencies] +criterion = { version = "0.3", features = [ "html_reports" ] } +rand = "^0" +rand_chacha = "^0" +rayon = "1.5" + +[[bench]] +name = "msm" +harness = false diff --git a/benches/msm.rs b/benches/msm.rs new file mode 100644 index 0000000..d93a817 --- /dev/null +++ b/benches/msm.rs @@ -0,0 +1,74 @@ +// Copyright Supranational LLC +// Licensed under the Apache License, Version 2.0, see LICENSE for details. +// SPDX-License-Identifier: Apache-2.0 + +#![allow(dead_code)] +#![allow(unused_imports)] +#![allow(unused_mut)] + +use criterion::{criterion_group, criterion_main, Criterion}; + +use grumpkin_msm; + +#[cfg(feature = "cuda")] +extern "C" { + fn cuda_available() -> bool; +} + +include!("../src/tests.rs"); + +fn criterion_benchmark(c: &mut Criterion) { + let bench_npow: usize = std::env::var("BENCH_NPOW") + .unwrap_or("17".to_string()) + .parse() + .unwrap(); + let npoints: usize = 1 << bench_npow; + + //println!("generating {} random points, just hang on...", npoints); + let mut points = crate::tests::gen_points(npoints); + let mut scalars = crate::tests::gen_scalars(npoints); + + #[cfg(feature = "cuda")] + { + unsafe { grumpkin_msm::CUDA_OFF = true }; + } + + let mut group = c.benchmark_group("CPU"); + group.sample_size(10); + + group.bench_function(format!("2**{} points", bench_npow), |b| { + b.iter(|| { + let _ = grumpkin_msm::bn256(&points, &scalars); + }) + }); + + group.finish(); + + #[cfg(feature = "cuda")] + if unsafe { cuda_available() } { + unsafe { grumpkin_msm::CUDA_OFF = false }; + + const EXTRA: usize = 5; + let bench_npow = bench_npow + EXTRA; + let npoints: usize = 1 << bench_npow; + + while points.len() < npoints { + points.append(&mut points.clone()); + } + scalars.append(&mut crate::tests::gen_scalars(npoints - scalars.len())); + + let mut group = c.benchmark_group("GPU"); + group.sample_size(20); + + group.bench_function(format!("2**{} points", bench_npow), |b| { + b.iter(|| { + let _ = grumpkin_msm::bn256(&points, &scalars); + }) + }); + + group.finish(); + } +} + +criterion_group!(benches, criterion_benchmark); +criterion_main!(benches); diff --git a/build.rs b/build.rs new file mode 100644 index 0000000..c38148a --- /dev/null +++ b/build.rs @@ -0,0 +1,102 @@ +use std::env; +use std::path::PathBuf; + +fn main() { + // account for cross-compilation [by examining environment variable] + let target_arch = env::var("CARGO_CFG_TARGET_ARCH").unwrap(); + + // Set CXX environment variable to choose alternative C compiler. + // Optimization level depends on whether or not --release is passed + // or implied. + let mut cc = cc::Build::new(); + cc.cpp(true); + + let c_src_dir = PathBuf::from("src"); + let files = vec![c_src_dir.join("pippenger.cpp")]; + let mut cc_def = None; + + match (cfg!(feature = "portable"), cfg!(feature = "force-adx")) { + (true, false) => { + println!("Compiling in portable mode without ISA extensions"); + cc_def = Some("__PASTA_PORTABLE__"); + } + (false, true) => { + if target_arch.eq("x86_64") { + println!("Enabling ADX support via `force-adx` feature"); + cc_def = Some("__ADX__"); + } else { + println!("`force-adx` is ignored for non-x86_64 targets"); + } + } + (false, false) => { + #[cfg(target_arch = "x86_64")] + if target_arch.eq("x86_64") && std::is_x86_feature_detected!("adx") + { + println!("Enabling ADX because it was detected on the host"); + cc_def = Some("__ADX__"); + } + } + (true, true) => panic!( + "Cannot compile with both `portable` and `force-adx` features" + ), + } + + cc.flag_if_supported("-mno-avx") // avoid costly transitions + .flag_if_supported("-fno-builtin") + .flag_if_supported("-std=c++11") + .flag_if_supported("-Wno-unused-command-line-argument"); + if !cfg!(debug_assertions) { + cc.define("NDEBUG", None); + } + if let Some(def) = cc_def { + cc.define(def, None); + } + if let Some(include) = env::var_os("DEP_BLST_C_SRC") { + cc.include(include); + } + if let Some(include) = env::var_os("DEP_SPPARK_ROOT") { + cc.include(include); + } + cc.files(&files).compile("pasta_msm"); + + if cfg!(target_os = "windows") && !cfg!(target_env = "msvc") { + return; + } + // Detect if there is CUDA compiler and engage "cuda" feature accordingly + let nvcc = match env::var("NVCC") { + Ok(var) => which::which(var), + Err(_) => which::which("nvcc"), + }; + if nvcc.is_ok() { + let mut nvcc = cc::Build::new(); + nvcc.cuda(true); + nvcc.flag("-arch=sm_80"); + nvcc.flag("-gencode").flag("arch=compute_70,code=sm_70"); + nvcc.flag("-t0"); + #[cfg(not(target_env = "msvc"))] + nvcc.flag("-Xcompiler").flag("-Wno-unused-function"); + nvcc.define("TAKE_RESPONSIBILITY_FOR_ERROR_MESSAGE", None); + #[cfg(feature = "cuda-mobile")] + nvcc.define("NTHREADS", "128"); + if let Some(def) = cc_def { + nvcc.define(def, None); + } + if let Some(include) = env::var_os("DEP_BLST_C_SRC") { + nvcc.include(include); + } + if let Some(include) = env::var_os("DEP_SPPARK_ROOT") { + nvcc.include(include); + } + nvcc.clone() + .file("cuda/bn254.cu") + .compile("pallas_msm_cuda"); + nvcc.define("__MSM_SORT_DONT_IMPLEMENT__", None) + .file("cuda/grumpkin.cu") + .compile("vesta_msm_cuda"); + + println!("cargo:rerun-if-changed=cuda"); + println!("cargo:rerun-if-env-changed=CXXFLAGS"); + println!("cargo:rustc-cfg=feature=\"cuda\""); + } + println!("cargo:rerun-if-env-changed=NVCC"); +} diff --git a/cuda/bn254.cu b/cuda/bn254.cu new file mode 100644 index 0000000..3c0c1c9 --- /dev/null +++ b/cuda/bn254.cu @@ -0,0 +1,24 @@ +// Copyright Supranational LLC +// Licensed under the Apache License, Version 2.0, see LICENSE for details. +// SPDX-License-Identifier: Apache-2.0 + +#include + +#include +#include + +#include + +typedef jacobian_t point_t; +typedef xyzz_t bucket_t; +typedef bucket_t::affine_t affine_t; +typedef fr_t scalar_t; + +#include + +#ifndef __CUDA_ARCH__ +extern "C" +RustError cuda_pippenger_bn254(point_t *out, const affine_t points[], size_t npoints, + const scalar_t scalars[]) +{ return mult_pippenger(out, points, npoints, scalars); } +#endif diff --git a/cuda/grumpkin.cu b/cuda/grumpkin.cu new file mode 100644 index 0000000..861610d --- /dev/null +++ b/cuda/grumpkin.cu @@ -0,0 +1,24 @@ +// Copyright Supranational LLC +// Licensed under the Apache License, Version 2.0, see LICENSE for details. +// SPDX-License-Identifier: Apache-2.0 + +#include + +#include +#include + +#include + +typedef jacobian_t point_t; +typedef xyzz_t bucket_t; +typedef bucket_t::affine_t affine_t; +typedef fp_t scalar_t; + +#include + +#ifndef __CUDA_ARCH__ +extern "C" +RustError cuda_pippenger_grumpkin(point_t *out, const affine_t points[], size_t npoints, + const scalar_t scalars[]) +{ return mult_pippenger(out, points, npoints, scalars); } +#endif diff --git a/examples/msm.rs b/examples/msm.rs new file mode 100644 index 0000000..65a70e1 --- /dev/null +++ b/examples/msm.rs @@ -0,0 +1,158 @@ +use core::cell::UnsafeCell; +use core::mem::transmute; +use core::sync::atomic::*; +use rand::{RngCore, SeedableRng}; +use rand_chacha::ChaCha20Rng; +use halo2curves::bn256; +use halo2curves::ff::Field; +use halo2curves::CurveExt; +use halo2curves::group::{Curve, Group}; + +#[cfg(feature = "cuda")] +extern "C" { + fn cuda_available() -> bool; +} + +pub fn gen_points(npoints: usize) -> Vec { + let mut ret: Vec = Vec::with_capacity(npoints); + unsafe { ret.set_len(npoints) }; + + let mut rnd: Vec = Vec::with_capacity(32 * npoints); + unsafe { rnd.set_len(32 * npoints) }; + ChaCha20Rng::from_entropy().fill_bytes(&mut rnd); + + let n_workers = rayon::current_num_threads(); + let work = AtomicUsize::new(0); + rayon::scope(|s| { + for _ in 0..n_workers { + s.spawn(|_| { + let hash = bn256::G1::hash_to_curve("foobar"); + + let mut stride = 1024; + let mut tmp: Vec = Vec::with_capacity(stride); + unsafe { tmp.set_len(stride) }; + + loop { + let work = work.fetch_add(stride, Ordering::Relaxed); + if work >= npoints { + break; + } + if work + stride > npoints { + stride = npoints - work; + unsafe { tmp.set_len(stride) }; + } + for i in 0..stride { + let off = (work + i) * 32; + tmp[i] = hash(&rnd[off..off + 32]); + } + #[allow(mutable_transmutes)] + bn256::G1::batch_normalize(&tmp, unsafe { + transmute::<&[bn256::G1Affine], &mut [bn256::G1Affine]>( + &ret[work..work + stride], + ) + }); + } + }) + } + }); + + ret +} + +fn as_mut(x: &T) -> &mut T { + unsafe { &mut *UnsafeCell::raw_get(x as *const _ as *const _) } +} + +pub fn gen_scalars(npoints: usize) -> Vec { + let mut ret: Vec = Vec::with_capacity(npoints); + unsafe { ret.set_len(npoints) }; + + let n_workers = rayon::current_num_threads(); + let work = AtomicUsize::new(0); + + rayon::scope(|s| { + for _ in 0..n_workers { + s.spawn(|_| { + let mut rng = ChaCha20Rng::from_entropy(); + loop { + let work = work.fetch_add(1, Ordering::Relaxed); + if work >= npoints { + break; + } + *as_mut(&ret[work]) = bn256::Fr::random(&mut rng); + } + }) + } + }); + + ret +} + +pub fn naive_multiscalar_mul( + points: &[bn256::G1Affine], + scalars: &[bn256::Fr], +) -> bn256::G1Affine { + let n_workers = rayon::current_num_threads(); + + let mut rets: Vec = Vec::with_capacity(n_workers); + unsafe { rets.set_len(n_workers) }; + + let npoints = points.len(); + let work = AtomicUsize::new(0); + let tid = AtomicUsize::new(0); + rayon::scope(|s| { + for _ in 0..n_workers { + s.spawn(|_| { + let mut ret = bn256::G1::default(); + + loop { + let work = work.fetch_add(1, Ordering::Relaxed); + if work >= npoints { + break; + } + ret += points[work] * scalars[work]; + } + + *as_mut(&rets[tid.fetch_add(1, Ordering::Relaxed)]) = ret; + }) + } + }); + + let mut ret = bn256::G1::default(); + for i in 0..n_workers { + ret += rets[i]; + } + + ret.to_affine() +} + +fn main() { + let bench_npow: usize = std::env::var("BENCH_NPOW") + .unwrap_or("0".to_string()) + .parse() + .unwrap(); + let npoints: usize = 1 << bench_npow; + + let two = bn256::G1::generator(); + // println!("generating {} random points, just hang on...", npoints); + // let points = gen_points(npoints); + let mut points: Vec = Vec::with_capacity(npoints); + unsafe { points.set_len(npoints) }; + bn256::G1::batch_normalize(&vec![two; npoints], &mut points); + let scalars = vec![bn256::Fr::from(2); npoints]; + + println!("points: {:?}", points); + println!("scalars: {:?}", scalars); + + #[cfg(feature = "cuda")] + { + unsafe { grumpkin_msm::CUDA_OFF = true }; + } + + let res = grumpkin_msm::bn256(&points, &scalars).to_affine(); + let native = naive_multiscalar_mul(&points, &scalars); + let hi = (points[0] * scalars[0]).to_affine(); + println!("hi: {:?}", hi); + assert_eq!(res, native); + println!("success!") +} diff --git a/rustfmt.toml b/rustfmt.toml new file mode 100644 index 0000000..df99c69 --- /dev/null +++ b/rustfmt.toml @@ -0,0 +1 @@ +max_width = 80 diff --git a/src/lib.rs b/src/lib.rs new file mode 100644 index 0000000..bec4922 --- /dev/null +++ b/src/lib.rs @@ -0,0 +1,106 @@ +// Copyright Supranational LLC +// Licensed under the Apache License, Version 2.0, see LICENSE for details. +// SPDX-License-Identifier: Apache-2.0 +#![allow(improper_ctypes)] + +extern crate blst; + +#[cfg(feature = "cuda")] +sppark::cuda_error!(); +#[cfg(feature = "cuda")] +extern "C" { + fn cuda_available() -> bool; +} +#[cfg(feature = "cuda")] +pub static mut CUDA_OFF: bool = false; + +use halo2curves::bn256; + +extern "C" { + fn mult_pippenger_bn254( + out: *mut bn256::G1, + points: *const bn256::G1Affine, + npoints: usize, + scalars: *const bn256::Fr, + ); + +} + +pub fn bn256(points: &[bn256::G1Affine], scalars: &[bn256::Fr]) -> bn256::G1 { + let npoints = points.len(); + if npoints != scalars.len() { + panic!("length mismatch") + } + #[cfg(feature = "cuda")] + if npoints >= 1 << 16 && unsafe { !CUDA_OFF && cuda_available() } { + extern "C" { + fn cuda_pippenger_bn254( + out: *mut bn256::G1, + points: *const bn256::G1Affine, + npoints: usize, + scalars: *const bn256::Fr, + ) -> cuda::Error; + + } + let mut ret = bn256::G1::default(); + let err = unsafe { + cuda_pippenger_bn254(&mut ret, &points[0], npoints, &scalars[0]) + }; + if err.code != 0 { + panic!("{}", String::from(err)); + } + return ret; + } + let mut ret = bn256::G1::default(); + unsafe { mult_pippenger_bn254(&mut ret, &points[0], npoints, &scalars[0]) }; + ret +} + +use halo2curves::grumpkin; + +extern "C" { + fn mult_pippenger_grumpkin( + out: *mut grumpkin::G1, + points: *const grumpkin::G1Affine, + npoints: usize, + scalars: *const grumpkin::Fr, + ); + +} + +pub fn grumpkin( + points: &[grumpkin::G1Affine], + scalars: &[grumpkin::Fr], +) -> grumpkin::G1 { + let npoints = points.len(); + if npoints != scalars.len() { + panic!("length mismatch") + } + #[cfg(feature = "cuda")] + if npoints >= 1 << 16 && unsafe { !CUDA_OFF && cuda_available() } { + extern "C" { + fn cuda_pippenger_grumpkin( + out: *mut grumpkin::G1, + points: *const grumpkin::G1Affine, + npoints: usize, + scalars: *const grumpkin::Fr, + ) -> cuda::Error; + + } + let mut ret = grumpkin::G1::default(); + let err = unsafe { + cuda_pippenger_grumpkin(&mut ret, &points[0], npoints, &scalars[0]) + }; + if err.code != 0 { + panic!("{}", String::from(err)); + } + return ret; + } + let mut ret = grumpkin::G1::default(); + unsafe { + mult_pippenger_grumpkin(&mut ret, &points[0], npoints, &scalars[0]) + }; + ret +} + +include!("tests.rs"); diff --git a/src/pippenger.cpp b/src/pippenger.cpp new file mode 100644 index 0000000..f6f7a11 --- /dev/null +++ b/src/pippenger.cpp @@ -0,0 +1,26 @@ +// Copyright Supranational LLC +// Licensed under the Apache License, Version 2.0, see LICENSE for details. +// SPDX-License-Identifier: Apache-2.0 + +#include +#include +#include +#include + +static thread_pool_t da_pool; + +extern "C" +void mult_pippenger_bn254(jacobian_t& ret, + const xyzz_t::affine_t points[], + size_t npoints, const fr_t scalars[]) +{ mult_pippenger>(ret, points, npoints, scalars, true, + &da_pool); +} + +extern "C" +void mult_pippenger_grumpkin(jacobian_t& ret, + const xyzz_t::affine_t points[], + size_t npoints, const fp_t scalars[]) +{ mult_pippenger>(ret, points, npoints, scalars, true, + &da_pool); +} diff --git a/src/tests.rs b/src/tests.rs new file mode 100644 index 0000000..3a1f0c2 --- /dev/null +++ b/src/tests.rs @@ -0,0 +1,150 @@ +// Copyright Supranational LLC +// Licensed under the Apache License, Version 2.0, see LICENSE for details. +// SPDX-License-Identifier: Apache-2.0 + +#[cfg(test)] +mod tests { + use crate as grumpkin_msm; + use core::cell::UnsafeCell; + use core::mem::transmute; + use core::sync::atomic::*; + use halo2curves::{ + CurveExt, + group::{ff::Field, Curve}, + bn256, + }; + use rand::{RngCore, SeedableRng}; + use rand_chacha::ChaCha20Rng; + + pub fn gen_points(npoints: usize) -> Vec { + let mut ret: Vec = Vec::with_capacity(npoints); + unsafe { ret.set_len(npoints) }; + + let mut rnd: Vec = Vec::with_capacity(32 * npoints); + unsafe { rnd.set_len(32 * npoints) }; + ChaCha20Rng::from_entropy().fill_bytes(&mut rnd); + + let n_workers = rayon::current_num_threads(); + let work = AtomicUsize::new(0); + rayon::scope(|s| { + for _ in 0..n_workers { + s.spawn(|_| { + let hash = bn256::G1::hash_to_curve("foobar"); + + let mut stride = 1024; + let mut tmp: Vec = Vec::with_capacity(stride); + unsafe { tmp.set_len(stride) }; + + loop { + let work = work.fetch_add(stride, Ordering::Relaxed); + if work >= npoints { + break; + } + if work + stride > npoints { + stride = npoints - work; + unsafe { tmp.set_len(stride) }; + } + for i in 0..stride { + let off = (work + i) * 32; + tmp[i] = hash(&rnd[off..off + 32]); + } + #[allow(mutable_transmutes)] + bn256::G1::batch_normalize(&tmp, unsafe { + transmute::<&[bn256::G1Affine], &mut [bn256::G1Affine]>( + &ret[work..work + stride], + ) + }); + } + }) + } + }); + + ret + } + + fn as_mut(x: &T) -> &mut T { + unsafe { &mut *UnsafeCell::raw_get(x as *const _ as *const _) } + } + + pub fn gen_scalars(npoints: usize) -> Vec { + let mut ret: Vec = Vec::with_capacity(npoints); + unsafe { ret.set_len(npoints) }; + + let n_workers = rayon::current_num_threads(); + let work = AtomicUsize::new(0); + + rayon::scope(|s| { + for _ in 0..n_workers { + s.spawn(|_| { + let mut rng = ChaCha20Rng::from_entropy(); + loop { + let work = work.fetch_add(1, Ordering::Relaxed); + if work >= npoints { + break; + } + *as_mut(&ret[work]) = bn256::Fr::random(&mut rng); + } + }) + } + }); + + ret + } + + pub fn naive_multiscalar_mul( + points: &[bn256::G1Affine], + scalars: &[bn256::Fr], + ) -> bn256::G1Affine { + let n_workers = rayon::current_num_threads(); + + let mut rets: Vec = Vec::with_capacity(n_workers); + unsafe { rets.set_len(n_workers) }; + + let npoints = points.len(); + let work = AtomicUsize::new(0); + let tid = AtomicUsize::new(0); + rayon::scope(|s| { + for _ in 0..n_workers { + s.spawn(|_| { + let mut ret = bn256::G1::default(); + + loop { + let work = work.fetch_add(1, Ordering::Relaxed); + if work >= npoints { + break; + } + ret += points[work] * scalars[work]; + } + + *as_mut(&rets[tid.fetch_add(1, Ordering::Relaxed)]) = ret; + }) + } + }); + + let mut ret = bn256::G1::default(); + for i in 0..n_workers { + ret += rets[i]; + } + + ret.to_affine() + } + + #[test] + fn it_works() { + #[cfg(not(debug_assertions))] + const NPOINTS: usize = 128 * 1024; + #[cfg(debug_assertions)] + const NPOINTS: usize = 8 * 1024; + + let points = gen_points(NPOINTS); + let scalars = gen_scalars(NPOINTS); + + let naive = naive_multiscalar_mul(&points, &scalars); + println!("{:?}", naive); + + let ret = grumpkin_msm::bn256(&points, &scalars).to_affine(); + println!("{:?}", ret); + + assert_eq!(ret, naive); + } +}