From 8ccc92de71ed8ec70dad914ffc96a045aa2d27ca Mon Sep 17 00:00:00 2001
From: Hanting Zhang <winston@lurk-lab.com>
Date: Mon, 11 Dec 2023 20:56:53 +0000
Subject: [PATCH] initial commit, not passing tests

---
 Cargo.toml        |  47 ++++++++++++++
 benches/msm.rs    |  74 ++++++++++++++++++++++
 build.rs          | 102 ++++++++++++++++++++++++++++++
 cuda/bn254.cu     |  24 +++++++
 cuda/grumpkin.cu  |  24 +++++++
 examples/msm.rs   | 158 ++++++++++++++++++++++++++++++++++++++++++++++
 rustfmt.toml      |   1 +
 src/lib.rs        | 106 +++++++++++++++++++++++++++++++
 src/pippenger.cpp |  26 ++++++++
 src/tests.rs      | 150 +++++++++++++++++++++++++++++++++++++++++++
 10 files changed, 712 insertions(+)
 create mode 100644 Cargo.toml
 create mode 100644 benches/msm.rs
 create mode 100644 build.rs
 create mode 100644 cuda/bn254.cu
 create mode 100644 cuda/grumpkin.cu
 create mode 100644 examples/msm.rs
 create mode 100644 rustfmt.toml
 create mode 100644 src/lib.rs
 create mode 100644 src/pippenger.cpp
 create mode 100644 src/tests.rs

diff --git a/Cargo.toml b/Cargo.toml
new file mode 100644
index 0000000..5d3d11c
--- /dev/null
+++ b/Cargo.toml
@@ -0,0 +1,47 @@
+[package]
+name = "grumpkin-msm"
+version = "0.1.0"
+edition = "2021"
+license = "Apache-2.0"
+description = "Optimized multiscalar multiplicaton for the Grumpkin curve cycle"
+repository = "https://github.com/lurk-lab/grumpkin-msm"
+readme = "README.md"
+include = [
+    "/benches/**",
+    "/cuda/**",
+    "/src/**",
+    "/Cargo.toml",
+    "/build.rs",
+    "/README.md",
+]
+
+[features]
+# By default, compile with ADX extension if the host supports it.
+# Binary can be executed on systems similar to the host.
+default = []
+# Compile in portable mode, without ISA extensions.
+# Binary can be executed on all systems.
+portable = [ "blst/portable" ]
+# Enable ADX even if the host CPU doesn't support it.
+# Binary can be executed on Broadwell+ and Ryzen+ systems.
+force-adx = [ "blst/force-adx" ]
+cuda-mobile = []
+
+[dependencies]
+blst = "~0.3.11"
+sppark = "~0.1.2"
+halo2curves = { version = "0.4.0" }
+
+[build-dependencies]
+cc = "^1.0.70"
+which = "^4.0"
+
+[dev-dependencies]
+criterion = { version = "0.3", features = [ "html_reports" ] }
+rand = "^0"
+rand_chacha = "^0"
+rayon = "1.5"
+
+[[bench]]
+name = "msm"
+harness = false
diff --git a/benches/msm.rs b/benches/msm.rs
new file mode 100644
index 0000000..d93a817
--- /dev/null
+++ b/benches/msm.rs
@@ -0,0 +1,74 @@
+// Copyright Supranational LLC
+// Licensed under the Apache License, Version 2.0, see LICENSE for details.
+// SPDX-License-Identifier: Apache-2.0
+
+#![allow(dead_code)]
+#![allow(unused_imports)]
+#![allow(unused_mut)]
+
+use criterion::{criterion_group, criterion_main, Criterion};
+
+use grumpkin_msm;
+
+#[cfg(feature = "cuda")]
+extern "C" {
+    fn cuda_available() -> bool;
+}
+
+include!("../src/tests.rs");
+
+fn criterion_benchmark(c: &mut Criterion) {
+    let bench_npow: usize = std::env::var("BENCH_NPOW")
+        .unwrap_or("17".to_string())
+        .parse()
+        .unwrap();
+    let npoints: usize = 1 << bench_npow;
+
+    //println!("generating {} random points, just hang on...", npoints);
+    let mut points = crate::tests::gen_points(npoints);
+    let mut scalars = crate::tests::gen_scalars(npoints);
+
+    #[cfg(feature = "cuda")]
+    {
+        unsafe { grumpkin_msm::CUDA_OFF = true };
+    }
+
+    let mut group = c.benchmark_group("CPU");
+    group.sample_size(10);
+
+    group.bench_function(format!("2**{} points", bench_npow), |b| {
+        b.iter(|| {
+            let _ = grumpkin_msm::bn256(&points, &scalars);
+        })
+    });
+
+    group.finish();
+
+    #[cfg(feature = "cuda")]
+    if unsafe { cuda_available() } {
+        unsafe { grumpkin_msm::CUDA_OFF = false };
+
+        const EXTRA: usize = 5;
+        let bench_npow = bench_npow + EXTRA;
+        let npoints: usize = 1 << bench_npow;
+
+        while points.len() < npoints {
+            points.append(&mut points.clone());
+        }
+        scalars.append(&mut crate::tests::gen_scalars(npoints - scalars.len()));
+
+        let mut group = c.benchmark_group("GPU");
+        group.sample_size(20);
+
+        group.bench_function(format!("2**{} points", bench_npow), |b| {
+            b.iter(|| {
+                let _ = grumpkin_msm::bn256(&points, &scalars);
+            })
+        });
+
+        group.finish();
+    }
+}
+
+criterion_group!(benches, criterion_benchmark);
+criterion_main!(benches);
diff --git a/build.rs b/build.rs
new file mode 100644
index 0000000..c38148a
--- /dev/null
+++ b/build.rs
@@ -0,0 +1,102 @@
+use std::env;
+use std::path::PathBuf;
+
+fn main() {
+    // account for cross-compilation [by examining environment variable]
+    let target_arch = env::var("CARGO_CFG_TARGET_ARCH").unwrap();
+
+    // Set CXX environment variable to choose alternative C compiler.
+    // Optimization level depends on whether or not --release is passed
+    // or implied.
+    let mut cc = cc::Build::new();
+    cc.cpp(true);
+
+    let c_src_dir = PathBuf::from("src");
+    let files = vec![c_src_dir.join("pippenger.cpp")];
+    let mut cc_def = None;
+
+    match (cfg!(feature = "portable"), cfg!(feature = "force-adx")) {
+        (true, false) => {
+            println!("Compiling in portable mode without ISA extensions");
+            cc_def = Some("__PASTA_PORTABLE__");
+        }
+        (false, true) => {
+            if target_arch.eq("x86_64") {
+                println!("Enabling ADX support via `force-adx` feature");
+                cc_def = Some("__ADX__");
+            } else {
+                println!("`force-adx` is ignored for non-x86_64 targets");
+            }
+        }
+        (false, false) => {
+            #[cfg(target_arch = "x86_64")]
+            if target_arch.eq("x86_64") && std::is_x86_feature_detected!("adx")
+            {
+                println!("Enabling ADX because it was detected on the host");
+                cc_def = Some("__ADX__");
+            }
+        }
+        (true, true) => panic!(
+            "Cannot compile with both `portable` and `force-adx` features"
+        ),
+    }
+
+    cc.flag_if_supported("-mno-avx") // avoid costly transitions
+        .flag_if_supported("-fno-builtin")
+        .flag_if_supported("-std=c++11")
+        .flag_if_supported("-Wno-unused-command-line-argument");
+    if !cfg!(debug_assertions) {
+        cc.define("NDEBUG", None);
+    }
+    if let Some(def) = cc_def {
+        cc.define(def, None);
+    }
+    if let Some(include) = env::var_os("DEP_BLST_C_SRC") {
+        cc.include(include);
+    }
+    if let Some(include) = env::var_os("DEP_SPPARK_ROOT") {
+        cc.include(include);
+    }
+    cc.files(&files).compile("pasta_msm");
+
+    if cfg!(target_os = "windows") && !cfg!(target_env = "msvc") {
+        return;
+    }
+    // Detect if there is CUDA compiler and engage "cuda" feature accordingly
+    let nvcc = match env::var("NVCC") {
+        Ok(var) => which::which(var),
+        Err(_) => which::which("nvcc"),
+    };
+    if nvcc.is_ok() {
+        let mut nvcc = cc::Build::new();
+        nvcc.cuda(true);
+        nvcc.flag("-arch=sm_80");
+        nvcc.flag("-gencode").flag("arch=compute_70,code=sm_70");
+        nvcc.flag("-t0");
+        #[cfg(not(target_env = "msvc"))]
+        nvcc.flag("-Xcompiler").flag("-Wno-unused-function");
+        nvcc.define("TAKE_RESPONSIBILITY_FOR_ERROR_MESSAGE", None);
+        #[cfg(feature = "cuda-mobile")]
+        nvcc.define("NTHREADS", "128");
+        if let Some(def) = cc_def {
+            nvcc.define(def, None);
+        }
+        if let Some(include) = env::var_os("DEP_BLST_C_SRC") {
+            nvcc.include(include);
+        }
+        if let Some(include) = env::var_os("DEP_SPPARK_ROOT") {
+            nvcc.include(include);
+        }
+        nvcc.clone()
+            .file("cuda/bn254.cu")
+            .compile("pallas_msm_cuda");
+        nvcc.define("__MSM_SORT_DONT_IMPLEMENT__", None)
+            .file("cuda/grumpkin.cu")
+            .compile("vesta_msm_cuda");
+
+        println!("cargo:rerun-if-changed=cuda");
+        println!("cargo:rerun-if-env-changed=CXXFLAGS");
+        println!("cargo:rustc-cfg=feature=\"cuda\"");
+    }
+    println!("cargo:rerun-if-env-changed=NVCC");
+}
diff --git a/cuda/bn254.cu b/cuda/bn254.cu
new file mode 100644
index 0000000..3c0c1c9
--- /dev/null
+++ b/cuda/bn254.cu
@@ -0,0 +1,24 @@
+// Copyright Supranational LLC
+// Licensed under the Apache License, Version 2.0, see LICENSE for details.
+// SPDX-License-Identifier: Apache-2.0
+
+#include <cuda.h>
+
+#include <ec/jacobian_t.hpp>
+#include <ec/xyzz_t.hpp>
+
+#include <ff/alt_bn128.hpp>
+
+typedef jacobian_t<fp_t> point_t;
+typedef xyzz_t<fp_t> bucket_t;
+typedef bucket_t::affine_t affine_t;
+typedef fr_t scalar_t;
+
+#include <msm/pippenger.cuh>
+
+#ifndef __CUDA_ARCH__
+extern "C"
+RustError cuda_pippenger_bn254(point_t *out, const affine_t points[], size_t npoints,
+                                              const scalar_t scalars[])
+{   return mult_pippenger<bucket_t>(out, points, npoints, scalars);   }
+#endif
diff --git a/cuda/grumpkin.cu b/cuda/grumpkin.cu
new file mode 100644
index 0000000..861610d
--- /dev/null
+++ b/cuda/grumpkin.cu
@@ -0,0 +1,24 @@
+// Copyright Supranational LLC
+// Licensed under the Apache License, Version 2.0, see LICENSE for details.
+// SPDX-License-Identifier: Apache-2.0
+
+#include <cuda.h>
+
+#include <ec/jacobian_t.hpp>
+#include <ec/xyzz_t.hpp>
+
+#include <ff/alt_bn128.hpp>
+
+typedef jacobian_t<fr_t> point_t;
+typedef xyzz_t<fr_t> bucket_t;
+typedef bucket_t::affine_t affine_t;
+typedef fp_t scalar_t;
+
+#include <msm/pippenger.cuh>
+
+#ifndef __CUDA_ARCH__
+extern "C"
+RustError cuda_pippenger_grumpkin(point_t *out, const affine_t points[], size_t npoints,
+                                              const scalar_t scalars[])
+{   return mult_pippenger<bucket_t>(out, points, npoints, scalars);   }
+#endif
diff --git a/examples/msm.rs b/examples/msm.rs
new file mode 100644
index 0000000..65a70e1
--- /dev/null
+++ b/examples/msm.rs
@@ -0,0 +1,158 @@
+use core::cell::UnsafeCell;
+use core::mem::transmute;
+use core::sync::atomic::*;
+use rand::{RngCore, SeedableRng};
+use rand_chacha::ChaCha20Rng;
+use halo2curves::bn256;
+use halo2curves::ff::Field;
+use halo2curves::CurveExt;
+use halo2curves::group::{Curve, Group};
+
+#[cfg(feature = "cuda")]
+extern "C" {
+    fn cuda_available() -> bool;
+}
+
+pub fn gen_points(npoints: usize) -> Vec<bn256::G1Affine> {
+    let mut ret: Vec<bn256::G1Affine> = Vec::with_capacity(npoints);
+    unsafe { ret.set_len(npoints) };
+
+    let mut rnd: Vec<u8> = Vec::with_capacity(32 * npoints);
+    unsafe { rnd.set_len(32 * npoints) };
+    ChaCha20Rng::from_entropy().fill_bytes(&mut rnd);
+
+    let n_workers = rayon::current_num_threads();
+    let work = AtomicUsize::new(0);
+    rayon::scope(|s| {
+        for _ in 0..n_workers {
+            s.spawn(|_| {
+                let hash = bn256::G1::hash_to_curve("foobar");
+
+                let mut stride = 1024;
+                let mut tmp: Vec<bn256::G1> = Vec::with_capacity(stride);
+                unsafe { tmp.set_len(stride) };
+
+                loop {
+                    let work = work.fetch_add(stride, Ordering::Relaxed);
+                    if work >= npoints {
+                        break;
+                    }
+                    if work + stride > npoints {
+                        stride = npoints - work;
+                        unsafe { tmp.set_len(stride) };
+                    }
+                    for i in 0..stride {
+                        let off = (work + i) * 32;
+                        tmp[i] = hash(&rnd[off..off + 32]);
+                    }
+                    #[allow(mutable_transmutes)]
+                    bn256::G1::batch_normalize(&tmp, unsafe {
+                        transmute::<&[bn256::G1Affine], &mut [bn256::G1Affine]>(
+                            &ret[work..work + stride],
+                        )
+                    });
+                }
+            })
+        }
+    });
+
+    ret
+}
+
+fn as_mut<T>(x: &T) -> &mut T {
+    unsafe { &mut *UnsafeCell::raw_get(x as *const _ as *const _) }
+}
+
+pub fn gen_scalars(npoints: usize) -> Vec<bn256::Fr> {
+    let mut ret: Vec<bn256::Fr> = Vec::with_capacity(npoints);
+    unsafe { ret.set_len(npoints) };
+
+    let n_workers = rayon::current_num_threads();
+    let work = AtomicUsize::new(0);
+
+    rayon::scope(|s| {
+        for _ in 0..n_workers {
+            s.spawn(|_| {
+                let mut rng = ChaCha20Rng::from_entropy();
+                loop {
+                    let work = work.fetch_add(1, Ordering::Relaxed);
+                    if work >= npoints {
+                        break;
+                    }
+                    *as_mut(&ret[work]) = bn256::Fr::random(&mut rng);
+                }
+            })
+        }
+    });
+
+    ret
+}
+
+pub fn naive_multiscalar_mul(
+    points: &[bn256::G1Affine],
+    scalars: &[bn256::Fr],
+) -> bn256::G1Affine {
+    let n_workers = rayon::current_num_threads();
+
+    let mut rets: Vec<bn256::G1> = Vec::with_capacity(n_workers);
+    unsafe { rets.set_len(n_workers) };
+
+    let npoints = points.len();
+    let work = AtomicUsize::new(0);
+    let tid = AtomicUsize::new(0);
+    rayon::scope(|s| {
+        for _ in 0..n_workers {
+            s.spawn(|_| {
+                let mut ret = bn256::G1::default();
+
+                loop {
+                    let work = work.fetch_add(1, Ordering::Relaxed);
+                    if work >= npoints {
+                        break;
+                    }
+                    ret += points[work] * scalars[work];
+                }
+
+                *as_mut(&rets[tid.fetch_add(1, Ordering::Relaxed)]) = ret;
+            })
+        }
+    });
+
+    let mut ret = bn256::G1::default();
+    for i in 0..n_workers {
+        ret += rets[i];
+    }
+
+    ret.to_affine()
+}
+
+fn main() {
+    let bench_npow: usize = std::env::var("BENCH_NPOW")
+        .unwrap_or("0".to_string())
+        .parse()
+        .unwrap();
+    let npoints: usize = 1 << bench_npow;
+
+    let two = bn256::G1::generator();
+    // println!("generating {} random points, just hang on...", npoints);
+    // let points = gen_points(npoints);
+    let mut points: Vec<bn256::G1Affine> = Vec::with_capacity(npoints);
+    unsafe { points.set_len(npoints) };
+    bn256::G1::batch_normalize(&vec![two; npoints], &mut points);
+    let scalars = vec![bn256::Fr::from(2); npoints];
+
+    println!("points: {:?}", points);
+    println!("scalars: {:?}", scalars);
+
+    #[cfg(feature = "cuda")]
+    {
+        unsafe { grumpkin_msm::CUDA_OFF = true };
+    }
+
+    let res = grumpkin_msm::bn256(&points, &scalars).to_affine();
+    let native = naive_multiscalar_mul(&points, &scalars);
+    let hi = (points[0] * scalars[0]).to_affine();
+    println!("hi: {:?}", hi);
+    assert_eq!(res, native);
+    println!("success!")
+}
diff --git a/rustfmt.toml b/rustfmt.toml
new file mode 100644
index 0000000..df99c69
--- /dev/null
+++ b/rustfmt.toml
@@ -0,0 +1 @@
+max_width = 80
diff --git a/src/lib.rs b/src/lib.rs
new file mode 100644
index 0000000..bec4922
--- /dev/null
+++ b/src/lib.rs
@@ -0,0 +1,106 @@
+// Copyright Supranational LLC
+// Licensed under the Apache License, Version 2.0, see LICENSE for details.
+// SPDX-License-Identifier: Apache-2.0
+#![allow(improper_ctypes)]
+
+extern crate blst;
+
+#[cfg(feature = "cuda")]
+sppark::cuda_error!();
+#[cfg(feature = "cuda")]
+extern "C" {
+    fn cuda_available() -> bool;
+}
+#[cfg(feature = "cuda")]
+pub static mut CUDA_OFF: bool = false;
+
+use halo2curves::bn256;
+
+extern "C" {
+    fn mult_pippenger_bn254(
+        out: *mut bn256::G1,
+        points: *const bn256::G1Affine,
+        npoints: usize,
+        scalars: *const bn256::Fr,
+    );
+
+}
+
+pub fn bn256(points: &[bn256::G1Affine], scalars: &[bn256::Fr]) -> bn256::G1 {
+    let npoints = points.len();
+    if npoints != scalars.len() {
+        panic!("length mismatch")
+    }
+    #[cfg(feature = "cuda")]
+    if npoints >= 1 << 16 && unsafe { !CUDA_OFF && cuda_available() } {
+        extern "C" {
+            fn cuda_pippenger_bn254(
+                out: *mut bn256::G1,
+                points: *const bn256::G1Affine,
+                npoints: usize,
+                scalars: *const bn256::Fr,
+            ) -> cuda::Error;
+
+        }
+        let mut ret = bn256::G1::default();
+        let err = unsafe {
+            cuda_pippenger_bn254(&mut ret, &points[0], npoints, &scalars[0])
+        };
+        if err.code != 0 {
+            panic!("{}", String::from(err));
+        }
+        return ret;
+    }
+    let mut ret = bn256::G1::default();
+    unsafe { mult_pippenger_bn254(&mut ret, &points[0], npoints, &scalars[0]) };
+    ret
+}
+
+use halo2curves::grumpkin;
+
+extern "C" {
+    fn mult_pippenger_grumpkin(
+        out: *mut grumpkin::G1,
+        points: *const grumpkin::G1Affine,
+        npoints: usize,
+        scalars: *const grumpkin::Fr,
+    );
+
+}
+
+pub fn grumpkin(
+    points: &[grumpkin::G1Affine],
+    scalars: &[grumpkin::Fr],
+) -> grumpkin::G1 {
+    let npoints = points.len();
+    if npoints != scalars.len() {
+        panic!("length mismatch")
+    }
+    #[cfg(feature = "cuda")]
+    if npoints >= 1 << 16 && unsafe { !CUDA_OFF && cuda_available() } {
+        extern "C" {
+            fn cuda_pippenger_grumpkin(
+                out: *mut grumpkin::G1,
+                points: *const grumpkin::G1Affine,
+                npoints: usize,
+                scalars: *const grumpkin::Fr,
+            ) -> cuda::Error;
+
+        }
+        let mut ret = grumpkin::G1::default();
+        let err = unsafe {
+            cuda_pippenger_grumpkin(&mut ret, &points[0], npoints, &scalars[0])
+        };
+        if err.code != 0 {
+            panic!("{}", String::from(err));
+        }
+        return ret;
+    }
+    let mut ret = grumpkin::G1::default();
+    unsafe {
+        mult_pippenger_grumpkin(&mut ret, &points[0], npoints, &scalars[0])
+    };
+    ret
+}
+
+include!("tests.rs");
diff --git a/src/pippenger.cpp b/src/pippenger.cpp
new file mode 100644
index 0000000..f6f7a11
--- /dev/null
+++ b/src/pippenger.cpp
@@ -0,0 +1,26 @@
+// Copyright Supranational LLC
+// Licensed under the Apache License, Version 2.0, see LICENSE for details.
+// SPDX-License-Identifier: Apache-2.0
+
+#include <msm/pippenger.hpp>
+#include <ec/jacobian_t.hpp>
+#include <ec/xyzz_t.hpp>
+#include <ff/alt_bn128.hpp>
+
+static thread_pool_t da_pool;
+
+extern "C"
+void mult_pippenger_bn254(jacobian_t<fp_t>& ret,
+                          const xyzz_t<fp_t>::affine_t points[],
+                          size_t npoints, const fr_t scalars[])
+{   mult_pippenger<xyzz_t<fp_t>>(ret, points, npoints, scalars, true,
+                                    &da_pool);
+}
+
+extern "C"
+void mult_pippenger_grumpkin(jacobian_t<fr_t>& ret,
+                           const xyzz_t<fr_t>::affine_t points[],
+                           size_t npoints, const fp_t scalars[])
+{   mult_pippenger<xyzz_t<fr_t>>(ret, points, npoints, scalars, true,
+                                     &da_pool);
+}
diff --git a/src/tests.rs b/src/tests.rs
new file mode 100644
index 0000000..3a1f0c2
--- /dev/null
+++ b/src/tests.rs
@@ -0,0 +1,150 @@
+// Copyright Supranational LLC
+// Licensed under the Apache License, Version 2.0, see LICENSE for details.
+// SPDX-License-Identifier: Apache-2.0
+
+#[cfg(test)]
+mod tests {
+    use crate as grumpkin_msm;
+    use core::cell::UnsafeCell;
+    use core::mem::transmute;
+    use core::sync::atomic::*;
+    use halo2curves::{
+        CurveExt,
+        group::{ff::Field, Curve},
+        bn256,
+    };
+    use rand::{RngCore, SeedableRng};
+    use rand_chacha::ChaCha20Rng;
+
+    pub fn gen_points(npoints: usize) -> Vec<bn256::G1Affine> {
+        let mut ret: Vec<bn256::G1Affine> = Vec::with_capacity(npoints);
+        unsafe { ret.set_len(npoints) };
+
+        let mut rnd: Vec<u8> = Vec::with_capacity(32 * npoints);
+        unsafe { rnd.set_len(32 * npoints) };
+        ChaCha20Rng::from_entropy().fill_bytes(&mut rnd);
+
+        let n_workers = rayon::current_num_threads();
+        let work = AtomicUsize::new(0);
+        rayon::scope(|s| {
+            for _ in 0..n_workers {
+                s.spawn(|_| {
+                let hash = bn256::G1::hash_to_curve("foobar");
+
+                let mut stride = 1024;
+                let mut tmp: Vec<bn256::G1> = Vec::with_capacity(stride);
+                unsafe { tmp.set_len(stride) };
+
+                loop {
+                    let work = work.fetch_add(stride, Ordering::Relaxed);
+                    if work >= npoints {
+                        break;
+                    }
+                    if work + stride > npoints {
+                        stride = npoints - work;
+                        unsafe { tmp.set_len(stride) };
+                    }
+                    for i in 0..stride {
+                        let off = (work + i) * 32;
+                        tmp[i] = hash(&rnd[off..off + 32]);
+                    }
+                    #[allow(mutable_transmutes)]
+                    bn256::G1::batch_normalize(&tmp, unsafe {
+                        transmute::<&[bn256::G1Affine], &mut [bn256::G1Affine]>(
+                            &ret[work..work + stride],
+                        )
+                    });
+                }
+            })
+            }
+        });
+
+        ret
+    }
+
+    fn as_mut<T>(x: &T) -> &mut T {
+        unsafe { &mut *UnsafeCell::raw_get(x as *const _ as *const _) }
+    }
+
+    pub fn gen_scalars(npoints: usize) -> Vec<bn256::Fr> {
+        let mut ret: Vec<bn256::Fr> = Vec::with_capacity(npoints);
+        unsafe { ret.set_len(npoints) };
+
+        let n_workers = rayon::current_num_threads();
+        let work = AtomicUsize::new(0);
+
+        rayon::scope(|s| {
+            for _ in 0..n_workers {
+                s.spawn(|_| {
+                    let mut rng = ChaCha20Rng::from_entropy();
+                    loop {
+                        let work = work.fetch_add(1, Ordering::Relaxed);
+                        if work >= npoints {
+                            break;
+                        }
+                        *as_mut(&ret[work]) = bn256::Fr::random(&mut rng);
+                    }
+                })
+            }
+        });
+
+        ret
+    }
+
+    pub fn naive_multiscalar_mul(
+        points: &[bn256::G1Affine],
+        scalars: &[bn256::Fr],
+    ) -> bn256::G1Affine {
+        let n_workers = rayon::current_num_threads();
+
+        let mut rets: Vec<bn256::G1> = Vec::with_capacity(n_workers);
+        unsafe { rets.set_len(n_workers) };
+
+        let npoints = points.len();
+        let work = AtomicUsize::new(0);
+        let tid = AtomicUsize::new(0);
+        rayon::scope(|s| {
+            for _ in 0..n_workers {
+                s.spawn(|_| {
+                    let mut ret = bn256::G1::default();
+
+                    loop {
+                        let work = work.fetch_add(1, Ordering::Relaxed);
+                        if work >= npoints {
+                            break;
+                        }
+                        ret += points[work] * scalars[work];
+                    }
+
+                    *as_mut(&rets[tid.fetch_add(1, Ordering::Relaxed)]) = ret;
+                })
+            }
+        });
+
+        let mut ret = bn256::G1::default();
+        for i in 0..n_workers {
+            ret += rets[i];
+        }
+
+        ret.to_affine()
+    }
+
+    #[test]
+    fn it_works() {
+        #[cfg(not(debug_assertions))]
+        const NPOINTS: usize = 128 * 1024;
+        #[cfg(debug_assertions)]
+        const NPOINTS: usize = 8 * 1024;
+
+        let points = gen_points(NPOINTS);
+        let scalars = gen_scalars(NPOINTS);
+
+        let naive = naive_multiscalar_mul(&points, &scalars);
+        println!("{:?}", naive);
+
+        let ret = grumpkin_msm::bn256(&points, &scalars).to_affine();
+        println!("{:?}", ret);
+
+        assert_eq!(ret, naive);
+    }
+}