lurk-lang · samuelburnham · Jan 9, 2024 · Jan 5, 2024 · Jan 6, 2024 · Jan 8, 2024
diff --git a/.github/PERF_REGRESSION.md b/.github/PERF_REGRESSION.md
@@ -0,0 +1,7 @@
+---
+title: ":rotating_light: Performance regression in #{{ env.PR_NUMBER }}"
+labels: P-Performance, automated issue
+---
+Regression >= {{ env.NOISE_THRESHOLD }} found during merge of: #{{ env.PR_NUMBER }}
+Commit: {{ env.GIT_SHA }}
+Triggered by: {{ env.WORKFLOW_URL }}
diff --git a/.github/tables.toml b/.github/tables.toml
@@ -0,0 +1,6 @@
+[table_comments]
+
+[top_comments]
+Overview = """
+This benchmark report shows the Arecibo GPU benchmarks.
+"""
diff --git a/.github/workflows/gpu-bench.yml b/.github/workflows/gpu-bench.yml
@@ -0,0 +1,131 @@
+# Run final tests only when attempting to merge, shown as skipped status checks beforehand
+name: GPU benchmark regression test
+
+on:
+  pull_request:
+    types: [opened, synchronize, reopened, ready_for_review]
+    branches: [dev]
+  merge_group:
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}
+  cancel-in-progress: true
+
+jobs:
+  # Run comparative benchmark against dev, open issue on regression
+  gpu-benchmark:
+    if: github.event_name != 'pull_request' || github.event.action == 'enqueued'
+    name: Run benchmarks on GPU
+    runs-on: [self-hosted, gpu-bench]
+    steps:
+      - uses: actions/checkout@v4
+        with:
+          repository: lurk-lab/ci-workflows
+      - uses: ./.github/actions/gpu-setup
+        with:
+          gpu-framework: 'cuda'
+      - uses: ./.github/actions/ci-env
+      - uses: actions/checkout@v4
+      # Install dependencies
+      - uses: dtolnay/rust-toolchain@stable
+      - uses: Swatinem/rust-cache@v2
+      - uses: taiki-e/install-action@v2
+        with:
+          tool: [email protected]
+      - name: Install criterion
+        run: |
+          cargo install cargo-criterion
+          cargo install criterion-table
+      - name: Set bench output format and base SHA
+        run: |
+          echo "ARECIBO_BENCH_OUTPUT=commit-comment" | tee -a $GITHUB_ENV
+          echo "BASE_COMMIT=${{ github.event.merge_group.base_sha }}" | tee -a $GITHUB_ENV
+          GPU_NAME=$(nvidia-smi --query-gpu=gpu_name --format=csv,noheader,nounits | tail -n1)
+          echo "GPU_ID=$(echo $GPU_NAME | awk '{ print $NF }')" | tee -a $GITHUB_ENV
+          echo "GPU_NAME=$GPU_NAME" | tee -a $GITHUB_ENV
+      # Checkout base branch for comparative bench
+      - uses: actions/checkout@v4
+        with:
+          ref: dev
+          path: dev
+      # Copy the script so the base can bench with the same parameters
+      - name: Run GPU bench on base branch
+        run: |
+          # Copy justfile to dev, overwriting existing config with that of PR branch
+          cp ../benches/justfile .
+          # Run benchmark
+          just gpu-bench-ci recursive-snark recursive-snark-supernova compressed-snark compressed-snark-supernova
+          # Copy bench output to PR branch
+          cp *-${{ env.BASE_COMMIT }}.json ..
+        working-directory: ${{ github.workspace }}/dev
+      - name: Run GPU bench on PR branch
+        run: |
+          just gpu-bench-ci recursive-snark recursive-snark-supernova compressed-snark compressed-snark-supernova
+          cp *-${{ github.sha }}.json ..
+        working-directory: ${{ github.workspace }}/benches
+      - name: copy the benchmark template and prepare it with data
+        run: |
+          cp .github/tables.toml .
+          # Get CPU model
+          CPU_MODEL=$(grep '^model name' /proc/cpuinfo | head -1 | awk -F ': ' '{ print $2 }')
+          # Get vCPU count
+          NUM_VCPUS=$(nproc --all)
+          # Get total RAM in GB
+          TOTAL_RAM=$(grep MemTotal /proc/meminfo | awk '{$2=$2/(1024^2); print int($2), "GB RAM";}')
+          WORKFLOW_URL="https://github.com/${{ github.repository }}/actions/runs/${{ github.run_id }}"
+
+          # Use conditionals to ensure that only non-empty variables are inserted
+          [[ ! -z "${{ env.GPU_NAME }}" ]] && sed -i "/^\"\"\"$/i ${{ env.GPU_NAME }}" tables.toml
+          [[ ! -z "$CPU_MODEL" ]] && sed -i "/^\"\"\"$/i $CPU_MODEL" tables.toml
+          [[ ! -z "$NUM_VCPUS" ]] && sed -i "/^\"\"\"$/i $NUM_VCPUS" tables.toml
+          [[ ! -z "$TOTAL_RAM" ]] && sed -i "/^\"\"\"$/i $TOTAL_RAM" tables.toml          
+          sed -i "/^\"\"\"$/i Workflow run: $WORKFLOW_URL" tables.toml
+          echo "WORKFLOW_URL=$WORKFLOW_URL" | tee -a $GITHUB_ENV
+        working-directory: ${{ github.workspace }}
+      # Create a `criterion-table` and write in commit comment
+      - name: Run `criterion-table`
+        run: |
+          cat recursive-snark-${{ env.BASE_COMMIT }}.json recursive-snark-${{ github.sha }}.json \
+          recursive-snark-supernova-${{ env.BASE_COMMIT }}.json recursive-snark-supernova- ${{ github.sha }}.json \
+          compressed-snark-${{ env.BASE_COMMIT }}.json compressed-snark-${{ github.sha }}.json \
+          compressed-snark-supernova-${{ env.BASE_COMMIT }}.json compressed-snark-supernova- ${{ github.sha }}.json \
+          | criterion-table > BENCHMARKS.md
+      - name: Write bench on commit comment
+        uses: peter-evans/commit-comment@v3
+        with:
+          body-path: BENCHMARKS.md
+      # Check for a slowdown >= `$ARECIBO_NOISE_THRESHOLD` (fallback is 5%). If so, open an issue but don't block merge
+      - name: Check for perf regression
+        id: regression-check
+        run: |
+          REGRESSIONS=$(awk -F'[*x]' '/slower/{print $12}' BENCHMARKS.md)
+          echo $regressions
+
+          if [ ! -z "${{ env.ARECIBO_NOISE_THRESHOLD}}" ]; then
+            NOISE_THRESHOLD=$(echo "1+${{ env.ARECIBO_NOISE_THRESHOLD }}" | bc)
+          else
+            NOISE_THRESHOLD=1.05
+          fi
+
+          for r in $REGRESSIONS
+          do
+            if (( $(echo "$r >= $NOISE_THRESHOLD" | bc -l) ))
+            then
+              exit 1
+            fi
+          done
+          echo "NOISE_THRESHOLD=$NOISE_THRESHOLD" | tee -a $GITHUB_ENV
+        continue-on-error: true
+      # Not possible to use ${{ github.event.number }} with the `merge_group` trigger
+      - name: Get PR number from merge branch
+        run: |
+          echo "PR_NUMBER=$(echo ${{ github.event.merge_group.head_ref }} | sed -e 's/.*pr-\(.*\)-.*/\1/')" | tee -a $GITHUB_ENV
+      - uses: JasonEtco/create-an-issue@v2
+        env:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+          PR_NUMBER: ${{ env.PR_NUMBER }}
+          GIT_SHA: ${{ github.sha }}
+          WORKFLOW_URL: ${{ env.WORKFLOW_URL }}
+          NOISE_THRESHOLD: $${{ env.NOISE_THRESHOLD }}
+        with:
+          filename: .github/PERF_REGRESSION.md
diff --git a/Cargo.toml b/Cargo.toml
@@ -74,6 +74,10 @@ hex = "0.4.3"
 sha2 = "0.10.7"
 tracing-test = "0.2.4"
 expect-test = "1.4.1"
+anyhow = "1.0.72"
+
+[build-dependencies]
+vergen = { version = "8", features = ["build", "git", "gitcl"] }
 
 [[bench]]
 name = "recursive-snark"

diff --git a/benches/common/mod.rs b/benches/common/mod.rs
@@ -0,0 +1,43 @@
+use anyhow::anyhow;
+use criterion::BenchmarkId;
+
+// TODO: Why Copy and &'static str over String?
+#[derive(Clone, Debug, Copy)]
+pub(crate) struct BenchParams {
+  pub step_size: usize,
+  pub date: &'static str,
+  pub sha: &'static str,
+}
+impl BenchParams {
+  pub(crate) fn bench_id(&self, name: &str) -> BenchmarkId {
+    let output_type = bench_output_env().unwrap_or("stdout".into());
+    match output_type.as_ref() {
+      "pr-comment" => BenchmarkId::new(name, format!("StepCircuitSize-{}", self.step_size)),
+      "commit-comment" => BenchmarkId::new(
+        format!("ref={}", self.sha),
+        format!("{}-StepCircuitSize-{}", name, self.step_size),
+      ),
+      // TODO: refine "gh-pages"
+      _ => BenchmarkId::new(
+        name,
+        format!(
+          "StepCircuitSize-{}-{}-{}",
+          self.step_size, self.sha, self.date
+        ),
+      ),
+    }
+  }
+}
+
+fn bench_output_env() -> anyhow::Result<String> {
+  std::env::var("ARECIBO_BENCH_OUTPUT").map_err(|e| anyhow!("Bench output env var isn't set: {e}"))
+}
+
+pub(crate) fn noise_threshold_env() -> anyhow::Result<f64> {
+  std::env::var("ARECIBO_BENCH_NOISE_THRESHOLD")
+    .map_err(|e| anyhow!("Noise threshold env var isn't set: {e}"))
+    .and_then(|nt| {
+      nt.parse::<f64>()
+        .map_err(|e| anyhow!("Failed to parse noise threshold: {e}"))
+    })
+}
diff --git a/benches/compressed-snark-supernova.rs b/benches/compressed-snark-supernova.rs
@@ -11,14 +11,17 @@ use criterion::{measurement::WallTime, *};
 use ff::PrimeField;
 use std::time::Duration;
 
+mod common;
+use common::{noise_threshold_env, BenchParams};
+
 type E1 = arecibo::provider::PallasEngine;
 type E2 = arecibo::provider::VestaEngine;
 type EE1 = arecibo::provider::ipa_pc::EvaluationEngine<E1>;
 type EE2 = arecibo::provider::ipa_pc::EvaluationEngine<E2>;
-// SNARKs without computation commitmnets
+// SNARKs without computation commitments
 type S1 = arecibo::spartan::batched::BatchedRelaxedR1CSSNARK<E1, EE1>;
 type S2 = arecibo::spartan::snark::RelaxedR1CSSNARK<E2, EE2>;
-// SNARKs with computation commitmnets
+// SNARKs with computation commitments
 type SS1 = arecibo::spartan::batched_ppsnark::BatchedRelaxedR1CSSNARK<E1, EE1>;
 type SS2 = arecibo::spartan::ppsnark::RelaxedR1CSSNARK<E2, EE2>;
 
@@ -162,8 +165,14 @@ fn bench_compressed_snark_internal_with_arity<
 
   let (prover_key, verifier_key) = CompressedSNARK::<_, _, _, _, S1, S2>::setup(&pp).unwrap();
 
+  let bench_params = BenchParams {
+    step_size: num_cons,
+    date: env!("VERGEN_GIT_COMMIT_DATE"),
+    sha: env!("VERGEN_GIT_SHA"),
+  };
+
   // Benchmark the prove time
-  group.bench_function("Prove", |b| {
+  group.bench_function(bench_params.bench_id("Prove"), |b| {
     b.iter(|| {
       assert!(CompressedSNARK::<_, _, _, _, S1, S2>::prove(
         black_box(&pp),
@@ -180,7 +189,7 @@ fn bench_compressed_snark_internal_with_arity<
   let compressed_snark = res.unwrap();
 
   // Benchmark the verification time
-  group.bench_function("Verify", |b| {
+  group.bench_function(bench_params.bench_id("Verify"), |b| {
     b.iter(|| {
       assert!(black_box(&compressed_snark)
         .verify(
@@ -211,10 +220,9 @@ fn bench_one_augmented_circuit_compressed_snark(c: &mut Criterion) {
     // number of constraints in the step circuit
     let num_cons = num_cons_in_augmented_circuit - NUM_CONS_VERIFIER_CIRCUIT_PRIMARY;
 
-    let mut group = c.benchmark_group(format!(
-      "CompressedSNARKSuperNova-1circuit-StepCircuitSize-{num_cons}"
-    ));
+    let mut group = c.benchmark_group("CompressedSNARKSuperNova-1circuit");
     group.sample_size(NUM_SAMPLES);
+    group.noise_threshold(noise_threshold_env().unwrap_or(0.05));
 
     bench_compressed_snark_internal_with_arity::<S1, S2>(&mut group, 1, num_cons);
 
@@ -239,10 +247,9 @@ fn bench_two_augmented_circuit_compressed_snark(c: &mut Criterion) {
     // number of constraints in the step circuit
     let num_cons = num_cons_in_augmented_circuit - NUM_CONS_VERIFIER_CIRCUIT_PRIMARY;
 
-    let mut group = c.benchmark_group(format!(
-      "CompressedSNARKSuperNova-2circuit-StepCircuitSize-{num_cons}"
-    ));
+    let mut group = c.benchmark_group("CompressedSNARKSuperNova-2circuit");
     group.sample_size(NUM_SAMPLES);
+    group.noise_threshold(noise_threshold_env().unwrap_or(0.05));
 
     bench_compressed_snark_internal_with_arity::<S1, S2>(&mut group, 2, num_cons);
 
@@ -267,10 +274,9 @@ fn bench_two_augmented_circuit_compressed_snark_with_computational_commitments(c
     // number of constraints in the step circuit
     let num_cons = num_cons_in_augmented_circuit - NUM_CONS_VERIFIER_CIRCUIT_PRIMARY;
 
-    let mut group = c.benchmark_group(format!(
-      "CompressedSNARKSuperNova-Commitments-2circuit-StepCircuitSize-{num_cons}"
-    ));
+    let mut group = c.benchmark_group("CompressedSNARKSuperNova-Commitments-2circuit");
     group.sample_size(NUM_SAMPLES);
+    group.noise_threshold(noise_threshold_env().unwrap_or(0.05));
 
     bench_compressed_snark_internal_with_arity::<SS1, SS2>(&mut group, 2, num_cons);
 

diff --git a/benches/compressed-snark.rs b/benches/compressed-snark.rs
@@ -14,6 +14,9 @@ use criterion::{measurement::WallTime, *};
 use ff::PrimeField;
 use std::time::Duration;
 
+mod common;
+use common::{noise_threshold_env, BenchParams};
+
 type E1 = PallasEngine;
 type E2 = VestaEngine;
 type EE1 = arecibo::provider::ipa_pc::EvaluationEngine<E1>;
@@ -101,8 +104,14 @@ fn bench_compressed_snark_internal<S1: RelaxedR1CSSNARKTrait<E1>, S2: RelaxedR1C
     assert!(res.is_ok());
   }
 
+  let bench_params = BenchParams {
+    step_size: num_cons,
+    date: env!("VERGEN_GIT_COMMIT_DATE"),
+    sha: env!("VERGEN_GIT_SHA"),
+  };
+
   // Bench time to produce a compressed SNARK
-  group.bench_function("Prove", |b| {
+  group.bench_function(bench_params.bench_id("Prove"), |b| {
     b.iter(|| {
       assert!(CompressedSNARK::<_, _, _, _, S1, S2>::prove(
         black_box(&pp),
@@ -117,7 +126,7 @@ fn bench_compressed_snark_internal<S1: RelaxedR1CSSNARKTrait<E1>, S2: RelaxedR1C
   let compressed_snark = res.unwrap();
 
   // Benchmark the verification time
-  group.bench_function("Verify", |b| {
+  group.bench_function(bench_params.bench_id("Verify"), |b| {
     b.iter(|| {
       assert!(black_box(&compressed_snark)
         .verify(
@@ -148,8 +157,9 @@ fn bench_compressed_snark(c: &mut Criterion) {
     // number of constraints in the step circuit
     let num_cons = num_cons_in_augmented_circuit - NUM_CONS_VERIFIER_CIRCUIT_PRIMARY;
 
-    let mut group = c.benchmark_group(format!("CompressedSNARK-StepCircuitSize-{num_cons}"));
+    let mut group = c.benchmark_group("CompressedSNARK");
     group.sample_size(NUM_SAMPLES);
+    group.noise_threshold(noise_threshold_env().unwrap_or(0.05));
 
     bench_compressed_snark_internal::<S1, S2>(&mut group, num_cons);
 
@@ -172,12 +182,10 @@ fn bench_compressed_snark_with_computational_commitments(c: &mut Criterion) {
     // number of constraints in the step circuit
     let num_cons = num_cons_in_augmented_circuit - NUM_CONS_VERIFIER_CIRCUIT_PRIMARY;
 
-    let mut group = c.benchmark_group(format!(
-      "CompressedSNARK-Commitments-StepCircuitSize-{num_cons}"
-    ));
-    group
-      .sampling_mode(SamplingMode::Flat)
-      .sample_size(NUM_SAMPLES);
+    let mut group = c.benchmark_group("CompressedSNARK-Commitments");
+    group.sampling_mode(SamplingMode::Flat);
+    group.sample_size(NUM_SAMPLES);
+    group.noise_threshold(noise_threshold_env().unwrap_or(0.05));
 
     bench_compressed_snark_internal::<SS1, SS2>(&mut group, num_cons);
 

diff --git a/benches/justfile b/benches/justfile
@@ -0,0 +1,36 @@
+# Install with `cargo install just`
+# Usage: `just <bench|gpu-bench|gpu-bench-ci> <args>`
+set dotenv-load
+set dotenv-filename := "bench.env"
+set ignore-comments := true
+
+commit := `git rev-parse HEAD`
+
+# Run CPU benchmarks
+bench +benches:
+  #!/bin/sh
+  for bench in {{benches}}; do
+    cargo criterion --bench $bench
+  done
+
+gpu-env: 
+  # The `compute`/`sm` number corresponds to the Nvidia GPU architecture
+  # In this case, the self-hosted machine uses the Ampere architecture, but we want this to be configurable
+  # See https://arnon.dk/matching-sm-architectures-arch-and-gencode-for-various-nvidia-cards/
+  export CUDA_ARCH := `nvidia-smi --query-gpu=compute_cap --format=csv,noheader | sed 's/\.//g'`
+  export EC_GPU_CUDA_NVCC_ARGS := "--fatbin --gpu-architecture=sm_$CUDA_ARCH --generate-code=arch=compute_$CUDA_ARCH,code=sm_$CUDA_ARCH"
+  export EC_GPU_FRAMEWORK := "cuda"
+
+# Run CUDA benchmarks on GPU
+gpu-bench +benches: gpu-env
+  #!/bin/sh
+  for bench in {{benches}}; do
+    cargo criterion --bench $bench --features "cuda"
+  done
+
+# Run CUDA benchmarks on GPU, tuned for CI
+gpu-bench-ci +benches:
+  #!/bin/sh
+  for bench in {{benches}}; do
+    cargo criterion --bench $bench --features "cuda" --message-format=json > "$bench-{{commit}}".json
+  done