feat: [INFRA-2749] change labels to match for gpu to use gpu-based in…

…stances for build (#119) * feat: [INFRA-2749] change label to gpu for build on gpu-based instances * feat: [INFRA-2749] change labels to match for gpu to use gpu-based instances for build * feat: [INFRA-2749] change labels to match for gpu to use gpu-based instances for tests * ci: try running E2E test on self-hosted gpu runner * ci: install some missing packages * ci: also apt update * install cuda and nccl * cuda 12.1 * new try * more deps * more * more * manual * quiet * deb * quiet * another try * up * remove existing * up * up * up * install all deps * up * up * up * up * up * up * nccl * don't install nccl from source * up * up * up * up * cuda home * export path and cache * fix * use 12.1 * cuda 12.2 * up * up * chore: Use base docker image (#132) * Use base docker image * add curl * incorrect version fix * do not use gpu on cis where not needed * cleanup --------- Co-authored-by: Philipp Sippl <[email protected]> * dbg print cuda * up * . * up * new try * dbg * test * test * test * test * test * test * test * test * test * test * test * test * test * test * test * test * test * test * test * test * test * test * test * test * test * test * test * test * test * test * test * test * test * test * test * test * test * test * test * test * test * test * test * test * test * test * test * test * test * revert container options * test * Trigger Build * remove fstab changes * try NCCL_P2P_DIRECT_DISABLE * disable shm * also run normal tests * remove tests again * dbg * dbg * revert dbg * limit db * revert limit * only random * no insertions * sync * add compute-sanitizer test to ci * fix broken shell command * install compute-sanitizer as well * test * test * hardcode compute-sanitizer path * disable write * dbg * dbg * test waiting for all work to finish per job * only 1 batch * 2 batches * 3 batches * reducing complexity, db_sizes no longer need to be mutexed * make e2e test deterministic, add super sync after each iteration * only use 1 stream atm * alloc on streams * another sync after alloc * sync after each op * tracing * try not freeing stuff * Revert "try not freeing stuff" This reverts commit 987fa80. * Revert "tracing" This reverts commit 7569f98. * test replacing ptr casts with normal cuda types * Revert "test replacing ptr casts with normal cuda types" This reverts commit c480df7. * try dirty hack to use default streams * don't access null ptr streams * Revert "don't access null ptr streams" This reverts commit 4e04d49. * Revert "try dirty hack to use default streams" This reverts commit 7972c61. * Revert "sync after each op" This reverts commit 03aadd7. * log mem addresses * Revert "log mem addresses" This reverts commit 331257b. * remove bind thread * dbg * dbg * dbg * dbg * dbg * dbg * up * dbg * dbg * dbg * dbg * dbg * up * up * up * up * up * dbg: max for all * up * up * up * up * up * up * up * 2 byte aligned * odd len in phase 2 * dbg * up * up * cublas test * up * up * up * up * add asserts * cublasGetStatusString * remove cuda test * update batch size in server * PR feedback * Revert "alloc on streams" This reverts commit 898eb13. * fmt --------- Co-authored-by: Daniel Kales <[email protected]> Co-authored-by: philsippl <[email protected]> Co-authored-by: wojciechsromek <[email protected]> Co-authored-by: wojciechsromek <[email protected]>
worldcoin · Aug 5, 2024 · d725f84 · d725f84
1 parent d6bc404
commit d725f84
Show file tree

Hide file tree

Showing 11 changed files with 239 additions and 84 deletions.
diff --git a/.github/workflows/test-gpu.yaml b/.github/workflows/test-gpu.yaml
@@ -0,0 +1,133 @@
+name: Rust GPU Tests
+
+on:
+  push:
+
+concurrency:
+  group: "${{ github.workflow }} @ ${{ github.event.pull_request.head.label || github.head_ref || github.ref }}"
+  cancel-in-progress: true
+
+jobs:
+  e2e:
+    runs-on: gpu
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v4
+
+      - name: Validate presence of GPU devices
+        run: nvidia-smi
+
+      - name: Check shared memory size
+        run: df -h
+
+      - name: Install OpenSSL && pkg-config
+        run: sudo apt-get update && sudo apt-get install -y pkg-config libssl-dev
+
+      - name: Install CUDA and NCCL dependencies
+        if: steps.cache-cuda-nccl.outputs.cache-hit != 'true'
+        env:
+          DEBIAN_FRONTEND: noninteractive
+        run: |
+          wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/cuda-keyring_1.1-1_all.deb
+          sudo dpkg -i cuda-keyring_1.1-1_all.deb
+          sudo apt update
+          sudo apt install -y cuda-toolkit-12-2 libnccl2 libnccl-dev
+
+      - name: Find libs
+        run: find /usr -name "libnvrtc*" && find /usr -name libcuda.so
+
+      - name: Cache Rust build
+        uses: actions/cache@v3
+        id: cache-rust
+        with:
+          path: |
+            ~/.cargo/registry
+            ~/.cargo/git
+            target
+          key: rust-build-${{ runner.os }}-${{ hashFiles('**/Cargo.lock') }}
+          restore-keys: |
+            rust-build-${{ runner.os }}-
+
+      - name: Find libs
+        run: find /usr -name "libnvrtc*" && find /usr -name libcuda.so
+
+      - name: Install Rust nightly
+        uses: dtolnay/rust-toolchain@master
+        with:
+          toolchain: nightly
+
+      - name: E2E Tests
+        run: cargo test --release e2e
+        shell: bash
+        env:
+          NCCL_P2P_LEVEL: LOC
+          NCCL_NET: Socket
+          NCCL_P2P_DIRECT_DISABLE: 1
+          NCCL_SHM_DISABLE: 1
+
+  e2e-sanitizer:
+    runs-on: gpu
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v4
+
+      - name: Validate presence of GPU devices
+        run: nvidia-smi
+
+      - name: Check shared memory size
+        run: df -h
+
+      - name: Install OpenSSL && pkg-config
+        run: sudo apt-get update && sudo apt-get install -y pkg-config libssl-dev
+
+      - name: Install CUDA and NCCL dependencies
+        if: steps.cache-cuda-nccl.outputs.cache-hit != 'true'
+        env:
+          DEBIAN_FRONTEND: noninteractive
+        run: |
+          wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/cuda-keyring_1.1-1_all.deb
+          sudo dpkg -i cuda-keyring_1.1-1_all.deb
+          sudo apt update
+          sudo apt install -y cuda-toolkit-12-2 cuda-command-line-tools-12-2 libnccl2 libnccl-dev
+
+      - name: Find libs
+        run: find /usr -name "libnvrtc*" && find /usr -name libcuda.so
+
+      - name: Cache Rust build
+        uses: actions/cache@v3
+        id: cache-rust
+        with:
+          path: |
+            ~/.cargo/registry
+            ~/.cargo/git
+            target
+          key: rust-build-${{ runner.os }}-${{ hashFiles('**/Cargo.lock') }}
+          restore-keys: |
+            rust-build-${{ runner.os }}-
+
+      - name: Find libs
+        run: find /usr -name "libnvrtc*" && find /usr -name libcuda.so
+
+      - name: Find compute-sanitizer
+        run: find /usr -name "compute-sanitizer"
+
+      - name: Install Rust nightly
+        uses: dtolnay/rust-toolchain@master
+        with:
+          toolchain: nightly
+
+      - name: Build e2e test
+        run: cargo test --release e2e --no-run
+
+      - name: Build e2e test and grab executable name
+        run: echo TEST_NAME=$(cargo --color=never test --release e2e --no-run 2>&1 | grep "Executable tests/e2e.rs" | sed "s/.*(\(.*\))/\1/") >> $GITHUB_OUTPUT
+        id: build-e2e
+
+      - name: E2E Tests w/ compute-sanitizer
+        run: /usr/local/cuda-12.2/bin/compute-sanitizer --tool=memcheck ${{ steps.build-e2e.outputs.TEST_NAME }} --nocapture
+        env:
+          NCCL_DEBUG: info
+          NCCL_P2P_LEVEL: LOC
+          NCCL_NET: Socket
+          NCCL_P2P_DIRECT_DISABLE: 1
+          NCCL_SHM_DISABLE: 1
diff --git a/src/bin/client.rs b/src/bin/client.rs
@@ -24,7 +24,7 @@ use std::{collections::HashMap, sync::Arc, time::Duration};
 use tokio::{spawn, sync::Mutex, time::sleep};
 use uuid::Uuid;
 
-const N_QUERIES: usize = 32 * 20;
+const N_QUERIES: usize = 64 * 20;
 const REGION: &str = "eu-north-1";
 const RNG_SEED_SERVER: u64 = 42;
 const DB_SIZE: usize = 8 * 1_000;

diff --git a/src/bin/server.rs b/src/bin/server.rs
@@ -46,7 +46,7 @@ use tracing_subscriber::{layer::SubscriberExt, util::SubscriberInitExt};
 
 const REGION: &str = "eu-north-1";
 const DB_SIZE: usize = 8 * 1_000;
-const N_QUERIES: usize = 32;
+const N_QUERIES: usize = 64;
 const N_BATCHES: usize = 100;
 const RNG_SEED: u64 = 42;
 const SYNC_RESULTS: usize = N_QUERIES * 2;

diff --git a/src/dot/distance_comparator.rs b/src/dot/distance_comparator.rs
@@ -73,6 +73,7 @@ impl DistanceComparator {
         results3: &[CudaView<u64>],
         results_ptrs: &[CudaSlice<u32>],
         db_sizes: &[usize],
+        real_db_sizes: &[usize],
         offset: usize,
         streams: &[CudaStream],
     ) {
@@ -102,6 +103,7 @@ impl DistanceComparator {
                             self.query_length,
                             offset,
                             num_elements,
+                            real_db_sizes[i],
                         ),
                     )
                     .unwrap();

diff --git a/src/dot/kernel.cu b/src/dot/kernel.cu
@@ -23,7 +23,7 @@ extern "C" __global__ void matmul_correct_and_reduce(int *c, unsigned short *out
     }
 }
 
-extern "C" __global__ void openResults(unsigned long long *result1, unsigned long long *result2, unsigned long long *result3, unsigned int *output, size_t dbLength, size_t queryLength, size_t offset, size_t numElements)
+extern "C" __global__ void openResults(unsigned long long *result1, unsigned long long *result2, unsigned long long *result3, unsigned int *output, size_t dbLength, size_t queryLength, size_t offset, size_t numElements, size_t realDbLen)
 {
     size_t idx = blockIdx.x * blockDim.x + threadIdx.x;
     if (idx < numElements)
@@ -36,7 +36,7 @@ extern "C" __global__ void openResults(unsigned long long *result1, unsigned lon
             bool match = (result & (1ULL << i));
 
             // Check if we are out of bounds for the query or db
-            if (queryIdx >= queryLength || dbIdx >= dbLength) {
+            if (queryIdx >= queryLength || dbIdx >= realDbLen) {
                 continue;
             }
 

diff --git a/src/dot/share_db.rs b/src/dot/share_db.rs
@@ -11,10 +11,15 @@ use crate::{
     rng::chacha::ChaChaCudaRng,
     threshold_ring::protocol::ChunkShare,
 };
+use core::panic;
 #[cfg(feature = "otp_encrypt")]
 use cudarc::driver::{CudaView, DeviceSlice};
 use cudarc::{
-    cublas::{result::gemm_ex, sys, CudaBlas},
+    cublas::{
+        result::gemm_ex,
+        sys::{self, lib},
+        CudaBlas,
+    },
     driver::{
         result::malloc_async, sys::CUdeviceptr, CudaFunction, CudaSlice, CudaStream, DevicePtr,
         LaunchAsync, LaunchConfig,
@@ -25,7 +30,11 @@ use cudarc::{
 #[cfg(feature = "otp_encrypt")]
 use itertools::Itertools;
 use rayon::prelude::*;
-use std::{ffi::c_void, mem, sync::Arc};
+use std::{
+    ffi::{c_void, CStr},
+    mem,
+    sync::Arc,
+};
 
 const PTX_SRC: &str = include_str!("kernel.cu");
 const REDUCE_FUNCTION_NAME: &str = "matmul_correct_and_reduce";
@@ -64,6 +73,13 @@ pub fn gemm(
     alpha: i32,
     beta: i32,
 ) {
+    // https://docs.nvidia.com/cuda/cublas/#cublasgemmex:
+    // "CUBLAS_COMPUTE_32I and CUBLAS_COMPUTE_32I_PEDANTIC compute types are only supported with A, B being 4-byte aligned and lda, ldb being multiples of 4."
+    assert!(m % 4 == 0, "m must be a multiple of 4");
+    // We don't enforce the following, since we use it for n=1 and emperial testing
+    // shows that it works. assert!(n % 4 == 0, "n must be a multiple of 4");
+    assert!(a % 4 == 0, "a must be aligned to 4 bytes");
+    assert!(b % 4 == 0, "b must be aligned to 4 bytes");
     unsafe {
         let status = gemm_ex(
             *handle.handle(),
@@ -87,14 +103,10 @@ pub fn gemm(
             sys::cublasGemmAlgo_t::CUBLAS_GEMM_DEFAULT,
         );
 
-        match status {
-            Ok(_) => {
-                println!("GEMM success");
-            }
-            Err(e) => {
-                // Handle error
-                eprintln!("CUBLAS error: {:?}", e);
-            }
+        // Try to fetch more information in case of an error
+        if let Err(e) = status {
+            let c_str = CStr::from_ptr(lib().cublasGetStatusString(e.0));
+            panic!("CUBLAS error: {:?}", c_str.to_str());
         }
     }
 }
@@ -825,7 +837,7 @@ mod tests {
     use std::sync::Arc;
 
     const WIDTH: usize = 12_800;
-    const QUERY_SIZE: usize = 31;
+    const QUERY_SIZE: usize = 32;
     const DB_SIZE: usize = 8 * 1000;
     const RNG_SEED: u64 = 42;
 
@@ -859,6 +871,7 @@ mod tests {
         let query = random_vec(QUERY_SIZE, WIDTH, u16::MAX as u32);
         let device_manager = Arc::new(DeviceManager::init());
         let n_devices = device_manager.device_count();
+
         let mut gpu_result = vec![0u16; DB_SIZE / n_devices * QUERY_SIZE];
         let db_sizes = vec![DB_SIZE / n_devices; n_devices];
 

diff --git a/src/helpers/device_manager.rs b/src/helpers/device_manager.rs
@@ -71,7 +71,6 @@ impl DeviceManager {
 
     pub fn await_streams(&self, streams: &[CudaStream]) {
         for i in 0..self.devices.len() {
-            self.devices[i].bind_to_thread().unwrap();
             unsafe { synchronize(streams[i].stream).unwrap() }
         }
     }