build: update operators-rs

Signed-off-by: YdrMaster <[email protected]>
InfiniTensor · Jul 15, 2024 · 4251763 · 4251763
1 parent 7847e93
commit 4251763
Show file tree

Hide file tree

Showing 13 changed files with 108 additions and 103 deletions.
diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
@@ -26,6 +26,11 @@ jobs:
       - name: Checkout code
         uses: actions/checkout@v4
 
+      - name: cuda-toolkit
+        uses: Jimver/[email protected]
+        with:
+          method: 'network'
+
       - name: Check format
         run: cargo fmt --check
 

diff --git a/Cargo.lock b/Cargo.lock
diff --git a/Cargo.toml b/Cargo.toml
@@ -35,7 +35,6 @@ tokio = { version = "1.38", features = ["rt-multi-thread", "sync"] }
 digit-layout = "0.0"
 build-script-cfg = "0.0"
 
-operators = { git = "https://github.com/YdrMaster/operators-rs", rev = "04e71d5", default-features = false }
-nccl = { git = "https://github.com/YdrMaster/cuda-driver", rev = "343b0e0" }
-search-cuda-tools = { git = "https://github.com/YdrMaster/cuda-driver", rev = "343b0e0" }
+operators = { git = "https://github.com/YdrMaster/operators-rs", rev = "9e621e6", default-features = false }
+search-cuda-tools = { git = "https://github.com/YdrMaster/cuda-driver", rev = "fb088b6" }
 search-neuware-tools = "0.0"
diff --git a/devices/nvidia-gpu/build.rs b/devices/nvidia-gpu/build.rs
@@ -1,10 +1,14 @@
 fn main() {
     use build_script_cfg::Cfg;
-    use search_cuda_tools::find_cuda_root;
+    use search_cuda_tools::{find_nccl_root, find_cuda_root};
 
     let cuda = Cfg::new("detected_cuda");
+    let nccl = Cfg::new("detected_nccl");
     if find_cuda_root().is_some() {
         cuda.define();
+        if find_nccl_root().is_some() {
+            nccl.define();
+        }
         println!("cargo:rerun-if-changed=src/sample.cu");
         cc::Build::new()
             .cuda(true)

diff --git a/devices/nvidia-gpu/src/lib.rs b/devices/nvidia-gpu/src/lib.rs
@@ -23,6 +23,9 @@ pub use operators::{cuda, nvidia_gpu::Handle as Gpu};
 pub use sample::{sample_cpu, sample_nv};
 pub use tensor::{reslice, reslice_mut, slice, split, udim, LocalSplitable, Tensor};
 
+#[cfg(detected_nccl)]
+pub use operators::nccl;
+
 pub struct NvidiaKernels(HashMap<i32, Internal>);
 
 struct Internal {
@@ -186,7 +189,9 @@ impl KernelsB for NvidiaKernels {
 }
 
 pub fn synchronize() {
-    cuda::init();
+    if let Err(cuda::NoDevice) = cuda::init() {
+        return;
+    }
     for i in 0..cuda::Device::count() {
         cuda::Device::new(i as _)
             .retain_primary()

diff --git a/models/llama/nvidia-gpu-distributed/Cargo.toml b/models/llama/nvidia-gpu-distributed/Cargo.toml
@@ -12,7 +12,6 @@ common-nv = { path = "../../../devices/nvidia-gpu" }
 causal-lm = { path = "../../../causal-lm" }
 llama = { path = "../common" }
 digit-layout.workspace = true
-nccl.workspace = true
 log.workspace = true
 itertools.workspace = true
 

diff --git a/models/llama/nvidia-gpu-distributed/src/lib.rs b/models/llama/nvidia-gpu-distributed/src/lib.rs
@@ -13,12 +13,12 @@ use common_nv::{
         AsRaw, Context, ContextResource, ContextSpore, DevByte, DevMem, DevMemSpore, Device,
         HostMemSpore, Stream, StreamSpore,
     },
+    nccl::{CommunicatorGroup, ReduceType},
     sample_nv, slice, split, udim, KernelsA, KernelsB, LocalSplitable, NvidiaKernels, Tensor,
 };
 use digit_layout::types::F16;
 use itertools::izip;
 use llama::InferenceConfig;
-use nccl::CommunicatorGroup;
 use parameters::{Layer, ParameterMatrix};
 use std::{
     iter::{repeat, zip},
@@ -312,7 +312,7 @@ impl CausalLM for Transformer {
                                     x.physical_mut(),
                                     None,
                                     self.config.dt,
-                                    nccl::ReduceType::ncclSum,
+                                    ReduceType::ncclSum,
                                     stream,
                                 );
 
@@ -321,7 +321,7 @@ impl CausalLM for Transformer {
                                     x.physical_mut(),
                                     None,
                                     self.config.dt,
-                                    nccl::ReduceType::ncclSum,
+                                    ReduceType::ncclSum,
                                     stream,
                                 );
                             }
@@ -623,7 +623,9 @@ fn malloc_all(contexts: &[Context], len: usize) -> Vec<DevMemSpore> {
 
 #[test]
 fn test_infer() {
-    cuda::init();
+    if let Err(cuda::NoDevice) = cuda::init() {
+        return;
+    }
     if cuda::Device::count() >= 2 {
         causal_lm::test_impl::<Transformer>(
             [0, 1].map(cuda::Device::new).into_iter().collect(),

diff --git a/models/llama/nvidia-gpu-distributed/src/parameters.rs b/models/llama/nvidia-gpu-distributed/src/parameters.rs
@@ -147,7 +147,9 @@ fn test_load() {
 
     const N: usize = 1;
 
-    cuda::init();
+    if let Err(cuda::NoDevice) = cuda::init() {
+        return;
+    }
     if Device::count() < N {
         return;
     }

diff --git a/models/llama/nvidia-gpu/src/lib.rs b/models/llama/nvidia-gpu/src/lib.rs
@@ -550,17 +550,18 @@ impl Drop for LayerLoader<'_> {
 
 #[test]
 fn test_infer() {
-    cuda::init();
-    if let Some(device) = cuda::Device::fetch() {
-        causal_lm::test_impl::<Transformer>(
-            ModelLoadMeta {
-                device,
-                load_layers: 20,
-            },
-            &[
-                29966, 29989, 1792, 29989, 29958, 13, 29903, 388, 376, 18567, 29908, 304, 592,
-                21106, 29879, 5299, 29989, 465, 22137, 29989, 29958, 13,
-            ],
-        );
-    };
+    if let Err(cuda::NoDevice) = cuda::init() {
+        return;
+    }
+    let device = cuda::Device::new(0);
+    causal_lm::test_impl::<Transformer>(
+        ModelLoadMeta {
+            device,
+            load_layers: 20,
+        },
+        &[
+            29966, 29989, 1792, 29989, 29958, 13, 29903, 388, 376, 18567, 29908, 304, 592, 21106,
+            29879, 5299, 29989, 465, 22137, 29989, 29958, 13,
+        ],
+    );
 }
diff --git a/web-api/src/manager.rs b/web-api/src/manager.rs
@@ -46,12 +46,12 @@ where
                     m.content = general_purpose::STANDARD
                         .decode(content)
                         .map(String::from_utf8)
-                        .map_err(|_| Error::ContentError(format!("Decode failed: {content}")))?
-                        .map_err(|_| Error::ContentError(format!("Decode failed: {content}")))?;
+                        .map_err(|_| Error::InvalidContent(format!("Decode failed: {content}")))?
+                        .map_err(|_| Error::InvalidContent(format!("Decode failed: {content}")))?;
                 }
             }
             Some("text") => {}
-            Some(e) => return Err(Error::ContentError(format!("Unknown encoding: {e}"))),
+            Some(e) => return Err(Error::InvalidContent(format!("Unknown encoding: {e}"))),
         };
 
         async fn infer<M: CausalLM>(

diff --git a/web-api/src/schemas.rs b/web-api/src/schemas.rs
@@ -83,7 +83,7 @@ impl Success for DropSuccess {
 pub(crate) enum Error {
     Session(SessionError),
     WrongJson(serde_json::Error),
-    ContentError(String),
+    InvalidContent(String),
     InvalidDialogPos(usize),
 }
 
@@ -103,7 +103,7 @@ impl Error {
             Self::Session(Busy) => StatusCode::NOT_ACCEPTABLE,
             Self::Session(Duplicate) => StatusCode::CONFLICT,
             Self::WrongJson(_) => StatusCode::BAD_REQUEST,
-            Self::ContentError(_) => StatusCode::BAD_REQUEST,
+            Self::InvalidContent(_) => StatusCode::BAD_REQUEST,
             Self::InvalidDialogPos(_) => StatusCode::RANGE_NOT_SATISFIABLE,
         }
     }
@@ -131,7 +131,7 @@ impl Error {
             Self::Session(Busy) => json(error!(0, "Session is busy")),
             Self::Session(Duplicate) => json(error!(0, "Session ID already exists")),
             Self::WrongJson(e) => json(error!(0, e.to_string())),
-            Self::ContentError(e) => json(error!(1, e)),
+            Self::InvalidContent(e) => json(error!(1, e)),
             &Self::InvalidDialogPos(current_dialog_pos) => {
                 #[derive(serde::Serialize)]
                 struct ErrorBodyExtra {

diff --git a/xtask/src/list_turbo.rs b/xtask/src/list_turbo.rs
@@ -9,7 +9,9 @@
 fn list_nv() {
     use llama_nv::cuda::{self, Device as Gpu};
 
-    cuda::init();
+    if let Err(cuda::NoDevice) = cuda::init() {
+        return;
+    }
     println!("NVidia CUDA environment detected, use `--turbo nv:` to select.");
     for i in 0..Gpu::count() {
         let gpu = Gpu::new(i as _);