diff --git a/Cargo.lock b/Cargo.lock index 883a3577..f538b600 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -534,9 +534,9 @@ name = "distributed" version = "0.0.0" dependencies = [ "common-nv", - "cuda", "half", "log", + "nccl", "search-cuda-tools", "tokenizer", "transformer", @@ -1036,6 +1036,16 @@ dependencies = [ "syn 1.0.109", ] +[[package]] +name = "nccl" +version = "0.1.0" +source = "git+https://github.com/YdrMaster/cuda-driver?rev=431d02#431d02bfa96dc4da840b0aedc43f35a923c69bbe" +dependencies = [ + "bindgen", + "cuda", + "search-cuda-tools", +] + [[package]] name = "nom" version = "7.1.3" @@ -1821,7 +1831,6 @@ name = "transformer-nv" version = "0.0.0" dependencies = [ "common-nv", - "cuda", "half", "log", "search-cuda-tools", diff --git a/Cargo.toml b/Cargo.toml index 0139ec01..905763bd 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -23,4 +23,5 @@ tokio = { version = "1.37", features = ["rt", "rt-multi-thread", "sync"] } cuda = { git = "https://github.com/YdrMaster/cuda-driver", rev = "431d02" } cublas = { git = "https://github.com/YdrMaster/cuda-driver", rev = "431d02" } +nccl = { git = "https://github.com/YdrMaster/cuda-driver", rev = "431d02" } search-cuda-tools = { git = "https://github.com/YdrMaster/cuda-driver", rev = "431d02" } diff --git a/nvidia/common/src/lib.rs b/nvidia/common/src/lib.rs index bfe5c8da..792bdaba 100644 --- a/nvidia/common/src/lib.rs +++ b/nvidia/common/src/lib.rs @@ -2,6 +2,7 @@ #[macro_use] extern crate log; +pub extern crate cuda; mod fused_softmax; mod gather; diff --git a/nvidia/distributed/Cargo.toml b/nvidia/distributed/Cargo.toml index d4214997..250db15d 100644 --- a/nvidia/distributed/Cargo.toml +++ b/nvidia/distributed/Cargo.toml @@ -9,7 +9,7 @@ authors = ["YdrMaster "] [dependencies] transformer = { path = "../../transformer" } common-nv = { path = "../common" } -cuda.workspace = true +nccl.workspace = true log.workspace = true half.workspace = true diff --git a/nvidia/distributed/src/lib.rs b/nvidia/distributed/src/lib.rs index 76d23ba7..c7b45cc5 100644 --- a/nvidia/distributed/src/lib.rs +++ b/nvidia/distributed/src/lib.rs @@ -1,60 +1,17 @@ #![cfg(detected_nccl)] -#[test] -fn test_load() { - use cuda::{ContextResource, ContextSpore, Device}; - use std::{io::ErrorKind::NotFound, time::Instant}; - use transformer::{Distributer, Llama2, Memory, SafeTensorError}; +mod parameters; - const N: usize = 1; +#[macro_use] +extern crate log; - cuda::init(); - if Device::count() < N { - return; - } - let devices = (0..N as _).map(Device::new).collect::>(); - let contexts = devices - .iter() - .map(Device::retain_primary) - .collect::>(); - let align = devices.iter().map(Device::alignment).max().unwrap(); +pub use common_nv::cuda; - let time = Instant::now(); - let safetensors = Memory::load_safetensors_from_dir("../../../TinyLlama-1.1B-Chat-v1.0"); - println!("mmap {:?}", time.elapsed()); +use parameters::ParameterMatrix; - let model = match safetensors { - Ok(m) => m, - Err(SafeTensorError::Io(e)) if e.kind() == NotFound => return, - Err(e) => panic!("{e:?}"), - }; - - let nlayers = model.num_hidden_layers(); - let mut matrix = Vec::with_capacity(contexts.len() * nlayers); - - let distributer = Distributer::new(&model, contexts.len(), align); - let time = Instant::now(); - for (i, context) in contexts.iter().enumerate() { - context.apply(|ctx| { - let stream = ctx.stream(); - for layer in 0..nlayers { - matrix.push( - stream - .from_host(distributer.distribute(layer, i).as_slice()) - .sporulate(), - ); - } - }); - } - println!("distribute {:?}", time.elapsed()); - - let time = Instant::now(); - for (i, context) in contexts.iter().enumerate() { - context.apply(|ctx| { - for element in &mut matrix[i * nlayers..][..nlayers] { - unsafe { element.kill(ctx) }; - } - }); - } - println!("kill {:?}", time.elapsed()); +pub struct Transformer { + comms: nccl::CommunicatorGroup, + matrix: ParameterMatrix, } + +impl Transformer {} diff --git a/nvidia/distributed/src/parameters.rs b/nvidia/distributed/src/parameters.rs new file mode 100644 index 00000000..3b1b50ad --- /dev/null +++ b/nvidia/distributed/src/parameters.rs @@ -0,0 +1,148 @@ +use common_nv::{ + cuda::{Context, ContextGuard, ContextResource, ContextSpore, DevMem, DevMemSpore, DevSlice}, + udim, Tensor, +}; +use std::time::Instant; +use transformer::{DistributeScheme, Distributer, Llama2}; + +pub struct ParameterMatrix { + scheme: DistributeScheme, + matrix: Vec, +} + +impl ParameterMatrix { + pub fn load(model: &dyn Llama2, contexts: &[Context]) -> Self { + let align = contexts + .iter() + .map(|ctx| ctx.device().alignment()) + .max() + .unwrap(); + + let nlayers = model.num_hidden_layers(); + let mut matrix = Vec::with_capacity(contexts.len() * nlayers); + + let distributer = Distributer::new(model, contexts.len(), align); + let time = Instant::now(); + for (i, context) in contexts.iter().enumerate() { + context.apply(|ctx| { + let stream = ctx.stream(); + for layer in 0..nlayers { + matrix.push( + stream + .from_host(distributer.distribute(layer, i).as_slice()) + .sporulate(), + ); + } + }); + } + info!("distribute {:?}", time.elapsed()); + + Self { + scheme: distributer.scheme().clone(), + matrix, + } + } + + pub unsafe fn kill(&mut self, contexts: &[Context]) { + assert_eq!(contexts.len(), self.scheme.n); + let nlayers = self.matrix.len() / self.scheme.n; + for (i, context) in contexts.iter().enumerate() { + context.apply(|ctx| { + for element in &mut self.matrix[i * nlayers..][..nlayers] { + element.kill(ctx); + } + }); + } + } +} + +pub struct Layer<'ctx> { + scheme: &'ctx DistributeScheme, + mem: DevMem<'ctx>, +} + +impl ParameterMatrix { + pub fn get<'ctx>(&'ctx self, layer: usize, i: usize, ctx: &'ctx ContextGuard) -> Layer<'ctx> { + let nlayers = self.matrix.len() / self.scheme.n; + Layer { + scheme: &self.scheme, + mem: unsafe { self.matrix[i * nlayers + layer].sprout(ctx) }, + } + } +} + +impl Layer<'_> { + #[inline] + pub fn input_layernorm(&self) -> Tensor<&DevSlice> { + let d = self.scheme.nh * self.scheme.dh; + Tensor::new(self.scheme.dt, &[d], todo!()) + } + + #[inline] + pub fn w_qkv(&self) -> Tensor<&[u8]> { + let nh = self.scheme.nh; + let nkvh = self.scheme.nkvh; + let dh = self.scheme.dh; + let d = nh * dh; + let n = self.scheme.n as udim; + Tensor::new(self.scheme.dt, &[(nh + nkvh + nkvh) / n * dh, d], todo!()) + } + + #[inline] + pub fn w_o(&self) -> Tensor<&[u8]> { + let d = self.scheme.nh * self.scheme.dh; + let n = self.scheme.n as udim; + Tensor::new(self.scheme.dt, &[d / n, d], todo!()) + } + + #[inline] + pub fn post_att_layernorm(&self) -> Tensor<&[u8]> { + let d = self.scheme.nh * self.scheme.dh; + Tensor::new(self.scheme.dt, &[d], todo!()) + } + + #[inline] + pub fn mlp_gate_up(&self) -> Tensor<&[u8]> { + let di = self.scheme.di; + let d = self.scheme.nh * self.scheme.dh; + let n = self.scheme.n as udim; + Tensor::new(self.scheme.dt, &[(di + di) / n, d], todo!()) + } + + #[inline] + pub fn mlp_down(&self) -> Tensor<&[u8]> { + let di = self.scheme.di; + let d = self.scheme.nh * self.scheme.dh; + let n = self.scheme.n as udim; + Tensor::new(self.scheme.dt, &[d, di / n], todo!()) + } +} + +#[test] +fn test_load() { + use common_nv::cuda::{self, Device}; + use std::io::ErrorKind::NotFound; + use transformer::{Memory, SafeTensorError}; + + const N: usize = 1; + + cuda::init(); + if Device::count() < N { + return; + } + + let time = Instant::now(); + let safetensors = Memory::load_safetensors_from_dir("../../../TinyLlama-1.1B-Chat-v1.0"); + println!("mmap {:?}", time.elapsed()); + + let model = match safetensors { + Ok(m) => m, + Err(SafeTensorError::Io(e)) if e.kind() == NotFound => return, + Err(e) => panic!("{e:?}"), + }; + + let contexts = (0..N as _) + .map(|i| Device::new(i).retain_primary()) + .collect::>(); + unsafe { ParameterMatrix::load(&model, &contexts).kill(&contexts) }; +} diff --git a/nvidia/transformer/Cargo.toml b/nvidia/transformer/Cargo.toml index 55f4f688..7fa118c8 100644 --- a/nvidia/transformer/Cargo.toml +++ b/nvidia/transformer/Cargo.toml @@ -9,7 +9,6 @@ authors = ["YdrMaster "] [dependencies] transformer = { path = "../../transformer" } common-nv = { path = "../common" } -cuda.workspace = true log.workspace = true half.workspace = true diff --git a/nvidia/transformer/src/lib.rs b/nvidia/transformer/src/lib.rs index c6baa5c9..2e6a33ef 100644 --- a/nvidia/transformer/src/lib.rs +++ b/nvidia/transformer/src/lib.rs @@ -4,7 +4,8 @@ mod parameters; #[macro_use] extern crate log; -pub extern crate cuda; + +pub use common_nv::cuda; use ::half::f16; use common_nv::{slice, udim, utok, Cache, DataType, NvidiaKernels, Storage, Tensor}; diff --git a/nvidia/transformer/src/parameters.rs b/nvidia/transformer/src/parameters.rs index d89accbf..c024166b 100644 --- a/nvidia/transformer/src/parameters.rs +++ b/nvidia/transformer/src/parameters.rs @@ -1,5 +1,7 @@ use crate::{Llama2, Tensor}; -use cuda::{ContextGuard, ContextResource, ContextSpore, DevMem, DevMemSpore, EventSpore, Stream}; +use common_nv::cuda::{ + ContextGuard, ContextResource, ContextSpore, DevMem, DevMemSpore, EventSpore, Stream, +}; pub(crate) struct ModelParameters { model_norm: Tensor, @@ -83,7 +85,7 @@ pub(crate) struct LayerParameter { pub mlp_down: Tensor, layer: usize, - sync_event: cuda::EventSpore, + sync_event: EventSpore, } impl LayerParameter { diff --git a/transformer/src/lib.rs b/transformer/src/lib.rs index 840639ff..20fdb9eb 100644 --- a/transformer/src/lib.rs +++ b/transformer/src/lib.rs @@ -15,7 +15,9 @@ pub use blas::Matrix; pub use buffer::LayerBuffer; pub use cache::LayerCache; pub use kernels::Kernels; -pub use parameters::{save, DistributedLayer, Distributer, Llama2, Memory, SafeTensorError}; +pub use parameters::{ + save, DistributeScheme, DistributedLayer, Distributer, Llama2, Memory, SafeTensorError, +}; pub use pos::pos; pub use request::Request; pub use sample::{BetweenF32, SampleArgs}; diff --git a/transformer/src/parameters/distribute.rs b/transformer/src/parameters/distribute.rs index 6a9e5653..6f043066 100644 --- a/transformer/src/parameters/distribute.rs +++ b/transformer/src/parameters/distribute.rs @@ -3,16 +3,11 @@ use std::sync::Arc; use tensor::{slice, udim, Blob, DataType, Tensor}; pub struct DistributedLayer { - scheme: Arc, + scheme: Arc, blob: Blob, } impl DistributedLayer { - #[inline] - pub fn scheme(&self) -> &Scheme { - &self.scheme - } - #[inline] pub fn as_slice(&self) -> &[u8] { &self.blob @@ -84,7 +79,7 @@ impl DistributedLayer { pub struct Distributer<'a> { model: &'a dyn Llama2, - scheme: Arc, + scheme: Arc, } impl<'a> Distributer<'a> { @@ -92,10 +87,15 @@ impl<'a> Distributer<'a> { pub fn new(model: &'a dyn Llama2, n: usize, align: usize) -> Self { Self { model, - scheme: Scheme::new(model, n, align), + scheme: DistributeScheme::new(model, n, align), } } + #[inline] + pub fn scheme(&self) -> &DistributeScheme { + &self.scheme + } + pub fn distribute(&self, layer: usize, i: usize) -> DistributedLayer { assert!(layer < self.model.num_hidden_layers()); assert!(i < self.scheme.n); @@ -211,7 +211,7 @@ impl<'a> Distributer<'a> { } #[derive(Clone, Debug)] -pub struct Scheme { +pub struct DistributeScheme { /// data type pub dt: DataType, /// num heads @@ -233,7 +233,7 @@ pub struct Scheme { pub total_size: usize, } -impl Scheme { +impl DistributeScheme { #[inline] fn new(model: &dyn Llama2, n: usize, align: usize) -> Arc { assert_eq!(model.num_key_value_heads() % n, 0); diff --git a/transformer/src/parameters/mod.rs b/transformer/src/parameters/mod.rs index 06e64350..a709b176 100644 --- a/transformer/src/parameters/mod.rs +++ b/transformer/src/parameters/mod.rs @@ -8,7 +8,7 @@ use common::utok; use tensor::{DataType, Tensor}; mod distribute; -pub use distribute::{DistributedLayer, Distributer}; +pub use distribute::{DistributeScheme, DistributedLayer, Distributer}; pub use memory::Memory; pub use safe_tensors::SafeTensorError; pub use save::save;