Skip to content

Commit

Permalink
feat(distribute): 初步支持加载层矩阵及切出参数张量
Browse files Browse the repository at this point in the history
Signed-off-by: YdrMaster <[email protected]>
  • Loading branch information
YdrMaster committed Apr 3, 2024
1 parent c14108b commit 7eac97a
Show file tree
Hide file tree
Showing 12 changed files with 192 additions and 72 deletions.
13 changes: 11 additions & 2 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -23,4 +23,5 @@ tokio = { version = "1.37", features = ["rt", "rt-multi-thread", "sync"] }

cuda = { git = "https://github.com/YdrMaster/cuda-driver", rev = "431d02" }
cublas = { git = "https://github.com/YdrMaster/cuda-driver", rev = "431d02" }
nccl = { git = "https://github.com/YdrMaster/cuda-driver", rev = "431d02" }
search-cuda-tools = { git = "https://github.com/YdrMaster/cuda-driver", rev = "431d02" }
1 change: 1 addition & 0 deletions nvidia/common/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

#[macro_use]
extern crate log;
pub extern crate cuda;

mod fused_softmax;
mod gather;
Expand Down
2 changes: 1 addition & 1 deletion nvidia/distributed/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ authors = ["YdrMaster <[email protected]>"]
[dependencies]
transformer = { path = "../../transformer" }
common-nv = { path = "../common" }
cuda.workspace = true
nccl.workspace = true
log.workspace = true
half.workspace = true

Expand Down
63 changes: 10 additions & 53 deletions nvidia/distributed/src/lib.rs
Original file line number Diff line number Diff line change
@@ -1,60 +1,17 @@
#![cfg(detected_nccl)]

#[test]
fn test_load() {
use cuda::{ContextResource, ContextSpore, Device};
use std::{io::ErrorKind::NotFound, time::Instant};
use transformer::{Distributer, Llama2, Memory, SafeTensorError};
mod parameters;

const N: usize = 1;
#[macro_use]
extern crate log;

cuda::init();
if Device::count() < N {
return;
}
let devices = (0..N as _).map(Device::new).collect::<Vec<_>>();
let contexts = devices
.iter()
.map(Device::retain_primary)
.collect::<Vec<_>>();
let align = devices.iter().map(Device::alignment).max().unwrap();
pub use common_nv::cuda;

let time = Instant::now();
let safetensors = Memory::load_safetensors_from_dir("../../../TinyLlama-1.1B-Chat-v1.0");
println!("mmap {:?}", time.elapsed());
use parameters::ParameterMatrix;

let model = match safetensors {
Ok(m) => m,
Err(SafeTensorError::Io(e)) if e.kind() == NotFound => return,
Err(e) => panic!("{e:?}"),
};

let nlayers = model.num_hidden_layers();
let mut matrix = Vec::with_capacity(contexts.len() * nlayers);

let distributer = Distributer::new(&model, contexts.len(), align);
let time = Instant::now();
for (i, context) in contexts.iter().enumerate() {
context.apply(|ctx| {
let stream = ctx.stream();
for layer in 0..nlayers {
matrix.push(
stream
.from_host(distributer.distribute(layer, i).as_slice())
.sporulate(),
);
}
});
}
println!("distribute {:?}", time.elapsed());

let time = Instant::now();
for (i, context) in contexts.iter().enumerate() {
context.apply(|ctx| {
for element in &mut matrix[i * nlayers..][..nlayers] {
unsafe { element.kill(ctx) };
}
});
}
println!("kill {:?}", time.elapsed());
pub struct Transformer {
comms: nccl::CommunicatorGroup,
matrix: ParameterMatrix,
}

impl Transformer {}
148 changes: 148 additions & 0 deletions nvidia/distributed/src/parameters.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,148 @@
use common_nv::{
cuda::{Context, ContextGuard, ContextResource, ContextSpore, DevMem, DevMemSpore, DevSlice},
udim, Tensor,
};
use std::time::Instant;
use transformer::{DistributeScheme, Distributer, Llama2};

pub struct ParameterMatrix {
scheme: DistributeScheme,
matrix: Vec<DevMemSpore>,
}

impl ParameterMatrix {
pub fn load(model: &dyn Llama2, contexts: &[Context]) -> Self {
let align = contexts
.iter()
.map(|ctx| ctx.device().alignment())
.max()
.unwrap();

let nlayers = model.num_hidden_layers();
let mut matrix = Vec::with_capacity(contexts.len() * nlayers);

let distributer = Distributer::new(model, contexts.len(), align);
let time = Instant::now();
for (i, context) in contexts.iter().enumerate() {
context.apply(|ctx| {
let stream = ctx.stream();
for layer in 0..nlayers {
matrix.push(
stream
.from_host(distributer.distribute(layer, i).as_slice())
.sporulate(),
);
}
});
}
info!("distribute {:?}", time.elapsed());

Self {
scheme: distributer.scheme().clone(),
matrix,
}
}

pub unsafe fn kill(&mut self, contexts: &[Context]) {
assert_eq!(contexts.len(), self.scheme.n);
let nlayers = self.matrix.len() / self.scheme.n;
for (i, context) in contexts.iter().enumerate() {
context.apply(|ctx| {
for element in &mut self.matrix[i * nlayers..][..nlayers] {
element.kill(ctx);
}
});
}
}
}

pub struct Layer<'ctx> {
scheme: &'ctx DistributeScheme,
mem: DevMem<'ctx>,
}

impl ParameterMatrix {
pub fn get<'ctx>(&'ctx self, layer: usize, i: usize, ctx: &'ctx ContextGuard) -> Layer<'ctx> {
let nlayers = self.matrix.len() / self.scheme.n;
Layer {
scheme: &self.scheme,
mem: unsafe { self.matrix[i * nlayers + layer].sprout(ctx) },
}
}
}

impl Layer<'_> {
#[inline]
pub fn input_layernorm(&self) -> Tensor<&DevSlice> {
let d = self.scheme.nh * self.scheme.dh;
Tensor::new(self.scheme.dt, &[d], todo!())
}

#[inline]
pub fn w_qkv(&self) -> Tensor<&[u8]> {
let nh = self.scheme.nh;
let nkvh = self.scheme.nkvh;
let dh = self.scheme.dh;
let d = nh * dh;
let n = self.scheme.n as udim;
Tensor::new(self.scheme.dt, &[(nh + nkvh + nkvh) / n * dh, d], todo!())
}

#[inline]
pub fn w_o(&self) -> Tensor<&[u8]> {
let d = self.scheme.nh * self.scheme.dh;
let n = self.scheme.n as udim;
Tensor::new(self.scheme.dt, &[d / n, d], todo!())
}

#[inline]
pub fn post_att_layernorm(&self) -> Tensor<&[u8]> {
let d = self.scheme.nh * self.scheme.dh;
Tensor::new(self.scheme.dt, &[d], todo!())
}

#[inline]
pub fn mlp_gate_up(&self) -> Tensor<&[u8]> {
let di = self.scheme.di;
let d = self.scheme.nh * self.scheme.dh;
let n = self.scheme.n as udim;
Tensor::new(self.scheme.dt, &[(di + di) / n, d], todo!())
}

#[inline]
pub fn mlp_down(&self) -> Tensor<&[u8]> {
let di = self.scheme.di;
let d = self.scheme.nh * self.scheme.dh;
let n = self.scheme.n as udim;
Tensor::new(self.scheme.dt, &[d, di / n], todo!())
}
}

#[test]
fn test_load() {
use common_nv::cuda::{self, Device};
use std::io::ErrorKind::NotFound;
use transformer::{Memory, SafeTensorError};

const N: usize = 1;

cuda::init();
if Device::count() < N {
return;
}

let time = Instant::now();
let safetensors = Memory::load_safetensors_from_dir("../../../TinyLlama-1.1B-Chat-v1.0");
println!("mmap {:?}", time.elapsed());

let model = match safetensors {
Ok(m) => m,
Err(SafeTensorError::Io(e)) if e.kind() == NotFound => return,
Err(e) => panic!("{e:?}"),
};

let contexts = (0..N as _)
.map(|i| Device::new(i).retain_primary())
.collect::<Vec<_>>();
unsafe { ParameterMatrix::load(&model, &contexts).kill(&contexts) };
}
1 change: 0 additions & 1 deletion nvidia/transformer/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,6 @@ authors = ["YdrMaster <[email protected]>"]
[dependencies]
transformer = { path = "../../transformer" }
common-nv = { path = "../common" }
cuda.workspace = true
log.workspace = true
half.workspace = true

Expand Down
3 changes: 2 additions & 1 deletion nvidia/transformer/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,8 @@ mod parameters;

#[macro_use]
extern crate log;
pub extern crate cuda;

pub use common_nv::cuda;

use ::half::f16;
use common_nv::{slice, udim, utok, Cache, DataType, NvidiaKernels, Storage, Tensor};
Expand Down
6 changes: 4 additions & 2 deletions nvidia/transformer/src/parameters.rs
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
use crate::{Llama2, Tensor};
use cuda::{ContextGuard, ContextResource, ContextSpore, DevMem, DevMemSpore, EventSpore, Stream};
use common_nv::cuda::{
ContextGuard, ContextResource, ContextSpore, DevMem, DevMemSpore, EventSpore, Stream,
};

pub(crate) struct ModelParameters {
model_norm: Tensor<DevMemSpore>,
Expand Down Expand Up @@ -83,7 +85,7 @@ pub(crate) struct LayerParameter {
pub mlp_down: Tensor<DevMemSpore>,

layer: usize,
sync_event: cuda::EventSpore,
sync_event: EventSpore,
}

impl LayerParameter {
Expand Down
4 changes: 3 additions & 1 deletion transformer/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,9 @@ pub use blas::Matrix;
pub use buffer::LayerBuffer;
pub use cache::LayerCache;
pub use kernels::Kernels;
pub use parameters::{save, DistributedLayer, Distributer, Llama2, Memory, SafeTensorError};
pub use parameters::{
save, DistributeScheme, DistributedLayer, Distributer, Llama2, Memory, SafeTensorError,
};
pub use pos::pos;
pub use request::Request;
pub use sample::{BetweenF32, SampleArgs};
Expand Down
20 changes: 10 additions & 10 deletions transformer/src/parameters/distribute.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3,16 +3,11 @@ use std::sync::Arc;
use tensor::{slice, udim, Blob, DataType, Tensor};

pub struct DistributedLayer {
scheme: Arc<Scheme>,
scheme: Arc<DistributeScheme>,
blob: Blob,
}

impl DistributedLayer {
#[inline]
pub fn scheme(&self) -> &Scheme {
&self.scheme
}

#[inline]
pub fn as_slice(&self) -> &[u8] {
&self.blob
Expand Down Expand Up @@ -84,18 +79,23 @@ impl DistributedLayer {

pub struct Distributer<'a> {
model: &'a dyn Llama2,
scheme: Arc<Scheme>,
scheme: Arc<DistributeScheme>,
}

impl<'a> Distributer<'a> {
#[inline]
pub fn new(model: &'a dyn Llama2, n: usize, align: usize) -> Self {
Self {
model,
scheme: Scheme::new(model, n, align),
scheme: DistributeScheme::new(model, n, align),
}
}

#[inline]
pub fn scheme(&self) -> &DistributeScheme {
&self.scheme
}

pub fn distribute(&self, layer: usize, i: usize) -> DistributedLayer {
assert!(layer < self.model.num_hidden_layers());
assert!(i < self.scheme.n);
Expand Down Expand Up @@ -211,7 +211,7 @@ impl<'a> Distributer<'a> {
}

#[derive(Clone, Debug)]
pub struct Scheme {
pub struct DistributeScheme {
/// data type
pub dt: DataType,
/// num heads
Expand All @@ -233,7 +233,7 @@ pub struct Scheme {
pub total_size: usize,
}

impl Scheme {
impl DistributeScheme {
#[inline]
fn new(model: &dyn Llama2, n: usize, align: usize) -> Arc<Self> {
assert_eq!(model.num_key_value_heads() % n, 0);
Expand Down
2 changes: 1 addition & 1 deletion transformer/src/parameters/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ use common::utok;
use tensor::{DataType, Tensor};
mod distribute;

pub use distribute::{DistributedLayer, Distributer};
pub use distribute::{DistributeScheme, DistributedLayer, Distributer};
pub use memory::Memory;
pub use safe_tensors::SafeTensorError;
pub use save::save;
Expand Down

0 comments on commit 7eac97a

Please sign in to comment.