Skip to content

Commit

Permalink
refactor(nv): 从框架移除所有 cuda 代码,改用算子库的采样实现
Browse files Browse the repository at this point in the history
Signed-off-by: YdrMaster <[email protected]>
  • Loading branch information
YdrMaster committed Jul 19, 2024
1 parent 7f1a14c commit 34e8cad
Show file tree
Hide file tree
Showing 11 changed files with 132 additions and 516 deletions.
14 changes: 7 additions & 7 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,6 @@ tokio = { version = "1.38", features = ["rt-multi-thread", "sync"] }
digit-layout = "0.0"
build-script-cfg = "0.0"

operators = { git = "https://github.com/YdrMaster/operators-rs", rev = "b05568a", default-features = false }
operators = { git = "https://github.com/YdrMaster/operators-rs", rev = "d059b1a", default-features = false }
search-cuda-tools = { git = "https://github.com/YdrMaster/cuda-driver", rev = "fb088b6" }
search-neuware-tools = "0.0"
32 changes: 28 additions & 4 deletions devices/common-cpu/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -7,12 +7,18 @@ macro_rules! slice {

mod gather;

use common::utok;
use common::{f16, utok};
use common_devices::{Operators, SliceOn};
use digit_layout::types::F16;
use operators::{
fuesd_softmax::common_cpu as softmax, mat_mul::common_cpu as mat_mul,
reform::common_cpu as reform, rms_norm::common_cpu as rms_norm, rope::common_cpu as rope,
swiglu::common_cpu as swiglu, Operator, QueueOf,
fuesd_softmax::common_cpu as softmax,
mat_mul::common_cpu as mat_mul,
random_sample::{common_cpu as random_sample, Args, KVPair, SampleArgs},
reform::common_cpu as reform,
rms_norm::common_cpu as rms_norm,
rope::common_cpu as rope,
swiglu::common_cpu as swiglu,
Operator, QueueOf,
};
use std::ops::{Deref, DerefMut};
use tensor::Tensor;
Expand All @@ -29,6 +35,23 @@ pub struct CpuKernels {
rope: rope::Operator,
softmax: softmax::Operator,
swiglu: swiglu::Operator,
sample: random_sample::Operator,
}

impl CpuKernels {
pub fn sample(&self, temperature: f32, top_p: f32, top_k: usize, logits: &[f16]) -> utok {
let mut kv_pair = KVPair::new(0, f16::ZERO);
let mut args = Args::<Cpu>::new(F16, logits.len());
args.kv_pair_base = &mut kv_pair as *mut _ as _;
args.data_base = logits.as_ptr() as _;
args.detail = SampleArgs {
temperature,
top_p,
top_k,
};
self.sample.launch(&args, &ThisThread).unwrap();
kv_pair.idx() as _
}
}

impl Default for CpuKernels {
Expand All @@ -40,6 +63,7 @@ impl Default for CpuKernels {
rope: rope::Operator::new(&Cpu),
softmax: softmax::Operator::new(&Cpu),
swiglu: swiglu::Operator::new(&Cpu),
sample: random_sample::Operator::new(&Cpu),
}
}
}
Expand Down
1 change: 0 additions & 1 deletion devices/nvidia-gpu/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -18,4 +18,3 @@ digit-layout.workspace = true
[build-dependencies]
build-script-cfg.workspace = true
search-cuda-tools.workspace = true
cc = "1.0"
8 changes: 0 additions & 8 deletions devices/nvidia-gpu/build.rs
Original file line number Diff line number Diff line change
Expand Up @@ -9,13 +9,5 @@ fn main() {
if find_nccl_root().is_some() {
nccl.define();
}
println!("cargo:rerun-if-changed=src/sample.cu");
cc::Build::new()
.cuda(true)
.flag("-gencode")
.flag("arch=compute_80,code=sm_80")
.flag("-allow-unsupported-compiler")
.file("src/sample.cu")
.compile("sample");
}
}
66 changes: 0 additions & 66 deletions devices/nvidia-gpu/src/.clang-format

This file was deleted.

80 changes: 69 additions & 11 deletions devices/nvidia-gpu/src/lib.rs
Original file line number Diff line number Diff line change
@@ -1,26 +1,33 @@
#![cfg(detected_cuda)]

mod gather;
mod sample;

use common::utok;
use ::sample::SampleArgs;
use common::{f16, utok};
use common_devices::{Operators, SliceOn};
use cuda::{AsRaw, Device};
use digit_layout::types::{F16, U32};
use operators::{
dyn_, fuesd_softmax::nvidia_gpu as softmax, mat_mul::nvidia_gpu as mat_mul,
reform::nvidia_gpu as reform, rms_norm::nvidia_gpu as rms_norm, rope::nvidia_gpu as rope,
swiglu::nvidia_gpu as swiglu, Operator, QueueOf, TensorLayout,
cuda::{memcpy_d2h, DevByte, DevMem, Stream},
dyn_,
fuesd_softmax::nvidia_gpu as softmax,
mat_mul::nvidia_gpu as mat_mul,
random_sample::{nvidia_gpu as random_sample, KVPair, RandomSample},
reform::nvidia_gpu as reform,
rms_norm::nvidia_gpu as rms_norm,
rope::nvidia_gpu as rope,
swiglu::nvidia_gpu as swiglu,
Operator, QueueOf, TensorLayout, Workspace,
};
use std::{
collections::HashMap,
mem::size_of,
ops::{Deref, DerefMut},
ptr::{null, null_mut},
};

pub use common_devices::{Kernels, KernelsA, KernelsB};
pub use operators::{cuda, nvidia_gpu::Handle as Gpu};
pub use sample::{sample_cpu, sample_nv};
pub use tensor::{reslice, reslice_mut, slice, split, udim, LocalSplitable, Tensor};

#[cfg(detected_nccl)]
Expand All @@ -35,10 +42,11 @@ struct Internal {
reform: reform::Operator,
softmax: softmax::Operator,
swiglu: swiglu::Operator,
random_sample: random_sample::Operator,
}

impl Internal {
pub fn new(handle: &Gpu, d: usize) -> Self {
pub fn new(handle: &Gpu, d: usize, voc: usize) -> Self {
let mat_mul = mat_mul::Operator::new(handle);

let mut rms_norm = rms_norm::Operator::new(handle);
Expand Down Expand Up @@ -92,37 +100,87 @@ impl Internal {
})
.unwrap();

let mut random_sample = random_sample::Operator::new(handle);
random_sample
.scheme(&operators::random_sample::Args::new(F16, voc))
.unwrap();

Self {
mat_mul,
rms_norm,
rope,
reform,
softmax,
swiglu,
random_sample,
}
}
}

impl NvidiaKernels {
pub fn new(devices: &[Device], rms_norm_size: usize) -> Self {
pub fn new(devices: &[Device], rms_norm_size: usize, voc_size: usize) -> Self {
Self(
devices
.iter()
.map(|d| {
(
unsafe { d.as_raw() },
Internal::new(&Gpu::new(d.retain_primary()), rms_norm_size),
Internal::new(&Gpu::new(d.retain_primary()), rms_norm_size, voc_size),
)
})
.collect(),
)
}
}

impl NvidiaKernels {
fn get(&self, queue: &QueueOf<Gpu>) -> &Internal {
self.0.get(&unsafe { queue.ctx().dev().as_raw() }).unwrap()
}

pub fn sample_workspace<'ctx>(&self, queue: &'ctx QueueOf<Gpu>) -> DevMem<'ctx> {
let random_sample = &self.get(queue).random_sample;
let workspace_len = random_sample.workspace();
let scheme_n = random_sample.scheme_n();
let mut workspace = queue.malloc::<u8>(workspace_len);
let host = (0..scheme_n).map(|i| i as u32).collect::<Vec<_>>();
queue.memcpy_h2d(&mut workspace[..scheme_n * size_of::<u32>()], &host);
workspace
}

pub fn sample(
&self,
args: impl IntoIterator<Item = SampleArgs>,
logits: &[DevByte],
workspace: &mut [DevByte],
stream: &Stream,
) -> Vec<utok> {
let random_sample = &self.get(stream).random_sample;
let voc = random_sample.scheme_n();
let logits = logits.as_ptr();

let details = args.into_iter().collect::<Vec<_>>();
let kv_pair_size = KVPair::<()>::LAYOUT.nbytes();
let mut kv_pairs = stream.malloc::<u8>(details.len() * kv_pair_size);

let mut args = operators::random_sample::Args::<Gpu>::new(F16, voc);
args.workspace = Workspace {
ptr: workspace.as_mut_ptr(),
len: workspace.len(),
};
for (i, arg) in details.iter().enumerate() {
args.kv_pair_base = unsafe { kv_pairs.as_mut_ptr().add(i * kv_pair_size) };
args.data_base = unsafe { logits.add(i * voc * F16.nbytes()) };
args.detail.temperature = arg.temperature;
args.detail.top_p = arg.top_p;
args.detail.top_k = arg.top_k;
random_sample.launch(&args, stream).unwrap();
}

let mut host = vec![KVPair::new(0, f16::ZERO); details.len()];
stream.synchronize();
memcpy_d2h(&mut host, &kv_pairs);

host.into_iter().map(|kv| kv.idx() as _).collect()
}
}

impl Kernels<Gpu> for NvidiaKernels {}
Expand Down
Loading

0 comments on commit 34e8cad

Please sign in to comment.