Skip to content

Commit

Permalink
refactor: 直接调用算子库中的 mlp 算子
Browse files Browse the repository at this point in the history
Signed-off-by: YdrMaster <[email protected]>
  • Loading branch information
YdrMaster committed Jul 30, 2024
1 parent c742dc8 commit 0f86b40
Show file tree
Hide file tree
Showing 7 changed files with 97 additions and 70 deletions.
4 changes: 2 additions & 2 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ tokio = { version = "1.38", features = ["rt-multi-thread", "sync"] }
digit-layout = "0.0"
build-script-cfg = "0.0"

operators = { git = "https://github.com/YdrMaster/operators-rs", rev = "1d55b48", default-features = false }
operators = { git = "https://github.com/YdrMaster/operators-rs", rev = "e6ee6ea", default-features = false }
search-cuda-tools = { git = "https://github.com/YdrMaster/cuda-driver", rev = "d089ada" }
search-neuware-tools = "0.0"
search-ascend-tools = { git = "https://github.com/InfiniTensor/ascendcl", rev = "1e7a696" }
13 changes: 5 additions & 8 deletions devices/common-cpu/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -13,11 +13,11 @@ use digit_layout::types::F16;
use operators::{
fuesd_softmax::common_cpu as softmax,
mat_mul::common_cpu as mat_mul,
mlp::common_cpu as mlp,
random_sample::{common_cpu as random_sample, Args, KVPair, SampleArgs},
reform::common_cpu as reform,
rms_norm::common_cpu as rms_norm,
rope::common_cpu as rope,
swiglu::common_cpu as swiglu,
Operator, QueueOf,
};
use std::ops::{Deref, DerefMut};
Expand All @@ -34,7 +34,7 @@ pub struct CpuKernels {
rms_norm: rms_norm::Operator,
rope: rope::Operator,
softmax: softmax::Operator,
swiglu: swiglu::Operator,
mlp: mlp::Operator,
sample: random_sample::Operator,
}

Expand Down Expand Up @@ -62,7 +62,7 @@ impl Default for CpuKernels {
rms_norm: rms_norm::Operator::new(&Cpu),
rope: rope::Operator::new(&Cpu),
softmax: softmax::Operator::new(&Cpu),
swiglu: swiglu::Operator::new(&Cpu),
mlp: mlp::Operator::new(&Cpu),
sample: random_sample::Operator::new(&Cpu),
}
}
Expand Down Expand Up @@ -100,11 +100,8 @@ impl Operators for CpuKernels {
) -> &impl operators::fuesd_softmax::FusedSoftmax<Self::Handle> {
&self.softmax
}
fn swiglu_op(
&self,
_: &QueueOf<Self::Handle>,
) -> &impl operators::swiglu::Swiglu<Self::Handle> {
&self.swiglu
fn mlp_op(&self, _: &QueueOf<Self::Handle>) -> &impl operators::mlp::Mlp<Self::Handle> {
&self.mlp
}
}

Expand Down
67 changes: 49 additions & 18 deletions devices/common/src/lib.rs
Original file line number Diff line number Diff line change
@@ -1,7 +1,5 @@
use common::utok;
use operators::{
fuesd_softmax, mat_mul, reform, rms_norm, rope, swiglu, Handle, Operator, QueueOf,
};
use operators::{fuesd_softmax, mat_mul, mlp, reform, rms_norm, rope, Handle, Operator, QueueOf};
use std::ops::{Deref, DerefMut};
use tensor::Tensor;

Expand All @@ -18,7 +16,7 @@ pub trait Operators {
&self,
queue: &QueueOf<Self::Handle>,
) -> &impl fuesd_softmax::FusedSoftmax<Self::Handle>;
fn swiglu_op(&self, queue: &QueueOf<Self::Handle>) -> &impl swiglu::Swiglu<Self::Handle>;
fn mlp_op(&self, queue: &QueueOf<Self::Handle>) -> &impl mlp::Mlp<Self::Handle>;
}

pub trait KernelsA {
Expand Down Expand Up @@ -68,10 +66,23 @@ pub trait KernelsA {
where
T: DerefMut<Target = SliceOn<Self::Handle>>;

fn swiglu<T, U>(&self, gate: &mut Tensor<T>, up: &Tensor<U>, queue: &QueueOf<Self::Handle>)
where
T: DerefMut<Target = SliceOn<Self::Handle>>,
U: Deref<Target = SliceOn<Self::Handle>>;
#[allow(clippy::too_many_arguments)]
fn mlp<M0, M1, C0, C1, C2>(
&self,
x: &mut Tensor<M0>,
x1: &Tensor<C0>,
gate_up: &mut Tensor<M1>,
w_gate_up: &Tensor<C1>,
w_down: &Tensor<C2>,
down_alpha: f32,
down_bias: bool,
queue: &QueueOf<Self::Handle>,
) where
M0: DerefMut<Target = SliceOn<Self::Handle>>,
M1: DerefMut<Target = SliceOn<Self::Handle>>,
C0: Deref<Target = SliceOn<Self::Handle>>,
C1: Deref<Target = SliceOn<Self::Handle>>,
C2: Deref<Target = SliceOn<Self::Handle>>;
}

pub trait KernelsB {
Expand Down Expand Up @@ -216,18 +227,38 @@ reform failed: {e}
.unwrap();
}

fn swiglu<T, U>(&self, gate: &mut Tensor<T>, up: &Tensor<U>, queue: &QueueOf<Self::Handle>)
where
T: DerefMut<Target = SliceOn<Self::Handle>>,
U: Deref<Target = SliceOn<Self::Handle>>,
fn mlp<M0, M1, C0, C1, C2>(
&self,
x: &mut Tensor<M0>,
x1: &Tensor<C0>,
gate_up: &mut Tensor<M1>,
w_gate_up: &Tensor<C1>,
w_down: &Tensor<C2>,
down_alpha: f32,
down_bias: bool,
queue: &QueueOf<Self::Handle>,
) where
M0: DerefMut<Target = SliceOn<Self::Handle>>,
M1: DerefMut<Target = SliceOn<Self::Handle>>,
C0: Deref<Target = SliceOn<Self::Handle>>,
C1: Deref<Target = SliceOn<Self::Handle>>,
C2: Deref<Target = SliceOn<Self::Handle>>,
{
self.swiglu_op(queue)
self.mlp_op(queue)
.launch(
&swiglu::Args {
gate_layout: gate.layout(),
gate_base: gate.base_mut(),
up_layout: up.layout(),
up_base: up.base(),
&mlp::Args {
y_layout: x.layout(),
y_base: x.base_mut(),
x_layout: x1.layout(),
x_base: x1.base(),
gate_up_layout: gate_up.layout(),
gate_up_base: gate_up.base_mut(),
w_gate_up_layout: w_gate_up.layout(),
w_gate_up_base: w_gate_up.base(),
w_down_layout: w_down.layout(),
w_down_base: w_down.base(),
down_alpha,
down_bias,
},
queue,
)
Expand Down
43 changes: 24 additions & 19 deletions devices/nvidia-gpu/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -11,11 +11,11 @@ use operators::{
dyn_,
fuesd_softmax::nvidia_gpu as softmax,
mat_mul::nvidia_gpu as mat_mul,
mlp::nvidia_gpu as mlp,
random_sample::{nvidia_gpu as random_sample, KVPair, RandomSample, SampleArgs},
reform::nvidia_gpu as reform,
rms_norm::nvidia_gpu as rms_norm,
rope::nvidia_gpu as rope,
swiglu::nvidia_gpu as swiglu,
Operator, QueueOf, TensorLayout, Workspace,
};
use std::{
Expand All @@ -39,20 +39,21 @@ struct Internal {
rope: rope::Operator,
reform: reform::Operator,
softmax: softmax::Operator,
swiglu: swiglu::Operator,
mlp: mlp::Operator,
random_sample: random_sample::Operator,
}

impl Internal {
pub fn new(handle: &Gpu, d: usize, voc: usize) -> Self {
let hidden_layout = TensorLayout::new(F16, [dyn_(), d.into()], [dyn_(); 2]);
let mat_mul = mat_mul::Operator::new(handle);

let mut rms_norm = rms_norm::Operator::new(handle);
rms_norm
.scheme(&operators::rms_norm::Args {
y_layout: TensorLayout::new(F16, [dyn_(), d.into()], [dyn_(); 2]),
y_layout: hidden_layout.clone(),
y_base: null_mut(),
x_layout: TensorLayout::new(F16, [dyn_(), d.into()], [dyn_(); 2]),
x_layout: hidden_layout.clone(),
x_base: null(),
w_layout: TensorLayout::new(F16, [d.into()], [dyn_()]),
w_base: null(),
Expand Down Expand Up @@ -88,15 +89,22 @@ impl Internal {
})
.unwrap();

let mut swiglu = swiglu::Operator::new(handle);
swiglu
.scheme(&operators::swiglu::Args {
gate_layout: TensorLayout::new(F16, [dyn_(); 2], [dyn_(); 2]),
gate_base: null_mut(),
up_layout: TensorLayout::new(F16, [dyn_(); 2], [dyn_(); 2]),
up_base: null(),
})
.unwrap();
let mut mlp = mlp::Operator::new(handle);
mlp.scheme(&operators::mlp::Args {
y_layout: hidden_layout.clone(),
y_base: null_mut(),
x_layout: hidden_layout.clone(),
x_base: null(),
gate_up_layout: TensorLayout::new(F16, [dyn_(); 2], [dyn_(); 2]),
gate_up_base: null_mut(),
w_gate_up_layout: TensorLayout::new(F16, [dyn_(); 2], [dyn_(); 2]),
w_gate_up_base: null(),
w_down_layout: TensorLayout::new(F16, [dyn_(); 2], [dyn_(); 2]),
w_down_base: null(),
down_alpha: 1.,
down_bias: true,
})
.unwrap();

let mut random_sample = random_sample::Operator::new(handle);
random_sample
Expand All @@ -109,7 +117,7 @@ impl Internal {
rope,
reform,
softmax,
swiglu,
mlp,
random_sample,
}
}
Expand Down Expand Up @@ -217,11 +225,8 @@ impl Operators for NvidiaKernels {
&self.get(queue).softmax
}

fn swiglu_op(
&self,
queue: &QueueOf<Self::Handle>,
) -> &impl operators::swiglu::Swiglu<Self::Handle> {
&self.get(queue).swiglu
fn mlp_op(&self, queue: &QueueOf<Self::Handle>) -> &impl operators::mlp::Mlp<Self::Handle> {
&self.get(queue).mlp
}
}

Expand Down
16 changes: 10 additions & 6 deletions models/llama/common/src/compute.rs
Original file line number Diff line number Diff line change
Expand Up @@ -157,12 +157,16 @@ pub trait ComputeStream {
.mat_mul(&mut x, 1., &x1, &params.att_o(), 1., queue);
self.kernels()
.rms_norm(&mut x1, &x, &params.mlp_layernorm(), epsilon, queue);
self.kernels()
.mat_mul(&mut gate_up, 0., &x1, &params.mlp_gate_up(), 1., queue);
let (mut gate, up) = split!(gate_up; [1]: di, di);
self.kernels().swiglu(&mut gate, &up, queue);
self.kernels()
.mat_mul(&mut x, 1., &gate, &params.mlp_down(), 1., queue);
self.kernels().mlp(
&mut x,
&x1,
&mut gate_up,
&params.mlp_gate_up(),
&params.mlp_down(),
1.,
true,
queue,
);
}
self.free_pos(pos.take_physical());
self.free(state_buf.take_physical());
Expand Down
22 changes: 6 additions & 16 deletions models/mixtral/cpu/src/infer.rs
Original file line number Diff line number Diff line change
Expand Up @@ -225,25 +225,15 @@ impl CausalLM for MixtralCPU {
let expert = indices[(tok * self.k + k) as usize];
let expert_w = weights[(tok * self.k + k) as usize].to_f32() / sum;
let w_gate_up = self.params.mlp_gate_up(layer, expert).transpose(&[1, 0]);
self.kernels.mat_mul(
&mut gate_up_slice,
0.,
let w_down = self.params.mlp_down(layer, expert).transpose(&[1, 0]);
self.kernels.mlp(
&mut x0_slice,
&x1_slice,
&mut gate_up_slice,
&w_gate_up,
1.,
&ThisThread,
);
let mut gate_up_slice = gate_up_slice.split(1, &[di as _, di as _]);
let up = gate_up_slice.pop_back().unwrap();
let mut gate = gate_up_slice.pop_back().unwrap();
self.kernels.swiglu(&mut gate, &up, &ThisThread);
let mlp_down = self.params.mlp_down(layer, expert).transpose(&[1, 0]);
self.kernels.mat_mul(
&mut x0_slice,
1.,
&gate,
&mlp_down,
&w_down,
expert_w,
true,
&ThisThread,
);
}
Expand Down

0 comments on commit 0f86b40

Please sign in to comment.