refactor: 直接调用算子库中的 mlp 算子

Signed-off-by: YdrMaster <[email protected]>
InfiniTensor · Jul 30, 2024 · 0f86b40 · 0f86b40
1 parent c742dc8
commit 0f86b40
Show file tree

Hide file tree

Showing 7 changed files with 97 additions and 70 deletions.
diff --git a/Cargo.lock b/Cargo.lock
diff --git a/Cargo.toml b/Cargo.toml
@@ -36,7 +36,7 @@ tokio = { version = "1.38", features = ["rt-multi-thread", "sync"] }
 digit-layout = "0.0"
 build-script-cfg = "0.0"
 
-operators = { git = "https://github.com/YdrMaster/operators-rs", rev = "1d55b48", default-features = false }
+operators = { git = "https://github.com/YdrMaster/operators-rs", rev = "e6ee6ea", default-features = false }
 search-cuda-tools = { git = "https://github.com/YdrMaster/cuda-driver", rev = "d089ada" }
 search-neuware-tools = "0.0"
 search-ascend-tools = { git = "https://github.com/InfiniTensor/ascendcl", rev = "1e7a696" }
diff --git a/devices/common-cpu/src/lib.rs b/devices/common-cpu/src/lib.rs
@@ -13,11 +13,11 @@ use digit_layout::types::F16;
 use operators::{
     fuesd_softmax::common_cpu as softmax,
     mat_mul::common_cpu as mat_mul,
+    mlp::common_cpu as mlp,
     random_sample::{common_cpu as random_sample, Args, KVPair, SampleArgs},
     reform::common_cpu as reform,
     rms_norm::common_cpu as rms_norm,
     rope::common_cpu as rope,
-    swiglu::common_cpu as swiglu,
     Operator, QueueOf,
 };
 use std::ops::{Deref, DerefMut};
@@ -34,7 +34,7 @@ pub struct CpuKernels {
     rms_norm: rms_norm::Operator,
     rope: rope::Operator,
     softmax: softmax::Operator,
-    swiglu: swiglu::Operator,
+    mlp: mlp::Operator,
     sample: random_sample::Operator,
 }
 
@@ -62,7 +62,7 @@ impl Default for CpuKernels {
             rms_norm: rms_norm::Operator::new(&Cpu),
             rope: rope::Operator::new(&Cpu),
             softmax: softmax::Operator::new(&Cpu),
-            swiglu: swiglu::Operator::new(&Cpu),
+            mlp: mlp::Operator::new(&Cpu),
             sample: random_sample::Operator::new(&Cpu),
         }
     }
@@ -100,11 +100,8 @@ impl Operators for CpuKernels {
     ) -> &impl operators::fuesd_softmax::FusedSoftmax<Self::Handle> {
         &self.softmax
     }
-    fn swiglu_op(
-        &self,
-        _: &QueueOf<Self::Handle>,
-    ) -> &impl operators::swiglu::Swiglu<Self::Handle> {
-        &self.swiglu
+    fn mlp_op(&self, _: &QueueOf<Self::Handle>) -> &impl operators::mlp::Mlp<Self::Handle> {
+        &self.mlp
     }
 }
 

diff --git a/devices/common/src/lib.rs b/devices/common/src/lib.rs
@@ -1,7 +1,5 @@
 use common::utok;
-use operators::{
-    fuesd_softmax, mat_mul, reform, rms_norm, rope, swiglu, Handle, Operator, QueueOf,
-};
+use operators::{fuesd_softmax, mat_mul, mlp, reform, rms_norm, rope, Handle, Operator, QueueOf};
 use std::ops::{Deref, DerefMut};
 use tensor::Tensor;
 
@@ -18,7 +16,7 @@ pub trait Operators {
         &self,
         queue: &QueueOf<Self::Handle>,
     ) -> &impl fuesd_softmax::FusedSoftmax<Self::Handle>;
-    fn swiglu_op(&self, queue: &QueueOf<Self::Handle>) -> &impl swiglu::Swiglu<Self::Handle>;
+    fn mlp_op(&self, queue: &QueueOf<Self::Handle>) -> &impl mlp::Mlp<Self::Handle>;
 }
 
 pub trait KernelsA {
@@ -68,10 +66,23 @@ pub trait KernelsA {
     where
         T: DerefMut<Target = SliceOn<Self::Handle>>;
 
-    fn swiglu<T, U>(&self, gate: &mut Tensor<T>, up: &Tensor<U>, queue: &QueueOf<Self::Handle>)
-    where
-        T: DerefMut<Target = SliceOn<Self::Handle>>,
-        U: Deref<Target = SliceOn<Self::Handle>>;
+    #[allow(clippy::too_many_arguments)]
+    fn mlp<M0, M1, C0, C1, C2>(
+        &self,
+        x: &mut Tensor<M0>,
+        x1: &Tensor<C0>,
+        gate_up: &mut Tensor<M1>,
+        w_gate_up: &Tensor<C1>,
+        w_down: &Tensor<C2>,
+        down_alpha: f32,
+        down_bias: bool,
+        queue: &QueueOf<Self::Handle>,
+    ) where
+        M0: DerefMut<Target = SliceOn<Self::Handle>>,
+        M1: DerefMut<Target = SliceOn<Self::Handle>>,
+        C0: Deref<Target = SliceOn<Self::Handle>>,
+        C1: Deref<Target = SliceOn<Self::Handle>>,
+        C2: Deref<Target = SliceOn<Self::Handle>>;
 }
 
 pub trait KernelsB {
@@ -216,18 +227,38 @@ reform failed: {e}
             .unwrap();
     }
 
-    fn swiglu<T, U>(&self, gate: &mut Tensor<T>, up: &Tensor<U>, queue: &QueueOf<Self::Handle>)
-    where
-        T: DerefMut<Target = SliceOn<Self::Handle>>,
-        U: Deref<Target = SliceOn<Self::Handle>>,
+    fn mlp<M0, M1, C0, C1, C2>(
+        &self,
+        x: &mut Tensor<M0>,
+        x1: &Tensor<C0>,
+        gate_up: &mut Tensor<M1>,
+        w_gate_up: &Tensor<C1>,
+        w_down: &Tensor<C2>,
+        down_alpha: f32,
+        down_bias: bool,
+        queue: &QueueOf<Self::Handle>,
+    ) where
+        M0: DerefMut<Target = SliceOn<Self::Handle>>,
+        M1: DerefMut<Target = SliceOn<Self::Handle>>,
+        C0: Deref<Target = SliceOn<Self::Handle>>,
+        C1: Deref<Target = SliceOn<Self::Handle>>,
+        C2: Deref<Target = SliceOn<Self::Handle>>,
     {
-        self.swiglu_op(queue)
+        self.mlp_op(queue)
             .launch(
-                &swiglu::Args {
-                    gate_layout: gate.layout(),
-                    gate_base: gate.base_mut(),
-                    up_layout: up.layout(),
-                    up_base: up.base(),
+                &mlp::Args {
+                    y_layout: x.layout(),
+                    y_base: x.base_mut(),
+                    x_layout: x1.layout(),
+                    x_base: x1.base(),
+                    gate_up_layout: gate_up.layout(),
+                    gate_up_base: gate_up.base_mut(),
+                    w_gate_up_layout: w_gate_up.layout(),
+                    w_gate_up_base: w_gate_up.base(),
+                    w_down_layout: w_down.layout(),
+                    w_down_base: w_down.base(),
+                    down_alpha,
+                    down_bias,
                 },
                 queue,
             )

diff --git a/devices/nvidia-gpu/src/lib.rs b/devices/nvidia-gpu/src/lib.rs
@@ -11,11 +11,11 @@ use operators::{
     dyn_,
     fuesd_softmax::nvidia_gpu as softmax,
     mat_mul::nvidia_gpu as mat_mul,
+    mlp::nvidia_gpu as mlp,
     random_sample::{nvidia_gpu as random_sample, KVPair, RandomSample, SampleArgs},
     reform::nvidia_gpu as reform,
     rms_norm::nvidia_gpu as rms_norm,
     rope::nvidia_gpu as rope,
-    swiglu::nvidia_gpu as swiglu,
     Operator, QueueOf, TensorLayout, Workspace,
 };
 use std::{
@@ -39,20 +39,21 @@ struct Internal {
     rope: rope::Operator,
     reform: reform::Operator,
     softmax: softmax::Operator,
-    swiglu: swiglu::Operator,
+    mlp: mlp::Operator,
     random_sample: random_sample::Operator,
 }
 
 impl Internal {
     pub fn new(handle: &Gpu, d: usize, voc: usize) -> Self {
+        let hidden_layout = TensorLayout::new(F16, [dyn_(), d.into()], [dyn_(); 2]);
         let mat_mul = mat_mul::Operator::new(handle);
 
         let mut rms_norm = rms_norm::Operator::new(handle);
         rms_norm
             .scheme(&operators::rms_norm::Args {
-                y_layout: TensorLayout::new(F16, [dyn_(), d.into()], [dyn_(); 2]),
+                y_layout: hidden_layout.clone(),
                 y_base: null_mut(),
-                x_layout: TensorLayout::new(F16, [dyn_(), d.into()], [dyn_(); 2]),
+                x_layout: hidden_layout.clone(),
                 x_base: null(),
                 w_layout: TensorLayout::new(F16, [d.into()], [dyn_()]),
                 w_base: null(),
@@ -88,15 +89,22 @@ impl Internal {
             })
             .unwrap();
 
-        let mut swiglu = swiglu::Operator::new(handle);
-        swiglu
-            .scheme(&operators::swiglu::Args {
-                gate_layout: TensorLayout::new(F16, [dyn_(); 2], [dyn_(); 2]),
-                gate_base: null_mut(),
-                up_layout: TensorLayout::new(F16, [dyn_(); 2], [dyn_(); 2]),
-                up_base: null(),
-            })
-            .unwrap();
+        let mut mlp = mlp::Operator::new(handle);
+        mlp.scheme(&operators::mlp::Args {
+            y_layout: hidden_layout.clone(),
+            y_base: null_mut(),
+            x_layout: hidden_layout.clone(),
+            x_base: null(),
+            gate_up_layout: TensorLayout::new(F16, [dyn_(); 2], [dyn_(); 2]),
+            gate_up_base: null_mut(),
+            w_gate_up_layout: TensorLayout::new(F16, [dyn_(); 2], [dyn_(); 2]),
+            w_gate_up_base: null(),
+            w_down_layout: TensorLayout::new(F16, [dyn_(); 2], [dyn_(); 2]),
+            w_down_base: null(),
+            down_alpha: 1.,
+            down_bias: true,
+        })
+        .unwrap();
 
         let mut random_sample = random_sample::Operator::new(handle);
         random_sample
@@ -109,7 +117,7 @@ impl Internal {
             rope,
             reform,
             softmax,
-            swiglu,
+            mlp,
             random_sample,
         }
     }
@@ -217,11 +225,8 @@ impl Operators for NvidiaKernels {
         &self.get(queue).softmax
     }
 
-    fn swiglu_op(
-        &self,
-        queue: &QueueOf<Self::Handle>,
-    ) -> &impl operators::swiglu::Swiglu<Self::Handle> {
-        &self.get(queue).swiglu
+    fn mlp_op(&self, queue: &QueueOf<Self::Handle>) -> &impl operators::mlp::Mlp<Self::Handle> {
+        &self.get(queue).mlp
     }
 }
 

diff --git a/models/llama/common/src/compute.rs b/models/llama/common/src/compute.rs
@@ -157,12 +157,16 @@ pub trait ComputeStream {
                 .mat_mul(&mut x, 1., &x1, &params.att_o(), 1., queue);
             self.kernels()
                 .rms_norm(&mut x1, &x, &params.mlp_layernorm(), epsilon, queue);
-            self.kernels()
-                .mat_mul(&mut gate_up, 0., &x1, &params.mlp_gate_up(), 1., queue);
-            let (mut gate, up) = split!(gate_up; [1]: di, di);
-            self.kernels().swiglu(&mut gate, &up, queue);
-            self.kernels()
-                .mat_mul(&mut x, 1., &gate, &params.mlp_down(), 1., queue);
+            self.kernels().mlp(
+                &mut x,
+                &x1,
+                &mut gate_up,
+                &params.mlp_gate_up(),
+                &params.mlp_down(),
+                1.,
+                true,
+                queue,
+            );
         }
         self.free_pos(pos.take_physical());
         self.free(state_buf.take_physical());

diff --git a/models/mixtral/cpu/src/infer.rs b/models/mixtral/cpu/src/infer.rs
@@ -225,25 +225,15 @@ impl CausalLM for MixtralCPU {
                     let expert = indices[(tok * self.k + k) as usize];
                     let expert_w = weights[(tok * self.k + k) as usize].to_f32() / sum;
                     let w_gate_up = self.params.mlp_gate_up(layer, expert).transpose(&[1, 0]);
-                    self.kernels.mat_mul(
-                        &mut gate_up_slice,
-                        0.,
+                    let w_down = self.params.mlp_down(layer, expert).transpose(&[1, 0]);
+                    self.kernels.mlp(
+                        &mut x0_slice,
                         &x1_slice,
+                        &mut gate_up_slice,
                         &w_gate_up,
-                        1.,
-                        &ThisThread,
-                    );
-                    let mut gate_up_slice = gate_up_slice.split(1, &[di as _, di as _]);
-                    let up = gate_up_slice.pop_back().unwrap();
-                    let mut gate = gate_up_slice.pop_back().unwrap();
-                    self.kernels.swiglu(&mut gate, &up, &ThisThread);
-                    let mlp_down = self.params.mlp_down(layer, expert).transpose(&[1, 0]);
-                    self.kernels.mat_mul(
-                        &mut x0_slice,
-                        1.,
-                        &gate,
-                        &mlp_down,
+                        &w_down,
                         expert_w,
+                        true,
                         &ThisThread,
                     );
                 }