diff --git a/README.md b/README.md index baefc6f..d1ce12c 100644 --- a/README.md +++ b/README.md @@ -19,7 +19,7 @@ ### 1. 算子:SiLU函数(10分) -请在`src/operators.rs`中实现SiLU算子,其公式为: +请在`src/operators.rs`中实现SwiGLU算子,其公式为: $$ y=silu(x) × y @@ -85,8 +85,8 @@ $$ hidden = rms_norm(residual) gate = hidden @ gate_weight.T up = hidden @ up_weight.T -itermediate = gate * sigmoid(gate) * up ## silu -output = itermediate @ down_weight.T +act = gate * sigmoid(gate) * up ## SwiGLU +output = act @ down_weight.T residual = output + residual ``` @@ -149,9 +149,9 @@ V = cat(V_cache, V) ### 以下是你需要实现的部分 score = Q @ K.T / sqrt(dim) attn = softmax(score) -x = attn @ V -x = x @ O_weight.T -residual = x + residual +attn_V = attn @ V +out = attn_V @ O_weight.T +residual = out + residual ``` Self-Attention的调试是很困难的。这里推荐大家使用pytorch来辅助调试。各位可以用transformers库(使用llama模型代码)来加载模型并运行,逐层检查中间张量结果。 diff --git a/src/main.rs b/src/main.rs index 8226fb1..75cb4b7 100644 --- a/src/main.rs +++ b/src/main.rs @@ -20,8 +20,8 @@ fn main() { let output_ids = llama.generate( input_ids, 500, - 0.9, - 4, + 0.8, + 30, 1., ); println!("{}", tokenizer.decode(&output_ids, true).unwrap()); diff --git a/src/operators.rs b/src/operators.rs index e2b026d..2d7355e 100644 --- a/src/operators.rs +++ b/src/operators.rs @@ -74,9 +74,9 @@ pub fn rms_norm(y: &mut Tensor, x: &Tensor, w: &Tensor, epsilon: todo!("实现 rms_norm,计算前做一些必要的检查会帮助你后续调试") } -// y = sigmoid(x) * x * y +// y = silu(x) * y // hint: this is an element-wise operation -pub fn silu(y: &mut Tensor, x: &Tensor) { +pub fn swiglu(y: &mut Tensor, x: &Tensor) { // let len = y.size(); // assert!(len == x.size()); @@ -176,7 +176,7 @@ pub fn random_sample(x: &Tensor, top_p: f32, top_k: u32, temperature: f32) fn test_silu() { let mut y = Tensor::::new(vec![2., 3., 4.], &vec![1, 3]); let x = Tensor::::new(vec![1., 2., 3.], &vec![1, 3]); - silu(&mut y, &x); + swiglu(&mut y, &x); assert!(y.close_to( &Tensor::::new(vec![1.4621172, 5.2847824, 11.43089], &vec![1, 3]), 1e-3