diff --git a/Cargo.toml b/Cargo.toml index cdf267f..50311b3 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -34,7 +34,7 @@ itertools = "0.13" env_logger = "0.11" build-script-cfg = "0.0" -operators = { git = "https://github.com/YdrMaster/operators-rs", rev = "91be1cc", default-features = false } +operators = { git = "https://github.com/YdrMaster/operators-rs", rev = "78b578d", default-features = false } search-cl-tools = { git = "https://github.com/InfiniTensor/clrt", rev = "9b6289d" } search-infini-tools = { git = "https://github.com/InfiniTensor/infini-rt", rev = "f40bcb5" } diff --git a/models/gpt2/common-cpu/src/infer.rs b/models/gpt2/common-cpu/src/infer.rs index 37dc27e..73cb842 100644 --- a/models/gpt2/common-cpu/src/infer.rs +++ b/models/gpt2/common-cpu/src/infer.rs @@ -27,13 +27,19 @@ fn test_infer() { return; }; let gguf = GGufModel::read(model.iter().map(|s| &**s)); - let model = Storage::from_gguf(&gguf); + let TokenizerAndPrompt { eos, tokenizer, prompt, } = TokenizerAndPrompt::new(&gguf, prompt, as_user); + + let model = Storage::from_gguf(&gguf); + println!("{:?}", model.meta); + let sample_args = SampleArgs::new(temperature, top_p, top_k).expect("invalid sample args"); + println!("{sample_args:?}"); + let &Gpt2Meta { dt_embd, nctx, @@ -49,7 +55,7 @@ fn test_infer() { test_utils::test_infer(eos, tokenizer, &prompt, max_steps, |input, pos| { // 词汇编码缓存 - let mut embd = Tensor::new(dt_embd, &[1, input.len(), d]).map(Blob::new); + let mut embd = Tensor::new(dt_embd, &[input.len(), d]).map(Blob::new); // 词汇位置缓存 let mut logits = model.meta.logits(1).map(Blob::new); let l = embd.get().len() / input.len(); @@ -60,7 +66,7 @@ fn test_infer() { worker .launch( gpt2::args::Args { - token_embd: embd.map_slice_mut(), + embd: embd.map_slice_mut(), logits: logits.map_slice_mut(), idx: postion(input.len(), pos).map_slice(), idx_add: postion(input.len(), 0).map_slice(), @@ -70,7 +76,6 @@ fn test_infer() { out_len: 1, pos, }], - num_tokens: input.len(), max_seq_len: input.len(), max_att_len: pos + input.len(), }, diff --git a/models/gpt2/common-cpu/src/lib.rs b/models/gpt2/common-cpu/src/lib.rs index 86d48e4..f7bf314 100644 --- a/models/gpt2/common-cpu/src/lib.rs +++ b/models/gpt2/common-cpu/src/lib.rs @@ -37,13 +37,14 @@ where { type Hardware = Cpu; type TopoNode = N; + type AddRows = op!(add_rows); type LayerNorm = op!(layer_norm); type MatMul = op!(mat_mul); type AttnKVCached = op!(attention_kv_cached); + type Gelu = op!(gelu); type Rearrange = op!(rearrange); type AllReduce = R; - type AddRows = op!(add_rows); - type Mlp = op!(gpt2_mlp); + fn debug(tensor: &Tensor) where T: Deref]>, diff --git a/models/gpt2/common/src/args.rs b/models/gpt2/common/src/args.rs index e0f9734..44d5412 100644 --- a/models/gpt2/common/src/args.rs +++ b/models/gpt2/common/src/args.rs @@ -3,14 +3,13 @@ use tensor::Tensor; pub struct Args<'a, H: Hardware> { /// shape: [nt, d] - pub token_embd: Tensor<&'a mut [H::Byte]>, + pub embd: Tensor<&'a mut [H::Byte]>, /// shape: [nout, nvoc] pub logits: Tensor<&'a mut [H::Byte]>, pub idx: Tensor<&'a [H::Byte]>, pub idx_add: Tensor<&'a [H::Byte]>, pub requests: Vec>, - pub num_tokens: usize, pub max_seq_len: usize, pub max_att_len: usize, } diff --git a/models/gpt2/common/src/compute.rs b/models/gpt2/common/src/compute.rs index 7ea3916..86a8b2c 100644 --- a/models/gpt2/common/src/compute.rs +++ b/models/gpt2/common/src/compute.rs @@ -4,7 +4,7 @@ use operators::{ add_rows::{self, AddRows}, all_reduce::{self, AllReduce, ReduceOp}, attention_kv_cached::{self, AttnKVCached}, - gpt2_mlp::{self, Gpt2Mlp}, + gelu::{self, Gelu}, layer_norm::{self, LayerNorm}, mat_mul::{self, MatMul}, rearrange::{self, Rearrange}, @@ -16,13 +16,13 @@ use tensor::{split, Tensor}; pub trait Operators { type Hardware: Hardware; type TopoNode: TopoNode; + type AddRows: AddRows; type LayerNorm: LayerNorm; type MatMul: MatMul; type AttnKVCached: AttnKVCached; + type Gelu: Gelu; type Rearrange: Rearrange; type AllReduce: AllReduce; - type AddRows: AddRows; - type Mlp: Gpt2Mlp; fn debug(tensor: &Tensor) where @@ -60,13 +60,13 @@ pub trait WeightLoader { pub struct Gpt2Worker { meta: Gpt2Meta, weights: WeightDecorator, + add_rows: Ops::AddRows, layer_norm: Ops::LayerNorm, mat_mul: Ops::MatMul, attn_kv_cached: Ops::AttnKVCached, + gelu: Ops::Gelu, rearrange: Ops::Rearrange, all_reduce: Ops::AllReduce, - add_rows: Ops::AddRows, - mlp: Ops::Mlp, pub debug: bool, } @@ -76,13 +76,13 @@ impl Gpt2Worker { Self { weights: meta.decorator(weights), // meta.decorator meta, + add_rows: Ops::AddRows::new(processor), layer_norm: Ops::LayerNorm::new(processor), mat_mul: Ops::MatMul::new(processor), attn_kv_cached: Ops::AttnKVCached::new(processor), + gelu: Ops::Gelu::new(processor), rearrange: Ops::Rearrange::new(processor), all_reduce: Ops::AllReduce::new(node), - add_rows: Ops::AddRows::new(processor), - mlp: Ops::Mlp::new(processor), debug: true, } } @@ -94,23 +94,19 @@ impl Gpt2Worker { pub fn workspace_size(&self, nt: usize, max_seq_len: usize, max_att_len: usize) -> usize { let Gpt2Meta { - dt_mat, - nh, - nkvh, - d, - // dh, - di, - .. + nh, nkvh, dh, di, .. } = self.meta; - let ele = dt_mat.nbytes(); - let embd = nt * d * ele; - let qkv = nt * (nh + nkvh + nkvh) * ele; - let gate_up = nt * di * 2 * ele; - let q = max_seq_len * nh * ele; - let att = nkvh * max_seq_len * max_att_len * ele; + let embd = self.meta.embd(nt); + let dt = embd.dt(); + let embd = embd.take(); + + let qkv = Tensor::new(dt, &[nt * (nh + nkvh + nkvh), dh]).take(); + let q = Tensor::new(dt, &[max_seq_len, nh, dh]).take(); + let att = Tensor::new(dt, &[nkvh, max_seq_len, max_att_len]).take(); - embd + qkv.max(gate_up) + q + att + let up = Tensor::new(dt, &[nt, di]).take(); + embd + (qkv + q + att).max(up) } } @@ -130,126 +126,112 @@ where QA: QueueAlloc, { let Args { - mut token_embd, + mut embd, mut logits, mut requests, - num_tokens: nt, max_seq_len, max_att_len, idx, idx_add, } = args; let Gpt2Meta { - dt_embd, nblk, nh, nkvh, dh, + di, .. } = self.meta; - let workspace_size = self.workspace_size(nt, max_seq_len, max_att_len); - let mut workspace = Workspace::new(queue_alloc, workspace, workspace_size); let queue = queue_alloc.queue(); - let old_token_embd_l = token_embd.layout(); - // wpe+wte { - self.add_rows( - &mut token_embd, - &self.weights.pos_embd(queue), - &idx, - &mut workspace, - queue_alloc, - )?; - token_embd = token_embd.merge(0..2).unwrap(); + let pos_embd = self.weights.pos_embd(queue); + self.add_rows(&mut embd, &pos_embd, &idx, workspace, queue_alloc)? } - let mut x = token_embd; + + let nt = embd.shape()[0]; + let mut x = embd; let x1 = Tensor::new(x.dt(), x.shape()); + let qkv = Tensor::new(x.dt(), &[nt, (nh + nkvh + nkvh) * dh]); + let up = Tensor::new(x.dt(), &[nt, di]); + + let workspace_size = self.workspace_size(nt, max_seq_len, max_att_len); + let mut workspace = Workspace::new(queue_alloc, workspace, workspace_size); let (buf, workspace) = workspace.split_at_mut(*x1.get()); let mut x1 = x1.map(|_| buf); - let qkv = Tensor::new(dt_embd, &[nt, (nh + nkvh + nkvh) * dh]); let req_split = requests.iter().map(|req| req.seq_len).collect::>(); for iblk in 0..nblk { { - let [scale, bias] = self.weights.attn_norm(iblk, queue); - self.layer_norm(&mut x1, &x, &scale, &bias, workspace, queue_alloc)? - } - let (buf, workspace) = workspace.split_at_mut(*qkv.get()); - let mut qkv = qkv.clone().map(|_| buf); - { - let [scale, bias] = self.weights.attn_qkv(iblk, queue); - let bias = bias.broadcast(0, nt); - self.rearrange(&mut qkv, &bias, workspace, queue_alloc)?; - self.mat_mul(&mut qkv, 1., &x1, &scale, 1., workspace, queue_alloc)? - } - let qkv = qkv.tile(1, &[nh + nkvh + nkvh, dh]); - split!(qkv => q, k, v; [nh, nkvh, nkvh] @ 1); - let mut q = q; - let k = k; - let v = v; - { - let q = q.map_slice_mut().transpose(&[1, 0]); - let k = k.map_slice().transpose(&[1, 0]); - let v = v.map_slice().transpose(&[1, 0]); - let q = q.split(1, &req_split); - let k = k.split(1, &req_split); - let v = v.split(1, &req_split); - - for (mut q, k, v, req) in izip!(q, k, v, &mut requests) { - let cache = req - .cache - .as_mut() // [buf, nblk, 2, nkvh, dh] - .index(1, iblk) // [buf, 2, nkvh, dh] - .transpose(&[2, 0]) // [nkvh, 2, buf, dh] - .map(|t| &mut t[..]); - - split!(cache => kc, vc; [1, 1] @ 1); - let mut o = unsafe { q.map_slice_static_mut() }; - self.attn_kv_cached( - &mut q, - &k, - &v, - &mut o, - &mut kc.index(1, 0), - &mut vc.index(1, 0), - req.pos, - workspace, - queue_alloc, - )? + let wb = self.weights.attn_norm(iblk, queue); + self.layer_norm(&mut x1, &x, wb, workspace, queue_alloc)?; + + let (buf, workspace) = workspace.split_at_mut(*qkv.get()); + let mut qkv = qkv.clone().map(|_| buf); + + let [w, b] = self.weights.attn_qkv(iblk, queue); + self.mat_mul(&mut qkv, &x1, (w, Some(b)), workspace, queue_alloc)?; + + let qkv = qkv.tile(1, &[nh + nkvh + nkvh, dh]); + split!(qkv => q, k, v; [nh, nkvh, nkvh] @ 1); + let mut q = q; + let k = k; + let v = v; + { + let q = q.map_slice_mut().transpose(&[1, 0]); + let k = k.map_slice().transpose(&[1, 0]); + let v = v.map_slice().transpose(&[1, 0]); + let q = q.split(1, &req_split); + let k = k.split(1, &req_split); + let v = v.split(1, &req_split); + + for (mut q, k, v, req) in izip!(q, k, v, &mut requests) { + let cache = req + .cache + .as_mut() // [buf, nblk, 2, nkvh, dh] + .index(1, iblk) // [buf, 2, nkvh, dh] + .transpose(&[2, 0]) // [nkvh, 2, buf, dh] + .map(|t| &mut t[..]); + + split!(cache => kc, vc; [1, 1] @ 1); + let mut o = unsafe { q.map_slice_static_mut() }; + self.attn_kv_cached( + &mut q, + &k, + &v, + &mut o, + &mut kc.index(1, 0), + &mut vc.index(1, 0), + req.pos, + workspace, + queue_alloc, + )? + } } - } - { let o = q.map_slice().merge(1..3).unwrap(); - let [scale, bias] = self.weights.attn_o(iblk, queue); - let bias = bias.broadcast(0, nt); - self.rearrange(&mut x1, &bias, workspace, queue_alloc)?; - self.mat_mul(&mut x1, 1., &o, &scale, 1., workspace, queue_alloc)?; + let [w, b] = self.weights.attn_o(iblk, queue); + self.mat_mul(&mut x1, &o, (w, Some(b)), workspace, queue_alloc)? } + self.add_rows(&mut x1, &x, &idx_add, workspace, queue_alloc)?; self.all_reduce(&mut x1, workspace, queue_alloc)?; - // 残差连接 wte+wpe 的数据 - self.add_rows.launch( - &add_rows::Args { - dst_layout: old_token_embd_l.clone(), - dst_base: x1.base_mut(), - src_layout: x.layout(), - src_base: x.base(), - idx_layout: idx_add.layout(), - idx_base: idx_add.map_slice().base(), - }, - workspace, - queue_alloc, - )?; + let wb = self.weights.ffn_norm(iblk, queue); + self.layer_norm(&mut x, &x1, wb, workspace, queue_alloc)?; { - let [scale, bias] = self.weights.ffn_norm(iblk, queue); - self.layer_norm(&mut x, &x1, &scale, &bias, workspace, queue_alloc)? + let (buf, workspace) = workspace.split_at_mut(*up.get()); + let mut up = up.clone().map(|_| buf); + + let [w, b] = self.weights.ffn_up(iblk, queue); + self.mat_mul(&mut up, &x, (w, Some(b)), workspace, queue_alloc)?; + + self.gelu(&mut up, workspace, queue_alloc)?; + + let [w, b] = self.weights.ffn_down(iblk, queue); + self.mat_mul(&mut x, &up, (w, Some(b)), workspace, queue_alloc)? } - self.mlp(&mut x, iblk, workspace, queue_alloc)?; - // 残差连接 att 之后的数据 - let mut x = x.map_slice_mut().tile(0, &[1, nt]); - self.add_rows(&mut x, &x1, &idx_add, workspace, queue_alloc)? + self.add_rows(&mut x, &x1, &idx_add, workspace, queue_alloc)?; + self.all_reduce(&mut x1, workspace, queue_alloc)? } if logits.shape()[0] == 0 { return Ok(()); @@ -273,13 +255,13 @@ where assert_eq!(dst, logits.shape()[0]); let mut x = x.map_slice_mut().slice(0, 0, 1, dst); - { - let inplace = unsafe { x.map_slice_static() }; - let [scale, bias] = self.weights.output_norm(queue); - self.layer_norm(&mut x, &inplace, &scale, &bias, workspace, queue_alloc)? - } - let output = self.weights.output_weight(queue).transpose(&[1, 0]); - self.mat_mul(&mut logits, 0., &x, &output, 1., workspace, queue_alloc) + + let inplace = unsafe { x.map_slice_static() }; + let wb = self.weights.output_norm(queue); + self.layer_norm(&mut x, &inplace, wb, workspace, queue_alloc)?; + + let w = self.weights.output_weight(queue).transpose(&[1, 0]); + self.mat_mul(&mut logits, &x, (w, None), workspace, queue_alloc) } } @@ -289,20 +271,18 @@ where Ops: Operators, W: WeightLoader, { - fn layer_norm( + fn layer_norm( &self, y: &mut Tensor, x: &Tensor, - s: &Tensor, - b: &Tensor, + [w, b]: [Tensor; 2], workspace: &mut [ByteOf], queue_alloc: &QA, ) -> Result<(), LaunchError> where Y: DerefMut]>, X: Deref]>, - W_: Deref]>, - B: Deref]>, + WB: Deref]>, QA: QueueAlloc, { self.layer_norm.launch( @@ -311,8 +291,8 @@ where y_base: y.base_mut(), x_layout: x.layout(), x_base: x.base(), - scale_layout: s.layout(), - scale_base: s.base(), + scale_layout: w.layout(), + scale_base: w.base(), bias_layout: b.layout(), bias_base: b.base(), epsilon: self.meta.epsilon, @@ -322,22 +302,28 @@ where ) } - fn mat_mul( + fn mat_mul( &self, c: &mut Tensor, - beta: f32, a: &Tensor, - b: &Tensor, - alpha: f32, + (w, b): (Tensor, Option>), workspace: &mut [ByteOf], queue_alloc: &QA, ) -> Result<(), LaunchError> where C: DerefMut]>, A: Deref]>, - B: Deref]>, + WB: Deref]>, QA: QueueAlloc, { + let beta = if let Some(b) = b { + let n = c.shape()[0]; + let b = b.broadcast(0, n); + self.rearrange(c, &b, workspace, queue_alloc)?; + 1. + } else { + 0. + }; self.mat_mul.launch( &mat_mul::Args { c_layout: c.layout(), @@ -345,9 +331,9 @@ where beta, a_layout: a.layout(), a_base: a.base(), - b_layout: b.layout(), - b_base: b.base(), - alpha, + b_layout: w.layout(), + b_base: w.base(), + alpha: 1., }, workspace, queue_alloc, @@ -396,6 +382,26 @@ where ) } + fn gelu( + &self, + x: &mut Tensor, + workspace: &mut [ByteOf], + queue_alloc: &QA, + ) -> Result<(), LaunchError> + where + X: DerefMut]>, + QA: QueueAlloc, + { + self.gelu.launch( + &gelu::Args { + layout: x.layout(), + base: x.base_mut(), + }, + workspace, + queue_alloc, + ) + } + fn rearrange( &self, dst: &mut Tensor, @@ -459,6 +465,8 @@ where Idx: Deref]>, QA: QueueAlloc, { + let n = dst.shape()[0]; + let mut dst = dst.map_slice_mut().tile(0, &[1, n]); self.add_rows.launch( &add_rows::Args { dst_layout: dst.layout(), @@ -472,38 +480,6 @@ where queue_alloc, ) } - - fn mlp( - &self, - y: &mut Tensor, - iblk: usize, - workspace: &mut [ByteOf], - queue_alloc: &QA, - ) -> Result<(), LaunchError> - where - Y: DerefMut]>, - QA: QueueAlloc, - { - let queue = queue_alloc.queue(); - let [up_weight, up_bias] = self.weights.ffn_up(iblk, queue); - let [down_weight, down_bias] = self.weights.ffn_down(iblk, queue); - self.mlp.launch( - &gpt2_mlp::Args { - y_layout: y.layout(), - y_base: y.base_mut(), - up_weight_layout: up_weight.layout(), - up_weight_base: up_weight.base(), - up_bias_layout: up_bias.layout(), - up_bias_base: up_bias.base(), - down_weight_layout: down_weight.layout(), - down_weight_base: down_weight.base(), - down_bias_layout: down_bias.layout(), - down_bias_base: down_bias.base(), - }, - workspace, - queue_alloc, - ) - } } struct WeightDecorator { diff --git a/models/gpt2/common/src/lib.rs b/models/gpt2/common/src/lib.rs index ab43e59..d882786 100644 --- a/models/gpt2/common/src/lib.rs +++ b/models/gpt2/common/src/lib.rs @@ -25,9 +25,10 @@ pub struct Gpt2Meta { pub dt_norm: DigitLayout, pub dt_mat: DigitLayout, - pub nblk: usize, pub nctx: usize, pub nvoc: usize, + + pub nblk: usize, pub nh: usize, pub nkvh: usize, pub d: usize, @@ -35,7 +36,6 @@ pub struct Gpt2Meta { pub di: usize, pub epsilon: f32, - pub theta: f32, } #[derive(Clone, Copy, PartialEq, Eq, Hash, Debug)] @@ -76,13 +76,16 @@ impl Gpt2Meta { Tensor::new(dt_embd, &[nt, nvoc]) } - // wte - pub fn token_embd(&self) -> Tensor { - self.embd(self.nvoc) + pub fn pos_embd(&self) -> Tensor { + let &Self { + dt_embd, nvoc, d, .. + } = self; + Tensor::new(dt_embd, &[nvoc, d]) } - // wpe - pub fn position_embd(&self) -> Tensor { - self.embd(self.nctx) + + pub fn norm(&self) -> Tensor { + let &Self { dt_norm, d, .. } = self; + Tensor::new(dt_norm, &[d]) } pub fn attn_qkv_w(&self, usage: TensorUsage) -> Tensor { @@ -110,8 +113,9 @@ impl Gpt2Meta { self.mat(di, d, usage) } - pub fn ffn_up_b(&self, _usage: TensorUsage) -> Tensor { - Tensor::new(self.dt_embd, &[self.di]) + pub fn ffn_up_b(&self, usage: TensorUsage) -> Tensor { + let &Self { di, .. } = self; + self.mat(di, 1, usage) } pub fn ffn_down_w(&self, usage: TensorUsage) -> Tensor { @@ -119,24 +123,15 @@ impl Gpt2Meta { self.mat(d, di, usage) } - pub fn ffn_down_b(&self, _usage: TensorUsage) -> Tensor { - Tensor::new(self.dt_embd, &[self.d]) + pub fn ffn_down_b(&self, usage: TensorUsage) -> Tensor { + let &Self { d, .. } = self; + self.mat(d, 1, usage) } pub fn output_weight(&self) -> Tensor { Tensor::new(self.dt_embd, &[self.nvoc, self.d]) } - pub fn norm(&self) -> Tensor { - let &Self { dt_norm, d, .. } = self; - Tensor::new(dt_norm, &[d]) - } - - pub fn pos_embd(&self) -> Tensor { - let &Self { nvoc, d, .. } = self; - Tensor::new(self.dt_embd, &[nvoc, d]) - } - fn mat(&self, row: usize, col: usize, usage: TensorUsage) -> Tensor { // NOTICE: 权重矩阵以 mat 类型存储但以 embd 类型参与计算 match usage { diff --git a/models/gpt2/common/src/storage.rs b/models/gpt2/common/src/storage.rs index 00e3148..aa7fc7c 100644 --- a/models/gpt2/common/src/storage.rs +++ b/models/gpt2/common/src/storage.rs @@ -1,5 +1,5 @@ use crate::Gpt2Meta; -use gguf::{ext::Mmap, map_files, GGufMetaMapExt, GGufModel}; +use gguf::{ext::Mmap, map_files, meta, tensor, GGufMetaMapExt, GGufModel}; use std::path::Path; #[derive(Clone)] @@ -38,6 +38,10 @@ impl<'a> Storage<&'a [u8]> { let output_norm_w = &gguf.tensors["output_norm.weight"]; let output = &gguf.tensors["output.weight"]; let qkv0 = &gguf.tensors["blk.0.attn_qkv.weight"]; + + let d = meta![gguf => llm_embedding_length]; + let nh = meta![gguf => llm_attention_head_count]; + #[rustfmt::skip] let meta = Gpt2Meta { dt_embd: token_embd.ty, @@ -46,33 +50,33 @@ impl<'a> Storage<&'a [u8]> { dt_norm: output_norm_w.ty, dt_mat : qkv0.ty, - nblk: gguf.llm_block_count ().unwrap(), - nctx: gguf.llm_context_length ().unwrap(), - nvoc: gguf.tokenizer_ggml_tokens ().unwrap().len(), - nh : gguf.llm_attention_head_count ().unwrap(), - nkvh: gguf.llm_attention_head_count_kv().unwrap(), - d : gguf.llm_embedding_length ().unwrap(), - dh : gguf.llm_embedding_length ().unwrap()/gguf.llm_attention_head_count ().unwrap(), - di : gguf.llm_feed_forward_length ().unwrap(), - epsilon: 1e-5, - theta: 1e4, + nctx: meta![gguf => llm_context_length ], + nvoc: meta![gguf => tokenizer_ggml_tokens].len(), + + d, nh, + nblk: meta![gguf => llm_block_count ], + nkvh: meta![gguf => llm_attention_head_count_kv; nh], + dh : meta![gguf => llm_rope_dimension_count; d / nh], + di : meta![gguf => llm_feed_forward_length ], + + epsilon: meta![gguf => llm_attention_layer_norm_rms_epsilon; 1e-5], }; #[rustfmt::skip] let blocks = (0..meta.nblk) .map(|i| BlkStorage { - attn_norm_w: gguf.tensors[&*format!("blk.{i}.attn_norm.weight" )].data, - attn_norm_b: gguf.tensors[&*format!("blk.{i}.attn_norm.bias" )].data, - attn_qkv_w: gguf.tensors[&*format!("blk.{i}.attn_qkv.weight" )].data, - attn_qkv_b: gguf.tensors[&*format!("blk.{i}.attn_qkv.bias" )].data, - attn_o_w: gguf.tensors[&*format!("blk.{i}.attn_output.weight")].data, - attn_o_b: gguf.tensors[&*format!("blk.{i}.attn_output.bias" )].data, - - ffn_norm_w: gguf.tensors[&*format!("blk.{i}.ffn_norm.weight" )].data, - ffn_norm_b: gguf.tensors[&*format!("blk.{i}.ffn_norm.bias" )].data, - ffn_up_w: gguf.tensors[&*format!("blk.{i}.ffn_up.weight" )].data, - ffn_up_b: gguf.tensors[&*format!("blk.{i}.ffn_up.bias" )].data, - ffn_down_w: gguf.tensors[&*format!("blk.{i}.ffn_down.weight" )].data, - ffn_down_b: gguf.tensors[&*format!("blk.{i}.ffn_down.bias" )].data, + attn_norm_w: tensor![gguf => format!("blk.{i}.attn_norm.weight" )].data, + attn_norm_b: tensor![gguf => format!("blk.{i}.attn_norm.bias" )].data, + attn_qkv_w: tensor![gguf => format!("blk.{i}.attn_qkv.weight" )].data, + attn_qkv_b: tensor![gguf => format!("blk.{i}.attn_qkv.bias" )].data, + attn_o_w: tensor![gguf => format!("blk.{i}.attn_output.weight")].data, + attn_o_b: tensor![gguf => format!("blk.{i}.attn_output.bias" )].data, + + ffn_norm_w: tensor![gguf => format!("blk.{i}.ffn_norm.weight" )].data, + ffn_norm_b: tensor![gguf => format!("blk.{i}.ffn_norm.bias" )].data, + ffn_up_w: tensor![gguf => format!("blk.{i}.ffn_up.weight" )].data, + ffn_up_b: tensor![gguf => format!("blk.{i}.ffn_up.bias" )].data, + ffn_down_w: tensor![gguf => format!("blk.{i}.ffn_down.weight" )].data, + ffn_down_b: tensor![gguf => format!("blk.{i}.ffn_down.bias" )].data, }) .collect(); @@ -146,5 +150,5 @@ fn test_load() { }; let gguf = GGufModel::read(shards.iter().map(|s| &**s)); let gpt2 = Storage::from_gguf(&gguf); - println!("{:?}", gpt2.meta); + println!("{:?}", gpt2.meta) }