diff --git a/Cargo.lock b/Cargo.lock index fdf77a20..ec6f9ac5 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1224,9 +1224,9 @@ dependencies = [ [[package]] name = "hashbrown" -version = "0.14.3" +version = "0.14.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "290f1a1d9242c78d09ce40a5e87e7554ee637af1351968159f4952f028f75604" +checksum = "e5274423e17b7c9fc20b6e7e208532f9b19825d82dfd615708b70edd83df41f1" [[package]] name = "heck" @@ -1451,6 +1451,7 @@ name = "llama" version = "0.0.0" dependencies = [ "common", + "rayon", "serde", "serde_json", "tensor", @@ -2954,13 +2955,13 @@ dependencies = [ "colored", "common", "distributed", + "llama", "log", "search-cuda-tools", "service", "simple_logger", "tensor", "tokio", - "transformer", "transformer-cpu", "transformer-nv", "web-api", diff --git a/models/llama-legacy/src/cast.rs b/models/llama-legacy/src/cast.rs deleted file mode 100644 index 34e02c36..00000000 --- a/models/llama-legacy/src/cast.rs +++ /dev/null @@ -1,63 +0,0 @@ -use super::{memory::Layer, ConfigJson, Llama2, Memory, Storage}; -use common::{bf16, f16, Blob}; -use std::sync::Arc; -use tensor::{DataType, Tensor, Ty}; - -impl Memory { - pub fn cast(src: &(dyn Llama2), new_dtype: DataType) -> Self { - Self { - config: ConfigJson { - torch_dtype: new_dtype, - ..ConfigJson::from(src) - }, - embed_tokens: cast(src.embed_tokens(), new_dtype), - layers: (0..src.num_hidden_layers()) - .map(|l| Layer { - input_layernorm: cast(src.input_layernorm(l), new_dtype), - w_qkv: cast(src.w_qkv(l), new_dtype), - self_attn_o_proj: cast(src.self_attn_o_proj(l), new_dtype), - post_attention_layernorm: cast(src.post_attention_layernorm(l), new_dtype), - mlp_gate_up: cast(src.mlp_gate_up(l), new_dtype), - mlp_down: cast(src.mlp_down(l), new_dtype), - }) - .collect(), - model_norm: cast(src.model_norm(), new_dtype), - lm_head: cast(src.lm_head(), new_dtype), - } - } -} - -fn cast(src: Tensor, new_dtype: DataType) -> Tensor { - match (src.data_type(), new_dtype) { - (DataType::F16, DataType::BF16) => typed(src, |x: &f16| bf16::from_f32(x.to_f32())), - (DataType::F16, DataType::F32) => typed(src, |x: &f16| x.to_f32()), - (DataType::BF16, DataType::F16) => typed(src, |x: &bf16| f16::from_f32(x.to_f32())), - (DataType::BF16, DataType::F32) => typed(src, |x: &bf16| x.to_f32()), - (DataType::F32, DataType::F16) => typed(src, |x: &f32| f16::from_f32(*x)), - (DataType::F32, DataType::BF16) => typed(src, |x: &f32| bf16::from_f32(*x)), - _ => todo!(), - } -} - -fn typed( - src: Tensor, - cast: impl Fn(&T) -> U + Sync, -) -> Tensor { - use rayon::iter::*; - use tensor::{reslice, reslice_mut}; - - assert_eq!(src.data_type(), T::DATA_TYPE); - if src.data_type() == U::DATA_TYPE { - return src; - } - - assert!(src.is_contiguous()); - let mut ans = Tensor::alloc(U::DATA_TYPE, src.shape(), Blob::new); - - reslice(src.physical()) - .par_iter() - .zip(reslice_mut(ans.physical_mut())) - .for_each(|(src, dst)| *dst = cast(src)); - - ans.map_physical(|b| Storage::Others(Arc::new(b))) -} diff --git a/models/llama-legacy/src/lib.rs b/models/llama-legacy/src/lib.rs index d29a8bb2..da4ced28 100644 --- a/models/llama-legacy/src/lib.rs +++ b/models/llama-legacy/src/lib.rs @@ -1,7 +1,5 @@ -mod cast; mod memory; mod safe_tensors; -mod save; mod storage; use common::utok; @@ -10,7 +8,6 @@ mod distribute; pub use distribute::{DistributeScheme, DistributedLayer, Distributer}; pub use memory::Memory; -pub use save::save; pub use storage::Storage; pub trait Llama2 { diff --git a/models/llama-legacy/src/memory.rs b/models/llama-legacy/src/memory.rs index 8fc9916c..07040745 100644 --- a/models/llama-legacy/src/memory.rs +++ b/models/llama-legacy/src/memory.rs @@ -172,23 +172,3 @@ impl Llama2 for Memory { self.lm_head.clone() } } - -#[test] -fn test_load() { - use std::time::Instant; - - let Some(model_dir) = common::test_model::find() else { - return; - }; - println!("model_dir: {}", model_dir.display()); - - let t0 = Instant::now(); - let model = Memory::load_safetensors(model_dir).unwrap(); - let t1 = Instant::now(); - println!("mmap {:?}", t1 - t0); - - let t0 = Instant::now(); - let _inside_memory = Memory::cast(&model, DataType::F32); - let t1 = Instant::now(); - println!("cast {:?}", t1 - t0); -} diff --git a/models/llama-legacy/src/save.rs b/models/llama-legacy/src/save.rs deleted file mode 100644 index 04e29bcc..00000000 --- a/models/llama-legacy/src/save.rs +++ /dev/null @@ -1,117 +0,0 @@ -use super::{ConfigJson, Llama2, Storage}; -use common::safe_tensors::{Dtype, SafeTensorsHeader, SafeTensorsHeaderMetadata, TensorInfo}; -use std::{ - collections::HashMap, - fs, - io::{self, BufWriter, Write}, - path::Path, -}; -use tensor::{DataType, Tensor}; - -pub fn save(model: &dyn Llama2, dir: impl AsRef) -> io::Result<()> { - let dir = dir.as_ref(); - fs::create_dir_all(dir)?; - let config = serde_json::to_string_pretty(&ConfigJson::from(model))?; - fs::write(dir.join("config.json"), config)?; - - let mut offset = 0usize; - let mut header = SafeTensorsHeader { - tensors: HashMap::new(), - metadata: SafeTensorsHeaderMetadata { - format: "pt".into(), - }, - }; - - let mut tensor_info = |tensor: Tensor| TensorInfo { - dtype: match tensor.data_type() { - DataType::Bool => Dtype::BOOL, - DataType::I8 => Dtype::I8, - DataType::I16 => Dtype::I16, - DataType::I32 => Dtype::I32, - DataType::I64 => Dtype::I64, - DataType::U8 => Dtype::U8, - DataType::U16 => Dtype::U16, - DataType::U32 => Dtype::U32, - DataType::U64 => Dtype::U64, - DataType::F16 => Dtype::F16, - DataType::BF16 => Dtype::BF16, - DataType::F32 => Dtype::F32, - DataType::F64 => Dtype::F64, - }, - shape: tensor.shape().iter().map(|&d| d as _).collect(), - data_offsets: { - let start = offset; - offset += tensor.bytes_size(); - (start, offset) - }, - }; - - header.tensors.insert( - "model.embed_tokens.weight".into(), - tensor_info(model.embed_tokens()), - ); - for layer in 0..model.num_hidden_layers() { - header.tensors.insert( - format!("model.layers.{layer}.input_layernorm.weight"), - tensor_info(model.input_layernorm(layer)), - ); - header.tensors.insert( - format!("model.layers.{layer}.self_attn.qkv_proj.weight"), - tensor_info(model.w_qkv(layer)), - ); - header.tensors.insert( - format!("model.layers.{layer}.self_attn.o_proj.weight"), - tensor_info(model.self_attn_o_proj(layer)), - ); - header.tensors.insert( - format!("model.layers.{layer}.post_attention_layernorm.weight"), - tensor_info(model.post_attention_layernorm(layer)), - ); - header.tensors.insert( - format!("model.layers.{layer}.mlp.gate_up_proj.weight"), - tensor_info(model.mlp_gate_up(layer)), - ); - header.tensors.insert( - format!("model.layers.{layer}.mlp.down_proj.weight"), - tensor_info(model.mlp_down(layer)), - ); - } - header - .tensors - .insert("model.norm.weight".into(), tensor_info(model.model_norm())); - header - .tensors - .insert("lm_head.weight".into(), tensor_info(model.lm_head())); - - let header = { - let str = serde_json::to_string(&header)?; - let len = str.len(); - const ALIGN: usize = std::mem::size_of::(); - let aligned = (len + ALIGN - 1) & !(ALIGN - 1); - - let mut buffer = Vec::with_capacity(aligned); - let mut write = BufWriter::new(&mut buffer); - write.write_all(&(aligned as u64).to_le_bytes())?; - write.write_all(str.as_bytes())?; - for _ in len..aligned { - write.write_all(&[32])?; - } - drop(write); - buffer - }; - - let mut file = fs::File::create(dir.join("model.safetensors"))?; - file.write_all(&header)?; - file.write_all(model.embed_tokens().as_slice())?; - for layer in 0..model.num_hidden_layers() { - file.write_all(model.input_layernorm(layer).as_slice())?; - file.write_all(model.w_qkv(layer).as_slice())?; - file.write_all(model.self_attn_o_proj(layer).as_slice())?; - file.write_all(model.post_attention_layernorm(layer).as_slice())?; - file.write_all(model.mlp_gate_up(layer).as_slice())?; - file.write_all(model.mlp_down(layer).as_slice())?; - } - file.write_all(model.model_norm().as_slice())?; - file.write_all(model.lm_head().as_slice())?; - Ok(()) -} diff --git a/models/llama/Cargo.toml b/models/llama/Cargo.toml index 49d3a214..99add97d 100644 --- a/models/llama/Cargo.toml +++ b/models/llama/Cargo.toml @@ -11,3 +11,4 @@ common = { path = "../../common" } tensor = { path = "../../tensor" } serde = { workspace = true, features = ["derive"] } serde_json.workspace = true +rayon.workspace = true diff --git a/models/llama/src/cast.rs b/models/llama/src/cast.rs new file mode 100644 index 00000000..0b1887f5 --- /dev/null +++ b/models/llama/src/cast.rs @@ -0,0 +1,59 @@ +use crate::{InferenceConfig, LayerStorage, Storage, Weight}; +use common::{bf16, f16, Blob}; +use tensor::{DataType, Tensor, Ty}; + +impl Storage { + pub fn cast(self, dt: DataType) -> Self { + if self.config.dt == dt { + return self; + } + Self { + config: InferenceConfig { dt, ..self.config }, + embed_tokens: cast(self.embed_tokens, dt), + layers: self + .layers + .into_iter() + .map(|l| LayerStorage { + att_layernorm: cast(l.att_layernorm, dt), + att_qkv: cast(l.att_qkv, dt), + att_o: cast(l.att_o, dt), + mlp_layernorm: cast(l.mlp_layernorm, dt), + mlp_gate_up: cast(l.mlp_gate_up, dt), + mlp_down: cast(l.mlp_down, dt), + }) + .collect(), + lm_layernorm: cast(self.lm_layernorm, dt), + lm_head: cast(self.lm_head, dt), + } + } +} + +fn cast(src: Tensor, dt: DataType) -> Tensor { + match (src.data_type(), dt) { + (DataType::F16, DataType::BF16) => typed(src, |x: &f16| bf16::from_f32(x.to_f32())), + (DataType::F16, DataType::F32) => typed(src, |x: &f16| x.to_f32()), + (DataType::BF16, DataType::F16) => typed(src, |x: &bf16| f16::from_f32(x.to_f32())), + (DataType::BF16, DataType::F32) => typed(src, |x: &bf16| x.to_f32()), + (DataType::F32, DataType::F16) => typed(src, |x: &f32| f16::from_f32(*x)), + (DataType::F32, DataType::BF16) => typed(src, |x: &f32| bf16::from_f32(*x)), + _ => todo!(), + } +} + +fn typed( + src: Tensor, + cast: impl Fn(&T) -> U + Sync, +) -> Tensor { + use rayon::iter::*; + use tensor::{reslice, reslice_mut}; + + assert_eq!(src.data_type(), T::DATA_TYPE); + let mut ans = Tensor::alloc(U::DATA_TYPE, src.shape(), Blob::new); + + reslice(src.physical()) + .par_iter() + .zip(reslice_mut(ans.physical_mut())) + .for_each(|(src, dst)| *dst = cast(src)); + + ans.map_physical(|b| b.into()) +} diff --git a/models/llama/src/json.rs b/models/llama/src/json.rs new file mode 100644 index 00000000..c87ed868 --- /dev/null +++ b/models/llama/src/json.rs @@ -0,0 +1,77 @@ +use common::utok; +use tensor::DataType; + +#[derive(serde::Serialize, serde::Deserialize, Debug)] +pub(crate) struct ConfigJson { + pub bos_token_id: utok, + pub eos_token_id: utok, + pub hidden_size: usize, + pub intermediate_size: usize, + pub max_position_embeddings: usize, + pub num_attention_heads: usize, + pub num_hidden_layers: usize, + pub num_key_value_heads: usize, + pub vocab_size: usize, + #[serde(default = "default_rms_norm_eps")] + pub rms_norm_eps: f32, + #[serde(default = "default_rope_theta")] + pub rope_theta: f32, + pub torch_dtype: DataType, +} + +#[inline(always)] +const fn default_rms_norm_eps() -> f32 { + 1e-5 +} + +#[inline(always)] +const fn default_rope_theta() -> f32 { + 1e4 +} + +macro_rules! convert { + (Dtype: $dtype:expr) => {{ + use common::safe_tensors::Dtype; + use tensor::DataType; + + match $dtype { + Dtype::BOOL => DataType::Bool, + Dtype::I8 => DataType::I8, + Dtype::I16 => DataType::I16, + Dtype::I32 => DataType::I32, + Dtype::I64 => DataType::I64, + Dtype::U8 => DataType::U8, + Dtype::U16 => DataType::U16, + Dtype::U32 => DataType::U32, + Dtype::U64 => DataType::U64, + Dtype::F16 => DataType::F16, + Dtype::BF16 => DataType::BF16, + Dtype::F32 => DataType::F32, + Dtype::F64 => DataType::F64, + _ => unreachable!(), + } + }}; + + (DataType: $data_type:expr) => {{ + use common::safe_tensors::Dtype; + use tensor::DataType; + + match $data_type { + DataType::Bool => Dtype::BOOL, + DataType::I8 => Dtype::I8, + DataType::I16 => Dtype::I16, + DataType::I32 => Dtype::I32, + DataType::I64 => Dtype::I64, + DataType::U8 => Dtype::U8, + DataType::U16 => Dtype::U16, + DataType::U32 => Dtype::U32, + DataType::U64 => Dtype::U64, + DataType::F16 => Dtype::F16, + DataType::BF16 => Dtype::BF16, + DataType::F32 => Dtype::F32, + DataType::F64 => Dtype::F64, + } + }}; +} + +pub(crate) use convert; diff --git a/models/llama/src/lib.rs b/models/llama/src/lib.rs index 312c9283..3e6a36fb 100644 --- a/models/llama/src/lib.rs +++ b/models/llama/src/lib.rs @@ -1,58 +1,74 @@ -use common::{safe_tensors::Dtype, utok, FileLoadError}; -use std::{fs, path::Path}; -use tensor::DataType; - -#[derive(serde::Serialize, serde::Deserialize, Debug)] -pub struct ConfigJson { - pub bos_token_id: utok, - pub eos_token_id: utok, - pub hidden_size: usize, - pub intermediate_size: usize, - pub max_position_embeddings: usize, - pub num_attention_heads: usize, - pub num_hidden_layers: usize, - pub num_key_value_heads: usize, - pub vocab_size: usize, - #[serde(default = "default_rms_norm_eps")] - pub rms_norm_eps: f32, - #[serde(default = "default_rope_theta")] - pub rope_theta: f32, - pub torch_dtype: DataType, +mod cast; +mod json; +mod load; +mod save; + +use common::{safe_tensors::SharedTensor, utok, Blob}; +use std::{ops::Deref, sync::Arc}; +use tensor::{udim, DataType, Tensor}; + +pub struct Storage { + pub config: InferenceConfig, + + pub embed_tokens: Tensor, + pub layers: Vec, + pub lm_layernorm: Tensor, + pub lm_head: Tensor, } -impl ConfigJson { - pub fn load(model_dir: impl AsRef) -> Result { - let path = model_dir.as_ref().join("config.json"); - let content = fs::read_to_string(path).map_err(FileLoadError::Io)?; - serde_json::from_str(&content).map_err(FileLoadError::Json) - } +pub struct LayerStorage { + pub att_layernorm: Tensor, + pub att_qkv: Tensor, + pub att_o: Tensor, + pub mlp_layernorm: Tensor, + pub mlp_gate_up: Tensor, + pub mlp_down: Tensor, +} + +#[derive(Clone, Debug)] +pub struct InferenceConfig { + pub dt: DataType, + pub voc: udim, + pub nlayers: udim, + pub nh: udim, + pub nkvh: udim, + pub d: udim, + pub dkv: udim, + pub di: udim, + pub max_seq_len: udim, + pub bos_token: utok, + pub eos_token: utok, + pub epsilon: f32, + pub theta: f32, } -#[inline(always)] -const fn default_rms_norm_eps() -> f32 { - 1e-5 +#[derive(Clone)] +pub enum Weight { + SafeTensor(SharedTensor), + Blob(Arc), } -#[inline(always)] -const fn default_rope_theta() -> f32 { - 1e4 +impl From for Weight { + #[inline] + fn from(tensor: SharedTensor) -> Self { + Self::SafeTensor(tensor) + } +} + +impl From for Weight { + #[inline] + fn from(blob: Blob) -> Self { + Self::Blob(Arc::new(blob)) + } } -pub fn convert(dtype: Dtype) -> DataType { - match dtype { - Dtype::BOOL => DataType::Bool, - Dtype::I8 => DataType::I8, - Dtype::I16 => DataType::I16, - Dtype::I32 => DataType::I32, - Dtype::I64 => DataType::I64, - Dtype::U8 => DataType::U8, - Dtype::U16 => DataType::U16, - Dtype::U32 => DataType::U32, - Dtype::U64 => DataType::U64, - Dtype::F16 => DataType::F16, - Dtype::BF16 => DataType::BF16, - Dtype::F32 => DataType::F32, - Dtype::F64 => DataType::F64, - _ => unreachable!(), +impl Deref for Weight { + type Target = [u8]; + #[inline] + fn deref(&self) -> &[u8] { + match self { + Self::SafeTensor(tensor) => tensor, + Self::Blob(blob) => blob, + } } } diff --git a/models/llama/src/load.rs b/models/llama/src/load.rs new file mode 100644 index 00000000..4fc0e5ed --- /dev/null +++ b/models/llama/src/load.rs @@ -0,0 +1,144 @@ +use crate::{ + json::{convert, ConfigJson}, + InferenceConfig, LayerStorage, Storage, Weight, +}; +use common::{ + safe_tensors::SafeTensors, + Blob, + FileLoadError::{self, Io, Json}, +}; +use std::{fs::File, path::Path, pin::Pin, sync::Arc, usize}; +use tensor::{udim, DataType, Shape, Tensor}; + +impl Storage { + pub fn load_safetensors(model_dir: impl AsRef) -> Result { + let config = File::open(model_dir.as_ref().join("config.json")).map_err(Io)?; + let config: ConfigJson = serde_json::from_reader(&config).map_err(Json)?; + let model = SafeTensors::load_from_dir(model_dir)?.share(); + + let dt = config.torch_dtype; + let voc = config.vocab_size as udim; + let d = config.hidden_size as udim; + let nh = config.num_attention_heads as udim; + let nkvh = config.num_key_value_heads as udim; + let dh = d / nh; + let dkv = dh * nkvh; + let di = config.intermediate_size as udim; + + Ok(Self { + config: InferenceConfig { + dt, + voc, + nlayers: config.num_hidden_layers as _, + nh, + nkvh, + d, + dkv, + di, + max_seq_len: config.max_position_embeddings as _, + bos_token: config.bos_token_id, + eos_token: config.eos_token_id, + epsilon: config.rms_norm_eps, + theta: config.rope_theta, + }, + + embed_tokens: tensor(&model, "model.embed_tokens.weight", dt, [voc, d]), + layers: (0..config.num_hidden_layers) + .map(|l| { + let name = |name: &str| format!("model.layers.{l}.{name}.weight"); + LayerStorage { + att_layernorm: tensor(&model, &name("input_layernorm"), dt, [d]), + att_qkv: { + let qkv = name("self_attn.qkv_proj"); + if model.contains(&qkv) { + tensor(&model, &qkv, dt, [d + dkv + dkv, d]) + } else { + let sq = &[nh, 2, dh / 2, d]; + let skv = &[nkvh, 2, dh / 2, d]; + let perm = &[0, 2, 1, 3]; + + let q = tensor(&model, &name("self_attn.q_proj"), dt, [d, d]) + .reshape(sq) + .transpose(perm); + let k = tensor(&model, &name("self_attn.k_proj"), dt, [dkv, d]) + .reshape(skv) + .transpose(perm); + let v = tensor(&model, &name("self_attn.v_proj"), dt, [dkv, d]) + .reshape(skv); + concat0(&[q, k, v]).reshape(&[d + dkv + dkv, d]) + } + } + .transpose(&[1, 0]), + att_o: tensor(&model, &name("self_attn.o_proj"), dt, [d, d]) + .transpose(&[1, 0]), + mlp_layernorm: tensor(&model, &name("post_attention_layernorm"), dt, [d]), + mlp_gate_up: { + let gate_up = name("mlp.gate_up_proj"); + if model.contains(&gate_up) { + tensor(&model, &gate_up, dt, [di + di, d]) + } else { + concat0(&[ + tensor(&model, &name("mlp.gate_proj"), dt, [di, d]), + tensor(&model, &name("mlp.up_proj"), dt, [di, d]), + ]) + } + } + .transpose(&[1, 0]), + mlp_down: tensor(&model, &name("mlp.down_proj"), dt, [d, di]) + .transpose(&[1, 0]), + } + }) + .collect(), + lm_layernorm: tensor(&model, "model.norm.weight", dt, [d]), + lm_head: tensor(&model, "lm_head.weight", dt, [voc, d]).transpose(&[1, 0]), + }) + } +} + +fn tensor( + model: &Pin>, + name: &str, + dt: DataType, + shape: [udim; N], +) -> Tensor { + let shared = model + .share_tensor(name) + .unwrap_or_else(|| panic!("missing tensor: {name}")); + assert_eq!(convert!(Dtype: shared.dtype()), dt); + assert_eq!( + &*shared.shape().iter().map(|&d| d as udim).collect::(), + shape + ); + Tensor::new(dt, &shape, Weight::SafeTensor(shared)) +} + +fn concat0(tensors: &[Tensor]) -> Tensor { + assert!(tensors + .windows(2) + .all(|t| t[0].data_type() == t[1].data_type())); + assert!(!tensors.is_empty()); + + let data_type = tensors[0].data_type(); + let mut shape = Shape::from_slice(tensors[0].shape()); + shape[0] = tensors.iter().map(|t| t.shape()[0]).sum(); + + let mut ans = Tensor::alloc(data_type, &shape, Blob::new); + let mut offset = 0; + for t in tensors { + let len = t.bytes_size(); + unsafe { t.reform_to_raw(&mut ans.physical_mut()[offset..][..len]) }; + offset += len; + } + ans.map_physical(|b| b.into()) +} + +#[test] +fn test_load() { + if let Some(model_dir) = common::test_model::find() { + println!("model_dir: {}", model_dir.display()); + + let time = std::time::Instant::now(); + let _storage = Storage::load_safetensors(model_dir).unwrap(); + println!("load: {:?}", time.elapsed()); + }; +} diff --git a/models/llama/src/save.rs b/models/llama/src/save.rs new file mode 100644 index 00000000..39ea2d37 --- /dev/null +++ b/models/llama/src/save.rs @@ -0,0 +1,109 @@ +use crate::{ + json::{convert, ConfigJson}, + Storage, Weight, +}; +use common::safe_tensors::{SafeTensorsHeader, SafeTensorsHeaderMetadata, TensorInfo}; +use std::{ + collections::HashMap, + fs, + io::{self, BufWriter, Write}, + path::Path, +}; +use tensor::Tensor; + +impl Storage { + pub fn save(&self, dir: impl AsRef) -> io::Result<()> { + let dir = dir.as_ref(); + fs::create_dir_all(dir)?; + let config = serde_json::to_string_pretty(&ConfigJson { + bos_token_id: self.config.bos_token, + eos_token_id: self.config.eos_token, + hidden_size: self.config.d as _, + intermediate_size: self.config.di as _, + max_position_embeddings: self.config.max_seq_len as _, + num_attention_heads: self.config.nh as _, + num_hidden_layers: self.config.nlayers as _, + num_key_value_heads: self.config.nkvh as _, + vocab_size: self.config.voc as _, + rms_norm_eps: self.config.epsilon, + rope_theta: self.config.theta, + torch_dtype: self.config.dt, + })?; + fs::write(dir.join("config.json"), config)?; + + let mut offset = 0usize; + let mut header = SafeTensorsHeader { + tensors: HashMap::new(), + metadata: SafeTensorsHeaderMetadata { + format: "rs".into(), + }, + }; + + let mut t = |tensor: &Tensor| TensorInfo { + dtype: convert!(DataType: tensor.data_type()), + shape: tensor.shape().iter().map(|&d| d as _).collect(), + data_offsets: { + let start = offset; + offset += tensor.bytes_size(); + (start, offset) + }, + }; + + header + .tensors + .insert("model.embed_tokens.weight".into(), t(&self.embed_tokens)); + for (i, l) in self.layers.iter().enumerate() { + #[rustfmt::skip] + let iter = [ + ("input_layernorm" , &l.att_layernorm), + ("self_attn.qkv_proj" , &l.att_qkv .clone().transpose(&[1, 0])), + ("self_attn.o_proj" , &l.att_o .clone().transpose(&[1, 0])), + ("post_attention_layernorm", &l.mlp_layernorm), + ("mlp.gate_up_proj" , &l.mlp_gate_up.clone().transpose(&[1, 0])), + ("mlp.down_proj" , &l.mlp_down .clone().transpose(&[1, 0])), + ]; + header.tensors.extend( + iter.map(|(name, tensor)| (format!("model.layers.{i}.{name}.weight"), t(tensor))), + ); + } + header.tensors.extend([ + ("model.norm.weight".into(), t(&self.lm_layernorm)), + ( + "lm_head.weight".into(), + t(&self.lm_head.clone().transpose(&[1, 0])), + ), + ]); + + let header = { + let str = serde_json::to_string(&header)?; + let len = str.len(); + const ALIGN: usize = std::mem::size_of::(); + let aligned = (len + ALIGN - 1) & !(ALIGN - 1); + + let mut buffer = Vec::with_capacity(aligned); + let mut write = BufWriter::new(&mut buffer); + write.write_all(&(aligned as u64).to_le_bytes())?; + write.write_all(str.as_bytes())?; + for _ in len..aligned { + write.write_all(&[32])?; + } + drop(write); + buffer + }; + + let mut file = fs::File::create(dir.join("model.safetensors"))?; + file.write_all(&header)?; + file.write_all(self.embed_tokens.physical())?; + for l in self.layers.iter() { + file.write_all(l.att_layernorm.physical())?; + file.write_all(l.att_qkv.physical())?; + file.write_all(l.att_o.physical())?; + file.write_all(l.mlp_layernorm.physical())?; + file.write_all(l.mlp_gate_up.physical())?; + file.write_all(l.mlp_down.physical())?; + } + file.write_all(self.lm_layernorm.physical())?; + file.write_all(self.lm_head.physical())?; + Ok(()) + } +} diff --git a/transformer-cpu/src/lib.rs b/transformer-cpu/src/lib.rs index d0aac586..e85c70fb 100644 --- a/transformer-cpu/src/lib.rs +++ b/transformer-cpu/src/lib.rs @@ -1,84 +1,18 @@ mod kernel; use causal_lm::{CausalLM, DecodingMeta, Model, QueryContext, SampleMeta}; -use common::{safe_tensors::SafeTensors, upos, utok, Blob, FileLoadError}; +use common::{upos, utok, Blob, FileLoadError}; use gemm::f16; use itertools::izip; use kernel::{ fused_softmax::softmax, gather::gather, mat_mul::mat_mul, rms_norm::rms_norm, rotary_embedding::rotary_embedding, swiglu::swiglu, }; -use llama::ConfigJson; +use llama::Storage; use std::{iter::repeat, path::Path, slice::from_raw_parts}; -use tensor::{reslice, slice, split, udim, DataType, LocalSplitable, Tensor}; - -pub struct Transformer { - eos_token: utok, - data_type: DataType, - nlayers: udim, - nh: udim, - nkvh: udim, - max_seq_len: udim, - d: udim, - di: udim, - epsilon: f32, - theta: f32, - safe_tensors: SafeTensors, -} - -impl Transformer { - pub fn embed_tokens(&self) -> Tensor<&[u8]> { - convert(&self.safe_tensors, "model.embed_tokens.weight") - } - - pub fn input_layernorm(&self, layer: udim) -> Tensor<&[u8]> { - convert(&self.safe_tensors, layer_name(layer, "input_layernorm")) - } - - pub fn w_qkv(&self, layer: udim) -> Tensor<&[u8]> { - convert(&self.safe_tensors, layer_name(layer, "self_attn.qkv_proj")) - } - - pub fn w_o(&self, layer: udim) -> Tensor<&[u8]> { - convert(&self.safe_tensors, layer_name(layer, "self_attn.o_proj")) - } - - pub fn post_attention_layernorm(&self, layer: udim) -> Tensor<&[u8]> { - convert( - &self.safe_tensors, - layer_name(layer, "post_attention_layernorm"), - ) - } - - pub fn mlp_gate_up(&self, layer: udim) -> Tensor<&[u8]> { - convert(&self.safe_tensors, layer_name(layer, "mlp.gate_up_proj")) - } - - pub fn mlp_down(&self, layer: udim) -> Tensor<&[u8]> { - convert(&self.safe_tensors, layer_name(layer, "mlp.down_proj")) - } - - pub fn model_norm(&self) -> Tensor<&[u8]> { - convert(&self.safe_tensors, "model.norm.weight") - } - - pub fn lm_head(&self) -> Tensor<&[u8]> { - convert(&self.safe_tensors, "lm_head.weight") - } -} - -fn layer_name(layer: udim, name: &str) -> String { - format!("model.layers.{layer}.{name}.weight") -} +use tensor::{reslice, slice, split, udim, LocalSplitable, Tensor}; -fn convert<'a>(tensors: &'a SafeTensors, name: impl AsRef) -> Tensor<&'a [u8]> { - let tensor = tensors - .get(name.as_ref()) - .expect(&format!("Tensor {} not found", name.as_ref())); - let data_type = llama::convert(tensor.dtype); - let shape = tensor.shape.iter().map(|&x| x as udim).collect::>(); - Tensor::new(data_type, &shape, tensor.data) -} +pub struct Transformer(Storage); impl Model for Transformer { type Meta = (); @@ -86,20 +20,7 @@ impl Model for Transformer { #[inline] fn load(model_dir: impl AsRef, _meta: Self::Meta) -> Result { - let config = ConfigJson::load(&model_dir)?; - Ok(Self { - eos_token: config.eos_token_id, - data_type: config.torch_dtype, - nlayers: config.num_hidden_layers as _, - nh: config.num_attention_heads as _, - nkvh: config.num_key_value_heads as _, - max_seq_len: config.max_position_embeddings as _, - d: config.hidden_size as _, - di: config.intermediate_size as _, - epsilon: config.rms_norm_eps, - theta: config.rope_theta, - safe_tensors: SafeTensors::load_from_dir(model_dir)?, - }) + Ok(Self(llama::Storage::load_safetensors(model_dir)?)) } } @@ -108,16 +29,16 @@ impl CausalLM for Transformer { #[inline] fn eos_token(&self) -> utok { - self.eos_token + self.0.config.eos_token } fn new_cache(&self) -> Tensor { - let dt = self.data_type; - let nlayers = self.nlayers; - let nkvh = self.nkvh; - let max_seq_len = self.max_seq_len; - let d = self.d; - let nh = self.nh; + let dt = self.0.config.dt; + let nlayers = self.0.config.nlayers; + let nkvh = self.0.config.nkvh; + let max_seq_len = self.0.config.max_seq_len; + let d = self.0.config.d; + let nh = self.0.config.nh; Tensor::alloc(dt, &[nlayers, 2, nkvh, max_seq_len, d / nh], Blob::new) } @@ -144,14 +65,14 @@ impl CausalLM for Transformer { } fn token_embed(&self, queries: impl IntoIterator) -> Tensor { - let dt = self.data_type; - let d = self.d; + let dt = self.0.config.dt; + let d = self.0.config.d; let tokens = queries.into_iter().collect::>(); let nt = tokens.len() as udim; let mut x = Tensor::alloc(dt, &[nt, d], Blob::new); - gather(&mut x, &self.embed_tokens(), tokens); + gather(&mut x, &self.0.embed_tokens, tokens); x } @@ -179,13 +100,13 @@ impl CausalLM for Transformer { }) .collect::>(); - let dt = self.data_type; - let d = self.d; - let nh = self.nh; - let nkvh = self.nkvh; + let dt = self.0.config.dt; + let d = self.0.config.d; + let nh = self.0.config.nh; + let nkvh = self.0.config.nkvh; let dh = d / nh; let dkv = nkvh * dh; - let di = self.di; + let di = self.0.config.di; let head_group = nh / nkvh; let head_div = (dh as f32).sqrt().recip(); @@ -203,15 +124,12 @@ impl CausalLM for Transformer { let pos = pos.as_ref().map_physical(|u| reslice(u)); let mut x = token_embedded; - for layer in 0..self.nlayers { + for (layer, params) in self.0.layers.iter().enumerate() { let (mut x1, qkv) = state!(); let mut qkv = qkv.slice(&[slice![=>], slice![=> d + dkv + dkv]]); - let input_layernorm = self.input_layernorm(layer); - rms_norm(&mut x1, &x, &input_layernorm, self.epsilon); - - let w_qkv = self.w_qkv(layer).transpose(&[1, 0]); - mat_mul(&mut qkv, 0., &x1, &w_qkv, 1.); + rms_norm(&mut x1, &x, ¶ms.att_layernorm, self.0.config.epsilon); + mat_mul(&mut qkv, 0., &x1, ¶ms.att_qkv, 1.); let (q, k, v) = split!(qkv; [1]: d, dkv, dkv); let mut q = q.reshape(&[nt, nh, dh]); @@ -219,8 +137,8 @@ impl CausalLM for Transformer { let v = v.reshape(&[nt, nkvh, dh]); let o = x1.reshape(&[nt, nh, dh]); - rotary_embedding(&mut q, &pos, self.theta); - rotary_embedding(&mut k, &pos, self.theta); + rotary_embedding(&mut q, &pos, self.0.config.theta); + rotary_embedding(&mut k, &pos, self.0.config.theta); let q = q.transpose(&[1, 0, 2]).split(1, &seq_len); let k = k.transpose(&[1, 0, 2]).split(1, &seq_len); @@ -266,20 +184,12 @@ impl CausalLM for Transformer { let (mut x1, gate_up) = state!(); let mut gate_up = gate_up.slice(&[slice![=>], slice![=> di + di]]); - let wo = self.w_o(layer).transpose(&[1, 0]); - mat_mul(&mut x, 1., &x1, &wo, 1.); - - let post_layernorm = self.post_attention_layernorm(layer); - rms_norm(&mut x1, &x, &post_layernorm, self.epsilon); - - let w_gate_up = self.mlp_gate_up(layer).transpose(&[1, 0]); - mat_mul(&mut gate_up, 0., &x1, &w_gate_up, 1.); - + mat_mul(&mut x, 1., &x1, ¶ms.att_o, 1.); + rms_norm(&mut x1, &x, ¶ms.mlp_layernorm, self.0.config.epsilon); + mat_mul(&mut gate_up, 0., &x1, ¶ms.mlp_gate_up, 1.); let (mut gate, up) = split!(gate_up; [1]: di, di); swiglu(&mut gate, &up); - - let mlp_down = self.mlp_down(layer).transpose(&[1, 0]); - mat_mul(&mut x, 1., &gate, &mlp_down, 1.); + mat_mul(&mut x, 1., &gate, ¶ms.mlp_down, 1.); } x @@ -290,8 +200,8 @@ impl CausalLM for Transformer { decoding: impl IntoIterator, mut hidden_state: Tensor, ) -> Tensor { - let dt = self.data_type; - let d = self.d; + let dt = self.0.config.dt; + let d = self.0.config.d; let buf = hidden_state.as_mut_slice(); let len = d as usize * dt.size(); @@ -335,7 +245,7 @@ impl CausalLM for Transformer { return Tensor::alloc(dt, &[0, d as _], Blob::new); } - let lm_head = self.lm_head().transpose(&[1, 0]); + let lm_head = &self.0.lm_head; let mut x = hidden_state.slice(&[slice![begin => dst], slice![=>]]); let mut logits = Tensor::alloc(dt, &[x.shape()[0], lm_head.shape()[1]], Blob::new); @@ -343,8 +253,8 @@ impl CausalLM for Transformer { let x_ = x .as_ref() .map_physical(|u| unsafe { from_raw_parts(u.as_ptr(), u.len()) }); - rms_norm(&mut x, &x_, &self.model_norm(), self.epsilon); - mat_mul(&mut logits, 0., &x, &lm_head, 1.); + rms_norm(&mut x, &x_, &self.0.lm_layernorm, self.0.config.epsilon); + mat_mul(&mut logits, 0., &x, lm_head, 1.); logits } diff --git a/xtask/Cargo.toml b/xtask/Cargo.toml index 63e100a1..89222aec 100644 --- a/xtask/Cargo.toml +++ b/xtask/Cargo.toml @@ -10,7 +10,7 @@ authors = ["YdrMaster "] common = { path = "../common" } tensor = { path = "../tensor" } causal-lm = { path = "../causal-lm" } -transformer = { path = "../models/llama-legacy" } +llama = { path = "../models/llama" } transformer-cpu = { path = "../transformer-cpu" } transformer-nv = { path = "../nvidia/transformer", optional = true } distributed = { path = "../nvidia/distributed", optional = true } diff --git a/xtask/src/cast.rs b/xtask/src/cast.rs index 26ad5e49..adff3515 100644 --- a/xtask/src/cast.rs +++ b/xtask/src/cast.rs @@ -1,6 +1,5 @@ use std::{fs, path::PathBuf, time::Instant}; use tensor::DataType; -use transformer::{save, Memory}; #[derive(Args, Default)] pub(crate) struct CastArgs { @@ -26,7 +25,7 @@ impl CastArgs { let model_dir = PathBuf::from(self.model); let time = Instant::now(); - let model = Memory::load_safetensors(&model_dir).unwrap(); + let model = llama::Storage::load_safetensors(&model_dir).unwrap(); println!("load model ... {:?}", time.elapsed()); let target = self.target.map(PathBuf::from).unwrap_or_else(|| { @@ -38,11 +37,11 @@ impl CastArgs { fs::create_dir_all(&target).unwrap(); let time = Instant::now(); - let model = Memory::cast(&model, ty); + let model = model.cast(ty); println!("cast data type ... {:?}", time.elapsed()); let time = Instant::now(); - save(&model, &target).unwrap(); + model.save(&target).unwrap(); println!("save model ... {:?}", time.elapsed()); let copy_file = |name: &str| {