diff --git a/.cargo/config.toml b/.cargo/config.toml index 85801ee6..3ce4e99f 100644 --- a/.cargo/config.toml +++ b/.cargo/config.toml @@ -3,3 +3,4 @@ xtask = "run --package xtask --release --" debug = "run --package xtask --" cast = "xtask cast" generate = "xtask generate" +service = "xtask service" diff --git a/Cargo.toml b/Cargo.toml index 7997db85..edc2b45a 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -9,3 +9,9 @@ members = [ "xtask", ] resolver = "2" + +[workspace.dependencies] +find_cuda_helper = "0.2" +half = "2.4" +serde_json = "1.0" +serde = "1.0" diff --git a/model-parameters/Cargo.toml b/model-parameters/Cargo.toml index 08feeca1..f04cf675 100644 --- a/model-parameters/Cargo.toml +++ b/model-parameters/Cargo.toml @@ -9,9 +9,9 @@ authors = ["YdrMaster "] [dependencies] common = { path = "../common" } tensor = { path = "../tensor" } -half = "2.4" +half.workspace = true rayon = "1.9" memmap2 = "0.9" safetensors = "0.4" -serde_json = "1.0" -serde = { version = "1.0", features = ["derive"] } +serde = { workspace = true, features = ["derive"] } +serde_json.workspace = true diff --git a/tensor/Cargo.toml b/tensor/Cargo.toml index 98c437e1..301f3db5 100644 --- a/tensor/Cargo.toml +++ b/tensor/Cargo.toml @@ -7,8 +7,8 @@ authors = ["YdrMaster "] # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html [dependencies] -half = "2.4" +half.workspace = true smallvec = "1.13" nalgebra = "0.32" rayon = "1.9" -serde = "1.0" +serde.workspace = true diff --git a/tokenizer/src/bpe.rs b/tokenizer/src/bpe.rs index de9f0879..327787f1 100644 --- a/tokenizer/src/bpe.rs +++ b/tokenizer/src/bpe.rs @@ -91,6 +91,7 @@ impl Tokenizer for BPE { } fn encode(&self, text: &str) -> Vec { + let text = text.replace(' ', "▁"); // FIXME: 从 tokenizer.json 读取 normalizer let mut tokens = Vec::new(); text.chars().map(|c| c.to_string()).for_each(|c| { diff --git a/transformer-cpu/src/lib.rs b/transformer-cpu/src/lib.rs index 50b3144c..d6d6a091 100644 --- a/transformer-cpu/src/lib.rs +++ b/transformer-cpu/src/lib.rs @@ -34,6 +34,11 @@ impl Transformer { LayerCache::new_layers(&*self.model) } + #[inline] + pub fn max_seq_len(&self) -> usize { + self.model.max_position_embeddings() + } + pub fn update(&self, tokens: &[utok], cache: &mut [LayerCache], pos: upos) -> Tensor { let seq_len = tokens.len() as udim; let d = self.model.hidden_size() as udim; diff --git a/transformer-nvidia/Cargo.toml b/transformer-nvidia/Cargo.toml index 1d3f53fe..62036a08 100644 --- a/transformer-nvidia/Cargo.toml +++ b/transformer-nvidia/Cargo.toml @@ -12,10 +12,10 @@ tensor = { path = "../tensor" } model-parameters = { path = "../model-parameters" } cuda = { git = "https://github.com/YdrMaster/cuda-bench" } cublas = { git = "https://github.com/YdrMaster/cuda-bench" } -half = "2.4" +half.workspace = true [dev-dependencies] tokenizer = { path = "../tokenizer" } [build-dependencies] -find_cuda_helper = "0.2" +find_cuda_helper.workspace = true diff --git a/transformer-nvidia/src/kernel/fused_softmax.rs b/transformer-nvidia/src/kernel/fused_softmax.rs index 2c3729af..625c35fd 100644 --- a/transformer-nvidia/src/kernel/fused_softmax.rs +++ b/transformer-nvidia/src/kernel/fused_softmax.rs @@ -69,7 +69,7 @@ extern "C" __global__ void {folding}( unreachable!(); }; - let grid_dims = (seq_len, nh); + let grid_dims = (nh, seq_len); let (kernel, block_dims) = if att_len <= self.block_size { (&self.padding, att_len) } else { diff --git a/transformer-nvidia/src/kernel/reform.rs b/transformer-nvidia/src/kernel/reform.rs index ccc793f7..c8686239 100644 --- a/transformer-nvidia/src/kernel/reform.rs +++ b/transformer-nvidia/src/kernel/reform.rs @@ -91,8 +91,8 @@ extern "C" __global__ void {name}( ]; let max_warp_per_block = self.block_size / self.warp_size; - let grid_dims = ((c + max_warp_per_block - 1) / max_warp_per_block, r); - let block_dims = (self.warp_size, (c + grid_dims.0 - 1) / grid_dims.0); + let grid_dims = (r, (c + max_warp_per_block - 1) / max_warp_per_block); + let block_dims = ((c + grid_dims.1 - 1) / grid_dims.1, self.warp_size); self.f .launch(grid_dims, block_dims, params.as_ptr(), 0, Some(stream)); } diff --git a/transformer-nvidia/src/kernel/rotary_embedding.rs b/transformer-nvidia/src/kernel/rotary_embedding.rs index fe6e7c5e..7fe211e3 100644 --- a/transformer-nvidia/src/kernel/rotary_embedding.rs +++ b/transformer-nvidia/src/kernel/rotary_embedding.rs @@ -62,6 +62,6 @@ extern "C" __global__ void {name}( ]; self.f - .launch((n, nh), dh / 2, params.as_ptr(), 0, Some(stream)) + .launch((nh, n), dh / 2, params.as_ptr(), 0, Some(stream)) } } diff --git a/transformer-nvidia/src/kernel/swiglu.rs b/transformer-nvidia/src/kernel/swiglu.rs index 76e7fcbf..f3a66dfb 100644 --- a/transformer-nvidia/src/kernel/swiglu.rs +++ b/transformer-nvidia/src/kernel/swiglu.rs @@ -67,8 +67,8 @@ extern "C" __global__ void {name}( a } - let block_dims = gcd(di, self.block_size); - let grid_dims = (di / block_dims, seq_len); + let block_dims = gcd(self.block_size, di); + let grid_dims = (seq_len, di / block_dims); self.f .launch(grid_dims, block_dims, params.as_ptr(), 0, Some(stream)); } diff --git a/transformer-nvidia/src/lib.rs b/transformer-nvidia/src/lib.rs index c06f5b45..b3b5fad8 100644 --- a/transformer-nvidia/src/lib.rs +++ b/transformer-nvidia/src/lib.rs @@ -52,8 +52,8 @@ impl<'a> Transformer<'a> { cublas!(cublasCreate_v2(&mut cublas_handle)); let ctx = stream.ctx(); - let _dev = ctx.dev(); - let block_size = 1024; + let dev = ctx.dev(); + let (block_size, _) = dev.max_block_dims(); Self { model: ModelParameters::new(host, stream), layers: LayersParameters::new(load_layers, host, stream), diff --git a/xtask/Cargo.toml b/xtask/Cargo.toml index 81d6dc08..4d6bf54f 100644 --- a/xtask/Cargo.toml +++ b/xtask/Cargo.toml @@ -12,9 +12,11 @@ tensor = { path = "../tensor" } tokenizer = { path = "../tokenizer" } transformer-cpu = { path = "../transformer-cpu" } transformer-nvidia = { path = "../transformer-nvidia" } +serde = { workspace = true, features = ["derive"] } +serde_json.workspace = true log = "0.4" simple_logger = "4.3" clap = { version = "4.5", features = ["derive"] } [build-dependencies] -find_cuda_helper = "0.2" +find_cuda_helper.workspace = true diff --git a/xtask/src/common.rs b/xtask/src/common.rs new file mode 100644 index 00000000..cdfd970f --- /dev/null +++ b/xtask/src/common.rs @@ -0,0 +1,52 @@ +use common::utok; +use log::LevelFilter; +use simple_logger::SimpleLogger; +use std::{io::ErrorKind::NotFound, path::Path}; +use tokenizer::{Tokenizer, VocabTxt, BPE}; + +pub(crate) fn logger_init(log_level: &Option) { + let log = log_level + .as_ref() + .and_then(|log| match log.to_lowercase().as_str() { + "off" | "none" => Some(LevelFilter::Off), + "trace" => Some(LevelFilter::Trace), + "debug" => Some(LevelFilter::Debug), + "info" => Some(LevelFilter::Info), + "error" => Some(LevelFilter::Error), + _ => None, + }) + .unwrap_or(LevelFilter::Warn); + SimpleLogger::new().with_level(log).init().unwrap(); +} + +pub(crate) fn tokenizer(path: Option, model_dir: impl AsRef) -> Box { + match path { + Some(path) => match Path::new(&path).extension() { + Some(ext) if ext == "txt" => Box::new(VocabTxt::from_txt_file(path).unwrap()), + Some(ext) if ext == "model" => Box::new(BPE::from_model_file(path).unwrap()), + _ => panic!("Tokenizer file {path:?} not supported"), + }, + None => { + match BPE::from_model_file(model_dir.as_ref().join("tokenizer.model")) { + Ok(bpe) => return Box::new(bpe), + Err(e) if e.kind() == NotFound => {} + Err(e) => panic!("{e:?}"), + } + match VocabTxt::from_txt_file(model_dir.as_ref().join("vocabs.txt")) { + Ok(voc) => return Box::new(voc), + Err(e) if e.kind() == NotFound => {} + Err(e) => panic!("{e:?}"), + } + panic!("Tokenizer file not found"); + } + } +} + +pub(crate) fn argmax(logits: &[T]) -> utok { + logits + .iter() + .enumerate() + .max_by(|(_, a), (_, b)| a.partial_cmp(b).unwrap()) + .unwrap() + .0 as _ +} diff --git a/xtask/src/generate.rs b/xtask/src/generate.rs index 4bf0d43d..bbe2c11a 100644 --- a/xtask/src/generate.rs +++ b/xtask/src/generate.rs @@ -1,16 +1,14 @@ -use common::utok; -use log::LevelFilter; -use simple_logger::SimpleLogger; +use crate::common::{argmax, logger_init, tokenizer}; use std::{ alloc::Layout, collections::HashMap, - io::{ErrorKind, Write}, + io::Write, path::{Path, PathBuf}, ptr::NonNull, sync::Mutex, time::Instant, }; -use tokenizer::{Tokenizer, VocabTxt, BPE}; +use tokenizer::Tokenizer; use transformer_cpu::{ model_parameters::{Allocator, Llama2, Memory}, Transformer, @@ -75,18 +73,7 @@ impl Allocator for NormalAllocator { impl GenerateArgs { pub fn invoke(self) { - let log = self - .log - .and_then(|log| match log.to_lowercase().as_str() { - "off" | "none" => Some(LevelFilter::Off), - "trace" => Some(LevelFilter::Trace), - "debug" => Some(LevelFilter::Debug), - "info" => Some(LevelFilter::Info), - "error" => Some(LevelFilter::Error), - _ => None, - }) - .unwrap_or(LevelFilter::Warn); - SimpleLogger::new().with_level(log).init().unwrap(); + logger_init(&self.log); let model_dir = PathBuf::from(self.model); let step = self.step.unwrap_or(usize::MAX); @@ -114,29 +101,6 @@ impl GenerateArgs { } } -fn tokenizer(path: Option, model_dir: impl AsRef) -> Box { - match path { - Some(path) => match Path::new(&path).extension() { - Some(ext) if ext == "txt" => Box::new(VocabTxt::from_txt_file(path).unwrap()), - Some(ext) if ext == "model" => Box::new(BPE::from_model_file(path).unwrap()), - _ => panic!("Tokenizer file {path:?} not supported"), - }, - None => { - match BPE::from_model_file(model_dir.as_ref().join("tokenizer.model")) { - Ok(bpe) => return Box::new(bpe), - Err(e) if e.kind() == ErrorKind::NotFound => {} - Err(e) => panic!("{e:?}"), - } - match VocabTxt::from_txt_file(model_dir.as_ref().join("vocabs.txt")) { - Ok(voc) => return Box::new(voc), - Err(e) if e.kind() == ErrorKind::NotFound => {} - Err(e) => panic!("{e:?}"), - } - panic!("Tokenizer file not found"); - } - } -} - fn on_host( model_dir: impl AsRef, tokenizer: Box, @@ -157,14 +121,13 @@ fn on_host( model = Box::new(Memory::realloc_with(&*model, allocator)); info!("copy model ... {:?}", time.elapsed()); } - let step = step.min(model.max_position_embeddings()); let time = Instant::now(); let mut transformer = Transformer::new(model); let mut kv_cache = transformer.new_cache(); info!("build transformer ... {:?}", time.elapsed()); let time = Instant::now(); - let prompt_tokens = tokenizer.encode(&prompt.trim().replace(' ', "▁")); + let prompt_tokens = tokenizer.encode(&prompt.trim()); info!("encode prompt ... {:?}", time.elapsed()); let time = Instant::now(); @@ -179,7 +142,7 @@ fn on_host( let mut token = *last; let mut pos = tokens.len(); let time = Instant::now(); - while pos < step { + while pos < step.min(transformer.max_seq_len()) { let logits = transformer.forward(token, &mut kv_cache, pos as _); let next = argmax(logits); @@ -301,12 +264,3 @@ fn on_nvidia_gpu( ) }); } - -fn argmax(logits: &[T]) -> utok { - logits - .iter() - .enumerate() - .max_by(|(_, a), (_, b)| a.partial_cmp(b).unwrap()) - .unwrap() - .0 as _ -} diff --git a/xtask/src/main.rs b/xtask/src/main.rs index d585dd69..6dc70f17 100644 --- a/xtask/src/main.rs +++ b/xtask/src/main.rs @@ -1,5 +1,7 @@ mod cast; +mod common; mod generate; +mod service; use clap::Parser; @@ -13,6 +15,7 @@ fn main() { match Cli::parse().command { Cast(cast) => cast.invode(), Generate(generate) => generate.invoke(), + Service(service) => service.launch(), } } @@ -30,4 +33,6 @@ enum Commands { Cast(cast::CastArgs), /// Generate following text Generate(generate::GenerateArgs), + /// Start LLM inference service + Service(service::ServiceArgs), } diff --git a/xtask/src/service/cpu.rs b/xtask/src/service/cpu.rs new file mode 100644 index 00000000..debf32f4 --- /dev/null +++ b/xtask/src/service/cpu.rs @@ -0,0 +1,58 @@ +use super::ServiceArgs; +use crate::common::{argmax, tokenizer}; +use common::upos; +use std::{collections::HashMap, path::Path, time::Instant}; +use transformer_cpu::{model_parameters::Memory, LayerCache, Transformer}; + +pub(super) fn run(args: ServiceArgs) { + let model_dir = Path::new(&args.model); + + let time = Instant::now(); + let tokenizer = tokenizer(args.tokenizer, &model_dir); + info!("build tokenizer ... {:?}", time.elapsed()); + + let time = Instant::now(); + let model = Box::new(Memory::load_safetensors_from_dir(model_dir).unwrap()); + info!("load model ... {:?}", time.elapsed()); + + let time = Instant::now(); + let mut transformer = Transformer::new(model); + info!("build transformer ... {:?}", time.elapsed()); + + struct SessionContext { + pos: upos, + kv_cache: Vec, + } + + let mut sessions = HashMap::::new(); + + loop { + let id = 0; + let prompt = "The quick brown fox jumps over the lazy dog"; + + let session = sessions.entry(id).or_insert_with(|| SessionContext { + pos: 0, + kv_cache: transformer.new_cache(), + }); + + let prompt_tokens = tokenizer.encode(&prompt.trim()); + let (last, tokens) = prompt_tokens.split_last().expect("prompt is empty"); + if !tokens.is_empty() { + transformer.update(tokens, &mut session.kv_cache, session.pos as _); + session.pos += tokens.len() as upos; + } + + let mut token = *last; + let max_pos = transformer.max_seq_len() as upos; + let mut out = String::new(); + while session.pos < max_pos { + let logits = transformer.forward(token, &mut session.kv_cache, session.pos as _); + let next = argmax(logits); + + token = next; + session.pos += 1; + + out.push_str(&tokenizer.decode(next).replace('▁', " ")); + } + } +} diff --git a/xtask/src/service/mod.rs b/xtask/src/service/mod.rs new file mode 100644 index 00000000..5c7241fe --- /dev/null +++ b/xtask/src/service/mod.rs @@ -0,0 +1,38 @@ +mod cpu; +#[cfg(detected_cuda)] +mod nvidia; + +#[derive(Args, Default)] +pub(crate) struct ServiceArgs { + /// Model directory. + #[clap(short, long)] + model: String, + /// Tokenizer file. + #[clap(short, long)] + tokenizer: Option, + /// Log level, may be "off", "trace", "debug", "info" or "error". + #[clap(long)] + log: Option, + + /// Use Nvidia GPU. + #[clap(long)] + nvidia: bool, +} + +impl ServiceArgs { + pub fn launch(self) { + crate::common::logger_init(&self.log); + if self.nvidia { + #[cfg(detected_cuda)] + { + nvidia::run(self); + } + #[cfg(not(detected_cuda))] + { + panic!("Nvidia GPU is not available"); + } + } else { + cpu::run(self); + } + } +} diff --git a/xtask/src/service/nvidia.rs b/xtask/src/service/nvidia.rs new file mode 100644 index 00000000..99ec0db7 --- /dev/null +++ b/xtask/src/service/nvidia.rs @@ -0,0 +1,5 @@ +use super::ServiceArgs; + +pub(super) fn run(args: ServiceArgs) { + todo!() +}