Skip to content

Commit

Permalink
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
feat(service): service for cpu 拆分出去方便添加 nvidia
Browse files Browse the repository at this point in the history
Signed-off-by: YdrMaster <ydrml@hotmail.com>
YdrMaster committed Mar 12, 2024
1 parent 50550ea commit 7f7f6a5
Showing 8 changed files with 131 additions and 86 deletions.
4 changes: 3 additions & 1 deletion Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

4 changes: 4 additions & 0 deletions service/Cargo.toml
Original file line number Diff line number Diff line change
@@ -11,5 +11,9 @@ common = { path = "../common" }
tensor = { path = "../tensor" }
tokenizer = { path = "../tokenizer" }
transformer-cpu = { path = "../transformer-cpu" }
transformer-nvidia = { path = "../transformer-nvidia" }
half.workspace = true
log.workspace = true

[build-dependencies]
find_cuda_helper.workspace = true
5 changes: 5 additions & 0 deletions service/build.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
fn main() {
if find_cuda_helper::find_cuda_root().is_some() {
println!("cargo:rustc-cfg=detected_cuda");
}
}
103 changes: 103 additions & 0 deletions service/src/cpu.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,103 @@
use crate::Command;
use common::{upos, utok};
use half::f16;
use std::{collections::HashMap, path::Path, time::Instant};
use tensor::reslice;
use transformer_cpu::{LayerCache, Llama2, Memory, Prompt, Request, Transformer};

pub struct CpuTask {
eos: utok,
transformer: Transformer,
sessions: HashMap<usize, SessionContext>,
}

impl CpuTask {
pub fn new(model_dir: impl AsRef<Path>) -> Self {
let time = Instant::now();
let model = Box::new(Memory::load_safetensors_from_dir(model_dir).unwrap());
info!("load model ... {:?}", time.elapsed());

let eos = model.eos_token_id();
let time = Instant::now();
let transformer = Transformer::new(model);
info!("build transformer ... {:?}", time.elapsed());

let sessions = HashMap::new();
Self {
eos,
transformer,
sessions,
}
}

pub fn invoke(&mut self, cmd: Command) {
match cmd {
Command::Chat {
id,
prompt,
responsing,
} => {
let ctx = self
.sessions
.entry(id)
.or_insert_with(|| SessionContext::new(&self.transformer));

let time = Instant::now();
let (last, tokens) = prompt.split_last().expect("prompt is empty");
if !tokens.is_empty() {
self.transformer.decode(vec![Request {
prompt: Prompt::Prefill(tokens),
cache: &mut ctx.cache,
pos: ctx.pos,
}]);
}
info!("prefill transformer ... {:?}", time.elapsed());

ctx.pos += tokens.len() as upos;
let mut token = *last;
let max_seq_len = self.transformer.max_seq_len() as upos;
while ctx.pos < max_seq_len {
let logits = self.transformer.decode(vec![Request {
prompt: transformer_cpu::Prompt::Decode(token),
cache: &mut ctx.cache,
pos: ctx.pos,
}]);
token = argmax(reslice::<u8, f16>(logits.access().as_slice()));
responsing.send(token).unwrap();

if token == self.eos {
break;
}

ctx.pos += 1;
}
}
Command::Drop { id } => {
self.sessions.remove(&id);
}
}
}
}

struct SessionContext {
pos: upos,
cache: Vec<LayerCache>,
}

impl SessionContext {
fn new(transformer: &Transformer) -> Self {
Self {
pos: 0,
cache: transformer.new_cache(),
}
}
}

fn argmax<T: PartialOrd>(logits: &[T]) -> utok {
logits
.iter()
.enumerate()
.max_by(|(_, a), (_, b)| a.partial_cmp(b).unwrap())
.unwrap()
.0 as _
}
96 changes: 14 additions & 82 deletions service/src/lib.rs
Original file line number Diff line number Diff line change
@@ -1,11 +1,11 @@
mod cpu;
mod session;
mod template;

use common::{upos, utok};
use half::f16;
use common::utok;
use cpu::CpuTask;
use session::SessionComponent;
use std::{
collections::HashMap,
path::Path,
sync::{
mpsc::{channel, Sender},
@@ -15,9 +15,7 @@ use std::{
time::Instant,
};
use template::Template;
use tensor::reslice;
use tokenizer::{Tokenizer, VocabTxt, BPE};
use transformer_cpu::{LayerCache, Llama2, Memory, Prompt, Request, Transformer};

pub use session::Session;

@@ -55,63 +53,20 @@ impl Service {
sender,
}),
_manager: thread::spawn(move || {
let time = Instant::now();
let model = Box::new(Memory::load_safetensors_from_dir(model_dir).unwrap());
info!("load model ... {:?}", time.elapsed());

let eos = model.eos_token_id();
let time = Instant::now();
let mut transformer = Transformer::new(model);
info!("build transformer ... {:?}", time.elapsed());

let mut sessions = HashMap::new();

let mut task = CpuTask::new(model_dir);
while let Ok(cmd) = receiver.recv() {
match cmd {
Command::Chat {
id,
prompt,
responsing,
} => {
let ctx = sessions
.entry(id)
.or_insert_with(|| SessionContext::new(&transformer));

let time = Instant::now();
let (last, tokens) = prompt.split_last().expect("prompt is empty");
if !tokens.is_empty() {
transformer.decode(vec![Request {
prompt: Prompt::Prefill(tokens),
cache: &mut ctx.cache,
pos: ctx.pos,
}]);
}
info!("prefill transformer ... {:?}", time.elapsed());

ctx.pos += tokens.len() as upos;
let mut token = *last;
let max_seq_len = transformer.max_seq_len() as upos;
while ctx.pos < max_seq_len {
let logits = transformer.decode(vec![Request {
prompt: transformer_cpu::Prompt::Decode(token),
cache: &mut ctx.cache,
pos: ctx.pos,
}]);
token = argmax(reslice::<u8, f16>(logits.access().as_slice()));
responsing.send(token).unwrap();
task.invoke(cmd);
}
// {
// use transformer_nvidia::cuda;

if token == eos {
break;
}
// cuda::init();
// let Some(dev) = cuda::Device::fetch() else {
// panic!("No Nvidia GPU is detected");
// };

ctx.pos += 1;
}
}
Command::Drop { id } => {
sessions.remove(&id);
}
}
}
// dev.set_mempool_threshold(u64::MAX);
// }
}),
}
}
@@ -133,20 +88,6 @@ enum Command {
},
}

struct SessionContext {
pos: upos,
cache: Vec<LayerCache>,
}

impl SessionContext {
fn new(transformer: &Transformer) -> Self {
Self {
pos: 0,
cache: transformer.new_cache(),
}
}
}

fn tokenizer(model_dir: impl AsRef<Path>) -> Box<dyn Tokenizer> {
use std::io::ErrorKind::NotFound;
match BPE::from_model_file(model_dir.as_ref().join("tokenizer.model")) {
@@ -161,12 +102,3 @@ fn tokenizer(model_dir: impl AsRef<Path>) -> Box<dyn Tokenizer> {
}
panic!("Tokenizer file not found");
}

fn argmax<T: PartialOrd>(logits: &[T]) -> utok {
logits
.iter()
.enumerate()
.max_by(|(_, a), (_, b)| a.partial_cmp(b).unwrap())
.unwrap()
.0 as _
}
2 changes: 1 addition & 1 deletion xtask/Cargo.toml
Original file line number Diff line number Diff line change
@@ -10,7 +10,7 @@ authors = ["YdrMaster <ydrml@hotmail.com>"]
common = { path = "../common" }
tensor = { path = "../tensor" }
tokenizer = { path = "../tokenizer" }
transformer-cpu = { path = "../transformer-cpu" }
transformer = { path = "../transformer" }
transformer-nvidia = { path = "../transformer-nvidia" }
service = { path = "../service" }
serde = { workspace = true, features = ["derive"] }
2 changes: 1 addition & 1 deletion xtask/src/cast.rs
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
use std::{fs, path::PathBuf, time::Instant};
use tensor::DataType;
use transformer_cpu::{save, Memory};
use transformer::{save, Memory};

#[derive(Args, Default)]
pub(crate) struct CastArgs {
1 change: 0 additions & 1 deletion xtask/src/main.rs
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
mod cast;
mod common;
mod generate;
// mod service;

use clap::Parser;

0 comments on commit 7f7f6a5

Please sign in to comment.