Skip to content

Commit

Permalink
Merge pull request #17 from InfiniTensor/dev
Browse files Browse the repository at this point in the history
抽取 Tokenizer
  • Loading branch information
YdrMaster authored Aug 8, 2024
2 parents c27bb14 + 7efc7e7 commit 8798e56
Show file tree
Hide file tree
Showing 9 changed files with 48 additions and 685 deletions.
14 changes: 9 additions & 5 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 0 additions & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@
members = [
"common",
"tensor",
"tokenizer",
"causal-lm",
"chat-template",
"service",
Expand Down
3 changes: 2 additions & 1 deletion service/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -9,11 +9,12 @@ authors = ["YdrMaster <[email protected]>"]
[dependencies]
common = { path = "../common" }
tensor = { path = "../tensor" }
tokenizer = { path = "../tokenizer" }
causal-lm = { path = "../causal-lm" }
chat-template = { path = "../chat-template" }
log.workspace = true
tokio.workspace = true
memmap2.workspace = true
tokeneer = "0.0"
lru = "0.12"
rangemap = "1.5"

Expand Down
35 changes: 19 additions & 16 deletions service/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,16 +2,19 @@

mod session;
mod session_manager;
mod tokenizer;

use causal_lm::{CausalLM, SampleArgs};
use chat_template::ChatTemplate;
use session::{Dispatcher, Generator};
use std::{
fmt::{self, Debug},
fs::File,
path::Path,
sync::Arc,
};
use tokenizer::{BPECommonNormalizer, Normalizer, Tokenizer, VocabTxt, BPE};
use tokeneer::{Bpe, Lpe, Tokeneer};
use tokenizer::{BPECommonNormalizer, Normalizer, Tokenize};
use tokio::task::JoinHandle;

pub use chat_template::Message;
Expand All @@ -29,7 +32,7 @@ pub struct Service<M: CausalLM> {
/// 推理线程的生命周期与这个组件绑定。
struct ServiceComponent<M: CausalLM> {
handle: Arc<Dispatcher<M>>,
tokenizer: Box<dyn Tokenizer + Send + Sync>,
tokenizer: Box<dyn Tokenize + Send + Sync>,
normalizer: Box<dyn Normalizer + Send + Sync>,
template: ChatTemplate,
bos: String,
Expand Down Expand Up @@ -151,29 +154,29 @@ fn template(model_dir: impl AsRef<Path>) -> ChatTemplate {
}

fn normalizer(model_dir: impl AsRef<Path>) -> Box<dyn Normalizer + Send + Sync> {
use std::io::ErrorKind::NotFound;
match BPE::from_tokenizer_model(model_dir.as_ref().join("tokenizer.model")) {
Ok(_) => return Box::new(BPECommonNormalizer {}),
Err(e) if e.kind() == NotFound => {}
Err(e) => panic!("{e:?}"),
if model_dir.as_ref().join("tokenizer.model").is_file() {
return Box::new(BPECommonNormalizer {});
}
match VocabTxt::from_txt_file(model_dir.as_ref().join("vocabs.txt")) {
Ok(_) => return Box::new(()),
Err(e) if e.kind() == NotFound => {}
Err(e) => panic!("{e:?}"),
if model_dir.as_ref().join("vocabs.txt").is_file() {
return Box::new(());
}
panic!("Tokenizer file not found");
}

fn tokenizer(model_dir: impl AsRef<Path>) -> Box<dyn Tokenizer + Send + Sync> {
fn tokenizer(model_dir: impl AsRef<Path>) -> Box<dyn Tokenize + Send + Sync> {
use std::io::ErrorKind::NotFound;
match BPE::from_tokenizer_model(model_dir.as_ref().join("tokenizer.model")) {
Ok(bpe) => return Box::new(bpe),

let mmap = |name: &str| {
File::open(model_dir.as_ref().join(name)).and_then(|f| unsafe { memmap2::Mmap::map(&f) })
};

match mmap("tokenizer.model") {
Ok(f) => return Box::new(Tokeneer::new(Bpe::from_tokenizer_model(&f))),
Err(e) if e.kind() == NotFound => {}
Err(e) => panic!("{e:?}"),
}
match VocabTxt::from_txt_file(model_dir.as_ref().join("vocabs.txt")) {
Ok(voc) => return Box::new(voc),
match mmap("vocabs.txt") {
Ok(f) => return Box::new(Tokeneer::new(Lpe::from_vocabs_txt(&f))),
Err(e) if e.kind() == NotFound => {}
Err(e) => panic!("{e:?}"),
}
Expand Down
19 changes: 18 additions & 1 deletion tokenizer/src/normalizer.rs → service/src/tokenizer.rs
Original file line number Diff line number Diff line change
@@ -1,4 +1,21 @@
use std::borrow::Cow;
use std::borrow::Cow;
use tokeneer::{utok, Tokeneer};

pub trait Tokenize {
fn encode(&self, text: &str) -> Vec<utok>;
fn decode(&self, token: utok) -> &str;
}

impl<M: tokeneer::Method> Tokenize for Tokeneer<M> {
#[inline]
fn encode(&self, text: &str) -> Vec<utok> {
self.encode(text)
}
#[inline]
fn decode(&self, token: utok) -> &str {
unsafe { std::str::from_utf8_unchecked(self.internal().decode(token)) }
}
}

pub trait Normalizer {
fn encode<'a>(&self, text: &'a str) -> Cow<'a, str>;
Expand Down
11 changes: 0 additions & 11 deletions tokenizer/Cargo.toml

This file was deleted.

Loading

0 comments on commit 8798e56

Please sign in to comment.