Skip to content

Commit

Permalink
refactor(tokenizer): tokenizer 具体实现移动到 tokeneer 库,相应简化代码
Browse files Browse the repository at this point in the history
Signed-off-by: YdrMaster <[email protected]>
  • Loading branch information
YdrMaster committed Aug 7, 2024
1 parent 03964a8 commit 607ec0b
Show file tree
Hide file tree
Showing 8 changed files with 38 additions and 184 deletions.
19 changes: 6 additions & 13 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 0 additions & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@
members = [
"common",
"tensor",
"tokenizer",
"causal-lm",
"chat-template",
"service",
Expand Down
2 changes: 1 addition & 1 deletion service/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -9,12 +9,12 @@ authors = ["YdrMaster <[email protected]>"]
[dependencies]
common = { path = "../common" }
tensor = { path = "../tensor" }
tokenizer = { path = "../tokenizer" }
causal-lm = { path = "../causal-lm" }
chat-template = { path = "../chat-template" }
log.workspace = true
tokio.workspace = true
memmap2.workspace = true
tokeneer = "0.0"
lru = "0.12"
rangemap = "1.5"

Expand Down
24 changes: 13 additions & 11 deletions service/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

mod session;
mod session_manager;
mod tokenizer;

use causal_lm::{CausalLM, SampleArgs};
use chat_template::ChatTemplate;
Expand All @@ -12,7 +13,8 @@ use std::{
path::Path,
sync::Arc,
};
use tokenizer::{BPECommonNormalizer, Bpe, Normalizer, Tokeneer, Tokenize, VocabTxt};
use tokeneer::{Bpe, Lpe, Tokeneer};
use tokenizer::{BPECommonNormalizer, Normalizer, Tokenize};
use tokio::task::JoinHandle;

pub use chat_template::Message;
Expand Down Expand Up @@ -152,29 +154,29 @@ fn template(model_dir: impl AsRef<Path>) -> ChatTemplate {
}

fn normalizer(model_dir: impl AsRef<Path>) -> Box<dyn Normalizer + Send + Sync> {
use std::io::ErrorKind::NotFound;
if model_dir.as_ref().join("tokenizer.model").is_file() {
return Box::new(BPECommonNormalizer {});
}
match VocabTxt::from_txt_file(model_dir.as_ref().join("vocabs.txt")) {
Ok(_) => return Box::new(()),
Err(e) if e.kind() == NotFound => {}
Err(e) => panic!("{e:?}"),
if model_dir.as_ref().join("vocabs.txt").is_file() {
return Box::new(());
}
panic!("Tokenizer file not found");
}

fn tokenizer(model_dir: impl AsRef<Path>) -> Box<dyn Tokenize + Send + Sync> {
use std::io::ErrorKind::NotFound;
let file = File::open(model_dir.as_ref().join("tokenizer.model"))
.and_then(|f| unsafe { memmap2::Mmap::map(&f) });
match file {

let mmap = |name: &str| {
File::open(model_dir.as_ref().join(name)).and_then(|f| unsafe { memmap2::Mmap::map(&f) })
};

match mmap("tokenizer.model") {
Ok(f) => return Box::new(Tokeneer::new(Bpe::from_tokenizer_model(&f))),
Err(e) if e.kind() == NotFound => {}
Err(e) => panic!("{e:?}"),
}
match VocabTxt::from_txt_file(model_dir.as_ref().join("vocabs.txt")) {
Ok(voc) => return Box::new(voc),
match mmap("vocabs.txt") {
Ok(f) => return Box::new(Tokeneer::new(Lpe::from_vocabs_txt(&f))),
Err(e) if e.kind() == NotFound => {}
Err(e) => panic!("{e:?}"),
}
Expand Down
19 changes: 18 additions & 1 deletion tokenizer/src/normalizer.rs → service/src/tokenizer.rs
Original file line number Diff line number Diff line change
@@ -1,4 +1,21 @@
use std::borrow::Cow;
use std::borrow::Cow;
use tokeneer::{utok, Tokeneer};

pub trait Tokenize {
fn encode(&self, text: &str) -> Vec<utok>;
fn decode(&self, token: utok) -> &str;
}

impl<M: tokeneer::Method> Tokenize for Tokeneer<M> {
#[inline]
fn encode(&self, text: &str) -> Vec<utok> {
self.encode(text)
}
#[inline]
fn decode(&self, token: utok) -> &str {
unsafe { std::str::from_utf8_unchecked(self.internal().decode(token)) }
}
}

pub trait Normalizer {
fn encode<'a>(&self, text: &'a str) -> Cow<'a, str>;
Expand Down
12 changes: 0 additions & 12 deletions tokenizer/Cargo.toml

This file was deleted.

87 changes: 0 additions & 87 deletions tokenizer/src/lib.rs

This file was deleted.

58 changes: 0 additions & 58 deletions tokenizer/src/vocab_txt.rs

This file was deleted.

0 comments on commit 607ec0b

Please sign in to comment.