Skip to content

Commit

Permalink
refactor(tokenizer): 使用独立仓库中的 bpe tokenizer
Browse files Browse the repository at this point in the history
Signed-off-by: YdrMaster <[email protected]>
  • Loading branch information
YdrMaster committed Aug 6, 2024
1 parent 9f1384f commit c5a3c38
Show file tree
Hide file tree
Showing 7 changed files with 37 additions and 678 deletions.
15 changes: 12 additions & 3 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions service/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ causal-lm = { path = "../causal-lm" }
chat-template = { path = "../chat-template" }
log.workspace = true
tokio.workspace = true
memmap2.workspace = true
lru = "0.12"
rangemap = "1.5"

Expand Down
15 changes: 8 additions & 7 deletions service/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -8,10 +8,11 @@ use chat_template::ChatTemplate;
use session::{Dispatcher, Generator};
use std::{
fmt::{self, Debug},
fs::File,
path::Path,
sync::Arc,
};
use tokenizer::{BPECommonNormalizer, Normalizer, Tokenize, Tokenizer, VocabTxt, BPE};
use tokenizer::{BPECommonNormalizer, Bpe, Normalizer, Tokeneer, Tokenize, VocabTxt};
use tokio::task::JoinHandle;

pub use chat_template::Message;
Expand Down Expand Up @@ -152,10 +153,8 @@ fn template(model_dir: impl AsRef<Path>) -> ChatTemplate {

fn normalizer(model_dir: impl AsRef<Path>) -> Box<dyn Normalizer + Send + Sync> {
use std::io::ErrorKind::NotFound;
match BPE::from_tokenizer_model(model_dir.as_ref().join("tokenizer.model")) {
Ok(_) => return Box::new(BPECommonNormalizer {}),
Err(e) if e.kind() == NotFound => {}
Err(e) => panic!("{e:?}"),
if model_dir.as_ref().join("tokenizer.model").is_file() {
return Box::new(BPECommonNormalizer {});
}
match VocabTxt::from_txt_file(model_dir.as_ref().join("vocabs.txt")) {
Ok(_) => return Box::new(()),
Expand All @@ -167,8 +166,10 @@ fn normalizer(model_dir: impl AsRef<Path>) -> Box<dyn Normalizer + Send + Sync>

fn tokenizer(model_dir: impl AsRef<Path>) -> Box<dyn Tokenize + Send + Sync> {
use std::io::ErrorKind::NotFound;
match BPE::from_tokenizer_model(model_dir.as_ref().join("tokenizer.model")) {
Ok(bpe) => return Box::new(Tokenizer::new(bpe)),
let file = File::open(model_dir.as_ref().join("tokenizer.model"))
.and_then(|f| unsafe { memmap2::Mmap::map(&f) });
match file {
Ok(f) => return Box::new(Tokeneer::new(Bpe::from_tokenizer_model(&f))),
Err(e) if e.kind() == NotFound => {}
Err(e) => panic!("{e:?}"),
}
Expand Down
2 changes: 1 addition & 1 deletion tokenizer/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -9,4 +9,4 @@ authors = ["YdrMaster <[email protected]>"]
[dependencies]
memmap2.workspace = true
patricia_tree = "0.8"
regex = "1.10"
tokeneer = { git = "https://github.com/YdrMaster/tokeneer", rev = "7d0477c" }
Loading

0 comments on commit c5a3c38

Please sign in to comment.