Skip to content

Commit

Permalink
Auto-detect wordpiece tokenizer when model.type is missing (#1151)
Browse files Browse the repository at this point in the history
* Auto-detect wordpiece tokenizer when model.type is missing

* Update test name
  • Loading branch information
xenova authored Jan 18, 2025
1 parent 761f257 commit 16ff98d
Show file tree
Hide file tree
Showing 3 changed files with 23 additions and 3 deletions.
6 changes: 4 additions & 2 deletions src/tokenizers.js
Original file line number Diff line number Diff line change
Expand Up @@ -364,13 +364,15 @@ export class TokenizerModel extends Callable {
return new BPE(config);

default:
// Some tokenizers, like for google-t5/t5-small, do not have a `type` field.
// In this case, we can infer the tokenizer type based on the structure of the `vocab` field.
// Some older tokenizers, like `google-t5/t5-small` and `distilbert/distilbert-base-uncased`, do not have a `type` field.
// In this case, we can infer the tokenizer type based on the structure of the `vocab` field and other properties.
if (config.vocab) {
if (Array.isArray(config.vocab)) {
// config.vocab is of type `[string, number][]`
// @ts-ignore
return new Unigram(config, ...args);
} else if (typeof config.vocab === 'object' && config.continuing_subword_prefix && config.unk_token) {
return new WordPieceTokenizer(config);
} else {
// @ts-ignore
return new LegacyTokenizerModel(config, ...args);
Expand Down
9 changes: 9 additions & 0 deletions tests/models/bert/test_tokenization_bert.js
Original file line number Diff line number Diff line change
Expand Up @@ -1332,4 +1332,13 @@ export const TEST_CONFIG = {
decoded: "[CLS] test $ 1 r2 # 3 [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] test [SEP]",
},
},
// `model.type` field missing in tokenizer.json
"google-bert/bert-base-cased": {
CHINESE_LATIN_MIXED: {
text: BERT_TEST_STRINGS.CHINESE_LATIN_MIXED,
tokens: ["ah", "[UNK]", "[UNK]", "z", "##z"],
ids: [101, 18257, 100, 100, 195, 1584, 102],
decoded: "[CLS] ah [UNK] [UNK] zz [SEP]",
},
},
};
11 changes: 10 additions & 1 deletion tests/models/distilbert/test_tokenization_distilbert.js
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
import { DistilBertTokenizer } from "../../../src/tokenizers.js";
import { BASE_TEST_STRINGS } from "../test_strings.js";
import { BASE_TEST_STRINGS, BERT_TEST_STRINGS } from "../test_strings.js";

export const TOKENIZER_CLASS = DistilBertTokenizer;
export const TEST_CONFIG = {
Expand Down Expand Up @@ -303,4 +303,13 @@ export const TEST_CONFIG = {
decoded: "[CLS] weird \uff5e edge \uff5e case [SEP]",
},
},
// `model.type` field missing in tokenizer.json
"distilbert/distilbert-base-multilingual-cased": {
CHINESE_LATIN_MIXED: {
text: BERT_TEST_STRINGS.CHINESE_LATIN_MIXED,
tokens: ["ah", "\u535a", "\u63a8", "z", "##z"],
ids: [101, 69863, 2684, 4163, 194, 10305, 102],
decoded: "[CLS] ah \u535a \u63a8 zz [SEP]",
},
},
};

0 comments on commit 16ff98d

Please sign in to comment.