Skip to content

Commit

Permalink
Join tokens with space if decoder is null
Browse files Browse the repository at this point in the history
  • Loading branch information
xenova committed Dec 10, 2023
1 parent 958a923 commit 599b4e2
Showing 1 changed file with 16 additions and 12 deletions.
28 changes: 16 additions & 12 deletions src/tokenizers.js
Original file line number Diff line number Diff line change
Expand Up @@ -1617,6 +1617,7 @@ class Decoder extends Callable {
* @throws {Error} If an unknown decoder type is provided.
*/
static fromConfig(config) {
if (config === null) return null;
switch (config.type) {
case 'WordPiece':
return new WordPieceDecoder(config);
Expand Down Expand Up @@ -2216,13 +2217,6 @@ export class PreTrainedTokenizer extends Callable {
// TODO: maybe, allow this to be null; in which case, we use model as decoder too?
this.decoder = Decoder.fromConfig(tokenizerJSON.decoder);


// Another slight hack to add `end_of_word_suffix` (if present) to the decoder
// This is needed for cases where BPE model and ByteLevel decoder are used
// For more information, see https://github.com/xenova/transformers.js/issues/74
// TODO: save this to the decoder when exporting?
this.decoder.end_of_word_suffix = this.model.end_of_word_suffix;

// Add added_tokens to model
this.special_tokens = [];
this.all_special_ids = [];
Expand All @@ -2246,8 +2240,17 @@ export class PreTrainedTokenizer extends Callable {
this.special_tokens.push(...(tokenizerConfig.additional_special_tokens ?? []));
this.special_tokens = [...new Set(this.special_tokens)]; // Remove duplicates

// Slight hack, but it prevents code duplication:
this.decoder.added_tokens = this.added_tokens;
if (this.decoder) {
// Slight hack, but it prevents code duplication:
this.decoder.added_tokens = this.added_tokens;

// Another slight hack to add `end_of_word_suffix` (if present) to the decoder
// This is needed for cases where BPE model and ByteLevel decoder are used
// For more information, see https://github.com/xenova/transformers.js/issues/74
// TODO: save this to the decoder when exporting?
this.decoder.end_of_word_suffix = this.model.end_of_word_suffix;
}


this.added_tokens_regex = this.added_tokens.length > 0 ? new RegExp(
'(' + this.added_tokens.map(escapeRegExp).join('|') + ')'
Expand Down Expand Up @@ -2634,13 +2637,14 @@ export class PreTrainedTokenizer extends Callable {
tokens = tokens.filter(x => !this.special_tokens.includes(x));
}

// If `this.decoder` is null, we just join tokens with a space:
// https://github.com/huggingface/tokenizers/blob/8edec536a737cb04494b454805be16c020abb14f/tokenizers/src/tokenizer/mod.rs#L835
/** @type {string} */
let decoded = this.decoder(tokens);

let decoded = this.decoder ? this.decoder(tokens) : tokens.join(' ');

// Slight hack, but prevents having to pass `skip_special_tokens` to
// each call to `decode`, which would lead to code duplication.
if (this.decoder.end_of_word_suffix) {
if (this.decoder && this.decoder.end_of_word_suffix) {
decoded = decoded.replaceAll(this.decoder.end_of_word_suffix, ' ');
if (skip_special_tokens) {
decoded = decoded.trim();
Expand Down

0 comments on commit 599b4e2

Please sign in to comment.