Skip to content

Commit

Permalink
Treat all tokens as added tokens
Browse files Browse the repository at this point in the history
  • Loading branch information
xenova committed Dec 10, 2023
1 parent 599b4e2 commit 4c53b09
Showing 1 changed file with 7 additions and 1 deletion.
8 changes: 7 additions & 1 deletion scripts/extra/esm.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
from transformers.convert_slow_tokenizer import Converter
from tokenizers import Tokenizer, decoders, pre_tokenizers, processors
from tokenizers import Tokenizer, AddedToken, pre_tokenizers, processors
from tokenizers.models import WordPiece

class EsmConverter(Converter):
Expand Down Expand Up @@ -35,6 +35,12 @@ def converted(self) -> Tokenizer:
if x[0] == '<' and x[-1] == '>'
])

# For some reason, all tokens are added: none of them are special, but they all need special splitting.
# See https://github.com/huggingface/transformers/blob/df5c5c62ae253055336f5bb0828ca8e3e15ab6bd/src/transformers/models/esm/tokenization_esm.py#L79-L80
tokenizer.add_tokens([
x for x in vocab.keys()
if x[0] != '<' or x[-1] != '>'
])
return tokenizer

def generate_fast_tokenizer(tokenizer):
Expand Down

0 comments on commit 4c53b09

Please sign in to comment.