Skip to content

Commit

Permalink
Add benchmark OtherLibs.FastBertTokenizer_SameDataAsBertTokenizers
Browse files Browse the repository at this point in the history
  • Loading branch information
georg-jung committed Apr 29, 2024
1 parent 6a05411 commit fb2762c
Showing 1 changed file with 15 additions and 0 deletions.
15 changes: 15 additions & 0 deletions src/Benchmarks/OtherLibs.cs
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
using BenchmarkDotNet.Attributes;
using BenchmarkDotNet.Diagnosers;
using BERTTokenizers.Base;
using FastBertTokenizer;
using RustLibWrapper;

namespace Benchmarks;
Expand All @@ -19,6 +20,7 @@ public class OtherLibs
private ConcreteUncasedTokenizer _nmZivkovicTokenizer;
private string[] _corpus = null!;
private List<string> _nmZivkovicCorpus = null!;
private readonly BertTokenizer _tokenizer = new();

public OtherLibs()
: this("data/wiki-simple.json.br", "data/baai-bge-small-en/vocab.txt", "data/baai-bge-small-en/tokenizer.json", 512)
Expand All @@ -38,6 +40,7 @@ public OtherLibs(string corpusPath, string vocabTxtFile, string tokenizerJsonPat
public async Task SetupAsync()
{
RustTokenizer.LoadTokenizer(_tokenizerJsonPath, _maxSequenceLength);
await _tokenizer.LoadTokenizerJsonAsync(_tokenizerJsonPath);
_corpus = await CorpusReader.ReadBrotliJsonCorpusAsync(_corpusPath);

_nmZivkovicCorpus = new(_corpus.Length);
Expand Down Expand Up @@ -70,6 +73,18 @@ public IReadOnlyCollection<object> NMZivkovic_BertTokenizers()
return res;
}

[Benchmark]
public IReadOnlyCollection<object> FastBertTokenizer_SameDataAsBertTokenizers()
{
List<object> res = new(_nmZivkovicCorpus.Count);
foreach (var text in _nmZivkovicCorpus)
{
res.Add(_tokenizer.Encode(text, _maxSequenceLength));
}

return res;
}

[Benchmark]
public object RustHuggingfaceWrapperSinglethreadedMemReuse()
{
Expand Down

0 comments on commit fb2762c

Please sign in to comment.