Skip to content

Commit

Permalink
Improve Benchmark architecture
Browse files Browse the repository at this point in the history
  • Loading branch information
georg-jung committed Nov 27, 2023
1 parent 930d4f8 commit 41ab5aa
Show file tree
Hide file tree
Showing 4 changed files with 131 additions and 72 deletions.
19 changes: 19 additions & 0 deletions src/Benchmarks/CorpusReader.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
// Copyright (c) Georg Jung. All rights reserved.
// Licensed under the MIT license. See LICENSE file in the project root for full license information.

using System.IO.Compression;
using System.Text.Json;

namespace Benchmarks;

internal static class CorpusReader
{
public static async Task<string[]> ReadBrotliJsonCorpusAsync(string filePath)
{
using var fs = File.OpenRead(filePath);
using var uncompress = new BrotliStream(fs, CompressionMode.Decompress);
var dict = await JsonSerializer.DeserializeAsync<Dictionary<int, string>>(uncompress);

return dict!.Values.ToArray();
}
}
93 changes: 93 additions & 0 deletions src/Benchmarks/OtherLibs.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,93 @@
// Copyright (c) Georg Jung. All rights reserved.
// Licensed under the MIT license. See LICENSE file in the project root for full license information.

using System.Text.RegularExpressions;
using BenchmarkDotNet.Attributes;
using BenchmarkDotNet.Diagnosers;
using BERTTokenizers.Base;
using RustLibWrapper;

namespace Benchmarks;

[MemoryDiagnoser]
public class OtherLibs
{
private readonly string _corpusPath;
private readonly string _vocabTxtFile;
private readonly string _tokenizerJsonPath;
private readonly int _maxSequenceLength;
private ConcreteUncasedTokenizer _nmZivkovicTokenizer;
private string[] _corpus = null!;
private List<string> _nmZivkovicCorpus = null!;

public OtherLibs()
: this("data/wiki-simple.json.br", "data/baai-bge-small-en-vocab.txt", "data/baai-bge-small-en-tokenizer.json", 512)
{
}

public OtherLibs(string corpusPath, string vocabTxtFile, string tokenizerJsonPath, int maxSequenceLength)
{
_nmZivkovicTokenizer = new(vocabTxtFile);
_corpusPath = corpusPath;
_vocabTxtFile = vocabTxtFile;
_tokenizerJsonPath = tokenizerJsonPath;
_maxSequenceLength = maxSequenceLength;
}

[GlobalSetup]
public async Task SetupAsync()
{
RustTokenizer.LoadTokenizer(_tokenizerJsonPath, _maxSequenceLength);
_corpus = await CorpusReader.ReadBrotliJsonCorpusAsync(_corpusPath);

_nmZivkovicCorpus = new(_corpus.Length);
var cnt = 0;
foreach (var tx in _corpus)
{
_corpus[cnt] = tx;

// this preprocessing gives NMZivkovic/BertTokenizers kind of an unfair advantage, but it throws otherwise
var nmZivkovicText = tx.Substring(0, Math.Min(tx.Length, 1250)); // NMZivkovic/BertTokenizers throws if text is too long; 1250 works with 512 tokens, 1500 doesn't; 5000 works with 2048 tokens
nmZivkovicText = Regex.Replace(nmZivkovicText, @"\s+", " "); // required due to bad whitespace processing of NMZivkovic/BertTokenizers
nmZivkovicText = Regex.Replace(nmZivkovicText, @"[^A-Za-z0-9\s\.\,;:\\/?!#$%()=+\-*\""'–_`<>&^@{}[\]\|~']+", string.Empty); // NMZivkovic/BertTokenizers doesn't handle unknown characters
_nmZivkovicCorpus.Add(nmZivkovicText);

cnt++;
}

_nmZivkovicTokenizer = new(_vocabTxtFile);
}

[Benchmark]
public IReadOnlyCollection<object> NMZivkovic_BertTokenizers()
{
List<object> res = new(_nmZivkovicCorpus.Count);
foreach (var text in _nmZivkovicCorpus)
{
res.Add(_nmZivkovicTokenizer.Encode(_maxSequenceLength, text));
}

return res;
}

[Benchmark]
public object RustHuggingfaceWrapperSinglethreadedMemReuse()
{
var inputIds = new uint[_maxSequenceLength];
var attMask = new uint[_maxSequenceLength];
foreach (var text in _corpus)
{
RustTokenizer.TokenizeAndGetIds(text, inputIds.AsSpan(), attMask.AsSpan());
}

return (inputIds, attMask);
}

private sealed class ConcreteUncasedTokenizer : UncasedTokenizer
{
public ConcreteUncasedTokenizer(string vocabularyFilePath)
: base(vocabularyFilePath)
{
}
}
}
4 changes: 2 additions & 2 deletions src/Benchmarks/Program.cs
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
// Copyright (c) Georg Jung. All rights reserved.
// Licensed under the MIT license. See LICENSE file in the project root for full license information.

using BenchmarkDotNet.Configs;
using BenchmarkDotNet.Running;
using Benchmarks;

var summary = BenchmarkRunner.Run<TokenizeSpeed>();
var tokenizeSpeed = BenchmarkRunner.Run<TokenizeSpeed>();
var otherLibs = BenchmarkRunner.Run<OtherLibs>();
87 changes: 17 additions & 70 deletions src/Benchmarks/TokenizeSpeed.cs
Original file line number Diff line number Diff line change
@@ -1,14 +1,9 @@
// Copyright (c) Georg Jung. All rights reserved.
// Licensed under the MIT license. See LICENSE file in the project root for full license information.

using System.IO.Compression;
using System.Text.Json;
using System.Text.RegularExpressions;
using BenchmarkDotNet.Attributes;
using BenchmarkDotNet.Diagnosers;
using BERTTokenizers.Base;
using FastBertTokenizer;
using RustLibWrapper;

namespace Benchmarks;

Expand All @@ -20,75 +15,35 @@ namespace Benchmarks;
*/
public class TokenizeSpeed
{
private readonly string[] _corpus;
private readonly List<string> _otherLibCorpus;
private readonly ConcreteUncasedTokenizer _otherLibTokenizer;
private readonly BertTokenizer _tokenizer;
private readonly string _corpusPath;
private readonly string _vocabTxtFile;
private readonly int _maxSequenceLength;
private string[] _corpus = null!;

public TokenizeSpeed()
: this("data/wiki-simple.json.br", "data/baai-bge-small-en-vocab.txt", "data/baai-bge-small-en-tokenizer.json", 512)
: this("data/wiki-simple.json.br", "data/baai-bge-small-en-vocab.txt", 512)
{
}

public TokenizeSpeed(string corpusPath, string vocabTxtFile, string tokenizerJsonPath, int maxSequenceLength)
public TokenizeSpeed(string corpusPath, string vocabTxtFile, int maxSequenceLength)
{
RustTokenizer.LoadTokenizer(tokenizerJsonPath, maxSequenceLength);
using var fs = File.OpenRead(corpusPath);
using var uncompress = new BrotliStream(fs, CompressionMode.Decompress);
var dict = JsonSerializer.Deserialize<Dictionary<int, string>>(uncompress)!;

_corpus = new string[dict.Count];
_otherLibCorpus = new(dict.Count);
var cnt = 0;
foreach (var tx in dict.Values)
{
_corpus[cnt] = tx;

// this preprocessing gives the other lib kind of an unfair advantage, but it throws otherwise
var otherLib = tx.Substring(0, Math.Min(tx.Length, 1250)); // other lib throw if text is too long; 1250 works with 512 tokens, 1500 doesn't; 5000 works with 2048 tokens
otherLib = Regex.Replace(otherLib, @"\s+", " "); // required due to bad whitespace processing of other lib
otherLib = Regex.Replace(otherLib, @"[^A-Za-z0-9\s\.\,;:\\/?!#$%()=+\-*\""'–_`<>&^@{}[\]\|~']+", string.Empty); // other lib doesn't handle unknown characters
_otherLibCorpus.Add(otherLib);

cnt++;
}

_otherLibTokenizer = new(vocabTxtFile);
_tokenizer = new();

using var sr = File.OpenText(vocabTxtFile);
_tokenizer.LoadVocabulary(sr, true);
_corpusPath = corpusPath;
_vocabTxtFile = vocabTxtFile;
_maxSequenceLength = maxSequenceLength;
_tokenizer = new();
}

[Benchmark]
public IReadOnlyCollection<object> OtherLib()
{
List<object> res = new(_otherLibCorpus.Count);
foreach (var text in _otherLibCorpus)
{
res.Add(_otherLibTokenizer.Encode(_maxSequenceLength, text));
}

return res;
}

[Benchmark]
public object RustHuggingfaceWrapperSinglethreadedMemReuse()
[GlobalSetup]
public async Task SetupAsync()
{
var inputIds = new uint[_maxSequenceLength];
var attMask = new uint[_maxSequenceLength];
foreach (var text in _otherLibCorpus)
{
RustTokenizer.TokenizeAndGetIds(text, inputIds.AsSpan(), attMask.AsSpan());
}

return (inputIds, attMask);
using var sr = File.OpenText(_vocabTxtFile);
_tokenizer.LoadVocabulary(sr, true);
_corpus = await CorpusReader.ReadBrotliJsonCorpusAsync(_corpusPath);
}

[Benchmark(Baseline = true)]
public IReadOnlyCollection<object> FastBertTokenizerSinglethreadedAllocating()
public IReadOnlyCollection<object> SinglethreadedAllocating()
{
List<object> res = new(_corpus.Length);
foreach (var text in _corpus)
Expand All @@ -100,7 +55,7 @@ public IReadOnlyCollection<object> FastBertTokenizerSinglethreadedAllocating()
}

[Benchmark]
public object FastBertTokenizerSingleThreadedMemReuse()
public object SingleThreadedMemReuse()
{
var iids = new long[_maxSequenceLength];
var attm = new long[_maxSequenceLength];
Expand All @@ -115,7 +70,7 @@ public object FastBertTokenizerSingleThreadedMemReuse()
}

[Benchmark]
public IReadOnlyCollection<(Memory<long> InputIds, Memory<long> AttentionMask, Memory<long> TokenTypeIds)> FastBertTokenizerMultithreadedAllocating()
public IReadOnlyCollection<(Memory<long> InputIds, Memory<long> AttentionMask, Memory<long> TokenTypeIds)> MultithreadedAllocating()
{
// this might be interesting to benchmark but doesn't make much sense as a real world use case
List<(Memory<long> InputIds, Memory<long> AttentionMask, Memory<long> TokenTypeIds)> res = new(_corpus.Length);
Expand All @@ -125,7 +80,7 @@ public object FastBertTokenizerSingleThreadedMemReuse()
}

[Benchmark]
public (Memory<long> InputIds, Memory<long> AttentionMask, Memory<long> TokenTypeIds) FastBertTokenizerMultithreadedMemReuse()
public (Memory<long> InputIds, Memory<long> AttentionMask, Memory<long> TokenTypeIds) MultithreadedMemReuse()
{
var batchSize = 1000;
var iids = new long[_maxSequenceLength * batchSize];
Expand All @@ -145,12 +100,4 @@ public object FastBertTokenizerSingleThreadedMemReuse()

return (iids.AsMemory(), attm.AsMemory(), toktyp.AsMemory());
}

private sealed class ConcreteUncasedTokenizer : UncasedTokenizer
{
public ConcreteUncasedTokenizer(string vocabularyFilePath)
: base(vocabularyFilePath)
{
}
}
}

0 comments on commit 41ab5aa

Please sign in to comment.