Improve Benchmark architecture

georg-jung · Nov 27, 2023 · 41ab5aa · 41ab5aa
1 parent 930d4f8
commit 41ab5aa
Show file tree

Hide file tree

Showing 4 changed files with 131 additions and 72 deletions.
diff --git a/src/Benchmarks/CorpusReader.cs b/src/Benchmarks/CorpusReader.cs
@@ -0,0 +1,19 @@
+// Copyright (c) Georg Jung. All rights reserved.
+// Licensed under the MIT license. See LICENSE file in the project root for full license information.
+
+using System.IO.Compression;
+using System.Text.Json;
+
+namespace Benchmarks;
+
+internal static class CorpusReader
+{
+    public static async Task<string[]> ReadBrotliJsonCorpusAsync(string filePath)
+    {
+        using var fs = File.OpenRead(filePath);
+        using var uncompress = new BrotliStream(fs, CompressionMode.Decompress);
+        var dict = await JsonSerializer.DeserializeAsync<Dictionary<int, string>>(uncompress);
+
+        return dict!.Values.ToArray();
+    }
+}
diff --git a/src/Benchmarks/OtherLibs.cs b/src/Benchmarks/OtherLibs.cs
@@ -0,0 +1,93 @@
+// Copyright (c) Georg Jung. All rights reserved.
+// Licensed under the MIT license. See LICENSE file in the project root for full license information.
+
+using System.Text.RegularExpressions;
+using BenchmarkDotNet.Attributes;
+using BenchmarkDotNet.Diagnosers;
+using BERTTokenizers.Base;
+using RustLibWrapper;
+
+namespace Benchmarks;
+
+[MemoryDiagnoser]
+public class OtherLibs
+{
+    private readonly string _corpusPath;
+    private readonly string _vocabTxtFile;
+    private readonly string _tokenizerJsonPath;
+    private readonly int _maxSequenceLength;
+    private ConcreteUncasedTokenizer _nmZivkovicTokenizer;
+    private string[] _corpus = null!;
+    private List<string> _nmZivkovicCorpus = null!;
+
+    public OtherLibs()
+        : this("data/wiki-simple.json.br", "data/baai-bge-small-en-vocab.txt", "data/baai-bge-small-en-tokenizer.json", 512)
+    {
+    }
+
+    public OtherLibs(string corpusPath, string vocabTxtFile, string tokenizerJsonPath, int maxSequenceLength)
+    {
+        _nmZivkovicTokenizer = new(vocabTxtFile);
+        _corpusPath = corpusPath;
+        _vocabTxtFile = vocabTxtFile;
+        _tokenizerJsonPath = tokenizerJsonPath;
+        _maxSequenceLength = maxSequenceLength;
+    }
+
+    [GlobalSetup]
+    public async Task SetupAsync()
+    {
+        RustTokenizer.LoadTokenizer(_tokenizerJsonPath, _maxSequenceLength);
+        _corpus = await CorpusReader.ReadBrotliJsonCorpusAsync(_corpusPath);
+
+        _nmZivkovicCorpus = new(_corpus.Length);
+        var cnt = 0;
+        foreach (var tx in _corpus)
+        {
+            _corpus[cnt] = tx;
+
+            // this preprocessing gives NMZivkovic/BertTokenizers kind of an unfair advantage, but it throws otherwise
+            var nmZivkovicText = tx.Substring(0, Math.Min(tx.Length, 1250)); // NMZivkovic/BertTokenizers throws if text is too long; 1250 works with 512 tokens, 1500 doesn't; 5000 works with 2048 tokens
+            nmZivkovicText = Regex.Replace(nmZivkovicText, @"\s+", " "); // required due to bad whitespace processing of NMZivkovic/BertTokenizers
+            nmZivkovicText = Regex.Replace(nmZivkovicText, @"[^A-Za-z0-9\s\.\,;:\\/?!#$%()=+\-*\""'–_`<>&^@{}[\]\|~']+", string.Empty); // NMZivkovic/BertTokenizers doesn't handle unknown characters
+            _nmZivkovicCorpus.Add(nmZivkovicText);
+
+            cnt++;
+        }
+
+        _nmZivkovicTokenizer = new(_vocabTxtFile);
+    }
+
+    [Benchmark]
+    public IReadOnlyCollection<object> NMZivkovic_BertTokenizers()
+    {
+        List<object> res = new(_nmZivkovicCorpus.Count);
+        foreach (var text in _nmZivkovicCorpus)
+        {
+            res.Add(_nmZivkovicTokenizer.Encode(_maxSequenceLength, text));
+        }
+
+        return res;
+    }
+
+    [Benchmark]
+    public object RustHuggingfaceWrapperSinglethreadedMemReuse()
+    {
+        var inputIds = new uint[_maxSequenceLength];
+        var attMask = new uint[_maxSequenceLength];
+        foreach (var text in _corpus)
+        {
+            RustTokenizer.TokenizeAndGetIds(text, inputIds.AsSpan(), attMask.AsSpan());
+        }
+
+        return (inputIds, attMask);
+    }
+
+    private sealed class ConcreteUncasedTokenizer : UncasedTokenizer
+    {
+        public ConcreteUncasedTokenizer(string vocabularyFilePath)
+            : base(vocabularyFilePath)
+        {
+        }
+    }
+}
diff --git a/src/Benchmarks/Program.cs b/src/Benchmarks/Program.cs
@@ -1,8 +1,8 @@
 // Copyright (c) Georg Jung. All rights reserved.
 // Licensed under the MIT license. See LICENSE file in the project root for full license information.
 
-using BenchmarkDotNet.Configs;
 using BenchmarkDotNet.Running;
 using Benchmarks;
 
-var summary = BenchmarkRunner.Run<TokenizeSpeed>();
+var tokenizeSpeed = BenchmarkRunner.Run<TokenizeSpeed>();
+var otherLibs = BenchmarkRunner.Run<OtherLibs>();
diff --git a/src/Benchmarks/TokenizeSpeed.cs b/src/Benchmarks/TokenizeSpeed.cs
@@ -1,14 +1,9 @@
 // Copyright (c) Georg Jung. All rights reserved.
 // Licensed under the MIT license. See LICENSE file in the project root for full license information.
 
-using System.IO.Compression;
-using System.Text.Json;
-using System.Text.RegularExpressions;
 using BenchmarkDotNet.Attributes;
 using BenchmarkDotNet.Diagnosers;
-using BERTTokenizers.Base;
 using FastBertTokenizer;
-using RustLibWrapper;
 
 namespace Benchmarks;
 
@@ -20,75 +15,35 @@ namespace Benchmarks;
 */
 public class TokenizeSpeed
 {
-    private readonly string[] _corpus;
-    private readonly List<string> _otherLibCorpus;
-    private readonly ConcreteUncasedTokenizer _otherLibTokenizer;
     private readonly BertTokenizer _tokenizer;
+    private readonly string _corpusPath;
+    private readonly string _vocabTxtFile;
     private readonly int _maxSequenceLength;
+    private string[] _corpus = null!;
 
     public TokenizeSpeed()
-        : this("data/wiki-simple.json.br", "data/baai-bge-small-en-vocab.txt", "data/baai-bge-small-en-tokenizer.json", 512)
+        : this("data/wiki-simple.json.br", "data/baai-bge-small-en-vocab.txt", 512)
     {
     }
 
-    public TokenizeSpeed(string corpusPath, string vocabTxtFile, string tokenizerJsonPath, int maxSequenceLength)
+    public TokenizeSpeed(string corpusPath, string vocabTxtFile, int maxSequenceLength)
     {
-        RustTokenizer.LoadTokenizer(tokenizerJsonPath, maxSequenceLength);
-        using var fs = File.OpenRead(corpusPath);
-        using var uncompress = new BrotliStream(fs, CompressionMode.Decompress);
-        var dict = JsonSerializer.Deserialize<Dictionary<int, string>>(uncompress)!;
-
-        _corpus = new string[dict.Count];
-        _otherLibCorpus = new(dict.Count);
-        var cnt = 0;
-        foreach (var tx in dict.Values)
-        {
-            _corpus[cnt] = tx;
-
-            // this preprocessing gives the other lib kind of an unfair advantage, but it throws otherwise
-            var otherLib = tx.Substring(0, Math.Min(tx.Length, 1250)); // other lib throw if text is too long; 1250 works with 512 tokens, 1500 doesn't; 5000 works with 2048 tokens
-            otherLib = Regex.Replace(otherLib, @"\s+", " "); // required due to bad whitespace processing of other lib
-            otherLib = Regex.Replace(otherLib, @"[^A-Za-z0-9\s\.\,;:\\/?!#$%()=+\-*\""'–_`<>&^@{}[\]\|~']+", string.Empty); // other lib doesn't handle unknown characters
-            _otherLibCorpus.Add(otherLib);
-
-            cnt++;
-        }
-
-        _otherLibTokenizer = new(vocabTxtFile);
-        _tokenizer = new();
-
-        using var sr = File.OpenText(vocabTxtFile);
-        _tokenizer.LoadVocabulary(sr, true);
+        _corpusPath = corpusPath;
+        _vocabTxtFile = vocabTxtFile;
         _maxSequenceLength = maxSequenceLength;
+        _tokenizer = new();
     }
 
-    [Benchmark]
-    public IReadOnlyCollection<object> OtherLib()
-    {
-        List<object> res = new(_otherLibCorpus.Count);
-        foreach (var text in _otherLibCorpus)
-        {
-            res.Add(_otherLibTokenizer.Encode(_maxSequenceLength, text));
-        }
-
-        return res;
-    }
-
-    [Benchmark]
-    public object RustHuggingfaceWrapperSinglethreadedMemReuse()
+    [GlobalSetup]
+    public async Task SetupAsync()
     {
-        var inputIds = new uint[_maxSequenceLength];
-        var attMask = new uint[_maxSequenceLength];
-        foreach (var text in _otherLibCorpus)
-        {
-            RustTokenizer.TokenizeAndGetIds(text, inputIds.AsSpan(), attMask.AsSpan());
-        }
-
-        return (inputIds, attMask);
+        using var sr = File.OpenText(_vocabTxtFile);
+        _tokenizer.LoadVocabulary(sr, true);
+        _corpus = await CorpusReader.ReadBrotliJsonCorpusAsync(_corpusPath);
     }
 
     [Benchmark(Baseline = true)]
-    public IReadOnlyCollection<object> FastBertTokenizerSinglethreadedAllocating()
+    public IReadOnlyCollection<object> SinglethreadedAllocating()
     {
         List<object> res = new(_corpus.Length);
         foreach (var text in _corpus)
@@ -100,7 +55,7 @@ public IReadOnlyCollection<object> FastBertTokenizerSinglethreadedAllocating()
     }
 
     [Benchmark]
-    public object FastBertTokenizerSingleThreadedMemReuse()
+    public object SingleThreadedMemReuse()
     {
         var iids = new long[_maxSequenceLength];
         var attm = new long[_maxSequenceLength];
@@ -115,7 +70,7 @@ public object FastBertTokenizerSingleThreadedMemReuse()
     }
 
     [Benchmark]
-    public IReadOnlyCollection<(Memory<long> InputIds, Memory<long> AttentionMask, Memory<long> TokenTypeIds)> FastBertTokenizerMultithreadedAllocating()
+    public IReadOnlyCollection<(Memory<long> InputIds, Memory<long> AttentionMask, Memory<long> TokenTypeIds)> MultithreadedAllocating()
     {
         // this might be interesting to benchmark but doesn't make much sense as a real world use case
         List<(Memory<long> InputIds, Memory<long> AttentionMask, Memory<long> TokenTypeIds)> res = new(_corpus.Length);
@@ -125,7 +80,7 @@ public object FastBertTokenizerSingleThreadedMemReuse()
     }
 
     [Benchmark]
-    public (Memory<long> InputIds, Memory<long> AttentionMask, Memory<long> TokenTypeIds) FastBertTokenizerMultithreadedMemReuse()
+    public (Memory<long> InputIds, Memory<long> AttentionMask, Memory<long> TokenTypeIds) MultithreadedMemReuse()
     {
         var batchSize = 1000;
         var iids = new long[_maxSequenceLength * batchSize];
@@ -145,12 +100,4 @@ public object FastBertTokenizerSingleThreadedMemReuse()
 
         return (iids.AsMemory(), attm.AsMemory(), toktyp.AsMemory());
     }
-
-    private sealed class ConcreteUncasedTokenizer : UncasedTokenizer
-    {
-        public ConcreteUncasedTokenizer(string vocabularyFilePath)
-            : base(vocabularyFilePath)
-        {
-        }
-    }
 }