From 02eedd9477616b11dd30ea8eee0fc5114dfe574f Mon Sep 17 00:00:00 2001 From: Martin Evans Date: Tue, 21 Jan 2025 00:24:36 +0000 Subject: [PATCH] January 2025 Update (#1036) * code changes for december update (not working yet) * Changes to support up to https://github.com/ggerganov/llama.cpp/commit/d408bb9268a988c5a60a5746d3a6430386e7604d * Updated to latest llama.cpp binaries, this works on Windows CPU but needs more changes for other backends * Updated to latest deps, fixed kernel memory failing to load * Copy missing Mac flibraries libggml-base and libggml-cpu * Removed any mention of AVX in MacOS loading * Added file copying for some more targets (still missing macos) * Updated to latest set of binaries * Fixed copy path for CUDA12 DLLs * Compatibility with llama.cpp backend split (PR #10256) on all platforms * Restore original comment * Update the dependency loader for ggml-metal and ggml-blas * Update the runtime targets for ggml-metal and ggml-blas * Add CPU backend (fallback) dependency for the GPU backends * Fix icons for the nuget backends * Update nuspec files for the GPU backends * Update BinaryReleaseId * Update nuspec for CPU & OSX * Update CPU nuspec to use noavx folder * Update Runtime.targets to use noavx folder * Update BinaryReleaseId * CUDA & Vulkan native libraries now correctly store the detected or user defined AVX level --------- Co-authored-by: SignalRT Co-authored-by: m0nsky --- LLama.Examples/Program.cs | 8 +- .../LLamaSharpTextEmbeddingGenerator.cs | 4 +- LLama.KernelMemory/LlamaSharpTextGenerator.cs | 2 - .../LLamaSharpTextEmbeddingGeneratorTests.cs | 14 +- .../LlamaSharpTextGeneratorTests.cs | 18 +- LLama.Unittest/SamplingTests.cs | 6 +- LLama.Web/Common/ModelOptions.cs | 5 +- LLama/Abstractions/IModelParams.cs | 7 +- LLama/Common/ModelParams.cs | 5 +- LLama/Extensions/IModelParamsExtensions.cs | 7 +- LLama/Extensions/LLamaExecutorExtensions.cs | 2 +- LLama/LLamaQuantizer.cs | 3 - LLama/LLamaSharp.Runtime.targets | 191 +++++++++++++++++- LLama/LLamaSharp.csproj | 2 +- LLama/LLamaStatelessExecutor.cs | 12 +- LLama/Native/GPUSplitMode.cs | 2 +- LLama/Native/LLamaFtype.cs | 24 +-- LLama/Native/LLamaModelParams.cs | 18 +- LLama/Native/LLamaRopeType.cs | 2 + .../DefaultNativeLibrarySelectingPolicy.cs | 8 +- LLama/Native/Load/NativeLibraryConfig.cs | 4 +- LLama/Native/Load/NativeLibraryUtils.cs | 95 +++++++-- LLama/Native/Load/NativeLibraryWithCuda.cs | 4 +- .../Load/NativeLibraryWithMacOrFallback.cs | 31 +-- LLama/Native/Load/NativeLibraryWithVulkan.cs | 4 +- LLama/Native/NativeApi.cs | 9 + LLama/Native/RopeScalingType.cs | 7 +- LLama/Native/SafeLLamaContextHandle.cs | 17 +- LLama/Native/SafeLLamaSamplerHandle.cs | 94 +++++++-- LLama/Native/SafeLlamaModelHandle.cs | 3 - LLama/Sampling/DefaultSamplingPipeline.cs | 58 +----- .../build/LLamaSharp.Backend.Cpu.nuspec | 61 ++++-- .../LLamaSharp.Backend.Cuda11.Linux.nuspec | 11 +- .../LLamaSharp.Backend.Cuda11.Windows.nuspec | 11 +- .../build/LLamaSharp.Backend.Cuda11.nuspec | 1 + .../LLamaSharp.Backend.Cuda12.Linux.nuspec | 12 +- .../LLamaSharp.Backend.Cuda12.Windows.nuspec | 12 +- .../LLamaSharp.Backend.Vulkan.Linux.nuspec | 11 +- .../LLamaSharp.Backend.Vulkan.Windows.nuspec | 12 +- .../build/LLamaSharp.Backend.Vulkan.nuspec | 3 +- 40 files changed, 563 insertions(+), 237 deletions(-) diff --git a/LLama.Examples/Program.cs b/LLama.Examples/Program.cs index 63114120d..f8ef7d5aa 100644 --- a/LLama.Examples/Program.cs +++ b/LLama.Examples/Program.cs @@ -1,6 +1,5 @@ -using LLama.Native; +using LLama.Native; using Spectre.Console; -using System.Runtime.InteropServices; AnsiConsole.MarkupLineInterpolated( $""" @@ -18,7 +17,7 @@ __ __ ____ __ """); // Configure logging. Change this to `true` to see log messages from llama.cpp -var showLLamaCppLogs = false; +var showLLamaCppLogs = true; NativeLibraryConfig .All .WithLogCallback((level, message) => @@ -31,8 +30,7 @@ __ __ ____ __ NativeLibraryConfig .All .WithCuda() - //.WithAutoDownload() // An experimental feature - .DryRun(out var loadedllamaLibrary, out var loadedLLavaLibrary); + .WithVulkan(); // Calling this method forces loading to occur now. NativeApi.llama_empty_call(); diff --git a/LLama.KernelMemory/LLamaSharpTextEmbeddingGenerator.cs b/LLama.KernelMemory/LLamaSharpTextEmbeddingGenerator.cs index 7f9ae1e4d..6efd44f7b 100644 --- a/LLama.KernelMemory/LLamaSharpTextEmbeddingGenerator.cs +++ b/LLama.KernelMemory/LLamaSharpTextEmbeddingGenerator.cs @@ -31,11 +31,9 @@ public LLamaSharpTextEmbeddingGenerator(LLamaSharpConfig config) var @params = new ModelParams(config.ModelPath) { - ContextSize = config.ContextSize ?? 2048, + ContextSize = config.ContextSize, GpuLayerCount = config.GpuLayerCount ?? 20, Embeddings = true, - MainGpu = config.MainGpu, - SplitMode = config.SplitMode, PoolingType = LLamaPoolingType.Mean, }; _weights = LLamaWeights.LoadFromFile(@params); diff --git a/LLama.KernelMemory/LlamaSharpTextGenerator.cs b/LLama.KernelMemory/LlamaSharpTextGenerator.cs index adfc89317..3fc96db9a 100644 --- a/LLama.KernelMemory/LlamaSharpTextGenerator.cs +++ b/LLama.KernelMemory/LlamaSharpTextGenerator.cs @@ -33,8 +33,6 @@ public LlamaSharpTextGenerator(LLamaSharpConfig config) { ContextSize = config.ContextSize ?? 2048, GpuLayerCount = config.GpuLayerCount ?? 20, - MainGpu = config.MainGpu, - SplitMode = config.SplitMode }; _weights = LLamaWeights.LoadFromFile(parameters); _context = _weights.CreateContext(parameters); diff --git a/LLama.Unittest/KernelMemory/LLamaSharpTextEmbeddingGeneratorTests.cs b/LLama.Unittest/KernelMemory/LLamaSharpTextEmbeddingGeneratorTests.cs index 91161b72c..5c7b4213d 100644 --- a/LLama.Unittest/KernelMemory/LLamaSharpTextEmbeddingGeneratorTests.cs +++ b/LLama.Unittest/KernelMemory/LLamaSharpTextEmbeddingGeneratorTests.cs @@ -1,21 +1,15 @@ -using LLama.Common; using LLamaSharp.KernelMemory; -using Microsoft.KernelMemory.AI; -using System; -using System.Collections.Generic; -using System.Linq; -using System.Text; -using System.Text.RegularExpressions; -using System.Threading.Tasks; using Xunit.Abstractions; namespace LLama.Unittest.KernelMemory { - public class LLamaSharpTextEmbeddingGeneratorTests : ITextTokenizerTests, IDisposable + public class LLamaSharpTextEmbeddingGeneratorTests + : ITextTokenizerTests, IDisposable { private readonly LLamaSharpTextEmbeddingGenerator _embeddingGenerator; - public LLamaSharpTextEmbeddingGeneratorTests(ITestOutputHelper testOutputHelper) : base(testOutputHelper) + public LLamaSharpTextEmbeddingGeneratorTests(ITestOutputHelper testOutputHelper) + : base(testOutputHelper) { _embeddingGenerator = new LLamaSharpTextEmbeddingGenerator(_lsConfig); diff --git a/LLama.Unittest/KernelMemory/LlamaSharpTextGeneratorTests.cs b/LLama.Unittest/KernelMemory/LlamaSharpTextGeneratorTests.cs index 02001f8cf..d21d7f959 100644 --- a/LLama.Unittest/KernelMemory/LlamaSharpTextGeneratorTests.cs +++ b/LLama.Unittest/KernelMemory/LlamaSharpTextGeneratorTests.cs @@ -1,25 +1,15 @@ -using LLama.Common; using LLamaSharp.KernelMemory; -using Microsoft.KernelMemory.AI; -using System; -using System.Collections.Generic; -using System.Diagnostics; -using System.Linq; -using System.Reflection.Emit; -using System.Text; -using System.Text.RegularExpressions; -using System.Threading.Tasks; using Xunit.Abstractions; -using Xunit.Sdk; -using static System.Net.Mime.MediaTypeNames; namespace LLama.Unittest.KernelMemory { - public class LlamaSharpTextGeneratorTests : ITextTokenizerTests, IDisposable + public class LlamaSharpTextGeneratorTests + : ITextTokenizerTests, IDisposable { private readonly LlamaSharpTextGenerator _textGenerator; - public LlamaSharpTextGeneratorTests(ITestOutputHelper testOutputHelper) : base(testOutputHelper) + public LlamaSharpTextGeneratorTests(ITestOutputHelper testOutputHelper) + : base(testOutputHelper) { _textGenerator = new LlamaSharpTextGenerator(_lsConfig); diff --git a/LLama.Unittest/SamplingTests.cs b/LLama.Unittest/SamplingTests.cs index f322bc250..bae7e3dea 100644 --- a/LLama.Unittest/SamplingTests.cs +++ b/LLama.Unittest/SamplingTests.cs @@ -167,11 +167,7 @@ private static SafeLLamaSamplerChainHandle CreateChain(SafeLLamaContextHandle co var chain = SafeLLamaSamplerChainHandle.Create(LLamaSamplerChainParams.Default()); chain.AddPenalties( - vocabSize: context.VocabCount, - eos: context.ModelHandle.Tokens.EOS, - newline: context.ModelHandle.Tokens.Newline ?? 0, - penaltyCount: 60, repeat: 1, freq: 0, presence: 0, - penalizeNewline: false, ignoreEOS: false + penaltyCount: 60, repeat: 1, freq: 0, presence: 0 ); if (logit_bias != null) { chain.AddLogitBias(context.VocabCount, logit_bias); } diff --git a/LLama.Web/Common/ModelOptions.cs b/LLama.Web/Common/ModelOptions.cs index 4e002c93f..a67a11a96 100644 --- a/LLama.Web/Common/ModelOptions.cs +++ b/LLama.Web/Common/ModelOptions.cs @@ -24,7 +24,7 @@ public class ModelOptions public int MainGpu { get; set; } = 0; /// - public GPUSplitMode SplitMode { get; set; } = GPUSplitMode.None; + public GPUSplitMode? SplitMode { get; set; } /// public int GpuLayerCount { get; set; } = 20; @@ -59,6 +59,9 @@ public class ModelOptions /// public TensorSplitsCollection TensorSplits { get; set; } = new(); + /// + public bool CheckTensors { get; } + /// public List MetadataOverrides { get; } = new(); diff --git a/LLama/Abstractions/IModelParams.cs b/LLama/Abstractions/IModelParams.cs index 7dc28f671..cbbacafe5 100644 --- a/LLama/Abstractions/IModelParams.cs +++ b/LLama/Abstractions/IModelParams.cs @@ -36,7 +36,7 @@ public interface IModelParams /// /// How to split the model across multiple GPUs /// - GPUSplitMode SplitMode { get; } + GPUSplitMode? SplitMode { get; } /// /// Number of layers to run in VRAM / GPU memory (n_gpu_layers) @@ -68,6 +68,11 @@ public interface IModelParams /// bool VocabOnly { get; } + /// + /// Validate model tensor data before loading + /// + bool CheckTensors { get; } + /// /// Override specific metadata items in the model /// diff --git a/LLama/Common/ModelParams.cs b/LLama/Common/ModelParams.cs index b276ed73a..7e4b1a967 100644 --- a/LLama/Common/ModelParams.cs +++ b/LLama/Common/ModelParams.cs @@ -19,7 +19,7 @@ public record ModelParams public int MainGpu { get; set; } = 0; /// - public GPUSplitMode SplitMode { get; set; } = GPUSplitMode.None; + public GPUSplitMode? SplitMode { get; set; } /// public int GpuLayerCount { get; set; } = 20; @@ -54,6 +54,9 @@ public record ModelParams /// public TensorSplitsCollection TensorSplits { get; set; } = new(); + /// + public bool CheckTensors { get; } + /// public List MetadataOverrides { get; set; } = new(); diff --git a/LLama/Extensions/IModelParamsExtensions.cs b/LLama/Extensions/IModelParamsExtensions.cs index 523ec737a..588564e33 100644 --- a/LLama/Extensions/IModelParamsExtensions.cs +++ b/LLama/Extensions/IModelParamsExtensions.cs @@ -1,4 +1,4 @@ -using System.IO; +using System.IO; using System; using System.Text; using LLama.Abstractions; @@ -31,11 +31,14 @@ public static IDisposable ToLlamaModelParams(this IModelParams @params, out LLam result = LLamaModelParams.Default(); result.main_gpu = @params.MainGpu; - result.split_mode = @params.SplitMode; result.n_gpu_layers = @params.GpuLayerCount < 0 ? int.MaxValue : @params.GpuLayerCount; + if (@params.SplitMode.HasValue) + result.split_mode = @params.SplitMode.Value; + result.use_mlock = @params.UseMemoryLock; result.use_mmap = @params.UseMemorymap; result.vocab_only = @params.VocabOnly; + result.check_tensors = @params.CheckTensors; unsafe { diff --git a/LLama/Extensions/LLamaExecutorExtensions.cs b/LLama/Extensions/LLamaExecutorExtensions.cs index 19c8d33df..e38ccf98d 100644 --- a/LLama/Extensions/LLamaExecutorExtensions.cs +++ b/LLama/Extensions/LLamaExecutorExtensions.cs @@ -147,7 +147,7 @@ private string CreatePrompt(IList messages) PreventEOS = options?.AdditionalProperties?.TryGetValue(nameof(DefaultSamplingPipeline.PreventEOS), out bool eos) is true ? eos : s_defaultPipeline.PreventEOS, PenalizeNewline = options?.AdditionalProperties?.TryGetValue(nameof(DefaultSamplingPipeline.PenalizeNewline), out bool pnl) is true ? pnl : s_defaultPipeline.PenalizeNewline, RepeatPenalty = options?.AdditionalProperties?.TryGetValue(nameof(DefaultSamplingPipeline.RepeatPenalty), out float rp) is true ? rp : s_defaultPipeline.RepeatPenalty, - RepeatPenaltyCount = options?.AdditionalProperties?.TryGetValue(nameof(DefaultSamplingPipeline.RepeatPenaltyCount), out int rpc) is true ? rpc : s_defaultPipeline.RepeatPenaltyCount, + PenaltyCount = options?.AdditionalProperties?.TryGetValue(nameof(DefaultSamplingPipeline.PenaltyCount), out int rpc) is true ? rpc : s_defaultPipeline.PenaltyCount, Grammar = options?.AdditionalProperties?.TryGetValue(nameof(DefaultSamplingPipeline.Grammar), out Grammar? g) is true ? g : s_defaultPipeline.Grammar, MinKeep = options?.AdditionalProperties?.TryGetValue(nameof(DefaultSamplingPipeline.MinKeep), out int mk) is true ? mk : s_defaultPipeline.MinKeep, MinP = options?.AdditionalProperties?.TryGetValue(nameof(DefaultSamplingPipeline.MinP), out float mp) is true ? mp : s_defaultPipeline.MinP, diff --git a/LLama/LLamaQuantizer.cs b/LLama/LLamaQuantizer.cs index 23f0f8b4e..9e90b732e 100644 --- a/LLama/LLamaQuantizer.cs +++ b/LLama/LLamaQuantizer.cs @@ -106,9 +106,6 @@ private static bool ValidateFtype(LLamaFtype ftype) case LLamaFtype.MOSTLY_IQ3_S: case LLamaFtype.MOSTLY_IQ3_M: - case LLamaFtype.MOSTLY_Q4_0_4_4: - case LLamaFtype.MOSTLY_Q4_0_4_8: - case LLamaFtype.MOSTLY_Q4_0_8_8: return true; case LLamaFtype.GUESSED: diff --git a/LLama/LLamaSharp.Runtime.targets b/LLama/LLamaSharp.Runtime.targets index 6466a1204..76292aaf5 100644 --- a/LLama/LLamaSharp.Runtime.targets +++ b/LLama/LLamaSharp.Runtime.targets @@ -4,14 +4,24 @@ - + PreserveNewest runtimes/win-x64/native/noavx/llama.dll - + PreserveNewest runtimes/win-x64/native/noavx/ggml.dll + + PreserveNewest + runtimes/win-x64/native/noavx/ggml-base.dll + + + PreserveNewest + runtimes/win-x64/native/noavx/ggml-cpu.dll + + + PreserveNewest runtimes/win-x64/native/avx/llama.dll @@ -20,55 +30,124 @@ PreserveNewest runtimes/win-x64/native/avx/ggml.dll + + PreserveNewest + runtimes/win-x64/native/avx/ggml-base.dll + + + PreserveNewest + runtimes/win-x64/native/avx/ggml-cpu.dll + + + PreserveNewest runtimes/win-x64/native/avx2/llama.dll + + PreserveNewest + runtimes/win-x64/native/avx2/ggml-base.dll + PreserveNewest runtimes/win-x64/native/avx2/ggml.dll + + PreserveNewest + runtimes/win-x64/native/avx2/ggml-cpu.dll + + + PreserveNewest runtimes/win-x64/native/avx512/llama.dll + + PreserveNewest + runtimes/win-x64/native/avx512/ggml-base.dll + PreserveNewest runtimes/win-x64/native/avx512/ggml.dll + + PreserveNewest + runtimes/win-x64/native/avx512/ggml-cpu.dll + + + PreserveNewest runtimes/win-x64/native/cuda11/llama.dll + + PreserveNewest + runtimes/win-x64/native/cuda11/ggml-base.dll + PreserveNewest runtimes/win-x64/native/cuda11/ggml.dll + + PreserveNewest + runtimes/win-x64/native/cuda11/ggml-cuda.dll + + + PreserveNewest runtimes/win-x64/native/cuda12/llama.dll + + PreserveNewest + runtimes/win-x64/native/cuda12/ggml-base.dll + PreserveNewest runtimes/win-x64/native/cuda12/ggml.dll + + PreserveNewest + runtimes/win-x64/native/cuda12/ggml-cuda.dll + + + PreserveNewest runtimes/win-x64/native/vulkan/llama.dll + + PreserveNewest + runtimes/win-x64/native/vulkan/ggml-base.dll + PreserveNewest runtimes/win-x64/native/vulkan/ggml.dll + + PreserveNewest + runtimes/win-x64/native/vulkan/ggml-vulkan.dll + + - + PreserveNewest runtimes/linux-x64/native/noavx/libllama.so - + PreserveNewest runtimes/linux-x64/native/noavx/libggml.so + + PreserveNewest + runtimes/linux-x64/native/noavx/libggml-base.so + + + PreserveNewest + runtimes/linux-x64/native/noavx/libggml-cpu.so + + + PreserveNewest runtimes/linux-x64/native/avx/libllama.so @@ -77,6 +156,17 @@ PreserveNewest runtimes/linux-x64/native/avx/libggml.so + + PreserveNewest + runtimes/linux-x64/native/avx/libggml-base.so + + + PreserveNewest + runtimes/linux-x64/native/avx/libggml-cpu.so + + + + PreserveNewest runtimes/linux-x64/native/avx2/libllama.so @@ -85,6 +175,15 @@ PreserveNewest runtimes/linux-x64/native/avx2/libggml.so + + PreserveNewest + runtimes/linux-x64/native/avx2/libggml-base.so + + + PreserveNewest + runtimes/linux-x64/native/avx2/libggml-cpu.so + + PreserveNewest runtimes/linux-x64/native/avx512/libllama.so @@ -93,6 +192,15 @@ PreserveNewest runtimes/linux-x64/native/avx512/libggml.so + + PreserveNewest + runtimes/linux-x64/native/avx512/libggml-base.so + + + PreserveNewest + runtimes/linux-x64/native/avx512/libggml-cpu.so + + PreserveNewest runtimes/linux-x64/native/cuda11/libllama.so @@ -101,6 +209,16 @@ PreserveNewest runtimes/linux-x64/native/cuda11/libggml.so + + PreserveNewest + runtimes/linux-x64/native/cuda11/libggml-base.so + + + PreserveNewest + runtimes/linux-x64/native/cuda11/libggml-cuda.so + + + PreserveNewest runtimes/linux-x64/native/cuda12/libllama.so @@ -109,6 +227,16 @@ PreserveNewest runtimes/linux-x64/native/cuda12/libggml.so + + PreserveNewest + runtimes/linux-x64/native/cuda12/libggml-base.so + + + PreserveNewest + runtimes/linux-x64/native/cuda12/libggml-cuda.so + + + PreserveNewest runtimes/linux-x64/native/vulkan/libllama.so @@ -117,7 +245,32 @@ PreserveNewest runtimes/linux-x64/native/vulkan/libggml.so + + PreserveNewest + runtimes/linux-x64/native/vulkan/libggml-base.so + + + PreserveNewest + runtimes/linux-x64/native/vulkan/libggml-vulkan.so + + + + PreserveNewest + runtimes/osx-arm64/native/libggml-base.dylib + + + PreserveNewest + runtimes/osx-arm64/native/libggml-cpu.dylib + + + PreserveNewest + runtimes/osx-arm64/native/libggml-metal.dylib + + + PreserveNewest + runtimes/osx-arm64/native/libggml-blas.dylib + PreserveNewest runtimes/osx-arm64/native/libggml.dylib @@ -134,7 +287,19 @@ PreserveNewest runtimes/osx-arm64/native/ggml-metal.metal - + + + PreserveNewest + runtimes/osx-x64/native/libggml-base.dylib + + + PreserveNewest + runtimes/osx-x64/native/libggml-cpu.dylib + + + PreserveNewest + runtimes/osx-x64/native/libggml-blas.dylib + PreserveNewest runtimes/osx-x64/native/libggml.dylib @@ -148,6 +313,18 @@ runtimes/osx-x64/native/libllava_shared.dylib + + PreserveNewest + runtimes/osx-x64/native/rosetta2/libggml-base.dylib + + + PreserveNewest + runtimes/osx-x64/native/rosetta2/libggml-cpu.dylib + + + PreserveNewest + runtimes/osx-x64/native/rosetta2/libggml-blas.dylib + PreserveNewest runtimes/osx-x64/native/rosetta2/libggml.dylib @@ -161,7 +338,7 @@ runtimes/osx-x64/native/rosetta2/libllava_shared.dylib - + PreserveNewest runtimes/win-x64/native/noavx/llava_shared.dll @@ -190,7 +367,7 @@ runtimes/win-x64/native/vulkan/llava_shared.dll - + PreserveNewest runtimes/linux-x64/native/noavx/libllava_shared.so diff --git a/LLama/LLamaSharp.csproj b/LLama/LLamaSharp.csproj index d50771f3a..a32aa7647 100644 --- a/LLama/LLamaSharp.csproj +++ b/LLama/LLamaSharp.csproj @@ -56,7 +56,7 @@ - 958367bf530d943a90 + 0827b2c1da-v6 diff --git a/LLama/LLamaStatelessExecutor.cs b/LLama/LLamaStatelessExecutor.cs index 003cf9827..ace8e2581 100644 --- a/LLama/LLamaStatelessExecutor.cs +++ b/LLama/LLamaStatelessExecutor.cs @@ -24,14 +24,14 @@ public class StatelessExecutor private readonly ILogger? _logger; private readonly LLamaBatch _batch; - // LLava Section + /// public bool IsMultiModal => false; /// - public LLavaWeights? ClipModel { get; } + public LLavaWeights? ClipModel => default; /// - public List Images { get; set; } + public List Images { get; } /// /// The context used by the executor when running the inference. @@ -80,7 +80,7 @@ public async IAsyncEnumerable InferAsync(string prompt, IInferenceParams Context = context; // Reset the sampling pipeline (if there is one) - inferenceParams?.SamplingPipeline?.Reset(); + inferenceParams?.SamplingPipeline.Reset(); // Sanity check inference params inferenceParams ??= new InferenceParams(); @@ -155,8 +155,8 @@ public async IAsyncEnumerable InferAsync(string prompt, IInferenceParams var n_left = n_past - tokensKeep; var n_discard = n_left / 2; - NativeApi.llama_kv_cache_seq_rm(Context.NativeHandle, (LLamaSeqId)0, tokensKeep , tokensKeep + n_discard); - NativeApi.llama_kv_cache_seq_add(Context.NativeHandle, (LLamaSeqId)0, tokensKeep + n_discard, n_past, -n_discard); + NativeApi.llama_kv_cache_seq_rm(Context.NativeHandle, LLamaSeqId.Zero, tokensKeep , tokensKeep + n_discard); + NativeApi.llama_kv_cache_seq_add(Context.NativeHandle, LLamaSeqId.Zero, tokensKeep + n_discard, n_past, -n_discard); n_past -= n_discard; } diff --git a/LLama/Native/GPUSplitMode.cs b/LLama/Native/GPUSplitMode.cs index 54fa095c1..27ee7ae49 100644 --- a/LLama/Native/GPUSplitMode.cs +++ b/LLama/Native/GPUSplitMode.cs @@ -17,7 +17,7 @@ public enum GPUSplitMode Layer = 1, /// - /// split rows across GPUs + /// split layers and KV across GPUs, use tensor parallelism if supported /// Row = 2, } \ No newline at end of file diff --git a/LLama/Native/LLamaFtype.cs b/LLama/Native/LLamaFtype.cs index 6970a4728..705f8032e 100644 --- a/LLama/Native/LLamaFtype.cs +++ b/LLama/Native/LLamaFtype.cs @@ -177,20 +177,20 @@ public enum LLamaFtype /// MOSTLY_BF16 = 32, - /// - /// except 1d tensors - /// - MOSTLY_Q4_0_4_4 = 33, + ///// + ///// except 1d tensors (no longer supported by llama.cpp) + ///// + //MOSTLY_Q4_0_4_4 = 33, - /// - /// except 1d tensors - /// - MOSTLY_Q4_0_4_8 = 34, + ///// + ///// except 1d tensors (no longer supported by llama.cpp) + ///// + //MOSTLY_Q4_0_4_8 = 34, - /// - /// except 1d tensors - /// - MOSTLY_Q4_0_8_8 = 35, + ///// + ///// except 1d tensors (no longer supported by llama.cpp) + ///// + //MOSTLY_Q4_0_8_8 = 35, /// /// except 1d tensors diff --git a/LLama/Native/LLamaModelParams.cs b/LLama/Native/LLamaModelParams.cs index c0437d9db..e16e3263e 100644 --- a/LLama/Native/LLamaModelParams.cs +++ b/LLama/Native/LLamaModelParams.cs @@ -8,6 +8,12 @@ namespace LLama.Native [StructLayout(LayoutKind.Sequential)] public unsafe struct LLamaModelParams { + /// + /// NULL-terminated list of devices to use for offloading (if NULL, all available devices are used) + /// todo: add support for llama_model_params.devices + /// + private IntPtr devices; + /// /// // number of layers to store in VRAM /// @@ -19,19 +25,19 @@ public unsafe struct LLamaModelParams public GPUSplitMode split_mode; /// - /// the GPU that is used for scratch and small tensors + /// the GPU that is used for the entire model when split_mode is LLAMA_SPLIT_MODE_NONE /// public int main_gpu; /// /// how to split layers across multiple GPUs (size: ) /// - public float* tensor_split; - - /// - /// comma separated list of RPC servers to use for offloading + public float* tensor_split; + + /// + /// comma separated list of RPC servers to use for offloading /// - public byte* rpc_servers; + public byte* rpc_servers; /// /// called with a progress value between 0 and 1, pass NULL to disable. If the provided progress_callback diff --git a/LLama/Native/LLamaRopeType.cs b/LLama/Native/LLamaRopeType.cs index ebad9e77b..3f1188112 100644 --- a/LLama/Native/LLamaRopeType.cs +++ b/LLama/Native/LLamaRopeType.cs @@ -9,4 +9,6 @@ public enum LLamaRopeType None = -1, Norm = 0, NEOX = 2,//GGML_ROPE_TYPE_NEOX, + //todo:LLAMA_ROPE_TYPE_MROPE = GGML_ROPE_TYPE_MROPE, + //todo:LLAMA_ROPE_TYPE_VISION = GGML_ROPE_TYPE_VISION, } \ No newline at end of file diff --git a/LLama/Native/Load/DefaultNativeLibrarySelectingPolicy.cs b/LLama/Native/Load/DefaultNativeLibrarySelectingPolicy.cs index 6f5ad35fe..36ab0c0c8 100644 --- a/LLama/Native/Load/DefaultNativeLibrarySelectingPolicy.cs +++ b/LLama/Native/Load/DefaultNativeLibrarySelectingPolicy.cs @@ -10,8 +10,6 @@ public class DefaultNativeLibrarySelectingPolicy: INativeLibrarySelectingPolicy /// public IEnumerable Apply(NativeLibraryConfig.Description description, SystemInfo systemInfo, NativeLogConfig.LLamaLogCallback? logCallback) { - List results = new(); - // Show the configuration we're working with Log(description.ToString(), LLamaLogLevel.Info, logCallback); @@ -24,12 +22,12 @@ public IEnumerable Apply(NativeLibraryConfig.Description descrip { if (description.UseCuda) { - yield return new NativeLibraryWithCuda(systemInfo.CudaMajorVersion, description.Library, description.SkipCheck); + yield return new NativeLibraryWithCuda(systemInfo.CudaMajorVersion, description.Library, description.AvxLevel, description.SkipCheck); } if (description.UseVulkan) { - yield return new NativeLibraryWithVulkan(systemInfo.VulkanVersion, description.Library, description.SkipCheck); + yield return new NativeLibraryWithVulkan(systemInfo.VulkanVersion, description.Library, description.AvxLevel, description.SkipCheck); } if((!description.UseCuda || !description.UseVulkan) || description.AllowFallback) @@ -56,7 +54,7 @@ public IEnumerable Apply(NativeLibraryConfig.Description descrip if(systemInfo.OSPlatform == OSPlatform.OSX || description.AllowFallback) { - yield return new NativeLibraryWithMacOrFallback(description.Library, description.SkipCheck); + yield return new NativeLibraryWithMacOrFallback(description.Library); } } } diff --git a/LLama/Native/Load/NativeLibraryConfig.cs b/LLama/Native/Load/NativeLibraryConfig.cs index 02e47b695..2bfa0554b 100644 --- a/LLama/Native/Load/NativeLibraryConfig.cs +++ b/LLama/Native/Load/NativeLibraryConfig.cs @@ -178,7 +178,7 @@ internal Description CheckAndGatherDescription() _avxLevel, _allowFallback, _skipCheck, - _searchDirectories.Concat(new[] { "./" }).ToArray() + _searchDirectories.Concat([ "./" ]).ToArray() ); } @@ -186,7 +186,7 @@ internal static string AvxLevelToString(AvxLevel level) { return level switch { - AvxLevel.None => string.Empty, + AvxLevel.None => "noavx", AvxLevel.Avx => "avx", AvxLevel.Avx2 => "avx2", AvxLevel.Avx512 => "avx512", diff --git a/LLama/Native/Load/NativeLibraryUtils.cs b/LLama/Native/Load/NativeLibraryUtils.cs index d5b014ce0..13e68be4d 100644 --- a/LLama/Native/Load/NativeLibraryUtils.cs +++ b/LLama/Native/Load/NativeLibraryUtils.cs @@ -45,33 +45,86 @@ internal static IntPtr TryLoadLibrary(NativeLibraryConfig config, out INativeLib { Log($"Got relative library path '{path}' from local with {library.Metadata}, trying to load it...", LLamaLogLevel.Debug, config.LogCallback); - // If we are on Linux / OSX, we need to manually load the GGML dependency - if (systemInfo.OSPlatform == OSPlatform.Linux || systemInfo.OSPlatform == OSPlatform.OSX) + // After the llama.cpp binaries have been split up (PR #10256), we need to load the dependencies manually. + // It can't be done automatically on Windows, because the dependencies can be in different folders (for example, ggml-cuda.dll from the cuda12 folder, and ggml-cpu.dll from the avx2 folder) + // It can't be done automatically on Linux, because Linux uses the environment variable "LD_LIBRARY_PATH" to automatically load dependencies, and LD_LIBRARY_PATH can only be + // set before running LLamaSharp, but we only know which folders to search in when running LLamaSharp (decided by the NativeLibrary). + + // Get the directory of the current runtime + string? currentRuntimeDirectory = Path.GetDirectoryName(path); + + // If we failed to get the directory of the current runtime, log it and continue on to the next library + if (currentRuntimeDirectory == null) { - // Get the directory of the library - string? libraryDirectory = Path.GetDirectoryName(path); - - if (libraryDirectory != null) + Log($"Failed to get the directory of the current runtime from path '{path}'", LLamaLogLevel.Error, config.LogCallback); + continue; + } + + // List which will hold all paths to dependencies to load + var dependencyPaths = new List(); + + // We should always load ggml-base from the current runtime directory + dependencyPaths.Add(Path.Combine(currentRuntimeDirectory, $"{libPrefix}ggml-base{ext}")); + + // If the library has metadata, we can check if we need to load additional dependencies + if (library.Metadata != null) + { + if (systemInfo.OSPlatform == OSPlatform.OSX) { - // Construct the dependency (libggml) path - string dependencyPath = Path.Combine(libraryDirectory, $"{libPrefix}ggml{ext}"); - - // Try to load the dependency - var dependencyResult = TryLoad(dependencyPath, description.SearchDirectories, config.LogCallback); + // On OSX, we should load the CPU backend from the current directory + + // ggml-cpu + dependencyPaths.Add(Path.Combine(currentRuntimeDirectory, $"{libPrefix}ggml-cpu{ext}")); + + // ggml-metal (only supported on osx-arm64) + if (os == "osx-arm64") + dependencyPaths.Add(Path.Combine(currentRuntimeDirectory, $"{libPrefix}ggml-metal{ext}")); + + // ggml-blas (osx-x64, osx-x64-rosetta2 and osx-arm64 all have blas) + dependencyPaths.Add(Path.Combine(currentRuntimeDirectory, $"{libPrefix}ggml-blas{ext}")); + } + else + { + // On other platforms (Windows, Linux), we need to load the CPU backend from the specified AVX level directory + // We are using the AVX level supplied by NativeLibraryConfig, which automatically detects the highest supported AVX level for us + + // ggml-cpu + dependencyPaths.Add(Path.Combine( + $"runtimes/{os}/native/{NativeLibraryConfig.AvxLevelToString(library.Metadata.AvxLevel)}", + $"{libPrefix}ggml-cpu{ext}" + )); + + // ggml-cuda + if (library.Metadata.UseCuda) + dependencyPaths.Add(Path.Combine(currentRuntimeDirectory, $"{libPrefix}ggml-cuda{ext}")); + + // ggml-vulkan + if (library.Metadata.UseVulkan) + dependencyPaths.Add(Path.Combine(currentRuntimeDirectory, $"{libPrefix}ggml-vulkan{ext}")); + } + } + + // And finally, we can add ggml + dependencyPaths.Add(Path.Combine(currentRuntimeDirectory, $"{libPrefix}ggml{ext}")); + + // Now, we will loop through our dependencyPaths and try to load them one by one + foreach (var dependencyPath in dependencyPaths) + { + // Try to load the dependency + var dependencyResult = TryLoad(dependencyPath, description.SearchDirectories, config.LogCallback); - // If we successfully loaded the library, log it - if (dependencyResult != IntPtr.Zero) - { - Log($"Successfully loaded dependency '{dependencyPath}'", LLamaLogLevel.Info, config.LogCallback); - } - else - { - Log($"Failed loading dependency '{dependencyPath}'", LLamaLogLevel.Info, config.LogCallback); - } + // If we successfully loaded the library, log it + if (dependencyResult != IntPtr.Zero) + { + Log($"Successfully loaded dependency '{dependencyPath}'", LLamaLogLevel.Info, config.LogCallback); + } + else + { + Log($"Failed loading dependency '{dependencyPath}'", LLamaLogLevel.Info, config.LogCallback); } } - // Try to load the library + // Try to load the main library var result = TryLoad(path, description.SearchDirectories, config.LogCallback); // If we successfully loaded the library, return the handle diff --git a/LLama/Native/Load/NativeLibraryWithCuda.cs b/LLama/Native/Load/NativeLibraryWithCuda.cs index 12da095dc..36dc4ca81 100644 --- a/LLama/Native/Load/NativeLibraryWithCuda.cs +++ b/LLama/Native/Load/NativeLibraryWithCuda.cs @@ -28,11 +28,13 @@ public NativeLibraryMetadata? Metadata /// /// /// + /// /// - public NativeLibraryWithCuda(int majorCudaVersion, NativeLibraryName libraryName, bool skipCheck) + public NativeLibraryWithCuda(int majorCudaVersion, NativeLibraryName libraryName, AvxLevel avxLevel, bool skipCheck) { _majorCudaVersion = majorCudaVersion; _libraryName = libraryName; + _avxLevel = avxLevel; _skipCheck = skipCheck; } diff --git a/LLama/Native/Load/NativeLibraryWithMacOrFallback.cs b/LLama/Native/Load/NativeLibraryWithMacOrFallback.cs index 6bcd55049..59754be03 100644 --- a/LLama/Native/Load/NativeLibraryWithMacOrFallback.cs +++ b/LLama/Native/Load/NativeLibraryWithMacOrFallback.cs @@ -1,5 +1,5 @@ -using LLama.Abstractions; using System.Collections.Generic; +using LLama.Abstractions; namespace LLama.Native { @@ -7,39 +7,30 @@ namespace LLama.Native /// /// A native library compiled on Mac, or fallbacks from all other libraries in the selection. /// - public class NativeLibraryWithMacOrFallback : INativeLibrary + public class NativeLibraryWithMacOrFallback + : INativeLibrary { - private NativeLibraryName _libraryName; - private bool _skipCheck; + private readonly NativeLibraryName _libraryName; /// - public NativeLibraryMetadata? Metadata - { - get - { - return new NativeLibraryMetadata(_libraryName, false, false, AvxLevel.None); - } - } + public NativeLibraryMetadata Metadata => new(_libraryName, false, false, AvxLevel.None); /// /// /// /// - /// - public NativeLibraryWithMacOrFallback(NativeLibraryName libraryName, bool skipCheck) + public NativeLibraryWithMacOrFallback(NativeLibraryName libraryName) { _libraryName = libraryName; - _skipCheck = skipCheck; } /// public IEnumerable Prepare(SystemInfo systemInfo, NativeLogConfig.LLamaLogCallback? logCallback) { - var path = GetPath(systemInfo, AvxLevel.None, logCallback); - return path is null ?[] : [path]; + yield return GetPath(systemInfo); } - private string? GetPath(SystemInfo systemInfo, AvxLevel avxLevel, NativeLogConfig.LLamaLogCallback? logCallback) + private string GetPath(SystemInfo systemInfo) { NativeLibraryUtils.GetPlatformPathParts(systemInfo.OSPlatform, out var os, out var fileExtension, out var libPrefix); string relativePath; @@ -50,11 +41,7 @@ public IEnumerable Prepare(SystemInfo systemInfo, NativeLogConfig.LLamaL } else { - var avxStr = NativeLibraryConfig.AvxLevelToString(AvxLevel.None); - if (!string.IsNullOrEmpty(avxStr)) - avxStr += "/"; - - relativePath = $"runtimes/{os}/native/{avxStr}{libPrefix}{_libraryName.GetLibraryName()}{fileExtension}"; + relativePath = $"runtimes/{os}/native/{libPrefix}{_libraryName.GetLibraryName()}{fileExtension}"; } return relativePath; diff --git a/LLama/Native/Load/NativeLibraryWithVulkan.cs b/LLama/Native/Load/NativeLibraryWithVulkan.cs index fe4eef01e..c3fe94de3 100644 --- a/LLama/Native/Load/NativeLibraryWithVulkan.cs +++ b/LLama/Native/Load/NativeLibraryWithVulkan.cs @@ -28,11 +28,13 @@ public NativeLibraryMetadata? Metadata /// /// /// + /// /// - public NativeLibraryWithVulkan(string? vulkanVersion, NativeLibraryName libraryName, bool skipCheck) + public NativeLibraryWithVulkan(string? vulkanVersion, NativeLibraryName libraryName, AvxLevel avxLevel, bool skipCheck) { _vulkanVersion = vulkanVersion; _libraryName = libraryName; + _avxLevel = avxLevel; _skipCheck = skipCheck; } diff --git a/LLama/Native/NativeApi.cs b/LLama/Native/NativeApi.cs index 041cc0dd5..0d6bc1984 100644 --- a/LLama/Native/NativeApi.cs +++ b/LLama/Native/NativeApi.cs @@ -172,6 +172,15 @@ public static unsafe int llama_chat_apply_template(SafeLlamaModelHandle? model, static extern int internal_llama_chat_apply_template(IntPtr model, byte* tmpl, LLamaChatMessage* chat, nuint n_msg, [MarshalAs(UnmanagedType.U1)] bool add_ass, byte* buf, int length); } + /// + /// Get list of built-in chat templates + /// + /// + /// + /// + [DllImport(libraryName, CallingConvention = CallingConvention.Cdecl)] + public static extern unsafe int llama_chat_builtin_templates(char** output, nuint len); + [DllImport(libraryName, CallingConvention = CallingConvention.Cdecl)] [return: MarshalAs(UnmanagedType.U1)] public static extern bool llama_add_bos_token(SafeLlamaModelHandle model); diff --git a/LLama/Native/RopeScalingType.cs b/LLama/Native/RopeScalingType.cs index 8d4552b80..61ae82942 100644 --- a/LLama/Native/RopeScalingType.cs +++ b/LLama/Native/RopeScalingType.cs @@ -1,4 +1,4 @@ -namespace LLama.Native +namespace LLama.Native { /// /// RoPE scaling type. @@ -26,5 +26,10 @@ public enum RopeScalingType /// YaRN scaling: https://arxiv.org/pdf/2309.00071.pdf /// Yarn = 2, + + /// + /// LongRope scaling + /// + LongRope = 3, } } diff --git a/LLama/Native/SafeLLamaContextHandle.cs b/LLama/Native/SafeLLamaContextHandle.cs index 450f4998a..19187ded9 100644 --- a/LLama/Native/SafeLLamaContextHandle.cs +++ b/LLama/Native/SafeLLamaContextHandle.cs @@ -333,6 +333,14 @@ static SafeLLamaContextHandle() [DllImport(NativeApi.libraryName, CallingConvention = CallingConvention.Cdecl)] private static extern void llama_kv_cache_update(SafeLLamaContextHandle ctx); + /// + /// Check if the context supports KV cache shifting + /// + /// + /// + [DllImport(NativeApi.libraryName, CallingConvention = CallingConvention.Cdecl)] + private static extern bool llama_kv_cache_can_shift(SafeLLamaContextHandle ctx); + [DllImport(NativeApi.libraryName, CallingConvention = CallingConvention.Cdecl)] private static extern LLamaPerfContextTimings llama_perf_context(SafeLLamaContextHandle ctx); @@ -566,7 +574,7 @@ public void Synchronize() /// internally for later use by the decoder cross-attention layers. /// /// - /// 0 = success
< 0 = error
+ /// 0 = success
< 0 = error (the KV cache state is restored to the state before this call)
public DecodeResult Encode(LLamaBatch batch) { if (batch.TokenCount == 0) @@ -583,7 +591,7 @@ public DecodeResult Encode(LLamaBatch batch) /// Positive return values does not mean a fatal error, but rather a warning:
/// - 0: success
/// - 1: could not find a KV slot for the batch (try reducing the size of the batch or increase the context)
- /// - < 0: error
+ /// - < 0: error (the KV cache state is restored to the state before this call)
///
public DecodeResult Decode(LLamaBatch batch) { @@ -746,6 +754,11 @@ public void ResetTimings() #endregion #region KV Cache Management + /// + /// Check if the context supports KV cache shifting + /// + public bool KvCacheCanShift => llama_kv_cache_can_shift(this); + /// /// Apply KV cache updates (such as K-shifts, defragmentation, etc.) /// diff --git a/LLama/Native/SafeLLamaSamplerHandle.cs b/LLama/Native/SafeLLamaSamplerHandle.cs index ef6a7ae30..9099c2f32 100644 --- a/LLama/Native/SafeLLamaSamplerHandle.cs +++ b/LLama/Native/SafeLLamaSamplerHandle.cs @@ -1,5 +1,5 @@ using System; -using System.Runtime.CompilerServices; +using System.Collections.Generic; using System.Text; namespace LLama.Native; @@ -410,40 +410,94 @@ public void AddGrammar(SafeLlamaModelHandle model, string grammar, string root) } /// - /// Create a sampler that applies various repetition penalties + /// Create a sampler that applies various repetition penalties. + /// + /// Avoid using on the full vocabulary as searching for repeated tokens can become slow. For example, apply top-k or top-p sampling first. /// - /// Vocab size - /// EOS token (if this model has one) - /// Newline token /// How many tokens of history to consider when calculating penalties /// Repetition penalty /// Frequency penalty /// Presence penalty - /// Whether or not to penalize the newline token - /// Whether or not to ignore EOS token /// - public void AddPenalties( - int vocabSize, LLamaToken? eos, LLamaToken newline, int penaltyCount, float repeat, float freq, float presence, bool penalizeNewline, bool ignoreEOS - ) + public void AddPenalties(int penaltyCount, float repeat, float freq, float presence) { - llama_sampler_chain_add(this, llama_sampler_init_penalties(vocabSize, eos ?? LLamaToken.InvalidToken, newline, penaltyCount, repeat, freq, presence, penalizeNewline, ignoreEOS)); + llama_sampler_chain_add( + this, + llama_sampler_init_penalties( + penaltyCount, + repeat, + freq, + presence + ) + ); // ReSharper disable InconsistentNaming [DllImport(NativeApi.libraryName, CallingConvention = CallingConvention.Cdecl)] static extern IntPtr llama_sampler_init_penalties( - int n_vocab, // llama_n_vocab() - LLamaToken special_eos_id, // llama_token_eos() - LLamaToken linefeed_id, // llama_token_nl() - int penalty_last_n, // last n tokens to penalize (0 = disable penalty, -1 = context size) - float penalty_repeat, // 1.0 = disabled - float penalty_freq, // 0.0 = disabled - float penalty_present, // 0.0 = disabled - bool penalize_nl, // consider newlines as a repeatable token - bool ignore_eos // ignore the end-of-sequence token + int penalty_last_n, // last n tokens to penalize (0 = disable penalty, -1 = context size) + float penalty_repeat, // 1.0 = disabled + float penalty_freq, // 0.0 = disabled + float penalty_present // 0.0 = disabled ); // ReSharper restore InconsistentNaming } + /// + /// DRY sampler, designed by p-e-w, as described in: https://github.com/oobabooga/text-generation-webui/pull/5677. + /// Porting Koboldcpp implementation authored by pi6am: https://github.com/LostRuins/koboldcpp/pull/982 + /// + /// The model this sampler will be used with + /// + /// penalty multiplier, 0.0 = disabled + /// exponential base + /// repeated sequences longer than this are penalized + /// how many tokens to scan for repetitions (0 = entire context) + public void AddDry(SafeLlamaModelHandle model, ReadOnlySpan sequenceBreakers, float multiplier = 0.8f, float @base = 1.75f, int allowedLength = 2, int penaltyLastN = 0) + { + unsafe + { + // Convert strings, fix memory in place, build array of pointers + var handles = new List(); + var breakers = stackalloc byte*[sequenceBreakers.Length]; + for (var i = 0; i < sequenceBreakers.Length; i++) + { + var chars = Encoding.Default.GetBytes(sequenceBreakers[i]); + handles.Add(chars.AsMemory().Pin()); + + breakers[i] = (byte*)handles[i].Pointer; + } + + llama_sampler_chain_add( + this, + llama_sampler_init_dry( + model, + multiplier, + @base, + allowedLength, + penaltyLastN, + breakers, + (nuint)sequenceBreakers.Length + ) + ); + + // Clear up all the handles fixing the memory in place + for (var i = 0; i < handles.Count; i++) + handles[i].Dispose(); + } + + // ReSharper disable InconsistentNaming + [DllImport(NativeApi.libraryName, CallingConvention = CallingConvention.Cdecl)] + static extern unsafe IntPtr llama_sampler_init_dry( + SafeLlamaModelHandle model, + float dry_multiplier, + float dry_base, + int dry_allowed_length, + int dry_penalty_last_n, + byte** seq_breakers, + nuint num_breakers + ); + } + /// /// Create a sampler that applies a bias directly to the logits /// diff --git a/LLama/Native/SafeLlamaModelHandle.cs b/LLama/Native/SafeLlamaModelHandle.cs index 718b81809..303ae3352 100644 --- a/LLama/Native/SafeLlamaModelHandle.cs +++ b/LLama/Native/SafeLlamaModelHandle.cs @@ -441,9 +441,6 @@ private static int llama_model_meta_val_str(SafeLlamaModelHandle model, string k [DllImport(NativeApi.libraryName, CallingConvention = CallingConvention.Cdecl)] private static extern LLamaTokenAttr llama_token_get_attr(SafeLlamaModelHandle model, LLamaToken token); - //[DllImport(NativeApi.libraryName, CallingConvention = CallingConvention.Cdecl)] - //private static extern GGMLTensor llama_get_model_tensor(SafeLlamaModelHandle model, string name); - /// /// Returns true if the model contains an encoder that requires llama_encode() call /// diff --git a/LLama/Sampling/DefaultSamplingPipeline.cs b/LLama/Sampling/DefaultSamplingPipeline.cs index 3d166f0c6..76404bc95 100644 --- a/LLama/Sampling/DefaultSamplingPipeline.cs +++ b/LLama/Sampling/DefaultSamplingPipeline.cs @@ -20,44 +20,6 @@ public sealed class DefaultSamplingPipeline /// public float RepeatPenalty { get; init; } = 1; - /// - /// Frequency penalty as described by OpenAI: https://platform.openai.com/docs/api-reference/chat/create
- /// Number between -2.0 and 2.0. Positive values penalize new tokens based on their existing frequency in the text - /// so far, decreasing the model's likelihood to repeat the same line verbatim. - ///
- [Obsolete($"Use {nameof(FrequencyPenalty)} instead.")] - public float AlphaFrequency - { - get => _frequencyPenalty; - init - { - if (value < -2) - throw new ArgumentOutOfRangeException(nameof(value), $"{nameof(AlphaFrequency)} must be greater than -2"); - if (value > 2) - throw new ArgumentOutOfRangeException(nameof(value), $"{nameof(AlphaFrequency)} must be less than 2"); - _frequencyPenalty = value; - } - } - - /// - /// Presence penalty as described by OpenAI: https://platform.openai.com/docs/api-reference/chat/create
- /// Number between -2.0 and 2.0. Positive values penalize new tokens based on whether they appear in the - /// text so far, increasing the model's likelihood to talk about new topics. - ///
- [Obsolete($"Use {nameof(PresencePenalty)} instead.")] - public float AlphaPresence - { - get => _presencePenalty; - init - { - if (value < -2) - throw new ArgumentOutOfRangeException(nameof(value), $"{nameof(AlphaPresence)} must be greater than -2"); - if (value > 2) - throw new ArgumentOutOfRangeException(nameof(value), $"{nameof(AlphaPresence)} must be less than 2"); - _presencePenalty = value; - } - } - /// /// Frequency penalty as described by OpenAI: https://platform.openai.com/docs/api-reference/chat/create
/// Number between -2.0 and 2.0. Positive values penalize new tokens based on their existing frequency in the text @@ -97,21 +59,15 @@ public float PresencePenalty private readonly float _presencePenalty; /// - /// How many tokens should be considered for penalizing repetition + /// How many tokens should be considered for penalties /// - public int RepeatPenaltyCount { get; init; } = 64; + public int PenaltyCount { get; init; } = 64; /// /// Whether the newline token should be protected from being modified by penalty /// public bool PenalizeNewline { get; init; } = false; - /// - /// Whether the EOS token should be protected from being modified by penalty - /// - [Obsolete($"This doesn't do what the name implies. If you're sure you want to use it, use {nameof(PreventEOS)}.")] - public bool PenalizeEOS { get; init; } = false; - /// /// Whether the EOS token should be suppressed. Setting this to 'true' prevents EOS from being sampled /// @@ -158,7 +114,7 @@ public float PresencePenalty public uint Seed { get; set; } = GetRandomSeed(); - private static Random RandomSeedGenerator = new(); + private static readonly Random RandomSeedGenerator = new(); private static uint GetRandomSeed() { lock (RandomSeedGenerator) @@ -196,13 +152,7 @@ protected override SafeLLamaSamplerChainHandle CreateChain(SafeLLamaContextHandl if (Grammar != null) chain.AddGrammar(context.ModelHandle, Grammar.Gbnf, Grammar.Root); - chain.AddPenalties( - context.VocabCount, - context.ModelHandle.Tokens.EOS, context.ModelHandle.Tokens.Newline ?? 0, - RepeatPenaltyCount, RepeatPenalty, - FrequencyPenalty, PresencePenalty, - PenalizeNewline, PreventEOS - ); + chain.AddPenalties(PenaltyCount, RepeatPenalty, FrequencyPenalty, PresencePenalty); chain.AddTopK(TopK); chain.AddTypical(TypicalP, MinKeep); diff --git a/LLama/runtimes/build/LLamaSharp.Backend.Cpu.nuspec b/LLama/runtimes/build/LLamaSharp.Backend.Cpu.nuspec index 0203aad2b..debc99506 100644 --- a/LLama/runtimes/build/LLamaSharp.Backend.Cpu.nuspec +++ b/LLama/runtimes/build/LLamaSharp.Backend.Cpu.nuspec @@ -18,46 +18,77 @@ - - + + + + + + + + + + + + + + + + + - - + + + + + + + + + + + + + + + + + + + + + + + - + + + + + - - - - - - - - - - + diff --git a/LLama/runtimes/build/LLamaSharp.Backend.Cuda11.Linux.nuspec b/LLama/runtimes/build/LLamaSharp.Backend.Cuda11.Linux.nuspec index 7b4f959f4..6abd16ccc 100644 --- a/LLama/runtimes/build/LLamaSharp.Backend.Cuda11.Linux.nuspec +++ b/LLama/runtimes/build/LLamaSharp.Backend.Cuda11.Linux.nuspec @@ -7,18 +7,27 @@ llama.cpp Authors false MIT + icon512.png https://github.com/SciSharp/LLamaSharp LLamaSharp.Backend.Cuda11.Linux contains the Linux binaries for LLamaSharp with Cuda11 support. Copyright 2023 The llama.cpp Authors. All rights reserved. LLamaSharp LLama LLM GPT AI ChatBot SciSharp + + + + - + + + + + diff --git a/LLama/runtimes/build/LLamaSharp.Backend.Cuda11.Windows.nuspec b/LLama/runtimes/build/LLamaSharp.Backend.Cuda11.Windows.nuspec index 34bc6781d..a412e2e6f 100644 --- a/LLama/runtimes/build/LLamaSharp.Backend.Cuda11.Windows.nuspec +++ b/LLama/runtimes/build/LLamaSharp.Backend.Cuda11.Windows.nuspec @@ -7,18 +7,27 @@ llama.cpp Authors false MIT + icon512.png https://github.com/SciSharp/LLamaSharp LLamaSharp.Backend.Cuda11.Windows contains the Windows binaries for LLamaSharp with Cuda11 support. Copyright 2023 The llama.cpp Authors. All rights reserved. LLamaSharp LLama LLM GPT AI ChatBot SciSharp + + + + - + + + + + diff --git a/LLama/runtimes/build/LLamaSharp.Backend.Cuda11.nuspec b/LLama/runtimes/build/LLamaSharp.Backend.Cuda11.nuspec index 1beeeaafc..5ac473914 100644 --- a/LLama/runtimes/build/LLamaSharp.Backend.Cuda11.nuspec +++ b/LLama/runtimes/build/LLamaSharp.Backend.Cuda11.nuspec @@ -22,6 +22,7 @@ + diff --git a/LLama/runtimes/build/LLamaSharp.Backend.Cuda12.Linux.nuspec b/LLama/runtimes/build/LLamaSharp.Backend.Cuda12.Linux.nuspec index 8834ae413..687283221 100644 --- a/LLama/runtimes/build/LLamaSharp.Backend.Cuda12.Linux.nuspec +++ b/LLama/runtimes/build/LLamaSharp.Backend.Cuda12.Linux.nuspec @@ -7,19 +7,27 @@ llama.cpp Authors false MIT + icon512.png https://github.com/SciSharp/LLamaSharp LLamaSharp.Backend.Cuda12.Linux contains the Linux binaries for LLamaSharp with Cuda12 support. Copyright 2023 The llama.cpp Authors. All rights reserved. LLamaSharp LLama LLM GPT AI ChatBot SciSharp + + + + - - + + + + + diff --git a/LLama/runtimes/build/LLamaSharp.Backend.Cuda12.Windows.nuspec b/LLama/runtimes/build/LLamaSharp.Backend.Cuda12.Windows.nuspec index 3d37accec..1fd01edb9 100644 --- a/LLama/runtimes/build/LLamaSharp.Backend.Cuda12.Windows.nuspec +++ b/LLama/runtimes/build/LLamaSharp.Backend.Cuda12.Windows.nuspec @@ -7,19 +7,27 @@ llama.cpp Authors false MIT + icon512.png https://github.com/SciSharp/LLamaSharp LLamaSharp.Backend.Cuda12.Windows contains the Windows binaries for LLamaSharp with Cuda12 support. Copyright 2023 The llama.cpp Authors. All rights reserved. LLamaSharp LLama LLM GPT AI ChatBot SciSharp + + + + - - + + + + + diff --git a/LLama/runtimes/build/LLamaSharp.Backend.Vulkan.Linux.nuspec b/LLama/runtimes/build/LLamaSharp.Backend.Vulkan.Linux.nuspec index 725764097..3f2202db4 100644 --- a/LLama/runtimes/build/LLamaSharp.Backend.Vulkan.Linux.nuspec +++ b/LLama/runtimes/build/LLamaSharp.Backend.Vulkan.Linux.nuspec @@ -7,18 +7,27 @@ llama.cpp Authors false MIT + icon512.png https://github.com/SciSharp/LLamaSharp LLamaSharp.Backend.Vulkan.Linux contains the Linux binaries for LLamaSharp with Vulkan support. Copyright 2023 The llama.cpp Authors. All rights reserved. LLamaSharp LLama LLM GPT AI ChatBot SciSharp + + + + - + + + + + diff --git a/LLama/runtimes/build/LLamaSharp.Backend.Vulkan.Windows.nuspec b/LLama/runtimes/build/LLamaSharp.Backend.Vulkan.Windows.nuspec index 5c5b83f94..3f7487bcd 100644 --- a/LLama/runtimes/build/LLamaSharp.Backend.Vulkan.Windows.nuspec +++ b/LLama/runtimes/build/LLamaSharp.Backend.Vulkan.Windows.nuspec @@ -7,18 +7,28 @@ llama.cpp Authors false MIT + icon512.png https://github.com/SciSharp/LLamaSharp LLamaSharp.Backend.Vulkan.Windows contains the Windows binaries for LLamaSharp with Vulkan support. Copyright 2023 The llama.cpp Authors. All rights reserved. LLamaSharp LLama LLM GPT AI ChatBot SciSharp + + + + - + + + + + + diff --git a/LLama/runtimes/build/LLamaSharp.Backend.Vulkan.nuspec b/LLama/runtimes/build/LLamaSharp.Backend.Vulkan.nuspec index b4f26ec97..c972ad0fc 100644 --- a/LLama/runtimes/build/LLamaSharp.Backend.Vulkan.nuspec +++ b/LLama/runtimes/build/LLamaSharp.Backend.Vulkan.nuspec @@ -22,6 +22,7 @@ + - + \ No newline at end of file