January 2025 Update (#1036)

* code changes for december update (not working yet) * Changes to support up to ggerganov/llama.cpp@d408bb9 * Updated to latest llama.cpp binaries, this works on Windows CPU but needs more changes for other backends * Updated to latest deps, fixed kernel memory failing to load * Copy missing Mac flibraries libggml-base and libggml-cpu * Removed any mention of AVX in MacOS loading * Added file copying for some more targets (still missing macos) * Updated to latest set of binaries * Fixed copy path for CUDA12 DLLs * Compatibility with llama.cpp backend split (PR #10256) on all platforms * Restore original comment * Update the dependency loader for ggml-metal and ggml-blas * Update the runtime targets for ggml-metal and ggml-blas * Add CPU backend (fallback) dependency for the GPU backends * Fix icons for the nuget backends * Update nuspec files for the GPU backends * Update BinaryReleaseId * Update nuspec for CPU & OSX * Update CPU nuspec to use noavx folder * Update Runtime.targets to use noavx folder * Update BinaryReleaseId * CUDA & Vulkan native libraries now correctly store the detected or user defined AVX level --------- Co-authored-by: SignalRT <[email protected]> Co-authored-by: m0nsky <[email protected]>
SciSharp · Jan 21, 2025 · 02eedd9 · 02eedd9
1 parent f55252f
commit 02eedd9
Show file tree

Hide file tree

Showing 40 changed files with 563 additions and 237 deletions.
diff --git a/LLama.Examples/Program.cs b/LLama.Examples/Program.cs
@@ -1,6 +1,5 @@
-using LLama.Native;
+using LLama.Native;
 using Spectre.Console;
-using System.Runtime.InteropServices;
 
 AnsiConsole.MarkupLineInterpolated(
     $"""
@@ -18,7 +17,7 @@ __       __                                       ____     __
     """);
 
 // Configure logging. Change this to `true` to see log messages from llama.cpp
-var showLLamaCppLogs = false;
+var showLLamaCppLogs = true;
 NativeLibraryConfig
    .All
    .WithLogCallback((level, message) =>
@@ -31,8 +30,7 @@ __       __                                       ____     __
 NativeLibraryConfig
    .All
    .WithCuda()
-   //.WithAutoDownload() // An experimental feature
-   .DryRun(out var loadedllamaLibrary, out var loadedLLavaLibrary);
+   .WithVulkan();
 
 // Calling this method forces loading to occur now.
 NativeApi.llama_empty_call();

diff --git a/LLama.KernelMemory/LLamaSharpTextEmbeddingGenerator.cs b/LLama.KernelMemory/LLamaSharpTextEmbeddingGenerator.cs
@@ -31,11 +31,9 @@ public LLamaSharpTextEmbeddingGenerator(LLamaSharpConfig config)
 
             var @params = new ModelParams(config.ModelPath)
             {
-                ContextSize = config.ContextSize ?? 2048,
+                ContextSize = config.ContextSize,
                 GpuLayerCount = config.GpuLayerCount ?? 20,
                 Embeddings = true,
-                MainGpu = config.MainGpu,
-                SplitMode = config.SplitMode,
                 PoolingType = LLamaPoolingType.Mean,
             };
             _weights = LLamaWeights.LoadFromFile(@params);

diff --git a/LLama.KernelMemory/LlamaSharpTextGenerator.cs b/LLama.KernelMemory/LlamaSharpTextGenerator.cs
@@ -33,8 +33,6 @@ public LlamaSharpTextGenerator(LLamaSharpConfig config)
             {
                 ContextSize = config.ContextSize ?? 2048,
                 GpuLayerCount = config.GpuLayerCount ?? 20,
-                MainGpu = config.MainGpu,
-                SplitMode = config.SplitMode
             };
             _weights = LLamaWeights.LoadFromFile(parameters);
             _context = _weights.CreateContext(parameters);

diff --git a/LLama.Unittest/KernelMemory/LLamaSharpTextEmbeddingGeneratorTests.cs b/LLama.Unittest/KernelMemory/LLamaSharpTextEmbeddingGeneratorTests.cs
@@ -1,21 +1,15 @@
-using LLama.Common;
 using LLamaSharp.KernelMemory;
-using Microsoft.KernelMemory.AI;
-using System;
-using System.Collections.Generic;
-using System.Linq;
-using System.Text;
-using System.Text.RegularExpressions;
-using System.Threading.Tasks;
 using Xunit.Abstractions;
 
 namespace LLama.Unittest.KernelMemory
 {
-    public class LLamaSharpTextEmbeddingGeneratorTests : ITextTokenizerTests, IDisposable
+    public class LLamaSharpTextEmbeddingGeneratorTests
+        : ITextTokenizerTests, IDisposable
     {
         private readonly LLamaSharpTextEmbeddingGenerator _embeddingGenerator;
 
-        public LLamaSharpTextEmbeddingGeneratorTests(ITestOutputHelper testOutputHelper) : base(testOutputHelper)
+        public LLamaSharpTextEmbeddingGeneratorTests(ITestOutputHelper testOutputHelper)
+            : base(testOutputHelper)
         {
             _embeddingGenerator = new LLamaSharpTextEmbeddingGenerator(_lsConfig);
 

diff --git a/LLama.Unittest/KernelMemory/LlamaSharpTextGeneratorTests.cs b/LLama.Unittest/KernelMemory/LlamaSharpTextGeneratorTests.cs
@@ -1,25 +1,15 @@
-using LLama.Common;
 using LLamaSharp.KernelMemory;
-using Microsoft.KernelMemory.AI;
-using System;
-using System.Collections.Generic;
-using System.Diagnostics;
-using System.Linq;
-using System.Reflection.Emit;
-using System.Text;
-using System.Text.RegularExpressions;
-using System.Threading.Tasks;
 using Xunit.Abstractions;
-using Xunit.Sdk;
-using static System.Net.Mime.MediaTypeNames;
 
 namespace LLama.Unittest.KernelMemory
 {
-    public class LlamaSharpTextGeneratorTests : ITextTokenizerTests, IDisposable
+    public class LlamaSharpTextGeneratorTests
+        : ITextTokenizerTests, IDisposable
     {        
         private readonly LlamaSharpTextGenerator _textGenerator;
 
-        public LlamaSharpTextGeneratorTests(ITestOutputHelper testOutputHelper) : base(testOutputHelper)
+        public LlamaSharpTextGeneratorTests(ITestOutputHelper testOutputHelper)
+            : base(testOutputHelper)
         {            
             _textGenerator = new LlamaSharpTextGenerator(_lsConfig);
 

diff --git a/LLama.Unittest/SamplingTests.cs b/LLama.Unittest/SamplingTests.cs
@@ -167,11 +167,7 @@ private static SafeLLamaSamplerChainHandle CreateChain(SafeLLamaContextHandle co
             var chain = SafeLLamaSamplerChainHandle.Create(LLamaSamplerChainParams.Default());
 
             chain.AddPenalties(
-                vocabSize: context.VocabCount,
-                eos: context.ModelHandle.Tokens.EOS,
-                newline: context.ModelHandle.Tokens.Newline ?? 0,
-                penaltyCount: 60, repeat: 1, freq: 0, presence: 0,
-                penalizeNewline: false, ignoreEOS: false
+                penaltyCount: 60, repeat: 1, freq: 0, presence: 0
             );
 
             if (logit_bias != null) { chain.AddLogitBias(context.VocabCount, logit_bias); }

diff --git a/LLama.Web/Common/ModelOptions.cs b/LLama.Web/Common/ModelOptions.cs
@@ -24,7 +24,7 @@ public class ModelOptions
         public int MainGpu { get; set; } = 0;
 
         /// <inheritdoc />
-        public GPUSplitMode SplitMode { get; set; } = GPUSplitMode.None;
+        public GPUSplitMode? SplitMode { get; set; }
 
         /// <inheritdoc />
         public int GpuLayerCount { get; set; } = 20;
@@ -59,6 +59,9 @@ public class ModelOptions
         /// <inheritdoc />
         public TensorSplitsCollection TensorSplits { get; set; } = new();
 
+        /// <inheritdoc />
+        public bool CheckTensors { get; }
+
         /// <inheritdoc />
         public List<MetadataOverride> MetadataOverrides { get; } = new();
 

diff --git a/LLama/Abstractions/IModelParams.cs b/LLama/Abstractions/IModelParams.cs
@@ -36,7 +36,7 @@ public interface IModelParams
         /// <summary>
         /// How to split the model across multiple GPUs
         /// </summary>
-        GPUSplitMode SplitMode { get; }
+        GPUSplitMode? SplitMode { get; }
 
         /// <summary>
         /// Number of layers to run in VRAM / GPU memory (n_gpu_layers)
@@ -68,6 +68,11 @@ public interface IModelParams
         /// </summary>
         bool VocabOnly { get; }
 
+        /// <summary>
+        /// Validate model tensor data before loading
+        /// </summary>
+        bool CheckTensors { get; }
+
         /// <summary>
         /// Override specific metadata items in the model
         /// </summary>

diff --git a/LLama/Common/ModelParams.cs b/LLama/Common/ModelParams.cs
@@ -19,7 +19,7 @@ public record ModelParams
         public int MainGpu { get; set; } = 0;
 
         /// <inheritdoc />
-        public GPUSplitMode SplitMode { get; set; } = GPUSplitMode.None;
+        public GPUSplitMode? SplitMode { get; set; }
 
         /// <inheritdoc />
         public int GpuLayerCount { get; set; } = 20;
@@ -54,6 +54,9 @@ public record ModelParams
         /// <inheritdoc />
         public TensorSplitsCollection TensorSplits { get; set; } = new();
 
+        /// <inheritdoc />
+        public bool CheckTensors { get; }
+
         /// <inheritdoc />
         public List<MetadataOverride> MetadataOverrides { get; set; } = new();
 

diff --git a/LLama/Extensions/IModelParamsExtensions.cs b/LLama/Extensions/IModelParamsExtensions.cs
@@ -1,4 +1,4 @@
-using System.IO;
+using System.IO;
 using System;
 using System.Text;
 using LLama.Abstractions;
@@ -31,11 +31,14 @@ public static IDisposable ToLlamaModelParams(this IModelParams @params, out LLam
         result = LLamaModelParams.Default();
 
         result.main_gpu = @params.MainGpu;
-        result.split_mode = @params.SplitMode;
         result.n_gpu_layers = @params.GpuLayerCount < 0 ? int.MaxValue : @params.GpuLayerCount;
+        if (@params.SplitMode.HasValue)
+            result.split_mode = @params.SplitMode.Value;
+
         result.use_mlock = @params.UseMemoryLock;
         result.use_mmap = @params.UseMemorymap;
         result.vocab_only = @params.VocabOnly;
+        result.check_tensors = @params.CheckTensors;
 
         unsafe
         {

diff --git a/LLama/Extensions/LLamaExecutorExtensions.cs b/LLama/Extensions/LLamaExecutorExtensions.cs
@@ -147,7 +147,7 @@ private string CreatePrompt(IList<ChatMessage> messages)
                     PreventEOS = options?.AdditionalProperties?.TryGetValue(nameof(DefaultSamplingPipeline.PreventEOS), out bool eos) is true ? eos : s_defaultPipeline.PreventEOS,
                     PenalizeNewline = options?.AdditionalProperties?.TryGetValue(nameof(DefaultSamplingPipeline.PenalizeNewline), out bool pnl) is true ? pnl : s_defaultPipeline.PenalizeNewline,
                     RepeatPenalty = options?.AdditionalProperties?.TryGetValue(nameof(DefaultSamplingPipeline.RepeatPenalty), out float rp) is true ? rp : s_defaultPipeline.RepeatPenalty,
-                    RepeatPenaltyCount = options?.AdditionalProperties?.TryGetValue(nameof(DefaultSamplingPipeline.RepeatPenaltyCount), out int rpc) is true ? rpc : s_defaultPipeline.RepeatPenaltyCount,
+                    PenaltyCount = options?.AdditionalProperties?.TryGetValue(nameof(DefaultSamplingPipeline.PenaltyCount), out int rpc) is true ? rpc : s_defaultPipeline.PenaltyCount,
                     Grammar = options?.AdditionalProperties?.TryGetValue(nameof(DefaultSamplingPipeline.Grammar), out Grammar? g) is true ? g : s_defaultPipeline.Grammar,
                     MinKeep = options?.AdditionalProperties?.TryGetValue(nameof(DefaultSamplingPipeline.MinKeep), out int mk) is true ? mk : s_defaultPipeline.MinKeep,
                     MinP = options?.AdditionalProperties?.TryGetValue(nameof(DefaultSamplingPipeline.MinP), out float mp) is true ? mp : s_defaultPipeline.MinP,

diff --git a/LLama/LLamaQuantizer.cs b/LLama/LLamaQuantizer.cs
@@ -106,9 +106,6 @@ private static bool ValidateFtype(LLamaFtype ftype)
                 case LLamaFtype.MOSTLY_IQ3_S:
                 case LLamaFtype.MOSTLY_IQ3_M:
 
-                case LLamaFtype.MOSTLY_Q4_0_4_4:
-                case LLamaFtype.MOSTLY_Q4_0_4_8:
-                case LLamaFtype.MOSTLY_Q4_0_8_8:
                     return true;
 
                 case LLamaFtype.GUESSED: