From 02eedd9477616b11dd30ea8eee0fc5114dfe574f Mon Sep 17 00:00:00 2001
From: Martin Evans <martindevans@gmail.com>
Date: Tue, 21 Jan 2025 00:24:36 +0000
Subject: [PATCH] January 2025 Update (#1036)

* code changes for december update (not working yet)

* Changes to support up to https://github.com/ggerganov/llama.cpp/commit/d408bb9268a988c5a60a5746d3a6430386e7604d

* Updated to latest llama.cpp binaries, this works on Windows CPU but needs more changes for other backends

* Updated to latest deps, fixed kernel memory failing to load

* Copy missing Mac flibraries libggml-base and libggml-cpu

* Removed any mention of AVX in MacOS loading

* Added file copying for some more targets (still missing macos)

* Updated to latest set of binaries

* Fixed copy path for CUDA12 DLLs

* Compatibility with llama.cpp backend split (PR #10256) on all platforms

* Restore original comment

* Update the dependency loader for ggml-metal and ggml-blas

* Update the runtime targets for ggml-metal and ggml-blas

* Add CPU backend (fallback) dependency for the GPU backends

* Fix icons for the nuget backends

* Update nuspec files for the GPU backends

* Update BinaryReleaseId

* Update nuspec for CPU & OSX

* Update CPU nuspec to use noavx folder

* Update Runtime.targets to use noavx folder

* Update BinaryReleaseId

* CUDA & Vulkan native libraries now correctly store the detected or user defined AVX level

---------

Co-authored-by: SignalRT <admin@signalrt.com>
Co-authored-by: m0nsky <ramonkroes@live.co.uk>
---
 LLama.Examples/Program.cs                     |   8 +-
 .../LLamaSharpTextEmbeddingGenerator.cs       |   4 +-
 LLama.KernelMemory/LlamaSharpTextGenerator.cs |   2 -
 .../LLamaSharpTextEmbeddingGeneratorTests.cs  |  14 +-
 .../LlamaSharpTextGeneratorTests.cs           |  18 +-
 LLama.Unittest/SamplingTests.cs               |   6 +-
 LLama.Web/Common/ModelOptions.cs              |   5 +-
 LLama/Abstractions/IModelParams.cs            |   7 +-
 LLama/Common/ModelParams.cs                   |   5 +-
 LLama/Extensions/IModelParamsExtensions.cs    |   7 +-
 LLama/Extensions/LLamaExecutorExtensions.cs   |   2 +-
 LLama/LLamaQuantizer.cs                       |   3 -
 LLama/LLamaSharp.Runtime.targets              | 191 +++++++++++++++++-
 LLama/LLamaSharp.csproj                       |   2 +-
 LLama/LLamaStatelessExecutor.cs               |  12 +-
 LLama/Native/GPUSplitMode.cs                  |   2 +-
 LLama/Native/LLamaFtype.cs                    |  24 +--
 LLama/Native/LLamaModelParams.cs              |  18 +-
 LLama/Native/LLamaRopeType.cs                 |   2 +
 .../DefaultNativeLibrarySelectingPolicy.cs    |   8 +-
 LLama/Native/Load/NativeLibraryConfig.cs      |   4 +-
 LLama/Native/Load/NativeLibraryUtils.cs       |  95 +++++++--
 LLama/Native/Load/NativeLibraryWithCuda.cs    |   4 +-
 .../Load/NativeLibraryWithMacOrFallback.cs    |  31 +--
 LLama/Native/Load/NativeLibraryWithVulkan.cs  |   4 +-
 LLama/Native/NativeApi.cs                     |   9 +
 LLama/Native/RopeScalingType.cs               |   7 +-
 LLama/Native/SafeLLamaContextHandle.cs        |  17 +-
 LLama/Native/SafeLLamaSamplerHandle.cs        |  94 +++++++--
 LLama/Native/SafeLlamaModelHandle.cs          |   3 -
 LLama/Sampling/DefaultSamplingPipeline.cs     |  58 +-----
 .../build/LLamaSharp.Backend.Cpu.nuspec       |  61 ++++--
 .../LLamaSharp.Backend.Cuda11.Linux.nuspec    |  11 +-
 .../LLamaSharp.Backend.Cuda11.Windows.nuspec  |  11 +-
 .../build/LLamaSharp.Backend.Cuda11.nuspec    |   1 +
 .../LLamaSharp.Backend.Cuda12.Linux.nuspec    |  12 +-
 .../LLamaSharp.Backend.Cuda12.Windows.nuspec  |  12 +-
 .../LLamaSharp.Backend.Vulkan.Linux.nuspec    |  11 +-
 .../LLamaSharp.Backend.Vulkan.Windows.nuspec  |  12 +-
 .../build/LLamaSharp.Backend.Vulkan.nuspec    |   3 +-
 40 files changed, 563 insertions(+), 237 deletions(-)

diff --git a/LLama.Examples/Program.cs b/LLama.Examples/Program.cs
index 63114120d..f8ef7d5aa 100644
--- a/LLama.Examples/Program.cs
+++ b/LLama.Examples/Program.cs
@@ -1,6 +1,5 @@
-﻿using LLama.Native;
+using LLama.Native;
 using Spectre.Console;
-using System.Runtime.InteropServices;
 
 AnsiConsole.MarkupLineInterpolated(
     $"""
@@ -18,7 +17,7 @@ __       __                                       ____     __
     """);
 
 // Configure logging. Change this to `true` to see log messages from llama.cpp
-var showLLamaCppLogs = false;
+var showLLamaCppLogs = true;
 NativeLibraryConfig
    .All
    .WithLogCallback((level, message) =>
@@ -31,8 +30,7 @@ __       __                                       ____     __
 NativeLibraryConfig
    .All
    .WithCuda()
-   //.WithAutoDownload() // An experimental feature
-   .DryRun(out var loadedllamaLibrary, out var loadedLLavaLibrary);
+   .WithVulkan();
 
 // Calling this method forces loading to occur now.
 NativeApi.llama_empty_call();
diff --git a/LLama.KernelMemory/LLamaSharpTextEmbeddingGenerator.cs b/LLama.KernelMemory/LLamaSharpTextEmbeddingGenerator.cs
index 7f9ae1e4d..6efd44f7b 100644
--- a/LLama.KernelMemory/LLamaSharpTextEmbeddingGenerator.cs
+++ b/LLama.KernelMemory/LLamaSharpTextEmbeddingGenerator.cs
@@ -31,11 +31,9 @@ public LLamaSharpTextEmbeddingGenerator(LLamaSharpConfig config)
 
             var @params = new ModelParams(config.ModelPath)
             {
-                ContextSize = config.ContextSize ?? 2048,
+                ContextSize = config.ContextSize,
                 GpuLayerCount = config.GpuLayerCount ?? 20,
                 Embeddings = true,
-                MainGpu = config.MainGpu,
-                SplitMode = config.SplitMode,
                 PoolingType = LLamaPoolingType.Mean,
             };
             _weights = LLamaWeights.LoadFromFile(@params);
diff --git a/LLama.KernelMemory/LlamaSharpTextGenerator.cs b/LLama.KernelMemory/LlamaSharpTextGenerator.cs
index adfc89317..3fc96db9a 100644
--- a/LLama.KernelMemory/LlamaSharpTextGenerator.cs
+++ b/LLama.KernelMemory/LlamaSharpTextGenerator.cs
@@ -33,8 +33,6 @@ public LlamaSharpTextGenerator(LLamaSharpConfig config)
             {
                 ContextSize = config.ContextSize ?? 2048,
                 GpuLayerCount = config.GpuLayerCount ?? 20,
-                MainGpu = config.MainGpu,
-                SplitMode = config.SplitMode
             };
             _weights = LLamaWeights.LoadFromFile(parameters);
             _context = _weights.CreateContext(parameters);
diff --git a/LLama.Unittest/KernelMemory/LLamaSharpTextEmbeddingGeneratorTests.cs b/LLama.Unittest/KernelMemory/LLamaSharpTextEmbeddingGeneratorTests.cs
index 91161b72c..5c7b4213d 100644
--- a/LLama.Unittest/KernelMemory/LLamaSharpTextEmbeddingGeneratorTests.cs
+++ b/LLama.Unittest/KernelMemory/LLamaSharpTextEmbeddingGeneratorTests.cs
@@ -1,21 +1,15 @@
-using LLama.Common;
 using LLamaSharp.KernelMemory;
-using Microsoft.KernelMemory.AI;
-using System;
-using System.Collections.Generic;
-using System.Linq;
-using System.Text;
-using System.Text.RegularExpressions;
-using System.Threading.Tasks;
 using Xunit.Abstractions;
 
 namespace LLama.Unittest.KernelMemory
 {
-    public class LLamaSharpTextEmbeddingGeneratorTests : ITextTokenizerTests, IDisposable
+    public class LLamaSharpTextEmbeddingGeneratorTests
+        : ITextTokenizerTests, IDisposable
     {
         private readonly LLamaSharpTextEmbeddingGenerator _embeddingGenerator;
 
-        public LLamaSharpTextEmbeddingGeneratorTests(ITestOutputHelper testOutputHelper) : base(testOutputHelper)
+        public LLamaSharpTextEmbeddingGeneratorTests(ITestOutputHelper testOutputHelper)
+            : base(testOutputHelper)
         {
             _embeddingGenerator = new LLamaSharpTextEmbeddingGenerator(_lsConfig);
             
diff --git a/LLama.Unittest/KernelMemory/LlamaSharpTextGeneratorTests.cs b/LLama.Unittest/KernelMemory/LlamaSharpTextGeneratorTests.cs
index 02001f8cf..d21d7f959 100644
--- a/LLama.Unittest/KernelMemory/LlamaSharpTextGeneratorTests.cs
+++ b/LLama.Unittest/KernelMemory/LlamaSharpTextGeneratorTests.cs
@@ -1,25 +1,15 @@
-using LLama.Common;
 using LLamaSharp.KernelMemory;
-using Microsoft.KernelMemory.AI;
-using System;
-using System.Collections.Generic;
-using System.Diagnostics;
-using System.Linq;
-using System.Reflection.Emit;
-using System.Text;
-using System.Text.RegularExpressions;
-using System.Threading.Tasks;
 using Xunit.Abstractions;
-using Xunit.Sdk;
-using static System.Net.Mime.MediaTypeNames;
 
 namespace LLama.Unittest.KernelMemory
 {
-    public class LlamaSharpTextGeneratorTests : ITextTokenizerTests, IDisposable
+    public class LlamaSharpTextGeneratorTests
+        : ITextTokenizerTests, IDisposable
     {        
         private readonly LlamaSharpTextGenerator _textGenerator;
 
-        public LlamaSharpTextGeneratorTests(ITestOutputHelper testOutputHelper) : base(testOutputHelper)
+        public LlamaSharpTextGeneratorTests(ITestOutputHelper testOutputHelper)
+            : base(testOutputHelper)
         {            
             _textGenerator = new LlamaSharpTextGenerator(_lsConfig);
 
diff --git a/LLama.Unittest/SamplingTests.cs b/LLama.Unittest/SamplingTests.cs
index f322bc250..bae7e3dea 100644
--- a/LLama.Unittest/SamplingTests.cs
+++ b/LLama.Unittest/SamplingTests.cs
@@ -167,11 +167,7 @@ private static SafeLLamaSamplerChainHandle CreateChain(SafeLLamaContextHandle co
             var chain = SafeLLamaSamplerChainHandle.Create(LLamaSamplerChainParams.Default());
 
             chain.AddPenalties(
-                vocabSize: context.VocabCount,
-                eos: context.ModelHandle.Tokens.EOS,
-                newline: context.ModelHandle.Tokens.Newline ?? 0,
-                penaltyCount: 60, repeat: 1, freq: 0, presence: 0,
-                penalizeNewline: false, ignoreEOS: false
+                penaltyCount: 60, repeat: 1, freq: 0, presence: 0
             );
 
             if (logit_bias != null) { chain.AddLogitBias(context.VocabCount, logit_bias); }
diff --git a/LLama.Web/Common/ModelOptions.cs b/LLama.Web/Common/ModelOptions.cs
index 4e002c93f..a67a11a96 100644
--- a/LLama.Web/Common/ModelOptions.cs
+++ b/LLama.Web/Common/ModelOptions.cs
@@ -24,7 +24,7 @@ public class ModelOptions
         public int MainGpu { get; set; } = 0;
 
         /// <inheritdoc />
-        public GPUSplitMode SplitMode { get; set; } = GPUSplitMode.None;
+        public GPUSplitMode? SplitMode { get; set; }
 
         /// <inheritdoc />
         public int GpuLayerCount { get; set; } = 20;
@@ -59,6 +59,9 @@ public class ModelOptions
         /// <inheritdoc />
         public TensorSplitsCollection TensorSplits { get; set; } = new();
 
+        /// <inheritdoc />
+        public bool CheckTensors { get; }
+
         /// <inheritdoc />
         public List<MetadataOverride> MetadataOverrides { get; } = new();
 
diff --git a/LLama/Abstractions/IModelParams.cs b/LLama/Abstractions/IModelParams.cs
index 7dc28f671..cbbacafe5 100644
--- a/LLama/Abstractions/IModelParams.cs
+++ b/LLama/Abstractions/IModelParams.cs
@@ -36,7 +36,7 @@ public interface IModelParams
         /// <summary>
         /// How to split the model across multiple GPUs
         /// </summary>
-        GPUSplitMode SplitMode { get; }
+        GPUSplitMode? SplitMode { get; }
 
         /// <summary>
         /// Number of layers to run in VRAM / GPU memory (n_gpu_layers)
@@ -68,6 +68,11 @@ public interface IModelParams
         /// </summary>
         bool VocabOnly { get; }
 
+        /// <summary>
+        /// Validate model tensor data before loading
+        /// </summary>
+        bool CheckTensors { get; }
+
         /// <summary>
         /// Override specific metadata items in the model
         /// </summary>
diff --git a/LLama/Common/ModelParams.cs b/LLama/Common/ModelParams.cs
index b276ed73a..7e4b1a967 100644
--- a/LLama/Common/ModelParams.cs
+++ b/LLama/Common/ModelParams.cs
@@ -19,7 +19,7 @@ public record ModelParams
         public int MainGpu { get; set; } = 0;
 
         /// <inheritdoc />
-        public GPUSplitMode SplitMode { get; set; } = GPUSplitMode.None;
+        public GPUSplitMode? SplitMode { get; set; }
 
         /// <inheritdoc />
         public int GpuLayerCount { get; set; } = 20;
@@ -54,6 +54,9 @@ public record ModelParams
         /// <inheritdoc />
         public TensorSplitsCollection TensorSplits { get; set; } = new();
 
+        /// <inheritdoc />
+        public bool CheckTensors { get; }
+
         /// <inheritdoc />
         public List<MetadataOverride> MetadataOverrides { get; set; } = new();
 
diff --git a/LLama/Extensions/IModelParamsExtensions.cs b/LLama/Extensions/IModelParamsExtensions.cs
index 523ec737a..588564e33 100644
--- a/LLama/Extensions/IModelParamsExtensions.cs
+++ b/LLama/Extensions/IModelParamsExtensions.cs
@@ -1,4 +1,4 @@
-﻿using System.IO;
+using System.IO;
 using System;
 using System.Text;
 using LLama.Abstractions;
@@ -31,11 +31,14 @@ public static IDisposable ToLlamaModelParams(this IModelParams @params, out LLam
         result = LLamaModelParams.Default();
 
         result.main_gpu = @params.MainGpu;
-        result.split_mode = @params.SplitMode;
         result.n_gpu_layers = @params.GpuLayerCount < 0 ? int.MaxValue : @params.GpuLayerCount;
+        if (@params.SplitMode.HasValue)
+            result.split_mode = @params.SplitMode.Value;
+
         result.use_mlock = @params.UseMemoryLock;
         result.use_mmap = @params.UseMemorymap;
         result.vocab_only = @params.VocabOnly;
+        result.check_tensors = @params.CheckTensors;
 
         unsafe
         {
diff --git a/LLama/Extensions/LLamaExecutorExtensions.cs b/LLama/Extensions/LLamaExecutorExtensions.cs
index 19c8d33df..e38ccf98d 100644
--- a/LLama/Extensions/LLamaExecutorExtensions.cs
+++ b/LLama/Extensions/LLamaExecutorExtensions.cs
@@ -147,7 +147,7 @@ private string CreatePrompt(IList<ChatMessage> messages)
                     PreventEOS = options?.AdditionalProperties?.TryGetValue(nameof(DefaultSamplingPipeline.PreventEOS), out bool eos) is true ? eos : s_defaultPipeline.PreventEOS,
                     PenalizeNewline = options?.AdditionalProperties?.TryGetValue(nameof(DefaultSamplingPipeline.PenalizeNewline), out bool pnl) is true ? pnl : s_defaultPipeline.PenalizeNewline,
                     RepeatPenalty = options?.AdditionalProperties?.TryGetValue(nameof(DefaultSamplingPipeline.RepeatPenalty), out float rp) is true ? rp : s_defaultPipeline.RepeatPenalty,
-                    RepeatPenaltyCount = options?.AdditionalProperties?.TryGetValue(nameof(DefaultSamplingPipeline.RepeatPenaltyCount), out int rpc) is true ? rpc : s_defaultPipeline.RepeatPenaltyCount,
+                    PenaltyCount = options?.AdditionalProperties?.TryGetValue(nameof(DefaultSamplingPipeline.PenaltyCount), out int rpc) is true ? rpc : s_defaultPipeline.PenaltyCount,
                     Grammar = options?.AdditionalProperties?.TryGetValue(nameof(DefaultSamplingPipeline.Grammar), out Grammar? g) is true ? g : s_defaultPipeline.Grammar,
                     MinKeep = options?.AdditionalProperties?.TryGetValue(nameof(DefaultSamplingPipeline.MinKeep), out int mk) is true ? mk : s_defaultPipeline.MinKeep,
                     MinP = options?.AdditionalProperties?.TryGetValue(nameof(DefaultSamplingPipeline.MinP), out float mp) is true ? mp : s_defaultPipeline.MinP,
diff --git a/LLama/LLamaQuantizer.cs b/LLama/LLamaQuantizer.cs
index 23f0f8b4e..9e90b732e 100644
--- a/LLama/LLamaQuantizer.cs
+++ b/LLama/LLamaQuantizer.cs
@@ -106,9 +106,6 @@ private static bool ValidateFtype(LLamaFtype ftype)
                 case LLamaFtype.MOSTLY_IQ3_S:
                 case LLamaFtype.MOSTLY_IQ3_M:
 
-                case LLamaFtype.MOSTLY_Q4_0_4_4:
-                case LLamaFtype.MOSTLY_Q4_0_4_8:
-                case LLamaFtype.MOSTLY_Q4_0_8_8:
                     return true;
 
                 case LLamaFtype.GUESSED:
diff --git a/LLama/LLamaSharp.Runtime.targets b/LLama/LLamaSharp.Runtime.targets
index 6466a1204..76292aaf5 100644
--- a/LLama/LLamaSharp.Runtime.targets
+++ b/LLama/LLamaSharp.Runtime.targets
@@ -4,14 +4,24 @@
     </PropertyGroup>
     <ItemGroup Condition="'$(IncludeBuiltInRuntimes)' == 'true'">
 
-      <None Include="$(MSBuildThisFileDirectory)runtimes/deps/llama.dll">
+      <None Include="$(MSBuildThisFileDirectory)runtimes/deps/noavx/llama.dll">
         <CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
         <Link>runtimes/win-x64/native/noavx/llama.dll</Link>
       </None>
-      <None Include="$(MSBuildThisFileDirectory)runtimes/deps/ggml.dll">
+      <None Include="$(MSBuildThisFileDirectory)runtimes/deps/noavx/ggml.dll">
           <CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
           <Link>runtimes/win-x64/native/noavx/ggml.dll</Link>
       </None>
+      <None Include="$(MSBuildThisFileDirectory)runtimes/deps/noavx/ggml-base.dll">
+          <CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
+          <Link>runtimes/win-x64/native/noavx/ggml-base.dll</Link>
+      </None>
+      <None Include="$(MSBuildThisFileDirectory)runtimes/deps/noavx/ggml-cpu.dll">
+        <CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
+        <Link>runtimes/win-x64/native/noavx/ggml-cpu.dll</Link>
+      </None>
+
+
       <None Include="$(MSBuildThisFileDirectory)runtimes/deps/avx/llama.dll">
         <CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
         <Link>runtimes/win-x64/native/avx/llama.dll</Link>
@@ -20,55 +30,124 @@
         <CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
         <Link>runtimes/win-x64/native/avx/ggml.dll</Link>
       </None>
+      <None Include="$(MSBuildThisFileDirectory)runtimes/deps/avx/ggml-base.dll">
+        <CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
+        <Link>runtimes/win-x64/native/avx/ggml-base.dll</Link>
+      </None>
+      <None Include="$(MSBuildThisFileDirectory)runtimes/deps/avx/ggml-cpu.dll">
+          <CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
+          <Link>runtimes/win-x64/native/avx/ggml-cpu.dll</Link>
+      </None>
+
+
       <None Include="$(MSBuildThisFileDirectory)runtimes/deps/avx2/llama.dll">
         <CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
         <Link>runtimes/win-x64/native/avx2/llama.dll</Link>
       </None>
+      <None Include="$(MSBuildThisFileDirectory)runtimes/deps/avx2/ggml-base.dll">
+        <CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
+        <Link>runtimes/win-x64/native/avx2/ggml-base.dll</Link>
+      </None>
       <None Include="$(MSBuildThisFileDirectory)runtimes/deps/avx2/ggml.dll">
         <CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
         <Link>runtimes/win-x64/native/avx2/ggml.dll</Link>
       </None>
+      <None Include="$(MSBuildThisFileDirectory)runtimes/deps/avx2/ggml-cpu.dll">
+        <CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
+        <Link>runtimes/win-x64/native/avx2/ggml-cpu.dll</Link>
+      </None>
+
+
       <None Include="$(MSBuildThisFileDirectory)runtimes/deps/avx512/llama.dll">
         <CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
         <Link>runtimes/win-x64/native/avx512/llama.dll</Link>
       </None>
+      <None Include="$(MSBuildThisFileDirectory)runtimes/deps/avx512/ggml-base.dll">
+        <CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
+        <Link>runtimes/win-x64/native/avx512/ggml-base.dll</Link>
+      </None>
       <None Include="$(MSBuildThisFileDirectory)runtimes/deps/avx512/ggml.dll">
         <CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
         <Link>runtimes/win-x64/native/avx512/ggml.dll</Link>
       </None>
+      <None Include="$(MSBuildThisFileDirectory)runtimes/deps/avx512/ggml-cpu.dll">
+        <CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
+        <Link>runtimes/win-x64/native/avx512/ggml-cpu.dll</Link>
+      </None>
+
+
       <None Include="$(MSBuildThisFileDirectory)runtimes/deps/cu11.7.1/llama.dll">
         <CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
         <Link>runtimes/win-x64/native/cuda11/llama.dll</Link>
       </None>
+      <None Include="$(MSBuildThisFileDirectory)runtimes/deps/cu11.7.1/ggml-base.dll">
+        <CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
+        <Link>runtimes/win-x64/native/cuda11/ggml-base.dll</Link>
+      </None>
       <None Include="$(MSBuildThisFileDirectory)runtimes/deps/cu11.7.1/ggml.dll">
         <CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
         <Link>runtimes/win-x64/native/cuda11/ggml.dll</Link>
       </None>
+      <None Include="$(MSBuildThisFileDirectory)runtimes/deps/cu11.7.1/ggml-cuda.dll">
+        <CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
+        <Link>runtimes/win-x64/native/cuda11/ggml-cuda.dll</Link>
+      </None>
+
+
       <None Include="$(MSBuildThisFileDirectory)runtimes/deps/cu12.2.0/llama.dll">
         <CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
         <Link>runtimes/win-x64/native/cuda12/llama.dll</Link>
       </None>
+      <None Include="$(MSBuildThisFileDirectory)runtimes/deps/cu12.2.0/ggml-base.dll">
+        <CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
+        <Link>runtimes/win-x64/native/cuda12/ggml-base.dll</Link>
+      </None>
       <None Include="$(MSBuildThisFileDirectory)runtimes/deps/cu12.2.0/ggml.dll">
         <CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
         <Link>runtimes/win-x64/native/cuda12/ggml.dll</Link>
       </None>
+      <None Include="$(MSBuildThisFileDirectory)runtimes/deps/cu12.2.0/ggml-cuda.dll">
+        <CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
+        <Link>runtimes/win-x64/native/cuda12/ggml-cuda.dll</Link>
+      </None>
+
+
       <None Include="$(MSBuildThisFileDirectory)runtimes/deps/vulkan/llama.dll">
         <CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
         <Link>runtimes/win-x64/native/vulkan/llama.dll</Link>
       </None>
+      <None Include="$(MSBuildThisFileDirectory)runtimes/deps/vulkan/ggml-base.dll">
+        <CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
+        <Link>runtimes/win-x64/native/vulkan/ggml-base.dll</Link>
+      </None>
       <None Include="$(MSBuildThisFileDirectory)runtimes/deps/vulkan/ggml.dll">
         <CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
         <Link>runtimes/win-x64/native/vulkan/ggml.dll</Link>
       </None>
+      <None Include="$(MSBuildThisFileDirectory)runtimes/deps/vulkan/ggml-vulkan.dll">
+        <CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
+        <Link>runtimes/win-x64/native/vulkan/ggml-vulkan.dll</Link>
+      </None>
+
 
-      <None Include="$(MSBuildThisFileDirectory)runtimes/deps/libllama.so">
+      <None Include="$(MSBuildThisFileDirectory)runtimes/deps/noavx/libllama.so">
         <CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
         <Link>runtimes/linux-x64/native/noavx/libllama.so</Link>
       </None>
-      <None Include="$(MSBuildThisFileDirectory)runtimes/deps/libggml.so">
+      <None Include="$(MSBuildThisFileDirectory)runtimes/deps/noavx/libggml.so">
         <CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
         <Link>runtimes/linux-x64/native/noavx/libggml.so</Link>
       </None>
+      <None Include="$(MSBuildThisFileDirectory)runtimes/deps/noavx/libggml-base.so">
+          <CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
+          <Link>runtimes/linux-x64/native/noavx/libggml-base.so</Link>
+      </None>
+      <None Include="$(MSBuildThisFileDirectory)runtimes/deps/noavx/libggml-cpu.so">
+        <CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
+        <Link>runtimes/linux-x64/native/noavx/libggml-cpu.so</Link>
+      </None>
+
+
       <None Include="$(MSBuildThisFileDirectory)runtimes/deps/avx/libllama.so">
         <CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
         <Link>runtimes/linux-x64/native/avx/libllama.so</Link>
@@ -77,6 +156,17 @@
         <CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
         <Link>runtimes/linux-x64/native/avx/libggml.so</Link>
       </None>
+      <None Include="$(MSBuildThisFileDirectory)runtimes/deps/avx/libggml-base.so">
+          <CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
+          <Link>runtimes/linux-x64/native/avx/libggml-base.so</Link>
+      </None>
+      <None Include="$(MSBuildThisFileDirectory)runtimes/deps/avx/libggml-cpu.so">
+        <CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
+        <Link>runtimes/linux-x64/native/avx/libggml-cpu.so</Link>
+      </None>
+
+
+
       <None Include="$(MSBuildThisFileDirectory)runtimes/deps/avx2/libllama.so">
         <CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
         <Link>runtimes/linux-x64/native/avx2/libllama.so</Link>
@@ -85,6 +175,15 @@
         <CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
         <Link>runtimes/linux-x64/native/avx2/libggml.so</Link>
       </None>
+      <None Include="$(MSBuildThisFileDirectory)runtimes/deps/avx2/libggml-base.so">
+          <CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
+          <Link>runtimes/linux-x64/native/avx2/libggml-base.so</Link>
+      </None>
+      <None Include="$(MSBuildThisFileDirectory)runtimes/deps/avx2/libggml-cpu.so">
+        <CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
+        <Link>runtimes/linux-x64/native/avx2/libggml-cpu.so</Link>
+      </None>
+
       <None Include="$(MSBuildThisFileDirectory)runtimes/deps/avx512/libllama.so">
         <CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
         <Link>runtimes/linux-x64/native/avx512/libllama.so</Link>
@@ -93,6 +192,15 @@
         <CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
         <Link>runtimes/linux-x64/native/avx512/libggml.so</Link>
       </None>
+      <None Include="$(MSBuildThisFileDirectory)runtimes/deps/avx512/libggml-base.so">
+          <CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
+          <Link>runtimes/linux-x64/native/avx512/libggml-base.so</Link>
+      </None>
+      <None Include="$(MSBuildThisFileDirectory)runtimes/deps/avx512/libggml-cpu.so">
+        <CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
+        <Link>runtimes/linux-x64/native/avx512/libggml-cpu.so</Link>
+      </None>
+
       <None Include="$(MSBuildThisFileDirectory)runtimes/deps/cu11.7.1/libllama.so">
         <CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
         <Link>runtimes/linux-x64/native/cuda11/libllama.so</Link>
@@ -101,6 +209,16 @@
         <CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
         <Link>runtimes/linux-x64/native/cuda11/libggml.so</Link>
       </None>
+      <None Include="$(MSBuildThisFileDirectory)runtimes/deps/cu11.7.1/libggml-base.so">
+          <CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
+          <Link>runtimes/linux-x64/native/cuda11/libggml-base.so</Link>
+      </None>
+      <None Include="$(MSBuildThisFileDirectory)runtimes/deps/cu11.7.1/libggml-cuda.so">
+        <CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
+        <Link>runtimes/linux-x64/native/cuda11/libggml-cuda.so</Link>
+      </None>
+
+
       <None Include="$(MSBuildThisFileDirectory)runtimes/deps/cu12.2.0/libllama.so">
         <CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
         <Link>runtimes/linux-x64/native/cuda12/libllama.so</Link>
@@ -109,6 +227,16 @@
         <CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
         <Link>runtimes/linux-x64/native/cuda12/libggml.so</Link>
       </None>
+      <None Include="$(MSBuildThisFileDirectory)runtimes/deps/cu12.2.0/libggml-base.so">
+          <CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
+          <Link>runtimes/linux-x64/native/cuda12/libggml-base.so</Link>
+      </None>
+      <None Include="$(MSBuildThisFileDirectory)runtimes/deps/cu12.2.0/libggml-cuda.so">
+        <CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
+        <Link>runtimes/linux-x64/native/cuda12/libggml-cuda.so</Link>
+      </None>
+
+
       <None Include="$(MSBuildThisFileDirectory)runtimes/deps/vulkan/libllama.so">
         <CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
         <Link>runtimes/linux-x64/native/vulkan/libllama.so</Link>
@@ -117,7 +245,32 @@
         <CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
         <Link>runtimes/linux-x64/native/vulkan/libggml.so</Link>
       </None>
+      <None Include="$(MSBuildThisFileDirectory)runtimes/deps/vulkan/libggml-base.so">
+          <CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
+          <Link>runtimes/linux-x64/native/vulkan/libggml-base.so</Link>
+      </None>
+      <None Include="$(MSBuildThisFileDirectory)runtimes/deps/vulkan/libggml-vulkan.so">
+        <CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
+        <Link>runtimes/linux-x64/native/vulkan/libggml-vulkan.so</Link>
+      </None>
+
 
+      <None Include="$(MSBuildThisFileDirectory)runtimes/deps/osx-arm64/libggml-base.dylib">
+          <CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
+          <Link>runtimes/osx-arm64/native/libggml-base.dylib</Link>
+      </None>
+      <None Include="$(MSBuildThisFileDirectory)runtimes/deps/osx-arm64/libggml-cpu.dylib">
+          <CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
+          <Link>runtimes/osx-arm64/native/libggml-cpu.dylib</Link>
+      </None>
+      <None Include="$(MSBuildThisFileDirectory)runtimes/deps/osx-arm64/libggml-metal.dylib">
+          <CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
+          <Link>runtimes/osx-arm64/native/libggml-metal.dylib</Link>
+      </None>
+      <None Include="$(MSBuildThisFileDirectory)runtimes/deps/osx-arm64/libggml-blas.dylib">
+          <CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
+          <Link>runtimes/osx-arm64/native/libggml-blas.dylib</Link>
+      </None>
       <None Include="$(MSBuildThisFileDirectory)runtimes/deps/osx-arm64/libggml.dylib">
         <CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
         <Link>runtimes/osx-arm64/native/libggml.dylib</Link>
@@ -134,7 +287,19 @@
         <CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
         <Link>runtimes/osx-arm64/native/ggml-metal.metal</Link>
       </None>
-        
+
+      <None Include="$(MSBuildThisFileDirectory)runtimes/deps/osx-x64/libggml-base.dylib">
+          <CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
+          <Link>runtimes/osx-x64/native/libggml-base.dylib</Link>
+      </None>
+      <None Include="$(MSBuildThisFileDirectory)runtimes/deps/osx-x64/libggml-cpu.dylib">
+          <CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
+          <Link>runtimes/osx-x64/native/libggml-cpu.dylib</Link>
+      </None>
+      <None Include="$(MSBuildThisFileDirectory)runtimes/deps/osx-x64/libggml-blas.dylib">
+          <CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
+          <Link>runtimes/osx-x64/native/libggml-blas.dylib</Link>
+      </None>
       <None Include="$(MSBuildThisFileDirectory)runtimes/deps/osx-x64/libggml.dylib">
         <CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
         <Link>runtimes/osx-x64/native/libggml.dylib</Link>
@@ -148,6 +313,18 @@
         <Link>runtimes/osx-x64/native/libllava_shared.dylib</Link>
       </None>
 
+      <None Include="$(MSBuildThisFileDirectory)runtimes/deps/osx-x64-rosetta2/libggml-base.dylib">
+          <CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
+          <Link>runtimes/osx-x64/native/rosetta2/libggml-base.dylib</Link>
+      </None>
+      <None Include="$(MSBuildThisFileDirectory)runtimes/deps/osx-x64-rosetta2/libggml-cpu.dylib">
+          <CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
+          <Link>runtimes/osx-x64/native/rosetta2/libggml-cpu.dylib</Link>
+      </None>        
+      <None Include="$(MSBuildThisFileDirectory)runtimes/deps/osx-x64-rosetta2/libggml-blas.dylib">
+          <CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
+          <Link>runtimes/osx-x64/native/rosetta2/libggml-blas.dylib</Link>
+      </None>        
       <None Include="$(MSBuildThisFileDirectory)runtimes/deps/osx-x64-rosetta2/libggml.dylib">
         <CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
         <Link>runtimes/osx-x64/native/rosetta2/libggml.dylib</Link>
@@ -161,7 +338,7 @@
         <Link>runtimes/osx-x64/native/rosetta2/libllava_shared.dylib</Link>
       </None>
 
-      <None Include="$(MSBuildThisFileDirectory)runtimes/deps/llava_shared.dll">
+      <None Include="$(MSBuildThisFileDirectory)runtimes/deps/noavx/llava_shared.dll">
         <CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
         <Link>runtimes/win-x64/native/noavx/llava_shared.dll</Link>
       </None>
@@ -190,7 +367,7 @@
         <Link>runtimes/win-x64/native/vulkan/llava_shared.dll</Link>
       </None>
 
-      <None Include="$(MSBuildThisFileDirectory)runtimes/deps/libllava_shared.so">
+      <None Include="$(MSBuildThisFileDirectory)runtimes/deps/noavx/libllava_shared.so">
         <CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
         <Link>runtimes/linux-x64/native/noavx/libllava_shared.so</Link>
       </None>
diff --git a/LLama/LLamaSharp.csproj b/LLama/LLamaSharp.csproj
index d50771f3a..a32aa7647 100644
--- a/LLama/LLamaSharp.csproj
+++ b/LLama/LLamaSharp.csproj
@@ -56,7 +56,7 @@
   </ItemGroup>
 
   <PropertyGroup>
-    <BinaryReleaseId>958367bf530d943a90</BinaryReleaseId>
+    <BinaryReleaseId>0827b2c1da-v6</BinaryReleaseId>
   </PropertyGroup>
 
   <PropertyGroup>
diff --git a/LLama/LLamaStatelessExecutor.cs b/LLama/LLamaStatelessExecutor.cs
index 003cf9827..ace8e2581 100644
--- a/LLama/LLamaStatelessExecutor.cs
+++ b/LLama/LLamaStatelessExecutor.cs
@@ -24,14 +24,14 @@ public class StatelessExecutor
         private readonly ILogger? _logger;
         private readonly LLamaBatch _batch;
 
-        // LLava Section
+        /// <inheritdoc />
         public bool IsMultiModal => false;
 
         /// <inheritdoc />
-        public LLavaWeights? ClipModel { get;  }
+        public LLavaWeights? ClipModel => default;
 
         /// <inheritdoc />
-        public List<byte[]> Images { get; set; }
+        public List<byte[]> Images { get; }
 
         /// <summary>
         /// The context used by the executor when running the inference.
@@ -80,7 +80,7 @@ public async IAsyncEnumerable<string> InferAsync(string prompt, IInferenceParams
             Context = context;
 
             // Reset the sampling pipeline (if there is one)
-            inferenceParams?.SamplingPipeline?.Reset();
+            inferenceParams?.SamplingPipeline.Reset();
 
             // Sanity check inference params
             inferenceParams ??= new InferenceParams();
@@ -155,8 +155,8 @@ public async IAsyncEnumerable<string> InferAsync(string prompt, IInferenceParams
                     var n_left = n_past - tokensKeep;
                     var n_discard = n_left / 2;
 
-                    NativeApi.llama_kv_cache_seq_rm(Context.NativeHandle, (LLamaSeqId)0, tokensKeep , tokensKeep + n_discard);
-                    NativeApi.llama_kv_cache_seq_add(Context.NativeHandle, (LLamaSeqId)0, tokensKeep + n_discard, n_past, -n_discard);
+                    NativeApi.llama_kv_cache_seq_rm(Context.NativeHandle, LLamaSeqId.Zero, tokensKeep , tokensKeep + n_discard);
+                    NativeApi.llama_kv_cache_seq_add(Context.NativeHandle, LLamaSeqId.Zero, tokensKeep + n_discard, n_past, -n_discard);
 
                     n_past -= n_discard;
                 }
diff --git a/LLama/Native/GPUSplitMode.cs b/LLama/Native/GPUSplitMode.cs
index 54fa095c1..27ee7ae49 100644
--- a/LLama/Native/GPUSplitMode.cs
+++ b/LLama/Native/GPUSplitMode.cs
@@ -17,7 +17,7 @@ public enum GPUSplitMode
     Layer = 1,
 
     /// <summary>
-    /// split rows across GPUs
+    /// split layers and KV across GPUs, use tensor parallelism if supported
     /// </summary>
     Row = 2,
 }
\ No newline at end of file
diff --git a/LLama/Native/LLamaFtype.cs b/LLama/Native/LLamaFtype.cs
index 6970a4728..705f8032e 100644
--- a/LLama/Native/LLamaFtype.cs
+++ b/LLama/Native/LLamaFtype.cs
@@ -177,20 +177,20 @@ public enum LLamaFtype
         /// </summary>
         MOSTLY_BF16 = 32,
 
-        /// <summary>
-        /// except 1d tensors
-        /// </summary>
-        MOSTLY_Q4_0_4_4 = 33,
+        ///// <summary>
+        ///// except 1d tensors (no longer supported by llama.cpp)
+        ///// </summary>
+        //MOSTLY_Q4_0_4_4 = 33,
 
-        /// <summary>
-        /// except 1d tensors
-        /// </summary>
-        MOSTLY_Q4_0_4_8 = 34,
+        ///// <summary>
+        ///// except 1d tensors (no longer supported by llama.cpp)
+        ///// </summary>
+        //MOSTLY_Q4_0_4_8 = 34,
 
-        /// <summary>
-        /// except 1d tensors
-        /// </summary>
-        MOSTLY_Q4_0_8_8 = 35,
+        ///// <summary>
+        ///// except 1d tensors (no longer supported by llama.cpp)
+        ///// </summary>
+        //MOSTLY_Q4_0_8_8 = 35,
 
         /// <summary>
         /// except 1d tensors
diff --git a/LLama/Native/LLamaModelParams.cs b/LLama/Native/LLamaModelParams.cs
index c0437d9db..e16e3263e 100644
--- a/LLama/Native/LLamaModelParams.cs
+++ b/LLama/Native/LLamaModelParams.cs
@@ -8,6 +8,12 @@ namespace LLama.Native
     [StructLayout(LayoutKind.Sequential)]
     public unsafe struct LLamaModelParams
     {
+        /// <summary>
+        /// NULL-terminated list of devices to use for offloading (if NULL, all available devices are used)
+        /// todo: add support for llama_model_params.devices
+        /// </summary>
+        private IntPtr devices;
+
         /// <summary>
         /// // number of layers to store in VRAM
         /// </summary>
@@ -19,19 +25,19 @@ public unsafe struct LLamaModelParams
         public GPUSplitMode split_mode;
 
         /// <summary>
-        /// the GPU that is used for scratch and small tensors
+        /// the GPU that is used for the entire model when split_mode is LLAMA_SPLIT_MODE_NONE
         /// </summary>
         public int main_gpu;
 
         /// <summary>
         /// how to split layers across multiple GPUs (size: <see cref="NativeApi.llama_max_devices"/>)
         /// </summary>
-        public float* tensor_split;
-
-        /// <summary>
-        /// comma separated list of RPC servers to use for offloading
+        public float* tensor_split;
+
+        /// <summary>
+        /// comma separated list of RPC servers to use for offloading
         /// </summary>
-        public byte* rpc_servers;
+        public byte* rpc_servers;
 
         /// <summary>
         /// called with a progress value between 0 and 1, pass NULL to disable. If the provided progress_callback
diff --git a/LLama/Native/LLamaRopeType.cs b/LLama/Native/LLamaRopeType.cs
index ebad9e77b..3f1188112 100644
--- a/LLama/Native/LLamaRopeType.cs
+++ b/LLama/Native/LLamaRopeType.cs
@@ -9,4 +9,6 @@ public enum LLamaRopeType
     None = -1,
     Norm = 0,
     NEOX = 2,//GGML_ROPE_TYPE_NEOX,
+    //todo:LLAMA_ROPE_TYPE_MROPE = GGML_ROPE_TYPE_MROPE,
+    //todo:LLAMA_ROPE_TYPE_VISION = GGML_ROPE_TYPE_VISION,
 }
\ No newline at end of file
diff --git a/LLama/Native/Load/DefaultNativeLibrarySelectingPolicy.cs b/LLama/Native/Load/DefaultNativeLibrarySelectingPolicy.cs
index 6f5ad35fe..36ab0c0c8 100644
--- a/LLama/Native/Load/DefaultNativeLibrarySelectingPolicy.cs
+++ b/LLama/Native/Load/DefaultNativeLibrarySelectingPolicy.cs
@@ -10,8 +10,6 @@ public class DefaultNativeLibrarySelectingPolicy: INativeLibrarySelectingPolicy
         /// <inheritdoc/>
         public IEnumerable<INativeLibrary> Apply(NativeLibraryConfig.Description description, SystemInfo systemInfo, NativeLogConfig.LLamaLogCallback? logCallback)
         {
-            List<INativeLibrary> results = new();
-
             // Show the configuration we're working with
             Log(description.ToString(), LLamaLogLevel.Info, logCallback);
 
@@ -24,12 +22,12 @@ public IEnumerable<INativeLibrary> Apply(NativeLibraryConfig.Description descrip
             {
                 if (description.UseCuda)
                 {
-                    yield return new NativeLibraryWithCuda(systemInfo.CudaMajorVersion, description.Library, description.SkipCheck);
+                    yield return new NativeLibraryWithCuda(systemInfo.CudaMajorVersion, description.Library, description.AvxLevel, description.SkipCheck);
                 }
 
                 if (description.UseVulkan)
                 {
-                    yield return new NativeLibraryWithVulkan(systemInfo.VulkanVersion, description.Library, description.SkipCheck);
+                    yield return new NativeLibraryWithVulkan(systemInfo.VulkanVersion, description.Library, description.AvxLevel, description.SkipCheck);
                 }
 
                 if((!description.UseCuda || !description.UseVulkan) || description.AllowFallback)
@@ -56,7 +54,7 @@ public IEnumerable<INativeLibrary> Apply(NativeLibraryConfig.Description descrip
 
                 if(systemInfo.OSPlatform == OSPlatform.OSX || description.AllowFallback)
                 {
-                    yield return new NativeLibraryWithMacOrFallback(description.Library, description.SkipCheck);
+                    yield return new NativeLibraryWithMacOrFallback(description.Library);
                 }
             }
         }
diff --git a/LLama/Native/Load/NativeLibraryConfig.cs b/LLama/Native/Load/NativeLibraryConfig.cs
index 02e47b695..2bfa0554b 100644
--- a/LLama/Native/Load/NativeLibraryConfig.cs
+++ b/LLama/Native/Load/NativeLibraryConfig.cs
@@ -178,7 +178,7 @@ internal Description CheckAndGatherDescription()
                 _avxLevel,
                 _allowFallback,
                 _skipCheck,
-                _searchDirectories.Concat(new[] { "./" }).ToArray()
+                _searchDirectories.Concat([ "./" ]).ToArray()
             );
         }
 
@@ -186,7 +186,7 @@ internal static string AvxLevelToString(AvxLevel level)
         {
             return level switch
             {
-                AvxLevel.None => string.Empty,
+                AvxLevel.None => "noavx",
                 AvxLevel.Avx => "avx",
                 AvxLevel.Avx2 => "avx2",
                 AvxLevel.Avx512 => "avx512",
diff --git a/LLama/Native/Load/NativeLibraryUtils.cs b/LLama/Native/Load/NativeLibraryUtils.cs
index d5b014ce0..13e68be4d 100644
--- a/LLama/Native/Load/NativeLibraryUtils.cs
+++ b/LLama/Native/Load/NativeLibraryUtils.cs
@@ -45,33 +45,86 @@ internal static IntPtr TryLoadLibrary(NativeLibraryConfig config, out INativeLib
                 {
                     Log($"Got relative library path '{path}' from local with {library.Metadata}, trying to load it...", LLamaLogLevel.Debug, config.LogCallback);
                     
-                    // If we are on Linux / OSX, we need to manually load the GGML dependency
-                    if (systemInfo.OSPlatform == OSPlatform.Linux || systemInfo.OSPlatform == OSPlatform.OSX)
+                    // After the llama.cpp binaries have been split up (PR #10256), we need to load the dependencies manually.
+                    // It can't be done automatically on Windows, because the dependencies can be in different folders (for example, ggml-cuda.dll from the cuda12 folder, and ggml-cpu.dll from the avx2 folder)
+                    // It can't be done automatically on Linux, because Linux uses the environment variable "LD_LIBRARY_PATH" to automatically load dependencies, and LD_LIBRARY_PATH can only be
+                    // set before running LLamaSharp, but we only know which folders to search in when running LLamaSharp (decided by the NativeLibrary).
+                    
+                    // Get the directory of the current runtime
+                    string? currentRuntimeDirectory = Path.GetDirectoryName(path);
+
+                    // If we failed to get the directory of the current runtime, log it and continue on to the next library
+                    if (currentRuntimeDirectory == null)
                     {
-                        // Get the directory of the library
-                        string? libraryDirectory = Path.GetDirectoryName(path);
-                        
-                        if (libraryDirectory != null)
+                        Log($"Failed to get the directory of the current runtime from path '{path}'", LLamaLogLevel.Error, config.LogCallback);
+                        continue;
+                    }
+
+                    // List which will hold all paths to dependencies to load
+                    var dependencyPaths = new List<string>();
+                    
+                    // We should always load ggml-base from the current runtime directory
+                    dependencyPaths.Add(Path.Combine(currentRuntimeDirectory, $"{libPrefix}ggml-base{ext}"));
+
+                    // If the library has metadata, we can check if we need to load additional dependencies
+                    if (library.Metadata != null)
+                    {
+                        if (systemInfo.OSPlatform == OSPlatform.OSX)
                         {
-                            // Construct the dependency (libggml) path
-                            string dependencyPath = Path.Combine(libraryDirectory, $"{libPrefix}ggml{ext}");
-                        
-                            // Try to load the dependency
-                            var dependencyResult = TryLoad(dependencyPath, description.SearchDirectories, config.LogCallback);
+                            // On OSX, we should load the CPU backend from the current directory
+                            
+                            // ggml-cpu
+                            dependencyPaths.Add(Path.Combine(currentRuntimeDirectory, $"{libPrefix}ggml-cpu{ext}"));
+
+                            // ggml-metal (only supported on osx-arm64)
+                            if (os == "osx-arm64")
+                                dependencyPaths.Add(Path.Combine(currentRuntimeDirectory, $"{libPrefix}ggml-metal{ext}"));
+                            
+                            // ggml-blas (osx-x64, osx-x64-rosetta2 and osx-arm64 all have blas)
+                            dependencyPaths.Add(Path.Combine(currentRuntimeDirectory, $"{libPrefix}ggml-blas{ext}"));
+                        }
+                        else
+                        {
+                            // On other platforms (Windows, Linux), we need to load the CPU backend from the specified AVX level directory
+                            // We are using the AVX level supplied by NativeLibraryConfig, which automatically detects the highest supported AVX level for us
+                            
+                            // ggml-cpu
+                            dependencyPaths.Add(Path.Combine(
+                                $"runtimes/{os}/native/{NativeLibraryConfig.AvxLevelToString(library.Metadata.AvxLevel)}",
+                                $"{libPrefix}ggml-cpu{ext}"
+                            ));
+
+                            // ggml-cuda
+                            if (library.Metadata.UseCuda)
+                                dependencyPaths.Add(Path.Combine(currentRuntimeDirectory, $"{libPrefix}ggml-cuda{ext}"));
+                    
+                            // ggml-vulkan
+                            if (library.Metadata.UseVulkan)
+                                dependencyPaths.Add(Path.Combine(currentRuntimeDirectory, $"{libPrefix}ggml-vulkan{ext}"));
+                        }
+                    }
+                    
+                    // And finally, we can add ggml
+                    dependencyPaths.Add(Path.Combine(currentRuntimeDirectory, $"{libPrefix}ggml{ext}"));
+                    
+                    // Now, we will loop through our dependencyPaths and try to load them one by one
+                    foreach (var dependencyPath in dependencyPaths)
+                    {
+                        // Try to load the dependency
+                        var dependencyResult = TryLoad(dependencyPath, description.SearchDirectories, config.LogCallback);
                         
-                            // If we successfully loaded the library, log it
-                            if (dependencyResult != IntPtr.Zero)
-                            {
-                                Log($"Successfully loaded dependency '{dependencyPath}'", LLamaLogLevel.Info, config.LogCallback);
-                            }
-                            else
-                            {
-                                Log($"Failed loading dependency '{dependencyPath}'", LLamaLogLevel.Info, config.LogCallback);
-                            }
+                        // If we successfully loaded the library, log it
+                        if (dependencyResult != IntPtr.Zero)
+                        {
+                            Log($"Successfully loaded dependency '{dependencyPath}'", LLamaLogLevel.Info, config.LogCallback);
+                        }
+                        else
+                        {
+                            Log($"Failed loading dependency '{dependencyPath}'", LLamaLogLevel.Info, config.LogCallback);
                         }
                     }
                     
-                    // Try to load the library
+                    // Try to load the main library
                     var result = TryLoad(path, description.SearchDirectories, config.LogCallback);
                     
                     // If we successfully loaded the library, return the handle
diff --git a/LLama/Native/Load/NativeLibraryWithCuda.cs b/LLama/Native/Load/NativeLibraryWithCuda.cs
index 12da095dc..36dc4ca81 100644
--- a/LLama/Native/Load/NativeLibraryWithCuda.cs
+++ b/LLama/Native/Load/NativeLibraryWithCuda.cs
@@ -28,11 +28,13 @@ public NativeLibraryMetadata? Metadata
         /// </summary>
         /// <param name="majorCudaVersion"></param>
         /// <param name="libraryName"></param>
+        /// <param name="avxLevel"></param>
         /// <param name="skipCheck"></param>
-        public NativeLibraryWithCuda(int majorCudaVersion, NativeLibraryName libraryName, bool skipCheck)
+        public NativeLibraryWithCuda(int majorCudaVersion, NativeLibraryName libraryName, AvxLevel avxLevel, bool skipCheck)
         {
             _majorCudaVersion = majorCudaVersion;
             _libraryName = libraryName;
+            _avxLevel = avxLevel;
             _skipCheck = skipCheck;
         }
 
diff --git a/LLama/Native/Load/NativeLibraryWithMacOrFallback.cs b/LLama/Native/Load/NativeLibraryWithMacOrFallback.cs
index 6bcd55049..59754be03 100644
--- a/LLama/Native/Load/NativeLibraryWithMacOrFallback.cs
+++ b/LLama/Native/Load/NativeLibraryWithMacOrFallback.cs
@@ -1,5 +1,5 @@
-using LLama.Abstractions;
 using System.Collections.Generic;
+using LLama.Abstractions;
 
 namespace LLama.Native
 {
@@ -7,39 +7,30 @@ namespace LLama.Native
     /// <summary>
     /// A native library compiled on Mac, or fallbacks from all other libraries in the selection.
     /// </summary>
-    public class NativeLibraryWithMacOrFallback : INativeLibrary
+    public class NativeLibraryWithMacOrFallback
+        : INativeLibrary
     {
-        private NativeLibraryName _libraryName;
-        private bool _skipCheck;
+        private readonly NativeLibraryName _libraryName;
 
         /// <inheritdoc/>
-        public NativeLibraryMetadata? Metadata
-        {
-            get
-            {
-                return new NativeLibraryMetadata(_libraryName, false, false, AvxLevel.None);
-            }
-        }
+        public NativeLibraryMetadata Metadata => new(_libraryName, false, false, AvxLevel.None);
 
         /// <summary>
         /// 
         /// </summary>
         /// <param name="libraryName"></param>
-        /// <param name="skipCheck"></param>
-        public NativeLibraryWithMacOrFallback(NativeLibraryName libraryName, bool skipCheck)
+        public NativeLibraryWithMacOrFallback(NativeLibraryName libraryName)
         {
             _libraryName = libraryName;
-            _skipCheck = skipCheck;
         }
 
         /// <inheritdoc/>
         public IEnumerable<string> Prepare(SystemInfo systemInfo, NativeLogConfig.LLamaLogCallback? logCallback)
         {
-            var path = GetPath(systemInfo, AvxLevel.None, logCallback);
-            return path is null ?[] : [path];
+            yield return GetPath(systemInfo);
         }
 
-        private string? GetPath(SystemInfo systemInfo, AvxLevel avxLevel, NativeLogConfig.LLamaLogCallback? logCallback)
+        private string GetPath(SystemInfo systemInfo)
         {
             NativeLibraryUtils.GetPlatformPathParts(systemInfo.OSPlatform, out var os, out var fileExtension, out var libPrefix);
             string relativePath;
@@ -50,11 +41,7 @@ public IEnumerable<string> Prepare(SystemInfo systemInfo, NativeLogConfig.LLamaL
             }
             else
             {
-                var avxStr = NativeLibraryConfig.AvxLevelToString(AvxLevel.None);
-                if (!string.IsNullOrEmpty(avxStr))
-                    avxStr += "/";
-
-                relativePath = $"runtimes/{os}/native/{avxStr}{libPrefix}{_libraryName.GetLibraryName()}{fileExtension}";
+                relativePath = $"runtimes/{os}/native/{libPrefix}{_libraryName.GetLibraryName()}{fileExtension}";
             }
 
             return relativePath;
diff --git a/LLama/Native/Load/NativeLibraryWithVulkan.cs b/LLama/Native/Load/NativeLibraryWithVulkan.cs
index fe4eef01e..c3fe94de3 100644
--- a/LLama/Native/Load/NativeLibraryWithVulkan.cs
+++ b/LLama/Native/Load/NativeLibraryWithVulkan.cs
@@ -28,11 +28,13 @@ public NativeLibraryMetadata? Metadata
         /// </summary>
         /// <param name="vulkanVersion"></param>
         /// <param name="libraryName"></param>
+        /// <param name="avxLevel"></param>
         /// <param name="skipCheck"></param>
-        public NativeLibraryWithVulkan(string? vulkanVersion, NativeLibraryName libraryName, bool skipCheck)
+        public NativeLibraryWithVulkan(string? vulkanVersion, NativeLibraryName libraryName, AvxLevel avxLevel, bool skipCheck)
         {
             _vulkanVersion = vulkanVersion;
             _libraryName = libraryName;
+            _avxLevel = avxLevel;
             _skipCheck = skipCheck;
         }
 
diff --git a/LLama/Native/NativeApi.cs b/LLama/Native/NativeApi.cs
index 041cc0dd5..0d6bc1984 100644
--- a/LLama/Native/NativeApi.cs
+++ b/LLama/Native/NativeApi.cs
@@ -172,6 +172,15 @@ public static unsafe int llama_chat_apply_template(SafeLlamaModelHandle? model,
             static extern int internal_llama_chat_apply_template(IntPtr model, byte* tmpl, LLamaChatMessage* chat, nuint n_msg, [MarshalAs(UnmanagedType.U1)] bool add_ass, byte* buf, int length);
         }
 
+        /// <summary>
+        /// Get list of built-in chat templates
+        /// </summary>
+        /// <param name="output"></param>
+        /// <param name="len"></param>
+        /// <returns></returns>
+        [DllImport(libraryName, CallingConvention = CallingConvention.Cdecl)]
+        public static extern unsafe int llama_chat_builtin_templates(char** output, nuint len);
+
         [DllImport(libraryName, CallingConvention = CallingConvention.Cdecl)]
         [return: MarshalAs(UnmanagedType.U1)]
         public static extern bool llama_add_bos_token(SafeLlamaModelHandle model);
diff --git a/LLama/Native/RopeScalingType.cs b/LLama/Native/RopeScalingType.cs
index 8d4552b80..61ae82942 100644
--- a/LLama/Native/RopeScalingType.cs
+++ b/LLama/Native/RopeScalingType.cs
@@ -1,4 +1,4 @@
-﻿namespace LLama.Native
+namespace LLama.Native
 {
     /// <summary>
     /// RoPE scaling type.
@@ -26,5 +26,10 @@ public enum RopeScalingType
         /// YaRN scaling: https://arxiv.org/pdf/2309.00071.pdf
         /// </summary>
         Yarn = 2,
+
+        /// <summary>
+        /// LongRope scaling
+        /// </summary>
+        LongRope = 3,
     }
 }
diff --git a/LLama/Native/SafeLLamaContextHandle.cs b/LLama/Native/SafeLLamaContextHandle.cs
index 450f4998a..19187ded9 100644
--- a/LLama/Native/SafeLLamaContextHandle.cs
+++ b/LLama/Native/SafeLLamaContextHandle.cs
@@ -333,6 +333,14 @@ static SafeLLamaContextHandle()
         [DllImport(NativeApi.libraryName, CallingConvention = CallingConvention.Cdecl)]
         private static extern void llama_kv_cache_update(SafeLLamaContextHandle ctx);
 
+        /// <summary>
+        /// Check if the context supports KV cache shifting
+        /// </summary>
+        /// <param name="ctx"></param>
+        /// <returns></returns>
+        [DllImport(NativeApi.libraryName, CallingConvention = CallingConvention.Cdecl)]
+        private static extern bool llama_kv_cache_can_shift(SafeLLamaContextHandle ctx);
+
         [DllImport(NativeApi.libraryName, CallingConvention = CallingConvention.Cdecl)]
         private static extern LLamaPerfContextTimings llama_perf_context(SafeLLamaContextHandle ctx);
 
@@ -566,7 +574,7 @@ public void Synchronize()
         /// internally for later use by the decoder cross-attention layers.
         /// </summary>
         /// <param name="batch"></param>
-        /// <returns>0 = success <br />&lt; 0 = error</returns>
+        /// <returns>0 = success <br />&lt; 0 = error (the KV cache state is restored to the state before this call)</returns>
         public DecodeResult Encode(LLamaBatch batch)
         {
             if (batch.TokenCount == 0)
@@ -583,7 +591,7 @@ public DecodeResult Encode(LLamaBatch batch)
         /// <returns>Positive return values does not mean a fatal error, but rather a warning:<br />
         ///  - 0: success<br />
         ///  - 1: could not find a KV slot for the batch (try reducing the size of the batch or increase the context)<br />
-        ///  - &lt; 0: error<br />
+        ///  - &lt; 0: error (the KV cache state is restored to the state before this call)<br />
         /// </returns>
         public DecodeResult Decode(LLamaBatch batch)
         {
@@ -746,6 +754,11 @@ public void ResetTimings()
         #endregion
 
         #region KV Cache Management
+        /// <summary>
+        /// Check if the context supports KV cache shifting
+        /// </summary>
+        public bool KvCacheCanShift => llama_kv_cache_can_shift(this);
+
         /// <summary>
         /// Apply KV cache updates (such as K-shifts, defragmentation, etc.)
         /// </summary>
diff --git a/LLama/Native/SafeLLamaSamplerHandle.cs b/LLama/Native/SafeLLamaSamplerHandle.cs
index ef6a7ae30..9099c2f32 100644
--- a/LLama/Native/SafeLLamaSamplerHandle.cs
+++ b/LLama/Native/SafeLLamaSamplerHandle.cs
@@ -1,5 +1,5 @@
 using System;
-using System.Runtime.CompilerServices;
+using System.Collections.Generic;
 using System.Text;
 
 namespace LLama.Native;
@@ -410,40 +410,94 @@ public void AddGrammar(SafeLlamaModelHandle model, string grammar, string root)
     }
 
     /// <summary>
-    /// Create a sampler that applies various repetition penalties
+    /// Create a sampler that applies various repetition penalties.
+    ///
+    /// Avoid using on the full vocabulary as searching for repeated tokens can become slow. For example, apply top-k or top-p sampling first.
     /// </summary>
-    /// <param name="vocabSize">Vocab size</param>
-    /// <param name="eos">EOS token (if this model has one)</param>
-    /// <param name="newline">Newline token</param>
     /// <param name="penaltyCount">How many tokens of history to consider when calculating penalties</param>
     /// <param name="repeat">Repetition penalty</param>
     /// <param name="freq">Frequency penalty</param>
     /// <param name="presence">Presence penalty</param>
-    /// <param name="penalizeNewline">Whether or not to penalize the newline token</param>
-    /// <param name="ignoreEOS">Whether or not to ignore EOS token</param>
     /// <returns></returns>
-    public void AddPenalties(
-        int vocabSize, LLamaToken? eos, LLamaToken newline, int penaltyCount, float repeat, float freq, float presence, bool penalizeNewline, bool ignoreEOS
-    )
+    public void AddPenalties(int penaltyCount, float repeat, float freq, float presence)
     {
-        llama_sampler_chain_add(this, llama_sampler_init_penalties(vocabSize, eos ?? LLamaToken.InvalidToken, newline, penaltyCount, repeat, freq, presence, penalizeNewline, ignoreEOS));
+        llama_sampler_chain_add(
+            this,
+            llama_sampler_init_penalties(
+                penaltyCount,
+                repeat,
+                freq,
+                presence
+            )
+        );
 
         // ReSharper disable InconsistentNaming
         [DllImport(NativeApi.libraryName, CallingConvention = CallingConvention.Cdecl)]
         static extern IntPtr llama_sampler_init_penalties(
-            int n_vocab,         // llama_n_vocab()
-            LLamaToken special_eos_id,  // llama_token_eos()
-            LLamaToken linefeed_id,     // llama_token_nl()
-            int penalty_last_n,  // last n tokens to penalize (0 = disable penalty, -1 = context size)
-            float penalty_repeat,  // 1.0 = disabled
-            float penalty_freq,    // 0.0 = disabled
-            float penalty_present, // 0.0 = disabled
-            bool penalize_nl,     // consider newlines as a repeatable token
-            bool ignore_eos       // ignore the end-of-sequence token
+            int penalty_last_n,     // last n tokens to penalize (0 = disable penalty, -1 = context size)
+            float penalty_repeat,   // 1.0 = disabled
+            float penalty_freq,     // 0.0 = disabled
+            float penalty_present   // 0.0 = disabled
         );
         // ReSharper restore InconsistentNaming
     }
 
+    /// <summary>
+    /// DRY sampler, designed by p-e-w, as described in: <a href="https://github.com/oobabooga/text-generation-webui/pull/5677">https://github.com/oobabooga/text-generation-webui/pull/5677</a>.
+    /// Porting Koboldcpp implementation authored by pi6am: <a href="https://github.com/LostRuins/koboldcpp/pull/982">https://github.com/LostRuins/koboldcpp/pull/982</a>
+    /// </summary>
+    /// <param name="model">The model this sampler will be used with</param>
+    /// <param name="sequenceBreakers"></param>
+    /// <param name="multiplier">penalty multiplier, 0.0 = disabled</param>
+    /// <param name="base">exponential base</param>
+    /// <param name="allowedLength">repeated sequences longer than this are penalized</param>
+    /// <param name="penaltyLastN">how many tokens to scan for repetitions (0 = entire context)</param>
+    public void AddDry(SafeLlamaModelHandle model, ReadOnlySpan<string> sequenceBreakers, float multiplier = 0.8f, float @base = 1.75f, int allowedLength = 2, int penaltyLastN = 0)
+    {
+        unsafe
+        {
+            // Convert strings, fix memory in place, build array of pointers
+            var handles = new List<MemoryHandle>();
+            var breakers = stackalloc byte*[sequenceBreakers.Length];
+            for (var i = 0; i < sequenceBreakers.Length; i++)
+            {
+                var chars = Encoding.Default.GetBytes(sequenceBreakers[i]);
+                handles.Add(chars.AsMemory().Pin());
+
+                breakers[i] = (byte*)handles[i].Pointer;
+            }
+
+            llama_sampler_chain_add(
+                this,
+                llama_sampler_init_dry(
+                    model,
+                    multiplier,
+                    @base,
+                    allowedLength,
+                    penaltyLastN,
+                    breakers,
+                    (nuint)sequenceBreakers.Length
+                )
+            );
+
+            // Clear up all the handles fixing the memory in place
+            for (var i = 0; i < handles.Count; i++)
+                handles[i].Dispose();
+        }
+
+        // ReSharper disable InconsistentNaming
+        [DllImport(NativeApi.libraryName, CallingConvention = CallingConvention.Cdecl)]
+        static extern unsafe IntPtr llama_sampler_init_dry(
+            SafeLlamaModelHandle model,
+            float dry_multiplier,
+            float dry_base,
+            int    dry_allowed_length,
+            int dry_penalty_last_n,
+            byte** seq_breakers,
+            nuint    num_breakers
+        );
+    }
+
     /// <summary>
     /// Create a sampler that applies a bias directly to the logits
     /// </summary>
diff --git a/LLama/Native/SafeLlamaModelHandle.cs b/LLama/Native/SafeLlamaModelHandle.cs
index 718b81809..303ae3352 100644
--- a/LLama/Native/SafeLlamaModelHandle.cs
+++ b/LLama/Native/SafeLlamaModelHandle.cs
@@ -441,9 +441,6 @@ private static int llama_model_meta_val_str(SafeLlamaModelHandle model, string k
         [DllImport(NativeApi.libraryName, CallingConvention = CallingConvention.Cdecl)]
         private static extern LLamaTokenAttr llama_token_get_attr(SafeLlamaModelHandle model, LLamaToken token);
 
-        //[DllImport(NativeApi.libraryName, CallingConvention = CallingConvention.Cdecl)]
-        //private static extern GGMLTensor llama_get_model_tensor(SafeLlamaModelHandle model, string name);
-
         /// <summary>
         /// Returns true if the model contains an encoder that requires llama_encode() call
         /// </summary>
diff --git a/LLama/Sampling/DefaultSamplingPipeline.cs b/LLama/Sampling/DefaultSamplingPipeline.cs
index 3d166f0c6..76404bc95 100644
--- a/LLama/Sampling/DefaultSamplingPipeline.cs
+++ b/LLama/Sampling/DefaultSamplingPipeline.cs
@@ -20,44 +20,6 @@ public sealed class DefaultSamplingPipeline
     /// </summary>
     public float RepeatPenalty { get; init; } = 1;
 
-    /// <summary>
-    /// Frequency penalty as described by OpenAI: https://platform.openai.com/docs/api-reference/chat/create<br />
-    /// Number between -2.0 and 2.0. Positive values penalize new tokens based on their existing frequency in the text
-    /// so far, decreasing the model's likelihood to repeat the same line verbatim.
-    /// </summary>
-    [Obsolete($"Use {nameof(FrequencyPenalty)} instead.")]
-    public float AlphaFrequency
-    {
-        get => _frequencyPenalty;
-        init
-        {
-            if (value < -2)
-                throw new ArgumentOutOfRangeException(nameof(value), $"{nameof(AlphaFrequency)} must be greater than -2");
-            if (value > 2)
-                throw new ArgumentOutOfRangeException(nameof(value), $"{nameof(AlphaFrequency)} must be less than 2");
-            _frequencyPenalty = value;
-        }
-    }
-
-    /// <summary>
-    /// Presence penalty as described by OpenAI: https://platform.openai.com/docs/api-reference/chat/create<br />
-    /// Number between -2.0 and 2.0. Positive values penalize new tokens based on whether they appear in the
-    /// text so far, increasing the model's likelihood to talk about new topics.
-    /// </summary>
-    [Obsolete($"Use {nameof(PresencePenalty)} instead.")]
-    public float AlphaPresence
-    {
-        get => _presencePenalty;
-        init
-        {
-            if (value < -2)
-                throw new ArgumentOutOfRangeException(nameof(value), $"{nameof(AlphaPresence)} must be greater than -2");
-            if (value > 2)
-                throw new ArgumentOutOfRangeException(nameof(value), $"{nameof(AlphaPresence)} must be less than 2");
-            _presencePenalty = value;
-        }
-    }
-
     /// <summary>
     /// Frequency penalty as described by OpenAI: https://platform.openai.com/docs/api-reference/chat/create<br />
     /// Number between -2.0 and 2.0. Positive values penalize new tokens based on their existing frequency in the text
@@ -97,21 +59,15 @@ public float PresencePenalty
     private readonly float _presencePenalty;
 
     /// <summary>
-    /// How many tokens should be considered for penalizing repetition
+    /// How many tokens should be considered for penalties
     /// </summary>
-    public int RepeatPenaltyCount { get; init; } = 64;
+    public int PenaltyCount { get; init; } = 64;
 
     /// <summary>
     /// Whether the newline token should be protected from being modified by penalty
     /// </summary>
     public bool PenalizeNewline { get; init; } = false;
 
-    /// <summary>
-    /// Whether the EOS token should be protected from being modified by penalty
-    /// </summary>
-    [Obsolete($"This doesn't do what the name implies. If you're sure you want to use it, use {nameof(PreventEOS)}.")]
-    public bool PenalizeEOS { get; init; } = false;
-
     /// <summary>
     /// Whether the EOS token should be suppressed. Setting this to 'true' prevents EOS from being sampled
     /// </summary>
@@ -158,7 +114,7 @@ public float PresencePenalty
     public uint Seed { get; set; } = GetRandomSeed();
 
 
-    private static Random RandomSeedGenerator = new();
+    private static readonly Random RandomSeedGenerator = new();
     private static uint GetRandomSeed()
     {
         lock (RandomSeedGenerator)
@@ -196,13 +152,7 @@ protected override SafeLLamaSamplerChainHandle CreateChain(SafeLLamaContextHandl
         if (Grammar != null)
             chain.AddGrammar(context.ModelHandle, Grammar.Gbnf, Grammar.Root);
 
-        chain.AddPenalties(
-            context.VocabCount,
-            context.ModelHandle.Tokens.EOS, context.ModelHandle.Tokens.Newline ?? 0,
-            RepeatPenaltyCount, RepeatPenalty,
-            FrequencyPenalty, PresencePenalty,
-            PenalizeNewline, PreventEOS
-        );
+        chain.AddPenalties(PenaltyCount, RepeatPenalty, FrequencyPenalty, PresencePenalty);
 
         chain.AddTopK(TopK);
         chain.AddTypical(TypicalP, MinKeep);
diff --git a/LLama/runtimes/build/LLamaSharp.Backend.Cpu.nuspec b/LLama/runtimes/build/LLamaSharp.Backend.Cpu.nuspec
index 0203aad2b..debc99506 100644
--- a/LLama/runtimes/build/LLamaSharp.Backend.Cpu.nuspec
+++ b/LLama/runtimes/build/LLamaSharp.Backend.Cpu.nuspec
@@ -18,46 +18,77 @@
   <files>
     <file src="LLamaSharpBackend.props" target="build/netstandard2.0/LLamaSharp.Backend.Cpu.props" />
 
-    <file src="runtimes/deps/ggml.dll" target="runtimes\win-x64\native\ggml.dll" />
-    <file src="runtimes/deps/llama.dll" target="runtimes\win-x64\native\llama.dll" />
+    <file src="runtimes/deps/noavx/ggml.dll" target="runtimes\win-x64\native\noavx\ggml.dll" />
+    <file src="runtimes/deps/noavx/ggml-base.dll" target="runtimes\win-x64\native\noavx\ggml-base.dll" />
+    <file src="runtimes/deps/noavx/ggml-cpu.dll" target="runtimes\win-x64\native\noavx\ggml-cpu.dll" />
+    <file src="runtimes/deps/noavx/llama.dll" target="runtimes\win-x64\native\noavx\llama.dll" />
+    <file src="runtimes/deps/noavx/llava_shared.dll" target="runtimes\win-x64\native\noavx\llava_shared.dll" />
+      
     <file src="runtimes/deps/avx/ggml.dll" target="runtimes\win-x64\native\avx\ggml.dll" />
+    <file src="runtimes/deps/avx/ggml-base.dll" target="runtimes\win-x64\native\avx\ggml-base.dll" />
+    <file src="runtimes/deps/avx/ggml-cpu.dll" target="runtimes\win-x64\native\avx\ggml-cpu.dll" />
     <file src="runtimes/deps/avx/llama.dll" target="runtimes\win-x64\native\avx\llama.dll" />
+    <file src="runtimes/deps/avx/llava_shared.dll" target="runtimes\win-x64\native\avx\llava_shared.dll" />
+      
     <file src="runtimes/deps/avx2/ggml.dll" target="runtimes\win-x64\native\avx2\ggml.dll" />
+    <file src="runtimes/deps/avx2/ggml-base.dll" target="runtimes\win-x64\native\avx2\ggml-base.dll" />
+    <file src="runtimes/deps/avx2/ggml-cpu.dll" target="runtimes\win-x64\native\avx2\ggml-cpu.dll" />
     <file src="runtimes/deps/avx2/llama.dll" target="runtimes\win-x64\native\avx2\llama.dll" />
+    <file src="runtimes/deps/avx2/llava_shared.dll" target="runtimes\win-x64\native\avx2\llava_shared.dll" />
+      
     <file src="runtimes/deps/avx512/ggml.dll" target="runtimes\win-x64\native\avx512\ggml.dll" />
+    <file src="runtimes/deps/avx512/ggml-base.dll" target="runtimes\win-x64\native\avx512\ggml-base.dll" />
+    <file src="runtimes/deps/avx512/ggml-cpu.dll" target="runtimes\win-x64\native\avx512\ggml-cpu.dll" />
     <file src="runtimes/deps/avx512/llama.dll" target="runtimes\win-x64\native\avx512\llama.dll" />
+    <file src="runtimes/deps/avx512/llava_shared.dll" target="runtimes\win-x64\native\avx512\llava_shared.dll" />
 
-    <file src="runtimes/deps/libggml.so" target="runtimes\linux-x64\native\libggml.so" />
-    <file src="runtimes/deps/libllama.so" target="runtimes\linux-x64\native\libllama.so" />
+    <file src="runtimes/deps/noavx/libggml.so" target="runtimes\linux-x64\native\noavx\libggml.so" />
+    <file src="runtimes/deps/noavx/libggml-base.so" target="runtimes\linux-x64\native\noavx\libggml-base.so" />
+    <file src="runtimes/deps/noavx/libggml-cpu.so" target="runtimes\linux-x64\native\noavx\libggml-cpu.so" />
+    <file src="runtimes/deps/noavx/libllama.so" target="runtimes\linux-x64\native\noavx\libllama.so" />
+    <file src="runtimes/deps/noavx/libllava_shared.so" target="runtimes\linux-x64\native\noavx\libllava_shared.so" />
+      
     <file src="runtimes/deps/avx/libggml.so" target="runtimes\linux-x64\native\avx\libggml.so" />
+    <file src="runtimes/deps/avx/libggml-base.so" target="runtimes\linux-x64\native\avx\libggml-base.so" />
+    <file src="runtimes/deps/avx/libggml-cpu.so" target="runtimes\linux-x64\native\avx\libggml-cpu.so" />
     <file src="runtimes/deps/avx/libllama.so" target="runtimes\linux-x64\native\avx\libllama.so" />
+    <file src="runtimes/deps/avx/libllava_shared.so" target="runtimes\linux-x64\native\avx\libllava_shared.so" />
+      
     <file src="runtimes/deps/avx2/libggml.so" target="runtimes\linux-x64\native\avx2\libggml.so" />
+    <file src="runtimes/deps/avx2/libggml-base.so" target="runtimes\linux-x64\native\avx2\libggml-base.so" />
+    <file src="runtimes/deps/avx2/libggml-cpu.so" target="runtimes\linux-x64\native\avx2\libggml-cpu.so" />
     <file src="runtimes/deps/avx2/libllama.so" target="runtimes\linux-x64\native\avx2\libllama.so" />
+    <file src="runtimes/deps/avx2/libllava_shared.so" target="runtimes\linux-x64\native\avx2\libllava_shared.so" />
+      
     <file src="runtimes/deps/avx512/libggml.so" target="runtimes\linux-x64\native\avx512\libggml.so" />
+    <file src="runtimes/deps/avx512/libggml-base.so" target="runtimes\linux-x64\native\avx512\libggml-base.so" />
+    <file src="runtimes/deps/avx512/libggml-cpu.so" target="runtimes\linux-x64\native\avx512\libggml-cpu.so" />
     <file src="runtimes/deps/avx512/libllama.so" target="runtimes\linux-x64\native\avx512\libllama.so" />
+    <file src="runtimes/deps/avx512/libllava_shared.so" target="runtimes\linux-x64\native\avx512\libllava_shared.so" />
       
     <file src="runtimes/deps/osx-x64/libggml.dylib" target="runtimes\osx-x64\native\libggml.dylib" />
+    <file src="runtimes/deps/osx-x64/libggml-base.dylib" target="runtimes\osx-x64\native\libggml-base.dylib" />
+    <file src="runtimes/deps/osx-x64/libggml-cpu.dylib" target="runtimes\osx-x64\native\libggml-cpu.dylib" />
+    <file src="runtimes/deps/osx-x64/libggml-blas.dylib" target="runtimes\osx-x64\native\libggml-blas.dylib" />
     <file src="runtimes/deps/osx-x64/libllama.dylib" target="runtimes\osx-x64\native\libllama.dylib" />
     <file src="runtimes/deps/osx-x64/libllava_shared.dylib" target="runtimes\osx-x64\native\libllava_shared.dylib" />
 
     <file src="runtimes/deps/osx-x64-rosetta2/libggml.dylib" target="runtimes\osx-x64\native\rosetta2\libggml.dylib" />
+    <file src="runtimes/deps/osx-x64-rosetta2/libggml-base.dylib" target="runtimes\osx-x64\native\rosetta2\libggml-base.dylib" />
+    <file src="runtimes/deps/osx-x64-rosetta2/libggml-cpu.dylib" target="runtimes\osx-x64\native\rosetta2\libggml-cpu.dylib" />
+    <file src="runtimes/deps/osx-x64-rosetta2/libggml-blas.dylib" target="runtimes\osx-x64\native\rosetta2\libggml-blas.dylib" />
     <file src="runtimes/deps/osx-x64-rosetta2/libllama.dylib" target="runtimes\osx-x64\native\rosetta2\libllama.dylib" />
     <file src="runtimes/deps/osx-x64-rosetta2/libllava_shared.dylib" target="runtimes\osx-x64\native\rosetta2\libllava_shared.dylib" />
 
     <file src="runtimes/deps/osx-arm64/libggml.dylib" target="runtimes\osx-arm64\native\libggml.dylib" />
-    <file src="runtimes/deps/osx-arm64/libllama.dylib" target="runtimes\osx-arm64\native\libllama.dylib" />
+    <file src="runtimes/deps/osx-arm64/libggml-base.dylib" target="runtimes\osx-arm64\native\libggml-base.dylib" />
+    <file src="runtimes/deps/osx-arm64/libggml-cpu.dylib" target="runtimes\osx-arm64\native\libggml-cpu.dylib" />
+    <file src="runtimes/deps/osx-arm64/libggml-blas.dylib" target="runtimes\osx-arm64\native\libggml-blas.dylib" />
+    <file src="runtimes/deps/osx-arm64/libggml-metal.dylib" target="runtimes\osx-arm64\native\libggml-metal.dylib" />
     <file src="runtimes/deps/osx-arm64/ggml-metal.metal" target="runtimes\osx-arm64\native\ggml-metal.metal" />
+    <file src="runtimes/deps/osx-arm64/libllama.dylib" target="runtimes\osx-arm64\native\libllama.dylib" />
     <file src="runtimes/deps/osx-arm64/libllava_shared.dylib" target="runtimes\osx-arm64\native\libllava_shared.dylib" />
-
-    <file src="runtimes/deps/llava_shared.dll" target="runtimes\win-x64\native\llava_shared.dll" />
-    <file src="runtimes/deps/avx/llava_shared.dll" target="runtimes\win-x64\native\avx\llava_shared.dll" />
-    <file src="runtimes/deps/avx2/llava_shared.dll" target="runtimes\win-x64\native\avx2\llava_shared.dll" />
-    <file src="runtimes/deps/avx512/llava_shared.dll" target="runtimes\win-x64\native\avx512\llava_shared.dll" />
-
-    <file src="runtimes/deps/libllava_shared.so" target="runtimes\linux-x64\native\libllava_shared.so" />
-    <file src="runtimes/deps/avx/libllava_shared.so" target="runtimes\linux-x64\native\avx\libllava_shared.so" />
-    <file src="runtimes/deps/avx2/libllava_shared.so" target="runtimes\linux-x64\native\avx2\libllava_shared.so" />
-    <file src="runtimes/deps/avx512/libllava_shared.so" target="runtimes\linux-x64\native\avx512\libllava_shared.so" />
+      
     <file src="icon512.png" target="icon512.png" />
   </files>
 </package>
diff --git a/LLama/runtimes/build/LLamaSharp.Backend.Cuda11.Linux.nuspec b/LLama/runtimes/build/LLamaSharp.Backend.Cuda11.Linux.nuspec
index 7b4f959f4..6abd16ccc 100644
--- a/LLama/runtimes/build/LLamaSharp.Backend.Cuda11.Linux.nuspec
+++ b/LLama/runtimes/build/LLamaSharp.Backend.Cuda11.Linux.nuspec
@@ -7,18 +7,27 @@
         <authors>llama.cpp Authors</authors>
         <requireLicenseAcceptance>false</requireLicenseAcceptance>
         <license type="expression">MIT</license>
+        <icon>icon512.png</icon>
         <projectUrl>https://github.com/SciSharp/LLamaSharp</projectUrl>
         <description>LLamaSharp.Backend.Cuda11.Linux contains the Linux binaries for LLamaSharp with Cuda11 support.</description>
         <releaseNotes></releaseNotes>
         <copyright>Copyright 2023 The llama.cpp Authors. All rights reserved.</copyright>
         <tags>LLamaSharp LLama LLM GPT AI ChatBot SciSharp</tags>
+        
+        <dependencies>
+            <dependency id="LLamaSharp.Backend.Cpu" version="$version$" />
+        </dependencies>
     </metadata>
 
     <files>
         <file src="LLamaSharpBackend.props" target="build/netstandard2.0/LLamaSharp.Backend.Cuda11.props" />
-        <file src="runtimes/deps/cu11.7.1/libllava_shared.so" target="runtimes/linux-x64/native/cuda11/libllava_shared.so" />
+
         <file src="runtimes/deps/cu11.7.1/libggml.so" target="runtimes/linux-x64/native/cuda11/libggml.so" />
+        <file src="runtimes/deps/cu11.7.1/libggml-base.so" target="runtimes/linux-x64/native/cuda11/libggml-base.so" />
+        <file src="runtimes/deps/cu11.7.1/libggml-cuda.so" target="runtimes/linux-x64/native/cuda11/libggml-cuda.so" />
+
         <file src="runtimes/deps/cu11.7.1/libllama.so" target="runtimes/linux-x64/native/cuda11/libllama.so" />
+        <file src="runtimes/deps/cu11.7.1/libllava_shared.so" target="runtimes/linux-x64/native/cuda11/libllava_shared.so" />
         
         <file src="icon512.png" target="icon512.png" />
     </files>
diff --git a/LLama/runtimes/build/LLamaSharp.Backend.Cuda11.Windows.nuspec b/LLama/runtimes/build/LLamaSharp.Backend.Cuda11.Windows.nuspec
index 34bc6781d..a412e2e6f 100644
--- a/LLama/runtimes/build/LLamaSharp.Backend.Cuda11.Windows.nuspec
+++ b/LLama/runtimes/build/LLamaSharp.Backend.Cuda11.Windows.nuspec
@@ -7,18 +7,27 @@
         <authors>llama.cpp Authors</authors>
         <requireLicenseAcceptance>false</requireLicenseAcceptance>
         <license type="expression">MIT</license>
+        <icon>icon512.png</icon>
         <projectUrl>https://github.com/SciSharp/LLamaSharp</projectUrl>
         <description>LLamaSharp.Backend.Cuda11.Windows contains the Windows binaries for LLamaSharp with Cuda11 support.</description>
         <releaseNotes></releaseNotes>
         <copyright>Copyright 2023 The llama.cpp Authors. All rights reserved.</copyright>
         <tags>LLamaSharp LLama LLM GPT AI ChatBot SciSharp</tags>
+
+        <dependencies>
+            <dependency id="LLamaSharp.Backend.Cpu" version="$version$" />
+        </dependencies>
     </metadata>
 
     <files>
         <file src="LLamaSharpBackend.props" target="build/netstandard2.0/LLamaSharp.Backend.Cuda11.props" />
-        <file src="runtimes/deps/cu11.7.1/llava_shared.dll" target="runtimes\win-x64\native\cuda11\llava_shared.dll" />
+
         <file src="runtimes/deps/cu11.7.1/ggml.dll" target="runtimes\win-x64\native\cuda11\ggml.dll" />
+        <file src="runtimes/deps/cu11.7.1/ggml-base.dll" target="runtimes\win-x64\native\cuda11\ggml-base.dll" />
+        <file src="runtimes/deps/cu11.7.1/ggml-cuda.dll" target="runtimes\win-x64\native\cuda11\ggml-cuda.dll" />
+
         <file src="runtimes/deps/cu11.7.1/llama.dll" target="runtimes\win-x64\native\cuda11\llama.dll" />
+        <file src="runtimes/deps/cu11.7.1/llava_shared.dll" target="runtimes\win-x64\native\cuda11\llava_shared.dll" />
         
         <file src="icon512.png" target="icon512.png" />
     </files>
diff --git a/LLama/runtimes/build/LLamaSharp.Backend.Cuda11.nuspec b/LLama/runtimes/build/LLamaSharp.Backend.Cuda11.nuspec
index 1beeeaafc..5ac473914 100644
--- a/LLama/runtimes/build/LLamaSharp.Backend.Cuda11.nuspec
+++ b/LLama/runtimes/build/LLamaSharp.Backend.Cuda11.nuspec
@@ -22,6 +22,7 @@
     </metadata>
 
     <files>
+        <file src="LLamaSharpBackend.props" target="build/netstandard2.0/LLamaSharp.Backend.Cuda11.props" />
         <file src="icon512.png" target="icon512.png" />
     </files>
 </package>
diff --git a/LLama/runtimes/build/LLamaSharp.Backend.Cuda12.Linux.nuspec b/LLama/runtimes/build/LLamaSharp.Backend.Cuda12.Linux.nuspec
index 8834ae413..687283221 100644
--- a/LLama/runtimes/build/LLamaSharp.Backend.Cuda12.Linux.nuspec
+++ b/LLama/runtimes/build/LLamaSharp.Backend.Cuda12.Linux.nuspec
@@ -7,19 +7,27 @@
         <authors>llama.cpp Authors</authors>
         <requireLicenseAcceptance>false</requireLicenseAcceptance>
         <license type="expression">MIT</license>
+        <icon>icon512.png</icon>
         <projectUrl>https://github.com/SciSharp/LLamaSharp</projectUrl>
         <description>LLamaSharp.Backend.Cuda12.Linux contains the Linux binaries for LLamaSharp with Cuda12 support.</description>
         <releaseNotes></releaseNotes>
         <copyright>Copyright 2023 The llama.cpp Authors. All rights reserved.</copyright>
         <tags>LLamaSharp LLama LLM GPT AI ChatBot SciSharp</tags>
+
+        <dependencies>
+            <dependency id="LLamaSharp.Backend.Cpu" version="$version$" />
+        </dependencies>
     </metadata>
 
     <files>
         <file src="LLamaSharpBackend.props" target="build/netstandard2.0/LLamaSharp.Backend.Cuda12.props" />
-        
-        <file src="runtimes/deps/cu12.2.0/libllava_shared.so" target="runtimes/linux-x64/native/cuda12/libllava_shared.so" />
+
         <file src="runtimes/deps/cu12.2.0/libggml.so" target="runtimes/linux-x64/native/cuda12/libggml.so" />
+        <file src="runtimes/deps/cu12.2.0/libggml-base.so" target="runtimes/linux-x64/native/cuda12/libggml-base.so" />
+        <file src="runtimes/deps/cu12.2.0/libggml-cuda.so" target="runtimes/linux-x64/native/cuda12/libggml-cuda.so" />
+
         <file src="runtimes/deps/cu12.2.0/libllama.so" target="runtimes/linux-x64/native/cuda12/libllama.so" />
+        <file src="runtimes/deps/cu12.2.0/libllava_shared.so" target="runtimes/linux-x64/native/cuda12/libllava_shared.so" />
         
         <file src="icon512.png" target="icon512.png" />
     </files>
diff --git a/LLama/runtimes/build/LLamaSharp.Backend.Cuda12.Windows.nuspec b/LLama/runtimes/build/LLamaSharp.Backend.Cuda12.Windows.nuspec
index 3d37accec..1fd01edb9 100644
--- a/LLama/runtimes/build/LLamaSharp.Backend.Cuda12.Windows.nuspec
+++ b/LLama/runtimes/build/LLamaSharp.Backend.Cuda12.Windows.nuspec
@@ -7,19 +7,27 @@
         <authors>llama.cpp Authors</authors>
         <requireLicenseAcceptance>false</requireLicenseAcceptance>
         <license type="expression">MIT</license>
+        <icon>icon512.png</icon>
         <projectUrl>https://github.com/SciSharp/LLamaSharp</projectUrl>
         <description>LLamaSharp.Backend.Cuda12.Windows contains the Windows binaries for LLamaSharp with Cuda12 support.</description>
         <releaseNotes></releaseNotes>
         <copyright>Copyright 2023 The llama.cpp Authors. All rights reserved.</copyright>
         <tags>LLamaSharp LLama LLM GPT AI ChatBot SciSharp</tags>
+
+        <dependencies>
+            <dependency id="LLamaSharp.Backend.Cpu" version="$version$" />
+        </dependencies>
     </metadata>
 
     <files>
         <file src="LLamaSharpBackend.props" target="build/netstandard2.0/LLamaSharp.Backend.Cuda12.props" />
-        
-        <file src="runtimes/deps/cu12.2.0/llava_shared.dll" target="runtimes\win-x64\native\cuda12\llava_shared.dll" />
+
         <file src="runtimes/deps/cu12.2.0/ggml.dll" target="runtimes\win-x64\native\cuda12\ggml.dll" />
+        <file src="runtimes/deps/cu12.2.0/ggml-base.dll" target="runtimes\win-x64\native\cuda12\ggml-base.dll" />
+        <file src="runtimes/deps/cu12.2.0/ggml-cuda.dll" target="runtimes\win-x64\native\cuda12\ggml-cuda.dll" />
+        
         <file src="runtimes/deps/cu12.2.0/llama.dll" target="runtimes\win-x64\native\cuda12\llama.dll" />
+        <file src="runtimes/deps/cu12.2.0/llava_shared.dll" target="runtimes\win-x64\native\cuda12\llava_shared.dll" />
         
         <file src="icon512.png" target="icon512.png" />
     </files>
diff --git a/LLama/runtimes/build/LLamaSharp.Backend.Vulkan.Linux.nuspec b/LLama/runtimes/build/LLamaSharp.Backend.Vulkan.Linux.nuspec
index 725764097..3f2202db4 100644
--- a/LLama/runtimes/build/LLamaSharp.Backend.Vulkan.Linux.nuspec
+++ b/LLama/runtimes/build/LLamaSharp.Backend.Vulkan.Linux.nuspec
@@ -7,18 +7,27 @@
         <authors>llama.cpp Authors</authors>
         <requireLicenseAcceptance>false</requireLicenseAcceptance>
         <license type="expression">MIT</license>
+        <icon>icon512.png</icon>
         <projectUrl>https://github.com/SciSharp/LLamaSharp</projectUrl>
         <description>LLamaSharp.Backend.Vulkan.Linux contains the Linux binaries for LLamaSharp with Vulkan support.</description>
         <releaseNotes></releaseNotes>
         <copyright>Copyright 2023 The llama.cpp Authors. All rights reserved.</copyright>
         <tags>LLamaSharp LLama LLM GPT AI ChatBot SciSharp</tags>
+
+        <dependencies>
+            <dependency id="LLamaSharp.Backend.Cpu" version="$version$" />
+        </dependencies>
     </metadata>
 
     <files>
         <file src="LLamaSharpBackend.props" target="build/netstandard2.0/LLamaSharp.Backend.Vulkan.props" />
-        <file src="runtimes/deps/vulkan/libllava_shared.so" target="runtimes/linux-x64/native/vulkan/libllava_shared.so" />
+
         <file src="runtimes/deps/vulkan/libggml.so" target="runtimes/linux-x64/native/vulkan/libggml.so" />
+        <file src="runtimes/deps/vulkan/libggml-base.so" target="runtimes/linux-x64/native/vulkan/libggml-base.so" />
+        <file src="runtimes/deps/vulkan/libggml-vulkan.so" target="runtimes/linux-x64/native/vulkan/libggml-vulkan.so" />
+
         <file src="runtimes/deps/vulkan/libllama.so" target="runtimes/linux-x64/native/vulkan/libllama.so" />
+        <file src="runtimes/deps/vulkan/libllava_shared.so" target="runtimes/linux-x64/native/vulkan/libllava_shared.so" />
         
         <file src="icon512.png" target="icon512.png" />
     </files>
diff --git a/LLama/runtimes/build/LLamaSharp.Backend.Vulkan.Windows.nuspec b/LLama/runtimes/build/LLamaSharp.Backend.Vulkan.Windows.nuspec
index 5c5b83f94..3f7487bcd 100644
--- a/LLama/runtimes/build/LLamaSharp.Backend.Vulkan.Windows.nuspec
+++ b/LLama/runtimes/build/LLamaSharp.Backend.Vulkan.Windows.nuspec
@@ -7,18 +7,28 @@
         <authors>llama.cpp Authors</authors>
         <requireLicenseAcceptance>false</requireLicenseAcceptance>
         <license type="expression">MIT</license>
+        <icon>icon512.png</icon>
         <projectUrl>https://github.com/SciSharp/LLamaSharp</projectUrl>
         <description>LLamaSharp.Backend.Vulkan.Windows contains the Windows binaries for LLamaSharp with Vulkan support.</description>
         <releaseNotes></releaseNotes>
         <copyright>Copyright 2023 The llama.cpp Authors. All rights reserved.</copyright>
         <tags>LLamaSharp LLama LLM GPT AI ChatBot SciSharp</tags>
+
+        <dependencies>
+            <dependency id="LLamaSharp.Backend.Cpu" version="$version$" />
+        </dependencies>
     </metadata>
 
     <files>
         <file src="LLamaSharpBackend.props" target="build/netstandard2.0/LLamaSharp.Backend.Vulkan.props" />
-        <file src="runtimes/deps/vulkan/llava_shared.dll" target="runtimes\win-x64\native\vulkan\llava_shared.dll" />
+        
         <file src="runtimes/deps/vulkan/ggml.dll" target="runtimes\win-x64\native\vulkan\ggml.dll" />
+        <file src="runtimes/deps/vulkan/ggml-base.dll" target="runtimes\win-x64\native\vulkan\ggml-base.dll" />
+        <file src="runtimes/deps/vulkan/ggml-vulkan.dll" target="runtimes\win-x64\native\vulkan\ggml-vulkan.dll" />
+
         <file src="runtimes/deps/vulkan/llama.dll" target="runtimes\win-x64\native\vulkan\llama.dll" />
+        <file src="runtimes/deps/vulkan/llava_shared.dll" target="runtimes\win-x64\native\vulkan\llava_shared.dll" />
+        
         <file src="icon512.png" target="icon512.png" />
     </files>
 </package>
diff --git a/LLama/runtimes/build/LLamaSharp.Backend.Vulkan.nuspec b/LLama/runtimes/build/LLamaSharp.Backend.Vulkan.nuspec
index b4f26ec97..c972ad0fc 100644
--- a/LLama/runtimes/build/LLamaSharp.Backend.Vulkan.nuspec
+++ b/LLama/runtimes/build/LLamaSharp.Backend.Vulkan.nuspec
@@ -22,6 +22,7 @@
     </metadata>
 
     <files>
+        <file src="LLamaSharpBackend.props" target="build/netstandard2.0/LLamaSharp.Backend.Vulkan.props" />
         <file src="icon512.png" target="icon512.png" />
     </files>
-</package>
+</package>
\ No newline at end of file