Skip to content

Commit

Permalink
January 2025 Update (#1036)
Browse files Browse the repository at this point in the history
* code changes for december update (not working yet)

* Changes to support up to ggerganov/llama.cpp@d408bb9

* Updated to latest llama.cpp binaries, this works on Windows CPU but needs more changes for other backends

* Updated to latest deps, fixed kernel memory failing to load

* Copy missing Mac flibraries libggml-base and libggml-cpu

* Removed any mention of AVX in MacOS loading

* Added file copying for some more targets (still missing macos)

* Updated to latest set of binaries

* Fixed copy path for CUDA12 DLLs

* Compatibility with llama.cpp backend split (PR #10256) on all platforms

* Restore original comment

* Update the dependency loader for ggml-metal and ggml-blas

* Update the runtime targets for ggml-metal and ggml-blas

* Add CPU backend (fallback) dependency for the GPU backends

* Fix icons for the nuget backends

* Update nuspec files for the GPU backends

* Update BinaryReleaseId

* Update nuspec for CPU & OSX

* Update CPU nuspec to use noavx folder

* Update Runtime.targets to use noavx folder

* Update BinaryReleaseId

* CUDA & Vulkan native libraries now correctly store the detected or user defined AVX level

---------

Co-authored-by: SignalRT <[email protected]>
Co-authored-by: m0nsky <[email protected]>
  • Loading branch information
3 people authored Jan 21, 2025
1 parent f55252f commit 02eedd9
Show file tree
Hide file tree
Showing 40 changed files with 563 additions and 237 deletions.
8 changes: 3 additions & 5 deletions LLama.Examples/Program.cs
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
using LLama.Native;
using LLama.Native;
using Spectre.Console;
using System.Runtime.InteropServices;

AnsiConsole.MarkupLineInterpolated(
$"""
Expand All @@ -18,7 +17,7 @@ __ __ ____ __
""");

// Configure logging. Change this to `true` to see log messages from llama.cpp
var showLLamaCppLogs = false;
var showLLamaCppLogs = true;
NativeLibraryConfig
.All
.WithLogCallback((level, message) =>
Expand All @@ -31,8 +30,7 @@ __ __ ____ __
NativeLibraryConfig
.All
.WithCuda()
//.WithAutoDownload() // An experimental feature
.DryRun(out var loadedllamaLibrary, out var loadedLLavaLibrary);
.WithVulkan();

// Calling this method forces loading to occur now.
NativeApi.llama_empty_call();
Expand Down
4 changes: 1 addition & 3 deletions LLama.KernelMemory/LLamaSharpTextEmbeddingGenerator.cs
Original file line number Diff line number Diff line change
Expand Up @@ -31,11 +31,9 @@ public LLamaSharpTextEmbeddingGenerator(LLamaSharpConfig config)

var @params = new ModelParams(config.ModelPath)
{
ContextSize = config.ContextSize ?? 2048,
ContextSize = config.ContextSize,
GpuLayerCount = config.GpuLayerCount ?? 20,
Embeddings = true,
MainGpu = config.MainGpu,
SplitMode = config.SplitMode,
PoolingType = LLamaPoolingType.Mean,
};
_weights = LLamaWeights.LoadFromFile(@params);
Expand Down
2 changes: 0 additions & 2 deletions LLama.KernelMemory/LlamaSharpTextGenerator.cs
Original file line number Diff line number Diff line change
Expand Up @@ -33,8 +33,6 @@ public LlamaSharpTextGenerator(LLamaSharpConfig config)
{
ContextSize = config.ContextSize ?? 2048,
GpuLayerCount = config.GpuLayerCount ?? 20,
MainGpu = config.MainGpu,
SplitMode = config.SplitMode
};
_weights = LLamaWeights.LoadFromFile(parameters);
_context = _weights.CreateContext(parameters);
Expand Down
Original file line number Diff line number Diff line change
@@ -1,21 +1,15 @@
using LLama.Common;
using LLamaSharp.KernelMemory;
using Microsoft.KernelMemory.AI;
using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using System.Text.RegularExpressions;
using System.Threading.Tasks;
using Xunit.Abstractions;

namespace LLama.Unittest.KernelMemory
{
public class LLamaSharpTextEmbeddingGeneratorTests : ITextTokenizerTests, IDisposable
public class LLamaSharpTextEmbeddingGeneratorTests
: ITextTokenizerTests, IDisposable
{
private readonly LLamaSharpTextEmbeddingGenerator _embeddingGenerator;

public LLamaSharpTextEmbeddingGeneratorTests(ITestOutputHelper testOutputHelper) : base(testOutputHelper)
public LLamaSharpTextEmbeddingGeneratorTests(ITestOutputHelper testOutputHelper)
: base(testOutputHelper)
{
_embeddingGenerator = new LLamaSharpTextEmbeddingGenerator(_lsConfig);

Expand Down
18 changes: 4 additions & 14 deletions LLama.Unittest/KernelMemory/LlamaSharpTextGeneratorTests.cs
Original file line number Diff line number Diff line change
@@ -1,25 +1,15 @@
using LLama.Common;
using LLamaSharp.KernelMemory;
using Microsoft.KernelMemory.AI;
using System;
using System.Collections.Generic;
using System.Diagnostics;
using System.Linq;
using System.Reflection.Emit;
using System.Text;
using System.Text.RegularExpressions;
using System.Threading.Tasks;
using Xunit.Abstractions;
using Xunit.Sdk;
using static System.Net.Mime.MediaTypeNames;

namespace LLama.Unittest.KernelMemory
{
public class LlamaSharpTextGeneratorTests : ITextTokenizerTests, IDisposable
public class LlamaSharpTextGeneratorTests
: ITextTokenizerTests, IDisposable
{
private readonly LlamaSharpTextGenerator _textGenerator;

public LlamaSharpTextGeneratorTests(ITestOutputHelper testOutputHelper) : base(testOutputHelper)
public LlamaSharpTextGeneratorTests(ITestOutputHelper testOutputHelper)
: base(testOutputHelper)
{
_textGenerator = new LlamaSharpTextGenerator(_lsConfig);

Expand Down
6 changes: 1 addition & 5 deletions LLama.Unittest/SamplingTests.cs
Original file line number Diff line number Diff line change
Expand Up @@ -167,11 +167,7 @@ private static SafeLLamaSamplerChainHandle CreateChain(SafeLLamaContextHandle co
var chain = SafeLLamaSamplerChainHandle.Create(LLamaSamplerChainParams.Default());

chain.AddPenalties(
vocabSize: context.VocabCount,
eos: context.ModelHandle.Tokens.EOS,
newline: context.ModelHandle.Tokens.Newline ?? 0,
penaltyCount: 60, repeat: 1, freq: 0, presence: 0,
penalizeNewline: false, ignoreEOS: false
penaltyCount: 60, repeat: 1, freq: 0, presence: 0
);

if (logit_bias != null) { chain.AddLogitBias(context.VocabCount, logit_bias); }
Expand Down
5 changes: 4 additions & 1 deletion LLama.Web/Common/ModelOptions.cs
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ public class ModelOptions
public int MainGpu { get; set; } = 0;

/// <inheritdoc />
public GPUSplitMode SplitMode { get; set; } = GPUSplitMode.None;
public GPUSplitMode? SplitMode { get; set; }

/// <inheritdoc />
public int GpuLayerCount { get; set; } = 20;
Expand Down Expand Up @@ -59,6 +59,9 @@ public class ModelOptions
/// <inheritdoc />
public TensorSplitsCollection TensorSplits { get; set; } = new();

/// <inheritdoc />
public bool CheckTensors { get; }

/// <inheritdoc />
public List<MetadataOverride> MetadataOverrides { get; } = new();

Expand Down
7 changes: 6 additions & 1 deletion LLama/Abstractions/IModelParams.cs
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ public interface IModelParams
/// <summary>
/// How to split the model across multiple GPUs
/// </summary>
GPUSplitMode SplitMode { get; }
GPUSplitMode? SplitMode { get; }

/// <summary>
/// Number of layers to run in VRAM / GPU memory (n_gpu_layers)
Expand Down Expand Up @@ -68,6 +68,11 @@ public interface IModelParams
/// </summary>
bool VocabOnly { get; }

/// <summary>
/// Validate model tensor data before loading
/// </summary>
bool CheckTensors { get; }

/// <summary>
/// Override specific metadata items in the model
/// </summary>
Expand Down
5 changes: 4 additions & 1 deletion LLama/Common/ModelParams.cs
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ public record ModelParams
public int MainGpu { get; set; } = 0;

/// <inheritdoc />
public GPUSplitMode SplitMode { get; set; } = GPUSplitMode.None;
public GPUSplitMode? SplitMode { get; set; }

/// <inheritdoc />
public int GpuLayerCount { get; set; } = 20;
Expand Down Expand Up @@ -54,6 +54,9 @@ public record ModelParams
/// <inheritdoc />
public TensorSplitsCollection TensorSplits { get; set; } = new();

/// <inheritdoc />
public bool CheckTensors { get; }

/// <inheritdoc />
public List<MetadataOverride> MetadataOverrides { get; set; } = new();

Expand Down
7 changes: 5 additions & 2 deletions LLama/Extensions/IModelParamsExtensions.cs
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
using System.IO;
using System.IO;
using System;
using System.Text;
using LLama.Abstractions;
Expand Down Expand Up @@ -31,11 +31,14 @@ public static IDisposable ToLlamaModelParams(this IModelParams @params, out LLam
result = LLamaModelParams.Default();

result.main_gpu = @params.MainGpu;
result.split_mode = @params.SplitMode;
result.n_gpu_layers = @params.GpuLayerCount < 0 ? int.MaxValue : @params.GpuLayerCount;
if (@params.SplitMode.HasValue)
result.split_mode = @params.SplitMode.Value;

result.use_mlock = @params.UseMemoryLock;
result.use_mmap = @params.UseMemorymap;
result.vocab_only = @params.VocabOnly;
result.check_tensors = @params.CheckTensors;

unsafe
{
Expand Down
2 changes: 1 addition & 1 deletion LLama/Extensions/LLamaExecutorExtensions.cs
Original file line number Diff line number Diff line change
Expand Up @@ -147,7 +147,7 @@ private string CreatePrompt(IList<ChatMessage> messages)
PreventEOS = options?.AdditionalProperties?.TryGetValue(nameof(DefaultSamplingPipeline.PreventEOS), out bool eos) is true ? eos : s_defaultPipeline.PreventEOS,
PenalizeNewline = options?.AdditionalProperties?.TryGetValue(nameof(DefaultSamplingPipeline.PenalizeNewline), out bool pnl) is true ? pnl : s_defaultPipeline.PenalizeNewline,
RepeatPenalty = options?.AdditionalProperties?.TryGetValue(nameof(DefaultSamplingPipeline.RepeatPenalty), out float rp) is true ? rp : s_defaultPipeline.RepeatPenalty,
RepeatPenaltyCount = options?.AdditionalProperties?.TryGetValue(nameof(DefaultSamplingPipeline.RepeatPenaltyCount), out int rpc) is true ? rpc : s_defaultPipeline.RepeatPenaltyCount,
PenaltyCount = options?.AdditionalProperties?.TryGetValue(nameof(DefaultSamplingPipeline.PenaltyCount), out int rpc) is true ? rpc : s_defaultPipeline.PenaltyCount,
Grammar = options?.AdditionalProperties?.TryGetValue(nameof(DefaultSamplingPipeline.Grammar), out Grammar? g) is true ? g : s_defaultPipeline.Grammar,
MinKeep = options?.AdditionalProperties?.TryGetValue(nameof(DefaultSamplingPipeline.MinKeep), out int mk) is true ? mk : s_defaultPipeline.MinKeep,
MinP = options?.AdditionalProperties?.TryGetValue(nameof(DefaultSamplingPipeline.MinP), out float mp) is true ? mp : s_defaultPipeline.MinP,
Expand Down
3 changes: 0 additions & 3 deletions LLama/LLamaQuantizer.cs
Original file line number Diff line number Diff line change
Expand Up @@ -106,9 +106,6 @@ private static bool ValidateFtype(LLamaFtype ftype)
case LLamaFtype.MOSTLY_IQ3_S:
case LLamaFtype.MOSTLY_IQ3_M:

case LLamaFtype.MOSTLY_Q4_0_4_4:
case LLamaFtype.MOSTLY_Q4_0_4_8:
case LLamaFtype.MOSTLY_Q4_0_8_8:
return true;

case LLamaFtype.GUESSED:
Expand Down
Loading

0 comments on commit 02eedd9

Please sign in to comment.