diff --git a/Package.swift b/Package.swift index 6371296c..bd3afa8b 100644 --- a/Package.swift +++ b/Package.swift @@ -27,12 +27,12 @@ let package = Package( .library(name: "SpeziLLMFog", targets: ["SpeziLLMFog"]) ], dependencies: [ - .package(url: "https://github.com/ml-explore/mlx-swift", branch: "0.21.2"), // Pin MLX library as it doesn't follow semantic versioning - .package(url: "https://github.com/ml-explore/mlx-swift-examples", branch: "1.16.0"), // Pin MLX library as it doesn't follow semantic versioning - .package(url: "https://github.com/huggingface/swift-transformers", .upToNextMinor(from: "0.1.12")), + .package(url: "https://github.com/ml-explore/mlx-swift", .upToNextMinor(from: "0.21.2")), + .package(url: "https://github.com/ml-explore/mlx-swift-examples", exact: "1.18.1"), // Pin MLX Swift Examples as it doesn't follow semantic versioning + .package(url: "https://github.com/huggingface/swift-transformers", .upToNextMinor(from: "0.1.14")), .package(url: "https://github.com/StanfordBDHG/OpenAI", .upToNextMinor(from: "0.2.9")), .package(url: "https://github.com/StanfordSpezi/Spezi", from: "1.2.1"), - .package(url: "https://github.com/StanfordSpezi/SpeziFoundation", from: "2.0.0-beta.3"), + .package(url: "https://github.com/StanfordSpezi/SpeziFoundation", from: "2.0.0"), .package(url: "https://github.com/StanfordSpezi/SpeziStorage", from: "1.0.2"), .package(url: "https://github.com/StanfordSpezi/SpeziOnboarding", from: "1.1.1"), .package(url: "https://github.com/StanfordSpezi/SpeziChat", .upToNextMinor(from: "0.2.1")), diff --git a/Sources/SpeziLLM/Models/LLMContextEntity.swift b/Sources/SpeziLLM/Models/LLMContextEntity.swift index 4f88e0d1..a842ad17 100644 --- a/Sources/SpeziLLM/Models/LLMContextEntity.swift +++ b/Sources/SpeziLLM/Models/LLMContextEntity.swift @@ -46,7 +46,7 @@ public struct LLMContextEntity: Codable, Equatable, Hashable, Identifiable { case tool(id: String, name: String) - var rawValue: String { + package var rawValue: String { switch self { case .user: "user" case .assistant: "assistant" diff --git a/Sources/SpeziLLMLocal/Configuration/LLMLocalContextParameters.swift b/Sources/SpeziLLMLocal/Configuration/LLMLocalContextParameters.swift deleted file mode 100644 index a707e839..00000000 --- a/Sources/SpeziLLMLocal/Configuration/LLMLocalContextParameters.swift +++ /dev/null @@ -1,27 +0,0 @@ -// -// This source file is part of the Stanford Spezi open source project -// -// SPDX-FileCopyrightText: 2024 Stanford University and the project authors (see CONTRIBUTORS.md) -// -// SPDX-License-Identifier: MIT -// - -import Foundation - - -/// Represents the context parameters of the LLM. -public struct LLMLocalContextParameters: Sendable { - /// RNG seed of the LLM - var seed: UInt64? - - /// Creates the ``LLMLocalContextParameters`` which wrap the underlying llama.cpp `llama_context_params` C struct. - /// Is passed to the underlying llama.cpp model in order to configure the context of the LLM. - /// - /// - Parameters: - /// - seed: RNG seed of the LLM, defaults to a random seed. - public init( - seed: UInt64? = nil - ) { - self.seed = seed - } -} diff --git a/Sources/SpeziLLMLocal/Configuration/LLMLocalParameters.swift b/Sources/SpeziLLMLocal/Configuration/LLMLocalParameters.swift index 01c9fcf3..70da6ad6 100644 --- a/Sources/SpeziLLMLocal/Configuration/LLMLocalParameters.swift +++ b/Sources/SpeziLLMLocal/Configuration/LLMLocalParameters.swift @@ -23,10 +23,15 @@ public struct LLMLocalParameters: Sendable { let systemPrompt: String? /// Indicates the maximum output length generated by the LLM. let maxOutputLength: Int - + /// Additional End of Sequence tokens at which the generation will be stoped. let extraEOSTokens: Set /// Interval for displaying output after every N tokens generated. let displayEveryNTokens: Int + /// RNG seed of the LLM + let seed: UInt64? + /// The chat template to use for the model in the Jinja format + let chatTemplate: String? + /// Creates the ``LLMLocalParameters`` which wrap the underlying llama.cpp `llama_model_params` C struct. /// Is passed to the underlying llama.cpp model in order to configure the LLM. @@ -36,15 +41,21 @@ public struct LLMLocalParameters: Sendable { /// - maxOutputLength: The maximum output length generated by the Spezi LLM, defaults to `512`. /// - extraEOSTokens: Additional tokens to use for end of string /// - displayEveryNTokens: Interval for displaying output after every N tokens generated, defaults to `4`. + /// - seed: RNG seed of the LLM, defaults to a random seed. + /// - chatTemplate: Can be set to manually overwrite the chatTemplate within the `swift-transformers` package. public init( systemPrompt: String? = Defaults.defaultSystemPrompt, maxOutputLength: Int = 512, extraEOSTokens: Set = [], - displayEveryNTokens: Int = 4 + displayEveryNTokens: Int = 4, + seed: UInt64? = nil, + chatTemplate: String? = nil ) { self.systemPrompt = systemPrompt self.maxOutputLength = maxOutputLength self.extraEOSTokens = extraEOSTokens self.displayEveryNTokens = displayEveryNTokens + self.seed = seed + self.chatTemplate = chatTemplate } } diff --git a/Sources/SpeziLLMLocal/Helpers/LLMContext+FormattedChat.swift b/Sources/SpeziLLMLocal/Helpers/LLMContext+FormattedChat.swift new file mode 100644 index 00000000..d55594b1 --- /dev/null +++ b/Sources/SpeziLLMLocal/Helpers/LLMContext+FormattedChat.swift @@ -0,0 +1,25 @@ +// +// This source file is part of the Stanford Spezi open source project +// +// SPDX-FileCopyrightText: 2024 Stanford University and the project authors (see CONTRIBUTORS.md) +// +// SPDX-License-Identifier: MIT +// + +import SpeziLLM + +extension LLMContext { + /// Formats the current ``LLMContext`` for compatibility with Transformers-based chat models. + /// + /// - Returns: An array of dictionaries where each dictionary represents a message in the format: + /// - `role`: The role of the message (e.g., "user", "assistant"), derived from the `rawValue` of the entry's `role`. + /// - `content`: The textual content of the message. + package var formattedChat: [[String: String]] { + self.map { entry in + [ + "role": entry.role.rawValue, + "content": entry.content + ] + } + } +} diff --git a/Sources/SpeziLLMLocal/LLMLocalPlatform.swift b/Sources/SpeziLLMLocal/LLMLocalPlatform.swift index a8930de3..83a23dd1 100644 --- a/Sources/SpeziLLMLocal/LLMLocalPlatform.swift +++ b/Sources/SpeziLLMLocal/LLMLocalPlatform.swift @@ -11,7 +11,9 @@ import MLX import Spezi import SpeziFoundation import SpeziLLM - +#if targetEnvironment(simulator) +import OSLog +#endif /// LLM execution platform of an ``LLMLocalSchema``. /// @@ -58,19 +60,29 @@ public actor LLMLocalPlatform: LLMPlatform, DefaultInitializable { public nonisolated func configure() { #if targetEnvironment(simulator) - assertionFailure("SpeziLLMLocal: Code cannot be run on simulator.") -#endif + Logger( + subsystem: "Spezi", + category: "LLMLocalPlatform" + ).warning("SpeziLLMLocal is only supported on physical devices. Use `LLMMockPlatform` instead.") +#else if let cacheLimit = configuration.cacheLimit { MLX.GPU.set(cacheLimit: cacheLimit * 1024 * 1024) } if let memoryLimit = configuration.memoryLimit { MLX.GPU.set(memoryLimit: memoryLimit.limit, relaxed: memoryLimit.relaxed) } +#endif } +#if targetEnvironment(simulator) + public nonisolated func callAsFunction(with llmSchema: LLMLocalSchema) -> LLMLocalMockSession { + LLMLocalMockSession(self, schema: llmSchema) + } +#else public nonisolated func callAsFunction(with llmSchema: LLMLocalSchema) -> LLMLocalSession { LLMLocalSession(self, schema: llmSchema) } +#endif deinit { MLX.GPU.clearCache() diff --git a/Sources/SpeziLLMLocal/LLMLocalSchema+PromptFormatting.swift b/Sources/SpeziLLMLocal/LLMLocalSchema+PromptFormatting.swift deleted file mode 100644 index 0859cbe6..00000000 --- a/Sources/SpeziLLMLocal/LLMLocalSchema+PromptFormatting.swift +++ /dev/null @@ -1,275 +0,0 @@ -// -// This source file is part of the Stanford Spezi open source project -// -// SPDX-FileCopyrightText: 2022 Stanford University and the project authors (see CONTRIBUTORS.md) -// -// SPDX-License-Identifier: MIT -// - -import SpeziLLM - - -extension LLMLocalSchema { - /// Holds default prompt formatting strategies for [Llama2](https://ai.meta.com/llama/) as well as [Phi-2](https://www.microsoft.com/en-us/research/blog/phi-2-the-surprising-power-of-small-language-models/) models. - public enum PromptFormattingDefaults { - /// Prompt formatting closure for the [Llama3](https://ai.meta.com/llama/) model - public static let llama3: (@Sendable (LLMContext) throws -> String) = { chat in // swiftlint:disable:this closure_body_length - /// BOS token of the LLM, used at the start of each prompt passage. - let BEGINOFTEXT = "<|begin_of_text|>" - /// The system identifier. - let SYSTEM = "system" - /// The user identifier. - let USER = "user" - /// The assistant identifier. - let ASSISTANT = "assistant" - /// The start token for enclosing the role of a particular message, e.g. <|start_header_id|>{role}<|end_header_id|> - let STARTHEADERID = "<|start_header_id|>" - /// The end token for enclosing the role of a particular message, e.g. <|start_header_id|>{role}<|end_header_id|> - let ENDHEADERID = "<|end_header_id|>" - /// The token that signifies the end of the message in a turn. - let EOTID = "<|eot_id|>" - - guard chat.first?.role == .system else { - throw LLMLocalError.illegalContext - } - - var systemPrompts: [String] = [] - var initialUserPrompt: String = "" - - for contextEntity in chat { - if contextEntity.role != .system { - if contextEntity.role == .user { - initialUserPrompt = contextEntity.content - break - } else { - throw LLMLocalError.illegalContext - } - } - - systemPrompts.append(contextEntity.content) - } - - /// Build the initial Llama3 prompt structure - /// - /// Template of the prompt structure: - /// <|begin_of_text|> - /// <|start_header_id|>user<|end_header_id|> - /// {{ user_message }}<|eot_id|> - /// <|start_header_id|>assistant<|end_header_id|> - var prompt = """ - \(BEGINOFTEXT) - \(STARTHEADERID)\(SYSTEM)\(ENDHEADERID) - \(systemPrompts.joined(separator: " "))\(EOTID) - - \(STARTHEADERID)\(USER)\(ENDHEADERID) - \(initialUserPrompt)\(EOTID) - - """ + " " // Add a spacer to the generated output from the model - - for contextEntity in chat.dropFirst(2) { - if contextEntity.role == .assistant() { - /// Append response from assistant to the Llama3 prompt structure - prompt += """ - \(STARTHEADERID)\(ASSISTANT)\(ENDHEADERID) - \(contextEntity.content) - \(EOTID) - """ - } else if contextEntity.role == .user { - /// Append response from user to the Llama3 prompt structure - prompt += """ - \(STARTHEADERID)\(USER)\(ENDHEADERID) - \(contextEntity.content) - \(EOTID) - """ + " " // Add a spacer to the generated output from the model - } - } - - prompt += - """ - \(STARTHEADERID)\(ASSISTANT)\(ENDHEADERID) - """ - - return prompt - } - - /// Prompt formatting closure for the [Llama2](https://ai.meta.com/llama/) model - public static let llama2: (@Sendable (LLMContext) throws -> String) = { chat in // swiftlint:disable:this closure_body_length - /// BOS token of the LLM, used at the start of each prompt passage. - let BOS = "" - /// EOS token of the LLM, used at the end of each prompt passage. - let EOS = "" - /// BOSYS token of the LLM, used at the start of the system prompt. - let BOSYS = "<>" - /// EOSYS token of the LLM, used at the end of the system prompt. - let EOSYS = "<>" - /// BOINST token of the LLM, used at the start of the instruction part of the prompt. - let BOINST = "[INST]" - /// EOINST token of the LLM, used at the end of the instruction part of the prompt. - let EOINST = "[/INST]" - - guard chat.first?.role == .system else { - throw LLMLocalError.illegalContext - } - - var systemPrompts: [String] = [] - var initialUserPrompt: String = "" - - for contextEntity in chat { - if contextEntity.role != .system { - if contextEntity.role == .user { - initialUserPrompt = contextEntity.content - break - } else { - throw LLMLocalError.illegalContext - } - } - - systemPrompts.append(contextEntity.content) - } - - /// Build the initial Llama2 prompt structure - /// - /// A template of the prompt structure looks like: - /// """ - /// [INST] <> - /// {your_system_prompt} - /// <> - /// - /// {user_message_1} [/INST] - /// """ - var prompt = """ - \(BOS)\(BOINST) \(BOSYS) - \(systemPrompts.joined(separator: " ")) - \(EOSYS) - - \(initialUserPrompt) \(EOINST) - """ + " " // Add a spacer to the generated output from the model - - for contextEntity in chat.dropFirst(2) { - if contextEntity.role == .assistant() { - /// Append response from assistant to the Llama2 prompt structure - /// - /// A template for appending an assistant response to the overall prompt looks like: - /// {user_message_1} [/INST]){model_reply_1} - prompt += """ - \(contextEntity.content)\(EOS) - """ - } else if contextEntity.role == .user { - /// Append response from user to the Llama2 prompt structure - /// - /// A template for appending an assistant response to the overall prompt looks like: - /// [INST] {user_message_2} [/INST] - prompt += """ - \(BOS)\(BOINST) \(contextEntity.content) \(EOINST) - """ + " " // Add a spacer to the generated output from the model - } - } - - return prompt - } - - /// Prompt formatting closure for the [Phi-2](https://www.microsoft.com/en-us/research/blog/phi-2-the-surprising-power-of-small-language-models/) model - public static let phi2: (@Sendable (LLMContext) throws -> String) = { chat in - guard chat.first?.role == .system else { - throw LLMLocalError.illegalContext - } - - var systemPrompts: [String] = [] - var initialUserPrompt: String = "" - - for contextEntity in chat { - if contextEntity.role != .system { - if contextEntity.role == .user { - initialUserPrompt = contextEntity.content - break - } else { - throw LLMLocalError.illegalContext - } - } - - systemPrompts.append(contextEntity.content) - } - - /// Build the initial Phi-2 prompt structure - /// - /// A template of the prompt structure looks like: - /// """ - /// System: {your_system_prompt} - /// Instruct: {model_reply_1} - /// Output: {model_reply_1} - /// """ - var prompt = """ - System: \(systemPrompts.joined(separator: " ")) - Instruct: \(initialUserPrompt)\n - """ - - for contextEntity in chat.dropFirst(2) { - if contextEntity.role == .assistant() { - /// Append response from assistant to the Phi-2 prompt structure - prompt += """ - Output: \(contextEntity.content)\n - """ - } else if contextEntity.role == .user { - /// Append response from assistant to the Phi-2 prompt structure - prompt += """ - Instruct: \(contextEntity.content)\n - """ - } - } - - /// Model starts responding after - if chat.last?.role == .user { - prompt += "Output: " - } - - return prompt - } - - /// Prompt formatting closure for the [Gemma](https://ai.google.dev/gemma/docs/formatting) models - /// - Important: System prompts are ignored as Gemma doesn't support them - public static let gemma: (@Sendable (LLMContext) throws -> String) = { chat in - /// Start token of Gemma - let startToken = "" - /// End token of Gemma - let endToken = "" - - /// Build the initial Gemma prompt structure - /// - /// A template of the prompt structure looks like: - /// """ - /// user - /// knock knock - /// model - /// who is there - /// user - /// Gemma - /// model - /// Gemma who? - /// """ - var prompt = "" - - for contextEntity in chat { - if contextEntity.role == .assistant() { - /// Append response from assistant to the Gemma prompt structure - prompt += """ - \(startToken)model - \(contextEntity.content)\(endToken)\n - """ - } else if contextEntity.role == .user { - /// Append response from assistant to the Gemma prompt structure - prompt += """ - \(startToken)user - \(contextEntity.content)\(endToken)\n - """ - } - } - - /// Model starts responding after - if chat.last?.role == .user { - prompt += "\(startToken)model\n" - } - - return prompt - } - } -} diff --git a/Sources/SpeziLLMLocal/LLMLocalSchema.swift b/Sources/SpeziLLMLocal/LLMLocalSchema.swift index 3bf3ca24..a17b2605 100644 --- a/Sources/SpeziLLMLocal/LLMLocalSchema.swift +++ b/Sources/SpeziLLMLocal/LLMLocalSchema.swift @@ -23,12 +23,8 @@ public struct LLMLocalSchema: LLMSchema { /// Closure to properly format the ``LLMLocal/context`` to a `String` which is tokenized and passed to the LLM. let parameters: LLMLocalParameters - /// Context parameters of the llama.cpp LLM. - let contextParameters: LLMLocalContextParameters - /// Sampling parameters of the llama.cpp LLM. + /// Sampling parameters of the LLM. let samplingParameters: LLMLocalSamplingParameters - /// Closure to properly format the ``LLMLocal/context`` to a `String` which is tokenized and passed to the LLM. - let formatChat: (@Sendable (LLMContext) throws -> String) /// Indicates if the inference output by the ``LLMLocalSession`` should automatically be inserted into the ``LLMLocalSession/context``. public let injectIntoContext: Bool /// The models configuration which is based on `mlx-libraries` @@ -37,25 +33,32 @@ public struct LLMLocalSchema: LLMSchema { /// Creates an instance of the ``LLMLocalSchema`` containing all necessary configuration for local LLM inference. /// /// - Parameters: - /// - configuration: A local `URL` where the LLM file is stored. The format of the LLM must be in the llama.cpp `.gguf` format. - /// - generateParameters: Parameters controlling the LLM generation process. - /// - maxTokens: Maximum number of tokens to generate in a single output, defaults to 2048. - /// - displayEveryNTokens: Interval for displaying output after every N tokens generated, defaults to 4 (improve by ~15% compared to update at every token). + /// - model: The `LLMLocalModel` to be used by the schema. + /// - parameters: Parameters controlling the LLM generation process. + /// - samplingParameters: Represents the sampling parameters of the LLM. /// - injectIntoContext: Indicates if the inference output by the ``LLMLocalSession`` should automatically be inserted into the ``LLMLocalSession/context``, defaults to false. - /// - formatChat: Closure to properly format the ``LLMLocalSession/context`` to a `String` which is tokenized and passed to the LLM, defaults to Llama2 prompt format. public init( model: LLMLocalModel, parameters: LLMLocalParameters = .init(), - contextParameters: LLMLocalContextParameters = .init(), samplingParameters: LLMLocalSamplingParameters = .init(), - injectIntoContext: Bool = false, - formatChat: @escaping (@Sendable (LLMContext) throws -> String) + injectIntoContext: Bool = false ) { self.parameters = parameters - self.contextParameters = contextParameters self.samplingParameters = samplingParameters - self.formatChat = formatChat self.injectIntoContext = injectIntoContext self.configuration = .init(id: model.hubID) } + + @_disfavoredOverload + internal init( + configuration: ModelConfiguration, + parameters: LLMLocalParameters = .init(), + samplingParameters: LLMLocalSamplingParameters = .init(), + injectIntoContext: Bool = false + ) { + self.configuration = configuration + self.parameters = parameters + self.samplingParameters = samplingParameters + self.injectIntoContext = injectIntoContext + } } diff --git a/Sources/SpeziLLMLocal/LLMLocalSession+Generate.swift b/Sources/SpeziLLMLocal/LLMLocalSession+Generate.swift index 8f9d6e31..c7785d72 100644 --- a/Sources/SpeziLLMLocal/LLMLocalSession+Generate.swift +++ b/Sources/SpeziLLMLocal/LLMLocalSession+Generate.swift @@ -24,22 +24,25 @@ extension LLMLocalSession { return } - let modelConfiguration = self.schema.configuration + let messages = if await !self.customContext.isEmpty { + await self.customContext + } else { + await self.context.formattedChat + } - guard let formattedChat = try? await schema.formatChat(self.context) else { + guard let promptTokens = try? await modelContainer.perform({ _, tokenizer in + if let chatTempalte = self.schema.parameters.chatTemplate { + return try tokenizer.applyChatTemplate(messages: messages, chatTemplate: chatTempalte) + } else { + return try tokenizer.applyChatTemplate(messages: messages) + } + }) else { Self.logger.error("SpeziLLMLocal: Failed to format chat with given context") await finishGenerationWithError(LLMLocalError.illegalContext, on: continuation) return } - let prompt = modelConfiguration.prepare(prompt: formattedChat) - let promptTokens = await modelContainer.perform { _, tokenizer in - tokenizer.encode(text: prompt) - } - - MLXRandom.seed(self.schema.contextParameters.seed ?? UInt64(Date.timeIntervalSinceReferenceDate * 1000)) - - let extraEOSTokens = modelConfiguration.extraEOSTokens + MLXRandom.seed(self.schema.parameters.seed ?? UInt64(Date.timeIntervalSinceReferenceDate * 1000)) guard await !checkCancellation(on: continuation) else { return @@ -52,13 +55,14 @@ extension LLMLocalSession { repetitionContextSize: schema.samplingParameters.repetitionContextSize ) - let (result, tokenizer) = await modelContainer.perform { model, tokenizer in + // swiftlint:disable:next closure_body_length + let result = await modelContainer.perform { model, tokenizer in let result = MLXLLM.generate( promptTokens: promptTokens, parameters: parameters, model: model, tokenizer: tokenizer, - extraEOSTokens: extraEOSTokens + extraEOSTokens: schema.parameters.extraEOSTokens ) { tokens in if Task.isCancelled { return .stop @@ -66,25 +70,40 @@ extension LLMLocalSession { if tokens.count >= self.schema.parameters.maxOutputLength { Self.logger.debug("SpeziLLMLocal: Max output length exceeded.") - continuation.finish() - Task { @MainActor in - self.state = .ready - } return .stop } - if schema.injectIntoContext && tokens.count.isMultiple(of: schema.parameters.displayEveryNTokens) { + if tokens.count.isMultiple(of: schema.parameters.displayEveryNTokens) { let lastTokens = Array(tokens.suffix(schema.parameters.displayEveryNTokens)) let text = tokenizer.decode(tokens: lastTokens) Self.logger.debug("SpeziLLMLocal: Yielded token: \(text, privacy: .public)") continuation.yield(text) + + if schema.injectIntoContext { + Task { @MainActor in + context.append(assistantOutput: text) + } + } } return .more } - return (result, tokenizer) + // Yielding every Nth token may result in missing the final tokens. + let reaminingTokens = result.tokens.count % schema.parameters.displayEveryNTokens + let lastTokens = Array(result.tokens.suffix(reaminingTokens)) + let text = tokenizer.decode(tokens: lastTokens) + continuation.yield(text) + + if schema.injectIntoContext { + Task { @MainActor in + context.append(assistantOutput: text) + context.completeAssistantStreaming() + } + } + + return result } Self.logger.debug( @@ -96,16 +115,6 @@ extension LLMLocalSession { ) await MainActor.run { - if schema.injectIntoContext { - // Yielding every Nth token may result in missing the final tokens. - let reaminingTokens = result.tokens.count % schema.parameters.displayEveryNTokens - let lastTokens = Array(result.tokens.suffix(reaminingTokens)) - let text = tokenizer.decode(tokens: lastTokens) - continuation.yield(text) - } - - context.append(assistantOutput: result.output, complete: true) - context.completeAssistantStreaming() continuation.finish() state = .ready } diff --git a/Sources/SpeziLLMLocal/LLMLocalSession+Setup.swift b/Sources/SpeziLLMLocal/LLMLocalSession+Setup.swift index 4ebb4573..715ba732 100644 --- a/Sources/SpeziLLMLocal/LLMLocalSession+Setup.swift +++ b/Sources/SpeziLLMLocal/LLMLocalSession+Setup.swift @@ -58,6 +58,8 @@ extension LLMLocalSession { Self.logger.error("SpeziLLMLocal: Failed to load local `modelContainer`") return false } + + Self.logger.debug("SpeziLLMLocal: Local LLM has finished initializing") return true } } diff --git a/Sources/SpeziLLMLocal/LLMLocalSession+Update.swift b/Sources/SpeziLLMLocal/LLMLocalSession+Update.swift new file mode 100644 index 00000000..a935d046 --- /dev/null +++ b/Sources/SpeziLLMLocal/LLMLocalSession+Update.swift @@ -0,0 +1,38 @@ +// +// This source file is part of the Stanford Spezi open source project +// +// SPDX-FileCopyrightText: 2024 Stanford University and the project authors (see CONTRIBUTORS.md) +// +// SPDX-License-Identifier: MIT +// + +import Foundation +import SpeziLLM + +extension LLMLocalSession { + /// Updates the existing instance of the ``LLMLocalSchema`` with new parameters. + /// + /// - Parameters: + /// - model: An instance of `LLMLocalModel` to be used by the schema. + /// - parameters: A dictionary or object containing parameters that control the LLM generation process. + /// - samplingParameters: An object representing the sampling parameters for the LLM. + /// - injectIntoContext: A Boolean value indicating whether the inference output from the ``LLMLocalSession`` + /// should be automatically inserted into the ``LLMLocalSession/context``. Defaults to `false`. + /// + /// - Important: Calling this method automatically invokes `cancel()`, stopping all running tasks associated + /// with the current session. + public func update( + parameters: LLMLocalParameters? = nil, + samplingParameters: LLMLocalSamplingParameters? = nil, + injectIntoContext: Bool? = nil // swiftlint:disable:this discouraged_optional_boolean + ) { + cancel() + + self.schema = .init( + configuration: self.schema.configuration, + parameters: parameters ?? self.schema.parameters, + samplingParameters: samplingParameters ?? self.schema.samplingParameters, + injectIntoContext: injectIntoContext ?? self.schema.injectIntoContext + ) + } +} diff --git a/Sources/SpeziLLMLocal/LLMLocalSession.swift b/Sources/SpeziLLMLocal/LLMLocalSession.swift index 026859f7..93bab019 100644 --- a/Sources/SpeziLLMLocal/LLMLocalSession.swift +++ b/Sources/SpeziLLMLocal/LLMLocalSession.swift @@ -67,7 +67,7 @@ public final class LLMLocalSession: LLMSession, @unchecked Sendable { static let logger = Logger(subsystem: "edu.stanford.spezi", category: "SpeziLLMLocal") let platform: LLMLocalPlatform - let schema: LLMLocalSchema + var schema: LLMLocalSchema @ObservationIgnored private var modelExist: Bool { false @@ -78,6 +78,9 @@ public final class LLMLocalSession: LLMSession, @unchecked Sendable { @MainActor public var state: LLMState = .uninitialized @MainActor public var context: LLMContext = [] + /// Overrides the `context` with a custom highly customizable context in the `swift-transformers` format. + /// - Important: When using the `customContext`, `injectToContext` will have no effect, and the assistant output will **not** be added to the `customContext` + @MainActor public var customContext: [[String: String]] = [] @MainActor public var numParameters: Int? @MainActor public var modelConfiguration: ModelConfiguration? diff --git a/Sources/SpeziLLMLocal/Mock/LLMLocalMockSession.swift b/Sources/SpeziLLMLocal/Mock/LLMLocalMockSession.swift new file mode 100644 index 00000000..690c425b --- /dev/null +++ b/Sources/SpeziLLMLocal/Mock/LLMLocalMockSession.swift @@ -0,0 +1,111 @@ +// +// This source file is part of the Stanford Spezi open source project +// +// SPDX-FileCopyrightText: 2024 Stanford University and the project authors (see CONTRIBUTORS.md) +// +// SPDX-License-Identifier: MIT +// + +import Foundation +import Observation +import SpeziLLM + + +/// A mock ``LLMLocalMockSession``, used for testing purposes. +/// +/// See `LLMMockSession` for more details +@Observable +public final class LLMLocalMockSession: LLMSession, @unchecked Sendable { + let platform: LLMLocalPlatform + let schema: LLMLocalSchema + + @ObservationIgnored private var task: Task<(), Never>? + + @MainActor public var state: LLMState = .uninitialized + @MainActor public var context: LLMContext = [] + + + /// Initializer for the ``LLMMockSession``. + /// + /// - Parameters: + /// - platform: The mock LLM platform. + /// - schema: The mock LLM schema. + init(_ platform: LLMLocalPlatform, schema: LLMLocalSchema) { + self.platform = platform + self.schema = schema + } + + + @discardableResult + public func generate() async throws -> AsyncThrowingStream { + let (stream, continuation) = AsyncThrowingStream.makeStream(of: String.self) + + // swiftlint:disable:next closure_body_length + task = Task { + await MainActor.run { + self.state = .loading + } + try? await Task.sleep(for: .seconds(1)) + guard await !checkCancellation(on: continuation) else { + return + } + + /// Generate mock messages + await MainActor.run { + self.state = .generating + } + await injectAndYield("Mock ", on: continuation) + + try? await Task.sleep(for: .milliseconds(500)) + guard await !checkCancellation(on: continuation) else { + return + } + await injectAndYield("Message ", on: continuation) + + try? await Task.sleep(for: .milliseconds(500)) + guard await !checkCancellation(on: continuation) else { + return + } + await injectAndYield("from ", on: continuation) + + try? await Task.sleep(for: .milliseconds(500)) + guard await !checkCancellation(on: continuation) else { + return + } + await injectAndYield("SpeziLLM! ", on: continuation) + + try? await Task.sleep(for: .milliseconds(500)) + guard await !checkCancellation(on: continuation) else { + return + } + await injectAndYield("Using SpeziLLMLocal only works on physical devices.", on: continuation) + + + continuation.finish() + await MainActor.run { + context.completeAssistantStreaming() + self.state = .ready + } + } + + return stream + } + + public func cancel() { + task?.cancel() + } + + private func injectAndYield(_ piece: String, on continuation: AsyncThrowingStream.Continuation) async { + continuation.yield(piece) + if schema.injectIntoContext { + await MainActor.run { + context.append(assistantOutput: piece) + } + } + } + + + deinit { + cancel() + } +} diff --git a/Sources/SpeziLLMLocal/SpeziLLMLocal.docc/SpeziLLMLocal.md b/Sources/SpeziLLMLocal/SpeziLLMLocal.docc/SpeziLLMLocal.md index c6823d3f..641f8b3f 100644 --- a/Sources/SpeziLLMLocal/SpeziLLMLocal.docc/SpeziLLMLocal.md +++ b/Sources/SpeziLLMLocal/SpeziLLMLocal.docc/SpeziLLMLocal.md @@ -4,7 +4,7 @@ # # This source file is part of the Stanford Spezi open source project # -# SPDX-FileCopyrightText: 2023 Stanford University and the project authors (see CONTRIBUTORS.md) +# SPDX-FileCopyrightText: 2024 Stanford University and the project authors (see CONTRIBUTORS.md) # # SPDX-License-Identifier: MIT # @@ -14,7 +14,8 @@ Provides local Language Model execution capabilities on-device. ## Overview -The ``SpeziLLMLocal`` target enables the usage of locally executed Language Models (LLMs) directly on-device, without the need for any kind of internet connection and no data every leaving the local device. The underlying technology used for the LLM inference is [llama.cpp](https://github.com/ggerganov/llama.cpp), a C/C++ library for executing [LLaMa models](https://ai.meta.com/llama/). ``SpeziLLMLocal`` provides a pure Swift-based API for interacting with the locally executed model, building on top of the infrastructure of the [SpeziLLM target](https://swiftpackageindex.com/stanfordspezi/spezillm/documentation/spezillm). + +The ``SpeziLLMLocal`` target enables the usage of locally executed Language Models (LLMs) directly on-device, without the need for any kind of internet connection and no data every leaving the local device. The underlying technology used for the LLM inference is [`mlx-swift`](https://github.com/ml-explore/mlx-swift). ``SpeziLLMLocal`` provides a pure Swift-based API for interacting with the locally executed model, building on top of the infrastructure of the [SpeziLLM target](https://swiftpackageindex.com/stanfordspezi/spezillm/documentation/spezillm). ## Setup @@ -26,45 +27,19 @@ You need to add the SpeziLLM Swift package to > Important: If your application is not yet configured to use Spezi, follow the [Spezi setup article](https://swiftpackageindex.com/stanfordspezi/spezi/documentation/spezi/initial-setup) to set up the core Spezi infrastructure. -> Important: In order to use the LLM local target, one needs to set build parameters in the consuming Xcode project or the consuming SPM package to enable the [Swift / C++ Interop](https://www.swift.org/documentation/cxx-interop/), introduced in Xcode 15 and Swift 5.9. Keep in mind that this is true for nested dependencies, one needs to set this configuration recursivly for the entire dependency tree towards the llama.cpp SPM package. -> -> **For Xcode projects:** -> - Open your [build settings in Xcode](https://developer.apple.com/documentation/xcode/configuring-the-build-settings-of-a-target/) by selecting *PROJECT_NAME > TARGET_NAME > Build Settings*. -> - Within the *Build Settings*, search for the `C++ and Objective-C Interoperability` setting and set it to `C++ / Objective-C++`. This enables the project to use the C++ headers from llama.cpp. -> -> **For SPM packages:** -> - Open the `Package.swift` file of your [SPM package]((https://www.swift.org/documentation/package-manager/)) -> - Within the package `target` that consumes the llama.cpp package, add the `interoperabilityMode(_:)` Swift build setting like that: -> ```swift -> /// Adds the dependency to the Spezi LLM SPM package -> dependencies: [ -> .package(url: "https://github.com/StanfordSpezi/SpeziLLM", .upToNextMinor(from: "0.6.0")) -> ], -> targets: [ -> .target( -> name: "ExampleConsumingTarget", -> /// State the dependence of the target to SpeziLLMLocal -> dependencies: [ -> .product(name: "SpeziLLMLocal", package: "SpeziLLM") -> ], -> /// Important: Configure the `.interoperabilityMode(_:)` within the `swiftSettings` -> swiftSettings: [ -> .interoperabilityMode(.Cxx) -> ] -> ) -> ] ->``` +> Important: Spezi LLM Local is not compatible with simulators. The underlying [`mlx-swift`](https://github.com/ml-explore/mlx-swift) requires a modern Metal MTLGPUFamily and the simulator does not provide that. + +> Important: To use the LLM local target, some LLMs require adding the [Increase Memory Limit](https://developer.apple.com/documentation/bundleresources/entitlements/com_apple_developer_kernel_increased-memory-limit) entitlement to the project. ## Spezi LLM Local Components -The core components of the ``SpeziLLMLocal`` target are ``LLMLocalSchema``, ``LLMLocalSession`` as well as ``LLMLocalPlatform``. They heavily utilize the [llama.cpp library](https://github.com/ggerganov/llama.cpp) to perform the inference of the Language Model. ``LLMLocalSchema`` defines the type and configuration of the LLM, ``LLMLocalSession`` represents the ``LLMLocalSchema`` in execution while ``LLMLocalPlatform`` is the LLM execution platform. +The core components of the ``SpeziLLMLocal`` target are ``LLMLocalSchema``, ``LLMLocalSession`` as well as ``LLMLocalPlatform``. They heavily utilize [mlx-swift-examples](https://github.com/ml-explore/mlx-swift-examples) to perform the inference of the Language Model. ``LLMLocalSchema`` defines the type and configuration of the LLM, ``LLMLocalSession`` represents the ``LLMLocalSchema`` in execution while ``LLMLocalPlatform`` is the LLM execution platform. -> Important: To execute a LLM locally, the model file must be present on the local device. -> The model must be in the popular `.gguf` format introduced by the [llama.cpp library](https://github.com/ggerganov/llama.cpp) +> Important: To execute a LLM locally, the model file must be present on the local device. > Tip: In order to download the model file of the Language model to the local device, SpeziLLM provides the [SpeziLLMLocalDownload](https://swiftpackageindex.com/stanfordspezi/spezillm/documentation/spezillmlocaldownload) target which provides model download and storage functionalities. -``LLMLocalSchema`` offers a variety of configuration possibilities, such as the used model file, the context window, the maximum output size or the batch size. These options can be set via the ``LLMLocalSchema/init(modelPath:parameters:contextParameters:samplingParameters:injectIntoContext:formatChat:)`` initializer and the ``LLMLocalParameters``, ``LLMLocalContextParameters``, and ``LLMLocalSamplingParameters`` types. Keep in mind that the model file must be in the popular `.gguf` format! +``LLMLocalSchema`` offers a variety of configuration possibilities, such as the used model file, the context window, the maximum output size or the batch size. These options can be set via the ``LLMLocalSchema/init(model:parameters:samplingParameters:injectIntoContext:)`` initializer and the ``LLMLocalParameters``, and ``LLMLocalSamplingParameters`` types. - Important: ``LLMLocalSchema``, ``LLMLocalSession`` as well as ``LLMLocalPlatform`` shouldn't be used on it's own but always used together with the Spezi `LLMRunner`! @@ -107,7 +82,7 @@ struct LLMLocalDemoView: View { // Instantiate the `LLMLocalSchema` to an `LLMLocalSession` via the `LLMRunner`. let llmSession: LLMLocalSession = runner( with: LLMLocalSchema( - modelPath: URL(string: "URL to the local model file")! + model: .llama3_1_8B_4bit ) ) diff --git a/Tests/UITests/TestApp/LLMFog/Account/AccountSetupHeader.swift b/Tests/UITests/TestApp/LLMFog/Account/AccountSetupHeader.swift index 2536ca9f..cf1ea4dd 100644 --- a/Tests/UITests/TestApp/LLMFog/Account/AccountSetupHeader.swift +++ b/Tests/UITests/TestApp/LLMFog/Account/AccountSetupHeader.swift @@ -13,8 +13,7 @@ import SwiftUI struct AccountSetupHeader: View { @Environment(Account.self) private var account - @Environment(\.accountSetupState) private var setupState - + @Environment(\.accountSetupState) var setupState var body: some View { VStack { @@ -25,7 +24,7 @@ struct AccountSetupHeader: View { .padding(.top, 30) Text("ACCOUNT_SUBTITLE") .padding(.bottom, 8) - if account.signedIn, case .presentingExistingAccount = setupState { + if account.signedIn, case .loadingExistingAccount = setupState { Text("ACCOUNT_SIGNED_IN_DESCRIPTION") } else { Text("ACCOUNT_SETUP_DESCRIPTION") diff --git a/Tests/UITests/TestApp/LLMLocal/LLMLocalChatTestView.swift b/Tests/UITests/TestApp/LLMLocal/LLMLocalChatTestView.swift index ad6999cf..03b7881d 100644 --- a/Tests/UITests/TestApp/LLMLocal/LLMLocalChatTestView.swift +++ b/Tests/UITests/TestApp/LLMLocal/LLMLocalChatTestView.swift @@ -25,8 +25,7 @@ struct LLMLocalChatTestView: View { } else { LLMChatViewSchema( with: LLMLocalSchema( - model: .llama3_8B_4bit, - formatChat: LLMLocalSchema.PromptFormattingDefaults.llama3 + model: .llama3_8B_4bit ) ) } diff --git a/Tests/UITests/TestAppUITests/TestAppLLMOpenAIUITests.swift b/Tests/UITests/TestAppUITests/TestAppLLMOpenAIUITests.swift index aa2b2e77..e4c2fc3f 100644 --- a/Tests/UITests/TestAppUITests/TestAppLLMOpenAIUITests.swift +++ b/Tests/UITests/TestAppUITests/TestAppLLMOpenAIUITests.swift @@ -29,9 +29,11 @@ class TestAppLLMOpenAIUITests: XCTestCase { func testSpeziLLMOpenAIOnboarding() throws { // swiftlint:disable:this function_body_length let app = XCUIApplication() + #if canImport(UIKit) if UIDevice.current.userInterfaceIdiom == .pad { throw XCTSkip("Skipped on iPad, see: https://github.com/StanfordBDHG/XCTestExtensions/issues/27") } + #endif XCTAssert(app.buttons["LLMOpenAI"].waitForExistence(timeout: 2)) app.buttons["LLMOpenAI"].tap() @@ -145,9 +147,11 @@ class TestAppLLMOpenAIUITests: XCTestCase { func testSpeziLLMOpenAIChat() throws { let app = XCUIApplication() + #if canImport(UIKit) if UIDevice.current.userInterfaceIdiom == .pad { throw XCTSkip("Skipped on iPad, see: https://github.com/StanfordBDHG/XCTestExtensions/issues/27") } + #endif XCTAssert(app.buttons["LLMOpenAI"].waitForExistence(timeout: 2)) app.buttons["LLMOpenAI"].tap() diff --git a/Tests/UITests/UITests.xcodeproj/project.pbxproj b/Tests/UITests/UITests.xcodeproj/project.pbxproj index 020fccc8..c66a1500 100644 --- a/Tests/UITests/UITests.xcodeproj/project.pbxproj +++ b/Tests/UITests/UITests.xcodeproj/project.pbxproj @@ -733,7 +733,7 @@ repositoryURL = "https://github.com/StanfordBDHG/XCTestExtensions.git"; requirement = { kind = upToNextMajorVersion; - minimumVersion = 1.0.0; + minimumVersion = 1.1.0; }; }; 9770F28F2BB3C40C00478571 /* XCRemoteSwiftPackageReference "SpeziFirebase" */ = { @@ -741,7 +741,7 @@ repositoryURL = "https://github.com/StanfordSpezi/SpeziFirebase"; requirement = { kind = upToNextMajorVersion; - minimumVersion = "2.0.0-beta.4"; + minimumVersion = 2.0.0; }; }; 979D418E2BB3EBD8001953BD /* XCRemoteSwiftPackageReference "SpeziAccount" */ = { @@ -749,7 +749,7 @@ repositoryURL = "https://github.com/StanfordSpezi/SpeziAccount"; requirement = { kind = upToNextMajorVersion; - minimumVersion = "2.0.0-beta.8"; + minimumVersion = 2.1.0; }; }; /* End XCRemoteSwiftPackageReference section */