feat: token prediction (speculative decoding) (#405)

* feat: token prediction (speculative decoding) * feat: `DraftSequenceTokenPredictor` * feat: `InputLookupTokenPredictor` * feat: `controlledEvaluate` * feat: reranking (`LlamaRankingContext`) * feat: `evaluateWithMetadata` * feat: token confidence * feat: `experimentalChunkDocument` * feat: build on arm64 using LLVM, use Visual Studio's CMake when available * feat: try compiling with LLVM on Windows x64 when available * feat(minor): dynamically load `llama.cpp` backends * feat(minor): more token values support in `SpecialToken` * feat(minor): improve memory usage estimation * fix: check for Rosetta usage on macOS x64 when using the `inspect gpu` command * fix: detect running under Rosetta on Apple Silicone and show an error message instead of crashing * fix: switch from `"nextTick"` to `"nextCycle"` for the default batch dispatcher * fix: remove deprecated CLS token * fix: pipe error logs in `inspect gpu` command * docs: improve building from source * docs: CUDA in Docker troubleshooting * docs: reranking * docs: context shift strategy * docs: remove Intel AMX trick, since it's being automatically used in the prebuilt binaries now
withcatai · Jan 7, 2025 · 632a7bf · 632a7bf
1 parent e2c5c3f
commit 632a7bf
Show file tree

Hide file tree

Showing 123 changed files with 7,863 additions and 938 deletions.
diff --git a/.config/typedoc.json b/.config/typedoc.json
@@ -27,6 +27,6 @@
     "interfacePropertiesFormat": "list",
     "sort": ["source-order"],
     "docsRoot": "../docs",
-    "intentionallyNotExported": ["MergeOptionalUnionTypes", "GbnfJsonSchemaToTSType", "_LlamaText"],
+    "intentionallyNotExported": ["MergeOptionalUnionTypes", "PickOptions", "GbnfJsonSchemaToTSType", "_LlamaText"],
     "useHTMLEncodedBrackets": true
 }
diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
@@ -23,8 +23,7 @@ jobs:
       - name: Download latest llama.cpp release
         env:
           CI: true
-        # pinned to `b4291` temporarily until the Windows on Arm64 build is fixed
-        run: node ./dist/cli/cli.js source download --release b4291 --skipBuild --noBundle --noUsageExample --updateBinariesReleaseMetadataAndSaveGitBundle
+        run: node ./dist/cli/cli.js source download --release latest --skipBuild --noBundle --noUsageExample --updateBinariesReleaseMetadataAndSaveGitBundle
       - name: Upload build artifact
         uses: actions/upload-artifact@v4
         with:

diff --git a/.vitepress/config.ts b/.vitepress/config.ts
@@ -132,13 +132,16 @@ export default defineConfig({
                         item.lastmod = new Date(buildDate);
                         item.changefreq = "daily";
                         item.priority = 0.9;
+                    } else if (item.url === "guide/") {
+                        item.changefreq = "daily";
+                        item.priority = 0.7;
                     } else if (item.url.startsWith("api/") || item.url.startsWith("cli/")) {
                         item = {
                             ...item,
                             lastmod: new Date(buildDate),
                             changefreq: "weekly",
                             priority: item.url.startsWith("cli/")
-                                ? 0.7
+                                ? 0.6
                                 : 0.5
                         };
                     } else if (item.lastmod == null && item.url.startsWith("blog/")) {
@@ -358,6 +361,9 @@ export default defineConfig({
         }
     },
     markdown: {
+        languageAlias: {
+            "js-highlight": "javascript"
+        },
         codeTransformers: [
             transformerTwoslash({
                 explicitTrigger: false,
@@ -482,7 +488,10 @@ export default defineConfig({
                     {text: "External Chat State", link: "/external-chat-state"},
                     {text: "Token Bias", link: "/token-bias"},
                     {text: "Objects Lifecycle", link: "/objects-lifecycle"},
+                    {text: "Chat Context Shift", link: "/chat-context-shift"},
                     {text: "Batching", link: "/batching"},
+                    {text: "Token Prediction", link: "/token-prediction"},
+                    {text: "Low Level API", link: "/low-level-api"},
                     {text: "Awesome List", link: "/awesome"},
                     {text: "Troubleshooting", link: "/troubleshooting"},
                     {text: "Tips and Tricks", link: "/tips-and-tricks"}

diff --git a/.vitepress/config/apiReferenceSidebar.ts b/.vitepress/config/apiReferenceSidebar.ts
@@ -1,6 +1,6 @@
 import {DefaultTheme} from "vitepress";
 /* eslint import/no-unresolved: "off" */
-import typedocSidebar from "../../docs/api/typedoc-sidebar.json"; // if this import fails, run `npm run docs:generateTypedoc`
+import typedocSidebar from "../../docs/api/typedoc-sidebar.json";
 
 const categoryOrder = [
     "Functions",
@@ -28,6 +28,7 @@ const classesOrder = [
     "LlamaCompletion",
     "LlamaEmbeddingContext",
     "LlamaEmbedding",
+    "LlamaRankingContext",
     "LlamaGrammar",
     "LlamaJsonSchemaGrammar",
     "LlamaText",

diff --git a/.vitepress/theme/style.css b/.vitepress/theme/style.css
@@ -354,7 +354,8 @@ div.search-keyboard-shortcuts[class] kbd:last-of-type {
 }
 
 .language-ts > .lang,
-.language-shell > .lang {
+.language-shell > .lang,
+.language-js-highlight > .lang {
     display: none;
 }
 

diff --git a/.vitepress/utils/parseCmakeListsTxtOptions.ts b/.vitepress/utils/parseCmakeListsTxtOptions.ts
@@ -1,5 +1,7 @@
 const maxLinesSpan = 10;
 
+const cmakeOptionRegex =
+    /^\s*option\([\s\t\n\r]*(?<key>\S+)[\s\t\n\r]+"(?<description>(?:\\"|[^"])*)"[\s\t\n\r]+(?<defaultValue>\S+)[\s\t\n\r]*\)/;
 export function parseCmakeListsTxtOptions(cmakeListsTxtString: string) {
     const lines = cmakeListsTxtString.split("\n");
 
@@ -8,9 +10,7 @@ export function parseCmakeListsTxtOptions(cmakeListsTxtString: string) {
             const match = lines
                 .slice(index, index + maxLinesSpan)
                 .join("\n")
-                .match(
-                    /^option\([\s\t\n\r]*(?<key>\S+)[\s\t\n\r]+"(?<description>(?:\\"|[^"])*)"[\s\t\n\r]+(?<defaultValue>\S+)[\s\t\n\r]*\)/
-                );
+                .match(cmakeOptionRegex);
             if (match == null || match.groups == null || match?.index !== 0)
                 return null;
 

diff --git a/README.md b/README.md
@@ -26,8 +26,9 @@
 * [Use the CLI to chat with a model without writing any code](#try-it-without-installing)
 * Up-to-date with the latest `llama.cpp`. Download and compile the latest release with a [single CLI command](https://node-llama-cpp.withcat.ai//guide/building-from-source#downloading-a-release)
 * Enforce a model to generate output in a parseable format, [like JSON](https://node-llama-cpp.withcat.ai/guide/chat-session#json-response), or even force it to [follow a specific JSON schema](https://node-llama-cpp.withcat.ai/guide/chat-session#response-json-schema)
-* [Provide a model with functions it can call on demand](https://node-llama-cpp.withcat.ai/guide/chat-session#function-calling) to retrieve information of perform actions
+* [Provide a model with functions it can call on demand](https://node-llama-cpp.withcat.ai/guide/chat-session#function-calling) to retrieve information or perform actions
 * [Embedding support](https://node-llama-cpp.withcat.ai/guide/embedding)
+* [Safe against special token injection attacks](https://node-llama-cpp.withcat.ai/guide/llama-text#input-safety-in-node-llama-cpp)
 * Great developer experience with full TypeScript support, and [complete documentation](https://node-llama-cpp.withcat.ai/guide/)
 * Much more
 

diff --git a/docs/guide/building-from-source.md b/docs/guide/building-from-source.md
@@ -25,13 +25,62 @@ This is useful for building from source on machines that aren't connected to the
 :::
 
 ::: info
-
 If `cmake` is not installed on your machine, `node-llama-cpp` will automatically download `cmake` to an internal directory and try to use it to build `llama.cpp` from source.
 
 If the build fails, make sure you have the required dependencies of `cmake` installed on your machine. More info is available [here](https://github.com/cmake-js/cmake-js#:~:text=projectRoot/build%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%5Bstring%5D-,Requirements%3A,-CMake) (you don't have to install `cmake` or `cmake-js`, just the dependencies).
+:::
+
+::: details Dependencies for macOS
+If the build fails on macOS with the error `"/usr/bin/cc" is not able to compile a simple test program`,
+try running this command to install the Xcode command line tools:
+```shell
+xcode-select --install
+```
+:::
+
+::: details Dependencies for Windows x64
+If the build fails on your machine, ensure you have all the necessary build tools installed.
+
+You can install all the dependencies via [WinGet](https://learn.microsoft.com/en-us/windows/package-manager/winget/) using this command:
+```shell
+winget install --id Microsoft.VisualStudio.2022.BuildTools --force --override "--add Microsoft.VisualStudio.Component.VC.CMake.Project Microsoft.VisualStudio.Component.VC.CoreBuildTools Microsoft.VisualStudio.Component.VC.Tools.x86.x64 Microsoft.VisualStudio.Component.VC.ATL Microsoft.VisualStudio.Component.VC.ATLMFC Microsoft.VisualStudio.Component.VC.Llvm.ClangToolset Microsoft.VisualStudio.Component.VC.Llvm.Clang Microsoft.VisualStudio.Component.VC.Redist.14.Latest Microsoft.Component.VC.Runtime.UCRTSDK Microsoft.VisualStudio.Component.Windows10SDK Microsoft.VisualStudio.Component.Windows10SDK.20348"
+```
+> WinGet is built-in on Windows 11 and modern Windows 10 versions
 
-If the build fails on macOS with the error `"/usr/bin/cc" is not able to compile a simple test program`, try running `xcode-select --install` to install the Xcode command line tools.
+---
+
+You can also install all the dependencies manually using the [Visual C++ Build Tools installer](https://visualstudio.microsoft.com/visual-cpp-build-tools/):
+* **`Workloads` tab:** select `Desktop development with C++`
+* **`Individual components` tab**: select the following:
+  * C++ ATL for latest v143 build tools (x86 & x64)
+  * C++ MFC for latest v143 build tools (x86 & x64)
+  * C++ CMake tools for Windows
+  * C++ Clang Compiler for Windows
+  * MSBuild support for LLVM (clang-cl) toolset
+  * Windows Universal CRT SDK
+:::
+
+::: details Dependencies for Windows on Arm
+On Windows on Arm you need to install additional build tools to build `llama.cpp` from source.
+
+You can install all the dependencies via [WinGet](https://learn.microsoft.com/en-us/windows/package-manager/winget/) using this command:
+```shell
+winget install --id Microsoft.VisualStudio.2022.BuildTools --force --override "--add Microsoft.VisualStudio.Component.VC.CMake.Project Microsoft.VisualStudio.Component.VC.CoreBuildTools Microsoft.VisualStudio.Component.VC.Tools.x86.x64 Microsoft.VisualStudio.Component.VC.Tools.ARM64 Microsoft.VisualStudio.Component.VC.ATL Microsoft.VisualStudio.Component.VC.ATL.ARM64 Microsoft.VisualStudio.Component.VC.ATLMFC Microsoft.VisualStudio.Component.VC.MFC.ARM64 Microsoft.VisualStudio.Component.VC.Llvm.ClangToolset Microsoft.VisualStudio.Component.VC.Llvm.Clang Microsoft.VisualStudio.Component.VC.Redist.14.Latest Microsoft.Component.VC.Runtime.UCRTSDK Microsoft.VisualStudio.Component.Windows10SDK Microsoft.VisualStudio.Component.Windows10SDK.20348"
+```
+> WinGet is built-in on Windows 11 and modern Windows 10 versions
+
+---
 
+You can also install all the dependencies manually using the [Visual C++ Build Tools installer](https://visualstudio.microsoft.com/visual-cpp-build-tools/):
+* **`Workloads` tab:** select `Desktop development with C++`
+* **`Individual components` tab**: select the following:
+  * MSVC v143 - VS 2022 C++ ARM64 build tools (latest)
+  * C++ ATL for latest v143 build tools (ARM64/ARM64EC)
+  * C++ MFC for latest v143 build tools (ARM64/ARM64EC)
+  * C++ CMake tools for Windows
+  * C++ Clang Compiler for Windows
+  * MSBuild support for LLVM (clang-cl) toolset
+  * Windows Universal CRT SDK
 :::
 
 ## `source download` and `source build` Commands

diff --git a/docs/guide/chat-context-shift.md b/docs/guide/chat-context-shift.md
@@ -0,0 +1,111 @@
+# Chat Context Shift Strategy {#background}
+When the chat history gets longer than the sequence's context size, we have to remove the oldest tokens from the context state to make room for new tokens to be generated.
+This is called a context shift.
+
+`node-llama-cpp` has a smart mechanism to handle context shifts on the chat level, so the oldest messages are truncated (from their beginning) or removed from the context state, while keeping the system prompt in place to ensure the model follows the guidelines you set for it.
+
+You can override `node-llama-cpp`'s default context shift strategy
+when using [`LlamaChatSession`](../api/classes/LlamaChatSession.md) or [`LlamaChat`](../api/classes/LlamaChat.md)
+by providing a custom context shift strategy.
+
+## The Default Context Shift Strategy {#default-strategy}
+The [default context shift strategy](../api/type-aliases/LLamaChatContextShiftOptions.md#strategy) is `eraseFirstResponseAndKeepFirstSystem`.
+
+This strategy attempts to truncate the oldest model responses (from their beginning) or remove them completely from the chat history while keeping the first system prompt in place.
+If a response is completely removed, the prompt that came before it will be removed as well.
+
+## Implementing a Custom Context Shift Strategy {#custom-strategy}
+A [custom context shift strategy](../api/type-aliases/LLamaChatContextShiftOptions.md#strategy) is a function that receives the full chat history as input and
+returns a new chat history that when tokenized will result in an array of tokens shorter than the desired max size.
+
+The context shift strategy will be called only when the context state needs to be shifted.
+
+If the context shift strategy returns an invalid chat history (e.g., a chat history that is too long),
+the prompting function will abort the evaluation and throw an error.
+
+A custom context shift strategy can be a simple logic that prioritizes which data to remove,
+or it can even use a language model to summarize information to shorten the chat history.
+
+It's important to keep the last user prompt and model response as-is to prevent infinite generation loops.
+
+```typescript
+import {fileURLToPath} from "url";
+import path from "path";
+import {getLlama, LlamaChatSession} from "node-llama-cpp";
+
+const __dirname = path.dirname(fileURLToPath(import.meta.url));
+
+const llama = await getLlama();
+const model = await llama.loadModel({
+    modelPath: path.join(__dirname, "models", "Meta-Llama-3.1-8B-Instruct.Q4_K_M.gguf")
+});
+const context = await model.createContext();
+
+// ---cut---
+const session = new LlamaChatSession({
+    contextSequence: context.getSequence(),
+    contextShift: {
+        strategy({
+            chatHistory, chatWrapper, maxTokensCount, tokenizer,
+            lastShiftMetadata
+        }) {
+            // clone the chat history to not mutate the original
+            const newChatHistory = chatHistory.map(
+                (item) => structuredClone(item)
+            );
+
+            function getTokensLeftToRemove() {
+                const {
+                    contextText
+                } = chatWrapper.generateContextState({chatHistory});
+                const tokenUsage = contextText.tokenize(tokenizer).length;
+
+                return Math.max(0, tokenUsage - maxTokensCount);
+            }
+
+            while (getTokensLeftToRemove() > 0 && newChatHistory.length > 2) {
+                for (let i = 0; i < newChatHistory.length - 2; i++) {
+                    const chatItem = newChatHistory[i]!;
+
+                    if (i === 0 && chatItem.type === "system")
+                        // don't remove the first system message
+                        continue;
+                    else if (chatItem.type === "model") {
+                        // remove the model response
+                        newChatHistory.splice(i, 1);
+                        i--;
+
+                        // remove the user messages that
+                        // came before the model response
+                        while (
+                            i > 0 &&
+                            newChatHistory[i - 1]?.type === "user"
+                        ) {
+                            newChatHistory.splice(i - 1, 1);
+                            i--;
+                        }
+                    } else if (chatItem.type === "system") {
+                        // don't remove system messages on their own
+                        continue;
+                    } else if (chatItem.type === "user") {
+                        // don't remove user messages on their own
+                        continue;
+                    } else {
+                        // ensure we handle all message types.
+                        // otherwise, this will error
+                        void (chatItem satisfies never);
+                    }
+                }
+            }
+
+            return {
+                chatHistory: newChatHistory,
+
+                // this metadata will be passed to the next context shift
+                // strategy call as the `lastShiftMetadata` argument
+                metadata: {}
+            };
+        }
+    }
+});
+```
diff --git a/docs/guide/choosing-a-model.md b/docs/guide/choosing-a-model.md
@@ -124,6 +124,20 @@ Here are a few concepts to be aware of when choosing a model:
 
   Many embedding models include terms like `embed` in their name.
 
+* **Reranking models** - models that are trained to rerank (sort) a list of documents
+  based on their relevance to a given query.
+  These models are usually smaller and faster than general-purpose models,
+  making them more efficient and practical for reranking tasks.
+
+  Reranking models are often significantly smaller (sometimes as small as 500MB), faster,
+  and consume less memory than general-purpose models, making them more efficient and practical.
+
+  While general-purpose models can also be used for reranking,
+  doing this requires prompting the model, which is more cumbersome and inefficient than
+  using a specialized model with a [ranking context](./embedding.md#reranking) for this task.
+
+  Many reranking models include terms like `rerank` or `reranker` in their name.
+
 ### How much data do you plan to feed the model at once with?
 If you plan to feed the model with a lot of data at once, you'll need a model that supports a large context size.
 The larger the context size is, the more data the model can process at once.

diff --git a/docs/guide/cmakeOptions.data.ts b/docs/guide/cmakeOptions.data.ts
@@ -68,12 +68,16 @@ function parseCmakeOptions(cmakeListsTxt: string, optionFilter: ((key: string) =
     for (let i = 0; i < cmakeOptions.length; i++) {
         const option = cmakeOptions[i]!;
 
-        if (!optionFilter(option.key) || option.key === "GGML_LLAMAFILE" || option.key === "GGML_CURL" || option.key === "GGML_RPC") {
+        if (!optionFilter(option.key) || option.key === "GGML_LLAMAFILE" || option.key === "GGML_CURL" || option.key === "GGML_RPC" ||
+            option.key === "GGML_WASM_SINGLE_FILE" || option.key === "BUILD_SHARED_LIBS" || option.key === "GGML_BACKEND_DL"
+        ) {
             cmakeOptions.splice(i, 1);
             i--;
             continue;
         } else if (option.key === "GGML_METAL" && option.defaultValue === "${GGML_METAL_DEFAULT}")
             option.defaultValue = htmlEscapeWithCodeMarkdown("`ON` on macOS on Apple Silicon, `OFF` otherwise");
+        else if (option.key === "GGML_BLAS" && option.defaultValue === "${GGML_BLAS_DEFAULT}")
+            option.defaultValue = htmlEscapeWithCodeMarkdown("`ON` on macOS, `OFF` otherwise");
         else if (option.key === "GGML_METAL_EMBED_LIBRARY" && option.defaultValue === "${GGML_METAL}")
             option.defaultValue = htmlEscapeWithCodeMarkdown("`ON` on macOS, `OFF` otherwise");
         else if (option.defaultValue === "${GGML_STANDALONE}") {

diff --git a/docs/guide/docker.md b/docs/guide/docker.md
@@ -34,7 +34,7 @@ FROM node:22
 
 # Replace `x86_64` with `sbsa` for ARM64
 ENV NVARCH=x86_64
-ENV INSTALL_CUDA_VERSION=12.6
+ENV INSTALL_CUDA_VERSION=12.5
 
 SHELL ["/bin/bash", "-c"]
 RUN apt-get update && \
@@ -172,3 +172,9 @@ docker run --rm -it --runtime=nvidia --gpus=all my-image:tag
 podman run --rm -it --device nvidia.com/gpu=all --security-opt=label=disable --gpus=all my-image:tag
 ```
 :::
+
+### Getting an `system has unsupported display driver / cuda driver combination` Error
+Ensure that the `INSTALL_CUDA_VERSION` in the Dockerfile matches
+or is older than the CUDA version installed on the host machine.
+
+> You can check what is the installed CUDA version using `nvidia-smi --version`.