Skip to content

Commit

Permalink
feat: token prediction (speculative decoding) (#405)
Browse files Browse the repository at this point in the history
* feat: token prediction (speculative decoding)
* feat: `DraftSequenceTokenPredictor`
* feat: `InputLookupTokenPredictor`
* feat: `controlledEvaluate`
* feat: reranking (`LlamaRankingContext`)
* feat: `evaluateWithMetadata`
* feat: token confidence
* feat: `experimentalChunkDocument`
* feat: build on arm64 using LLVM, use Visual Studio's CMake when available
* feat: try compiling with LLVM on Windows x64 when available
* feat(minor): dynamically load `llama.cpp` backends
* feat(minor): more token values support in `SpecialToken`
* feat(minor): improve memory usage estimation
* fix: check for Rosetta usage on macOS x64 when using the `inspect gpu` command
* fix: detect running under Rosetta on Apple Silicone and show an error message instead of crashing
* fix: switch from `"nextTick"` to `"nextCycle"` for the default batch dispatcher
* fix: remove deprecated CLS token
* fix: pipe error logs in `inspect gpu` command
* docs: improve building from source
* docs: CUDA in Docker troubleshooting
* docs: reranking
* docs: context shift strategy
* docs: remove Intel AMX trick, since it's being automatically used in the prebuilt binaries now
  • Loading branch information
giladgd authored Jan 7, 2025
1 parent e2c5c3f commit 632a7bf
Show file tree
Hide file tree
Showing 123 changed files with 7,863 additions and 938 deletions.
2 changes: 1 addition & 1 deletion .config/typedoc.json
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,6 @@
"interfacePropertiesFormat": "list",
"sort": ["source-order"],
"docsRoot": "../docs",
"intentionallyNotExported": ["MergeOptionalUnionTypes", "GbnfJsonSchemaToTSType", "_LlamaText"],
"intentionallyNotExported": ["MergeOptionalUnionTypes", "PickOptions", "GbnfJsonSchemaToTSType", "_LlamaText"],
"useHTMLEncodedBrackets": true
}
3 changes: 1 addition & 2 deletions .github/workflows/build.yml
Original file line number Diff line number Diff line change
Expand Up @@ -23,8 +23,7 @@ jobs:
- name: Download latest llama.cpp release
env:
CI: true
# pinned to `b4291` temporarily until the Windows on Arm64 build is fixed
run: node ./dist/cli/cli.js source download --release b4291 --skipBuild --noBundle --noUsageExample --updateBinariesReleaseMetadataAndSaveGitBundle
run: node ./dist/cli/cli.js source download --release latest --skipBuild --noBundle --noUsageExample --updateBinariesReleaseMetadataAndSaveGitBundle
- name: Upload build artifact
uses: actions/upload-artifact@v4
with:
Expand Down
11 changes: 10 additions & 1 deletion .vitepress/config.ts
Original file line number Diff line number Diff line change
Expand Up @@ -132,13 +132,16 @@ export default defineConfig({
item.lastmod = new Date(buildDate);
item.changefreq = "daily";
item.priority = 0.9;
} else if (item.url === "guide/") {
item.changefreq = "daily";
item.priority = 0.7;
} else if (item.url.startsWith("api/") || item.url.startsWith("cli/")) {
item = {
...item,
lastmod: new Date(buildDate),
changefreq: "weekly",
priority: item.url.startsWith("cli/")
? 0.7
? 0.6
: 0.5
};
} else if (item.lastmod == null && item.url.startsWith("blog/")) {
Expand Down Expand Up @@ -358,6 +361,9 @@ export default defineConfig({
}
},
markdown: {
languageAlias: {
"js-highlight": "javascript"
},
codeTransformers: [
transformerTwoslash({
explicitTrigger: false,
Expand Down Expand Up @@ -482,7 +488,10 @@ export default defineConfig({
{text: "External Chat State", link: "/external-chat-state"},
{text: "Token Bias", link: "/token-bias"},
{text: "Objects Lifecycle", link: "/objects-lifecycle"},
{text: "Chat Context Shift", link: "/chat-context-shift"},
{text: "Batching", link: "/batching"},
{text: "Token Prediction", link: "/token-prediction"},
{text: "Low Level API", link: "/low-level-api"},
{text: "Awesome List", link: "/awesome"},
{text: "Troubleshooting", link: "/troubleshooting"},
{text: "Tips and Tricks", link: "/tips-and-tricks"}
Expand Down
3 changes: 2 additions & 1 deletion .vitepress/config/apiReferenceSidebar.ts
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import {DefaultTheme} from "vitepress";
/* eslint import/no-unresolved: "off" */
import typedocSidebar from "../../docs/api/typedoc-sidebar.json"; // if this import fails, run `npm run docs:generateTypedoc`
import typedocSidebar from "../../docs/api/typedoc-sidebar.json";

const categoryOrder = [
"Functions",
Expand Down Expand Up @@ -28,6 +28,7 @@ const classesOrder = [
"LlamaCompletion",
"LlamaEmbeddingContext",
"LlamaEmbedding",
"LlamaRankingContext",
"LlamaGrammar",
"LlamaJsonSchemaGrammar",
"LlamaText",
Expand Down
3 changes: 2 additions & 1 deletion .vitepress/theme/style.css
Original file line number Diff line number Diff line change
Expand Up @@ -354,7 +354,8 @@ div.search-keyboard-shortcuts[class] kbd:last-of-type {
}

.language-ts > .lang,
.language-shell > .lang {
.language-shell > .lang,
.language-js-highlight > .lang {
display: none;
}

Expand Down
6 changes: 3 additions & 3 deletions .vitepress/utils/parseCmakeListsTxtOptions.ts
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
const maxLinesSpan = 10;

const cmakeOptionRegex =
/^\s*option\([\s\t\n\r]*(?<key>\S+)[\s\t\n\r]+"(?<description>(?:\\"|[^"])*)"[\s\t\n\r]+(?<defaultValue>\S+)[\s\t\n\r]*\)/;
export function parseCmakeListsTxtOptions(cmakeListsTxtString: string) {
const lines = cmakeListsTxtString.split("\n");

Expand All @@ -8,9 +10,7 @@ export function parseCmakeListsTxtOptions(cmakeListsTxtString: string) {
const match = lines
.slice(index, index + maxLinesSpan)
.join("\n")
.match(
/^option\([\s\t\n\r]*(?<key>\S+)[\s\t\n\r]+"(?<description>(?:\\"|[^"])*)"[\s\t\n\r]+(?<defaultValue>\S+)[\s\t\n\r]*\)/
);
.match(cmakeOptionRegex);
if (match == null || match.groups == null || match?.index !== 0)
return null;

Expand Down
3 changes: 2 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -26,8 +26,9 @@
* [Use the CLI to chat with a model without writing any code](#try-it-without-installing)
* Up-to-date with the latest `llama.cpp`. Download and compile the latest release with a [single CLI command](https://node-llama-cpp.withcat.ai//guide/building-from-source#downloading-a-release)
* Enforce a model to generate output in a parseable format, [like JSON](https://node-llama-cpp.withcat.ai/guide/chat-session#json-response), or even force it to [follow a specific JSON schema](https://node-llama-cpp.withcat.ai/guide/chat-session#response-json-schema)
* [Provide a model with functions it can call on demand](https://node-llama-cpp.withcat.ai/guide/chat-session#function-calling) to retrieve information of perform actions
* [Provide a model with functions it can call on demand](https://node-llama-cpp.withcat.ai/guide/chat-session#function-calling) to retrieve information or perform actions
* [Embedding support](https://node-llama-cpp.withcat.ai/guide/embedding)
* [Safe against special token injection attacks](https://node-llama-cpp.withcat.ai/guide/llama-text#input-safety-in-node-llama-cpp)
* Great developer experience with full TypeScript support, and [complete documentation](https://node-llama-cpp.withcat.ai/guide/)
* Much more

Expand Down
53 changes: 51 additions & 2 deletions docs/guide/building-from-source.md
Original file line number Diff line number Diff line change
Expand Up @@ -25,13 +25,62 @@ This is useful for building from source on machines that aren't connected to the
:::

::: info

If `cmake` is not installed on your machine, `node-llama-cpp` will automatically download `cmake` to an internal directory and try to use it to build `llama.cpp` from source.

If the build fails, make sure you have the required dependencies of `cmake` installed on your machine. More info is available [here](https://github.com/cmake-js/cmake-js#:~:text=projectRoot/build%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%20%5Bstring%5D-,Requirements%3A,-CMake) (you don't have to install `cmake` or `cmake-js`, just the dependencies).
:::

::: details Dependencies for macOS
If the build fails on macOS with the error `"/usr/bin/cc" is not able to compile a simple test program`,
try running this command to install the Xcode command line tools:
```shell
xcode-select --install
```
:::

::: details Dependencies for Windows x64
If the build fails on your machine, ensure you have all the necessary build tools installed.

You can install all the dependencies via [WinGet](https://learn.microsoft.com/en-us/windows/package-manager/winget/) using this command:
```shell
winget install --id Microsoft.VisualStudio.2022.BuildTools --force --override "--add Microsoft.VisualStudio.Component.VC.CMake.Project Microsoft.VisualStudio.Component.VC.CoreBuildTools Microsoft.VisualStudio.Component.VC.Tools.x86.x64 Microsoft.VisualStudio.Component.VC.ATL Microsoft.VisualStudio.Component.VC.ATLMFC Microsoft.VisualStudio.Component.VC.Llvm.ClangToolset Microsoft.VisualStudio.Component.VC.Llvm.Clang Microsoft.VisualStudio.Component.VC.Redist.14.Latest Microsoft.Component.VC.Runtime.UCRTSDK Microsoft.VisualStudio.Component.Windows10SDK Microsoft.VisualStudio.Component.Windows10SDK.20348"
```
> WinGet is built-in on Windows 11 and modern Windows 10 versions
If the build fails on macOS with the error `"/usr/bin/cc" is not able to compile a simple test program`, try running `xcode-select --install` to install the Xcode command line tools.
---

You can also install all the dependencies manually using the [Visual C++ Build Tools installer](https://visualstudio.microsoft.com/visual-cpp-build-tools/):
* **`Workloads` tab:** select `Desktop development with C++`
* **`Individual components` tab**: select the following:
* C++ ATL for latest v143 build tools (x86 & x64)
* C++ MFC for latest v143 build tools (x86 & x64)
* C++ CMake tools for Windows
* C++ Clang Compiler for Windows
* MSBuild support for LLVM (clang-cl) toolset
* Windows Universal CRT SDK
:::

::: details Dependencies for Windows on Arm
On Windows on Arm you need to install additional build tools to build `llama.cpp` from source.

You can install all the dependencies via [WinGet](https://learn.microsoft.com/en-us/windows/package-manager/winget/) using this command:
```shell
winget install --id Microsoft.VisualStudio.2022.BuildTools --force --override "--add Microsoft.VisualStudio.Component.VC.CMake.Project Microsoft.VisualStudio.Component.VC.CoreBuildTools Microsoft.VisualStudio.Component.VC.Tools.x86.x64 Microsoft.VisualStudio.Component.VC.Tools.ARM64 Microsoft.VisualStudio.Component.VC.ATL Microsoft.VisualStudio.Component.VC.ATL.ARM64 Microsoft.VisualStudio.Component.VC.ATLMFC Microsoft.VisualStudio.Component.VC.MFC.ARM64 Microsoft.VisualStudio.Component.VC.Llvm.ClangToolset Microsoft.VisualStudio.Component.VC.Llvm.Clang Microsoft.VisualStudio.Component.VC.Redist.14.Latest Microsoft.Component.VC.Runtime.UCRTSDK Microsoft.VisualStudio.Component.Windows10SDK Microsoft.VisualStudio.Component.Windows10SDK.20348"
```
> WinGet is built-in on Windows 11 and modern Windows 10 versions
---

You can also install all the dependencies manually using the [Visual C++ Build Tools installer](https://visualstudio.microsoft.com/visual-cpp-build-tools/):
* **`Workloads` tab:** select `Desktop development with C++`
* **`Individual components` tab**: select the following:
* MSVC v143 - VS 2022 C++ ARM64 build tools (latest)
* C++ ATL for latest v143 build tools (ARM64/ARM64EC)
* C++ MFC for latest v143 build tools (ARM64/ARM64EC)
* C++ CMake tools for Windows
* C++ Clang Compiler for Windows
* MSBuild support for LLVM (clang-cl) toolset
* Windows Universal CRT SDK
:::

## `source download` and `source build` Commands
Expand Down
111 changes: 111 additions & 0 deletions docs/guide/chat-context-shift.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,111 @@
# Chat Context Shift Strategy {#background}
When the chat history gets longer than the sequence's context size, we have to remove the oldest tokens from the context state to make room for new tokens to be generated.
This is called a context shift.

`node-llama-cpp` has a smart mechanism to handle context shifts on the chat level, so the oldest messages are truncated (from their beginning) or removed from the context state, while keeping the system prompt in place to ensure the model follows the guidelines you set for it.

You can override `node-llama-cpp`'s default context shift strategy
when using [`LlamaChatSession`](../api/classes/LlamaChatSession.md) or [`LlamaChat`](../api/classes/LlamaChat.md)
by providing a custom context shift strategy.

## The Default Context Shift Strategy {#default-strategy}
The [default context shift strategy](../api/type-aliases/LLamaChatContextShiftOptions.md#strategy) is `eraseFirstResponseAndKeepFirstSystem`.

This strategy attempts to truncate the oldest model responses (from their beginning) or remove them completely from the chat history while keeping the first system prompt in place.
If a response is completely removed, the prompt that came before it will be removed as well.

## Implementing a Custom Context Shift Strategy {#custom-strategy}
A [custom context shift strategy](../api/type-aliases/LLamaChatContextShiftOptions.md#strategy) is a function that receives the full chat history as input and
returns a new chat history that when tokenized will result in an array of tokens shorter than the desired max size.

The context shift strategy will be called only when the context state needs to be shifted.

If the context shift strategy returns an invalid chat history (e.g., a chat history that is too long),
the prompting function will abort the evaluation and throw an error.

A custom context shift strategy can be a simple logic that prioritizes which data to remove,
or it can even use a language model to summarize information to shorten the chat history.

It's important to keep the last user prompt and model response as-is to prevent infinite generation loops.

```typescript
import {fileURLToPath} from "url";
import path from "path";
import {getLlama, LlamaChatSession} from "node-llama-cpp";

const __dirname = path.dirname(fileURLToPath(import.meta.url));

const llama = await getLlama();
const model = await llama.loadModel({
modelPath: path.join(__dirname, "models", "Meta-Llama-3.1-8B-Instruct.Q4_K_M.gguf")
});
const context = await model.createContext();

// ---cut---
const session = new LlamaChatSession({
contextSequence: context.getSequence(),
contextShift: {
strategy({
chatHistory, chatWrapper, maxTokensCount, tokenizer,
lastShiftMetadata
}) {
// clone the chat history to not mutate the original
const newChatHistory = chatHistory.map(
(item) => structuredClone(item)
);

function getTokensLeftToRemove() {
const {
contextText
} = chatWrapper.generateContextState({chatHistory});
const tokenUsage = contextText.tokenize(tokenizer).length;

return Math.max(0, tokenUsage - maxTokensCount);
}

while (getTokensLeftToRemove() > 0 && newChatHistory.length > 2) {
for (let i = 0; i < newChatHistory.length - 2; i++) {
const chatItem = newChatHistory[i]!;

if (i === 0 && chatItem.type === "system")
// don't remove the first system message
continue;
else if (chatItem.type === "model") {
// remove the model response
newChatHistory.splice(i, 1);
i--;

// remove the user messages that
// came before the model response
while (
i > 0 &&
newChatHistory[i - 1]?.type === "user"
) {
newChatHistory.splice(i - 1, 1);
i--;
}
} else if (chatItem.type === "system") {
// don't remove system messages on their own
continue;
} else if (chatItem.type === "user") {
// don't remove user messages on their own
continue;
} else {
// ensure we handle all message types.
// otherwise, this will error
void (chatItem satisfies never);
}
}
}

return {
chatHistory: newChatHistory,

// this metadata will be passed to the next context shift
// strategy call as the `lastShiftMetadata` argument
metadata: {}
};
}
}
});
```
14 changes: 14 additions & 0 deletions docs/guide/choosing-a-model.md
Original file line number Diff line number Diff line change
Expand Up @@ -124,6 +124,20 @@ Here are a few concepts to be aware of when choosing a model:

Many embedding models include terms like `embed` in their name.

* **Reranking models** - models that are trained to rerank (sort) a list of documents
based on their relevance to a given query.
These models are usually smaller and faster than general-purpose models,
making them more efficient and practical for reranking tasks.

Reranking models are often significantly smaller (sometimes as small as 500MB), faster,
and consume less memory than general-purpose models, making them more efficient and practical.

While general-purpose models can also be used for reranking,
doing this requires prompting the model, which is more cumbersome and inefficient than
using a specialized model with a [ranking context](./embedding.md#reranking) for this task.

Many reranking models include terms like `rerank` or `reranker` in their name.

### How much data do you plan to feed the model at once with?
If you plan to feed the model with a lot of data at once, you'll need a model that supports a large context size.
The larger the context size is, the more data the model can process at once.
Expand Down
6 changes: 5 additions & 1 deletion docs/guide/cmakeOptions.data.ts
Original file line number Diff line number Diff line change
Expand Up @@ -68,12 +68,16 @@ function parseCmakeOptions(cmakeListsTxt: string, optionFilter: ((key: string) =
for (let i = 0; i < cmakeOptions.length; i++) {
const option = cmakeOptions[i]!;

if (!optionFilter(option.key) || option.key === "GGML_LLAMAFILE" || option.key === "GGML_CURL" || option.key === "GGML_RPC") {
if (!optionFilter(option.key) || option.key === "GGML_LLAMAFILE" || option.key === "GGML_CURL" || option.key === "GGML_RPC" ||
option.key === "GGML_WASM_SINGLE_FILE" || option.key === "BUILD_SHARED_LIBS" || option.key === "GGML_BACKEND_DL"
) {
cmakeOptions.splice(i, 1);
i--;
continue;
} else if (option.key === "GGML_METAL" && option.defaultValue === "${GGML_METAL_DEFAULT}")
option.defaultValue = htmlEscapeWithCodeMarkdown("`ON` on macOS on Apple Silicon, `OFF` otherwise");
else if (option.key === "GGML_BLAS" && option.defaultValue === "${GGML_BLAS_DEFAULT}")
option.defaultValue = htmlEscapeWithCodeMarkdown("`ON` on macOS, `OFF` otherwise");
else if (option.key === "GGML_METAL_EMBED_LIBRARY" && option.defaultValue === "${GGML_METAL}")
option.defaultValue = htmlEscapeWithCodeMarkdown("`ON` on macOS, `OFF` otherwise");
else if (option.defaultValue === "${GGML_STANDALONE}") {
Expand Down
8 changes: 7 additions & 1 deletion docs/guide/docker.md
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ FROM node:22

# Replace `x86_64` with `sbsa` for ARM64
ENV NVARCH=x86_64
ENV INSTALL_CUDA_VERSION=12.6
ENV INSTALL_CUDA_VERSION=12.5

SHELL ["/bin/bash", "-c"]
RUN apt-get update && \
Expand Down Expand Up @@ -172,3 +172,9 @@ docker run --rm -it --runtime=nvidia --gpus=all my-image:tag
podman run --rm -it --device nvidia.com/gpu=all --security-opt=label=disable --gpus=all my-image:tag
```
:::

### Getting an `system has unsupported display driver / cuda driver combination` Error
Ensure that the `INSTALL_CUDA_VERSION` in the Dockerfile matches
or is older than the CUDA version installed on the host machine.

> You can check what is the installed CUDA version using `nvidia-smi --version`.
Loading

0 comments on commit 632a7bf

Please sign in to comment.