Skip to content

Commit

Permalink
Merge branch 'sgl-project:main' into main
Browse files Browse the repository at this point in the history
  • Loading branch information
stbaione authored Nov 21, 2024
2 parents 0eba024 + f35cb46 commit 8d06175
Show file tree
Hide file tree
Showing 100 changed files with 3,442 additions and 969 deletions.
7 changes: 6 additions & 1 deletion .github/workflows/nightly-eval.yml
Original file line number Diff line number Diff line change
Expand Up @@ -27,9 +27,14 @@ jobs:
bash scripts/ci_install_dependency.sh
pip install --upgrade "evalplus[vllm] @ git+https://github.com/evalplus/evalplus"
- name: Nightly gsm8k and human eval Accuracy
- name: Test human eval
timeout-minutes: 120
run: |
cd test/srt
python3 test_nightly_human_eval.py
- name: Test gsm8k
timeout-minutes: 120
run: |
cd test/srt
python3 test_nightly_gsm8k_eval.py
1 change: 1 addition & 0 deletions .github/workflows/pr-test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -244,6 +244,7 @@ jobs:
cd test/srt
python3 test_mla.py
python3 test_mla_fp8.py
python3 test_dp_attention.py
- name: Evaluate data parallelism accuracy (DP=2)
timeout-minutes: 10
Expand Down
12 changes: 12 additions & 0 deletions Makefile
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
.PHONY: check-deps install-deps format

check-deps:
@command -v isort >/dev/null 2>&1 || (echo "Installing isort..." && pip install isort)
@command -v black >/dev/null 2>&1 || (echo "Installing black..." && pip install black)

install-deps:
pip install isort black

format: check-deps
@echo "Formatting modified Python files..."
git diff --name-only --diff-filter=M | grep '\.py$$' | xargs -I {} sh -c 'isort {} && black {}'
15 changes: 15 additions & 0 deletions benchmark/json_schema/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
## Run benchmark

### Benchmark sglang

Run Llama-8b

```bash
python3 -m sglang.launch_server --model-path meta-llama/Llama-3.1-8B-Instruct --port 30000
```

Benchmark

```bash
python3 bench_sglang.py
```
146 changes: 146 additions & 0 deletions benchmark/json_schema/bench_sglang.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,146 @@
import argparse
import json
import time
from typing import List, Tuple

import jsonschema
from datasets import load_dataset

import sglang as sgl
from sglang.global_config import global_config
from sglang.srt.hf_transformers_utils import get_tokenizer
from sglang.test.test_utils import (
add_common_sglang_args_and_parse,
select_sglang_backend,
)
from sglang.utils import dump_state_text


@sgl.function
def schema_gen(s, message: Tuple[str, str], json_schema: str):
system, user = message
s += sgl.system(system)
s += sgl.user(user)
s += sgl.assistant(
sgl.gen("json_output", temperature=0, max_tokens=256, json_schema=json_schema)
)


def contains_formats(schema, formats: List[str]):
if isinstance(schema, dict):
if schema.get("format", None) in formats:
return True
for value in schema.values():
if contains_formats(value, formats):
return True
elif isinstance(schema, list):
for item in schema:
if contains_formats(item, formats):
return True
return False


def convert_dataset(path: str):
raw_dataset = load_dataset(path)
dataset = []
for data in raw_dataset["train"]:
messages = data["prompt"]
schema = data["schema"]
obj = json.loads(schema)

# skip some corrupted examples
if obj.get("type", None) is None:
continue

# skip schema with format "email"
# which is not supported by outlines for now
if contains_formats(obj, ["email"]):
continue

system = messages[0]
user = messages[1]
assert system["role"] == "system", "invalid role"
assert user["role"] == "user", "invalid role"
assert len(messages) == 2, "invalid message length"
message = json.dumps(system["content"]), json.dumps(user["content"])
dataset.append(
{
"message": message,
"json_schema": schema,
}
)

return dataset


def bench_schema(args):
arguments = convert_dataset(args.data_path)

if args.num_jsons < 0 or args.num_jsons > len(arguments):
args.num_jsons = len(arguments)
arguments = arguments[: args.num_jsons]

# Select backend
backend = select_sglang_backend(args)
sgl.set_default_backend(backend)

# Run requests
tic = time.time()
states = schema_gen.run_batch(
arguments,
temperature=0,
num_threads=args.parallel,
progress_bar=True,
)
latency = time.time() - tic

# Check if the outputs are valid
indexs = []
for i, state in enumerate(states):
try:
schema = json.loads(arguments[i]["json_schema"])
obj = json.loads(state["json_output"])
assert jsonschema.validate(obj, schema) is None
except Exception as e:
print(e)
indexs.append(i)

return states, latency


def main(args):
states, latency = bench_schema(args)

# Compute accuracy
tokenizer = get_tokenizer(
global_config.default_backend.get_server_args()["tokenizer_path"]
)
output_jsons = [state["json_output"] for state in states]
num_output_tokens = sum(len(tokenizer.encode(x)) for x in output_jsons)
print(f"Latency: {latency:.3f}")
print(f"Output throughput: {num_output_tokens / latency:.3f} token/s")
print(f"#output tokens: {num_output_tokens}")

# Write results
dump_state_text(f"tmp_output_{args.backend}.txt", states)
with open(f"{args.backend}.jsonl", "w") as fout:
for state in states:
fout.write(state["json_output"] + "\n")

with open(args.result_file, "a") as fout:
value = {
"task": "json_schema",
"backend": args.backend,
"latency": round(latency, 3),
"num_jsons": args.num_jsons,
"parallel": args.parallel,
}
fout.write(json.dumps(value) + "\n")


if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--data-path", type=str, default="NousResearch/json-mode-eval")
parser.add_argument("--num-jsons", type=int, default=-1)
args = add_common_sglang_args_and_parse(parser)
main(args)
2 changes: 1 addition & 1 deletion docker/Dockerfile.rocm
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
# Usage (to build SGLang ROCm docker image):
# docker build --build-arg SGL_BRANCH=v0.3.5 -t testImage -f Dockerfile.rocm .
# docker build --build-arg SGL_BRANCH=v0.3.5.post2 -t testImage -f Dockerfile.rocm .

# default base image
ARG BASE_IMAGE="rocm/vllm-dev:20241022"
Expand Down
4 changes: 2 additions & 2 deletions docs/backend/backend.md
Original file line number Diff line number Diff line change
Expand Up @@ -79,8 +79,8 @@ python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct
```
python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct --chunked-prefill-size 4096
```
- To enable the experimental overlapped scheduler, add `--enable-overlap-scheduler`. It overlaps CPU scheduler with GPU computation and can accelerate almost all workloads. This does not work for constrained decoding currenly.
- To enable torch.compile acceleration, add `--enable-torch-compile`. It accelerates small models on small batch sizes. This does not work for FP8 currenly.
- To enable the experimental overlapped scheduler, add `--enable-overlap-schedule`. It overlaps CPU scheduler with GPU computation and can accelerate almost all workloads. This does not work for constrained decoding currently.
- To enable torch.compile acceleration, add `--enable-torch-compile`. It accelerates small models on small batch sizes. This does not work for FP8 currently.
- To enable torchao quantization, add `--torchao-config int4wo-128`. It supports various quantization strategies.
- To enable fp8 weight quantization, add `--quantization fp8` on a fp16 checkpoint or directly load a fp8 checkpoint without specifying any arguments.
- To enable fp8 kv cache quantization, add `--kv-cache-dtype fp8_e5m2`.
Expand Down
4 changes: 3 additions & 1 deletion docs/developer/setup_github_runner.md
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,9 @@ docker pull nvidia/cuda:12.1.1-devel-ubuntu22.04
# Nvidia
docker run --shm-size 128g -it -v /tmp/huggingface:/hf_home --gpus all nvidia/cuda:12.1.1-devel-ubuntu22.04 /bin/bash
# AMD
docker run --rm --device=/dev/kfd --device=/dev/dri --group-add video --shm-size 128g -it -v /tmp/huggingface:/hf_home henryx/haisgl:sgl0.3.1.post3_vllm0.6.0_triton3.0.0_rocm6.2.1 /bin/bash
docker run --rm --device=/dev/kfd --device=/dev/dri --group-add video --shm-size 128g -it -v /tmp/huggingface:/hf_home lmsysorg/sglang:v0.3.5.post2-rocm620 /bin/bash
# AMD just the last 2 GPUs
docker run --rm --device=/dev/kfd --device=/dev/dri/renderD176 --device=/dev/dri/renderD184 --group-add video --shm-size 128g -it -v /tmp/huggingface:/hf_home lmsysorg/sglang:v0.3.5.post2-rocm620 /bin/bash
```

### Step 2: Configure the runner by `config.sh`
Expand Down
2 changes: 1 addition & 1 deletion docs/frontend/frontend.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
# Frontend: Structured Generation Language (SGLang)
The frontend language can be used with local models or API models. It is an alternative to the OpenAI API. You may found it easier to use for complex prompting workflow.
The frontend language can be used with local models or API models. It is an alternative to the OpenAI API. You may find it easier to use for complex prompting workflow.

## Quick Start
The example below shows how to use SGLang to answer a multi-turn question.
Expand Down
4 changes: 2 additions & 2 deletions docs/references/hyperparameter_tuning.md
Original file line number Diff line number Diff line change
Expand Up @@ -31,8 +31,8 @@ If you see out of memory (OOM) errors, you can try to tune the following paramet
- You can also try to decrease `--mem-fraction-static`, which reduces the memory usage of the KV cache memory pool and helps both prefill and decoding.

### Try Advanced Options
- To enable the experimental overlapped scheduler, add `--enable-overlap-scheduler`. It overlaps CPU scheduler with GPU computation and can accelerate almost all workloads. This does not work for constrained decoding currenly.
- To enable torch.compile acceleration, add `--enable-torch-compile`. It accelerates small models on small batch sizes. This does not work for FP8 currenly.
- To enable the experimental overlapped scheduler, add `--enable-overlap-schedule`. It overlaps CPU scheduler with GPU computation and can accelerate almost all workloads. This does not work for constrained decoding currently.
- To enable torch.compile acceleration, add `--enable-torch-compile`. It accelerates small models on small batch sizes. This does not work for FP8 currently.

### Tune `--schedule-policy`
If the workload has many shared prefixes, use the default `--schedule-policy lpm`. `lpm` stands for longest prefix match.
Expand Down
1 change: 1 addition & 0 deletions docs/references/supported_models.md
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@
- XVERSE / XVERSE MoE
- SmolLM
- GLM-4
- Phi-3-Small

## Embedding Models

Expand Down
2 changes: 1 addition & 1 deletion docs/references/troubleshooting.md
Original file line number Diff line number Diff line change
Expand Up @@ -11,4 +11,4 @@ If you see out of memory (OOM) errors, you can try to tune the following paramet
## CUDA error: an illegal memory access was encountered
This error may be due to kernel errors or out-of-memory issues.
- If it is a kernel error, it is not easy to fix. Please file an issue on the GitHub.
- If it is out-of-memory, sometimes it will report this error instead of "Out-of-memory." Please refer to the above seciton to avoid the OOM.
- If it is out-of-memory, sometimes it will report this error instead of "Out-of-memory." Please refer to the above section to avoid the OOM.
8 changes: 4 additions & 4 deletions docs/start/install.md
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ Note: Please check the [FlashInfer installation doc](https://docs.flashinfer.ai/
## Method 2: From source
```
# Use the last release branch
git clone -b v0.3.5 https://github.com/sgl-project/sglang.git
git clone -b v0.3.5.post2 https://github.com/sgl-project/sglang.git
cd sglang
pip install --upgrade pip
Expand Down Expand Up @@ -46,7 +46,7 @@ docker run --gpus all \
Note: To AMD ROCm system with Instinct/MI GPUs, it is recommended to use `docker/Dockerfile.rocm` to build images, example and usage as below:

```bash
docker build --build-arg SGL_BRANCH=v0.3.5 -t v0.3.5-rocm620 -f Dockerfile.rocm .
docker build --build-arg SGL_BRANCH=v0.3.5.post2 -t v0.3.5.post2-rocm620 -f Dockerfile.rocm .

alias drun='docker run -it --rm --network=host --device=/dev/kfd --device=/dev/dri --ipc=host \
--shm-size 16G --group-add video --cap-add=SYS_PTRACE --security-opt seccomp=unconfined \
Expand All @@ -55,11 +55,11 @@ alias drun='docker run -it --rm --network=host --device=/dev/kfd --device=/dev/d
drun -p 30000:30000 \
-v ~/.cache/huggingface:/root/.cache/huggingface \
--env "HF_TOKEN=<secret>" \
v0.3.5-rocm620 \
v0.3.5.post2-rocm620 \
python3 -m sglang.launch_server --model-path meta-llama/Llama-3.1-8B-Instruct --host 0.0.0.0 --port 30000

# Till flashinfer backend available, --attention-backend triton --sampling-backend pytorch are set by default
drun v0.3.5-rocm620 python3 -m sglang.bench_latency --batch-size 32 --input 1024 --output 128 --model amd/Meta-Llama-3.1-8B-Instruct-FP8-KV --tp 8 --quantization fp8
drun v0.3.5.post2-rocm620 python3 -m sglang.bench_latency --batch-size 32 --input 1024 --output 128 --model amd/Meta-Llama-3.1-8B-Instruct-FP8-KV --tp 8 --quantization fp8
```

## Method 4: Using docker compose
Expand Down
6 changes: 3 additions & 3 deletions python/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"

[project]
name = "sglang"
version = "0.3.5"
version = "0.3.5.post2"
description = "SGLang is yet another fast serving framework for large language models and vision language models."
readme = "README.md"
requires-python = ">=3.8"
Expand All @@ -19,8 +19,8 @@ dependencies = ["requests", "tqdm", "numpy", "IPython"]
runtime_common = ["aiohttp", "decord", "fastapi", "hf_transfer", "huggingface_hub", "interegular",
"orjson", "packaging", "pillow", "prometheus-client>=0.20.0", "psutil", "pydantic", "python-multipart",
"torchao", "uvicorn", "uvloop", "pyzmq>=25.1.2",
"outlines>=0.0.44", "modelscope"]
srt = ["sglang[runtime_common]", "torch", "vllm==0.6.3.post1"]
"outlines>=0.0.44,<0.1.0", "modelscope"]
srt = ["sglang[runtime_common]", "torch", "vllm>=0.6.3.post1"]

# HIP (Heterogeneous-computing Interface for Portability) for AMD
# => base docker rocm/vllm-dev:20241022, not from public vllm whl
Expand Down
4 changes: 2 additions & 2 deletions python/sglang/bench_latency.py
Original file line number Diff line number Diff line change
Expand Up @@ -220,7 +220,7 @@ def prepare_synthetic_inputs_for_latency_test(batch_size, input_len):
return reqs


@torch.inference_mode()
@torch.no_grad
def extend(reqs, model_runner):
batch = ScheduleBatch.init_new(
reqs=reqs,
Expand All @@ -237,7 +237,7 @@ def extend(reqs, model_runner):
return next_token_ids, logits_output.next_token_logits, batch


@torch.inference_mode()
@torch.no_grad
def decode(input_token_ids, batch, model_runner):
batch.output_ids = input_token_ids
batch.prepare_for_decode()
Expand Down
Loading

0 comments on commit 8d06175

Please sign in to comment.