Merge branch 'sgl-project:main' into main

nod-ai · Nov 21, 2024 · 8d06175 · 8d06175
2 parents 0eba024 + f35cb46
commit 8d06175
Show file tree

Hide file tree

Showing 100 changed files with 3,442 additions and 969 deletions.
diff --git a/.github/workflows/nightly-eval.yml b/.github/workflows/nightly-eval.yml
@@ -27,9 +27,14 @@ jobs:
           bash scripts/ci_install_dependency.sh
           pip install --upgrade "evalplus[vllm] @ git+https://github.com/evalplus/evalplus"
 
-      - name: Nightly gsm8k and human eval Accuracy
+      - name: Test human eval
         timeout-minutes: 120
         run: |
           cd test/srt
           python3 test_nightly_human_eval.py
+
+      - name: Test gsm8k
+        timeout-minutes: 120
+        run: |
+          cd test/srt
           python3 test_nightly_gsm8k_eval.py
diff --git a/.github/workflows/pr-test.yml b/.github/workflows/pr-test.yml
@@ -244,6 +244,7 @@ jobs:
           cd test/srt
           python3 test_mla.py
           python3 test_mla_fp8.py
+          python3 test_dp_attention.py
 
       - name: Evaluate data parallelism accuracy (DP=2)
         timeout-minutes: 10

diff --git a/Makefile b/Makefile
@@ -0,0 +1,12 @@
+.PHONY: check-deps install-deps format
+
+check-deps:
+	@command -v isort >/dev/null 2>&1 || (echo "Installing isort..." && pip install isort)
+	@command -v black >/dev/null 2>&1 || (echo "Installing black..." && pip install black)
+
+install-deps:
+	pip install isort black
+
+format: check-deps
+	@echo "Formatting modified Python files..."
+	git diff --name-only --diff-filter=M | grep '\.py$$' | xargs -I {} sh -c 'isort {} && black {}'
diff --git a/benchmark/json_schema/README.md b/benchmark/json_schema/README.md
@@ -0,0 +1,15 @@
+## Run benchmark
+
+### Benchmark sglang
+
+Run Llama-8b
+
+```bash
+python3 -m sglang.launch_server --model-path meta-llama/Llama-3.1-8B-Instruct --port 30000
+```
+
+Benchmark
+
+```bash
+python3 bench_sglang.py
+```
diff --git a/benchmark/json_schema/bench_sglang.py b/benchmark/json_schema/bench_sglang.py
@@ -0,0 +1,146 @@
+import argparse
+import json
+import time
+from typing import List, Tuple
+
+import jsonschema
+from datasets import load_dataset
+
+import sglang as sgl
+from sglang.global_config import global_config
+from sglang.srt.hf_transformers_utils import get_tokenizer
+from sglang.test.test_utils import (
+    add_common_sglang_args_and_parse,
+    select_sglang_backend,
+)
+from sglang.utils import dump_state_text
+
+
+@sgl.function
+def schema_gen(s, message: Tuple[str, str], json_schema: str):
+    system, user = message
+    s += sgl.system(system)
+    s += sgl.user(user)
+    s += sgl.assistant(
+        sgl.gen("json_output", temperature=0, max_tokens=256, json_schema=json_schema)
+    )
+
+
+def contains_formats(schema, formats: List[str]):
+    if isinstance(schema, dict):
+        if schema.get("format", None) in formats:
+            return True
+        for value in schema.values():
+            if contains_formats(value, formats):
+                return True
+    elif isinstance(schema, list):
+        for item in schema:
+            if contains_formats(item, formats):
+                return True
+    return False
+
+
+def convert_dataset(path: str):
+    raw_dataset = load_dataset(path)
+    dataset = []
+    for data in raw_dataset["train"]:
+        messages = data["prompt"]
+        schema = data["schema"]
+        obj = json.loads(schema)
+
+        # skip some corrupted examples
+        if obj.get("type", None) is None:
+            continue
+
+        # skip schema with format "email"
+        # which is not supported by outlines for now
+        if contains_formats(obj, ["email"]):
+            continue
+
+        system = messages[0]
+        user = messages[1]
+        assert system["role"] == "system", "invalid role"
+        assert user["role"] == "user", "invalid role"
+        assert len(messages) == 2, "invalid message length"
+        message = json.dumps(system["content"]), json.dumps(user["content"])
+        dataset.append(
+            {
+                "message": message,
+                "json_schema": schema,
+            }
+        )
+
+    return dataset
+
+
+def bench_schema(args):
+    arguments = convert_dataset(args.data_path)
+
+    if args.num_jsons < 0 or args.num_jsons > len(arguments):
+        args.num_jsons = len(arguments)
+    arguments = arguments[: args.num_jsons]
+
+    # Select backend
+    backend = select_sglang_backend(args)
+    sgl.set_default_backend(backend)
+
+    # Run requests
+    tic = time.time()
+    states = schema_gen.run_batch(
+        arguments,
+        temperature=0,
+        num_threads=args.parallel,
+        progress_bar=True,
+    )
+    latency = time.time() - tic
+
+    # Check if the outputs are valid
+    indexs = []
+    for i, state in enumerate(states):
+        try:
+            schema = json.loads(arguments[i]["json_schema"])
+            obj = json.loads(state["json_output"])
+            assert jsonschema.validate(obj, schema) is None
+        except Exception as e:
+            print(e)
+            indexs.append(i)
+
+    return states, latency
+
+
+def main(args):
+    states, latency = bench_schema(args)
+
+    # Compute accuracy
+    tokenizer = get_tokenizer(
+        global_config.default_backend.get_server_args()["tokenizer_path"]
+    )
+    output_jsons = [state["json_output"] for state in states]
+    num_output_tokens = sum(len(tokenizer.encode(x)) for x in output_jsons)
+    print(f"Latency: {latency:.3f}")
+    print(f"Output throughput: {num_output_tokens / latency:.3f} token/s")
+    print(f"#output tokens: {num_output_tokens}")
+
+    # Write results
+    dump_state_text(f"tmp_output_{args.backend}.txt", states)
+    with open(f"{args.backend}.jsonl", "w") as fout:
+        for state in states:
+            fout.write(state["json_output"] + "\n")
+
+    with open(args.result_file, "a") as fout:
+        value = {
+            "task": "json_schema",
+            "backend": args.backend,
+            "latency": round(latency, 3),
+            "num_jsons": args.num_jsons,
+            "parallel": args.parallel,
+        }
+        fout.write(json.dumps(value) + "\n")
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--data-path", type=str, default="NousResearch/json-mode-eval")
+    parser.add_argument("--num-jsons", type=int, default=-1)
+    args = add_common_sglang_args_and_parse(parser)
+    main(args)
diff --git a/docker/Dockerfile.rocm b/docker/Dockerfile.rocm
@@ -1,5 +1,5 @@
 # Usage (to build SGLang ROCm docker image):
-#   docker build --build-arg SGL_BRANCH=v0.3.5 -t testImage -f Dockerfile.rocm .
+#   docker build --build-arg SGL_BRANCH=v0.3.5.post2 -t testImage -f Dockerfile.rocm .
 
 # default base image
 ARG BASE_IMAGE="rocm/vllm-dev:20241022"

diff --git a/docs/backend/backend.md b/docs/backend/backend.md
@@ -79,8 +79,8 @@ python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct
 ```
 python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct --chunked-prefill-size 4096
 ```
-- To enable the experimental overlapped scheduler, add `--enable-overlap-scheduler`. It overlaps CPU scheduler with GPU computation and can accelerate almost all workloads. This does not work for constrained decoding currenly.
-- To enable torch.compile acceleration, add `--enable-torch-compile`. It accelerates small models on small batch sizes. This does not work for FP8 currenly.
+- To enable the experimental overlapped scheduler, add `--enable-overlap-schedule`. It overlaps CPU scheduler with GPU computation and can accelerate almost all workloads. This does not work for constrained decoding currently.
+- To enable torch.compile acceleration, add `--enable-torch-compile`. It accelerates small models on small batch sizes. This does not work for FP8 currently.
 - To enable torchao quantization, add `--torchao-config int4wo-128`. It supports various quantization strategies.
 - To enable fp8 weight quantization, add `--quantization fp8` on a fp16 checkpoint or directly load a fp8 checkpoint without specifying any arguments.
 - To enable fp8 kv cache quantization, add `--kv-cache-dtype fp8_e5m2`.

diff --git a/docs/developer/setup_github_runner.md b/docs/developer/setup_github_runner.md
@@ -11,7 +11,9 @@ docker pull nvidia/cuda:12.1.1-devel-ubuntu22.04
 # Nvidia
 docker run --shm-size 128g -it -v /tmp/huggingface:/hf_home --gpus all nvidia/cuda:12.1.1-devel-ubuntu22.04 /bin/bash
 # AMD
-docker run --rm --device=/dev/kfd --device=/dev/dri --group-add video --shm-size 128g -it -v /tmp/huggingface:/hf_home henryx/haisgl:sgl0.3.1.post3_vllm0.6.0_triton3.0.0_rocm6.2.1 /bin/bash
+docker run --rm --device=/dev/kfd --device=/dev/dri --group-add video --shm-size 128g -it -v /tmp/huggingface:/hf_home lmsysorg/sglang:v0.3.5.post2-rocm620 /bin/bash
+# AMD just the last 2 GPUs
+docker run --rm --device=/dev/kfd --device=/dev/dri/renderD176 --device=/dev/dri/renderD184 --group-add video --shm-size 128g -it -v /tmp/huggingface:/hf_home lmsysorg/sglang:v0.3.5.post2-rocm620 /bin/bash
 ```
 
 ### Step 2: Configure the runner by `config.sh`

diff --git a/docs/frontend/frontend.md b/docs/frontend/frontend.md
@@ -1,5 +1,5 @@
 # Frontend: Structured Generation Language (SGLang)
-The frontend language can be used with local models or API models. It is an alternative to the OpenAI API. You may found it easier to use for complex prompting workflow.
+The frontend language can be used with local models or API models. It is an alternative to the OpenAI API. You may find it easier to use for complex prompting workflow.
 
 ## Quick Start
 The example below shows how to use SGLang to answer a multi-turn question.

diff --git a/docs/references/hyperparameter_tuning.md b/docs/references/hyperparameter_tuning.md
@@ -31,8 +31,8 @@ If you see out of memory (OOM) errors, you can try to tune the following paramet
 - You can also try to decrease `--mem-fraction-static`, which reduces the memory usage of the KV cache memory pool and helps both prefill and decoding.
 
 ### Try Advanced Options
-- To enable the experimental overlapped scheduler, add `--enable-overlap-scheduler`. It overlaps CPU scheduler with GPU computation and can accelerate almost all workloads. This does not work for constrained decoding currenly.
-- To enable torch.compile acceleration, add `--enable-torch-compile`. It accelerates small models on small batch sizes. This does not work for FP8 currenly.
+- To enable the experimental overlapped scheduler, add `--enable-overlap-schedule`. It overlaps CPU scheduler with GPU computation and can accelerate almost all workloads. This does not work for constrained decoding currently.
+- To enable torch.compile acceleration, add `--enable-torch-compile`. It accelerates small models on small batch sizes. This does not work for FP8 currently.
 
 ### Tune `--schedule-policy`
 If the workload has many shared prefixes, use the default `--schedule-policy lpm`. `lpm` stands for longest prefix match.

diff --git a/docs/references/supported_models.md b/docs/references/supported_models.md
@@ -28,6 +28,7 @@
 - XVERSE / XVERSE MoE
 - SmolLM
 - GLM-4
+- Phi-3-Small
 
 ## Embedding Models
 

diff --git a/docs/references/troubleshooting.md b/docs/references/troubleshooting.md
@@ -11,4 +11,4 @@ If you see out of memory (OOM) errors, you can try to tune the following paramet
 ## CUDA error: an illegal memory access was encountered
 This error may be due to kernel errors or out-of-memory issues.
 - If it is a kernel error, it is not easy to fix. Please file an issue on the GitHub.
-- If it is out-of-memory, sometimes it will report this error instead of "Out-of-memory." Please refer to the above seciton to avoid the OOM.
+- If it is out-of-memory, sometimes it will report this error instead of "Out-of-memory." Please refer to the above section to avoid the OOM.
diff --git a/docs/start/install.md b/docs/start/install.md
@@ -16,7 +16,7 @@ Note: Please check the [FlashInfer installation doc](https://docs.flashinfer.ai/
 ## Method 2: From source
 ```
 # Use the last release branch
-git clone -b v0.3.5 https://github.com/sgl-project/sglang.git
+git clone -b v0.3.5.post2 https://github.com/sgl-project/sglang.git
 cd sglang
 
 pip install --upgrade pip
@@ -46,7 +46,7 @@ docker run --gpus all \
 Note: To AMD ROCm system with Instinct/MI GPUs, it is recommended to use `docker/Dockerfile.rocm` to build images, example and usage as below:
 
 ```bash
-docker build --build-arg SGL_BRANCH=v0.3.5 -t v0.3.5-rocm620 -f Dockerfile.rocm .
+docker build --build-arg SGL_BRANCH=v0.3.5.post2 -t v0.3.5.post2-rocm620 -f Dockerfile.rocm .
 
 alias drun='docker run -it --rm --network=host --device=/dev/kfd --device=/dev/dri --ipc=host \
     --shm-size 16G --group-add video --cap-add=SYS_PTRACE --security-opt seccomp=unconfined \
@@ -55,11 +55,11 @@ alias drun='docker run -it --rm --network=host --device=/dev/kfd --device=/dev/d
 drun -p 30000:30000 \
     -v ~/.cache/huggingface:/root/.cache/huggingface \
     --env "HF_TOKEN=<secret>" \
-    v0.3.5-rocm620 \
+    v0.3.5.post2-rocm620 \
     python3 -m sglang.launch_server --model-path meta-llama/Llama-3.1-8B-Instruct --host 0.0.0.0 --port 30000
 
 # Till flashinfer backend available, --attention-backend triton --sampling-backend pytorch are set by default
-drun v0.3.5-rocm620 python3 -m sglang.bench_latency --batch-size 32 --input 1024 --output 128 --model amd/Meta-Llama-3.1-8B-Instruct-FP8-KV --tp 8 --quantization fp8
+drun v0.3.5.post2-rocm620 python3 -m sglang.bench_latency --batch-size 32 --input 1024 --output 128 --model amd/Meta-Llama-3.1-8B-Instruct-FP8-KV --tp 8 --quantization fp8
 ```
 
 ## Method 4: Using docker compose

diff --git a/python/pyproject.toml b/python/pyproject.toml
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
 
 [project]
 name = "sglang"
-version = "0.3.5"
+version = "0.3.5.post2"
 description = "SGLang is yet another fast serving framework for large language models and vision language models."
 readme = "README.md"
 requires-python = ">=3.8"
@@ -19,8 +19,8 @@ dependencies = ["requests", "tqdm", "numpy", "IPython"]
 runtime_common = ["aiohttp", "decord", "fastapi", "hf_transfer", "huggingface_hub", "interegular",
     "orjson", "packaging", "pillow", "prometheus-client>=0.20.0", "psutil", "pydantic", "python-multipart",
     "torchao", "uvicorn", "uvloop", "pyzmq>=25.1.2",
-    "outlines>=0.0.44", "modelscope"]
-srt = ["sglang[runtime_common]", "torch", "vllm==0.6.3.post1"]
+    "outlines>=0.0.44,<0.1.0", "modelscope"]
+srt = ["sglang[runtime_common]", "torch", "vllm>=0.6.3.post1"]
 
 # HIP (Heterogeneous-computing Interface for Portability) for AMD
 # => base docker rocm/vllm-dev:20241022, not from public vllm whl

diff --git a/python/sglang/bench_latency.py b/python/sglang/bench_latency.py
@@ -220,7 +220,7 @@ def prepare_synthetic_inputs_for_latency_test(batch_size, input_len):
     return reqs
 
 
-@torch.inference_mode()
+@torch.no_grad
 def extend(reqs, model_runner):
     batch = ScheduleBatch.init_new(
         reqs=reqs,
@@ -237,7 +237,7 @@ def extend(reqs, model_runner):
     return next_token_ids, logits_output.next_token_logits, batch
 
 
-@torch.inference_mode()
+@torch.no_grad
 def decode(input_token_ids, batch, model_runner):
     batch.output_ids = input_token_ids
     batch.prepare_for_decode()
-Original file line number
+Diff line change
@@ Expand Up / @@ -28,6 +28,7 @@ @@
     - XVERSE / XVERSE MoE
     - SmolLM
     - GLM-4
+    - Phi-3-Small
     ## Embedding Models
@@ Expand Down @@