Merge branch 'main' into dev/onevision

EvolvingLMMs-Lab · Aug 14, 2024 · b5ed5c6 · b5ed5c6
2 parents 59f2a1c + 67c0d83
commit b5ed5c6
Show file tree

Hide file tree

Showing 23 changed files with 882 additions and 543 deletions.
diff --git a/.github/ISSUE_TEMPLATE/1-bug-report.yml b/.github/ISSUE_TEMPLATE/1-bug-report.yml
@@ -12,6 +12,7 @@ body:
     - label: 2. The bug has not been fixed in the latest version.
     - label: 3. Please note that if the bug-related issue you submitted lacks corresponding environment info and a minimal reproducible demo, it will be challenging for us to reproduce and resolve the issue, reducing the likelihood of receiving feedback.
     - label: 4. If the issue you raised is not a bug but a question, please raise a discussion at https://github.com/sgl-project/sglang/discussions/new/choose Otherwise, it will be closed.
+    - label: 5. Please use English, otherwise it will be closed.
 - type: textarea
   attributes:
     label: Describe the bug
@@ -31,7 +32,7 @@ body:
   attributes:
     label: Environment
     description: |
-      Please provide necessary environment information here with `python3 -m sglang.check_env`.
+      Please provide necessary environment information here with `python3 -m sglang.check_env`. Otherwise the issue will be closed.
     placeholder: Environment here.
   validations:
     required: true
diff --git a/.github/ISSUE_TEMPLATE/2-feature-request.yml b/.github/ISSUE_TEMPLATE/2-feature-request.yml
@@ -3,6 +3,12 @@ description: Suggest an idea for this project
 title: "[Feature] "
 
 body:
+- type: checkboxes
+  attributes:
+    label: Checklist
+    options:
+    - label: 1. If the issue you raised is not a feature but a question, please raise a discussion at https://github.com/sgl-project/sglang/discussions/new/choose Otherwise, it will be closed.
+    - label: 2. Please use English, otherwise it will be closed.
 - type: textarea
   attributes:
     label: Motivation

diff --git a/.github/pull_request_template.md b/.github/pull_request_template.md
@@ -1,15 +1,16 @@
-Thank you for your contribution, we really appreciate it. The following instructions will help improve your pull request and make it easier to receive feedback. If there are any items you don't understand, don't worry. Just submit the pull request and ask the maintainers for help.
+<!-- Thank you for your contribution, we really appreciate it. The following instructions will help improve your pull request and make it easier to receive feedback. If there are any items you don't understand, don't worry. Just submit the pull request and ask the maintainers for help. -->
 
 ## Motivation
 
-Please explain the motivation behind this PR and the goal you aim to achieve with it.
+<!-- Please explain the motivation behind this PR and the goal you aim to achieve with it. -->
 
 ## Modification
 
-Briefly describe the changes made in this PR.
+<!-- Briefly describe the changes made in this PR. -->
 
 ## Checklist
 
-1. Ensure pre-commit `pre-commit run --all-files` or other linting tools are used to fix potential lint issues.
-2. Confirm that modifications are covered by complete unit tests. If not, please add more unit tests for correctness.
-3. Modify documentation as needed, such as docstrings or example tutorials.
+- [ ] Before submitting a PR for review, make sure it has passed verification in your local development environment **at least**.
+- [ ] Ensure pre-commit `pre-commit run --all-files` or other linting tools are used to fix potential lint issues.
+- [ ] Confirm that modifications are covered by complete unit tests. If not, please add more unit tests for correctness.
+- [ ] Modify documentation as needed, such as docstrings or example tutorials.
diff --git a/.github/workflows/accuracy-test.yml b/.github/workflows/accuracy-test.yml
@@ -43,4 +43,4 @@ jobs:
       run: |
         cd test/srt
         python3 test_eval_accuracy_large.py
-      timeout-minutes: 20
+      timeout-minutes: 10
diff --git a/.github/workflows/e2e-test.yml b/.github/workflows/e2e-test.yml
@@ -39,13 +39,16 @@ jobs:
       run: |
         cd test/srt
         python3 -m unittest test_serving_throughput.TestServingThroughput.test_default
+      timeout-minutes: 10
 
     - name: Benchmark Serving Throughput (w/o RadixAttention)
       run: |
         cd test/srt
         python3 -m unittest test_serving_throughput.TestServingThroughput.test_default_without_radix_cache
+      timeout-minutes: 10
 
     - name: Benchmark Serving Throughput (w/ ChunkedPrefill)
       run: |
         cd test/srt
         python3 -m unittest test_serving_throughput.TestServingThroughput.test_default_with_chunked_prefill
+      timeout-minutes: 10
diff --git a/.github/workflows/moe-test.yml b/.github/workflows/moe-test.yml
@@ -36,7 +36,12 @@ jobs:
                 pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.4/ --force-reinstall
 
             - name: Benchmark MOE Serving Throughput
-              run: |
-                cd test/srt
-                python3 -m unittest test_moe_serving_throughput.TestServingThroughput.test_default
-                python3 -m unittest test_moe_serving_throughput.TestServingThroughput.test_default_without_radix_cache
+              uses: nick-fields/retry@v3
+              with:
+                timeout_minutes: 15
+                max_attempts: 2
+                retry_on: error
+                command: |
+                  cd test/srt
+                  python3 -m unittest test_moe_serving_throughput.TestServingThroughput.test_default
+                  python3 -m unittest test_moe_serving_throughput.TestServingThroughput.test_default_without_radix_cache
diff --git a/.github/workflows/unit-test.yml b/.github/workflows/unit-test.yml
@@ -41,8 +41,10 @@ jobs:
       run: |
         cd test/srt
         python3 run_suite.py --suite minimal
+      timeout-minutes: 15
 
     - name: Test Frontend Language
       run: |
         cd test/lang
         python3 run_suite.py --suite minimal
+      timeout-minutes: 10
diff --git a/README.md b/README.md
@@ -88,7 +88,7 @@ docker run --gpus all \
 2. Execute the command `docker compose up -d` in your terminal.
 
 ### Common Notes
-- If you cannot install FlashInfer, check out its [installation](https://docs.flashinfer.ai/installation.html#) page. If you still cannot install it, you can use the slower Triton kernels by adding `--disable-flashinfer` when launching the server.
+- [FlashInfer](https://github.com/flashinfer-ai/flashinfer) is currently one of the dependencies that must be installed for SGLang. If you are using NVIDIA GPU devices below sm80, such as T4, you can't use SGLang for the time being. We expect to resolve this issue soon, so please stay tuned. If you encounter any FlashInfer-related issues on sm80+ devices (e.g., A100, L40S, H100), consider using Triton's kernel by `--disable-flashinfer --disable-flashinfer-sampling` and raise a issue.
 - If you only need to use the OpenAI backend, you can avoid installing other dependencies by using `pip install "sglang[openai]"`.
 
 ## Backend: SGLang Runtime (SRT)

diff --git a/benchmark/gsm8k/bench_sglang.py b/benchmark/gsm8k/bench_sglang.py
@@ -88,6 +88,9 @@ def few_shot_gsm8k(s, question):
     for i in range(len(states)):
         preds.append(get_answer_value(states[i]["answer"]))
 
+    # print(f"{preds=}")
+    # print(f"{labels=}")
+
     # Compute accuracy
     acc = np.mean(np.array(preds) == np.array(labels))
     invalid = np.mean(np.array(preds) == INVALID)

diff --git a/python/sglang/bench_latency.py b/python/sglang/bench_latency.py
@@ -221,6 +221,7 @@ def correctness_test(
 
     # Prepare inputs
     input_ids, reqs = prepare_inputs_for_correctness_test(bench_args, tokenizer)
+    rank_print(f"{input_ids=}")
 
     if bench_args.cut_len > 0:
         # Prefill

diff --git a/python/sglang/srt/constrained/jump_forward.py b/python/sglang/srt/constrained/jump_forward.py
@@ -62,16 +62,22 @@ def _init_state_to_jump_forward(regex_string):
                 id_to_symbol.setdefault(id_, []).append(symbol)
 
             transitions = fsm_info.transitions
+
             outgoings_ct = defaultdict(int)
-            state_to_jump_forward = {}
+            # NOTE(lsyin): Final states can lead to terminate, so they have one outgoing edge naturally
+            for s in fsm_info.finals:
+                outgoings_ct[s] = 1
 
+            state_to_jump_forward = {}
             for (state, id_), next_state in transitions.items():
                 if id_ == fsm_info.alphabet_anything_value:
+                    # Arbitrarily symbol cannot be recognized as jump forward
                     continue
+
                 symbols = id_to_symbol[id_]
                 for c in symbols:
                     if len(c) > 1:
-                        # Skip byte level transitions
+                        # Skip byte level transitions like c = "5E"
                         continue
 
                     outgoings_ct[state] += 1
@@ -87,6 +93,9 @@ def _init_state_to_jump_forward(regex_string):
 
             # Process the byte level jump forward
             outgoings_ct = defaultdict(int)
+            for s in fsm_info.finals:
+                outgoings_ct[s] = 1
+
             for (state, id_), next_state in transitions.items():
                 if id_ == fsm_info.alphabet_anything_value:
                     continue
@@ -177,3 +186,5 @@ def test_main(regex_string):
     test_main(r"霍格沃茨特快列车|霍比特人比尔博")
     # 霍格: \xe9\x9c\x8d \xe6\xa0\xbc ...
     # 霍比: \xe9\x9c\x8d \xe6\xaf\x94 ...
+
+    test_main(r"[-+]?[0-9]+[ ]*")
diff --git a/python/sglang/srt/layers/activation.py b/python/sglang/srt/layers/activation.py
@@ -14,7 +14,6 @@
 """Fused operators for activation layers."""
 
 import torch
-import torch.nn as nn
 import torch.nn.functional as F
 from flashinfer.activation import silu_and_mul
 from vllm.model_executor.custom_op import CustomOp

diff --git a/python/sglang/srt/layers/fused_moe/__init__.py b/python/sglang/srt/layers/fused_moe/__init__.py
@@ -0,0 +1 @@
+from sglang.srt.layers.fused_moe.layer import FusedMoE, FusedMoEMethodBase
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		from sglang.srt.layers.fused_moe.layer import FusedMoE, FusedMoEMethodBase