triton-inference-server · kthui · Dec 5, 2024 · Nov 26, 2024 · Dec 2, 2024 · Dec 3, 2024
diff --git a/ci/L0_additional_outputs_vllm/additional_outputs_test.py b/ci/L0_additional_outputs_vllm/additional_outputs_test.py
@@ -37,13 +37,21 @@ class TestAdditionalOutputs:
     _sampling_parameters = {"temperature": "0", "top_p": "1"}
     _prompt = "In this example,"
 
+    def _get_sampling_parameters(self, logprobs=None):
+        sampling_parameters = self._sampling_parameters.copy()
+        if logprobs is not None:
+            sampling_parameters["logprobs"] = logprobs
+        return sampling_parameters
+
     def _get_inputs(
         self,
         prompt,
         stream=True,
         sampling_parameters=None,
         return_finish_reason=None,
         return_cumulative_logprob=None,
+        return_logprobs=None,
+        return_num_input_tokens=None,
         return_num_output_tokens=None,
     ):
         inputs = []
@@ -76,6 +84,16 @@ def _get_inputs(
                 np.array([return_cumulative_logprob], dtype=bool)
             )
 
+        if return_logprobs is not None:
+            inputs.append(grpcclient.InferInput("return_logprobs", [1], "BOOL"))
+            inputs[-1].set_data_from_numpy(np.array([return_logprobs], dtype=bool))
+
+        if return_num_input_tokens is not None:
+            inputs.append(grpcclient.InferInput("return_num_input_tokens", [1], "BOOL"))
+            inputs[-1].set_data_from_numpy(
+                np.array([return_num_input_tokens], dtype=bool)
+            )
+
         if return_num_output_tokens is not None:
             inputs.append(
                 grpcclient.InferInput("return_num_output_tokens", [1], "BOOL")
@@ -89,12 +107,12 @@ def _get_inputs(
     def _callback(self, result, error):
         self._responses.append({"result": result, "error": error})
 
-    def _llm_infer(self, inputs):
+    def _llm_infer(self, inputs, sampling_parameters):
         self._responses = []
         with grpcclient.InferenceServerClient(self._grpc_url) as client:
             client.start_stream(self._callback)
             client.async_stream_infer(
-                self._model_name, inputs=inputs, parameters=self._sampling_parameters
+                self._model_name, inputs=inputs, parameters=sampling_parameters
             )
             client.stop_stream()
         assert len(self._responses) > 0
@@ -135,6 +153,63 @@ def _assert_cumulative_logprob(self, return_cumulative_logprob):
             assert cumulative_logprob != prev_cumulative_logprob
             prev_cumulative_logprob = cumulative_logprob
 
+    def _assert_logprobs(
+        self, stream, sampling_parameters, return_logprobs, return_num_output_tokens
+    ):
+        for response in self._responses:
+            result, error = response["result"], response["error"]
+            assert error is None
+            logprobs_np = result.as_numpy(name="logprobs")
+            if return_logprobs is None or return_logprobs == False:
+                assert logprobs_np is None
+                continue
+            logprobs = json.loads(logprobs_np[0].decode("utf-8"))
+            if "logprobs" not in sampling_parameters:
+                assert logprobs is None
+                continue
+            assert isinstance(logprobs, list)
+            assert len(logprobs) >= 1
+            if return_num_output_tokens == True:
+                num_output_tokens = result.as_numpy(name="num_output_tokens")[0].astype(
+                    int
+                )
+                assert len(logprobs) == num_output_tokens
+            text_output_logprobs = ""
+            for logprobs_d in logprobs:
+                assert isinstance(logprobs_d, dict)
+                assert len(logprobs_d) >= 1
+                assert len(logprobs_d) <= sampling_parameters["logprobs"] + 1
+                rank_one_found = False
+                for token_id, logprob_d in logprobs_d.items():
+                    assert isinstance(token_id, str)
+                    assert len(logprob_d) == 3
+                    assert isinstance(logprob_d["logprob"], float)
+                    assert isinstance(logprob_d["rank"], int)
+                    assert isinstance(logprob_d["decoded_token"], str)
+                    if logprob_d["rank"] == 1:
+                        assert not rank_one_found
+                        rank_one_found = True
+                        text_output_logprobs += logprob_d["decoded_token"]
+                assert rank_one_found
+            text_output = result.as_numpy(name="text_output")[0].decode("utf-8")
+            if not stream:
+                # given exclude_input_in_output is not set, prepend_input is True if not
+                # streaming and False if streaming
+                text_output_logprobs = self._prompt + text_output_logprobs
+            assert text_output_logprobs == text_output
+
+    def _assert_num_input_tokens(self, return_num_input_tokens):
+        for response in self._responses:
+            result, error = response["result"], response["error"]
+            assert error is None
+            num_input_tokens_np = result.as_numpy(name="num_input_tokens")
+            if return_num_input_tokens is None or return_num_input_tokens == False:
+                assert num_input_tokens_np is None
+                continue
+            num_input_tokens = num_input_tokens_np.astype(int)
+            assert num_input_tokens > 0
+            assert num_input_tokens <= len(self._prompt)
+
     def _assert_num_output_tokens(self, return_num_output_tokens):
         for response in self._responses:
             result, error = response["result"], response["error"]
@@ -144,46 +219,42 @@ def _assert_num_output_tokens(self, return_num_output_tokens):
                 assert num_output_tokens_np is None
                 continue
             num_output_tokens = num_output_tokens_np[0].astype(int)
-            # TODO: vLLM may return token ids identical to the previous one when
-            #       streaming, for example:
-            #
-            #       prev: None
-            #       curr: text=' the', token_ids=array('l', [5])
-            #
-            #       prev: text=' the', token_ids=array('l', [5, 1385])
-            #       curr: text=' the term', token_ids=array('l', [5, 1385])
-            #
-            #       prev: text=' the term', token_ids=array('l', [5, 1385, 44])
-            #       curr: text=' the term', token_ids=array('l', [5, 1385, 44])
-            #
-            #       prev: text=' the term', token_ids=array('l', [5, 1385, 44, 48])
-            #       curr: text=' the term “', token_ids=array('l', [5, 1385, 44, 48])
-            #
-            #       If this is no longer the case in a future release, change the assert
-            #       to assert num_output_tokens > 0.
-            assert num_output_tokens >= 0
+            assert num_output_tokens > 0
 
     @pytest.mark.parametrize("stream", [True, False])
     @pytest.mark.parametrize("return_finish_reason", [None, True, False])
     @pytest.mark.parametrize("return_cumulative_logprob", [None, True, False])
+    @pytest.mark.parametrize("logprobs", [None, 0, 2])
+    @pytest.mark.parametrize("return_logprobs", [None, True, False])
+    @pytest.mark.parametrize("return_num_input_tokens", [None, True, False])
     @pytest.mark.parametrize("return_num_output_tokens", [None, True, False])
     def test_additional_outputs(
         self,
         stream,
         return_finish_reason,
         return_cumulative_logprob,
+        logprobs,
+        return_logprobs,
+        return_num_input_tokens,
         return_num_output_tokens,
     ):
+        sampling_parameters = self._get_sampling_parameters(logprobs=logprobs)
         inputs = self._get_inputs(
             self._prompt,
             stream=stream,
-            sampling_parameters=self._sampling_parameters,
+            sampling_parameters=sampling_parameters,
             return_finish_reason=return_finish_reason,
             return_cumulative_logprob=return_cumulative_logprob,
+            return_logprobs=return_logprobs,
+            return_num_input_tokens=return_num_input_tokens,
             return_num_output_tokens=return_num_output_tokens,
         )
-        self._llm_infer(inputs)
+        self._llm_infer(inputs, sampling_parameters)
         self._assert_text_output_valid()
         self._assert_finish_reason(return_finish_reason)
         self._assert_cumulative_logprob(return_cumulative_logprob)
+        self._assert_logprobs(
+            stream, sampling_parameters, return_logprobs, return_num_output_tokens
+        )
+        self._assert_num_input_tokens(return_num_input_tokens)
         self._assert_num_output_tokens(return_num_output_tokens)
diff --git a/ci/L0_additional_outputs_vllm/test.sh b/ci/L0_additional_outputs_vllm/test.sh
@@ -40,7 +40,7 @@ sed -i 's/"gpu_memory_utilization": 0.5/"gpu_memory_utilization": 0.3/' models/v
 RET=0
 
 # Test
-SERVER_LOG="vllm_opt.server.log"
+SERVER_LOG="additional_outputs_test.server.log"
 SERVER_ARGS="--model-repository=models"
 run_server
 if [ "$SERVER_PID" == "0" ]; then

diff --git a/docs/additional_outputs.md b/docs/additional_outputs.md
@@ -46,8 +46,6 @@ for more details.
 To enable, set `return_finish_reason` input tensor to `True`. The reason will be
 sent as a string on the `finish_reason` output tensor.
 
-Supported since r24.12.
-
 ### Cumulative Log Probabilities
 
 The cumulative log probability of the generated output text. See
@@ -57,7 +55,26 @@ for more details.
 To enable, set `return_cumulative_logprob` input tensor to `True`. The floating
 point value will be sent on the `cumulative_logprob` output tensor.
 
-Supported since r24.12.
+### Log Probabilities
+
+The log probabilities of the top probability tokens at each position of the
+[logprobs](https://github.com/vllm-project/vllm/blob/v0.6.3.post1/vllm/sampling_params.py#L146-L152)
+are requested. Only the log probabilities of the new tokens generated since the
+last response are returned on each new response. See
+[here](https://github.com/vllm-project/vllm/blob/v0.6.3.post1/vllm/outputs.py#L24-L25)
+for more details on the log probabilities.
+
+To enable, set `return_logprobs` input tensor to `True`. The log probabilities
+will be sent on the `logprobs` output tensor as a serialized JSON string.
+
+### Number of Input Tokens
+
+The number of token IDs of the prompt. See
+[here](https://github.com/vllm-project/vllm/blob/v0.6.3.post1/vllm/outputs.py#L79-L81)
+for more details.
+
+To enable, set `return_num_input_tokens` input tensor to `True`. The unsigned
+integer value will be sent on the `num_input_tokens` output tensor.
 
 ### Number of Output Tokens
 
@@ -71,8 +88,6 @@ for more details on the token IDs of the generated output text.
 To enable, set `return_num_output_tokens` input tensor to `True`. The unsigned
 integer value will be sent on the `num_output_tokens` output tensor.
 
-Supported since r24.12.
-
 ## Examples
 
 ### Add Finish Reason to Outputs