Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: Add log probabilities and number of input tokens to additional outputs #75

Merged
merged 5 commits into from
Dec 5, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
115 changes: 93 additions & 22 deletions ci/L0_additional_outputs_vllm/additional_outputs_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,13 +37,21 @@ class TestAdditionalOutputs:
_sampling_parameters = {"temperature": "0", "top_p": "1"}
_prompt = "In this example,"

def _get_sampling_parameters(self, logprobs=None):
sampling_parameters = self._sampling_parameters.copy()
if logprobs is not None:
sampling_parameters["logprobs"] = logprobs
return sampling_parameters

def _get_inputs(
self,
prompt,
stream=True,
sampling_parameters=None,
return_finish_reason=None,
return_cumulative_logprob=None,
return_logprobs=None,
return_num_input_tokens=None,
return_num_output_tokens=None,
):
inputs = []
Expand Down Expand Up @@ -76,6 +84,16 @@ def _get_inputs(
np.array([return_cumulative_logprob], dtype=bool)
)

if return_logprobs is not None:
inputs.append(grpcclient.InferInput("return_logprobs", [1], "BOOL"))
inputs[-1].set_data_from_numpy(np.array([return_logprobs], dtype=bool))

if return_num_input_tokens is not None:
inputs.append(grpcclient.InferInput("return_num_input_tokens", [1], "BOOL"))
inputs[-1].set_data_from_numpy(
np.array([return_num_input_tokens], dtype=bool)
)

if return_num_output_tokens is not None:
inputs.append(
grpcclient.InferInput("return_num_output_tokens", [1], "BOOL")
Expand All @@ -89,12 +107,12 @@ def _get_inputs(
def _callback(self, result, error):
self._responses.append({"result": result, "error": error})

def _llm_infer(self, inputs):
def _llm_infer(self, inputs, sampling_parameters):
self._responses = []
with grpcclient.InferenceServerClient(self._grpc_url) as client:
client.start_stream(self._callback)
client.async_stream_infer(
self._model_name, inputs=inputs, parameters=self._sampling_parameters
self._model_name, inputs=inputs, parameters=sampling_parameters
)
client.stop_stream()
assert len(self._responses) > 0
Expand Down Expand Up @@ -135,6 +153,63 @@ def _assert_cumulative_logprob(self, return_cumulative_logprob):
assert cumulative_logprob != prev_cumulative_logprob
prev_cumulative_logprob = cumulative_logprob

def _assert_logprobs(
self, stream, sampling_parameters, return_logprobs, return_num_output_tokens
):
for response in self._responses:
result, error = response["result"], response["error"]
assert error is None
logprobs_np = result.as_numpy(name="logprobs")
if return_logprobs is None or return_logprobs == False:
assert logprobs_np is None
continue
logprobs = json.loads(logprobs_np[0].decode("utf-8"))
if "logprobs" not in sampling_parameters:
assert logprobs is None
continue
assert isinstance(logprobs, list)
assert len(logprobs) >= 1
if return_num_output_tokens == True:
num_output_tokens = result.as_numpy(name="num_output_tokens")[0].astype(
int
)
assert len(logprobs) == num_output_tokens
text_output_logprobs = ""
for logprobs_d in logprobs:
assert isinstance(logprobs_d, dict)
assert len(logprobs_d) >= 1
assert len(logprobs_d) <= sampling_parameters["logprobs"] + 1
rank_one_found = False
for token_id, logprob_d in logprobs_d.items():
assert isinstance(token_id, str)
assert len(logprob_d) == 3
assert isinstance(logprob_d["logprob"], float)
assert isinstance(logprob_d["rank"], int)
assert isinstance(logprob_d["decoded_token"], str)
if logprob_d["rank"] == 1:
assert not rank_one_found
rank_one_found = True
text_output_logprobs += logprob_d["decoded_token"]
assert rank_one_found
text_output = result.as_numpy(name="text_output")[0].decode("utf-8")
if not stream:
# given exclude_input_in_output is not set, prepend_input is True if not
# streaming and False if streaming
text_output_logprobs = self._prompt + text_output_logprobs
assert text_output_logprobs == text_output

def _assert_num_input_tokens(self, return_num_input_tokens):
for response in self._responses:
result, error = response["result"], response["error"]
assert error is None
num_input_tokens_np = result.as_numpy(name="num_input_tokens")
if return_num_input_tokens is None or return_num_input_tokens == False:
assert num_input_tokens_np is None
continue
num_input_tokens = num_input_tokens_np.astype(int)
assert num_input_tokens > 0
assert num_input_tokens <= len(self._prompt)

def _assert_num_output_tokens(self, return_num_output_tokens):
for response in self._responses:
result, error = response["result"], response["error"]
Expand All @@ -144,46 +219,42 @@ def _assert_num_output_tokens(self, return_num_output_tokens):
assert num_output_tokens_np is None
continue
num_output_tokens = num_output_tokens_np[0].astype(int)
# TODO: vLLM may return token ids identical to the previous one when
# streaming, for example:
#
# prev: None
# curr: text=' the', token_ids=array('l', [5])
#
# prev: text=' the', token_ids=array('l', [5, 1385])
# curr: text=' the term', token_ids=array('l', [5, 1385])
#
# prev: text=' the term', token_ids=array('l', [5, 1385, 44])
# curr: text=' the term', token_ids=array('l', [5, 1385, 44])
#
# prev: text=' the term', token_ids=array('l', [5, 1385, 44, 48])
# curr: text=' the term “', token_ids=array('l', [5, 1385, 44, 48])
#
# If this is no longer the case in a future release, change the assert
# to assert num_output_tokens > 0.
assert num_output_tokens >= 0
assert num_output_tokens > 0

@pytest.mark.parametrize("stream", [True, False])
@pytest.mark.parametrize("return_finish_reason", [None, True, False])
@pytest.mark.parametrize("return_cumulative_logprob", [None, True, False])
@pytest.mark.parametrize("logprobs", [None, 0, 2])
@pytest.mark.parametrize("return_logprobs", [None, True, False])
@pytest.mark.parametrize("return_num_input_tokens", [None, True, False])
@pytest.mark.parametrize("return_num_output_tokens", [None, True, False])
def test_additional_outputs(
self,
stream,
return_finish_reason,
return_cumulative_logprob,
logprobs,
return_logprobs,
return_num_input_tokens,
return_num_output_tokens,
):
sampling_parameters = self._get_sampling_parameters(logprobs=logprobs)
inputs = self._get_inputs(
self._prompt,
stream=stream,
sampling_parameters=self._sampling_parameters,
sampling_parameters=sampling_parameters,
return_finish_reason=return_finish_reason,
return_cumulative_logprob=return_cumulative_logprob,
return_logprobs=return_logprobs,
return_num_input_tokens=return_num_input_tokens,
return_num_output_tokens=return_num_output_tokens,
)
self._llm_infer(inputs)
self._llm_infer(inputs, sampling_parameters)
self._assert_text_output_valid()
self._assert_finish_reason(return_finish_reason)
self._assert_cumulative_logprob(return_cumulative_logprob)
self._assert_logprobs(
stream, sampling_parameters, return_logprobs, return_num_output_tokens
)
self._assert_num_input_tokens(return_num_input_tokens)
self._assert_num_output_tokens(return_num_output_tokens)
2 changes: 1 addition & 1 deletion ci/L0_additional_outputs_vllm/test.sh
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ sed -i 's/"gpu_memory_utilization": 0.5/"gpu_memory_utilization": 0.3/' models/v
RET=0

# Test
SERVER_LOG="vllm_opt.server.log"
SERVER_LOG="additional_outputs_test.server.log"
SERVER_ARGS="--model-repository=models"
run_server
if [ "$SERVER_PID" == "0" ]; then
Expand Down
25 changes: 20 additions & 5 deletions docs/additional_outputs.md
Original file line number Diff line number Diff line change
Expand Up @@ -46,8 +46,6 @@ for more details.
To enable, set `return_finish_reason` input tensor to `True`. The reason will be
sent as a string on the `finish_reason` output tensor.

Supported since r24.12.

### Cumulative Log Probabilities

The cumulative log probability of the generated output text. See
Expand All @@ -57,7 +55,26 @@ for more details.
To enable, set `return_cumulative_logprob` input tensor to `True`. The floating
point value will be sent on the `cumulative_logprob` output tensor.

Supported since r24.12.
### Log Probabilities

The log probabilities of the top probability tokens at each position of the
[logprobs](https://github.com/vllm-project/vllm/blob/v0.6.3.post1/vllm/sampling_params.py#L146-L152)
are requested. Only the log probabilities of the new tokens generated since the
last response are returned on each new response. See
[here](https://github.com/vllm-project/vllm/blob/v0.6.3.post1/vllm/outputs.py#L24-L25)
for more details on the log probabilities.

To enable, set `return_logprobs` input tensor to `True`. The log probabilities
will be sent on the `logprobs` output tensor as a serialized JSON string.

### Number of Input Tokens

The number of token IDs of the prompt. See
[here](https://github.com/vllm-project/vllm/blob/v0.6.3.post1/vllm/outputs.py#L79-L81)
for more details.

To enable, set `return_num_input_tokens` input tensor to `True`. The unsigned
integer value will be sent on the `num_input_tokens` output tensor.

### Number of Output Tokens

Expand All @@ -71,8 +88,6 @@ for more details on the token IDs of the generated output text.
To enable, set `return_num_output_tokens` input tensor to `True`. The unsigned
integer value will be sent on the `num_output_tokens` output tensor.

Supported since r24.12.

## Examples

### Add Finish Reason to Outputs
Expand Down
Loading
Loading