diff --git a/nemo/collections/llm/api.py b/nemo/collections/llm/api.py index 386b08cc7813..83201e78283d 100644 --- a/nemo/collections/llm/api.py +++ b/nemo/collections/llm/api.py @@ -337,6 +337,7 @@ def deploy( max_input_len: int = 256, max_output_len: int = 256, max_batch_size: int = 8, + output_context_logits: bool = True, output_generation_logits: bool = True, ): """ @@ -364,8 +365,11 @@ def deploy( Needs to be True to be able to run evaluation. Default: True. openai_format_response (bool): Return the response from PyTriton server in OpenAI compatible format. Needs to be True while running evaluation. Default: True. + output_context_logits (bool): If True builds trtllm engine with gather_context_logits set to True. Default: True. + context_logits are used to compute the logProb of the output token in case of multi token prediction benchmarks. output_generation_logits (bool): If True builds trtllm engine with gather_generation_logits set to True. - generation_logits are used to compute the logProb of the output token. Default: True. + generation_logits are used to compute the logProb of the output token in case of single token prediction + benchmarks (like MMLU, lambada). Default: True. """ from nemo.collections.llm.deploy.base import get_trtllm_deployable, unset_environment_variables from nemo.deploy import DeployPyTriton @@ -383,6 +387,7 @@ def deploy( max_output_len, max_batch_size, dtype, + output_context_logits, output_generation_logits, ) @@ -425,7 +430,6 @@ def evaluate( limit: Optional[Union[int, float]] = None, bootstrap_iters: int = 100000, # inference params - max_tokens_to_generate: Optional[int] = 256, temperature: Optional[float] = 0.000000001, top_p: Optional[float] = 0.0, top_k: Optional[int] = 1, @@ -454,7 +458,6 @@ def evaluate( bootstrap_iters (int): Number of iterations for bootstrap statistics, used when calculating stderrs. Set to 0 for no stderr calculations to be performed. Default: 100000. # inference params - max_tokens_to_generate (int): max tokens to generate. Default: 256. temperature: Optional[float]: float value between 0 and 1. temp of 0 indicates greedy decoding, where the token with highest prob is chosen. Temperature can't be set to 0.0 currently, due to a bug with TRTLLM (# TODO to be investigated). Hence using a very samll value as the default. Default: 0.000000001. @@ -480,9 +483,7 @@ def evaluate( # Wait for server to be ready before starting evaluation evaluation.wait_for_server_ready(url=url, triton_http_port=triton_http_port, model_name=model_name) # Create an object of the NeMoFWLM which is passed as a model to evaluator.simple_evaluate - model = evaluation.NeMoFWLMEval( - model_name, url, tokenizer, max_tokens_to_generate, temperature, top_p, top_k, add_bos - ) + model = evaluation.NeMoFWLMEval(model_name, url, tokenizer, temperature, top_p, top_k, add_bos) results = evaluator.simple_evaluate( model=model, tasks=eval_task, diff --git a/nemo/collections/llm/deploy/base.py b/nemo/collections/llm/deploy/base.py index 4b0065271604..fd82c94effb3 100644 --- a/nemo/collections/llm/deploy/base.py +++ b/nemo/collections/llm/deploy/base.py @@ -63,6 +63,7 @@ def get_trtllm_deployable( max_output_len, max_batch_size, dtype, + output_context_logits, output_generation_logits, ): """ @@ -109,6 +110,7 @@ def get_trtllm_deployable( max_output_len=max_output_len, max_batch_size=max_batch_size, dtype=dtype, + gather_context_logits=output_context_logits, gather_generation_logits=output_generation_logits, ) except Exception as error: diff --git a/nemo/collections/llm/evaluation/base.py b/nemo/collections/llm/evaluation/base.py index aa415cb1022a..96d99b445433 100644 --- a/nemo/collections/llm/evaluation/base.py +++ b/nemo/collections/llm/evaluation/base.py @@ -33,38 +33,51 @@ class NeMoFWLMEval(LM): Created based on: https://github.com/EleutherAI/lm-evaluation-harness/blob/v0.4.4/docs/model_guide.md """ - def __init__(self, model_name, api_url, tokenizer, max_tokens_to_generate, temperature, top_p, top_k, add_bos): + def __init__(self, model_name, api_url, tokenizer, temperature, top_p, top_k, add_bos): self.model_name = model_name self.api_url = api_url self.tokenizer = tokenizer - self.max_tokens_to_generate = max_tokens_to_generate self.temperature = temperature self.top_p = top_p self.top_k = top_k self.add_bos = add_bos super().__init__() - def _generate_tokens_logits(self, payload, return_text: bool = False, return_logits: bool = False): + def _generate_tokens_logits( + self, payload, single_prediction_token, return_text: bool = False, return_logits: bool = False + ): """ A private method that sends post request to the model on PyTriton server and returns either generated text or logits. """ nq = NemoQueryLLM(url=self.api_url, model_name=payload['model']) + output_context_logits = False + output_generation_logits = False + if single_prediction_token: + # In case of single token prediction return the generation logits + output_generation_logits = True + else: + # In case of multiple token prediction return the context logits + output_context_logits = True response = nq.query_llm( prompts=payload['prompt'] if isinstance(payload['prompt'], list) else [payload['prompt']], max_output_len=payload['max_tokens'], top_k=payload['top_k'], top_p=payload['top_p'], temperature=payload['temperature'], - output_generation_logits=True, + output_context_logits=output_context_logits, + output_generation_logits=output_generation_logits, openai_format_response=True, ) if return_text: return response["choices"][0]["text"] # shape[batch_size, 1] - if return_logits: - return response["choices"][0]["generation_logits"] # shape[batch_size, 1, num_tokens, vocab_size] + elif return_logits: + if output_context_logits: + return response["choices"][0]["context_logits"] + else: + return response["choices"][0]["generation_logits"] def tokenizer_type(self, tokenizer): """ @@ -93,6 +106,16 @@ def loglikelihood(self, requests: list[Instance]): elif tokenizer_type == "AutoTokenizer": special_tokens_kwargs['add_special_tokens'] = self.add_bos + single_prediction_token = False + # Assuming evaluating on only one benchmark/task at a time, hence all instances in requests are of the same + # task. + mmlu_regex_pattern = r"^mmlu_" + lambada_regex_pattern = r"^lambada_" + if re.match(mmlu_regex_pattern, requests[0].task_name) or re.match( + lambada_regex_pattern, requests[0].task_name + ): + single_prediction_token = True + results = [] for request in tqdm(requests): # get the input prompt from the request @@ -105,31 +128,39 @@ def loglikelihood(self, requests: list[Instance]): if self.tokenizer_type(self.tokenizer) == "SentencePieceTokenizer": continuation_enc = continuation_enc[1:] num_cont_tokens = len(continuation_enc) - # Update self.max_tokens_to_generate with number of continuation tokens (or output tokens) in the request - self.max_tokens_to_generate = num_cont_tokens + # Hard code max_tokens_to_generate to 1 to always generate just 1 token + self.max_tokens_to_generate = 1 + # Delete the last token from continuation before passing it to the ip prompt by replacing with empty string + prompt = context + continuation.replace(self.tokenizer.tokenizer.decode(continuation_enc[-1]), "") # Create payload to query the model deployed on PyTriton server payload = { "model": self.model_name, - "prompt": context, + "prompt": prompt, "max_tokens": self.max_tokens_to_generate, "temperature": self.temperature, "top_p": self.top_p, "top_k": self.top_k, } # Get the logits from the model - generation_logits = self._generate_tokens_logits(payload, return_logits=True) - # Convert generation_logits to torch tensor to easily get logprobs wo manual implementation of log_softmax - multi_logits = F.log_softmax(torch.tensor(generation_logits[0]), dim=-1) + logits = self._generate_tokens_logits(payload, single_prediction_token, return_logits=True) + # In case of multiple token prediction where full context logits are returned, get only logits + # corresponding to the continuation tokens from the context logits tensor.context_logits contains logits + # for all tokens in the ip prompt along with the logit for the next token prediction after the final token + # in the prompt. Shape of context_logits: [1, #tokens_in_prompt+1, vocab_size] + if not single_prediction_token: + logits = logits[:, -num_cont_tokens:, :] + # Convert logits to torch tensor to easily get logprobs wo manual implementation of log_softmax + logProbs = F.log_softmax(torch.tensor(logits), dim=-1) # Convert encoded continuation tokens to torch tensor cont_toks = torch.tensor(continuation_enc, dtype=torch.long).unsqueeze(0) # Get the greedy token from the logits (i.e token with the highest prob) - greedy_tokens = multi_logits.argmax(dim=-1) + greedy_tokens = logProbs.argmax(dim=-1) # Check if all greedy_tokens match the the actual continuation tokens is_greedy = (greedy_tokens == cont_toks).all() # Get the logits corresponding to the actual continuation tokens - logits = torch.gather(multi_logits, 2, cont_toks.unsqueeze(-1)).squeeze(-1) + logProbs_actual = torch.gather(logProbs, 2, cont_toks.unsqueeze(-1)).squeeze(-1) # result is tuple of logProb of generating the continuation token and is_greedy - result = (float(logits.sum()), bool(is_greedy)) + result = (float(logProbs_actual.sum()), bool(is_greedy)) results.append(result) diff --git a/nemo/deploy/nlp/query_llm.py b/nemo/deploy/nlp/query_llm.py index e91754297aa7..097445888726 100644 --- a/nemo/deploy/nlp/query_llm.py +++ b/nemo/deploy/nlp/query_llm.py @@ -198,6 +198,7 @@ def query_llm( end_strings=None, init_timeout=60.0, openai_format_response: bool = False, + output_context_logits: bool = False, output_generation_logits: bool = False, ): """ @@ -275,6 +276,9 @@ def query_llm( if end_strings is not None: inputs["end_strings"] = str_list2numpy(end_strings) + if output_context_logits is not None: + inputs["output_context_logits"] = np.full(prompts.shape, output_context_logits, dtype=np.bool_) + if output_generation_logits is not None: inputs["output_generation_logits"] = np.full(prompts.shape, output_generation_logits, dtype=np.bool_) @@ -301,6 +305,8 @@ def query_llm( } if output_generation_logits: openai_response["choices"][0]["generation_logits"] = result_dict["generation_logits"] + if output_context_logits: + openai_response["choices"][0]["context_logits"] = result_dict["context_logits"] return openai_response else: return sentences diff --git a/nemo/export/tensorrt_llm.py b/nemo/export/tensorrt_llm.py index 3c6946f129a7..c4ec99dcabd9 100644 --- a/nemo/export/tensorrt_llm.py +++ b/nemo/export/tensorrt_llm.py @@ -959,6 +959,7 @@ def forward( prompt_embeddings_checkpoint_path: str = None, streaming: bool = False, output_log_probs: bool = False, + output_context_logits: bool = False, output_generation_logits: bool = False, **sampling_kwargs, ): @@ -1049,6 +1050,7 @@ def forward( no_repeat_ngram_size=no_repeat_ngram_size, output_log_probs=output_log_probs, multiprocessed_env=multiprocessed_env, + output_context_logits=output_context_logits, output_generation_logits=output_generation_logits, **sampling_kwargs, ) @@ -1133,6 +1135,7 @@ def get_triton_input(self): Tensor(name="no_repeat_ngram_size", shape=(-1,), dtype=np.single, optional=True), Tensor(name="task_id", shape=(-1,), dtype=bytes, optional=True), Tensor(name="lora_uids", shape=(-1,), dtype=bytes, optional=True), + Tensor(name="output_context_logits", shape=(-1,), dtype=np.bool_, optional=False), Tensor(name="output_generation_logits", shape=(-1,), dtype=np.bool_, optional=False), ) return inputs @@ -1142,6 +1145,7 @@ def get_triton_output(self): outputs = ( Tensor(name="outputs", shape=(-1,), dtype=bytes), Tensor(name="generation_logits", shape=(-1,), dtype=np.single), + Tensor(name="context_logits", shape=(-1,), dtype=np.single), ) return outputs @@ -1149,6 +1153,7 @@ def get_triton_output(self): def triton_infer_fn(self, **inputs: np.ndarray): """Triton infer function for streaming""" output_dict = {} + context_logits_available = False generation_logits_available = False try: infer_input = {"input_texts": str_ndarray2list(inputs.pop("prompts"))} @@ -1179,10 +1184,21 @@ def triton_infer_fn(self, **inputs: np.ndarray): if "output_generation_logits" in inputs: generation_logits_available = inputs["output_generation_logits"] infer_input["output_generation_logits"] = inputs.pop("output_generation_logits")[0][0] + if "output_context_logits" in inputs: + context_logits_available = inputs["output_context_logits"] + infer_input["output_context_logits"] = inputs.pop("output_context_logits")[0][0] if generation_logits_available: output_texts, generation_logits = self.forward(**infer_input) - output_dict["generation_logits"] = np.array(generation_logits.cpu().numpy()) + # generation_logits is a 4d tensor of dim [1,1,#generated_tokens, vocab_size], return just the 3d tensor + # in output dict. + output_dict["generation_logits"] = np.array(generation_logits[0].cpu().numpy()) + elif context_logits_available: + output_texts, context_logits = self.forward(**infer_input) + # convert context logits to 3d tensor from list since its avaiable as a list of tensor shaped + # [#tokens, vocab_size] + context_logits = context_logits[0].unsqueeze(0) + output_dict["context_logits"] = np.array(context_logits.cpu().numpy()) else: output_texts = self.forward(**infer_input) output_dict["outputs"] = cast_output(output_texts, np.bytes_) diff --git a/nemo/export/trt_llm/tensorrt_llm_run.py b/nemo/export/trt_llm/tensorrt_llm_run.py index ef67c918290f..8be537f840e8 100644 --- a/nemo/export/trt_llm/tensorrt_llm_run.py +++ b/nemo/export/trt_llm/tensorrt_llm_run.py @@ -647,6 +647,7 @@ def generate( streaming: bool = False, output_log_probs=False, multiprocessed_env=False, + output_context_logits=False, output_generation_logits=False, **sampling_kwargs, ) -> Optional[List[List[str]]]: @@ -709,6 +710,8 @@ def generate( if output_generation_logits: return output_lines_list, outputs['generation_logits'] + elif output_context_logits: + return output_lines_list, outputs['context_logits'] return output_lines_list diff --git a/requirements/requirements_eval.txt b/requirements/requirements_eval.txt new file mode 100644 index 000000000000..60828395c199 --- /dev/null +++ b/requirements/requirements_eval.txt @@ -0,0 +1,2 @@ +# Installs EleutherAI's lm-evaluation-harness https://github.com/EleutherAI/lm-evaluation-harness/tree/main +lm-eval