diff --git a/examples/llamacpp_processor.py b/examples/llamacpp_processor.py index f09b95da0..33ec50da2 100644 --- a/examples/llamacpp_processor.py +++ b/examples/llamacpp_processor.py @@ -1,11 +1,9 @@ from enum import Enum -import numpy as np -from llama_cpp import Llama, LogitsProcessorList, StoppingCriteria, StoppingCriteriaList -from numpy.typing import NDArray +from llama_cpp import Llama, LogitsProcessorList from pydantic import BaseModel, constr -from outlines.generate.processors.llamacpp import JSONLogitsProcessor +from outlines.models.llamacpp import JSONLogitsProcessor class Weapon(str, Enum): @@ -31,38 +29,20 @@ class Character(BaseModel): strength: int -# TODO: why do we need this? -class EosCriteria(StoppingCriteria): - def __init__(self, eos_token_id): - self.eos_token_id = eos_token_id - - def __call__(self, input_ids: NDArray[np.intc], logits: NDArray[np.single]): - if self.eos_token_id in input_ids[1:]: - return True - - if __name__ == "__main__": llama = Llama("./phi-2.Q4_K_M.gguf") - prompt = b"Instruct: You are a leading role play gamer. You have seen thousands of different characters and their attributes.\nPlease return a JSON object with common attributes of an RPG character. Give me a character description\nOutput:" + prompt = "Instruct: You are a leading role play gamer. You have seen thousands of different characters and their attributes.\nPlease return a JSON object with common attributes of an RPG character. Give me a character description\nOutput:" logits_processor = JSONLogitsProcessor(Character, llama) - stopping_criteria_list = StoppingCriteriaList([EosCriteria(llama.token_eos())]) - json_str = "" - tokens = llama.tokenize(prompt) - for token in llama.generate( - tokens, + json_str = llama.create_completion( + prompt, top_k=40, top_p=0.95, - temp=0.7, + temperature=0.7, + max_tokens=120, logits_processor=LogitsProcessorList([logits_processor]), - stopping_criteria=stopping_criteria_list, - ): - d = llama.detokenize([token]) - try: - json_str += d.decode("utf-8") - except UnicodeDecodeError: - continue + )["choices"][0]["text"] print(json_str) diff --git a/outlines/generate/processors/__init__.py b/outlines/generate/processors/__init__.py deleted file mode 100644 index e69de29bb..000000000 diff --git a/outlines/generate/processors/llamacpp.py b/outlines/generate/processors/llamacpp.py deleted file mode 100644 index a8c5f218a..000000000 --- a/outlines/generate/processors/llamacpp.py +++ /dev/null @@ -1,133 +0,0 @@ -import json -import math -from collections import defaultdict -from typing import DefaultDict, List, Tuple, Union - -import numpy as np -import torch -from numpy.typing import NDArray - -from outlines.fsm.fsm import RegexFSM -from outlines.fsm.json_schema import build_regex_from_object -from outlines.models.tokenizer import Tokenizer - - -class LlamaCppTokenizer(Tokenizer): - def __init__(self, llama_instance, **kwargs): - self.model_name = "llama" - self.llama_instance = llama_instance - self.is_llama = False - - self.n_vocab = llama_instance.n_vocab() - - self.eos_token_id = llama_instance.token_eos() - self.eos_token = llama_instance.detokenize([self.eos_token_id]) - self.pad_token_id = -1 - self.bos_token_id = llama_instance.token_bos() - self.nl_token_id = 0 - self.vocabulary = {} - self._create_vocabulary() - - self.special_tokens = {} - - def _create_vocabulary(self): - for t in range(self.n_vocab): - token_piece = "" - try: - token_piece = self.llama_instance.detokenize([t]).decode("utf-8") - self.vocabulary[token_piece] = t - except Exception as e: - print(f"Failed to convert token ({token_piece}): {e}") - continue - - def encode( - self, prompt: Union[str, List[str]] - ) -> Tuple[NDArray[np.int64], NDArray[np.int64]]: - token_ids = self.llama_instance.tokenize(prompt) - return token_ids, torch.ones_like(token_ids) - - def decode(self, token_ids: NDArray[np.int64]) -> List[str]: - if isinstance(token_ids, list): - token_ids = np.array(token_ids) - if token_ids.ndim == 1: - token_ids = [token_ids] - - decoded = self.llama_instance.detokenize(token_ids) - - return decoded - - def convert_token_to_string(self, token: str) -> str: - return token - - def __eq__(self, other): - if isinstance(other, type(self)): - return other.model_name == self.model_name and other.kwargs == self.kwargs - return NotImplemented - - def __hash__(self): - return hash(self.model_name) - - -class RegexLogitsProcessor: - def __init__(self, regex_string, llama): - """Compile the FSM that drives the regex-guided generation. - - Parameters - ---------- - regex_string - A string that represents a regular expression - llm - An instance of `vllm.LLM` - - """ - - self.tokenizer = LlamaCppTokenizer(llama) - - fsm = RegexFSM(regex_string, self.tokenizer) - self.fsm = fsm - - self.fsm_state = None - - def __call__( - self, input_ids: NDArray[np.int64], scores: NDArray[np.float32] - ) -> NDArray[np.float32]: - """Use the FSM to bias the logits before sampling the next token.""" - - # TODO: sequence id handling - seq_id = 0 - - if len(input_ids) == 0 or self.fsm_state is None: # Initialize the fsm states - self.fsm_state: DefaultDict[int, int] = defaultdict(int) # type: ignore - else: - last_token = input_ids[-1] - self.fsm_state[seq_id] = self.fsm.next_state( - self.fsm_state[seq_id], last_token - ) - - allowed_tokens = self.fsm.allowed_token_ids(self.fsm_state[seq_id]) - - mask = torch.full((scores.shape[-1],), -math.inf, device="cpu").numpy() - mask[allowed_tokens] = 0 - biased_scores = scores + mask - - biased_scores[self.tokenizer.eos_token_id] = 0 - - return biased_scores - - -class JSONLogitsProcessor(RegexLogitsProcessor): - def __init__(self, schema, llm): - """Compile the FSM that drives the JSON-guided generation. - - Parameters - ---------- - schema - A JSON schema that encodes the structure we want the model to generate - llm - An instance of `vllm.LLM` - - """ - if isinstance(schema, dict): - schema = json.dumps(schema) - regex_string = build_regex_from_object(schema) - super().__init__(regex_string, llm) diff --git a/outlines/models/llamacpp.py b/outlines/models/llamacpp.py index c51f600f8..2bd55b275 100644 --- a/outlines/models/llamacpp.py +++ b/outlines/models/llamacpp.py @@ -1,79 +1,94 @@ -import ctypes -from typing import List, Optional, Tuple, Union +import json +import math +from collections import defaultdict +from typing import DefaultDict, List, Optional, Tuple, Union import numpy as np import torch + +# TODO: in order to make sub classing work we need to move the installation check here +from llama_cpp import Llama, LlamaGrammar, LogitsProcessorList from numpy.typing import NDArray +from outlines.fsm.fsm import RegexFSM +from outlines.fsm.json_schema import build_regex_from_object from outlines.models.tokenizer import Tokenizer -class LlamaCpp: - """Represents a `llama_cpp` model.""" +class RegexLogitsProcessor: + def __init__(self, regex_string, llama): + """Compile the FSM that drives the regex-guided generation. - def __init__( - self, llama_instance, model, tokenizer, device, context_params, **kwargs - ): - self.device = device - self.llama_instance = llama_instance - self.tokenizer = tokenizer - - # Note: the concept of padding does not exist in llama.cpp as a batched sequence is just - # a flat array of tokens that can be assigned to one or more sequences. - # To make it compatible with the transformers inspired tokenizer interface - # we need a padding token to homogenize to token_ids tensor. - self.pad_token_id = -1 - - self.n_past = 0 - self.n_vocab = kwargs.pop("n_vocab") - - self.ctx = llama_instance.llama_new_context_with_model(model, context_params) - - def forward(self, input_ids: torch.LongTensor, *_): - """Compute a forward pass through the llama_cpp model.""" - if input_ids.ndim == 2: - seq_tensor = input_ids[:, self.n_past :] - elif input_ids.ndim == 1: - seq_tensor = input_ids.view(1, -1)[:, self.n_past :] + Parameters + ---------- + regex_string + A string that represents a regular expression + llm + An instance of `vllm.LLM` + + """ + + self.tokenizer = LlamaCppTokenizer(llama) + + fsm = RegexFSM(regex_string, self.tokenizer) + self.fsm = fsm + + self.fsm_state = None + + def __call__( + self, input_ids: NDArray[np.int64], scores: NDArray[np.float32] + ) -> NDArray[np.float32]: + """Use the FSM to bias the logits before sampling the next token.""" + + # TODO: sequence id handling + seq_id = 0 + + if len(input_ids) == 0 or self.fsm_state is None: # Initialize the fsm states + self.fsm_state: DefaultDict[int, int] = defaultdict(int) # type: ignore else: - raise Exception("Only one and two dimensional inputs allowed.") - - tokens_total = torch.numel(seq_tensor[seq_tensor != self.pad_token_id]) - batch = self.llama_instance.llama_batch_init(tokens_total, 0, 1) - - seq_token_ids = [] - for seq_idx, seq in enumerate(seq_tensor): - for token_pos, token_id in enumerate(seq): - if token_id == self.pad_token_id: - break - batch.token[batch.n_tokens] = token_id.item() - batch.pos[batch.n_tokens] = token_pos - batch.seq_id[batch.n_tokens][0] = seq_idx - batch.n_seq_id[batch.n_tokens] = 1 - batch.logits[batch.n_tokens] = False - - batch.n_tokens += 1 - self.n_past += 1 - - batch.logits[batch.n_tokens - 1] = True - seq_token_ids.append(batch.n_tokens - 1) - - if self.llama_instance.llama_decode(self.ctx, batch) != 0: - print("Error decoding") - - all_logits = [] - for seq_token in seq_token_ids: - logits = self.llama_instance.llama_get_logits_ith(self.ctx, seq_token) - logits_list = (ctypes.c_float * self.n_vocab)( - *[logits[token_id] for token_id in range(self.n_vocab)] + last_token = input_ids[-1] + self.fsm_state[seq_id] = self.fsm.next_state( + self.fsm_state[seq_id], last_token ) - logits_tensor = torch.tensor(logits_list) - all_logits.append(logits_tensor) - self.llama_instance.llama_batch_free(batch) + allowed_tokens = self.fsm.allowed_token_ids(self.fsm_state[seq_id]) + + mask = torch.full((scores.shape[-1],), -math.inf, device="cpu").numpy() + mask[allowed_tokens] = 0 + biased_scores = scores + mask + + biased_scores[self.tokenizer.eos_token_id] = 0 + + return biased_scores - stacked_logits = torch.stack(all_logits) - return stacked_logits, None + +class JSONLogitsProcessor(RegexLogitsProcessor): + def __init__(self, schema, llm): + """Compile the FSM that drives the JSON-guided generation. + + Parameters + ---------- + schema + A JSON schema that encodes the structure we want the model to generate + llm + An instance of `vllm.LLM` + + """ + if isinstance(schema, dict): + schema = json.dumps(schema) + regex_string = build_regex_from_object(schema) + super().__init__(regex_string, llm) + + +class LlamaCpp(Llama): + """Represents a `llama_cpp` model.""" + + def __init__(self, model, **kwargs): + super().__init__(model, **kwargs) + + self.device = "cpu" + + self.tokenizer = LlamaCppTokenizer(self) def __call__( self, @@ -81,51 +96,77 @@ def __call__( attention_mask: torch.LongTensor, past_key_values: Optional[Tuple] = None, ) -> torch.FloatTensor: - logits, kv_cache = self.forward(input_ids, attention_mask, past_key_values) - next_token_logits = logits + super().eval(input_ids[0, self.n_tokens :]) + + logits = super().eval_logits + logits_tensor = torch.FloatTensor(list(logits)) + + return logits_tensor, None - return next_token_logits, kv_cache + def sample( + self, + top_k: int = 40, + top_p: float = 0.95, + min_p: float = 0.05, + typical_p: float = 1.0, + temp: float = 0.80, + repeat_penalty: float = 1.1, + frequency_penalty: float = 0.0, + presence_penalty: float = 0.0, + tfs_z: float = 1.0, + mirostat_mode: int = 0, + mirostat_eta: float = 0.1, + mirostat_tau: float = 5.0, + penalize_nl: bool = True, + logits_processor: Optional[LogitsProcessorList] = None, + grammar: Optional[LlamaGrammar] = None, + ): + return super().sample( + top_k=top_k, + top_p=top_p, + min_p=min_p, + typical_p=typical_p, + temp=temp, + repeat_penalty=repeat_penalty, + frequency_penalty=frequency_penalty, + presence_penalty=presence_penalty, + tfs_z=tfs_z, + mirostat_mode=mirostat_mode, + mirostat_eta=mirostat_eta, + mirostat_tau=mirostat_tau, + penalize_nl=penalize_nl, + logits_processor=logits_processor, + grammar=grammar, + ) class LlamaCppTokenizer(Tokenizer): - def __init__(self, llama_instance, model, model_name: str, **kwargs): - self.model_name = model_name - self.llama_instance = llama_instance + def __init__(self, model, **kwargs): + self.model = model self.is_llama = False + self.model_name = self.model.model_path - self.model = model - self.n_vocab = kwargs.pop("n_vocab") + self.tokenizer = self.model.tokenizer() - self.eos_token_id = llama_instance.llama_token_eos(model) - self.eos_token = self._get_eos_token() - self.pad_token_id = -1 - self.bos_token_id = llama_instance.llama_token_eos(model) - self.nl_token_id = llama_instance.llama_token_nl(model) + self.eos_token_id = self.model.token_eos() + self.bos_token_id = self.model.token_bos() + self.pad_token_id = 0 + self.eos_token = self.tokenizer.decode([self.eos_token_id]) + self.bos_token = self.tokenizer.decode([self.bos_token_id]) + + self.n_vocab = self.model.n_vocab() self.vocabulary = {} self._create_vocabulary() - self.n_past = 0 - - self.special_tokens = { - self.eos_token_id, - self.pad_token_id, - self.bos_token_id, - self.nl_token_id, - } + self.special_tokens = {} def _create_vocabulary(self): for t in range(self.n_vocab): - size = 32 - buffer = (ctypes.c_char * size)() - n = self.llama_instance.llama_token_to_piece( - self.model, self.llama_instance.llama_token(t), buffer, size - ) - try: - token_piece = buffer[:n].decode("utf-8") + token_piece = self.tokenizer.decode([t]) self.vocabulary[token_piece] = t except Exception as e: - print(f"Failed to convert token ({buffer[:n]}): {e}") + print(f"Failed to convert token: {e}") continue def encode( @@ -136,27 +177,9 @@ def encode( else: prompts = [prompt] - max_len = 0 token_ids = [] for p in prompts: - embd_inp = (self.llama_instance.llama_token * (len(p) + 1))() - - n_of_tok = self.llama_instance.llama_tokenize( - model=self.model, - text=bytes(str(p), "utf-8"), - text_len=len(embd_inp), - tokens=embd_inp, - n_max_tokens=len(embd_inp), - add_bos=self.n_past == 0, - special=False, - ) - - self.n_past += n_of_tok - - if n_of_tok > max_len: - max_len = n_of_tok - - embd_inp = embd_inp[:n_of_tok] + embd_inp = self.tokenizer.encode(p) token_ids.append(np.array(embd_inp)) max_len = np.max([len(a) for a in token_ids]) @@ -176,39 +199,7 @@ def encode( return token_ids, torch.ones_like(token_ids) def decode(self, token_ids: NDArray[np.int64]) -> List[str]: - if isinstance(token_ids, list): - token_ids = np.array(token_ids) - if token_ids.ndim == 1: - token_ids = [token_ids] - - pieces = [] - for tokens in token_ids: - seq = [] - for id in tokens: - size = 32 - buffer = (ctypes.c_char * size)() - n = self.llama_instance.llama_token_to_piece( - self.model, self.llama_instance.llama_token(id), buffer, size - ) - - token_piece = buffer[:n].decode("utf-8") # type: ignore - - seq.append(token_piece) - - pieces.append("".join(seq)) - - return pieces - - def _get_eos_token(self): - size = 32 - buffer = (ctypes.c_char * size)() - n = self.llama_instance.llama_token_to_piece( - self.model, self.llama_instance.llama_token(self.eos_token_id), buffer, size - ) - - token_piece = buffer[:n].decode("utf-8") - - return token_piece + return [self.tokenizer.decode(t.tolist()) for t in token_ids] def convert_token_to_string(self, token: str) -> str: return token @@ -228,77 +219,4 @@ def llamacpp( model_kwargs: dict = {}, tokenizer_kwargs: dict = {}, ): - try: - import llama_cpp - except ImportError: - raise ImportError( - "The `llama-cpp-python` library needs to be installed in order to use LlamaCpp." - ) - - if device is None: - device = "cpu" - - llama_cpp.llama_backend_init(numa=False) - - model_params = llama_cpp.llama_model_default_params() - - if "cuda" in device: - model_params.n_gpu_layers = 999 - else: - model_params.n_gpu_layers = model_kwargs.pop( - "n_gpu_layers", model_params.n_gpu_layers - ) - - if "tensor_split" in model_kwargs.keys(): - tensor_split = model_kwargs.get("tensor_split") - if isinstance(tensor_split, list): - tensor_split_arr = (ctypes.c_float * llama_cpp.LLAMA_MAX_DEVICES)( - *[t for t in tensor_split] - ) - model_params.tensor_split = tensor_split_arr - - context_params = llama_cpp.llama_context_default_params() - context_params.n_batch = model_kwargs.pop("n_batch", context_params.n_batch) - context_params.n_ctx = model_kwargs.pop("n_ctx", context_params.n_ctx) - context_params.n_threads = model_kwargs.pop("n_threads", context_params.n_threads) - context_params.n_threads_batch = model_kwargs.pop( - "n_threads_batch", context_params.n_threads_batch - ) - context_params.rope_scaling_type = model_kwargs.pop( - "rope_scaling_type", context_params.rope_scaling_type - ) - context_params.rope_freq_base = model_kwargs.pop( - "rope_freq_base", context_params.rope_freq_base - ) - context_params.rope_freq_scale = model_kwargs.pop( - "rope_freq_scale", context_params.rope_freq_scale - ) - context_params.yarn_ext_factor = model_kwargs.pop( - "yarn_ext_factor", context_params.yarn_ext_factor - ) - context_params.yarn_attn_factor = model_kwargs.pop( - "yarn_attn_factor", context_params.yarn_attn_factor - ) - context_params.yarn_beta_fast = model_kwargs.pop( - "yarn_beta_fast", context_params.yarn_beta_fast - ) - context_params.yarn_beta_slow = model_kwargs.pop( - "yarn_beta_slow", context_params.yarn_beta_slow - ) - context_params.yarn_orig_ctx = model_kwargs.pop( - "yarn_orig_ctx", context_params.yarn_orig_ctx - ) - context_params.offload_kqv = model_kwargs.pop( - "offload_kqv", context_params.offload_kqv - ) - - model = llama_cpp.llama_load_model_from_file( - model_name.encode("utf-8"), model_params - ) - - model_kwargs["n_vocab"] = llama_cpp.llama_n_vocab(model) - tokenizer_kwargs["n_vocab"] = model_kwargs.get("n_vocab") - - tokenizer = LlamaCppTokenizer(llama_cpp, model, model_name, **tokenizer_kwargs) - - return LlamaCpp(llama_cpp, model, tokenizer, "cpu", context_params, **model_kwargs) + return LlamaCpp(model_name, **model_kwargs)