diff --git a/examples/llamacpp_processor.py b/examples/llamacpp_processor.py
index f09b95da0..33ec50da2 100644
--- a/examples/llamacpp_processor.py
+++ b/examples/llamacpp_processor.py
@@ -1,11 +1,9 @@
 from enum import Enum
 
-import numpy as np
-from llama_cpp import Llama, LogitsProcessorList, StoppingCriteria, StoppingCriteriaList
-from numpy.typing import NDArray
+from llama_cpp import Llama, LogitsProcessorList
 from pydantic import BaseModel, constr
 
-from outlines.generate.processors.llamacpp import JSONLogitsProcessor
+from outlines.models.llamacpp import JSONLogitsProcessor
 
 
 class Weapon(str, Enum):
@@ -31,38 +29,20 @@ class Character(BaseModel):
     strength: int
 
 
-# TODO: why do we need this?
-class EosCriteria(StoppingCriteria):
-    def __init__(self, eos_token_id):
-        self.eos_token_id = eos_token_id
-
-    def __call__(self, input_ids: NDArray[np.intc], logits: NDArray[np.single]):
-        if self.eos_token_id in input_ids[1:]:
-            return True
-
-
 if __name__ == "__main__":
     llama = Llama("./phi-2.Q4_K_M.gguf")
 
-    prompt = b"Instruct: You are a leading role play gamer. You have seen thousands of different characters and their attributes.\nPlease return a JSON object with common attributes of an RPG character. Give me a character description\nOutput:"
+    prompt = "Instruct: You are a leading role play gamer. You have seen thousands of different characters and their attributes.\nPlease return a JSON object with common attributes of an RPG character. Give me a character description\nOutput:"
 
     logits_processor = JSONLogitsProcessor(Character, llama)
-    stopping_criteria_list = StoppingCriteriaList([EosCriteria(llama.token_eos())])
 
-    json_str = ""
-    tokens = llama.tokenize(prompt)
-    for token in llama.generate(
-        tokens,
+    json_str = llama.create_completion(
+        prompt,
         top_k=40,
         top_p=0.95,
-        temp=0.7,
+        temperature=0.7,
+        max_tokens=120,
         logits_processor=LogitsProcessorList([logits_processor]),
-        stopping_criteria=stopping_criteria_list,
-    ):
-        d = llama.detokenize([token])
-        try:
-            json_str += d.decode("utf-8")
-        except UnicodeDecodeError:
-            continue
+    )["choices"][0]["text"]
 
     print(json_str)
diff --git a/outlines/generate/processors/__init__.py b/outlines/generate/processors/__init__.py
deleted file mode 100644
index e69de29bb..000000000
diff --git a/outlines/generate/processors/llamacpp.py b/outlines/generate/processors/llamacpp.py
deleted file mode 100644
index a8c5f218a..000000000
--- a/outlines/generate/processors/llamacpp.py
+++ /dev/null
@@ -1,133 +0,0 @@
-import json
-import math
-from collections import defaultdict
-from typing import DefaultDict, List, Tuple, Union
-
-import numpy as np
-import torch
-from numpy.typing import NDArray
-
-from outlines.fsm.fsm import RegexFSM
-from outlines.fsm.json_schema import build_regex_from_object
-from outlines.models.tokenizer import Tokenizer
-
-
-class LlamaCppTokenizer(Tokenizer):
-    def __init__(self, llama_instance, **kwargs):
-        self.model_name = "llama"
-        self.llama_instance = llama_instance
-        self.is_llama = False
-
-        self.n_vocab = llama_instance.n_vocab()
-
-        self.eos_token_id = llama_instance.token_eos()
-        self.eos_token = llama_instance.detokenize([self.eos_token_id])
-        self.pad_token_id = -1
-        self.bos_token_id = llama_instance.token_bos()
-        self.nl_token_id = 0
-        self.vocabulary = {}
-        self._create_vocabulary()
-
-        self.special_tokens = {}
-
-    def _create_vocabulary(self):
-        for t in range(self.n_vocab):
-            token_piece = ""
-            try:
-                token_piece = self.llama_instance.detokenize([t]).decode("utf-8")
-                self.vocabulary[token_piece] = t
-            except Exception as e:
-                print(f"Failed to convert token ({token_piece}): {e}")
-                continue
-
-    def encode(
-        self, prompt: Union[str, List[str]]
-    ) -> Tuple[NDArray[np.int64], NDArray[np.int64]]:
-        token_ids = self.llama_instance.tokenize(prompt)
-        return token_ids, torch.ones_like(token_ids)
-
-    def decode(self, token_ids: NDArray[np.int64]) -> List[str]:
-        if isinstance(token_ids, list):
-            token_ids = np.array(token_ids)
-        if token_ids.ndim == 1:
-            token_ids = [token_ids]
-
-        decoded = self.llama_instance.detokenize(token_ids)
-
-        return decoded
-
-    def convert_token_to_string(self, token: str) -> str:
-        return token
-
-    def __eq__(self, other):
-        if isinstance(other, type(self)):
-            return other.model_name == self.model_name and other.kwargs == self.kwargs
-        return NotImplemented
-
-    def __hash__(self):
-        return hash(self.model_name)
-
-
-class RegexLogitsProcessor:
-    def __init__(self, regex_string, llama):
-        """Compile the FSM that drives the regex-guided generation.
-
-        Parameters
-        ----------
-        regex_string
-            A string that represents a regular expression
-        llm
-            An instance of `vllm.LLM`
-
-        """
-
-        self.tokenizer = LlamaCppTokenizer(llama)
-
-        fsm = RegexFSM(regex_string, self.tokenizer)
-        self.fsm = fsm
-
-        self.fsm_state = None
-
-    def __call__(
-        self, input_ids: NDArray[np.int64], scores: NDArray[np.float32]
-    ) -> NDArray[np.float32]:
-        """Use the FSM to bias the logits before sampling the next token."""
-
-        # TODO: sequence id handling
-        seq_id = 0
-
-        if len(input_ids) == 0 or self.fsm_state is None:  # Initialize the fsm states
-            self.fsm_state: DefaultDict[int, int] = defaultdict(int)  # type: ignore
-        else:
-            last_token = input_ids[-1]
-            self.fsm_state[seq_id] = self.fsm.next_state(
-                self.fsm_state[seq_id], last_token
-            )
-
-        allowed_tokens = self.fsm.allowed_token_ids(self.fsm_state[seq_id])
-
-        mask = torch.full((scores.shape[-1],), -math.inf, device="cpu").numpy()
-        mask[allowed_tokens] = 0
-        biased_scores = scores + mask
-
-        biased_scores[self.tokenizer.eos_token_id] = 0
-
-        return biased_scores
-
-
-class JSONLogitsProcessor(RegexLogitsProcessor):
-    def __init__(self, schema, llm):
-        """Compile the FSM that drives the JSON-guided generation.
-
-        Parameters
-        ----------
-        schema
-            A JSON schema that encodes the structure we want the model to generate
-        llm
-            An instance of `vllm.LLM`
-
-        """
-        if isinstance(schema, dict):
-            schema = json.dumps(schema)
-        regex_string = build_regex_from_object(schema)
-        super().__init__(regex_string, llm)
diff --git a/outlines/models/llamacpp.py b/outlines/models/llamacpp.py
index c51f600f8..2bd55b275 100644
--- a/outlines/models/llamacpp.py
+++ b/outlines/models/llamacpp.py
@@ -1,79 +1,94 @@
-import ctypes
-from typing import List, Optional, Tuple, Union
+import json
+import math
+from collections import defaultdict
+from typing import DefaultDict, List, Optional, Tuple, Union
 
 import numpy as np
 import torch
+
+# TODO: in order to make sub classing work we need to move the installation check here
+from llama_cpp import Llama, LlamaGrammar, LogitsProcessorList
 from numpy.typing import NDArray
 
+from outlines.fsm.fsm import RegexFSM
+from outlines.fsm.json_schema import build_regex_from_object
 from outlines.models.tokenizer import Tokenizer
 
 
-class LlamaCpp:
-    """Represents a `llama_cpp` model."""
+class RegexLogitsProcessor:
+    def __init__(self, regex_string, llama):
+        """Compile the FSM that drives the regex-guided generation.
 
-    def __init__(
-        self, llama_instance, model, tokenizer, device, context_params, **kwargs
-    ):
-        self.device = device
-        self.llama_instance = llama_instance
-        self.tokenizer = tokenizer
-
-        # Note: the concept of padding does not exist in llama.cpp as a batched sequence is just
-        # a flat array of tokens that can be assigned to one or more sequences.
-        # To make it compatible with the transformers inspired tokenizer interface
-        # we need a padding token to homogenize to token_ids tensor.
-        self.pad_token_id = -1
-
-        self.n_past = 0
-        self.n_vocab = kwargs.pop("n_vocab")
-
-        self.ctx = llama_instance.llama_new_context_with_model(model, context_params)
-
-    def forward(self, input_ids: torch.LongTensor, *_):
-        """Compute a forward pass through the llama_cpp model."""
-        if input_ids.ndim == 2:
-            seq_tensor = input_ids[:, self.n_past :]
-        elif input_ids.ndim == 1:
-            seq_tensor = input_ids.view(1, -1)[:, self.n_past :]
+        Parameters
+        ----------
+        regex_string
+            A string that represents a regular expression
+        llm
+            An instance of `vllm.LLM`
+
+        """
+
+        self.tokenizer = LlamaCppTokenizer(llama)
+
+        fsm = RegexFSM(regex_string, self.tokenizer)
+        self.fsm = fsm
+
+        self.fsm_state = None
+
+    def __call__(
+        self, input_ids: NDArray[np.int64], scores: NDArray[np.float32]
+    ) -> NDArray[np.float32]:
+        """Use the FSM to bias the logits before sampling the next token."""
+
+        # TODO: sequence id handling
+        seq_id = 0
+
+        if len(input_ids) == 0 or self.fsm_state is None:  # Initialize the fsm states
+            self.fsm_state: DefaultDict[int, int] = defaultdict(int)  # type: ignore
         else:
-            raise Exception("Only one and two dimensional inputs allowed.")
-
-        tokens_total = torch.numel(seq_tensor[seq_tensor != self.pad_token_id])
-        batch = self.llama_instance.llama_batch_init(tokens_total, 0, 1)
-
-        seq_token_ids = []
-        for seq_idx, seq in enumerate(seq_tensor):
-            for token_pos, token_id in enumerate(seq):
-                if token_id == self.pad_token_id:
-                    break
-                batch.token[batch.n_tokens] = token_id.item()
-                batch.pos[batch.n_tokens] = token_pos
-                batch.seq_id[batch.n_tokens][0] = seq_idx
-                batch.n_seq_id[batch.n_tokens] = 1
-                batch.logits[batch.n_tokens] = False
-
-                batch.n_tokens += 1
-                self.n_past += 1
-
-            batch.logits[batch.n_tokens - 1] = True
-            seq_token_ids.append(batch.n_tokens - 1)
-
-        if self.llama_instance.llama_decode(self.ctx, batch) != 0:
-            print("Error decoding")
-
-        all_logits = []
-        for seq_token in seq_token_ids:
-            logits = self.llama_instance.llama_get_logits_ith(self.ctx, seq_token)
-            logits_list = (ctypes.c_float * self.n_vocab)(
-                *[logits[token_id] for token_id in range(self.n_vocab)]
+            last_token = input_ids[-1]
+            self.fsm_state[seq_id] = self.fsm.next_state(
+                self.fsm_state[seq_id], last_token
             )
-            logits_tensor = torch.tensor(logits_list)
-            all_logits.append(logits_tensor)
 
-        self.llama_instance.llama_batch_free(batch)
+        allowed_tokens = self.fsm.allowed_token_ids(self.fsm_state[seq_id])
+
+        mask = torch.full((scores.shape[-1],), -math.inf, device="cpu").numpy()
+        mask[allowed_tokens] = 0
+        biased_scores = scores + mask
+
+        biased_scores[self.tokenizer.eos_token_id] = 0
+
+        return biased_scores
 
-        stacked_logits = torch.stack(all_logits)
-        return stacked_logits, None
+
+class JSONLogitsProcessor(RegexLogitsProcessor):
+    def __init__(self, schema, llm):
+        """Compile the FSM that drives the JSON-guided generation.
+
+        Parameters
+        ----------
+        schema
+            A JSON schema that encodes the structure we want the model to generate
+        llm
+            An instance of `vllm.LLM`
+
+        """
+        if isinstance(schema, dict):
+            schema = json.dumps(schema)
+        regex_string = build_regex_from_object(schema)
+        super().__init__(regex_string, llm)
+
+
+class LlamaCpp(Llama):
+    """Represents a `llama_cpp` model."""
+
+    def __init__(self, model, **kwargs):
+        super().__init__(model, **kwargs)
+
+        self.device = "cpu"
+
+        self.tokenizer = LlamaCppTokenizer(self)
 
     def __call__(
         self,
@@ -81,51 +96,77 @@ def __call__(
         attention_mask: torch.LongTensor,
         past_key_values: Optional[Tuple] = None,
     ) -> torch.FloatTensor:
-        logits, kv_cache = self.forward(input_ids, attention_mask, past_key_values)
-        next_token_logits = logits
+        super().eval(input_ids[0, self.n_tokens :])
+
+        logits = super().eval_logits
+        logits_tensor = torch.FloatTensor(list(logits))
+
+        return logits_tensor, None
 
-        return next_token_logits, kv_cache
+    def sample(
+        self,
+        top_k: int = 40,
+        top_p: float = 0.95,
+        min_p: float = 0.05,
+        typical_p: float = 1.0,
+        temp: float = 0.80,
+        repeat_penalty: float = 1.1,
+        frequency_penalty: float = 0.0,
+        presence_penalty: float = 0.0,
+        tfs_z: float = 1.0,
+        mirostat_mode: int = 0,
+        mirostat_eta: float = 0.1,
+        mirostat_tau: float = 5.0,
+        penalize_nl: bool = True,
+        logits_processor: Optional[LogitsProcessorList] = None,
+        grammar: Optional[LlamaGrammar] = None,
+    ):
+        return super().sample(
+            top_k=top_k,
+            top_p=top_p,
+            min_p=min_p,
+            typical_p=typical_p,
+            temp=temp,
+            repeat_penalty=repeat_penalty,
+            frequency_penalty=frequency_penalty,
+            presence_penalty=presence_penalty,
+            tfs_z=tfs_z,
+            mirostat_mode=mirostat_mode,
+            mirostat_eta=mirostat_eta,
+            mirostat_tau=mirostat_tau,
+            penalize_nl=penalize_nl,
+            logits_processor=logits_processor,
+            grammar=grammar,
+        )
 
 
 class LlamaCppTokenizer(Tokenizer):
-    def __init__(self, llama_instance, model, model_name: str, **kwargs):
-        self.model_name = model_name
-        self.llama_instance = llama_instance
+    def __init__(self, model, **kwargs):
+        self.model = model
         self.is_llama = False
+        self.model_name = self.model.model_path
 
-        self.model = model
-        self.n_vocab = kwargs.pop("n_vocab")
+        self.tokenizer = self.model.tokenizer()
 
-        self.eos_token_id = llama_instance.llama_token_eos(model)
-        self.eos_token = self._get_eos_token()
-        self.pad_token_id = -1
-        self.bos_token_id = llama_instance.llama_token_eos(model)
-        self.nl_token_id = llama_instance.llama_token_nl(model)
+        self.eos_token_id = self.model.token_eos()
+        self.bos_token_id = self.model.token_bos()
+        self.pad_token_id = 0
+        self.eos_token = self.tokenizer.decode([self.eos_token_id])
+        self.bos_token = self.tokenizer.decode([self.bos_token_id])
+
+        self.n_vocab = self.model.n_vocab()
         self.vocabulary = {}
         self._create_vocabulary()
 
-        self.n_past = 0
-
-        self.special_tokens = {
-            self.eos_token_id,
-            self.pad_token_id,
-            self.bos_token_id,
-            self.nl_token_id,
-        }
+        self.special_tokens = {}
 
     def _create_vocabulary(self):
         for t in range(self.n_vocab):
-            size = 32
-            buffer = (ctypes.c_char * size)()
-            n = self.llama_instance.llama_token_to_piece(
-                self.model, self.llama_instance.llama_token(t), buffer, size
-            )
-
             try:
-                token_piece = buffer[:n].decode("utf-8")
+                token_piece = self.tokenizer.decode([t])
                 self.vocabulary[token_piece] = t
             except Exception as e:
-                print(f"Failed to convert token ({buffer[:n]}): {e}")
+                print(f"Failed to convert token: {e}")
                 continue
 
     def encode(
@@ -136,27 +177,9 @@ def encode(
         else:
             prompts = [prompt]
 
-        max_len = 0
         token_ids = []
         for p in prompts:
-            embd_inp = (self.llama_instance.llama_token * (len(p) + 1))()
-
-            n_of_tok = self.llama_instance.llama_tokenize(
-                model=self.model,
-                text=bytes(str(p), "utf-8"),
-                text_len=len(embd_inp),
-                tokens=embd_inp,
-                n_max_tokens=len(embd_inp),
-                add_bos=self.n_past == 0,
-                special=False,
-            )
-
-            self.n_past += n_of_tok
-
-            if n_of_tok > max_len:
-                max_len = n_of_tok
-
-            embd_inp = embd_inp[:n_of_tok]
+            embd_inp = self.tokenizer.encode(p)
             token_ids.append(np.array(embd_inp))
 
         max_len = np.max([len(a) for a in token_ids])
@@ -176,39 +199,7 @@ def encode(
         return token_ids, torch.ones_like(token_ids)
 
     def decode(self, token_ids: NDArray[np.int64]) -> List[str]:
-        if isinstance(token_ids, list):
-            token_ids = np.array(token_ids)
-        if token_ids.ndim == 1:
-            token_ids = [token_ids]
-
-        pieces = []
-        for tokens in token_ids:
-            seq = []
-            for id in tokens:
-                size = 32
-                buffer = (ctypes.c_char * size)()
-                n = self.llama_instance.llama_token_to_piece(
-                    self.model, self.llama_instance.llama_token(id), buffer, size
-                )
-
-                token_piece = buffer[:n].decode("utf-8")  # type: ignore
-
-                seq.append(token_piece)
-
-            pieces.append("".join(seq))
-
-        return pieces
-
-    def _get_eos_token(self):
-        size = 32
-        buffer = (ctypes.c_char * size)()
-        n = self.llama_instance.llama_token_to_piece(
-            self.model, self.llama_instance.llama_token(self.eos_token_id), buffer, size
-        )
-
-        token_piece = buffer[:n].decode("utf-8")
-
-        return token_piece
+        return [self.tokenizer.decode(t.tolist()) for t in token_ids]
 
     def convert_token_to_string(self, token: str) -> str:
         return token
@@ -228,77 +219,4 @@ def llamacpp(
     model_kwargs: dict = {},
     tokenizer_kwargs: dict = {},
 ):
-    try:
-        import llama_cpp
-    except ImportError:
-        raise ImportError(
-            "The `llama-cpp-python` library needs to be installed in order to use LlamaCpp."
-        )
-
-    if device is None:
-        device = "cpu"
-
-    llama_cpp.llama_backend_init(numa=False)
-
-    model_params = llama_cpp.llama_model_default_params()
-
-    if "cuda" in device:
-        model_params.n_gpu_layers = 999
-    else:
-        model_params.n_gpu_layers = model_kwargs.pop(
-            "n_gpu_layers", model_params.n_gpu_layers
-        )
-
-    if "tensor_split" in model_kwargs.keys():
-        tensor_split = model_kwargs.get("tensor_split")
-        if isinstance(tensor_split, list):
-            tensor_split_arr = (ctypes.c_float * llama_cpp.LLAMA_MAX_DEVICES)(
-                *[t for t in tensor_split]
-            )
-            model_params.tensor_split = tensor_split_arr
-
-    context_params = llama_cpp.llama_context_default_params()
-    context_params.n_batch = model_kwargs.pop("n_batch", context_params.n_batch)
-    context_params.n_ctx = model_kwargs.pop("n_ctx", context_params.n_ctx)
-    context_params.n_threads = model_kwargs.pop("n_threads", context_params.n_threads)
-    context_params.n_threads_batch = model_kwargs.pop(
-        "n_threads_batch", context_params.n_threads_batch
-    )
-    context_params.rope_scaling_type = model_kwargs.pop(
-        "rope_scaling_type", context_params.rope_scaling_type
-    )
-    context_params.rope_freq_base = model_kwargs.pop(
-        "rope_freq_base", context_params.rope_freq_base
-    )
-    context_params.rope_freq_scale = model_kwargs.pop(
-        "rope_freq_scale", context_params.rope_freq_scale
-    )
-    context_params.yarn_ext_factor = model_kwargs.pop(
-        "yarn_ext_factor", context_params.yarn_ext_factor
-    )
-    context_params.yarn_attn_factor = model_kwargs.pop(
-        "yarn_attn_factor", context_params.yarn_attn_factor
-    )
-    context_params.yarn_beta_fast = model_kwargs.pop(
-        "yarn_beta_fast", context_params.yarn_beta_fast
-    )
-    context_params.yarn_beta_slow = model_kwargs.pop(
-        "yarn_beta_slow", context_params.yarn_beta_slow
-    )
-    context_params.yarn_orig_ctx = model_kwargs.pop(
-        "yarn_orig_ctx", context_params.yarn_orig_ctx
-    )
-    context_params.offload_kqv = model_kwargs.pop(
-        "offload_kqv", context_params.offload_kqv
-    )
-
-    model = llama_cpp.llama_load_model_from_file(
-        model_name.encode("utf-8"), model_params
-    )
-
-    model_kwargs["n_vocab"] = llama_cpp.llama_n_vocab(model)
-    tokenizer_kwargs["n_vocab"] = model_kwargs.get("n_vocab")
-
-    tokenizer = LlamaCppTokenizer(llama_cpp, model, model_name, **tokenizer_kwargs)
-
-    return LlamaCpp(llama_cpp, model, tokenizer, "cpu", context_params, **model_kwargs)
+    return LlamaCpp(model_name, **model_kwargs)