From e40c7fd362dbc029841d98e3d60bb5231c3555cd Mon Sep 17 00:00:00 2001
From: Andrew Lapp <andrew@github.rew.la>
Date: Mon, 22 Jul 2024 16:24:16 -0500
Subject: [PATCH] Restructure documentation, use phi-3-mini, include structured
 generation example, add feature matrix to models

---
 .../images/logits_processing_diagram.svg      | 157 +++++++++++++
 docs/quickstart.md                            | 214 +++---------------
 docs/reference/{ => generation}/cfg.md        |   0
 docs/reference/{ => generation}/choices.md    |   0
 .../{ => generation}/custom_fsm_ops.md        |   0
 docs/reference/{ => generation}/format.md     |   0
 docs/reference/generation/generation.md       | 210 +++++++++++++++++
 docs/reference/{ => generation}/json.md       |   3 +-
 docs/reference/{ => generation}/regex.md      |   0
 .../structured_generation_explanation.md      |  55 +++++
 docs/reference/{ => generation}/types.md      |   0
 docs/reference/json_mode.md                   |  17 --
 docs/reference/models/models.md               |  93 ++++++++
 docs/reference/models/transformers_vision.md  |   1 +
 mkdocs.yml                                    |  25 +-
 requirements-doc.txt                          |   1 +
 16 files changed, 564 insertions(+), 212 deletions(-)
 create mode 100644 docs/assets/images/logits_processing_diagram.svg
 rename docs/reference/{ => generation}/cfg.md (100%)
 rename docs/reference/{ => generation}/choices.md (100%)
 rename docs/reference/{ => generation}/custom_fsm_ops.md (100%)
 rename docs/reference/{ => generation}/format.md (100%)
 create mode 100644 docs/reference/generation/generation.md
 rename docs/reference/{ => generation}/json.md (98%)
 rename docs/reference/{ => generation}/regex.md (100%)
 create mode 100644 docs/reference/generation/structured_generation_explanation.md
 rename docs/reference/{ => generation}/types.md (100%)
 delete mode 100644 docs/reference/json_mode.md
 create mode 100644 docs/reference/models/models.md
diff --git a/docs/assets/images/logits_processing_diagram.svg b/docs/assets/images/logits_processing_diagram.svg
new file mode 100644
index 000000000..49d4d3af8
--- /dev/null
+++ b/docs/assets/images/logits_processing_diagram.svg
@@ -0,0 +1,157 @@
+<?xml version="1.0" encoding="UTF-8" standalone="no"?>
+<!DOCTYPE svg PUBLIC "-//W3C//DTD SVG 1.1//EN"
+ "http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd">
+<!-- Generated by graphviz version 2.43.0 (0)
+ -->
+<!-- Title: %3 Pages: 1 -->
+<svg width="214pt" height="784pt"
+ viewBox="0.00 0.00 214.00 784.00" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink">
+<g id="graph0" class="graph" transform="scale(1 1) rotate(0) translate(4 780)">
+<title>%3</title>
+<polygon fill="white" stroke="transparent" points="-4,4 -4,-780 210,-780 210,4 -4,4"/>
+<!-- inputTokens -->
+<g id="node1" class="node">
+<title>inputTokens</title>
+<polygon fill="none" stroke="black" points="48,-751 48,-772 159,-772 159,-751 48,-751"/>
+<text text-anchor="start" x="51" y="-758.8" font-family="Times,serif" font-weight="bold" font-size="14.00">Input Tokens</text>
+<polygon fill="none" stroke="black" points="48,-730 48,-751 95,-751 95,-730 48,-730"/>
+<text text-anchor="start" x="66.5" y="-736.8" font-family="Times,serif" font-size="14.00">7</text>
+<polygon fill="none" stroke="black" points="95,-730 95,-751 127,-751 127,-730 95,-730"/>
+<text text-anchor="start" x="106" y="-736.8" font-family="Times,serif" font-size="14.00">4</text>
+<polygon fill="none" stroke="black" points="127,-730 127,-751 159,-751 159,-730 127,-730"/>
+<text text-anchor="start" x="138" y="-736.8" font-family="Times,serif" font-size="14.00">8</text>
+</g>
+<!-- TransformerDecoder -->
+<g id="node2" class="node">
+<title>TransformerDecoder</title>
+<polygon fill="lightblue" stroke="lightblue" points="190,-690 16,-690 16,-654 190,-654 190,-690"/>
+<text text-anchor="middle" x="103" y="-668.9" font-family="Helvetica,sans-Serif" font-size="12.00">Transformer Decoder Pass</text>
+</g>
+<!-- inputTokens&#45;&gt;TransformerDecoder -->
+<g id="edge1" class="edge">
+<title>inputTokens&#45;&gt;TransformerDecoder</title>
+<path fill="none" stroke="black" d="M103,-725.64C103,-717.67 103,-708.77 103,-700.58"/>
+<polygon fill="black" stroke="black" points="106.5,-700.38 103,-690.38 99.5,-700.38 106.5,-700.38"/>
+</g>
+<!-- logitsTable -->
+<g id="node3" class="node">
+<title>logitsTable</title>
+<polygon fill="none" stroke="black" points="19,-593 19,-614 188,-614 188,-593 19,-593"/>
+<text text-anchor="start" x="22" y="-600.8" font-family="Times,serif" font-weight="bold" font-size="14.00">Model Output Logits</text>
+<polygon fill="none" stroke="black" points="19,-572 19,-593 85,-593 85,-572 19,-572"/>
+<text text-anchor="start" x="30.5" y="-578.8" font-family="Times,serif" font-size="14.00">Token</text>
+<polygon fill="none" stroke="black" points="85,-572 85,-593 188,-593 188,-572 85,-572"/>
+<text text-anchor="start" x="97" y="-578.8" font-family="Times,serif" font-size="14.00">Probability</text>
+<polygon fill="none" stroke="black" points="19,-551 19,-572 85,-572 85,-551 19,-551"/>
+<text text-anchor="start" x="45.5" y="-557.8" font-family="Times,serif" font-size="14.00">+</text>
+<polygon fill="none" stroke="black" points="85,-551 85,-572 188,-572 188,-551 85,-551"/>
+<text text-anchor="start" x="125" y="-557.8" font-family="Times,serif" font-size="14.00">3%</text>
+<polygon fill="none" stroke="black" points="19,-530 19,-551 85,-551 85,-530 19,-530"/>
+<text text-anchor="start" x="41" y="-536.8" font-family="Times,serif" font-size="14.00">foo</text>
+<polygon fill="none" stroke="black" points="85,-530 85,-551 188,-551 188,-530 85,-530"/>
+<text text-anchor="start" x="125" y="-536.8" font-family="Times,serif" font-size="14.00">4%</text>
+<polygon fill="none" stroke="black" points="19,-509 19,-530 85,-530 85,-509 19,-509"/>
+<text text-anchor="start" x="49.5" y="-515.8" font-family="Times,serif" font-size="14.00">.</text>
+<polygon fill="none" stroke="black" points="85,-509 85,-530 188,-530 188,-509 85,-509"/>
+<text text-anchor="start" x="125" y="-515.8" font-family="Times,serif" font-size="14.00">7%</text>
+<polygon fill="none" stroke="black" points="19,-488 19,-509 85,-509 85,-488 19,-488"/>
+<text text-anchor="start" x="47" y="-494.8" font-family="Times,serif" font-size="14.00">1</text>
+<polygon fill="none" stroke="black" points="85,-488 85,-509 188,-509 188,-488 85,-488"/>
+<text text-anchor="start" x="120.5" y="-494.8" font-family="Times,serif" font-size="14.00">11%</text>
+<polygon fill="none" stroke="black" points="19,-467 19,-488 85,-488 85,-467 19,-467"/>
+<text text-anchor="start" x="47" y="-473.8" font-family="Times,serif" font-size="14.00">2</text>
+<polygon fill="none" stroke="black" points="85,-467 85,-488 188,-488 188,-467 85,-467"/>
+<text text-anchor="start" x="120.5" y="-473.8" font-family="Times,serif" font-size="14.00">13%</text>
+<polygon fill="none" stroke="black" points="19,-446 19,-467 85,-467 85,-446 19,-446"/>
+<text text-anchor="start" x="47" y="-452.8" font-family="Times,serif" font-size="14.00">3</text>
+<polygon fill="none" stroke="black" points="85,-446 85,-467 188,-467 188,-446 85,-446"/>
+<text text-anchor="start" x="120.5" y="-452.8" font-family="Times,serif" font-size="14.00">17%</text>
+</g>
+<!-- TransformerDecoder&#45;&gt;logitsTable -->
+<g id="edge2" class="edge">
+<title>TransformerDecoder&#45;&gt;logitsTable</title>
+<path fill="none" stroke="black" d="M103,-653.83C103,-646.79 103,-638.04 103,-628.47"/>
+<polygon fill="black" stroke="black" points="106.5,-628.25 103,-618.25 99.5,-628.25 106.5,-628.25"/>
+</g>
+<!-- OutlinesLogitsProcessor -->
+<g id="node4" class="node">
+<title>OutlinesLogitsProcessor</title>
+<polygon fill="lightblue" stroke="lightblue" points="206,-406 0,-406 0,-370 206,-370 206,-406"/>
+<text text-anchor="middle" x="103" y="-384.9" font-family="Helvetica,sans-Serif" font-size="12.00">Outlines Regex Logits Processor</text>
+</g>
+<!-- logitsTable&#45;&gt;OutlinesLogitsProcessor -->
+<g id="edge3" class="edge">
+<title>logitsTable&#45;&gt;OutlinesLogitsProcessor</title>
+<path fill="none" stroke="black" d="M103,-441.93C103,-432.71 103,-423.91 103,-416.22"/>
+<polygon fill="black" stroke="black" points="106.5,-416.17 103,-406.17 99.5,-416.17 106.5,-416.17"/>
+</g>
+<!-- logitsProcessorTable -->
+<g id="node5" class="node">
+<title>logitsProcessorTable</title>
+<polygon fill="none" stroke="black" points="30,-309 30,-330 177,-330 177,-309 30,-309"/>
+<text text-anchor="start" x="37" y="-316.8" font-family="Times,serif" font-weight="bold" font-size="14.00">Processed Logits</text>
+<polygon fill="none" stroke="black" points="30,-288 30,-309 92,-309 92,-288 30,-288"/>
+<text text-anchor="start" x="39.5" y="-294.8" font-family="Times,serif" font-size="14.00">Token</text>
+<polygon fill="none" stroke="black" points="92,-288 92,-309 177,-309 177,-288 92,-288"/>
+<text text-anchor="start" x="95" y="-294.8" font-family="Times,serif" font-size="14.00">Probability</text>
+<polygon fill="none" stroke="black" points="30,-267 30,-288 92,-288 92,-267 30,-267"/>
+<text text-anchor="start" x="54.5" y="-273.8" font-family="Times,serif" font-size="14.00">+</text>
+<polygon fill="none" stroke="black" points="92,-267 92,-288 177,-288 177,-267 92,-267"/>
+<text text-anchor="start" x="123" y="-273.8" font-family="Times,serif" font-size="14.00">0%</text>
+<polygon fill="none" stroke="black" points="30,-246 30,-267 92,-267 92,-246 30,-246"/>
+<text text-anchor="start" x="33" y="-252.8" font-family="Times,serif" font-size="14.00">number</text>
+<polygon fill="none" stroke="black" points="92,-246 92,-267 177,-267 177,-246 92,-246"/>
+<text text-anchor="start" x="123" y="-252.8" font-family="Times,serif" font-size="14.00">0%</text>
+<polygon fill="none" stroke="black" points="30,-225 30,-246 92,-246 92,-225 30,-225"/>
+<text text-anchor="start" x="58.5" y="-231.8" font-family="Times,serif" font-size="14.00">.</text>
+<polygon fill="none" stroke="black" points="92,-225 92,-246 177,-246 177,-225 92,-225"/>
+<text text-anchor="start" x="123" y="-231.8" font-family="Times,serif" font-size="14.00">7%</text>
+<polygon fill="none" stroke="black" points="30,-204 30,-225 92,-225 92,-204 30,-204"/>
+<text text-anchor="start" x="56" y="-210.8" font-family="Times,serif" font-size="14.00">1</text>
+<polygon fill="none" stroke="black" points="92,-204 92,-225 177,-225 177,-204 92,-204"/>
+<text text-anchor="start" x="118.5" y="-210.8" font-family="Times,serif" font-size="14.00">11%</text>
+<polygon fill="none" stroke="black" points="30,-183 30,-204 92,-204 92,-183 30,-183"/>
+<text text-anchor="start" x="56" y="-189.8" font-family="Times,serif" font-size="14.00">2</text>
+<polygon fill="none" stroke="black" points="92,-183 92,-204 177,-204 177,-183 92,-183"/>
+<text text-anchor="start" x="118.5" y="-189.8" font-family="Times,serif" font-size="14.00">13%</text>
+<polygon fill="none" stroke="black" points="30,-162 30,-183 92,-183 92,-162 30,-162"/>
+<text text-anchor="start" x="56" y="-168.8" font-family="Times,serif" font-size="14.00">3</text>
+<polygon fill="none" stroke="black" points="92,-162 92,-183 177,-183 177,-162 92,-162"/>
+<text text-anchor="start" x="118.5" y="-168.8" font-family="Times,serif" font-size="14.00">17%</text>
+</g>
+<!-- OutlinesLogitsProcessor&#45;&gt;logitsProcessorTable -->
+<g id="edge4" class="edge">
+<title>OutlinesLogitsProcessor&#45;&gt;logitsProcessorTable</title>
+<path fill="none" stroke="black" d="M103,-369.83C103,-362.79 103,-354.04 103,-344.47"/>
+<polygon fill="black" stroke="black" points="106.5,-344.25 103,-334.25 99.5,-344.25 106.5,-344.25"/>
+</g>
+<!-- sampler -->
+<g id="node6" class="node">
+<title>sampler</title>
+<polygon fill="lightblue" stroke="lightblue" points="136.5,-122 69.5,-122 69.5,-86 136.5,-86 136.5,-122"/>
+<text text-anchor="middle" x="103" y="-100.9" font-family="Helvetica,sans-Serif" font-size="12.00">Sampler</text>
+</g>
+<!-- logitsProcessorTable&#45;&gt;sampler -->
+<g id="edge5" class="edge">
+<title>logitsProcessorTable&#45;&gt;sampler</title>
+<path fill="none" stroke="black" d="M103,-157.93C103,-148.71 103,-139.91 103,-132.22"/>
+<polygon fill="black" stroke="black" points="106.5,-132.17 103,-122.17 99.5,-132.17 106.5,-132.17"/>
+</g>
+<!-- sampledTokenTable -->
+<g id="node7" class="node">
+<title>sampledTokenTable</title>
+<polygon fill="none" stroke="black" points="40,-25 40,-46 166,-46 166,-25 40,-25"/>
+<text text-anchor="start" x="43" y="-32.8" font-family="Times,serif" font-weight="bold" font-size="14.00">Sampled Token</text>
+<polygon fill="none" stroke="black" points="40,-4 40,-25 119,-25 119,-4 40,-4"/>
+<text text-anchor="start" x="58" y="-10.8" font-family="Times,serif" font-size="14.00">Token</text>
+<polygon fill="none" stroke="black" points="119,-4 119,-25 166,-25 166,-4 119,-4"/>
+<text text-anchor="start" x="137.5" y="-10.8" font-family="Times,serif" font-size="14.00">3</text>
+</g>
+<!-- sampler&#45;&gt;sampledTokenTable -->
+<g id="edge6" class="edge">
+<title>sampler&#45;&gt;sampledTokenTable</title>
+<path fill="none" stroke="black" d="M103,-85.91C103,-78.33 103,-69.13 103,-60.23"/>
+<polygon fill="black" stroke="black" points="106.5,-60.05 103,-50.05 99.5,-60.05 106.5,-60.05"/>
+</g>
+</g>
+</svg>
diff --git a/docs/quickstart.md b/docs/quickstart.md
index 7d4e86fbc..fcb524d8a 100644
--- a/docs/quickstart.md
+++ b/docs/quickstart.md
@@ -6,6 +6,7 @@ title: Quickstart
 
 After [installing Outlines](installation.md), the fastest way to get to up to speed with the library is to get acquainted with its few core elements. We advise you to take a quick look at this page to see everything Outlines has to offer before diving in the [documentation](reference/index.md).
 
+
 ## Core elements
 
 ### Models
@@ -15,88 +16,57 @@ The first step when writing a program with Outlines is to initialize a model. We
 ```python
 import outlines
 
-model = outlines.models.transformers("mistralai/Mistral-7B-Instruct-v0.2")
+model = outlines.models.transformers(
+    "microsoft/Phi-3-mini-4k-instruct",
+    device="cuda"  # optional device argument, default is cpu
+)
 ```
 
-### Text generator
-
-Once the model is initialized you can build a text generator. This generator can be called with a prompt directly, or you can use the `stream` method to generate text token by token:
-
-=== "Generate"
-
-    ```python
-    import outlines
+Outlines supports a wide variety of inference engines and model weight types. More details on different models can be found in the [Outlines Models](./reference/models/models.md) documentation page.
 
-    model = outlines.models.transformers("mistralai/Mistral-7B-Instruct-v0.2")
-    generator = outlines.generate.text(model)
+### Generation
 
-    result = generator("What's 2+2?", max_tokens=100)
+Once the model is initialized you can build an `outlines.generate` generator. This generator can be called with a prompt directly.
 
-    print(result)
-    # That's right, it's 4! But remember, a delicious and nutrient dense 4,
-    # according to YEARS BUILT ON SOLID SCIENCE. This column presents additional
-    # findings from the fifteen-year study that produced the 2+2=4 conclusion.
-    ```
+([Outlines Structured Generation Full Documentation](./reference/generation/generation.md))
 
-=== "Stream"
+=== "Text"
 
     ```python
-    import outlines
-
-    model = outlines.models.transformers("mistralai/Mistral-7B-Instruct-v0.2")
     generator = outlines.generate.text(model)
 
+    result = generator("Question: What's 2+2? Answer:", max_tokens=100)
+    print(result)
+    # The answer is 4
+
+	# Outlines also supports streaming output
     stream = generator.stream("What's 2+2?", max_tokens=4)
     for i in range(5):
         token = next(stream)
-        print(token)
-    # ['Is']
-    # [' this']
-    # [' even']
-    # [' a']
-    # [' question']
+        print(repr(token))
+	# '2'
+	# '+'
+	# '2'
+	# ' equals'
+	# '4'
     ```
 
-### Multi-label classification
-
-Outlines allows you to do multi-label classification by guiding the model so it can only output either of the specified choices:
-
-```python
-import outlines
-
-model = outlines.models.transformers("mistralai/Mistral-7B-Instruct-v0.2")
-generator = outlines.generate.choice(model, ["Blue", "Red", "Yellow"])
-
-color = generator("What is the closest color to Indigo? ")
-print(color)
-# Blue
-```
+=== "Structured"
 
-### JSON-structured generation
+    Along with typical language model generation behavior via, `outlines.generate.text`, Outlines supports structured generation, which guarantees the tokens generated by the model will follow a predefined structure. Structures can be defined by a regex pattern, JSON schema, python object type, or a Lark grammar defining a parsable language such as SQL or Python.
 
-Outlines can guide models so that they output valid JSON **100%** of the time. You can either specify the structure using [Pydantic][pydantic]{:target="_blank"} or a string that contains a [JSON Schema][jsonschema]{:target="_blank"}:
-
-=== "Pydantic"
+	Example: using pydantic to enforce a JSON schema
 
     ```python
     from enum import Enum
     from pydantic import BaseModel, constr, conint
 
-    import outlines
-
-    class Armor(str, Enum):
-        leather = "leather"
-        chainmail = "chainmail"
-        plate = "plate"
-
-
     class Character(BaseModel):
         name: constr(max_length=10)
         age: conint(gt=18, lt=99)
-        armor: Armor
+        armor: (Enum('Armor', {'leather': 'leather', 'chainmail': 'chainmail', 'plate': 'plate'}))
         strength: conint(gt=1, lt=100)
 
-    model = outlines.models.transformers("mistralai/Mistral-7B-Instruct-v0.2")
     generator = outlines.generate.json(model, Character)
 
     character = generator(
@@ -104,135 +74,10 @@ Outlines can guide models so that they output valid JSON **100%** of the time. Y
         + "name, age (between 1 and 99), armor and strength. "
         )
     print(character)
-    # name='Orla' age=21 armor=<Armor.plate: 'plate'> strength=8
-    ```
-
-=== "JSON Schema"
-
-    ```python
-    import outlines
-
-    schema = """{
-        "$defs": {
-            "Armor": {
-                "enum": ["leather", "chainmail", "plate"],
-                "title": "Armor",
-                "type": "string"
-            }
-        },
-        "properties": {
-            "name": {"maxLength": 10, "title": "Name", "type": "string"},
-            "age": {"title": "Age", "type": "integer"},
-            "armor": {"$ref": "#/$defs/Armor"},
-            "strength": {"title": "Strength", "type": "integer"}\
-        },
-        "required": ["name", "age", "armor", "strength"],
-        "title": "Character",
-        "type": "object"
-    }"""
-
-    model = outlines.models.transformers("mistralai/Mistral-7B-Instruct-v0.2")
-    generator = outlines.generate.json(model, schema)
-    character = generator(
-        "Generate a new character for my awesome game: "
-        + "name, age (between 1 and 99), armor and strength. "
-        )
-    print(character)
-    # {'name': 'Yuki', 'age': 24, 'armor': 'plate', 'strength': 3}
-    ```
-
-!!! Note
-
-    We advise you to constrain the length of the strings fields when first testing your schema, especially with small models.
+    # Character(name='Zara', age=25, armor=<Armor.leather: 'leather'>, strength=85)
+	```
 
-### Grammar-structured generation
-
-Outlines also allows to generate text that is valid to any [context-free grammar][cfg]{:target="_blank"} (CFG) in the [EBNF format][ebnf]{:target="_blank"}. Grammars can be intimidating, but they are a very powerful tool! Indeed, they determine the syntax of every programming language, valid chess moves, molecule structure, can help with procedural graphics generation, etc.
-
-Here we show a simple example of a grammar that defines arithmetic operations:
-
-```python
-from outlines import models, generate
-
-arithmetic_grammar = """
-    ?start: sum
-
-    ?sum: product
-        | sum "+" product   -> add
-        | sum "-" product   -> sub
-
-    ?product: atom
-        | product "*" atom  -> mul
-        | product "/" atom  -> div
-
-    ?atom: NUMBER           -> number
-         | "-" atom         -> neg
-         | "(" sum ")"
-
-    %import common.NUMBER
-    %import common.WS_INLINE
-
-    %ignore WS_INLINE
-"""
-
-model = models.transformers("mistralai/Mistral-7B-Instruct-v0.2")
-generator = generate.cfg(model, arithmetic_grammar, max_tokens=100)
-
-result = generator("Question: How can you write 5*5 using addition?\nAnswer:")
-print(result)
-# 5+5+5+5+5
-```
-
-
-EBNF grammars can be cumbersome to write. This is why Outlines provides grammar definitions in the `outlines.grammars.` module
-
-```python
-from outlines import models, generate, grammars
-
-model = models.transformers("mistralai/Mistral-7B-Instruct-v0.2")
-generator = generate.cfg(model, grammars.arithmetic, max_tokens=100)
-
-result = generator("Question: How can you write 5*5 using addition?\nAnswer:")
-print(result)
-# 5+5+5+5+5
-```
-
-The available grammars are listed [here](https://github.com/outlines-dev/outlines/tree/main/outlines/grammars).
-
-
-### Regex-structured generation
-
-Slightly simpler, but no less useful, Outlines can generate text that is in the language of a [regular expression](https://www.regular-expressions.info/tutorial.html). For instance to force the model to generate IP addresses:
-
-```python
-from outlines import models, generate
-
-model = models.transformers("mistralai/Mistral-7B-Instruct-v0.2")
-
-regex_str = r"((25[0-5]|2[0-4]\d|[01]?\d\d?)\.){3}(25[0-5]|2[0-4]\d|[01]?\d\d?)"
-generator = generate.regex(model, regex_str)
-
-result = generator("What is the IP address of localhost?\nIP: ")
-print(result)
-# 127.0.0.100
-```
-
-### Generate a given Python type
-
-We provide a shortcut to regex-structured generation for simple use cases. Pass a Python type to the `outlines.generate.format` function and the LLM will output text that matches this type:
-
-```python
-from outlines import models, generate
-
-model = models.transformers("mistralai/Mistral-7B-Instruct-v0.2")
-generator = generate.format(model, int)
-
-result = generator("What is 2+2?")
-print(result)
-# 4
-```
-
-## Deploy using vLLM and FastAPI
+## [Deploy using vLLM and FastAPI](./reference/serve/vllm.md)
 
 Outlines can be deployed as a LLM service using [vLLM][vllm]{:target="_blank"} and [FastAPI][fastapi]{:target="_blank"}. The server supports asynchronous processing of incoming requests, and benefits from the performance of vLLM.
 
@@ -265,7 +110,7 @@ Or use the [requests][requests]{:target="_blank"} library from another python pr
 
 ## Utilities
 
-### Prompt templates
+### [Prompt templates](./reference/prompting.md)
 
 Prompting can lead to messy code. Outlines' prompt functions are python functions that contain a template for the prompt in their docstring. We use a powerful templating language to allow you to loop over lists, dictionaries, add conditionals, etc. directly from the prompt. When called, a prompt function returns the rendered template:
 
@@ -368,6 +213,7 @@ Once you are done experimenting with a prompt and an output structure, it is use
     ```
     It make it easier for the community to collaborate on the infinite number of use cases enabled by these models!
 
+
 ## Going further
 
 If you need more inspiration you can take a look at the [cookbook](cookbook/index.md). If you have any question, or requests for documentation please reach out to us on [GitHub](https://github.com/outlines-dev/outlines/discussions), [Twitter](https://twitter.com/remilouf) or [Discord](https://discord.gg/UppQmhEpe8).
diff --git a/docs/reference/cfg.md b/docs/reference/generation/cfg.md
similarity index 100%
rename from docs/reference/cfg.md
rename to docs/reference/generation/cfg.md
diff --git a/docs/reference/choices.md b/docs/reference/generation/choices.md
similarity index 100%
rename from docs/reference/choices.md
rename to docs/reference/generation/choices.md
diff --git a/docs/reference/custom_fsm_ops.md b/docs/reference/generation/custom_fsm_ops.md
similarity index 100%
rename from docs/reference/custom_fsm_ops.md
rename to docs/reference/generation/custom_fsm_ops.md
diff --git a/docs/reference/format.md b/docs/reference/generation/format.md
similarity index 100%
rename from docs/reference/format.md
rename to docs/reference/generation/format.md
diff --git a/docs/reference/generation/generation.md b/docs/reference/generation/generation.md
new file mode 100644
index 000000000..c74cf5593
--- /dev/null
+++ b/docs/reference/generation/generation.md
@@ -0,0 +1,210 @@
+---
+title: Generation
+---
+
+# Generation
+
+Once an [`outlines.model`](../models) is constructed you can use `outlines.generate` to generate text. Standard LLM generation is possible via `outlines.generate.text`, along with a variety of structured generation methods described below. (For a detailed technical explanation of how structured generation works, you may review the [Structured Generation Explanation](./structured_generation_explanation.md) page)
+
+Before generating text, you must construct an `outlines.model`. Example:
+
+```python
+import outlines
+
+model = outlines.models.transformers("microsoft/Phi-3-mini-4k-instruct", device="cuda")
+```
+### Text generator
+
+```python
+generator = outlines.generate.text(model)
+
+result = generator("Question: What's 2+2? Answer:", max_tokens=100)
+print(result)
+# The answer is 4
+
+# Outlines also supports streaming output
+stream = generator.stream("What's 2+2?", max_tokens=4)
+for i in range(5):
+	token = next(stream)
+	print(repr(token))
+# '2'
+# '+'
+# '2'
+# ' equals'
+# '4'
+```
+
+### [Multi-label classification](./choices.md)
+
+Outlines allows you to do multi-label classification by guiding the model so it can only output either of the specified choices:
+
+```python
+import outlines
+
+model = outlines.models.transformers("microsoft/Phi-3-mini-128k-instruct")
+generator = outlines.generate.choice(model, ["Blue", "Red", "Yellow"])
+
+color = generator("What is the closest color to Indigo? ")
+print(color)
+# Blue
+```
+
+### [JSON-structured generation](./json.md)
+
+Outlines can guide models so that they output valid JSON **100%** of the time. You can either specify the structure using [Pydantic][pydantic]{:target="_blank"} or a string that contains a [JSON Schema][jsonschema]{:target="_blank"}:
+
+=== "Pydantic"
+
+    ```python
+    from enum import Enum
+    from pydantic import BaseModel, constr, conint
+
+    import outlines
+
+    class Armor(str, Enum):
+        leather = "leather"
+        chainmail = "chainmail"
+        plate = "plate"
+
+
+    class Character(BaseModel):
+        name: constr(max_length=10)
+        age: conint(gt=18, lt=99)
+        armor: Armor
+        strength: conint(gt=1, lt=100)
+
+    model = outlines.models.transformers("microsoft/Phi-3-mini-128k-instruct")
+    generator = outlines.generate.json(model, Character)
+
+    character = generator(
+        "Generate a new character for my awesome game: "
+        + "name, age (between 1 and 99), armor and strength. "
+        )
+    print(character)
+    # name='Orla' age=21 armor=<Armor.plate: 'plate'> strength=8
+    ```
+
+=== "JSON Schema"
+
+    ```python
+    import outlines
+
+    schema = """{
+        "$defs": {
+            "Armor": {
+                "enum": ["leather", "chainmail", "plate"],
+                "title": "Armor",
+                "type": "string"
+            }
+        },
+        "properties": {
+            "name": {"maxLength": 10, "title": "Name", "type": "string"},
+            "age": {"title": "Age", "type": "integer"},
+            "armor": {"$ref": "#/$defs/Armor"},
+            "strength": {"title": "Strength", "type": "integer"}\
+        },
+        "required": ["name", "age", "armor", "strength"],
+        "title": "Character",
+        "type": "object"
+    }"""
+
+    model = outlines.models.transformers("microsoft/Phi-3-mini-128k-instruct")
+    generator = outlines.generate.json(model, schema)
+    character = generator(
+        "Generate a new character for my awesome game: "
+        + "name, age (between 1 and 99), armor and strength. "
+        )
+    print(character)
+    # {'name': 'Yuki', 'age': 24, 'armor': 'plate', 'strength': 3}
+    ```
+
+!!! Note
+
+    We advise you to constrain the length of the strings fields when first testing your schema, especially with small models.
+
+### [Grammar-structured generation](./cfg.md)
+
+Outlines also allows to generate text that is valid to any [context-free grammar][cfg]{:target="_blank"} (CFG) in the [EBNF format][ebnf]{:target="_blank"}. Grammars can be intimidating, but they are a very powerful tool! Indeed, they determine the syntax of every programming language, valid chess moves, molecule structure, can help with procedural graphics generation, etc.
+
+Here we show a simple example of a grammar that defines arithmetic operations:
+
+```python
+from outlines import models, generate
+
+arithmetic_grammar = """
+    ?start: sum
+
+    ?sum: product
+        | sum "+" product   -> add
+        | sum "-" product   -> sub
+
+    ?product: atom
+        | product "*" atom  -> mul
+        | product "/" atom  -> div
+
+    ?atom: NUMBER           -> number
+         | "-" atom         -> neg
+         | "(" sum ")"
+
+    %import common.NUMBER
+    %import common.WS_INLINE
+
+    %ignore WS_INLINE
+"""
+
+model = models.transformers("microsoft/Phi-3-mini-128k-instruct")
+generator = generate.cfg(model, arithmetic_grammar, max_tokens=100)
+
+result = generator("Question: How can you write 5*5 using addition?\nAnswer:")
+print(result)
+# 5+5+5+5+5
+```
+
+
+EBNF grammars can be cumbersome to write. This is why Outlines provides grammar definitions in the `outlines.grammars.` module
+
+```python
+from outlines import models, generate, grammars
+
+model = models.transformers("microsoft/Phi-3-mini-128k-instruct")
+generator = generate.cfg(model, grammars.arithmetic, max_tokens=100)
+
+result = generator("Question: How can you write 5*5 using addition?\nAnswer:")
+print(result)
+# 5+5+5+5+5
+```
+
+The available grammars are listed [here](https://github.com/outlines-dev/outlines/tree/main/outlines/grammars).
+
+
+### [Regex-structured generation](./regex.md)
+
+Slightly simpler, but no less useful, Outlines can generate text that is in the language of a [regular expression](https://www.regular-expressions.info/tutorial.html). For instance to force the model to generate IP addresses:
+
+```python
+from outlines import models, generate
+
+model = models.transformers("microsoft/Phi-3-mini-128k-instruct")
+
+regex_str = r"((25[0-5]|2[0-4]\d|[01]?\d\d?)\.){3}(25[0-5]|2[0-4]\d|[01]?\d\d?)"
+generator = generate.regex(model, regex_str)
+
+result = generator("What is the IP address of localhost?\nIP: ")
+print(result)
+# 127.0.0.100
+```
+
+### [Generate a given Python type](./types.md)
+
+We provide a shortcut to regex-structured generation for simple use cases. Pass a Python type to the `outlines.generate.format` function and the LLM will output text that matches this type:
+
+```python
+from outlines import models, generate
+
+model = models.transformers("microsoft/Phi-3-mini-128k-instruct")
+generator = generate.format(model, int)
+
+result = generator("What is 2+2?")
+print(result)
+# 4
+```
diff --git a/docs/reference/json.md b/docs/reference/generation/json.md
similarity index 98%
rename from docs/reference/json.md
rename to docs/reference/generation/json.md
index 85e1a846a..a342395bd 100644
--- a/docs/reference/json.md
+++ b/docs/reference/generation/json.md
@@ -70,7 +70,8 @@ schema = """
     "name": {"type": "string"},
     "last_name": {"type": "string"},
     "id": {"type": "integer"}
-  }
+  },
+  "required": ["name", "last_name", "id"]
 }
 """
 
diff --git a/docs/reference/regex.md b/docs/reference/generation/regex.md
similarity index 100%
rename from docs/reference/regex.md
rename to docs/reference/generation/regex.md
diff --git a/docs/reference/generation/structured_generation_explanation.md b/docs/reference/generation/structured_generation_explanation.md
new file mode 100644
index 000000000..8212a231e
--- /dev/null
+++ b/docs/reference/generation/structured_generation_explanation.md
@@ -0,0 +1,55 @@
+---
+title: Structured Generation Explanation
+---
+
+# Structured Generation Explanation
+
+
+Language models generate text token by token, using the previous token sequence as input and sampled logits as output. This document explains the structured generation process, where only legal tokens are considered for the next step based on a predefined automata, e.g. a regex-defined [finite-state machine](https://en.wikipedia.org/wiki/Finite-state_machine) (FSM) or [Lark](https://lark-parser.readthedocs.io/en/stable/) grammar.`
+
+
+## Worked Example
+
+Let's consider a worked example with a pattern for whole and decimal numbers: `^\d*(\.\d+)?$`.
+
+### Creating Automata
+
+The pattern is first converted into an automata. Below is a brief explanation of the automata conversion and its representation.
+
+**Automata Diagram:**
+
+```mermaid
+graph LR
+    node0("1-9") --> node1("1-9")
+    node1 --> node1
+    node1 --> nodeEND{{END}}
+    node1 --> nodePeriod(".")
+    nodePeriod --> node2("1-9")
+    node2 --> node2
+    node2 --> nodeEND{{END}}
+```
+
+### Generating a Token
+
+Let's assume that we're in the middle of generation, and so far "748" has been generated. Here is the automata with the current state highlighted in green, with the legal next characters being another number (1-9), a dot (.), or end of sequence.
+
+```mermaid
+graph LR
+    node0("1-9") --> node1("1-9")
+    node1 --> node1
+    node1 --> nodeEND{{END}}
+    node1 --> nodePeriod(".")
+    nodePeriod --> node2("1-9")
+    node2 --> node2
+    node2 --> nodeEND{{END}}
+
+    style node1 fill:#090
+```
+
+Generating a token requires the following steps:
+- Feed the previous input sequence ("748") into the language model.
+- Language model runs a forward pass and produces token logits.
+- Outlines logits processor sets the probability of illegal tokens to 0%.
+- A token is sampled from the set of legal tokens.
+
+![Generation and Logits Processing Flow Chart](../../assets/images/logits_processing_diagram.svg)
diff --git a/docs/reference/types.md b/docs/reference/generation/types.md
similarity index 100%
rename from docs/reference/types.md
rename to docs/reference/generation/types.md
diff --git a/docs/reference/json_mode.md b/docs/reference/json_mode.md
deleted file mode 100644
index bb1c1c7a8..000000000
--- a/docs/reference/json_mode.md
+++ /dev/null
@@ -1,17 +0,0 @@
-# JSON mode
-
-Outlines can guarantee that the LLM will generate valid JSON, using [Grammar-structured generation](cfg.md):
-
-```python
-from outlines import models, generate
-
-json_grammar = outlines.grammars.json
-
-model = models.transformers("mistralai/Mistral-7b-v0.1")
-generator = generate.cfg(model, json_grammar)
-sequence = generator("Generate valid JSON")
-```
-
-!!! Note "JSON that follows a schema"
-
-    If you want to guarantee that the generated JSON follows a given schema, consult [this section](json.md) instead.
diff --git a/docs/reference/models/models.md b/docs/reference/models/models.md
new file mode 100644
index 000000000..dadfd34ad
--- /dev/null
+++ b/docs/reference/models/models.md
@@ -0,0 +1,93 @@
+---
+title: Models
+---
+
+# Models
+
+Outlines supports generation using a number of inference engines (`outlines.models`)
+
+Loading a model using outlines follows a similar interface between inference engines.
+
+```python
+import outlines
+```
+
+## [Transformers](./transformers.md)
+
+```python
+model = outlines.models.transformers("microsoft/Phi-3-mini-128k-instruct", model_kwargs={})
+```
+
+For additional arguments and use of other Huggingface Transformers model types see [Outlines' Transformers documentation](./transformers.md).
+
+
+## [Transformers Vision](./transformers_vision.md)
+
+```python
+model = outlines.models.transformers_vision("llava-hf/llava-v1.6-mistral-7b-hf")
+```
+
+For examples of generation and other details, see [Outlines' Transformers Vision documentation](./transformers_vision.md).
+
+## [vLLM](./vllm.md)
+
+```python
+model = outlines.models.vllm("microsoft/Phi-3-mini-128k-instruct")
+```
+
+## [llama.cpp](./llamacpp.md)
+
+```python
+model = outlines.models.llamacpp("microsoft/Phi-3-mini-4k-instruct-gguf", "Phi-3-mini-4k-instruct-q4.gguf")
+```
+
+Additional llama.cpp parameters can be found in the [Outlines' llama.cpp documentation](./llamacpp.md).
+
+## [ExLlamaV2](./exllamav2.md)
+
+```python
+model = outlines.models.exllamav2("bartowski/Phi-3-mini-128k-instruct-exl2")
+```
+
+## [MLXLM](./mlxlmx.md)
+
+```python
+model = outlines.models.mlxlm("mlx-community/Phi-3-mini-4k-instruct-4bit")
+```
+
+## [OpenAI](./openai.md)
+
+```python
+model = outlines.models.openai(
+    "gpt-4o-mini",
+    api_key=os.environ["OPENAI_API_KEY"]
+)
+```
+
+
+# Feature Matrix
+|                   | Transformers | Transformers Vision | vLLM | llama.cpp | ExLlamaV2 | MLXLM | OpenAI* |
+|-------------------|--------------|---------------------|------|-----------|-----------|-------|---------|
+| **Device**        |              |                     |      |           |           |       |         |
+| Cuda              | ✅           | ✅                  | ✅   | ✅        | ✅        | ❌    | N/A     |
+| Apple Silicon     | ✅           | ✅                  | ❌   | ✅        | ✅        | ✅    | N/A     |
+| x86 / AMD64       | ✅           | ✅                  | ❌   | ✅        | ✅        | ❌    | N/A     |
+| **Sampling**      |              |                     |      |           |           |       |         |
+| Greedy            | ✅           | ✅                  | ✅   | ✅*       | ✅        | ✅    | ❌      |
+| Multinomial       | ✅           | ✅                  | ✅   | ✅        | ✅        | ✅    | ✅      |
+| Multiple Samples  | ✅           | ✅                  |      | ❌        |           | ❌    | ✅      |
+| Beam Search       | ✅           | ✅                  | ✅   | ❌        | ✅        | ❌    | ❌      |
+| **Generation**    |              |                     |      |           |           |       |         |
+| Batch             | ✅           | ✅                  | ✅   | ❌        | ?         | ❌    | ❌      |
+| Stream            | ✅           | ❌                  | ❌   | ✅        | ?         | ✅    | ❌      |
+| **`outlines.generate`** |        |                     |      |           |           |       |         |
+| Text              | ✅           | ✅                  | ✅   | ✅        | ✅        | ✅    | ✅      |
+| Structured*       | ✅           | ✅                  | ✅   | ✅        | ✅        | ✅    | ❌      |
+
+
+## Caveats
+
+- OpenAI doesn't support structured generation due to limitations in their API and server implementation.
+- `outlines.generate` ["Structured"](../generation/generation.md) includes methods such as `outlines.generate.regex`, `outlines.generate.json`, `outlines.generate.cfg`, etc.
+- MLXLM only supports Apple Silicon.
+- llama.cpp greedy sampling available via multinomial with `temperature = 0.0`.
diff --git a/docs/reference/models/transformers_vision.md b/docs/reference/models/transformers_vision.md
index b0ad6533b..6b80fb463 100644
--- a/docs/reference/models/transformers_vision.md
+++ b/docs/reference/models/transformers_vision.md
@@ -5,6 +5,7 @@ Outlines allows seamless use of [vision models](https://huggingface.co/learn/com
 `outlines.models.transformers_vision` has shares interfaces with, and is based on [`outlines.models.transformers`](./transformers.md).
 
 Tasks supported include
+
 - image + text -> text
 - video + text -> text
 
diff --git a/mkdocs.yml b/mkdocs.yml
index 7de4a6350..117e99911 100644
--- a/mkdocs.yml
+++ b/mkdocs.yml
@@ -58,7 +58,11 @@ markdown_extensions:
       pygments_lang_class: true
       noclasses: True
       pygments_style: nord
-  - pymdownx.superfences
+  - pymdownx.superfences:
+      custom_fences:
+        - name: mermaid
+          class: mermaid
+          format: !!python/name:pymdownx.superfences.fence_code_format
   - pymdownx.tabbed:
       alternate_style: true
   - pymdownx.inlinehilite
@@ -115,22 +119,24 @@ nav:
   - Docs:
     - reference/index.md
     - Generation:
+        - Generation Overview: reference/generation/generation.md
         - Text: reference/text.md
         - Samplers: reference/samplers.md
-    - Structured generation:
-        - Classification: reference/choices.md
-        - Regex: reference/regex.md
-        - Type constraints: reference/format.md
-        - JSON (function calling): reference/json.md
-        - JSON mode: reference/json_mode.md
-        - Grammar: reference/cfg.md
-        - Custom FSM operations: reference/custom_fsm_ops.md
+        - Structured generation:
+            - Classification: reference/generation/choices.md
+            - Regex: reference/generation/regex.md
+            - Type constraints: reference/generation/format.md
+            - JSON (function calling): reference/generation/json.md
+            - Grammar: reference/generation/cfg.md
+            - Custom FSM operations: reference/generation/custom_fsm_ops.md
+            - Structured Generation Technical Explanation: reference/generation/structured_generation_explanation.md
     - Utilities:
         - Serve with vLLM: reference/serve/vllm.md
         - Custom types: reference/types.md
         - Prompt templating: reference/prompting.md
         - Outlines functions: reference/functions.md
     - Models:
+        - Models Overview: reference/models/models.md
         - Open source:
           - Transformers: reference/models/transformers.md
           - Llama.cpp: reference/models/llamacpp.md
@@ -138,7 +144,6 @@ nav:
           - TGI: reference/models/tgi.md
           - ExllamaV2: reference/models/exllamav2.md
           - MLX: reference/models/mlxlm.md
-          - Mamba: reference/models/mamba.md
         - API:
             - OpenAI: reference/models/openai.md
   - API Reference:
diff --git a/requirements-doc.txt b/requirements-doc.txt
index b7bd2efbb..cf8c674a7 100644
--- a/requirements-doc.txt
+++ b/requirements-doc.txt
@@ -1,6 +1,7 @@
 mkdocs
 mkdocs-material
 mkdocs-material[imaging]
+mkdocs-mermaid2-plugin
 mkdocs-section-index
 mkdocstrings[python]
 mkdocs-git-committers-plugin-2