From 70f40195af9007c68070d8b0effa25bda34bce53 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Quentin=20Gallou=C3=A9dec?=
 <quentin.gallouedec@huggingface.co>
Date: Mon, 16 Sep 2024 17:03:44 +0000
Subject: [PATCH 01/22] first modifications in the documentation

---
 docs/source/dataset_formats.mdx   |  2 +-
 docs/source/online_dpo_trainer.md | 89 ++++++++++++++++++-------------
 2 files changed, 53 insertions(+), 38 deletions(-)
diff --git a/docs/source/dataset_formats.mdx b/docs/source/dataset_formats.mdx
index 9b9dda92e8..bad094d277 100644
--- a/docs/source/dataset_formats.mdx
+++ b/docs/source/dataset_formats.mdx
@@ -209,7 +209,7 @@ Choosing the right dataset format depends on the task you are working on and the
 
 <Tip>
 
-TRL trainers only support standard dataset formats. If you have a conversational dataset, you must first convert it into a standard format.
+TRL trainers only support standard dataset formats, [for now](https://github.com/huggingface/trl/issues/2071). If you have a conversational dataset, you must first convert it into a standard format.
 For more information on how to work with conversational datasets, refer to the [Working with conversational datasets in TRL](#working-with-conversational-datasets-in-trl) section.
 
 </Tip>
diff --git a/docs/source/online_dpo_trainer.md b/docs/source/online_dpo_trainer.md
index 7b46c7d6fe..ec163e8220 100644
--- a/docs/source/online_dpo_trainer.md
+++ b/docs/source/online_dpo_trainer.md
@@ -17,44 +17,75 @@ This post-training method was contributed by [Michael Noukhovitch](https://huggi
 > [!WARNING]
 > Make sure that the SFT model and reward model use the _same_ chat template. Otherwise, you may find the model completions are scored incorrectly during training.
 
+## Expected dataset format
+
+Online DPO only requires a [prompt-only dataset](dataset_format#preference) (unlike offline DPO, that expects [preference dataset](dataset_format#preference)). The [`OnlineDPOTrainer`] supports both [conversational](dataset_format#conversational-dataset-format) and [standard](dataset_format#standard-dataset-format) dataset format.
+
+## Quick start
+
+This example demonstrates how to train a model using the online DPO method. We use the [Qwen 0.5B model](https://huggingface.co/Qwen/Qwen2-0.5B-Instruct) as the base model and the [Qwen 0.5B reward model](https://huggingface.co/trl-lib/Qwen2-0.5B-Reward) as the reward model. We use the prompts from the [UltraFeedback dataset](https://huggingface.co/datasets/openbmb/UltraFeedback):
+
+<iframe
+  src="https://huggingface.co/datasets/trl-lib/ultrafeedback-prompt/embed/viewer/default/train?row=0"
+  frameborder="0"
+  width="100%"
+  height="560px"
+></iframe>
+
 The basic API is as follows:
 
 ```python
-from datasets import Dataset
+# train_online_dpo.py
+from datasets import load_dataset
 from trl import OnlineDPOConfig, OnlineDPOTrainer
-from transformers import (
-    AutoModelForCausalLM,
-    AutoModelForSequenceClassification,
-    AutoTokenizer,
-)
-NUM_DUMMY_SAMPLES = 100
+from transformers import AutoModelForCausalLM, AutoModelForSequenceClassification, AutoTokenizer
 
-tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2-0.5B-Instruct")
-# The model to optimise
 model = AutoModelForCausalLM.from_pretrained("Qwen/Qwen2-0.5B-Instruct")
-# The reference model to calculate the KL divergence against
-ref_model = AutoModelForCausalLM.from_pretrained("Qwen/Qwen2-0.5B-Instruct")
-# The model to score completions with.
+tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2-0.5B-Instruct")
 reward_model = AutoModelForSequenceClassification.from_pretrained("trl-lib/Qwen2-0.5B-Reward", num_labels=1)
+train_dataset = load_dataset("trl-lib/ultrafeedback-prompt", split="train")
 
-train_dataset = Dataset.from_dict(
-    {"prompt": ["Q: Hi how are you? A:"] * NUM_DUMMY_SAMPLES})
-eval_dataset = Dataset.from_dict(
-    {"prompt": ["Q: What do you like to eat A:"] * NUM_DUMMY_SAMPLES})
-
-args = OnlineDPOConfig(output_dir="online-dpo-model")
+args = OnlineDPOConfig(output_dir="online-dpo-qwen2", logging_steps=10)
 trainer = OnlineDPOTrainer(
     model=model,
-    ref_model=ref_model,
     reward_model=reward_model,
     args=args,
     tokenizer=tokenizer,
     train_dataset=train_dataset,
-    eval_dataset=eval_dataset,
 )
 trainer.train()
 ```
 
+We run this script with the following command:
+
+```bash
+accelerate launch train_online_dpo.py
+```
+
+After approximately 1 hour of training, the model is trained and we can demonstrate the completions:
+
+````python
+>>> from transformers import pipeline
+>>> generator = pipeline("text-generation", model="online-dpo-qwen2/checkpoint-500", device="cuda")
+>>> question = "Can you tell me which shell command can be used to display the CPU usage of a specific process in Linux? And what is the syntax for this command?"
+>>> output = generator([{"role": "user", "content": question}], max_new_tokens=200, return_full_text=False)[0]
+>>> print(output["generated_text"])
+Yes, you can use the `top` command in Linux to display the CPU usage of a specific process.
+The syntax for the `top` command depends on your version of Linux. Here's an example of how to run `top` with the `-b` option:
+```
+top -b
+```
+
+This will display the CPU usage of all processes in the system at the top level. You can also specify additional options by adding them after the `-b` option using square brackets (`[]`). For example, to display only the running processes and their CPU usage, you would add the following options to the command:
+```
+top -b --running=1
+```
+
+Note that some versions of Linux may require you to set up a user account or enable logging before running the `top` command.
+````
+
+### Example script
+
 To test the online DPO script with 1B parameter models, run:
 
 ```bash
@@ -78,18 +109,7 @@ Tips:
 * `objective/rlhf_reward` is the ultimate objective of online DPO training. If training works as intended, this metric should keep going up.
 * We recommend using the "EOS trick" via the `--missing_eos_penalty` argument, which subtracts from the rewards a fixed scalar penalty for completions that do not end with an EOS token. This can help the model learn to generate more coherent completions.
 
-### Expected dataset format
 
-Unlike offline DPO, where one provides a dataset with chosen and rejected columns, online DPO only requires a dataset of prompts to generate the completions from. The [`OnlineDPOTrainer`] assumes that the dataset is preprocessed for model inference, so typically you will need to wrap your prompts in the messages format and then apply the chat template as follows:
-
-```python
-def prepare_dataset(row):
-    """Apply chat template to messages"""
-    row["prompt"] = tokenizer.apply_chat_template(row["prompt"], tokenize=False, add_generation_prompt=True)
-    return row
-
-dataset = prepare_dataset(dataset)
-```
 
 ### Explanation of the logged metrics
 
@@ -112,12 +132,7 @@ The logged metrics are as follows. Here is an example [tracked run at Weights an
 
 ## What is my model doing exactly?
 
-To help you understand what your model is doing, we periodically log some sample completions from the model via [`LogCompletionsCallback`]. You can find an example [tracked run at Weights and Biases](https://wandb.ai/huggingface/trl/runs/hlzevfro?nw=nwuserlewtun), which allows you to see the model's response at different stages of training. By default we generate during training, but you can customize the number of prompts to generate completions for in [`LogCompletionsCallback`]. 
-
-
-## Implementation details
-
-Many online implementation details are borrowed from the [`PPOv2Trainer`], which is itself based on the [The N+ Implementation Details of RLHF with PPO: A Case Study on TL;DR Summarization](https://huggingface.co/papers/2403.17031).
+To help you understand what your model is doing, we periodically log some sample completions from the model via [`LogCompletionsCallback`]. You can find an example [tracked run at Weights and Biases](https://wandb.ai/huggingface/trl/runs/hlzevfro), which allows you to see the model's response at different stages of training. By default we generate during training, but you can customize the number of prompts to generate completions for in [`LogCompletionsCallback`]. 
 
 
 ## Benchmark experiments

From 73a185d217f43c8cbe76bd6af3520f5a6355412d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Quentin=20Gallou=C3=A9dec?=
 <quentin.gallouedec@huggingface.co>
Date: Mon, 16 Sep 2024 17:04:05 +0000
Subject: [PATCH 02/22] Add script for processing ultrafeedback prompt dataset

---
 examples/datasets/ultrafeedback-prompt.py | 67 +++++++++++++++++++++++
 1 file changed, 67 insertions(+)
 create mode 100644 examples/datasets/ultrafeedback-prompt.py

diff --git a/examples/datasets/ultrafeedback-prompt.py b/examples/datasets/ultrafeedback-prompt.py
new file mode 100644
index 0000000000..308b8af160
--- /dev/null
+++ b/examples/datasets/ultrafeedback-prompt.py
@@ -0,0 +1,67 @@
+# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from dataclasses import dataclass
+from typing import Optional
+
+from datasets import load_dataset
+from transformers import HfArgumentParser
+
+
+@dataclass
+class ScriptArguments:
+    r"""
+    Arguments for the script.
+
+    Args:
+        push_to_hub (`bool`, *optional*, defaults to `False`):
+            Whether to push the dataset to the Hugging Face Hub.
+        repo_id (`str`, *optional*, defaults to `"trl-lib/ultrafeedback-prompt"`):
+            Hugging Face repository ID to push the dataset to.
+        dataset_num_proc (`Optional[int]`, *optional*, defaults to `None`):
+            Number of workers to use for dataset processing.
+    """
+
+    push_to_hub: bool = False
+    repo_id: str = "trl-lib/ultrafeedback-prompt"
+    dataset_num_proc: Optional[int] = None
+
+
+def to_unpaired_preference(example):
+    prompt = [{"role": "user", "content": example["instruction"]}]
+    return {"prompt": prompt}
+
+def drop_long_prompt(example):
+    if len(example["prompt"][0]["content"]) > 768:
+        return False
+    else:
+        return True
+
+
+if __name__ == "__main__":
+    parser = HfArgumentParser(ScriptArguments)
+    args = parser.parse_args_into_dataclasses()[0]
+
+    dataset = load_dataset("openbmb/UltraFeedback", split="train")
+
+    dataset = dataset.map(
+        to_unpaired_preference,
+        remove_columns=["source", "instruction", "models", "completions", "correct_answers", "incorrect_answers"],
+        num_proc=args.dataset_num_proc,
+    )
+    dataset = dataset.filter(drop_long_prompt)
+    dataset = dataset.train_test_split(test_size=0.05, seed=42)
+
+    if args.push_to_hub:
+        dataset.push_to_hub(args.repo_id)

From 98e0bde910e5e33a2731e22594ae593961520ee6 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Quentin=20Gallou=C3=A9dec?=
 <quentin.gallouedec@huggingface.co>
Date: Mon, 16 Sep 2024 17:04:10 +0000
Subject: [PATCH 03/22] Remove unused variable in ultrafeedback.py

---
 examples/datasets/ultrafeedback.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/examples/datasets/ultrafeedback.py b/examples/datasets/ultrafeedback.py
index 82a0fb5e07..c29553b04b 100644
--- a/examples/datasets/ultrafeedback.py
+++ b/examples/datasets/ultrafeedback.py
@@ -70,7 +70,6 @@ class ScriptArguments:
 
 
 def to_unpaired_preference(example, model_name, aspect):
-    model_index = example["models"].index(model_name)
     prompt = [{"role": "user", "content": example["instruction"]}]
     model_index = example["models"].index(model_name)
     response_content = example["completions"][model_index]["response"]

From 1d2c868511716fe0120bee1b51a7ad2e59ed2f39 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Quentin=20Gallou=C3=A9dec?=
 <quentin.gallouedec@huggingface.co>
Date: Mon, 16 Sep 2024 17:04:46 +0000
Subject: [PATCH 04/22] style

---
 examples/datasets/ultrafeedback-prompt.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/examples/datasets/ultrafeedback-prompt.py b/examples/datasets/ultrafeedback-prompt.py
index 308b8af160..10cf8e066e 100644
--- a/examples/datasets/ultrafeedback-prompt.py
+++ b/examples/datasets/ultrafeedback-prompt.py
@@ -42,6 +42,7 @@ def to_unpaired_preference(example):
     prompt = [{"role": "user", "content": example["instruction"]}]
     return {"prompt": prompt}
 
+
 def drop_long_prompt(example):
     if len(example["prompt"][0]["content"]) > 768:
         return False

From faffc117d4d43fe2b7f206e9604a89667f4be5c6 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Quentin=20Gallou=C3=A9dec?=
 <quentin.gallouedec@huggingface.co>
Date: Mon, 16 Sep 2024 17:05:08 +0000
Subject: [PATCH 05/22] apply chat template within the init

---
 examples/scripts/dpo_online.py    |  7 -------
 trl/trainer/online_dpo_trainer.py | 10 ++++++++++
 2 files changed, 10 insertions(+), 7 deletions(-)

diff --git a/examples/scripts/dpo_online.py b/examples/scripts/dpo_online.py
index aba216f674..5c7d89d372 100644
--- a/examples/scripts/dpo_online.py
+++ b/examples/scripts/dpo_online.py
@@ -43,7 +43,6 @@
 import torch
 from datasets import load_dataset
 from transformers import AutoModelForCausalLM, AutoModelForSequenceClassification, AutoTokenizer, GenerationConfig
-from accelerate import PartialState
 from trl import (
     DPOScriptArguments,
     ModelConfig,
@@ -52,7 +51,6 @@
     get_kbit_device_map,
     get_peft_config,
     get_quantization_config,
-    maybe_apply_chat_template,
     LogCompletionsCallback,
 )
 
@@ -103,11 +101,6 @@
 
     dataset = load_dataset(args.dataset_name)
 
-    with PartialState().local_main_process_first():
-        dataset = dataset.map(
-            maybe_apply_chat_template, num_proc=training_args.dataset_num_proc, fn_kwargs={"tokenizer": tokenizer}
-        )
-
     trainer = OnlineDPOTrainer(
         model=model,
         reward_model=reward_model,
diff --git a/trl/trainer/online_dpo_trainer.py b/trl/trainer/online_dpo_trainer.py
index 8c4d031765..0da7c69253 100644
--- a/trl/trainer/online_dpo_trainer.py
+++ b/trl/trainer/online_dpo_trainer.py
@@ -38,6 +38,7 @@
 from transformers.training_args import OptimizerNames
 from transformers.utils import is_peft_available, is_sagemaker_mp_enabled, logging
 
+from ..data_utils import maybe_apply_chat_template
 from ..models import create_reference_model
 from ..models.utils import unwrap_model_for_generation
 from .judges import BasePairwiseJudge
@@ -193,6 +194,15 @@ def __init__(
         # Compute that only on the main process for faster data processing.
         # see: https://github.com/huggingface/trl/pull/1255
         with PartialState().local_main_process_first():
+            # Apply the chat template if needed
+            train_dataset = train_dataset.map(
+                maybe_apply_chat_template, fn_kwargs={"tokenizer": tokenizer}, num_proc=args.dataset_num_proc
+            )
+            if eval_dataset is not None:
+                eval_dataset = eval_dataset.map(
+                    maybe_apply_chat_template, fn_kwargs={"tokenizer": tokenizer}, num_proc=args.dataset_num_proc
+                )
+
             # Tokenize the dataset
             fn_kwargs = {"is_encoder_decoder": model.config.is_encoder_decoder, "tokenizer": tokenizer}
             train_dataset = train_dataset.map(self.tokenize_row, fn_kwargs=fn_kwargs, num_proc=args.dataset_num_proc)

From 121ec6b6e9e082a352f2e9f98a7f8b1fbba7b7f3 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Quentin=20Gallou=C3=A9dec?=
 <quentin.gallouedec@huggingface.co>
Date: Mon, 16 Sep 2024 17:05:20 +0000
Subject: [PATCH 06/22] extend test

---
 tests/test_online_dpo_trainer.py | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/tests/test_online_dpo_trainer.py b/tests/test_online_dpo_trainer.py
index 64796a8628..ac53711ec5 100644
--- a/tests/test_online_dpo_trainer.py
+++ b/tests/test_online_dpo_trainer.py
@@ -15,6 +15,7 @@
 import unittest
 
 from datasets import load_dataset
+from parameterized import parameterized
 from transformers import AutoModelForCausalLM, AutoModelForSequenceClassification, AutoTokenizer
 from transformers.testing_utils import require_peft
 from transformers.utils import is_peft_available
@@ -35,7 +36,8 @@ def setUp(self):
         self.tokenizer = AutoTokenizer.from_pretrained(self.model_id)
         self.tokenizer.pad_token = self.tokenizer.eos_token
 
-    def test_training(self):
+    @parameterized.expand([("standard_prompt_only",), ("conversational_prompt_only",)])
+    def test_training(self, config_name):
         with tempfile.TemporaryDirectory() as tmp_dir:
             training_args = OnlineDPOConfig(
                 output_dir=tmp_dir,
@@ -45,7 +47,7 @@ def test_training(self):
                 eval_strategy="steps",
                 report_to="none",
             )
-            dummy_dataset = load_dataset("trl-internal-testing/zen", "standard_prompt_only")
+            dummy_dataset = load_dataset("trl-internal-testing/zen", config_name)
 
             trainer = OnlineDPOTrainer(
                 model=self.model,
@@ -145,6 +147,7 @@ def test_training_with_peft_and_ref_model(self):
             # Check if training loss is available
             self.assertIn("train_loss", trainer.state.log_history[-1])
 
+    @require_peft
     def test_training_with_peft_model_and_peft_config(self):
         model_lora_config = LoraConfig(r=8, lora_alpha=16, lora_dropout=0.1, bias="none", task_type="CAUSAL_LM")
         model = get_peft_model(self.model, model_lora_config)

From 997c9c3c50c42fce18fd2e6c5ab183d3e722bd14 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Quentin=20Gallou=C3=A9dec?=
 <quentin.gallouedec@huggingface.co>
Date: Mon, 16 Sep 2024 17:05:32 +0000
Subject: [PATCH 07/22] new default lr

---
 trl/trainer/online_dpo_config.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/trl/trainer/online_dpo_config.py b/trl/trainer/online_dpo_config.py
index c4554a6a55..ead18fcd13 100644
--- a/trl/trainer/online_dpo_config.py
+++ b/trl/trainer/online_dpo_config.py
@@ -28,6 +28,9 @@ class OnlineDPOConfig(TrainingArguments):
     command line.
 
     Parameters:
+        learning_rate (`float`, *optional*, defaults to `5e-7`):
+            Initial learning rate for [`AdamW`] optimizer. The default value replaces that of
+            [`~transformers.TrainingArguments`].
         reward_model_path (`Optional[str]`, *optional*, defaults to `None`):
             Path to the reward model.
         max_new_tokens (`int`, *optional*, defaults to `64`):
@@ -54,6 +57,7 @@ class OnlineDPOConfig(TrainingArguments):
             Whether to disable dropout in the model.
     """
 
+    learning_rate: float = 5e-7
     reward_model_path: Optional[str] = None
     max_new_tokens: int = 64
     temperature: float = 0.9

From a2aeec63ac8b06f39c310c48e6c3de521bf8036f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Quentin=20Gallou=C3=A9dec?=
 <quentin.gallouedec@huggingface.co>
Date: Mon, 16 Sep 2024 18:58:29 +0000
Subject: [PATCH 08/22] nash md and xpo conv test

---
 tests/test_nash_md_trainer.py | 6 ++++--
 tests/test_xpo_trainer.py     | 6 ++++--
 2 files changed, 8 insertions(+), 4 deletions(-)

diff --git a/tests/test_nash_md_trainer.py b/tests/test_nash_md_trainer.py
index a7e9f685fa..470e3eacc5 100644
--- a/tests/test_nash_md_trainer.py
+++ b/tests/test_nash_md_trainer.py
@@ -15,6 +15,7 @@
 import unittest
 
 from datasets import load_dataset
+from parameterized import parameterized
 from transformers import AutoModelForCausalLM, AutoModelForSequenceClassification, AutoTokenizer
 from transformers.testing_utils import require_peft
 from transformers.utils import is_peft_available
@@ -35,7 +36,8 @@ def setUp(self):
         self.tokenizer = AutoTokenizer.from_pretrained(self.model_id)
         self.tokenizer.pad_token = self.tokenizer.eos_token
 
-    def test_nash_md_trainer_training(self):
+    @parameterized.expand([("standard_prompt_only",), ("conversational_prompt_only",)])
+    def test_nash_md_trainer_training(self, config_name):
         with tempfile.TemporaryDirectory() as tmp_dir:
             training_args = NashMDConfig(
                 output_dir=tmp_dir,
@@ -47,7 +49,7 @@ def test_nash_md_trainer_training(self):
                 eval_strategy="steps",
                 report_to="none",
             )
-            dummy_dataset = load_dataset("trl-internal-testing/zen", "standard_prompt_only")
+            dummy_dataset = load_dataset("trl-internal-testing/zen", config_name)
 
             trainer = NashMDTrainer(
                 model=self.model,
diff --git a/tests/test_xpo_trainer.py b/tests/test_xpo_trainer.py
index 7b098e88de..952ecdcbc7 100644
--- a/tests/test_xpo_trainer.py
+++ b/tests/test_xpo_trainer.py
@@ -15,6 +15,7 @@
 import unittest
 
 from datasets import load_dataset
+from parameterized import parameterized
 from transformers import AutoModelForCausalLM, AutoModelForSequenceClassification, AutoTokenizer
 from transformers.testing_utils import require_peft
 from transformers.utils import is_peft_available
@@ -35,7 +36,8 @@ def setUp(self):
         self.tokenizer = AutoTokenizer.from_pretrained(self.model_id)
         self.tokenizer.pad_token = self.tokenizer.eos_token
 
-    def test_xpo_trainer_training(self):
+    @parameterized.expand([("standard_prompt_only",), ("conversational_prompt_only",)])
+    def test_xpo_trainer_training(self, config_name):
         with tempfile.TemporaryDirectory() as tmp_dir:
             training_args = XPOConfig(
                 output_dir=tmp_dir,
@@ -47,7 +49,7 @@ def test_xpo_trainer_training(self):
                 eval_strategy="steps",
                 report_to="none",
             )
-            dummy_dataset = load_dataset("trl-internal-testing/zen", "standard_prompt_only")
+            dummy_dataset = load_dataset("trl-internal-testing/zen", config_name)
 
             trainer = XPOTrainer(
                 model=self.model,

From 30f36c09a9b1332f8e9408cd7f9c3156259225ce Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Quentin=20Gallou=C3=A9dec?=
 <quentin.gallouedec@huggingface.co>
Date: Mon, 16 Sep 2024 18:58:37 +0000
Subject: [PATCH 09/22] Update prompt length check to 512 characters

---
 examples/datasets/ultrafeedback-prompt.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/datasets/ultrafeedback-prompt.py b/examples/datasets/ultrafeedback-prompt.py
index 10cf8e066e..8149bae168 100644
--- a/examples/datasets/ultrafeedback-prompt.py
+++ b/examples/datasets/ultrafeedback-prompt.py
@@ -44,7 +44,7 @@ def to_unpaired_preference(example):
 
 
 def drop_long_prompt(example):
-    if len(example["prompt"][0]["content"]) > 768:
+    if len(example["prompt"][0]["content"]) > 512:
         return False
     else:
         return True

From 075faa9bb5e9d8368ce2241859004b7db413bb82 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Quentin=20Gallou=C3=A9dec?=
 <quentin.gallouedec@huggingface.co>
Date: Tue, 17 Sep 2024 11:33:44 +0000
Subject: [PATCH 10/22] remove `maybe_apply_chat_template` in XPO and Nash
 examples

---
 examples/scripts/nash_md.py | 7 -------
 examples/scripts/xpo.py     | 7 -------
 2 files changed, 14 deletions(-)

diff --git a/examples/scripts/nash_md.py b/examples/scripts/nash_md.py
index 6ad8068db9..ec0bf1b1d9 100644
--- a/examples/scripts/nash_md.py
+++ b/examples/scripts/nash_md.py
@@ -50,7 +50,6 @@
 import torch
 from datasets import load_dataset
 from transformers import AutoModelForCausalLM, AutoModelForSequenceClassification, AutoTokenizer, GenerationConfig
-from accelerate import PartialState
 from trl import (
     DPOScriptArguments,
     ModelConfig,
@@ -58,7 +57,6 @@
     NashMDTrainer,
     get_kbit_device_map,
     get_quantization_config,
-    maybe_apply_chat_template,
     LogCompletionsCallback,
 )
 from trl.commands.cli_utils import TrlParser
@@ -106,11 +104,6 @@
 
     dataset = load_dataset(args.dataset_name)
 
-    with PartialState().local_main_process_first():
-        dataset = dataset.map(
-            maybe_apply_chat_template, num_proc=training_args.dataset_num_proc, fn_kwargs={"tokenizer": tokenizer}
-        )
-
     trainer = NashMDTrainer(
         model=model,
         ref_model=ref_model,
diff --git a/examples/scripts/xpo.py b/examples/scripts/xpo.py
index af3448f412..22c1722ecc 100644
--- a/examples/scripts/xpo.py
+++ b/examples/scripts/xpo.py
@@ -33,7 +33,6 @@
 import torch
 from datasets import load_dataset
 from transformers import AutoModelForCausalLM, AutoModelForSequenceClassification, AutoTokenizer, GenerationConfig
-from accelerate import PartialState
 from trl import (
     DPOScriptArguments,
     ModelConfig,
@@ -41,7 +40,6 @@
     XPOTrainer,
     get_kbit_device_map,
     get_quantization_config,
-    maybe_apply_chat_template,
     LogCompletionsCallback,
 )
 from trl.commands.cli_utils import TrlParser
@@ -89,11 +87,6 @@
 
     dataset = load_dataset(args.dataset_name)
 
-    with PartialState().local_main_process_first():
-        dataset = dataset.map(
-            maybe_apply_chat_template, num_proc=training_args.dataset_num_proc, fn_kwargs={"tokenizer": tokenizer}
-        )
-
     trainer = XPOTrainer(
         model=model,
         ref_model=ref_model,

From e2cac3ffb20293241184729dac55b5e358a445ca Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Quentin=20Gallou=C3=A9dec?=
 <quentin.gallouedec@huggingface.co>
Date: Tue, 17 Sep 2024 13:20:28 +0000
Subject: [PATCH 11/22] polish online dpo doc

---
 docs/source/online_dpo_trainer.md | 94 +++++++++++++++++--------------
 1 file changed, 51 insertions(+), 43 deletions(-)

diff --git a/docs/source/online_dpo_trainer.md b/docs/source/online_dpo_trainer.md
index ec163e8220..2933e01250 100644
--- a/docs/source/online_dpo_trainer.md
+++ b/docs/source/online_dpo_trainer.md
@@ -12,18 +12,9 @@ The current implementation uses reward models for scoring completions -- see [Re
 
 This post-training method was contributed by [Michael Noukhovitch](https://huggingface.co/mnoukhov), [Shengyi Costa Huang](https://huggingface.co/vwxyzjn), [Quentin Gallouédec](https://huggingface.co/qgallouedec), and [Edward Beeching](https://huggingface.co/edbeeching).
 
-## Usage tips
-
-> [!WARNING]
-> Make sure that the SFT model and reward model use the _same_ chat template. Otherwise, you may find the model completions are scored incorrectly during training.
-
-## Expected dataset format
-
-Online DPO only requires a [prompt-only dataset](dataset_format#preference) (unlike offline DPO, that expects [preference dataset](dataset_format#preference)). The [`OnlineDPOTrainer`] supports both [conversational](dataset_format#conversational-dataset-format) and [standard](dataset_format#standard-dataset-format) dataset format.
-
 ## Quick start
 
-This example demonstrates how to train a model using the online DPO method. We use the [Qwen 0.5B model](https://huggingface.co/Qwen/Qwen2-0.5B-Instruct) as the base model and the [Qwen 0.5B reward model](https://huggingface.co/trl-lib/Qwen2-0.5B-Reward) as the reward model. We use the prompts from the [UltraFeedback dataset](https://huggingface.co/datasets/openbmb/UltraFeedback):
+This example demonstrates how to train a model using the online DPO method. We use the [Qwen 0.5B model](https://huggingface.co/Qwen/Qwen2-0.5B-Instruct) as the base model and the [Qwen 0.5B reward model](https://huggingface.co/trl-lib/Qwen2-0.5B-Reward) as the reward model. We use the prompts from the [UltraFeedback dataset](https://huggingface.co/datasets/openbmb/UltraFeedback). You can view the prompts in the dataset here:
 
 <iframe
   src="https://huggingface.co/datasets/trl-lib/ultrafeedback-prompt/embed/viewer/default/train?row=0"
@@ -32,7 +23,7 @@ This example demonstrates how to train a model using the online DPO method. We u
   height="560px"
 ></iframe>
 
-The basic API is as follows:
+Below is the script to train the model:
 
 ```python
 # train_online_dpo.py
@@ -56,37 +47,67 @@ trainer = OnlineDPOTrainer(
 trainer.train()
 ```
 
-We run this script with the following command:
+Execute the script using the following command:
 
 ```bash
 accelerate launch train_online_dpo.py
 ```
 
-After approximately 1 hour of training, the model is trained and we can demonstrate the completions:
+Distributed across 8 GPUs, the training takes approximately 1 hour. You can verify the training progress by checking the reward graph. An increasing trend in both the reward for rejected and chosen completions indicates that the model is improving and generating better responses over time.
+
+![](https://huggingface.co/datasets/trl-internal-testing/example-images/resolve/main/images/online-dpo-qwen2-reward.png)
 
-````python
+To see how the trained model performs, use the following code to generate completions:
+
+```python
 >>> from transformers import pipeline
->>> generator = pipeline("text-generation", model="online-dpo-qwen2/checkpoint-500", device="cuda")
->>> question = "Can you tell me which shell command can be used to display the CPU usage of a specific process in Linux? And what is the syntax for this command?"
+>>> generator = pipeline("text-generation", model="online-dpo-qwen2/checkpoint-1773", device="cuda")
+>>> question = "Why is the problem always DNS?"
 >>> output = generator([{"role": "user", "content": question}], max_new_tokens=200, return_full_text=False)[0]
 >>> print(output["generated_text"])
-Yes, you can use the `top` command in Linux to display the CPU usage of a specific process.
-The syntax for the `top` command depends on your version of Linux. Here's an example of how to run `top` with the `-b` option:
-```
-top -b
+The reason why the problem of DNS (Domain Name System) can always be encountered is that it is designed to provide reliable and accurate information about the availability, ownership, or expiration of domain names. However, there may be some circumstances where the system fails to resolve an IP address correctly, leading to the problem of DNS.
+For example, if the server hosting the domain name does not have the correct IP address associated with it, or if the IP address is incorrectly formatted, then the DNS system will fail to resolve the domain name correctly. Additionally, if the server hosting the domain name has been compromised, then the DNS system may also fail to resolve the domain name correctly.
+It's worth noting that the exact cause of DNS failure can vary depending on the specific situation, so it's important to carefully check all relevant factors before attempting to resolve the issue. If you suspect that your DNS problem may be caused by a bug in the system, you should report it to the DNS provider directly for further investigation.
 ```
 
-This will display the CPU usage of all processes in the system at the top level. You can also specify additional options by adding them after the `-b` option using square brackets (`[]`). For example, to display only the running processes and their CPU usage, you would add the following options to the command:
+## Expected dataset format
+
+Online DPO only requires a [prompt-only dataset](dataset_format#preference) (unlike offline DPO, that expects [preference dataset](dataset_format#preference)). The [`OnlineDPOTrainer`] supports both [conversational](dataset_format#conversational-dataset-format) and [standard](dataset_format#standard-dataset-format) dataset format. When provided with a conversational dataset, the trainer will automatically apply the chat template to the dataset.
+
+## Usage tips
+
+### ⚠️ Use the same chat template
+
+Make sure that the SFT model and reward model use the _same_ chat template. Otherwise, you may find the model completions are scored incorrectly during training.
+
+### Encourage the model to generate finish the completion within a given length
+
+We can want the model to generate completion within a given length. During the learning, the model will generate completion up to the maximum completion length specified in the `max_new_tokens` argument of [`OnlineDPOConfig`]. I you want to penalize for not generating an EOS token before the maximum completion length, you can use the `missing_eos_penalty` argument of [`OnlineDPOConfig`]:
+
+```python
+args = OnlineDPOConfig(..., max_new_tokens=128, missing_eos_penalty=1.0)
 ```
-top -b --running=1
+
+### Logging Completions
+
+To better understand your model’s behavior during training, you can log sample completions periodically using the [`LogCompletionsCallback`].
+
+```python
+trainer = OnlineDPOTrainer(..., eval_dataset=eval_dataset)
+completions_callback = LogCompletionsCallback(trainer, num_prompts=8)
+trainer.add_callback(completions_callback)
 ```
 
-Note that some versions of Linux may require you to set up a user account or enable logging before running the `top` command.
-````
+This callback logs the model's generated completions directly to Weights & Biases.
+
+![Logged Completions](https://huggingface.co/datasets/trl-internal-testing/example-images/resolve/main/images/wandb_completions.png)
+
+
+## Example script
 
-### Example script
+We provide an example script to train a model using the online DPO method. The script is available in [`examples/scripts/dpo_online.py`](https://github.com/huggingface/trl/blob/main/examples/scripts/dpo_online.py)
 
-To test the online DPO script with 1B parameter models, run:
+To test the online DPO script with the [Pythia 1B model](https://huggingface.co/trl-lib/pythia-1b-deduped-tldr-sft) on the TL;DR summarization task, run the following command:
 
 ```bash
 python examples/scripts/dpo_online.py \
@@ -104,22 +125,15 @@ python examples/scripts/dpo_online.py \
     --push_to_hub
 ```
 
-Tips:
-
-* `objective/rlhf_reward` is the ultimate objective of online DPO training. If training works as intended, this metric should keep going up.
-* We recommend using the "EOS trick" via the `--missing_eos_penalty` argument, which subtracts from the rewards a fixed scalar penalty for completions that do not end with an EOS token. This can help the model learn to generate more coherent completions.
-
-
-
-### Explanation of the logged metrics
+## Logged metrics
 
 The logged metrics are as follows. Here is an example [tracked run at Weights and Biases](https://wandb.ai/huggingface/trl/runs/dd2o3g35)
 
 * `objective/kl`: The mean Kullback-Leibler (KL) divergence between the current model and reference model.
 * `objective/entropy`: The mean entropy of the model, indicating the randomness of the actions chosen by the model.
 * `objective/non_score_reward`: The mean reward from non-score-related sources, basically `beta * kl.sum(1)`, where `beta` is the KL penalty coefficient and `kl` is the per-token KL divergence.
-* `objective/rlhf_reward`: The mean RLHF reward, which is `score - non_score_reward`.
-* `objective/scores`: The mean scores returned by the reward model / environment.
+* `objective/rlhf_reward`: The mean RLHF reward, which is `scores - non_score_reward`. The `rlhf_reward` is the ultimate objective of online DPO training. If training works as intended, this metric should keep going up.
+* `objective/scores`: The mean scores returned by the reward mode.
 * `objective/scores_margin`: The mean score margin (according to the external reward model) between the chosen and rejected completions.
 * `rewards/accuracies`: The accuracies of the online DPO's implicit reward model.
 * `rewards/chosen`: The mean reward (according to online DPO's implicit reward model)of the chosen completions.
@@ -129,12 +143,6 @@ The logged metrics are as follows. Here is an example [tracked run at Weights an
 * `logps/rejected`: The mean log probabilities of the rejected completions.
 * `val/contain_eos_token`: The fraction of completions which contain an EOS token.
 
-
-## What is my model doing exactly?
-
-To help you understand what your model is doing, we periodically log some sample completions from the model via [`LogCompletionsCallback`]. You can find an example [tracked run at Weights and Biases](https://wandb.ai/huggingface/trl/runs/hlzevfro), which allows you to see the model's response at different stages of training. By default we generate during training, but you can customize the number of prompts to generate completions for in [`LogCompletionsCallback`]. 
-
-
 ## Benchmark experiments
 
 To validate the online DPO implementation works, we ran experiments with the Pythia 1B, 2.8B, and 6.9B models on a single node of 8 x H100s. Here are the commands we used to run the experiments. We take the SFT / RM models directly from [The N+ Implementation Details of RLHF with PPO: A Case Study on TL;DR Summarization](https://huggingface.co/papers/2403.17031).
@@ -178,7 +186,7 @@ accelerate launch --config_file examples/accelerate_configs/deepspeed_zero2.yaml
     --bf16 \
     --logging_steps 20 \
     --save_steps 0.1 \
-    --push_to_hub \
+    --push_to_hub
 
 # 6.9B Online DPO experiment
 accelerate launch --config_file examples/accelerate_configs/deepspeed_zero2.yaml \

From f7fa597a30e15482a620928e7ca2878c56938626 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Quentin=20Gallou=C3=A9dec?=
 <quentin.gallouedec@huggingface.co>
Date: Tue, 17 Sep 2024 15:03:00 +0000
Subject: [PATCH 12/22] better section name

---
 docs/source/online_dpo_trainer.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/source/online_dpo_trainer.md b/docs/source/online_dpo_trainer.md
index 2933e01250..aa1cf91537 100644
--- a/docs/source/online_dpo_trainer.md
+++ b/docs/source/online_dpo_trainer.md
@@ -80,7 +80,7 @@ Online DPO only requires a [prompt-only dataset](dataset_format#preference) (unl
 
 Make sure that the SFT model and reward model use the _same_ chat template. Otherwise, you may find the model completions are scored incorrectly during training.
 
-### Encourage the model to generate finish the completion within a given length
+### Encourage EOS token generation
 
 We can want the model to generate completion within a given length. During the learning, the model will generate completion up to the maximum completion length specified in the `max_new_tokens` argument of [`OnlineDPOConfig`]. I you want to penalize for not generating an EOS token before the maximum completion length, you can use the `missing_eos_penalty` argument of [`OnlineDPOConfig`]:
 

From a54b1a1b8dd6d6165301dfd50a624360b3934d14 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Quentin=20Gallou=C3=A9dec?=
 <quentin.gallouedec@huggingface.co>
Date: Tue, 17 Sep 2024 15:10:38 +0000
Subject: [PATCH 13/22] LogCompletionsCallback doc

---
 docs/source/callbacks.mdx | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/docs/source/callbacks.mdx b/docs/source/callbacks.mdx
index e4d26797c2..dfcf4fd8b7 100644
--- a/docs/source/callbacks.mdx
+++ b/docs/source/callbacks.mdx
@@ -11,3 +11,7 @@
 ## WinRateCallback
 
 [[autodoc]] WinRateCallback
+
+## LogCompletionsCallback
+
+[[autodoc]] LogCompletionsCallback
\ No newline at end of file

From d40cd756579c482b45f78d6e9188a5ff4f533d68 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Quentin=20Gallou=C3=A9dec?=
 <quentin.gallouedec@huggingface.co>
Date: Tue, 17 Sep 2024 15:36:42 +0000
Subject: [PATCH 14/22] optional generation config

---
 trl/trainer/callbacks.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/trl/trainer/callbacks.py b/trl/trainer/callbacks.py
index 86d6d435b4..baf340c269 100644
--- a/trl/trainer/callbacks.py
+++ b/trl/trainer/callbacks.py
@@ -47,7 +47,7 @@ def _generate_completions(
     model: PreTrainedModel,
     tokenizer: PreTrainedTokenizerBase,
     accelerator: Accelerator,
-    generation_config: GenerationConfig,
+    generation_config: Optional[GenerationConfig],
     batch_size: int = 1,
 ) -> List[str]:
     """

From 1ba0ca8ac9572a25ea96d9519f62330a42f7d99d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Quentin=20Gallou=C3=A9dec?=
 <quentin.gallouedec@huggingface.co>
Date: Tue, 17 Sep 2024 21:28:37 +0000
Subject: [PATCH 15/22] reorder stats (consistency with online dpo)

---
 trl/trainer/nash_md_trainer.py | 16 +++++++++-------
 1 file changed, 9 insertions(+), 7 deletions(-)

diff --git a/trl/trainer/nash_md_trainer.py b/trl/trainer/nash_md_trainer.py
index b0118c237f..87aeaef539 100644
--- a/trl/trainer/nash_md_trainer.py
+++ b/trl/trainer/nash_md_trainer.py
@@ -111,15 +111,17 @@ def __init__(
 
         # Overwrite the stats dictionary to include NashMD specific statistics
         self.stats = {
-            "logps/chosen": [],
-            "logps/rejected": [],
+            # Remove "non_score_reward", "rlhf_reward", "scores_margin"
+            # Add "mixture_coef"
+            "loss/kl": [],
+            "objective/entropy": [],
+            "loss/score": [],
             "rewards/chosen": [],
             "rewards/rejected": [],
-            "loss/score": [],
-            "loss/kl_div": [],
-            "objective/entropy": [],
-            "rewards/margins": [],
             "rewards/accuracies": [],
+            "rewards/margins": [],
+            "logps/chosen": [],
+            "logps/rejected": [],
             "val/model_contain_eos_token": [],
             "val/ref_contain_eos_token": [],
             "beta": [],
@@ -273,7 +275,7 @@ def gather_mean(tensor):
         # Log score
         self.stats["loss/score"].append(gather_mean(score))
         # Log KL divergence
-        self.stats["loss/kl_div"].append(gather_mean(kl_div))
+        self.stats["loss/kl"].append(gather_mean(kl_div))
 
         # Log logprobs
         model_logprobs_model_data_sum = model_logprobs_model_data.sum(1)

From a2ea4d1070ba37e89a48f751127bb186d7db57bd Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Quentin=20Gallou=C3=A9dec?=
 <quentin.gallouedec@huggingface.co>
Date: Tue, 17 Sep 2024 21:37:26 +0000
Subject: [PATCH 16/22] update online dpo doc

---
 docs/source/online_dpo_trainer.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/docs/source/online_dpo_trainer.md b/docs/source/online_dpo_trainer.md
index aa1cf91537..8ba99a6ed0 100644
--- a/docs/source/online_dpo_trainer.md
+++ b/docs/source/online_dpo_trainer.md
@@ -135,13 +135,14 @@ The logged metrics are as follows. Here is an example [tracked run at Weights an
 * `objective/rlhf_reward`: The mean RLHF reward, which is `scores - non_score_reward`. The `rlhf_reward` is the ultimate objective of online DPO training. If training works as intended, this metric should keep going up.
 * `objective/scores`: The mean scores returned by the reward mode.
 * `objective/scores_margin`: The mean score margin (according to the external reward model) between the chosen and rejected completions.
-* `rewards/accuracies`: The accuracies of the online DPO's implicit reward model.
 * `rewards/chosen`: The mean reward (according to online DPO's implicit reward model)of the chosen completions.
 * `rewards/rejected`: The mean reward (according to online DPO's implicit reward model) of the rejected completions.
+* `rewards/accuracies`: The accuracies of the online DPO's implicit reward model.
 * `rewards/margins`: The mean reward margin (according to online DPO's implicit reward model) between the chosen and rejected completions.
 * `logps/chosen`: The mean log probabilities of the chosen completions.
 * `logps/rejected`: The mean log probabilities of the rejected completions.
 * `val/contain_eos_token`: The fraction of completions which contain an EOS token.
+* `beta`:
 
 ## Benchmark experiments
 
@@ -266,7 +267,6 @@ The online DPO checkpoint gets increasingly more win rate as we scale up the mod
 
 [[autodoc]] OnlineDPOTrainer
 
-
 ## OnlineDPOConfig
 
 [[autodoc]] OnlineDPOConfig
\ No newline at end of file

From 32d85d3e859d584513036bf895bdfa04124127ee Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Quentin=20Gallou=C3=A9dec?=
 <quentin.gallouedec@huggingface.co>
Date: Tue, 17 Sep 2024 21:37:41 +0000
Subject: [PATCH 17/22] format online dpo config

---
 trl/trainer/online_dpo_config.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/trl/trainer/online_dpo_config.py b/trl/trainer/online_dpo_config.py
index ead18fcd13..10c9cb7708 100644
--- a/trl/trainer/online_dpo_config.py
+++ b/trl/trainer/online_dpo_config.py
@@ -44,7 +44,8 @@ class OnlineDPOConfig(TrainingArguments):
         beta (`float` or `list[float]`, *optional*, defaults to `0.1`):
             Parameter controlling the deviation from the reference model. Higher β means less deviation from the
             reference model. For the IPO loss (`loss_type="ipo"`), β is the regularization parameter denoted by τ in
-            the [paper](https://huggingface.co/papers/2310.12036). If a list of floats is provided then the β is selected for each new epoch and the last β is used for the rest of the epochs.
+            the [paper](https://huggingface.co/papers/2310.12036). If a list of floats is provided then the β is
+            selected for each new epoch and the last β is used for the rest of the epochs.
         loss_type (`str`, *optional*, defaults to `"sigmoid"`):
             Type of loss to use. Possible values are:
 

From 8281af10a6a6c4bac0fc15a69d2b7494e2d65df3 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Quentin=20Gallou=C3=A9dec?=
 <quentin.gallouedec@huggingface.co>
Date: Tue, 17 Sep 2024 21:37:55 +0000
Subject: [PATCH 18/22] format nash_md config

---
 trl/trainer/nash_md_config.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/trl/trainer/nash_md_config.py b/trl/trainer/nash_md_config.py
index 7a5a7c2ff5..0f344ec271 100644
--- a/trl/trainer/nash_md_config.py
+++ b/trl/trainer/nash_md_config.py
@@ -27,8 +27,9 @@ class NashMDConfig(OnlineDPOConfig):
 
     Parameters:
         mixture_coef (`float` or `list[float]`, *optional*, defaults to `0.5`):
-            Logit mixture coefficient for the model and reference model.
-            If a list of floats is provided then the mixture coefficient is selected for each new epoch and the last coefficient is used for the rest of the epochs.
+            Logit mixture coefficient for the model and reference model. If a list of floats is provided then the
+            mixture coefficient is selected for each new epoch and the last coefficient is used for the rest of the
+            epochs.
     """
 
     mixture_coef: Union[float, List[float]] = 0.5

From 78a02945263926145a899a88c921d8f45302eb42 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Quentin=20Gallou=C3=A9dec?=
 <quentin.gallouedec@huggingface.co>
Date: Tue, 17 Sep 2024 21:42:51 +0000
Subject: [PATCH 19/22] update nash md

---
 docs/source/nash_md_trainer.md | 96 ++++++++++++++++++++++++++++++----
 1 file changed, 87 insertions(+), 9 deletions(-)

diff --git a/docs/source/nash_md_trainer.md b/docs/source/nash_md_trainer.md
index 5ff03d1e20..b959ccf767 100644
--- a/docs/source/nash_md_trainer.md
+++ b/docs/source/nash_md_trainer.md
@@ -1,18 +1,93 @@
 # Nash MD Trainer
 
-## Overview 
+## Overview
+
 Nash-MD was proposed in the paper [Nash Learning from Human Feedback](https://huggingface.co/papers/2312.00886) by Rémi Munos, [Michal Valko](https://huggingface.co/misovalko), Daniele Calandriello, Mohammad Gheshlaghi Azar, Mark Rowland, Daniel Guo, Yunhao Tang, Matthieu Geist, Thomas Mésnard, and Andrea Michi. 
 
 The abstract from the paper is the following:
 
 > Reinforcement learning from human feedback (RLHF) has emerged as the main paradigm for aligning large language models (LLMs) with human preferences. Typically, RLHF involves the initial step of learning a reward model from human feedback, often expressed as preferences between pairs of text generations produced by a pre-trained LLM. Subsequently, the LLM's policy is fine-tuned by optimizing it to maximize the reward model through a reinforcement learning algorithm. However, an inherent limitation of current reward models is their inability to fully represent the richness of human preferences and their dependency on the sampling distribution. In this study, we introduce an alternative pipeline for the fine-tuning of LLMs using pairwise human feedback. Our approach entails the initial learning of a preference model, which is conditioned on two inputs given a prompt, followed by the pursuit of a policy that consistently generates responses preferred over those generated by any competing policy, thus defining the Nash equilibrium of this preference model. We term this approach Nash learning from human feedback (NLHF). In the context of a tabular policy representation, we present a novel algorithmic solution, Nash-MD, founded on the principles of mirror descent. This algorithm produces a sequence of policies, with the last iteration converging to the regularized Nash equilibrium. Additionally, we explore parametric representations of policies and introduce gradient descent algorithms for deep-learning architectures. To demonstrate the effectiveness of our approach, we present experimental results involving the fine-tuning of a LLM for a text summarization task. We believe NLHF offers a compelling avenue for preference learning and policy optimization with the potential of advancing the field of aligning LLMs with human preferences.
 
-
 This post-training method was contributed by [Kashif Rasul](https://huggingface.co/kashif) and [Daniil Tiapkin](https://huggingface.co/dtiapkin), [Pierre Ménard](https://huggingface.co/menardprr), Daniele Calandriello and [Quentin Gallouédec](https://huggingface.co/qgallouedec). 
 
-## Get started
+## Quick start
+
+This example demonstrates how to train a model using the Nash MD method. We use the [Qwen 0.5B model](https://huggingface.co/Qwen/Qwen2-0.5B-Instruct) as the base model and the [Qwen 0.5B reward model](https://huggingface.co/trl-lib/Qwen2-0.5B-Reward) as the reward model. We use the prompts from the [UltraFeedback dataset](https://huggingface.co/datasets/openbmb/UltraFeedback). You can view the prompts in the dataset here:
+
+<iframe
+  src="https://huggingface.co/datasets/trl-lib/ultrafeedback-prompt/embed/viewer/default/train?row=0"
+  frameborder="0"
+  width="100%"
+  height="560px"
+></iframe>
+
+Below is the script to train the model:
+
+```python
+# train_nash_md.py
+from datasets import load_dataset
+from trl import NashMDConfig, NashMDTrainer
+from transformers import AutoModelForCausalLM, AutoModelForSequenceClassification, AutoTokenizer
+
+model = AutoModelForCausalLM.from_pretrained("Qwen/Qwen2-0.5B-Instruct")
+tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2-0.5B-Instruct")
+reward_model = AutoModelForSequenceClassification.from_pretrained("trl-lib/Qwen2-0.5B-Reward", num_labels=1)
+train_dataset = load_dataset("trl-lib/ultrafeedback-prompt", split="train")
+
+args = NashMDConfig(output_dir="nash-md-qwen2", logging_steps=10)
+trainer = NashMDTrainer(
+    model=model,
+    reward_model=reward_model,
+    args=args,
+    tokenizer=tokenizer,
+    train_dataset=train_dataset,
+)
+trainer.train()
+```
+
+Execute the script using the following command:
+
+```bash
+accelerate launch train_nash_md.py
+```
+
+## Expected dataset format
+
+Nash MD requires a [prompt-only dataset](dataset_format#preference). The [`NashMDTrainer`] supports both [conversational](dataset_format#conversational-dataset-format) and [standard](dataset_format#standard-dataset-format) dataset format. When provided with a conversational dataset, the trainer will automatically apply the chat template to the dataset.
+
+## Usage tips
+
+### ⚠️ Use the same chat template
 
-To just run the Nash MD script to make sure this trainer can run, you can run the following command to train a Nash MD model with a dummy reward model.
+Make sure that the SFT model and reward model use the _same_ chat template. Otherwise, you may find the model completions are scored incorrectly during training.
+
+### Encourage EOS token generation
+
+We can want the model to generate completion within a given length. During the learning, the model will generate completion up to the maximum completion length specified in the `max_new_tokens` argument of [`NashMDConfig`]. I you want to penalize for not generating an EOS token before the maximum completion length, you can use the `missing_eos_penalty` argument of [`NashMDConfig`]:
+
+```python
+args = NashMDConfig(..., max_new_tokens=128, missing_eos_penalty=1.0)
+```
+
+### Logging Completions
+
+To better understand your model’s behavior during training, you can log sample completions periodically using the [`LogCompletionsCallback`].
+
+```python
+trainer = NashMDTrainer(..., eval_dataset=eval_dataset)
+completions_callback = LogCompletionsCallback(trainer, num_prompts=8)
+trainer.add_callback(completions_callback)
+```
+
+This callback logs the model's generated completions directly to Weights & Biases.
+
+![Logged Completions](https://huggingface.co/datasets/trl-internal-testing/example-images/resolve/main/images/wandb_completions.png)
+
+## Example script
+
+We provide an example script to train a model using the Nash MD method. The script is available in [`examples/scripts/nash_md.py`](https://github.com/huggingface/trl/blob/main/examples/scripts/nash_md.py)
+
+To test the Nash MD script with the [Pythia 14M model](https://huggingface.co/EleutherAI/pythia-14m) on the TL;DR summarization task, run the following command:
 
 ```bash
 python examples/scripts/nash_md.py \
@@ -26,24 +101,27 @@ python examples/scripts/nash_md.py \
     --num_train_epochs 3 \
     --max_new_tokens 64 \
     --warmup_ratio 0.1 \
-    --missing_eos_penalty 1.0
+    --missing_eos_penalty 1.0 \
+    --push_to_hub
 ```
 
-## Explanation of the logged metrics
+## Logged metrics
 
 The logged metrics are as follows:
 
-* `loss/score`: The mean reinforce score loss.
-* `loss/kl_div`: The mean kl divergence loss.
+* `loss/kl`: The mean KL divergence between the model and reference data.
 * `objective/entropy`: The mean entropy of the model and reference data.
-* `rewards/accuracies`: The accuracies of the Nash MD's implicit reward model.
+* `loss/score`: The mean reinforce score loss.
 * `rewards/chosen`: The mean scores (according to the reward model) of the model completions.
 * `rewards/rejected`: The mean scores (according to the reward model) of the mixture completions.
+* `rewards/accuracies`: The accuracies of the Nash MD's implicit reward model.
 * `rewards/margins`: The mean reward margin (according to reward model) between the chosen and mixture completions.
 * `logps/chosen`: The mean log probabilities of the chosen completions.
 * `logps/rejected`: The mean log probabilities of the reference completions.
 * `val/model_contain_eos_token`: The amount of times the model's output contains the eos token.
 * `val/ref_contain_eos_token`: The amount of times the mixture's output contains the eos token.
+* `beta`:
+* `mixture_coef`:
 
 ## NashMDTrainer
 

From 4474c7c5d4c1c77d62171796374af7694323648c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Quentin=20Gallou=C3=A9dec?=
 <quentin.gallouedec@huggingface.co>
Date: Wed, 18 Sep 2024 07:44:08 +0000
Subject: [PATCH 20/22] Nash MD -> Nash-MD

---
 docs/source/_toctree.yml       |  2 +-
 docs/source/nash_md_trainer.md | 12 ++++++------
 2 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/docs/source/_toctree.yml b/docs/source/_toctree.yml
index 2f90f9c473..c6d7ef6826 100644
--- a/docs/source/_toctree.yml
+++ b/docs/source/_toctree.yml
@@ -37,7 +37,7 @@
     - local: kto_trainer
       title: KTO
     - local: nash_md_trainer
-      title: Nash MD
+      title: Nash-MD
     - local: orpo_trainer
       title: ORPO
     - local: ppo_trainer
diff --git a/docs/source/nash_md_trainer.md b/docs/source/nash_md_trainer.md
index b959ccf767..761cdecbd9 100644
--- a/docs/source/nash_md_trainer.md
+++ b/docs/source/nash_md_trainer.md
@@ -1,4 +1,4 @@
-# Nash MD Trainer
+# Nash-MD Trainer
 
 ## Overview
 
@@ -12,7 +12,7 @@ This post-training method was contributed by [Kashif Rasul](https://huggingface.
 
 ## Quick start
 
-This example demonstrates how to train a model using the Nash MD method. We use the [Qwen 0.5B model](https://huggingface.co/Qwen/Qwen2-0.5B-Instruct) as the base model and the [Qwen 0.5B reward model](https://huggingface.co/trl-lib/Qwen2-0.5B-Reward) as the reward model. We use the prompts from the [UltraFeedback dataset](https://huggingface.co/datasets/openbmb/UltraFeedback). You can view the prompts in the dataset here:
+This example demonstrates how to train a model using the Nash-MD method. We use the [Qwen 0.5B model](https://huggingface.co/Qwen/Qwen2-0.5B-Instruct) as the base model and the [Qwen 0.5B reward model](https://huggingface.co/trl-lib/Qwen2-0.5B-Reward) as the reward model. We use the prompts from the [UltraFeedback dataset](https://huggingface.co/datasets/openbmb/UltraFeedback). You can view the prompts in the dataset here:
 
 <iframe
   src="https://huggingface.co/datasets/trl-lib/ultrafeedback-prompt/embed/viewer/default/train?row=0"
@@ -53,7 +53,7 @@ accelerate launch train_nash_md.py
 
 ## Expected dataset format
 
-Nash MD requires a [prompt-only dataset](dataset_format#preference). The [`NashMDTrainer`] supports both [conversational](dataset_format#conversational-dataset-format) and [standard](dataset_format#standard-dataset-format) dataset format. When provided with a conversational dataset, the trainer will automatically apply the chat template to the dataset.
+Nash-MD requires a [prompt-only dataset](dataset_format#preference). The [`NashMDTrainer`] supports both [conversational](dataset_format#conversational-dataset-format) and [standard](dataset_format#standard-dataset-format) dataset format. When provided with a conversational dataset, the trainer will automatically apply the chat template to the dataset.
 
 ## Usage tips
 
@@ -85,9 +85,9 @@ This callback logs the model's generated completions directly to Weights & Biase
 
 ## Example script
 
-We provide an example script to train a model using the Nash MD method. The script is available in [`examples/scripts/nash_md.py`](https://github.com/huggingface/trl/blob/main/examples/scripts/nash_md.py)
+We provide an example script to train a model using the Nash-MD method. The script is available in [`examples/scripts/nash_md.py`](https://github.com/huggingface/trl/blob/main/examples/scripts/nash_md.py)
 
-To test the Nash MD script with the [Pythia 14M model](https://huggingface.co/EleutherAI/pythia-14m) on the TL;DR summarization task, run the following command:
+To test the Nash-MD script with the [Pythia 14M model](https://huggingface.co/EleutherAI/pythia-14m) on the TL;DR summarization task, run the following command:
 
 ```bash
 python examples/scripts/nash_md.py \
@@ -114,7 +114,7 @@ The logged metrics are as follows:
 * `loss/score`: The mean reinforce score loss.
 * `rewards/chosen`: The mean scores (according to the reward model) of the model completions.
 * `rewards/rejected`: The mean scores (according to the reward model) of the mixture completions.
-* `rewards/accuracies`: The accuracies of the Nash MD's implicit reward model.
+* `rewards/accuracies`: The accuracies of the Nash-MD's implicit reward model.
 * `rewards/margins`: The mean reward margin (according to reward model) between the chosen and mixture completions.
 * `logps/chosen`: The mean log probabilities of the chosen completions.
 * `logps/rejected`: The mean log probabilities of the reference completions.

From a9e601b209b2742a01e8a144c446b85b89a6fc89 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Quentin=20Gallou=C3=A9dec?=
 <quentin.gallouedec@huggingface.co>
Date: Wed, 18 Sep 2024 07:55:33 +0000
Subject: [PATCH 21/22] xpo doc

---
 docs/source/xpo_trainer.mdx | 100 ++++++++++++++++++++++++++++++++----
 1 file changed, 89 insertions(+), 11 deletions(-)

diff --git a/docs/source/xpo_trainer.mdx b/docs/source/xpo_trainer.mdx
index 2c55fd226c..fbf29a8a94 100644
--- a/docs/source/xpo_trainer.mdx
+++ b/docs/source/xpo_trainer.mdx
@@ -1,18 +1,93 @@
 # XPO Trainer
 
-Exploratory Preference Optimization (XPO) ([Xie et al. 2024](https://huggingface.co/papers/2405.21046))  is a simple online preference tuning method based on the DPO loss together with a reward model (RM) by Tengyang Xie, Dylan J. Foster, Akshay Krishnamurthy, Corby Rosset, Ahmed Awadallah, and Alexander Rakhlin. 
+## Overview
+
+Exploratory Preference Optimization (XPO) was proposed in the paper [Exploratory Preference Optimization: Harnessing Implicit Q*-Approximation for Sample-Efficient RLHF](https://huggingface.co/papers/2405.21046) by Tengyang Xie, Dylan J. Foster, Akshay Krishnamurthy, [Corby Rosset](https://huggingface.co/corbyrosset), [Ahmed Awadallah](https://huggingface.co/AhmedAwadallah), and Alexander Rakhlin. It is a simple online preference tuning method based on the DPO loss together with a reward model (RM). XPO augments the DPO objective with an exploration bonus allowing the method to explore outside the support of the intitial model and human feedback data.
 
 The abstract from the paper is the following:
 
 > Reinforcement learning from human feedback (RLHF) has emerged as a central tool for language model alignment. We consider online exploration in RLHF, which exploits interactive access to human or AI feedback by deliberately encouraging the model to produce diverse, maximally informative responses. By allowing RLHF to confidently stray from the pre-trained model, online exploration offers the possibility of novel, potentially super-human capabilities, but its full potential as a paradigm for language model training has yet to be realized, owing to computational and statistical bottlenecks in directly adapting existing reinforcement learning techniques. We propose a new algorithm for online exploration in RLHF, Exploratory Preference Optimization (XPO), which is simple and practical -- a one-line change to (online) Direct Preference Optimization (DPO; Rafailov et al., 2023) -- yet enjoys the strongest known provable guarantees and promising empirical performance. XPO augments the DPO objective with a novel and principled exploration bonus, empowering the algorithm to explore outside the support of the initial model and human feedback data. In theory, we show that XPO is provably sample-efficient and converges to a near-optimal language model policy under natural exploration conditions, irrespective of whether the initial model has good coverage. Our analysis, which builds on the observation that DPO implicitly performs a form of Q*-approximation (or, Bellman error minimization), combines previously disparate techniques from language modeling and theoretical reinforcement learning in a serendipitous fashion through the perspective of KL-regularized Markov decision processes. Empirically, we find that XPO is more sample-efficient than non-exploratory DPO variants in a preliminary evaluation.
 
-XPO augments the DPO objective with an exploration bonus allowing the method to explore outside the support of the intitial model and human feedback data.
-
 This post-training method was contributed by [Kashif Rasul](https://huggingface.co/kashif),  [Quentin Gallouédec](https://huggingface.co/qgallouedec) and [Lewis Tunstall](https://huggingface.co/lewtun).
 
-## Get started
+## Quick start
+
+This example demonstrates how to train a model using the XPO method. We use the [Qwen 0.5B model](https://huggingface.co/Qwen/Qwen2-0.5B-Instruct) as the base model and the [Qwen 0.5B reward model](https://huggingface.co/trl-lib/Qwen2-0.5B-Reward) as the reward model. We use the prompts from the [UltraFeedback dataset](https://huggingface.co/datasets/openbmb/UltraFeedback). You can view the prompts in the dataset here:
+
+<iframe
+  src="https://huggingface.co/datasets/trl-lib/ultrafeedback-prompt/embed/viewer/default/train?row=0"
+  frameborder="0"
+  width="100%"
+  height="560px"
+></iframe>
+
+Below is the script to train the model:
+
+```python
+# train_xpo.py
+from datasets import load_dataset
+from trl import XPOConfig, XPOTrainer
+from transformers import AutoModelForCausalLM, AutoModelForSequenceClassification, AutoTokenizer
+
+model = AutoModelForCausalLM.from_pretrained("Qwen/Qwen2-0.5B-Instruct")
+tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2-0.5B-Instruct")
+reward_model = AutoModelForSequenceClassification.from_pretrained("trl-lib/Qwen2-0.5B-Reward", num_labels=1)
+train_dataset = load_dataset("trl-lib/ultrafeedback-prompt", split="train")
+
+args = XPOConfig(output_dir="nash-md-qwen2", logging_steps=10)
+trainer = XPOTrainer(
+    model=model,
+    reward_model=reward_model,
+    args=args,
+    tokenizer=tokenizer,
+    train_dataset=train_dataset,
+)
+trainer.train()
+```
+
+Execute the script using the following command:
+
+```bash
+accelerate launch train_xpo.py
+```
+
+## Expected dataset format
+
+XPO requires a [prompt-only dataset](dataset_format#preference). The [`XPOTrainer`] supports both [conversational](dataset_format#conversational-dataset-format) and [standard](dataset_format#standard-dataset-format) dataset format. When provided with a conversational dataset, the trainer will automatically apply the chat template to the dataset.
+
+## Usage tips
+
+### ⚠️ Use the same chat template
+
+Make sure that the SFT model and reward model use the _same_ chat template. Otherwise, you may find the model completions are scored incorrectly during training.
 
-To just run the XPO script to make sure this trainer can run, you can run the following command to train an XPO model with a dummy reward model.
+### Encourage EOS token generation
+
+We can want the model to generate completion within a given length. During the learning, the model will generate completion up to the maximum completion length specified in the `max_new_tokens` argument of [`XPOConfig`]. I you want to penalize for not generating an EOS token before the maximum completion length, you can use the `missing_eos_penalty` argument of [`XPOConfig`]:
+
+```python
+args = XPOConfig(..., max_new_tokens=128, missing_eos_penalty=1.0)
+```
+
+### Logging Completions
+
+To better understand your model’s behavior during training, you can log sample completions periodically using the [`LogCompletionsCallback`].
+
+```python
+trainer = XPOTrainer(..., eval_dataset=eval_dataset)
+completions_callback = LogCompletionsCallback(trainer, num_prompts=8)
+trainer.add_callback(completions_callback)
+```
+
+This callback logs the model's generated completions directly to Weights & Biases.
+
+![Logged Completions](https://huggingface.co/datasets/trl-internal-testing/example-images/resolve/main/images/wandb_completions.png)
+
+## Example script
+
+We provide an example script to train a model using the XPO method. The script is available in [`examples/scripts/nash_md.py`](https://github.com/huggingface/trl/blob/main/examples/scripts/nash_md.py)
+
+To test the XPO script with the [Pythia 14M model](https://huggingface.co/EleutherAI/pythia-14m) on the TL;DR summarization task, run the following command:
 
 ```bash
 python examples/scripts/xpo.py \
@@ -26,28 +101,31 @@ python examples/scripts/xpo.py \
     --num_train_epochs 3 \
     --max_new_tokens 64 \
     --warmup_ratio 0.1 \
-    --missing_eos_penalty 1.0
+    --missing_eos_penalty 1.0 \
+    --push_to_hub
 ```
 
-## Explanation of the logged metrics
+## Logged metrics
 
 The logged metrics are as follows:
 
 * `loss/xpo`: The mean xpo part of the full loss.
 * `loss/dpo`: The mean dpo part of the full loss.
+* `objective/kl`: The mean KL divergence between the model and reference data.
+* `objective/entropy`: The mean entropy of the model and reference data.
 * `objective/model_scores`: The mean scores (according to the reward model) of the model completions.
 * `objective/ref_scores`: The mean scores (according to the reward model) of the reference completions.
 * `objective/scores_margin`: The mean score margin (according to the external reward model) between the chosen and rejected completions.
-* `objective/kl`: The mean KL divergence between the model and reference data.
-* `objective/entropy`: The mean entropy of the model and reference data.
-* `rewards/accuracies`: The accuracies of the XPO's implicit reward model.
 * `rewards/chosen`: The mean reward (according to XPO's DPO implicit reward model) of the chosen completions.
 * `rewards/rejected`: The mean reward (according to XPO's DPO implicit reward model) of the rejected completions.
+* `rewards/accuracies`: The accuracies of the XPO's implicit reward model.
 * `rewards/margins`: The mean reward margin (according to online DPO's implicit reward model) between the chosen and rejected completions.
 * `logps/chosen`: The mean log probabilities of the chosen completions.
 * `logps/rejected`: The mean log probabilities of the rejected completions.
 * `val/model_contain_eos_token`: The amount of times the model's output contains the eos token.
 * `val/ref_contain_eos_token`: The amount of times the reference's output contains the eos token.
+* `beta`:
+* `mixture_coef`:
 
 ## XPOTrainer
 
@@ -55,4 +133,4 @@ The logged metrics are as follows:
 
 ## XPOConfig
 
-[[autodoc]] XPOConfig
\ No newline at end of file
+[[autodoc]] XPOConfig

From 131798b9c92f2eb9cca524296355e6f3df40bcb9 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Quentin=20Gallou=C3=A9dec?=
 <quentin.gallouedec@huggingface.co>
Date: Wed, 18 Sep 2024 08:11:00 +0000
Subject: [PATCH 22/22] doc

---
 docs/source/nash_md_trainer.md    | 4 ++--
 docs/source/online_dpo_trainer.md | 2 +-
 docs/source/xpo_trainer.mdx       | 5 +++--
 3 files changed, 6 insertions(+), 5 deletions(-)

diff --git a/docs/source/nash_md_trainer.md b/docs/source/nash_md_trainer.md
index 761cdecbd9..e0d931d187 100644
--- a/docs/source/nash_md_trainer.md
+++ b/docs/source/nash_md_trainer.md
@@ -120,8 +120,8 @@ The logged metrics are as follows:
 * `logps/rejected`: The mean log probabilities of the reference completions.
 * `val/model_contain_eos_token`: The amount of times the model's output contains the eos token.
 * `val/ref_contain_eos_token`: The amount of times the mixture's output contains the eos token.
-* `beta`:
-* `mixture_coef`:
+* `beta`: The parameter that controls the weight of the loss term representing the deviation from the reference model. Typically fixed, but can be made dynamic by passing a list to [`NashMDConfig`].
+* `mixture_coef`: Logit mixture coefficient for the model and reference model. Typically fixed, but can be made dynamic by passing a list to [`NashMDConfig`].
 
 ## NashMDTrainer
 
diff --git a/docs/source/online_dpo_trainer.md b/docs/source/online_dpo_trainer.md
index 8ba99a6ed0..3dfca04053 100644
--- a/docs/source/online_dpo_trainer.md
+++ b/docs/source/online_dpo_trainer.md
@@ -142,7 +142,7 @@ The logged metrics are as follows. Here is an example [tracked run at Weights an
 * `logps/chosen`: The mean log probabilities of the chosen completions.
 * `logps/rejected`: The mean log probabilities of the rejected completions.
 * `val/contain_eos_token`: The fraction of completions which contain an EOS token.
-* `beta`:
+* `beta`: The parameter that controls the weight of the loss term representing the deviation from the reference model. Typically fixed, but can be made dynamic by passing a list to [`OnlineDPOConfig`].
 
 ## Benchmark experiments
 
diff --git a/docs/source/xpo_trainer.mdx b/docs/source/xpo_trainer.mdx
index fbf29a8a94..ef23da1db8 100644
--- a/docs/source/xpo_trainer.mdx
+++ b/docs/source/xpo_trainer.mdx
@@ -124,8 +124,9 @@ The logged metrics are as follows:
 * `logps/rejected`: The mean log probabilities of the rejected completions.
 * `val/model_contain_eos_token`: The amount of times the model's output contains the eos token.
 * `val/ref_contain_eos_token`: The amount of times the reference's output contains the eos token.
-* `beta`:
-* `mixture_coef`:
+* `alpha`: The weight of the XPO loss term. Typically fixed, but can be made dynamic by passing a list to [`XPOConfig`].
+* `beta`: The parameter that controls the weight of the loss term representing the deviation from the reference model. Typically fixed, but can be made dynamic by passing a list to [`XPOConfig`].
+
 
 ## XPOTrainer