diff --git a/examples/datasets/zen.py b/scripts/generate_zen_dataset.py
similarity index 97%
rename from examples/datasets/zen.py
rename to scripts/generate_zen_dataset.py
index 7aa9b64ea7..e599e6c1ce 100644
--- a/examples/datasets/zen.py
+++ b/scripts/generate_zen_dataset.py
@@ -28,13 +28,13 @@ class ScriptArguments:
             Fraction of the dataset to include in the test split.
         push_to_hub (`bool`, *optional*, defaults to `False`):
             Whether to push the dataset to the Hugging Face Hub.
-        repo_id (`str`, *optional*, defaults to `"trl-lib/zen"`):
+        repo_id (`str`, *optional*, defaults to `"trl-internal-testing/zen"`):
             Hugging Face repository ID to push the dataset to.
     """
 
     test_size: float = 0.1
     push_to_hub: bool = False
-    repo_id: str = "trl-lib/zen"
+    repo_id: str = "trl-internal-testing/zen"
 
 
 def main(test_size, push_to_hub, repo_id):
@@ -62,7 +62,7 @@ def main(test_size, push_to_hub, repo_id):
             "Namespaces are one honking great idea -- let's do more of those!",
         ],
     })
-    standard_language_modeling_dataset = standard_language_modeling_dataset.train_test_split(test_size=test_size)
+    standard_language_modeling_dataset = standard_language_modeling_dataset.train_test_split(test_size=test_size, shuffle=False)
     if push_to_hub:
         standard_language_modeling_dataset.push_to_hub(repo_id, config_name="standard_language_modeling")
 
@@ -89,7 +89,7 @@ def main(test_size, push_to_hub, repo_id):
             "Namespaces are one honking great",
         ],
     })
-    standard_prompt_only_dataset = standard_prompt_only_dataset.train_test_split(test_size=test_size)
+    standard_prompt_only_dataset = standard_prompt_only_dataset.train_test_split(test_size=test_size, shuffle=False)
     if push_to_hub:
         standard_prompt_only_dataset.push_to_hub(repo_id, config_name="standard_prompt_only")
 
@@ -137,7 +137,7 @@ def main(test_size, push_to_hub, repo_id):
             " idea -- let's do more of those!",
         ],
     })
-    standard_prompt_completion_dataset = standard_prompt_completion_dataset.train_test_split(test_size=test_size)
+    standard_prompt_completion_dataset = standard_prompt_completion_dataset.train_test_split(test_size=test_size, shuffle=False)
     if push_to_hub:
         standard_prompt_completion_dataset.push_to_hub(repo_id, config_name="standard_prompt_completion")
 
@@ -206,7 +206,7 @@ def main(test_size, push_to_hub, repo_id):
             " watermelon -- let's plant some!",
         ],
     })
-    standard_preference_dataset = standard_preference_dataset.train_test_split(test_size=test_size)
+    standard_preference_dataset = standard_preference_dataset.train_test_split(test_size=test_size, shuffle=False)
     if push_to_hub:
         standard_preference_dataset.push_to_hub(repo_id, config_name="standard_preference")
 
@@ -254,7 +254,7 @@ def main(test_size, push_to_hub, repo_id):
             "Namespaces are one honking great watermelon -- let's plant some!",
         ],
     })
-    standard_implicit_prompt_preference_dataset = standard_implicit_prompt_preference_dataset.train_test_split(test_size=test_size)
+    standard_implicit_prompt_preference_dataset = standard_implicit_prompt_preference_dataset.train_test_split(test_size=test_size, shuffle=False)
     if push_to_hub:
         standard_implicit_prompt_preference_dataset.push_to_hub(repo_id, config_name="standard_implicit_prompt_preference")
 
@@ -303,11 +303,11 @@ def main(test_size, push_to_hub, repo_id):
         ],
         "label": [True, False, False, True, True, False, True, False, True, True, False, True, True, False, True, False, True, False, False],
     })
-    standard_unpaired_preference_dataset = standard_unpaired_preference_dataset.train_test_split(test_size=test_size)
+    standard_unpaired_preference_dataset = standard_unpaired_preference_dataset.train_test_split(test_size=test_size, shuffle=False)
     if push_to_hub:
         standard_unpaired_preference_dataset.push_to_hub(repo_id, config_name="standard_unpaired_preference")
 
-    standard_step_dataset = Dataset.from_dict({
+    standard_stepwise_supervision_dataset = Dataset.from_dict({
         "prompt": [
             "Beautiful is better than",
             "Explicit is better than",
@@ -350,7 +350,7 @@ def main(test_size, push_to_hub, repo_id):
             [" of those great ideas,", " that solve many problems."],
             [" the code should still aim for balance."],
         ],
-        "label": [
+        "labels": [
             [False, True],
             [False, True, False],
             [False, True],
@@ -371,9 +371,9 @@ def main(test_size, push_to_hub, repo_id):
             [False]
         ]
     })
-    standard_step_dataset = standard_step_dataset.train_test_split(test_size=test_size)
+    standard_stepwise_supervision_dataset = standard_stepwise_supervision_dataset.train_test_split(test_size=test_size, shuffle=False)
     if push_to_hub:
-        standard_step_dataset.push_to_hub(repo_id, config_name="standard_step")
+        standard_stepwise_supervision_dataset.push_to_hub(repo_id, config_name="standard_stepwise_supervision")
 
     conversational_language_modeling_dataset = Dataset.from_dict({
         "messages": [
@@ -398,7 +398,7 @@ def main(test_size, push_to_hub, repo_id):
             [{"role": "user", "content": "Any great ideas?"}, {"role": "assistant", "content": "Namespaces are one honking great idea."}],
         ],
     })
-    conversational_language_modeling_dataset = conversational_language_modeling_dataset.train_test_split(test_size=test_size)
+    conversational_language_modeling_dataset = conversational_language_modeling_dataset.train_test_split(test_size=test_size, shuffle=False)
     if push_to_hub:
         conversational_language_modeling_dataset.push_to_hub(repo_id, config_name="conversational_language_modeling")
 
@@ -425,7 +425,7 @@ def main(test_size, push_to_hub, repo_id):
             [{"role": "user", "content": "Any great ideas?"}],
         ],
     })
-    conversational_prompt_only_dataset = conversational_prompt_only_dataset.train_test_split(test_size=test_size)
+    conversational_prompt_only_dataset = conversational_prompt_only_dataset.train_test_split(test_size=test_size, shuffle=False)
     if push_to_hub:
         conversational_prompt_only_dataset.push_to_hub(repo_id, config_name="conversational_prompt_only")
 
@@ -473,7 +473,7 @@ def main(test_size, push_to_hub, repo_id):
             [{"role": "assistant", "content": "Namespaces are one honking great idea."}],
         ],
     })
-    conversational_prompt_completion_dataset = conversational_prompt_completion_dataset.train_test_split(test_size=test_size)
+    conversational_prompt_completion_dataset = conversational_prompt_completion_dataset.train_test_split(test_size=test_size, shuffle=False)
     if push_to_hub:
         conversational_prompt_completion_dataset.push_to_hub(repo_id, config_name="conversational_prompt_completion")
 
@@ -542,7 +542,7 @@ def main(test_size, push_to_hub, repo_id):
             [{"role": "assistant", "content": "Recursion."}],
         ],
     })
-    conversational_preference_dataset = conversational_preference_dataset.train_test_split(test_size=test_size)
+    conversational_preference_dataset = conversational_preference_dataset.train_test_split(test_size=test_size, shuffle=False)
     if push_to_hub:
         conversational_preference_dataset.push_to_hub(repo_id, config_name="conversational_preference")
 
@@ -590,7 +590,7 @@ def main(test_size, push_to_hub, repo_id):
             [{"role": "user", "content": "Any great ideas?"}, {"role": "assistant", "content": "Recursion."}],
         ],
     })
-    conversational_implicit_prompt_preference_dataset = conversational_implicit_prompt_preference_dataset.train_test_split(test_size=test_size)
+    conversational_implicit_prompt_preference_dataset = conversational_implicit_prompt_preference_dataset.train_test_split(test_size=test_size, shuffle=False)
     if push_to_hub:
         conversational_implicit_prompt_preference_dataset.push_to_hub(repo_id, config_name="conversational_implicit_prompt_preference")
 
@@ -639,7 +639,7 @@ def main(test_size, push_to_hub, repo_id):
         ],
         "label": [True, True, True, False, True, True, True, False, True, False, True, False, True, False, False, True, True, True, True],
     })
-    conversational_unpaired_preference_dataset = conversational_unpaired_preference_dataset.train_test_split(test_size=test_size)
+    conversational_unpaired_preference_dataset = conversational_unpaired_preference_dataset.train_test_split(test_size=test_size, shuffle=False)
     if push_to_hub:
         conversational_unpaired_preference_dataset.push_to_hub(repo_id, config_name="conversational_unpaired_preference")
     # fmt: on
diff --git a/tests/test_bco_trainer.py b/tests/test_bco_trainer.py
index f53a62048c..09f274ef65 100644
--- a/tests/test_bco_trainer.py
+++ b/tests/test_bco_trainer.py
@@ -160,10 +160,10 @@ def test_tokenize_and_process_tokens(self):
             self.assertListEqual(tokenized_dataset["prompt"], train_dataset["prompt"])
             self.assertListEqual(tokenized_dataset["completion"], train_dataset["completion"])
             self.assertListEqual(tokenized_dataset["label"], train_dataset["label"])
-            self.assertListEqual(tokenized_dataset["prompt_input_ids"][0], [31137])
-            self.assertListEqual(tokenized_dataset["prompt_attention_mask"][0], [1])
-            self.assertListEqual(tokenized_dataset["answer_input_ids"][0], [374, 2664, 1091, 16965, 13])
-            self.assertListEqual(tokenized_dataset["answer_attention_mask"][0], [1, 1, 1, 1, 1])
+            self.assertListEqual(tokenized_dataset["prompt_input_ids"][0], [46518, 374, 2664, 1091])
+            self.assertListEqual(tokenized_dataset["prompt_attention_mask"][0], [1, 1, 1, 1])
+            self.assertListEqual(tokenized_dataset["answer_input_ids"][0], [27261, 13])
+            self.assertListEqual(tokenized_dataset["answer_attention_mask"][0], [1, 1])
 
             fn_kwargs = {
                 "prefix": "",
@@ -178,13 +178,15 @@ def test_tokenize_and_process_tokens(self):
             self.assertListEqual(processed_dataset["prompt"], train_dataset["prompt"])
             self.assertListEqual(processed_dataset["completion"], train_dataset["completion"])
             self.assertListEqual(processed_dataset["label"], train_dataset["label"])
-            self.assertListEqual(processed_dataset["prompt_input_ids"][0], [31137])
-            self.assertListEqual(processed_dataset["prompt_attention_mask"][0], [1])
+            self.assertListEqual(processed_dataset["prompt_input_ids"][0], [46518, 374, 2664, 1091])
+            self.assertListEqual(processed_dataset["prompt_attention_mask"][0], [1, 1, 1, 1])
             self.assertListEqual(
-                processed_dataset["completion_input_ids"][0], [31137, 374, 2664, 1091, 16965, 13, 151645]
+                processed_dataset["completion_input_ids"][0], [46518, 374, 2664, 1091, 27261, 13, 151645]
             )
             self.assertListEqual(processed_dataset["completion_attention_mask"][0], [1, 1, 1, 1, 1, 1, 1])
-            self.assertListEqual(processed_dataset["completion_labels"][0], [-100, 374, 2664, 1091, 16965, 13, 151645])
+            self.assertListEqual(
+                processed_dataset["completion_labels"][0], [-100, -100, -100, -100, 27261, 13, 151645]
+            )
 
     @require_sklearn
     def test_bco_trainer_without_providing_ref_model(self):
diff --git a/tests/test_kto_trainer.py b/tests/test_kto_trainer.py
index 66e9117e3a..b705d0bc5a 100644
--- a/tests/test_kto_trainer.py
+++ b/tests/test_kto_trainer.py
@@ -156,10 +156,10 @@ def test_tokenize_and_process_tokens(self):
             self.assertListEqual(tokenized_dataset["prompt"], train_dataset["prompt"])
             self.assertListEqual(tokenized_dataset["completion"], train_dataset["completion"])
             self.assertListEqual(tokenized_dataset["label"], train_dataset["label"])
-            self.assertListEqual(tokenized_dataset["prompt_input_ids"][0], [31137])
-            self.assertListEqual(tokenized_dataset["prompt_attention_mask"][0], [1])
-            self.assertListEqual(tokenized_dataset["answer_input_ids"][0], [374, 2664, 1091, 16965, 13])
-            self.assertListEqual(tokenized_dataset["answer_attention_mask"][0], [1, 1, 1, 1, 1])
+            self.assertListEqual(tokenized_dataset["prompt_input_ids"][0], [46518, 374, 2664, 1091])
+            self.assertListEqual(tokenized_dataset["prompt_attention_mask"][0], [1, 1, 1, 1])
+            self.assertListEqual(tokenized_dataset["answer_input_ids"][0], [27261, 13])
+            self.assertListEqual(tokenized_dataset["answer_attention_mask"][0], [1, 1])
 
             # Test corruption of (prompt, completion) pairs for KL dataset
             for batch_size in [2, 3]:
@@ -196,13 +196,15 @@ def test_tokenize_and_process_tokens(self):
             self.assertListEqual(processed_dataset["prompt"], train_dataset["prompt"])
             self.assertListEqual(processed_dataset["completion"], train_dataset["completion"])
             self.assertListEqual(processed_dataset["label"], train_dataset["label"])
-            self.assertListEqual(processed_dataset["prompt_input_ids"][0], [31137])
-            self.assertListEqual(processed_dataset["prompt_attention_mask"][0], [1])
+            self.assertListEqual(processed_dataset["prompt_input_ids"][0], [46518, 374, 2664, 1091])
+            self.assertListEqual(processed_dataset["prompt_attention_mask"][0], [1, 1, 1, 1])
             self.assertListEqual(
-                processed_dataset["completion_input_ids"][0], [31137, 374, 2664, 1091, 16965, 13, 151645]
+                processed_dataset["completion_input_ids"][0], [46518, 374, 2664, 1091, 27261, 13, 151645]
             )
             self.assertListEqual(processed_dataset["completion_attention_mask"][0], [1, 1, 1, 1, 1, 1, 1])
-            self.assertListEqual(processed_dataset["completion_labels"][0], [-100, 374, 2664, 1091, 16965, 13, 151645])
+            self.assertListEqual(
+                processed_dataset["completion_labels"][0], [-100, -100, -100, -100, 27261, 13, 151645]
+            )
 
     def test_kto_trainer_without_providing_ref_model(self):
         with tempfile.TemporaryDirectory() as tmp_dir:
diff --git a/tests/test_sft_trainer.py b/tests/test_sft_trainer.py
index f4e46803f1..d8a3706050 100644
--- a/tests/test_sft_trainer.py
+++ b/tests/test_sft_trainer.py
@@ -1172,7 +1172,7 @@ def test_sft_trainer_eval_packing(self):
             )
 
             self.assertEqual(len(trainer.train_dataset["input_ids"]), 46)  # w/ this dataset, we end up with 46 seqs
-            self.assertEqual(len(trainer.eval_dataset["input_ids"]), 5)  # w/ this dataset, we end up with 5 seqs
+            self.assertEqual(len(trainer.eval_dataset["input_ids"]), 6)  # w/ this dataset, we end up with 6 seqs
 
     def test_sft_trainer_no_packing(self):
         with tempfile.TemporaryDirectory() as tmp_dir: