diff --git a/examples/datasets/zen.py b/scripts/generate_zen_dataset.py similarity index 97% rename from examples/datasets/zen.py rename to scripts/generate_zen_dataset.py index 7aa9b64ea7..e599e6c1ce 100644 --- a/examples/datasets/zen.py +++ b/scripts/generate_zen_dataset.py @@ -28,13 +28,13 @@ class ScriptArguments: Fraction of the dataset to include in the test split. push_to_hub (`bool`, *optional*, defaults to `False`): Whether to push the dataset to the Hugging Face Hub. - repo_id (`str`, *optional*, defaults to `"trl-lib/zen"`): + repo_id (`str`, *optional*, defaults to `"trl-internal-testing/zen"`): Hugging Face repository ID to push the dataset to. """ test_size: float = 0.1 push_to_hub: bool = False - repo_id: str = "trl-lib/zen" + repo_id: str = "trl-internal-testing/zen" def main(test_size, push_to_hub, repo_id): @@ -62,7 +62,7 @@ def main(test_size, push_to_hub, repo_id): "Namespaces are one honking great idea -- let's do more of those!", ], }) - standard_language_modeling_dataset = standard_language_modeling_dataset.train_test_split(test_size=test_size) + standard_language_modeling_dataset = standard_language_modeling_dataset.train_test_split(test_size=test_size, shuffle=False) if push_to_hub: standard_language_modeling_dataset.push_to_hub(repo_id, config_name="standard_language_modeling") @@ -89,7 +89,7 @@ def main(test_size, push_to_hub, repo_id): "Namespaces are one honking great", ], }) - standard_prompt_only_dataset = standard_prompt_only_dataset.train_test_split(test_size=test_size) + standard_prompt_only_dataset = standard_prompt_only_dataset.train_test_split(test_size=test_size, shuffle=False) if push_to_hub: standard_prompt_only_dataset.push_to_hub(repo_id, config_name="standard_prompt_only") @@ -137,7 +137,7 @@ def main(test_size, push_to_hub, repo_id): " idea -- let's do more of those!", ], }) - standard_prompt_completion_dataset = standard_prompt_completion_dataset.train_test_split(test_size=test_size) + standard_prompt_completion_dataset = standard_prompt_completion_dataset.train_test_split(test_size=test_size, shuffle=False) if push_to_hub: standard_prompt_completion_dataset.push_to_hub(repo_id, config_name="standard_prompt_completion") @@ -206,7 +206,7 @@ def main(test_size, push_to_hub, repo_id): " watermelon -- let's plant some!", ], }) - standard_preference_dataset = standard_preference_dataset.train_test_split(test_size=test_size) + standard_preference_dataset = standard_preference_dataset.train_test_split(test_size=test_size, shuffle=False) if push_to_hub: standard_preference_dataset.push_to_hub(repo_id, config_name="standard_preference") @@ -254,7 +254,7 @@ def main(test_size, push_to_hub, repo_id): "Namespaces are one honking great watermelon -- let's plant some!", ], }) - standard_implicit_prompt_preference_dataset = standard_implicit_prompt_preference_dataset.train_test_split(test_size=test_size) + standard_implicit_prompt_preference_dataset = standard_implicit_prompt_preference_dataset.train_test_split(test_size=test_size, shuffle=False) if push_to_hub: standard_implicit_prompt_preference_dataset.push_to_hub(repo_id, config_name="standard_implicit_prompt_preference") @@ -303,11 +303,11 @@ def main(test_size, push_to_hub, repo_id): ], "label": [True, False, False, True, True, False, True, False, True, True, False, True, True, False, True, False, True, False, False], }) - standard_unpaired_preference_dataset = standard_unpaired_preference_dataset.train_test_split(test_size=test_size) + standard_unpaired_preference_dataset = standard_unpaired_preference_dataset.train_test_split(test_size=test_size, shuffle=False) if push_to_hub: standard_unpaired_preference_dataset.push_to_hub(repo_id, config_name="standard_unpaired_preference") - standard_step_dataset = Dataset.from_dict({ + standard_stepwise_supervision_dataset = Dataset.from_dict({ "prompt": [ "Beautiful is better than", "Explicit is better than", @@ -350,7 +350,7 @@ def main(test_size, push_to_hub, repo_id): [" of those great ideas,", " that solve many problems."], [" the code should still aim for balance."], ], - "label": [ + "labels": [ [False, True], [False, True, False], [False, True], @@ -371,9 +371,9 @@ def main(test_size, push_to_hub, repo_id): [False] ] }) - standard_step_dataset = standard_step_dataset.train_test_split(test_size=test_size) + standard_stepwise_supervision_dataset = standard_stepwise_supervision_dataset.train_test_split(test_size=test_size, shuffle=False) if push_to_hub: - standard_step_dataset.push_to_hub(repo_id, config_name="standard_step") + standard_stepwise_supervision_dataset.push_to_hub(repo_id, config_name="standard_stepwise_supervision") conversational_language_modeling_dataset = Dataset.from_dict({ "messages": [ @@ -398,7 +398,7 @@ def main(test_size, push_to_hub, repo_id): [{"role": "user", "content": "Any great ideas?"}, {"role": "assistant", "content": "Namespaces are one honking great idea."}], ], }) - conversational_language_modeling_dataset = conversational_language_modeling_dataset.train_test_split(test_size=test_size) + conversational_language_modeling_dataset = conversational_language_modeling_dataset.train_test_split(test_size=test_size, shuffle=False) if push_to_hub: conversational_language_modeling_dataset.push_to_hub(repo_id, config_name="conversational_language_modeling") @@ -425,7 +425,7 @@ def main(test_size, push_to_hub, repo_id): [{"role": "user", "content": "Any great ideas?"}], ], }) - conversational_prompt_only_dataset = conversational_prompt_only_dataset.train_test_split(test_size=test_size) + conversational_prompt_only_dataset = conversational_prompt_only_dataset.train_test_split(test_size=test_size, shuffle=False) if push_to_hub: conversational_prompt_only_dataset.push_to_hub(repo_id, config_name="conversational_prompt_only") @@ -473,7 +473,7 @@ def main(test_size, push_to_hub, repo_id): [{"role": "assistant", "content": "Namespaces are one honking great idea."}], ], }) - conversational_prompt_completion_dataset = conversational_prompt_completion_dataset.train_test_split(test_size=test_size) + conversational_prompt_completion_dataset = conversational_prompt_completion_dataset.train_test_split(test_size=test_size, shuffle=False) if push_to_hub: conversational_prompt_completion_dataset.push_to_hub(repo_id, config_name="conversational_prompt_completion") @@ -542,7 +542,7 @@ def main(test_size, push_to_hub, repo_id): [{"role": "assistant", "content": "Recursion."}], ], }) - conversational_preference_dataset = conversational_preference_dataset.train_test_split(test_size=test_size) + conversational_preference_dataset = conversational_preference_dataset.train_test_split(test_size=test_size, shuffle=False) if push_to_hub: conversational_preference_dataset.push_to_hub(repo_id, config_name="conversational_preference") @@ -590,7 +590,7 @@ def main(test_size, push_to_hub, repo_id): [{"role": "user", "content": "Any great ideas?"}, {"role": "assistant", "content": "Recursion."}], ], }) - conversational_implicit_prompt_preference_dataset = conversational_implicit_prompt_preference_dataset.train_test_split(test_size=test_size) + conversational_implicit_prompt_preference_dataset = conversational_implicit_prompt_preference_dataset.train_test_split(test_size=test_size, shuffle=False) if push_to_hub: conversational_implicit_prompt_preference_dataset.push_to_hub(repo_id, config_name="conversational_implicit_prompt_preference") @@ -639,7 +639,7 @@ def main(test_size, push_to_hub, repo_id): ], "label": [True, True, True, False, True, True, True, False, True, False, True, False, True, False, False, True, True, True, True], }) - conversational_unpaired_preference_dataset = conversational_unpaired_preference_dataset.train_test_split(test_size=test_size) + conversational_unpaired_preference_dataset = conversational_unpaired_preference_dataset.train_test_split(test_size=test_size, shuffle=False) if push_to_hub: conversational_unpaired_preference_dataset.push_to_hub(repo_id, config_name="conversational_unpaired_preference") # fmt: on diff --git a/tests/test_bco_trainer.py b/tests/test_bco_trainer.py index f53a62048c..09f274ef65 100644 --- a/tests/test_bco_trainer.py +++ b/tests/test_bco_trainer.py @@ -160,10 +160,10 @@ def test_tokenize_and_process_tokens(self): self.assertListEqual(tokenized_dataset["prompt"], train_dataset["prompt"]) self.assertListEqual(tokenized_dataset["completion"], train_dataset["completion"]) self.assertListEqual(tokenized_dataset["label"], train_dataset["label"]) - self.assertListEqual(tokenized_dataset["prompt_input_ids"][0], [31137]) - self.assertListEqual(tokenized_dataset["prompt_attention_mask"][0], [1]) - self.assertListEqual(tokenized_dataset["answer_input_ids"][0], [374, 2664, 1091, 16965, 13]) - self.assertListEqual(tokenized_dataset["answer_attention_mask"][0], [1, 1, 1, 1, 1]) + self.assertListEqual(tokenized_dataset["prompt_input_ids"][0], [46518, 374, 2664, 1091]) + self.assertListEqual(tokenized_dataset["prompt_attention_mask"][0], [1, 1, 1, 1]) + self.assertListEqual(tokenized_dataset["answer_input_ids"][0], [27261, 13]) + self.assertListEqual(tokenized_dataset["answer_attention_mask"][0], [1, 1]) fn_kwargs = { "prefix": "", @@ -178,13 +178,15 @@ def test_tokenize_and_process_tokens(self): self.assertListEqual(processed_dataset["prompt"], train_dataset["prompt"]) self.assertListEqual(processed_dataset["completion"], train_dataset["completion"]) self.assertListEqual(processed_dataset["label"], train_dataset["label"]) - self.assertListEqual(processed_dataset["prompt_input_ids"][0], [31137]) - self.assertListEqual(processed_dataset["prompt_attention_mask"][0], [1]) + self.assertListEqual(processed_dataset["prompt_input_ids"][0], [46518, 374, 2664, 1091]) + self.assertListEqual(processed_dataset["prompt_attention_mask"][0], [1, 1, 1, 1]) self.assertListEqual( - processed_dataset["completion_input_ids"][0], [31137, 374, 2664, 1091, 16965, 13, 151645] + processed_dataset["completion_input_ids"][0], [46518, 374, 2664, 1091, 27261, 13, 151645] ) self.assertListEqual(processed_dataset["completion_attention_mask"][0], [1, 1, 1, 1, 1, 1, 1]) - self.assertListEqual(processed_dataset["completion_labels"][0], [-100, 374, 2664, 1091, 16965, 13, 151645]) + self.assertListEqual( + processed_dataset["completion_labels"][0], [-100, -100, -100, -100, 27261, 13, 151645] + ) @require_sklearn def test_bco_trainer_without_providing_ref_model(self): diff --git a/tests/test_kto_trainer.py b/tests/test_kto_trainer.py index 66e9117e3a..b705d0bc5a 100644 --- a/tests/test_kto_trainer.py +++ b/tests/test_kto_trainer.py @@ -156,10 +156,10 @@ def test_tokenize_and_process_tokens(self): self.assertListEqual(tokenized_dataset["prompt"], train_dataset["prompt"]) self.assertListEqual(tokenized_dataset["completion"], train_dataset["completion"]) self.assertListEqual(tokenized_dataset["label"], train_dataset["label"]) - self.assertListEqual(tokenized_dataset["prompt_input_ids"][0], [31137]) - self.assertListEqual(tokenized_dataset["prompt_attention_mask"][0], [1]) - self.assertListEqual(tokenized_dataset["answer_input_ids"][0], [374, 2664, 1091, 16965, 13]) - self.assertListEqual(tokenized_dataset["answer_attention_mask"][0], [1, 1, 1, 1, 1]) + self.assertListEqual(tokenized_dataset["prompt_input_ids"][0], [46518, 374, 2664, 1091]) + self.assertListEqual(tokenized_dataset["prompt_attention_mask"][0], [1, 1, 1, 1]) + self.assertListEqual(tokenized_dataset["answer_input_ids"][0], [27261, 13]) + self.assertListEqual(tokenized_dataset["answer_attention_mask"][0], [1, 1]) # Test corruption of (prompt, completion) pairs for KL dataset for batch_size in [2, 3]: @@ -196,13 +196,15 @@ def test_tokenize_and_process_tokens(self): self.assertListEqual(processed_dataset["prompt"], train_dataset["prompt"]) self.assertListEqual(processed_dataset["completion"], train_dataset["completion"]) self.assertListEqual(processed_dataset["label"], train_dataset["label"]) - self.assertListEqual(processed_dataset["prompt_input_ids"][0], [31137]) - self.assertListEqual(processed_dataset["prompt_attention_mask"][0], [1]) + self.assertListEqual(processed_dataset["prompt_input_ids"][0], [46518, 374, 2664, 1091]) + self.assertListEqual(processed_dataset["prompt_attention_mask"][0], [1, 1, 1, 1]) self.assertListEqual( - processed_dataset["completion_input_ids"][0], [31137, 374, 2664, 1091, 16965, 13, 151645] + processed_dataset["completion_input_ids"][0], [46518, 374, 2664, 1091, 27261, 13, 151645] ) self.assertListEqual(processed_dataset["completion_attention_mask"][0], [1, 1, 1, 1, 1, 1, 1]) - self.assertListEqual(processed_dataset["completion_labels"][0], [-100, 374, 2664, 1091, 16965, 13, 151645]) + self.assertListEqual( + processed_dataset["completion_labels"][0], [-100, -100, -100, -100, 27261, 13, 151645] + ) def test_kto_trainer_without_providing_ref_model(self): with tempfile.TemporaryDirectory() as tmp_dir: diff --git a/tests/test_sft_trainer.py b/tests/test_sft_trainer.py index f4e46803f1..d8a3706050 100644 --- a/tests/test_sft_trainer.py +++ b/tests/test_sft_trainer.py @@ -1172,7 +1172,7 @@ def test_sft_trainer_eval_packing(self): ) self.assertEqual(len(trainer.train_dataset["input_ids"]), 46) # w/ this dataset, we end up with 46 seqs - self.assertEqual(len(trainer.eval_dataset["input_ids"]), 5) # w/ this dataset, we end up with 5 seqs + self.assertEqual(len(trainer.eval_dataset["input_ids"]), 6) # w/ this dataset, we end up with 6 seqs def test_sft_trainer_no_packing(self): with tempfile.TemporaryDirectory() as tmp_dir: