diff --git a/reasoning_gym/games/tower_of_hanoi.py b/reasoning_gym/games/tower_of_hanoi.py index df902300..a9e7f458 100644 --- a/reasoning_gym/games/tower_of_hanoi.py +++ b/reasoning_gym/games/tower_of_hanoi.py @@ -4,7 +4,7 @@ import random import re from dataclasses import dataclass -from typing import Dict, List, Optional, Tuple +from typing import Any, Dict, List, Optional, Tuple from ..factory import ProceduralDataset, register_dataset @@ -368,6 +368,73 @@ def _parse_move(self, move: str) -> Tuple[int, int, int]: to_peg = int(match.group(3)) return disk, from_peg, to_peg + def score_answer(self, answer: Optional[str], metadata: Dict[str, Any]) -> float: + """ + Score the user's solution for the Tower of Hanoi puzzle. + + The answer is expected to be a newline-separated sequence of moves in the format: + "Move disk X from Peg Y to Peg Z" + + Expected behavior: + - Correct answer (i.e. equivalent in length, or better, than the one provided in the dataset item) gives 1.0. + - A correct solution that is suboptimal length gives a proportional reward of optimal_move_count/user_move_count + - A badly formatted answer gives a minimal reward (0.01). + - An answer that is syntactically valid but does not solve the puzzle gives a partial reward (0.05). + - An empty string gives 0.01. + - None gives 0.0. + """ + if answer is None: + return 0.0 + + if answer == "": + return 0.01 + + # If answer is a string, split it into lines; if it's already a list, use it directly. + if isinstance(answer, str): + moves = [line.strip() for line in answer.strip().splitlines() if line.strip()] + elif isinstance(answer, list): + moves = [line.strip() for line in answer if isinstance(line, str) and line.strip()] + else: + return 0.0 + + # Build the initial peg state from metadata. + num_disks = metadata["num_disks"] + num_pegs = metadata["num_pegs"] + start_peg = metadata["start_peg"] + target_peg = metadata["target_peg"] + + peg_state = {peg: [] for peg in range(1, num_pegs + 1)} + for disk in range(num_disks, 0, -1): + peg_state[start_peg].append(disk) + + # Process each move. + for move in moves: + try: + disk, from_peg, to_peg = self._parse_move(move) + except Exception: + return 0.01 # Invalid move format + + # Validate the move using existing _validate_move method. + if not self._validate_move(peg_state, move): + return 0.01 + + # Execute the move. + peg_state[from_peg].pop() + peg_state[to_peg].append(disk) + + # Check if the final state is solved: all disks on target peg in descending order. + expected_final = list(range(num_disks, 0, -1)) + solved = peg_state[target_peg] == expected_final + if not solved: + return 0.05 + + optimal_moves = metadata.get("solution_length", len(moves)) + user_moves = len(moves) + if user_moves <= optimal_moves: + return 1.0 + else: + return optimal_moves / user_moves + # Register the dataset register_dataset("tower_of_hanoi", HanoiDataset, HanoiConfig) diff --git a/tests/test_tower_of_hanoi.py b/tests/test_tower_of_hanoi.py index a4228bc3..9bf83dca 100644 --- a/tests/test_tower_of_hanoi.py +++ b/tests/test_tower_of_hanoi.py @@ -226,3 +226,46 @@ def is_valid_final_state(pegs_state: dict, target_peg: int, num_disks: int) -> b if len(target_stack) != num_disks: return False return target_stack == list(range(num_disks, 0, -1)) + + +def test_score_answer(): + """ + Test that the score_answer method returns the expected reward values. + + Expected behavior: + - Correct answer (i.e. equivalent in length, or better, than the one provided in the dataset item) gives 1.0. + - A correct solution that is suboptimal length gives a proportional reward of optimal_move_count/user_move_count + - A badly formatted answer gives a minimal reward (0.01). + - An answer that is syntactically valid but does not solve the puzzle gives a partial reward (0.05). + - An empty string gives 0.01. + - None gives 0.0. + """ + # Create a dataset instance using the default configuration. + config = HanoiConfig(min_disks=3, max_disks=5, min_pegs=3, max_pegs=4, size=5, seed=42) + dataset = HanoiDataset(config) + # Pick one instance from the dataset for testing. + item = dataset[0] + metadata = item["metadata"] + correct_answer = item["answer"] + + # 1. Correct answer should yield full reward. + score_correct = dataset.score_answer(answer=correct_answer, metadata=metadata) + assert score_correct == 1.0, f"Correct answer score {score_correct} is not 1.0." + + # 2. A badly formatted answer should yield minimal reward (0.01). + score_bad_format = dataset.score_answer(answer="a wrong solution", metadata=metadata) + assert score_bad_format == 0.01, f"Badly formatted answer score {score_bad_format} is not 0.01." + + # 3. An answer that is validly formatted but unsolved. + # For example, remove the last move from the correct answer. + unfinished_answer = correct_answer[:-1] + score_unsolved = dataset.score_answer(answer=unfinished_answer, metadata=metadata) + assert score_unsolved == 0.05, f"Unsolved answer score {score_unsolved} is not 0.05." + + # 4. An empty answer should yield 0.01. + score_empty = dataset.score_answer(answer="", metadata=metadata) + assert score_empty == 0.01, f"Empty answer score {score_empty} is not 0.01." + + # 5. A None answer should yield 0.0. + score_none = dataset.score_answer(answer=None, metadata=metadata) + assert score_none == 0.0, f"None answer score {score_none} is not 0.0."