open-thought · andreaskoepf · Feb 3, 2025 · Feb 3, 2025 · Feb 3, 2025
diff --git a/reasoning_gym/games/tower_of_hanoi.py b/reasoning_gym/games/tower_of_hanoi.py
@@ -4,7 +4,7 @@
 import random
 import re
 from dataclasses import dataclass
-from typing import Dict, List, Optional, Tuple
+from typing import Any, Dict, List, Optional, Tuple
 
 from ..factory import ProceduralDataset, register_dataset
 
@@ -368,6 +368,73 @@ def _parse_move(self, move: str) -> Tuple[int, int, int]:
         to_peg = int(match.group(3))
         return disk, from_peg, to_peg
 
+    def score_answer(self, answer: Optional[str], metadata: Dict[str, Any]) -> float:
+        """
+        Score the user's solution for the Tower of Hanoi puzzle.
+
+        The answer is expected to be a newline-separated sequence of moves in the format:
+        "Move disk X from Peg Y to Peg Z"
+
+        Expected behavior:
+            - Correct answer (i.e. equivalent in length, or better, than the one provided in the dataset item) gives 1.0.
+            - A correct solution that is suboptimal length gives a proportional reward of optimal_move_count/user_move_count
+            - A badly formatted answer gives a minimal reward (0.01).
+            - An answer that is syntactically valid but does not solve the puzzle gives a partial reward (0.05).
+            - An empty string gives 0.01.
+            - None gives 0.0.
+        """
+        if answer is None:
+            return 0.0
+
+        if answer == "":
+            return 0.01
+
+        # If answer is a string, split it into lines; if it's already a list, use it directly.
+        if isinstance(answer, str):
+            moves = [line.strip() for line in answer.strip().splitlines() if line.strip()]
+        elif isinstance(answer, list):
+            moves = [line.strip() for line in answer if isinstance(line, str) and line.strip()]
+        else:
+            return 0.0
+
+        # Build the initial peg state from metadata.
+        num_disks = metadata["num_disks"]
+        num_pegs = metadata["num_pegs"]
+        start_peg = metadata["start_peg"]
+        target_peg = metadata["target_peg"]
+
+        peg_state = {peg: [] for peg in range(1, num_pegs + 1)}
+        for disk in range(num_disks, 0, -1):
+            peg_state[start_peg].append(disk)
+
+        # Process each move.
+        for move in moves:
+            try:
+                disk, from_peg, to_peg = self._parse_move(move)
+            except Exception:
+                return 0.01  # Invalid move format
+
+            # Validate the move using existing _validate_move method.
+            if not self._validate_move(peg_state, move):
+                return 0.01
+
+            # Execute the move.
+            peg_state[from_peg].pop()
+            peg_state[to_peg].append(disk)
+
+        # Check if the final state is solved: all disks on target peg in descending order.
+        expected_final = list(range(num_disks, 0, -1))
+        solved = peg_state[target_peg] == expected_final
+        if not solved:
+            return 0.05
+
+        optimal_moves = metadata.get("solution_length", len(moves))
+        user_moves = len(moves)
+        if user_moves <= optimal_moves:
+            return 1.0
+        else:
+            return optimal_moves / user_moves
+
 
 # Register the dataset
 register_dataset("tower_of_hanoi", HanoiDataset, HanoiConfig)
diff --git a/tests/test_tower_of_hanoi.py b/tests/test_tower_of_hanoi.py
@@ -226,3 +226,46 @@ def is_valid_final_state(pegs_state: dict, target_peg: int, num_disks: int) -> b
     if len(target_stack) != num_disks:
         return False
     return target_stack == list(range(num_disks, 0, -1))
+
+
+def test_score_answer():
+    """
+    Test that the score_answer method returns the expected reward values.
+
+    Expected behavior:
+      - Correct answer (i.e. equivalent in length, or better, than the one provided in the dataset item) gives 1.0.
+      - A correct solution that is suboptimal length gives a proportional reward of optimal_move_count/user_move_count
+      - A badly formatted answer gives a minimal reward (0.01).
+      - An answer that is syntactically valid but does not solve the puzzle gives a partial reward (0.05).
+      - An empty string gives 0.01.
+      - None gives 0.0.
+    """
+    # Create a dataset instance using the default configuration.
+    config = HanoiConfig(min_disks=3, max_disks=5, min_pegs=3, max_pegs=4, size=5, seed=42)
+    dataset = HanoiDataset(config)
+    # Pick one instance from the dataset for testing.
+    item = dataset[0]
+    metadata = item["metadata"]
+    correct_answer = item["answer"]
+
+    # 1. Correct answer should yield full reward.
+    score_correct = dataset.score_answer(answer=correct_answer, metadata=metadata)
+    assert score_correct == 1.0, f"Correct answer score {score_correct} is not 1.0."
+
+    # 2. A badly formatted answer should yield minimal reward (0.01).
+    score_bad_format = dataset.score_answer(answer="a wrong solution", metadata=metadata)
+    assert score_bad_format == 0.01, f"Badly formatted answer score {score_bad_format} is not 0.01."
+
+    # 3. An answer that is validly formatted but unsolved.
+    # For example, remove the last move from the correct answer.
+    unfinished_answer = correct_answer[:-1]
+    score_unsolved = dataset.score_answer(answer=unfinished_answer, metadata=metadata)
+    assert score_unsolved == 0.05, f"Unsolved answer score {score_unsolved} is not 0.05."
+
+    # 4. An empty answer should yield 0.01.
+    score_empty = dataset.score_answer(answer="", metadata=metadata)
+    assert score_empty == 0.01, f"Empty answer score {score_empty} is not 0.01."
+
+    # 5. A None answer should yield 0.0.
+    score_none = dataset.score_answer(answer=None, metadata=metadata)
+    assert score_none == 0.0, f"None answer score {score_none} is not 0.0."