Skip to content

Commit

Permalink
Merge pull request #50 from joenorton/toh-score-answer
Browse files Browse the repository at this point in the history
feat: toh scoring
  • Loading branch information
andreaskoepf authored Feb 3, 2025
2 parents 00961bc + 9841f64 commit 57c6050
Show file tree
Hide file tree
Showing 2 changed files with 111 additions and 1 deletion.
69 changes: 68 additions & 1 deletion reasoning_gym/games/tower_of_hanoi.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
import random
import re
from dataclasses import dataclass
from typing import Dict, List, Optional, Tuple
from typing import Any, Dict, List, Optional, Tuple

from ..factory import ProceduralDataset, register_dataset

Expand Down Expand Up @@ -368,6 +368,73 @@ def _parse_move(self, move: str) -> Tuple[int, int, int]:
to_peg = int(match.group(3))
return disk, from_peg, to_peg

def score_answer(self, answer: Optional[str], metadata: Dict[str, Any]) -> float:
"""
Score the user's solution for the Tower of Hanoi puzzle.
The answer is expected to be a newline-separated sequence of moves in the format:
"Move disk X from Peg Y to Peg Z"
Expected behavior:
- Correct answer (i.e. equivalent in length, or better, than the one provided in the dataset item) gives 1.0.
- A correct solution that is suboptimal length gives a proportional reward of optimal_move_count/user_move_count
- A badly formatted answer gives a minimal reward (0.01).
- An answer that is syntactically valid but does not solve the puzzle gives a partial reward (0.05).
- An empty string gives 0.01.
- None gives 0.0.
"""
if answer is None:
return 0.0

if answer == "":
return 0.01

# If answer is a string, split it into lines; if it's already a list, use it directly.
if isinstance(answer, str):
moves = [line.strip() for line in answer.strip().splitlines() if line.strip()]
elif isinstance(answer, list):
moves = [line.strip() for line in answer if isinstance(line, str) and line.strip()]
else:
return 0.0

# Build the initial peg state from metadata.
num_disks = metadata["num_disks"]
num_pegs = metadata["num_pegs"]
start_peg = metadata["start_peg"]
target_peg = metadata["target_peg"]

peg_state = {peg: [] for peg in range(1, num_pegs + 1)}
for disk in range(num_disks, 0, -1):
peg_state[start_peg].append(disk)

# Process each move.
for move in moves:
try:
disk, from_peg, to_peg = self._parse_move(move)
except Exception:
return 0.01 # Invalid move format

# Validate the move using existing _validate_move method.
if not self._validate_move(peg_state, move):
return 0.01

# Execute the move.
peg_state[from_peg].pop()
peg_state[to_peg].append(disk)

# Check if the final state is solved: all disks on target peg in descending order.
expected_final = list(range(num_disks, 0, -1))
solved = peg_state[target_peg] == expected_final
if not solved:
return 0.05

optimal_moves = metadata.get("solution_length", len(moves))
user_moves = len(moves)
if user_moves <= optimal_moves:
return 1.0
else:
return optimal_moves / user_moves


# Register the dataset
register_dataset("tower_of_hanoi", HanoiDataset, HanoiConfig)
43 changes: 43 additions & 0 deletions tests/test_tower_of_hanoi.py
Original file line number Diff line number Diff line change
Expand Up @@ -226,3 +226,46 @@ def is_valid_final_state(pegs_state: dict, target_peg: int, num_disks: int) -> b
if len(target_stack) != num_disks:
return False
return target_stack == list(range(num_disks, 0, -1))


def test_score_answer():
"""
Test that the score_answer method returns the expected reward values.
Expected behavior:
- Correct answer (i.e. equivalent in length, or better, than the one provided in the dataset item) gives 1.0.
- A correct solution that is suboptimal length gives a proportional reward of optimal_move_count/user_move_count
- A badly formatted answer gives a minimal reward (0.01).
- An answer that is syntactically valid but does not solve the puzzle gives a partial reward (0.05).
- An empty string gives 0.01.
- None gives 0.0.
"""
# Create a dataset instance using the default configuration.
config = HanoiConfig(min_disks=3, max_disks=5, min_pegs=3, max_pegs=4, size=5, seed=42)
dataset = HanoiDataset(config)
# Pick one instance from the dataset for testing.
item = dataset[0]
metadata = item["metadata"]
correct_answer = item["answer"]

# 1. Correct answer should yield full reward.
score_correct = dataset.score_answer(answer=correct_answer, metadata=metadata)
assert score_correct == 1.0, f"Correct answer score {score_correct} is not 1.0."

# 2. A badly formatted answer should yield minimal reward (0.01).
score_bad_format = dataset.score_answer(answer="a wrong solution", metadata=metadata)
assert score_bad_format == 0.01, f"Badly formatted answer score {score_bad_format} is not 0.01."

# 3. An answer that is validly formatted but unsolved.
# For example, remove the last move from the correct answer.
unfinished_answer = correct_answer[:-1]
score_unsolved = dataset.score_answer(answer=unfinished_answer, metadata=metadata)
assert score_unsolved == 0.05, f"Unsolved answer score {score_unsolved} is not 0.05."

# 4. An empty answer should yield 0.01.
score_empty = dataset.score_answer(answer="", metadata=metadata)
assert score_empty == 0.01, f"Empty answer score {score_empty} is not 0.01."

# 5. A None answer should yield 0.0.
score_none = dataset.score_answer(answer=None, metadata=metadata)
assert score_none == 0.0, f"None answer score {score_none} is not 0.0."

0 comments on commit 57c6050

Please sign in to comment.