Minor question template & score_answer improvements (#261)

* math prompt improvements * ignore brackets in complex_arithmetic results * improve additional instruction in prompt of polynomial_equations * more strict tests for score_answer in polynomial_equations * simplify special reward handling * fix test_intermediate_integration * fix sokoban dataset * add common dataset score_answer consistency test
open-thought · Mar 4, 2025 · 5d7fbac · 5d7fbac
1 parent 061282e
commit 5d7fbac
Show file tree

Hide file tree

Showing 106 changed files with 394 additions and 498 deletions.
diff --git a/reasoning_gym/algebra/complex_arithmetic.py b/reasoning_gym/algebra/complex_arithmetic.py
@@ -103,6 +103,10 @@ def parse_string_to_complex(answer: str) -> complex:
             # Normalize the answer string by removing spaces and converting to lowercase
             answer = answer.replace(" ", "").lower()
 
+            # remove brackets
+            while len(answer) > 1 and answer[0] == "(" and answer[-1] == ")":
+                answer = answer[1:-1]
+
             # Convert mathematical notation 'i' to Python's 'j' for complex numbers
             answer = answer.replace("i", "j")
 

diff --git a/reasoning_gym/algebra/intermediate_integration.py b/reasoning_gym/algebra/intermediate_integration.py
@@ -77,9 +77,10 @@ def __init__(self, config: IntermediateIntegrationConfig):
             "Evaluate the indefinite integral: ∫ {integrand} dx",
         ]
         self.added_instruction = """
-In addition, when doing calculation, use the following instructions together with your mathematical ingenuity to solve the integral problems
-## 1. Use ** instead ^ to represent powers. For example 7*X**2 instead of 7*X^2.
-## 2. Always use * when doing all sorts of multiplcation in your reasoning steps. For example Use [-3*X**3*sin(X) - 9*X**2*cos(X) + 18*X*sin(X) + 18*cos(X) + C] instead of [-3x3sin(x) - 9x2cos(x) + 18xsin(x) + 18cos(x) + C].
+When performing calculations, please follow these guidelines:
+1. Use ** instead of ^ to represent exponents. For example, write 7*X**2 instead of 7*X^2.
+2. Always include the * symbol for all multiplication operations in your reasoning steps. For example, write `-3*X**3*sin(X) - 9*X**2*cos(X) + 18*X*sin(X) + 18*cos(X) + C` instead of `-3x3sin(x) - 9x2cos(x) + 18xsin(x) + 18cos(x) + C`.
+3. Use `exp(x)` or `E**(x)` for the exponential function (i.e. use capital E for Euler's number).
 """
 
     def _get_outer_constant(self, rng: random.Random) -> int:
@@ -245,7 +246,7 @@ def score_answer(self, answer: Optional[str], entry: dict[str, Any]) -> float:
         """Determine if the solution provided solves the problem"""
         reward = 0.0
         metadata = entry["metadata"]
-        if answer is not None:
+        if isinstance(answer, str):
             try:
                 var = metadata["variable"]
                 x = sympy.Symbol(var)
@@ -258,12 +259,8 @@ def score_answer(self, answer: Optional[str], entry: dict[str, Any]) -> float:
                 # Check mathematical equivalence through simplification
                 if sympy.simplify(derivative - integrand) == 0:
                     reward = 1.0
-                elif answer.strip():
-                    reward = 0.05
-                else:
-                    reward = 0.01
             except:
-                reward = 0.01
+                reward = 0.0
         return reward
 
 

diff --git a/reasoning_gym/algebra/polynomial_equations.py b/reasoning_gym/algebra/polynomial_equations.py
@@ -27,8 +27,9 @@ class PolynomialEquationsConfig:
     seed: Optional[int] = None
     size: int = 500
     # reward function hyperparameters
-    penalty_missing_factor = 0.1
-    penalty_extra_factor = 0.05
+    penalty_missing_factor = 0.5
+    penalty_extra_factor = 0.5
+    exp_distance_factor = -10.0
 
     def validate(self) -> None:
         """Validate configuration parameters."""
@@ -62,12 +63,15 @@ def __init__(self, config: PolynomialEquationsConfig):
             "Solve the polynomial equation for real {variable}:\n{polynomial_expanded} = 0",
         ]
         self.added_instruction = """
-In solving the equations, please abide by the following instruction:
-## 1. All answers should be comma-separated. For example "-0.3773, 0.4005" etc.
-## 2. In cases where your answer is b = 2 + sqrt(4560) / 172 and b = 2 - sqrt(4560) / 172. Since b can be 2 numbers, resolve your answer like this instead, "-0.3773, 0.4005".
-## 3. If there are no real values of i that satisfy the equation, report your answer as empty string, "".
-## 4. If there are 2 answers, resolve the answers as comma-separated floats of 2 numbers, if 3 answers, make it comma-separated floats of 3 numbers.
-## 5. Resolve all numbers as floats in the string of comma-separated numbers. Round the floats higher than 4 decimal place(d.p) down to 4 d.p.
+In solving equations, please follow these instructions:
+1. Provide all answers as comma-separated decimal values. For example: "-0.3773, 0.4005"
+2. For solutions that can be expressed in exact form (like "u = 2 + sqrt(4560)/172" and "u = 2 - sqrt(4560)/172"), convert them to decimal form in your final answer.
+3. If there are no real values that satisfy the equation, report your answer as an empty string: ""
+4. Format your answer based on the number of solutions:
+   - For 1 solution: a single decimal number
+   - For 2 solutions: two comma-separated decimal numbers
+   - For 3 or more solutions: all values as comma-separated decimal numbers
+5. Round all decimal values to 4 decimal places (rounding down when the 5th decimal place is 5 or greater).
 """
         super().__init__(config=config, seed=config.seed, size=config.size)
 
@@ -238,7 +242,7 @@ def score_answer(self, answer: Optional[str], entry: dict[str, Any]) -> float:
                 # Remove matched oracle solution
                 oracle_solutions.pop(matched_distance_index)
                 # Exponential decay reward
-                total_reward += math.exp(-matched_distance)
+                total_reward += math.exp(matched_distance * self.config.exp_distance_factor)
             else:
                 # Extra predicted solution
                 extra_solutions += 1

diff --git a/reasoning_gym/algebra/polynomial_multiplication.py b/reasoning_gym/algebra/polynomial_multiplication.py
@@ -69,9 +69,9 @@ def __init__(self, config: PolynomialMultiplicationConfig):
             "Calculate the following: {polynomial_expr}",
         ]
         self.added_instruction = """
-In addition, When doing calculation, Use the following instructions together with your mathematical ingenuity to solve the integral problems
-## 1. Use ** instead ^ to represent powers. For example 7*X**2 instead of 7*X^2.
-## 2. Always use * when doing all sorts of multiplcation in your reasoning steps and even in reporting answers.
+When performing calculations, please follow these guidelines:
+1. Use ** instead of ^ to represent exponents. For example, write 7*X**2 instead of 7*X^2.
+2. Always include the * symbol for all multiplication operations in your reasoning steps. For example, write `-3*X**3*sin(X) - 9*X**2*cos(X) + 18*X*sin(X) + 18*cos(X) + C` instead of `-3x3sin(x) - 9x2cos(x) + 18xsin(x) + 18cos(x) + C`.
 """
         super().__init__(config=config, seed=config.seed, size=config.size)
 
@@ -106,10 +106,9 @@ def __getitem__(self, idx: int) -> dict:
 
         return {
             "question": question,
-            "answer": product,
+            "answer": str(product),
             "metadata": {
                 "polynomial_expr": str(polynomial_expr),
-                "result": str(product),
                 "variables": list(product.free_symbols),
             },
         }
@@ -147,21 +146,16 @@ def _generate_polynomial(self, rng: random.Random, monomials: Optional[list]):
 
     def score_answer(self, answer: Optional[str], entry: dict[str, Any]) -> float:
         reward = 0.0
-        metadata = entry["metadata"]
         if answer is not None:
             try:
                 predicted_poly = sp.parse_expr(answer)
-                target_poly = sp.parse_expr(metadata["result"])
+                target_poly = sp.parse_expr(entry["answer"])
 
                 # Check if the difference simplifies to zero (i.e. they are equivalent).
                 if predicted_poly == target_poly:
                     reward = 1.0
-                elif answer.strip():
-                    reward = 0.05
-                else:
-                    reward = 0.01
             except Exception:
-                reward = 0.01
+                reward = 0.0
         return reward
 
 

diff --git a/reasoning_gym/algebra/simple_integration.py b/reasoning_gym/algebra/simple_integration.py
@@ -42,9 +42,9 @@ def __init__(self, config: SimpleIntegrationConfig):
             "Evaluate the indefinite integral: ∫ {integrand} dx",
         ]
         self.added_instruction = """
-In addition, When doing calculation, Use the following instructions together with your mathematical ingenuity to solve the integral problems
-## 1. Use ** instead ^ to represent powers. For example 7*X**2 instead of 7*X^2.
-## 2. Always use * when doing all sorts of multiplcation in your reasoning steps. For example Use [-3*X**3*sin(X) - 9*X**2*cos(X) + 18*X*sin(X) + 18*cos(X) + C] instead of [-3x3sin(x) - 9x2cos(x) + 18xsin(x) + 18cos(x) + C].
+When performing calculations, please follow these guidelines:
+1. Use ** instead of ^ to represent exponents. For example, write 7*X**2 instead of 7*X^2.
+2. Always include the * symbol for all multiplication operations in your reasoning steps. For example, write `-3*X**3*sin(X) - 9*X**2*cos(X) + 18*X*sin(X) + 18*cos(X) + C` instead of `-3x3sin(x) - 9x2cos(x) + 18xsin(x) + 18cos(x) + C`.
 """
         super().__init__(config=config, seed=config.seed, size=config.size)
 
@@ -103,12 +103,8 @@ def score_answer(self, answer: Optional[str], entry: dict[str, Any]) -> float:
                 # Check mathematical equivalence through simplification
                 if sympy.simplify(derivative - integrand) == 0:
                     reward = 1.0
-                elif answer.strip():
-                    reward = 0.05
-                else:
-                    reward = 0.01
             except:
-                reward = 0.01
+                reward = 0.0
         return reward
 
 

diff --git a/reasoning_gym/algorithmic/ab.py b/reasoning_gym/algorithmic/ab.py
@@ -130,12 +130,9 @@ def score_answer(self, answer: Optional[str], entry: dict[str, Any]) -> float:
             float: The computed score between 0.0 and 1.0.
         """
 
-        if answer == None:
-            return 0.0
-        if answer != entry["answer"]:
-            return 0.01
-        else:
+        if answer == entry["answer"]:
             return 1.0  # Yay
+        return 0.0
 
 
 # Register the dataset

diff --git a/reasoning_gym/algorithmic/binary_matrix.py b/reasoning_gym/algorithmic/binary_matrix.py
@@ -108,9 +108,9 @@ def score_answer(self, answer: Optional[str], entry: dict[str, Any]) -> float:
                     # check if answer is python list of lists
                     answer = self._matrix_to_str(eval(answer))
                     if answer == oracle_answer:
-                        return 0.5
-                except Exception as e:
-                    return 0.01
+                        return 0.1
+                except Exception:
+                    return 0.0
         return 0.0
 
     def __getitem__(self, idx: int) -> dict:

diff --git a/reasoning_gym/algorithmic/cryptarithm.py b/reasoning_gym/algorithmic/cryptarithm.py
@@ -200,7 +200,7 @@ def score_answer(self, answer: Optional[str], entry: dict[str, Any]) -> float:
         Returns:
             float: The computed score between 0.0 and 1.0.
         """
-        if not answer:
+        if not isinstance(answer, str):
             return 0.0
 
         correct_mapping = {}

diff --git a/reasoning_gym/algorithmic/game_of_life.py b/reasoning_gym/algorithmic/game_of_life.py
@@ -106,7 +106,7 @@ def score_answer(self, answer: Optional[str], entry: dict[str, Any]) -> float:
             ans_arr = json.loads(answer)
             correct_arr = json.loads(entry["answer"])
         except Exception:
-            return 0.01
+            return 0.0
 
         total_cells = 0
         correct_cells = 0

diff --git a/reasoning_gym/algorithmic/graph_color.py b/reasoning_gym/algorithmic/graph_color.py
@@ -228,12 +228,13 @@ def score_answer(self, answer: Optional[str], entry: dict[str, Any]) -> float:
         try:
             danswer = json.loads(answer)
             solved, failure = verify_graph_coloring_solution(entry["metadata"]["puzzle"], danswer)
-            if not solved:
-                return 0.01  # json was parsable but solution incorrect
-            else:
+            if solved:
                 return 1.0  # Yay
+            else:
+                return 0.01  # json parsable
         except Exception:
-            return 0.0
+            pass
+        return 0.0
 
 
 register_dataset("graph_color", GraphColorDataset, GraphColorConfig)
diff --git a/reasoning_gym/algorithmic/group_anagrams.py b/reasoning_gym/algorithmic/group_anagrams.py
@@ -95,7 +95,7 @@ def score_answer(self, answer: Optional[str], entry: dict[str, Any]) -> float:
                 if answer_str == oracle_str:
                     reward = 1.0
                 else:
-                    reward = 0.01
+                    reward = 0.01  # json parsable
             except Exception:
                 reward = 0.0
         return reward

diff --git a/reasoning_gym/algorithmic/jugs.py b/reasoning_gym/algorithmic/jugs.py
@@ -303,11 +303,11 @@ def score_answer(self, answer: Optional[str], entry: dict[str, Any]) -> float:
             danswer = json.loads(answer)
             valid, _ = verify_solution(entry["metadata"]["puzzle"], danswer)
             if not valid:
-                return 0.01
+                return 0.01  # json parsable
             else:
                 return 1.0  # Yay
         except Exception as e:
-            return 0.01
+            return 0.0
 
 
 register_dataset("jugs", JugsDataset, JugsConfig)
diff --git a/reasoning_gym/algorithmic/letter_jumble.py b/reasoning_gym/algorithmic/letter_jumble.py
@@ -116,7 +116,7 @@ def partial(self, expected_answer, model_answer):
 
         # Each word in the expected answer is worth an equal fraction of 1.0
         total_words = len(expected_words)
-        score_per_word = 1.0 / total_words if total_words else 0
+        score_per_word = 1.0 / total_words if total_words > 0 else 0
 
         # Calculate scores word by word
         scores = []
@@ -142,18 +142,16 @@ def score_answer(self, answer: Optional[str], entry: dict[str, Any]) -> float:
             float: The computed score between 0.0 and 1.0.
         """
 
-        if not answer:
+        if not isinstance(answer, str):
             return 0.0
 
         oracle_answer = entry["answer"].strip().lower()
-        if answer:
-            answer = answer.strip().lower()
-            if answer == oracle_answer:
-                return 1.0  # Perfect score!
-            else:
-                partial_score = self.partial(oracle_answer, answer)
-                return partial_score
-        return 0.01
+        answer = answer.strip().lower()
+        if answer == oracle_answer:
+            return 1.0  # Perfect score!
+        else:
+            partial_score = self.partial(oracle_answer, answer)
+            return partial_score
 
 
 register_dataset("letter_jumble", LetterJumbleDataset, LetterJumbleConfig)
diff --git a/reasoning_gym/algorithmic/manipulate_matrix.py b/reasoning_gym/algorithmic/manipulate_matrix.py
@@ -144,8 +144,6 @@ def score_answer(self, answer: Optional[str], entry: dict[str, Any]) -> float:
 
             if oracle_answer in answer:
                 return len(oracle_answer) / len(answer)
-            else:
-                return 0.01
 
         return 0.0
 

diff --git a/reasoning_gym/algorithmic/palindrome_generation.py b/reasoning_gym/algorithmic/palindrome_generation.py
@@ -92,14 +92,14 @@ def score_answer(self, answer: Optional[str], entry: dict[str, Any]) -> float:
         - Correct answer (palindrome with only correct letters in the correct quantities) gives 1.0
         - An answer that is a palindrome, but not with the same letters as provided, gives 0.05
         - An answer that is a string, but not a palindrome gives 0.02
-        - An empty string gives 0.01.
+        - An empty string gives 0.0
         - None gives 0.0.
         """
         if answer is None or not isinstance(answer, str):
             return 0.0  # No answer given
 
         if answer == "":
-            return 0.01
+            return 0.0
 
         metadata = entry["metadata"]
         answer = answer.strip().lower()

diff --git a/reasoning_gym/algorithmic/palindrome_partitioning.py b/reasoning_gym/algorithmic/palindrome_partitioning.py
@@ -95,9 +95,8 @@ def score_answer(self, answer: Optional[str], entry: dict[str, Any]) -> float:
                 oracle = self.to_set_of_tuples(entry["metadata"]["solution"])
                 if answer == oracle:
                     return 1.0
-                return 0.01
             except Exception:
-                return 0.0
+                pass
         return 0.0
 
     def _generate_palindrome_letters(self, rng: Random, length: int) -> list[str]:

diff --git a/reasoning_gym/algorithmic/pool_matrix.py b/reasoning_gym/algorithmic/pool_matrix.py
@@ -80,7 +80,7 @@ def _average_pool(self, matrix: np.ndarray, pool_size: int) -> np.ndarray:
     def score_answer(self, answer: Optional[str], entry: dict[str, Any]) -> float:
         """Score the answer based on the metadata"""
 
-        if not answer:
+        if not isinstance(answer, str):
             return 0.0
 
         reward = 0.0
@@ -91,8 +91,6 @@ def score_answer(self, answer: Optional[str], entry: dict[str, Any]) -> float:
                 reward = 1.0
             elif oracle_answer.shape == answer.shape:
                 reward = 0.1
-            else:
-                reward = 0.01
         except Exception:
             pass
         return reward

diff --git a/reasoning_gym/algorithmic/ransom_note.py b/reasoning_gym/algorithmic/ransom_note.py
@@ -108,14 +108,12 @@ def score_answer(self, answer: Optional[str], entry: dict[str, Any]) -> float:
             float: The computed score between 0.0 and 1.0.
         """
 
-        if answer == None:
-            return 0.0
+        if isinstance(answer, str):
+            s_answer = answer.strip()
+            if s_answer == str(entry["answer"]):
+                return 1.0
 
-        s_answer = answer.strip()
-        if not s_answer == str(entry["answer"]):
-            return 0.01
-        else:
-            return 1.0
+        return 0.0
 
 
 register_dataset("ransom_note", RansomNoteDataset, RansomNoteConfig)
diff --git a/reasoning_gym/algorithmic/sentence_reordering.py b/reasoning_gym/algorithmic/sentence_reordering.py
@@ -110,7 +110,7 @@ def score_answer(self, answer: Optional[str], entry: dict[str, Any]) -> float:
                 else:
                     reward = 0.05
             except:
-                reward = 0.01
+                reward = 0.0
         return reward
 
 

diff --git a/reasoning_gym/algorithmic/spell_backward.py b/reasoning_gym/algorithmic/spell_backward.py
@@ -52,14 +52,14 @@ def __getitem__(self, idx: int) -> dict:
     def score_answer(self, answer: Optional[str], entry: dict[str, Any]) -> float:
         reward = 0.0
         expected_answer = entry["answer"]
-        if answer is not None:
+        if isinstance(answer, str):
             try:
                 if expected_answer.lower() == answer.lower():
                     reward = 1.0
                 else:
                     reward = 0.05
             except:
-                reward = 0.01
+                reward = 0.0
         return reward