Skip to content

Commit

Permalink
Fix evaluation
Browse files Browse the repository at this point in the history
  • Loading branch information
tongyx361 committed Sep 19, 2024
1 parent 8f462ae commit a8fe7cc
Show file tree
Hide file tree
Showing 5 changed files with 322 additions and 298 deletions.
136 changes: 77 additions & 59 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ pip install "git+https://github.com/tongyx361/symeval.git"
``` python
from symeval import *

math_evaluator = EvaluatorMathBatch()
evaluator = EvaluatorMathBatch()
```

`symeval` provides elaborate answer extraction and correctness judgement
Expand All @@ -41,7 +41,7 @@ to evaluate in batch with **timeout** but still efficiently.

``` python
test_eq(
math_evaluator.batch_eq(ref_answers=["1/2", "1/2"], pred_answers=["0.5", "2/4"]),
evaluator.batch_eq(ref_answers=["1/2", "1/2"], pred_answers=["0.5", "2/4"]),
[True] * 2,
)
```
Expand Down Expand Up @@ -90,37 +90,50 @@ can:

``` python
# MATH-style boxed answer
math_evaluator.extract_ans("Therefore, $1+1=\\boxed{2}$.")
evaluator.extract_ans("Therefore, $1+1=\\boxed{2}$.")
```

``` python
# Answer around "answer"
math_evaluator.extract_ans(
evaluator.extract_ans(
"Both $1$ and $11$ divide $11,$ so $\\boxed{11}=2$, and since $1,$ $2,$ $4,$ $5,$ $10,$ and $20$ divide $20,$ then $\\boxed{20}=6$. The inner expression, $\\boxed{11}\\times\\boxed{20}=2\\times6=12$. Finally, $\\boxed{12}=6$ because $1,$ $2,$ $3,$ $4,$ $6,$ and $12$ divide $12.$\n\nTherefore, $6$ is our answer. Please note that we have not boxed the correct answer as we normally do, as that would be especially confusing for this problem."
)
```

``` python
# Use the last number by default
math_evaluator.extract_ans(
evaluator.extract_ans(
'First, we need to count the total number of letters in the word "CIRCLE". There are 6 letters.\n\nNext, we need to count the number of distinct letters. There are 6 distinct letters in the word "CIRCLE": C, I, R, L, E, and G.\n\nNow, let\'s consider the arrangements of the distinct letters. The number of ways to arrange n distinct items is n factorial (n!). So, we have 6! = 6 × 5 × 4 × 3 × 2 × 1 = 720 ways to arrange the distinct letters.\n\nHowever, the word "CIRCLE" has one letter that repeats (the letter \'C\' repeats twice). We have over-counted the number of distinct arrangements by including arrangements that are just rotations of each other (for example, "CIRCLE" and "LCIRCE" are considered different arrangements here, but they are the same word when read).\n\nTo correct for this, we divide the total number of arrangements by the number of ways to arrange the repeated letters. The number of ways to arrange 2 identical items is 2! = 2 × 1 = 2. So, we divide the total number of arrangements by 2 to get the correct number of distinct arrangements.\n\nTherefore, the number of ways to arrange the letters of the word "CIRCLE" is 720 ÷ 2 = 360.'
)
# More cases ...
```

``` python
# Normalize fraction
math_evaluator.extract_ans("The answer is 1/2")
evaluator.extract_ans("The answer is 1/2")
```

``` python
# Normalize pmatrix
math_evaluator.extract_ans(
evaluator.extract_ans(
"The answer is \\begin{pmatrix} 3 \\\\ \\frac{\\pi}{2} \\end{pmatrix}"
)
# More cases ...
```

More test cases:

<details class="code-fold">
<summary>Code</summary>

``` python
test_eq(evaluator.norm_ans_str("864 \\mbox{ inches}^2"), "864")
test_eq(evaluator.norm_ans_str("\\frac{270}7\\text{ degrees}"), "\\frac{270}7")
test_eq(evaluator.norm_ans_str(".0000672"), "0.0000672")
```

</details>

#### Correctly Processing Various Mathematical Objects / Special Text

[`EvaluatorMath`](https://tongyx361.github.io/symeval/core.html#evaluatormath),
Expand All @@ -133,26 +146,26 @@ calculation, is able to correctly process
times.

``` python
math_evaluator.eq("x+y", "y+x") == True # Expression
evaluator.eq("x+y", "y+x") == True # Expression
```

``` python
math_evaluator.eq("\\frac{1}{2}", "0.5") == True # LaTeX
evaluator.eq("\\frac{1}{2}", "0.5") == True # LaTeX
```

``` python
math_evaluator.eq(
evaluator.eq(
"\\begin{array}1\\\\2\\end{array}",
"1,2",
) # Matrix (Vector)
```

``` python
math_evaluator.eq("{1,2}", "{2,1}", compare_sets=True) # Set
evaluator.eq("{1,2}", "{2,1}", compare_sets=True) # Set
```

``` python
math_evaluator.eq("no", "false") # Bool
evaluator.eq("no", "false") # Bool
# More mathematical objects and special texts ...
```

Expand All @@ -162,47 +175,52 @@ More test cases:
<summary>Code</summary>

``` python
test_eq(math_evaluator.eq("251,7\\\\ \\noindent", "0"), False)
test_eq(math_evaluator.eq("3.54*10^{-7}", "3.54e-07"), True)
test_eq(math_evaluator.eq(r"\frac{1}{2}", "0.5"), True)
test_eq(math_evaluator.eq("1", "100"), False)
test_eq(math_evaluator.eq("100", "1"), False)
test_eq(math_evaluator.eq("3.04", "0.0304", False), True)
test_eq(math_evaluator.eq(["0.0304", 0.0304], "3.04"), True)
test_eq(math_evaluator.eq("x<-1", "x>3"), False)
test_eq(evaluator.eq("251,7\\\\ \\noindent", "0"), False)
test_eq(evaluator.eq("3.54*10^{-7}", "3.54e-07"), True)
test_eq(evaluator.eq(r"\frac{1}{2}", "0.5"), True)
test_eq(evaluator.eq("1", "100"), False)
test_eq(evaluator.eq("100", "1"), False)
test_eq(evaluator.eq("3.04", "0.0304", False), True)
test_eq(evaluator.eq(["0.0304", 0.0304], "3.04"), True)
test_eq(evaluator.eq("x<-1", "x>3"), False)
test_eq(
math_evaluator.eq("(-\\infty,0)\\cup(0,\\infty)", "(-\\infty,0)\\cup(0,\\infty)"),
evaluator.eq("(-\\infty,0)\\cup(0,\\infty)", "(-\\infty,0)\\cup(0,\\infty)"),
True,
)
test_eq(math_evaluator.eq("1+2,2+1", "2+1,1+2"), True)
test_eq(math_evaluator.eq("5", "5"), True)
test_eq(math_evaluator.eq("0.1 + 0.2", "0.3"), True) # `0.1 + 0.2 == 0.3` is `False`
test_eq(math_evaluator.eq("x + y", "y + x"), True)
test_eq(math_evaluator.eq("C", "C"), True)
test_eq(math_evaluator.eq("1,234", "1234"), True)
test_eq(math_evaluator.eq("12,34", "(12,34)"), True)

test_eq(math_evaluator.eq("\\$ 5", "5"), True)
test_eq(math_evaluator.eq("3 * \\sqrt{13}", "3\\sqrt{13}"), True)
test_eq(math_evaluator.eq("\\pi/2", "\\frac{\\pi}{2}"), True)
test_eq(math_evaluator.eq("(3,\\pi/2)", "(3,\\frac{\\pi}{2})"), True)
test_eq(math_evaluator.eq("23000", "\\$23{,}000"), True)
test_eq(
math_evaluator.eq(r"\left(1,2\right)", r"\left(2,1\right)", compare_sets=True), True
)
test_eq(math_evaluator.eq("White", "white"), True)
test_eq(math_evaluator.eq("[0,3)", "[0,1]"), False)
test_eq(math_evaluator.eq("[0,1]", "[0,3)"), False)
test_eq(math_evaluator.eq("1001.5", "1001"), False)
test_eq(math_evaluator.eq("\\frac{2003}{2}", "1001"), False)
test_eq(evaluator.eq("1+2,2+1", "2+1,1+2"), True)
test_eq(evaluator.eq("5", "5"), True)
test_eq(evaluator.eq("0.1 + 0.2", "0.3"), True) # `0.1 + 0.2 == 0.3` is `False`
test_eq(evaluator.eq("x + y", "y + x"), True)
test_eq(evaluator.eq("C", "C"), True)
test_eq(evaluator.eq("1,234", "1234"), True)
test_eq(evaluator.eq("12,34", "(12,34)"), True)

test_eq(evaluator.eq("\\$ 5", "5"), True)
test_eq(evaluator.eq("3 * \\sqrt{13}", "3\\sqrt{13}"), True)
test_eq(evaluator.eq("\\pi/2", "\\frac{\\pi}{2}"), True)
test_eq(evaluator.eq("(3,\\pi/2)", "(3,\\frac{\\pi}{2})"), True)
test_eq(evaluator.eq("23000", "\\$23{,}000"), True)
test_eq(evaluator.eq(r"\left(1,2\right)", r"\left(2,1\right)", compare_sets=True), True)
test_eq(evaluator.eq("White", "white"), True)
test_eq(evaluator.eq("[0,3)", "[0,1]"), False)
test_eq(evaluator.eq("[0,1]", "[0,3)"), False)
test_eq(evaluator.eq("1001.5", "1001"), False)
test_eq(evaluator.eq("\\frac{2003}{2}", "1001"), False)
```

</details>

``` python
test_eq(evaluator.eq("-2,1", "1,-2", compare_sets=True), True)
```

#### Normalized Majority Voting

``` python
math_evaluator.get_maj_answers(["", "", "1", "2", "2", "3", "3", "3"])
maj_answers_list, norm_answers_list = evaluator.batch_get_maj_answers(
[["", "", "1", "2", "2", "3", "3", "3"]]
)
print(f"{maj_answers_list = } <- {norm_answers_list = }")
```

### Parsing LaTeX
Expand Down Expand Up @@ -230,51 +248,51 @@ latex2sympy_interval("(a+b,b]")
``` python
from symeval import EvaluatorMathBatch

math_evaluator = EvaluatorMathBatch()
evaluator = EvaluatorMathBatch()
```

``` python
math_evaluator.latex2matrix(r"\sqrt{400\cos^2(9\pi/44)},\frac{\pi}{4}")
evaluator.latex2matrix(r"\sqrt{400\cos^2(9\pi/44)},\frac{\pi}{4}")
```

``` python
math_evaluator.latex2matrix(
evaluator.latex2matrix(
r"\begin{pmatrix} \frac{1}{2} & 0 & -\frac{\sqrt{3}}{2} \\ 0 & 1 & 0 \\ \frac{\sqrt{3}}{2} & 0 & \frac{1}{2} \end{pmatrix}"
)
```

``` python
test_eq(
math_evaluator.latex2matrix("\\begin{pmatrix}-18\\\\-49\\\\96\\end{pmatrix}"),
evaluator.latex2matrix("\\begin{pmatrix}-18\\\\-49\\\\96\\end{pmatrix}"),
Matrix([[-18, -49, 96]]),
)
test_eq(
math_evaluator.latex2matrix("\\begin{pmatrix} 2 & 3 \\\\ 0 & -2 \\end{pmatrix}"),
evaluator.latex2matrix("\\begin{pmatrix} 2 & 3 \\\\ 0 & -2 \\end{pmatrix}"),
Matrix([[2, 3], [0, -2]]),
)
```

### Normalization

``` python
test_eq(math_evaluator.norm_math_str("251,7\\\\ \\noindent"), "251,7")
test_eq(evaluator.norm_math_str("251,7\\\\ \\noindent"), "251,7")
```

``` python
test_eq(fix_a_slash_b("(3/4)\\sqrt{3}"), "(\\frac{3}{4})\\sqrt{3}")
```

``` python
test_eq(math_evaluator.norm_pm("x\\pmy"), "x-y,x+y")
test_eq(math_evaluator.norm_pm("a\\mpb"), "a-b,a+b")
test_eq(math_evaluator.norm_pm("1\\pm\\sqrt{19}"), "1-\\sqrt{19},1+\\sqrt{19}")
test_eq(math_evaluator.norm_pm(r"\{1\pm\sqrt{5},-2\}"), "1-\\sqrt{5},1+\\sqrt{5},-2")
test_eq(evaluator.norm_pm("x\\pmy"), "x-y,x+y")
test_eq(evaluator.norm_pm("a\\mpb"), "a-b,a+b")
test_eq(evaluator.norm_pm("1\\pm\\sqrt{19}"), "1-\\sqrt{19},1+\\sqrt{19}")
test_eq(evaluator.norm_pm(r"\{1\pm\sqrt{5},-2\}"), "1-\\sqrt{5},1+\\sqrt{5},-2")
test_eq(
math_evaluator.norm_pm("\\(\\frac{1\\pm\\sqrt{17}}{4}\\)"),
evaluator.norm_pm("\\(\\frac{1\\pm\\sqrt{17}}{4}\\)"),
"\\frac{1-\\sqrt{17}}{4},\\frac{1+\\sqrt{17}}{4}",
)
test_eq(
math_evaluator.norm_pm(r"\frac{1\pm\sqrt{1-\frac{2}{\sqrt{3}}}}{1}"),
evaluator.norm_pm(r"\frac{1\pm\sqrt{1-\frac{2}{\sqrt{3}}}}{1}"),
"\\frac{1-\\sqrt{1-\\frac{2}{\\sqrt{3}}}}{1},\\frac{1+\\sqrt{1-\\frac{2}{\\sqrt{3}}}}{1}",
)
```
Expand All @@ -285,14 +303,14 @@ test_eq(norm_deg(r"\sin 20^\circ"), r"\sin {20*\frac{\pi}{180}}")
```

``` python
test_eq(math_evaluator.norm_basic_fn(r"sinx"), r"\sin^{1}x")
test_eq(math_evaluator.norm_basic_fn(r"\sin^2x"), r"\sin^{2}x")
test_eq(evaluator.norm_basic_fn(r"sinx"), r"\sin^{1}x")
test_eq(evaluator.norm_basic_fn(r"\sin^2x"), r"\sin^{2}x")
```

### Processing Sets

``` python
test_eq(math_evaluator.extract_set("{2,1}"), ["1", "2"])
test_eq(evaluator.extract_set("{2,1}"), ["1", "2"])
```

``` python
Expand All @@ -305,7 +323,7 @@ test_eq(is_set("(3/4)sqrt(3)"), False)
### Manipulating Strings

``` python
test_eq(math_evaluator.remove_first_paren_pair("{white}", "{"), "white")
test_eq(evaluator.remove_first_paren_pair("{white}", "{"), "white")
```

## Contribution Guidelines
Expand Down
9 changes: 0 additions & 9 deletions nbs/00_core.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -62,15 +62,6 @@
"show_doc(EvaluatorMath)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"show_doc(EvaluatorBatchBase)"
]
},
{
"cell_type": "code",
"execution_count": null,
Expand Down
Loading

0 comments on commit a8fe7cc

Please sign in to comment.