Merge branch 'main' into Feature/#544

Marker-Inc-Korea · Jul 2, 2024 · 4854d23 · 4854d23
2 parents 7e56020 + 79ff4c7
commit 4854d23
Show file tree

Hide file tree

Showing 3 changed files with 21 additions and 6 deletions.
diff --git a/autorag/evaluation/metric/generation.py b/autorag/evaluation/metric/generation.py
@@ -6,7 +6,7 @@
 
 import evaluate
 import pandas as pd
-import sacrebleu
+from sacrebleu.metrics.bleu import BLEU
 import torch
 from llama_index.core.embeddings import BaseEmbedding
 from llama_index.embeddings.openai import OpenAIEmbedding
@@ -64,12 +64,25 @@ def compute_score(gt: List[str], pred: str) -> float:
     return result
 
 
-@generation_metric
-def bleu(gt: List[str], pred: str, **kwargs) -> float:
+@convert_inputs_to_list
+def bleu(generation_gt: List[List[str]], generations: [str], tokenize: str|None = None, smooth_method: str = 'exp', smooth_value: Optional[float] = None, max_ngram_order: int = 4, trg_lang: str = '', **kwargs) -> List[float]:
     """
-    Compute bleu score for generation.
+    Computes the BLEU metric given pred and ground-truth.
+
+    :param tokenize: The tokenizer to use. If None, defaults to language-specific tokenizers with '13a' as the fallback default. check #https://github.com/mjpost/sacrebleu/blob/master/sacrebleu/metrics/bleu.py
+    :param smooth_method: The smoothing method to use ('floor', 'add-k', 'exp' or 'none').
+    :param smooth_value: The smoothing value for `floor` and `add-k` methods. `None` falls back to default value.
+    :param max_ngram_order: If given, it overrides the maximum n-gram order (default: 4) when computing precisions.
+    :param trg_lang: An optional language code to raise potential tokenizer warnings.
+    :param generation_gt: A list of ground truth.
+        Must be 2-d list of string.
+        Because it can be a multiple ground truth.
+    :param generations: A list of generations that LLM generated.
     """
-    return sacrebleu.sentence_bleu(pred, gt, **kwargs).score
+    bleu = BLEU(tokenize=tokenize, smooth_method=smooth_method, smooth_value=smooth_value, max_ngram_order=max_ngram_order, trg_lang=trg_lang, **kwargs)
+
+    result = list(map(lambda x: bleu.sentence_score(x[0], x[1]).score, zip(generations, generation_gt)))
+    return result
 
 
 @convert_inputs_to_list

diff --git a/requirements.txt b/requirements.txt
@@ -10,6 +10,8 @@ pyyaml  # for yaml file
 pyarrow  # for pandas with parquet
 fastparquet  # for pandas with parquet
 sacrebleu  # for bleu score
+sacrebleu[ko] # for bleu score Korean
+sacrebleu[jp] # for bleu score Japanese
 evaluate  # for meteor and other scores
 rouge_score  # for rouge score
 rich  # for pretty logging

diff --git a/tests/autorag/evaluate/metric/test_generation_metric.py b/tests/autorag/evaluate/metric/test_generation_metric.py
@@ -47,9 +47,9 @@ def ko_base_test_generation_metrics(func, solution, **kwargs):
     assert all(list(map(lambda x: x[0] == pytest.approx(x[1], 0.001),
                         zip(scores, solution))))
 
-
 def test_bleu():
     base_test_generation_metrics(bleu, [51.1507, 23.5783, 100.0], lowercase=True)
+    ko_base_test_generation_metrics(bleu, [100.0, 81.9178, 73.7534], lowercase=True, tokenize='ko-mecab', max_ngram_order=2, trg_lang='ko')
 
 
 def test_meteor():