Skip to content

Commit

Permalink
Merge branch 'main' into Feature/#544
Browse files Browse the repository at this point in the history
  • Loading branch information
bwook00 authored Jul 2, 2024
2 parents 7e56020 + 79ff4c7 commit 4854d23
Show file tree
Hide file tree
Showing 3 changed files with 21 additions and 6 deletions.
23 changes: 18 additions & 5 deletions autorag/evaluation/metric/generation.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@

import evaluate
import pandas as pd
import sacrebleu
from sacrebleu.metrics.bleu import BLEU
import torch
from llama_index.core.embeddings import BaseEmbedding
from llama_index.embeddings.openai import OpenAIEmbedding
Expand Down Expand Up @@ -64,12 +64,25 @@ def compute_score(gt: List[str], pred: str) -> float:
return result


@generation_metric
def bleu(gt: List[str], pred: str, **kwargs) -> float:
@convert_inputs_to_list
def bleu(generation_gt: List[List[str]], generations: [str], tokenize: str|None = None, smooth_method: str = 'exp', smooth_value: Optional[float] = None, max_ngram_order: int = 4, trg_lang: str = '', **kwargs) -> List[float]:
"""
Compute bleu score for generation.
Computes the BLEU metric given pred and ground-truth.
:param tokenize: The tokenizer to use. If None, defaults to language-specific tokenizers with '13a' as the fallback default. check #https://github.com/mjpost/sacrebleu/blob/master/sacrebleu/metrics/bleu.py
:param smooth_method: The smoothing method to use ('floor', 'add-k', 'exp' or 'none').
:param smooth_value: The smoothing value for `floor` and `add-k` methods. `None` falls back to default value.
:param max_ngram_order: If given, it overrides the maximum n-gram order (default: 4) when computing precisions.
:param trg_lang: An optional language code to raise potential tokenizer warnings.
:param generation_gt: A list of ground truth.
Must be 2-d list of string.
Because it can be a multiple ground truth.
:param generations: A list of generations that LLM generated.
"""
return sacrebleu.sentence_bleu(pred, gt, **kwargs).score
bleu = BLEU(tokenize=tokenize, smooth_method=smooth_method, smooth_value=smooth_value, max_ngram_order=max_ngram_order, trg_lang=trg_lang, **kwargs)

result = list(map(lambda x: bleu.sentence_score(x[0], x[1]).score, zip(generations, generation_gt)))
return result


@convert_inputs_to_list
Expand Down
2 changes: 2 additions & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,8 @@ pyyaml # for yaml file
pyarrow # for pandas with parquet
fastparquet # for pandas with parquet
sacrebleu # for bleu score
sacrebleu[ko] # for bleu score Korean
sacrebleu[jp] # for bleu score Japanese
evaluate # for meteor and other scores
rouge_score # for rouge score
rich # for pretty logging
Expand Down
2 changes: 1 addition & 1 deletion tests/autorag/evaluate/metric/test_generation_metric.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,9 +47,9 @@ def ko_base_test_generation_metrics(func, solution, **kwargs):
assert all(list(map(lambda x: x[0] == pytest.approx(x[1], 0.001),
zip(scores, solution))))


def test_bleu():
base_test_generation_metrics(bleu, [51.1507, 23.5783, 100.0], lowercase=True)
ko_base_test_generation_metrics(bleu, [100.0, 81.9178, 73.7534], lowercase=True, tokenize='ko-mecab', max_ngram_order=2, trg_lang='ko')


def test_meteor():
Expand Down

0 comments on commit 4854d23

Please sign in to comment.