Meta-eval / feeback functions benchmarking notebooks, ranking-based e…

…val utils, and docs update (#991) * implement recommendation metrics for benchmark framework ece fix Revert "ece fix" This reverts commit c58ee7e. run actual evals add context relevance inference api to hugs ffs fmt larger dataset + smarter backoff + recall nb update (wip) fix how we handle ties in precision and recall saving results for GPT-3.5, GPT-4, Claude-1, and Claude-2 remove secrets * finished evals with truera context relevance model * add Verb 2S top 1 prompt * update ECE method pushed to server * save csv results for tmp scaling * save * implement meeting bank generator * example notebook for comprehensiveness benchmark WIP * gi# This is a combination of 2 commits. gainsight benchmarking done remove secrets * prepping comprehensiveness benchmark notebook * remove unused test script * moving results csvs * updates models * intermediate results code change * good stopping point * cleanup * symlink docs * huge doc updates * fix doc symlink * fix score range in docstring * add docstring for truera's context relevance model * update comprehensiveness notebook * update comprehensiveness notebook * fix * file renames * new symlinks * update mkdcos --------- Co-authored-by: Josh Reini <[email protected]> Co-authored-by: Josh Reini <[email protected]>
truera · Apr 5, 2024 · a8c350a · piotrm0 · Apr 15, 2024 · daniel-huang-1230
1 parent b6c96e9
commit a8c350a
Show file tree

Hide file tree

Showing 40 changed files with 63,593 additions and 43 deletions.
diff --git a/docs/trulens_eval/evaluation/feedback_evaluations/answer_relevance_benchmark_small.ipynb b/docs/trulens_eval/evaluation/feedback_evaluations/answer_relevance_benchmark_small.ipynb
@@ -0,0 +1 @@
+../../../../trulens_eval/trulens_eval/tests/answer_relevance_benchmark_small.ipynb
diff --git a/docs/trulens_eval/evaluation/feedback_evaluations/answer_relevance_smoke_tests.ipynb b/docs/trulens_eval/evaluation/feedback_evaluations/answer_relevance_smoke_tests.ipynb
diff --git a/docs/trulens_eval/evaluation/feedback_evaluations/comprehensiveness_benchmark.ipynb b/docs/trulens_eval/evaluation/feedback_evaluations/comprehensiveness_benchmark.ipynb
@@ -0,0 +1 @@
+../../../../trulens_eval/trulens_eval/tests/comprehensiveness_benchmark.ipynb
diff --git a/docs/trulens_eval/evaluation/feedback_evaluations/context_relevance_benchmark.ipynb b/docs/trulens_eval/evaluation/feedback_evaluations/context_relevance_benchmark.ipynb
@@ -0,0 +1 @@
+../../../../trulens_eval/trulens_eval/tests/context_relevance_benchmark.ipynb
diff --git a/docs/trulens_eval/evaluation/feedback_evaluations/context_relevance_benchmark_small.ipynb b/docs/trulens_eval/evaluation/feedback_evaluations/context_relevance_benchmark_small.ipynb
@@ -0,0 +1 @@
+../../../../trulens_eval/trulens_eval/tests/context_relevance_benchmark_small.ipynb
diff --git a/docs/trulens_eval/evaluation/feedback_evaluations/context_relevance_smoke_tests.ipynb b/docs/trulens_eval/evaluation/feedback_evaluations/context_relevance_smoke_tests.ipynb
diff --git a/docs/trulens_eval/evaluation/feedback_evaluations/groundedness_benchmark.ipynb b/docs/trulens_eval/evaluation/feedback_evaluations/groundedness_benchmark.ipynb
@@ -0,0 +1 @@
+../../../../trulens_eval/trulens_eval/tests/groundedness_benchmark.ipynb
diff --git a/docs/trulens_eval/evaluation/feedback_evaluations/groundedness_smoke_tests.ipynb b/docs/trulens_eval/evaluation/feedback_evaluations/groundedness_smoke_tests.ipynb
diff --git a/mkdocs.yml b/mkdocs.yml
@@ -206,11 +206,12 @@ nav:
       - Generating Test Cases:
           - trulens_eval/evaluation/generate_test_cases/index.md
       - Feedback Evaluations:
-          # Titles come from notebook and will be overridden if specified here.
           # PLACEHOLDER: - trulens_eval/evaluation/feedback_evaluations/index.md
-          - trulens_eval/evaluation/feedback_evaluations/answer_relevance_smoke_tests.ipynb
-          - trulens_eval/evaluation/feedback_evaluations/context_relevance_smoke_tests.ipynb
-          - trulens_eval/evaluation/feedback_evaluations/groundedness_smoke_tests.ipynb
+          - Answer Relevance Benchmark (small): trulens_eval/evaluation/feedback_evaluations/answer_relevance_benchmark_small.ipynb
+          - Comprehensiveness Benchmark: trulens_eval/evaluation/feedback_evaluations/comprehensiveness_benchmark.ipynb
+          - Context Relevance Benchmark (small): trulens_eval/evaluation/feedback_evaluations/context_relevance_benchmark_small.ipynb
+          - Context Relevance Benchmark (large): trulens_eval/evaluation/feedback_evaluations/context_relevance_benchmark.ipynb
+          - Groundedness Benchmark: trulens_eval/evaluation/feedback_evaluations/groundedness_benchmark.ipynb
   - 🎺 Tracking:
       # PLACEHOLDER: - trulens_eval/tracking/index.md
       - Instrumentation Overview:

diff --git a/trulens_eval/trulens_eval/feedback/prompts.py b/trulens_eval/trulens_eval/feedback/prompts.py
@@ -31,6 +31,7 @@
 LLM_GROUNDEDNESS_USER = v2.Groundedness.user_prompt.template
 
 CONTEXT_RELEVANCE_SYSTEM = v2.ContextRelevance.system_prompt.template
+QS_RELEVANCE_VERB_2S_TOP1 = v2.QuestionStatementRelevanceVerb2STop1Confidence.prompt.template
 CONTEXT_RELEVANCE_USER = v2.ContextRelevance.user_prompt.template
 
 ANSWER_RELEVANCE_SYSTEM = v2.PromptResponseRelevance.system_prompt.template

diff --git a/trulens_eval/trulens_eval/feedback/provider/base.py b/trulens_eval/trulens_eval/feedback/provider/base.py
@@ -300,7 +300,7 @@ def generate_score_and_reasons(
             )
             return score, {}
 
-    def context_relevance(self, question: str, context: str) -> float:
+    def context_relevance(self, question: str, context: str, temperature: float = 0.0) -> float:
         """
         Uses chat completion model. A function that completes a template to
         check the relevance of the context to the question.
@@ -336,7 +336,8 @@ def context_relevance(self, question: str, context: str) -> float:
                 prompts.CONTEXT_RELEVANCE_USER,
                 question=question,
                 context=context
-            )
+            ),
+            temperature=temperature
         )
 
     def qs_relevance(self, question: str, context: str) -> float:
@@ -352,7 +353,7 @@ def qs_relevance(self, question: str, context: str) -> float:
         return self.context_relevance(question, context)
 
     def context_relevance_with_cot_reasons(self, question: str,
-                                           context: str) -> Tuple[float, Dict]:
+                                           context: str, temperature: float = 0.0) -> Tuple[float, Dict]:
         """
         Uses chat completion model. A function that completes a
         template to check the relevance of the context to the question.
@@ -388,7 +389,7 @@ def context_relevance_with_cot_reasons(self, question: str,
             "RELEVANCE:", prompts.COT_REASONS_TEMPLATE
         )
 
-        return self.generate_score_and_reasons(system_prompt, user_prompt)
+        return self.generate_score_and_reasons(system_prompt, user_prompt, temperature)
 
     def qs_relevance_with_cot_reasons(self, question: str,
                                       context: str) -> Tuple[float, Dict]:

diff --git a/trulens_eval/trulens_eval/feedback/provider/hugs.py b/trulens_eval/trulens_eval/feedback/provider/hugs.py
@@ -24,6 +24,7 @@
 HUGS_NLI_API_URL = "https://api-inference.huggingface.co/models/ynie/roberta-large-snli_mnli_fever_anli_R1_R2_R3-nli"
 HUGS_DOCNLI_API_URL = "https://api-inference.huggingface.co/models/MoritzLaurer/DeBERTa-v3-base-mnli-fever-docnli-ling-2c"
 HUGS_PII_DETECTION_API_URL = "https://api-inference.huggingface.co/models/bigcode/starpii"
+HUGS_CONTEXT_RELEVANCE_API_URL = "https://api-inference.huggingface.co/models/truera/context_relevance"
 HUGS_HALLUCINATION_API_URL = "https://api-inference.huggingface.co/models/vectara/hallucination_evaluation_model"
 
 import functools
@@ -187,6 +188,48 @@ def get_scores(text):
 
         return l1, dict(text1_scores=scores1, text2_scores=scores2)
 
+    @_tci
+    def context_relevance(self, prompt: str, context: str) -> float:
+        """
+        Uses Huggingface's truera/context_relevance model, a
+        model that uses computes the relevance of a given context to the prompt. 
+        The model can be found at https://huggingface.co/truera/context_relevance.
+        **Usage:**
+        ```python
+        from trulens_eval import Feedback
+        from trulens_eval.feedback.provider.hugs import Huggingface
+        huggingface_provider = Huggingface()
+
+        feedback = Feedback(huggingface_provider.context_relevance).on_input_output() 
+        ```
+        The `on_input_output()` selector can be changed. See [Feedback Function
+        Guide](https://www.trulens.org/trulens_eval/feedback_function_guide/)
+
+        Args:
+            prompt (str): The given prompt.
+            context (str): Comparative contextual information.
+
+        Returns:
+            float: A value between 0 and 1. 0 being irrelevant and 1
+            being a relevant context for addressing the prompt.
+        """
+
+        if prompt[len(prompt) - 1] != '.':
+            prompt += '.'
+        ctx_relevnace_string = prompt + '<eos>' + context
+        payload = {"inputs": ctx_relevnace_string}
+        hf_response = self.endpoint.post(
+            url=HUGS_CONTEXT_RELEVANCE_API_URL, payload=payload
+        )
+
+        for label in hf_response:
+            if label['label'] == 'context_relevance':
+                return label['score']
+
+        raise RuntimeError(
+            "'context_relevance' not found in huggingface api response."
+        )
+
     # TODEP
     @_tci
     def positive_sentiment(self, text: str) -> float:

diff --git a/trulens_eval/trulens_eval/feedback/v2/feedback.py b/trulens_eval/trulens_eval/feedback/v2/feedback.py
@@ -205,6 +205,45 @@ class ContextRelevance(Relevance, WithPrompt):
     )
 
 
+class QuestionStatementRelevanceVerb2STop1Confidence(Relevance, WithPrompt):
+    prompt: ClassVar[PromptTemplate] = PromptTemplate.from_template(
+        """You are a RELEVANCE grader; providing the relevance of the given STATEMENT to the given QUESTION.
+Respond only as a number from 0 to 10 where 0 is the least relevant and 10 is the most relevant. 
+
+A few additional scoring guidelines:
+
+- Long STATEMENTS should score equally well as short STATEMENTS.
+
+- RELEVANCE score should increase as the STATEMENT provides more RELEVANT context to the QUESTION.
+
+- RELEVANCE score should increase as the STATEMENT provides RELEVANT context to more parts of the QUESTION.
+
+- STATEMENT that is RELEVANT to some of the QUESTION should score of 2, 3 or 4. Higher score indicates more RELEVANCE.
+
+- STATEMENT that is RELEVANT to most of the QUESTION should get a score of 5, 6, 7 or 8. Higher score indicates more RELEVANCE.
+
+- STATEMENT that is RELEVANT to the entire QUESTION should get a score of 9 or 10. Higher score indicates more RELEVANCE.
+
+- STATEMENT must be relevant and helpful for answering the entire QUESTION to get a score of 10.
+
+- Answers that intentionally do not answer the question, such as 'I don't know', should also be counted as the most relevant.
+
+- Never elaborate.
+
+QUESTION: {question}
+
+STATEMENT: {statement}
+
+RELEVANCE: 
+
+Finally, provide the probability on a scale of 0 to 10 that your REVELANCE scoring is correct. Give ONLY the probability, no
+other words or explanation.\n\nFor example: <the probability between
+0 and 10 that your guess is correct, without any extra commentary whatsoever;
+just the probability!>
+"""
+
+    )
+
 class PromptResponseRelevance(Relevance, WithPrompt):
     system_prompt: ClassVar[PromptTemplate] = PromptTemplate.from_template(
         """You are a RELEVANCE grader; providing the relevance of the given RESPONSE to the given PROMPT.

diff --git a/.../tests/answer_relevance_smoke_tests.ipynb → ...ts/answer_relevance_benchmark_small.ipynb b/.../tests/answer_relevance_smoke_tests.ipynb → ...ts/answer_relevance_benchmark_small.ipynb
diff --git a/trulens_eval/trulens_eval/tests/benchmark_frameworks/eval_as_recommendation.py b/trulens_eval/trulens_eval/tests/benchmark_frameworks/eval_as_recommendation.py
@@ -0,0 +1,104 @@
+import pandas as pd
+import numpy as np
+from sklearn.metrics import ndcg_score
+from typing import List
+import time
+import random
+
+import logging  
+log = logging.getLogger(__name__)
+
+
+
+"""score passages with feedback function, retrying if feedback function fails.
+    Args: df: dataframe with columns 'query_id', 'query', 'passage', 'is_selected'
+            feedback_func: function that takes query and passage as input and returns a score
+            backoff_time: time to wait between retries
+            n: number of samples to estimate conditional probabilities of feedback_func's scores
+"""
+def score_passages(df, feedback_func_name, feedback_func, backoff_time=0.5, n=5, temperature=0.0):
+    grouped = df.groupby('query_id')
+    scores = []
+    true_relevance = []
+
+    for name, group in grouped:
+        query_scores = []
+        query_relevance = []
+        for _, row in group.iterrows():
+            sampled_score = None
+            if feedback_func_name == 'TruEra' or n == 1:
+                sampled_score = feedback_func(row['query'], row['passage'], temperature) # hard-coded for now, we don't need to sample for TruEra BERT-based model
+                time.sleep(backoff_time) 
+            else:
+                sampled_scores = []
+                for _ in range(n):
+                    sampled_scores.append(feedback_func(row['query'], row['passage'], temperature))
+                    time.sleep(backoff_time) 
+                sampled_score = sum(sampled_scores) / len(sampled_scores)
+            query_scores.append(sampled_score)
+            query_relevance.append(row['is_selected'])
+            # print(f"Feedback avg score for query {name} is {sampled_score}, is_selected is {row['is_selected']}")
+
+        print(f"Feedback function {name} scored {len(query_scores)} out of {len(group)} passages.")
+        scores.append(query_scores)
+        true_relevance.append(query_relevance)
+
+    return scores, true_relevance
+
+def compute_ndcg(scores, true_relevance):
+    ndcg_values = [ndcg_score([true], [pred]) for true, pred in zip(true_relevance, scores)]
+    return np.mean(ndcg_values)
+
+def compute_ece(scores, true_relevance, n_bins=10):
+    ece = 0
+    for bin in np.linspace(0, 1, n_bins):
+        bin_scores = []
+        bin_truth = []
+        for score_list, truth_list in zip(scores, true_relevance):
+            for score, truth in zip(score_list, truth_list):
+                if bin <= score < bin + 1/n_bins:
+                    bin_scores.append(score)
+                    bin_truth.append(truth)
+
+        if bin_scores:
+            bin_avg_confidence = np.mean(bin_scores)
+            bin_accuracy = np.mean(bin_truth)
+            ece += np.abs(bin_avg_confidence - bin_accuracy) * len(bin_scores) / sum(map(len, scores))
+
+    return ece
+
+def precision_at_k(scores, true_relevance, k):
+    sorted_scores = sorted(scores, reverse=True)
+    kth_score = sorted_scores[min(k-1, len(scores)-1)]
+
+    # Indices of items with scores >= kth highest score
+    top_k_indices = [i for i, score in enumerate(scores) if score >= kth_score]
+
+    # Calculate precision
+    true_positives = sum(np.take(true_relevance, top_k_indices))
+    return true_positives / len(top_k_indices) if top_k_indices else 0
+
+def recall_at_k(scores, true_relevance, k):
+    """
+    Calculate the recall at K.
+
+    Parameters:
+    true_relevance (list of int): List of binary values indicating relevance (1 for relevant, 0 for not).
+    scores (list of float): List of scores assigned by the model.
+    k (int): Number of top items to consider for calculating recall.
+
+    Returns:
+    float: Recall at K.
+    """
+    sorted_scores = sorted(scores, reverse=True)
+    kth_score = sorted_scores[min(k-1, len(scores)-1)]
+
+    # Indices of items with scores >= kth highest score
+    top_k_indices = [i for i, score in enumerate(scores) if score >= kth_score]
+
+    # Calculate recall
+    relevant_indices = np.where(true_relevance)[0]
+    hits = sum(idx in top_k_indices for idx in relevant_indices)
+    total_relevant = sum(true_relevance)
+
+    return hits / total_relevant if total_relevant > 0 else 0
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		../../../../trulens_eval/trulens_eval/tests/answer_relevance_benchmark_small.ipynb