fix groundedness display (note: highlight doesn't work in patch)

truera · Oct 4, 2024 · 2048ee1 · 2048ee1
1 parent d407abc
commit 2048ee1
Show file tree

Hide file tree

Showing 3 changed files with 5 additions and 64 deletions.
diff --git a/src/dashboard/trulens/dashboard/display.py b/src/dashboard/trulens/dashboard/display.py
@@ -150,6 +150,7 @@ def expand_groundedness_df(df: pd.DataFrame) -> pd.DataFrame:
     reasons_df = pd.DataFrame(data)
 
     reasons_df.rename(columns={"Groundedness Score": "score"}, inplace=True)
+    print(reasons_df)
 
     # Return only the expanded reasons DataFrame
     return reasons_df.reset_index(drop=True)
diff --git a/src/dashboard/trulens/dashboard/pages/Evaluations.py b/src/dashboard/trulens/dashboard/pages/Evaluations.py
@@ -1,7 +1,6 @@
 import asyncio
 import json
 import pprint as pp
-import re
 
 import matplotlib.pyplot as plt
 import numpy as np
@@ -24,6 +23,7 @@
 from trulens.core.utils.json import jsonify_for_ui
 from trulens.core.utils.serial import Lens
 from trulens.dashboard.components.record_viewer import record_viewer
+from trulens.dashboard.display import expand_groundedness_df
 from trulens.dashboard.streamlit_utils import init_from_args
 from trulens.dashboard.ux.components import draw_agent_info
 from trulens.dashboard.ux.components import draw_llm_info
@@ -481,67 +481,9 @@ def highlight_groundedness(s):
 
                         # note: improve conditional to not rely on the feedback name
                         if "groundedness" in feedback_name.lower():
-                            try:
-                                # Split the reasons value into separate rows and columns
-                                reasons = df["reasons"].iloc[0]
-                                # Split the reasons into separate statements
-                                statements = reasons.split("STATEMENT")
-                                data = []
-                                # Each reason has three components: statement, supporting evidence, and score
-                                # Parse each reason into these components and add them to the data list
-                                for statement in statements[1:]:
-                                    try:
-                                        criteria = statement.split(
-                                            "Criteria: "
-                                        )[1].split("Supporting Evidence: ")[0]
-                                        supporting_evidence = statement.split(
-                                            "Supporting Evidence: "
-                                        )[1].split("Score: ")[0]
-                                        score_pattern = re.compile(
-                                            r"([0-9]+)(?=\D*$)"
-                                        )
-                                        score_split = statement.split(
-                                            "Score: "
-                                        )[1]
-                                        score_match = score_pattern.search(
-                                            score_split
-                                        )
-                                        if score_match:
-                                            score = (
-                                                float(score_match.group(1)) / 10
-                                            )
-                                    except Exception:
-                                        pass
-                                    data.append({
-                                        "Statement": criteria,
-                                        "Supporting Evidence from Source": supporting_evidence,
-                                        "Score": score,
-                                    })
-                                reasons_df = pd.DataFrame(data)
-                                # Combine the original feedback data with the expanded reasons
-                                df_expanded = pd.concat(
-                                    [
-                                        df.reset_index(drop=True),
-                                        reasons_df.reset_index(drop=True),
-                                    ],
-                                    axis=1,
-                                )
-                                st.dataframe(
-                                    df_expanded.style.apply(
-                                        highlight_groundedness, axis=1
-                                    ).format("{:.2f}", subset=["Score"]),
-                                    hide_index=True,
-                                    column_order=[
-                                        "Statement",
-                                        "Supporting Evidence from Source",
-                                        "Score",
-                                    ],
-                                )
-                            except Exception:
-                                st.dataframe(
-                                    df.style.apply(highlight, axis=1),
-                                    hide_index=True,
-                                )
+                            # note: improve conditional to not rely on the feedback name
+                            df = expand_groundedness_df(df)
+                            st.dataframe(df, hide_index=True)
                         else:
                             st.dataframe(
                                 df.style.apply(highlight, axis=1),

diff --git a/src/feedback/trulens/feedback/llm_provider.py b/src/feedback/trulens/feedback/llm_provider.py
@@ -1660,8 +1660,6 @@ def evaluate_hypothesis(index, hypothesis):
             np.mean(list(groundedness_scores.values()))
         )
 
-        print(reasons_str)
-
         return average_groundedness_score, {"reasons": reasons_str}
 
     @deprecation_utils.method_renamed("relevance")