microsoft · imatiach-msft · Jan 30, 2024 · Jan 29, 2024
@@ -71,6 +71,20 @@ export class ModelAssessment extends React.Component<IModelAssessmentProps> {
           abortSignal
         );
       };
+      callBack.requestGenerativeTextMetrics = async (
+        selectionIndexes: number[][],
+        generativeTextCache: Map<string, Map<string, number>>,
+        abortSignal: AbortSignal
+      ): Promise<any[]> => {
+        const parameters = [selectionIndexes, generativeTextCache];
+        return connectToFlaskServiceWithBackupCall(
+          this.props.config,
+          parameters,
+          "handle_generative_text_json",
+          "/get_generative_text_metrics",
+          abortSignal
+        );
+      };
       callBack.requestMatrix = async (
         data: any[]
       ): Promise<IErrorAnalysisMatrix> => {

@@ -16,6 +16,7 @@ export interface IModelAssessmentProps {
 export type CallbackType = Pick<
   IModelAssessmentDashboardProps,
   | "requestExp"
+  | "requestGenerativeTextMetrics"
   | "requestObjectDetectionMetrics"
   | "requestPredictions"
   | "requestQuestionAnsweringMetrics"

@@ -56,6 +56,7 @@ export * from "./lib/util/getFilterBoundsArgs";
 export * from "./lib/util/calculateBoxData";
 export * from "./lib/util/calculateConfusionMatrixData";
 export * from "./lib/util/calculateLineData";
+export * from "./lib/util/GenerativeTextStatisticsUtils";
 export * from "./lib/util/MultilabelStatisticsUtils";
 export * from "./lib/util/ObjectDetectionStatisticsUtils";
 export * from "./lib/util/QuestionAnsweringStatisticsUtils";

@@ -140,6 +140,13 @@ export interface IModelAssessmentContext {
   requestExp?:
     | ((index: number | number[], abortSignal: AbortSignal) => Promise<any[]>)
     | undefined;
+  requestGenerativeTextMetrics?:
+    | ((
+        selectionIndexes: number[][],
+        generativeTextCache: Map<string, Map<string, number>>,
+        abortSignal: AbortSignal
+      ) => Promise<any[]>)
+    | undefined;
   requestObjectDetectionMetrics?:
     | ((
         selectionIndexes: number[][],

@@ -8,6 +8,7 @@ import { JointDataset } from "../util/JointDataset";
 export enum ModelTypes {
   Regression = "regression",
   Binary = "binary",
+  GenerativeText = "generativetext",
   Multiclass = "multiclass",
   ImageBinary = "imagebinary",
   ImageMulticlass = "imagemulticlass",

@@ -0,0 +1,88 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT License.
+
+import { localization } from "@responsible-ai/localization";
+
+import {
+  ILabeledStatistic,
+  TotalCohortSamples
+} from "../Interfaces/IStatistic";
+
+import { QuestionAnsweringMetrics } from "./QuestionAnsweringStatisticsUtils";
+
+export enum GenerativeTextMetrics {
+  Coherence = "coherence",
+  Fluency = "fluency",
+  Equivalence = "equivalence",
+  Groundedness = "groundedness",
+  Relevance = "relevance"
+}
+
+export const generateGenerativeTextStats: (
+  selectionIndexes: number[][],
+  generativeTextCache: Map<string, Map<string, number>>
+) => ILabeledStatistic[][] = (
+  selectionIndexes: number[][],
+  generativeTextCache: Map<string, Map<string, number>>
+): ILabeledStatistic[][] => {
+  return selectionIndexes.map((selectionArray) => {
+    const count = selectionArray.length;
+
+    const value = generativeTextCache.get(selectionArray.toString());
+    const stat: Map<string, number> = value ? value : new Map<string, number>();
+
+    const stats = [
+      {
+        key: TotalCohortSamples,
+        label: localization.Interpret.Statistics.samples,
+        stat: count
+      }
+    ];
+    for (const [key, value] of stat.entries()) {
+      let label = "";
+      switch (key) {
+        case GenerativeTextMetrics.Coherence:
+          label = localization.Interpret.Statistics.coherence;
+          break;
+        case GenerativeTextMetrics.Fluency:
+          label = localization.Interpret.Statistics.fluency;
+          break;
+        case GenerativeTextMetrics.Equivalence:
+          label = localization.Interpret.Statistics.equivalence;
+          break;
+        case GenerativeTextMetrics.Groundedness:
+          label = localization.Interpret.Statistics.groundedness;
+          break;
+        case GenerativeTextMetrics.Relevance:
+          label = localization.Interpret.Statistics.relevance;
+          break;
+        case QuestionAnsweringMetrics.ExactMatchRatio:
+          label = localization.Interpret.Statistics.exactMatchRatio;
+          break;
+        case QuestionAnsweringMetrics.F1Score:
+          label = localization.Interpret.Statistics.f1Score;
+          break;
+        case QuestionAnsweringMetrics.MeteorScore:
+          label = localization.Interpret.Statistics.meteorScore;
+          break;
+        case QuestionAnsweringMetrics.BleuScore:
+          label = localization.Interpret.Statistics.bleuScore;
+          break;
+        case QuestionAnsweringMetrics.BertScore:
+          label = localization.Interpret.Statistics.bertScore;
+          break;
+        case QuestionAnsweringMetrics.RougeScore:
+          label = localization.Interpret.Statistics.rougeScore;
+          break;
+        default:
+          break;
+      }
+      stats.push({
+        key,
+        label,
+        stat: value
+      });
+    }
+    return stats;
+  });
+};
@@ -10,6 +10,7 @@ import {
 } from "../Interfaces/IStatistic";
 import { IsBinary } from "../util/ExplanationUtils";
 
+import { generateGenerativeTextStats } from "./GenerativeTextStatisticsUtils";
 import { JointDataset } from "./JointDataset";
 import { ClassificationEnum } from "./JointDatasetUtils";
 import { generateMulticlassStats } from "./MulticlassStatisticsUtils";
@@ -156,7 +157,8 @@ export const generateMetrics: (
   modelType: ModelTypes,
   objectDetectionCache?: Map<string, [number, number, number]>,
   objectDetectionInputs?: [string, string, number],
-  questionAnsweringCache?: QuestionAnsweringCacheType
+  questionAnsweringCache?: QuestionAnsweringCacheType,
+  generativeTextCache?: Map<string, Map<string, number>>
 ): ILabeledStatistic[][] => {
   if (
     modelType === ModelTypes.ImageMultilabel ||
@@ -192,6 +194,9 @@ export const generateMetrics: (
       objectDetectionInputs
     );
   }
+  if (modelType === ModelTypes.GenerativeText && generativeTextCache) {
+    return generateGenerativeTextStats(selectionIndexes, generativeTextCache);
+  }
   const outcomes = jointDataset.unwrap(JointDataset.ClassificationError);
   if (IsBinary(modelType)) {
     return selectionIndexes.map((selectionArray) => {

@@ -58,9 +58,12 @@ export class MetricSelector extends React.Component<IMetricSelectorProps> {
       options.push(this.addDropdownOption(Metrics.AccuracyScore));
     } else if (
       IsMultilabel(modelType) ||
-      modelType === ModelTypes.ObjectDetection
+      modelType === ModelTypes.ObjectDetection ||
+      modelType === ModelTypes.QuestionAnswering
     ) {
       options.push(this.addDropdownOption(Metrics.ErrorRate));
+    } else if (modelType === ModelTypes.GenerativeText) {
+      options.push(this.addDropdownOption(Metrics.MeanSquaredError));
     }
     return (
       <Dropdown

diff --git a/libs/localization/src/lib/en.json b/libs/localization/src/lib/en.json
@@ -1221,19 +1221,24 @@
       "_rSquared.comment": "the coefficient of determination, see https://en.wikipedia.org/wiki/Coefficient_of_determination",
       "_recall.comment": "computed recall of model, see https://en.wikipedia.org/wiki/Evaluation_of_binary_classifiers",
       "accuracy": "Accuracy: {0}",
+      "coherence": "Coherence: {0}",
       "bleuScore": "Bleu score: {0}",
       "bertScore": "Bert score: {0}",
       "exactMatchRatio": "Exact match ratio: {0}",
+      "equivalence": "Equivalence: {0}",
       "rougeScore": "Rouge Score: {0}",
+      "fluency": "Fluency: {0}",
       "fnr": "False negative rate: {0}",
       "fpr": "False positive rate: {0}",
+      "groundedness": "Groundedness: {0}",
       "hammingScore": "Hamming score: {0}",
       "meanPrediction": "Mean prediction {0}",
       "meteorScore": "Meteor Score: {0}",
       "mse": "Mean squared error: {0}",
       "precision": "Precision: {0}",
       "rSquared": "R²: {0}",
       "recall": "Recall: {0}",
+      "relevance": "Relevance: {0}",
       "selectionRate": "Selection rate: {0}",
       "mae": "Mean absolute error: {0}",
       "f1Score": "F1 score: {0}",
@@ -1766,10 +1771,26 @@
           "name": "Accuracy score",
           "description": "The fraction of data points classified correctly."
         },
+        "coherence": {
+          "name": "Coherence",
+          "description": "Coherence of an answer is measured by how well all the sentences fit together and sound naturally as a whole."
+        },
+        "fluency": {
+          "name": "Fluency",
+          "description": "Fluency measures the quality of individual sentences in the answer, and whether they are well-written and grammatically correct."
+        },
+        "equivalence": {
+          "name": "Equivalence",
+          "description": "Equivalence, as a metric, measures the similarity between the predicted answer and the correct answer."
+        },
         "exactMatchRatio": {
           "name": "Exact match ratio",
           "description": "The ratio of instances classified correctly for every label."
         },
+        "groundedness": {
+          "name": "Groundedness",
+          "description": "Groundedness measures whether the answer follows logically from the information in the context."
+        },
         "meteorScore": {
           "name": "Meteor Score",
           "description": "METEOR Score is calculated based on the harmonic mean of precision and recall, with recall weighted more than precision in question answering task."
@@ -1782,6 +1803,10 @@
           "name": "Bert Score",
           "description": "BERTScore focuses on computing semantic similarity between tokens of reference and machine generated text in question answering task."
         },
+        "relevance": {
+          "name": "Relevance",
+          "description": "Relevance measures how well the answer addresses the main aspects of the question, based on the context"
+        },
         "rougeScore": {
           "name": "Rouge Score",
           "description": "Rouge Score measures the ratio of words (and/or n-grams) in the reference text that appeared in the machine generated text in question answering task."