Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

add genai metrics endpoint in UI for model overview metrics (#2517) #2520

Merged
merged 1 commit into from
Jan 30, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 14 additions & 0 deletions apps/widget/src/app/ModelAssessment.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,20 @@ export class ModelAssessment extends React.Component<IModelAssessmentProps> {
abortSignal
);
};
callBack.requestGenerativeTextMetrics = async (
selectionIndexes: number[][],
generativeTextCache: Map<string, Map<string, number>>,
abortSignal: AbortSignal
): Promise<any[]> => {
const parameters = [selectionIndexes, generativeTextCache];
return connectToFlaskServiceWithBackupCall(
this.props.config,
parameters,
"handle_generative_text_json",
"/get_generative_text_metrics",
abortSignal
);
};
callBack.requestMatrix = async (
data: any[]
): Promise<IErrorAnalysisMatrix> => {
Expand Down
1 change: 1 addition & 0 deletions apps/widget/src/app/ModelAssessmentUtils.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ export interface IModelAssessmentProps {
export type CallbackType = Pick<
IModelAssessmentDashboardProps,
| "requestExp"
| "requestGenerativeTextMetrics"
| "requestObjectDetectionMetrics"
| "requestPredictions"
| "requestQuestionAnsweringMetrics"
Expand Down
1 change: 1 addition & 0 deletions libs/core-ui/src/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,7 @@ export * from "./lib/util/getFilterBoundsArgs";
export * from "./lib/util/calculateBoxData";
export * from "./lib/util/calculateConfusionMatrixData";
export * from "./lib/util/calculateLineData";
export * from "./lib/util/GenerativeTextStatisticsUtils";
export * from "./lib/util/MultilabelStatisticsUtils";
export * from "./lib/util/ObjectDetectionStatisticsUtils";
export * from "./lib/util/QuestionAnsweringStatisticsUtils";
Expand Down
7 changes: 7 additions & 0 deletions libs/core-ui/src/lib/Context/ModelAssessmentContext.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -140,6 +140,13 @@ export interface IModelAssessmentContext {
requestExp?:
| ((index: number | number[], abortSignal: AbortSignal) => Promise<any[]>)
| undefined;
requestGenerativeTextMetrics?:
| ((
selectionIndexes: number[][],
generativeTextCache: Map<string, Map<string, number>>,
abortSignal: AbortSignal
) => Promise<any[]>)
| undefined;
requestObjectDetectionMetrics?:
| ((
selectionIndexes: number[][],
Expand Down
1 change: 1 addition & 0 deletions libs/core-ui/src/lib/Interfaces/IExplanationContext.ts
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ import { JointDataset } from "../util/JointDataset";
export enum ModelTypes {
Regression = "regression",
Binary = "binary",
GenerativeText = "generativetext",
Multiclass = "multiclass",
ImageBinary = "imagebinary",
ImageMulticlass = "imagemulticlass",
Expand Down
88 changes: 88 additions & 0 deletions libs/core-ui/src/lib/util/GenerativeTextStatisticsUtils.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,88 @@
// Copyright (c) Microsoft Corporation.
// Licensed under the MIT License.

import { localization } from "@responsible-ai/localization";

import {
ILabeledStatistic,
TotalCohortSamples
} from "../Interfaces/IStatistic";

import { QuestionAnsweringMetrics } from "./QuestionAnsweringStatisticsUtils";

export enum GenerativeTextMetrics {
Coherence = "coherence",
Fluency = "fluency",
Equivalence = "equivalence",
Groundedness = "groundedness",
Relevance = "relevance"
}

export const generateGenerativeTextStats: (
selectionIndexes: number[][],
generativeTextCache: Map<string, Map<string, number>>
) => ILabeledStatistic[][] = (
selectionIndexes: number[][],
generativeTextCache: Map<string, Map<string, number>>
): ILabeledStatistic[][] => {
return selectionIndexes.map((selectionArray) => {
const count = selectionArray.length;

const value = generativeTextCache.get(selectionArray.toString());
const stat: Map<string, number> = value ? value : new Map<string, number>();

const stats = [
{
key: TotalCohortSamples,
label: localization.Interpret.Statistics.samples,
stat: count
}
];
for (const [key, value] of stat.entries()) {
let label = "";
switch (key) {
case GenerativeTextMetrics.Coherence:
label = localization.Interpret.Statistics.coherence;
break;
case GenerativeTextMetrics.Fluency:
label = localization.Interpret.Statistics.fluency;
break;
case GenerativeTextMetrics.Equivalence:
label = localization.Interpret.Statistics.equivalence;
break;
case GenerativeTextMetrics.Groundedness:
label = localization.Interpret.Statistics.groundedness;
break;
case GenerativeTextMetrics.Relevance:
label = localization.Interpret.Statistics.relevance;
break;
case QuestionAnsweringMetrics.ExactMatchRatio:
label = localization.Interpret.Statistics.exactMatchRatio;
break;
case QuestionAnsweringMetrics.F1Score:
label = localization.Interpret.Statistics.f1Score;
break;
case QuestionAnsweringMetrics.MeteorScore:
label = localization.Interpret.Statistics.meteorScore;
break;
case QuestionAnsweringMetrics.BleuScore:
label = localization.Interpret.Statistics.bleuScore;
break;
case QuestionAnsweringMetrics.BertScore:
label = localization.Interpret.Statistics.bertScore;
break;
case QuestionAnsweringMetrics.RougeScore:
label = localization.Interpret.Statistics.rougeScore;
break;
default:
break;
}
stats.push({
key,
label,
stat: value
});
}
return stats;
});
};
7 changes: 6 additions & 1 deletion libs/core-ui/src/lib/util/StatisticsUtils.ts
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ import {
} from "../Interfaces/IStatistic";
import { IsBinary } from "../util/ExplanationUtils";

import { generateGenerativeTextStats } from "./GenerativeTextStatisticsUtils";
import { JointDataset } from "./JointDataset";
import { ClassificationEnum } from "./JointDatasetUtils";
import { generateMulticlassStats } from "./MulticlassStatisticsUtils";
Expand Down Expand Up @@ -156,7 +157,8 @@ export const generateMetrics: (
modelType: ModelTypes,
objectDetectionCache?: Map<string, [number, number, number]>,
objectDetectionInputs?: [string, string, number],
questionAnsweringCache?: QuestionAnsweringCacheType
questionAnsweringCache?: QuestionAnsweringCacheType,
generativeTextCache?: Map<string, Map<string, number>>
): ILabeledStatistic[][] => {
if (
modelType === ModelTypes.ImageMultilabel ||
Expand Down Expand Up @@ -192,6 +194,9 @@ export const generateMetrics: (
objectDetectionInputs
);
}
if (modelType === ModelTypes.GenerativeText && generativeTextCache) {
return generateGenerativeTextStats(selectionIndexes, generativeTextCache);
}
const outcomes = jointDataset.unwrap(JointDataset.ClassificationError);
if (IsBinary(modelType)) {
return selectionIndexes.map((selectionArray) => {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -58,9 +58,12 @@ export class MetricSelector extends React.Component<IMetricSelectorProps> {
options.push(this.addDropdownOption(Metrics.AccuracyScore));
} else if (
IsMultilabel(modelType) ||
modelType === ModelTypes.ObjectDetection
modelType === ModelTypes.ObjectDetection ||
modelType === ModelTypes.QuestionAnswering
) {
options.push(this.addDropdownOption(Metrics.ErrorRate));
} else if (modelType === ModelTypes.GenerativeText) {
options.push(this.addDropdownOption(Metrics.MeanSquaredError));
}
return (
<Dropdown
Expand Down
25 changes: 25 additions & 0 deletions libs/localization/src/lib/en.json
Original file line number Diff line number Diff line change
Expand Up @@ -1221,19 +1221,24 @@
"_rSquared.comment": "the coefficient of determination, see https://en.wikipedia.org/wiki/Coefficient_of_determination",
"_recall.comment": "computed recall of model, see https://en.wikipedia.org/wiki/Evaluation_of_binary_classifiers",
"accuracy": "Accuracy: {0}",
"coherence": "Coherence: {0}",
"bleuScore": "Bleu score: {0}",
"bertScore": "Bert score: {0}",
"exactMatchRatio": "Exact match ratio: {0}",
"equivalence": "Equivalence: {0}",
"rougeScore": "Rouge Score: {0}",
"fluency": "Fluency: {0}",
"fnr": "False negative rate: {0}",
"fpr": "False positive rate: {0}",
"groundedness": "Groundedness: {0}",
"hammingScore": "Hamming score: {0}",
"meanPrediction": "Mean prediction {0}",
"meteorScore": "Meteor Score: {0}",
"mse": "Mean squared error: {0}",
"precision": "Precision: {0}",
"rSquared": "R²: {0}",
"recall": "Recall: {0}",
"relevance": "Relevance: {0}",
"selectionRate": "Selection rate: {0}",
"mae": "Mean absolute error: {0}",
"f1Score": "F1 score: {0}",
Expand Down Expand Up @@ -1766,10 +1771,26 @@
"name": "Accuracy score",
"description": "The fraction of data points classified correctly."
},
"coherence": {
"name": "Coherence",
"description": "Coherence of an answer is measured by how well all the sentences fit together and sound naturally as a whole."
},
"fluency": {
"name": "Fluency",
"description": "Fluency measures the quality of individual sentences in the answer, and whether they are well-written and grammatically correct."
},
"equivalence": {
"name": "Equivalence",
"description": "Equivalence, as a metric, measures the similarity between the predicted answer and the correct answer."
},
"exactMatchRatio": {
"name": "Exact match ratio",
"description": "The ratio of instances classified correctly for every label."
},
"groundedness": {
"name": "Groundedness",
"description": "Groundedness measures whether the answer follows logically from the information in the context."
},
"meteorScore": {
"name": "Meteor Score",
"description": "METEOR Score is calculated based on the harmonic mean of precision and recall, with recall weighted more than precision in question answering task."
Expand All @@ -1782,6 +1803,10 @@
"name": "Bert Score",
"description": "BERTScore focuses on computing semantic similarity between tokens of reference and machine generated text in question answering task."
},
"relevance": {
"name": "Relevance",
"description": "Relevance measures how well the answer addresses the main aspects of the question, based on the context"
},
"rougeScore": {
"name": "Rouge Score",
"description": "Rouge Score measures the ratio of words (and/or n-grams) in the reference text that appeared in the machine generated text in question answering task."
Expand Down
Loading
Loading