From 4cbbebf9f145bf19bbbf223a09681f4557105f94 Mon Sep 17 00:00:00 2001
From: BenoitDherin <dherin@google.com>
Date: Wed, 17 Apr 2024 00:41:21 +0000
Subject: [PATCH 1/4] precommit

---
 .../solutions/vertex_llm_evaluation.ipynb     | 1042 +++++++++++++++++
 1 file changed, 1042 insertions(+)
 create mode 100644 notebooks/vertex_genai/solutions/vertex_llm_evaluation.ipynb

diff --git a/notebooks/vertex_genai/solutions/vertex_llm_evaluation.ipynb b/notebooks/vertex_genai/solutions/vertex_llm_evaluation.ipynb
new file mode 100644
index 00000000..38a0bc68
--- /dev/null
+++ b/notebooks/vertex_genai/solutions/vertex_llm_evaluation.ipynb
@@ -0,0 +1,1042 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "JAPoU8Sm5E6e"
+   },
+   "source": [
+    "# Evaluate LLMs with Vertex AutoSxS Model Evaluation"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "d975e698c9a4"
+   },
+   "source": [
+    "## Learning Objectives\n",
+    "\n",
+    "1) Learn how to create evaluation data\n",
+    "1) Learn how to setup a AutoSxS model evaluation pipeline\n",
+    "1) Learn how to run the evaluation pipeline job\n",
+    "1) Learn how to check the autorater judgments\n",
+    "1) Learn how to evaluate how much AutoSxS is aligned with human judgment\n",
+    "\n",
+    "\n",
+    "In this notebook, we will use Vertex AI Model Evaluation AutoSxS (pronounced Auto Side-by-Side) to compare two LLMs predictions in a summarization task in order to understand which LLM model did a better job at the summarization task. Provided that we have some additional human judgments as to which model is better for part of the dataset, then we will demonstrate how to evaluate the alignment of AutoSxS  with human judgment. (Note that Vertex AI Model Evaluation AutoSxS allows you to compare the performance of Google-first-party and Third-party LLMs, provided the model responses are stored in a JSONL evaluation file.)\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "WReHDGG5g0XY"
+   },
+   "source": [
+    "## Setup\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {
+    "id": "PyQmSRbKA8r-",
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "import os\n",
+    "import pprint\n",
+    "import random\n",
+    "import string\n",
+    "\n",
+    "import pandas as pd\n",
+    "from google.cloud import aiplatform\n",
+    "from google.protobuf.json_format import MessageToDict\n",
+    "from IPython.display import HTML, display"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 52,
+   "metadata": {
+    "id": "oM1iC_MfAts1",
+    "tags": []
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Your project ID is set to dherin-dev\n"
+     ]
+    }
+   ],
+   "source": [
+    "project_id_list = !gcloud config get-value project 2> /dev/null\n",
+    "PROJECT_ID = project_id_list[0]\n",
+    "BUCKET = PROJECT_ID\n",
+    "REGION = \"us-central1\"\n",
+    "\n",
+    "# Evaluation data containing competing models responses\n",
+    "EVALUATION_FILE_URI = \"gs://cloud-training/specialized-training/llm_eval/sum_eval_gemini_dataset_001.jsonl\"\n",
+    "HUMAN_EVALUATION_FILE_URI = \"gs://cloud-training/specialized-training/llm_eval/sum_human_eval_gemini_dataset_001.jsonl\"\n",
+    "\n",
+    "# AutoSxS Vertex Pipeline template\n",
+    "TEMPLATE_URI = (\n",
+    "    \"https://us-kfp.pkg.dev/ml-pipeline/llm-rlhf/autosxs-template/default\"\n",
+    ")\n",
+    "\n",
+    "print(f\"Your project ID is set to {PROJECT_ID}\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "timestamp"
+   },
+   "source": [
+    "We will now create a GCS path where AutoSxS will export its judgment data:\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 53,
+   "metadata": {
+    "id": "84Vdv7R-QEH6",
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "def generate_uuid(length: int = 8) -> str:\n",
+    "    \"\"\"Generate a uuid of a specifed length (default=8).\"\"\"\n",
+    "    return \"\".join(\n",
+    "        random.choices(string.ascii_lowercase + string.digits, k=length)\n",
+    "    )\n",
+    "\n",
+    "\n",
+    "UUID = generate_uuid()\n",
+    "BUCKET_URI = f\"gs://{BUCKET}/autosxs-{UUID}\"\n",
+    "PIPELINE_ROOT = f\"{BUCKET_URI}/pipeline\""
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "-EcIXiGsCePi"
+   },
+   "source": [
+    "Let us make sure the bucket where AutoSxS will export the data exists, and if not, let us create it:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 54,
+   "metadata": {
+    "id": "NIq7R4HZCfIc",
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "! gsutil ls gs://{BUCKET} > /dev/null || gsutil mb -l {REGION} -p {PROJECT_ID} gs://{BUCKET}"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "init_aip:mbsdk,all",
+    "jp-MarkdownHeadingCollapsed": true,
+    "tags": []
+   },
+   "source": [
+    "At last, let us initialize the `aiplatform` client in the cell below:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 55,
+   "metadata": {
+    "id": "j4KEcQEWROby",
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "aiplatform.init(project=PROJECT_ID, location=REGION, staging_bucket=BUCKET_URI)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "NRkfTNeaHbZd",
+    "tags": []
+   },
+   "source": [
+    "### Define helper functions"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "The functions below will allow us to display AutoSxS judgments in a more readable way within the notebook.\n",
+    "They are here only for cosmetic reasons. "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 56,
+   "metadata": {
+    "id": "ivbHUDiEHd2Q",
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "def print_autosxs_judgments(df, n=3):\n",
+    "    \"\"\"Print AutoSxS judgments in the notebook\"\"\"\n",
+    "\n",
+    "    style = \"white-space: pre-wrap; width: 800px; overflow-x: auto;\"\n",
+    "    df = df.sample(n=n)\n",
+    "\n",
+    "    for index, row in df.iterrows():\n",
+    "        if row[\"confidence\"] >= 0.5:\n",
+    "            display(\n",
+    "                HTML(\n",
+    "                    f\"<h2>Document:</h2> <div style='{style}'>{row['document']}</div>\"\n",
+    "                )\n",
+    "            )\n",
+    "            display(\n",
+    "                HTML(\n",
+    "                    f\"<h2>Response A:</h2> <div style='{style}'>{row['response_a']}</div>\"\n",
+    "                )\n",
+    "            )\n",
+    "            display(\n",
+    "                HTML(\n",
+    "                    f\"<h2>Response B:</h2> <div style='{style}'>{row['response_b']}</div>\"\n",
+    "                )\n",
+    "            )\n",
+    "            display(\n",
+    "                HTML(\n",
+    "                    f\"<h2>Explanation:</h2> <div style='{style}'>{row['explanation']}</div>\"\n",
+    "                )\n",
+    "            )\n",
+    "            display(\n",
+    "                HTML(\n",
+    "                    f\"<h2>Confidence score:</h2> <div style='{style}'>{row['confidence']}</div>\"\n",
+    "                )\n",
+    "            )\n",
+    "            display(HTML(\"<hr>\"))\n",
+    "\n",
+    "\n",
+    "def print_aggregated_metrics(scores):\n",
+    "    \"\"\"Print AutoSxS aggregated metrics\"\"\"\n",
+    "\n",
+    "    score_b = round(win_rate_metrics[\"autosxs_model_b_win_rate\"] * 100)\n",
+    "    display(\n",
+    "        HTML(\n",
+    "            f\"<h3>AutoSxS Autorater prefers {score_b}% of time Model B over Model A </h3>\"\n",
+    "        )\n",
+    "    )\n",
+    "\n",
+    "\n",
+    "def print_human_preference_metrics(metrics):\n",
+    "    \"\"\"Print AutoSxS Human-preference alignment metrics\"\"\"\n",
+    "    display(\n",
+    "        HTML(\n",
+    "            f\"<h3>AutoSxS Autorater prefers {score_b}% of time Model B over Model A </h3>\"\n",
+    "        )\n",
+    "    )"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "qoiqIyiMvc3n"
+   },
+   "source": [
+    "## Evaluate LLMs using Vertex AI Model Evaluation AutoSxS\n",
+    "\n",
+    "Suppose you've obtained your LLM-generated predictions in a summarization task. To evaluate LLMs such as Gemini-Pro on Vertex AI against another using [AutoSXS](https://cloud.google.com/vertex-ai/generative-ai/docs/models/side-by-side-eval), you need to follow these steps for evaluation:\n",
+    "\n",
+    "1.   **Prepare the Evaluation Dataset**: Gather your prompts, contexts, generated responses and human preference required for the evaluation.\n",
+    "\n",
+    "2.   **Convert the Evaluation Dataset:** Convert the dataset into the JSONL format and store it in a Cloud Storage bucket. (Alternatively, you can save the dataset to a BigQuery table.)\n",
+    "\n",
+    "3.   **Run a Model Evaluation Job:** Use Vertex AI to run a model evaluation job to assess the performance of the LLM.\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "08d289fa873f"
+   },
+   "source": [
+    "### Dataset\n",
+    "\n",
+    "The dataset is a modified sample of the [XSum](https://huggingface.co/datasets/EdinburghNLP/xsum) dataset for evaluation of abstractive single-document summarization systems."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "ZEIlO0eHbsQh"
+   },
+   "source": [
+    "### Read the evaluation data\n",
+    "\n",
+    "In this summarization use case, you use `sum_eval_gemini_dataset_001`, a JSONL-formatted evaluation datasets which contains content-response pairs without human preferences.\n",
+    "\n",
+    "In the dataset, each row represents a single example. The dataset includes ID fields, such as \"id\" and \"document,\" which are used to identify each unique example. The \"document\" field contains the newspaper articles to be summarized.\n",
+    "\n",
+    "While the dataset does not have [data fields](https://cloud.google.com/vertex-ai/docs/generative-ai/models/side-by-side-eval#prep-eval-dataset) for prompts and contexts, it does include pre-generated predictions. These predictions contain the generated response according to the LLMs task, with \"response_a\" and \"response_b\" representing different article summaries.\n",
+    "\n",
+    "**Note: For experimentation, you can provide only a few examples. The documentation recommends at least 400 examples to ensure high-quality aggregate metrics.**\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 57,
+   "metadata": {
+    "id": "R-_ettKRxfxT",
+    "tags": []
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>id</th>\n",
+       "      <th>document</th>\n",
+       "      <th>response_a</th>\n",
+       "      <th>response_b</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>40159674</td>\n",
+       "      <td>The 33-year-old, capped 81 times by the Republ...</td>\n",
+       "      <td>Whelan joined Aston Villa on a free transfer f...</td>\n",
+       "      <td>- Glenn Whelan, 33, has joined Aston Villa fro...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>36925620</td>\n",
+       "      <td>Paul Frew, MLA for North Antrim, appeared in c...</td>\n",
+       "      <td>- A 15-year-old girl is suing DUP MLA Paul Fre...</td>\n",
+       "      <td>- Paul Frew, MLA for North Antrim, appeared in...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>39729595</td>\n",
+       "      <td>Party leader Jeremy Corbyn said half of the ho...</td>\n",
+       "      <td>Labour says it will build 100,000 new homes a ...</td>\n",
+       "      <td>- Labour leader Jeremy Corbyn announced a hous...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>16788700</td>\n",
+       "      <td>No clear winner emerged between incumbent Jean...</td>\n",
+       "      <td>The African Union summit in Addis Ababa ended ...</td>\n",
+       "      <td>The African Union (AU) summit in Addis Ababa, ...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>19389625</td>\n",
+       "      <td>The ban has been called by opposition coalitio...</td>\n",
+       "      <td>Opposition coalition in Togo has called for a ...</td>\n",
+       "      <td>Togo's opposition coalition, Let's Save Togo, ...</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "         id                                           document  \\\n",
+       "0  40159674  The 33-year-old, capped 81 times by the Republ...   \n",
+       "1  36925620  Paul Frew, MLA for North Antrim, appeared in c...   \n",
+       "2  39729595  Party leader Jeremy Corbyn said half of the ho...   \n",
+       "3  16788700  No clear winner emerged between incumbent Jean...   \n",
+       "4  19389625  The ban has been called by opposition coalitio...   \n",
+       "\n",
+       "                                          response_a  \\\n",
+       "0  Whelan joined Aston Villa on a free transfer f...   \n",
+       "1  - A 15-year-old girl is suing DUP MLA Paul Fre...   \n",
+       "2  Labour says it will build 100,000 new homes a ...   \n",
+       "3  The African Union summit in Addis Ababa ended ...   \n",
+       "4  Opposition coalition in Togo has called for a ...   \n",
+       "\n",
+       "                                          response_b  \n",
+       "0  - Glenn Whelan, 33, has joined Aston Villa fro...  \n",
+       "1  - Paul Frew, MLA for North Antrim, appeared in...  \n",
+       "2  - Labour leader Jeremy Corbyn announced a hous...  \n",
+       "3  The African Union (AU) summit in Addis Ababa, ...  \n",
+       "4  Togo's opposition coalition, Let's Save Togo, ...  "
+      ]
+     },
+     "execution_count": 57,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "evaluation_gemini_df = pd.read_json(EVALUATION_FILE_URI, lines=True)\n",
+    "evaluation_gemini_df.head()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "1lZHraNFkDz8"
+   },
+   "source": [
+    "### Run a model evaluation job\n",
+    "\n",
+    "AutoSxS relays on Vertex AI pipelines to run model evaluation. And here you can see some of the required pipeline parameters:\n",
+    "\n",
+    "*   `evaluation_dataset` to indicate where the evaluation dataset location. In this case, it is the JSONL Cloud bucket URI.\n",
+    "\n",
+    "*   `id_colums` to distinguish evaluation examples that are unique. Here, as you can imagine, your have `id` and `document` fields.\n",
+    "\n",
+    "*   `task` to indicate the task type you want to evaluate in `{task}@{version}` form. It can be `summarization` or `question_answer`. In this case you have `summarization`.\n",
+    "\n",
+    "*   `autorater_prompt_parameters` to configure the autorater task behaviour. And you can specify inference instructions to guide task completion, as well as setting the inference context to refer during the task execution.\n",
+    "\n",
+    "Lastly, you have to provide `response_column_a` and `response_column_b` with the names of columns containing predefined predictions in order to calculate the evaluation metrics. In this case, `response_a` and `response_b` respectively.\n",
+    "\n",
+    "To learn more about all supported parameters, see the [official documentation](https://cloud.google.com/vertex-ai/docs/generative-ai/models/side-by-side-eval#perform-eval).\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 58,
+   "metadata": {
+    "id": "Cp7e-hOmNMhA",
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "display_name = f\"autosxs-eval-{generate_uuid()}\"\n",
+    "parameters = {\n",
+    "    \"evaluation_dataset\": EVALUATION_FILE_URI,\n",
+    "    \"id_columns\": [\"id\", \"document\"],\n",
+    "    \"task\": \"summarization\",\n",
+    "    \"autorater_prompt_parameters\": {\n",
+    "        \"inference_context\": {\"column\": \"document\"},\n",
+    "        \"inference_instruction\": {\"template\": \"Summarize the following text: \"},\n",
+    "    },\n",
+    "    \"response_column_a\": \"response_a\",\n",
+    "    \"response_column_b\": \"response_b\",\n",
+    "}"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "Bp0YIvSv-zhB"
+   },
+   "source": [
+    "After you define the model evaluation parameters, you can run a model evaluation pipeline job using the predifined pipeline template with Vertex AI Python SDK."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "AjFHT5ze9m4L",
+    "tags": []
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Creating PipelineJob\n",
+      "PipelineJob created. Resource name: projects/115851500182/locations/us-central1/pipelineJobs/autosxs-eval-zf4d8btr1296\n",
+      "To use this PipelineJob in another session:\n",
+      "pipeline_job = aiplatform.PipelineJob.get('projects/115851500182/locations/us-central1/pipelineJobs/autosxs-eval-zf4d8btr1296')\n",
+      "View Pipeline Job:\n",
+      "https://console.cloud.google.com/vertex-ai/locations/us-central1/pipelines/runs/autosxs-eval-zf4d8btr1296?project=115851500182\n",
+      "PipelineJob projects/115851500182/locations/us-central1/pipelineJobs/autosxs-eval-zf4d8btr1296 current state:\n",
+      "PipelineState.PIPELINE_STATE_RUNNING\n",
+      "PipelineJob projects/115851500182/locations/us-central1/pipelineJobs/autosxs-eval-zf4d8btr1296 current state:\n",
+      "PipelineState.PIPELINE_STATE_RUNNING\n",
+      "PipelineJob projects/115851500182/locations/us-central1/pipelineJobs/autosxs-eval-zf4d8btr1296 current state:\n",
+      "PipelineState.PIPELINE_STATE_RUNNING\n",
+      "PipelineJob projects/115851500182/locations/us-central1/pipelineJobs/autosxs-eval-zf4d8btr1296 current state:\n",
+      "PipelineState.PIPELINE_STATE_RUNNING\n",
+      "PipelineJob projects/115851500182/locations/us-central1/pipelineJobs/autosxs-eval-zf4d8btr1296 current state:\n",
+      "PipelineState.PIPELINE_STATE_RUNNING\n",
+      "PipelineJob projects/115851500182/locations/us-central1/pipelineJobs/autosxs-eval-zf4d8btr1296 current state:\n",
+      "PipelineState.PIPELINE_STATE_RUNNING\n"
+     ]
+    }
+   ],
+   "source": [
+    "import random\n",
+    "\n",
+    "job = aiplatform.PipelineJob(\n",
+    "    job_id=(display_name + str(random.randint(1, 2**12))),\n",
+    "    display_name=display_name,\n",
+    "    pipeline_root=os.path.join(BUCKET_URI, display_name),\n",
+    "    template_path=TEMPLATE_URI,\n",
+    "    parameter_values=parameters,\n",
+    "    enable_caching=False,\n",
+    ")\n",
+    "job.run(sync=True)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "PunFdLfqGh0e"
+   },
+   "source": [
+    "### Evaluate the results\n",
+    "\n",
+    "After the evaluation pipeline successfully run, you can review the evaluation results by looking both at artifacts generated by the pipeline itself in Vertex AI Pipelines UI and in the notebook enviroment using the Vertex AI Python SDK.\n",
+    "\n",
+    "AutoSXS produces three types of evaluation results: a judgments table, aggregated metrics, and alignment metrics (if human preferences are provided).\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "J4ShokOI9FDI"
+   },
+   "source": [
+    "### AutoSxS Judgments\n",
+    "\n",
+    "The judgments table contains metrics that offer insights of LLM performance per each example.\n",
+    "\n",
+    "For each response pair, the judgments table includes a `choice` column indicating the better response based on the evaluation criteria used by the autorater.\n",
+    "\n",
+    "Each choice has a `confidence score` column between 0 and 1, representing the autorater's level of confidence in the evaluation.\n",
+    "\n",
+    "Last but not less important, AutoSXS provides an explanation for why the autorater preferred one response over the other.\n",
+    "\n",
+    "Below you have an example of AutoSxS judgments output."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 106,
+   "metadata": {
+    "id": "MakdmpYCmehF"
+   },
+   "outputs": [],
+   "source": [
+    "for details in job.task_details:\n",
+    "    if details.task_name == \"online-evaluation-pairwise\":\n",
+    "        break\n",
+    "\n",
+    "judgments_uri = MessageToDict(details.outputs[\"judgments\"]._pb)[\"artifacts\"][0][\n",
+    "    \"uri\"\n",
+    "]\n",
+    "judgments_df = pd.read_json(judgments_uri, lines=True)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 107,
+   "metadata": {
+    "id": "_gyM2-i3HHnP"
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<h2>Document:</h2> <div style='white-space: pre-wrap; width: 800px; overflow-x: auto;'>The money was for \"development\", he said on Sunday, but gave no details.\n",
+       "The announcement comes three months after Mr Maduro travelled to China - a major investor in the region.\n",
+       "Venezuela is suffering from an acute economic crisis, as the price of its main export, oil, has almost halved over a year. The opposition accuses the government of mismanagement.\n",
+       "Mr Maduro visited Beijing in January and said at the time that China would invest more than $20bn in Venezuela.\n",
+       "He did not make clear in Sunday's announcement if these latest $5bn were part of that larger sum.\n",
+       "Loans by China's state-owned banks to Latin American countries rose by 71% to $22bn (£14bn) in 2014, according to estimates published by the China-Latin America Finance Database.\n",
+       "The Chinese loans exceed the combined worth of those by the World Bank and the Inter-American Development Bank, according to the database.\n",
+       "The $5bn will be a boost to Venezuela, which has been hit hard by falling oil price.  According to reports, 96% of its export revenues come from oil.\n",
+       "Figures from Venezuela's oil ministry suggest the price of Venezuelan oil has dropped from $97 in April 2014 to $50 this month.\n",
+       "Inflation in 2014 stood at more than 60% and there are widespread shortages of basic staples such a flour, cooking oil and milk.</div>"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/html": [
+       "<h2>Response A:</h2> <div style='white-space: pre-wrap; width: 800px; overflow-x: auto;'>Venezuelan President Nicolas Maduro has announced that China will provide $5bn in loans to help the country's struggling economy.\n",
+       "The announcement comes as the price of oil, Venezuela's main export, has fallen sharply in recent months.\n",
+       "China is a major investor in Venezuela and has provided billions of dollars in loans in the past.\n",
+       "The new loans are expected to help Venezuela weather the economic crisis and finance infrastructure projects.</div>"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/html": [
+       "<h2>Response B:</h2> <div style='white-space: pre-wrap; width: 800px; overflow-x: auto;'>Venezuelan President Maduro announced that China will provide $5 billion for \"development\" in Venezuela. This announcement comes after Maduro's visit to China in January, where he secured a $20 billion investment from China. Venezuela is currently facing an economic crisis due to the falling price of oil, its main export. The $5 billion loan will help boost Venezuela's economy, which has been hit hard by the falling oil price.</div>"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/html": [
+       "<h2>Explanation:</h2> <div style='white-space: pre-wrap; width: 800px; overflow-x: auto;'>Response (B) is better than Response (A) because it provides more details and is more informative. For example, Response (B) mentions that the $5 billion loan is part of a larger $20 billion investment from China, while Response (A) does not. Additionally, Response (B) provides more context about Venezuela's economic crisis, such as the falling price of oil and the widespread shortages of basic staples.</div>"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/html": [
+       "<h2>Confidence score:</h2> <div style='white-space: pre-wrap; width: 800px; overflow-x: auto;'>0.5</div>"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/html": [
+       "<hr>"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "print_autosxs_judgments(judgments_df)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "tJ5PJ9x69KrC"
+   },
+   "source": [
+    "### AutoSxS Aggregate metrics\n",
+    "\n",
+    "AutoSxS also provides aggregated metrics as an additional evaluation result. These win-rate metrics are calculated by utilizing the judgments table to determine the percentage of times the autorater preferred one model response.  \n",
+    "\n",
+    "These metrics are relevant for quickly find out which is the best model in the context of the evaluated task.\n",
+    "\n",
+    "Below you have an example of AutoSxS Aggregate metrics."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 110,
+   "metadata": {
+    "id": "w2RISjQSJk9R"
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<h3>AutoSxS Autorater prefers 87% of time Model B over Model A </h3>"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "for details in job.task_details:\n",
+    "    if details.task_name == \"model-evaluation-text-generation-pairwise\":\n",
+    "        break\n",
+    "\n",
+    "win_rate_metrics = MessageToDict(details.outputs[\"autosxs_metrics\"]._pb)[\n",
+    "    \"artifacts\"\n",
+    "][0][\"metadata\"]\n",
+    "print_aggregated_metrics(win_rate_metrics)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "_5mYmHj6poXz"
+   },
+   "source": [
+    "### Human-preference alignment metrics\n",
+    "\n",
+    "After reviewing the results of your initial AutoSxS evalution, you may wonder about the reliability of the Autorater assessment's alignment with human raters' views.\n",
+    "\n",
+    "AutoSxS supports human preference to validate Autorater evaluation.\n",
+    "\n",
+    "To check alignment with a human-preference dataset,  you need to add the ground truths as a column to the `evaluation_dataset` and pass the column name to `human_preference_column`."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "8vSedvz39-iu"
+   },
+   "source": [
+    "#### Read the evaluation data\n",
+    "\n",
+    "With respect of evaluation dataset, in this case the `sum_human_eval_gemini_dataset_001` dataset also includes human preferences.\n",
+    "\n",
+    "Below you have a sample of the dataset."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 102,
+   "metadata": {
+    "id": "mbfsO2uw9-i5"
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>id</th>\n",
+       "      <th>document</th>\n",
+       "      <th>response_a</th>\n",
+       "      <th>response_b</th>\n",
+       "      <th>actual</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>12511455</td>\n",
+       "      <td>Since then the country has seen the creation o...</td>\n",
+       "      <td>I am not able to generate a summary</td>\n",
+       "      <td>Nepal is a landlocked country in South Asia, b...</td>\n",
+       "      <td>B</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>13130633</td>\n",
+       "      <td>Nine Madryn Street, Toxteth, where the drummer...</td>\n",
+       "      <td>The redevelopment of the Welsh Streets in Live...</td>\n",
+       "      <td>- 271 homes in Toxteth, including the childhoo...</td>\n",
+       "      <td>A</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>14703910</td>\n",
+       "      <td>A country of fertile plains, high mountains an...</td>\n",
+       "      <td>Syria is a country in Western Asia. It is bord...</td>\n",
+       "      <td>Syria is a country with diverse ethnic and rel...</td>\n",
+       "      <td>B</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>15520382</td>\n",
+       "      <td>Evidence of Phytophthora ramorum was discovere...</td>\n",
+       "      <td>Phytophthora ramorum, a fungus-like pathogen, ...</td>\n",
+       "      <td>- Phytophthora ramorum, a fungus-like pathogen...</td>\n",
+       "      <td>A</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>16788700</td>\n",
+       "      <td>No clear winner emerged between incumbent Jean...</td>\n",
+       "      <td>The African Union summit in Addis Ababa ended ...</td>\n",
+       "      <td>The African Union (AU) summit in Addis Ababa, ...</td>\n",
+       "      <td>A</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "         id                                           document  \\\n",
+       "0  12511455  Since then the country has seen the creation o...   \n",
+       "1  13130633  Nine Madryn Street, Toxteth, where the drummer...   \n",
+       "2  14703910  A country of fertile plains, high mountains an...   \n",
+       "3  15520382  Evidence of Phytophthora ramorum was discovere...   \n",
+       "4  16788700  No clear winner emerged between incumbent Jean...   \n",
+       "\n",
+       "                                          response_a  \\\n",
+       "0                I am not able to generate a summary   \n",
+       "1  The redevelopment of the Welsh Streets in Live...   \n",
+       "2  Syria is a country in Western Asia. It is bord...   \n",
+       "3  Phytophthora ramorum, a fungus-like pathogen, ...   \n",
+       "4  The African Union summit in Addis Ababa ended ...   \n",
+       "\n",
+       "                                          response_b actual  \n",
+       "0  Nepal is a landlocked country in South Asia, b...      B  \n",
+       "1  - 271 homes in Toxteth, including the childhoo...      A  \n",
+       "2  Syria is a country with diverse ethnic and rel...      B  \n",
+       "3  - Phytophthora ramorum, a fungus-like pathogen...      A  \n",
+       "4  The African Union (AU) summit in Addis Ababa, ...      A  "
+      ]
+     },
+     "execution_count": 102,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "human_evaluation_gemini_df = pd.read_json(HUMAN_EVALUATION_FILE_URI, lines=True)\n",
+    "human_evaluation_gemini_df.head()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "XpmAr6UX-Imb"
+   },
+   "source": [
+    "#### Run a model evaluation job\n",
+    "\n",
+    "With respect to the AutoSXS pipeline, you must specify the human preference column in the pipeline parameters.\n",
+    "\n",
+    "Then, you can run the evaluation pipeline job using the Vertex AI Python SDK as shown below.\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 103,
+   "metadata": {
+    "id": "bFmvFt2a3MtN"
+   },
+   "outputs": [],
+   "source": [
+    "display_name = f\"autosxs-human-eval-{generate_uuid()}\"\n",
+    "parameters = {\n",
+    "    \"evaluation_dataset\": HUMAN_EVALUATION_FILE_URI,\n",
+    "    \"id_columns\": [\"id\", \"document\"],\n",
+    "    \"task\": \"summarization\",\n",
+    "    \"autorater_prompt_parameters\": {\n",
+    "        \"inference_context\": {\"column\": \"document\"},\n",
+    "        \"inference_instruction\": {\"template\": \"Summarize the following text: \"},\n",
+    "    },\n",
+    "    \"response_column_a\": \"response_a\",\n",
+    "    \"response_column_b\": \"response_b\",\n",
+    "    \"human_preference_column\": \"actual\",\n",
+    "}"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 104,
+   "metadata": {
+    "id": "KbhIPY-_3SSB"
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Creating PipelineJob\n",
+      "PipelineJob created. Resource name: projects/115851500182/locations/us-central1/pipelineJobs/autosxs-human-eval-6lec31qg\n",
+      "To use this PipelineJob in another session:\n",
+      "pipeline_job = aiplatform.PipelineJob.get('projects/115851500182/locations/us-central1/pipelineJobs/autosxs-human-eval-6lec31qg')\n",
+      "View Pipeline Job:\n",
+      "https://console.cloud.google.com/vertex-ai/locations/us-central1/pipelines/runs/autosxs-human-eval-6lec31qg?project=115851500182\n",
+      "PipelineJob projects/115851500182/locations/us-central1/pipelineJobs/autosxs-human-eval-6lec31qg current state:\n",
+      "PipelineState.PIPELINE_STATE_RUNNING\n",
+      "PipelineJob projects/115851500182/locations/us-central1/pipelineJobs/autosxs-human-eval-6lec31qg current state:\n",
+      "PipelineState.PIPELINE_STATE_RUNNING\n",
+      "PipelineJob projects/115851500182/locations/us-central1/pipelineJobs/autosxs-human-eval-6lec31qg current state:\n",
+      "PipelineState.PIPELINE_STATE_RUNNING\n",
+      "PipelineJob projects/115851500182/locations/us-central1/pipelineJobs/autosxs-human-eval-6lec31qg current state:\n",
+      "PipelineState.PIPELINE_STATE_RUNNING\n",
+      "PipelineJob projects/115851500182/locations/us-central1/pipelineJobs/autosxs-human-eval-6lec31qg current state:\n",
+      "PipelineState.PIPELINE_STATE_RUNNING\n",
+      "PipelineJob projects/115851500182/locations/us-central1/pipelineJobs/autosxs-human-eval-6lec31qg current state:\n",
+      "PipelineState.PIPELINE_STATE_RUNNING\n",
+      "PipelineJob projects/115851500182/locations/us-central1/pipelineJobs/autosxs-human-eval-6lec31qg current state:\n",
+      "PipelineState.PIPELINE_STATE_RUNNING\n",
+      "PipelineJob projects/115851500182/locations/us-central1/pipelineJobs/autosxs-human-eval-6lec31qg current state:\n",
+      "PipelineState.PIPELINE_STATE_RUNNING\n",
+      "PipelineJob run completed. Resource name: projects/115851500182/locations/us-central1/pipelineJobs/autosxs-human-eval-6lec31qg\n"
+     ]
+    }
+   ],
+   "source": [
+    "job = aiplatform.PipelineJob(\n",
+    "    job_id=display_name,\n",
+    "    display_name=(display_name + str(random.randint(1, 2**12))),\n",
+    "    pipeline_root=os.path.join(BUCKET_URI, display_name),\n",
+    "    template_path=TEMPLATE_URI,\n",
+    "    parameter_values=parameters,\n",
+    "    enable_caching=False,\n",
+    ")\n",
+    "job.run(sync=True)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "QTzJ8BaWEusN"
+   },
+   "source": [
+    "### Get human-aligned aggregated metrics\n",
+    "\n",
+    "Compared with the aggregated metrics you get before, now the pipeline returns additional measurements that utilize human-preference data provided by you.\n",
+    "\n",
+    "Below you have a view of the resulting human-aligned aggregated metrics, comparing the win rates for models for both human preferenced and model inferences.\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 115,
+   "metadata": {
+    "id": "JLUOJFjA38ja"
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "{'autosxs_model_a_win_rate': 0.13,\n",
+      " 'autosxs_model_b_win_rate': 0.87,\n",
+      " 'human_preference_model_a_win_rate': 0.26,\n",
+      " 'human_preference_model_b_win_rate': 0.74}\n"
+     ]
+    }
+   ],
+   "source": [
+    "for details in job.task_details:\n",
+    "    if details.task_name == \"model-evaluation-text-generation-pairwise\":\n",
+    "        break\n",
+    "\n",
+    "human_aligned_metrics = {\n",
+    "    k: round(v, 3)\n",
+    "    for k, v in MessageToDict(details.outputs[\"autosxs_metrics\"]._pb)[\n",
+    "        \"artifacts\"\n",
+    "    ][0][\"metadata\"].items()\n",
+    "    if \"win_rate\" in k\n",
+    "}\n",
+    "pprint.pprint(human_aligned_metrics)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "TpV-iwP9qw9c"
+   },
+   "source": [
+    "## Cleaning up\n",
+    "\n",
+    "To clean up all Google Cloud resources used in this project, you can [delete the Google Cloud\n",
+    "project](https://cloud.google.com/resource-manager/docs/creating-managing-projects#shutting_down_projects) you used for the tutorial. Clicking the __End Lab__ button in your lab instructions will take care of this for you.\n",
+    "\n",
+    "Otherwise, you can delete the individual resources you created in this tutorial."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Copyright 2024 Google LLC\n",
+    "\n",
+    "Licensed under the Apache License, Version 2.0 (the \"License\");\n",
+    "you may not use this file except in compliance with the License.\n",
+    "You may obtain a copy of the License at\n",
+    "\n",
+    "     https://www.apache.org/licenses/LICENSE-2.0\n",
+    "\n",
+    "Unless required by applicable law or agreed to in writing, software\n",
+    "distributed under the License is distributed on an \"AS IS\" BASIS,\n",
+    "WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n",
+    "See the License for the specific language governing permissions and\n",
+    "limitations under the License."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "colab": {
+   "name": "evaluate_gemini_with_autosxs.ipynb",
+   "toc_visible": true
+  },
+  "environment": {
+   "kernel": "python3",
+   "name": "tf2-gpu.2-12.m119",
+   "type": "gcloud",
+   "uri": "us-docker.pkg.dev/deeplearning-platform-release/gcr.io/tf2-gpu.2-12:m119"
+  },
+  "kernelspec": {
+   "display_name": "Python 3 (Local)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.14"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}

From e637a7aae3e6948416dc1eff13eef66d2432678e Mon Sep 17 00:00:00 2001
From: BenoitDherin <dherin@google.com>
Date: Wed, 17 Apr 2024 00:51:12 +0000
Subject: [PATCH 2/4] precommit

---
 .../solutions/vertex_llm_evaluation.ipynb     | 197 ++++++++++--------
 1 file changed, 105 insertions(+), 92 deletions(-)

diff --git a/notebooks/vertex_genai/solutions/vertex_llm_evaluation.ipynb b/notebooks/vertex_genai/solutions/vertex_llm_evaluation.ipynb
index 38a0bc68..37a257db 100644
--- a/notebooks/vertex_genai/solutions/vertex_llm_evaluation.ipynb
+++ b/notebooks/vertex_genai/solutions/vertex_llm_evaluation.ipynb
@@ -165,89 +165,6 @@
     "aiplatform.init(project=PROJECT_ID, location=REGION, staging_bucket=BUCKET_URI)"
    ]
   },
-  {
-   "cell_type": "markdown",
-   "metadata": {
-    "id": "NRkfTNeaHbZd",
-    "tags": []
-   },
-   "source": [
-    "### Define helper functions"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "The functions below will allow us to display AutoSxS judgments in a more readable way within the notebook.\n",
-    "They are here only for cosmetic reasons. "
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 56,
-   "metadata": {
-    "id": "ivbHUDiEHd2Q",
-    "tags": []
-   },
-   "outputs": [],
-   "source": [
-    "def print_autosxs_judgments(df, n=3):\n",
-    "    \"\"\"Print AutoSxS judgments in the notebook\"\"\"\n",
-    "\n",
-    "    style = \"white-space: pre-wrap; width: 800px; overflow-x: auto;\"\n",
-    "    df = df.sample(n=n)\n",
-    "\n",
-    "    for index, row in df.iterrows():\n",
-    "        if row[\"confidence\"] >= 0.5:\n",
-    "            display(\n",
-    "                HTML(\n",
-    "                    f\"<h2>Document:</h2> <div style='{style}'>{row['document']}</div>\"\n",
-    "                )\n",
-    "            )\n",
-    "            display(\n",
-    "                HTML(\n",
-    "                    f\"<h2>Response A:</h2> <div style='{style}'>{row['response_a']}</div>\"\n",
-    "                )\n",
-    "            )\n",
-    "            display(\n",
-    "                HTML(\n",
-    "                    f\"<h2>Response B:</h2> <div style='{style}'>{row['response_b']}</div>\"\n",
-    "                )\n",
-    "            )\n",
-    "            display(\n",
-    "                HTML(\n",
-    "                    f\"<h2>Explanation:</h2> <div style='{style}'>{row['explanation']}</div>\"\n",
-    "                )\n",
-    "            )\n",
-    "            display(\n",
-    "                HTML(\n",
-    "                    f\"<h2>Confidence score:</h2> <div style='{style}'>{row['confidence']}</div>\"\n",
-    "                )\n",
-    "            )\n",
-    "            display(HTML(\"<hr>\"))\n",
-    "\n",
-    "\n",
-    "def print_aggregated_metrics(scores):\n",
-    "    \"\"\"Print AutoSxS aggregated metrics\"\"\"\n",
-    "\n",
-    "    score_b = round(win_rate_metrics[\"autosxs_model_b_win_rate\"] * 100)\n",
-    "    display(\n",
-    "        HTML(\n",
-    "            f\"<h3>AutoSxS Autorater prefers {score_b}% of time Model B over Model A </h3>\"\n",
-    "        )\n",
-    "    )\n",
-    "\n",
-    "\n",
-    "def print_human_preference_metrics(metrics):\n",
-    "    \"\"\"Print AutoSxS Human-preference alignment metrics\"\"\"\n",
-    "    display(\n",
-    "        HTML(\n",
-    "            f\"<h3>AutoSxS Autorater prefers {score_b}% of time Model B over Model A </h3>\"\n",
-    "        )\n",
-    "    )"
-   ]
-  },
   {
    "cell_type": "markdown",
    "metadata": {
@@ -553,6 +470,13 @@
     "judgments_df = pd.read_json(judgments_uri, lines=True)"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "The next cell contains a helper function to print a sample from AutoSxS judgments nicely in the notebook."
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": 107,
@@ -646,6 +570,42 @@
     }
    ],
    "source": [
+    "def print_autosxs_judgments(df, n=3):\n",
+    "    \"\"\"Print AutoSxS judgments in the notebook\"\"\"\n",
+    "\n",
+    "    style = \"white-space: pre-wrap; width: 800px; overflow-x: auto;\"\n",
+    "    df = df.sample(n=n)\n",
+    "\n",
+    "    for index, row in df.iterrows():\n",
+    "        if row[\"confidence\"] >= 0.5:\n",
+    "            display(\n",
+    "                HTML(\n",
+    "                    f\"<h2>Document:</h2> <div style='{style}'>{row['document']}</div>\"\n",
+    "                )\n",
+    "            )\n",
+    "            display(\n",
+    "                HTML(\n",
+    "                    f\"<h2>Response A:</h2> <div style='{style}'>{row['response_a']}</div>\"\n",
+    "                )\n",
+    "            )\n",
+    "            display(\n",
+    "                HTML(\n",
+    "                    f\"<h2>Response B:</h2> <div style='{style}'>{row['response_b']}</div>\"\n",
+    "                )\n",
+    "            )\n",
+    "            display(\n",
+    "                HTML(\n",
+    "                    f\"<h2>Explanation:</h2> <div style='{style}'>{row['explanation']}</div>\"\n",
+    "                )\n",
+    "            )\n",
+    "            display(\n",
+    "                HTML(\n",
+    "                    f\"<h2>Confidence score:</h2> <div style='{style}'>{row['confidence']}</div>\"\n",
+    "                )\n",
+    "            )\n",
+    "            display(HTML(\"<hr>\"))\n",
+    "\n",
+    "\n",
     "print_autosxs_judgments(judgments_df)"
    ]
   },
@@ -691,7 +651,33 @@
     "\n",
     "win_rate_metrics = MessageToDict(details.outputs[\"autosxs_metrics\"]._pb)[\n",
     "    \"artifacts\"\n",
-    "][0][\"metadata\"]\n",
+    "][0][\"metadata\"]"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "The next cell contains a helper function to print AutoSxS aggregated metrics nicely in the notebook."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def print_aggregated_metrics(scores):\n",
+    "    \"\"\"Print AutoSxS aggregated metrics\"\"\"\n",
+    "\n",
+    "    score_b = round(win_rate_metrics[\"autosxs_model_b_win_rate\"] * 100)\n",
+    "    display(\n",
+    "        HTML(\n",
+    "            f\"<h3>AutoSxS Autorater prefers {score_b}% of time Model B over Model A </h3>\"\n",
+    "        )\n",
+    "    )\n",
+    "\n",
+    "\n",
     "print_aggregated_metrics(win_rate_metrics)"
    ]
   },
@@ -963,7 +949,31 @@
     "        \"artifacts\"\n",
     "    ][0][\"metadata\"].items()\n",
     "    if \"win_rate\" in k\n",
-    "}\n",
+    "}"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "The next cell contains a helper function to print AutoSxS alignment metrics nicely in the notebook."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def print_human_preference_metrics(metrics):\n",
+    "    \"\"\"Print AutoSxS Human-preference alignment metrics\"\"\"\n",
+    "    display(\n",
+    "        HTML(\n",
+    "            f\"<h3>AutoSxS Autorater prefers {score_b}% of time Model B over Model A </h3>\"\n",
+    "        )\n",
+    "    )\n",
+    "\n",
+    "\n",
     "pprint.pprint(human_aligned_metrics)"
    ]
   },
@@ -981,6 +991,16 @@
     "Otherwise, you can delete the individual resources you created in this tutorial."
    ]
   },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Acknowledgement \n",
+    "\n",
+    "This notebook is adapted from a [tutorial](https://github.com/GoogleCloudPlatform/generative-ai/blob/main/gemini/evaluation/evaluate_gemini_with_autosxs.ipynb)\n",
+    "written by Ivan Nardini."
+   ]
+  },
   {
    "cell_type": "markdown",
    "metadata": {},
@@ -999,13 +1019,6 @@
     "See the License for the specific language governing permissions and\n",
     "limitations under the License."
    ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": []
   }
  ],
  "metadata": {

From a570cdf144a487cb5fb687a6213f6de703eb9e84 Mon Sep 17 00:00:00 2001
From: BenoitDherin <dherin@google.com>
Date: Wed, 17 Apr 2024 20:10:26 +0000
Subject: [PATCH 3/4] incorporate Takumi comments

---
 .../solutions/vertex_llm_evaluation.ipynb     | 410 ++++--------------
 1 file changed, 79 insertions(+), 331 deletions(-)

diff --git a/notebooks/vertex_genai/solutions/vertex_llm_evaluation.ipynb b/notebooks/vertex_genai/solutions/vertex_llm_evaluation.ipynb
index 37a257db..5bf65703 100644
--- a/notebooks/vertex_genai/solutions/vertex_llm_evaluation.ipynb
+++ b/notebooks/vertex_genai/solutions/vertex_llm_evaluation.ipynb
@@ -3,7 +3,8 @@
   {
    "cell_type": "markdown",
    "metadata": {
-    "id": "JAPoU8Sm5E6e"
+    "id": "JAPoU8Sm5E6e",
+    "tags": []
    },
    "source": [
     "# Evaluate LLMs with Vertex AutoSxS Model Evaluation"
@@ -46,10 +47,9 @@
    },
    "outputs": [],
    "source": [
+    "import datetime\n",
     "import os\n",
     "import pprint\n",
-    "import random\n",
-    "import string\n",
     "\n",
     "import pandas as pd\n",
     "from google.cloud import aiplatform\n",
@@ -59,7 +59,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 52,
+   "execution_count": 2,
    "metadata": {
     "id": "oM1iC_MfAts1",
     "tags": []
@@ -91,36 +91,6 @@
     "print(f\"Your project ID is set to {PROJECT_ID}\")"
    ]
   },
-  {
-   "cell_type": "markdown",
-   "metadata": {
-    "id": "timestamp"
-   },
-   "source": [
-    "We will now create a GCS path where AutoSxS will export its judgment data:\n"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 53,
-   "metadata": {
-    "id": "84Vdv7R-QEH6",
-    "tags": []
-   },
-   "outputs": [],
-   "source": [
-    "def generate_uuid(length: int = 8) -> str:\n",
-    "    \"\"\"Generate a uuid of a specifed length (default=8).\"\"\"\n",
-    "    return \"\".join(\n",
-    "        random.choices(string.ascii_lowercase + string.digits, k=length)\n",
-    "    )\n",
-    "\n",
-    "\n",
-    "UUID = generate_uuid()\n",
-    "BUCKET_URI = f\"gs://{BUCKET}/autosxs-{UUID}\"\n",
-    "PIPELINE_ROOT = f\"{BUCKET_URI}/pipeline\""
-   ]
-  },
   {
    "cell_type": "markdown",
    "metadata": {
@@ -132,7 +102,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 54,
+   "execution_count": 3,
    "metadata": {
     "id": "NIq7R4HZCfIc",
     "tags": []
@@ -155,14 +125,14 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 55,
+   "execution_count": 4,
    "metadata": {
     "id": "j4KEcQEWROby",
     "tags": []
    },
    "outputs": [],
    "source": [
-    "aiplatform.init(project=PROJECT_ID, location=REGION, staging_bucket=BUCKET_URI)"
+    "aiplatform.init(project=PROJECT_ID, location=REGION, staging_bucket=BUCKET)"
    ]
   },
   {
@@ -205,14 +175,14 @@
     "\n",
     "In the dataset, each row represents a single example. The dataset includes ID fields, such as \"id\" and \"document,\" which are used to identify each unique example. The \"document\" field contains the newspaper articles to be summarized.\n",
     "\n",
-    "While the dataset does not have [data fields](https://cloud.google.com/vertex-ai/docs/generative-ai/models/side-by-side-eval#prep-eval-dataset) for prompts and contexts, it does include pre-generated predictions. These predictions contain the generated response according to the LLMs task, with \"response_a\" and \"response_b\" representing different article summaries.\n",
+    "While the dataset does not have [data fields](https://cloud.google.com/vertex-ai/docs/generative-ai/models/side-by-side-eval#prep-eval-dataset) for prompts and contexts, it does include pre-generated predictions. These predictions contain the generated response according to the LLMs task, with \"response_a\" and \"response_b\" representing different article summaries generated by two different LLM models.\n",
     "\n",
     "**Note: For experimentation, you can provide only a few examples. The documentation recommends at least 400 examples to ensure high-quality aggregate metrics.**\n"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 57,
+   "execution_count": 5,
    "metadata": {
     "id": "R-_ettKRxfxT",
     "tags": []
@@ -308,7 +278,7 @@
        "4  Togo's opposition coalition, Let's Save Togo, ...  "
       ]
      },
-     "execution_count": 57,
+     "execution_count": 5,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -332,28 +302,36 @@
     "\n",
     "*   `id_colums` to distinguish evaluation examples that are unique. Here, as you can imagine, your have `id` and `document` fields.\n",
     "\n",
-    "*   `task` to indicate the task type you want to evaluate in `{task}@{version}` form. It can be `summarization` or `question_answer`. In this case you have `summarization`.\n",
+    "*   `task` to indicate the task type you want to evaluate. It can be `summarization` or `question_answer`. In this case you have `summarization`.\n",
     "\n",
-    "*   `autorater_prompt_parameters` to configure the autorater task behaviour. And you can specify inference instructions to guide task completion, as well as setting the inference context to refer during the task execution.\n",
-    "\n",
-    "Lastly, you have to provide `response_column_a` and `response_column_b` with the names of columns containing predefined predictions in order to calculate the evaluation metrics. In this case, `response_a` and `response_b` respectively.\n",
+    "*   `autorater_prompt_parameters` to configure the autorater task behavior. You can specify inference instructions to guide task completion, as well as setting the inference context to refer during the task execution. For example, for the summarization task below we have that `autorater_prompt_parameters` is specified by a dictionary containing the name of the field containing the summarization context (i.e. the document to summarize) as well as the summarization instruction itself:\n",
+    "```python\n",
+    "    {\n",
+    "        \"inference_context\": {\"column\": \"document\"},\n",
+    "        \"inference_instruction\": {\"template\": \"Summarize the following text: \"},\n",
+    "    },\n",
+    "```\n",
     "\n",
-    "To learn more about all supported parameters, see the [official documentation](https://cloud.google.com/vertex-ai/docs/generative-ai/models/side-by-side-eval#perform-eval).\n"
+    "Lastly, you have to provide `response_column_a` and `response_column_b` with the names of columns containing predefined predictions in order to calculate the evaluation metrics. In this case, `response_a` and `response_b` respectively. Note that we can simply specify the actual models through the `model_a` and `model_b` fields (as long as these models are stored in Vertex model registry) instead of providing the pre-generated responses (through the `response_column_a` and `response_column_b` fields). To learn more about all supported parameters and their usage, see the [official documentation](https://cloud.google.com/vertex-ai/docs/generative-ai/models/side-by-side-eval#perform-eval).\n",
+    "\n"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 58,
+   "execution_count": 7,
    "metadata": {
     "id": "Cp7e-hOmNMhA",
     "tags": []
    },
    "outputs": [],
    "source": [
-    "display_name = f\"autosxs-eval-{generate_uuid()}\"\n",
+    "timestamp = str(datetime.datetime.now().timestamp()).replace(\".\", \"\")\n",
+    "display_name = f\"autosxs-{timestamp}\"\n",
+    "pipeline_root = os.path.join(\"gs://\", BUCKET, display_name)\n",
+    "\n",
     "parameters = {\n",
     "    \"evaluation_dataset\": EVALUATION_FILE_URI,\n",
-    "    \"id_columns\": [\"id\", \"document\"],\n",
+    "    \"id_columns\": [\"id\"],\n",
     "    \"task\": \"summarization\",\n",
     "    \"autorater_prompt_parameters\": {\n",
     "        \"inference_context\": {\"column\": \"document\"},\n",
@@ -375,7 +353,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 9,
    "metadata": {
     "id": "AjFHT5ze9m4L",
     "tags": []
@@ -386,33 +364,31 @@
      "output_type": "stream",
      "text": [
       "Creating PipelineJob\n",
-      "PipelineJob created. Resource name: projects/115851500182/locations/us-central1/pipelineJobs/autosxs-eval-zf4d8btr1296\n",
+      "PipelineJob created. Resource name: projects/115851500182/locations/us-central1/pipelineJobs/autosxs-1713383655691384\n",
       "To use this PipelineJob in another session:\n",
-      "pipeline_job = aiplatform.PipelineJob.get('projects/115851500182/locations/us-central1/pipelineJobs/autosxs-eval-zf4d8btr1296')\n",
+      "pipeline_job = aiplatform.PipelineJob.get('projects/115851500182/locations/us-central1/pipelineJobs/autosxs-1713383655691384')\n",
       "View Pipeline Job:\n",
-      "https://console.cloud.google.com/vertex-ai/locations/us-central1/pipelines/runs/autosxs-eval-zf4d8btr1296?project=115851500182\n",
-      "PipelineJob projects/115851500182/locations/us-central1/pipelineJobs/autosxs-eval-zf4d8btr1296 current state:\n",
+      "https://console.cloud.google.com/vertex-ai/locations/us-central1/pipelines/runs/autosxs-1713383655691384?project=115851500182\n",
+      "PipelineJob projects/115851500182/locations/us-central1/pipelineJobs/autosxs-1713383655691384 current state:\n",
       "PipelineState.PIPELINE_STATE_RUNNING\n",
-      "PipelineJob projects/115851500182/locations/us-central1/pipelineJobs/autosxs-eval-zf4d8btr1296 current state:\n",
+      "PipelineJob projects/115851500182/locations/us-central1/pipelineJobs/autosxs-1713383655691384 current state:\n",
       "PipelineState.PIPELINE_STATE_RUNNING\n",
-      "PipelineJob projects/115851500182/locations/us-central1/pipelineJobs/autosxs-eval-zf4d8btr1296 current state:\n",
+      "PipelineJob projects/115851500182/locations/us-central1/pipelineJobs/autosxs-1713383655691384 current state:\n",
       "PipelineState.PIPELINE_STATE_RUNNING\n",
-      "PipelineJob projects/115851500182/locations/us-central1/pipelineJobs/autosxs-eval-zf4d8btr1296 current state:\n",
+      "PipelineJob projects/115851500182/locations/us-central1/pipelineJobs/autosxs-1713383655691384 current state:\n",
       "PipelineState.PIPELINE_STATE_RUNNING\n",
-      "PipelineJob projects/115851500182/locations/us-central1/pipelineJobs/autosxs-eval-zf4d8btr1296 current state:\n",
+      "PipelineJob projects/115851500182/locations/us-central1/pipelineJobs/autosxs-1713383655691384 current state:\n",
       "PipelineState.PIPELINE_STATE_RUNNING\n",
-      "PipelineJob projects/115851500182/locations/us-central1/pipelineJobs/autosxs-eval-zf4d8btr1296 current state:\n",
+      "PipelineJob projects/115851500182/locations/us-central1/pipelineJobs/autosxs-1713383655691384 current state:\n",
       "PipelineState.PIPELINE_STATE_RUNNING\n"
      ]
     }
    ],
    "source": [
-    "import random\n",
-    "\n",
     "job = aiplatform.PipelineJob(\n",
-    "    job_id=(display_name + str(random.randint(1, 2**12))),\n",
+    "    job_id=display_name,\n",
     "    display_name=display_name,\n",
-    "    pipeline_root=os.path.join(BUCKET_URI, display_name),\n",
+    "    pipeline_root=pipeline_root,\n",
     "    template_path=TEMPLATE_URI,\n",
     "    parameter_values=parameters,\n",
     "    enable_caching=False,\n",
@@ -454,19 +430,24 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 106,
+   "execution_count": null,
    "metadata": {
-    "id": "MakdmpYCmehF"
+    "id": "MakdmpYCmehF",
+    "tags": []
    },
    "outputs": [],
    "source": [
-    "for details in job.task_details:\n",
-    "    if details.task_name == \"online-evaluation-pairwise\":\n",
-    "        break\n",
+    "online_eval_task = [\n",
+    "    task\n",
+    "    for task in job.task_details\n",
+    "    if task.task_name == \"online-evaluation-pairwise\"\n",
+    "][0]\n",
+    "\n",
+    "\n",
+    "judgments_uri = MessageToDict(online_eval_task.outputs[\"judgments\"]._pb)[\n",
+    "    \"artifacts\"\n",
+    "][0][\"uri\"]\n",
     "\n",
-    "judgments_uri = MessageToDict(details.outputs[\"judgments\"]._pb)[\"artifacts\"][0][\n",
-    "    \"uri\"\n",
-    "]\n",
     "judgments_df = pd.read_json(judgments_uri, lines=True)"
    ]
   },
@@ -479,96 +460,12 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 107,
+   "execution_count": null,
    "metadata": {
-    "id": "_gyM2-i3HHnP"
+    "id": "_gyM2-i3HHnP",
+    "tags": []
    },
-   "outputs": [
-    {
-     "data": {
-      "text/html": [
-       "<h2>Document:</h2> <div style='white-space: pre-wrap; width: 800px; overflow-x: auto;'>The money was for \"development\", he said on Sunday, but gave no details.\n",
-       "The announcement comes three months after Mr Maduro travelled to China - a major investor in the region.\n",
-       "Venezuela is suffering from an acute economic crisis, as the price of its main export, oil, has almost halved over a year. The opposition accuses the government of mismanagement.\n",
-       "Mr Maduro visited Beijing in January and said at the time that China would invest more than $20bn in Venezuela.\n",
-       "He did not make clear in Sunday's announcement if these latest $5bn were part of that larger sum.\n",
-       "Loans by China's state-owned banks to Latin American countries rose by 71% to $22bn (£14bn) in 2014, according to estimates published by the China-Latin America Finance Database.\n",
-       "The Chinese loans exceed the combined worth of those by the World Bank and the Inter-American Development Bank, according to the database.\n",
-       "The $5bn will be a boost to Venezuela, which has been hit hard by falling oil price.  According to reports, 96% of its export revenues come from oil.\n",
-       "Figures from Venezuela's oil ministry suggest the price of Venezuelan oil has dropped from $97 in April 2014 to $50 this month.\n",
-       "Inflation in 2014 stood at more than 60% and there are widespread shortages of basic staples such a flour, cooking oil and milk.</div>"
-      ],
-      "text/plain": [
-       "<IPython.core.display.HTML object>"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    },
-    {
-     "data": {
-      "text/html": [
-       "<h2>Response A:</h2> <div style='white-space: pre-wrap; width: 800px; overflow-x: auto;'>Venezuelan President Nicolas Maduro has announced that China will provide $5bn in loans to help the country's struggling economy.\n",
-       "The announcement comes as the price of oil, Venezuela's main export, has fallen sharply in recent months.\n",
-       "China is a major investor in Venezuela and has provided billions of dollars in loans in the past.\n",
-       "The new loans are expected to help Venezuela weather the economic crisis and finance infrastructure projects.</div>"
-      ],
-      "text/plain": [
-       "<IPython.core.display.HTML object>"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    },
-    {
-     "data": {
-      "text/html": [
-       "<h2>Response B:</h2> <div style='white-space: pre-wrap; width: 800px; overflow-x: auto;'>Venezuelan President Maduro announced that China will provide $5 billion for \"development\" in Venezuela. This announcement comes after Maduro's visit to China in January, where he secured a $20 billion investment from China. Venezuela is currently facing an economic crisis due to the falling price of oil, its main export. The $5 billion loan will help boost Venezuela's economy, which has been hit hard by the falling oil price.</div>"
-      ],
-      "text/plain": [
-       "<IPython.core.display.HTML object>"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    },
-    {
-     "data": {
-      "text/html": [
-       "<h2>Explanation:</h2> <div style='white-space: pre-wrap; width: 800px; overflow-x: auto;'>Response (B) is better than Response (A) because it provides more details and is more informative. For example, Response (B) mentions that the $5 billion loan is part of a larger $20 billion investment from China, while Response (A) does not. Additionally, Response (B) provides more context about Venezuela's economic crisis, such as the falling price of oil and the widespread shortages of basic staples.</div>"
-      ],
-      "text/plain": [
-       "<IPython.core.display.HTML object>"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    },
-    {
-     "data": {
-      "text/html": [
-       "<h2>Confidence score:</h2> <div style='white-space: pre-wrap; width: 800px; overflow-x: auto;'>0.5</div>"
-      ],
-      "text/plain": [
-       "<IPython.core.display.HTML object>"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    },
-    {
-     "data": {
-      "text/html": [
-       "<hr>"
-      ],
-      "text/plain": [
-       "<IPython.core.display.HTML object>"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    }
-   ],
+   "outputs": [],
    "source": [
     "def print_autosxs_judgments(df, n=3):\n",
     "    \"\"\"Print AutoSxS judgments in the notebook\"\"\"\n",
@@ -626,24 +523,11 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 110,
+   "execution_count": null,
    "metadata": {
     "id": "w2RISjQSJk9R"
    },
-   "outputs": [
-    {
-     "data": {
-      "text/html": [
-       "<h3>AutoSxS Autorater prefers 87% of time Model B over Model A </h3>"
-      ],
-      "text/plain": [
-       "<IPython.core.display.HTML object>"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    }
-   ],
+   "outputs": [],
    "source": [
     "for details in job.task_details:\n",
     "    if details.task_name == \"model-evaluation-text-generation-pairwise\":\n",
@@ -711,112 +595,11 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 102,
+   "execution_count": null,
    "metadata": {
     "id": "mbfsO2uw9-i5"
    },
-   "outputs": [
-    {
-     "data": {
-      "text/html": [
-       "<div>\n",
-       "<style scoped>\n",
-       "    .dataframe tbody tr th:only-of-type {\n",
-       "        vertical-align: middle;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe tbody tr th {\n",
-       "        vertical-align: top;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe thead th {\n",
-       "        text-align: right;\n",
-       "    }\n",
-       "</style>\n",
-       "<table border=\"1\" class=\"dataframe\">\n",
-       "  <thead>\n",
-       "    <tr style=\"text-align: right;\">\n",
-       "      <th></th>\n",
-       "      <th>id</th>\n",
-       "      <th>document</th>\n",
-       "      <th>response_a</th>\n",
-       "      <th>response_b</th>\n",
-       "      <th>actual</th>\n",
-       "    </tr>\n",
-       "  </thead>\n",
-       "  <tbody>\n",
-       "    <tr>\n",
-       "      <th>0</th>\n",
-       "      <td>12511455</td>\n",
-       "      <td>Since then the country has seen the creation o...</td>\n",
-       "      <td>I am not able to generate a summary</td>\n",
-       "      <td>Nepal is a landlocked country in South Asia, b...</td>\n",
-       "      <td>B</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>1</th>\n",
-       "      <td>13130633</td>\n",
-       "      <td>Nine Madryn Street, Toxteth, where the drummer...</td>\n",
-       "      <td>The redevelopment of the Welsh Streets in Live...</td>\n",
-       "      <td>- 271 homes in Toxteth, including the childhoo...</td>\n",
-       "      <td>A</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>2</th>\n",
-       "      <td>14703910</td>\n",
-       "      <td>A country of fertile plains, high mountains an...</td>\n",
-       "      <td>Syria is a country in Western Asia. It is bord...</td>\n",
-       "      <td>Syria is a country with diverse ethnic and rel...</td>\n",
-       "      <td>B</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>3</th>\n",
-       "      <td>15520382</td>\n",
-       "      <td>Evidence of Phytophthora ramorum was discovere...</td>\n",
-       "      <td>Phytophthora ramorum, a fungus-like pathogen, ...</td>\n",
-       "      <td>- Phytophthora ramorum, a fungus-like pathogen...</td>\n",
-       "      <td>A</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>4</th>\n",
-       "      <td>16788700</td>\n",
-       "      <td>No clear winner emerged between incumbent Jean...</td>\n",
-       "      <td>The African Union summit in Addis Ababa ended ...</td>\n",
-       "      <td>The African Union (AU) summit in Addis Ababa, ...</td>\n",
-       "      <td>A</td>\n",
-       "    </tr>\n",
-       "  </tbody>\n",
-       "</table>\n",
-       "</div>"
-      ],
-      "text/plain": [
-       "         id                                           document  \\\n",
-       "0  12511455  Since then the country has seen the creation o...   \n",
-       "1  13130633  Nine Madryn Street, Toxteth, where the drummer...   \n",
-       "2  14703910  A country of fertile plains, high mountains an...   \n",
-       "3  15520382  Evidence of Phytophthora ramorum was discovere...   \n",
-       "4  16788700  No clear winner emerged between incumbent Jean...   \n",
-       "\n",
-       "                                          response_a  \\\n",
-       "0                I am not able to generate a summary   \n",
-       "1  The redevelopment of the Welsh Streets in Live...   \n",
-       "2  Syria is a country in Western Asia. It is bord...   \n",
-       "3  Phytophthora ramorum, a fungus-like pathogen, ...   \n",
-       "4  The African Union summit in Addis Ababa ended ...   \n",
-       "\n",
-       "                                          response_b actual  \n",
-       "0  Nepal is a landlocked country in South Asia, b...      B  \n",
-       "1  - 271 homes in Toxteth, including the childhoo...      A  \n",
-       "2  Syria is a country with diverse ethnic and rel...      B  \n",
-       "3  - Phytophthora ramorum, a fungus-like pathogen...      A  \n",
-       "4  The African Union (AU) summit in Addis Ababa, ...      A  "
-      ]
-     },
-     "execution_count": 102,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
+   "outputs": [],
    "source": [
     "human_evaluation_gemini_df = pd.read_json(HUMAN_EVALUATION_FILE_URI, lines=True)\n",
     "human_evaluation_gemini_df.head()"
@@ -837,16 +620,20 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 103,
+   "execution_count": null,
    "metadata": {
-    "id": "bFmvFt2a3MtN"
+    "id": "bFmvFt2a3MtN",
+    "tags": []
    },
    "outputs": [],
    "source": [
-    "display_name = f\"autosxs-human-eval-{generate_uuid()}\"\n",
+    "timestamp = str(datetime.datetime.now().timestamp()).replace(\".\", \"\")\n",
+    "display_name = f\"autosxs-human-eval-{timestamp}\"\n",
+    "pipeline_root = os.path.join(\"gs://\", BUCKET, display_name)\n",
+    "\n",
     "parameters = {\n",
     "    \"evaluation_dataset\": HUMAN_EVALUATION_FILE_URI,\n",
-    "    \"id_columns\": [\"id\", \"document\"],\n",
+    "    \"id_columns\": [\"id\"],\n",
     "    \"task\": \"summarization\",\n",
     "    \"autorater_prompt_parameters\": {\n",
     "        \"inference_context\": {\"column\": \"document\"},\n",
@@ -860,46 +647,16 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 104,
+   "execution_count": null,
    "metadata": {
     "id": "KbhIPY-_3SSB"
    },
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Creating PipelineJob\n",
-      "PipelineJob created. Resource name: projects/115851500182/locations/us-central1/pipelineJobs/autosxs-human-eval-6lec31qg\n",
-      "To use this PipelineJob in another session:\n",
-      "pipeline_job = aiplatform.PipelineJob.get('projects/115851500182/locations/us-central1/pipelineJobs/autosxs-human-eval-6lec31qg')\n",
-      "View Pipeline Job:\n",
-      "https://console.cloud.google.com/vertex-ai/locations/us-central1/pipelines/runs/autosxs-human-eval-6lec31qg?project=115851500182\n",
-      "PipelineJob projects/115851500182/locations/us-central1/pipelineJobs/autosxs-human-eval-6lec31qg current state:\n",
-      "PipelineState.PIPELINE_STATE_RUNNING\n",
-      "PipelineJob projects/115851500182/locations/us-central1/pipelineJobs/autosxs-human-eval-6lec31qg current state:\n",
-      "PipelineState.PIPELINE_STATE_RUNNING\n",
-      "PipelineJob projects/115851500182/locations/us-central1/pipelineJobs/autosxs-human-eval-6lec31qg current state:\n",
-      "PipelineState.PIPELINE_STATE_RUNNING\n",
-      "PipelineJob projects/115851500182/locations/us-central1/pipelineJobs/autosxs-human-eval-6lec31qg current state:\n",
-      "PipelineState.PIPELINE_STATE_RUNNING\n",
-      "PipelineJob projects/115851500182/locations/us-central1/pipelineJobs/autosxs-human-eval-6lec31qg current state:\n",
-      "PipelineState.PIPELINE_STATE_RUNNING\n",
-      "PipelineJob projects/115851500182/locations/us-central1/pipelineJobs/autosxs-human-eval-6lec31qg current state:\n",
-      "PipelineState.PIPELINE_STATE_RUNNING\n",
-      "PipelineJob projects/115851500182/locations/us-central1/pipelineJobs/autosxs-human-eval-6lec31qg current state:\n",
-      "PipelineState.PIPELINE_STATE_RUNNING\n",
-      "PipelineJob projects/115851500182/locations/us-central1/pipelineJobs/autosxs-human-eval-6lec31qg current state:\n",
-      "PipelineState.PIPELINE_STATE_RUNNING\n",
-      "PipelineJob run completed. Resource name: projects/115851500182/locations/us-central1/pipelineJobs/autosxs-human-eval-6lec31qg\n"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
     "job = aiplatform.PipelineJob(\n",
     "    job_id=display_name,\n",
-    "    display_name=(display_name + str(random.randint(1, 2**12))),\n",
-    "    pipeline_root=os.path.join(BUCKET_URI, display_name),\n",
+    "    display_name=display_name,\n",
+    "    pipeline_root=pipeline_root,\n",
     "    template_path=TEMPLATE_URI,\n",
     "    parameter_values=parameters,\n",
     "    enable_caching=False,\n",
@@ -922,30 +679,21 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 115,
+   "execution_count": null,
    "metadata": {
     "id": "JLUOJFjA38ja"
    },
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "{'autosxs_model_a_win_rate': 0.13,\n",
-      " 'autosxs_model_b_win_rate': 0.87,\n",
-      " 'human_preference_model_a_win_rate': 0.26,\n",
-      " 'human_preference_model_b_win_rate': 0.74}\n"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
-    "for details in job.task_details:\n",
-    "    if details.task_name == \"model-evaluation-text-generation-pairwise\":\n",
-    "        break\n",
+    "human_eval_task = [\n",
+    "    task\n",
+    "    for task in job.task_details\n",
+    "    if task.task_name == \"model-evaluation-text-generation-pairwise\"\n",
+    "][0]\n",
     "\n",
     "human_aligned_metrics = {\n",
     "    k: round(v, 3)\n",
-    "    for k, v in MessageToDict(details.outputs[\"autosxs_metrics\"]._pb)[\n",
+    "    for k, v in MessageToDict(human_eval_task.outputs[\"autosxs_metrics\"]._pb)[\n",
     "        \"artifacts\"\n",
     "    ][0][\"metadata\"].items()\n",
     "    if \"win_rate\" in k\n",

From 0a5670b5eeb65ec18eda8d95159b22ad6d187565 Mon Sep 17 00:00:00 2001
From: BenoitDherin <dherin@google.com>
Date: Wed, 17 Apr 2024 20:50:38 +0000
Subject: [PATCH 4/4] final cleanup

---
 .../solutions/vertex_llm_evaluation.ipynb     | 188 +++---------------
 1 file changed, 27 insertions(+), 161 deletions(-)

diff --git a/notebooks/vertex_genai/solutions/vertex_llm_evaluation.ipynb b/notebooks/vertex_genai/solutions/vertex_llm_evaluation.ipynb
index 5bf65703..7c1ae5aa 100644
--- a/notebooks/vertex_genai/solutions/vertex_llm_evaluation.ipynb
+++ b/notebooks/vertex_genai/solutions/vertex_llm_evaluation.ipynb
@@ -40,7 +40,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 1,
+   "execution_count": null,
    "metadata": {
     "id": "PyQmSRbKA8r-",
     "tags": []
@@ -59,20 +59,12 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 2,
+   "execution_count": null,
    "metadata": {
     "id": "oM1iC_MfAts1",
     "tags": []
    },
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Your project ID is set to dherin-dev\n"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
     "project_id_list = !gcloud config get-value project 2> /dev/null\n",
     "PROJECT_ID = project_id_list[0]\n",
@@ -102,7 +94,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 3,
+   "execution_count": null,
    "metadata": {
     "id": "NIq7R4HZCfIc",
     "tags": []
@@ -125,7 +117,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 4,
+   "execution_count": null,
    "metadata": {
     "id": "j4KEcQEWROby",
     "tags": []
@@ -182,107 +174,12 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 5,
+   "execution_count": null,
    "metadata": {
     "id": "R-_ettKRxfxT",
     "tags": []
    },
-   "outputs": [
-    {
-     "data": {
-      "text/html": [
-       "<div>\n",
-       "<style scoped>\n",
-       "    .dataframe tbody tr th:only-of-type {\n",
-       "        vertical-align: middle;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe tbody tr th {\n",
-       "        vertical-align: top;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe thead th {\n",
-       "        text-align: right;\n",
-       "    }\n",
-       "</style>\n",
-       "<table border=\"1\" class=\"dataframe\">\n",
-       "  <thead>\n",
-       "    <tr style=\"text-align: right;\">\n",
-       "      <th></th>\n",
-       "      <th>id</th>\n",
-       "      <th>document</th>\n",
-       "      <th>response_a</th>\n",
-       "      <th>response_b</th>\n",
-       "    </tr>\n",
-       "  </thead>\n",
-       "  <tbody>\n",
-       "    <tr>\n",
-       "      <th>0</th>\n",
-       "      <td>40159674</td>\n",
-       "      <td>The 33-year-old, capped 81 times by the Republ...</td>\n",
-       "      <td>Whelan joined Aston Villa on a free transfer f...</td>\n",
-       "      <td>- Glenn Whelan, 33, has joined Aston Villa fro...</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>1</th>\n",
-       "      <td>36925620</td>\n",
-       "      <td>Paul Frew, MLA for North Antrim, appeared in c...</td>\n",
-       "      <td>- A 15-year-old girl is suing DUP MLA Paul Fre...</td>\n",
-       "      <td>- Paul Frew, MLA for North Antrim, appeared in...</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>2</th>\n",
-       "      <td>39729595</td>\n",
-       "      <td>Party leader Jeremy Corbyn said half of the ho...</td>\n",
-       "      <td>Labour says it will build 100,000 new homes a ...</td>\n",
-       "      <td>- Labour leader Jeremy Corbyn announced a hous...</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>3</th>\n",
-       "      <td>16788700</td>\n",
-       "      <td>No clear winner emerged between incumbent Jean...</td>\n",
-       "      <td>The African Union summit in Addis Ababa ended ...</td>\n",
-       "      <td>The African Union (AU) summit in Addis Ababa, ...</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>4</th>\n",
-       "      <td>19389625</td>\n",
-       "      <td>The ban has been called by opposition coalitio...</td>\n",
-       "      <td>Opposition coalition in Togo has called for a ...</td>\n",
-       "      <td>Togo's opposition coalition, Let's Save Togo, ...</td>\n",
-       "    </tr>\n",
-       "  </tbody>\n",
-       "</table>\n",
-       "</div>"
-      ],
-      "text/plain": [
-       "         id                                           document  \\\n",
-       "0  40159674  The 33-year-old, capped 81 times by the Republ...   \n",
-       "1  36925620  Paul Frew, MLA for North Antrim, appeared in c...   \n",
-       "2  39729595  Party leader Jeremy Corbyn said half of the ho...   \n",
-       "3  16788700  No clear winner emerged between incumbent Jean...   \n",
-       "4  19389625  The ban has been called by opposition coalitio...   \n",
-       "\n",
-       "                                          response_a  \\\n",
-       "0  Whelan joined Aston Villa on a free transfer f...   \n",
-       "1  - A 15-year-old girl is suing DUP MLA Paul Fre...   \n",
-       "2  Labour says it will build 100,000 new homes a ...   \n",
-       "3  The African Union summit in Addis Ababa ended ...   \n",
-       "4  Opposition coalition in Togo has called for a ...   \n",
-       "\n",
-       "                                          response_b  \n",
-       "0  - Glenn Whelan, 33, has joined Aston Villa fro...  \n",
-       "1  - Paul Frew, MLA for North Antrim, appeared in...  \n",
-       "2  - Labour leader Jeremy Corbyn announced a hous...  \n",
-       "3  The African Union (AU) summit in Addis Ababa, ...  \n",
-       "4  Togo's opposition coalition, Let's Save Togo, ...  "
-      ]
-     },
-     "execution_count": 5,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
+   "outputs": [],
    "source": [
     "evaluation_gemini_df = pd.read_json(EVALUATION_FILE_URI, lines=True)\n",
     "evaluation_gemini_df.head()"
@@ -300,7 +197,7 @@
     "\n",
     "*   `evaluation_dataset` to indicate where the evaluation dataset location. In this case, it is the JSONL Cloud bucket URI.\n",
     "\n",
-    "*   `id_colums` to distinguish evaluation examples that are unique. Here, as you can imagine, your have `id` and `document` fields.\n",
+    "*   `id_colums` to distinguish evaluation examples that are unique. Here, as you can imagine, your have `id` and `document` fields. These fields will be added in the judgment table generated by AutoSxS.\n",
     "\n",
     "*   `task` to indicate the task type you want to evaluate. It can be `summarization` or `question_answer`. In this case you have `summarization`.\n",
     "\n",
@@ -318,7 +215,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 7,
+   "execution_count": null,
    "metadata": {
     "id": "Cp7e-hOmNMhA",
     "tags": []
@@ -331,7 +228,7 @@
     "\n",
     "parameters = {\n",
     "    \"evaluation_dataset\": EVALUATION_FILE_URI,\n",
-    "    \"id_columns\": [\"id\"],\n",
+    "    \"id_columns\": [\"id\", \"document\"],\n",
     "    \"task\": \"summarization\",\n",
     "    \"autorater_prompt_parameters\": {\n",
     "        \"inference_context\": {\"column\": \"document\"},\n",
@@ -353,37 +250,12 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 9,
+   "execution_count": null,
    "metadata": {
     "id": "AjFHT5ze9m4L",
     "tags": []
    },
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Creating PipelineJob\n",
-      "PipelineJob created. Resource name: projects/115851500182/locations/us-central1/pipelineJobs/autosxs-1713383655691384\n",
-      "To use this PipelineJob in another session:\n",
-      "pipeline_job = aiplatform.PipelineJob.get('projects/115851500182/locations/us-central1/pipelineJobs/autosxs-1713383655691384')\n",
-      "View Pipeline Job:\n",
-      "https://console.cloud.google.com/vertex-ai/locations/us-central1/pipelines/runs/autosxs-1713383655691384?project=115851500182\n",
-      "PipelineJob projects/115851500182/locations/us-central1/pipelineJobs/autosxs-1713383655691384 current state:\n",
-      "PipelineState.PIPELINE_STATE_RUNNING\n",
-      "PipelineJob projects/115851500182/locations/us-central1/pipelineJobs/autosxs-1713383655691384 current state:\n",
-      "PipelineState.PIPELINE_STATE_RUNNING\n",
-      "PipelineJob projects/115851500182/locations/us-central1/pipelineJobs/autosxs-1713383655691384 current state:\n",
-      "PipelineState.PIPELINE_STATE_RUNNING\n",
-      "PipelineJob projects/115851500182/locations/us-central1/pipelineJobs/autosxs-1713383655691384 current state:\n",
-      "PipelineState.PIPELINE_STATE_RUNNING\n",
-      "PipelineJob projects/115851500182/locations/us-central1/pipelineJobs/autosxs-1713383655691384 current state:\n",
-      "PipelineState.PIPELINE_STATE_RUNNING\n",
-      "PipelineJob projects/115851500182/locations/us-central1/pipelineJobs/autosxs-1713383655691384 current state:\n",
-      "PipelineState.PIPELINE_STATE_RUNNING\n"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
     "job = aiplatform.PipelineJob(\n",
     "    job_id=display_name,\n",
@@ -525,7 +397,8 @@
    "cell_type": "code",
    "execution_count": null,
    "metadata": {
-    "id": "w2RISjQSJk9R"
+    "id": "w2RISjQSJk9R",
+    "tags": []
    },
    "outputs": [],
    "source": [
@@ -548,7 +421,9 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {},
+   "metadata": {
+    "tags": []
+   },
    "outputs": [],
    "source": [
     "def print_aggregated_metrics(scores):\n",
@@ -597,7 +472,8 @@
    "cell_type": "code",
    "execution_count": null,
    "metadata": {
-    "id": "mbfsO2uw9-i5"
+    "id": "mbfsO2uw9-i5",
+    "tags": []
    },
    "outputs": [],
    "source": [
@@ -633,7 +509,7 @@
     "\n",
     "parameters = {\n",
     "    \"evaluation_dataset\": HUMAN_EVALUATION_FILE_URI,\n",
-    "    \"id_columns\": [\"id\"],\n",
+    "    \"id_columns\": [\"id\", \"document\"],\n",
     "    \"task\": \"summarization\",\n",
     "    \"autorater_prompt_parameters\": {\n",
     "        \"inference_context\": {\"column\": \"document\"},\n",
@@ -649,7 +525,8 @@
    "cell_type": "code",
    "execution_count": null,
    "metadata": {
-    "id": "KbhIPY-_3SSB"
+    "id": "KbhIPY-_3SSB",
+    "tags": []
    },
    "outputs": [],
    "source": [
@@ -681,7 +558,8 @@
    "cell_type": "code",
    "execution_count": null,
    "metadata": {
-    "id": "JLUOJFjA38ja"
+    "id": "JLUOJFjA38ja",
+    "tags": []
    },
    "outputs": [],
    "source": [
@@ -710,7 +588,9 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {},
+   "metadata": {
+    "tags": []
+   },
    "outputs": [],
    "source": [
     "def print_human_preference_metrics(metrics):\n",
@@ -725,20 +605,6 @@
     "pprint.pprint(human_aligned_metrics)"
    ]
   },
-  {
-   "cell_type": "markdown",
-   "metadata": {
-    "id": "TpV-iwP9qw9c"
-   },
-   "source": [
-    "## Cleaning up\n",
-    "\n",
-    "To clean up all Google Cloud resources used in this project, you can [delete the Google Cloud\n",
-    "project](https://cloud.google.com/resource-manager/docs/creating-managing-projects#shutting_down_projects) you used for the tutorial. Clicking the __End Lab__ button in your lab instructions will take care of this for you.\n",
-    "\n",
-    "Otherwise, you can delete the individual resources you created in this tutorial."
-   ]
-  },
   {
    "cell_type": "markdown",
    "metadata": {},