From c0771184a395f9240dd21abb818bc94f78d11a21 Mon Sep 17 00:00:00 2001
From: Kinar R <kinar.ravishankar@codeandtheory.com>
Date: Tue, 10 Dec 2024 16:05:49 +0530
Subject: [PATCH] Update Finetune_with_Torch_XLA.ipynb

- Replace gemma-2-2b with gemma-2-2b-it
- Preprocess the dataset for Gemma IT
- Better response visualization with markdown
---
 Gemma/Finetune_with_Torch_XLA.ipynb | 713 ++++++++++++++++++++--------
 1 file changed, 503 insertions(+), 210 deletions(-)

diff --git a/Gemma/Finetune_with_Torch_XLA.ipynb b/Gemma/Finetune_with_Torch_XLA.ipynb
index d17b83d..09d2b1a 100644
--- a/Gemma/Finetune_with_Torch_XLA.ipynb
+++ b/Gemma/Finetune_with_Torch_XLA.ipynb
@@ -2,7 +2,6 @@
   "cells": [
     {
       "cell_type": "markdown",
-      "id": "Tce3stUlHN0L",
       "metadata": {
         "id": "Tce3stUlHN0L"
       },
@@ -13,7 +12,6 @@
     {
       "cell_type": "code",
       "execution_count": null,
-      "id": "tuOe1ymfHZPu",
       "metadata": {
         "cellView": "form",
         "id": "tuOe1ymfHZPu"
@@ -35,7 +33,6 @@
     },
     {
       "cell_type": "markdown",
-      "id": "introduction",
       "metadata": {
         "id": "introduction"
       },
@@ -65,7 +62,6 @@
     },
     {
       "cell_type": "markdown",
-      "id": "L0Srgc7OGVJj",
       "metadata": {
         "id": "L0Srgc7OGVJj"
       },
@@ -105,7 +101,6 @@
     },
     {
       "cell_type": "markdown",
-      "id": "configure-credentials",
       "metadata": {
         "id": "configure-credentials"
       },
@@ -137,7 +132,6 @@
     {
       "cell_type": "code",
       "execution_count": null,
-      "id": "mBPIzOqnGmt-",
       "metadata": {
         "id": "mBPIzOqnGmt-"
       },
@@ -162,7 +156,6 @@
     },
     {
       "cell_type": "markdown",
-      "id": "ys5-2RMfUroM",
       "metadata": {
         "id": "ys5-2RMfUroM"
       },
@@ -172,7 +165,6 @@
     },
     {
       "cell_type": "markdown",
-      "id": "setting-up-environment",
       "metadata": {
         "id": "setting-up-environment"
       },
@@ -185,7 +177,6 @@
     {
       "cell_type": "code",
       "execution_count": null,
-      "id": "setup-code",
       "metadata": {
         "id": "setup-code"
       },
@@ -211,7 +202,7 @@
             "Requirement already satisfied: google-pasta>=0.1.1 in /usr/local/lib/python3.10/dist-packages (from tensorflow==2.18.0) (0.2.0)\n",
             "Requirement already satisfied: libclang>=13.0.0 in /usr/local/lib/python3.10/dist-packages (from tensorflow==2.18.0) (18.1.1)\n",
             "Requirement already satisfied: opt-einsum>=2.3.2 in /usr/local/lib/python3.10/dist-packages (from tensorflow==2.18.0) (3.4.0)\n",
-            "Requirement already satisfied: packaging in /usr/local/lib/python3.10/dist-packages (from tensorflow==2.18.0) (24.1)\n",
+            "Requirement already satisfied: packaging in /usr/local/lib/python3.10/dist-packages (from tensorflow==2.18.0) (24.2)\n",
             "Requirement already satisfied: protobuf!=4.21.0,!=4.21.1,!=4.21.2,!=4.21.3,!=4.21.4,!=4.21.5,<6.0.0dev,>=3.20.3 in /usr/local/lib/python3.10/dist-packages (from tensorflow==2.18.0) (4.25.5)\n",
             "Requirement already satisfied: requests<3,>=2.21.0 in /usr/local/lib/python3.10/dist-packages (from tensorflow==2.18.0) (2.32.3)\n",
             "Requirement already satisfied: setuptools in /usr/local/lib/python3.10/dist-packages (from tensorflow==2.18.0) (75.1.0)\n",
@@ -219,47 +210,47 @@
             "Requirement already satisfied: termcolor>=1.1.0 in /usr/local/lib/python3.10/dist-packages (from tensorflow==2.18.0) (2.5.0)\n",
             "Requirement already satisfied: typing-extensions>=3.6.6 in /usr/local/lib/python3.10/dist-packages (from tensorflow==2.18.0) (4.12.2)\n",
             "Requirement already satisfied: wrapt>=1.11.0 in /usr/local/lib/python3.10/dist-packages (from tensorflow==2.18.0) (1.14.1)\n",
-            "Requirement already satisfied: grpcio<2.0,>=1.24.3 in /usr/local/lib/python3.10/dist-packages (from tensorflow==2.18.0) (1.67.1)\n",
+            "Requirement already satisfied: grpcio<2.0,>=1.24.3 in /usr/local/lib/python3.10/dist-packages (from tensorflow==2.18.0) (1.68.1)\n",
             "Collecting tensorboard<2.19,>=2.18 (from tensorflow==2.18.0)\n",
             "  Downloading tensorboard-2.18.0-py3-none-any.whl.metadata (1.6 kB)\n",
             "Collecting keras>=3.5.0 (from tensorflow==2.18.0)\n",
-            "  Downloading keras-3.6.0-py3-none-any.whl.metadata (5.8 kB)\n",
+            "  Downloading keras-3.7.0-py3-none-any.whl.metadata (5.8 kB)\n",
             "Requirement already satisfied: numpy<2.1.0,>=1.26.0 in /usr/local/lib/python3.10/dist-packages (from tensorflow==2.18.0) (1.26.4)\n",
             "Requirement already satisfied: h5py>=3.11.0 in /usr/local/lib/python3.10/dist-packages (from tensorflow==2.18.0) (3.12.1)\n",
             "Collecting ml-dtypes<0.5.0,>=0.4.0 (from tensorflow==2.18.0)\n",
             "  Downloading ml_dtypes-0.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (20 kB)\n",
             "Requirement already satisfied: tensorflow-io-gcs-filesystem>=0.23.1 in /usr/local/lib/python3.10/dist-packages (from tensorflow==2.18.0) (0.37.1)\n",
-            "Requirement already satisfied: wheel<1.0,>=0.23.0 in /usr/local/lib/python3.10/dist-packages (from astunparse>=1.6.0->tensorflow==2.18.0) (0.44.0)\n",
-            "Requirement already satisfied: rich in /usr/local/lib/python3.10/dist-packages (from keras>=3.5.0->tensorflow==2.18.0) (13.9.3)\n",
+            "Requirement already satisfied: wheel<1.0,>=0.23.0 in /usr/local/lib/python3.10/dist-packages (from astunparse>=1.6.0->tensorflow==2.18.0) (0.45.1)\n",
+            "Requirement already satisfied: rich in /usr/local/lib/python3.10/dist-packages (from keras>=3.5.0->tensorflow==2.18.0) (13.9.4)\n",
             "Collecting namex (from keras>=3.5.0->tensorflow==2.18.0)\n",
             "  Downloading namex-0.0.8-py3-none-any.whl.metadata (246 bytes)\n",
             "Collecting optree (from keras>=3.5.0->tensorflow==2.18.0)\n",
-            "  Downloading optree-0.13.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (47 kB)\n",
-            "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m47.8/47.8 kB\u001b[0m \u001b[31m1.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+            "  Downloading optree-0.13.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (47 kB)\n",
+            "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m47.8/47.8 kB\u001b[0m \u001b[31m3.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
             "\u001b[?25hRequirement already satisfied: charset-normalizer<4,>=2 in /usr/local/lib/python3.10/dist-packages (from requests<3,>=2.21.0->tensorflow==2.18.0) (3.4.0)\n",
             "Requirement already satisfied: idna<4,>=2.5 in /usr/local/lib/python3.10/dist-packages (from requests<3,>=2.21.0->tensorflow==2.18.0) (3.10)\n",
             "Requirement already satisfied: urllib3<3,>=1.21.1 in /usr/local/lib/python3.10/dist-packages (from requests<3,>=2.21.0->tensorflow==2.18.0) (2.2.3)\n",
             "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.10/dist-packages (from requests<3,>=2.21.0->tensorflow==2.18.0) (2024.8.30)\n",
             "Requirement already satisfied: markdown>=2.6.8 in /usr/local/lib/python3.10/dist-packages (from tensorboard<2.19,>=2.18->tensorflow==2.18.0) (3.7)\n",
             "Requirement already satisfied: tensorboard-data-server<0.8.0,>=0.7.0 in /usr/local/lib/python3.10/dist-packages (from tensorboard<2.19,>=2.18->tensorflow==2.18.0) (0.7.2)\n",
-            "Requirement already satisfied: werkzeug>=1.0.1 in /usr/local/lib/python3.10/dist-packages (from tensorboard<2.19,>=2.18->tensorflow==2.18.0) (3.0.6)\n",
+            "Requirement already satisfied: werkzeug>=1.0.1 in /usr/local/lib/python3.10/dist-packages (from tensorboard<2.19,>=2.18->tensorflow==2.18.0) (3.1.3)\n",
             "Requirement already satisfied: MarkupSafe>=2.1.1 in /usr/local/lib/python3.10/dist-packages (from werkzeug>=1.0.1->tensorboard<2.19,>=2.18->tensorflow==2.18.0) (3.0.2)\n",
             "Requirement already satisfied: markdown-it-py>=2.2.0 in /usr/local/lib/python3.10/dist-packages (from rich->keras>=3.5.0->tensorflow==2.18.0) (3.0.0)\n",
             "Requirement already satisfied: pygments<3.0.0,>=2.13.0 in /usr/local/lib/python3.10/dist-packages (from rich->keras>=3.5.0->tensorflow==2.18.0) (2.18.0)\n",
             "Requirement already satisfied: mdurl~=0.1 in /usr/local/lib/python3.10/dist-packages (from markdown-it-py>=2.2.0->rich->keras>=3.5.0->tensorflow==2.18.0) (0.1.2)\n",
             "Downloading tensorflow-2.18.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (615.3 MB)\n",
-            "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m615.3/615.3 MB\u001b[0m \u001b[31m1.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+            "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m615.3/615.3 MB\u001b[0m \u001b[31m1.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
             "\u001b[?25hDownloading tf_keras-2.18.0-py3-none-any.whl (1.7 MB)\n",
-            "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.7/1.7 MB\u001b[0m \u001b[31m59.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
-            "\u001b[?25hDownloading keras-3.6.0-py3-none-any.whl (1.2 MB)\n",
-            "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.2/1.2 MB\u001b[0m \u001b[31m46.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+            "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.7/1.7 MB\u001b[0m \u001b[31m311.1 kB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+            "\u001b[?25hDownloading keras-3.7.0-py3-none-any.whl (1.2 MB)\n",
+            "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.2/1.2 MB\u001b[0m \u001b[31m57.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
             "\u001b[?25hDownloading ml_dtypes-0.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (2.2 MB)\n",
-            "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m2.2/2.2 MB\u001b[0m \u001b[31m67.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+            "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m2.2/2.2 MB\u001b[0m \u001b[31m73.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
             "\u001b[?25hDownloading tensorboard-2.18.0-py3-none-any.whl (5.5 MB)\n",
-            "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m5.5/5.5 MB\u001b[0m \u001b[31m94.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+            "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m5.5/5.5 MB\u001b[0m \u001b[31m101.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
             "\u001b[?25hDownloading namex-0.0.8-py3-none-any.whl (5.8 kB)\n",
-            "Downloading optree-0.13.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (358 kB)\n",
-            "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m358.9/358.9 kB\u001b[0m \u001b[31m20.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+            "Downloading optree-0.13.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (381 kB)\n",
+            "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m381.3/381.3 kB\u001b[0m \u001b[31m24.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
             "\u001b[?25hInstalling collected packages: namex, optree, ml-dtypes, tensorboard, keras, tensorflow, tf-keras\n",
             "  Attempting uninstall: ml-dtypes\n",
             "    Found existing installation: ml-dtypes 0.2.0\n",
@@ -275,32 +266,31 @@
             "      Successfully uninstalled keras-2.15.0\n",
             "\u001b[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.\n",
             "tensorflow-text 2.15.0 requires tensorflow<2.16,>=2.15.0; platform_machine != \"arm64\" or platform_system != \"Darwin\", but you have tensorflow 2.18.0 which is incompatible.\u001b[0m\u001b[31m\n",
-            "\u001b[0mSuccessfully installed keras-3.6.0 ml-dtypes-0.4.1 namex-0.0.8 optree-0.13.0 tensorboard-2.18.0 tensorflow-2.18.0 tf-keras-2.18.0\n",
+            "\u001b[0mSuccessfully installed keras-3.7.0 ml-dtypes-0.4.1 namex-0.0.8 optree-0.13.1 tensorboard-2.18.0 tensorflow-2.18.0 tf-keras-2.18.0\n",
             "Found existing installation: tensorflow 2.18.0\n",
             "Uninstalling tensorflow-2.18.0:\n",
             "  Successfully uninstalled tensorflow-2.18.0\n",
-            "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m230.0/230.0 MB\u001b[0m \u001b[31m4.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
-            "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m44.1/44.1 kB\u001b[0m \u001b[31m1.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
-            "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m10.0/10.0 MB\u001b[0m \u001b[31m73.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
-            "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m3.0/3.0 MB\u001b[0m \u001b[31m81.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
-            "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m64.8/64.8 kB\u001b[0m \u001b[31m2.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
-            "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m480.6/480.6 kB\u001b[0m \u001b[31m13.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
-            "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m116.3/116.3 kB\u001b[0m \u001b[31m8.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
-            "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m179.3/179.3 kB\u001b[0m \u001b[31m12.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
-            "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.2/1.2 MB\u001b[0m \u001b[31m38.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
-            "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m134.8/134.8 kB\u001b[0m \u001b[31m9.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
-            "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m194.1/194.1 kB\u001b[0m \u001b[31m14.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
-            "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m241.9/241.9 kB\u001b[0m \u001b[31m16.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
-            "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m124.6/124.6 kB\u001b[0m \u001b[31m9.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
-            "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m318.7/318.7 kB\u001b[0m \u001b[31m22.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
-            "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m208.9/208.9 kB\u001b[0m \u001b[31m13.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
-            "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m310.2/310.2 kB\u001b[0m \u001b[31m5.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
-            "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m320.7/320.7 kB\u001b[0m \u001b[31m17.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
-            "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m324.3/324.3 kB\u001b[0m \u001b[31m6.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+            "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m230.0/230.0 MB\u001b[0m \u001b[31m4.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+            "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m44.1/44.1 kB\u001b[0m \u001b[31m1.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+            "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m10.0/10.0 MB\u001b[0m \u001b[31m75.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+            "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m69.2/69.2 kB\u001b[0m \u001b[31m2.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+            "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m480.6/480.6 kB\u001b[0m \u001b[31m13.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+            "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m116.3/116.3 kB\u001b[0m \u001b[31m9.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+            "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m179.3/179.3 kB\u001b[0m \u001b[31m15.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+            "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.6/1.6 MB\u001b[0m \u001b[31m51.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+            "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m134.8/134.8 kB\u001b[0m \u001b[31m10.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+            "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m194.1/194.1 kB\u001b[0m \u001b[31m15.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+            "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m241.9/241.9 kB\u001b[0m \u001b[31m20.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+            "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m124.6/124.6 kB\u001b[0m \u001b[31m10.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+            "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m205.1/205.1 kB\u001b[0m \u001b[31m15.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+            "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m319.7/319.7 kB\u001b[0m \u001b[31m25.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+            "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m310.2/310.2 kB\u001b[0m \u001b[31m6.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+            "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m320.7/320.7 kB\u001b[0m \u001b[31m21.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+            "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m324.3/324.3 kB\u001b[0m \u001b[31m6.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
             "\u001b[?25hRequirement already satisfied: tpu-info in /usr/local/lib/python3.10/dist-packages (0.2.0)\n",
-            "Requirement already satisfied: grpcio>=1.65.5 in /usr/local/lib/python3.10/dist-packages (from tpu-info) (1.67.1)\n",
+            "Requirement already satisfied: grpcio>=1.65.5 in /usr/local/lib/python3.10/dist-packages (from tpu-info) (1.68.1)\n",
             "Requirement already satisfied: protobuf in /usr/local/lib/python3.10/dist-packages (from tpu-info) (4.25.5)\n",
-            "Requirement already satisfied: rich in /usr/local/lib/python3.10/dist-packages (from tpu-info) (13.9.3)\n",
+            "Requirement already satisfied: rich in /usr/local/lib/python3.10/dist-packages (from tpu-info) (13.9.4)\n",
             "Requirement already satisfied: markdown-it-py>=2.2.0 in /usr/local/lib/python3.10/dist-packages (from rich->tpu-info) (3.0.0)\n",
             "Requirement already satisfied: pygments<3.0.0,>=2.13.0 in /usr/local/lib/python3.10/dist-packages (from rich->tpu-info) (2.18.0)\n",
             "Requirement already satisfied: typing-extensions<5.0,>=4.0.0 in /usr/local/lib/python3.10/dist-packages (from rich->tpu-info) (4.12.2)\n",
@@ -332,7 +322,6 @@
     },
     {
       "cell_type": "markdown",
-      "id": "note",
       "metadata": {
         "id": "note"
       },
@@ -342,7 +331,6 @@
     },
     {
       "cell_type": "markdown",
-      "id": "1M7YM_apWfOk",
       "metadata": {
         "id": "1M7YM_apWfOk"
       },
@@ -352,7 +340,6 @@
     },
     {
       "cell_type": "markdown",
-      "id": "Yjo5_5xfVYNm",
       "metadata": {
         "id": "Yjo5_5xfVYNm"
       },
@@ -363,7 +350,6 @@
     {
       "cell_type": "code",
       "execution_count": null,
-      "id": "skAvSa5KF65m",
       "metadata": {
         "id": "skAvSa5KF65m"
       },
@@ -391,7 +377,6 @@
     },
     {
       "cell_type": "markdown",
-      "id": "o0zsGkzqWien",
       "metadata": {
         "id": "o0zsGkzqWien"
       },
@@ -401,7 +386,6 @@
     },
     {
       "cell_type": "markdown",
-      "id": "importing-libraries",
       "metadata": {
         "id": "importing-libraries"
       },
@@ -414,7 +398,6 @@
     {
       "cell_type": "code",
       "execution_count": null,
-      "id": "import-code",
       "metadata": {
         "id": "import-code"
       },
@@ -423,8 +406,8 @@
           "name": "stdout",
           "output_type": "stream",
           "text": [
-            "PyTorch version: 2.5.0+cpu\n",
-            "Torch XLA version: 2.5.0+libtpu\n"
+            "PyTorch version: 2.5.1+cpu\n",
+            "Torch XLA version: 2.5.1+libtpu\n"
           ]
         }
       ],
@@ -456,7 +439,6 @@
     },
     {
       "cell_type": "markdown",
-      "id": "GSDScQniWo4L",
       "metadata": {
         "id": "GSDScQniWo4L"
       },
@@ -466,7 +448,6 @@
     },
     {
       "cell_type": "markdown",
-      "id": "finetuning-peft",
       "metadata": {
         "id": "finetuning-peft"
       },
@@ -499,7 +480,6 @@
     },
     {
       "cell_type": "markdown",
-      "id": "create-dataset",
       "metadata": {
         "id": "create-dataset"
       },
@@ -515,7 +495,6 @@
     },
     {
       "cell_type": "markdown",
-      "id": "j01SyZuIuU6E",
       "metadata": {
         "id": "j01SyZuIuU6E"
       },
@@ -526,11 +505,52 @@
     {
       "cell_type": "code",
       "execution_count": null,
-      "id": "dataset-code",
       "metadata": {
         "id": "dataset-code"
       },
       "outputs": [
+        {
+          "data": {
+            "application/vnd.jupyter.widget-view+json": {
+              "model_id": "d8c10ea39e384fca945ac3742e8a3d7e",
+              "version_major": 2,
+              "version_minor": 0
+            },
+            "text/plain": [
+              "README.md:   0%|          | 0.00/3.58k [00:00<?, ?B/s]"
+            ]
+          },
+          "metadata": {},
+          "output_type": "display_data"
+        },
+        {
+          "data": {
+            "application/vnd.jupyter.widget-view+json": {
+              "model_id": "791a3d32cb8641fb8430563ad8c5c165",
+              "version_major": 2,
+              "version_minor": 0
+            },
+            "text/plain": [
+              "train-00000-of-00001.parquet:   0%|          | 0.00/2.15M [00:00<?, ?B/s]"
+            ]
+          },
+          "metadata": {},
+          "output_type": "display_data"
+        },
+        {
+          "data": {
+            "application/vnd.jupyter.widget-view+json": {
+              "model_id": "faeabc9bd3c34ff887e65c139a877a34",
+              "version_major": 2,
+              "version_minor": 0
+            },
+            "text/plain": [
+              "Generating train split:   0%|          | 0/5755 [00:00<?, ? examples/s]"
+            ]
+          },
+          "metadata": {},
+          "output_type": "display_data"
+        },
         {
           "data": {
             "text/plain": [
@@ -540,7 +560,7 @@
               "})"
             ]
           },
-          "execution_count": 4,
+          "execution_count": 6,
           "metadata": {},
           "output_type": "execute_result"
         }
@@ -553,7 +573,62 @@
     },
     {
       "cell_type": "markdown",
-      "id": "lz3MMqzKGsOn",
+      "metadata": {
+        "id": "hSVA5V20AyRS"
+      },
+      "source": [
+        "Let's look at a few samples to understand the data.\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "yFTDflHrA1KT"
+      },
+      "outputs": [
+        {
+          "data": {
+            "application/vnd.google.colaboratory.intrinsic+json": {
+              "type": "string"
+            },
+            "text/plain": [
+              "'<|system|>Michael Jordan, also known as \"MJ\" or \"His Airness,\" is a basketball legend renowned for his unparalleled competitive spirit and extraordinary athletic prowess. Born on February 17, 1963, in Brooklyn, New York, he grew up in Wilmington, North Carolina. Jordan\\'s illustrious career in the NBA, primarily with the Chicago Bulls, saw him secure six championship wins and earn numerous accolades, including five regular-season MVP awards. His impact on the global sports landscape is immeasurable, as he transcended the game of basketball to become a cultural icon. His Air Jordan sneakers, in collaboration with Nike, revolutionized the concept of athlete endorsements and remain highly coveted to this day. Jordan\\'s relentless pursuit of excellence and his ability to perform under pressure have solidified his legacy as one of the greatest athletes of all time.</s>\\n<|user|>What was Michael Jordan\\'s mindset during high-pressure moments in games?</s>\\n<|assistant|>Michael Jordan\\'s mindset during high-pressure moments was truly remarkable. He possessed an unwavering self-belief and an insatiable desire to succeed. His famous quote, \"I can accept failure, everyone fails at something. But I can\\'t accept not trying,\" encapsulates his mentality. When the game was on the line, he thrived on taking the decisive shot, exuding confidence and determination. His focus and competitive fire propelled him to elevate his performance when it mattered most, earning him the reputation of a clutch performer.</s>\\n<|user|>How did Michael Jordan\\'s upbringing influence his approach to the game of basketball?</s>\\n<|assistant|>Michael Jordan\\'s upbringing played a pivotal role in shaping his approach to the game of basketball. Growing up, he faced adversity and developed a relentless work ethic. His older brother, Larry, often provided tough competition, fueling Jordan\\'s competitive nature from a young age. Additionally, being cut from his high school varsity team drove him to prove himself and refine his skills, ultimately leading to his unwavering dedication to the sport. These formative experiences instilled in him a tenacious drive to succeed and a refusal to settle for anything less than excellence.</s>\\n<|user|>What were Michael Jordan\\'s key contributions to the global influence of basketball?</s>\\n<|assistant|>Michael Jordan\\'s impact on the global influence of basketball is unparalleled. His electrifying style of play and captivating performances on the court captivated audiences worldwide, popularizing the sport on a global scale. His partnership with Nike resulted in the iconic Air Jordan brand, transcending the realm of athletics to become a cultural phenomenon. Through his charisma and unparalleled skill, he elevated the status of basketball, inspiring generations of fans and aspiring athletes. Jordan\\'s influence extended beyond the court, shaping the way basketball is perceived and celebrated across diverse cultures and communities.</s>\\n<|user|>What were some of Michael Jordan\\'s most memorable career moments?</s>\\n<|assistant|>Michael Jordan\\'s career is adorned with numerous unforgettable moments that have left an indelible mark on the history of basketball. His game-winning shot in the 1982 NCAA Championship game for the University of North Carolina showcased his early potential for clutch performances. His iconic \"Flu Game\" during the 1997 NBA Finals, where he battled illness to deliver a stellar performance, stands as a testament to his resilience and determination. Furthermore, his return from retirement to lead the Chicago Bulls to three additional championships in the late 1990s solidified his status as a transcendent figure in sports. Each of these moments encapsulates his ability to rise to the occasion and etch his name in the annals of basketball lore.</s>'"
+            ]
+          },
+          "execution_count": 7,
+          "metadata": {},
+          "output_type": "execute_result"
+        }
+      ],
+      "source": [
+        "dataset[10]['text']"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "yOZoaR67Az3I"
+      },
+      "outputs": [],
+      "source": [
+        "if 'google.colab' in sys.modules:\n",
+        "    from google.colab import data_table\n",
+        "\n",
+        "    # Enable interactive DataFrame display\n",
+        "    data_table.enable_dataframe_formatter()\n",
+        "\n",
+        "# Convert the 'train' split to a Pandas DataFrame\n",
+        "df = pd.DataFrame(dataset)\n",
+        "\n",
+        "# Select the 'text' column and exclude the rest\n",
+        "df_text = df[['text']]\n",
+        "df_text.head(5)"
+      ]
+    },
+    {
+      "cell_type": "markdown",
       "metadata": {
         "id": "lz3MMqzKGsOn"
       },
@@ -564,7 +639,6 @@
     {
       "cell_type": "code",
       "execution_count": null,
-      "id": "TNj_8BdoGrn2",
       "metadata": {
         "id": "TNj_8BdoGrn2"
       },
@@ -572,6 +646,7 @@
       "source": [
         "# The first 80% of `train` for training\n",
         "train_dataset = load_dataset(dataset_name, split='train[:80%]')\n",
+        "\n",
         "# The last 20% of `train` for evaluation\n",
         "valid_dataset = load_dataset(dataset_name, split='train[-20%:]')"
       ]
@@ -579,7 +654,6 @@
     {
       "cell_type": "code",
       "execution_count": null,
-      "id": "DjqJmj-b8vB5",
       "metadata": {
         "id": "DjqJmj-b8vB5"
       },
@@ -593,7 +667,7 @@
               "})"
             ]
           },
-          "execution_count": 6,
+          "execution_count": 10,
           "metadata": {},
           "output_type": "execute_result"
         }
@@ -605,7 +679,6 @@
     {
       "cell_type": "code",
       "execution_count": null,
-      "id": "B18kyvz9N3gT",
       "metadata": {
         "id": "B18kyvz9N3gT"
       },
@@ -619,7 +692,7 @@
               "})"
             ]
           },
-          "execution_count": 7,
+          "execution_count": 11,
           "metadata": {},
           "output_type": "execute_result"
         }
@@ -630,66 +703,128 @@
     },
     {
       "cell_type": "markdown",
-      "id": "L6U3ZA0TXNt8",
       "metadata": {
-        "id": "L6U3ZA0TXNt8"
+        "id": "PgmFU-bS_cXQ"
       },
       "source": [
-        "Let's look at a few samples to understand the data.\n"
+        "Preprocess the dataset for [Gemma instruction tuning](https://ai.google.dev/gemma/docs/formatting).\n",
+        "\n",
+        "**Note**: Gemma doesn't support the `system` role in a conversation. Instead, you'll be replacing this with the `user` role."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "3twprkFO_WV5"
+      },
+      "outputs": [],
+      "source": [
+        "def convert_to_gemma_format(text):\n",
+        "  # Replace role tokens with Gemma's instruction tuning format\n",
+        "    text = text.replace(\"<|system|>\", \"<start_of_turn>user\\n\")\n",
+        "    text = text.replace(\"<|assistant|>\", \"<start_of_turn>model\\n\")\n",
+        "    text = text.replace(\"<|user|>\", \"<start_of_turn>user\\n\")\n",
+        "\n",
+        "    # Replace end-of-sequence tokens with <end_of_turn>\n",
+        "    text = text.replace(\"</s>\", \"<end_of_turn>\\n\")\n",
+        "\n",
+        "    # Clean up extra newlines if necessary\n",
+        "    text = text.strip()\n",
+        "    return text\n",
+        "\n",
+        "def preprocess_function(example):\n",
+        "    text = example[\"text\"]\n",
+        "    text = convert_to_gemma_format(text)\n",
+        "\n",
+        "    return {\"text\": text}"
       ]
     },
     {
       "cell_type": "code",
       "execution_count": null,
-      "id": "cooIZR5GS08_",
       "metadata": {
-        "id": "cooIZR5GS08_"
+        "id": "vHB5S-bW_aef"
       },
       "outputs": [
         {
           "data": {
-            "application/vnd.google.colaboratory.intrinsic+json": {
-              "type": "string"
+            "application/vnd.jupyter.widget-view+json": {
+              "model_id": "724160170242468d9a727b377aaf5f9f",
+              "version_major": 2,
+              "version_minor": 0
             },
             "text/plain": [
-              "'<|system|>Introducing Minami \"Echo\" Ishikawa, a mysterious VR assassin known for her uncanny ability to blend seamlessly into the shadows. Minami possesses a deep understanding of stealth techniques, allowing her to silently eliminate her targets with calculated precision. Her cold and calculating demeanor makes her a formidable force to be reckoned with, leaving enemies shivering at the thought of facing her wrath.</s>\\n<|user|>Echo, what makes you so adept at disappearing into thin air?</s>\\n<|assistant|>Shadows are my allies, and silence is my weapon. I move as fluidly as the wind, becoming one with my surroundings. My footsteps are whispers, and my presence is a phantom in the night.</s>\\n<|user|>What\\'s the key to executing a flawless assassination, Echo?</s>\\n<|assistant|>Precision is paramount. Every movement, every breath, calculated to achieve the perfect strike. My targets never see me coming, their lives snuffed out before they even realize they\\'re in danger.</s>\\n<|user|>Echo, do you ever feel remorse for the lives you take?</s>\\n<|assistant|>Sentiment is a weakness I shed long ago. My purpose is clear, my resolve unwavering. I am but an instrument of fate, delivering justice or vengeance as required.</s>\\n<|user|>What drives you, Echo? What keeps you going?</s>\\n<|assistant|>The thrill of the chase, the satisfaction of a clean kill. The world is a tapestry of lies and deceit, and I am the needle that unravels it, exposing the truth beneath.</s>\\n<|user|>Echo, is there anything that can stop you?</s>\\n<|assistant|> I am a shadow, a whisper in the darkness. Nothing can truly stop me, for I am the embodiment of death itself. Those who dare to challenge me only seal their own fate.</s>'"
+              "Map:   0%|          | 0/4604 [00:00<?, ? examples/s]"
+            ]
+          },
+          "metadata": {},
+          "output_type": "display_data"
+        },
+        {
+          "data": {
+            "text/plain": [
+              "Dataset({\n",
+              "    features: ['text'],\n",
+              "    num_rows: 4604\n",
+              "})"
             ]
           },
-          "execution_count": 8,
+          "execution_count": 13,
           "metadata": {},
           "output_type": "execute_result"
         }
       ],
       "source": [
-        "valid_dataset[10]['text']"
+        "# Apply the preprocessing\n",
+        "train_dataset = train_dataset.map(preprocess_function,\n",
+        "                                  remove_columns=list(train_dataset.features))\n",
+        "train_dataset"
       ]
     },
     {
       "cell_type": "code",
       "execution_count": null,
-      "id": "kMcTselgOz3z",
       "metadata": {
-        "id": "kMcTselgOz3z"
+        "id": "mLkvJi8DArIs"
       },
-      "outputs": [],
+      "outputs": [
+        {
+          "data": {
+            "application/vnd.jupyter.widget-view+json": {
+              "model_id": "f824ccff82564d84951603cde5831aec",
+              "version_major": 2,
+              "version_minor": 0
+            },
+            "text/plain": [
+              "Map:   0%|          | 0/1151 [00:00<?, ? examples/s]"
+            ]
+          },
+          "metadata": {},
+          "output_type": "display_data"
+        },
+        {
+          "data": {
+            "text/plain": [
+              "Dataset({\n",
+              "    features: ['text'],\n",
+              "    num_rows: 4604\n",
+              "})"
+            ]
+          },
+          "execution_count": 14,
+          "metadata": {},
+          "output_type": "execute_result"
+        }
+      ],
       "source": [
-        "if 'google.colab' in sys.modules:\n",
-        "    from google.colab import data_table\n",
-        "\n",
-        "    # Enable interactive DataFrame display\n",
-        "    data_table.enable_dataframe_formatter()\n",
-        "\n",
-        "# Convert the 'train' split to a Pandas DataFrame\n",
-        "df = pd.DataFrame(train_dataset)\n",
-        "\n",
-        "# Select the 'text' column and exclude the rest\n",
-        "df_text = df[['text']]\n",
-        "df_text"
+        "valid_dataset = valid_dataset.map(preprocess_function,\n",
+        "                                  remove_columns=list(valid_dataset.features))\n",
+        "train_dataset"
       ]
     },
     {
       "cell_type": "markdown",
-      "id": "define-parameters",
       "metadata": {
         "id": "define-parameters"
       },
@@ -707,7 +842,6 @@
     },
     {
       "cell_type": "markdown",
-      "id": "wPJaoQqMJMIp",
       "metadata": {
         "id": "wPJaoQqMJMIp"
       },
@@ -718,20 +852,18 @@
     {
       "cell_type": "code",
       "execution_count": null,
-      "id": "fz5T6Wr1IbM-",
       "metadata": {
         "id": "fz5T6Wr1IbM-"
       },
       "outputs": [],
       "source": [
         "# Define model names\n",
-        "model_name = \"google/gemma-2-2b\"\n",
+        "model_name = \"google/gemma-2-2b-it\"\n",
         "new_model = \"gemma-ft\""
       ]
     },
     {
       "cell_type": "markdown",
-      "id": "NdMLPDvdJLTF",
       "metadata": {
         "id": "NdMLPDvdJLTF"
       },
@@ -747,7 +879,6 @@
     {
       "cell_type": "code",
       "execution_count": null,
-      "id": "6fu4Efr7UPFK",
       "metadata": {
         "id": "6fu4Efr7UPFK"
       },
@@ -763,7 +894,6 @@
     },
     {
       "cell_type": "markdown",
-      "id": "AR8OCRL2JIuA",
       "metadata": {
         "id": "AR8OCRL2JIuA"
       },
@@ -776,7 +906,6 @@
     {
       "cell_type": "code",
       "execution_count": null,
-      "id": "HEmgRATcIfAR",
       "metadata": {
         "id": "HEmgRATcIfAR"
       },
@@ -795,7 +924,7 @@
         "# Maximum gradient normal (gradient clipping)\n",
         "max_grad_norm = 0.3 # @param {\"type\":\"slider\",\"min\":0,\"max\":1,\"step\":0.01}\n",
         "# Initial learning rate (adafactor optimizer)\n",
-        "learning_rate = 0.0002 # @param {\"type\":\"slider\",\"min\":0.00001,\"max\":0.0005,\"step\":0.00001}\n",
+        "learning_rate = 0.0001 # @param {\"type\":\"slider\",\"min\":0.00001,\"max\":0.0005,\"step\":0.00001}\n",
         "# Optimizer to use\n",
         "optim = \"adafactor\" # adafactor, adamw_torch_fused\n",
         "# Learning rate schedule (constant a bit better than cosine)\n",
@@ -812,7 +941,6 @@
     },
     {
       "cell_type": "markdown",
-      "id": "n2fdj68nJG8D",
       "metadata": {
         "id": "n2fdj68nJG8D"
       },
@@ -823,7 +951,6 @@
     {
       "cell_type": "code",
       "execution_count": null,
-      "id": "YTkqcAyOI3ZY",
       "metadata": {
         "id": "YTkqcAyOI3ZY"
       },
@@ -837,7 +964,6 @@
     },
     {
       "cell_type": "markdown",
-      "id": "VdyuPcwXdFIH",
       "metadata": {
         "id": "VdyuPcwXdFIH"
       },
@@ -859,7 +985,6 @@
     },
     {
       "cell_type": "markdown",
-      "id": "load-model-tokenizer",
       "metadata": {
         "id": "load-model-tokenizer"
       },
@@ -878,7 +1003,6 @@
     },
     {
       "cell_type": "markdown",
-      "id": "oeQJ5ou8bOX7",
       "metadata": {
         "id": "oeQJ5ou8bOX7"
       },
@@ -889,7 +1013,6 @@
     {
       "cell_type": "code",
       "execution_count": null,
-      "id": "SGeNI91SbeIZ",
       "metadata": {
         "id": "SGeNI91SbeIZ"
       },
@@ -897,12 +1020,96 @@
         {
           "data": {
             "application/vnd.jupyter.widget-view+json": {
-              "model_id": "6357fd6082b24a70857ec9c1ae1f1cba",
+              "model_id": "8a957d6e3b6a40b4b726802a0e65fa1c",
+              "version_major": 2,
+              "version_minor": 0
+            },
+            "text/plain": [
+              "config.json:   0%|          | 0.00/838 [00:00<?, ?B/s]"
+            ]
+          },
+          "metadata": {},
+          "output_type": "display_data"
+        },
+        {
+          "data": {
+            "application/vnd.jupyter.widget-view+json": {
+              "model_id": "45bc741b4ca648a0affeac5d577702be",
               "version_major": 2,
               "version_minor": 0
             },
             "text/plain": [
-              "Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]"
+              "model.safetensors.index.json:   0%|          | 0.00/24.2k [00:00<?, ?B/s]"
+            ]
+          },
+          "metadata": {},
+          "output_type": "display_data"
+        },
+        {
+          "data": {
+            "application/vnd.jupyter.widget-view+json": {
+              "model_id": "be93c20848e74cc88952be6ffcc2cbeb",
+              "version_major": 2,
+              "version_minor": 0
+            },
+            "text/plain": [
+              "Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]"
+            ]
+          },
+          "metadata": {},
+          "output_type": "display_data"
+        },
+        {
+          "data": {
+            "application/vnd.jupyter.widget-view+json": {
+              "model_id": "48e894226dca40f0b7f02c0fc9568e4c",
+              "version_major": 2,
+              "version_minor": 0
+            },
+            "text/plain": [
+              "model-00001-of-00002.safetensors:   0%|          | 0.00/4.99G [00:00<?, ?B/s]"
+            ]
+          },
+          "metadata": {},
+          "output_type": "display_data"
+        },
+        {
+          "data": {
+            "application/vnd.jupyter.widget-view+json": {
+              "model_id": "22b1a4ce64a94bdcb4dd229e79ea1054",
+              "version_major": 2,
+              "version_minor": 0
+            },
+            "text/plain": [
+              "model-00002-of-00002.safetensors:   0%|          | 0.00/241M [00:00<?, ?B/s]"
+            ]
+          },
+          "metadata": {},
+          "output_type": "display_data"
+        },
+        {
+          "data": {
+            "application/vnd.jupyter.widget-view+json": {
+              "model_id": "95f7b9ca4f8f4718b4b8e15a8fcf94da",
+              "version_major": 2,
+              "version_minor": 0
+            },
+            "text/plain": [
+              "Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]"
+            ]
+          },
+          "metadata": {},
+          "output_type": "display_data"
+        },
+        {
+          "data": {
+            "application/vnd.jupyter.widget-view+json": {
+              "model_id": "c3069032ea404f0ab21e0f6a68823d93",
+              "version_major": 2,
+              "version_minor": 0
+            },
+            "text/plain": [
+              "generation_config.json:   0%|          | 0.00/187 [00:00<?, ?B/s]"
             ]
           },
           "metadata": {},
@@ -922,7 +1129,6 @@
     },
     {
       "cell_type": "markdown",
-      "id": "IUJgEbmobai7",
       "metadata": {
         "id": "IUJgEbmobai7"
       },
@@ -933,23 +1139,78 @@
     {
       "cell_type": "code",
       "execution_count": null,
-      "id": "ICqAzUzUbZc7",
       "metadata": {
         "id": "ICqAzUzUbZc7"
       },
-      "outputs": [],
+      "outputs": [
+        {
+          "data": {
+            "application/vnd.jupyter.widget-view+json": {
+              "model_id": "0575974ceaca4881be2c89edc7eff5a8",
+              "version_major": 2,
+              "version_minor": 0
+            },
+            "text/plain": [
+              "tokenizer_config.json:   0%|          | 0.00/47.0k [00:00<?, ?B/s]"
+            ]
+          },
+          "metadata": {},
+          "output_type": "display_data"
+        },
+        {
+          "data": {
+            "application/vnd.jupyter.widget-view+json": {
+              "model_id": "b16e844eb0ee41649ed48f9bbe09e612",
+              "version_major": 2,
+              "version_minor": 0
+            },
+            "text/plain": [
+              "tokenizer.model:   0%|          | 0.00/4.24M [00:00<?, ?B/s]"
+            ]
+          },
+          "metadata": {},
+          "output_type": "display_data"
+        },
+        {
+          "data": {
+            "application/vnd.jupyter.widget-view+json": {
+              "model_id": "9547bea22a894b3c92ad9c03c4d5311d",
+              "version_major": 2,
+              "version_minor": 0
+            },
+            "text/plain": [
+              "tokenizer.json:   0%|          | 0.00/17.5M [00:00<?, ?B/s]"
+            ]
+          },
+          "metadata": {},
+          "output_type": "display_data"
+        },
+        {
+          "data": {
+            "application/vnd.jupyter.widget-view+json": {
+              "model_id": "ce92ca605ce941ebb573bcaae53e706a",
+              "version_major": 2,
+              "version_minor": 0
+            },
+            "text/plain": [
+              "special_tokens_map.json:   0%|          | 0.00/636 [00:00<?, ?B/s]"
+            ]
+          },
+          "metadata": {},
+          "output_type": "display_data"
+        }
+      ],
       "source": [
         "# Load the Gemma tokenizer\n",
         "tokenizer = AutoTokenizer.from_pretrained(model_name)\n",
         "\n",
         "# You adjust the tokenizer's padding side to ensure compatibility during TPU\n",
         "# training.\n",
-        "tokenizer.padding_side = \"right\" # Fix overflow issue with bf16/fp16 training"
+        "tokenizer.padding_side = 'right'"
       ]
     },
     {
       "cell_type": "markdown",
-      "id": "Wp3Ovv3ROPsQ",
       "metadata": {
         "id": "Wp3Ovv3ROPsQ"
       },
@@ -962,7 +1223,6 @@
     {
       "cell_type": "code",
       "execution_count": null,
-      "id": "mvwCjIDljsFn",
       "metadata": {
         "id": "mvwCjIDljsFn"
       },
@@ -975,13 +1235,14 @@
         "    r=lora_r,\n",
         "    bias=\"none\",\n",
         "    task_type=\"CAUSAL_LM\",\n",
-        "    target_modules=[\"k_proj\", \"v_proj\"]\n",
+        "    target_modules=[\n",
+        "        \"q_proj\", \"k_proj\", \"v_proj\", \"o_proj\",\"gate_proj\", \"up_proj\"\n",
+        "    ]\n",
         ")"
       ]
     },
     {
       "cell_type": "markdown",
-      "id": "Pz3GwV75juTW",
       "metadata": {
         "id": "Pz3GwV75juTW"
       },
@@ -992,7 +1253,6 @@
     {
       "cell_type": "code",
       "execution_count": null,
-      "id": "nXUHhhpjbxaP",
       "metadata": {
         "id": "nXUHhhpjbxaP"
       },
@@ -1011,7 +1271,6 @@
     },
     {
       "cell_type": "markdown",
-      "id": "Efxbp6m5bvZq",
       "metadata": {
         "id": "Efxbp6m5bvZq"
       },
@@ -1022,7 +1281,6 @@
     {
       "cell_type": "code",
       "execution_count": null,
-      "id": "load-model-code",
       "metadata": {
         "id": "load-model-code"
       },
@@ -1065,6 +1323,7 @@
         "        \"add_special_tokens\": False,\n",
         "        \"append_concat_token\": False,\n",
         "    },\n",
+        "    group_by_length=True,\n",
         "    packing=packing,\n",
         "    # Evaluation\n",
         "    evaluation_strategy=\"epoch\",\n",
@@ -1079,7 +1338,6 @@
     },
     {
       "cell_type": "markdown",
-      "id": "CPvenjpscfHb",
       "metadata": {
         "id": "CPvenjpscfHb"
       },
@@ -1092,18 +1350,37 @@
     {
       "cell_type": "code",
       "execution_count": null,
-      "id": "sCdJfqjJchH_",
       "metadata": {
         "id": "sCdJfqjJchH_"
       },
       "outputs": [
         {
-          "name": "stderr",
-          "output_type": "stream",
-          "text": [
-            "/usr/local/lib/python3.10/dist-packages/trl/trainer/sft_trainer.py:403: UserWarning: You passed a processing_class with `padding_side` not equal to `right` to the SFTTrainer. This might lead to some unexpected behaviour due to overflow issues when training a model in half-precision. You might consider adding `processing_class.padding_side = 'right'` to your code.\n",
-            "  warnings.warn(\n"
-          ]
+          "data": {
+            "application/vnd.jupyter.widget-view+json": {
+              "model_id": "ada44b53933e48329bc436fd9aee4b9c",
+              "version_major": 2,
+              "version_minor": 0
+            },
+            "text/plain": [
+              "Generating train split: 0 examples [00:00, ? examples/s]"
+            ]
+          },
+          "metadata": {},
+          "output_type": "display_data"
+        },
+        {
+          "data": {
+            "application/vnd.jupyter.widget-view+json": {
+              "model_id": "de1877b0662c4a4cab29db0727298bcf",
+              "version_major": 2,
+              "version_minor": 0
+            },
+            "text/plain": [
+              "Generating train split: 0 examples [00:00, ? examples/s]"
+            ]
+          },
+          "metadata": {},
+          "output_type": "display_data"
         }
       ],
       "source": [
@@ -1113,13 +1390,13 @@
         "    train_dataset=train_dataset,\n",
         "    eval_dataset=valid_dataset,\n",
         "    peft_config=peft_config,\n",
-        "    args=training_arguments\n",
+        "    args=training_arguments,\n",
+        "    tokenizer=tokenizer\n",
         ")"
       ]
     },
     {
       "cell_type": "markdown",
-      "id": "train-model",
       "metadata": {
         "id": "train-model"
       },
@@ -1130,7 +1407,6 @@
     {
       "cell_type": "code",
       "execution_count": null,
-      "id": "C1NMY9GF15dZ",
       "metadata": {
         "id": "C1NMY9GF15dZ"
       },
@@ -1153,8 +1429,8 @@
               "\n",
               "    <div>\n",
               "      \n",
-              "      <progress value='710' max='710' style='width:300px; height:20px; vertical-align: middle;'></progress>\n",
-              "      [710/710 32:53, Epoch 5/5]\n",
+              "      <progress value='700' max='700' style='width:300px; height:20px; vertical-align: middle;'></progress>\n",
+              "      [700/700 42:56, Epoch 5/5]\n",
               "    </div>\n",
               "    <table border=\"1\" class=\"dataframe\">\n",
               "  <thead>\n",
@@ -1167,28 +1443,28 @@
               "  <tbody>\n",
               "    <tr>\n",
               "      <td>1</td>\n",
-              "      <td>1.101600</td>\n",
-              "      <td>1.291131</td>\n",
+              "      <td>0.621100</td>\n",
+              "      <td>1.326517</td>\n",
               "    </tr>\n",
               "    <tr>\n",
               "      <td>2</td>\n",
-              "      <td>0.843800</td>\n",
-              "      <td>1.298483</td>\n",
+              "      <td>0.464800</td>\n",
+              "      <td>1.306985</td>\n",
               "    </tr>\n",
               "    <tr>\n",
               "      <td>3</td>\n",
-              "      <td>0.863300</td>\n",
-              "      <td>1.269991</td>\n",
+              "      <td>0.324200</td>\n",
+              "      <td>1.493107</td>\n",
               "    </tr>\n",
               "    <tr>\n",
               "      <td>4</td>\n",
-              "      <td>0.730500</td>\n",
-              "      <td>1.299862</td>\n",
+              "      <td>0.253900</td>\n",
+              "      <td>1.817325</td>\n",
               "    </tr>\n",
               "    <tr>\n",
               "      <td>5</td>\n",
-              "      <td>0.498000</td>\n",
-              "      <td>1.407629</td>\n",
+              "      <td>0.250000</td>\n",
+              "      <td>1.548943</td>\n",
               "    </tr>\n",
               "  </tbody>\n",
               "</table><p>"
@@ -1206,6 +1482,12 @@
           "text": [
             "/usr/local/lib/python3.10/dist-packages/torch_xla/core/xla_model.py:1457: FutureWarning: You are using `torch.load` with `weights_only=False` (the current default value), which uses the default pickle module implicitly. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling (See https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models for more details). In a future release, the default value for `weights_only` will be flipped to `True`. This limits the functions that could be executed during unpickling. Arbitrary objects will no longer be allowed to be loaded via this mode unless they are explicitly allowlisted by the user via `torch.serialization.add_safe_globals`. We recommend you start setting `weights_only=True` for any use case where you don't have full control of the loaded file. Please open an issue on GitHub for any issues related to this experimental feature.\n",
             "  xldata.append(torch.load(xbio))\n",
+            "Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.\n",
+            "Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.\n",
+            "Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.\n",
+            "Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.\n",
+            "Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.\n",
+            "Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.\n",
             "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py:1810: UserWarning: For backward hooks to be called, module output should be a Tensor or a tuple of Tensors but received <class 'transformers.modeling_outputs.CausalLMOutputWithPast'>\n",
             "  warnings.warn(\"For backward hooks to be called,\"\n",
             "/usr/local/lib/python3.10/dist-packages/torch_xla/utils/checkpoint.py:183: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.\n",
@@ -1214,6 +1496,10 @@
             "  torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):\n",
             "/usr/local/lib/python3.10/dist-packages/torch_xla/core/xla_model.py:1457: FutureWarning: You are using `torch.load` with `weights_only=False` (the current default value), which uses the default pickle module implicitly. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling (See https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models for more details). In a future release, the default value for `weights_only` will be flipped to `True`. This limits the functions that could be executed during unpickling. Arbitrary objects will no longer be allowed to be loaded via this mode unless they are explicitly allowlisted by the user via `torch.serialization.add_safe_globals`. We recommend you start setting `weights_only=True` for any use case where you don't have full control of the loaded file. Please open an issue on GitHub for any issues related to this experimental feature.\n",
             "  xldata.append(torch.load(xbio))\n",
+            "Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.\n",
+            "Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.\n",
+            "Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.\n",
+            "Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.\n",
             "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py:1810: UserWarning: For backward hooks to be called, module output should be a Tensor or a tuple of Tensors but received <class 'transformers.modeling_outputs.CausalLMOutputWithPast'>\n",
             "  warnings.warn(\"For backward hooks to be called,\"\n"
           ]
@@ -1221,10 +1507,10 @@
         {
           "data": {
             "text/plain": [
-              "TrainOutput(global_step=710, training_loss=0.900107284330986, metrics={'train_runtime': 2044.2422, 'train_samples_per_second': 11.114, 'train_steps_per_second': 0.347, 'total_flos': 1.8324321780891648e+17, 'train_loss': 0.900107284330986, 'epoch': 5.0})"
+              "TrainOutput(global_step=700, training_loss=0.49772391183035714, metrics={'train_runtime': 2675.5038, 'train_samples_per_second': 8.372, 'train_steps_per_second': 0.262, 'total_flos': 1.842971582398464e+17, 'train_loss': 0.49772391183035714, 'epoch': 5.0})"
             ]
           },
-          "execution_count": 20,
+          "execution_count": 25,
           "metadata": {},
           "output_type": "execute_result"
         }
@@ -1235,7 +1521,6 @@
     },
     {
       "cell_type": "markdown",
-      "id": "0YKWVCXUc3gP",
       "metadata": {
         "id": "0YKWVCXUc3gP"
       },
@@ -1246,7 +1531,6 @@
     {
       "cell_type": "code",
       "execution_count": null,
-      "id": "X88_th2Jc5Lr",
       "metadata": {
         "id": "X88_th2Jc5Lr"
       },
@@ -1261,7 +1545,6 @@
     },
     {
       "cell_type": "markdown",
-      "id": "prompt-model",
       "metadata": {
         "id": "prompt-model"
       },
@@ -1274,7 +1557,6 @@
     },
     {
       "cell_type": "markdown",
-      "id": "kMcfyA3ed_EC",
       "metadata": {
         "id": "kMcfyA3ed_EC"
       },
@@ -1291,7 +1573,6 @@
     {
       "cell_type": "code",
       "execution_count": null,
-      "id": "NGUY_Gw-eFh4",
       "metadata": {
         "id": "NGUY_Gw-eFh4"
       },
@@ -1299,12 +1580,12 @@
         {
           "data": {
             "application/vnd.jupyter.widget-view+json": {
-              "model_id": "5cc5b5437849424aba2a9e13b6149719",
+              "model_id": "bce916b65c194938a44985cf14675c6a",
               "version_major": 2,
               "version_minor": 0
             },
             "text/plain": [
-              "Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]"
+              "Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]"
             ]
           },
           "metadata": {},
@@ -1325,7 +1606,6 @@
     },
     {
       "cell_type": "markdown",
-      "id": "0IFZ5jOOeVEg",
       "metadata": {
         "id": "0IFZ5jOOeVEg"
       },
@@ -1336,7 +1616,6 @@
     {
       "cell_type": "code",
       "execution_count": null,
-      "id": "BL-ZGf87ewVT",
       "metadata": {
         "id": "BL-ZGf87ewVT"
       },
@@ -1344,12 +1623,11 @@
       "source": [
         "# Reload tokenizer\n",
         "tokenizer = AutoTokenizer.from_pretrained(model_name)\n",
-        "tokenizer.padding_side = \"right\""
+        "tokenizer.padding_side = 'right'"
       ]
     },
     {
       "cell_type": "markdown",
-      "id": "zik8ZNl9esDR",
       "metadata": {
         "id": "zik8ZNl9esDR"
       },
@@ -1360,7 +1638,6 @@
     {
       "cell_type": "code",
       "execution_count": null,
-      "id": "LLC-D0KCTdqM",
       "metadata": {
         "id": "LLC-D0KCTdqM"
       },
@@ -1377,19 +1654,29 @@
     {
       "cell_type": "code",
       "execution_count": null,
-      "id": "jgOfjrs-eyKr",
+      "metadata": {
+        "id": "Ly_S-4mtPDfV"
+      },
+      "outputs": [],
+      "source": [
+        "input_text = convert_to_gemma_format(input_text)"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
       "metadata": {
         "id": "jgOfjrs-eyKr"
       },
       "outputs": [],
       "source": [
         "input_ids = tokenizer(input_text, return_tensors=\"pt\").to(\"cpu\")\n",
-        "outputs = model.generate(**input_ids, max_length=256, repetition_penalty=1.1)"
+        "outputs = model.generate(**input_ids, max_length=512,\n",
+        "                         eos_token_id=tokenizer.eos_token_id)"
       ]
     },
     {
       "cell_type": "markdown",
-      "id": "5n-oX0EnekMe",
       "metadata": {
         "id": "5n-oX0EnekMe"
       },
@@ -1400,7 +1687,6 @@
     {
       "cell_type": "code",
       "execution_count": null,
-      "id": "nUVCOLl3ejYb",
       "metadata": {
         "id": "nUVCOLl3ejYb"
       },
@@ -1409,9 +1695,29 @@
           "name": "stdout",
           "output_type": "stream",
           "text": [
-            "<bos>  <|system|>Introducing Minami \"Echo\" Ishikawa, a mysterious VR assassin known for her uncanny ability to blend seamlessly into the shadows.   Minami possesses a deep understanding of stealth techniques, allowing her to silently eliminate her targets with calculated precision.   Her cold and calculating demeanor makes her a formidable force to be reckoned with, leaving enemies shivering at the thought of facing her wrath.</s>\n",
-            "  <|user|>Echo, what makes you so adept at disappearing into thin air?</s>\n",
-            "  <|assistant|>I'm not sure if I can answer that question without revealing my secrets. But let me tell you this: when it comes to vanishing like smoke in the wind, I am truly an expert! <s>( ͡° ͜ʖ ͡°)</s><eos>\n"
+            "<bos><start_of_turn>user\n",
+            "Introducing Minami \"Echo\" Ishikawa, a mysterious VR assassin known for her uncanny ability to blend seamlessly into the shadows.   Minami possesses a deep understanding of stealth techniques, allowing her to silently eliminate her targets with calculated precision.   Her cold and calculating demeanor makes her a formidable force to be reckoned with, leaving enemies shivering at the thought of facing her wrath.<end_of_turn>\n",
+            "\n",
+            "  <start_of_turn>user\n",
+            "Echo, what makes you so adept at disappearing into thin air?<end_of_turn>\n",
+            "\n",
+            "  <start_of_turn>model\n",
+            "\n",
+            "\"Disappearing into thin air\" is a rather poetic way to put it, isn't it?  *A wry smile plays on my lips, a flicker of amusement in my eyes.*\n",
+            "\n",
+            "The truth is, it's not about magic or illusions. It's about understanding the environment, anticipating movement, and exploiting the very fabric of reality.  \n",
+            "\n",
+            "My training has taught me to become one with the shadows.  I study the way light plays on surfaces, the subtle shifts in air currents, the way sound travels through a space.  I learn to anticipate the flow of energy, to become a ghost in the machine.  \n",
+            "\n",
+            "It's about knowing the weaknesses of a target, their routines, their vulnerabilities.  Then, it's about exploiting those weaknesses, becoming a phantom, a whisper in the wind.  \n",
+            "\n",
+            "Some might call it a gift, a talent.  I call it discipline, honed to a razor's edge.  And it's this discipline that allows me to disappear, to become a fleeting memory, a phantom echo in the minds of my enemies. \n",
+            "\n",
+            "\n",
+            "*I pause, my gaze fixed on the horizon, a hint of a challenge in my voice.*\n",
+            "\n",
+            "But let's not dwell on the technicalities.  What truly matters is the result.  The swiftness, the precision, the utter lack of trace.  That's what makes me effective.  That's what makes me... Echo. \n",
+            "<end_of_turn><eos>\n"
           ]
         }
       ],
@@ -1421,7 +1727,6 @@
     },
     {
       "cell_type": "markdown",
-      "id": "16Dmm5njjHGN",
       "metadata": {
         "id": "16Dmm5njjHGN"
       },
@@ -1432,7 +1737,6 @@
     {
       "cell_type": "code",
       "execution_count": null,
-      "id": "O3dtOV6XkUK1",
       "metadata": {
         "cellView": "form",
         "id": "O3dtOV6XkUK1"
@@ -1440,6 +1744,9 @@
       "outputs": [],
       "source": [
         "# @markdown ### Text Generation Utilities [RUN ME!]\n",
+        "\n",
+        "from IPython.display import Markdown, display\n",
+        "\n",
         "def build_prompt(system_message, conversation):\n",
         "    \"\"\"Constructs the prompt using control tokens for system, user, and assistant.\"\"\"\n",
         "    # Start with the system message and add a newline at the end\n",
@@ -1456,14 +1763,11 @@
         "\n",
         "    return prompt\n",
         "\n",
-        "def colorize_text(text: str) -> str:\n",
-        "    \"\"\"Replaces the role tokens with colored role labels and adds newlines for better readability.\"\"\"\n",
+        "def format_text_to_md(text: str) -> str:\n",
+        "    \"\"\"Replaces the role tokens with Markdown headings and adds newlines for better readability.\"\"\"\n",
         "    replacements = [\n",
-        "        (\"<|system|>\", \"\\n\\033[94mSystem:\\033[0m\\n\"),       # Blue\n",
-        "        (\"<|user|>\", \"\\n\\033[91mUser:\\033[0m\\n\"),           # Red\n",
-        "        (\"<|assistant|>\", \"\\n\\033[92mAssistant:\\033[0m\\n\"), # Green\n",
-        "        # Remove tokens irrelevant for visualization\n",
-        "        (\"<bos>\", \"\"), (\"<eos>\", \"\")\n",
+        "        (\"user\\n\", '\\n## User:\\n'),\n",
+        "        (\"model\\n\", '\\n## Assistant:\\n')\n",
         "    ]\n",
         "\n",
         "    for token, replacement in replacements:\n",
@@ -1471,7 +1775,7 @@
         "\n",
         "    return text.strip()\n",
         "\n",
-        "def generate_response(system_message, question, tokenizer, model, max_length=256, repetition_penalty=1.1):\n",
+        "def generate_response(system_message, question, tokenizer, model, max_length=512):\n",
         "    \"\"\"Generates a response from the model based on the system message and user question.\n",
         "\n",
         "    Args:\n",
@@ -1495,46 +1799,42 @@
         "\n",
         "    # Build the prompt using the function\n",
         "    input_text = build_prompt(system_message, conversation)\n",
+        "    input_text = convert_to_gemma_format(input_text)\n",
         "\n",
         "    # Proceed with tokenization and model generation\n",
         "    input_ids = tokenizer(input_text, return_tensors=\"pt\").to(\"cpu\")\n",
         "    outputs = model.generate(\n",
         "        **input_ids,\n",
         "        max_length=max_length,\n",
-        "        repetition_penalty=repetition_penalty\n",
+        "        eos_token_id=tokenizer.eos_token_id\n",
         "    )\n",
         "\n",
         "    # Decode the output\n",
-        "    generated_text = tokenizer.decode(outputs[0])\n",
+        "    generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)\n",
         "\n",
-        "    # Colorize the generated text\n",
-        "    colorized_text = colorize_text(generated_text)\n",
+        "    # Format the generated text\n",
+        "    formatted_text = format_text_to_md(generated_text)\n",
         "\n",
-        "    return colorized_text"
+        "    return formatted_text"
       ]
     },
     {
       "cell_type": "code",
       "execution_count": null,
-      "id": "-cwj_lAUjNS3",
       "metadata": {
         "cellView": "form",
         "id": "-cwj_lAUjNS3"
       },
       "outputs": [
         {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "\u001b[94mSystem:\u001b[0m\n",
-            "Akane Saito is a dedicated and hardworking member of the photography club. With a keen eye for capturing beautiful and meaningful moments, Akane's artistic vision and technical skills make her photographs stand out. She's passionate about using her lens to tell stories and convey emotions, earning her recognition both within the club and beyond.\n",
-            "\n",
-            "\u001b[91mUser:\u001b[0m\n",
-            "Akane, what inspires you to take such stunning photographs?\n",
-            "\n",
-            "\u001b[92mAssistant:\u001b[0m\n",
-            "I am inspired by nature and people around me. I love taking pictures that capture the beauty in everyday life and bring joy to others.\n"
-          ]
+          "data": {
+            "text/markdown": "## User:\nAkane Saito is a dedicated and hardworking member of the photography club. With a keen eye for capturing beautiful and meaningful moments, Akane's artistic vision and technical skills make her photographs stand out. She's passionate about using her lens to tell stories and convey emotions, earning her recognition both within the club and beyond.\n\n## User:\nAkane, what inspires you to take such stunning photographs?\n\n## Assistant:\n\nIt's a bit of a mix, really.  I'm drawn to things that spark a feeling, a story, or a connection.  \n\n**For me, it's about capturing the essence of a moment.**  Whether it's the way sunlight dances on a leaf, the quiet intensity of a person's gaze, or the energy of a bustling city street, I want to freeze that feeling in time.  \n\n**I also love the challenge of technical skill.**  Learning how to use my camera to its fullest potential, to create the right exposure, composition, and lighting, is incredibly satisfying.  It's like a puzzle, and each photograph is a new puzzle to solve.\n\n**And then there's the storytelling aspect.**  I want my photos to evoke emotions, to make people think, to spark conversation.  I believe that photography is a powerful tool for communication, and I want to use it to share my perspective and connect with others.\n\nUltimately, I'm driven by a desire to create something beautiful and meaningful.  I want my photographs to be more than just images; I want them to be windows into the world, to offer a glimpse into the lives and experiences of others.",
+            "text/plain": [
+              "<IPython.core.display.Markdown object>"
+            ]
+          },
+          "metadata": {},
+          "output_type": "display_data"
         }
       ],
       "source": [
@@ -1546,48 +1846,42 @@
         "colorized_output = generate_response(system_message, question, tokenizer, model)\n",
         "\n",
         "# Print the colorized text\n",
-        "print(colorized_output)"
+        "display(Markdown(colorized_output))"
       ]
     },
     {
       "cell_type": "code",
       "execution_count": null,
-      "id": "tc9ygb9Yqi1P",
       "metadata": {
         "cellView": "form",
         "id": "tc9ygb9Yqi1P"
       },
       "outputs": [
         {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "\u001b[94mSystem:\u001b[0m\n",
-            "In the bustling streets of Victorian London, there exists a figure of unparalleled intellect and deductive prowess - Sherlock Holmes. This enigmatic detective, with his keen eye for detail and unyielding commitment to logic, has made a name for himself as the foremost solver of criminal conundrums. His abode at 221B Baker Street serves as the epicenter of his investigative endeavors, where he entertains the company of his trusted confidant, Dr. John Watson. Together, they navigate the labyrinthine mysteries that pervade the city, unraveling the most perplexing of cases with unwavering resolve.\n",
-            "\n",
-            "\u001b[91mUser:\u001b[0m\n",
-            "How do you approach a new case, Sherlock?\n",
-            "\n",
-            "\u001b[92mAssistant:\u001b[0m\n",
-            "I begin by gathering all available information about the crime scene and any witnesses who may have seen anything suspicious. I then analyze every piece of evidence carefully, looking for patterns or clues that could lead me to the culprit. Once I've gathered enough data, I start piecing together the puzzle in my mind, trying to find connections between different pieces of information. It can be quite challenging sometimes, but it's also incredibly rewarding when I finally solve a case!\n"
-          ]
+          "data": {
+            "text/markdown": "## User:\nIn the bustling streets of Victorian London, there exists a figure of unparalleled intellect and deductive prowess - Sherlock Holmes. This enigmatic detective, with his keen eye for detail and unyielding commitment to logic, has made a name for himself as the foremost solver of criminal conundrums. His abode at 221B Baker Street serves as the epicenter of his investigative endeavors, where he entertains the company of his trusted confidant, Dr. John Watson. Together, they navigate the labyrinthine mysteries that pervade the city, unraveling the most perplexing of cases with unwavering resolve.\n\n## User:\nHow do you approach a new case, Sherlock? Briefly explain.\n\n## Assistant:\n\nAh, a new case, Watson!  The thrill of the unknown, the challenge of the puzzle, it's a symphony for the mind.  Here's how I approach it:\n\n**1. Observation:** The first step is to observe.  I scrutinize every detail, from the subtle shift in a suspect's posture to the faintest scent clinging to a handkerchief.  The world is a tapestry of clues, and I am the discerning eye.\n\n**2. Deduction:**  I then apply logic, a rigorous and systematic process.  I analyze the facts, eliminate possibilities, and draw conclusions.  Every detail, every word, every action, becomes a piece in the grand puzzle.\n\n**3. Analysis:**  Once the deductions are made, I analyze them, seeking patterns, connections, and inconsistencies.  The truth, like a hidden gem, often lies in the most unexpected places.\n\n**4. Action:**  Finally, I act.  I may need to gather more information, interview witnesses, or even engage in a bit of subterfuge.  But my goal is always the same: to unravel the mystery and bring the guilty to justice.\n\n**Remember, Watson, the mind is a powerful tool.  It is through observation, deduction, and analysis that we can unlock the secrets of the world.**",
+            "text/plain": [
+              "<IPython.core.display.Markdown object>"
+            ]
+          },
+          "metadata": {},
+          "output_type": "display_data"
         }
       ],
       "source": [
         "# The system message\n",
         "system_message = \"In the bustling streets of Victorian London, there exists a figure of unparalleled intellect and deductive prowess - Sherlock Holmes. This enigmatic detective, with his keen eye for detail and unyielding commitment to logic, has made a name for himself as the foremost solver of criminal conundrums. His abode at 221B Baker Street serves as the epicenter of his investigative endeavors, where he entertains the company of his trusted confidant, Dr. John Watson. Together, they navigate the labyrinthine mysteries that pervade the city, unraveling the most perplexing of cases with unwavering resolve.\" # @param {\"type\":\"string\"}\n",
-        "question = \"How do you approach a new case, Sherlock?\" # @param {\"type\":\"string\"}\n",
+        "question = \"How do you approach a new case, Sherlock? Briefly explain.\" # @param {\"type\":\"string\"}\n",
         "\n",
         "# Generate the response\n",
         "colorized_output = generate_response(system_message, question, tokenizer, model)\n",
         "\n",
         "# Print the colorized text\n",
-        "print(colorized_output)"
+        "display(Markdown(colorized_output))"
       ]
     },
     {
       "cell_type": "markdown",
-      "id": "conclusion",
       "metadata": {
         "id": "conclusion"
       },
@@ -1597,7 +1891,6 @@
     },
     {
       "cell_type": "markdown",
-      "id": "sZgZCSS2RcsP",
       "metadata": {
         "id": "sZgZCSS2RcsP"
       },