NVIDIA · supertetelman · Jul 12, 2024 · Jul 11, 2024
diff --git a/cloud-service-providers/azure/azureml/python_sdk/README.md b/cloud-service-providers/azure/azureml/python_sdk/README.md
@@ -0,0 +1,21 @@
+# Instructions for deploying NIM models on AzureML using Python SDK
+
+In this example, we will deploy the LLAMA3 8B model on AzureML using the Python SDK.
+
+****Prerequisites:**
+- [NGC API Key](https://catalog.ngc.nvidia.com/)
+- [AzureML workspace](https://learn.microsoft.com/en-us/azure/machine-learning/how-to-manage-workspace?view=azureml-api-2&tabs=python)
+
+1. Provision the compute instance using the Jupyter notebook `provision aml-compute.ipynb`.This will setup the GPU compute 1xA100 on AzureML. You can run this Jupyter notebook from your local machine. 
+
+2. Upon the successful running of this notebook, you will get the URL of the Jupyter server which starts running on the AzureML compute as shown below (_note: your URL would be different name_). You can then paste the URL in your local machines' browser
+```bash 
+
+{'display_name': 'Jupyter Lab', 'endpoint_uri': 'https://mayani-gpu-ci.swedencentral.instances.azureml.ms/lab'}].....
+
+```
+
+3. Run the script `nim-azureml-compute.ipynb` from this repository on your jupyter server which is running on the AzureML compute node as shown in the image below
+![image](imgs/browser.png)
+
+
diff --git a/cloud-service-providers/azure/azureml/python_sdk/imgs/browser.png b/cloud-service-providers/azure/azureml/python_sdk/imgs/browser.png
diff --git a/cloud-service-providers/azure/azureml/python_sdk/nim-azureml-compute.ipynb b/cloud-service-providers/azure/azureml/python_sdk/nim-azureml-compute.ipynb
@@ -0,0 +1,309 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "be9cbb03-afe2-4c0a-96c2-bcfa2dfb7e65",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Thu Jul 11 21:57:03 2024       \n",
+      "+---------------------------------------------------------------------------------------+\n",
+      "| NVIDIA-SMI 535.171.04             Driver Version: 535.171.04   CUDA Version: 12.2     |\n",
+      "|-----------------------------------------+----------------------+----------------------+\n",
+      "| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |\n",
+      "| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |\n",
+      "|                                         |                      |               MIG M. |\n",
+      "|=========================================+======================+======================|\n",
+      "|   0  NVIDIA A100 80GB PCIe          On  | 00000001:00:00.0 Off |                    0 |\n",
+      "| N/A   32C    P0              41W / 300W |      0MiB / 81920MiB |      0%      Default |\n",
+      "|                                         |                      |             Disabled |\n",
+      "+-----------------------------------------+----------------------+----------------------+\n",
+      "                                                                                         \n",
+      "+---------------------------------------------------------------------------------------+\n",
+      "| Processes:                                                                            |\n",
+      "|  GPU   GI   CI        PID   Type   Process name                            GPU Memory |\n",
+      "|        ID   ID                                                             Usage      |\n",
+      "|=======================================================================================|\n",
+      "|  No running processes found                                                           |\n",
+      "+---------------------------------------------------------------------------------------+\n"
+     ]
+    }
+   ],
+   "source": [
+    "!nvidia-smi"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "id": "247b5c45-c8f3-4cb4-8eea-823a04d3c3ea",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "NGC API Key:  ········\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Login Succeeded\n",
+      "\n",
+      "WARNING! Your password will be stored unencrypted in /home/azureuser/.docker/config.json.\n",
+      "Configure a credential helper to remove this warning. See\n",
+      "https://docs.docker.com/engine/reference/commandline/login/#credentials-store\n",
+      "\n",
+      "\n"
+     ]
+    }
+   ],
+   "source": [
+    "import getpass\n",
+    "import subprocess\n",
+    "import os\n",
+    "\n",
+    "# Prompt for NGC API key\n",
+    "ngc_api_key = getpass.getpass(\"NGC API Key: \")\n",
+    "\n",
+    "# Log in to the Docker registry\n",
+    "login_command = f\"echo {ngc_api_key} | docker login nvcr.io -u '$oauthtoken' --password-stdin\"\n",
+    "login_result = subprocess.run(login_command, shell=True, capture_output=True, text=True)\n",
+    "print(login_result.stdout)\n",
+    "print(login_result.stderr)\n",
+    "\n",
+    "# Check if login was successful\n",
+    "if login_result.returncode != 0:\n",
+    "    raise Exception(\"Docker login failed\")\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 19,
+   "id": "3990ed25-a430-40b1-bd3b-d0e6c4532709",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "# Set environment variables\n",
+    "os.environ[\"CONTAINER_NAME\"] = \"llama3-8b-instruct\"\n",
+    "os.environ[\"IMG_NAME\"] = f\"nvcr.io/nim/meta/{os.environ['CONTAINER_NAME']}:1.0.0\"\n",
+    "os.environ[\"NGC_API_KEY\"]=\"<Your NGC API key>\"\n",
+    "os.environ[\"LOCAL_NIM_CACHE\"] = \"/mnt/batch/tasks/shared/LS_root/mounts/clusters/mayani-gpu-ci/code/.cache/nim\"\n",
+    "\n",
+    "# Create the cache directory\n",
+    "os.makedirs(os.environ[\"LOCAL_NIM_CACHE\"], exist_ok=True)\n",
+    "\n",
+    "# Define the docker run command without -it and with -d\n",
+    "docker_command = [\n",
+    "    \"docker\", \"run\", \"-d\", \"--rm\",\n",
+    "    f\"--name={os.environ['CONTAINER_NAME']}\",\n",
+    "    \"--gpus\", \"all\",\n",
+    "    \"-e\", f\"{os.environ['NGC_API_KEY']}\",\n",
+    "    \"-v\", f\"{os.environ['LOCAL_NIM_CACHE']}:/opt/nim/.cache\",\n",
+    "    \"-u\", str(os.getuid()),\n",
+    "    \"-p\", \"8000:8000\",\n",
+    "    os.environ[\"IMG_NAME\"]\n",
+    "]\n",
+    "\n",
+    "# Execute the docker run command\n",
+    "result = subprocess.run(docker_command, capture_output=True, text=True)\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 15,
+   "id": "da55dd96-0d73-4039-a561-aa40f10fcaef",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "stdout: 9a7d3f7cc0fceeb3a6dc5a80dad6e859c8bef94f9963f2e251f374102601d436\n",
+      "\n",
+      "stderr: \n",
+      "Container started successfully with ID: 9a7d3f7cc0fceeb3a6dc5a80dad6e859c8bef94f9963f2e251f374102601d436\n",
+      "Container logs:\n",
+      "\n",
+      "===========================================\n",
+      "== NVIDIA Inference Microservice LLM NIM ==\n",
+      "===========================================\n",
+      "\n",
+      "NVIDIA Inference Microservice LLM NIM Version 1.0.0\n",
+      "Model: nim/meta/llama3-8b-instruct\n",
+      "\n",
+      "Container image Copyright (c) 2016-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.\n",
+      "\n",
+      "This NIM container is governed by the NVIDIA AI Product Agreement here:\n",
+      "https://www.nvidia.com/en-us/data-center/products/nvidia-ai-enterprise/eula/.\n",
+      "A copy of this license can be found under /opt/nim/LICENSE.\n",
+      "\n",
+      "The use of this model is governed by the AI Foundation Models Community License\n",
+      "here: https://docs.nvidia.com/ai-foundation-models-community-license.pdf.\n",
+      "\n",
+      "ADDITIONAL INFORMATION: Meta Llama 3 Community License, Built with Meta Llama 3. \n",
+      "A copy of the Llama 3 license can be found under /opt/nim/MODEL_LICENSE.\n",
+      "\n",
+      "\n"
+     ]
+    }
+   ],
+   "source": [
+    "import subprocess\n",
+    "import os\n",
+    "import getpass\n",
+    "\n",
+    "# Prompt for NGC API key if not set\n",
+    "if \"NGC_API_KEY\" not in os.environ:\n",
+    "    os.environ[\"NGC_API_KEY\"] = getpass.getpass(\"NGC API Key: \")\n",
+    "\n",
+    "# Set environment variables\n",
+    "container_name = \"llama3-8b-instruct\"\n",
+    "img_name = f\"nvcr.io/nim/meta/{container_name}:1.0.0\"\n",
+    "local_nim_cache = \"/mnt/batch/tasks/shared/LS_root/mounts/clusters/mayani-gpu-ci/code/.cache/nim\" # this should be the path where you want to store the cache\n",
+    "\n",
+    "# Create the cache directory\n",
+    "os.makedirs(local_nim_cache, exist_ok=True)\n",
+    "\n",
+    "# Define the docker run command without -it and with -d\n",
+    "docker_command = [\n",
+    "    \"docker\", \"run\", \"-d\", \"--rm\",\n",
+    "    f\"--name={container_name}\",\n",
+    "    \"--gpus\", \"all\",\n",
+    "    \"-e\", f\"NGC_API_KEY={os.environ['NGC_API_KEY']}\",\n",
+    "    \"-v\", f\"{local_nim_cache}:/opt/nim/.cache\",\n",
+    "    \"-u\", str(os.getuid()),\n",
+    "    \"-p\", \"8000:8000\",\n",
+    "    img_name\n",
+    "]\n",
+    "\n",
+    "# Execute the docker run command\n",
+    "result = subprocess.run(docker_command, capture_output=True, text=True)\n",
+    "print(\"stdout:\", result.stdout)\n",
+    "print(\"stderr:\", result.stderr)\n",
+    "\n",
+    "# Check if the container started successfully\n",
+    "if result.returncode == 0:\n",
+    "    container_id = result.stdout.strip()\n",
+    "    print(f\"Container started successfully with ID: {container_id}\")\n",
+    "\n",
+    "    # Optionally, check the logs of the container\n",
+    "    logs_command = [\"docker\", \"logs\", container_id]\n",
+    "    logs_result = subprocess.run(logs_command, capture_output=True, text=True)\n",
+    "    print(\"Container logs:\")\n",
+    "    print(logs_result.stdout)\n",
+    "    print(logs_result.stderr)\n",
+    "else:\n",
+    "    print(\"Failed to start the container\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 24,
+   "id": "ed95bad4-d477-4c58-a0b8-fbf5732575eb",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "CONTAINER ID   IMAGE                 COMMAND                CREATED             STATUS             PORTS     NAMES\n",
+      "32fdd3fe65a4   localhost/c3:latest   \"/usr/local/bin/ice\"   About an hour ago   Up About an hour             c3-progenitor\n"
+     ]
+    }
+   ],
+   "source": [
+    "!docker container ps -a"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 18,
+   "id": "ee094db9-c986-4e84-a023-56782d6f8837",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Status Code: 200\n",
+      "Response Body: {'id': 'cmpl-9f5372bd74444ce09a68b6335c6f9905', 'object': 'text_completion', 'created': 1720736060, 'model': 'meta/llama3-8b-instruct', 'choices': [{'index': 0, 'text': ', there was a lovely little girl name Sophie. She was eight years old and lived in a small town in the countryside. Sophie had a big heart and always tried to do the right thing, even if it meant going against the crowd.\\nOne day, Sophie discovered that her cat, Mr. Whiskers, was', 'logprobs': None, 'finish_reason': 'length', 'stop_reason': None}], 'usage': {'prompt_tokens': 5, 'total_tokens': 69, 'completion_tokens': 64}}\n"
+     ]
+    }
+   ],
+   "source": [
+    "import requests\n",
+    "import json\n",
+    "\n",
+    "# Define the URL and headers\n",
+    "url = 'http://0.0.0.0:8000/v1/completions'\n",
+    "headers = {\n",
+    "    'accept': 'application/json',\n",
+    "    'Content-Type': 'application/json'\n",
+    "}\n",
+    "\n",
+    "# Define the payload\n",
+    "payload = {\n",
+    "    \"model\": \"meta/llama3-8b-instruct\",\n",
+    "    \"prompt\": \"Once upon a time\",\n",
+    "    \"max_tokens\": 64\n",
+    "}\n",
+    "\n",
+    "# Make the POST request\n",
+    "response = requests.post(url, headers=headers, data=json.dumps(payload))\n",
+    "\n",
+    "# Print the response\n",
+    "print(\"Status Code:\", response.status_code)\n",
+    "print(\"Response Body:\", response.json())"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "b3437882-6834-4ffa-a959-8b1a4cbc4786",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3.8 - AzureML",
+   "language": "python",
+   "name": "python38-azureml"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.9.19"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}