From 0abf541cb69b6470147205cbd5a6af6ec3f991d4 Mon Sep 17 00:00:00 2001 From: Yusra AlSayyad Date: Sun, 23 Jun 2024 11:04:21 -0700 Subject: [PATCH] Add notebook to compare resources for 1-3 collections --- .../resourceUsagePerCollection.ipynb | 206 ++++++++++++++++++ .../resourceUsagePerCollection.yaml | 32 +++ 2 files changed, 238 insertions(+) create mode 100644 batch-processing/resourceUsagePerCollection.ipynb create mode 100644 batch-processing/resourceUsagePerCollection.yaml diff --git a/batch-processing/resourceUsagePerCollection.ipynb b/batch-processing/resourceUsagePerCollection.ipynb new file mode 100644 index 0000000..06a2d0b --- /dev/null +++ b/batch-processing/resourceUsagePerCollection.ipynb @@ -0,0 +1,206 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "id": "eb2db9a1-0d72-4679-9b53-715e63852f0f", + "metadata": {}, + "outputs": [], + "source": [ + "# Time Square Parameters\n", + "\n", + "repo1 = \"embargo\"\n", + "collection1 = \"LSSTComCam/runs/DRP/20241101_20241211/w_2024_50/DM-48128\"\n", + "\n", + "repo2 = \"embargo\"\n", + "collection2 = \"LSSTComCam/runs/DRP/20241101_20241204/w_2024_49/DM-47988\"\n", + "\n", + "repo3 = \"embargo\"\n", + "collection3 = \"LSSTComCam/runs/DRP/20241101_20241127/w_2024_48/DM-47841\"" + ] + }, + { + "cell_type": "markdown", + "id": "364d0429-ae78-4658-846d-4a3dcab1c9ce", + "metadata": {}, + "source": [ + "# Resource Usage Comparison Report\n", + "### Compare Runtime and Memory Usage for tasks in up to three collections \n", + "\n", + "* {{params.repo1}}:{{params.collection1}}\n", + "* {{params.repo2}}:{{params.collection2}}\n", + "* {{params.repo3}}:{{params.collection3}}" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "20833b7d-d675-42b5-807d-862762df4350", + "metadata": {}, + "outputs": [], + "source": [ + "from lsst.daf.butler import Butler, DatasetNotFoundError\n", + "import matplotlib.pyplot as plt\n", + "import seaborn as sns\n", + "import pandas as pd\n", + "from matplotlib.lines import Line2D" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b2a4d1a0-a675-445f-ac95-924cc946bd3e", + "metadata": {}, + "outputs": [], + "source": [ + "repos = [repo1, repo2, repo3]\n", + "collections = [collection1, collection2, collection3]\n", + "\n", + "dfs = []\n", + "\n", + "for repo, collection in zip(repos, collections):\n", + " if repo and collection:\n", + " butler = Butler(repo, collections=[collection])\n", + " try:\n", + " df = butler.get(\"ResourceUsageSummary\")\n", + " except DatasetNotFoundError as e:\n", + " print(e)\n", + " print(f\"Skipping {repo}:{collection}. \"\n", + " \" Check that build-gather-resource-usage-qg was run on this collection\")\n", + " continue\n", + " df['collection'] = collection\n", + " dfs.append(df)\n", + " \n", + "both = pd.concat(dfs)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5cdb0d65-d65b-4082-afe4-e80df34320c2", + "metadata": {}, + "outputs": [], + "source": [ + "def custom_boxplot(data, y, hue, value_prefix, xlabel, sortby, logscale=False):\n", + " versions = sorted(data[hue].drop_duplicates())\n", + "\n", + " tasks = data.groupby(\"task\").sum()[sortby].sort_values(ascending=True).index\n", + " \n", + " palette = sns.color_palette(n_colors=len(versions))\n", + "\n", + " fig, ax = plt.subplots(figsize=(8, 8))\n", + "\n", + " for i, task in enumerate(tasks):\n", + " for j, version in enumerate(versions):\n", + " sub_group = data[(data[y] == task) & (data[hue] == version)]\n", + " if not sub_group.empty:\n", + " min_rt = sub_group[f'{value_prefix}p000'].values[0]\n", + " q1 = sub_group[f'{value_prefix}p032'].values[0]\n", + " median = sub_group[f'{value_prefix}p050'].values[0]\n", + " q3 = sub_group[f'{value_prefix}p068'].values[0]\n", + " max_rt = sub_group[f'{value_prefix}p100'].values[0]\n", + "\n", + " # Calculate the position for the box with offset for each version\n", + " # task index + version index (but start lower depending on how many) \n", + " position = i + (j * 0.25) - (0.125 * (len(versions) - 1)) \n", + "\n", + " ax.plot([min_rt, q1], [position, position], color=palette[j], linestyle='-', lw=1.5)\n", + " ax.plot([q1, q3], [position, position], color=palette[j], lw=6, solid_capstyle='butt')\n", + " ax.plot([q3, max_rt], [position, position], color=palette[j], linestyle='-', lw=1.5)\n", + " ax.plot([median], [position], marker='|', color='k')\n", + " ax.set_yticks(range(len(tasks)))\n", + " ax.set_yticklabels(tasks)\n", + "\n", + " handles = [Line2D([0], [0], color=palette[i], lw=4) for i in range(len(versions))]\n", + " ax.legend(handles, versions, title=hue, loc='lower left', bbox_to_anchor=(0, 1.02), borderaxespad=0, ncol=1)\n", + " \n", + " ax.set_xlabel(xlabel)\n", + " if logscale:\n", + " ax.set_xscale('log')\n", + " plt.show()" + ] + }, + { + "cell_type": "markdown", + "id": "f388943a-48d2-49bd-8b98-484199d2e2ef", + "metadata": {}, + "source": [ + "## Runtime for top 20 longest Tasks, sorted by integrated runtime. \n", + "\n", + "x-axis has linear scaling in the first and logarithmic scaling in the second. " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ce9a4c74-d93a-42e9-a668-d1fd9c061727", + "metadata": {}, + "outputs": [], + "source": [ + "tasks = both.groupby(\"task\").sum()[\"integrated_runtime_hrs\"].sort_values(ascending=False).head(20).index\n", + "pivoted_df = both[(both.task.isin(tasks))].sort_values('integrated_runtime_hrs', ascending=True) \n", + "custom_boxplot(pivoted_df, y='task', hue='collection', value_prefix=\"runtime_s_\", xlabel=\"runtime (s)\", sortby=\"integrated_runtime_hrs\", logscale=False)\n", + "custom_boxplot(pivoted_df, y='task', hue='collection', value_prefix=\"runtime_s_\", xlabel=\"runtime (s)\", sortby=\"integrated_runtime_hrs\", logscale=True)" + ] + }, + { + "cell_type": "markdown", + "id": "ae943812-ffeb-446f-b849-55383616166e", + "metadata": {}, + "source": [ + "## Tasks with memory usage over 2GB, sorted by max memory usage.\n", + "\n", + "If the processing was run with clustering, the memory usage is reported per cluster not per task. Use caution when interpreting. " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "bfca017c-28e4-4c09-8489-b64d81276f6b", + "metadata": {}, + "outputs": [], + "source": [ + "pivoted_df = both[(both.task.isin(both[(both.mem_GB_p050 > 2)][\"task\"].unique()))].sort_values('mem_GB_p100', ascending=True) # pd.DataFrame(data)\n", + "custom_boxplot(pivoted_df, y='task', hue='collection', value_prefix=\"mem_GB_\", xlabel=\"Memory (GB)\", sortby='mem_GB_p100', logscale=False)\n", + "custom_boxplot(pivoted_df, y='task', hue='collection', value_prefix=\"mem_GB_\", xlabel=\"Memory (GB)\", sortby='mem_GB_p100', logscale=True)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "bf164b14-a69e-406a-8fc1-7a468fd8265a", + "metadata": {}, + "outputs": [], + "source": [ + "# Get top 20 tasks with longest integrated runtime:\n", + "tasks = both.groupby(\"task\").sum()[\"integrated_runtime_hrs\"].sort_values(ascending=False).head(20).index\n", + "df = both[both.task.isin(tasks)].sort_values('integrated_runtime_hrs', ascending=False)\n", + "hue_order = sorted(df[\"collection\"].drop_duplicates())\n", + "\n", + "ax = sns.catplot(data=df, y=\"task\", x=\"integrated_runtime_hrs\", hue=\"collection\", height=8, hue_order=hue_order, kind=\"bar\")\n", + "ax = sns.catplot(data=df, y=\"task\", x=\"quanta\", hue=\"collection\", height=8, hue_order=hue_order, kind=\"bar\")" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "LSST", + "language": "python", + "name": "lsst" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.9" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/batch-processing/resourceUsagePerCollection.yaml b/batch-processing/resourceUsagePerCollection.yaml new file mode 100644 index 0000000..a85af90 --- /dev/null +++ b/batch-processing/resourceUsagePerCollection.yaml @@ -0,0 +1,32 @@ +title: Resource Usage Plots for up to three Collections +description: Generate plots comparing runtime and memory usage stats for up to three collections +authors: + - name: Yusra AlSayyad + slack: yusra +tags: + - batch-processing +parameters: + repo1: + type: string + description: Butler repo of first collection to display + default: "embargo" + collection1: + type: string + description: Collection name containing a resource usage summary to display + default: "LSSTComCam/runs/DRP/20241101_20241211/w_2024_50/DM-48128" + repo2: + type: string + description: Optional Butler repo of second collection to compare + default: "embargo" + collection2: + type: string + description: Optional second collection of resource usage statistics to compare + default: "LSSTComCam/runs/DRP/20241101_20241204/w_2024_49/DM-47988" + repo3: + type: string + description: Optional Butler repo of third collection to compare + default: "" + collection3: + type: string + description: Optional third collection of resource usage statistics to compare + default: "" \ No newline at end of file