Merge pull request #35 from lsst-sqre/tickets/DM-48190

DM-48190: Add notebook to compare resources for 1-3 collections
lsst-sqre · Dec 18, 2024 · cb61b07 · cb61b07
2 parents d0adeae + 0abf541
commit cb61b07
Show file tree

Hide file tree

Showing 2 changed files with 238 additions and 0 deletions.
diff --git a/batch-processing/resourceUsagePerCollection.ipynb b/batch-processing/resourceUsagePerCollection.ipynb
@@ -0,0 +1,206 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "eb2db9a1-0d72-4679-9b53-715e63852f0f",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Time Square Parameters\n",
+    "\n",
+    "repo1 = \"embargo\"\n",
+    "collection1 = \"LSSTComCam/runs/DRP/20241101_20241211/w_2024_50/DM-48128\"\n",
+    "\n",
+    "repo2 = \"embargo\"\n",
+    "collection2 = \"LSSTComCam/runs/DRP/20241101_20241204/w_2024_49/DM-47988\"\n",
+    "\n",
+    "repo3 = \"embargo\"\n",
+    "collection3 = \"LSSTComCam/runs/DRP/20241101_20241127/w_2024_48/DM-47841\""
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "364d0429-ae78-4658-846d-4a3dcab1c9ce",
+   "metadata": {},
+   "source": [
+    "# Resource Usage Comparison Report\n",
+    "### Compare Runtime and Memory Usage for tasks in up to three collections \n",
+    "\n",
+    "* {{params.repo1}}:{{params.collection1}}\n",
+    "* {{params.repo2}}:{{params.collection2}}\n",
+    "* {{params.repo3}}:{{params.collection3}}"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "20833b7d-d675-42b5-807d-862762df4350",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from lsst.daf.butler import Butler, DatasetNotFoundError\n",
+    "import matplotlib.pyplot as plt\n",
+    "import seaborn as sns\n",
+    "import pandas as pd\n",
+    "from matplotlib.lines import Line2D"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "b2a4d1a0-a675-445f-ac95-924cc946bd3e",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "repos = [repo1, repo2, repo3]\n",
+    "collections = [collection1, collection2, collection3]\n",
+    "\n",
+    "dfs = []\n",
+    "\n",
+    "for repo, collection in zip(repos, collections):\n",
+    "    if repo and collection:\n",
+    "        butler = Butler(repo, collections=[collection])\n",
+    "        try:\n",
+    "            df = butler.get(\"ResourceUsageSummary\")\n",
+    "        except DatasetNotFoundError as e:\n",
+    "            print(e)\n",
+    "            print(f\"Skipping {repo}:{collection}. \"\n",
+    "                  \" Check that build-gather-resource-usage-qg was run on this collection\")\n",
+    "            continue\n",
+    "        df['collection'] = collection\n",
+    "        dfs.append(df)\n",
+    "        \n",
+    "both = pd.concat(dfs)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "5cdb0d65-d65b-4082-afe4-e80df34320c2",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def custom_boxplot(data, y, hue, value_prefix, xlabel, sortby, logscale=False):\n",
+    "    versions = sorted(data[hue].drop_duplicates())\n",
+    "\n",
+    "    tasks = data.groupby(\"task\").sum()[sortby].sort_values(ascending=True).index\n",
+    "    \n",
+    "    palette = sns.color_palette(n_colors=len(versions))\n",
+    "\n",
+    "    fig, ax = plt.subplots(figsize=(8, 8))\n",
+    "\n",
+    "    for i, task in enumerate(tasks):\n",
+    "        for j, version in enumerate(versions):\n",
+    "            sub_group = data[(data[y] == task) & (data[hue] == version)]\n",
+    "            if not sub_group.empty:\n",
+    "                min_rt = sub_group[f'{value_prefix}p000'].values[0]\n",
+    "                q1 = sub_group[f'{value_prefix}p032'].values[0]\n",
+    "                median = sub_group[f'{value_prefix}p050'].values[0]\n",
+    "                q3 = sub_group[f'{value_prefix}p068'].values[0]\n",
+    "                max_rt = sub_group[f'{value_prefix}p100'].values[0]\n",
+    "\n",
+    "                # Calculate the position for the box with offset for each version\n",
+    "                # task index + version index (but start lower depending on how many) \n",
+    "                position = i + (j * 0.25) - (0.125 * (len(versions) - 1)) \n",
+    "\n",
+    "                ax.plot([min_rt, q1], [position, position], color=palette[j], linestyle='-', lw=1.5)\n",
+    "                ax.plot([q1, q3], [position, position], color=palette[j], lw=6, solid_capstyle='butt')\n",
+    "                ax.plot([q3, max_rt], [position, position], color=palette[j], linestyle='-', lw=1.5)\n",
+    "                ax.plot([median], [position], marker='|', color='k')\n",
+    "    ax.set_yticks(range(len(tasks)))\n",
+    "    ax.set_yticklabels(tasks)\n",
+    "\n",
+    "    handles = [Line2D([0], [0], color=palette[i], lw=4) for i in range(len(versions))]\n",
+    "    ax.legend(handles, versions, title=hue, loc='lower left', bbox_to_anchor=(0, 1.02), borderaxespad=0, ncol=1)\n",
+    "    \n",
+    "    ax.set_xlabel(xlabel)\n",
+    "    if logscale:\n",
+    "        ax.set_xscale('log')\n",
+    "    plt.show()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "f388943a-48d2-49bd-8b98-484199d2e2ef",
+   "metadata": {},
+   "source": [
+    "## Runtime for top 20 longest Tasks, sorted by integrated runtime. \n",
+    "\n",
+    "x-axis has linear scaling in the first and logarithmic scaling in the second. "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "ce9a4c74-d93a-42e9-a668-d1fd9c061727",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "tasks = both.groupby(\"task\").sum()[\"integrated_runtime_hrs\"].sort_values(ascending=False).head(20).index\n",
+    "pivoted_df = both[(both.task.isin(tasks))].sort_values('integrated_runtime_hrs', ascending=True) \n",
+    "custom_boxplot(pivoted_df, y='task', hue='collection', value_prefix=\"runtime_s_\", xlabel=\"runtime (s)\", sortby=\"integrated_runtime_hrs\", logscale=False)\n",
+    "custom_boxplot(pivoted_df, y='task', hue='collection', value_prefix=\"runtime_s_\", xlabel=\"runtime (s)\", sortby=\"integrated_runtime_hrs\", logscale=True)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "ae943812-ffeb-446f-b849-55383616166e",
+   "metadata": {},
+   "source": [
+    "## Tasks with memory usage over 2GB, sorted by max memory usage.\n",
+    "\n",
+    "If the processing was run with clustering, the memory usage is reported per cluster not per task. Use caution when interpreting. "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "bfca017c-28e4-4c09-8489-b64d81276f6b",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "pivoted_df = both[(both.task.isin(both[(both.mem_GB_p050 > 2)][\"task\"].unique()))].sort_values('mem_GB_p100', ascending=True)  #  pd.DataFrame(data)\n",
+    "custom_boxplot(pivoted_df, y='task', hue='collection', value_prefix=\"mem_GB_\", xlabel=\"Memory (GB)\", sortby='mem_GB_p100', logscale=False)\n",
+    "custom_boxplot(pivoted_df, y='task', hue='collection', value_prefix=\"mem_GB_\", xlabel=\"Memory (GB)\", sortby='mem_GB_p100', logscale=True)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "bf164b14-a69e-406a-8fc1-7a468fd8265a",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Get top 20 tasks with longest integrated runtime:\n",
+    "tasks = both.groupby(\"task\").sum()[\"integrated_runtime_hrs\"].sort_values(ascending=False).head(20).index\n",
+    "df = both[both.task.isin(tasks)].sort_values('integrated_runtime_hrs', ascending=False)\n",
+    "hue_order = sorted(df[\"collection\"].drop_duplicates())\n",
+    "\n",
+    "ax = sns.catplot(data=df, y=\"task\", x=\"integrated_runtime_hrs\", hue=\"collection\", height=8, hue_order=hue_order, kind=\"bar\")\n",
+    "ax = sns.catplot(data=df, y=\"task\", x=\"quanta\", hue=\"collection\", height=8, hue_order=hue_order, kind=\"bar\")"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "LSST",
+   "language": "python",
+   "name": "lsst"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.9"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/batch-processing/resourceUsagePerCollection.yaml b/batch-processing/resourceUsagePerCollection.yaml
@@ -0,0 +1,32 @@
+title: Resource Usage Plots for up to three Collections
+description: Generate plots comparing runtime and memory usage stats for up to three collections
+authors:
+  - name: Yusra AlSayyad
+    slack: yusra
+tags:
+  - batch-processing
+parameters:
+  repo1:
+    type: string
+    description: Butler repo of first collection to display
+    default: "embargo"
+  collection1:
+    type: string
+    description: Collection name containing a resource usage summary to display
+    default: "LSSTComCam/runs/DRP/20241101_20241211/w_2024_50/DM-48128"
+  repo2:
+    type: string
+    description: Optional Butler repo of second collection to compare
+    default: "embargo"
+  collection2:
+    type: string
+    description: Optional second collection of resource usage statistics to compare
+    default: "LSSTComCam/runs/DRP/20241101_20241204/w_2024_49/DM-47988"
+  repo3:
+    type: string
+    description: Optional Butler repo of third collection to compare
+    default: ""
+  collection3:
+    type: string
+    description: Optional third collection of resource usage statistics to compare
+    default: ""