Add query benchmark module and performance docs page

- Simulates XYZ tile queries for collections with different item layouts - Adds a new page to docs outlining tradeoffs between number of items and speed of queries - Adds mkdocs-jupyter to the dependencies for the mkdocs site
stac-utils · hrodmn · Jan 7, 2025 · Jan 7, 2025 · Jan 7, 2025 · 627eb2b59532538294072b2a2a14ec93c6203692
commit 627eb2b59532538294072b2a2a14ec93c6203692
diff --git a/.github/workflows/deploy_mkdocs.yml b/.github/workflows/deploy_mkdocs.yml
@@ -16,18 +16,17 @@ jobs:
     name: Deploy docs
     runs-on: ubuntu-latest
     steps:
-      - name: Checkout master
-        uses: actions/checkout@v2
+      - uses: actions/checkout@v4
 
-      - name: Set up Python 3.8
-        uses: actions/setup-python@v2
+      - name: Set up Python 3.12
+        uses: actions/setup-python@v5
         with:
-          python-version: 3.8
+          python-version: 3.12
 
       - name: Install dependencies
         run: |
           python -m pip install --upgrade pip
-          python -m pip install mkdocs mkdocs-material
+          python -m pip install mkdocs mkdocs-material mkdocs-jupyter pandas seaborn folium
 
       - name: Deploy docs
         run: mkdocs gh-deploy --force -f docs/mkdocs.yml
diff --git a/docs/mkdocs.yml b/docs/mkdocs.yml
@@ -20,11 +20,18 @@ nav:
   - Home: "index.md"
   - PgSTAC: "pgstac.md"
   - pyPgSTAC: "pypgstac.md"
+  - Performance:
+    - item_size_analysis.ipynb
   - Development - Contributing: "contributing.md"
   - Release Notes: "release-notes.md"
 
 plugins:
   - search
+  - mkdocs-jupyter:
+      include_source: True
+      include_requirejs: True
+      execute: True
+      show_input: False
 
 theme:
   name: material

diff --git a/docs/src/benchmark.json b/docs/src/benchmark.json
diff --git a/docs/src/item_size_analysis.ipynb b/docs/src/item_size_analysis.ipynb
@@ -0,0 +1,242 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "f7b04830-7b6a-4e37-a2a3-9961970e06df",
+   "metadata": {},
+   "source": [
+    "# impacts of STAC item footprint size on dynamic tiling query performance\n",
+    "\n",
+    "**TL;DR:** If you have any control over the geographic footprint of the assets that you are cataloging with `pgstac` and you want to serve visualizations with a dynamic tiling application, try to maximize the size of the assets!\n",
+    "\n",
+    "Dynamic tiling applications like [`titiler-pgstac`](https://github.com/stac-utils/titiler-pgstac) send many queries to a `pgstac` database and clients are very sensitive to performance so it is worth considering a few basic ideas when building collections and items that may be used in this way.\n",
+    "\n",
+    "`pgstac`'s query functions perform relatively expensive spatial intersection operations so the fewer items there are in a collection x datetime partition, the faster the query will be. This is not a `pgstac`-specific problem (any application that needs to perform spatial intersections will take longer as the number of calculations increases), but it is worth demonstrating the influence of these factors in the dynamic tiling context."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "d34feaea-5288-4124-bca1-6bd4090fd27d",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import json\n",
+    "import math\n",
+    "import uuid\n",
+    "from datetime import datetime, timezone\n",
+    "from typing import Any, Dict, Generator, Tuple\n",
+    "\n",
+    "import numpy as np\n",
+    "import pandas as pd\n",
+    "import seaborn as sns\n",
+    "from folium import Map, GeoJson, LayerControl\n",
+    "from matplotlib.colors import LogNorm\n",
+    "\n",
+    "\n",
+    "XMIN, YMIN = 0, 0\n",
+    "AOI_WIDTH = 50\n",
+    "AOI_HEIGHT = 50\n",
+    "ITEM_WIDTHS = [0.5, 1, 2, 4, 6, 8, 10]\n",
+    "\n",
+    "def generate_items(\n",
+    "    item_size: Tuple[float, float],\n",
+    "    collection_id: str,\n",
+    ") -> Generator[Dict[str, Any], None, None]:\n",
+    "    item_width, item_height = item_size\n",
+    "\n",
+    "    cols = math.ceil(AOI_WIDTH / item_width)\n",
+    "    rows = math.ceil(AOI_HEIGHT / item_height)\n",
+    "\n",
+    "    # Generate items for each grid cell\n",
+    "    for row in range(rows):\n",
+    "        for col in range(cols):\n",
+    "            left = XMIN + (col * item_width)\n",
+    "            bottom = YMIN + (row * item_height)\n",
+    "            right = left + item_width\n",
+    "            top = bottom + item_height\n",
+    "\n",
+    "            yield {\n",
+    "                \"type\": \"Feature\",\n",
+    "                \"stac_version\": \"1.0.0\",\n",
+    "                \"id\": str(uuid.uuid4()),\n",
+    "                \"collection\": collection_id,\n",
+    "                \"geometry\": {\n",
+    "                    \"type\": \"Polygon\",\n",
+    "                    \"coordinates\": [\n",
+    "                        [\n",
+    "                            [left, bottom],\n",
+    "                            [right, bottom],\n",
+    "                            [right, top],\n",
+    "                            [left, top],\n",
+    "                            [left, bottom],\n",
+    "                        ],\n",
+    "                    ],\n",
+    "                },\n",
+    "                \"bbox\": [left, bottom, right, top],\n",
+    "                \"properties\": {\n",
+    "                    \"datetime\": datetime.now(timezone.utc).isoformat(),\n",
+    "                },\n",
+    "            }\n",
+    "\n",
+    "\n",
+    "\n",
+    "def load_benchmark_results() -> pd.DataFrame:\n",
+    "    \"\"\"Load benchmark results from JSON file into a pandas DataFrame.\"\"\"\n",
+    "    with open(\"./benchmark.json\") as f:\n",
+    "        data = json.load(f)\n",
+    "\n",
+    "    # Extract the benchmarks into a list of records\n",
+    "    records = []\n",
+    "    for benchmark in data[\"benchmarks\"]:\n",
+    "        record = {\n",
+    "            \"item_width\": benchmark[\"params\"][\"item_width\"],\n",
+    "            \"zoom\": benchmark[\"params\"][\"zoom\"],\n",
+    "            \"mean\": benchmark[\"stats\"][\"mean\"],\n",
+    "            \"stddev\": benchmark[\"stats\"][\"stddev\"],\n",
+    "            \"median\": benchmark[\"stats\"][\"median\"],\n",
+    "        }\n",
+    "\n",
+    "        records.append(record)\n",
+    "\n",
+    "    return pd.DataFrame(records).sort_values(by=[\"item_width\", \"zoom\"])\n",
+    "\n",
+    "\n",
+    "stac_items = {\n",
+    "    item_width: list(\n",
+    "        generate_items(\n",
+    "            (item_width, item_width),\n",
+    "            f\"{item_width} degrees\"\n",
+    "        )\n",
+    "    )\n",
+    "    for item_width in ITEM_WIDTHS\n",
+    "}\n",
+    "\n",
+    "df = load_benchmark_results()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "ceb365dc-67c1-4cbd-8b5e-7022a5773140",
+   "metadata": {},
+   "source": [
+    "## Scenario\n",
+    "Imagine you have a continental-scale dataset of gridded data that will be stored as cloud-optimized geotiffs (COGs) and you get to decide how the individual files will be spatially arranged and cataloged in a `pgstac` database. You could make items as small as 0.5 degree squares or as large as 10 degree squares. In this case the assets will be non-overlapping rectangular grids.\n",
+    "\n",
+    "The assets will be publicly accessible, so smaller file sizes might be useful for some applications/users, but since the data will be stored as COGs and we also want to be able to serve raster tile visualizations in a web map with `titiler-pgstac`, smaller file sizes are not very important. However, the processing pipleline that generates the assets might have some resource constraints that push you to choose a smaller tile size.\n",
+    "\n",
+    "Consider the following options for tile sizes:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "7ae1604f-df80-485b-a02a-4237f9ab0081",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "pd.DataFrame(\n",
+    "    {\"tile width (degrees)\": item_width, \"# items\": len(items)}\n",
+    "    for item_width, items in stac_items.items()\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "fce001cb-ceb8-458e-9bda-e207d20362e7",
+   "metadata": {},
+   "source": [
+    "The number of items is inversely proportional to the square of the tile width which means that small changes in tile size can have a large impact on the eventual number of items in your catalog!\n",
+    "\n",
+    "This map shows the spatial arrangement of the items for a range of tile sizes:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "4c19eafb-125e-46ad-8b6f-e0053084287f",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "m = Map([25, 25], zoom_start=3)\n",
+    "for item_width in ITEM_WIDTHS:\n",
+    "    layer_name = f\"{item_width} degrees\"\n",
+    "    geojson = GeoJson(\n",
+    "        {\n",
+    "            \"type\": \"FeatureCollection\",\n",
+    "            \"features\": stac_items[item_width],\n",
+    "        },\n",
+    "        name=layer_name,\n",
+    "        overlay=True,\n",
+    "        show=False,\n",
+    "    )\n",
+    "    geojson.add_to(m)\n",
+    "    \n",
+    "LayerControl(collapsed=False, position=\"topright\").add_to(m)\n",
+    "\n",
+    "m"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "1c9714e7-c865-4b73-8305-83851864e486",
+   "metadata": {},
+   "source": [
+    "## Performance comparison\n",
+    "To simulate the performance of queries made by a dynamic tiling application we have prepared a benchmarking procedure that uses the `pgstac` function `xyzsearch` to run an item query for an XYZ tile. By iterating over many combinations of tile sizes and zoom levels we can examine the response time with respect to item footprint size and tile zoom level. \n",
+    "\n",
+    "This figure shows average response time for `xyzsearch` to return a complete set of results for each zoom level for the range of item tile widths:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "c61e8c44-df19-404d-8878-1efb5fddeb36",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "ax = sns.heatmap(\n",
+    "    df.pivot(index=\"item_width\", columns=\"zoom\", values=\"median\"),\n",
+    "    norm=LogNorm(vmin=1e-2, vmax=1e1),\n",
+    "    cbar_kws={\n",
+    "        \"ticks\": np.logspace(-2, 0, num=3),\n",
+    "        \"format\": \"%.1e\",\n",
+    "    }\n",
+    ")\n",
+    "ax.set(xlabel=\"zoom level\", ylabel=\"item tile width\")\n",
+    "ax.xaxis.tick_top()\n",
+    "ax.xaxis.set_label_position(\"top\")\n",
+    "display(ax)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "cdb98e2e-e9da-4516-85f7-5576035b5915",
+   "metadata": {},
+   "source": [
+    "Without details about the resource configuration for a specific `pgstac` deployment it is hard to say which zoom level becomes inoperable for a given tile size, but queries that take >0.5 seconds in this test would probably yield poor results in a deployed context."
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.12.3"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/src/pypgstac/pyproject.toml b/src/pypgstac/pyproject.toml
@@ -35,9 +35,11 @@ dependencies = [
 [project.optional-dependencies]
 test = [
     "pytest",
+    "pytest-benchmark",
     "pytest-cov",
     "pystac[validation]==1.*",
     "types-cachetools",
+    "morecantile",
 ]
 dev = [
     "flake8==7.1.1",
@@ -55,6 +57,13 @@ migrations = [
     "psycopg2-binary",
     "migra"
 ]
+docs = [
+    "jupyter",
+    "pandas",
+    "seaborn",
+    "mkdocs-jupyter",
+    "folium"
+]
 
 
 [project.urls]
@@ -123,3 +132,6 @@ namespace_packages = true
 [tool.pydocstyle]
 select = "D1"
 match = "(?!test).*.py"
+
+[tool.pytest.ini_options]
+addopts = "-vv --benchmark-skip"
diff --git a/src/pypgstac/tests/test_benchmark.py b/src/pypgstac/tests/test_benchmark.py
@@ -0,0 +1,150 @@
+import json
+import uuid
+from datetime import datetime, timezone
+from math import ceil
+from typing import Any, Dict, Generator, Tuple
+
+import morecantile
+import psycopg
+import pytest
+
+from pypgstac.load import Loader, Methods
+
+XMIN, YMIN = 0, 0
+AOI_WIDTH = 50
+AOI_HEIGHT = 50
+
+
+ITEM_WIDTHS = [0.5, 0.75, 1, 1.5, 2, 3, 4, 5, 6, 8, 10]
+TMS = morecantile.tms.get("WebMercatorQuad")
+
+
+def generate_items(
+    item_size: Tuple[float, float],
+    collection_id: str,
+) -> Generator[Dict[str, Any], None, None]:
+    item_width, item_height = item_size
+
+    cols = ceil(AOI_WIDTH / item_width)
+    rows = ceil(AOI_HEIGHT / item_height)
+
+    # generate an item for each grid cell
+    for row in range(rows):
+        for col in range(cols):
+            left = XMIN + (col * item_width)
+            bottom = YMIN + (row * item_height)
+            right = left + item_width
+            top = bottom + item_height
+
+            yield {
+                "type": "Feature",
+                "stac_version": "1.0.0",
+                "id": str(uuid.uuid4()),
+                "collection": collection_id,
+                "geometry": {
+                    "type": "Polygon",
+                    "coordinates": [
+                        [
+                            [left, bottom],
+                            [right, bottom],
+                            [right, top],
+                            [left, top],
+                            [left, bottom],
+                        ],
+                    ],
+                },
+                "bbox": [left, bottom, right, top],
+                "properties": {
+                    "datetime": datetime.now(timezone.utc).isoformat(),
+                },
+            }
+
+
+@pytest.fixture(scope="function")
+def search_hashes(loader: Loader) -> Dict[float, str]:
+    search_hashes = {}
+    for item_width in ITEM_WIDTHS:
+        collection_id = f"collection-{str(item_width)}"
+        collection = {
+            "type": "Collection",
+            "id": collection_id,
+            "stac_version": "1.0.0",
+            "description": f"Minimal test collection {collection_id}",
+            "license": "proprietary",
+            "extent": {
+                "spatial": {
+                    "bbox": [XMIN, YMIN, XMIN + AOI_WIDTH, YMIN + AOI_HEIGHT],
+                },
+                "temporal": {
+                    "interval": [[datetime.now(timezone.utc).isoformat(), None]],
+                },
+            },
+        }
+
+        loader.load_collections(
+            [collection],
+            insert_mode=Methods.insert,
+        )
+        loader.load_items(
+            generate_items((item_width, item_width), collection_id),
+            insert_mode=Methods.insert,
+        )
+
+        with psycopg.connect(autocommit=True) as conn:
+            with conn.cursor() as cursor:
+                cursor.execute(
+                    "SELECT * FROM search_query(%s);",
+                    (json.dumps({"collections": [collection_id]}),),
+                )
+                res = cursor.fetchone()
+                assert res
+                search_hashes[item_width] = res[0]
+
+    return search_hashes
+
+
+@pytest.mark.benchmark(
+    group="xyzsearch",
+    min_rounds=3,
+    warmup=True,
+    warmup_iterations=2,
+)
+@pytest.mark.parametrize("item_width", ITEM_WIDTHS)
+@pytest.mark.parametrize("zoom", range(3, 8 + 1))
+def test1(
+    benchmark,
+    search_hashes: Dict[float, str],
+    item_width: float,
+    zoom: int,
+) -> None:
+    # get a tile from the center of the full AOI
+    xmid = XMIN + AOI_WIDTH / 2
+    ymid = YMIN + AOI_HEIGHT / 2
+    tiles = TMS.tiles(xmid, ymid, xmid + 1, ymid + 1, [zoom])
+    tile = next(tiles)
+
+    def xyzsearch_test():
+        with psycopg.connect(autocommit=True) as conn:
+            with conn.cursor() as cursor:
+                cursor.execute(
+                    "SELECT * FROM xyzsearch(%s, %s, %s, %s, %s, %s, %s, %s, %s, %s);",
+                    (
+                        tile.x,
+                        tile.y,
+                        tile.z,
+                        search_hashes[item_width],
+                        json.dumps(
+                            {
+                                "include": ["assets", "id", "bbox", "collection"],
+                            },
+                        ),  # fields
+                        100000,  # scan_limit,
+                        100000,  # items limit
+                        "5 seconds",
+                        True,  # exitwhenfull
+                        True,  # skipcovered
+                    ),
+                )
+                _ = cursor.fetchone()[0]
+
+    _ = benchmark(xyzsearch_test)