Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add query benchmark module and performance docs page #336

Open
wants to merge 2 commits into
base: main
Choose a base branch
from
Open
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Next Next commit
Add query benchmark module and performance docs page
- Simulates XYZ tile queries for collections with different item layouts
- Adds a new page to docs outlining tradeoffs between number of items
  and speed of queries
- Adds mkdocs-jupyter to the dependencies for the mkdocs site
hrodmn committed Jan 7, 2025
commit 627eb2b59532538294072b2a2a14ec93c6203692
11 changes: 5 additions & 6 deletions .github/workflows/deploy_mkdocs.yml
Original file line number Diff line number Diff line change
@@ -16,18 +16,17 @@ jobs:
name: Deploy docs
runs-on: ubuntu-latest
steps:
- name: Checkout master
uses: actions/checkout@v2
- uses: actions/checkout@v4

- name: Set up Python 3.8
uses: actions/setup-python@v2
- name: Set up Python 3.12
uses: actions/setup-python@v5
with:
python-version: 3.8
python-version: 3.12

- name: Install dependencies
run: |
python -m pip install --upgrade pip
python -m pip install mkdocs mkdocs-material
python -m pip install mkdocs mkdocs-material mkdocs-jupyter pandas seaborn folium

- name: Deploy docs
run: mkdocs gh-deploy --force -f docs/mkdocs.yml
7 changes: 7 additions & 0 deletions docs/mkdocs.yml
Original file line number Diff line number Diff line change
@@ -20,11 +20,18 @@ nav:
- Home: "index.md"
- PgSTAC: "pgstac.md"
- pyPgSTAC: "pypgstac.md"
- Performance:
- item_size_analysis.ipynb
- Development - Contributing: "contributing.md"
- Release Notes: "release-notes.md"

plugins:
- search
- mkdocs-jupyter:
include_source: True
include_requirejs: True
execute: True
show_input: False

theme:
name: material
4,301 changes: 4,301 additions & 0 deletions docs/src/benchmark.json

Large diffs are not rendered by default.

242 changes: 242 additions & 0 deletions docs/src/item_size_analysis.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,242 @@
{
"cells": [
{
"cell_type": "markdown",
"id": "f7b04830-7b6a-4e37-a2a3-9961970e06df",
"metadata": {},
"source": [
"# impacts of STAC item footprint size on dynamic tiling query performance\n",
"\n",
"**TL;DR:** If you have any control over the geographic footprint of the assets that you are cataloging with `pgstac` and you want to serve visualizations with a dynamic tiling application, try to maximize the size of the assets!\n",
"\n",
"Dynamic tiling applications like [`titiler-pgstac`](https://github.com/stac-utils/titiler-pgstac) send many queries to a `pgstac` database and clients are very sensitive to performance so it is worth considering a few basic ideas when building collections and items that may be used in this way.\n",
"\n",
"`pgstac`'s query functions perform relatively expensive spatial intersection operations so the fewer items there are in a collection x datetime partition, the faster the query will be. This is not a `pgstac`-specific problem (any application that needs to perform spatial intersections will take longer as the number of calculations increases), but it is worth demonstrating the influence of these factors in the dynamic tiling context."
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "d34feaea-5288-4124-bca1-6bd4090fd27d",
"metadata": {},
"outputs": [],
"source": [
"import json\n",
"import math\n",
"import uuid\n",
"from datetime import datetime, timezone\n",
"from typing import Any, Dict, Generator, Tuple\n",
"\n",
"import numpy as np\n",
"import pandas as pd\n",
"import seaborn as sns\n",
"from folium import Map, GeoJson, LayerControl\n",
"from matplotlib.colors import LogNorm\n",
"\n",
"\n",
"XMIN, YMIN = 0, 0\n",
"AOI_WIDTH = 50\n",
"AOI_HEIGHT = 50\n",
"ITEM_WIDTHS = [0.5, 1, 2, 4, 6, 8, 10]\n",
"\n",
"def generate_items(\n",
" item_size: Tuple[float, float],\n",
" collection_id: str,\n",
") -> Generator[Dict[str, Any], None, None]:\n",
" item_width, item_height = item_size\n",
"\n",
" cols = math.ceil(AOI_WIDTH / item_width)\n",
" rows = math.ceil(AOI_HEIGHT / item_height)\n",
"\n",
" # Generate items for each grid cell\n",
" for row in range(rows):\n",
" for col in range(cols):\n",
" left = XMIN + (col * item_width)\n",
" bottom = YMIN + (row * item_height)\n",
" right = left + item_width\n",
" top = bottom + item_height\n",
"\n",
" yield {\n",
" \"type\": \"Feature\",\n",
" \"stac_version\": \"1.0.0\",\n",
" \"id\": str(uuid.uuid4()),\n",
" \"collection\": collection_id,\n",
" \"geometry\": {\n",
" \"type\": \"Polygon\",\n",
" \"coordinates\": [\n",
" [\n",
" [left, bottom],\n",
" [right, bottom],\n",
" [right, top],\n",
" [left, top],\n",
" [left, bottom],\n",
" ],\n",
" ],\n",
" },\n",
" \"bbox\": [left, bottom, right, top],\n",
" \"properties\": {\n",
" \"datetime\": datetime.now(timezone.utc).isoformat(),\n",
" },\n",
" }\n",
"\n",
"\n",
"\n",
"def load_benchmark_results() -> pd.DataFrame:\n",
" \"\"\"Load benchmark results from JSON file into a pandas DataFrame.\"\"\"\n",
" with open(\"./benchmark.json\") as f:\n",
" data = json.load(f)\n",
"\n",
" # Extract the benchmarks into a list of records\n",
" records = []\n",
" for benchmark in data[\"benchmarks\"]:\n",
" record = {\n",
" \"item_width\": benchmark[\"params\"][\"item_width\"],\n",
" \"zoom\": benchmark[\"params\"][\"zoom\"],\n",
" \"mean\": benchmark[\"stats\"][\"mean\"],\n",
" \"stddev\": benchmark[\"stats\"][\"stddev\"],\n",
" \"median\": benchmark[\"stats\"][\"median\"],\n",
" }\n",
"\n",
" records.append(record)\n",
"\n",
" return pd.DataFrame(records).sort_values(by=[\"item_width\", \"zoom\"])\n",
"\n",
"\n",
"stac_items = {\n",
" item_width: list(\n",
" generate_items(\n",
" (item_width, item_width),\n",
" f\"{item_width} degrees\"\n",
" )\n",
" )\n",
" for item_width in ITEM_WIDTHS\n",
"}\n",
"\n",
"df = load_benchmark_results()"
]
},
{
"cell_type": "markdown",
"id": "ceb365dc-67c1-4cbd-8b5e-7022a5773140",
"metadata": {},
"source": [
"## Scenario\n",
"Imagine you have a continental-scale dataset of gridded data that will be stored as cloud-optimized geotiffs (COGs) and you get to decide how the individual files will be spatially arranged and cataloged in a `pgstac` database. You could make items as small as 0.5 degree squares or as large as 10 degree squares. In this case the assets will be non-overlapping rectangular grids.\n",
"\n",
"The assets will be publicly accessible, so smaller file sizes might be useful for some applications/users, but since the data will be stored as COGs and we also want to be able to serve raster tile visualizations in a web map with `titiler-pgstac`, smaller file sizes are not very important. However, the processing pipleline that generates the assets might have some resource constraints that push you to choose a smaller tile size.\n",
"\n",
"Consider the following options for tile sizes:"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "7ae1604f-df80-485b-a02a-4237f9ab0081",
"metadata": {},
"outputs": [],
"source": [
"pd.DataFrame(\n",
" {\"tile width (degrees)\": item_width, \"# items\": len(items)}\n",
" for item_width, items in stac_items.items()\n",
")"
]
},
{
"cell_type": "markdown",
"id": "fce001cb-ceb8-458e-9bda-e207d20362e7",
"metadata": {},
"source": [
"The number of items is inversely proportional to the square of the tile width which means that small changes in tile size can have a large impact on the eventual number of items in your catalog!\n",
"\n",
"This map shows the spatial arrangement of the items for a range of tile sizes:"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "4c19eafb-125e-46ad-8b6f-e0053084287f",
"metadata": {},
"outputs": [],
"source": [
"m = Map([25, 25], zoom_start=3)\n",
"for item_width in ITEM_WIDTHS:\n",
" layer_name = f\"{item_width} degrees\"\n",
" geojson = GeoJson(\n",
" {\n",
" \"type\": \"FeatureCollection\",\n",
" \"features\": stac_items[item_width],\n",
" },\n",
" name=layer_name,\n",
" overlay=True,\n",
" show=False,\n",
" )\n",
" geojson.add_to(m)\n",
" \n",
"LayerControl(collapsed=False, position=\"topright\").add_to(m)\n",
"\n",
"m"
]
},
{
"cell_type": "markdown",
"id": "1c9714e7-c865-4b73-8305-83851864e486",
"metadata": {},
"source": [
"## Performance comparison\n",
"To simulate the performance of queries made by a dynamic tiling application we have prepared a benchmarking procedure that uses the `pgstac` function `xyzsearch` to run an item query for an XYZ tile. By iterating over many combinations of tile sizes and zoom levels we can examine the response time with respect to item footprint size and tile zoom level. \n",
"\n",
"This figure shows average response time for `xyzsearch` to return a complete set of results for each zoom level for the range of item tile widths:"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "c61e8c44-df19-404d-8878-1efb5fddeb36",
"metadata": {},
"outputs": [],
"source": [
"ax = sns.heatmap(\n",
" df.pivot(index=\"item_width\", columns=\"zoom\", values=\"median\"),\n",
" norm=LogNorm(vmin=1e-2, vmax=1e1),\n",
" cbar_kws={\n",
" \"ticks\": np.logspace(-2, 0, num=3),\n",
" \"format\": \"%.1e\",\n",
" }\n",
")\n",
"ax.set(xlabel=\"zoom level\", ylabel=\"item tile width\")\n",
"ax.xaxis.tick_top()\n",
"ax.xaxis.set_label_position(\"top\")\n",
"display(ax)"
]
},
{
"cell_type": "markdown",
"id": "cdb98e2e-e9da-4516-85f7-5576035b5915",
"metadata": {},
"source": [
"Without details about the resource configuration for a specific `pgstac` deployment it is hard to say which zoom level becomes inoperable for a given tile size, but queries that take >0.5 seconds in this test would probably yield poor results in a deployed context."
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.12.3"
}
},
"nbformat": 4,
"nbformat_minor": 5
}
12 changes: 12 additions & 0 deletions src/pypgstac/pyproject.toml
Original file line number Diff line number Diff line change
@@ -35,9 +35,11 @@ dependencies = [
[project.optional-dependencies]
test = [
"pytest",
"pytest-benchmark",
"pytest-cov",
"pystac[validation]==1.*",
"types-cachetools",
"morecantile",
]
dev = [
"flake8==7.1.1",
@@ -55,6 +57,13 @@ migrations = [
"psycopg2-binary",
"migra"
]
docs = [
"jupyter",
"pandas",
"seaborn",
"mkdocs-jupyter",
"folium"
]


[project.urls]
@@ -123,3 +132,6 @@ namespace_packages = true
[tool.pydocstyle]
select = "D1"
match = "(?!test).*.py"

[tool.pytest.ini_options]
addopts = "-vv --benchmark-skip"
150 changes: 150 additions & 0 deletions src/pypgstac/tests/test_benchmark.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,150 @@
import json
import uuid
from datetime import datetime, timezone
from math import ceil
from typing import Any, Dict, Generator, Tuple

import morecantile
import psycopg
import pytest

from pypgstac.load import Loader, Methods

XMIN, YMIN = 0, 0
AOI_WIDTH = 50
AOI_HEIGHT = 50


ITEM_WIDTHS = [0.5, 0.75, 1, 1.5, 2, 3, 4, 5, 6, 8, 10]
TMS = morecantile.tms.get("WebMercatorQuad")


def generate_items(
item_size: Tuple[float, float],
collection_id: str,
) -> Generator[Dict[str, Any], None, None]:
item_width, item_height = item_size

cols = ceil(AOI_WIDTH / item_width)
rows = ceil(AOI_HEIGHT / item_height)

# generate an item for each grid cell
for row in range(rows):
for col in range(cols):
left = XMIN + (col * item_width)
bottom = YMIN + (row * item_height)
right = left + item_width
top = bottom + item_height

yield {
"type": "Feature",
"stac_version": "1.0.0",
"id": str(uuid.uuid4()),
"collection": collection_id,
"geometry": {
"type": "Polygon",
"coordinates": [
[
[left, bottom],
[right, bottom],
[right, top],
[left, top],
[left, bottom],
],
],
},
"bbox": [left, bottom, right, top],
"properties": {
"datetime": datetime.now(timezone.utc).isoformat(),
},
}


@pytest.fixture(scope="function")
def search_hashes(loader: Loader) -> Dict[float, str]:
search_hashes = {}
for item_width in ITEM_WIDTHS:
collection_id = f"collection-{str(item_width)}"
collection = {
"type": "Collection",
"id": collection_id,
"stac_version": "1.0.0",
"description": f"Minimal test collection {collection_id}",
"license": "proprietary",
"extent": {
"spatial": {
"bbox": [XMIN, YMIN, XMIN + AOI_WIDTH, YMIN + AOI_HEIGHT],
},
"temporal": {
"interval": [[datetime.now(timezone.utc).isoformat(), None]],
},
},
}

loader.load_collections(
[collection],
insert_mode=Methods.insert,
)
loader.load_items(
generate_items((item_width, item_width), collection_id),
insert_mode=Methods.insert,
)

with psycopg.connect(autocommit=True) as conn:
with conn.cursor() as cursor:
cursor.execute(
"SELECT * FROM search_query(%s);",
(json.dumps({"collections": [collection_id]}),),
)
res = cursor.fetchone()
assert res
search_hashes[item_width] = res[0]

return search_hashes


@pytest.mark.benchmark(
group="xyzsearch",
min_rounds=3,
warmup=True,
warmup_iterations=2,
)
@pytest.mark.parametrize("item_width", ITEM_WIDTHS)
@pytest.mark.parametrize("zoom", range(3, 8 + 1))
def test1(
benchmark,
search_hashes: Dict[float, str],
item_width: float,
zoom: int,
) -> None:
# get a tile from the center of the full AOI
xmid = XMIN + AOI_WIDTH / 2
ymid = YMIN + AOI_HEIGHT / 2
tiles = TMS.tiles(xmid, ymid, xmid + 1, ymid + 1, [zoom])
tile = next(tiles)

def xyzsearch_test():
with psycopg.connect(autocommit=True) as conn:
with conn.cursor() as cursor:
cursor.execute(
"SELECT * FROM xyzsearch(%s, %s, %s, %s, %s, %s, %s, %s, %s, %s);",
(
tile.x,
tile.y,
tile.z,
search_hashes[item_width],
json.dumps(
{
"include": ["assets", "id", "bbox", "collection"],
},
), # fields
100000, # scan_limit,
100000, # items limit
"5 seconds",
True, # exitwhenfull
True, # skipcovered
),
)
_ = cursor.fetchone()[0]

_ = benchmark(xyzsearch_test)