From 335dcede43ae01efd231f9e9918b13aca4d45a2a Mon Sep 17 00:00:00 2001
From: Behnam Robatmili <brobatmili@chanzuckerberg.com>
Date: Fri, 19 Jan 2024 11:02:03 -0800
Subject: [PATCH] Early version of profiler harness

Include a basic benchmark as the starting point and needed scripts
---
 .github/workflows/profiler.yml            | 32 ++++++++++++
 tools/perf_checker/perf_checker.py        | 59 +++++++++++++++++++++++
 tools/perf_checker/perf_checker.sh        | 45 +++++++++++++++++
 tools/perf_checker/test_anndata_export.py | 19 ++++++++
 4 files changed, 155 insertions(+)
 create mode 100644 .github/workflows/profiler.yml
 create mode 100644 tools/perf_checker/perf_checker.py
 create mode 100755 tools/perf_checker/perf_checker.sh
 create mode 100644 tools/perf_checker/test_anndata_export.py

diff --git a/.github/workflows/profiler.yml b/.github/workflows/profiler.yml
new file mode 100644
index 000000000..082a29a99
--- /dev/null
+++ b/.github/workflows/profiler.yml
@@ -0,0 +1,32 @@
+name: Profiler
+
+on:
+  pull_request:
+
+jobs:
+  run_profiler:
+    name: Run Profiler
+    strategy:
+      matrix:
+        os: [single-cell-8c64g-runner]
+        python-version: ["3.11"]
+    runs-on: ${{matrix.os}}
+    permissions: # these permissions must be set for AWS auth to work!
+      id-token: write
+      contents: read
+
+    steps:
+      - uses: actions/checkout@v4
+        with:
+          fetch-depth: 1
+
+      - name: Configure AWS Credentials
+        uses: aws-actions/configure-aws-credentials@v1
+        with:
+          aws-region: us-west-2
+          role-to-assume: ${{ secrets.AWS_ROLE_TO_ASSUME }}
+          role-session-name: PushDockerImage
+
+      - name: Run all tests
+        run: |
+          ./tools/perf_checker/perf_checker.sh
\ No newline at end of file
diff --git a/tools/perf_checker/perf_checker.py b/tools/perf_checker/perf_checker.py
new file mode 100644
index 000000000..96e753442
--- /dev/null
+++ b/tools/perf_checker/perf_checker.py
@@ -0,0 +1,59 @@
+import argparse
+import json
+import logging
+import re
+from typing import Any
+
+import attr
+import profiler
+
+
+def format_string(text: str) -> Any:
+    return re.sub("\n", " ", text)
+
+
+# The script takes a command and a database path and looks
+# the performance anomalies in the performance history of that
+# command across the profiled runs.
+
+logger = logging.getLogger()
+logger.setLevel(logging.DEBUG)
+
+parser = argparse.ArgumentParser()
+parser.add_argument("command", type=str)
+parser.add_argument("db_path", type=str)
+
+args = parser.parse_args()
+
+# Processes the set of previously written logs
+# The threshold (ratio) of allowable performance degradation between profiling runs
+threshold = 1.10
+
+db = profiler.data.FileBasedProfileDB(args.db_path)
+command_profiles = db.find(f"{args.command}")
+
+
+if len(command_profiles) >= 2:
+    first_profile = command_profiles[0]
+    curr_profile = command_profiles[-1]
+    first_time = first_profile.elapsed_time_sec
+    curr_time = curr_profile.elapsed_time_sec
+
+    formatted_first_profile = json.dumps(format_string(str(attr.asdict(first_profile))))
+    formatted_curr_profile = json.dumps(format_string(str(attr.asdict(curr_profile))))
+
+    logging.info("****************************")
+    logging.info(f"Current time {curr_time} vs first time {first_time}")
+    logging.info("****************************")
+    logging.info(f"First profile: {formatted_first_profile}")
+    logging.info("****************************")
+    logging.info(f"Current profile: {formatted_curr_profile}")
+    logging.info("****************************")
+    logging.info(
+        f"TileDB version ver = first: {first_profile.tiledbsoma_version} curr: {curr_profile.tiledbsoma_version}"
+    )
+    if float(curr_time) > threshold * float(first_time):
+        raise SystemExit(f"Major performance degradation detected on {args.benchmark}")
+
+    if threshold * float(curr_time) < float(first_time):
+        logging.info(f"Major performance increase detected on {args.command}")
diff --git a/tools/perf_checker/perf_checker.sh b/tools/perf_checker/perf_checker.sh
new file mode 100755
index 000000000..f6d939759
--- /dev/null
+++ b/tools/perf_checker/perf_checker.sh
@@ -0,0 +1,45 @@
+.sh
+#!/bin/sh
+set -euox pipefail
+
+# Download the right version of python
+sudo apt install python3.11 python3.11-venv -y
+# Download gnu time tool
+sudo apt-get update -y
+sudo apt-get install -y time
+
+# Installing mount-s3
+sudo wget https://s3.amazonaws.com/mountpoint-s3-release/latest/x86_64/mount-s3.deb
+sudo apt install -y ./mount-s3.deb
+
+# Setting up mount-s3. We use S3 file system as it is necessary to persist the
+# profiling run data that are performed below
+mkdir ./census-profiler-tests
+mkdir ./s3_cache
+mount-s3 census-profiler-tests ./census-profiler-tests --cache ./s3_cache  --metadata-ttl 300
+dbpath=`pwd`/census-profiler-tests
+
+pip install psutil
+pip install gitpython
+pip install somacore
+pip install tiledbsoma
+pip install cellxgene_census
+
+# Download the repo including the profiler
+cd ../
+git clone https://github.com/single-cell-data/TileDB-SOMA.git
+# Downloading TileDB-SOMA (remove the next line once the branch is merged)
+cd TileDB-SOMA/profiler
+git checkout census_profiler
+pip install .
+cd ../../cellxgene-census/
+
+# New benchmarks must be added to this list
+declare -a benchmarks=("./tools/perf_checker/test_anndata_export.py")
+
+# Running all benchmarks and checking performance changes
+for benchmark in ${benchmarks}
+do
+  python -m profiler "python ${benchmark}" $dbpath
+  python ./tools/perf_checker/perf_checker.py "python ${benchmark}" $dbpath
+done
\ No newline at end of file
diff --git a/tools/perf_checker/test_anndata_export.py b/tools/perf_checker/test_anndata_export.py
new file mode 100644
index 000000000..4f80f5627
--- /dev/null
+++ b/tools/perf_checker/test_anndata_export.py
@@ -0,0 +1,19 @@
+from sys import stderr
+
+import cellxgene_census
+import tiledbsoma as soma
+
+print("Starting bm 1", file=stderr)
+census_S3_latest = dict(census_version="2024-01-01")
+
+
+def main() -> None:
+    with cellxgene_census.open_soma(**census_S3_latest) as census:
+        with census["census_data"]["homo_sapiens"].axis_query(
+            measurement_name="RNA",
+            obs_query=soma.AxisQuery(value_filter="""tissue_general == 'eye'"""),
+        ) as query:
+            query.to_anndata(X_name="raw")
+
+
+main()