Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Support benchmark history summary v2.1 #3312

Merged
merged 15 commits into from
Apr 17, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
39 changes: 35 additions & 4 deletions .github/workflows/perf_benchmark.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ on:
- accuracy
- default # speed, balance, accuracy models only
- all # default + other models
default: default
default: all
data-group:
type: choice
description: Data group to run benchmark
Expand Down Expand Up @@ -97,7 +97,7 @@ on:
permissions: read-all

jobs:
Perf-Benchmark:
Perf-Benchmark-Run:
strategy:
fail-fast: false
matrix:
Expand All @@ -114,6 +114,8 @@ jobs:
task: "semantic_segmentation"
- task-short: "vsp"
task: "visual_prompting"
- task-short: "act"
task: "action"
name: Perf-Benchmark-${{ matrix.task-short }}
runs-on: [self-hosted, linux, x64, dmount-v2]
timeout-minutes: 8640
Expand All @@ -139,14 +141,43 @@ jobs:
--num-repeat ${{ inputs.num-repeat }}
--num-epoch ${{ inputs.num-epoch }}
--eval-upto ${{ inputs.eval-upto }}
--summary-csv .tox/perf-benchmark-summary.csv
--summary-file .tox/perf-benchmark-summary.xlsx
--mlflow-tracking-uri ${{ vars.MLFLOW_TRACKING_SERVER_URI }}
--user-name ${{ github.triggering_actor }}
--otx-ref ${{ inputs.otx-ref }}
- name: Upload test results
uses: actions/upload-artifact@5d5d22a31266ced268874388b861e4b58bb5c2f3 # v4.3.1
with:
name: perf-benchmark-${{ matrix.task-short }}
path: .tox/perf-*.csv
path: .tox/perf-benchmark-*.*
# Use always() to always run this step to publish test results when there are test failures
if: ${{ always() }}

Perf-Benchmark-Summary:
if: ${{ always() }}
needs: Perf-Benchmark-Run
runs-on: ubuntu-latest
steps:
- name: Checkout repository
uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1
- name: Install Python
uses: actions/setup-python@0a5c61591373683505ea898e09a3ea4f39ef2b9c # v5.0.0
with:
python-version: "3.10"
- name: Install dependencies
run: python -m pip install --upgrade pip pandas matplotlib nbconvert ipython ipykernel openpyxl
- name: Download benchmark results
uses: actions/download-artifact@v4
with:
path: tests/perf/history/latest
- name: Summarize benchamrk results
run: |
python tests/perf/history/summary.py tests/perf/history ./perf-benchmark-summary --pattern "*raw*.csv" --normalize
jupyter nbconvert --execute --to html --no-input tests/perf/history/summary.ipynb --output-dir ./perf-benchmark-summary --output perf-benchmark-summary
- name: Upload benchmark summary
uses: actions/upload-artifact@5d5d22a31266ced268874388b861e4b58bb5c2f3 # v4.3.1
with:
name: perf-benchmark-summary
path: perf-benchmark-summary
# Use always() to always run this step to publish test results when there are test failures
if: ${{ always() }}
5 changes: 3 additions & 2 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -50,8 +50,9 @@ dev = [
"pytest-mock",
"pytest-csv",
"pytest-cov",
"mlflow==2.11.1", # For regression test
"py-cpuinfo==9.0.0", # For regression test
"mlflow==2.11.1", # For perf benchmark
"py-cpuinfo==9.0.0", # For perf benchmark
"openpyxl", # For perf benchmark
]
docs = [
"furo",
Expand Down
4 changes: 2 additions & 2 deletions tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,9 +72,9 @@ def pytest_addoption(parser: pytest.Parser):
help="Output root directory. Defaults to temp directory.",
)
parser.addoption(
"--summary-csv",
"--summary-file",
action="store",
help="Path to output summary cvs file. Defaults to {output-root}/benchmark-summary.csv",
help="Path to output summary file. Defaults to {output-root}/benchmark-summary.csv",
)
parser.addoption(
"--dry-run",
Expand Down
2 changes: 1 addition & 1 deletion tests/perf/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Copyright (C) 2024 Intel Corporation
# SPDX-License-Identifier: Apache-2.0

"""OTX perfomance benchamrk tests."""
"""OTX perfomance benchmark tests."""
143 changes: 0 additions & 143 deletions tests/perf/benchmark-reference.csv

This file was deleted.

87 changes: 44 additions & 43 deletions tests/perf/benchmark.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,6 @@

import gc
import logging
import os
import subprocess
from dataclasses import dataclass
from pathlib import Path
Expand All @@ -17,6 +16,8 @@
import numpy as np
import pandas as pd

from .history import summary

log = logging.getLogger(__name__)


Expand Down Expand Up @@ -71,14 +72,25 @@ class Criterion:
def __call__(self, result_entry: pd.Series, target_entry: pd.Series) -> None:
"""Check result against given target."""
if self.name not in result_entry or result_entry[self.name] is None or np.isnan(result_entry[self.name]):
print(f"[Check] {self.name} not in result")
return
if self.name not in target_entry or target_entry[self.name] is None or np.isnan(target_entry[self.name]):
print(f"[Check] {self.name} not in target")
return
if self.compare == "==":
print(
f"[Check] abs({self.name}:{result_entry[self.name]} - {self.name}:{target_entry[self.name]}) < {self.name}:{target_entry[self.name]} * {self.margin}",
)
assert abs(result_entry[self.name] - target_entry[self.name]) < target_entry[self.name] * self.margin
elif self.compare == "<":
print(
f"[Check] {self.name}:{result_entry[self.name]} < {self.name}:{target_entry[self.name]} * (1.0 + {self.margin})",
)
assert result_entry[self.name] < target_entry[self.name] * (1.0 + self.margin)
elif self.compare == ">":
print(
f"[Check] {self.name}:{result_entry[self.name]} > {self.name}:{target_entry[self.name]} * (1.0 - {self.margin})",
)
assert result_entry[self.name] > target_entry[self.name] * (1.0 - self.margin)

def __init__(
Expand Down Expand Up @@ -156,7 +168,7 @@ def run(
"--engine.device",
self.accelerator,
]
for key, value in dataset.extra_overrides.items():
for key, value in dataset.extra_overrides.get("train", {}).items():
command.append(f"--{key}")
command.append(str(value))
command.extend(["--seed", str(seed)])
Expand All @@ -183,6 +195,9 @@ def run(
"--work_dir",
str(sub_work_dir),
]
for key, value in dataset.extra_overrides.get("test", {}).items():
command.append(f"--{key}")
command.append(str(value))
self._run_command(command)
self._rename_raw_data(
work_dir=sub_work_dir / ".latest" / "test",
Expand All @@ -198,6 +213,9 @@ def run(
"--work_dir",
str(sub_work_dir),
]
for key, value in dataset.extra_overrides.get("export", {}).items():
command.append(f"--{key}")
command.append(str(value))
self._run_command(command)

exported_model_path = sub_work_dir / ".latest" / "export" / "exported_model.xml"
Expand All @@ -214,6 +232,9 @@ def run(
"--work_dir",
str(sub_work_dir),
]
for key, value in dataset.extra_overrides.get("test", {}).items():
command.append(f"--{key}")
command.append(str(value))
self._run_command(command)

self._rename_raw_data(
Expand All @@ -235,6 +256,9 @@ def run(
"--work_dir",
str(sub_work_dir),
]
for key, value in dataset.extra_overrides.get("optimize", {}).items():
command.append(f"--{key}")
command.append(str(value))
self._run_command(command)

optimized_model_path = sub_work_dir / ".latest" / "optimize" / "optimized_model.xml"
Expand All @@ -252,6 +276,9 @@ def run(
"--work_dir",
str(sub_work_dir),
]
for key, value in dataset.extra_overrides.get("test", {}).items():
command.append(f"--{key}")
command.append(str(value))
self._run_command(command)

self._rename_raw_data(
Expand All @@ -264,12 +291,14 @@ def run(
gc.collect()

result = self.load_result(work_dir)
return self.average_result(result, keys=["task", "model", "data_group", "data"])
if result is None:
return None
result = summary.average(result, keys=["task", "model", "data_group", "data"]) # Average out seeds
return result.set_index(["task", "model", "data_group", "data"])

def _run_command(self, command: list[str]) -> None:
if self.dry_run:
print(" ".join(command))
else:
print(" ".join(command))
if not self.dry_run:
subprocess.run(command, check=True) # noqa: S603

def _log_metrics(
Expand Down Expand Up @@ -356,40 +385,7 @@ def load_result(result_path: Path) -> pd.DataFrame | None:
if len(results) == 0:
return None

return pd.concat(results, ignore_index=True).set_index(["task", "model", "data_group", "data"])

@staticmethod
def average_result(data: pd.DataFrame, keys: list[str]) -> pd.DataFrame | None:
"""Average result w.r.t. given keys

Args:
result (pd.DataFrame): Result data frame
keys (list[str]): Keys to summarize whole data

Retruns:
pd.DataFrame: Averaged result table
"""
if data is None:
return None

# Flatten index
index_names = data.index.names
column_names = data.columns
data = data.reset_index()
# Average by keys
grouped = data.groupby(keys)
aggregated = grouped.mean(numeric_only=True)
# Merge index columns
idx_columns = set(index_names) - set(keys)
for col in idx_columns:
aggregated[col] = "all"
# Merge tag columns (non-numeric & non-index)
tag_columns = set(column_names) - set(aggregated.columns) - set(keys)
for col in tag_columns:
# Take common string prefix such as: ["data/1", "data/2", "data/3"] -> "data/"
aggregated[col] = grouped[col].agg(lambda x: os.path.commonprefix(x.tolist()))
# Recover index
return aggregated.reset_index().set_index(index_names)
return pd.concat(results, ignore_index=True)

def check(self, result: pd.DataFrame, criteria: list[Criterion]):
"""Check result w.r.t. reference data.
Expand All @@ -399,19 +395,24 @@ def check(self, result: pd.DataFrame, criteria: list[Criterion]):
criteria (list[Criterion]): Criteria to check results
"""
if result is None:
print("[Check] No results loaded. Skipping result checking.")
return

if self.reference_results is None:
print("No benchmark references loaded. Skipping result checking.")
print("[Check] No benchmark references loaded. Skipping result checking.")
return

for key, result_entry in result.iterrows():
if key not in self.reference_results.index:
print(f"No benchmark reference for {key} loaded. Skipping result checking.")
print(f"[Check] No benchmark reference for {key} loaded. Skipping result checking.")
continue
target_entry = self.reference_results.loc[key]
if isinstance(target_entry, pd.DataFrame):
target_entry = target_entry.iloc[0] # 1-row pd.DataFrame to pd.Series
# Match num_repeat of result and target
result_seed_average = result_entry["seed"]
result_num_repeat = 2 * result_seed_average + 1 # (0+1+2+3+4)/5 = 2.0 -> 2*2.0+1 = 5
target_entry = target_entry.query(f"seed < {result_num_repeat}")
target_entry = target_entry.mean(numeric_only=True) # N-row pd.DataFrame to pd.Series

for criterion in criteria:
criterion(result_entry, target_entry)
Expand Down
Loading
Loading