122 performance benchmarking (#59)

* Basic performance benchmarking * Remove accidental file add * Initial benchmarks and Jenkinsfile * Some prototyping of sbatch * Improve sbatch * sbatch file passes successfully * Remove unnecessary slashes and add more files * Better doc * Add structure factor benchmark * Initial benchmark testing * Fix overwrite of data * Corrected bug where args were the wrong way around * Write more easily readable json * GIve 10% on time upper bound * Account for flaky testing and better generation script output * Add turbo disable, data generating sbatch and control SCARF turbo * prep SCARF jenkinsfile * Send email on failure * Better message * Better email message * Ready for running on scarf * Write reports to xml * Added newline * Make lines less than 79 chars and correct numpy returns doc strings * Move to pytest-benchmark * Remove unused code * Try running from different label * steps * Correct cron timing * remove histogram and qpts limit * Create json to location * Remove qpts limit * Add speedup calculation * Add speedups call to sbatch * Speed up plots * Working performance visualisation * Tidying visualisation * Further tidying * Refactor and document visualisation * Remove qpoint limit * Remove unused sbatch file, correct doc * Specify directory on command line * Documenting, commenting and making nicer to read * Take into account seednames in speedups calculations * Take into account seedname in visualisation * Move into different files and refactor to use different line styles * Add visualisation of speedups over the amount of CPUs for a specific file and refactor how directories and files are specified * Correct dosctring indents * Place legend to the right * Add docstring * Only change linestyle every 5 lines * Refactor to use subplots * Update scripts for new API * Plot each material with a different linestyle * Move threads into utils.py Co-authored-by: James King <[email protected]> Co-authored-by: Rebecca Fair <[email protected]>
pace-neutrons · May 18, 2020 · 0d441a1 · 0d441a1
1 parent 6ca7dd6
commit 0d441a1
Show file tree

Hide file tree

Showing 18 changed files with 865 additions and 0 deletions.
diff --git a/.gitignore b/.gitignore
@@ -7,6 +7,10 @@ euphonic.egg-info/
 .tox
 tests_and_analysis/test/reports/
 tests_and_analysis/static_code_analysis/reports/
+tests_and_analysis/performance_benchmarking/reports/
 .coverage
 *.pyd
+*.log
+*.err
 *.so
+venv
diff --git a/tests_and_analysis/performance_benchmarking/Jenkinsfile b/tests_and_analysis/performance_benchmarking/Jenkinsfile
@@ -0,0 +1,30 @@
+#!groovy
+
+pipeline {
+
+    agent { label "SCARF" }
+
+    triggers {
+        cron("0 0 * * 0")
+    }
+
+    stages {
+
+        stage("Benchmark"){
+            steps {
+                checkout scm
+                sh """
+                    cd tests_and_analysis/performance_benchmarking &&
+                    sbatch run_benchmark_tests.sbatch
+                """
+            }
+        }
+
+    }
+
+    post {
+        cleanup {
+            deleteDir()
+        }
+    }
+}
diff --git a/tests_and_analysis/performance_benchmarking/__init__.py b/tests_and_analysis/performance_benchmarking/__init__.py
diff --git a/tests_and_analysis/performance_benchmarking/data/La2Zr2O7.castep_bin b/tests_and_analysis/performance_benchmarking/data/La2Zr2O7.castep_bin
diff --git a/tests_and_analysis/performance_benchmarking/data/Nb-242424-s0.25.castep_bin b/tests_and_analysis/performance_benchmarking/data/Nb-242424-s0.25.castep_bin
diff --git a/tests_and_analysis/performance_benchmarking/data/qpts_10000.npy b/tests_and_analysis/performance_benchmarking/data/qpts_10000.npy
diff --git a/tests_and_analysis/performance_benchmarking/data/quartz.castep_bin b/tests_and_analysis/performance_benchmarking/data/quartz.castep_bin
diff --git a/tests_and_analysis/performance_benchmarking/requirements.txt b/tests_and_analysis/performance_benchmarking/requirements.txt
@@ -0,0 +1,5 @@
+pytest==5.4.1
+pytest-benchmark[histogram]==3.2.3
+numpy
+pandas==1.0.3
+matplotlib
diff --git a/tests_and_analysis/performance_benchmarking/run_benchmark_tests.py b/tests_and_analysis/performance_benchmarking/run_benchmark_tests.py
@@ -0,0 +1,16 @@
+import os
+import pytest
+
+if __name__ == "__main__":
+
+    test_dir = os.path.dirname(os.path.abspath(__file__))
+    reports_dir = os.path.join(test_dir, "reports")
+    if not os.path.exists(reports_dir):
+        os.mkdir(reports_dir)
+
+    os.chdir(reports_dir)
+    test_exit_code = pytest.main([
+        test_dir,
+        "--benchmark-json=performance_benchmarks.json"
+    ])
+    os.chdir("..")
diff --git a/tests_and_analysis/performance_benchmarking/speedups.py b/tests_and_analysis/performance_benchmarking/speedups.py
@@ -0,0 +1,131 @@
+import argparse
+import json
+from typing import Dict
+import os
+
+
+def get_file_or_dir() -> str:
+    """
+    Get the filename to calculate speedups of that has
+    been specified on the command line.
+
+    Returns
+    -------
+    str
+        The filename to calculate speedups for.
+    """
+    parser = argparse.ArgumentParser()
+    dir_file_group = parser.add_mutually_exclusive_group()
+    dir_file_group.add_argument("-f", action="store", dest="filename",
+                                help="The file to calculate speedups for")
+    dir_file_group.add_argument("-d", action="store", dest="dirname",
+                                help="The directory containing files"
+                                     " to calculate speedups for",
+                                default="reports")
+    args_parsed = parser.parse_args()
+    if args_parsed.filename:
+        return args_parsed.filename
+    else:
+        return args_parsed.dirname
+
+
+def median_value(benchmark: Dict) -> float:
+    """
+    Extract the median value from the benchmark disctionary.
+
+    Parameters
+    ----------
+    benchmark : Dict
+        A dictionary containing a median values
+
+    Returns
+    -------
+    float
+        The median time taken value from the benchmark data
+    """
+    return benchmark["stats"]["median"]
+
+
+def calculate_speedups(filename: str) -> Dict[str, Dict[str, Dict[int, float]]]:
+    """
+    Calculate speedups for the tests that are parameterised to
+    use a number of different threads.
+
+    Parameters
+    ----------
+    filename : str
+        The file to calculate speedups for
+
+    Returns
+    -------
+    Dict[str, Dict[str, Dict[int, float]]]
+        The keys of the top level dictionary are the name of the test.
+        The keys of the next level of the dictionary are the seednames
+        used in the tests.
+        The keys of the next level dictionary are the number of threads used.
+        The values are the speedups for the given test and number of threads.
+    """
+    data = json.load(open(filename))
+    data["benchmarks"].sort(key=median_value)
+    # Extract the time taken for all the tests at the various numbers of threads
+    # and format the data to easily calculate speedups
+    speed_at_threads = {}
+    for benchmark in data["benchmarks"]:
+        # Filter out the tests that haven't used different numbers of threads
+        if "use_c" in benchmark["params"] and \
+                benchmark["params"]["use_c"] is True:
+            # Initialise performance data structure
+            test = benchmark["name"].split("[")[0]
+            if test not in speed_at_threads:
+                speed_at_threads[test] = {}
+            seedname = benchmark["params"]["seedname"]
+            if seedname not in speed_at_threads[test]:
+                speed_at_threads[test][seedname] = {}
+            # At the given test and number of threads extract the
+            # median time taken
+            speed_at_threads[test][seedname][benchmark["params"]["n_threads"]] \
+                = benchmark["stats"]["median"]
+    # Calculate the speedups from the formatted data
+    speedups = {}
+    for test in speed_at_threads:
+        speedups[test] = {}
+        for seedname in speed_at_threads[test]:
+            speedups[test][seedname] = {}
+            sequential_speed = speed_at_threads[test][seedname][1]
+            for n_threads in speed_at_threads[test][seedname]:
+                speedups[test][seedname][n_threads] = \
+                    sequential_speed / speed_at_threads[test][seedname][n_threads]
+    return speedups
+
+
+def write_speedups(filename: str, speedups: Dict[str, Dict[str, Dict[int, float]]]):
+    """
+    Write the calculated speedups to the given json file in
+    the "speedups" entry.
+
+    Parameters
+    ----------
+    filename : str
+        The file to write the speedups to
+    speedups : Dict[str, Dict[str, Dict[int, float]]]
+        The calculated speedups to write to file.
+    """
+    # Load in the data and update with the speedups
+    data = json.load(open(filename))
+    data["speedups"] = speedups
+    # Format the data nicely when overwriting to the file
+    json.dump(data, open(filename, "w+"), indent=4, sort_keys=True)
+
+
+if __name__ == "__main__":
+    path: str = get_file_or_dir()
+    if os.path.isdir(path):
+        for filename in os.listdir(path):
+            filepath = os.path.join(path, filename)
+            speedups: Dict[str, Dict[str, Dict[int, float]]] = \
+                calculate_speedups(filepath)
+            write_speedups(filepath, speedups)
+    elif os.path.isfile(path):
+        speedups: Dict[str, Dict[str, Dict[int, float]]] = \
+            calculate_speedups(path)
+        write_speedups(path, speedups)
diff --git a/tests_and_analysis/performance_benchmarking/test_benchmark_fc.py b/tests_and_analysis/performance_benchmarking/test_benchmark_fc.py
@@ -0,0 +1,53 @@
+import os
+import pytest
+from utils import get_data_path, get_seednames,\
+    get_qpts, get_threads
+
+from euphonic import ureg, ForceConstants
+
+
+@pytest.mark.parametrize("seedname", get_seednames())
+@pytest.mark.parametrize("use_c", [True, False])
+@pytest.mark.parametrize("n_threads", get_threads())
+def test_calculate_qpoint_phonon_modes(seedname, use_c, n_threads, benchmark):
+    # Set up
+    fc = ForceConstants.from_castep(
+            os.path.join(get_data_path(), f'{seedname}.castep_bin'))
+    qpts = get_qpts()
+    # Benchmark
+    if use_c:
+        benchmark(
+            fc.calculate_qpoint_phonon_modes,
+            qpts, use_c=True,
+            fall_back_on_python=False,
+            n_threads=n_threads,
+            asr='reciprocal', eta_scale=0.75
+        )
+    elif n_threads == 1:
+        benchmark(
+            fc.calculate_qpoint_phonon_modes,
+            qpts, use_c=False,
+            asr='reciprocal', eta_scale=0.75
+        )
+
+
+@pytest.mark.parametrize("seedname", get_seednames())
+def test_calculate_structure_factor(seedname, benchmark):
+    # Set up
+    qpts = get_qpts()
+    fc = ForceConstants.from_castep(
+            os.path.join(get_data_path(), f'{seedname}.castep_bin'))
+    phonons = fc.calculate_qpoint_phonon_modes(
+        qpts, use_c=True, fall_back_on_python=False, n_threads=5
+    )
+    fm = ureg('fm')
+    scattering_lengths = {
+        'La': 8.24*fm, 'Zr': 7.16*fm, 'O': 5.803*fm, 'C': 6.646*fm,
+        'Si': 4.1491*fm, 'H': -3.7390*fm, 'N': 9.36*fm, 'S': 2.847*fm,
+        'Nb': 7.054*fm
+    }
+    # Benchmark
+    benchmark(
+        phonons.calculate_structure_factor,
+        scattering_lengths=scattering_lengths
+    )
diff --git a/tests_and_analysis/performance_benchmarking/utils.py b/tests_and_analysis/performance_benchmarking/utils.py
@@ -0,0 +1,44 @@
+import numpy as np
+import os
+from typing import List
+
+
+def get_data_path() -> str:
+    """
+    Returns
+    -------
+    str
+        The path to the data files for use in performance benchmarking
+    """
+    return os.path.join(os.path.dirname(__file__), "data")
+
+
+def get_seednames() -> List[str]:
+    """
+    Returns
+    -------
+    List[str]
+        A list of the seednames to test with
+    """
+    return ["Nb-242424-s0.25", "quartz", "La2Zr2O7"]
+
+
+def get_threads() -> List[int]:
+    """
+    Returns
+    -------
+    List[int]
+        A list of the number of threads to test with
+    """
+    return [1, 2, 4, 8, 12, 16, 24]
+
+
+def get_qpts() -> np.ndarray:
+    """
+    Returns
+    -------
+    np.ndarray
+        A numpy array of 10,000 q-points
+    """
+    qpts_npy_file = os.path.join(get_data_path(), "qpts_10000.npy")
+    return np.load(qpts_npy_file)
diff --git a/tests_and_analysis/performance_benchmarking/visualise.py b/tests_and_analysis/performance_benchmarking/visualise.py
@@ -0,0 +1,47 @@
+import argparse
+import matplotlib.pyplot as plt
+from visualise.performance_over_time import plot_median_values
+from visualise.speedups_over_time import plot_speedups_over_time
+from visualise.speedups import plot_speedups_for_file
+
+
+def get_parser() -> argparse.ArgumentParser:
+    """
+    Get the directory specified as an argument on the command line.
+
+    Returns
+    -------
+    str
+        The path of the directory
+    """
+    parser = argparse.ArgumentParser()
+    parser.add_argument("-st", "--speedup-over-time", action="store",
+                        dest="speedup_over_time_dir",
+                        help="Plot and show how the speedups data has changed"
+                             " over time for the files in the directory you"
+                             " have specified as part of this argument")
+    parser.add_argument("-p", "--performance", action="store",
+                        dest="performance_dir",
+                        help="Plot and show how performance data has changed"
+                             " over time for the files in the directory you"
+                             " have specified as part of this argument")
+    parser.add_argument("-sf", "--speedup-file", action="store",
+                        dest="speedup_file",
+                        help="Plot and show how using more threads affects the"
+                             " performance of functions across multiple"
+                             " different materials for the specified file")
+    return parser
+
+
+if __name__ == "__main__":
+    parser = get_parser()
+    args_parsed = parser.parse_args()
+    if args_parsed.speedup_over_time_dir:
+        figure_index = plot_speedups_over_time(
+            args_parsed.speedup_over_time_dir
+        )
+    if args_parsed.performance_dir:
+        figure_index = plot_median_values(args_parsed.performance_dir)
+    if args_parsed.speedup_file:
+        plot_speedups_for_file(args_parsed.speedup_file)
+    plt.show()
diff --git a/tests_and_analysis/performance_benchmarking/visualise/__init__.py b/tests_and_analysis/performance_benchmarking/visualise/__init__.py