From ac8e293b6a9c242f00a840e79f099f4200c5bd4c Mon Sep 17 00:00:00 2001 From: Joana Niermann <53186085+niermann999@users.noreply.github.com> Date: Wed, 8 Jan 2025 08:07:20 +0000 Subject: [PATCH] ref: Generalize propagation benchmark functionality (#404) Generalize the benchmark functionality to different detectors and actor setups. Also splits some common benchmark functionality into a detray benchmark library, like e.g. the generation of track samples. The CPU benchmarks have also been switched to use dynamic scheduling for load balancing. Also adds the charge conjugation operation to the pdg particle, so that the charge hypothesis can be updated when test tracks are generated with randomized charge (this was triggering an assertion in the benchmarks otherwise) --- .../detray/builders/material_map_builder.hpp | 2 +- .../detray/definitions/pdg_particle.hpp | 23 +- .../detray/navigation/intersection_kernel.hpp | 2 +- .../include/detray/propagator/actor_chain.hpp | 20 +- tests/benchmarks/CMakeLists.txt | 35 +++ tests/benchmarks/cpu/CMakeLists.txt | 19 +- tests/benchmarks/cpu/benchmark_propagator.cpp | 182 -------------- tests/benchmarks/cpu/propagation.cpp | 158 +++++++++++++ tests/benchmarks/cuda/CMakeLists.txt | 14 +- .../cuda/benchmark_propagator_cuda.cpp | 127 ---------- .../cuda/benchmark_propagator_cuda_kernel.cu | 71 ------ .../cuda/benchmark_propagator_cuda_kernel.hpp | 64 ----- tests/benchmarks/cuda/propagation.cpp | 164 +++++++++++++ .../detray/benchmarks/benchmark_base.hpp | 76 ++++++ .../detray/benchmarks/cpu/CMakeLists.txt | 15 ++ .../benchmarks/cpu/propagation_benchmark.hpp | 131 +++++++++++ .../detray/benchmarks/device/CMakeLists.txt | 9 + .../benchmarks/device/cuda/CMakeLists.txt | 29 +++ .../device/cuda/propagation_benchmark.cu | 148 ++++++++++++ .../device/cuda/propagation_benchmark.hpp | 184 +++++++++++++++ .../propagation_benchmark_config.hpp | 57 +++++ .../propagation_benchmark_utils.hpp | 222 ++++++++++++++++++ .../validation/material_validation_utils.hpp | 2 +- .../cpu/propagator/propagator.cpp | 6 +- 24 files changed, 1289 insertions(+), 471 deletions(-) delete mode 100644 tests/benchmarks/cpu/benchmark_propagator.cpp create mode 100644 tests/benchmarks/cpu/propagation.cpp delete mode 100644 tests/benchmarks/cuda/benchmark_propagator_cuda.cpp delete mode 100644 tests/benchmarks/cuda/benchmark_propagator_cuda_kernel.cu delete mode 100644 tests/benchmarks/cuda/benchmark_propagator_cuda_kernel.hpp create mode 100644 tests/benchmarks/cuda/propagation.cpp create mode 100644 tests/benchmarks/include/detray/benchmarks/benchmark_base.hpp create mode 100644 tests/benchmarks/include/detray/benchmarks/cpu/CMakeLists.txt create mode 100644 tests/benchmarks/include/detray/benchmarks/cpu/propagation_benchmark.hpp create mode 100644 tests/benchmarks/include/detray/benchmarks/device/CMakeLists.txt create mode 100644 tests/benchmarks/include/detray/benchmarks/device/cuda/CMakeLists.txt create mode 100644 tests/benchmarks/include/detray/benchmarks/device/cuda/propagation_benchmark.cu create mode 100644 tests/benchmarks/include/detray/benchmarks/device/cuda/propagation_benchmark.hpp create mode 100644 tests/benchmarks/include/detray/benchmarks/propagation_benchmark_config.hpp create mode 100644 tests/benchmarks/include/detray/benchmarks/propagation_benchmark_utils.hpp diff --git a/core/include/detray/builders/material_map_builder.hpp b/core/include/detray/builders/material_map_builder.hpp index 4c8c19229..b994c7e71 100644 --- a/core/include/detray/builders/material_map_builder.hpp +++ b/core/include/detray/builders/material_map_builder.hpp @@ -220,7 +220,7 @@ struct add_sf_material_map { template + concepts::scalar scalar_t> DETRAY_HOST inline std::pair operator()( [[maybe_unused]] const coll_t& coll, [[maybe_unused]] const index_t& index, diff --git a/core/include/detray/definitions/pdg_particle.hpp b/core/include/detray/definitions/pdg_particle.hpp index f623ffdc6..73c749eae 100644 --- a/core/include/detray/definitions/pdg_particle.hpp +++ b/core/include/detray/definitions/pdg_particle.hpp @@ -29,13 +29,13 @@ struct pdg_particle { m_charge(static_cast(charge)) {} DETRAY_HOST_DEVICE - std::int32_t pdg_num() const { return m_pdg_num; } + constexpr std::int32_t pdg_num() const { return m_pdg_num; } DETRAY_HOST_DEVICE - scalar_type mass() const { return m_mass; } + constexpr scalar_type mass() const { return m_mass; } DETRAY_HOST_DEVICE - scalar_type charge() const { return m_charge; } + constexpr scalar_type charge() const { return m_charge; } private: std::int32_t m_pdg_num; @@ -43,6 +43,23 @@ struct pdg_particle { scalar_type m_charge; }; +/// Apply the charge conjugation operator to a particle hypothesis @param ptc +template +DETRAY_HOST_DEVICE constexpr pdg_particle charge_conjugation( + const pdg_particle& ptc) { + return (ptc.charge() != 0) + ? detray::pdg_particle{-ptc.pdg_num(), ptc.mass(), + -ptc.charge()} + : ptc; +} + +/// @returns an updated particle hypothesis according to the track qop +template +DETRAY_HOST_DEVICE constexpr pdg_particle update_particle_hypothesis( + const pdg_particle& ptc, const track_t& params) { + return (ptc.charge() * params.qop() > 0.f) ? ptc : charge_conjugation(ptc); +} + // Macro for declaring the particle #define DETRAY_DECLARE_PARTICLE(PARTICLE_NAME, PDG_NUM, MASS, CHARGE) \ template \ diff --git a/core/include/detray/navigation/intersection_kernel.hpp b/core/include/detray/navigation/intersection_kernel.hpp index 82da15a92..a7fd7c626 100644 --- a/core/include/detray/navigation/intersection_kernel.hpp +++ b/core/include/detray/navigation/intersection_kernel.hpp @@ -134,7 +134,7 @@ struct intersection_update { /// @return the intersection template + concepts::scalar scalar_t> DETRAY_HOST_DEVICE inline bool operator()( const mask_group_t &mask_group, const mask_range_t &mask_range, const traj_t &traj, intersection_t &sfi, diff --git a/core/include/detray/propagator/actor_chain.hpp b/core/include/detray/propagator/actor_chain.hpp index 9eec2a509..1650d6822 100644 --- a/core/include/detray/propagator/actor_chain.hpp +++ b/core/include/detray/propagator/actor_chain.hpp @@ -33,6 +33,8 @@ class actor_chain { public: /// Types of the actors that are registered in the chain using actor_list_type = dtuple; + // Tuple of actor states + using state_tuple = dtuple; // Type of states tuple that is used in the propagator using state = dtuple; @@ -52,8 +54,7 @@ class actor_chain { return m_actors; } - /// @returns a tuple of default constructible actor states and a - /// corresponding tuple of references + /// @returns a tuple of default constructible actor states DETRAY_HOST_DEVICE static constexpr auto make_actor_states() { // Only possible if each state is default initializable @@ -66,10 +67,10 @@ class actor_chain { } /// @returns a tuple of reference for every state in the tuple @param t - DETRAY_HOST_DEVICE static constexpr state make_ref_tuple( + DETRAY_HOST_DEVICE static constexpr state setup_actor_states( dtuple &t) { - return make_ref_tuple(t, - std::make_index_sequence{}); + return setup_actor_states( + t, std::make_index_sequence{}); } private: @@ -110,7 +111,7 @@ class actor_chain { /// @returns a tuple of reference for every state in the tuple @param t template - DETRAY_HOST_DEVICE static constexpr state make_ref_tuple( + DETRAY_HOST_DEVICE static constexpr state setup_actor_states( dtuple &t, std::index_sequence /*ids*/) { return detray::tie(detail::get(t)...); @@ -125,6 +126,7 @@ template <> class actor_chain<> { public: + using state_tuple = dtuple<>; /// Empty states replaces a real actor states container struct state {}; @@ -137,6 +139,12 @@ class actor_chain<> { propagator_state_t & /*p_state*/) const { /*Do nothing*/ } + + /// @returns an empty state + DETRAY_HOST_DEVICE static constexpr state setup_actor_states( + const state_tuple &) { + return {}; + } }; } // namespace detray diff --git a/tests/benchmarks/CMakeLists.txt b/tests/benchmarks/CMakeLists.txt index dd4c3fd9c..12940e393 100644 --- a/tests/benchmarks/CMakeLists.txt +++ b/tests/benchmarks/CMakeLists.txt @@ -4,12 +4,47 @@ # # Mozilla Public License Version 2.0 +# Set the common C++ flags. +include(detray-compiler-options-cpp) +include_directories( + SYSTEM + $ +) +include_directories( + SYSTEM + $ +) + +# Set up a common benchmark library. +file( + GLOB _detray_benchmarks_headers + RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" + "include/detray/benchmarks/*.hpp" +) + +add_library(detray_benchmarks INTERFACE "${_detray_benchmarks_headers}") +add_library(detray::benchmarks ALIAS detray_benchmarks) + +target_include_directories( + detray_benchmarks + INTERFACE "${CMAKE_CURRENT_SOURCE_DIR}/include" +) + +target_link_libraries( + detray_benchmarks + INTERFACE benchmark::benchmark vecmem::core detray::core detray::test_utils +) + +unset(_detray_benchmarks_headers) + # Set up the host/cpu benchmarks. if(DETRAY_BUILD_HOST) add_subdirectory(cpu) + add_subdirectory(include/detray/benchmarks/cpu) endif() # Set up all of the "device" benchmarks. if(DETRAY_BUILD_CUDA) add_subdirectory(cuda) + add_subdirectory(include/detray/benchmarks/device) endif() diff --git a/tests/benchmarks/cpu/CMakeLists.txt b/tests/benchmarks/cpu/CMakeLists.txt index b3d8b271e..c72bb0724 100644 --- a/tests/benchmarks/cpu/CMakeLists.txt +++ b/tests/benchmarks/cpu/CMakeLists.txt @@ -10,21 +10,20 @@ message(STATUS "Building detray host benchmarks") option(DETRAY_BENCHMARK_MULTITHREAD "Enable multithreaded benchmarks" OFF) option(DETRAY_BENCHMARK_PRINTOUTS "Enable printouts in the benchmarks" OFF) -# Look for openMP, which is used for the CPU benchmark +# Look for openMP, which is used for the CPU propagation benchmark find_package(OpenMP) # Macro setting up the CPU benchmarks for a specific algebra plugin. macro(detray_add_cpu_benchmark algebra) # Build the benchmark executable. detray_add_executable(benchmark_cpu_${algebra} - "benchmark_propagator.cpp" "find_volume.cpp" "grid.cpp" "grid2.cpp" "intersect_all.cpp" "intersect_surfaces.cpp" "masks.cpp" - LINK_LIBRARIES benchmark::benchmark benchmark::benchmark_main vecmem::core + LINK_LIBRARIES benchmark::benchmark benchmark::benchmark_main vecmem::core detray::benchmarks detray::core_${algebra} detray::test_utils ) @@ -48,9 +47,21 @@ macro(detray_add_cpu_benchmark algebra) ) endif() + # Build the benchmark executable for the propagation + detray_add_executable( benchmark_cpu_propagation_${algebra} + "propagation.cpp" + LINK_LIBRARIES detray::benchmark_cpu benchmark::benchmark_main + vecmem::core detray::core_${algebra} detray::test_utils + ) + + target_compile_options( + detray_benchmark_cpu_propagation_${algebra} + PRIVATE "-march=native" "-ftree-vectorize" + ) + if(OpenMP_CXX_FOUND) target_link_libraries( - detray_benchmark_cpu_${algebra} + detray_benchmark_cpu_propagation_${algebra} PRIVATE OpenMP::OpenMP_CXX ) endif() diff --git a/tests/benchmarks/cpu/benchmark_propagator.cpp b/tests/benchmarks/cpu/benchmark_propagator.cpp deleted file mode 100644 index 036ee2860..000000000 --- a/tests/benchmarks/cpu/benchmark_propagator.cpp +++ /dev/null @@ -1,182 +0,0 @@ -/** Detray library, part of the ACTS project (R&D line) - * - * (c) 2020-2024 CERN for the benefit of the ACTS project - * - * Mozilla Public License Version 2.0 - */ - -// Project include(s). -#include "detray/definitions/detail/algebra.hpp" -#include "detray/definitions/detail/containers.hpp" -#include "detray/definitions/detail/indexing.hpp" -#include "detray/definitions/units.hpp" -#include "detray/detectors/bfield.hpp" -#include "detray/geometry/shapes/rectangle2D.hpp" -#include "detray/navigation/navigator.hpp" -#include "detray/propagator/actor_chain.hpp" -#include "detray/propagator/actors/aborters.hpp" -#include "detray/propagator/actors/parameter_resetter.hpp" -#include "detray/propagator/actors/parameter_transporter.hpp" -#include "detray/propagator/actors/pointwise_material_interactor.hpp" -#include "detray/propagator/base_actor.hpp" -#include "detray/propagator/propagator.hpp" -#include "detray/propagator/rk_stepper.hpp" -#include "detray/tracks/tracks.hpp" -#include "detray/utils/grid/grid.hpp" - -// Detray test include(s). -#include "detray/test/utils/detectors/build_toy_detector.hpp" -#include "detray/test/utils/simulation/event_generator/track_generators.hpp" -#include "detray/test/utils/types.hpp" - -// VecMem include(s). -#include - -// Google benchmark include(s). -#include - -// System include(s) -#include -#include - -// Use the detray:: namespace implicitly. -using namespace detray; - -using matadata_t = test::toy_metadata; -using detector_host_type = detector; -using detector_device_type = detector; - -using algebra_t = typename detector_host_type::algebra_type; -using scalar = dscalar; - -using intersection_t = - intersection2D; - -using navigator_host_type = navigator; -using navigator_device_type = navigator; -using field_type = bfield::const_field_t; -using rk_stepper_type = rk_stepper; -using actor_chain_t = actor_chain, - pointwise_material_interactor, - parameter_resetter>; -using propagator_host_type = - propagator; -using propagator_device_type = - propagator; - -enum class propagate_option { - e_unsync = 0, - e_sync = 1, -}; - -// VecMem memory resource(s) -vecmem::host_memory_resource host_mr; - -// detector configuration -auto toy_cfg = - toy_det_config{}.n_brl_layers(4u).n_edc_layers(7u).do_check(false); - -void fill_tracks(vecmem::vector> &tracks, - const std::size_t n_tracks, bool do_sort = true) { - using scalar_t = dscalar; - using uniform_gen_t = - detail::random_numbers>; - using trk_generator_t = - random_track_generator, uniform_gen_t>; - - trk_generator_t::configuration trk_gen_cfg{}; - trk_gen_cfg.seed(42u); - trk_gen_cfg.n_tracks(n_tracks); - trk_gen_cfg.randomize_charge(true); - trk_gen_cfg.phi_range(-constant::pi, constant::pi); - trk_gen_cfg.eta_range(-3.f, 3.f); - trk_gen_cfg.mom_range(1.f * unit::GeV, - 100.f * unit::GeV); - trk_gen_cfg.origin(0.f, 0.f, 0.f); - trk_gen_cfg.origin_stddev(0.f, 0.f, 0.f); - - // Iterate through uniformly distributed momentum directions - for (auto traj : trk_generator_t{trk_gen_cfg}) { - tracks.push_back(traj); - } - - if (do_sort) { - // Sort by theta angle - const auto traj_comp = [](const auto &lhs, const auto &rhs) { - constexpr auto pi_2{constant::pi_2}; - return math::fabs(pi_2 - vector::theta(lhs.dir())) < - math::fabs(pi_2 - vector::theta(rhs.dir())); - }; - - std::ranges::sort(tracks, traj_comp); - } -} - -template -static void BM_PROPAGATOR_CPU(benchmark::State &state) { - - std::size_t n_tracks{static_cast(state.range(0)) * - static_cast(state.range(0))}; - - // Create the toy geometry and bfield - auto [det, names] = build_toy_detector(host_mr, toy_cfg); - test::vector3 B{0.f, 0.f, 2.f * unit::T}; - auto bfield = bfield::create_const_field(B); - - // Create propagator - propagation::config cfg{}; - cfg.navigation.search_window = {3u, 3u}; - propagator_host_type p{cfg}; - - std::size_t total_tracks = 0; - - for (auto _ : state) { - - // TODO: use fixture to build tracks - state.PauseTiming(); - - // Get tracks - vecmem::vector> tracks(&host_mr); - fill_tracks(tracks, n_tracks); - - total_tracks += tracks.size(); - - state.ResumeTiming(); - -#pragma omp parallel for - for (auto &track : tracks) { - - parameter_transporter::state transporter_state{}; - pointwise_material_interactor::state interactor_state{}; - parameter_resetter::state resetter_state{}; - - auto actor_states = - tie(transporter_state, interactor_state, resetter_state); - - // Create the propagator state - propagator_host_type::state p_state(track, bfield, det); - - // Run propagation - if constexpr (opt == propagate_option::e_unsync) { - p.propagate(p_state, actor_states); - } else if constexpr (opt == propagate_option::e_sync) { - p.propagate_sync(p_state, actor_states); - } - } - } - - state.counters["TracksPropagated"] = benchmark::Counter( - static_cast(total_tracks), benchmark::Counter::kIsRate); -} - -BENCHMARK_TEMPLATE(BM_PROPAGATOR_CPU, propagate_option::e_unsync) - ->Name("CPU unsync propagation") - ->RangeMultiplier(2) - ->Range(8, 256); -BENCHMARK_TEMPLATE(BM_PROPAGATOR_CPU, propagate_option::e_sync) - ->Name("CPU sync propagation") - ->RangeMultiplier(2) - ->Range(8, 256); - -BENCHMARK_MAIN(); diff --git a/tests/benchmarks/cpu/propagation.cpp b/tests/benchmarks/cpu/propagation.cpp new file mode 100644 index 000000000..2c638e746 --- /dev/null +++ b/tests/benchmarks/cpu/propagation.cpp @@ -0,0 +1,158 @@ +/** Detray library, part of the ACTS project (R&D line) + * + * (c) 2024 CERN for the benefit of the ACTS project + * + * Mozilla Public License Version 2.0 + */ + +// Project include(s) +#include "detray/detectors/bfield.hpp" +#include "detray/navigation/navigator.hpp" +#include "detray/propagator/actor_chain.hpp" +#include "detray/propagator/actors/aborters.hpp" +#include "detray/propagator/actors/parameter_resetter.hpp" +#include "detray/propagator/actors/parameter_transporter.hpp" +#include "detray/propagator/actors/pointwise_material_interactor.hpp" +#include "detray/propagator/rk_stepper.hpp" +#include "detray/tracks/tracks.hpp" + +// Detray benchmark include(s) +#include "detray/benchmarks/cpu/propagation_benchmark.hpp" + +// Detray test include(s). +#include "detray/test/utils/detectors/build_toy_detector.hpp" +#include "detray/test/utils/detectors/build_wire_chamber.hpp" +#include "detray/test/utils/simulation/event_generator/track_generators.hpp" +#include "detray/test/utils/types.hpp" + +// Vecmem include(s) +#include + +// System include(s) +#include +#include + +using namespace detray; + +int main(int argc, char** argv) { + + using toy_detector_t = detector; + using test_algebra = typename toy_detector_t::algebra_type; + using scalar = dscalar; + using vector3 = dvector3D; + using free_track_parameters_t = free_track_parameters; + using uniform_gen_t = + detail::random_numbers>; + using track_generator_t = + random_track_generator; + + using field_t = bfield::const_field_t; + using stepper_t = rk_stepper; + using empty_chain_t = actor_chain<>; + using default_chain = + actor_chain, + pointwise_material_interactor, + parameter_resetter>; + + vecmem::host_memory_resource host_mr; + + // + // Configuration + // + + // Constant magnetic field + vector3 B{0.f, 0.f, 2.f * unit::T}; + + // Configure toy detector + toy_det_config toy_cfg{}; + toy_cfg.use_material_maps(false).n_brl_layers(4u).n_edc_layers(7u); + + std::cout << toy_cfg << std::endl; + + // Configure wire chamber + wire_chamber_config wire_chamber_cfg{}; + wire_chamber_cfg.half_z(500.f * unit::mm); + + std::cout << wire_chamber_cfg << std::endl; + + // Configure propagation + propagation::config prop_cfg{}; + prop_cfg.navigation.search_window = {3u, 3u}; + + std::cout << prop_cfg << std::endl; + + // Benchmark config + detray::benchmarks::benchmark_base::configuration bench_cfg{}; + + std::vector n_tracks{8 * 8, 16 * 16, 32 * 32, 64 * 64, + 128 * 128, 256 * 256, 512 * 512}; + + auto trk_cfg = + detray::benchmarks::get_default_trk_gen_config( + n_tracks); + + // Specific configuration for the random track generation + trk_cfg.seed(42u); + + // Add additional tracks for warmup + bench_cfg.n_warmup(static_cast( + std::ceil(0.1f * static_cast(trk_cfg.n_tracks())))); + + // + // Prepare data + // + auto track_samples = + detray::benchmarks::generate_track_samples( + &host_mr, n_tracks, trk_cfg); + + const auto [toy_det, names] = + build_toy_detector(host_mr, toy_cfg); + const auto [wire_chamber, _] = + build_wire_chamber(host_mr, wire_chamber_cfg); + + auto bfield = bfield::create_const_field(B); + + dtuple<> empty_state{}; + + parameter_transporter::state transporter_state{}; + pointwise_material_interactor::state interactor_state{}; + parameter_resetter::state resetter_state{}; + + auto actor_states = detail::make_tuple( + transporter_state, interactor_state, resetter_state); + + // + // Register benchmarks + // + std::cout << "Propagation Benchmarks\n" + << "----------------------\n\n"; + + prop_cfg.stepping.do_covariance_transport = true; + detray::benchmarks::register_benchmark< + detray::benchmarks::host_propagation_bm, stepper_t, default_chain>( + "TOY_DETECTOR_W_COV_TRANSPORT", bench_cfg, prop_cfg, toy_det, bfield, + &actor_states, track_samples, n_tracks); + + prop_cfg.stepping.do_covariance_transport = false; + detray::benchmarks::register_benchmark< + detray::benchmarks::host_propagation_bm, stepper_t, empty_chain_t>( + "TOY_DETECTOR", bench_cfg, prop_cfg, toy_det, bfield, &empty_state, + track_samples, n_tracks); + + prop_cfg.stepping.do_covariance_transport = true; + detray::benchmarks::register_benchmark< + detray::benchmarks::host_propagation_bm, stepper_t, default_chain>( + "WIRE_CHAMBER_W_COV_TRANSPORT", bench_cfg, prop_cfg, wire_chamber, + bfield, &actor_states, track_samples, n_tracks); + + prop_cfg.stepping.do_covariance_transport = false; + detray::benchmarks::register_benchmark< + detray::benchmarks::host_propagation_bm, stepper_t, empty_chain_t>( + "WIRE_CHAMBER", bench_cfg, prop_cfg, wire_chamber, bfield, &empty_state, + track_samples, n_tracks); + + // Run benchmarks + ::benchmark::Initialize(&argc, argv); + ::benchmark::RunSpecifiedBenchmarks(); + ::benchmark::Shutdown(); +} diff --git a/tests/benchmarks/cuda/CMakeLists.txt b/tests/benchmarks/cuda/CMakeLists.txt index 096c92ad0..a101f0854 100644 --- a/tests/benchmarks/cuda/CMakeLists.txt +++ b/tests/benchmarks/cuda/CMakeLists.txt @@ -26,26 +26,24 @@ if(DETRAY_EIGEN_PLUGIN) endif() foreach(algebra ${algebras}) - detray_add_executable(benchmark_cuda_${algebra} - "benchmark_propagator_cuda_kernel.hpp" - "benchmark_propagator_cuda.cpp" - "benchmark_propagator_cuda_kernel.cu" - LINK_LIBRARIES benchmark::benchmark detray::core detray::algebra_${algebra} vecmem::cuda detray::test_utils + detray_add_executable(benchmark_cuda_propagation_${algebra} + "propagation.cpp" + LINK_LIBRARIES detray::benchmark_cuda detray::core detray::algebra_${algebra} vecmem::cuda detray::test_utils ) target_compile_definitions( - detray_benchmark_cuda_${algebra} + detray_benchmark_cuda_propagation_${algebra} PRIVATE ${algebra}=${algebra} ) target_compile_options( - detray_benchmark_cuda_${algebra} + detray_benchmark_cuda_propagation_${algebra} PRIVATE "-march=native" "-ftree-vectorize" ) if(OpenMP_CXX_FOUND) target_link_libraries( - detray_benchmark_cuda_${algebra} + detray_benchmark_cuda_propagation_${algebra} PRIVATE OpenMP::OpenMP_CXX ) endif() diff --git a/tests/benchmarks/cuda/benchmark_propagator_cuda.cpp b/tests/benchmarks/cuda/benchmark_propagator_cuda.cpp deleted file mode 100644 index 325381ea1..000000000 --- a/tests/benchmarks/cuda/benchmark_propagator_cuda.cpp +++ /dev/null @@ -1,127 +0,0 @@ -/** Detray library, part of the ACTS project (R&D line) - * - * (c) 2022-2024 CERN for the benefit of the ACTS project - * - * Mozilla Public License Version 2.0 - */ - -// Project include(s) -#include "benchmark_propagator_cuda_kernel.hpp" - -// Detray test include(s). -#include "detray/test/utils/detectors/build_toy_detector.hpp" -#include "detray/test/utils/simulation/event_generator/track_generators.hpp" -#include "detray/test/utils/types.hpp" - -// Vecmem include(s) -#include -#include -#include -#include - -// Google include(s). -#include - -using namespace detray; - -// VecMem memory resource(s) -vecmem::host_memory_resource host_mr; -vecmem::cuda::managed_memory_resource mng_mr; -vecmem::cuda::device_memory_resource dev_mr; -vecmem::binary_page_memory_resource bp_mng_mr(mng_mr); - -// detector configuration -auto toy_cfg = - toy_det_config{}.n_brl_layers(4u).n_edc_layers(7u).do_check(false); - -void fill_tracks(vecmem::vector> &tracks, - const std::size_t n_tracks, bool do_sort = true) { - using scalar_t = dscalar; - using uniform_gen_t = - detail::random_numbers>; - using trk_generator_t = - random_track_generator, - uniform_gen_t>; - - trk_generator_t::configuration trk_gen_cfg{}; - trk_gen_cfg.seed(42u); - trk_gen_cfg.n_tracks(n_tracks); - trk_gen_cfg.randomize_charge(true); - trk_gen_cfg.phi_range(-constant::pi, constant::pi); - trk_gen_cfg.eta_range(-3.f, 3.f); - trk_gen_cfg.mom_range(1.f * unit::GeV, - 100.f * unit::GeV); - trk_gen_cfg.origin(0.f, 0.f, 0.f); - trk_gen_cfg.origin_stddev(0.f, 0.f, 0.f); - - // Iterate through uniformly distributed momentum directions - for (auto traj : trk_generator_t{trk_gen_cfg}) { - tracks.push_back(traj); - } - - if (do_sort) { - // Sort by theta angle - const auto traj_comp = [](const auto &lhs, const auto &rhs) { - constexpr auto pi_2{constant::pi_2}; - return math::fabs(pi_2 - vector::theta(lhs.dir())) < - math::fabs(pi_2 - vector::theta(rhs.dir())); - }; - - std::ranges::sort(tracks, traj_comp); - } -} - -template -static void BM_PROPAGATOR_CUDA(benchmark::State &state) { - - std::size_t n_tracks{static_cast(state.range(0)) * - static_cast(state.range(0))}; - - // Create the toy geometry - auto [det, names] = build_toy_detector(host_mr, toy_cfg); - test::vector3 B{0.f, 0.f, 2.f * unit::T}; - auto bfield = bfield::create_const_field(B); - - // vecmem copy helper object - vecmem::cuda::copy cuda_cpy; - - // Copy detector to device - auto det_buff = detray::get_buffer(det, dev_mr, cuda_cpy); - auto det_view = detray::get_data(det_buff); - - std::size_t total_tracks = 0; - - for (auto _ : state) { - - state.PauseTiming(); - - // Get tracks - vecmem::vector> tracks(&bp_mng_mr); - fill_tracks(tracks, n_tracks); - - total_tracks += tracks.size(); - - state.ResumeTiming(); - - // Get tracks data - auto tracks_data = vecmem::get_data(tracks); - - // Run the propagator test for GPU device - propagator_benchmark(det_view, bfield, tracks_data, opt); - } - - state.counters["TracksPropagated"] = benchmark::Counter( - static_cast(total_tracks), benchmark::Counter::kIsRate); -} - -BENCHMARK_TEMPLATE(BM_PROPAGATOR_CUDA, propagate_option::e_unsync) - ->Name("CUDA unsync propagation") - ->RangeMultiplier(2) - ->Range(8, 256); -BENCHMARK_TEMPLATE(BM_PROPAGATOR_CUDA, propagate_option::e_sync) - ->Name("CUDA sync propagation") - ->RangeMultiplier(2) - ->Range(8, 256); - -BENCHMARK_MAIN(); diff --git a/tests/benchmarks/cuda/benchmark_propagator_cuda_kernel.cu b/tests/benchmarks/cuda/benchmark_propagator_cuda_kernel.cu deleted file mode 100644 index 9c2de23f3..000000000 --- a/tests/benchmarks/cuda/benchmark_propagator_cuda_kernel.cu +++ /dev/null @@ -1,71 +0,0 @@ -/** Detray library, part of the ACTS project (R&D line) - * - * (c) 2022 CERN for the benefit of the ACTS project - * - * Mozilla Public License Version 2.0 - */ - -#include "benchmark_propagator_cuda_kernel.hpp" -#include "detray/definitions/detail/cuda_definitions.hpp" - -namespace detray { - -__global__ void __launch_bounds__(256, 4) propagator_benchmark_kernel( - typename detector_host_type::view_type det_data, - covfie::field_view> field_data, - vecmem::data::vector_view> tracks_data, - const propagate_option opt) { - - int gid = threadIdx.x + blockIdx.x * blockDim.x; - - detector_device_type det(det_data); - vecmem::device_vector> tracks( - tracks_data); - - if (gid >= tracks.size()) { - return; - } - - // Create propagator - propagation::config cfg{}; - cfg.navigation.search_window = {3u, 3u}; - propagator_device_type p{cfg}; - - parameter_transporter::state transporter_state{}; - pointwise_material_interactor::state interactor_state{}; - parameter_resetter::state resetter_state{}; - - // Create the actor states - auto actor_states = - detray::tie(transporter_state, interactor_state, resetter_state); - // Create the propagator state - propagator_device_type::state p_state(tracks.at(gid), field_data, det); - - // Run propagation - if (opt == propagate_option::e_unsync) { - p.propagate(p_state, actor_states); - } else if (opt == propagate_option::e_sync) { - p.propagate_sync(p_state, actor_states); - } -} - -void propagator_benchmark( - typename detector_host_type::view_type det_data, - covfie::field_view> field_data, - vecmem::data::vector_view>& tracks_data, - const propagate_option opt) { - - constexpr int thread_dim = 256; - int block_dim = - static_cast(tracks_data.size() + thread_dim - 1) / thread_dim; - - // run the test kernel - propagator_benchmark_kernel<<>>(det_data, field_data, - tracks_data, opt); - - // cuda error check - DETRAY_CUDA_ERROR_CHECK(cudaGetLastError()); - DETRAY_CUDA_ERROR_CHECK(cudaDeviceSynchronize()); -} - -} // namespace detray diff --git a/tests/benchmarks/cuda/benchmark_propagator_cuda_kernel.hpp b/tests/benchmarks/cuda/benchmark_propagator_cuda_kernel.hpp deleted file mode 100644 index 37ba65f18..000000000 --- a/tests/benchmarks/cuda/benchmark_propagator_cuda_kernel.hpp +++ /dev/null @@ -1,64 +0,0 @@ -/** Detray library, part of the ACTS project (R&D line) - * - * (c) 2022-2024 CERN for the benefit of the ACTS project - * - * Mozilla Public License Version 2.0 - */ - -#pragma once - -// Project include(s) -#include "detray/definitions/detail/algebra.hpp" -#include "detray/definitions/units.hpp" -#include "detray/detectors/bfield.hpp" -#include "detray/navigation/navigator.hpp" -#include "detray/propagator/actor_chain.hpp" -#include "detray/propagator/actors/aborters.hpp" -#include "detray/propagator/actors/parameter_resetter.hpp" -#include "detray/propagator/actors/parameter_transporter.hpp" -#include "detray/propagator/actors/pointwise_material_interactor.hpp" -#include "detray/propagator/base_actor.hpp" -#include "detray/propagator/propagator.hpp" -#include "detray/propagator/rk_stepper.hpp" -#include "detray/tracks/tracks.hpp" - -// Detray test include(s). -#include "detray/test/utils/types.hpp" - -namespace detray { - -using matadata_t = test::toy_metadata; -using test_algebra = matadata_t::algebra_type; -using scalar = detray::dscalar; - -using detector_host_type = - detray::detector; -using detector_device_type = - detray::detector; - -using navigator_host_type = detray::navigator; -using navigator_device_type = detray::navigator; -using field_type = detray::bfield::const_field_t; -using rk_stepper_type = detray::rk_stepper; -using actor_chain_t = - detray::actor_chain, - detray::pointwise_material_interactor, - detray::parameter_resetter>; -using propagator_host_type = - detray::propagator; -using propagator_device_type = - detray::propagator; - -enum class propagate_option { - e_unsync = 0, - e_sync = 1, -}; - -/// test function for propagator with single state -void propagator_benchmark( - typename detector_host_type::view_type det_data, - typename field_type::view_t field_data, - vecmem::data::vector_view>& tracks_data, - const propagate_option opt); - -} // namespace detray diff --git a/tests/benchmarks/cuda/propagation.cpp b/tests/benchmarks/cuda/propagation.cpp new file mode 100644 index 000000000..ef5721395 --- /dev/null +++ b/tests/benchmarks/cuda/propagation.cpp @@ -0,0 +1,164 @@ +/** Detray library, part of the ACTS project (R&D line) + * + * (c) 2024 CERN for the benefit of the ACTS project + * + * Mozilla Public License Version 2.0 + */ + +// Project include(s) +#include "detray/detectors/bfield.hpp" +#include "detray/navigation/navigator.hpp" +#include "detray/propagator/actor_chain.hpp" +#include "detray/propagator/actors/aborters.hpp" +#include "detray/propagator/actors/parameter_resetter.hpp" +#include "detray/propagator/actors/parameter_transporter.hpp" +#include "detray/propagator/actors/pointwise_material_interactor.hpp" +#include "detray/propagator/rk_stepper.hpp" +#include "detray/tracks/tracks.hpp" + +// Detray benchmark include(s) +#include "detray/benchmarks/device/cuda/propagation_benchmark.hpp" + +// Detray test include(s). +#include "detray/test/utils/detectors/build_toy_detector.hpp" +#include "detray/test/utils/detectors/build_wire_chamber.hpp" +#include "detray/test/utils/simulation/event_generator/track_generators.hpp" +#include "detray/test/utils/types.hpp" + +// Vecmem include(s) +#include +#include + +// System include(s) +#include +#include + +using namespace detray; + +int main(int argc, char** argv) { + + using toy_detector_t = detector; + using test_algebra = typename toy_detector_t::algebra_type; + using scalar = dscalar; + using vector3 = dvector3D; + using free_track_parameters_t = free_track_parameters; + using uniform_gen_t = + detail::random_numbers>; + using track_generator_t = + random_track_generator; + using field_bknd_t = bfield::const_bknd_t; + + vecmem::host_memory_resource host_mr; + vecmem::cuda::device_memory_resource dev_mr; + + // + // Configuration + // + + // Constant magnetic field + vector3 B{0.f, 0.f, 2.f * unit::T}; + + // Configure toy detector + toy_det_config toy_cfg{}; + toy_cfg.use_material_maps(false).n_brl_layers(4u).n_edc_layers(7u); + + std::cout << toy_cfg << std::endl; + + // Configure wire chamber + wire_chamber_config wire_chamber_cfg{}; + wire_chamber_cfg.half_z(500.f * unit::mm); + + std::cout << wire_chamber_cfg << std::endl; + + // Configure propagation + propagation::config prop_cfg{}; + prop_cfg.navigation.search_window = {3u, 3u}; + + std::cout << prop_cfg << std::endl; + + // Benchmark config + detray::benchmarks::benchmark_base::configuration bench_cfg{}; + + std::vector n_tracks{8 * 8, 16 * 16, 32 * 32, 64 * 64, + 128 * 128, 256 * 256, 512 * 512}; + + auto trk_cfg = + detray::benchmarks::get_default_trk_gen_config( + n_tracks); + + // Specific configuration for the random track generation + trk_cfg.seed(42u); + + // Add additional tracks for warmup + bench_cfg.n_warmup(static_cast( + std::ceil(0.1f * static_cast(trk_cfg.n_tracks())))); + + // + // Prepare data + // + auto track_samples = + detray::benchmarks::generate_track_samples( + &host_mr, n_tracks, trk_cfg, true); + + const auto [toy_det, names] = + build_toy_detector(host_mr, toy_cfg); + const auto [wire_chamber, _] = + build_wire_chamber(host_mr, wire_chamber_cfg); + + auto bfield = bfield::create_const_field(B); + + dtuple<> empty_state{}; + + parameter_transporter::state transporter_state{}; + pointwise_material_interactor::state interactor_state{}; + parameter_resetter::state resetter_state{}; + + auto actor_states = detail::make_tuple( + transporter_state, interactor_state, resetter_state); + + // + // Register benchmarks + // + std::cout << "Propagation Benchmarks\n" + << "----------------------\n\n"; + + prop_cfg.stepping.do_covariance_transport = true; + detray::benchmarks::register_benchmark< + detray::benchmarks::cuda_propagation_bm, + detray::benchmarks::cuda_propagator_type< + test::toy_metadata, field_bknd_t, + detray::benchmarks::default_chain>>( + "TOY_DETECTOR_W_COV_TRANSPORT", bench_cfg, prop_cfg, toy_det, bfield, + &actor_states, track_samples, n_tracks, &dev_mr); + + prop_cfg.stepping.do_covariance_transport = false; + detray::benchmarks::register_benchmark< + detray::benchmarks::cuda_propagation_bm, + detray::benchmarks::cuda_propagator_type< + test::toy_metadata, field_bknd_t, detray::benchmarks::empty_chain>>( + "TOY_DETECTOR", bench_cfg, prop_cfg, toy_det, bfield, &empty_state, + track_samples, n_tracks, &dev_mr); + + prop_cfg.stepping.do_covariance_transport = true; + detray::benchmarks::register_benchmark< + detray::benchmarks::cuda_propagation_bm, + detray::benchmarks::cuda_propagator_type< + test::default_metadata, field_bknd_t, + detray::benchmarks::default_chain>>( + "WIRE_CHAMBER_W_COV_TRANSPORT", bench_cfg, prop_cfg, wire_chamber, + bfield, &actor_states, track_samples, n_tracks, &dev_mr); + + prop_cfg.stepping.do_covariance_transport = false; + detray::benchmarks::register_benchmark< + detray::benchmarks::cuda_propagation_bm, + detray::benchmarks::cuda_propagator_type< + test::default_metadata, field_bknd_t, + detray::benchmarks::empty_chain>>( + "WIRE_CHAMBER", bench_cfg, prop_cfg, wire_chamber, bfield, &empty_state, + track_samples, n_tracks, &dev_mr); + + // Run benchmarks + ::benchmark::Initialize(&argc, argv); + ::benchmark::RunSpecifiedBenchmarks(); + ::benchmark::Shutdown(); +} diff --git a/tests/benchmarks/include/detray/benchmarks/benchmark_base.hpp b/tests/benchmarks/include/detray/benchmarks/benchmark_base.hpp new file mode 100644 index 000000000..c867b6f19 --- /dev/null +++ b/tests/benchmarks/include/detray/benchmarks/benchmark_base.hpp @@ -0,0 +1,76 @@ +/** Detray library, part of the ACTS project (R&D line) + * + * (c) 2023-2024 CERN for the benefit of the ACTS project + * + * Mozilla Public License Version 2.0 + */ + +#pragma once + +// Benchmark include +#include + +// System include(s) +#include +#include + +namespace detray::benchmarks { + +/// Base type for detray benchmarks with google benchmark +struct benchmark_base { + /// Local configuration type + struct configuration { + /// Size of data sample to be used in benchmark + int m_samples{100}; + /// Run a number of operations before the benchmark + bool m_warmup = true; + // Size of data in warm-up round + int m_n_warmup{static_cast(0.1 * static_cast(m_samples))}; + + /// Setters + /// @{ + configuration& n_samples(int n) { + m_samples = n; + return *this; + } + configuration& do_warmup(bool b) { + m_warmup = b; + return *this; + } + configuration& n_warmup(int n) { + m_n_warmup = n; + m_warmup = true; + return *this; + } + /// @} + + /// Getters + /// @{ + constexpr int n_samples() const { return m_samples; } + constexpr bool do_warmup() const { return m_warmup; } + constexpr int n_warmup() const { return m_n_warmup; } + /// @} + + private: + /// Print the benchmark setup + friend std::ostream& operator<<(std::ostream& os, + const configuration& cfg) { + os << " -> running:\t " << cfg.n_samples() << " samples" + << std::endl; + if (cfg.do_warmup()) { + os << " -> warmup: \t " << cfg.n_warmup() << " samples" + << std::endl; + } + os << std::endl; + return os; + } + }; + + /// Default construction + benchmark_base() = default; + + /// Default destructor + virtual ~benchmark_base() = default; +}; + +} // namespace detray::benchmarks diff --git a/tests/benchmarks/include/detray/benchmarks/cpu/CMakeLists.txt b/tests/benchmarks/include/detray/benchmarks/cpu/CMakeLists.txt new file mode 100644 index 000000000..c2bc1a0d8 --- /dev/null +++ b/tests/benchmarks/include/detray/benchmarks/cpu/CMakeLists.txt @@ -0,0 +1,15 @@ +# Detray library, part of the ACTS project (R&D line) +# +# (c) 2024 CERN for the benefit of the ACTS project +# +# Mozilla Public License Version 2.0 + +# Set the CPU build flags. +include(detray-compiler-options-cpp) + +# Set up a test library, which the "new style" benchmarks and tests could use. +add_library(detray_benchmark_cpu INTERFACE "propagation_benchmark.hpp") + +add_library(detray::benchmark_cpu ALIAS detray_benchmark_cpu) + +target_link_libraries(detray_benchmark_cpu INTERFACE detray::benchmarks) diff --git a/tests/benchmarks/include/detray/benchmarks/cpu/propagation_benchmark.hpp b/tests/benchmarks/include/detray/benchmarks/cpu/propagation_benchmark.hpp new file mode 100644 index 000000000..94c78c950 --- /dev/null +++ b/tests/benchmarks/include/detray/benchmarks/cpu/propagation_benchmark.hpp @@ -0,0 +1,131 @@ +/** Detray library, part of the ACTS project (R&D line) + * + * (c) 2023-2024 CERN for the benefit of the ACTS project + * + * Mozilla Public License Version 2.0 + */ + +#pragma once + +// Project include(s) +#include "detray/definitions/detail/algebra.hpp" +#include "detray/tracks/tracks.hpp" + +// Detray benchmark include(s) +#include "detray/benchmarks/benchmark_base.hpp" +#include "detray/benchmarks/propagation_benchmark_config.hpp" +#include "detray/benchmarks/propagation_benchmark_utils.hpp" + +// Benchmark include +#include + +// System include(s) +#include +#include +#include +#include + +namespace detray::benchmarks { + +template +struct host_propagation_bm : public benchmark_base { + /// Detector dependent types + using algebra_t = typename propagator_t::detector_type::algebra_type; + using scalar_t = dscalar; + using vector3_t = dvector3D; + + /// Local configuration type + using configuration = propagation_benchmark_config; + + /// The benchmark configuration + configuration m_cfg{}; + + /// Default construction + host_propagation_bm() = default; + + /// Construct from an externally provided configuration @param cfg + explicit host_propagation_bm(const configuration &cfg) : m_cfg{cfg} {} + + /// @return the benchmark configuration + configuration &config() { return m_cfg; } + + /// Prepare data and run benchmark loop + inline void operator()(::benchmark::State &state, + dvector> *tracks, + const typename propagator_t::detector_type *det, + const bfield_t *bfield, + typename propagator_t::actor_chain_type::state_tuple + *input_actor_states) const { + using actor_chain_t = typename propagator_t::actor_chain_type; + using actor_states_t = typename actor_chain_t::state_tuple; + + assert(tracks != nullptr); + assert(det != nullptr); + assert(bfield != nullptr); + assert(input_actor_states != nullptr); + + const int n_samples{m_cfg.benchmark().n_samples()}; + const int n_warmup{m_cfg.benchmark().n_warmup()}; + + assert(static_cast(n_samples) <= tracks->size()); + + // Create propagator + propagator_t p{m_cfg.propagation()}; + + // Call the host propagation + auto run_propagation = [&p, det, bfield, input_actor_states]( + free_track_parameters &track) { + // Fresh copy of actor states + actor_states_t actor_states(*input_actor_states); + // Tuple of references to pass to the propagator + typename actor_chain_t::state actor_state_refs = + actor_chain_t::setup_actor_states(actor_states); + + typename propagator_t::state p_state(track, *bfield, *det); + // Particle hypothesis + auto &ptc = p_state._stepping.particle_hypothesis(); + p_state.set_particle(update_particle_hypothesis(ptc, track)); + + // Run propagation + if constexpr (kOPT == + detray::benchmarks::propagation_opt::e_unsync) { + ::benchmark::DoNotOptimize( + p.propagate(p_state, actor_state_refs)); + } else if constexpr (kOPT == + detray::benchmarks::propagation_opt::e_sync) { + ::benchmark::DoNotOptimize( + p.propagate_sync(p_state, actor_state_refs)); + } + }; + + // Warm-up + if (m_cfg.benchmark().do_warmup()) { + assert(n_warmup > 0); + auto stride{n_samples / n_warmup}; + stride = (stride == 0) ? 10 : stride; + assert(stride > 0); + +#pragma omp parallel for schedule(dynamic) + for (int i = 0; i < n_samples; i += stride) { + run_propagation((*tracks)[static_cast(i)]); + } + } + + // Run the benchmark + std::size_t total_tracks = 0u; + for (auto _ : state) { +#pragma omp parallel for schedule(dynamic) + for (int i = 0; i < n_samples; ++i) { + run_propagation((*tracks)[static_cast(i)]); + } + total_tracks += static_cast(n_samples); + } + // Report throughput + state.counters["TracksPropagated"] = benchmark::Counter( + static_cast(total_tracks), benchmark::Counter::kIsRate); + } +}; + +} // namespace detray::benchmarks diff --git a/tests/benchmarks/include/detray/benchmarks/device/CMakeLists.txt b/tests/benchmarks/include/detray/benchmarks/device/CMakeLists.txt new file mode 100644 index 000000000..71bce8dfa --- /dev/null +++ b/tests/benchmarks/include/detray/benchmarks/device/CMakeLists.txt @@ -0,0 +1,9 @@ +# Detray library, part of the ACTS project (R&D line) +# +# (c) 2024 CERN for the benefit of the ACTS project +# +# Mozilla Public License Version 2.0 + +if(DETRAY_BUILD_CUDA) + add_subdirectory(cuda) +endif() diff --git a/tests/benchmarks/include/detray/benchmarks/device/cuda/CMakeLists.txt b/tests/benchmarks/include/detray/benchmarks/device/cuda/CMakeLists.txt new file mode 100644 index 000000000..b43cd29c6 --- /dev/null +++ b/tests/benchmarks/include/detray/benchmarks/device/cuda/CMakeLists.txt @@ -0,0 +1,29 @@ +# Detray library, part of the ACTS project (R&D line) +# +# (c) 2024 CERN for the benefit of the ACTS project +# +# Mozilla Public License Version 2.0 + +# C++17 support for CUDA requires CMake 3.18. +cmake_minimum_required(VERSION 3.18) + +# Enable CUDA as a language. +enable_language(CUDA) + +# Set the CUDA build flags. +include(detray-compiler-options-cuda) + +# Set up a benchamrk library for CUDA +add_library( + detray_benchmark_cuda + STATIC + "propagation_benchmark.hpp" + "propagation_benchmark.cu" +) + +add_library(detray::benchmark_cuda ALIAS detray_benchmark_cuda) + +target_link_libraries( + detray_benchmark_cuda + PUBLIC vecmem::cuda detray::benchmarks detray::test_utils detray::core_array +) diff --git a/tests/benchmarks/include/detray/benchmarks/device/cuda/propagation_benchmark.cu b/tests/benchmarks/include/detray/benchmarks/device/cuda/propagation_benchmark.cu new file mode 100644 index 000000000..200d9c295 --- /dev/null +++ b/tests/benchmarks/include/detray/benchmarks/device/cuda/propagation_benchmark.cu @@ -0,0 +1,148 @@ +/** Detray library, part of the ACTS project (R&D line) + * + * (c) 2022-2024 CERN for the benefit of the ACTS project + * + * Mozilla Public License Version 2.0 + */ + +#include "detray/benchmarks/device/cuda/propagation_benchmark.hpp" +#include "detray/definitions/detail/cuda_definitions.hpp" + +namespace detray::benchmarks { + +template +__global__ void __launch_bounds__(256, 4) propagator_benchmark_kernel( + propagation::config cfg, + typename propagator_t::detector_type::view_type det_view, + typename propagator_t::stepper_type::magnetic_field_type field_view, + typename propagator_t::actor_chain_type::state_tuple + *device_actor_state_ptr, + vecmem::data::vector_view< + free_track_parameters> + tracks_view) { + + using detector_device_t = + detector; + using algebra_t = typename detector_device_t::algebra_type; + using actor_chain_t = typename propagator_t::actor_chain_type; + using propagator_device_t = + propagator, actor_chain_t>; + + detector_device_t det(det_view); + vecmem::device_vector> tracks(tracks_view); + + int gid = threadIdx.x + blockIdx.x * blockDim.x; + if (gid >= tracks.size()) { + return; + } + + // Create propagator + propagator_device_t p{cfg}; + + // Create the actor states on a fresh copy + typename actor_chain_t::state_tuple actor_states = *device_actor_state_ptr; + auto actor_state_refs = actor_chain_t::setup_actor_states(actor_states); + + // Create the propagator state + typename propagator_device_t::state p_state(tracks.at(gid), field_view, + det); + + // Particle hypothesis + auto &ptc = p_state._stepping.particle_hypothesis(); + p_state.set_particle(update_particle_hypothesis(ptc, tracks.at(gid))); + + // Run propagation + if constexpr (kOPT == detray::benchmarks::propagation_opt::e_unsync) { + p.propagate(p_state, actor_state_refs); + } else if constexpr (kOPT == detray::benchmarks::propagation_opt::e_sync) { + p.propagate_sync(p_state, actor_state_refs); + } +} + +template +typename propagator_t::actor_chain_type::state_tuple *setup_actor_states( + typename propagator_t::actor_chain_type::state_tuple *input_actor_states) { + + // Copy the actor state blueprint to the device + using actor_state_t = typename propagator_t::actor_chain_type::state_tuple; + actor_state_t *device_actor_state_ptr{nullptr}; + + cudaError_t success = + cudaMalloc((void **)&device_actor_state_ptr, sizeof(actor_state_t)); + assert(success == cudaSuccess); + + success = cudaMemcpy(device_actor_state_ptr, input_actor_states, + sizeof(actor_state_t), cudaMemcpyHostToDevice); + assert(success == cudaSuccess); + + return device_actor_state_ptr; +} + +template +void release_actor_states(typename propagator_t::actor_chain_type::state_tuple + *device_actor_state_ptr) { + [[maybe_unused]] cudaError_t success = cudaFree(device_actor_state_ptr); + assert(success == cudaSuccess); +} + +template +void run_propagation_kernel( + const propagation::config &cfg, + typename propagator_t::detector_type::view_type det_view, + typename propagator_t::stepper_type::magnetic_field_type field_view, + typename propagator_t::actor_chain_type::state_tuple + *device_actor_state_ptr, + vecmem::data::vector_view< + free_track_parameters> + tracks_view, + const int n_samples) { + + constexpr int thread_dim = 256; + int block_dim = (n_samples + thread_dim - 1) / thread_dim; + + // run the test kernel + propagator_benchmark_kernel<<>>( + cfg, det_view, field_view, device_actor_state_ptr, tracks_view); + + // cuda error check + DETRAY_CUDA_ERROR_CHECK(cudaGetLastError()); + DETRAY_CUDA_ERROR_CHECK(cudaDeviceSynchronize()); +} + +/// Macro declaring the template instantiations for the different detector types +#define DECLARE_PROPAGATION_BENCHMARK(METADATA, CHAIN, FIELD, OPT) \ + \ + template void \ + run_propagation_kernel, OPT>( \ + const propagation::config &, detector::view_type, \ + covfie::field_view, \ + cuda_propagator_type::actor_chain_type::state_tuple *, \ + vecmem::data::vector_view< \ + free_track_parameters::algebra_type>>, \ + const int); \ + \ + template cuda_propagator_type::actor_chain_type::state_tuple * \ + setup_actor_states>( \ + cuda_propagator_type::actor_chain_type::state_tuple *); \ + \ + template void \ + release_actor_states>( \ + cuda_propagator_type::actor_chain_type::state_tuple *); + +DECLARE_PROPAGATION_BENCHMARK(test::default_metadata, empty_chain, + const_field_t, propagation_opt::e_unsync) +DECLARE_PROPAGATION_BENCHMARK(test::default_metadata, default_chain, + const_field_t, propagation_opt::e_unsync) + +DECLARE_PROPAGATION_BENCHMARK(test::toy_metadata, empty_chain, const_field_t, + propagation_opt::e_unsync) +DECLARE_PROPAGATION_BENCHMARK(test::toy_metadata, default_chain, const_field_t, + propagation_opt::e_unsync) + +} // namespace detray::benchmarks diff --git a/tests/benchmarks/include/detray/benchmarks/device/cuda/propagation_benchmark.hpp b/tests/benchmarks/include/detray/benchmarks/device/cuda/propagation_benchmark.hpp new file mode 100644 index 000000000..f5ef5e199 --- /dev/null +++ b/tests/benchmarks/include/detray/benchmarks/device/cuda/propagation_benchmark.hpp @@ -0,0 +1,184 @@ +/** Detray library, part of the ACTS project (R&D line) + * + * (c) 2024 CERN for the benefit of the ACTS project + * + * Mozilla Public License Version 2.0 + */ + +#pragma once + +// Project include(s) +#include "detray/definitions/detail/algebra.hpp" +#include "detray/detectors/bfield.hpp" +#include "detray/navigation/navigator.hpp" +#include "detray/propagator/actor_chain.hpp" +#include "detray/propagator/actors/aborters.hpp" +#include "detray/propagator/actors/parameter_resetter.hpp" +#include "detray/propagator/actors/parameter_transporter.hpp" +#include "detray/propagator/actors/pointwise_material_interactor.hpp" +#include "detray/propagator/propagator.hpp" +#include "detray/propagator/rk_stepper.hpp" +#include "detray/tracks/tracks.hpp" + +// Detray test include(s). +#include "detray/test/utils/types.hpp" + +// Detray benchmark include(s) +#include "detray/benchmarks/benchmark_base.hpp" +#include "detray/benchmarks/propagation_benchmark_config.hpp" +#include "detray/benchmarks/propagation_benchmark_utils.hpp" + +// Vecmem include(s) +#include +#include +#include +#include + +// Benchmark include +#include + +// System include(s) +#include +#include +#include +#include + +namespace detray::benchmarks { + +// Define propagator type +template +using empty_chain = actor_chain<>; + +template +using default_chain = actor_chain, + pointwise_material_interactor, + parameter_resetter>; + +using const_field_t = bfield::const_bknd_t; + +template class actor_chain_t> +using cuda_propagator_type = + propagator, + typename detector::algebra_type>, + navigator>, + actor_chain_t::algebra_type>>; + +/// Launch the propagation kernelfor benchmarking +/// +/// @param cfg the propagation configuration +/// @param det_view the detector vecmem view +/// @param field_data the magentic field view (maybe an empty field) +/// @param tracks_data the track collection view +/// @param navigation_cache_view the navigation cache vecemem view +/// @param opt which propagation to run (sync vs. unsync) +template +void run_propagation_kernel( + const propagation::config &, + typename propagator_t::detector_type::view_type, + typename propagator_t::stepper_type::magnetic_field_type, + typename propagator_t::actor_chain_type::state_tuple *, + vecmem::data::vector_view< + free_track_parameters>, + const int); + +/// Allocate actor state blueprint on device +template +typename propagator_t::actor_chain_type::state_tuple *setup_actor_states( + typename propagator_t::actor_chain_type::state_tuple *); + +/// Release actor state blueprint +template +void release_actor_states( + typename propagator_t::actor_chain_type::state_tuple *); + +/// Device Propagation becnhmark +template +struct cuda_propagation_bm : public benchmark_base { + /// Detector dependent types + using algebra_t = typename propagator_t::detector_type::algebra_type; + using scalar_t = dscalar; + using vector3_t = dvector3D; + + /// Local configuration type + using configuration = propagation_benchmark_config; + + /// The benchmark configuration + configuration m_cfg{}; + + /// Default construction + cuda_propagation_bm() = default; + + /// Construct from an externally provided configuration @param cfg + explicit cuda_propagation_bm(const configuration &cfg) : m_cfg{cfg} {} + + /// @return the benchmark configuration + configuration &config() { return m_cfg; } + + /// Prepare data and run benchmark loop + inline void operator()(::benchmark::State &state, + vecmem::memory_resource *dev_mr, + dvector> *tracks, + const typename propagator_t::detector_type *det, + const bfield_bknd_t *bfield, + typename propagator_t::actor_chain_type::state_tuple + *input_actor_states) const { + + assert(dev_mr != nullptr); + assert(tracks != nullptr); + assert(det != nullptr); + assert(bfield != nullptr); + assert(input_actor_states != nullptr); + + // Helper object for performing memory copies (to CUDA devices) + vecmem::cuda::copy cuda_cpy; + + const int n_samples{m_cfg.benchmark().n_samples()}; + const int n_warmup{m_cfg.benchmark().n_warmup()}; + + assert(static_cast(n_samples) <= tracks->size()); + + // Copy the track collection to device + auto track_buffer = + detray::get_buffer(vecmem::get_data(*tracks), *dev_mr, cuda_cpy); + + // Copy the detector to device and get its view + auto det_buffer = detray::get_buffer(*det, *dev_mr, cuda_cpy); + auto det_view = detray::get_data(det_buffer); + + // Copy blueprint actor states to device + auto *device_actor_state_ptr = + setup_actor_states(input_actor_states); + + // Do a small warm up run + { + auto warmup_track_buffer = detray::get_buffer( + vecmem::get_data(*tracks), *dev_mr, cuda_cpy); + run_propagation_kernel( + m_cfg.propagation(), det_view, *bfield, device_actor_state_ptr, + warmup_track_buffer, math::min(n_warmup, n_samples)); + } + + std::size_t total_tracks = 0u; + for (auto _ : state) { + // Launch the propagator test for GPU device + run_propagation_kernel( + m_cfg.propagation(), det_view, *bfield, device_actor_state_ptr, + track_buffer, n_samples); + + total_tracks += static_cast(n_samples); + } + + // Report throughput + state.counters["TracksPropagated"] = benchmark::Counter( + static_cast(total_tracks), benchmark::Counter::kIsRate); + + release_actor_states(device_actor_state_ptr); + } +}; + +} // namespace detray::benchmarks diff --git a/tests/benchmarks/include/detray/benchmarks/propagation_benchmark_config.hpp b/tests/benchmarks/include/detray/benchmarks/propagation_benchmark_config.hpp new file mode 100644 index 000000000..32dbcc617 --- /dev/null +++ b/tests/benchmarks/include/detray/benchmarks/propagation_benchmark_config.hpp @@ -0,0 +1,57 @@ +/** Detray library, part of the ACTS project (R&D line) + * + * (c) 2024 CERN for the benefit of the ACTS project + * + * Mozilla Public License Version 2.0 + */ + +#pragma once + +// Project include(s) +#include "detray/benchmarks/benchmark_base.hpp" +#include "detray/propagator/propagation_config.hpp" + +// System include(s) +#include +#include + +namespace detray::benchmarks { + +/// Configuration for propagation benchmarks +struct propagation_benchmark_config { + /// Prefix for the benchmark name + std::string m_name{"BM_PROPAGATION"}; + /// Benchmark configuration + benchmark_base::configuration m_benchmark{}; + /// Propagation configuration + propagation::config m_propagation{}; + + /// Default construciton + propagation_benchmark_config() = default; + + /// Construct from a base configuration + explicit propagation_benchmark_config( + const benchmark_base::configuration& bench_cfg) + : m_benchmark(bench_cfg) {} + + /// Getters + /// @{ + const std::string& name() const { return m_name; } + const propagation::config& propagation() const { return m_propagation; } + propagation::config& propagation() { return m_propagation; } + const benchmark_base::configuration& benchmark() const { + return m_benchmark; + } + benchmark_base::configuration& benchmark() { return m_benchmark; } + /// @} + + /// Setters + /// @{ + propagation_benchmark_config& name(const std::string_view n) { + m_name = n; + return *this; + } + /// @} +}; + +} // namespace detray::benchmarks diff --git a/tests/benchmarks/include/detray/benchmarks/propagation_benchmark_utils.hpp b/tests/benchmarks/include/detray/benchmarks/propagation_benchmark_utils.hpp new file mode 100644 index 000000000..e3c4bffd8 --- /dev/null +++ b/tests/benchmarks/include/detray/benchmarks/propagation_benchmark_utils.hpp @@ -0,0 +1,222 @@ +/** Detray library, part of the ACTS project (R&D line) + * + * (c) 2024 CERN for the benefit of the ACTS project + * + * Mozilla Public License Version 2.0 + */ + +#pragma once + +// Project include(s) +#include "detray/definitions/detail/algebra.hpp" +#include "detray/navigation/navigator.hpp" +#include "detray/propagator/actor_chain.hpp" +#include "detray/propagator/propagator.hpp" +#include "detray/tracks/tracks.hpp" +#include "detray/utils/tuple.hpp" + +// Vecmem include(s) +#include + +// Benchmark include +#include + +// System include(s) +#include +#include +#include + +namespace detray::benchmarks { + +/// Which propagate function to run +enum class propagation_opt { + e_unsync = 0, + e_sync = 1, +}; + +/// @returns the default track generation configuration for detray benchmarks +template +inline typename track_generator_t::configuration get_default_trk_gen_config( + const std::vector &n_tracks) { + + using track_t = typename track_generator_t::track_type; + using scalar_t = dscalar; + + int n_trks{*std::ranges::max_element(n_tracks)}; + + // Generate tracks + typename track_generator_t::configuration trk_cfg{}; + trk_cfg.n_tracks(static_cast(n_trks)); + trk_cfg.randomize_charge(true); + trk_cfg.phi_range(-constant::pi, constant::pi); + trk_cfg.eta_range(-3.f, 3.f); + trk_cfg.mom_range(1.f * unit::GeV, 100.f * unit::GeV); + trk_cfg.origin(0.f, 0.f, 0.f); + trk_cfg.origin_stddev(0.f, 0.f, 0.f); + + return trk_cfg; +} + +/// Precompute the tracks +/// +/// @param mr memory resource to allocate the track vector +/// @param cfg the configuration of the track generator +/// @param do_sort sort the tracks by theta angle +template +inline auto generate_tracks( + vecmem::memory_resource *mr, + const typename track_generator_t::configuration &cfg = {}, + bool do_sort = true) { + + using track_t = typename track_generator_t::track_type; + using scalar_t = dscalar; + + // Track collection + dvector tracks(mr); + + // Iterate through uniformly distributed momentum directions + for (auto track : track_generator_t{cfg}) { + // Put it into vector of trajectories + tracks.push_back(track); + } + + if (do_sort) { + // Sort by theta angle + const auto traj_comp = [](const auto &lhs, const auto &rhs) { + constexpr auto pi_2{constant::pi_2}; + return math::fabs(pi_2 - vector::theta(lhs.dir())) < + math::fabs(pi_2 - vector::theta(rhs.dir())); + }; + + std::ranges::sort(tracks, traj_comp); + } + + return tracks; +} + +/// Generate as many samples of track states as there are entries in the +/// @param n_tracks vector. +template +inline auto generate_track_samples( + vecmem::memory_resource *mr, const std::vector &n_tracks, + typename track_generator_t::configuration &cfg = {}, bool do_sort = true) { + + using track_t = typename track_generator_t::track_type; + + std::vector> track_samples{}; + track_samples.reserve(n_tracks.size()); + + auto tmp_cfg{cfg}; + for (const int n : n_tracks) { + tmp_cfg.n_tracks(static_cast(n)); + track_samples.push_back( + generate_tracks(mr, tmp_cfg, do_sort)); + } + + return track_samples; +} + +/// Register a propagation benchmark of type @tparam benchmark_t +/// +/// @tparam benchmark_t the propagation benchmark functor +/// @tparam propagator_t full propagator type +/// @tparam detector_t host detector type +/// @tparam bfield_t covfie magnetic field type +/// +/// @param name name for the benchmark +/// @param bench_cfg basic benchmark configuration +/// @param prop_cfg propagation configuration +/// @param det the detector +/// @param bfield the covfie field +/// @param actor_states tuple that contains all actor states (same order as in +/// actor_chain_t) +/// @param tracks the pre-computed test tracks +/// @param n_samples the number of track to run +template