diff --git a/CMakeLists.txt b/CMakeLists.txt
index 8953a816..947dd69d 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -165,11 +165,11 @@ endif()
 set(ADEPT_G4_INTEGRATION_SRCS
   src/G4HepEmTrackingManagerSpecialized.cc
   src/AdePTTrackingManager.cc
-  src/AdePTTrackingManager.cu
   src/AdePTPhysics.cc
   src/HepEMPhysics.cc
   src/AdePTGeant4Integration.cpp
   src/AdePTConfigurationMessenger.cc
+  src/AdePTTrackingManager.cu
 )
 
 add_library(CopCore INTERFACE)
@@ -187,6 +187,7 @@ target_include_directories(AdePT_G4_integration
     $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include>
     $<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}/>
 )
+
 target_link_libraries(AdePT_G4_integration
   PUBLIC
     CopCore
@@ -207,6 +208,14 @@ set_target_properties(AdePT_G4_integration
     CUDA_RESOLVE_DEVICE_SYMBOLS ON
 )
 
+option(ASYNC_MODE OFF "Enable the async transport backend")
+if(ASYNC_MODE)
+  add_compile_definitions(ASYNC_MODE)
+  message(STATUS "${Green}Using the asynchronous transport backend${ColorReset}")
+else()
+  message(STATUS "${Red}Async backend is disabled${ColorReset}")
+endif()
+
 # Optional library to activate NVTX annotations for profiling:
 option(NVTX OFF "Add NVTX instrumentation for profiling (only for annotated examples)")
 add_library(NVTX INTERFACE)
diff --git a/examples/AsyncExample/AdeptIntegration.cpp b/examples/AsyncExample/AdeptIntegration.cpp
deleted file mode 100644
index 1666f6a2..00000000
--- a/examples/AsyncExample/AdeptIntegration.cpp
+++ /dev/null
@@ -1,273 +0,0 @@
-// SPDX-FileCopyrightText: 2022 CERN
-// SPDX-License-Identifier: Apache-2.0
-
-#include "AdeptIntegration.h"
-
-#include "TrackTransfer.h"
-#include "Histograms.h"
-
-#include <AdePT/integration/AdePTGeant4Integration.hh>
-
-#include <VecGeom/management/BVHManager.h>
-#include "VecGeom/management/GeoManager.h"
-
-#include <G4RunManager.hh>
-#include <G4Threading.hh>
-#include <G4Proton.hh>
-#include <G4Region.hh>
-#include <G4SDManager.hh>
-#include <G4VFastSimSensitiveDetector.hh>
-#include <G4MaterialCutsCouple.hh>
-#include <G4ProductionCutsTable.hh>
-#include <G4TransportationManager.hh>
-
-#include <G4HepEmData.hh>
-#include <G4HepEmState.hh>
-#include <G4HepEmStateInit.hh>
-#include <G4HepEmParameters.hh>
-#include <G4HepEmMatCutData.hh>
-#include <G4LogicalVolumeStore.hh>
-
-#include <iomanip>
-
-std::shared_ptr<AdePTTransportInterface> AdePTTransportFactory(unsigned int nThread, unsigned int nTrackSlot,
-                                                               unsigned int nHitSlot, int verbosity,
-                                                               std::vector<std::string> const *GPURegionNames,
-                                                               bool trackInAllRegions, int cudaStackSize)
-{
-  static std::shared_ptr<AsyncAdePT::AdeptIntegration> adePT{new AsyncAdePT::AdeptIntegration(
-      nThread, nTrackSlot, nHitSlot, verbosity, GPURegionNames, trackInAllRegions, cudaStackSize)};
-  return adePT;
-}
-
-namespace AsyncAdePT {
-
-TrackBuffer::TrackHandle TrackBuffer::createToDeviceSlot()
-{
-  bool warningIssued = false;
-  while (true) {
-    auto &toDevice = getActiveBuffer();
-    std::shared_lock lock{toDevice.mutex};
-    const auto slot = toDevice.nTrack.fetch_add(1, std::memory_order_relaxed);
-
-    if (slot < toDevice.maxTracks)
-      return TrackHandle{toDevice.tracks[slot], std::move(lock)};
-    else {
-      if (!warningIssued) {
-        std::cerr << __FILE__ << ':' << __LINE__ << " Contention in to-device queue; thread sleeping" << std::endl;
-        warningIssued = true;
-      }
-      using namespace std::chrono_literals;
-      std::this_thread::sleep_for(1ms);
-    }
-  }
-}
-
-namespace {
-template <typename T>
-std::size_t countTracks(int pdgToSelect, T const &container)
-{
-  return std::count_if(container.begin(), container.end(),
-                       [pdgToSelect](TrackDataWithIDs const &track) { return track.pdg == pdgToSelect; });
-}
-
-std::ostream &operator<<(std::ostream &stream, TrackDataWithIDs const &track)
-{
-  const auto flags = stream.flags();
-  stream << std::setw(5) << track.pdg << std::scientific << std::setw(15) << std::setprecision(6) << track.eKin << " ("
-         << std::setprecision(2) << std::setw(9) << track.position[0] << std::setw(9) << track.position[1]
-         << std::setw(9) << track.position[2] << ")";
-  stream.flags(flags);
-  return stream;
-}
-} // namespace
-
-void AdeptIntegration::AddTrack(int pdg, int parentID, double energy, double x, double y, double z, double dirx,
-                                double diry, double dirz, double globalTime, double localTime, double properTime,
-                                int threadId, unsigned int eventId, unsigned int trackId)
-{
-  if (pdg != 11 && pdg != -11 && pdg != 22) {
-    G4cerr << __FILE__ << ":" << __LINE__ << ": Only supporting EM tracks. Got pdgID=" << pdg << "\n";
-    return;
-  }
-
-  TrackDataWithIDs track{pdg,       parentID,   energy,  x,       y,
-                         z,         dirx,       diry,    dirz,    globalTime,
-                         localTime, properTime, eventId, trackId, static_cast<short>(threadId)};
-  if (fDebugLevel >= 2) {
-    fGPUNetEnergy[threadId] += energy;
-    if (fDebugLevel >= 6) {
-      G4cout << "\n[_in," << eventId << "," << trackId << "]: " << track << "\tGPU net energy " << std::setprecision(6)
-             << fGPUNetEnergy[threadId] << G4endl;
-    }
-  }
-
-  // Lock buffer and emplace the track
-  {
-    auto trackHandle  = fBuffer->createToDeviceSlot();
-    trackHandle.track = std::move(track);
-  }
-
-  fEventStates[threadId].store(EventState::NewTracksFromG4, std::memory_order_release);
-}
-
-void AdeptIntegration::FullInit()
-{
-  const auto numVolumes = vecgeom::GeoManager::Instance().GetRegisteredVolumesCount();
-  if (numVolumes == 0) throw std::runtime_error("AdeptIntegration::Initialize: Number of geometry volumes is zero.");
-
-  G4cout << "=== AdeptIntegration: initializing geometry and physics\n";
-  // Initialize geometry on device
-  if (!vecgeom::GeoManager::Instance().IsClosed())
-    throw std::runtime_error("AdeptIntegration::Initialize: VecGeom geometry not closed.");
-
-  const vecgeom::cxx::VPlacedVolume *world = vecgeom::GeoManager::Instance().GetWorld();
-  if (!InitializeGeometry(world))
-    throw std::runtime_error("AdeptIntegration::Initialize: Cannot initialize geometry on GPU");
-
-  // Initialize G4HepEm
-  const double bz = fG4Integrations.front().GetUniformFieldZ();
-  if (!InitializePhysics(bz)) throw std::runtime_error("AdeptIntegration::Initialize cannot initialize physics on GPU");
-
-  // Check VecGeom geometry matches Geant4. Initialize auxiliary per-LV data. Initialize scoring map.
-  fG4Integrations.front().CheckGeometry(fg4hepem_state.get());
-  adeptint::VolAuxData *auxData = new adeptint::VolAuxData[vecgeom::GeoManager::Instance().GetRegisteredVolumesCount()];
-  fG4Integrations.front().InitVolAuxData(auxData, fg4hepem_state.get(), fTrackInAllRegions, fGPURegionNames);
-
-  // Initialize volume auxiliary data on device
-  auto &volAuxArray       = adeptint::VolAuxArray::GetInstance();
-  volAuxArray.fNumVolumes = numVolumes;
-  volAuxArray.fAuxData    = auxData;
-  AsyncAdePT::InitVolAuxArray(volAuxArray);
-
-  for (auto &g4int : fG4Integrations) {
-    g4int.InitScoringData(volAuxArray.fAuxData);
-  }
-
-  // Allocate buffers to transport particles to/from device. Scale the size of the staging area
-  // with the number of threads.
-  fBuffer = std::make_unique<TrackBuffer>(8192 * fNThread, 1024 * fNThread, fNThread);
-
-  fGPUWorker = std::thread{&AdeptIntegration::TransportLoop, this};
-}
-
-void AdeptIntegration::InitBVH()
-{
-  vecgeom::cxx::BVHManager::Init();
-  vecgeom::cxx::BVHManager::DeviceInit();
-}
-
-void AdeptIntegration::Flush(G4int threadId, G4int eventId)
-{
-  if (fDebugLevel >= 3) {
-    G4cout << "\nFlushing AdePT for event " << eventId << G4endl;
-  }
-
-  auto xyHisto_count     = std::make_shared<TH2L>(("Event_" + std::to_string(eventId) + "_count_xy").c_str(),
-                                              "Number of hits;x;y", 500, -2500, 2500, 500, -2500, 2500);
-  auto xyHisto_e         = std::make_shared<TH2D>(("Event_" + std::to_string(eventId) + "_E_xy").c_str(),
-                                          "Energy deposition;x;y", 500, -2500, 2500, 500, -2500, 2500);
-  auto etaPhiHisto_count = std::make_shared<TH2L>(("Event_" + std::to_string(eventId) + "_count_etaPhi").c_str(),
-                                                  "Number of hits;#phi;#eta", 100, -180, 180, 300, -15, 15);
-  auto etaPhiHisto_e     = std::make_shared<TH2D>(("Event_" + std::to_string(eventId) + "_E_etaPhi").c_str(),
-                                              "Energy deposition;#phi;#eta", 100, -180, 180, 300, -15, 15);
-  AsyncExHistos::registerHisto(xyHisto_count);
-  AsyncExHistos::registerHisto(xyHisto_e);
-  AsyncExHistos::registerHisto(etaPhiHisto_count);
-  AsyncExHistos::registerHisto(etaPhiHisto_e);
-
-  assert(static_cast<unsigned int>(threadId) < fBuffer->fromDeviceBuffers.size());
-  fEventStates[threadId].store(EventState::G4RequestsFlush, std::memory_order_release);
-
-  AdePTGeant4Integration &g4Integration = fG4Integrations[threadId];
-
-  while (fEventStates[threadId].load(std::memory_order_acquire) < EventState::DeviceFlushed) {
-    {
-      std::unique_lock lock{fMutex_G4Workers};
-      fCV_G4Workers.wait(lock);
-    }
-
-    std::shared_ptr<const std::vector<GPUHit>> gpuHits;
-    while ((gpuHits = GetGPUHits(threadId)) != nullptr) {
-      GPUHit dummy;
-      dummy.fEventId = eventId;
-      auto range     = std::equal_range(gpuHits->begin(), gpuHits->end(), dummy,
-                                        [](const GPUHit &lhs, const GPUHit &rhs) { return lhs.fEventId < rhs.fEventId; });
-      for (auto it = range.first; it != range.second; ++it) {
-        assert(it->threadId == threadId);
-        const auto pos = it->fPostStepPoint.fPosition;
-        xyHisto_e->Fill(pos.x(), pos.y(), it->fTotalEnergyDeposit);
-        xyHisto_count->Fill(pos.x(), pos.y());
-        G4ThreeVector posg4(pos.x(), pos.y(), pos.z());
-        const auto phi = posg4.getPhi() / 2. / (2. * acos(0)) * 360;
-        etaPhiHisto_e->Fill(phi, posg4.getEta(), it->fTotalEnergyDeposit);
-        etaPhiHisto_count->Fill(phi, posg4.getEta());
-        g4Integration.ProcessGPUHit(*it);
-      }
-    }
-  }
-
-  // Now device should be flushed, so retrieve the tracks:
-  std::vector<TrackDataWithIDs> tracks;
-  {
-    auto handle = fBuffer->getTracksFromDevice(threadId);
-    tracks.swap(handle.tracks);
-    fEventStates[threadId].store(EventState::LeakedTracksRetrieved, std::memory_order_release);
-  }
-
-  // TODO: Sort tracks on device?
-#ifndef NDEBUG
-  for (auto const &track : tracks) {
-    bool error = false;
-    if (track.threadId != threadId || track.eventId != static_cast<unsigned int>(eventId)) error = true;
-    if (!(track.pdg == -11 || track.pdg == 11 || track.pdg == 22)) error = true;
-    if (error)
-      std::cerr << "Error in returning track: threadId=" << track.threadId << " eventId=" << track.eventId
-                << " pdg=" << track.pdg << "\n";
-    assert(!error);
-  }
-#endif
-  std::sort(tracks.begin(), tracks.end());
-
-  const auto oldEnergyTransferred = fGPUNetEnergy[threadId];
-  if (fDebugLevel >= 2) {
-    unsigned int trackId = 0;
-    for (const auto &track : tracks) {
-
-      fGPUNetEnergy[threadId] -= track.eKin;
-      if (fDebugLevel >= 5) {
-        G4cout << "\n[out," << track.eventId << "," << trackId++ << "]: " << track << "\tGPU net energy "
-               << std::setprecision(6) << fGPUNetEnergy[threadId] << G4endl;
-      }
-    }
-  }
-
-  if (fDebugLevel >= 2) {
-    std::stringstream str;
-    str << "\n[" << eventId << "]: Pushed " << tracks.size() << " tracks to G4";
-    str << "\tEnergy back to G4: " << std::setprecision(6)
-        << (oldEnergyTransferred - fGPUNetEnergy[threadId]) / CLHEP::GeV << "\tGPU net energy " << std::setprecision(6)
-        << fGPUNetEnergy[threadId] / CLHEP::GeV << " GeV";
-    str << "\t(" << countTracks(11, tracks) << ", " << countTracks(-11, tracks) << ", " << countTracks(22, tracks)
-        << ")";
-    G4cout << str.str() << G4endl;
-  }
-
-  if (tracks.empty()) {
-    AsyncAdePT::PerEventScoring &scoring = fScoring[threadId];
-    scoring.CopyToHost();
-    scoring.ClearGPU();
-    fGPUNetEnergy[threadId] = 0.;
-
-    if (fDebugLevel >= 2) {
-      G4cout << "\n\tScoring for event " << eventId << G4endl;
-      scoring.Print();
-    }
-
-    fEventStates[threadId].store(EventState::ScoringRetrieved, std::memory_order_release);
-  }
-
-  g4Integration.ReturnTracks(tracks.begin(), tracks.end(), fDebugLevel);
-}
-
-} // namespace AsyncAdePT
diff --git a/examples/AsyncExample/BasicScoring.cu b/examples/AsyncExample/BasicScoring.cu
deleted file mode 100644
index 12556e74..00000000
--- a/examples/AsyncExample/BasicScoring.cu
+++ /dev/null
@@ -1,249 +0,0 @@
-// SPDX-FileCopyrightText: 2024 CERN
-// SPDX-License-Identifier: Apache-2.0
-
-#include "BasicScoring.h"
-// #include "AdeptIntegration.h"
-
-#include <AdePT/copcore/Global.h>
-
-#include <cub/device/device_merge_sort.cuh>
-
-#include <chrono>
-#include <mutex>
-#include <thread>
-
-// Comparison for sorting tracks into events on device:
-struct CompareGPUHits {
-  __device__ bool operator()(const GPUHit &lhs, const GPUHit &rhs) const { return lhs.fEventId < rhs.fEventId; }
-};
-
-namespace AsyncAdePT {
-
-__device__ HitScoringBuffer gHitScoringBuffer_dev;
-
-__device__ GPUHit &HitScoringBuffer::GetNextSlot()
-{
-  const auto slotIndex = atomicAdd(&fSlotCounter, 1);
-  if (slotIndex >= fNSlot) {
-    printf("Trying to score hit #%d with only %d slots\n", slotIndex, fNSlot);
-    COPCORE_EXCEPTION("Out of slots in HitScoringBuffer::NextSlot");
-  }
-
-  return hitBuffer_dev[slotIndex];
-}
-
-HitScoring::HitScoring(unsigned int hitCapacity, unsigned int nThread) : fHitCapacity{hitCapacity}, fHitQueues(nThread)
-{
-  // We use a single allocation for both buffers:
-  GPUHit *gpuHits = nullptr;
-  COPCORE_CUDA_CHECK(cudaMallocHost(&gpuHits, sizeof(GPUHit) * 2 * fHitCapacity));
-  fGPUHitBuffer_host.reset(gpuHits);
-
-  auto result = cudaMalloc(&gpuHits, sizeof(GPUHit) * 2 * fHitCapacity);
-  if (result != cudaSuccess) throw std::invalid_argument{"No space to allocate hit buffer."};
-  fGPUHitBuffer_dev.reset(gpuHits);
-
-  // Init buffers for on-device sorting of hits:
-  // Determine device storage requirements for on-device sorting.
-  result = cub::DeviceMergeSort::SortKeys(nullptr, fGPUSortAuxMemorySize, fGPUHitBuffer_dev.get(), fHitCapacity,
-                                          CompareGPUHits{});
-  if (result != cudaSuccess) throw std::invalid_argument{"No space for hit sorting on device."};
-
-  std::byte *gpuSortingMem;
-  result = cudaMalloc(&gpuSortingMem, fGPUSortAuxMemorySize);
-  if (result != cudaSuccess) throw std::invalid_argument{"No space to allocate hit sorting buffer."};
-  fGPUSortAuxMemory.reset(gpuSortingMem);
-
-  // Store buffer data in structs
-  fBuffers[0].hitScoringInfo = HitScoringBuffer{fGPUHitBuffer_dev.get(), 0, fHitCapacity};
-  fBuffers[0].hostBuffer = fGPUHitBuffer_host.get();
-  fBuffers[0].state      = BufferHandle::State::OnDevice;
-  fBuffers[1].hitScoringInfo = HitScoringBuffer{fGPUHitBuffer_dev.get() + fHitCapacity, 0, fHitCapacity};
-  fBuffers[1].hostBuffer = fGPUHitBuffer_host.get() + fHitCapacity;
-  fBuffers[1].state      = BufferHandle::State::Free;
-
-  COPCORE_CUDA_CHECK(cudaGetSymbolAddress(&fHitScoringBuffer_deviceAddress, gHitScoringBuffer_dev));
-  assert(fHitScoringBuffer_deviceAddress != nullptr);
-  COPCORE_CUDA_CHECK(cudaMemcpy(fHitScoringBuffer_deviceAddress, &fBuffers[0].hitScoringInfo, sizeof(HitScoringBuffer),
-                                cudaMemcpyHostToDevice));
-}
-
-/// Place a new empty buffer on the GPU.
-/// The caller has to ensure that all scoring work on the device completes by making the cuda
-/// stream wait for all transport on the device.
-/// The function will block while the swap is running.
-void HitScoring::SwapDeviceBuffers(cudaStream_t cudaStream)
-{
-  // Ensure that host side has been processed:
-  auto &currentBuffer = fBuffers[fActiveBuffer];
-  if (currentBuffer.state != BufferHandle::State::OnDevice)
-    throw std::logic_error(__FILE__ + std::to_string(__LINE__) + ": On-device buffer in wrong state");
-
-  // Get new buffer info from device:
-  auto &currentHitInfo = currentBuffer.hitScoringInfo;
-  COPCORE_CUDA_CHECK(cudaMemcpyAsync(&currentHitInfo, fHitScoringBuffer_deviceAddress, sizeof(HitScoringBuffer),
-                                     cudaMemcpyDefault, cudaStream));
-
-  // Execute the swap:
-  fActiveBuffer          = (fActiveBuffer + 1) % fBuffers.size();
-  auto &nextDeviceBuffer = fBuffers[fActiveBuffer];
-  while (nextDeviceBuffer.state != BufferHandle::State::Free) {
-    std::cerr << __func__ << " Warning: Another thread should have processed the hits.\n";
-  }
-  assert(nextDeviceBuffer.state == BufferHandle::State::Free && nextDeviceBuffer.hitScoringInfo.fSlotCounter == 0);
-
-  nextDeviceBuffer.state = BufferHandle::State::OnDevice;
-  COPCORE_CUDA_CHECK(cudaMemcpyAsync(fHitScoringBuffer_deviceAddress, &nextDeviceBuffer.hitScoringInfo,
-                                     sizeof(HitScoringBuffer), cudaMemcpyDefault, cudaStream));
-  COPCORE_CUDA_CHECK(cudaStreamSynchronize(cudaStream));
-  currentBuffer.state = BufferHandle::State::OnDeviceNeedTransferToHost;
-}
-
-/// Copy the current contents of the GPU hit buffer to host.
-void HitScoring::TransferHitsToHost(cudaStream_t cudaStreamForHitCopy)
-{
-  for (auto &buffer : fBuffers) {
-    if (buffer.state != BufferHandle::State::OnDeviceNeedTransferToHost) continue;
-
-    buffer.state = BufferHandle::State::TransferToHost;
-    assert(buffer.hitScoringInfo.fSlotCounter < fHitCapacity);
-
-    auto bufferBegin = buffer.hitScoringInfo.hitBuffer_dev;
-
-    cub::DeviceMergeSort::SortKeys(fGPUSortAuxMemory.get(), fGPUSortAuxMemorySize, bufferBegin,
-                                   buffer.hitScoringInfo.fSlotCounter, CompareGPUHits{}, cudaStreamForHitCopy);
-
-    COPCORE_CUDA_CHECK(cudaMemcpyAsync(buffer.hostBuffer, bufferBegin,
-                                       sizeof(GPUHit) * buffer.hitScoringInfo.fSlotCounter, cudaMemcpyDefault,
-                                       cudaStreamForHitCopy));
-    COPCORE_CUDA_CHECK(cudaLaunchHostFunc(
-        cudaStreamForHitCopy,
-        [](void *arg) { static_cast<BufferHandle *>(arg)->state = BufferHandle::State::NeedHostProcessing; }, &buffer));
-  }
-}
-
-bool HitScoring::ProcessHits()
-{
-  std::unique_lock lock{fProcessingHitsMutex, std::defer_lock};
-  bool haveNewHits = false;
-
-  while (std::any_of(fBuffers.begin(), fBuffers.end(),
-                     [](auto &buffer) { return buffer.state >= BufferHandle::State::TransferToHost; })) {
-    for (auto &handle : fBuffers) {
-      if (handle.state == BufferHandle::State::NeedHostProcessing) {
-        if (!lock) lock.lock();
-        haveNewHits = true;
-        ProcessBuffer(handle);
-      }
-    }
-  }
-
-  return haveNewHits;
-}
-
-void HitScoring::ProcessBuffer(BufferHandle &handle)
-{
-  // We are assuming that the caller holds a lock on fProcessingHitsMutex.
-  if (handle.state == BufferHandle::State::NeedHostProcessing) {
-    auto hitVector = std::make_shared<std::vector<GPUHit>>();
-    hitVector->assign(handle.hostBuffer, handle.hostBuffer + handle.hitScoringInfo.fSlotCounter);
-    handle.hitScoringInfo.fSlotCounter = 0;
-    handle.state                   = BufferHandle::State::Free;
-
-    for (auto &hitQueue : fHitQueues) {
-      hitQueue.push_back(hitVector);
-    }
-  }
-}
-
-std::shared_ptr<const std::vector<GPUHit>> HitScoring::GetNextHitsVector(unsigned int threadId)
-{
-  assert(threadId < fHitQueues.size());
-  std::shared_lock lock{fProcessingHitsMutex};
-
-  if (fHitQueues[threadId].empty())
-    return nullptr;
-  else {
-    auto ret = fHitQueues[threadId].front();
-    fHitQueues[threadId].pop_front();
-    return ret;
-  }
-}
-
-/// Clear the device hits content
-void PerEventScoring::ClearGPU(cudaStream_t cudaStream)
-{
-  COPCORE_CUDA_CHECK(cudaMemsetAsync(fScoring_dev, 0, sizeof(GlobalCounters), cudaStream));
-  COPCORE_CUDA_CHECK(cudaStreamSynchronize(cudaStream));
-}
-
-/// Transfer scoring counters into host instance. Blocks until the operation completes.
-void PerEventScoring::CopyToHost(cudaStream_t cudaStream)
-{
-  const auto oldPointer = fScoring_dev;
-  COPCORE_CUDA_CHECK(
-      cudaMemcpyAsync(&fGlobalCounters, fScoring_dev, sizeof(GlobalCounters), cudaMemcpyDeviceToHost, cudaStream));
-  COPCORE_CUDA_CHECK(cudaStreamSynchronize(cudaStream));
-  assert(oldPointer == fScoring_dev);
-  (void)oldPointer;
-}
-
-} // namespace AsyncAdePT
-
-namespace {
-/// @brief Utility function to copy a 3D vector, used for filling the Step Points
-__device__ __forceinline__ void Copy3DVector(vecgeom::Vector3D<Precision> const &source,
-                                             vecgeom::Vector3D<Precision> &destination)
-{
-  destination = source;
-}
-} // namespace
-
-namespace adept_scoring {
-
-/// @brief Record a hit
-template <>
-__device__ void RecordHit(AsyncAdePT::PerEventScoring * /*scoring*/, int aParentID, char aParticleType,
-                          double aStepLength, double aTotalEnergyDeposit, vecgeom::NavigationState const *aPreState,
-                          vecgeom::Vector3D<Precision> const *aPrePosition,
-                          vecgeom::Vector3D<Precision> const *aPreMomentumDirection,
-                          vecgeom::Vector3D<Precision> const * /*aPrePolarization*/, double aPreEKin, double aPreCharge,
-                          vecgeom::NavigationState const *aPostState, vecgeom::Vector3D<Precision> const *aPostPosition,
-                          vecgeom::Vector3D<Precision> const *aPostMomentumDirection,
-                          vecgeom::Vector3D<Precision> const * /*aPostPolarization*/, double aPostEKin,
-                          double aPostCharge, unsigned int eventID, short threadID)
-{
-  // Acquire a hit slot
-  GPUHit &aGPUHit  = AsyncAdePT::gHitScoringBuffer_dev.GetNextSlot();
-  aGPUHit.fParentID = aParentID;
-  aGPUHit.fEventId = eventID;
-  aGPUHit.threadId = threadID;
-
-  // Fill the required data
-  aGPUHit.fParticleType       = aParticleType;
-  aGPUHit.fStepLength         = aStepLength;
-  aGPUHit.fTotalEnergyDeposit = aTotalEnergyDeposit;
-  // Pre step point
-  aGPUHit.fPreStepPoint.fNavigationState = *aPreState;
-  Copy3DVector(*aPrePosition, aGPUHit.fPreStepPoint.fPosition);
-  Copy3DVector(*aPreMomentumDirection, aGPUHit.fPreStepPoint.fMomentumDirection);
-  // Copy3DVector(aPrePolarization, aGPUHit.fPreStepPoint.fPolarization);
-  aGPUHit.fPreStepPoint.fEKin   = aPreEKin;
-  aGPUHit.fPreStepPoint.fCharge = aPreCharge;
-  // Post step point
-  aGPUHit.fPostStepPoint.fNavigationState = *aPostState;
-  Copy3DVector(*aPostPosition, aGPUHit.fPostStepPoint.fPosition);
-  Copy3DVector(*aPostMomentumDirection, aGPUHit.fPostStepPoint.fMomentumDirection);
-  // Copy3DVector(aPostPolarization, aGPUHit.fPostStepPoint.fPolarization);
-  aGPUHit.fPostStepPoint.fEKin   = aPostEKin;
-  aGPUHit.fPostStepPoint.fCharge = aPostCharge;
-}
-
-template <>
-__device__ void AccountProduced(AsyncAdePT::PerEventScoring *scoring, int num_ele, int num_pos, int num_gam)
-{
-  atomicAdd(&scoring->fGlobalCounters.numElectrons, num_ele);
-  atomicAdd(&scoring->fGlobalCounters.numPositrons, num_pos);
-  atomicAdd(&scoring->fGlobalCounters.numGammas, num_gam);
-}
-}
\ No newline at end of file
diff --git a/examples/AsyncExample/BasicScoring.h b/examples/AsyncExample/BasicScoring.h
deleted file mode 100644
index 3859337d..00000000
--- a/examples/AsyncExample/BasicScoring.h
+++ /dev/null
@@ -1,110 +0,0 @@
-// SPDX-FileCopyrightText: 2024 CERN
-// SPDX-License-Identifier: Apache-2.0
-
-#ifndef SCORING_H
-#define SCORING_H
-
-#include <AdePT/core/AdePTScoringTemplate.cuh>
-
-#include "ResourceManagement.h"
-#include <AdePT/core/HostScoringStruct.cuh>
-#include <VecGeom/navigation/NavStateIndex.h>
-
-#include <atomic>
-#include <deque>
-#include <shared_mutex>
-#include <array>
-
-namespace AsyncAdePT {
-
-/// Struct holding GPU hits to be used both on host and device.
-struct HitScoringBuffer {
-  GPUHit *hitBuffer_dev     = nullptr;
-  unsigned int fSlotCounter = 0;
-  unsigned int fNSlot       = 0;
-
-  __device__ GPUHit &GetNextSlot();
-};
-
-extern __device__ HitScoringBuffer gHitScoringBuffer_dev;
-
-struct BufferHandle {
-  HitScoringBuffer hitScoringInfo;
-  GPUHit *hostBuffer;
-  enum class State { Free, OnDevice, OnDeviceNeedTransferToHost, TransferToHost, NeedHostProcessing };
-  std::atomic<State> state;
-};
-
-class HitScoring {
-  unique_ptr_cuda<GPUHit> fGPUHitBuffer_dev;
-  unique_ptr_cuda<GPUHit, CudaHostDeleter<GPUHit>> fGPUHitBuffer_host;
-
-  std::array<BufferHandle, 2> fBuffers;
-
-  void *fHitScoringBuffer_deviceAddress = nullptr;
-  unsigned int fHitCapacity;
-  unsigned short fActiveBuffer = 0;
-  unique_ptr_cuda<std::byte> fGPUSortAuxMemory;
-  std::size_t fGPUSortAuxMemorySize;
-
-  std::vector<std::deque<std::shared_ptr<const std::vector<GPUHit>>>> fHitQueues;
-  mutable std::shared_mutex fProcessingHitsMutex;
-
-  void ProcessBuffer(BufferHandle &handle);
-
-public:
-  HitScoring(unsigned int hitCapacity, unsigned int nThread);
-  unsigned int HitCapacity() const { return fHitCapacity; }
-  void SwapDeviceBuffers(cudaStream_t cudaStream);
-  bool ProcessHits();
-  bool ReadyToSwapBuffers() const
-  {
-    return std::any_of(fBuffers.begin(), fBuffers.end(),
-                       [](const auto &handle) { return handle.state == BufferHandle::State::Free; });
-  }
-  void TransferHitsToHost(cudaStream_t cudaStreamForHitCopy);
-  std::shared_ptr<const std::vector<GPUHit>> GetNextHitsVector(unsigned int threadId);
-};
-
-struct PerEventScoring {
-  GlobalCounters fGlobalCounters;
-  PerEventScoring *const fScoring_dev;
-
-  PerEventScoring(PerEventScoring *gpuScoring) : fScoring_dev{gpuScoring} { ClearGPU(); }
-  PerEventScoring(PerEventScoring &&other) = default;
-  ~PerEventScoring()                       = default;
-
-  /// @brief Copy hits to host for a single event
-  void CopyToHost(cudaStream_t cudaStream = 0);
-
-  /// @brief Clear hits on device to reuse for next event
-  void ClearGPU(cudaStream_t cudaStream = 0);
-
-  /// @brief Print scoring info
-  void Print() { fGlobalCounters.Print(); };
-};
-
-} // namespace AsyncAdePT
-
-namespace adept_scoring {
-
-/// @brief Record a hit
-template <>
-__device__ void RecordHit(AsyncAdePT::PerEventScoring * /*scoring*/, int aParentID, char aParticleType,
-                          double aStepLength, double aTotalEnergyDeposit, vecgeom::NavigationState const *aPreState,
-                          vecgeom::Vector3D<Precision> const *aPrePosition,
-                          vecgeom::Vector3D<Precision> const *aPreMomentumDirection,
-                          vecgeom::Vector3D<Precision> const * /*aPrePolarization*/, double aPreEKin, double aPreCharge,
-                          vecgeom::NavigationState const *aPostState, vecgeom::Vector3D<Precision> const *aPostPosition,
-                          vecgeom::Vector3D<Precision> const *aPostMomentumDirection,
-                          vecgeom::Vector3D<Precision> const * /*aPostPolarization*/, double aPostEKin,
-                          double aPostCharge, unsigned int eventID, short threadID);
-
-/// @brief Account for the number of produced secondaries
-/// @details Atomically increase the number of produced secondaries.
-template <>
-__device__ void AccountProduced(AsyncAdePT::PerEventScoring *scoring, int num_ele, int num_pos, int num_gam);
-
-} // namespace adept_scoring
-
-#endif
diff --git a/examples/AsyncExample/CMakeLists.txt b/examples/AsyncExample/CMakeLists.txt
index 88f64e9d..699c1171 100644
--- a/examples/AsyncExample/CMakeLists.txt
+++ b/examples/AsyncExample/CMakeLists.txt
@@ -1,6 +1,8 @@
 # SPDX-FileCopyrightText: 2023 CERN
 # SPDX-License-Identifier: Apache-2.0
 
+SET(CMAKE_EXE_LINKER_FLAGS "-Wl,--disable-new-dtags")
+
 if(HepMC3_FOUND)
   message(STATUS "HepMC3 found ${HEPMC3_INCLUDE_DIR}")
 else()
@@ -71,36 +73,13 @@ endif()
 # Include the C++ side of the AdePT-G4 integration from core AdePT:
 list(TRANSFORM ADEPT_G4_INTEGRATION_SRCS PREPEND ../../)
 
-add_library(AdePT_G4_integration_async SHARED
-  ${ADEPT_G4_INTEGRATION_SRCS}
-  AdeptIntegration.cpp
-  AdeptIntegration.cu
-  BasicScoring.cu
-  ResourceManagement.cu)
-target_link_libraries(AdePT_G4_integration_async
-PUBLIC
-  CopCore
-  VecGeom::vgdml
-  VecGeom::vecgeomcuda_static
-  G4HepEm::g4HepEm
-  )
-target_include_directories(AdePT_G4_integration_async
-  PUBLIC
-    $<BUILD_INTERFACE:${CMAKE_SOURCE_DIR}/include>
-    $<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}/>
-)
-set_target_properties(AdePT_G4_integration_async PROPERTIES
-  CUDA_SEPARABLE_COMPILATION ON
-  CUDA_RESOLVE_DEVICE_SYMBOLS ON)
-
-option(AsyncTransport "Link to AdePT_G4_integration_async" ON)
-
 add_executable(AsyncExample
     example.cpp
     ${sources_g4})
 target_include_directories(AsyncExample PRIVATE
   ${CMAKE_CURRENT_SOURCE_DIR}/include
-  ${CMAKE_CURRENT_SOURCE_DIR})
+  ${CMAKE_CURRENT_SOURCE_DIR}
+)
 
 target_link_libraries(AsyncExample
   PRIVATE
@@ -111,11 +90,7 @@ target_link_libraries(AsyncExample
     ROOT::RIO
 )
 
-if(AsyncTransport)
-  target_link_libraries(AsyncExample PRIVATE AdePT_G4_integration_async)
-else()
-  target_link_libraries(AsyncExample PRIVATE AdePT_G4_integration)
-endif()
+target_link_libraries(AsyncExample PRIVATE AdePT_G4_integration)
 
 target_compile_options(AsyncExample PRIVATE -Wall -Wextra)
 
diff --git a/examples/AsyncExample/Histograms.h b/examples/AsyncExample/Histograms.h
index 55d01e37..bb78a299 100644
--- a/examples/AsyncExample/Histograms.h
+++ b/examples/AsyncExample/Histograms.h
@@ -68,4 +68,4 @@ void registerHisto(std::shared_ptr<Histo> histo)
   HistoWriter::GetInstance().RegisterHisto(histo);
 }
 
-} // namespace AsyncExHistos
+} // namespace AsyncExHistos
\ No newline at end of file
diff --git a/examples/AsyncExample/README.md b/examples/AsyncExample/README.md
index bcec2bf7..59fd0903 100644
--- a/examples/AsyncExample/README.md
+++ b/examples/AsyncExample/README.md
@@ -20,32 +20,4 @@ This example is based on Example14 with the following modifications:
   - The G4 workers communicate with the AdePT thread via state machines, so it is clear when events finish or need to start transporting.
   - As long as the G4 workers have CPU work to do, they don't block while the GPU transport is running.
   - Each track knows which G4 worker it came from, and the scoring structures are replicated for each G4 worker that is active.
-- AdePT not only runs in the region that is named as GPU region in the config, it also transports particles in all daughter regions of the "AdePT region". This required refactoring the geometry visitor that sets up the GPU region.
-
-The example can be run both as full Geant4 simulation or as Geant4 + AdePT simulation where the simulation of the EM calorimeter + daughter regions is done on the GPU. The selection of the two running modes can be done at run time using a command.
-
-To activate AdePT:
-/param/ActivateModel AdePT
-
-and to inactivate it (run full Geant4):
-/adeptint/adept/activate true
-/param/InActivateModel AdePT
-
-The selection of the detector region in which AdePT is suppossed to be used, is done through:
-/adeptint/detector/regionname EcalRegion
-
-The selection of sensitive volumes can be done through:
-/adeptint/detector/addsensitivevolume EAPD_01
-/adeptint/detector/addsensitivevolume EAPD_02
-etc.
-
-(the selection of sensitive volumes through auxiliary information in the GDML file is not yet supported)
-
-The interfacing to Geant4 is done through the standard 'fast simulation' hooks. The EMShowerModel class implements the DoIt method which passes particles from Geant4 tracking to AdePT tracking. The particles are added to a buffer and asynchronously enqueued for GPU transport. The output are energy depositions per sensitive volume, which are converted into Geant4 hits and any 'outgoing' particles, which are put back on the Geant4 stack. The 'Flush' method is called when there are no more Geant4 particles on the stack (but before the event is finished) to block until the GPU transport for the G4 event in question has finished.
-
-The SensitiveDetector class implements two 'ProcessHits' methods. One is the standard one used by the Geant4 scoring, while the second one is used to convert G4FastHits (so energy deposition from AdePT scoring) into the Geant4 hits (directly comparable to full simulation). The scoring consists of basic 'calorimeter scoring' where we have one hit per active calorimeter cell and we record the total energy deposited in that cell in the event.
-
-The EventAction class prints all the hits at the end of the event, if
-`/adeptint/event/verbose > 1`
-
-Higher verbosity levels will print considerably more information about the G4 <--> AdePT communication.
\ No newline at end of file
+- AdePT not only runs in the region that is named as GPU region in the config, it also transports particles in all daughter regions of the "AdePT region". This required refactoring the geometry visitor that sets up the GPU region.
\ No newline at end of file
diff --git a/examples/AsyncExample/TrackTransfer.h b/examples/AsyncExample/TrackTransfer.h
deleted file mode 100644
index 7ac172bd..00000000
--- a/examples/AsyncExample/TrackTransfer.h
+++ /dev/null
@@ -1,100 +0,0 @@
-// SPDX-FileCopyrightText: 2024 CERN
-// SPDX-License-Identifier: Apache-2.0
-
-#ifndef TRACK_TRANSFER_H
-#define TRACK_TRANSFER_H
-
-#include <AdePT/core/TrackData.h>
-
-#include "ResourceManagement.h"
-
-#include <atomic>
-#include <array>
-#include <mutex>
-#include <shared_mutex>
-#include <new>
-#include <vector>
-
-namespace AsyncAdePT {
-
-struct TrackDataWithIDs : public adeptint::TrackData {
-  unsigned int eventId{0};
-  unsigned int trackId{0};
-  short threadId{-1};
-
-  TrackDataWithIDs(int pdg_id, int parentID, double ene, double x, double y, double z, double dirx, double diry,
-                   double dirz, double gTime, double lTime, double pTime, unsigned int eventId = 0,
-                   unsigned int trackId = 0, short threadId = -1)
-      : TrackData{pdg_id, parentID, ene, x, y, z, dirx, diry, dirz, gTime, lTime, pTime}, eventId{eventId},
-        trackId{trackId}, threadId{threadId}
-  {
-  }
-  friend bool operator==(TrackDataWithIDs const &a, TrackDataWithIDs const &b)
-  {
-    return a.threadId != b.threadId || a.eventId != b.eventId || static_cast<adeptint::TrackData const &>(a) == b;
-  }
-  bool operator<(TrackDataWithIDs const &other)
-  {
-    if (threadId != other.threadId) return threadId < other.threadId;
-    if (eventId != other.eventId) return eventId < other.eventId;
-    return TrackData::operator<(other);
-  }
-};
-
-/// @brief Buffer holding input tracks to be transported on GPU and output tracks to be
-/// re-injected in the Geant4 stack
-struct TrackBuffer {
-  struct alignas(64) ToDeviceBuffer {
-    TrackDataWithIDs *tracks;
-    unsigned int maxTracks;
-    std::atomic_uint nTrack;
-    mutable std::shared_mutex mutex;
-  };
-  std::array<ToDeviceBuffer, 2> toDeviceBuffer;
-  std::atomic_short toDeviceIndex{0};
-
-  unsigned int fNumToDevice{0};   ///< number of slots in the toDevice buffer
-  unsigned int fNumFromDevice{0}; ///< number of slots in the fromDevice buffer
-  unique_ptr_cuda<TrackDataWithIDs, CudaHostDeleter<TrackDataWithIDs>>
-      toDevice_host;                              ///< Tracks to be transported to the device
-  unique_ptr_cuda<TrackDataWithIDs> toDevice_dev; ///< toDevice buffer of tracks
-  unique_ptr_cuda<TrackDataWithIDs, CudaHostDeleter<TrackDataWithIDs>> fromDevice_host; ///< Tracks from device
-  unique_ptr_cuda<TrackDataWithIDs> fromDevice_dev;                                     ///< fromDevice buffer of tracks
-  unique_ptr_cuda<unsigned int, CudaHostDeleter<unsigned int>>
-      nFromDevice_host; ///< Number of tracks collected on device
-
-  std::vector<std::vector<TrackDataWithIDs>> fromDeviceBuffers;
-  std::mutex fromDeviceMutex;
-
-  TrackBuffer(unsigned int numToDevice, unsigned int numFromDevice, unsigned short nThread);
-
-  ToDeviceBuffer &getActiveBuffer() { return toDeviceBuffer[toDeviceIndex]; }
-  void swapToDeviceBuffers() { toDeviceIndex = (toDeviceIndex + 1) % 2; }
-
-  /// A handle to access TrackData vectors while holding a lock
-  struct TrackHandle {
-    TrackDataWithIDs &track;
-    std::shared_lock<std::shared_mutex> lock;
-  };
-
-  /// @brief Create a handle with lock for tracks that go to the device.
-  /// Create a shared_lock and a reference to a track
-  /// @return TrackHandle with lock and reference to track slot.
-  TrackHandle createToDeviceSlot();
-
-  struct FromDeviceHandle {
-    std::vector<TrackDataWithIDs> &tracks;
-    std::scoped_lock<std::mutex> lock;
-  };
-
-  /// @brief Create a handle with lock for tracks that return from the device.
-  /// @return BufferHandle with lock and reference to track vector.
-  FromDeviceHandle getTracksFromDevice(int threadId)
-  {
-    return {fromDeviceBuffers[threadId], std::scoped_lock{fromDeviceMutex}};
-  }
-};
-
-} // namespace AsyncAdePT
-
-#endif
diff --git a/examples/AsyncExample/example.cpp b/examples/AsyncExample/example.cpp
index 241105b2..39376c22 100644
--- a/examples/AsyncExample/example.cpp
+++ b/examples/AsyncExample/example.cpp
@@ -13,11 +13,9 @@
 #include "FTFP_BERT_AdePT.hh"
 #include "G4HadronicProcessStore.hh"
 #include "G4EmParameters.hh"
-#include "G4FastSimulationPhysics.hh"
 #include "G4VisExecutive.hh"
 #include "G4UIExecutive.hh"
 
-#include "AdeptIntegration.h"
 #include <AdePT/core/AdePTConfiguration.hh>
 
 #include <memory>
@@ -46,8 +44,9 @@ int main(int argc, char **argv)
       AdePT = false;
     } else if (argument == "--output") {
       AsyncExHistos::HistoWriter::GetInstance().SetFilename(argv[++i]);
-    } else if (argument == "--seed") {
-      AsyncAdePT::AdeptIntegration::fAdePTSeed = std::stoll(argv[++i]);
+    // TODO: Properly pass the seed to AdePT with a macro command
+    // } else if (argument == "--seed") {
+    //   AsyncAdePT::AdeptIntegration::fAdePTSeed = std::stoll(argv[++i]);
     } else {
       G4Exception("main", "Unknown argument", FatalErrorInArgument,
                   ("Unknown argument passed to " + G4String(argv[0]) + " : " + argument + "\n" + helpMsg).c_str());
@@ -63,16 +62,12 @@ int main(int argc, char **argv)
   auto detector = new DetectorConstruction();
   runManager->SetUserInitialization(detector);
 
-  std::unique_ptr<AdePTConfiguration> adeptConfig;
-
   // Physics list
   G4VUserPhysicsList *physicsList = nullptr;
   if (AdePT) {
     physicsList = new FTFP_BERT_AdePT();
   } else {
     physicsList = new FTFP_BERT_HepEm();
-    // Create a dummy config instance, so the AdePT-specific macros don't lead to errors:
-    adeptConfig.reset(new AdePTConfiguration());
   }
 
   runManager->SetUserInitialization(physicsList);
diff --git a/examples/AsyncExample/include/EMShowerMessenger.hh b/examples/AsyncExample/include/EMShowerMessenger.hh
deleted file mode 100644
index 7ec49132..00000000
--- a/examples/AsyncExample/include/EMShowerMessenger.hh
+++ /dev/null
@@ -1,82 +0,0 @@
-// SPDX-FileCopyrightText: 2022 CERN
-// SPDX-License-Identifier: Apache-2.0
-//
-// ********************************************************************
-// * License and Disclaimer                                           *
-// *                                                                  *
-// * The  Geant4 software  is  copyright of the Copyright Holders  of *
-// * the Geant4 Collaboration.  It is provided  under  the terms  and *
-// * conditions of the Geant4 Software License,  included in the file *
-// * LICENSE and available at  http://cern.ch/geant4/license .  These *
-// * include a list of copyright holders.                             *
-// *                                                                  *
-// * Neither the authors of this software system, nor their employing *
-// * institutes,nor the agencies providing financial support for this *
-// * work  make  any representation or  warranty, express or implied, *
-// * regarding  this  software system or assume any liability for its *
-// * use.  Please see the license in the file  LICENSE  and URL above *
-// * for the full disclaimer and the limitation of liability.         *
-// *                                                                  *
-// * This  code  implementation is the result of  the  scientific and *
-// * technical work of the GEANT4 collaboration.                      *
-// * By using,  copying,  modifying or  distributing the software (or *
-// * any work based  on the software)  you  agree  to acknowledge its *
-// * use  in  resulting  scientific  publications,  and indicate your *
-// * acceptance of all terms of the Geant4 Software license.          *
-// ********************************************************************
-//
-#ifndef EMSHOWERMESSENGER_HH
-#define EMSHOWERMESSENGER_HH
-
-class EMShowerModel;
-class G4UIcommand;
-class G4UIdirectory;
-class G4UIcmdWithoutParameter;
-class G4UIcmdWithADouble;
-class G4UIcmdWithADoubleAndUnit;
-class G4UIcmdWithAnInteger;
-
-#include "G4UImessenger.hh"
-
-/**
- * @brief Messenger for the example fast simulation model.
- *
- * Allows to set the parameters of the EMShowerModel: parameters of the
- * distributions used in the parametrisation, number of created (same) energy
- * deposits, as well as the maximum depth of an EM shower.
- *
- */
-
-class EMShowerMessenger : public G4UImessenger {
-public:
-  EMShowerMessenger(EMShowerModel *aModel);
-  ~EMShowerMessenger();
-
-public:
-  /// Invokes appropriate methods based on the typed command
-  virtual void SetNewValue(G4UIcommand *aCommand, G4String aNewValues) final;
-
-private:
-  /// Model to setup
-  EMShowerModel *fModel;
-  /// Command to set the up a directory for model settings /adeptint/fastSim
-  G4UIdirectory *fDirectory;
-  /// Command printing current settings
-  G4UIcmdWithoutParameter *fPrintCmd;
-  /// Command to set the sigma parameter of the Gaussian distribution describing
-  /// the transverse profile
-  // G4UIcmdWithADoubleAndUnit *fSigmaCmd;
-  /// Command to set the alpha parameter of the Gamma distribution describing
-  /// the longitudinal profile
-  // G4UIcmdWithADouble *fAlphaCmd;
-  /// Command to set the beta parameter of the Gamma distribution describing the
-  /// longitudinal profile
-  // G4UIcmdWithADouble *fBetaCmd;
-  /// Command to set the number of (same energy) deposits to be created by
-  /// fast simulation
-  G4UIcmdWithAnInteger *fNbOfHitsCmd;
-  /// Command to set the maximum shower depth
-  // G4UIcmdWithADouble *fLongMaxDepthCmd;
-};
-
-#endif /* EMSHOWERMESSENGER_HH */
\ No newline at end of file
diff --git a/examples/AsyncExample/include/EMShowerModel.hh b/examples/AsyncExample/include/EMShowerModel.hh
deleted file mode 100644
index 8df74da3..00000000
--- a/examples/AsyncExample/include/EMShowerModel.hh
+++ /dev/null
@@ -1,97 +0,0 @@
-// SPDX-FileCopyrightText: 2022 CERN
-// SPDX-License-Identifier: Apache-2.0
-//
-// ********************************************************************
-// * License and Disclaimer                                           *
-// *                                                                  *
-// * The  Geant4 software  is  copyright of the Copyright Holders  of *
-// * the Geant4 Collaboration.  It is provided  under  the terms  and *
-// * conditions of the Geant4 Software License,  included in the file *
-// * LICENSE and available at  http://cern.ch/geant4/license .  These *
-// * include a list of copyright holders.                             *
-// *                                                                  *
-// * Neither the authors of this software system, nor their employing *
-// * institutes,nor the agencies providing financial support for this *
-// * work  make  any representation or  warranty, express or implied, *
-// * regarding  this  software system or assume any liability for its *
-// * use.  Please see the license in the file  LICENSE  and URL above *
-// * for the full disclaimer and the limitation of liability.         *
-// *                                                                  *
-// * This  code  implementation is the result of  the  scientific and *
-// * technical work of the GEANT4 collaboration.                      *
-// * By using,  copying,  modifying or  distributing the software (or *
-// * any work based  on the software)  you  agree  to acknowledge its *
-// * use  in  resulting  scientific  publications,  and indicate your *
-// * acceptance of all terms of the Geant4 Software license.          *
-// ********************************************************************
-//
-#ifndef EMSHOWERMODEL_HH
-#define EMSHOWERMODEL_HH
-
-#include "G4VFastSimulationModel.hh"
-#include <AdePT/copcore/SystemOfUnits.h>
-
-#include <G4HepEmData.hh>
-#include <G4HepEmParameters.hh>
-#include <G4HepEmState.hh>
-#include <G4HepEmStateInit.hh>
-
-#include <memory>
-#include <unordered_map>
-
-class EMShowerMessenger;
-class G4FastSimHitMaker;
-class G4VPhysicalVolume;
-class AdeptIntegration;
-
-/**
- * @brief Example fast simulation model for EM showers.
- *
- * Parametrisation of electrons, positrons, and gammas. It is triggered if those
- * particles enter the detector so that there is sufficient length for the
- * shower development (max depth, controlled by the UI command).
- *
- * Using AdePT
- */
-
-class EMShowerModel : public G4VFastSimulationModel {
-public:
-  EMShowerModel(G4String, G4Region *, std::shared_ptr<AdeptIntegration>);
-  EMShowerModel(G4String);
-  ~EMShowerModel();
-
-  /// There are no kinematics constraints. True is returned.
-  G4bool ModelTrigger(const G4FastTrack &) final override;
-  /// Model is applicable to electrons, positrons, and photons.
-  G4bool IsApplicable(const G4ParticleDefinition &) final override;
-
-  /// Take particle out of the full simulation (kill it at the entrance
-  /// depositing all the energy). Simulate the full shower using AdePT library.
-  void DoIt(const G4FastTrack &, G4FastStep &) final override;
-
-  void Flush() final override;
-
-  /// Print current settings.
-  void Print() const;
-
-  /// Set verbosity for integration
-  void SetVerbosity(int verbosity) { fVerbosity = verbosity; }
-
-private:
-  /// Messenger for configuration
-  std::unique_ptr<EMShowerMessenger> fMessenger;
-
-  /// AdePT integration (shared with all workers)
-  std::shared_ptr<AdeptIntegration> fAdept;
-
-  /// Verbosity
-  int fVerbosity{0};
-
-  /// @brief Counts number of tracks passed to AdePT
-  unsigned int fTrackCounter{0};
-  /// @brief Save last event ID to determine when new event starts
-  int fLastEventId{-1};
-  /// @brief Cycle number (injection/flush) for each event
-  unsigned short fCycleNumber{0};
-};
-#endif /* EMSHOWERMODEL_HH */
\ No newline at end of file
diff --git a/examples/AsyncExample/include/SteppingAction.hh b/examples/AsyncExample/include/SteppingAction.hh
index 3b165567..42786753 100644
--- a/examples/AsyncExample/include/SteppingAction.hh
+++ b/examples/AsyncExample/include/SteppingAction.hh
@@ -8,6 +8,7 @@
 #include "G4Step.hh"
 #include "G4EventManager.hh"
 #include "G4SystemOfUnits.hh"
+#include "G4Event.hh"
 
 class SteppingAction final : public G4UserSteppingAction {
 
diff --git a/examples/AsyncExample/src/EMShowerMessenger.cc b/examples/AsyncExample/src/EMShowerMessenger.cc
deleted file mode 100644
index 9b66da28..00000000
--- a/examples/AsyncExample/src/EMShowerMessenger.cc
+++ /dev/null
@@ -1,68 +0,0 @@
-// SPDX-FileCopyrightText: 2022 CERN
-// SPDX-License-Identifier: Apache-2.0
-//
-// ********************************************************************
-// * License and Disclaimer                                           *
-// *                                                                  *
-// * The  Geant4 software  is  copyright of the Copyright Holders  of *
-// * the Geant4 Collaboration.  It is provided  under  the terms  and *
-// * conditions of the Geant4 Software License,  included in the file *
-// * LICENSE and available at  http://cern.ch/geant4/license .  These *
-// * include a list of copyright holders.                             *
-// *                                                                  *
-// * Neither the authors of this software system, nor their employing *
-// * institutes,nor the agencies providing financial support for this *
-// * work  make  any representation or  warranty, express or implied, *
-// * regarding  this  software system or assume any liability for its *
-// * use.  Please see the license in the file  LICENSE  and URL above *
-// * for the full disclaimer and the limitation of liability.         *
-// *                                                                  *
-// * This  code  implementation is the result of  the  scientific and *
-// * technical work of the GEANT4 collaboration.                      *
-// * By using,  copying,  modifying or  distributing the software (or *
-// * any work based  on the software)  you  agree  to acknowledge its *
-// * use  in  resulting  scientific  publications,  and indicate your *
-// * acceptance of all terms of the Geant4 Software license.          *
-// ********************************************************************
-//
-#include "EMShowerModel.hh"
-#include "EMShowerMessenger.hh"
-
-#include "G4UIdirectory.hh"
-#include "G4UIcmdWithoutParameter.hh"
-#include "G4UIcmdWithADoubleAndUnit.hh"
-#include "G4UIcmdWithADouble.hh"
-#include "G4UIcmdWithAnInteger.hh"
-
-EMShowerMessenger::EMShowerMessenger(EMShowerModel *aModel) : fModel(aModel)
-{
-  fDirectory = new G4UIdirectory("/adeptint/fastSim/");
-  fDirectory->SetGuidance("Set mesh parameters for the example fast sim model.");
-
-  fPrintCmd = new G4UIcmdWithoutParameter("/adeptint/fastSim/print", this);
-  fPrintCmd->SetGuidance("Print current settings.");
-
-  fNbOfHitsCmd = new G4UIcmdWithAnInteger("/adeptint/fastSim/numberOfHits", this);
-  fNbOfHitsCmd->SetGuidance("Set number of (same energy) energy deposits created in fast simulation. "
-                            "Those deposits will be scored in the detector according to the readout of "
-                            "the sensitive detector.");
-  fNbOfHitsCmd->SetParameterName("Number", false);
-}
-
-//....oooOO0OOooo........oooOO0OOooo........oooOO0OOooo........oooOO0OOooo......
-
-EMShowerMessenger::~EMShowerMessenger()
-{
-  delete fPrintCmd;
-  delete fNbOfHitsCmd;
-  delete fDirectory;
-}
-
-//....oooOO0OOooo........oooOO0OOooo........oooOO0OOooo........oooOO0OOooo......
-
-void EMShowerMessenger::SetNewValue(G4UIcommand *aCommand, G4String /*aNewValues*/)
-{
-  if (aCommand == fPrintCmd) {
-    fModel->Print();
-  }
-}
diff --git a/examples/AsyncExample/src/EMShowerModel.cc b/examples/AsyncExample/src/EMShowerModel.cc
deleted file mode 100644
index b4f9b97f..00000000
--- a/examples/AsyncExample/src/EMShowerModel.cc
+++ /dev/null
@@ -1,140 +0,0 @@
-// SPDX-FileCopyrightText: 2022 CERN
-// SPDX-License-Identifier: Apache-2.0
-
-#include "EMShowerMessenger.hh"
-
-#include <G4RunManager.hh>
-#include "G4Electron.hh"
-#include "G4Positron.hh"
-#include "G4Gamma.hh"
-#include "G4SystemOfUnits.hh"
-#include "G4UnitsTable.hh"
-#include "G4FastHit.hh"
-#include "Randomize.hh"
-#include "G4FastSimHitMaker.hh"
-#include "G4EventManager.hh"
-#include "G4Event.hh"
-
-#include "EMShowerModel.hh"
-#include "G4GlobalFastSimulationManager.hh"
-
-#include <VecGeom/base/Config.h>
-#include <VecGeom/management/GeoManager.h>
-
-#include <G4MaterialCutsCouple.hh>
-#include <G4ProductionCutsTable.hh>
-
-EMShowerModel::EMShowerModel(G4String aModelName, G4Region *aEnvelope, std::shared_ptr<AdeptIntegration> adept)
-    : G4VFastSimulationModel{aModelName, aEnvelope}, fMessenger{new EMShowerMessenger(this)}, fAdept{adept}
-{
-}
-
-//....oooOO0OOooo........oooOO0OOooo........oooOO0OOooo........oooOO0OOooo......
-
-EMShowerModel::EMShowerModel(G4String aModelName)
-    : G4VFastSimulationModel(aModelName), fMessenger(new EMShowerMessenger(this))
-{
-}
-
-//....oooOO0OOooo........oooOO0OOooo........oooOO0OOooo........oooOO0OOooo......
-
-EMShowerModel::~EMShowerModel() {}
-
-//....oooOO0OOooo........oooOO0OOooo........oooOO0OOooo........oooOO0OOooo......
-
-G4bool EMShowerModel::IsApplicable(const G4ParticleDefinition &aParticleType)
-{
-  return fAdept && (&aParticleType == G4Electron::ElectronDefinition() ||
-                    &aParticleType == G4Positron::PositronDefinition() || &aParticleType == G4Gamma::GammaDefinition());
-}
-
-//....oooOO0OOooo........oooOO0OOooo........oooOO0OOooo........oooOO0OOooo......
-
-G4bool EMShowerModel::ModelTrigger(const G4FastTrack & /*aFastTrack*/)
-{
-
-  // The model is invoked for e/e-/gamma, so this has to return true
-  return true;
-}
-
-//....oooOO0OOooo........oooOO0OOooo........oooOO0OOooo........oooOO0OOooo......
-
-void EMShowerModel::DoIt(const G4FastTrack &aFastTrack, G4FastStep &aFastStep)
-{
-  auto g4track           = aFastTrack.GetPrimaryTrack();
-  auto particlePosition  = g4track->GetPosition();
-  auto particleDirection = g4track->GetMomentumDirection();
-
-#ifndef NDEBUG
-  G4VSolid const *solid = aFastTrack.GetEnvelopeSolid();
-  const auto localPos   = aFastTrack.GetPrimaryTrackLocalPosition();
-  const auto localDir   = aFastTrack.GetPrimaryTrackLocalDirection();
-  const auto dist       = solid->DistanceToOut(localPos, localDir);
-  // if (dist < millimeter) std::cout << "DistanceToOut: " << std::setw(9) << dist / millimeter << "\n";
-  if (dist < micrometer) {
-    G4cout << "Error: AdePT will reject a particle that's immediately leaving the GPU region.\n";
-    G4cout << "Rejected pos: " << aFastTrack.GetPrimaryTrack()->GetPosition() << "\n";
-    G4cout << "Distance: " << solid->DistanceToOut(localPos, localDir) << "\n";
-    G4cout << *solid << "\n";
-    return;
-  }
-
-  const G4ThreeVector pos{-282.494219, -1126.731331, -3156.200000};
-  if ((particlePosition - pos).mag() < 1.E-5) {
-    G4cout << "TAG: The solid close to " << particlePosition << " is:\n";
-    G4cout << *solid << "\n";
-    G4cout << "LVol=" << aFastTrack.GetEnvelopeLogicalVolume()->GetName()
-           << " NDaugher=" << aFastTrack.GetEnvelopeLogicalVolume()->GetNoDaughters() << "\n";
-    G4cout << "DistanceToOut=" << solid->DistanceToOut(localPos, localDir) / millimeter << G4endl;
-  }
-
-  const G4ThreeVector posKickOut{129.079191, 1231.252437, 888.105959};
-  if ((particlePosition - posKickOut).mag() < 1.E-4) {
-    G4cout << "TAG: The solid close to the kick-out position " << particlePosition << " is:\n";
-    G4cout << *solid << "\n";
-    G4cout << "LVol=" << aFastTrack.GetEnvelopeLogicalVolume()->GetName()
-           << " NDaugher=" << aFastTrack.GetEnvelopeLogicalVolume()->GetNoDaughters() << "\n";
-    G4cout << "DistanceToOut=" << solid->DistanceToOut(localPos, localDir) / millimeter << G4endl;
-  }
-#endif
-
-  // Remove particle from further processing by G4
-  aFastStep.KillPrimaryTrack();
-  aFastStep.SetPrimaryTrackPathLength(0.0);
-  G4double energy = aFastTrack.GetPrimaryTrack()->GetKineticEnergy();
-  // No need to create any deposit, it will be handled by this model (and
-  // G4FastSimHitMaker that will call the sensitive detector)
-  aFastStep.SetTotalEnergyDeposited(0);
-
-  const auto pdg    = aFastTrack.GetPrimaryTrack()->GetParticleDefinition()->GetPDGEncoding();
-  const auto thread = G4Threading::G4GetThreadId();
-  const auto event  = G4EventManager::GetEventManager()->GetConstCurrentEvent()->GetEventID();
-  if (event != fLastEventId) {
-    fLastEventId  = event;
-    fTrackCounter = 0;
-    fCycleNumber  = 0;
-  }
-
-  fAdept->AddTrack(pdg, energy, particlePosition[0], particlePosition[1], particlePosition[2], particleDirection[0],
-                   particleDirection[1], particleDirection[2], g4track->GetGlobalTime(), g4track->GetLocalTime(),
-                   g4track->GetProperTime(), thread, event, fCycleNumber, fTrackCounter);
-}
-
-void EMShowerModel::Flush()
-{
-  const auto threadId = G4Threading::G4GetThreadId();
-  const auto event    = G4EventManager::GetEventManager()->GetConstCurrentEvent()->GetEventID();
-  if (fVerbosity > 0)
-    G4cout << "Waiting for AdePT to finish the transport for thread " << threadId << " event " << event << " cycle "
-           << fCycleNumber << G4endl;
-
-  fAdept->Flush(threadId, event, fCycleNumber);
-  fCycleNumber++;
-}
-
-//....oooOO0OOooo........oooOO0OOooo........oooOO0OOooo........oooOO0OOooo......
-
-void EMShowerModel::Print() const
-{
-  G4cout << "EMShowerModel (AdePT)=" << fAdept << G4endl;
-}
diff --git a/examples/AsyncExample/src/SensitiveDetector.cc b/examples/AsyncExample/src/SensitiveDetector.cc
index 958759fd..8ee5aef0 100644
--- a/examples/AsyncExample/src/SensitiveDetector.cc
+++ b/examples/AsyncExample/src/SensitiveDetector.cc
@@ -86,10 +86,8 @@ G4bool SensitiveDetector::ProcessHits(G4Step *aStep, G4TouchableHistory *)
 
 //....oooOO0OOooo........oooOO0OOooo........oooOO0OOooo........oooOO0OOooo......
 
-SimpleHit *SensitiveDetector::RetrieveAndSetupHit(G4TouchableHistory *aTouchable)
+SimpleHit *SensitiveDetector::RetrieveAndSetupHit(std::size_t hitID)
 {
-  std::size_t hitID = fScoringMap[aTouchable->GetHistory()->GetTopVolume()->GetInstanceID()];
-
   assert(hitID < static_cast<std::size_t>(fNumSensitive));
 
   if (hitID >= fHitsCollection->entries()) {
diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt
index d900b5fe..8243d81b 100644
--- a/examples/CMakeLists.txt
+++ b/examples/CMakeLists.txt
@@ -9,4 +9,4 @@ configure_file(data/ppttbar.hepmc3 ${PROJECT_BINARY_DIR}/ppttbar.hepmc3)
 # - Subprojects
 add_subdirectory(Example1)
 add_subdirectory(IntegrationBenchmark)
-# add_subdirectory(AsyncExample)
+add_subdirectory(AsyncExample)
diff --git a/examples/Example1/CMakeLists.txt b/examples/Example1/CMakeLists.txt
index bbaa1898..8b07cd05 100644
--- a/examples/Example1/CMakeLists.txt
+++ b/examples/Example1/CMakeLists.txt
@@ -71,6 +71,7 @@ configure_file("macros/example1_large_stack.mac.in" "${PROJECT_BINARY_DIR}/examp
 configure_file("macros/example1_ttbar.mac.in" "${PROJECT_BINARY_DIR}/example1_ttbar.mac")
 configure_file("macros/example1_ttbar_LHCb.mac.in" "${PROJECT_BINARY_DIR}/example1_ttbar_LHCb.mac")
 configure_file("macros/example1_ttbar_noadept.mac.in" "${PROJECT_BINARY_DIR}/example1_ttbar_noadept.mac")
+configure_file("macros/async_mode.mac.in" "${PROJECT_BINARY_DIR}/async_mode.mac")
 
 # Tests
 
diff --git a/examples/Example1/macros/async_mode.mac.in b/examples/Example1/macros/async_mode.mac.in
new file mode 100644
index 00000000..004406f2
--- /dev/null
+++ b/examples/Example1/macros/async_mode.mac.in
@@ -0,0 +1,62 @@
+# SPDX-FileCopyrightText: 2023 CERN
+# SPDX-License-Identifier: Apache-2.0
+#  example23.in
+#
+
+## =============================================================================
+## Geant4 macro for modelling simplified sampling calorimeters
+## =============================================================================
+##
+/run/numberOfThreads 1
+/control/verbose 0
+/run/verbose 0
+/process/verbose 0
+/tracking/verbose 0
+/event/verbose 0
+
+/detector/filename cms2018_sd.gdml
+# Temporary workaround since we don't have a G4 to VecGeom converter
+/adept/setVecGeomGDML cms2018_sd.gdml
+/adept/setVerbosity 0
+## Threshold for buffering tracks before sending to GPU
+/adept/setTransportBufferThreshold 2000
+## Total number of GPU track slots (not per thread)
+/adept/setMillionsOfTrackSlots 4
+/adept/setMillionsOfHitSlots 4
+/adept/setCUDAStackLimit 16384
+
+# If true, particles are transported on the GPU across the whole geometry, GPU regions are ignored
+/adept/setTrackInAllRegions true
+# In order to do the GPU transport only in specific regions
+#/adept/addGPURegion EcalRegion
+#/adept/addGPURegion HcalRegion
+
+
+## -----------------------------------------------------------------------------
+## Optionally, set a constant magnetic filed:
+## -----------------------------------------------------------------------------
+/detector/setField 0 0 0 tesla
+#/detector/setField 0 0 3.8 tesla
+
+## -----------------------------------------------------------------------------
+## Set secondary production threshold, init. the run and set primary properties
+## -----------------------------------------------------------------------------
+/run/setCut 0.7 mm
+/run/initialize
+
+## User-defined Event verbosity: 1 = total edep, 2 = energy deposit per placed sensitive volume
+/eventAction/verbose 2
+
+/gun/hepmc
+/generator/hepmcAscii/maxevents 256
+/generator/hepmcAscii/firstevent 0
+/generator/hepmcAscii/open ppttbar.hepmc3
+/generator/hepmcAscii/verbose 0
+
+## -----------------------------------------------------------------------------
+## Run the simulation with the given number of events and print list of processes
+## -----------------------------------------------------------------------------
+
+# run events with parametrised simulation
+# by default all created models are active
+/run/beamOn 8
diff --git a/examples/AsyncExample/ResourceManagement.cu b/include/AdePT/base/ResourceManagement.cuh
similarity index 54%
rename from examples/AsyncExample/ResourceManagement.cu
rename to include/AdePT/base/ResourceManagement.cuh
index 6d00999a..a4df1069 100644
--- a/examples/AsyncExample/ResourceManagement.cu
+++ b/include/AdePT/base/ResourceManagement.cuh
@@ -1,7 +1,10 @@
 // SPDX-FileCopyrightText: 2023 CERN
 // SPDX-License-Identifier: Apache-2.0
-#include "ResourceManagement.h"
+#ifndef RESOURCE_MANAGEMENT_CUH
+#define RESOURCE_MANAGEMENT_CUH
 
+#include <memory>
+#include <AdePT/base/ResourceManagement.hh>
 #include "AdePT/copcore/Global.h"
 
 namespace AsyncAdePT {
@@ -26,4 +29,18 @@ void freeCudaEvent(void *event)
   if (event) COPCORE_CUDA_CHECK(cudaEventDestroy(*static_cast<cudaEvent_t *>(event)));
 }
 
+// Instantiate the deleters for specific types.
+#ifdef __CUDACC__
+template <>
+struct CudaDeleter<cudaStream_t> {
+  void operator()(cudaStream_t *stream) const { freeCudaStream(stream); }
+};
+template <>
+struct CudaDeleter<cudaEvent_t> {
+  void operator()(cudaEvent_t *event) const { freeCudaEvent(event); }
+};
+#endif
+
 } // namespace AsyncAdePT
+
+#endif // RESOURCE_MANAGEMENT_CUH
\ No newline at end of file
diff --git a/examples/AsyncExample/ResourceManagement.h b/include/AdePT/base/ResourceManagement.hh
similarity index 61%
rename from examples/AsyncExample/ResourceManagement.h
rename to include/AdePT/base/ResourceManagement.hh
index 2fedbe54..299f7b76 100644
--- a/examples/AsyncExample/ResourceManagement.h
+++ b/include/AdePT/base/ResourceManagement.hh
@@ -1,13 +1,16 @@
 // SPDX-FileCopyrightText: 2023 CERN
 // SPDX-License-Identifier: Apache-2.0
-#ifndef RESOURCE_MANAGEMENT_CUH
-#define RESOURCE_MANAGEMENT_CUH
+
+#ifndef RESOURCE_MANAGEMENT_HH
+#define RESOURCE_MANAGEMENT_HH
 
 #include <memory>
 
 namespace AsyncAdePT {
 void freeCuda(void *ptr);
 void freeCudaHost(void *ptr);
+void freeCudaStream(void *stream);
+void freeCudaEvent(void *event);
 
 template <class T>
 struct CudaDeleter {
@@ -18,22 +21,9 @@ struct CudaHostDeleter {
   void operator()(T *ptr) const { freeCudaHost(ptr); }
 };
 
-void freeCudaStream(void *stream);
-void freeCudaEvent(void *event);
-
-#ifdef __CUDACC__
-template <>
-struct CudaDeleter<cudaStream_t> {
-  void operator()(cudaStream_t *stream) const { freeCudaStream(stream); }
-};
-template <>
-struct CudaDeleter<cudaEvent_t> {
-  void operator()(cudaEvent_t *event) const { freeCudaEvent(event); }
-};
-#endif
 template <typename T = void, typename Deleter = CudaDeleter<T>>
 using unique_ptr_cuda = std::unique_ptr<T, Deleter>;
 
 } // namespace AsyncAdePT
 
-#endif // RESOURCE_MANAGEMENT_CUH
\ No newline at end of file
+#endif // RESOURCE_MANAGEMENT_HH
\ No newline at end of file
diff --git a/examples/AsyncExample/SlotManager.cuh b/include/AdePT/base/SlotManager.cuh
similarity index 100%
rename from examples/AsyncExample/SlotManager.cuh
rename to include/AdePT/base/SlotManager.cuh
diff --git a/include/AdePT/core/AdePTScoringTemplate.cuh b/include/AdePT/core/AdePTScoringTemplate.cuh
index 2cfd8eb3..e436640e 100644
--- a/include/AdePT/core/AdePTScoringTemplate.cuh
+++ b/include/AdePT/core/AdePTScoringTemplate.cuh
@@ -9,39 +9,35 @@
 // Templates for the AdePTScoring CUDA methods
 
 // These methods are meant to be templated on a struct containing the necessary information
-namespace adept_scoring
-{
+namespace adept_scoring {
 template <typename Scoring>
 Scoring *InitializeOnGPU(Scoring *scoring);
 
 template <typename Scoring>
 void FreeGPU(Scoring *scoring, Scoring *scoring_dev);
 
-  template <typename Scoring>
-  __device__ void RecordHit(Scoring *scoring_dev, int aParentID, char aParticleType, double aStepLength,
-                            double aTotalEnergyDeposit, vecgeom::NavigationState const *aPreState,
-                            vecgeom::Vector3D<Precision> const *aPrePosition,
-                            vecgeom::Vector3D<Precision> const *aPreMomentumDirection,
-                            vecgeom::Vector3D<Precision> const *aPrePolarization, double aPreEKin, double aPreCharge,
-                            vecgeom::NavigationState const *aPostState,
-                            vecgeom::Vector3D<Precision> const *aPostPosition,
-                            vecgeom::Vector3D<Precision> const *aPostMomentumDirection,
-                            vecgeom::Vector3D<Precision> const *aPostPolarization, double aPostEKin, double aPostCharge,
-                            unsigned int eventId, short threadId);
-
-  template <typename Scoring>
-  __device__ void AccountProduced(Scoring *scoring_dev, int num_ele, int num_pos, int num_gam);
-
-  template <typename Scoring>
-  __device__ __forceinline__ void EndOfIterationGPU(Scoring *scoring_dev);
-
-  template <typename Scoring, typename IntegrationLayer>
-  inline void EndOfIteration(Scoring &scoring, Scoring *scoring_dev, cudaStream_t &stream,
-                             IntegrationLayer &integration);
-
-  template <typename Scoring, typename IntegrationLayer>
-  inline void EndOfTransport(Scoring &scoring, Scoring *scoring_dev, cudaStream_t &stream,
-                             IntegrationLayer &integration);
-}
-
-#endif
\ No newline at end of file
+template <typename Scoring>
+__device__ void RecordHit(Scoring *scoring_dev, int aParentID, char aParticleType, double aStepLength,
+                          double aTotalEnergyDeposit, vecgeom::NavigationState const *aPreState,
+                          vecgeom::Vector3D<Precision> const *aPrePosition,
+                          vecgeom::Vector3D<Precision> const *aPreMomentumDirection,
+                          vecgeom::Vector3D<Precision> const *aPrePolarization, double aPreEKin, double aPreCharge,
+                          vecgeom::NavigationState const *aPostState, vecgeom::Vector3D<Precision> const *aPostPosition,
+                          vecgeom::Vector3D<Precision> const *aPostMomentumDirection,
+                          vecgeom::Vector3D<Precision> const *aPostPolarization, double aPostEKin, double aPostCharge,
+                          unsigned int eventId, short threadId);
+
+template <typename Scoring>
+__device__ void AccountProduced(Scoring *scoring_dev, int num_ele, int num_pos, int num_gam);
+
+template <typename Scoring>
+__device__ __forceinline__ void EndOfIterationGPU(Scoring *scoring_dev);
+
+template <typename Scoring, typename IntegrationLayer>
+inline void EndOfIteration(Scoring &scoring, Scoring *scoring_dev, cudaStream_t &stream, IntegrationLayer &integration);
+
+template <typename Scoring, typename IntegrationLayer>
+inline void EndOfTransport(Scoring &scoring, Scoring *scoring_dev, cudaStream_t &stream, IntegrationLayer &integration);
+} // namespace adept_scoring
+
+#endif
diff --git a/include/AdePT/core/AdePTTransport.cuh b/include/AdePT/core/AdePTTransport.cuh
index e4c6e948..3245ea2f 100644
--- a/include/AdePT/core/AdePTTransport.cuh
+++ b/include/AdePT/core/AdePTTransport.cuh
@@ -7,7 +7,6 @@
 #include <AdePT/core/AdePTScoringTemplate.cuh>
 #include <AdePT/core/HostScoringStruct.cuh>
 #include <AdePT/core/HostScoringImpl.cuh>
-
 #include <AdePT/core/AdePTTransportStruct.cuh>
 #include <AdePT/base/Atomic.h>
 #include <AdePT/navigation/AdePTNavigator.h>
diff --git a/include/AdePT/core/AdePTTransport.h b/include/AdePT/core/AdePTTransport.h
index 71cdc174..f5305595 100644
--- a/include/AdePT/core/AdePTTransport.h
+++ b/include/AdePT/core/AdePTTransport.h
@@ -5,10 +5,10 @@
 ///   - filling the buffer with tracks to be transported on the GPU
 ///   - Calling the Shower method transporting a buffer on the GPU
 
-#ifndef ADEPT_INTEGRATION_H
-#define ADEPT_INTEGRATION_H
+#ifndef ADEPT_TRANSPORT_H
+#define ADEPT_TRANSPORT_H
+
 
-#include "AdePTTransportInterface.hh"
 
 #include <unordered_map>
 #include <VecGeom/base/Config.h>
@@ -16,7 +16,8 @@
 #include <VecGeom/management/CudaManager.h> // forward declares vecgeom::cxx::VPlacedVolume
 #endif
 
-#include "CommonStruct.h"
+#include <AdePT/core/CommonStruct.h>
+#include <AdePT/core/AdePTTransportInterface.hh>
 #include <AdePT/core/AdePTScoringTemplate.cuh>
 #include <AdePT/core/HostScoringStruct.cuh>
 #include <AdePT/core/AdePTConfiguration.hh>
diff --git a/include/AdePT/core/AdePTTransport.icc b/include/AdePT/core/AdePTTransport.icc
index 9cf53ebd..e980644d 100644
--- a/include/AdePT/core/AdePTTransport.icc
+++ b/include/AdePT/core/AdePTTransport.icc
@@ -75,7 +75,7 @@ void AdePTTransport<IntegrationLayer>::AddTrack(int pdg, int parent_id, double e
                                                 vecgeom::NavigationState &&state)
 {
   fBuffer.toDevice.emplace_back(pdg, parent_id, energy, x, y, z, dirx, diry, dirz, globalTime, localTime, properTime,
-                                state);
+                                std::move(state));
   if (pdg == 11)
     fBuffer.nelectrons++;
   else if (pdg == -11)
diff --git a/include/AdePT/core/AdePTTransportStruct.cuh b/include/AdePT/core/AdePTTransportStruct.cuh
index 0a552c69..1766c89b 100644
--- a/include/AdePT/core/AdePTTransportStruct.cuh
+++ b/include/AdePT/core/AdePTTransportStruct.cuh
@@ -111,4 +111,4 @@ extern __constant__ __device__ adeptint::VolAuxData *gVolAuxData;
 extern __constant__ __device__ double BzFieldValue;
 extern __constant__ __device__ bool ApplyCuts;
 } // namespace adept_impl
-#endif
+#endif
\ No newline at end of file
diff --git a/examples/AsyncExample/AdeptIntegration.cu b/include/AdePT/core/AsyncAdePTTransport.cuh
similarity index 73%
rename from examples/AsyncExample/AdeptIntegration.cu
rename to include/AdePT/core/AsyncAdePTTransport.cuh
index f8993072..a8602a66 100644
--- a/examples/AsyncExample/AdeptIntegration.cu
+++ b/include/AdePT/core/AsyncAdePTTransport.cuh
@@ -1,29 +1,30 @@
 // SPDX-FileCopyrightText: 2022 CERN
 // SPDX-License-Identifier: Apache-2.0
 
-#include "AdeptIntegration.h"
-#include "AdeptIntegration.cuh"
-
-#include "TrackTransfer.h"
-
-#include <VecGeom/base/Config.h>
-#ifdef VECGEOM_ENABLE_CUDA
-#include <VecGeom/backend/cuda/Interface.h>
-#endif
+#ifndef ASYNC_ADEPT_TRANSPORT_CUH
+#define ASYNC_ADEPT_TRANSPORT_CUH
 
+#include <AdePT/core/AsyncAdePTTransportStruct.cuh>
+#include <AdePT/core/AsyncAdePTTransportStruct.hh>
+#include <AdePT/core/CommonStruct.h>
+#include <AdePT/core/AdePTConfiguration.hh>
 #include <AdePT/base/Atomic.h>
 #include <AdePT/base/MParray.h>
-#include <AdePT/navigation/BVHNavigator.h>
-#include <AdePT/integration/AdePTGeant4Integration.hh>
-
 #include <AdePT/copcore/Global.h>
 #include <AdePT/copcore/PhysicalConstants.h>
 #include <AdePT/copcore/Ranluxpp.h>
+#include <AdePT/kernels/electrons_async.cuh>
+#include <AdePT/kernels/gammas_async.cuh>
 
-#include <G4Threading.hh>
-#include <G4TransportationManager.hh>
-#include <G4UniformMagField.hh>
-#include <G4FieldManager.hh>
+#include <AdePT/navigation/BVHNavigator.h>
+#include <AdePT/integration/AdePTGeant4Integration.hh>
+
+// #include <AdePT/benchmarking/NVTX.h>
+
+#include <VecGeom/base/Config.h>
+#ifdef VECGEOM_ENABLE_CUDA
+#include <VecGeom/backend/cuda/Interface.h>
+#endif
 
 #include <G4HepEmState.hh>
 #include <G4HepEmData.hh>
@@ -37,36 +38,14 @@
 #include <stdio.h>
 #include <vector>
 #include <numeric>
+// #include <cuda/std/numeric>
 #include <algorithm>
 #include <sstream>
 #include <stdexcept>
 #include <type_traits>
 
-#include "electrons.cuh"
-#include "gammas.cuh"
-
-#include <AdePT/benchmarking/NVTX.h>
-
 using namespace AsyncAdePT;
 
-namespace AsyncAdePT {
-
-__constant__ __device__ struct G4HepEmParameters g4HepEmPars;
-__constant__ __device__ struct G4HepEmData g4HepEmData;
-
-__constant__ __device__ adeptint::VolAuxData *gVolAuxData = nullptr;
-__constant__ __device__ double BzFieldValue               = 0;
-
-/// Transfer volume auxiliary data to GPU
-void InitVolAuxArray(adeptint::VolAuxArray &array)
-{
-  using adeptint::VolAuxData;
-  COPCORE_CUDA_CHECK(cudaMalloc(&array.fAuxData_dev, sizeof(VolAuxData) * array.fNumVolumes));
-  COPCORE_CUDA_CHECK(
-      cudaMemcpy(array.fAuxData_dev, array.fAuxData, sizeof(VolAuxData) * array.fNumVolumes, cudaMemcpyHostToDevice));
-  COPCORE_CUDA_CHECK(cudaMemcpyToSymbol(gVolAuxData, &array.fAuxData_dev, sizeof(VolAuxData *)));
-}
-
 /// Communication with the hit processing thread.
 struct HitProcessingContext {
   cudaStream_t hitTransferStream;
@@ -75,98 +54,6 @@ struct HitProcessingContext {
   std::atomic_bool keepRunning = true;
 };
 
-/// Initialise the track buffers used to communicate between host and device.
-TrackBuffer::TrackBuffer(unsigned int numToDevice, unsigned int numFromDevice, unsigned short nThread)
-    : fNumToDevice{numToDevice}, fNumFromDevice{numFromDevice}, fromDeviceBuffers(nThread)
-{
-  TrackDataWithIDs *devPtr, *hostPtr;
-  // Double buffer for lock-free host runs:
-  COPCORE_CUDA_CHECK(cudaMallocHost(&hostPtr, 2 * numToDevice * sizeof(TrackDataWithIDs)));
-  COPCORE_CUDA_CHECK(cudaMalloc(&devPtr, numToDevice * sizeof(TrackDataWithIDs)));
-  toDevice_host.reset(hostPtr);
-  toDevice_dev.reset(devPtr);
-
-  COPCORE_CUDA_CHECK(cudaMallocHost(&hostPtr, numFromDevice * sizeof(TrackDataWithIDs)));
-  COPCORE_CUDA_CHECK(cudaMalloc(&devPtr, numFromDevice * sizeof(TrackDataWithIDs)));
-  fromDevice_host.reset(hostPtr);
-  fromDevice_dev.reset(devPtr);
-
-  unsigned int *nFromDevice = nullptr;
-  COPCORE_CUDA_CHECK(cudaMallocHost(&nFromDevice, sizeof(unsigned int)));
-  nFromDevice_host.reset(nFromDevice);
-
-  toDeviceBuffer[0].tracks    = toDevice_host.get();
-  toDeviceBuffer[0].maxTracks = numToDevice;
-  toDeviceBuffer[0].nTrack    = 0;
-  toDeviceBuffer[1].tracks    = toDevice_host.get() + numToDevice;
-  toDeviceBuffer[1].maxTracks = numToDevice;
-  toDeviceBuffer[1].nTrack    = 0;
-}
-
-} // namespace AsyncAdePT
-
-AdeptIntegration::AdeptIntegration(unsigned short nThread, unsigned int trackCapacity, unsigned int hitBufferCapacity,
-                                   int debugLevel, std::vector<std::string> const *GPURegionNames,
-                                   bool trackInAllRegions, int cudaStackSize)
-    : fNThread{nThread}, fTrackCapacity{trackCapacity}, fScoringCapacity{hitBufferCapacity}, fDebugLevel{debugLevel},
-      fG4Integrations(nThread), fEventStates(nThread), fGPUNetEnergy(nThread, 0.),
-      fTrackInAllRegions{trackInAllRegions}, fGPURegionNames{GPURegionNames}
-{
-  if (nThread > kMaxThreads)
-    throw std::invalid_argument("AdeptIntegration limited to " + std::to_string(kMaxThreads) + " threads");
-
-  for (auto &eventState : fEventStates) {
-    std::atomic_init(&eventState, EventState::ScoringRetrieved);
-  }
-
-  if (cudaStackSize > 0) {
-    G4cout << "Instantiate AdePT with cuda stack size " << cudaStackSize << "\n";
-    COPCORE_CUDA_CHECK(cudaDeviceSetLimit(cudaLimitStackSize, cudaStackSize));
-  }
-
-  AdeptIntegration::FullInit();
-}
-
-AdeptIntegration::~AdeptIntegration()
-{
-  FreeGPU();
-}
-
-static G4HepEmState *InitG4HepEm()
-{
-  auto state = new G4HepEmState;
-  InitG4HepEmState(state);
-
-  G4HepEmMatCutData *cutData = state->fData->fTheMatCutData;
-  G4cout << "fNumG4MatCuts = " << cutData->fNumG4MatCuts << ", fNumMatCutData = " << cutData->fNumMatCutData << G4endl;
-
-  // Copy to GPU.
-  CopyG4HepEmDataToGPU(state->fData);
-  COPCORE_CUDA_CHECK(cudaMemcpyToSymbol(g4HepEmPars, state->fParameters, sizeof(G4HepEmParameters)));
-
-  // Create G4HepEmData with the device pointers.
-  G4HepEmData dataOnDevice;
-  dataOnDevice.fTheMatCutData   = state->fData->fTheMatCutData_gpu;
-  dataOnDevice.fTheMaterialData = state->fData->fTheMaterialData_gpu;
-  dataOnDevice.fTheElementData  = state->fData->fTheElementData_gpu;
-  dataOnDevice.fTheElectronData = state->fData->fTheElectronData_gpu;
-  dataOnDevice.fThePositronData = state->fData->fThePositronData_gpu;
-  dataOnDevice.fTheSBTableData  = state->fData->fTheSBTableData_gpu;
-  dataOnDevice.fTheGammaData    = state->fData->fTheGammaData_gpu;
-  // The other pointers should never be used.
-  dataOnDevice.fTheMatCutData_gpu   = nullptr;
-  dataOnDevice.fTheMaterialData_gpu = nullptr;
-  dataOnDevice.fTheElementData_gpu  = nullptr;
-  dataOnDevice.fTheElectronData_gpu = nullptr;
-  dataOnDevice.fThePositronData_gpu = nullptr;
-  dataOnDevice.fTheSBTableData_gpu  = nullptr;
-  dataOnDevice.fTheGammaData_gpu    = nullptr;
-
-  COPCORE_CUDA_CHECK(cudaMemcpyToSymbol(g4HepEmData, &dataOnDevice, sizeof(G4HepEmData)));
-
-  return state;
-}
-
 // Kernel to initialize the set of queues per particle type.
 __global__ void InitParticleQueues(ParticleQueues queues, size_t CapacityTransport, size_t CapacityLeaked)
 {
@@ -188,7 +75,7 @@ __global__ void InjectTracks(AsyncAdePT::TrackDataWithIDs *trackinfo, int ntrack
                              const vecgeom::VPlacedVolume *world, adept::MParrayT<QueueIndexPair> *toBeEnqueued,
                              uint64_t initialSeed)
 {
-  constexpr double tolerance = 10. * vecgeom::kTolerance;
+  // constexpr double tolerance = 10. * vecgeom::kTolerance;
   for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < ntracks; i += blockDim.x * gridDim.x) {
     ParticleGenerator *generator = nullptr;
     const auto &trackInfo        = trackinfo[i];
@@ -214,45 +101,48 @@ __global__ void InjectTracks(AsyncAdePT::TrackDataWithIDs *trackinfo, int ntrack
                                            trackInfo.globalTime, static_cast<float>(trackInfo.localTime),
                                            static_cast<float>(trackInfo.properTime), trackInfo.position,
                                            trackInfo.direction, trackInfo.eventId, trackInfo.parentID, trackInfo.threadId);
+    track.navState.Clear();
+    track.navState = trackinfo[i].navState;
     toBeEnqueued->push_back(QueueIndexPair{slot, queueIndex});
 
-    // We locate the pushed point because we run the risk that the
-    // point is not located in the GPU region
-#ifdef NDEBUG
-    constexpr int maxAttempt = 2;
-#else
-    constexpr int maxAttempt = 10;
-#endif
-    for (int attempt = 1; attempt < maxAttempt; ++attempt) {
-      const auto amount = attempt < 5 ? attempt : (attempt - 5) * -1;
-      track.navState.Clear();
-      const auto pushedPosition = track.pos + amount * tolerance * track.dir;
-      BVHNavigator::LocatePointIn(world, pushedPosition, track.navState, true);
-      // The track must be on boundary at this point
-      track.navState.SetBoundaryState(true);
-      // nextState is initialized as needed.
-#ifndef NDEBUG
-      const vecgeom::VPlacedVolume *volume = track.navState.Top();
-      int lvolID                           = volume->GetLogicalVolume()->id();
-      adeptint::VolAuxData const &auxData  = AsyncAdePT::gVolAuxData[lvolID];
-      if (auxData.fGPUregion && attempt == 1) {
-        break;
-      } else {
-        printf("Error [%d, %d]: ev=%d track=%d thread=%d: GPUregion=%d volID=%d "
-               "x=(%18.15f, %18.15f, %18.15f) dir=(%f, %f, %f) "
-               "Safety=%17.15f DistanceToOut=%f shiftAmount=%d\n",
-               blockIdx.x, threadIdx.x, trackInfo.eventId, trackInfo.trackId, trackInfo.threadId, auxData.fGPUregion,
-               volume->id(), pushedPosition[0], pushedPosition[1], pushedPosition[2], track.dir[0], track.dir[1],
-               track.dir[2], BVHNavigator::ComputeSafety(pushedPosition, track.navState),
-               volume->DistanceToOut(track.pos, track.dir), amount);
-        track.navState.Print();
-        if (auxData.fGPUregion) {
-          printf("Success in attempt %d shiftAmount %d\n", attempt, amount);
-          break;
-        }
-      }
-#endif
-    }
+    // FIXME KEEP OLD IMPLEMENTATION SINCE THIS HAS NOT BEEN TESTED THOROUGLY, THEN REMOVE
+    //     // We locate the pushed point because we run the risk that the
+    //     // point is not located in the GPU region
+    // #ifdef NDEBUG
+    //     constexpr int maxAttempt = 2;
+    // #else
+    //     constexpr int maxAttempt = 10;
+    // #endif
+    //     for (int attempt = 1; attempt < maxAttempt; ++attempt) {
+    //       const auto amount = attempt < 5 ? attempt : (attempt - 5) * -1;
+    //       track.navState.Clear();
+    //       const auto pushedPosition = track.pos + amount * tolerance * track.dir;
+    //       BVHNavigator::LocatePointIn(world, pushedPosition, track.navState, true);
+    //       // The track must be on boundary at this point
+    //       track.navState.SetBoundaryState(true);
+    //       // nextState is initialized as needed.
+    // #ifndef NDEBUG
+    //       const vecgeom::VPlacedVolume *volume = track.navState.Top();
+    //       int lvolID                           = volume->GetLogicalVolume()->id();
+    //       adeptint::VolAuxData const &auxData  = AsyncAdePT::gVolAuxData[lvolID];
+    //       if (auxData.fGPUregion && attempt == 1) {
+    //         break;
+    //       } else {
+    //         printf("Error [%d, %d]: ev=%d track=%d thread=%d: GPUregion=%d volID=%d "
+    //                "x=(%18.15f, %18.15f, %18.15f) dir=(%f, %f, %f) "
+    //                "Safety=%17.15f DistanceToOut=%f shiftAmount=%d\n",
+    //                blockIdx.x, threadIdx.x, trackInfo.eventId, trackInfo.trackId, trackInfo.threadId,
+    //                auxData.fGPUregion, volume->id(), pushedPosition[0], pushedPosition[1], pushedPosition[2],
+    //                track.dir[0], track.dir[1], track.dir[2], BVHNavigator::ComputeSafety(pushedPosition,
+    //                track.navState), volume->DistanceToOut(track.pos, track.dir), amount);
+    //         track.navState.Print();
+    //         if (auxData.fGPUregion) {
+    //           printf("Success in attempt %d shiftAmount %d\n", attempt, amount);
+    //           break;
+    //         }
+    //       }
+    // #endif
+    //     }
   }
 }
 
@@ -302,7 +192,7 @@ __global__ void FillFromDeviceBuffer(AllLeaked all, AsyncAdePT::TrackDataWithIDs
       pdg          = -11;
     }
 
-    const auto trackSlot = (*leakedTracks->fLeakedQueue)[queueSlot];
+    const auto trackSlot     = (*leakedTracks->fLeakedQueue)[queueSlot];
     Track const *const track = leakedTracks->fTracks + trackSlot;
 
     if (i >= maxFromDeviceBuffer) {
@@ -315,7 +205,7 @@ __global__ void FillFromDeviceBuffer(AllLeaked all, AsyncAdePT::TrackDataWithIDs
       fromDevice[i].direction[0] = track->dir[0];
       fromDevice[i].direction[1] = track->dir[1];
       fromDevice[i].direction[2] = track->dir[2];
-      fromDevice[i].eKin         = track->energy;
+      fromDevice[i].eKin         = track->eKin;
       fromDevice[i].globalTime   = track->globalTime;
       fromDevice[i].localTime    = track->localTime;
       fromDevice[i].properTime   = track->properTime;
@@ -333,6 +223,7 @@ __global__ void FreeSlots1(Ts... slotManagers)
 {
   (slotManagers->FreeMarkedSlotsStage1(), ...);
 }
+
 template <typename... Ts>
 __global__ void FreeSlots2(Ts... slotManagers)
 {
@@ -347,8 +238,8 @@ __global__ void FinishIteration(AllParticleQueues all, Stats *stats, TracksAndSl
     // Clear queues and write statistics
     for (int i = threadIdx.x; i < ParticleType::NumParticleTypes; i += blockDim.x) {
       all.queues[i].currentlyActive->clear();
-      stats->inFlight[i]     = all.queues[i].nextActive->size();
-      stats->leakedTracks[i] = all.queues[i].leakedTracksCurrent->size() + all.queues[i].leakedTracksNext->size();
+      stats->inFlight[i]       = all.queues[i].nextActive->size();
+      stats->leakedTracks[i]   = all.queues[i].leakedTracksCurrent->size() + all.queues[i].leakedTracksNext->size();
       stats->queueFillLevel[i] = float(all.queues[i].nextActive->size()) / all.queues[i].nextActive->max_size();
     }
   } else if (blockIdx.x == 1 && threadIdx.x == 0) {
@@ -407,7 +298,7 @@ __global__ void ZeroEventCounters(Stats *stats)
  */
 __global__ void CountCurrentPopulation(AllParticleQueues all, Stats *stats, TracksAndSlots tracksAndSlots)
 {
-  constexpr unsigned int N = AdeptIntegration::kMaxThreads;
+  constexpr unsigned int N = kMaxThreads;
   __shared__ unsigned int sharedCount[N];
 
   for (unsigned int particleType = blockIdx.x; particleType < ParticleType::NumParticleTypes;
@@ -479,37 +370,73 @@ __global__ void InitSlotManagers(SlotManager *mgr, std::size_t N)
   }
 }
 
-bool AdeptIntegration::InitializeGeometry(const vecgeom::cxx::VPlacedVolume *world)
+/// WIP: Free functions implementing the CUDA parts
+namespace async_adept_impl {
+
+G4HepEmState *InitG4HepEm()
 {
-  // Upload geometry to GPU.
-  auto &cudaManager = vecgeom::cxx::CudaManager::Instance();
-  cudaManager.LoadGeometry(world);
-  auto world_dev = cudaManager.Synchronize();
-  // Initialize BVH
-  InitBVH();
-
-  return (world_dev != nullptr);
+  auto state = new G4HepEmState;
+  InitG4HepEmState(state);
+
+  G4HepEmMatCutData *cutData = state->fData->fTheMatCutData;
+  G4cout << "fNumG4MatCuts = " << cutData->fNumG4MatCuts << ", fNumMatCutData = " << cutData->fNumMatCutData << G4endl;
+
+  // Copy to GPU.
+  CopyG4HepEmDataToGPU(state->fData);
+  COPCORE_CUDA_CHECK(cudaMemcpyToSymbol(g4HepEmPars, state->fParameters, sizeof(G4HepEmParameters)));
+
+  // Create G4HepEmData with the device pointers.
+  G4HepEmData dataOnDevice;
+  dataOnDevice.fTheMatCutData   = state->fData->fTheMatCutData_gpu;
+  dataOnDevice.fTheMaterialData = state->fData->fTheMaterialData_gpu;
+  dataOnDevice.fTheElementData  = state->fData->fTheElementData_gpu;
+  dataOnDevice.fTheElectronData = state->fData->fTheElectronData_gpu;
+  dataOnDevice.fThePositronData = state->fData->fThePositronData_gpu;
+  dataOnDevice.fTheSBTableData  = state->fData->fTheSBTableData_gpu;
+  dataOnDevice.fTheGammaData    = state->fData->fTheGammaData_gpu;
+  // The other pointers should never be used.
+  dataOnDevice.fTheMatCutData_gpu   = nullptr;
+  dataOnDevice.fTheMaterialData_gpu = nullptr;
+  dataOnDevice.fTheElementData_gpu  = nullptr;
+  dataOnDevice.fTheElectronData_gpu = nullptr;
+  dataOnDevice.fThePositronData_gpu = nullptr;
+  dataOnDevice.fTheSBTableData_gpu  = nullptr;
+  dataOnDevice.fTheGammaData_gpu    = nullptr;
+
+  COPCORE_CUDA_CHECK(cudaMemcpyToSymbol(g4HepEmData, &dataOnDevice, sizeof(G4HepEmData)));
+
+  return state;
 }
 
-bool AdeptIntegration::InitializePhysics(double bz)
+bool InitializeField(double bz)
 {
-  // Initialize shared physics data
-  fg4hepem_state.reset(InitG4HepEm());
-
   // Initialize field
   COPCORE_CUDA_CHECK(cudaMemcpyToSymbol(BzFieldValue, &bz, sizeof(double)));
+  return true;
+}
 
+bool InitializeApplyCuts(bool applycuts)
+{
+  // Initialize ApplyCut
+  COPCORE_CUDA_CHECK(cudaMemcpyToSymbol(ApplyCuts, &applycuts, sizeof(bool)));
   return true;
 }
 
+void FlushScoring(AdePTScoring &scoring)
+{
+  scoring.CopyToHost();
+  scoring.ClearGPU();
+}
+
 /// Allocate memory on device, as well as streams and cuda events to synchronise kernels.
 /// If successful, this will initialise the member fGPUState.
 /// If memory allocation fails, an exception is thrown. In this case, the caller has to
 /// try again after some wait time or with less transport slots.
-void AdeptIntegration::InitializeGPU()
+std::unique_ptr<GPUstate, GPUstateDeleter> InitializeGPU(int trackCapacity, int scoringCapacity, int numThreads,
+                                                         TrackBuffer &trackBuffer, std::vector<AdePTScoring> &scoring)
 {
-  auto gpuStateTmp   = std::make_unique<GPUstate>();
-  GPUstate &gpuState = *gpuStateTmp;
+  auto gpuState_ptr  = std::unique_ptr<GPUstate, GPUstateDeleter>(new GPUstate());
+  GPUstate &gpuState = *gpuState_ptr;
 
   // Allocate structures to manage tracks of an implicit type:
   //  * memory to hold the actual Track elements,
@@ -537,8 +464,8 @@ void AdeptIntegration::InitializeGPU()
     if (emplaceForAutoDelete) gpuState.allCudaPointers.push_back(devPtr);
   };
 
-  gpuState.slotManager_host = SlotManager{static_cast<SlotManager::value_type>(fTrackCapacity),
-                                          static_cast<SlotManager::value_type>(fTrackCapacity)};
+  gpuState.slotManager_host = SlotManager{static_cast<SlotManager::value_type>(trackCapacity),
+                                          static_cast<SlotManager::value_type>(trackCapacity)};
   gpuState.slotManager_dev  = nullptr;
   gpuMalloc(gpuState.slotManager_dev, gpuState.nSlotManager_dev);
   COPCORE_CUDA_CHECK(
@@ -550,7 +477,7 @@ void AdeptIntegration::InitializeGPU()
     ParticleType &particleType = gpuState.particles[i];
     // Provide 20% more queue slots than track slots, so a large cluster of a specific particle type
     // doesn't exhaust the queues.
-    const size_t nSlot              = fTrackCapacity * ParticleType::relativeQueueSize[i] * 1.2;
+    const size_t nSlot              = trackCapacity * ParticleType::relativeQueueSize[i] * 1.2;
     const size_t sizeOfQueueStorage = adept::MParray::SizeOfInstance(nSlot);
     const size_t sizeOfLeakQueue    = adept::MParray::SizeOfInstance(nSlot / 10);
 
@@ -573,7 +500,7 @@ void AdeptIntegration::InitializeGPU()
 
   // init gamma interaction queues
   for (unsigned int i = 0; i < GammaInteractions::NInt; ++i) {
-    const auto capacity     = fTrackCapacity / 6;
+    const auto capacity     = trackCapacity / 6;
     const auto instanceSize = adept::MParrayT<GammaInteractions::Data>::SizeOfInstance(capacity);
     void *gpuPtr            = nullptr;
     gpuMalloc(gpuPtr, instanceSize);
@@ -586,105 +513,93 @@ void AdeptIntegration::InitializeGPU()
   COPCORE_CUDA_CHECK(cudaMallocHost(&gpuState.stats, sizeof(Stats)));
 
   // init scoring structures
-  gpuMalloc(gpuState.fScoring_dev, fNThread);
+  gpuMalloc(gpuState.fScoring_dev, numThreads);
 
-  fScoring.clear();
-  fScoring.reserve(fNThread);
-  for (unsigned int i = 0; i < fNThread; ++i) {
-    fScoring.emplace_back(gpuState.fScoring_dev + i);
+  scoring.clear();
+  scoring.reserve(numThreads);
+  for (unsigned int i = 0; i < numThreads; ++i) {
+    scoring.emplace_back(gpuState.fScoring_dev + i);
   }
-  gpuState.fHitScoring.reset(new HitScoring(fScoringCapacity, fNThread));
+  gpuState.fHitScoring.reset(new HitScoring(scoringCapacity, numThreads));
 
-  const auto injectQueueSize = adept::MParrayT<QueueIndexPair>::SizeOfInstance(fBuffer->fNumToDevice);
+  const auto injectQueueSize = adept::MParrayT<QueueIndexPair>::SizeOfInstance(trackBuffer.fNumToDevice);
   void *gpuPtr               = nullptr;
   gpuMalloc(gpuPtr, injectQueueSize);
   gpuState.injectionQueue = static_cast<adept::MParrayT<QueueIndexPair> *>(gpuPtr);
-  InitQueue<QueueIndexPair><<<1, 1>>>(gpuState.injectionQueue, fBuffer->fNumToDevice);
+  InitQueue<QueueIndexPair><<<1, 1>>>(gpuState.injectionQueue, trackBuffer.fNumToDevice);
 
   // This is the largest allocation. If it does not fit, we need to try again:
   Track *trackStorage_dev = nullptr;
-  gpuMalloc(trackStorage_dev, fTrackCapacity);
+  gpuMalloc(trackStorage_dev, trackCapacity);
 
   for (auto &partType : gpuState.particles) {
     partType.tracks = trackStorage_dev;
   }
 
-  fGPUstate = std::move(gpuStateTmp);
+  return gpuState_ptr;
 }
 
-void AdeptIntegration::FreeGPU()
+void AdvanceEventStates(EventState oldState, EventState newState, std::vector<std::atomic<EventState>> &eventStates)
 {
-  fGPUstate->runTransport = false;
-  fGPUWorker.join();
-
-  adeptint::VolAuxData *volAux = nullptr;
-  COPCORE_CUDA_CHECK(cudaMemcpyFromSymbol(&volAux, AsyncAdePT::gVolAuxData, sizeof(adeptint::VolAuxData *)));
-  COPCORE_CUDA_CHECK(cudaFree(volAux));
-
-  // Free resources.
-  fGPUstate.reset();
-
-  // Free G4HepEm data
-  FreeG4HepEmData(AdeptIntegration::fg4hepem_state->fData);
+  for (auto &eventState : eventStates) {
+    EventState expected = oldState;
+    eventState.compare_exchange_strong(expected, newState, std::memory_order_release, std::memory_order_relaxed);
+  }
 }
 
-void AdeptIntegration::ReturnTracksToG4()
+__host__ void ReturnTracksToG4(TrackBuffer &trackBuffer, GPUstate &gpuState,
+                               std::vector<std::atomic<EventState>> &eventStates)
 {
-  std::scoped_lock lock{fBuffer->fromDeviceMutex};
-  const auto &fromDevice                      = fBuffer->fromDevice_host.get();
-  TrackDataWithIDs const *const fromDeviceEnd = fromDevice + *fBuffer->nFromDevice_host;
+  std::scoped_lock lock{trackBuffer.fromDeviceMutex};
+  const auto &fromDevice                      = trackBuffer.fromDevice_host.get();
+  TrackDataWithIDs const *const fromDeviceEnd = fromDevice + *trackBuffer.nFromDevice_host;
 
   for (TrackDataWithIDs *trackIt = fromDevice; trackIt < fromDeviceEnd; ++trackIt) {
-    assert(0 <= trackIt->threadId && trackIt->threadId <= fNThread);
-    fBuffer->fromDeviceBuffers[trackIt->threadId].push_back(*trackIt);
+    // TODO: Pass numThreads here, only used in debug mode however
+    // assert(0 <= trackIt->threadId && trackIt->threadId <= numThreads);
+    trackBuffer.fromDeviceBuffers[trackIt->threadId].push_back(*trackIt);
   }
 
-  AdvanceEventStates(EventState::FlushingTracks, EventState::DeviceFlushed);
-  fGPUstate->extractState = GPUstate::ExtractState::Idle;
+  AdvanceEventStates(EventState::FlushingTracks, EventState::DeviceFlushed, eventStates);
+  gpuState.extractState = GPUstate::ExtractState::Idle;
 
 #ifndef NDEBUG
-  for (const auto &trackBuffer : fBuffer->fromDeviceBuffers) {
-    if (trackBuffer.empty()) continue;
-    const auto eventId = trackBuffer.front().eventId;
-    assert(std::all_of(trackBuffer.begin(), trackBuffer.end(),
+  for (const auto &buffer : trackBuffer.fromDeviceBuffers) {
+    if (buffer.empty()) continue;
+    const auto eventId = buffer.front().eventId;
+    assert(std::all_of(buffer.begin(), buffer.end(),
                        [eventId](const TrackDataWithIDs &track) { return eventId == track.eventId; }));
   }
 #endif
 }
 
-void AdeptIntegration::AdvanceEventStates(EventState oldState, EventState newState)
+void HitProcessingLoop(HitProcessingContext *const context, GPUstate &gpuState,
+                       std::vector<std::atomic<EventState>> &eventStates, std::condition_variable &cvG4Workers)
 {
-  for (auto &eventState : fEventStates) {
-    EventState expected = oldState;
-    eventState.compare_exchange_strong(expected, newState, std::memory_order_release, std::memory_order_relaxed);
+  while (context->keepRunning) {
+    std::unique_lock lock(context->mutex);
+    context->cv.wait(lock);
+
+    gpuState.fHitScoring->TransferHitsToHost(context->hitTransferStream);
+    const bool haveNewHits = gpuState.fHitScoring->ProcessHits();
+
+    if (haveNewHits) {
+      AdvanceEventStates(EventState::FlushingHits, EventState::HitsFlushed, eventStates);
+      cvG4Workers.notify_all();
+    }
   }
 }
 
-void AdeptIntegration::TransportLoop()
+void TransportLoop(int trackCapacity, int scoringCapacity, int numThreads, TrackBuffer &trackBuffer, GPUstate &gpuState,
+                   std::vector<std::atomic<EventState>> &eventStates, std::condition_variable &cvG4Workers,
+                   std::vector<AdePTScoring> &scoring, int adeptSeed)
 {
-  NVTXTracer tracer{"TransportLoop"};
-
-  // Initialise the transport engine:
-  do {
-    try {
-      InitializeGPU();
-    } catch (std::invalid_argument &exc) {
-      // Clear error state:
-      auto result = cudaGetLastError();
-      std::cerr << "\nError: AdePT failed to initialise the device (" << cudaGetErrorName(result) << "):\n"
-                << exc.what() << "\nReducing track capacity: " << fTrackCapacity << " --> " << fTrackCapacity * 0.9
-                << '\n';
-      fTrackCapacity *= 0.9;
-
-      if (fTrackCapacity < 10000) throw std::runtime_error{"AdePT is unable to allocate GPU memory."};
-    }
-  } while (!fGPUstate);
+  // NVTXTracer tracer{"TransportLoop"};
 
-  using InjectState  = GPUstate::InjectState;
+  using InjectState                             = GPUstate::InjectState;
   using ExtractState                            = GPUstate::ExtractState;
   auto &cudaManager                             = vecgeom::cxx::CudaManager::Instance();
   const vecgeom::cuda::VPlacedVolume *world_dev = cudaManager.world_gpu();
-  GPUstate &gpuState                            = *fGPUstate;
 
   ParticleType &electrons = gpuState.particles[ParticleType::Electron];
   ParticleType &positrons = gpuState.particles[ParticleType::Positron];
@@ -708,7 +623,8 @@ void AdeptIntegration::TransportLoop()
   };
 
   std::unique_ptr<HitProcessingContext> hitProcessing{new HitProcessingContext{transferStream}};
-  std::thread hitProcessingThread{&AdeptIntegration::HitProcessingLoop, this, hitProcessing.get()};
+  std::thread hitProcessingThread{&HitProcessingLoop, (HitProcessingContext *)hitProcessing.get(), std::ref(gpuState),
+                                  std::ref(eventStates), std::ref(cvG4Workers)};
 
   auto computeThreadsAndBlocks = [](unsigned int nParticles) -> std::pair<unsigned int, unsigned int> {
     constexpr int TransportThreads             = 256;
@@ -723,7 +639,7 @@ void AdeptIntegration::TransportLoop()
   };
 
   while (gpuState.runTransport) {
-    NVTXTracer nvtx1{"Setup"}, nvtx2{"Setup2"};
+    // NVTXTracer nvtx1{"Setup"}, nvtx2{"Setup2"};
     InitSlotManagers<<<80, 256, 0, gpuState.stream>>>(gpuState.slotManager_dev, gpuState.nSlotManager_dev);
     COPCORE_CUDA_CHECK(cudaMemsetAsync(gpuState.stats_dev, 0, sizeof(Stats), gpuState.stream));
 
@@ -735,41 +651,42 @@ void AdeptIntegration::TransportLoop()
       return state.load(std::memory_order_acquire) < EventState::LeakedTracksRetrieved;
     };
     // Wait for work from G4 workers:
-    while (gpuState.runTransport && std::none_of(fEventStates.begin(), fEventStates.end(), needTransport)) {
+    while (gpuState.runTransport && std::none_of(eventStates.begin(), eventStates.end(), needTransport)) {
       using namespace std::chrono_literals;
       std::this_thread::sleep_for(10ms);
     }
 
-    if (fDebugLevel > 2) {
-      G4cout << "GPU transport starting" << std::endl;
-    }
+    // TODO: Pass debug level here
+    // if (fDebugLevel > 2) {
+    //   G4cout << "GPU transport starting" << std::endl;
+    // }
 
     COPCORE_CUDA_CHECK(cudaStreamSynchronize(gpuState.stream));
 
-#ifdef USE_NVTX
-    std::map<AdeptIntegration::EventState, std::string> stateMap{
-        {EventState::NewTracksFromG4, "NewTracksFromG4"},
-        {EventState::G4RequestsFlush, "G4RequestsFlush"},
-        {EventState::Inject, "Inject"},
-        {EventState::InjectionCompleted, "InjectionCompleted"},
-        {EventState::Transporting, "Transporting"},
-        {EventState::WaitingForTransportToFinish, "WaitingForTransportToFinish"},
-        {EventState::RequestHitFlush, "RequestHitFlush"},
-        {EventState::HitsFlushed, "HitsFlushed"},
-        {EventState::FlushingTracks, "FlushingTracks"},
-        {EventState::DeviceFlushed, "DeviceFlushed"},
-        {EventState::LeakedTracksRetrieved, "LeakedTracksRetrieved"},
-        {EventState::ScoringRetrieved, "ScoringRetrieved"}};
-#endif
+    // #ifdef USE_NVTX
+    //     std::map<AsyncAdePTTransport::EventState, std::string> stateMap{
+    //         {EventState::NewTracksFromG4, "NewTracksFromG4"},
+    //         {EventState::G4RequestsFlush, "G4RequestsFlush"},
+    //         {EventState::Inject, "Inject"},
+    //         {EventState::InjectionCompleted, "InjectionCompleted"},
+    //         {EventState::Transporting, "Transporting"},
+    //         {EventState::WaitingForTransportToFinish, "WaitingForTransportToFinish"},
+    //         {EventState::RequestHitFlush, "RequestHitFlush"},
+    //         {EventState::HitsFlushed, "HitsFlushed"},
+    //         {EventState::FlushingTracks, "FlushingTracks"},
+    //         {EventState::DeviceFlushed, "DeviceFlushed"},
+    //         {EventState::LeakedTracksRetrieved, "LeakedTracksRetrieved"},
+    //         {EventState::ScoringRetrieved, "ScoringRetrieved"}};
+    // #endif
 
     for (unsigned int iteration = 0;
          inFlight > 0 || gpuState.injectState != InjectState::Idle || gpuState.extractState != ExtractState::Idle ||
-         std::any_of(fEventStates.begin(), fEventStates.end(), needTransport);
+         std::any_of(eventStates.begin(), eventStates.end(), needTransport);
          ++iteration) {
-#ifdef USE_NVTX
-      nvtx1.setTag(stateMap[fEventStates[0].load()].data());
-      nvtx2.setTag(stateMap[fEventStates[1].load()].data());
-#endif
+      // #ifdef USE_NVTX
+      //       nvtx1.setTag(stateMap[eventStates[0].load()].data());
+      //       nvtx2.setTag(stateMap[eventStates[1].load()].data());
+      // #endif
 
       // Swap the queues for the next iteration.
       electrons.queues.SwapActive();
@@ -789,7 +706,7 @@ void AdeptIntegration::TransportLoop()
       // *** Particle injection ***
       // --------------------------
       if (gpuState.injectState == InjectState::Idle) {
-        for (auto &eventState : fEventStates) {
+        for (auto &eventState : eventStates) {
           if (const auto state = eventState.load(std::memory_order_acquire); state == EventState::G4RequestsFlush) {
             eventState = EventState::Inject;
           } else if (state == EventState::Inject) {
@@ -797,18 +714,19 @@ void AdeptIntegration::TransportLoop()
           }
         }
 
-        if (auto &toDevice = fBuffer->getActiveBuffer(); toDevice.nTrack > 0) {
+        if (auto &toDevice = trackBuffer.getActiveBuffer(); toDevice.nTrack > 0) {
           gpuState.injectState = InjectState::CreatingSlots;
 
-          fBuffer->swapToDeviceBuffers();
+          trackBuffer.swapToDeviceBuffers();
           std::scoped_lock lock{toDevice.mutex};
           const auto nInject = std::min(toDevice.nTrack.load(), toDevice.maxTracks);
           toDevice.nTrack    = 0;
 
-          if (fDebugLevel > 3) std::cout << "Injecting " << nInject << " to GPU\n";
+          // TODO: Pass debug level here
+          // if (fDebugLevel > 3) std::cout << "Injecting " << nInject << " to GPU\n";
 
           // copy buffer of tracks to device
-          COPCORE_CUDA_CHECK(cudaMemcpyAsync(fBuffer->toDevice_dev.get(), toDevice.tracks,
+          COPCORE_CUDA_CHECK(cudaMemcpyAsync(trackBuffer.toDevice_dev.get(), toDevice.tracks,
                                              nInject * sizeof(TrackDataWithIDs), cudaMemcpyHostToDevice,
                                              transferStream));
           // Mark end of copy operation:
@@ -818,7 +736,7 @@ void AdeptIntegration::TransportLoop()
           constexpr auto injectThreads = 128u;
           const auto injectBlocks      = (nInject + injectThreads - 1) / injectThreads;
           InjectTracks<<<injectBlocks, injectThreads, 0, transferStream>>>(
-              fBuffer->toDevice_dev.get(), nInject, secondaries, world_dev, gpuState.injectionQueue, fAdePTSeed);
+              trackBuffer.toDevice_dev.get(), nInject, secondaries, world_dev, gpuState.injectionQueue, adeptSeed);
           COPCORE_CUDA_CHECK(cudaLaunchHostFunc(
               transferStream,
               [](void *arg) { (*static_cast<decltype(GPUstate::injectState) *>(arg)) = InjectState::ReadyToEnqueue; },
@@ -886,8 +804,8 @@ void AdeptIntegration::TransportLoop()
       // *** Count detailed event statistics ***
       // ---------------------------------------
       {
-        AdvanceEventStates(EventState::Transporting, EventState::WaitingForTransportToFinish);
-        AdvanceEventStates(EventState::InjectionCompleted, EventState::Transporting);
+        AdvanceEventStates(EventState::Transporting, EventState::WaitingForTransportToFinish, eventStates);
+        AdvanceEventStates(EventState::InjectionCompleted, EventState::Transporting, eventStates);
 
         // Reset all counters count the currently flying population
         ZeroEventCounters<<<1, 256, 0, statsStream>>>(gpuState.stats_dev);
@@ -910,11 +828,11 @@ void AdeptIntegration::TransportLoop()
       // -------------------------
 
       if (gpuState.extractState == ExtractState::Idle &&
-          std::any_of(fEventStates.begin(), fEventStates.end(), [](const auto &eventState) {
+          std::any_of(eventStates.begin(), eventStates.end(), [](const auto &eventState) {
             return eventState.load(std::memory_order_acquire) == EventState::HitsFlushed;
           })) {
         gpuState.extractState = ExtractState::FreeingSlots;
-        AdvanceEventStates(EventState::HitsFlushed, EventState::FlushingTracks);
+        AdvanceEventStates(EventState::HitsFlushed, EventState::FlushingTracks, eventStates);
 
         const AllLeaked allLeaked = {.leakedElectrons = {electrons.tracks, electrons.queues.leakedTracksCurrent,
                                                          electrons.queues.leakedTracksNext, electrons.slotManager},
@@ -930,10 +848,10 @@ void AdeptIntegration::TransportLoop()
 
         // Populate the staging buffer and copy to host
         constexpr unsigned int block_size = 128;
-        const unsigned int grid_size      = (fBuffer->fNumFromDevice + block_size - 1) / block_size;
-        FillFromDeviceBuffer<<<grid_size, block_size, 0, transferStream>>>(allLeaked, fBuffer->fromDevice_dev.get(),
-                                                                           fBuffer->fNumFromDevice);
-        COPCORE_CUDA_CHECK(cudaMemcpyFromSymbolAsync(fBuffer->nFromDevice_host.get(), nFromDevice_dev,
+        const unsigned int grid_size      = (trackBuffer.fNumFromDevice + block_size - 1) / block_size;
+        FillFromDeviceBuffer<<<grid_size, block_size, 0, transferStream>>>(allLeaked, trackBuffer.fromDevice_dev.get(),
+                                                                           trackBuffer.fNumFromDevice);
+        COPCORE_CUDA_CHECK(cudaMemcpyFromSymbolAsync(trackBuffer.nFromDevice_host.get(), nFromDevice_dev,
                                                      sizeof(unsigned int), 0, cudaMemcpyDeviceToHost, transferStream));
         COPCORE_CUDA_CHECK(cudaLaunchHostFunc(
             transferStream,
@@ -950,11 +868,28 @@ void AdeptIntegration::TransportLoop()
 
       if (gpuState.extractState == ExtractState::ReadyToCopy) {
         gpuState.extractState = ExtractState::CopyToHost;
-        COPCORE_CUDA_CHECK(cudaMemcpyAsync(fBuffer->fromDevice_host.get(), fBuffer->fromDevice_dev.get(),
-                                           (*fBuffer->nFromDevice_host) * sizeof(TrackDataWithIDs),
+        COPCORE_CUDA_CHECK(cudaMemcpyAsync(trackBuffer.fromDevice_host.get(), trackBuffer.fromDevice_dev.get(),
+                                           (*trackBuffer.nFromDevice_host) * sizeof(TrackDataWithIDs),
                                            cudaMemcpyDeviceToHost, transferStream));
+
+        struct CallbackData {
+          TrackBuffer *trackBuffer;
+          GPUstate *gpuState;
+          std::vector<std::atomic<EventState>> *eventStates;
+        };
+
+        // Needs to be dynamically allocated, since the callback may execute after
+        // the current scope has ended.
+        CallbackData *data = new CallbackData{&trackBuffer, &gpuState, &eventStates};
+
         COPCORE_CUDA_CHECK(cudaLaunchHostFunc(
-            transferStream, [](void *thisPtr) { static_cast<AdeptIntegration *>(thisPtr)->ReturnTracksToG4(); }, this));
+            transferStream,
+            [](void *userData) {
+              CallbackData *data = static_cast<CallbackData *>(userData);
+              ReturnTracksToG4(*data->trackBuffer, *data->gpuState, *data->eventStates);
+              delete data;
+            },
+            data));
       }
 
       // -------------------------
@@ -1006,10 +941,10 @@ void AdeptIntegration::TransportLoop()
           particlesInFlight[i] = gpuState.stats->inFlight[i];
         }
 
-        for (unsigned short threadId = 0; threadId < fNThread; ++threadId) {
-          const auto state = fEventStates[threadId].load(std::memory_order_acquire);
+        for (unsigned short threadId = 0; threadId < numThreads; ++threadId) {
+          const auto state = eventStates[threadId].load(std::memory_order_acquire);
           if (state == EventState::WaitingForTransportToFinish && gpuState.stats->perEventInFlight[threadId] == 0) {
-            fEventStates[threadId] = EventState::RequestHitFlush;
+            eventStates[threadId] = EventState::RequestHitFlush;
           }
           if (EventState::RequestHitFlush <= state && state < EventState::LeakedTracksRetrieved &&
               gpuState.stats->perEventInFlight[threadId] != 0) {
@@ -1023,9 +958,9 @@ void AdeptIntegration::TransportLoop()
           hitProcessing->cv.notify_one();
         } else {
           if (gpuState.stats->hitBufferOccupancy >= gpuState.fHitScoring->HitCapacity() / 2 ||
-              std::any_of(fEventStates.begin(), fEventStates.end(),
+              std::any_of(eventStates.begin(), eventStates.end(),
                           [](const auto &state) { return state == EventState::RequestHitFlush; })) {
-            AdvanceEventStates(EventState::RequestHitFlush, EventState::FlushingHits);
+            AdvanceEventStates(EventState::RequestHitFlush, EventState::FlushingHits, eventStates);
             gpuState.fHitScoring->SwapDeviceBuffers(gpuState.stream);
             hitProcessing->cv.notify_one();
           }
@@ -1033,33 +968,36 @@ void AdeptIntegration::TransportLoop()
       }
 
       // *** Notify G4 workers if their events completed ***
-      if (std::any_of(fEventStates.begin(), fEventStates.end(),
+      if (std::any_of(eventStates.begin(), eventStates.end(),
                       [](const EventState &state) { return state == EventState::DeviceFlushed; })) {
-        fCV_G4Workers.notify_all();
+        cvG4Workers.notify_all();
       }
 
-      if (fDebugLevel >= 3 && inFlight > 0 || (fDebugLevel >= 2 && iteration % 500 == 0)) {
-        std::cerr << inFlight << " in flight ";
-        std::cerr << "(" << gpuState.stats->inFlight[ParticleType::Electron] << " "
-                  << gpuState.stats->inFlight[ParticleType::Positron] << " "
-                  << gpuState.stats->inFlight[ParticleType::Gamma] << "),\tqueues:(" << std::setprecision(3)
-                  << gpuState.stats->queueFillLevel[ParticleType::Electron] << " "
-                  << gpuState.stats->queueFillLevel[ParticleType::Positron] << " "
-                  << gpuState.stats->queueFillLevel[ParticleType::Gamma] << ")";
-        std::cerr << "\t slots:" << gpuState.stats->slotFillLevel << ", " << numLeaked << " leaked."
-                  << "\tInjectState: " << static_cast<unsigned int>(gpuState.injectState.load())
-                  << "\tExtractState: " << static_cast<unsigned int>(gpuState.extractState.load())
-                  << "\tHitBuffer: " << gpuState.stats->hitBufferOccupancy;
-        if (fDebugLevel >= 4) {
-          std::cerr << "\n\tper event: ";
-          for (unsigned int i = 0; i < fNThread; ++i) {
-            std::cerr << i << ": " << gpuState.stats->perEventInFlight[i]
-                      << " (s=" << static_cast<unsigned short>(fEventStates[i].load(std::memory_order_acquire))
-                      << ")\t";
-          }
-        }
-        std::cerr << std::endl;
-      }
+      // TODO: get fDebugLevel correctly and put prints back in.
+      // int fDebugLevel = 0;
+      // int fNThread = numThreads;
+      // if (fDebugLevel >= 3 && inFlight > 0 || (fDebugLevel >= 2 && iteration % 500 == 0)) {
+      //   std::cerr << inFlight << " in flight ";
+      //   std::cerr << "(" << gpuState.stats->inFlight[ParticleType::Electron] << " "
+      //             << gpuState.stats->inFlight[ParticleType::Positron] << " "
+      //             << gpuState.stats->inFlight[ParticleType::Gamma] << "),\tqueues:(" << std::setprecision(3)
+      //             << gpuState.stats->queueFillLevel[ParticleType::Electron] << " "
+      //             << gpuState.stats->queueFillLevel[ParticleType::Positron] << " "
+      //             << gpuState.stats->queueFillLevel[ParticleType::Gamma] << ")";
+      //   std::cerr << "\t slots:" << gpuState.stats->slotFillLevel << ", " << numLeaked << " leaked."
+      //             << "\tInjectState: " << static_cast<unsigned int>(gpuState.injectState.load())
+      //             << "\tExtractState: " << static_cast<unsigned int>(gpuState.extractState.load())
+      //             << "\tHitBuffer: " << gpuState.stats->hitBufferOccupancy;
+      //   if (fDebugLevel >= 4) {
+      //     std::cerr << "\n\tper event: ";
+      //     for (unsigned int i = 0; i < fNThread; ++i) {
+      //       std::cerr << i << ": " << gpuState.stats->perEventInFlight[i]
+      //                 << " (s=" << static_cast<unsigned short>(eventStates[i].load(std::memory_order_acquire))
+      //                 << ")\t";
+      //     }
+      //   }
+      //   std::cerr << std::endl;
+      // }
 
 #ifndef NDEBUG
       // *** Check slots ***
@@ -1099,7 +1037,8 @@ void AdeptIntegration::TransportLoop()
     ClearAllQueues<<<1, 1, 0, gpuState.stream>>>(queues);
     COPCORE_CUDA_CHECK(cudaStreamSynchronize(gpuState.stream));
 
-    if (fDebugLevel > 2) std::cout << "End transport loop.\n";
+    // TODO
+    // if (fDebugLevel > 2) std::cout << "End transport loop.\n";
   }
 
   hitProcessing->keepRunning = false;
@@ -1107,26 +1046,101 @@ void AdeptIntegration::TransportLoop()
   hitProcessingThread.join();
 
   // Free device memory:
-  fGPUstate = nullptr;
+  // TODO: Do this from the caller if possible, doesn't make sense here
+  // fGPUstate = nullptr;
 }
 
-std::shared_ptr<const std::vector<GPUHit>> AdeptIntegration::GetGPUHits(unsigned int threadId) const
+std::shared_ptr<const std::vector<GPUHit>> GetGPUHits(unsigned int threadId, GPUstate &gpuState)
 {
-  return fGPUstate->fHitScoring->GetNextHitsVector(threadId);
+  return gpuState.fHitScoring->GetNextHitsVector(threadId);
 }
 
-void AdeptIntegration::HitProcessingLoop(HitProcessingContext *const context)
+// TODO: Make it clear that this will initialize and return the GPUState or make a
+// separate init function that will compile here and be called from the .icc
+std::thread LaunchGPUWorker(int trackCapacity, int scoringCapacity, int numThreads, TrackBuffer &trackBuffer,
+                            GPUstate &gpuState, std::vector<std::atomic<EventState>> &eventStates,
+                            std::condition_variable &cvG4Workers, std::vector<AdePTScoring> &scoring, int adeptSeed)
 {
-  while (context->keepRunning) {
-    std::unique_lock lock(context->mutex);
-    context->cv.wait(lock);
+  return std::thread{&TransportLoop,        trackCapacity,      scoringCapacity,       numThreads,
+                     std::ref(trackBuffer), std::ref(gpuState), std::ref(eventStates), std::ref(cvG4Workers),
+                     std::ref(scoring),     adeptSeed};
+}
 
-    fGPUstate->fHitScoring->TransferHitsToHost(context->hitTransferStream);
-    const bool haveNewHits = fGPUstate->fHitScoring->ProcessHits();
+void FreeGPU(std::unique_ptr<AsyncAdePT::GPUstate, AsyncAdePT::GPUstateDeleter> &gpuState, G4HepEmState &g4hepem_state,
+             std::thread &gpuWorker)
+{
+  gpuState->runTransport = false;
+  gpuWorker.join();
 
-    if (haveNewHits) {
-      AdvanceEventStates(EventState::FlushingHits, EventState::HitsFlushed);
-      fCV_G4Workers.notify_all();
-    }
-  }
-}
\ No newline at end of file
+  adeptint::VolAuxData *volAux = nullptr;
+  COPCORE_CUDA_CHECK(cudaMemcpyFromSymbol(&volAux, AsyncAdePT::gVolAuxData, sizeof(adeptint::VolAuxData *)));
+  COPCORE_CUDA_CHECK(cudaFree(volAux));
+
+  // Free resources.
+  gpuState.reset();
+
+  // Free G4HepEm data
+  FreeG4HepEmData(g4hepem_state.fData);
+}
+
+} // namespace async_adept_impl
+
+namespace AsyncAdePT {
+
+__constant__ __device__ struct G4HepEmParameters g4HepEmPars;
+__constant__ __device__ struct G4HepEmData g4HepEmData;
+
+__constant__ __device__ adeptint::VolAuxData *gVolAuxData = nullptr;
+__constant__ __device__ double BzFieldValue               = 0;
+__constant__ __device__ bool ApplyCuts                    = false;
+
+/// Transfer volume auxiliary data to GPU
+void InitVolAuxArray(adeptint::VolAuxArray &array)
+{
+  using adeptint::VolAuxData;
+  COPCORE_CUDA_CHECK(cudaMalloc(&array.fAuxData_dev, sizeof(VolAuxData) * array.fNumVolumes));
+  COPCORE_CUDA_CHECK(
+      cudaMemcpy(array.fAuxData_dev, array.fAuxData, sizeof(VolAuxData) * array.fNumVolumes, cudaMemcpyHostToDevice));
+  COPCORE_CUDA_CHECK(cudaMemcpyToSymbol(gVolAuxData, &array.fAuxData_dev, sizeof(VolAuxData *)));
+}
+
+/// Initialise the track buffers used to communicate between host and device.
+TrackBuffer::TrackBuffer(unsigned int numToDevice, unsigned int numFromDevice, unsigned short nThread)
+    : fNumToDevice{numToDevice}, fNumFromDevice{numFromDevice}, fromDeviceBuffers(nThread)
+{
+  TrackDataWithIDs *devPtr, *hostPtr;
+  // Double buffer for lock-free host runs:
+  // toDevice_host = std::make_unique<TrackDataWithIDs[]>(2 * numToDevice);
+  COPCORE_CUDA_CHECK(cudaMallocHost(&hostPtr, 2 * numToDevice * sizeof(TrackDataWithIDs)));
+  COPCORE_CUDA_CHECK(cudaMalloc(&devPtr, numToDevice * sizeof(TrackDataWithIDs)));
+
+  toDevice_host.reset(hostPtr);
+  toDevice_dev.reset(devPtr);
+
+  // fromDevice_host = std::make_unique<TrackDataWithIDs[]>(numFromDevice);
+  COPCORE_CUDA_CHECK(cudaMallocHost(&hostPtr, numFromDevice * sizeof(TrackDataWithIDs)));
+  COPCORE_CUDA_CHECK(cudaMalloc(&devPtr, numFromDevice * sizeof(TrackDataWithIDs)));
+
+  // TODO: Check whether we can use ResourceManager
+  fromDevice_host.reset(hostPtr);
+  fromDevice_dev.reset(devPtr);
+
+  unsigned int *nFromDevice = nullptr;
+  COPCORE_CUDA_CHECK(cudaMallocHost(&nFromDevice, sizeof(unsigned int)));
+  // nFromDevice_host = std::make_unique<unsigned int[]>(nFromDevice);
+  // nFromDevice_host = nFromDevice;
+  nFromDevice_host.reset(nFromDevice);
+
+  // toDeviceBuffer[0].tracks = toDevice_host;
+  toDeviceBuffer[0].tracks    = toDevice_host.get();
+  toDeviceBuffer[0].maxTracks = numToDevice;
+  toDeviceBuffer[0].nTrack    = 0;
+  // toDeviceBuffer[1].tracks    = toDevice_host + numToDevice;
+  toDeviceBuffer[1].tracks    = toDevice_host.get() + numToDevice;
+  toDeviceBuffer[1].maxTracks = numToDevice;
+  toDeviceBuffer[1].nTrack    = 0;
+}
+
+} // namespace AsyncAdePT
+
+#endif // ASYNC_ADEPT_TRANSPORT_CUH
diff --git a/examples/AsyncExample/AdeptIntegration.h b/include/AdePT/core/AsyncAdePTTransport.hh
similarity index 68%
rename from examples/AsyncExample/AdeptIntegration.h
rename to include/AdePT/core/AsyncAdePTTransport.hh
index 415c8d94..3a253879 100644
--- a/examples/AsyncExample/AdeptIntegration.h
+++ b/include/AdePT/core/AsyncAdePTTransport.hh
@@ -6,15 +6,16 @@
 /// - filling the buffer with tracks to be transported on the GPU
 /// - Calling the Shower method transporting a buffer on the GPU
 
-#ifndef ADEPT_INTEGRATION_H
-#define ADEPT_INTEGRATION_H
+#ifndef ASYNC_ADEPT_TRANSPORT_HH
+#define ASYNC_ADEPT_TRANSPORT_HH
 
 #define ADEPT_SAVE_IDs
-#include "TrackTransfer.h"
-#include "BasicScoring.h"
 
 #include <AdePT/core/AdePTTransportInterface.hh>
 #include <AdePT/core/CommonStruct.h>
+#include <AdePT/core/AdePTConfiguration.hh>
+#include <AdePT/core/AsyncAdePTTransportStruct.hh>
+#include <AdePT/core/PerEventScoringStruct.cuh>
 
 #include <VecGeom/base/Config.h>
 #include <VecGeom/management/CudaManager.h> // forward declares vecgeom::cxx::VPlacedVolume
@@ -28,42 +29,26 @@
 class G4Region;
 class G4VPhysicalVolume;
 struct G4HepEmState;
-class AdePTGeant4Integration;
 namespace AsyncAdePT {
 struct TrackBuffer;
 struct GPUstate;
-struct HitProcessingContext;
+
 void InitVolAuxArray(adeptint::VolAuxArray &array);
 
-class AdeptIntegration : public AdePTTransportInterface {
+template <typename IntegrationLayer>
+class AsyncAdePTTransport : public AdePTTransportInterface {
 public:
-  static constexpr int kMaxThreads = 256;
   static inline uint64_t fAdePTSeed = 1234567;
 
 private:
-  enum class EventState : unsigned char {
-    NewTracksFromG4,
-    G4RequestsFlush,
-    Inject,
-    InjectionCompleted,
-    Transporting,
-    WaitingForTransportToFinish,
-    RequestHitFlush,
-    FlushingHits,
-    HitsFlushed,
-    FlushingTracks,
-    DeviceFlushed,
-    LeakedTracksRetrieved,
-    ScoringRetrieved
-  };
-
   unsigned short fNThread{0};       ///< Number of G4 workers
   unsigned int fTrackCapacity{0};   ///< Number of track slots to allocate on device
   unsigned int fScoringCapacity{0}; ///< Number of hit slots to allocate on device
   int fDebugLevel{1};               ///< Debug level
-  std::vector<AdePTGeant4Integration> fG4Integrations;
-  std::unique_ptr<GPUstate> fGPUstate;               ///< CUDA state placeholder
-  std::vector<PerEventScoring> fScoring;             ///< User scoring objects per G4 worker
+  int fCUDAStackLimit{0};           ///< CUDA device stack limit
+  std::vector<IntegrationLayer> fIntegrationLayerObjects;
+  std::unique_ptr<GPUstate, GPUstateDeleter> fGPUstate{nullptr}; ///< CUDA state placeholder
+  std::vector<AdePTScoring> fScoring;                ///< User scoring objects per G4 worker
   std::unique_ptr<TrackBuffer> fBuffer{nullptr};     ///< Buffers for transferring tracks between host and device
   std::unique_ptr<G4HepEmState> fg4hepem_state;      ///< The HepEm state singleton
   std::thread fGPUWorker;                            ///< Thread to manage GPU
@@ -74,29 +59,21 @@ class AdeptIntegration : public AdePTTransportInterface {
   bool fTrackInAllRegions = false;
   std::vector<std::string> const *fGPURegionNames;
 
-  void FullInit();
+  void Initialize();
   void InitBVH();
+  bool InitializeField(double bz);
   bool InitializeGeometry(const vecgeom::cxx::VPlacedVolume *world);
-  bool InitializePhysics(double bz);
-  void InitializeGPU();
-  void FreeGPU();
-  /// @brief Asynchronous loop for transporting particles on GPU.
-  void TransportLoop();
-  void HitProcessingLoop(HitProcessingContext *const);
-  void ReturnTracksToG4();
-  void AdvanceEventStates(EventState oldState, EventState newState);
-  std::shared_ptr<const std::vector<GPUHit>> GetGPUHits(unsigned int threadId) const;
+  bool InitializePhysics();
 
 public:
-  AdeptIntegration(unsigned short nThread, unsigned int trackCapacity, unsigned int hitBufferCapacity, int debugLevel,
-                   std::vector<std::string> const *GPURegionNames, bool trackInAllRegions, int cudaStackSize);
-  AdeptIntegration(const AdeptIntegration &other) = delete;
-  ~AdeptIntegration();
+  AsyncAdePTTransport(AdePTConfiguration &configuration);
+  AsyncAdePTTransport(const AsyncAdePTTransport &other) = delete;
+  ~AsyncAdePTTransport();
 
   /// @brief Adds a track to the buffer
   void AddTrack(int pdg, int parentID, double energy, double x, double y, double z, double dirx, double diry,
                 double dirz, double globalTime, double localTime, double properTime, int threadId, unsigned int eventId,
-                unsigned int trackIndex) override;
+                unsigned int trackIndex, vecgeom::NavigationState &&state) override;
   /// @brief Set track capacity on GPU
   void SetTrackCapacity(size_t capacity) override { fTrackCapacity = capacity; }
   /// @brief Set Hit buffer capacity on GPU and Host
@@ -110,11 +87,17 @@ class AdeptIntegration : public AdePTTransportInterface {
   void SetTrackInAllRegions(bool trackInAllRegions) override { fTrackInAllRegions = trackInAllRegions; }
   bool GetTrackInAllRegions() const override { return fTrackInAllRegions; }
   void SetGPURegionNames(std::vector<std::string> const *regionNames) override { fGPURegionNames = regionNames; }
-  void SetCUDAStackLimit(int limit) override{};
+  void SetCUDAStackLimit(int limit) override {};
   std::vector<std::string> const *GetGPURegionNames() override { return fGPURegionNames; }
   /// No effect
   void Initialize(bool) override {}
+  /// @brief Initializes the ApplyCut flag. Can only be called after G4 Physics is build
+  bool InitializeApplyCuts(bool applycuts);
   /// @brief Finish GPU transport, bring hits and tracks to host
+  /// @details The shower call exists to maintain the same interface as the
+  /// synchronous AdePT mode, since in this case the transport loop is always
+  /// running. The only call to Shower() from G4 is done when the tracking
+  /// manager needs to flush an event.
   void Shower(int event, int threadId) override { Flush(threadId, event); }
   /// Block until transport of the given event is done.
   void Flush(int threadId, int eventId);
@@ -123,4 +106,6 @@ class AdeptIntegration : public AdePTTransportInterface {
 
 } // namespace AsyncAdePT
 
+#include "AsyncAdePTTransport.icc"
+
 #endif
diff --git a/include/AdePT/core/AsyncAdePTTransport.icc b/include/AdePT/core/AsyncAdePTTransport.icc
new file mode 100644
index 00000000..39a41aaa
--- /dev/null
+++ b/include/AdePT/core/AsyncAdePTTransport.icc
@@ -0,0 +1,334 @@
+// SPDX-FileCopyrightText: 2022 CERN
+// SPDX-License-Identifier: Apache-2.0
+
+#include <AdePT/core/AsyncAdePTTransport.hh>
+
+#include <AdePT/integration/AdePTGeant4Integration.hh>
+
+#include <VecGeom/management/BVHManager.h>
+#include "VecGeom/management/GeoManager.h"
+
+#include <G4RunManager.hh>
+#include <G4Threading.hh>
+#include <G4Proton.hh>
+#include <G4Region.hh>
+#include <G4SDManager.hh>
+#include <G4VFastSimSensitiveDetector.hh>
+#include <G4MaterialCutsCouple.hh>
+#include <G4ProductionCutsTable.hh>
+#include <G4TransportationManager.hh>
+
+#include <G4HepEmData.hh>
+#include <G4HepEmState.hh>
+#include <G4HepEmStateInit.hh>
+#include <G4HepEmParameters.hh>
+#include <G4HepEmMatCutData.hh>
+#include <G4LogicalVolumeStore.hh>
+
+#include <iomanip>
+
+// std::shared_ptr<AdePTTransportInterface> AdePTTransportFactory(unsigned int nThread, unsigned int nTrackSlot,
+//                                                                unsigned int nHitSlot, int verbosity,
+//                                                                std::vector<std::string> const *GPURegionNames,
+//                                                                bool trackInAllRegions, int cudaStackSize)
+// {
+//   static std::shared_ptr<AsyncAdePT::AsyncAdePTTransport> adePT{new AsyncAdePT::AsyncAdePTTransport(
+//       nThread, nTrackSlot, nHitSlot, verbosity, GPURegionNames, trackInAllRegions, cudaStackSize)};
+//   return adePT;
+// }
+
+/// Forward declarations
+namespace async_adept_impl {
+bool InitializeField(double);
+bool InitializeApplyCuts(bool applycuts);
+G4HepEmState *InitG4HepEm();
+void FlushScoring(AdePTScoring &);
+std::shared_ptr<const std::vector<GPUHit>> GetGPUHits(unsigned int, AsyncAdePT::GPUstate &);
+std::thread LaunchGPUWorker(int, int, int, AsyncAdePT::TrackBuffer &, AsyncAdePT::GPUstate &,
+                            std::vector<std::atomic<AsyncAdePT::EventState>> &, std::condition_variable &,
+                            std::vector<AdePTScoring> &, int);
+std::unique_ptr<AsyncAdePT::GPUstate, AsyncAdePT::GPUstateDeleter> InitializeGPU(int trackCapacity, int scoringCapacity,
+                                                                                 int numThreads,
+                                                                                 AsyncAdePT::TrackBuffer &trackBuffer,
+                                                                                 std::vector<AdePTScoring> &scoring);
+void FreeGPU(std::unique_ptr<AsyncAdePT::GPUstate, AsyncAdePT::GPUstateDeleter> &, G4HepEmState &, std::thread &);
+} // namespace async_adept_impl
+
+namespace AsyncAdePT {
+
+namespace {
+template <typename T>
+std::size_t countTracks(int pdgToSelect, T const &container)
+{
+  return std::count_if(container.begin(), container.end(),
+                       [pdgToSelect](TrackDataWithIDs const &track) { return track.pdg == pdgToSelect; });
+}
+
+std::ostream &operator<<(std::ostream &stream, TrackDataWithIDs const &track)
+{
+  const auto flags = stream.flags();
+  stream << std::setw(5) << track.pdg << std::scientific << std::setw(15) << std::setprecision(6) << track.eKin << " ("
+         << std::setprecision(2) << std::setw(9) << track.position[0] << std::setw(9) << track.position[1]
+         << std::setw(9) << track.position[2] << ")";
+  stream.flags(flags);
+  return stream;
+}
+} // namespace
+
+template <typename IntegrationLayer>
+AsyncAdePTTransport<IntegrationLayer>::AsyncAdePTTransport(AdePTConfiguration &configuration)
+    : fNThread{(ushort)configuration.GetNumThreads()},
+      fTrackCapacity{(uint)(1024 * 1024 * configuration.GetMillionsOfTrackSlots())},
+      fScoringCapacity{(uint)(1024 * 1024 * configuration.GetMillionsOfHitSlots())}, fDebugLevel{0},
+      fIntegrationLayerObjects(fNThread), fEventStates(fNThread), fGPUNetEnergy(fNThread, 0.0),
+      fTrackInAllRegions{configuration.GetTrackInAllRegions()}, fGPURegionNames{configuration.GetGPURegionNames()},
+      fCUDAStackLimit{configuration.GetCUDAStackLimit()}
+{
+  if (fNThread > kMaxThreads)
+    throw std::invalid_argument("AsyncAdePTTransport limited to " + std::to_string(kMaxThreads) + " threads");
+
+  for (auto &eventState : fEventStates) {
+    std::atomic_init(&eventState, EventState::ScoringRetrieved);
+  }
+
+  AsyncAdePTTransport::Initialize();
+}
+
+template <typename IntegrationLayer>
+AsyncAdePTTransport<IntegrationLayer>::~AsyncAdePTTransport()
+{
+  async_adept_impl::FreeGPU(std::ref(fGPUstate), *fg4hepem_state, fGPUWorker);
+}
+
+template <typename IntegrationLayer>
+void AsyncAdePTTransport<IntegrationLayer>::AddTrack(int pdg, int parentID, double energy, double x, double y, double z,
+                                                     double dirx, double diry, double dirz, double globalTime,
+                                                     double localTime, double properTime, int threadId,
+                                                     unsigned int eventId, unsigned int trackId,
+                                                     vecgeom::NavigationState &&state)
+{
+  if (pdg != 11 && pdg != -11 && pdg != 22) {
+    G4cerr << __FILE__ << ":" << __LINE__ << ": Only supporting EM tracks. Got pdgID=" << pdg << "\n";
+    return;
+  }
+
+  TrackDataWithIDs track{pdg,
+                         parentID,
+                         energy,
+                         x,
+                         y,
+                         z,
+                         dirx,
+                         diry,
+                         dirz,
+                         globalTime,
+                         localTime,
+                         properTime,
+                         std::move(state),
+                         eventId,
+                         trackId,
+                         static_cast<short>(threadId)};
+  if (fDebugLevel >= 2) {
+    fGPUNetEnergy[threadId] += energy;
+    if (fDebugLevel >= 6) {
+      G4cout << "\n[_in," << eventId << "," << trackId << "]: " << track << "\tGPU net energy " << std::setprecision(6)
+             << fGPUNetEnergy[threadId] << G4endl;
+    }
+  }
+
+  // Lock buffer and emplace the track
+  {
+    auto trackHandle  = fBuffer->createToDeviceSlot();
+    trackHandle.track = std::move(track);
+  }
+
+  fEventStates[threadId].store(EventState::NewTracksFromG4, std::memory_order_release);
+}
+
+template <typename IntegrationLayer>
+bool AsyncAdePTTransport<IntegrationLayer>::InitializeField(double bz)
+{
+  return async_adept_impl::InitializeField(bz);
+}
+
+template <typename IntegrationLayer>
+bool AsyncAdePTTransport<IntegrationLayer>::InitializeApplyCuts(bool applycuts)
+{
+  return async_adept_impl::InitializeApplyCuts(applycuts);
+}
+
+template <typename IntegrationLayer>
+bool AsyncAdePTTransport<IntegrationLayer>::InitializeGeometry(const vecgeom::cxx::VPlacedVolume *world)
+{
+  // Upload geometry to GPU.
+  auto &cudaManager = vecgeom::cxx::CudaManager::Instance();
+  if (fCUDAStackLimit > 0) {
+    std::cout << "CUDA Device stack limit: " << fCUDAStackLimit << "\n";
+    cudaDeviceSetLimit(cudaLimitStackSize, fCUDAStackLimit);
+  }
+  cudaManager.LoadGeometry(world);
+  auto world_dev = cudaManager.Synchronize();
+  // Initialize BVH
+  InitBVH();
+
+  return (world_dev != nullptr);
+}
+
+template <typename IntegrationLayer>
+bool AsyncAdePTTransport<IntegrationLayer>::InitializePhysics()
+{
+  // Initialize shared physics data
+  fg4hepem_state.reset(async_adept_impl::InitG4HepEm());
+  return true;
+}
+
+template <typename IntegrationLayer>
+void AsyncAdePTTransport<IntegrationLayer>::Initialize()
+{
+  const auto numVolumes = vecgeom::GeoManager::Instance().GetRegisteredVolumesCount();
+  if (numVolumes == 0) throw std::runtime_error("AsyncAdePTTransport::Initialize: Number of geometry volumes is zero.");
+
+  G4cout << "=== AsyncAdePTTransport: initializing geometry and physics\n";
+  // Initialize geometry on device
+  if (!vecgeom::GeoManager::Instance().IsClosed())
+    throw std::runtime_error("AsyncAdePTTransport::Initialize: VecGeom geometry not closed.");
+
+  const vecgeom::cxx::VPlacedVolume *world = vecgeom::GeoManager::Instance().GetWorld();
+  if (!InitializeGeometry(world))
+    throw std::runtime_error("AsyncAdePTTransport::Initialize: Cannot initialize geometry on GPU");
+
+  // Initialize G4HepEm
+  fIntegrationLayerObjects.front().GetUniformFieldZ();
+  if (!InitializePhysics())
+    throw std::runtime_error("AsyncAdePTTransport::Initialize cannot initialize physics on GPU");
+
+  // Initialize field
+  const double bz =
+      fIntegrationLayerObjects[0].GetUniformFieldZ(); // Get the field value from one of the worker threads
+  if (!InitializeField(bz))
+    throw std::runtime_error("AdePTTransport<IntegrationLayer>::Initialize cannot initialize field on GPU");
+
+  // Check VecGeom geometry matches Geant4. Initialize auxiliary per-LV data. Initialize scoring map.
+  fIntegrationLayerObjects.front().CheckGeometry(fg4hepem_state.get());
+  adeptint::VolAuxData *auxData = new adeptint::VolAuxData[vecgeom::GeoManager::Instance().GetRegisteredVolumesCount()];
+  fIntegrationLayerObjects.front().InitVolAuxData(auxData, fg4hepem_state.get(), fTrackInAllRegions, fGPURegionNames);
+
+  // Initialize volume auxiliary data on device
+  auto &volAuxArray       = adeptint::VolAuxArray::GetInstance();
+  volAuxArray.fNumVolumes = numVolumes;
+  volAuxArray.fAuxData    = auxData;
+  AsyncAdePT::InitVolAuxArray(volAuxArray);
+
+  for (auto &g4int : fIntegrationLayerObjects) {
+    g4int.InitScoringData();
+  }
+
+  // TODO: Make this configurable or figure out a reasonable value (Currently, it seems that the
+  // to device buffer is often full)
+  // Allocate buffers to transport particles to/from device. Scale the size of the staging area
+  // with the number of threads.
+  fBuffer = std::make_unique<TrackBuffer>(8192 * fNThread, 1024 * fNThread, fNThread);
+
+  assert(fBuffer != nullptr);
+
+  fGPUstate  = async_adept_impl::InitializeGPU(fTrackCapacity, fScoringCapacity, fNThread, *fBuffer, fScoring);
+  fGPUWorker = async_adept_impl::LaunchGPUWorker(fTrackCapacity, fScoringCapacity, fNThread, *fBuffer, *fGPUstate,
+                                                 fEventStates, fCV_G4Workers, fScoring, fAdePTSeed);
+}
+
+template <typename IntegrationLayer>
+void AsyncAdePTTransport<IntegrationLayer>::InitBVH()
+{
+  vecgeom::cxx::BVHManager::Init();
+  vecgeom::cxx::BVHManager::DeviceInit();
+}
+
+template <typename IntegrationLayer>
+void AsyncAdePTTransport<IntegrationLayer>::Flush(G4int threadId, G4int eventId)
+{
+  if (fDebugLevel >= 3) {
+    G4cout << "\nFlushing AdePT for event " << eventId << G4endl;
+  }
+
+  assert(static_cast<unsigned int>(threadId) < fBuffer->fromDeviceBuffers.size());
+  fEventStates[threadId].store(EventState::G4RequestsFlush, std::memory_order_release);
+
+  AdePTGeant4Integration &integrationInstance = fIntegrationLayerObjects[threadId];
+
+  while (fEventStates[threadId].load(std::memory_order_acquire) < EventState::DeviceFlushed) {
+    {
+      std::unique_lock lock{fMutex_G4Workers};
+      fCV_G4Workers.wait(lock);
+    }
+
+    std::shared_ptr<const std::vector<GPUHit>> gpuHits;
+    while ((gpuHits = async_adept_impl::GetGPUHits(threadId, *fGPUstate)) != nullptr) {
+      GPUHit dummy;
+      dummy.fEventId = eventId;
+      auto range     = std::equal_range(gpuHits->begin(), gpuHits->end(), dummy,
+                                        [](const GPUHit &lhs, const GPUHit &rhs) { return lhs.fEventId < rhs.fEventId; });
+      for (auto it = range.first; it != range.second; ++it) {
+        assert(it->threadId == threadId);
+        integrationInstance.ProcessGPUHit(*it);
+      }
+    }
+  }
+
+  // Now device should be flushed, so retrieve the tracks:
+  std::vector<TrackDataWithIDs> tracks;
+  {
+    auto handle = fBuffer->getTracksFromDevice(threadId);
+    tracks.swap(handle.tracks);
+    fEventStates[threadId].store(EventState::LeakedTracksRetrieved, std::memory_order_release);
+  }
+
+  // TODO: Sort tracks on device?
+#ifndef NDEBUG
+  for (auto const &track : tracks) {
+    bool error = false;
+    if (track.threadId != threadId || track.eventId != static_cast<unsigned int>(eventId)) error = true;
+    if (!(track.pdg == -11 || track.pdg == 11 || track.pdg == 22)) error = true;
+    if (error)
+      std::cerr << "Error in returning track: threadId=" << track.threadId << " eventId=" << track.eventId
+                << " pdg=" << track.pdg << "\n";
+    assert(!error);
+  }
+#endif
+  std::sort(tracks.begin(), tracks.end());
+
+  const auto oldEnergyTransferred = fGPUNetEnergy[threadId];
+  if (fDebugLevel >= 2) {
+    unsigned int trackId = 0;
+    for (const auto &track : tracks) {
+
+      fGPUNetEnergy[threadId] -= track.eKin;
+      if (fDebugLevel >= 5) {
+        G4cout << "\n[out," << track.eventId << "," << trackId++ << "]: " << track << "\tGPU net energy "
+               << std::setprecision(6) << fGPUNetEnergy[threadId] << G4endl;
+      }
+    }
+  }
+
+  if (fDebugLevel >= 2) {
+    std::stringstream str;
+    str << "\n[" << eventId << "]: Pushed " << tracks.size() << " tracks to G4";
+    str << "\tEnergy back to G4: " << std::setprecision(6)
+        << (oldEnergyTransferred - fGPUNetEnergy[threadId]) / CLHEP::GeV << "\tGPU net energy " << std::setprecision(6)
+        << fGPUNetEnergy[threadId] / CLHEP::GeV << " GeV";
+    str << "\t(" << countTracks(11, tracks) << ", " << countTracks(-11, tracks) << ", " << countTracks(22, tracks)
+        << ")";
+    G4cout << str.str() << G4endl;
+  }
+
+  if (tracks.empty()) {
+    async_adept_impl::FlushScoring(fScoring[threadId]);
+
+    // TODO: Most likely this needs to go to AsyncAdePTTransport in a function that
+    // we will forward-declare here (Otherwise AdePTScoring is undefined)
+    fEventStates[threadId].store(EventState::ScoringRetrieved, std::memory_order_release);
+  }
+
+  integrationInstance.ReturnTracks(tracks.begin(), tracks.end(), fDebugLevel);
+}
+
+} // namespace AsyncAdePT
diff --git a/examples/AsyncExample/AdeptIntegration.cuh b/include/AdePT/core/AsyncAdePTTransportStruct.cuh
similarity index 88%
rename from examples/AsyncExample/AdeptIntegration.cuh
rename to include/AdePT/core/AsyncAdePTTransportStruct.cuh
index 950a67e2..71c70d8a 100644
--- a/examples/AsyncExample/AdeptIntegration.cuh
+++ b/include/AdePT/core/AsyncAdePTTransportStruct.cuh
@@ -1,15 +1,16 @@
 // SPDX-FileCopyrightText: 2022 CERN
 // SPDX-License-Identifier: Apache-2.0
 
-#ifndef ADEPT_INTEGRATION_CUH
-#define ADEPT_INTEGRATION_CUH
+#ifndef ASYNC_ADEPT_TRANSPORT_STRUCT_CUH
+#define ASYNC_ADEPT_TRANSPORT_STRUCT_CUH
 
-#include "AdeptIntegration.h"
-
-#include "Track.cuh"
-#include "TrackTransfer.h"
-#include "SlotManager.cuh"
-#include "ResourceManagement.h"
+#include <AdePT/core/CommonStruct.h>
+#include <AdePT/core/AsyncAdePTTransportStruct.hh>
+// #include <AdePT/core/AsyncAdePTTransport.hh>
+#include <AdePT/core/PerEventScoringImpl.cuh>
+#include "AsyncTrack.cuh"
+#include <AdePT/base/SlotManager.cuh>
+#include <AdePT/base/ResourceManagement.cuh>
 
 #include <G4HepEmData.hh>
 #include <G4HepEmParameters.hh>
@@ -82,7 +83,7 @@ struct GammaInteractions {
     double geometryStepLength;
     double PEmxSec; // Only used for photoelectric process
     unsigned int slot;
-    vecgeom::NavStateIndex preStepNavState;
+    vecgeom::NavigationState preStepNavState;
     vecgeom::Vector3D<Precision> preStepPos;
     vecgeom::Vector3D<Precision> preStepDir;
     double preStepEnergy;
@@ -97,6 +98,7 @@ struct Secondaries {
   ParticleGenerator gammas;
 };
 
+// Holds the leaked track structs for all three particle types
 struct AllLeaked {
   LeakedTracks leakedElectrons;
   LeakedTracks leakedPositrons;
@@ -115,6 +117,7 @@ struct ParticleQueues {
   void SwapLeakedQueue() { std::swap(leakedTracksCurrent, leakedTracksNext); }
 };
 
+// Holds all information needed to manage in-flight tracks of one type
 struct ParticleType {
   Track *tracks;
   SlotManager *slotManager;
@@ -149,8 +152,8 @@ struct Stats {
   int leakedTracks[ParticleType::NumParticleTypes];
   float queueFillLevel[ParticleType::NumParticleTypes];
   float slotFillLevel;
-  unsigned int perEventInFlight[AdeptIntegration::kMaxThreads];
-  unsigned int perEventLeaked[AdeptIntegration::kMaxThreads];
+  unsigned int perEventInFlight[kMaxThreads];
+  unsigned int perEventLeaked[kMaxThreads];
   unsigned int hitBufferOccupancy;
 };
 
@@ -200,6 +203,12 @@ struct GPUstate {
   }
 };
 
+// Implementation of the GPUstate deleter
+void GPUstateDeleter::operator()(GPUstate *ptr)
+{
+  delete ptr;
+}
+
 // Constant data structures from G4HepEm accessed by the kernels.
 // (defined in TestEm3.cu)
 extern __constant__ __device__ struct G4HepEmParameters g4HepEmPars;
@@ -210,6 +219,7 @@ extern __constant__ __device__ adeptint::VolAuxData *gVolAuxData;
 
 // constexpr float BzFieldValue = 0.1 * copcore::units::tesla;
 extern __constant__ __device__ double BzFieldValue;
+extern __constant__ __device__ bool ApplyCuts;
 constexpr double kPush = 1.e-8 * copcore::units::cm;
 
 } // namespace AsyncAdePT
diff --git a/include/AdePT/core/AsyncAdePTTransportStruct.hh b/include/AdePT/core/AsyncAdePTTransportStruct.hh
new file mode 100644
index 00000000..6c36e0d7
--- /dev/null
+++ b/include/AdePT/core/AsyncAdePTTransportStruct.hh
@@ -0,0 +1,36 @@
+// SPDX-FileCopyrightText: 2024 CERN
+// SPDX-License-Identifier: Apache-2.0
+
+#ifndef ASYNC_ADEPT_TRANSPORT_STRUCT_HH
+#define ASYNC_ADEPT_TRANSPORT_STRUCT_HH
+
+namespace AsyncAdePT {
+
+struct GPUstate;
+static constexpr int kMaxThreads = 256;
+
+// We need a deleter for the unique_ptr to the GPUstate
+// This deleter is implemented in AsyncAdePTTransportStruct.cuh
+struct GPUstateDeleter {
+  void operator()(GPUstate *ptr);
+};
+
+enum class EventState : unsigned char {
+  NewTracksFromG4,
+  G4RequestsFlush,
+  Inject,
+  InjectionCompleted,
+  Transporting,
+  WaitingForTransportToFinish,
+  RequestHitFlush,
+  FlushingHits,
+  HitsFlushed,
+  FlushingTracks,
+  DeviceFlushed,
+  LeakedTracksRetrieved,
+  ScoringRetrieved
+};
+
+} // namespace AsyncAdePT
+
+#endif
diff --git a/examples/AsyncExample/Track.cuh b/include/AdePT/core/AsyncTrack.cuh
similarity index 70%
rename from examples/AsyncExample/Track.cuh
rename to include/AdePT/core/AsyncTrack.cuh
index 1195d9c3..e66ac1c0 100644
--- a/examples/AsyncExample/Track.cuh
+++ b/include/AdePT/core/AsyncTrack.cuh
@@ -1,23 +1,26 @@
 // SPDX-FileCopyrightText: 2022 CERN
 // SPDX-License-Identifier: Apache-2.0
 
-#ifndef ADEPT_TRACK_CUH
-#define ADEPT_TRACK_CUH
+#ifndef ASYNC_ADEPT_TRACK_CUH
+#define ASYNC_ADEPT_TRACK_CUH
 
 #include <AdePT/base/MParray.h>
 #include <AdePT/copcore/SystemOfUnits.h>
 #include <AdePT/copcore/Ranluxpp.h>
 
 #include <VecGeom/base/Vector3D.h>
-#include <VecGeom/navigation/NavStateIndex.h>
+#include <VecGeom/navigation/NavigationState.h>
+
+// TODO: This needs to be unified with the other Track struct, however due to the slot manager
+// approach, this can't be done before introducing the SlotManager for all kernels
 
 // A data structure to represent a particle track. The particle type is implicit
 // by the queue and not stored in memory.
 struct Track {
   using Precision = vecgeom::Precision;
   RanluxppDouble rngState;
-  double energy            = 0;
-  float numIALeft[3]       = {-1., -1., -1.};
+  double eKin              = 0;
+  float numIALeft[4]       = {-1.f, -1.f, -1.f, -1.f};
   float initialRange       = -1.f; // Only for e-?
   float dynamicRangeFactor = -1.f; // Only for e-?
   float tlimitMin          = -1.f; // Only for e-?
@@ -28,7 +31,7 @@ struct Track {
 
   vecgeom::Vector3D<Precision> pos;
   vecgeom::Vector3D<Precision> dir;
-  vecgeom::NavStateIndex navState;
+  vecgeom::NavigationState navState;
   unsigned int eventId{0};
   int parentId{-1};
   short threadId{-1};
@@ -42,7 +45,7 @@ struct Track {
   __device__ Track(uint64_t rngSeed, double eKin, double globalTime, float localTime, float properTime,
                    double const position[3], double const direction[3], unsigned int eventId, int parentId,
                    short threadId)
-      : energy{eKin}, globalTime{globalTime}, localTime{localTime}, properTime{properTime}, eventId{eventId},
+      : eKin{eKin}, globalTime{globalTime}, localTime{localTime}, properTime{properTime}, eventId{eventId},
         parentId{parentId}, threadId{threadId}
   {
     rngState.SetSeed(rngSeed);
@@ -53,10 +56,10 @@ struct Track {
 
   /// Construct a secondary from a parent track.
   /// NB: The caller is responsible to branch a new RNG state.
-  __device__ Track(RanluxppDouble const &rngState, double energy, const vecgeom::Vector3D<Precision> &parentPos,
-                   const vecgeom::Vector3D<Precision> &newDirection, const vecgeom::NavStateIndex &newNavState,
+  __device__ Track(RanluxppDouble const &rngState, double eKin, const vecgeom::Vector3D<Precision> &parentPos,
+                   const vecgeom::Vector3D<Precision> &newDirection, const vecgeom::NavigationState &newNavState,
                    const Track &parentTrack)
-      : rngState{rngState}, energy{energy}, globalTime{parentTrack.globalTime}, pos{parentPos}, dir{newDirection},
+      : rngState{rngState}, eKin{eKin}, globalTime{parentTrack.globalTime}, pos{parentPos}, dir{newDirection},
         navState{newNavState}, eventId{parentTrack.eventId}, parentId{parentTrack.parentId},
         threadId{parentTrack.threadId}
   {
diff --git a/include/AdePT/core/CommonStruct.h b/include/AdePT/core/CommonStruct.h
index 8e337017..3db53283 100644
--- a/include/AdePT/core/CommonStruct.h
+++ b/include/AdePT/core/CommonStruct.h
@@ -8,6 +8,16 @@
 #include <AdePT/base/MParray.h>
 #include <AdePT/core/TrackData.h>
 
+#include <AdePT/base/ResourceManagement.hh>
+
+#include <atomic>
+#include <array>
+#include <mutex>
+#include <shared_mutex>
+#include <new>
+#include <vector>
+#include <thread>
+
 // Common data structures used by the integration with Geant4
 namespace adeptint {
 
@@ -47,8 +57,10 @@ struct VolAuxArray {
 /// re-injected in the Geant4 stack
 struct TrackBuffer {
   std::vector<TrackData> toDevice;    ///< Tracks to be transported on the device
-  std::vector<TrackData> fromDevice;  ///< Tracks coming from device to be transported on the CPU
-  TrackData *fromDeviceBuff{nullptr}; ///< Buffer of leaked tracks from device
+  std::vector<TrackData> fromDevice;  ///< Tracks coming from device to be transported on the CPU.
+                                      ///< Initialized from "fromDeviceBuff" after the copy
+  TrackData *fromDeviceBuff{nullptr}; ///< Buffer of leaked tracks from device.
+                                      ///< Used as the destination for the Cuda copy
   int buffSize{0};                    ///< Size of buffer collecting tracks from device
   int eventId{-1};                    ///< Index of current transported event
   int startTrack{0};                  ///< Track counter for the current event
@@ -65,4 +77,107 @@ struct TrackBuffer {
 };
 
 } // end namespace adeptint
+
+namespace AsyncAdePT {
+
+struct TrackDataWithIDs : public adeptint::TrackData {
+  unsigned int eventId{0};
+  unsigned int trackId{0};
+  short threadId{-1};
+
+  TrackDataWithIDs(int pdg_id, int parentID, double ene, double x, double y, double z, double dirx, double diry,
+                   double dirz, double gTime, double lTime, double pTime, vecgeom::NavigationState &&state,
+                   unsigned int eventId = 0, unsigned int trackId = 0, short threadId = -1)
+      : TrackData{pdg_id, parentID, ene, x, y, z, dirx, diry, dirz, gTime, lTime, pTime, std::move(state)},
+        eventId{eventId}, trackId{trackId}, threadId{threadId}
+  {
+  }
+  friend bool operator==(TrackDataWithIDs const &a, TrackDataWithIDs const &b)
+  {
+    return a.threadId != b.threadId || a.eventId != b.eventId || static_cast<adeptint::TrackData const &>(a) == b;
+  }
+  bool operator<(TrackDataWithIDs const &other)
+  {
+    if (threadId != other.threadId) return threadId < other.threadId;
+    if (eventId != other.eventId) return eventId < other.eventId;
+    return TrackData::operator<(other);
+  }
+};
+
+/// @brief Buffer holding input tracks to be transported on GPU and output tracks to be
+/// re-injected in the Geant4 stack
+struct TrackBuffer {
+  struct alignas(64) ToDeviceBuffer {
+    TrackDataWithIDs *tracks;
+    unsigned int maxTracks;
+    std::atomic_uint nTrack;
+    mutable std::shared_mutex mutex;
+  };
+  std::array<ToDeviceBuffer, 2> toDeviceBuffer;
+  std::atomic_short toDeviceIndex{0};
+
+  unsigned int fNumToDevice{0};   ///< number of slots in the toDevice buffer
+  unsigned int fNumFromDevice{0}; ///< number of slots in the fromDevice buffer
+  unique_ptr_cuda<TrackDataWithIDs, CudaHostDeleter<TrackDataWithIDs>>
+      toDevice_host;                              ///< Tracks to be transported to the device
+  unique_ptr_cuda<TrackDataWithIDs> toDevice_dev; ///< toDevice buffer of tracks
+  unique_ptr_cuda<TrackDataWithIDs, CudaHostDeleter<TrackDataWithIDs>> fromDevice_host; ///< Tracks from device
+  unique_ptr_cuda<TrackDataWithIDs> fromDevice_dev;                                     ///< fromDevice buffer of tracks
+  unique_ptr_cuda<unsigned int, CudaHostDeleter<unsigned int>>
+      nFromDevice_host; ///< Number of tracks collected on device
+
+  std::vector<std::vector<TrackDataWithIDs>> fromDeviceBuffers;
+  std::mutex fromDeviceMutex;
+
+  TrackBuffer(unsigned int numToDevice, unsigned int numFromDevice, unsigned short nThread);
+
+  ToDeviceBuffer &getActiveBuffer() { return toDeviceBuffer[toDeviceIndex]; }
+  void swapToDeviceBuffers() { toDeviceIndex = (toDeviceIndex + 1) % 2; }
+
+  /// A handle to access TrackData vectors while holding a lock
+  struct TrackHandle {
+    TrackDataWithIDs &track;
+    std::shared_lock<std::shared_mutex> lock;
+  };
+
+  /// @brief Create a handle with lock for tracks that go to the device.
+  /// Create a shared_lock and a reference to a track
+  /// @return TrackHandle with lock and reference to track slot.
+  // TrackHandle createToDeviceSlot();
+  TrackHandle createToDeviceSlot()
+  {
+    bool warningIssued = false;
+    while (true) {
+      auto &toDevice = getActiveBuffer();
+      std::shared_lock lock{toDevice.mutex};
+      const auto slot = toDevice.nTrack.fetch_add(1, std::memory_order_relaxed);
+
+      if (slot < toDevice.maxTracks)
+        return TrackHandle{toDevice.tracks[slot], std::move(lock)};
+      else {
+        if (!warningIssued) {
+          std::cerr << __FILE__ << ':' << __LINE__ << " Contention in to-device queue; thread sleeping" << std::endl;
+          warningIssued = true;
+        }
+        using namespace std::chrono_literals;
+        std::this_thread::sleep_for(1ms);
+      }
+    }
+  }
+
+  struct FromDeviceHandle {
+    std::vector<TrackDataWithIDs> &tracks;
+    std::scoped_lock<std::mutex> lock;
+  };
+
+  /// @brief Create a handle with lock for tracks that return from the device.
+  /// @return BufferHandle with lock and reference to track vector.
+  FromDeviceHandle getTracksFromDevice(int threadId)
+  {
+    return {fromDeviceBuffers[threadId], std::scoped_lock{fromDeviceMutex}};
+  }
+};
+
+} // namespace AsyncAdePT
+
 #endif
diff --git a/include/AdePT/core/HostScoringStruct.cuh b/include/AdePT/core/HostScoringStruct.cuh
index bc4a4364..d46a0d5f 100644
--- a/include/AdePT/core/HostScoringStruct.cuh
+++ b/include/AdePT/core/HostScoringStruct.cuh
@@ -4,58 +4,12 @@
 #ifndef HOSTSCORING_H
 #define HOSTSCORING_H
 
-#include "VecGeom/navigation/NavigationState.h"
+#include <AdePT/core/ScoringCommons.hh>
+// #include "VecGeom/navigation/NavigationState.h"
 #include <AdePT/base/Atomic.h>
 #include <G4ios.hh>
 
-struct GPUStepPoint {
-  vecgeom::Vector3D<Precision> fPosition;
-  vecgeom::Vector3D<Precision> fMomentumDirection;
-  vecgeom::Vector3D<Precision> fPolarization;
-  double fEKin;
-  double fCharge;
-  // Data needed to reconstruct G4 Touchable history
-  vecgeom::NavigationState fNavigationState{0}; // VecGeom navigation state, used to identify the touchable
-};
-
-// Stores the necessary data to reconstruct GPU hits on the host , and
-// call the user-defined Geant4 sensitive detector code
-struct GPUHit {
-  int fParentID{0}; // Track ID
-  // Data needed to reconstruct G4 Step
-  double fStepLength{0};
-  double fTotalEnergyDeposit{0};
-  double fNonIonizingEnergyDeposit{0};
-  // bool fFirstStepInVolume{false};
-  // bool fLastStepInVolume{false};
-  // Data needed to reconstruct pre-post step points
-  GPUStepPoint fPreStepPoint;
-  GPUStepPoint fPostStepPoint;
-  unsigned int fEventId{0};
-  short threadId{-1};
-  char fParticleType{0}; // Particle type ID
-};
 
-/// @brief Stores information used for comparison with Geant4 (Number of steps, Number of produced particles, etc)
-struct GlobalCounters {
-  double energyDeposit;
-  // Not int to avoid overflows for more than 100,000 events; unsigned long long
-  // is the only other data type available for atomicAdd().
-  unsigned long long chargedSteps;
-  unsigned long long neutralSteps;
-  unsigned long long hits;
-  unsigned long long numGammas;
-  unsigned long long numElectrons;
-  unsigned long long numPositrons;
-  // Not used on the device, filled in by the host.
-  unsigned long long numKilled;
-
-  void Print()
-  {
-    printf("Global scoring: stpChg=%llu stpNeu=%llu hits=%llu numGam=%llu numEle=%llu numPos=%llu numKilled=%llu\n",
-           chargedSteps, neutralSteps, hits, numGammas, numElectrons, numPositrons, numKilled);
-  }
-};
 // Contains the necessary information for recording hits on GPU, and reconstructing them
 // on the host, in order to call the user-defined sensitive detector code
 struct HostScoring {
diff --git a/include/AdePT/core/PerEventScoringImpl.cuh b/include/AdePT/core/PerEventScoringImpl.cuh
new file mode 100644
index 00000000..07c462b8
--- /dev/null
+++ b/include/AdePT/core/PerEventScoringImpl.cuh
@@ -0,0 +1,300 @@
+// SPDX-FileCopyrightText: 2024 CERN
+// SPDX-License-Identifier: Apache-2.0
+
+#ifndef PER_EVENT_SCORING_CUH
+#define PER_EVENT_SCORING_CUH
+
+#include <AdePT/core/PerEventScoringStruct.cuh>
+#include <AdePT/base/ResourceManagement.cuh>
+#include <AdePT/core/AdePTScoringTemplate.cuh>
+#include <AdePT/core/ScoringCommons.hh>
+#include <AdePT/copcore/Global.h>
+
+#include <VecGeom/navigation/NavigationState.h>
+
+#include <atomic>
+#include <deque>
+#include <mutex>
+#include <shared_mutex>
+#include <array>
+#include <chrono>
+#include <thread>
+
+#include <cub/device/device_merge_sort.cuh>
+
+// Comparison for sorting tracks into events on device:
+struct CompareGPUHits {
+  __device__ bool operator()(const GPUHit &lhs, const GPUHit &rhs) const { return lhs.fEventId < rhs.fEventId; }
+};
+
+namespace AsyncAdePT {
+
+/// Struct holding GPU hits to be used both on host and device.
+struct HitScoringBuffer {
+  GPUHit *hitBuffer_dev     = nullptr;
+  unsigned int fSlotCounter = 0;
+  unsigned int fNSlot       = 0;
+
+  __device__ GPUHit &GetNextSlot()
+  {
+    const auto slotIndex = atomicAdd(&fSlotCounter, 1);
+    if (slotIndex >= fNSlot) {
+      printf("Trying to score hit #%d with only %d slots\n", slotIndex, fNSlot);
+      COPCORE_EXCEPTION("Out of slots in HitScoringBuffer::NextSlot");
+    }
+    return hitBuffer_dev[slotIndex];
+  }
+};
+
+__device__ HitScoringBuffer gHitScoringBuffer_dev;
+
+struct BufferHandle {
+  HitScoringBuffer hitScoringInfo;
+  GPUHit *hostBuffer;
+  enum class State { Free, OnDevice, OnDeviceNeedTransferToHost, TransferToHost, NeedHostProcessing };
+  std::atomic<State> state;
+};
+
+// TODO: Rename this. Maybe ScoringState? Check usage in GPUstate
+class HitScoring {
+  unique_ptr_cuda<GPUHit> fGPUHitBuffer_dev;
+  unique_ptr_cuda<GPUHit, CudaHostDeleter<GPUHit>> fGPUHitBuffer_host;
+
+  std::array<BufferHandle, 2> fBuffers;
+
+  void *fHitScoringBuffer_deviceAddress = nullptr;
+  unsigned int fHitCapacity;
+  unsigned short fActiveBuffer = 0;
+  unique_ptr_cuda<std::byte> fGPUSortAuxMemory;
+  std::size_t fGPUSortAuxMemorySize;
+
+  std::vector<std::deque<std::shared_ptr<const std::vector<GPUHit>>>> fHitQueues;
+  mutable std::shared_mutex fProcessingHitsMutex;
+
+  void ProcessBuffer(BufferHandle &handle)
+  {
+    // We are assuming that the caller holds a lock on fProcessingHitsMutex.
+    if (handle.state == BufferHandle::State::NeedHostProcessing) {
+      auto hitVector = std::make_shared<std::vector<GPUHit>>();
+      hitVector->assign(handle.hostBuffer, handle.hostBuffer + handle.hitScoringInfo.fSlotCounter);
+      handle.hitScoringInfo.fSlotCounter = 0;
+      handle.state                       = BufferHandle::State::Free;
+
+      for (auto &hitQueue : fHitQueues) {
+        hitQueue.push_back(hitVector);
+      }
+    }
+  }
+
+public:
+  HitScoring(unsigned int hitCapacity, unsigned int nThread) : fHitCapacity{hitCapacity}, fHitQueues(nThread)
+  {
+    // We use a single allocation for both buffers:
+    GPUHit *gpuHits = nullptr;
+    COPCORE_CUDA_CHECK(cudaMallocHost(&gpuHits, sizeof(GPUHit) * 2 * fHitCapacity));
+    fGPUHitBuffer_host.reset(gpuHits);
+
+    auto result = cudaMalloc(&gpuHits, sizeof(GPUHit) * 2 * fHitCapacity);
+    if (result != cudaSuccess) throw std::invalid_argument{"No space to allocate hit buffer."};
+    fGPUHitBuffer_dev.reset(gpuHits);
+
+    // Init buffers for on-device sorting of hits:
+    // Determine device storage requirements for on-device sorting.
+    result = cub::DeviceMergeSort::SortKeys(nullptr, fGPUSortAuxMemorySize, fGPUHitBuffer_dev.get(), fHitCapacity,
+                                            CompareGPUHits{});
+    if (result != cudaSuccess) throw std::invalid_argument{"No space for hit sorting on device."};
+
+    std::byte *gpuSortingMem;
+    result = cudaMalloc(&gpuSortingMem, fGPUSortAuxMemorySize);
+    if (result != cudaSuccess) throw std::invalid_argument{"No space to allocate hit sorting buffer."};
+    fGPUSortAuxMemory.reset(gpuSortingMem);
+
+    // Store buffer data in structs
+    fBuffers[0].hitScoringInfo = HitScoringBuffer{fGPUHitBuffer_dev.get(), 0, fHitCapacity};
+    fBuffers[0].hostBuffer     = fGPUHitBuffer_host.get();
+    fBuffers[0].state          = BufferHandle::State::OnDevice;
+    fBuffers[1].hitScoringInfo = HitScoringBuffer{fGPUHitBuffer_dev.get() + fHitCapacity, 0, fHitCapacity};
+    fBuffers[1].hostBuffer     = fGPUHitBuffer_host.get() + fHitCapacity;
+    fBuffers[1].state          = BufferHandle::State::Free;
+
+    COPCORE_CUDA_CHECK(cudaGetSymbolAddress(&fHitScoringBuffer_deviceAddress, gHitScoringBuffer_dev));
+    assert(fHitScoringBuffer_deviceAddress != nullptr);
+    COPCORE_CUDA_CHECK(cudaMemcpy(fHitScoringBuffer_deviceAddress, &fBuffers[0].hitScoringInfo,
+                                  sizeof(HitScoringBuffer), cudaMemcpyHostToDevice));
+  }
+
+  unsigned int HitCapacity() const { return fHitCapacity; }
+
+  void SwapDeviceBuffers(cudaStream_t cudaStream)
+  {
+    // Ensure that host side has been processed:
+    auto &currentBuffer = fBuffers[fActiveBuffer];
+    if (currentBuffer.state != BufferHandle::State::OnDevice)
+      throw std::logic_error(__FILE__ + std::to_string(__LINE__) + ": On-device buffer in wrong state");
+
+    // Get new buffer info from device:
+    auto &currentHitInfo = currentBuffer.hitScoringInfo;
+    COPCORE_CUDA_CHECK(cudaMemcpyAsync(&currentHitInfo, fHitScoringBuffer_deviceAddress, sizeof(HitScoringBuffer),
+                                       cudaMemcpyDefault, cudaStream));
+
+    // Execute the swap:
+    fActiveBuffer          = (fActiveBuffer + 1) % fBuffers.size();
+    auto &nextDeviceBuffer = fBuffers[fActiveBuffer];
+    while (nextDeviceBuffer.state != BufferHandle::State::Free) {
+      std::cerr << __func__ << " Warning: Another thread should have processed the hits.\n";
+    }
+    assert(nextDeviceBuffer.state == BufferHandle::State::Free && nextDeviceBuffer.hitScoringInfo.fSlotCounter == 0);
+
+    nextDeviceBuffer.state = BufferHandle::State::OnDevice;
+    COPCORE_CUDA_CHECK(cudaMemcpyAsync(fHitScoringBuffer_deviceAddress, &nextDeviceBuffer.hitScoringInfo,
+                                       sizeof(HitScoringBuffer), cudaMemcpyDefault, cudaStream));
+    COPCORE_CUDA_CHECK(cudaStreamSynchronize(cudaStream));
+    currentBuffer.state = BufferHandle::State::OnDeviceNeedTransferToHost;
+  }
+
+  bool ProcessHits()
+  {
+    std::unique_lock lock{fProcessingHitsMutex, std::defer_lock};
+    bool haveNewHits = false;
+
+    while (std::any_of(fBuffers.begin(), fBuffers.end(),
+                       [](auto &buffer) { return buffer.state >= BufferHandle::State::TransferToHost; })) {
+      for (auto &handle : fBuffers) {
+        if (handle.state == BufferHandle::State::NeedHostProcessing) {
+          if (!lock) lock.lock();
+          haveNewHits = true;
+          ProcessBuffer(handle);
+        }
+      }
+    }
+
+    return haveNewHits;
+  }
+
+  bool ReadyToSwapBuffers() const
+  {
+    return std::any_of(fBuffers.begin(), fBuffers.end(),
+                       [](const auto &handle) { return handle.state == BufferHandle::State::Free; });
+  }
+
+  /// Copy the current contents of the GPU hit buffer to host.
+  void TransferHitsToHost(cudaStream_t cudaStreamForHitCopy)
+  {
+    for (auto &buffer : fBuffers) {
+      if (buffer.state != BufferHandle::State::OnDeviceNeedTransferToHost) continue;
+
+      buffer.state = BufferHandle::State::TransferToHost;
+      assert(buffer.hitScoringInfo.fSlotCounter < fHitCapacity);
+
+      auto bufferBegin = buffer.hitScoringInfo.hitBuffer_dev;
+
+      cub::DeviceMergeSort::SortKeys(fGPUSortAuxMemory.get(), fGPUSortAuxMemorySize, bufferBegin,
+                                     buffer.hitScoringInfo.fSlotCounter, CompareGPUHits{}, cudaStreamForHitCopy);
+
+      COPCORE_CUDA_CHECK(cudaMemcpyAsync(buffer.hostBuffer, bufferBegin,
+                                         sizeof(GPUHit) * buffer.hitScoringInfo.fSlotCounter, cudaMemcpyDefault,
+                                         cudaStreamForHitCopy));
+      COPCORE_CUDA_CHECK(cudaLaunchHostFunc(
+          cudaStreamForHitCopy,
+          [](void *arg) { static_cast<BufferHandle *>(arg)->state = BufferHandle::State::NeedHostProcessing; },
+          &buffer));
+    }
+  }
+
+  std::shared_ptr<const std::vector<GPUHit>> GetNextHitsVector(unsigned int threadId)
+  {
+    assert(threadId < fHitQueues.size());
+    std::shared_lock lock{fProcessingHitsMutex};
+
+    if (fHitQueues[threadId].empty())
+      return nullptr;
+    else {
+      auto ret = fHitQueues[threadId].front();
+      fHitQueues[threadId].pop_front();
+      return ret;
+    }
+  }
+};
+
+// Implement Cuda-dependent functionality from PerEventScoring
+
+void PerEventScoring::ClearGPU(cudaStream_t cudaStream)
+{
+  COPCORE_CUDA_CHECK(cudaMemsetAsync(fScoring_dev, 0, sizeof(GlobalCounters), cudaStream));
+  COPCORE_CUDA_CHECK(cudaStreamSynchronize(cudaStream));
+}
+
+void PerEventScoring::CopyToHost(cudaStream_t cudaStream)
+{
+  const auto oldPointer = fScoring_dev;
+  COPCORE_CUDA_CHECK(
+      cudaMemcpyAsync(&fGlobalCounters, fScoring_dev, sizeof(GlobalCounters), cudaMemcpyDeviceToHost, cudaStream));
+  COPCORE_CUDA_CHECK(cudaStreamSynchronize(cudaStream));
+  assert(oldPointer == fScoring_dev);
+  (void)oldPointer;
+}
+
+} // namespace AsyncAdePT
+
+namespace adept_scoring {
+
+/// @brief Utility function to copy a 3D vector, used for filling the Step Points
+__device__ __forceinline__ void Copy3DVector(vecgeom::Vector3D<Precision> const *source,
+                                             vecgeom::Vector3D<Precision> *destination)
+{
+  destination->x() = source->x();
+  destination->y() = source->y();
+  destination->z() = source->z();
+}
+
+/// @brief Record a hit
+template <>
+__device__ void RecordHit(AsyncAdePT::PerEventScoring * /*scoring*/, int aParentID, char aParticleType,
+                          double aStepLength, double aTotalEnergyDeposit, vecgeom::NavigationState const *aPreState,
+                          vecgeom::Vector3D<Precision> const *aPrePosition,
+                          vecgeom::Vector3D<Precision> const *aPreMomentumDirection,
+                          vecgeom::Vector3D<Precision> const * /*aPrePolarization*/, double aPreEKin, double aPreCharge,
+                          vecgeom::NavigationState const *aPostState, vecgeom::Vector3D<Precision> const *aPostPosition,
+                          vecgeom::Vector3D<Precision> const *aPostMomentumDirection,
+                          vecgeom::Vector3D<Precision> const * /*aPostPolarization*/, double aPostEKin,
+                          double aPostCharge, unsigned int eventID, short threadID)
+{
+  // Acquire a hit slot
+  GPUHit &aGPUHit  = AsyncAdePT::gHitScoringBuffer_dev.GetNextSlot();
+  aGPUHit.fEventId = eventID;
+  aGPUHit.threadId = threadID;
+
+  // Fill the required data
+  aGPUHit.fParentID           = aParentID;
+  aGPUHit.fParticleType       = aParticleType;
+  aGPUHit.fStepLength         = aStepLength;
+  aGPUHit.fTotalEnergyDeposit = aTotalEnergyDeposit;
+  // Pre step point
+  aGPUHit.fPreStepPoint.fNavigationState = *aPreState;
+  Copy3DVector(aPrePosition, &(aGPUHit.fPreStepPoint.fPosition));
+  Copy3DVector(aPreMomentumDirection, &(aGPUHit.fPreStepPoint.fMomentumDirection));
+  // Copy3DVector(aPrePolarization, aGPUHit.fPreStepPoint.fPolarization);
+  aGPUHit.fPreStepPoint.fEKin   = aPreEKin;
+  aGPUHit.fPreStepPoint.fCharge = aPreCharge;
+  // Post step point
+  aGPUHit.fPostStepPoint.fNavigationState = *aPostState;
+  Copy3DVector(aPostPosition, &(aGPUHit.fPostStepPoint.fPosition));
+  Copy3DVector(aPostMomentumDirection, &(aGPUHit.fPostStepPoint.fMomentumDirection));
+  // Copy3DVector(aPostPolarization, aGPUHit.fPostStepPoint.fPolarization);
+  aGPUHit.fPostStepPoint.fEKin   = aPostEKin;
+  aGPUHit.fPostStepPoint.fCharge = aPostCharge;
+}
+
+/// @brief Account for the number of produced secondaries
+/// @details Atomically increase the number of produced secondaries.
+template <>
+__device__ void AccountProduced(AsyncAdePT::PerEventScoring *scoring, int num_ele, int num_pos, int num_gam)
+{
+  atomicAdd(&scoring->fGlobalCounters.numElectrons, num_ele);
+  atomicAdd(&scoring->fGlobalCounters.numPositrons, num_pos);
+  atomicAdd(&scoring->fGlobalCounters.numGammas, num_gam);
+}
+
+} // namespace adept_scoring
+
+#endif
diff --git a/include/AdePT/core/PerEventScoringStruct.cuh b/include/AdePT/core/PerEventScoringStruct.cuh
new file mode 100644
index 00000000..de19b626
--- /dev/null
+++ b/include/AdePT/core/PerEventScoringStruct.cuh
@@ -0,0 +1,35 @@
+// SPDX-FileCopyrightText: 2024 CERN
+// SPDX-License-Identifier: Apache-2.0
+
+#ifndef PER_EVENT_SCORING_STRUCT_CUH
+#define PER_EVENT_SCORING_STRUCT_CUH
+
+#include <AdePT/core/ScoringCommons.hh>
+
+namespace AsyncAdePT {
+
+struct PerEventScoring {
+  GlobalCounters fGlobalCounters;
+  PerEventScoring *const fScoring_dev;
+
+  PerEventScoring(PerEventScoring *gpuScoring) : fScoring_dev{gpuScoring} { 
+    ClearGPU(); 
+  }
+  PerEventScoring(PerEventScoring &&other) = default;
+  ~PerEventScoring()                       = default;
+
+  /// @brief Copy hits to host for a single event
+  void CopyToHost(cudaStream_t cudaStream = 0);
+
+  /// @brief Clear hits on device to reuse for next event
+  void ClearGPU(cudaStream_t cudaStream = 0);
+
+  /// @brief Print scoring info
+  void Print() { fGlobalCounters.Print(); };
+};
+
+}
+
+using AdePTScoring = AsyncAdePT::PerEventScoring;
+
+#endif
\ No newline at end of file
diff --git a/include/AdePT/core/ScoringCommons.hh b/include/AdePT/core/ScoringCommons.hh
new file mode 100644
index 00000000..d686ceb8
--- /dev/null
+++ b/include/AdePT/core/ScoringCommons.hh
@@ -0,0 +1,58 @@
+// SPDX-FileCopyrightText: 2024 CERN
+// SPDX-License-Identifier: Apache-2.0
+
+#ifndef SCORING_COMMONS_HH
+#define SCORING_COMMONS_HH
+
+#include "VecGeom/navigation/NavigationState.h"
+
+struct GPUStepPoint {
+  vecgeom::Vector3D<Precision> fPosition;
+  vecgeom::Vector3D<Precision> fMomentumDirection;
+  vecgeom::Vector3D<Precision> fPolarization;
+  double fEKin;
+  double fCharge;
+  // Data needed to reconstruct G4 Touchable history
+  vecgeom::NavigationState fNavigationState{0}; // VecGeom navigation state, used to identify the touchable
+};
+
+// Stores the necessary data to reconstruct GPU hits on the host , and
+// call the user-defined Geant4 sensitive detector code
+struct GPUHit {
+  int fParentID{0}; // Track ID
+  // Data needed to reconstruct G4 Step
+  double fStepLength{0};
+  double fTotalEnergyDeposit{0};
+  double fNonIonizingEnergyDeposit{0};
+  // bool fFirstStepInVolume{false};
+  // bool fLastStepInVolume{false};
+  // Data needed to reconstruct pre-post step points
+  GPUStepPoint fPreStepPoint;
+  GPUStepPoint fPostStepPoint;
+  unsigned int fEventId{0};
+  short threadId{-1};
+  char fParticleType{0}; // Particle type ID
+};
+
+/// @brief Stores information used for comparison with Geant4 (Number of steps, Number of produced particles, etc)
+struct GlobalCounters {
+  double energyDeposit;
+  // Not int to avoid overflows for more than 100,000 events; unsigned long long
+  // is the only other data type available for atomicAdd().
+  unsigned long long chargedSteps;
+  unsigned long long neutralSteps;
+  unsigned long long hits;
+  unsigned long long numGammas;
+  unsigned long long numElectrons;
+  unsigned long long numPositrons;
+  // Not used on the device, filled in by the host.
+  unsigned long long numKilled;
+
+  void Print()
+  {
+    printf("Global scoring: stpChg=%llu stpNeu=%llu hits=%llu numGam=%llu numEle=%llu numPos=%llu numKilled=%llu\n",
+           chargedSteps, neutralSteps, hits, numGammas, numElectrons, numPositrons, numKilled);
+  }
+};
+
+#endif
diff --git a/include/AdePT/core/TrackData.h b/include/AdePT/core/TrackData.h
index 1808463a..ddef7fce 100644
--- a/include/AdePT/core/TrackData.h
+++ b/include/AdePT/core/TrackData.h
@@ -26,9 +26,9 @@ struct TrackData {
 
   TrackData() = default;
   TrackData(int pdg_id, int parentID, double ene, double x, double y, double z, double dirx, double diry, double dirz,
-            double gTime, double lTime, double pTime, vecgeom::NavigationState state)
-      : navState{state}, position{x, y, z}, direction{dirx, diry, dirz}, eKin{ene}, globalTime{gTime}, localTime{lTime},
-        properTime{pTime}, pdg{pdg_id}, parentID{parentID}
+            double gTime, double lTime, double pTime, vecgeom::NavigationState &&state)
+      : navState{std::move(state)}, position{x, y, z}, direction{dirx, diry, dirz}, eKin{ene}, globalTime{gTime},
+        localTime{lTime}, properTime{pTime}, pdg{pdg_id}, parentID{parentID}
   {
   }
 
diff --git a/include/AdePT/integration/AdePTGeant4Integration.hh b/include/AdePT/integration/AdePTGeant4Integration.hh
index 70c1e744..5dea67cf 100644
--- a/include/AdePT/integration/AdePTGeant4Integration.hh
+++ b/include/AdePT/integration/AdePTGeant4Integration.hh
@@ -28,8 +28,6 @@ struct Deleter {
 
 class AdePTGeant4Integration {
 public:
-  static constexpr G4int kAdePTTrackID =
-      std::numeric_limits<G4int>::min() + 2; // TrackID to signify that the track came from AdePT
   AdePTGeant4Integration() = default;
   ~AdePTGeant4Integration();
 
diff --git a/include/AdePT/integration/AdePTTrackingManager.hh b/include/AdePT/integration/AdePTTrackingManager.hh
index ed7def56..69cddda0 100644
--- a/include/AdePT/integration/AdePTTrackingManager.hh
+++ b/include/AdePT/integration/AdePTTrackingManager.hh
@@ -9,7 +9,11 @@
 
 #include "globals.hh"
 #include <AdePT/core/AdePTTransportInterface.hh>
+#ifndef ASYNC_MODE
 #include <AdePT/core/AdePTTransport.h>
+#else
+#include <AdePT/core/AsyncAdePTTransport.hh>
+#endif
 #include "AdePT/copcore/SystemOfUnits.h"
 #include <AdePT/integration/AdePTGeant4Integration.hh>
 #include <AdePT/core/AdePTConfiguration.hh>
@@ -60,7 +64,12 @@ private:
   static inline int fNumThreads{0};
   std::set<G4Region const *> fGPURegions{};
   int fVerbosity{0};
+  // std::shared_ptr<AdePTTransportInterface> fAdeptTransport;
+  #ifndef ASYNC_MODE
+  std::unique_ptr<AdePTTransportInterface> fAdeptTransport;
+  #else
   std::shared_ptr<AdePTTransportInterface> fAdeptTransport;
+  #endif
   AdePTConfiguration *fAdePTConfiguration;
   unsigned int fTrackCounter{0};
   int fCurrentEventID{0};
diff --git a/examples/AsyncExample/electrons.cuh b/include/AdePT/kernels/electrons_async.cuh
similarity index 80%
rename from examples/AsyncExample/electrons.cuh
rename to include/AdePT/kernels/electrons_async.cuh
index e48e4f88..38abfe1f 100644
--- a/examples/AsyncExample/electrons.cuh
+++ b/include/AdePT/kernels/electrons_async.cuh
@@ -1,7 +1,7 @@
-// SPDX-FileCopyrightText: 2022 CERN
-// SPDX-License-Identifier: Apache-2.0
+// // SPDX-FileCopyrightText: 2022 CERN
+// // SPDX-License-Identifier: Apache-2.0
 
-#include "AdeptIntegration.cuh"
+#include <AdePT/core/AsyncAdePTTransportStruct.cuh>
 
 #include <AdePT/navigation/BVHNavigator.h>
 #include <AdePT/magneticfield/fieldPropagatorConstBz.h>
@@ -9,8 +9,6 @@
 #include <AdePT/copcore/PhysicalConstants.h>
 #include <AdePT/core/AdePTScoringTemplate.cuh>
 
-#define NOFLUCTUATION
-
 #include <G4HepEmElectronManager.hh>
 #include <G4HepEmElectronTrack.hh>
 #include <G4HepEmElectronInteractionBrem.hh>
@@ -66,23 +64,24 @@ static __device__ __forceinline__ void TransportElectrons(Track *electrons, cons
     const int slot      = (*active)[i];
     Track &currentTrack = electrons[slot];
     currentTrack.stepCounter++; // step counter is already increased below when checking for maxSteps
-    auto eKin                = currentTrack.energy;
+    auto eKin                = currentTrack.eKin;
     const auto preStepEnergy = eKin;
-    auto pos            = currentTrack.pos;
+    auto pos                 = currentTrack.pos;
     const auto preStepPos{pos};
-    auto dir            = currentTrack.dir;
+    auto dir = currentTrack.dir;
     vecgeom::Vector3D<Precision> preStepDir(dir);
-    auto navState       = currentTrack.navState;
-    const auto volume   = navState.Top();
+    auto navState     = currentTrack.navState;
+    const auto volume = navState.Top();
     // the MCC vector is indexed by the logical volume id
-    const int lvolID          = volume->GetLogicalVolume()->id();
+    const int lvolID = volume->GetLogicalVolume()->id();
+    // TODO: Why do we have a specific AsyncAdePT VolAuxData?
     VolAuxData const &auxData = AsyncAdePT::gVolAuxData[lvolID];
 
     assert(auxData.fGPUregion > 0); // make sure we don't get inconsistent region here
     SlotManager &slotManager = IsElectron ? *secondaries.electrons.fSlotManager : *secondaries.positrons.fSlotManager;
 
     auto survive = [&](bool leak = false) {
-      currentTrack.energy   = eKin;
+      currentTrack.eKin     = eKin;
       currentTrack.pos      = pos;
       currentTrack.dir      = dir;
       currentTrack.navState = navState;
@@ -97,7 +96,7 @@ static __device__ __forceinline__ void TransportElectrons(Track *electrons, cons
                       "evt=%d thread=%d "
                       "e=%f safety=%f out of GPU\n",
              __LINE__, IsElectron ? -11 : 11, lvolID, pos[0], pos[1], pos[2], dir[0], dir[1], dir[2],
-             currentTrack.eventId, currentTrack.threadId, currentTrack.energy,
+             currentTrack.eventId, currentTrack.threadId, currentTrack.eKin,
              BVHNavigator::ComputeSafety(pos, navState));
       // pos += kPushOutRegion * dir;
       // survive(true);
@@ -138,11 +137,12 @@ static __device__ __forceinline__ void TransportElectrons(Track *electrons, cons
     G4HepEmRandomEngine rnge(&currentTrack.rngState);
 
     // Sample the `number-of-interaction-left` and put it into the track.
-    for (int ip = 0; ip < 3; ++ip) {
+    for (int ip = 0; ip < 4; ++ip) {
       double numIALeft = currentTrack.numIALeft[ip];
       if (numIALeft <= 0) {
         numIALeft = -std::log(currentTrack.Uniform());
       }
+      if (ip == 3) numIALeft = vecgeom::kInfLength; // suppress lepton nuclear by infinite length
       theTrack->SetNumIALeft(numIALeft, ip);
     }
 
@@ -188,13 +188,14 @@ static __device__ __forceinline__ void TransportElectrons(Track *electrons, cons
     // also need to carry them over!
 
     // Check if there's a volume boundary in between.
-    bool propagated = true;
+    bool propagated    = true;
     long hitsurf_index = -1;
     double geometryStepLength;
-    vecgeom::NavStateIndex nextState;
+    vecgeom::NavigationState nextState;
     if (BzFieldValue != 0) {
       geometryStepLength = fieldPropagatorBz.ComputeStepAndNextVolume<BVHNavigator>(
-          eKin, restMass, Charge, geometricalStepLengthFromPhysics, pos, dir, navState, nextState, propagated, safety, /*max_i_in_stepper=*/10);
+          eKin, restMass, Charge, geometricalStepLengthFromPhysics, pos, dir, navState, nextState, hitsurf_index,
+          propagated, safety, /*max_i_in_stepper=*/10);
     } else {
       geometryStepLength = BVHNavigator::ComputeStepAndNextVolume(pos, dir, geometricalStepLengthFromPhysics, navState,
                                                                   nextState, kPush);
@@ -263,6 +264,18 @@ static __device__ __forceinline__ void TransportElectrons(Track *electrons, cons
     currentTrack.localTime += deltaTime;
     currentTrack.properTime += deltaTime * (restMass / eKin);
 
+    if (nextState.IsOnBoundary()) {
+      // if the particle hit a boundary, and is neither stopped or outside, relocate to have the correct next state
+      // before RecordHit is called
+      if (!stopped && !nextState.IsOutside()) {
+#ifdef ADEPT_USE_SURF
+        AdePTNavigator::RelocateToNextVolume(pos, dir, hitsurf_index, nextState);
+#else
+        AdePTNavigator::RelocateToNextVolume(pos, dir, nextState);
+#endif
+      }
+    }
+
     if (auxData.fSensIndex >= 0)
       adept_scoring::RecordHit(&userScoring[currentTrack.threadId], currentTrack.parentId,
                                static_cast<char>(IsElectron ? 0 : 1), // Particle type
@@ -283,7 +296,7 @@ static __device__ __forceinline__ void TransportElectrons(Track *electrons, cons
                                currentTrack.eventId, currentTrack.threadId);
 
     // Save the `number-of-interaction-left` in our track.
-    for (int ip = 0; ip < 3; ++ip) {
+    for (int ip = 0; ip < 4; ++ip) {
       double numIALeft           = theTrack->GetNumIALeft(ip);
       currentTrack.numIALeft[ip] = numIALeft;
     }
@@ -316,26 +329,19 @@ static __device__ __forceinline__ void TransportElectrons(Track *electrons, cons
     }
 
     if (nextState.IsOnBoundary()) {
-      if (++currentTrack.looperCounter > 256) {
-        // Kill loopers that are scraping a boundary
-        printf("Looper scraping a boundary going to G4: E=%E event=%d loop=%d energyDeposit=%E geoStepLength=%E "
-               "physicsStepLength=%E "
-               "safety=%E\n",
-               eKin, currentTrack.eventId, currentTrack.looperCounter, energyDeposit, geometryStepLength,
-               geometricalStepLengthFromPhysics, safety);
-        atomicAdd(&userScoring[currentTrack.threadId].fGlobalCounters.numKilled, 1);
-        // survive(/*leak=*/true); // don't send particles back at this point, G4 seems to be struggling a lot with those tracks too!
-        continue;
-      } else if (nextState.Top() != nullptr) {
-        // Go to next volume
-        BVHNavigator::RelocateToNextVolume(pos, dir, nextState);
+      // For now, just count that we hit something.
+
+      // Kill the particle if it left the world.
+      if (!nextState.IsOutside()) {
 
         // Move to the next boundary.
         navState = nextState;
-        currentTrack.looperCounter = 0;
         // Check if the next volume belongs to the GPU region and push it to the appropriate queue
-        const auto nextvolume         = navState.Top();
-        const int nextlvolID          = nextvolume->GetLogicalVolume()->id();
+#ifndef ADEPT_USE_SURF
+        const int nextlvolID = navState.Top()->GetLogicalVolume()->id();
+#else
+        const int nextlvolID = navState.GetLogicalId();
+#endif
         VolAuxData const &nextauxData = AsyncAdePT::gVolAuxData[nextlvolID];
         if (nextauxData.fGPUregion > 0)
           survive();
@@ -353,27 +359,77 @@ static __device__ __forceinline__ void TransportElectrons(Track *electrons, cons
     } else if (!propagated || restrictedPhysicalStepLength) {
       // Did not yet reach the interaction point due to error in the magnetic
       // field propagation. Try again next time.
-      if (++currentTrack.looperCounter > 1024) {
-        // Kill loopers that are failing to propagate
-        printf("Looper with trouble in field propagation going to G4: E=%E event=%d loop=%d energyDeposit=%E "
-               "geoStepLength=%E physicsStepLength=%E "
-               "safety=%E\n",
-               eKin, currentTrack.eventId, currentTrack.looperCounter, energyDeposit, geometryStepLength,
-               geometricalStepLengthFromPhysics, safety);
-        atomicAdd(&userScoring[currentTrack.threadId].fGlobalCounters.numKilled, 1);
-        // survive(/*leak=*/true);  // don't send particles back at this point, G4 seems to be struggling a lot with those tracks too!
-
-      } else {
-        survive();
-      }
+      survive();
       continue;
     } else if (winnerProcessIndex < 0) {
       // No discrete process, move on.
-      currentTrack.looperCounter = 0;
       survive();
       continue;
     }
-    currentTrack.looperCounter = 0;
+
+    // if (nextState.IsOnBoundary()) {
+    //   // TODO: We need to return these particles to G4, also check if we
+    //   // want to have this logic here
+    //   if (++currentTrack.looperCounter > 256) {
+    //     // Kill loopers that are scraping a boundary
+    //     printf("Looper scraping a boundary going to G4: E=%E event=%d loop=%d energyDeposit=%E geoStepLength=%E "
+    //            "physicsStepLength=%E "
+    //            "safety=%E\n",
+    //            eKin, currentTrack.eventId, currentTrack.looperCounter, energyDeposit, geometryStepLength,
+    //            geometricalStepLengthFromPhysics, safety);
+    //     atomicAdd(&userScoring[currentTrack.threadId].fGlobalCounters.numKilled, 1);
+    //     // survive(/*leak=*/true); // don't send particles back at this point, G4 seems to be struggling a lot with
+    //     // those tracks too!
+    //     continue;
+    //   } else if (nextState.Top() != nullptr) {
+    //     // Go to next volume
+    //     BVHNavigator::RelocateToNextVolume(pos, dir, nextState);
+
+    //     // Move to the next boundary.
+    //     navState                   = nextState;
+    //     currentTrack.looperCounter = 0;
+    //     // Check if the next volume belongs to the GPU region and push it to the appropriate queue
+    //     const auto nextvolume         = navState.Top();
+    //     const int nextlvolID          = nextvolume->GetLogicalVolume()->id();
+    //     VolAuxData const &nextauxData = AsyncAdePT::gVolAuxData[nextlvolID];
+    //     if (nextauxData.fGPUregion > 0)
+    //       survive();
+    //     else {
+    //       // To be safe, just push a bit the track exiting the GPU region to make sure
+    //       // Geant4 does not relocate it again inside the same region
+    //       pos += kPushOutRegion * dir;
+    //       survive(/*leak*/ true);
+    //     }
+    //   } else {
+    //     // Kill the particle if it left the world.
+    //     slotManager.MarkSlotForFreeing(slot);
+    //   }
+    //   continue;
+    // } else if (!propagated || restrictedPhysicalStepLength) {
+    //   // Did not yet reach the interaction point due to error in the magnetic
+    //   // field propagation. Try again next time.
+    //   if (++currentTrack.looperCounter > 1024) {
+    //     // Kill loopers that are failing to propagate
+    //     printf("Looper with trouble in field propagation going to G4: E=%E event=%d loop=%d energyDeposit=%E "
+    //            "geoStepLength=%E physicsStepLength=%E "
+    //            "safety=%E\n",
+    //            eKin, currentTrack.eventId, currentTrack.looperCounter, energyDeposit, geometryStepLength,
+    //            geometricalStepLengthFromPhysics, safety);
+    //     atomicAdd(&userScoring[currentTrack.threadId].fGlobalCounters.numKilled, 1);
+    //     // survive(/*leak=*/true);  // don't send particles back at this point, G4 seems to be struggling a lot with
+    //     // those tracks too!
+
+    //   } else {
+    //     survive();
+    //   }
+    //   continue;
+    // } else if (winnerProcessIndex < 0) {
+    //   // No discrete process, move on.
+    //   currentTrack.looperCounter = 0;
+    //   survive();
+    //   continue;
+    // }
+    // currentTrack.looperCounter = 0;
 
     // Reset number of interaction left for the winner discrete process.
     // (Will be resampled in the next iteration.)
@@ -465,6 +521,11 @@ static __device__ __forceinline__ void TransportElectrons(Track *electrons, cons
       slotManager.MarkSlotForFreeing(slot);
       break;
     }
+    case 3: {
+      // leptop-nuclear needs to be handled by G4, just keep the track
+      survive();
+      break;
+    }
     }
   }
 }
diff --git a/examples/AsyncExample/gammas.cuh b/include/AdePT/kernels/gammas_async.cuh
similarity index 84%
rename from examples/AsyncExample/gammas.cuh
rename to include/AdePT/kernels/gammas_async.cuh
index 9cccecc3..840e07c7 100644
--- a/examples/AsyncExample/gammas.cuh
+++ b/include/AdePT/kernels/gammas_async.cuh
@@ -1,7 +1,7 @@
 // SPDX-FileCopyrightText: 2022 CERN
 // SPDX-License-Identifier: Apache-2.0
 
-#include "AdeptIntegration.cuh"
+#include <AdePT/core/AsyncAdePTTransportStruct.cuh>
 
 #include <AdePT/navigation/BVHNavigator.h>
 
@@ -35,17 +35,17 @@ __global__ void __launch_bounds__(256, 1)
   constexpr Precision kPushOutRegion = 10 * vecgeom::kTolerance;
   const int activeSize               = active->size();
   for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < activeSize; i += blockDim.x * gridDim.x) {
-    const int slot            = (*active)[i];
-    Track &currentTrack       = gammas[slot];
-    const auto energy         = currentTrack.energy;
-    const auto preStepEnergy  = energy;
-    auto pos = currentTrack.pos;
+    const int slot           = (*active)[i];
+    Track &currentTrack      = gammas[slot];
+    const auto eKin          = currentTrack.eKin;
+    const auto preStepEnergy = eKin;
+    auto pos                 = currentTrack.pos;
     const auto preStepPos{pos};
-    const auto dir            = currentTrack.dir;
+    const auto dir = currentTrack.dir;
     const auto preStepDir{dir};
-    auto navState             = currentTrack.navState;
+    auto navState              = currentTrack.navState;
     const auto preStepNavState = navState;
-    VolAuxData const &auxData = AsyncAdePT::gVolAuxData[currentTrack.navState.Top()->GetLogicalVolume()->id()];
+    VolAuxData const &auxData  = AsyncAdePT::gVolAuxData[currentTrack.navState.Top()->GetLogicalVolume()->id()];
     assert(auxData.fGPUregion > 0); // make sure we don't get inconsistent region here
     auto &slotManager = *secondaries.gammas.fSlotManager;
 
@@ -60,16 +60,16 @@ __global__ void __launch_bounds__(256, 1)
     // Init a track with the needed data to call into G4HepEm.
     G4HepEmGammaTrack gammaTrack;
     G4HepEmTrack *theTrack = gammaTrack.GetTrack();
-    theTrack->SetEKin(energy);
+    theTrack->SetEKin(eKin);
     theTrack->SetMCIndex(auxData.fMCIndex);
 
-    // Sample the `number-of-interaction-left` and put it into the track.
-    for (int ip = 0; ip < 3; ++ip) {
-      double numIALeft = currentTrack.numIALeft[ip];
-      if (numIALeft <= 0) {
-        numIALeft = -std::log(currentTrack.Uniform());
-      }
-      theTrack->SetNumIALeft(numIALeft, ip);
+    // Re-sample the `number-of-interaction-left` (if needed, otherwise use stored numIALeft) and put it into the
+    // G4HepEmTrack. Use index 0 since numIALeft for gammas is based only on the total macroscopic cross section. The
+    // currentTrack.numIALeft[0] are updated later
+    if (currentTrack.numIALeft[0] <= 0.0) {
+      theTrack->SetNumIALeft(-std::log(currentTrack.Uniform()), 0);
+    } else {
+      theTrack->SetNumIALeft(currentTrack.numIALeft[0], 0);
     }
 
     // Call G4HepEm to compute the physics step limit.
@@ -82,7 +82,7 @@ __global__ void __launch_bounds__(256, 1)
     // also need to carry them over!
 
     // Check if there's a volume boundary in between.
-    vecgeom::NavStateIndex nextState;
+    vecgeom::NavigationState nextState;
     double geometryStepLength =
         BVHNavigator::ComputeStepAndNextVolume(pos, dir, geometricalStepLengthFromPhysics, navState, nextState, kPush);
     pos += geometryStepLength * dir;
@@ -96,17 +96,21 @@ __global__ void __launch_bounds__(256, 1)
     theTrack->SetGStepLength(geometryStepLength);
     theTrack->SetOnBoundary(nextState.IsOnBoundary());
 
-    G4HepEmGammaManager::UpdateNumIALeft(theTrack);
-
-    // Save the `number-of-interaction-left` in our track.
-    for (int ip = 0; ip < 3; ++ip) {
-      double numIALeft           = theTrack->GetNumIALeft(ip);
-      currentTrack.numIALeft[ip] = numIALeft;
-    }
+    // Update the flight times of the particle
+    const double deltaTime = theTrack->GetGStepLength() / copcore::units::kCLight;
+    currentTrack.globalTime += deltaTime;
+    currentTrack.localTime += deltaTime;
 
     if (nextState.IsOnBoundary()) {
       // Kill the particle if it left the world.
       if (nextState.Top() != nullptr) {
+
+        G4HepEmGammaManager::UpdateNumIALeft(theTrack);
+
+        // Save the `number-of-interaction-left` in our track.
+        double numIALeft          = theTrack->GetNumIALeft(0);
+        currentTrack.numIALeft[0] = numIALeft; // double to float conversion
+
         BVHNavigator::RelocateToNextVolume(pos, dir, nextState);
 
         // Move to the next boundary.
@@ -127,21 +131,25 @@ __global__ void __launch_bounds__(256, 1)
         slotManager.MarkSlotForFreeing(slot);
       }
       continue;
-    } else if (winnerProcessIndex < 0) {
-      // No discrete process, move on.
-      survive(activeQueue);
-      continue;
+    } else {
+
+      G4HepEmGammaManager::SampleInteraction(&g4HepEmData, &gammaTrack, currentTrack.Uniform());
+      winnerProcessIndex = theTrack->GetWinnerProcessIndex();
+      // NOTE: no simple re-drawing is possible for gamma-nuclear, since HowFar returns now smaller steps due to the
+      // gamma-nuclear reactions in comparison to without gamma-nuclear reactions. Thus, an empty step without a
+      // reaction is needed to compensate for the smaller step size returned by HowFar.
+
+      // Reset number of interaction left for the winner discrete process also in the currentTrack (SampleInteraction()
+      // resets it for theTrack), will be resampled in the next iteration.
+      currentTrack.numIALeft[0] = -1.0;
+
+      if (winnerProcessIndex == 3) {
+        // Gamma-nuclear must be handled by G4, move on.
+        survive(activeQueue);
+        continue;
+      }
     }
 
-    // Reset number of interaction left for the winner discrete process.
-    // (Will be resampled in the next iteration.)
-    currentTrack.numIALeft[winnerProcessIndex] = -1.0;
-
-    // Update the flight times of the particle
-    const double deltaTime = theTrack->GetGStepLength() / copcore::units::kCLight;
-    currentTrack.globalTime += deltaTime;
-    currentTrack.localTime += deltaTime;
-
     assert(winnerProcessIndex < gammaInteractions.NInt);
 
     // Enqueue track in special interaction queue
@@ -174,7 +182,7 @@ __global__ void __launch_bounds__(256, 1)
       const double geometryStepLength = queue[i].geometryStepLength;
       Track &currentTrack             = gammas[slot];
       auto &slotManager               = *secondaries.gammas.fSlotManager;
-      const auto energy               = currentTrack.energy;
+      const auto eKin                 = currentTrack.eKin;
       const auto &dir                 = currentTrack.dir;
 
       VolAuxData const &auxData = AsyncAdePT::gVolAuxData[currentTrack.navState.Top()->GetLogicalVolume()->id()];
@@ -188,14 +196,14 @@ __global__ void __launch_bounds__(256, 1)
 
       if (interactionType == GammaInteractions::PairCreation) {
         // Invoke gamma conversion to e-/e+ pairs, if the energy is above the threshold.
-        if (energy < 2 * copcore::units::kElectronMassC2) {
+        if (eKin < 2 * copcore::units::kElectronMassC2) {
           survive();
           continue;
         }
 
-        const double logEnergy = std::log(energy);
+        const double logEnergy = std::log(eKin);
         double elKinEnergy, posKinEnergy;
-        G4HepEmGammaInteractionConversion::SampleKinEnergies(&g4HepEmData, energy, logEnergy, auxData.fMCIndex,
+        G4HepEmGammaInteractionConversion::SampleKinEnergies(&g4HepEmData, eKin, logEnergy, auxData.fMCIndex,
                                                              elKinEnergy, posKinEnergy, &rnge);
 
         double dirPrimary[] = {dir.x(), dir.y(), dir.z()};
@@ -223,24 +231,24 @@ __global__ void __launch_bounds__(256, 1)
       if (interactionType == GammaInteractions::ComptonScattering) {
         // Invoke Compton scattering of gamma.
         constexpr double LowEnergyThreshold = 100 * copcore::units::eV;
-        if (energy < LowEnergyThreshold) {
+        if (eKin < LowEnergyThreshold) {
           survive();
           continue;
         }
         const double origDirPrimary[] = {dir.x(), dir.y(), dir.z()};
         double dirPrimary[3];
         const double newEnergyGamma =
-            G4HepEmGammaInteractionCompton::SamplePhotonEnergyAndDirection(energy, dirPrimary, origDirPrimary, &rnge);
+            G4HepEmGammaInteractionCompton::SamplePhotonEnergyAndDirection(eKin, dirPrimary, origDirPrimary, &rnge);
         vecgeom::Vector3D<double> newDirGamma(dirPrimary[0], dirPrimary[1], dirPrimary[2]);
 
-        const double energyEl = energy - newEnergyGamma;
+        const double energyEl = eKin - newEnergyGamma;
         if (energyEl > LowEnergyThreshold) {
           // Create a secondary electron and sample/compute directions.
           adept_scoring::AccountProduced(userScoring + currentTrack.threadId, /*numElectrons*/ 1, /*numPositrons*/ 0,
                                          /*numGammas*/ 0);
 
           Track &electron = secondaries.electrons.NextTrack(newRNG, energyEl, currentTrack.pos,
-                                                            energy * dir - newEnergyGamma * newDirGamma,
+                                                            eKin * dir - newEnergyGamma * newDirGamma,
                                                             currentTrack.navState, currentTrack);
           electron.dir.Normalize();
         } else {
@@ -266,8 +274,8 @@ __global__ void __launch_bounds__(256, 1)
 
         // Check the new gamma energy and deposit if below threshold.
         if (newEnergyGamma > LowEnergyThreshold) {
-          currentTrack.energy = newEnergyGamma;
-          currentTrack.dir    = newDirGamma;
+          currentTrack.eKin = newEnergyGamma;
+          currentTrack.dir  = newDirGamma;
           survive();
         } else {
           if (auxData.fSensIndex >= 0)
@@ -299,10 +307,10 @@ __global__ void __launch_bounds__(256, 1)
         constexpr double theLowEnergyThreshold = 1 * copcore::units::eV;
 
         const double bindingEnergy = G4HepEmGammaInteractionPhotoelectric::SelectElementBindingEnergy(
-            &g4HepEmData, auxData.fMCIndex, queue[i].PEmxSec, energy, &rnge);
+            &g4HepEmData, auxData.fMCIndex, queue[i].PEmxSec, eKin, &rnge);
 
         double edep             = bindingEnergy;
-        const double photoElecE = energy - edep;
+        const double photoElecE = eKin - edep;
         if (photoElecE > theLowEnergyThreshold) {
           // Create a secondary electron and sample directions.
           adept_scoring::AccountProduced(userScoring + currentTrack.threadId, /*numElectrons*/ 1, /*numPositrons*/ 0,
@@ -317,7 +325,7 @@ __global__ void __launch_bounds__(256, 1)
               vecgeom::Vector3D<Precision>{dirPhotoElec[0], dirPhotoElec[1], dirPhotoElec[2]}, currentTrack.navState,
               currentTrack);
         } else {
-          edep = energy;
+          edep = eKin;
         }
         if (auxData.fSensIndex >= 0)
           adept_scoring::RecordHit(userScoring + currentTrack.threadId, currentTrack.parentId,
diff --git a/include/AdePT/navigation/AdePTNavigator.h b/include/AdePT/navigation/AdePTNavigator.h
index 8c1be581..1e558b43 100644
--- a/include/AdePT/navigation/AdePTNavigator.h
+++ b/include/AdePT/navigation/AdePTNavigator.h
@@ -9,13 +9,12 @@
 #ifndef ADEPT_NAVIGATOR_H_
 #define ADEPT_NAVIGATOR_H_
 
-#include <AdePT/navigation/LoopNavigator.h>
 #include <AdePT/navigation/BVHNavigator.h>
 #ifdef ADEPT_USE_SURF
 #include <AdePT/navigation/SurfNavigator.h>
 #endif
 
-inline namespace COPCORE_IMPL {
+// inline namespace COPCORE_IMPL {
 #ifdef ADEPT_USE_SURF
 #ifdef ADEPT_USE_SURF_SINGLE
 using AdePTNavigator = SurfNavigator<float>;
@@ -25,5 +24,5 @@ using AdePTNavigator = SurfNavigator<double>;
 #else
 using AdePTNavigator = BVHNavigator;
 #endif
-} // End namespace COPCORE_IMPL
+// } // End namespace COPCORE_IMPL
 #endif // ADEPT_NAVIGATOR_H_
diff --git a/include/AdePT/navigation/BVHNavigator.h b/include/AdePT/navigation/BVHNavigator.h
index c10bc237..7ec060f6 100644
--- a/include/AdePT/navigation/BVHNavigator.h
+++ b/include/AdePT/navigation/BVHNavigator.h
@@ -12,9 +12,9 @@
 #include <VecGeom/base/Global.h>
 #include <VecGeom/navigation/BVHNavigator.h>
 
-inline namespace COPCORE_IMPL {
+// inline namespace COPCORE_IMPL {
 
 using BVHNavigator = vecgeom::BVHNavigator;
 
-} // End namespace COPCORE_IMPL
+// } // End namespace COPCORE_IMPL
 #endif // RT_LOOP_NAVIGATOR_H_
\ No newline at end of file
diff --git a/src/AdePTTrackingManager.cc b/src/AdePTTrackingManager.cc
index 33a6e94c..4271a5bf 100644
--- a/src/AdePTTrackingManager.cc
+++ b/src/AdePTTrackingManager.cc
@@ -17,6 +17,15 @@
 
 #include <algorithm>
 
+#ifdef ASYNC_MODE
+std::shared_ptr<AdePTTransportInterface> InstantiateAdePT(AdePTConfiguration &conf)
+{
+  static std::shared_ptr<AsyncAdePT::AsyncAdePTTransport<AdePTGeant4Integration>> adePT{
+      new AsyncAdePT::AsyncAdePTTransport<AdePTGeant4Integration>(conf)};
+  return adePT;
+}
+#endif
+
 //....oooOO0OOooo........oooOO0OOooo........oooOO0OOooo........oooOO0OOooo......
 
 AdePTTrackingManager::AdePTTrackingManager()
@@ -49,9 +58,15 @@ void AdePTTrackingManager::InitializeAdePT()
     // Load the VecGeom world in memory
     AdePTGeant4Integration::CreateVecGeomWorld(fAdePTConfiguration->GetVecGeomGDML());
 
-    // Create an instance of an AdePT transport engine. This can either be one engine per thread or a shared engine for
-    // all threads.
-    fAdeptTransport = std::make_shared<AdePTTransport<AdePTGeant4Integration>>(*fAdePTConfiguration);
+// Create an instance of an AdePT transport engine. This can either be one engine per thread or a shared engine for
+// all threads.
+#ifndef ASYNC_MODE
+    fAdeptTransport = std::make_unique<AdePTTransport<AdePTGeant4Integration>>(*fAdePTConfiguration);
+#else
+    // fAdeptTransport =
+    // std::make_shared<AsyncAdePT::AsyncAdePTTransport<AdePTGeant4Integration>>(*fAdePTConfiguration);
+    fAdeptTransport = InstantiateAdePT(*fAdePTConfiguration);
+#endif
 
     // Initialize common data:
     // G4HepEM, Upload VecGeom geometry to GPU, Geometry check, Create volume auxiliary data
@@ -64,7 +79,13 @@ void AdePTTrackingManager::InitializeAdePT()
     // Create an instance of an AdePT transport engine. This can either be one engine per thread or a shared engine for
     // all threads.
     fAdePTConfiguration->SetNumThreads(fNumThreads);
-    fAdeptTransport = std::make_shared<AdePTTransport<AdePTGeant4Integration>>(*fAdePTConfiguration);
+#ifndef ASYNC_MODE
+    fAdeptTransport = std::make_unique<AdePTTransport<AdePTGeant4Integration>>(*fAdePTConfiguration);
+#else
+    // fAdeptTransport =
+    // std::make_shared<AsyncAdePT::AsyncAdePTTransport<AdePTGeant4Integration>>(*fAdePTConfiguration);
+    fAdeptTransport = InstantiateAdePT(*fAdePTConfiguration);
+#endif
     // Initialize per-thread data
     fAdeptTransport->Initialize();
   }
diff --git a/src/AdePTTrackingManager.cu b/src/AdePTTrackingManager.cu
index ba77bdbe..86d9b5c6 100644
--- a/src/AdePTTrackingManager.cu
+++ b/src/AdePTTrackingManager.cu
@@ -1,11 +1,20 @@
 // SPDX-FileCopyrightText: 2023 CERN
 // SPDX-License-Identifier: Apache-2.0
 
+#ifndef ASYNC_MODE
 #include <AdePT/core/AdePTTransport.cuh>
 #include <AdePT/integration/AdePTGeant4Integration.hh>
+#else
+#include <AdePT/core/AsyncAdePTTransport.cuh>
+#endif
+
+#ifndef ASYNC_MODE
 
 // Explicit instantiation of the ShowerGPU<AdePTGeant4Integration> function
 namespace adept_impl {
 template void ShowerGPU<AdePTGeant4Integration>(AdePTGeant4Integration &, int, adeptint::TrackBuffer &, GPUstate &,
                                                 HostScoring *, HostScoring *);
 } // namespace adept_impl
+
+#endif
+