From dc7e8e9625a4b61888d1f5bcd383e9712e999423 Mon Sep 17 00:00:00 2001
From: David Rohr <drohr@jwdt.org>
Date: Tue, 4 Feb 2025 01:44:02 +0100
Subject: [PATCH] GPU: Remove support for host helper threads (no longer used)

---
 Common/Topologies/o2prototype_topology.xml    |   2 +-
 GPU/GPUTracking/Base/GPUReconstruction.cxx    |   3 -
 GPU/GPUTracking/Base/GPUReconstructionCPU.h   |  10 +-
 .../Base/GPUReconstructionDeviceBase.cxx      | 139 ------------------
 .../Base/GPUReconstructionDeviceBase.h        |  17 +--
 .../Base/GPUReconstructionHelpers.h           |  50 -------
 GPU/GPUTracking/CMakeLists.txt                |   1 -
 GPU/GPUTracking/Definitions/GPUSettingsList.h |   1 -
 GPU/GPUTracking/Global/GPUChain.h             |  13 --
 GPU/GPUTracking/Global/GPUChainTracking.h     |   7 +-
 .../Global/GPUChainTrackingSliceTracker.cxx   |  65 +-------
 11 files changed, 8 insertions(+), 300 deletions(-)
 delete mode 100644 GPU/GPUTracking/Base/GPUReconstructionHelpers.h
diff --git a/Common/Topologies/o2prototype_topology.xml b/Common/Topologies/o2prototype_topology.xml
index 240b8d87d469a..8d53c9eb0127a 100644
--- a/Common/Topologies/o2prototype_topology.xml
+++ b/Common/Topologies/o2prototype_topology.xml
@@ -74,7 +74,7 @@ The following parameters need adjustment when extending the FLP-EPN configuratio
     </decltask>
 
     <decltask id="tracker">
-        <exe reachable="true">$ALICEO2_INSTALL_DIR/bin/aliceHLTWrapper Tracker_%collectionIndex%_%taskIndex% 1 --dds --poll-period 100 --input type=pull,size=5000,method=connect,property=EPNReceiverOutputAddress,count=1 --output type=push,size=500,method=bind,property=TrackingOutputAddress,min-port=48000 --library libAliHLTTPC.so --component TPCCATracker --run 167808 --parameter '-GlobalTracking -allowGPU -GPUHelperThreads 4 -loglevel=0x7c'</exe>
+        <exe reachable="true">$ALICEO2_INSTALL_DIR/bin/aliceHLTWrapper Tracker_%collectionIndex%_%taskIndex% 1 --dds --poll-period 100 --input type=pull,size=5000,method=connect,property=EPNReceiverOutputAddress,count=1 --output type=push,size=500,method=bind,property=TrackingOutputAddress,min-port=48000 --library libAliHLTTPC.so --component TPCCATracker --run 167808 --parameter '-GlobalTracking -allowGPU -loglevel=0x7c'</exe>
         <!-- <requirement></requirement> -->
         <properties>
             <id access="read">EPNReceiverOutputAddress</id>
diff --git a/GPU/GPUTracking/Base/GPUReconstruction.cxx b/GPU/GPUTracking/Base/GPUReconstruction.cxx
index 1496300818fd8..270f092a1fd29 100644
--- a/GPU/GPUTracking/Base/GPUReconstruction.cxx
+++ b/GPU/GPUTracking/Base/GPUReconstruction.cxx
@@ -278,9 +278,6 @@ int32_t GPUReconstruction::InitPhaseBeforeDevice()
   if (!(mRecoSteps.stepsGPUMask & GPUDataTypes::RecoStep::TPCMerging)) {
     mProcessingSettings.mergerSortTracks = false;
   }
-  if (!IsGPU()) {
-    mProcessingSettings.nDeviceHelperThreads = 0;
-  }
 
   if (mProcessingSettings.debugLevel > 3 || !IsGPU() || mProcessingSettings.deterministicGPUReconstruction) {
     mProcessingSettings.delayedOutput = false;
diff --git a/GPU/GPUTracking/Base/GPUReconstructionCPU.h b/GPU/GPUTracking/Base/GPUReconstructionCPU.h
index 8cc753731d074..27959382e7b67 100644
--- a/GPU/GPUTracking/Base/GPUReconstructionCPU.h
+++ b/GPU/GPUTracking/Base/GPUReconstructionCPU.h
@@ -16,7 +16,6 @@
 #define GPURECONSTRUCTIONICPU_H
 
 #include "GPUReconstruction.h"
-#include "GPUReconstructionHelpers.h"
 #include "GPUConstantMem.h"
 #include <stdexcept>
 #include "utils/timer.h"
@@ -117,13 +116,6 @@ class GPUReconstructionCPU : public GPUReconstructionKernels<GPUReconstructionCP
   virtual void RecordMarker(deviceEvent* ev, int32_t stream) {}
   virtual void SynchronizeGPU() {}
   virtual void ReleaseEvent(deviceEvent ev) {}
-  virtual int32_t StartHelperThreads() { return 0; }
-  virtual int32_t StopHelperThreads() { return 0; }
-  virtual void RunHelperThreads(int32_t (GPUReconstructionHelpers::helperDelegateBase::*function)(int32_t, int32_t, GPUReconstructionHelpers::helperParam*), GPUReconstructionHelpers::helperDelegateBase* functionCls, int32_t count) {}
-  virtual void WaitForHelperThreads() {}
-  virtual int32_t HelperError(int32_t iThread) const { return 0; }
-  virtual int32_t HelperDone(int32_t iThread) const { return 0; }
-  virtual void ResetHelperThreads(int32_t helpers) {}
 
   size_t TransferMemoryResourceToGPU(GPUMemoryResource* res, int32_t stream = -1, deviceEvent* ev = nullptr, deviceEvent* evList = nullptr, int32_t nEvents = 1) { return TransferMemoryInternal(res, stream, ev, evList, nEvents, true, res->Ptr(), res->PtrDevice()); }
   size_t TransferMemoryResourceToHost(GPUMemoryResource* res, int32_t stream = -1, deviceEvent* ev = nullptr, deviceEvent* evList = nullptr, int32_t nEvents = 1) { return TransferMemoryInternal(res, stream, ev, evList, nEvents, false, res->PtrDevice(), res->Ptr()); }
@@ -294,7 +286,7 @@ HighResTimer& GPUReconstructionCPU::getTimer(const char* name, int32_t num)
   static int32_t id = getNextTimerId();
   timerMeta* timer = getTimerById(id);
   if (timer == nullptr) {
-    int32_t max = std::max<int32_t>({getOMPMaxThreads(), mProcessingSettings.nDeviceHelperThreads + 1, mProcessingSettings.nStreams});
+    int32_t max = std::max<int32_t>({getOMPMaxThreads(), mProcessingSettings.nStreams});
     timer = insertTimer(id, name, J, max, 1, RecoStep::NoRecoStep);
   }
   if (num == -1) {
diff --git a/GPU/GPUTracking/Base/GPUReconstructionDeviceBase.cxx b/GPU/GPUTracking/Base/GPUReconstructionDeviceBase.cxx
index 3522095622ad4..91715fab4f668 100644
--- a/GPU/GPUTracking/Base/GPUReconstructionDeviceBase.cxx
+++ b/GPU/GPUTracking/Base/GPUReconstructionDeviceBase.cxx
@@ -41,57 +41,6 @@ GPUReconstructionDeviceBase::GPUReconstructionDeviceBase(const GPUSettingsDevice
 
 GPUReconstructionDeviceBase::~GPUReconstructionDeviceBase() = default;
 
-void* GPUReconstructionDeviceBase::helperWrapper_static(void* arg)
-{
-  GPUReconstructionHelpers::helperParam* par = (GPUReconstructionHelpers::helperParam*)arg;
-  GPUReconstructionDeviceBase* cls = par->cls;
-  return cls->helperWrapper(par);
-}
-
-void* GPUReconstructionDeviceBase::helperWrapper(GPUReconstructionHelpers::helperParam* par)
-{
-  if (mProcessingSettings.debugLevel >= 3) {
-    GPUInfo("\tHelper thread %d starting", par->num);
-  }
-
-  // cpu_set_t mask; //TODO add option
-  // CPU_ZERO(&mask);
-  // CPU_SET(par->num * 2 + 2, &mask);
-  // sched_setaffinity(0, sizeof(mask), &mask);
-
-  par->mutex[0].lock();
-  while (par->terminate == false) {
-    for (int32_t i = par->num + 1; i < par->count; i += mProcessingSettings.nDeviceHelperThreads + 1) {
-      // if (mProcessingSettings.debugLevel >= 3) GPUInfo("\tHelper Thread %d Running, Slice %d+%d, Phase %d", par->num, i, par->phase);
-      if ((par->functionCls->*par->function)(i, par->num + 1, par)) {
-        par->error = 1;
-      }
-      if (par->reset) {
-        break;
-      }
-      par->done = i + 1;
-      // if (mProcessingSettings.debugLevel >= 3) GPUInfo("\tHelper Thread %d Finished, Slice %d+%d, Phase %d", par->num, i, par->phase);
-    }
-    ResetThisHelperThread(par);
-    par->mutex[0].lock();
-  }
-  if (mProcessingSettings.debugLevel >= 3) {
-    GPUInfo("\tHelper thread %d terminating", par->num);
-  }
-  par->mutex[1].unlock();
-  pthread_exit(nullptr);
-  return (nullptr);
-}
-
-void GPUReconstructionDeviceBase::ResetThisHelperThread(GPUReconstructionHelpers::helperParam* par)
-{
-  if (par->reset) {
-    GPUImportant("GPU Helper Thread %d reseting", par->num);
-  }
-  par->reset = false;
-  par->mutex[1].unlock();
-}
-
 int32_t GPUReconstructionDeviceBase::GetGlobalLock(void*& pLock)
 {
 #ifdef _WIN32
@@ -138,86 +87,6 @@ void GPUReconstructionDeviceBase::ReleaseGlobalLock(void* sem)
 #endif
 }
 
-void GPUReconstructionDeviceBase::ResetHelperThreads(int32_t helpers)
-{
-  GPUImportant("Error occurred, GPU tracker helper threads will be reset (Number of threads %d (%d))", mProcessingSettings.nDeviceHelperThreads, mNSlaveThreads);
-  SynchronizeGPU();
-  for (int32_t i = 0; i < mProcessingSettings.nDeviceHelperThreads; i++) {
-    mHelperParams[i].reset = true;
-    if (helpers || i >= mProcessingSettings.nDeviceHelperThreads) {
-      pthread_mutex_lock(&((pthread_mutex_t*)mHelperParams[i].mutex)[1]);
-    }
-  }
-  GPUImportant("GPU Tracker helper threads have ben reset");
-}
-
-int32_t GPUReconstructionDeviceBase::StartHelperThreads()
-{
-  int32_t nThreads = mProcessingSettings.nDeviceHelperThreads;
-  if (nThreads) {
-    mHelperParams = new GPUReconstructionHelpers::helperParam[nThreads];
-    if (mHelperParams == nullptr) {
-      GPUError("Memory allocation error");
-      ExitDevice();
-      return (1);
-    }
-    for (int32_t i = 0; i < nThreads; i++) {
-      mHelperParams[i].cls = this;
-      mHelperParams[i].terminate = false;
-      mHelperParams[i].reset = false;
-      mHelperParams[i].num = i;
-      for (int32_t j = 0; j < 2; j++) {
-        mHelperParams[i].mutex[j].lock();
-      }
-
-      if (pthread_create(&mHelperParams[i].threadId, nullptr, helperWrapper_static, &mHelperParams[i])) {
-        GPUError("Error starting slave thread");
-        ExitDevice();
-        return (1);
-      }
-    }
-  }
-  mNSlaveThreads = nThreads;
-  return (0);
-}
-
-int32_t GPUReconstructionDeviceBase::StopHelperThreads()
-{
-  if (mNSlaveThreads) {
-    for (int32_t i = 0; i < mNSlaveThreads; i++) {
-      mHelperParams[i].terminate = true;
-      mHelperParams[i].mutex[0].unlock();
-      mHelperParams[i].mutex[1].lock();
-      if (pthread_join(mHelperParams[i].threadId, nullptr)) {
-        GPUError("Error waiting for thread to terminate");
-        return (1);
-      }
-    }
-    delete[] mHelperParams;
-  }
-  mNSlaveThreads = 0;
-  return (0);
-}
-
-void GPUReconstructionDeviceBase::WaitForHelperThreads()
-{
-  for (int32_t i = 0; i < mProcessingSettings.nDeviceHelperThreads; i++) {
-    pthread_mutex_lock(&((pthread_mutex_t*)mHelperParams[i].mutex)[1]);
-  }
-}
-
-void GPUReconstructionDeviceBase::RunHelperThreads(int32_t (GPUReconstructionHelpers::helperDelegateBase::*function)(int32_t i, int32_t t, GPUReconstructionHelpers::helperParam* p), GPUReconstructionHelpers::helperDelegateBase* functionCls, int32_t count)
-{
-  for (int32_t i = 0; i < mProcessingSettings.nDeviceHelperThreads; i++) {
-    mHelperParams[i].done = 0;
-    mHelperParams[i].error = 0;
-    mHelperParams[i].function = function;
-    mHelperParams[i].functionCls = functionCls;
-    mHelperParams[i].count = count;
-    pthread_mutex_unlock(&((pthread_mutex_t*)mHelperParams[i].mutex)[0]);
-  }
-}
-
 int32_t GPUReconstructionDeviceBase::InitDevice()
 {
   // cpu_set_t mask;
@@ -262,10 +131,6 @@ int32_t GPUReconstructionDeviceBase::InitDevice()
   mProcShadow.mMemoryResProcessors = RegisterMemoryAllocation(&mProcShadow, &GPUProcessorProcessors::SetPointersDeviceProcessor, GPUMemoryResource::MEMORY_PERMANENT | GPUMemoryResource::MEMORY_HOST, "Processors");
   AllocateRegisteredMemory(mProcShadow.mMemoryResProcessors);
 
-  if (StartHelperThreads()) {
-    return (1);
-  }
-
   if (mMaster == nullptr || mProcessingSettings.debugLevel >= 2) {
     GPUInfo("GPU Tracker initialization successfull"); // Verbosity reduced because GPU backend will print GPUImportant message!
   }
@@ -282,10 +147,6 @@ void* GPUReconstructionDeviceBase::GPUProcessorProcessors::SetPointersDeviceProc
 
 int32_t GPUReconstructionDeviceBase::ExitDevice()
 {
-  if (StopHelperThreads()) {
-    return (1);
-  }
-
   int32_t retVal = ExitDevice_Runtime();
   mProcessorsShadow = nullptr;
   mHostMemoryPool = mHostMemoryBase = mDeviceMemoryPool = mDeviceMemoryBase = mHostMemoryPoolEnd = mDeviceMemoryPoolEnd = mHostMemoryPermanent = mDeviceMemoryPermanent = nullptr;
diff --git a/GPU/GPUTracking/Base/GPUReconstructionDeviceBase.h b/GPU/GPUTracking/Base/GPUReconstructionDeviceBase.h
index 215615f558442..1381fd0f76981 100644
--- a/GPU/GPUTracking/Base/GPUReconstructionDeviceBase.h
+++ b/GPU/GPUTracking/Base/GPUReconstructionDeviceBase.h
@@ -17,7 +17,6 @@
 
 #include "GPUReconstructionCPU.h"
 #include <pthread.h>
-#include "GPUReconstructionHelpers.h"
 #include "GPUChain.h"
 #include <vector>
 
@@ -61,24 +60,10 @@ class GPUReconstructionDeviceBase : public GPUReconstructionCPU
   size_t GPUMemCpyAlways(bool onGpu, void* dst, const void* src, size_t size, int32_t stream, int32_t toGPU, deviceEvent* ev = nullptr, deviceEvent* evList = nullptr, int32_t nEvents = 1) override;
   size_t WriteToConstantMemory(size_t offset, const void* src, size_t size, int32_t stream = -1, deviceEvent* ev = nullptr) override = 0;
 
-  int32_t StartHelperThreads() override;
-  int32_t StopHelperThreads() override;
-  void RunHelperThreads(int32_t (GPUReconstructionHelpers::helperDelegateBase::*function)(int32_t, int32_t, GPUReconstructionHelpers::helperParam*), GPUReconstructionHelpers::helperDelegateBase* functionCls, int32_t count) override;
-  int32_t HelperError(int32_t iThread) const override { return mHelperParams[iThread].error; }
-  int32_t HelperDone(int32_t iThread) const override { return mHelperParams[iThread].done; }
-  void WaitForHelperThreads() override;
-  void ResetHelperThreads(int32_t helpers) override;
-  void ResetThisHelperThread(GPUReconstructionHelpers::helperParam* par);
-
   int32_t GetGlobalLock(void*& pLock);
   void ReleaseGlobalLock(void* sem);
 
-  static void* helperWrapper_static(void* arg);
-  void* helperWrapper(GPUReconstructionHelpers::helperParam* par);
-
-  int32_t mDeviceId = -1;                                         // Device ID used by backend
-  GPUReconstructionHelpers::helperParam* mHelperParams = nullptr; // Control Struct for helper threads
-  int32_t mNSlaveThreads = 0;                                     // Number of slave threads currently active
+  int32_t mDeviceId = -1; // Device ID used by backend
 
   struct DebugEvents {
     deviceEvent DebugStart, DebugStop; // Debug timer events
diff --git a/GPU/GPUTracking/Base/GPUReconstructionHelpers.h b/GPU/GPUTracking/Base/GPUReconstructionHelpers.h
deleted file mode 100644
index c55e81905f32f..0000000000000
--- a/GPU/GPUTracking/Base/GPUReconstructionHelpers.h
+++ /dev/null
@@ -1,50 +0,0 @@
-// Copyright 2019-2020 CERN and copyright holders of ALICE O2.
-// See https://alice-o2.web.cern.ch/copyright for details of the copyright holders.
-// All rights not expressly granted are reserved.
-//
-// This software is distributed under the terms of the GNU General Public
-// License v3 (GPL Version 3), copied verbatim in the file "COPYING".
-//
-// In applying this license CERN does not waive the privileges and immunities
-// granted to it by virtue of its status as an Intergovernmental Organization
-// or submit itself to any jurisdiction.
-
-/// \file GPUReconstructionHelpers.h
-/// \author David Rohr
-
-#ifndef GPURECONSTRUCTIONHELPERS_H
-#define GPURECONSTRUCTIONHELPERS_H
-
-#include <mutex>
-
-namespace o2
-{
-namespace gpu
-{
-class GPUReconstructionDeviceBase;
-class GPUReconstructionHelpers
-{
- public:
-  class helperDelegateBase
-  {
-  };
-
-  struct helperParam {
-    pthread_t threadId;
-    GPUReconstructionDeviceBase* cls;
-    int32_t num;
-    std::mutex mutex[2];
-    int8_t terminate;
-    helperDelegateBase* functionCls;
-    int32_t (helperDelegateBase::*function)(int32_t, int32_t, helperParam*);
-    int32_t phase;
-    int32_t count;
-    volatile int32_t done;
-    volatile int8_t error;
-    volatile int8_t reset;
-  };
-};
-} // namespace gpu
-} // namespace o2
-
-#endif
diff --git a/GPU/GPUTracking/CMakeLists.txt b/GPU/GPUTracking/CMakeLists.txt
index 5dd92d41db29b..6acc7fd1dd537 100644
--- a/GPU/GPUTracking/CMakeLists.txt
+++ b/GPU/GPUTracking/CMakeLists.txt
@@ -104,7 +104,6 @@ set(HDRS_INSTALL
     Base/GPUConstantMem.h
     Base/GPUParam.inc
     Base/GPUParamRTC.h
-    Base/GPUReconstructionHelpers.h
     Base/GPUReconstructionIncludes.h
     Base/GPUReconstructionIncludesITS.h
     Base/GPUReconstructionKernelMacros.h
diff --git a/GPU/GPUTracking/Definitions/GPUSettingsList.h b/GPU/GPUTracking/Definitions/GPUSettingsList.h
index c10793975453d..ca6f2f370300e 100644
--- a/GPU/GPUTracking/Definitions/GPUSettingsList.h
+++ b/GPU/GPUTracking/Definitions/GPUSettingsList.h
@@ -252,7 +252,6 @@ AddOption(registerStandaloneInputMemory, bool, false, "registerInputMemory", 0,
 AddOption(ompThreads, int32_t, -1, "omp", 't', "Number of OMP threads to run (-1: all)", min(-1), message("Using %s OMP threads"))
 AddOption(ompKernels, uint8_t, 2, "", 0, "Parallelize with OMP inside kernels instead of over slices, 2 for nested parallelization over TPC sectors and inside kernels")
 AddOption(ompAutoNThreads, bool, true, "", 0, "Auto-adjust number of OMP threads, decreasing the number for small input data")
-AddOption(nDeviceHelperThreads, int32_t, 1, "", 0, "Number of CPU helper threads for CPU processing")
 AddOption(nStreams, int8_t, 8, "", 0, "Number of GPU streams / command queues")
 AddOption(nTPCClustererLanes, int8_t, -1, "", 0, "Number of TPC clusterers that can run in parallel (-1 = autoset)")
 AddOption(overrideClusterizerFragmentLen, int32_t, -1, "", 0, "Force the cluster max fragment len to a certain value (-1 = autodetect)")
diff --git a/GPU/GPUTracking/Global/GPUChain.h b/GPU/GPUTracking/Global/GPUChain.h
index 06650f9d9c733..0981fea43810a 100644
--- a/GPU/GPUTracking/Global/GPUChain.h
+++ b/GPU/GPUTracking/Global/GPUChain.h
@@ -16,7 +16,6 @@
 #define GPUCHAIN_H
 
 #include "GPUReconstructionCPU.h"
-#include "GPUReconstructionHelpers.h"
 
 namespace o2
 {
@@ -111,12 +110,6 @@ class GPUChain
     }
   }
   inline void StreamWaitForEvents(int32_t stream, deviceEvent* evList, int32_t nEvents = 1) { mRec->StreamWaitForEvents(stream, evList, nEvents); }
-  template <class T>
-  void RunHelperThreads(T function, GPUReconstructionHelpers::helperDelegateBase* functionCls, int32_t count);
-  inline void WaitForHelperThreads() { mRec->WaitForHelperThreads(); }
-  inline int32_t HelperError(int32_t iThread) const { return mRec->HelperError(iThread); }
-  inline int32_t HelperDone(int32_t iThread) const { return mRec->HelperDone(iThread); }
-  inline void ResetHelperThreads(int32_t helpers) { mRec->ResetHelperThreads(helpers); }
   inline int32_t GPUDebug(const char* state = "UNKNOWN", int32_t stream = -1) { return mRec->GPUDebug(state, stream); }
   // nEvents is forced to 0 if evList ==  nullptr
   inline void TransferMemoryResourceToGPU(RecoStep step, GPUMemoryResource* res, int32_t stream = -1, deviceEvent* ev = nullptr, deviceEvent* evList = nullptr, int32_t nEvents = 1) { timeCpy(step, true, &GPUReconstructionCPU::TransferMemoryResourceToGPU, res, stream, ev, evList, nEvents); }
@@ -242,12 +235,6 @@ class GPUChain
   void timeCpy(RecoStep step, int32_t toGPU, S T::*func, Args... args);
 };
 
-template <class T>
-inline void GPUChain::RunHelperThreads(T function, GPUReconstructionHelpers::helperDelegateBase* functionCls, int32_t count)
-{
-  mRec->RunHelperThreads((int32_t(GPUReconstructionHelpers::helperDelegateBase::*)(int32_t, int32_t, GPUReconstructionHelpers::helperParam*))function, functionCls, count);
-}
-
 template <bool Always, class T, class S, typename... Args>
 inline void GPUChain::timeCpy(RecoStep step, int32_t toGPU, S T::*func, Args... args)
 {
diff --git a/GPU/GPUTracking/Global/GPUChainTracking.h b/GPU/GPUTracking/Global/GPUChainTracking.h
index 6d6d82b518097..d827b095773b1 100644
--- a/GPU/GPUTracking/Global/GPUChainTracking.h
+++ b/GPU/GPUTracking/Global/GPUChainTracking.h
@@ -16,7 +16,6 @@
 #define GPUCHAINTRACKING_H
 
 #include "GPUChain.h"
-#include "GPUReconstructionHelpers.h"
 #include "GPUDataTypes.h"
 #include <atomic>
 #include <mutex>
@@ -68,7 +67,7 @@ struct GPUTPCCFChainContext;
 struct GPUNewCalibValues;
 struct GPUTriggerOutputs;
 
-class GPUChainTracking : public GPUChain, GPUReconstructionHelpers::helperDelegateBase
+class GPUChainTracking : public GPUChain
 {
   friend class GPUReconstruction;
 
@@ -314,15 +313,11 @@ class GPUChainTracking : public GPUChain, GPUReconstructionHelpers::helperDelega
   void RunTPCClusterFilter(o2::tpc::ClusterNativeAccess* clusters, std::function<o2::tpc::ClusterNative*(size_t)> allocator, bool applyClusterCuts);
   bool NeedTPCClustersOnGPU();
 
-  std::atomic_flag mLockAtomicOutputBuffer = ATOMIC_FLAG_INIT;
   std::mutex mMutexUpdateCalib;
   std::unique_ptr<GPUChainTrackingFinalContext> mPipelineFinalizationCtx;
   GPUChainTrackingFinalContext* mPipelineNotifyCtx = nullptr;
   std::function<void()> mWaitForFinalInputs;
 
-  int32_t HelperReadEvent(int32_t iSlice, int32_t threadId, GPUReconstructionHelpers::helperParam* par);
-  int32_t HelperOutput(int32_t iSlice, int32_t threadId, GPUReconstructionHelpers::helperParam* par);
-
   int32_t OutputStream() const { return mRec->NStreams() - 2; }
 };
 } // namespace gpu
diff --git a/GPU/GPUTracking/Global/GPUChainTrackingSliceTracker.cxx b/GPU/GPUTracking/Global/GPUChainTrackingSliceTracker.cxx
index 35a8c6c455048..174b3757d3307 100644
--- a/GPU/GPUTracking/Global/GPUChainTrackingSliceTracker.cxx
+++ b/GPU/GPUTracking/Global/GPUChainTrackingSliceTracker.cxx
@@ -55,9 +55,6 @@ int32_t GPUChainTracking::RunTPCTrackingSlices()
   if (retVal) {
     SynchronizeGPU();
   }
-  if (retVal >= 2) {
-    ResetHelperThreads(retVal >= 3);
-  }
   return (retVal != 0);
 }
 
@@ -114,9 +111,6 @@ int32_t GPUChainTracking::RunTPCTrackingSlices_internal()
       processorsShadow()->tpcTrackers[iSlice].SetGPUTextureBase(mRec->DeviceMemoryBase());
     }
 
-    if (!doSliceDataOnGPU) {
-      RunHelperThreads(&GPUChainTracking::HelperReadEvent, this, NSLICES);
-    }
     if (PrepareTextures()) {
       return (2);
     }
@@ -183,22 +177,12 @@ int32_t GPUChainTracking::RunTPCTrackingSlices_internal()
       TransferMemoryResourcesToGPU(RecoStep::TPCSliceTracking, &trk, useStream);
       runKernel<GPUTPCCreateSliceData>({GetGridBlk(GPUCA_ROW_COUNT, useStream), {iSlice}, {nullptr, streamInit[useStream] ? nullptr : &mEvents->init}});
       streamInit[useStream] = true;
-    } else if (!doGPU || iSlice % (GetProcessingSettings().nDeviceHelperThreads + 1) == 0) {
+    } else {
       if (ReadEvent(iSlice, 0)) {
         GPUError("Error reading event");
         error = 1;
         continue;
       }
-    } else {
-      if (GetProcessingSettings().debugLevel >= 3) {
-        GPUInfo("Waiting for helper thread %d", iSlice % (GetProcessingSettings().nDeviceHelperThreads + 1) - 1);
-      }
-      while (HelperDone(iSlice % (GetProcessingSettings().nDeviceHelperThreads + 1) - 1) < (int32_t)iSlice) {
-      }
-      if (HelperError(iSlice % (GetProcessingSettings().nDeviceHelperThreads + 1) - 1)) {
-        error = 1;
-        continue;
-      }
     }
     if (GetProcessingSettings().deterministicGPUReconstruction) {
       runKernel<GPUTPCSectorDebugSortKernels, GPUTPCSectorDebugSortKernels::hitData>({GetGridBlk(GPUCA_ROW_COUNT, useStream), {iSlice}});
@@ -297,9 +281,6 @@ int32_t GPUChainTracking::RunTPCTrackingSlices_internal()
     if (doGPU) {
       ReleaseEvent(mEvents->init);
     }
-    if (!doSliceDataOnGPU) {
-      WaitForHelperThreads();
-    }
 
     if (!GetProcessingSettings().trackletSelectorInPipeline) {
       if (GetProcessingSettings().trackletConstructorInPipeline) {
@@ -359,7 +340,6 @@ int32_t GPUChainTracking::RunTPCTrackingSlices_internal()
       if (param().rec.tpc.globalTracking) {
         mWriteOutputDone.fill(0);
       }
-      RunHelperThreads(&GPUChainTracking::HelperOutput, this, NSLICES);
 
       uint32_t tmpSlice = 0;
       for (uint32_t iSlice = 0; iSlice < NSLICES; iSlice++) {
@@ -402,12 +382,12 @@ int32_t GPUChainTracking::RunTPCTrackingSlices_internal()
         }
 
         if (GetProcessingSettings().debugLevel >= 3) {
-          GPUInfo("Data ready for slice %d, helper thread %d", iSlice, iSlice % (GetProcessingSettings().nDeviceHelperThreads + 1));
+          GPUInfo("Data ready for slice %d", iSlice);
         }
         mSliceSelectorReady = iSlice;
 
         if (param().rec.tpc.globalTracking) {
-          for (uint32_t tmpSlice2a = 0; tmpSlice2a <= iSlice; tmpSlice2a += GetProcessingSettings().nDeviceHelperThreads + 1) {
+          for (uint32_t tmpSlice2a = 0; tmpSlice2a <= iSlice; tmpSlice2a++) {
             uint32_t tmpSlice2 = GPUTPCGlobalTracking::GlobalTrackingSliceOrder(tmpSlice2a);
             uint32_t sliceLeft, sliceRight;
             GPUTPCGlobalTracking::GlobalTrackingSliceLeftRight(tmpSlice2, sliceLeft, sliceRight);
@@ -419,12 +399,9 @@ int32_t GPUChainTracking::RunTPCTrackingSlices_internal()
             }
           }
         } else {
-          if (iSlice % (GetProcessingSettings().nDeviceHelperThreads + 1) == 0) {
-            WriteOutput(iSlice, 0);
-          }
+          WriteOutput(iSlice, 0);
         }
       }
-      WaitForHelperThreads();
     }
     if (!(GetRecoStepsOutputs() & GPUDataTypes::InOutType::TPCSectorTracks) && param().rec.tpc.globalTracking) {
       std::vector<bool> blocking(NSLICES * mRec->NStreams());
@@ -518,43 +495,9 @@ void GPUChainTracking::WriteOutput(int32_t iSlice, int32_t threadId)
   if (GetProcessingSettings().debugLevel >= 5) {
     GPUInfo("Running WriteOutput for slice %d on thread %d\n", iSlice, threadId);
   }
-  if (GetProcessingSettings().nDeviceHelperThreads) {
-    while (mLockAtomicOutputBuffer.test_and_set(std::memory_order_acquire)) {
-    }
-  }
   processors()->tpcTrackers[iSlice].WriteOutputPrepare();
-  if (GetProcessingSettings().nDeviceHelperThreads) {
-    mLockAtomicOutputBuffer.clear();
-  }
   processors()->tpcTrackers[iSlice].WriteOutput();
   if (GetProcessingSettings().debugLevel >= 5) {
     GPUInfo("Finished WriteOutput for slice %d on thread %d\n", iSlice, threadId);
   }
 }
-
-int32_t GPUChainTracking::HelperReadEvent(int32_t iSlice, int32_t threadId, GPUReconstructionHelpers::helperParam* par) { return ReadEvent(iSlice, threadId); }
-
-int32_t GPUChainTracking::HelperOutput(int32_t iSlice, int32_t threadId, GPUReconstructionHelpers::helperParam* par)
-{
-  if (param().rec.tpc.globalTracking) {
-    uint32_t tmpSlice = GPUTPCGlobalTracking::GlobalTrackingSliceOrder(iSlice);
-    uint32_t sliceLeft, sliceRight;
-    GPUTPCGlobalTracking::GlobalTrackingSliceLeftRight(tmpSlice, sliceLeft, sliceRight);
-
-    while (mSliceSelectorReady < (int32_t)tmpSlice || mSliceSelectorReady < (int32_t)sliceLeft || mSliceSelectorReady < (int32_t)sliceRight) {
-      if (par->reset) {
-        return 1;
-      }
-    }
-    GlobalTracking(tmpSlice, 0);
-    WriteOutput(tmpSlice, 0);
-  } else {
-    while (mSliceSelectorReady < iSlice) {
-      if (par->reset) {
-        return 1;
-      }
-    }
-    WriteOutput(iSlice, threadId);
-  }
-  return 0;
-}