From c49f0748845ea24f436542373eac00c8bd3cff8e Mon Sep 17 00:00:00 2001 From: cima22 Date: Thu, 25 Jan 2024 10:07:21 +0100 Subject: [PATCH] Updated code for new kernel registration and fixed clang formats --- .../include/DataFormatsTPC/ClusterNative.h | 13 ++- GPU/CMakeLists.txt | 2 + .../Base/GPUReconstructionIncludesDevice.h | 2 +- GPU/GPUTracking/CMakeLists.txt | 2 +- .../DataCompression/GPUTPCDecompression.cxx | 46 +++----- .../DataCompression/GPUTPCDecompression.h | 10 -- .../GPUTPCDecompressionKernels.cxx | 69 +++++++----- .../GPUTPCDecompressionKernels.h | 7 +- GPU/GPUTracking/DataTypes/GPUO2FakeClasses.h | 3 + .../Global/GPUChainTrackingCompression.cxx | 83 ++++++++++---- .../Standalone/Benchmark/standalone.cxx | 10 +- GPU/GPUTracking/kernels.cmake | 2 + .../.cmake/api/v1/query/cache-v2 | 0 .../.cmake/api/v1/query/cmakeFiles-v1 | 0 .../.cmake/api/v1/query/codemodel-v2 | 0 .../.cmake/api/v1/query/toolchains-v1 | 0 cmake-build-debug/DartConfiguration.tcl | 106 ------------------ 17 files changed, 149 insertions(+), 206 deletions(-) delete mode 100644 cmake-build-debug/.cmake/api/v1/query/cache-v2 delete mode 100644 cmake-build-debug/.cmake/api/v1/query/cmakeFiles-v1 delete mode 100644 cmake-build-debug/.cmake/api/v1/query/codemodel-v2 delete mode 100644 cmake-build-debug/.cmake/api/v1/query/toolchains-v1 delete mode 100644 cmake-build-debug/DartConfiguration.tcl diff --git a/DataFormats/Detectors/TPC/include/DataFormatsTPC/ClusterNative.h b/DataFormats/Detectors/TPC/include/DataFormatsTPC/ClusterNative.h index 49d61007092a8..a996f59f51f9e 100644 --- a/DataFormats/Detectors/TPC/include/DataFormatsTPC/ClusterNative.h +++ b/DataFormats/Detectors/TPC/include/DataFormatsTPC/ClusterNative.h @@ -31,7 +31,7 @@ template class ConstMCTruthContainer; template class ConstMCTruthContainerView; -} +} // namespace dataformats } // namespace o2 namespace o2 @@ -156,6 +156,17 @@ struct ClusterNative { return (this->getFlags() < rhs.getFlags()); } } + + GPUd() bool operator==(const ClusterNative& rhs) const + { + return this->getTimePacked() == rhs.getTimePacked() && + this->padPacked == rhs.padPacked && + this->sigmaTimePacked == rhs.sigmaTimePacked && + this->sigmaPadPacked == rhs.sigmaPadPacked && + this->qMax == rhs.qMax && + this->qTot == rhs.qTot && + this->getFlags() == rhs.getFlags(); + } }; // This is an index struct to access TPC clusters inside sectors and rows. It shall not own the data, but just point to diff --git a/GPU/CMakeLists.txt b/GPU/CMakeLists.txt index 6e08e9582761a..0af7d998d2f0c 100644 --- a/GPU/CMakeLists.txt +++ b/GPU/CMakeLists.txt @@ -18,6 +18,8 @@ # HDRS_CINT_O2: Headers for ROOT dictionary (only for O2) HDRS_INSTALL: Headers # for installation only +#set(CMAKE_CXX_FLAGS_${CMAKE_BUILD_TYPE} "${CMAKE_CXX_FLAGS_${CMAKE_BUILD_TYPE}} -fno-omit-frame-pointer") # to uncomment if needed, tired of typing this... + if(NOT DEFINED GPUCA_NO_FAST_MATH) set(GPUCA_NO_FAST_MATH 0) endif() diff --git a/GPU/GPUTracking/Base/GPUReconstructionIncludesDevice.h b/GPU/GPUTracking/Base/GPUReconstructionIncludesDevice.h index 81fde1dbfb996..58e32253336de 100644 --- a/GPU/GPUTracking/Base/GPUReconstructionIncludesDevice.h +++ b/GPU/GPUTracking/Base/GPUReconstructionIncludesDevice.h @@ -75,7 +75,7 @@ using namespace GPUCA_NAMESPACE::gpu; #include "GPUTPCCompressionKernels.cxx" #include "GPUTPCCompressionTrackModel.cxx" -//Files for TPC Decompression +// Files for TPC Decompression #include "GPUTPCDecompressionKernels.cxx" // Files for TPC Cluster Finder diff --git a/GPU/GPUTracking/CMakeLists.txt b/GPU/GPUTracking/CMakeLists.txt index 18a187bf5f64d..200ac832a433e 100644 --- a/GPU/GPUTracking/CMakeLists.txt +++ b/GPU/GPUTracking/CMakeLists.txt @@ -11,7 +11,7 @@ set(MODULE GPUTracking) -set(CMAKE_CXX_FLAGS_${CMAKE_BUILD_TYPE} "${CMAKE_CXX_FLAGS_${CMAKE_BUILD_TYPE}} -fno-omit-frame-pointer") # to uncomment if needed, tired of typing this... +# set(CMAKE_CXX_FLAGS_${CMAKE_BUILD_TYPE} "${CMAKE_CXX_FLAGS_${CMAKE_BUILD_TYPE}} -fno-omit-frame-pointer") # to uncomment if needed, tired of typing this... include(cmake/helpers.cmake) diff --git a/GPU/GPUTracking/DataCompression/GPUTPCDecompression.cxx b/GPU/GPUTracking/DataCompression/GPUTPCDecompression.cxx index 922bfc120d136..83420fa37feb1 100644 --- a/GPU/GPUTracking/DataCompression/GPUTPCDecompression.cxx +++ b/GPU/GPUTracking/DataCompression/GPUTPCDecompression.cxx @@ -65,45 +65,35 @@ void GPUTPCDecompression::SetPointersCompressedClusters(void*& mem, T& c, unsign computePointerWithAlignment(mem, c.nTrackClusters, nTr); } -void* GPUTPCDecompression::SetPointersTmpNativeBuffersGPU(void* mem){ - computePointerWithAlignment(mem,mTmpNativeClusters,NSLICES * GPUCA_ROW_COUNT * mMaxNativeClustersPerBuffer); - //computePointerWithAlignment(mem,mClusterNativeAccess); +void* GPUTPCDecompression::SetPointersTmpNativeBuffersGPU(void* mem) +{ + computePointerWithAlignment(mem, mTmpNativeClusters, NSLICES * GPUCA_ROW_COUNT * mMaxNativeClustersPerBuffer); return mem; } -void* GPUTPCDecompression::SetPointersTmpNativeBuffersOutput(void* mem){ - computePointerWithAlignment(mem,mNativeClustersIndex,NSLICES * GPUCA_ROW_COUNT); +void* GPUTPCDecompression::SetPointersTmpNativeBuffersOutput(void* mem) +{ + computePointerWithAlignment(mem, mNativeClustersIndex, NSLICES * GPUCA_ROW_COUNT); return mem; } -void* GPUTPCDecompression::SetPointersTmpNativeBuffersInput(void* mem){ - computePointerWithAlignment(mem,mUnattachedClustersOffsets,NSLICES * GPUCA_ROW_COUNT); +void* GPUTPCDecompression::SetPointersTmpNativeBuffersInput(void* mem) +{ + computePointerWithAlignment(mem, mUnattachedClustersOffsets, NSLICES * GPUCA_ROW_COUNT); return mem; } -void GPUTPCDecompression::RegisterMemoryAllocation() { +void GPUTPCDecompression::RegisterMemoryAllocation() +{ AllocateAndInitializeLate(); mMemoryResInputGPU = mRec->RegisterMemoryAllocation(this, &GPUTPCDecompression::SetPointersInputGPU, GPUMemoryResource::MEMORY_INPUT_FLAG | GPUMemoryResource::MEMORY_GPU | GPUMemoryResource::MEMORY_CUSTOM, "TPCDecompressionInput"); - mRec->RegisterMemoryAllocation(this,&GPUTPCDecompression::SetPointersTmpNativeBuffersGPU,GPUMemoryResource::MEMORY_GPU,"TPCDecompressionTmpBuffersGPU"); - mResourceTmpIndexes = mRec->RegisterMemoryAllocation(this,&GPUTPCDecompression::SetPointersTmpNativeBuffersOutput,GPUMemoryResource::MEMORY_OUTPUT,"TPCDecompressionTmpBuffersOutput"); - mResourceTmpClustersOffsets = mRec->RegisterMemoryAllocation(this,&GPUTPCDecompression::SetPointersTmpNativeBuffersInput,GPUMemoryResource::MEMORY_INPUT,"TPCDecompressionTmpBuffersInput"); + mRec->RegisterMemoryAllocation(this, &GPUTPCDecompression::SetPointersTmpNativeBuffersGPU, GPUMemoryResource::MEMORY_GPU, "TPCDecompressionTmpBuffersGPU"); + mResourceTmpIndexes = mRec->RegisterMemoryAllocation(this, &GPUTPCDecompression::SetPointersTmpNativeBuffersOutput, GPUMemoryResource::MEMORY_OUTPUT, "TPCDecompressionTmpBuffersOutput"); + mResourceTmpClustersOffsets = mRec->RegisterMemoryAllocation(this, &GPUTPCDecompression::SetPointersTmpNativeBuffersInput, GPUMemoryResource::MEMORY_INPUT, "TPCDecompressionTmpBuffersInput"); } -void GPUTPCDecompression::SetMaxData(const GPUTrackingInOutPointers& io){ - //mMaxNativeClustersPerBuffer = 81760; - mMaxNativeClustersPerBuffer = 12000; -} -/* -GPUTPCDecompression::ConcurrentClusterNativeBuffer::ConcurrentClusterNativeBuffer(): -mCmprClsBuffer{new o2::tpc::ClusterNative[mCapacity]},mIndex{0} -{} - -void GPUTPCDecompression::ConcurrentClusterNativeBuffer::push_back(tpc::ClusterNative cluster) +void GPUTPCDecompression::SetMaxData(const GPUTrackingInOutPointers& io) { - if(mIndex == mCapacity){ - //reallocate? - return; - } - unsigned int current = CAMath::AtomicAdd(mIndex, 1u); - mTmpNativeClusters[current] = cluster; -}*/ \ No newline at end of file + // mMaxNativeClustersPerBuffer = 81760; + mMaxNativeClustersPerBuffer = 12000; +} \ No newline at end of file diff --git a/GPU/GPUTracking/DataCompression/GPUTPCDecompression.h b/GPU/GPUTracking/DataCompression/GPUTPCDecompression.h index e4b2cca65c007..ac63f698b8226 100644 --- a/GPU/GPUTracking/DataCompression/GPUTPCDecompression.h +++ b/GPU/GPUTracking/DataCompression/GPUTPCDecompression.h @@ -66,15 +66,6 @@ class GPUTPCDecompression : public GPUProcessor o2::tpc::ClusterNative* mTmpNativeClusters; o2::tpc::ClusterNativeAccess* mClusterNativeAccess; o2::tpc::ClusterNative* mNativeClustersBuffer; -/* class ConcurrentClusterNativeBuffer{ - size_t mIndex; - size_t mCapacity = 10; - o2::tpc::ClusterNative* mCmprClsBuffer; - public: - ConcurrentClusterNativeBuffer(); - void push_back(ClusterNative cluster); - };*/ -// ConcurrentClusterNativeBuffer* tmpBuffer; template void SetPointersCompressedClusters(void*& mem, T& c, unsigned int nClA, unsigned int nTr, unsigned int nClU, bool reducedClA); @@ -82,7 +73,6 @@ class GPUTPCDecompression : public GPUProcessor short mMemoryResInputGPU = -1; short mResourceTmpIndexes = -1; short mResourceTmpClustersOffsets = -1; - }; } // namespace GPUCA_NAMESPACE::gpu #endif // GPUTPCDECOMPRESSION_H diff --git a/GPU/GPUTracking/DataCompression/GPUTPCDecompressionKernels.cxx b/GPU/GPUTracking/DataCompression/GPUTPCDecompressionKernels.cxx index 033a38391dec7..ac4dc2765fd11 100644 --- a/GPU/GPUTracking/DataCompression/GPUTPCDecompressionKernels.cxx +++ b/GPU/GPUTracking/DataCompression/GPUTPCDecompressionKernels.cxx @@ -17,12 +17,14 @@ #include "GPUConstantMem.h" #include "GPUTPCCompressionTrackModel.h" #include "GPUCommonAlgorithm.h" +#include using namespace GPUCA_NAMESPACE::gpu; using namespace o2::tpc; template <> -GPUdii() void GPUTPCDecompressionKernels::Thread(int nBlocks, int nThreads, int iBlock, int iThread, GPUsharedref() GPUSharedMemory& smem, processorType& processors){ +GPUdii() void GPUTPCDecompressionKernels::Thread(int nBlocks, int nThreads, int iBlock, int iThread, GPUsharedref() GPUSharedMemory& smem, processorType& processors) +{ GPUTPCDecompression& GPUrestrict() decompressor = processors.tpcDecompressor; CompressedClusters& GPUrestrict() cmprClusters = decompressor.mInputGPU; const GPUParam& GPUrestrict() param = processors.param; @@ -39,46 +41,47 @@ GPUdii() void GPUTPCDecompressionKernels::Thread= GPUCA_NSLICES); - if(changeLeg){ + if (changeLeg) { tmpSlice -= GPUCA_NSLICES; } - if(cmprClusters.nComppressionModes & GPUSettings::CompressionDifferences){ + if (cmprClusters.nComppressionModes & GPUSettings::CompressionDifferences) { slice += tmpSlice; - if(slice >= GPUCA_NSLICES){ + if (slice >= GPUCA_NSLICES) { slice -= GPUCA_NSLICES; } - row += cmprClusters.rowDiffA[clusterOffset -trackIndex -1]; - if(row >= GPUCA_ROW_COUNT){ + row += cmprClusters.rowDiffA[clusterOffset - trackIndex - 1]; + if (row >= GPUCA_ROW_COUNT) { row -= GPUCA_ROW_COUNT; } } else { slice = tmpSlice; - row = cmprClusters.rowDiffA[clusterOffset -trackIndex -1]; + row = cmprClusters.rowDiffA[clusterOffset - trackIndex - 1]; } if (changeLeg && track.Mirror()) { break; } - if (track.Propagate(param.tpcGeometry.Row2X(row),param.SliceParam[slice].Alpha)){ + if (track.Propagate(param.tpcGeometry.Row2X(row), param.SliceParam[slice].Alpha)) { break; } - unsigned int timeTmp = cmprClusters.timeResA[clusterOffset -trackIndex -1]; + unsigned int timeTmp = cmprClusters.timeResA[clusterOffset - trackIndex - 1]; if (timeTmp & 800000) { timeTmp |= 0xFF000000; } - time = timeTmp + ClusterNative::packTime(CAMath::Max(0.f,param.tpcGeometry.LinearZ2Time(slice,track.Z() + zOffset))); + time = timeTmp + ClusterNative::packTime(CAMath::Max(0.f, param.tpcGeometry.LinearZ2Time(slice, track.Z() + zOffset))); float tmpPad = CAMath::Max(0.f, CAMath::Min((float)param.tpcGeometry.NPads(GPUCA_ROW_COUNT - 1), param.tpcGeometry.LinearY2Pad(slice, row, track.Y()))); - pad = cmprClusters.padResA[clusterOffset -trackIndex - 1] + ClusterNative::packPad(tmpPad); + pad = cmprClusters.padResA[clusterOffset - trackIndex - 1] + ClusterNative::packPad(tmpPad); time = time & 0xFFFFFF; pad = (unsigned short)pad; if (pad >= param.tpcGeometry.NPads(row) * ClusterNative::scalePadPacked) { @@ -103,11 +106,11 @@ GPUdii() void GPUTPCDecompressionKernels::decompressTrack(CompressedClusters& cm const auto cluster = decompressTrackStore(cmprClusters, clusterOffset, slice, row, pad, time, decompressor, stored); float y = param.tpcGeometry.LinearPad2Y(slice, row, cluster.getPad()); float z = param.tpcGeometry.LinearTime2Z(slice, cluster.getTime()); - if(clusterIndex == 0){ + if (clusterIndex == 0) { zOffset = z; - track.Init(param.tpcGeometry.Row2X(row), y, z - zOffset, param.SliceParam[slice].Alpha, cmprClusters.qPtA[trackIndex],param); + track.Init(param.tpcGeometry.Row2X(row), y, z - zOffset, param.SliceParam[slice].Alpha, cmprClusters.qPtA[trackIndex], param); } - if(clusterIndex + 1 < cmprClusters.nTrackClusters[trackIndex] && track.Filter(y,z-zOffset,row)){ + if (clusterIndex + 1 < cmprClusters.nTrackClusters[trackIndex] && track.Filter(y, z - zOffset, row)) { break; } clusterOffset++; @@ -115,28 +118,32 @@ GPUdii() void GPUTPCDecompressionKernels::decompressTrack(CompressedClusters& cm clusterOffset += cmprClusters.nTrackClusters[trackIndex] - clusterIndex; } -GPUdii() ClusterNative GPUTPCDecompressionKernels::decompressTrackStore(const o2::tpc::CompressedClusters& cmprClusters, const unsigned int clusterOffset, unsigned int slice, unsigned int row, unsigned int pad, unsigned int time, GPUTPCDecompression& decompressor, bool& stored){ - unsigned int tmpBufferIndex = computeLinearTmpBufferIndex(slice,row,decompressor.mMaxNativeClustersPerBuffer); - unsigned int currentClusterIndex = CAMath::AtomicAdd(decompressor.mNativeClustersIndex + (slice * GPUCA_ROW_COUNT + row),1u); +GPUdii() ClusterNative GPUTPCDecompressionKernels::decompressTrackStore(const o2::tpc::CompressedClusters& cmprClusters, const unsigned int clusterOffset, unsigned int slice, unsigned int row, unsigned int pad, unsigned int time, GPUTPCDecompression& decompressor, bool& stored) +{ + unsigned int tmpBufferIndex = computeLinearTmpBufferIndex(slice, row, decompressor.mMaxNativeClustersPerBuffer); + unsigned int currentClusterIndex = CAMath::AtomicAdd(decompressor.mNativeClustersIndex + (slice * GPUCA_ROW_COUNT + row), 1u); const ClusterNative c(time, cmprClusters.flagsA[clusterOffset], pad, cmprClusters.sigmaTimeA[clusterOffset], cmprClusters.sigmaPadA[clusterOffset], cmprClusters.qMaxA[clusterOffset], cmprClusters.qTotA[clusterOffset]); stored = currentClusterIndex < decompressor.mMaxNativeClustersPerBuffer; - if(stored){ + if (stored) { decompressor.mTmpNativeClusters[tmpBufferIndex + currentClusterIndex] = c; } return c; } template <> -GPUdii() void GPUTPCDecompressionKernels::Thread(int nBlocks, int nThreads, int iBlock, int iThread, GPUsharedref() GPUSharedMemory& smem, processorType& processors){ +GPUdii() void GPUTPCDecompressionKernels::Thread(int nBlocks, int nThreads, int iBlock, int iThread, GPUsharedref() GPUSharedMemory& smem, processorType& processors) +{ GPUTPCDecompression& GPUrestrict() decompressor = processors.tpcDecompressor; CompressedClusters& GPUrestrict() cmprClusters = decompressor.mInputGPU; ClusterNative* GPUrestrict() clusterBuffer = decompressor.mNativeClustersBuffer; + const ClusterNativeAccess* outputAccess = processors.ioPtrs.clustersNative; + unsigned int* offsets = decompressor.mUnattachedClustersOffsets; - for (unsigned int i = get_global_id(0); i < GPUCA_NSLICES * GPUCA_ROW_COUNT; i += get_global_size(0)){ + for (unsigned int i = get_global_id(0); i < GPUCA_NSLICES * GPUCA_ROW_COUNT; i += get_global_size(0)) { unsigned int slice = i / GPUCA_ROW_COUNT; unsigned int row = i % GPUCA_ROW_COUNT; - unsigned int tmpBufferIndex = computeLinearTmpBufferIndex(slice,row,decompressor.mMaxNativeClustersPerBuffer); - ClusterNative* buffer = clusterBuffer + processors.ioPtrs.clustersNative->clusterOffset[slice][row]; + unsigned int tmpBufferIndex = computeLinearTmpBufferIndex(slice, row, decompressor.mMaxNativeClustersPerBuffer); + ClusterNative* buffer = clusterBuffer + outputAccess->clusterOffset[slice][row]; if (decompressor.mNativeClustersIndex[i] != 0) { memcpy((void*)buffer, (const void*)(decompressor.mTmpNativeClusters + tmpBufferIndex), decompressor.mNativeClustersIndex[i] * sizeof(clusterBuffer[0])); } @@ -144,7 +151,7 @@ GPUdii() void GPUTPCDecompressionKernels::Thread= decompressor.mInputGPU.nSliceRows) ? 0 : decompressor.mInputGPU.nSliceRowClusters[i]); decompressHits(cmprClusters, offsets[i], end, clout); if (processors.param.rec.tpc.clustersShiftTimebins != 0.f) { - for (unsigned int k = 0; k < processors.ioPtrs.clustersNative->nClusters[slice][row]; k++) { + for (unsigned int k = 0; k < outputAccess->nClusters[slice][row]; k++) { auto& cl = buffer[k]; float t = cl.getTime() + processors.param.rec.tpc.clustersShiftTimebins; if (t < 0) { @@ -156,12 +163,12 @@ GPUdii() void GPUTPCDecompressionKernels::ThreadnClusters[slice][row]); + GPUCommonAlgorithm::sort(buffer, buffer + outputAccess->nClusters[slice][row]); } - } -GPUdii() void GPUTPCDecompressionKernels::decompressHits(const o2::tpc::CompressedClusters& cmprClusters, const unsigned int start, const unsigned int end, ClusterNative* clusterNativeBuffer){ +GPUdii() void GPUTPCDecompressionKernels::decompressHits(const o2::tpc::CompressedClusters& cmprClusters, const unsigned int start, const unsigned int end, ClusterNative* clusterNativeBuffer) +{ unsigned int time = 0; unsigned short pad = 0; for (unsigned int k = start; k < end; k++) { @@ -176,7 +183,7 @@ GPUdii() void GPUTPCDecompressionKernels::decompressHits(const o2::tpc::Compress time = cmprClusters.timeDiffU[k]; pad = cmprClusters.padDiffU[k]; } - *(clusterNativeBuffer++) = ClusterNative(time, cmprClusters.flagsU[k], pad, cmprClusters.sigmaTimeU[k], cmprClusters.sigmaPadU[k], cmprClusters.qMaxU[k], cmprClusters.qTotU[k]); + *(clusterNativeBuffer++) = ClusterNative(time, cmprClusters.flagsU[k], pad, cmprClusters.sigmaTimeU[k], cmprClusters.sigmaPadU[k], cmprClusters.qMaxU[k], cmprClusters.qTotU[k]); } } diff --git a/GPU/GPUTracking/DataCompression/GPUTPCDecompressionKernels.h b/GPU/GPUTracking/DataCompression/GPUTPCDecompressionKernels.h index e70f7486bebbf..ee2046b4ab0f6 100644 --- a/GPU/GPUTracking/DataCompression/GPUTPCDecompressionKernels.h +++ b/GPU/GPUTracking/DataCompression/GPUTPCDecompressionKernels.h @@ -41,7 +41,7 @@ class GPUTPCDecompressionKernels : public GPUKernelTemplate enum K : int { step0attached = 0, step1unattached = 1, - prepareAccess = 2 + // prepareAccess = 2 }; template @@ -50,10 +50,11 @@ class GPUTPCDecompressionKernels : public GPUKernelTemplate GPUdi() static o2::tpc::ClusterNative decompressTrackStore(const o2::tpc::CompressedClusters& cmprClusters, const unsigned int clusterOffset, unsigned int slice, unsigned int row, unsigned int pad, unsigned int time, GPUTPCDecompression& decompressor, bool& stored); GPUdi() static void decompressHits(const o2::tpc::CompressedClusters& cmprClusters, const unsigned int start, const unsigned int end, o2::tpc::ClusterNative* clusterNativeBuffer); - GPUd() static unsigned int computeLinearTmpBufferIndex(unsigned int slice, unsigned int row, unsigned int maxClustersPerBuffer){ + GPUd() static unsigned int computeLinearTmpBufferIndex(unsigned int slice, unsigned int row, unsigned int maxClustersPerBuffer) + { return slice * (GPUCA_ROW_COUNT * maxClustersPerBuffer) + row * maxClustersPerBuffer; } }; -} +} // namespace GPUCA_NAMESPACE::gpu #endif // GPUTPCDECOMPRESSIONKERNELS_H diff --git a/GPU/GPUTracking/DataTypes/GPUO2FakeClasses.h b/GPU/GPUTracking/DataTypes/GPUO2FakeClasses.h index 02bb8b9775ad9..9a1db75b11185 100644 --- a/GPU/GPUTracking/DataTypes/GPUO2FakeClasses.h +++ b/GPU/GPUTracking/DataTypes/GPUO2FakeClasses.h @@ -116,6 +116,9 @@ class GPUTPCCompression public: GPUFakeEmpty* mOutput; }; +class GPUTPCDecompression +{ +}; class GPUTPCClusterFinder { }; diff --git a/GPU/GPUTracking/Global/GPUChainTrackingCompression.cxx b/GPU/GPUTracking/Global/GPUChainTrackingCompression.cxx index f924602baaa5d..05e8135353f2c 100644 --- a/GPU/GPUTracking/Global/GPUChainTrackingCompression.cxx +++ b/GPU/GPUTracking/Global/GPUChainTrackingCompression.cxx @@ -28,8 +28,7 @@ using namespace o2::tpc; int GPUChainTracking::RunTPCCompression() { - LOGP(info, "====== Compression"); -//#ifdef GPUCA_HAVE_O2HEADERS +#ifdef GPUCA_HAVE_O2HEADERS mRec->PushNonPersistentMemory(qStr2Tag("TPCCOMPR")); RecoStep myStep = RecoStep::TPCCompression; bool doGPU = GetRecoStepsGPU() & RecoStep::TPCCompression; @@ -200,16 +199,20 @@ int GPUChainTracking::RunTPCCompression() ((GPUChainTracking*)GetNextChainInQueue())->mRec->BlockStackedMemory(mRec); } mRec->PopNonPersistentMemory(RecoStep::TPCCompression, qStr2Tag("TPCCOMPR")); -//#endif +#endif return 0; } int GPUChainTracking::RunTPCDecompression() { - LOGP(info, "====== Decompression"); +#ifdef GPUCA_HAVE_O2HEADERS + // mRec->PushNonPersistentMemory(qStr2Tag("TPCDCMPR")); + ClusterNativeAccess* original = new ClusterNativeAccess; + original->clustersLinear = new ClusterNative[mIOPtrs.clustersNative->nClustersTotal]; + memcpy((void*)original->clustersLinear, mIOPtrs.clustersNative->clustersLinear, mIOPtrs.clustersNative->nClustersTotal * sizeof(mIOPtrs.clustersNative->clustersLinear[0])); + memcpy((void*)original->nClusters, mIOPtrs.clustersNative->nClusters, NSLICES * GPUCA_ROW_COUNT * sizeof(mIOPtrs.clustersNative->nClusters[0][0])); + original->setOffsetPtrs(); -//#ifdef GPUCA_HAVE_O2HEADERS - // mRec->PushNonPersistentMemory(qStr2Tag("TPCDCMPR")); RecoStep myStep = RecoStep::TPCDecompression; bool doGPU = GetRecoStepsGPU() & RecoStep::TPCDecompression; // with -g gives true GPUTPCDecompression& Decompressor = processors()->tpcDecompressor; @@ -230,7 +233,6 @@ int GPUChainTracking::RunTPCDecompression() size_t copySize = AllocateRegisteredMemory(Decompressor.mMemoryResInputGPU); WriteToConstantMemory(myStep, (char*)&processors()->tpcDecompressor - (char*)processors(), &DecompressorShadow, sizeof(DecompressorShadow), 0); - //TransferMemoryResourcesToGPU(myStep, &Decompressor, 0); int outputStream = 0; bool toGPU = true; @@ -287,12 +289,12 @@ int GPUChainTracking::RunTPCDecompression() decodedAttachedClusters += Decompressor.mNativeClustersIndex[linearIndex]; } } - LOGP(info,"decoded = {}",decodedAttachedClusters); - TransferMemoryResourceLinkToGPU(RecoStep::TPCDecompression,Decompressor.mResourceTmpClustersOffsets,0); + LOGP(info, "decoded = {}", decodedAttachedClusters); + TransferMemoryResourceLinkToGPU(RecoStep::TPCDecompression, Decompressor.mResourceTmpClustersOffsets, 0); if (decodedAttachedClusters != cmprClsHost.nAttachedClusters) { - GPUWarning("My version: %u / %u clusters failed track model decoding (%f %%)", cmprClsHost.nAttachedClusters - decodedAttachedClusters, cmprClsHost.nAttachedClusters, 100.f * (float)(cmprClsHost.nAttachedClusters - decodedAttachedClusters) / (float)cmprClsHost.nAttachedClusters); + GPUWarning("My version: %u / %u clusters failed track model decoding (%f %%)", cmprClsHost.nAttachedClusters - decodedAttachedClusters, cmprClsHost.nAttachedClusters, 100.f * (float)(cmprClsHost.nAttachedClusters - decodedAttachedClusters) / (float)cmprClsHost.nAttachedClusters); } else { - GPUInfo("My version: all attached clusters have been decoded"); + GPUInfo("My version: all attached clusters have been decoded"); } Decompressor.mNativeClustersBuffer = mInputsHost->mPclusterNativeOutput; @@ -313,20 +315,20 @@ int GPUChainTracking::RunTPCDecompression() runKernel(GetGridAutoStep(0, RecoStep::TPCDecompression), krnlRunRangeNone, krnlEventNone); ClusterNative* tmpBuffer = new ClusterNative[mInputsHost->mNClusterNative]; - //GPUMemCpy(RecoStep::TPCDecompression,mInputsHost->mPclusterNativeOutput,mInputsShadow->mPclusterNativeBuffer, sizeof(mInputsShadow->mPclusterNativeBuffer[0]) * mIOPtrs.clustersNative->nClustersTotal,0,false); - GPUMemCpy(RecoStep::TPCDecompression,tmpBuffer,mInputsShadow->mPclusterNativeBuffer, sizeof(mInputsShadow->mPclusterNativeBuffer[0]) * mIOPtrs.clustersNative->nClustersTotal,0,false); + ClusterNativeAccess gpuBuffer = *mInputsHost->mPclusterNativeAccess; + gpuBuffer.clustersLinear = tmpBuffer; + // GPUMemCpy(RecoStep::TPCDecompression,mInputsHost->mPclusterNativeOutput,mInputsShadow->mPclusterNativeBuffer, sizeof(mInputsShadow->mPclusterNativeBuffer[0]) * mIOPtrs.clustersNative->nClustersTotal,0,false); + GPUMemCpy(RecoStep::TPCDecompression, tmpBuffer, mInputsShadow->mPclusterNativeBuffer, sizeof(mInputsShadow->mPclusterNativeBuffer[0]) * mIOPtrs.clustersNative->nClustersTotal, 0, false); + gpuBuffer.setOffsetPtrs(); TPCClusterDecompressor decomp; auto allocator = [this](size_t size) { this->mInputsHost->mNClusterNative = this->mInputsShadow->mNClusterNative = size; - //this->AllocateRegisteredMemory(this->mInputsHost->mResourceClusterNativeOutput, this->mSubOutputControls[GPUTrackingOutputs::getIndex(&GPUTrackingOutputs::clustersNative)]); + // this->AllocateRegisteredMemory(this->mInputsHost->mResourceClusterNativeOutput, this->mSubOutputControls[GPUTrackingOutputs::getIndex(&GPUTrackingOutputs::clustersNative)]); return this->mInputsHost->mPclusterNativeOutput; }; auto& gatherTimer = getTimer("TPCDecompression", 0); gatherTimer.Start(); - - LOGP(info,"==== mIOPtrs.compressed.nAttCl = {}, nUnAttCl = {}, nTracks = {}",cmprClsHost.nAttachedClusters,cmprClsHost.nUnattachedClusters,cmprClsHost.nTracks); - if (decomp.decompress(mIOPtrs.tpcCompressedClusters, *mClusterNativeAccess, allocator, param())) { GPUError("Error decompressing clusters"); return 1; @@ -334,7 +336,7 @@ int GPUChainTracking::RunTPCDecompression() gatherTimer.Stop(); mIOPtrs.clustersNative = mClusterNativeAccess.get(); if (mRec->IsGPU()) { - //AllocateRegisteredMemory(mInputsHost->mResourceClusterNativeBuffer); + // AllocateRegisteredMemory(mInputsHost->mResourceClusterNativeBuffer); processorsShadow()->ioPtrs.clustersNative = mInputsShadow->mPclusterNativeAccess; WriteToConstantMemory(RecoStep::TPCDecompression, (char*)&processors()->ioPtrs - (char*)processors(), &processorsShadow()->ioPtrs, sizeof(processorsShadow()->ioPtrs), 0); *mInputsHost->mPclusterNativeAccess = *mIOPtrs.clustersNative; @@ -345,10 +347,49 @@ int GPUChainTracking::RunTPCDecompression() SynchronizeStream(0); } - LOGP(info,"==== My version/cpu version: {}/{}", tmpBuffer[0].qTot,mInputsHost->mPclusterNativeOutput[0].qTot); - + const ClusterNativeAccess* decoded = &gpuBuffer; // mIOPtrs.clustersNative; + // original = (ClusterNativeAccess*)mIOPtrs.clustersNative; + unsigned int decodingErrors = 0; + std::vector tmpClusters; + if (param().rec.tpc.rejectionStrategy == GPUSettings::RejectionNone) { // verification does not make sense if we reject clusters during compression + for (unsigned int i = 0; i < NSLICES; i++) { + for (unsigned int j = 0; j < GPUCA_ROW_COUNT; j++) { + if (original->nClusters[i][j] != decoded->nClusters[i][j]) { + GPUError("Number of clusters mismatch slice %u row %u: expected %d v.s. decoded %d", i, j, original->nClusters[i][j], decoded->nClusters[i][j]); + decodingErrors++; + continue; + } + tmpClusters.resize(original->nClusters[i][j]); + for (unsigned int k = 0; k < original->nClusters[i][j]; k++) { + tmpClusters[k] = original->clusters[i][j][k]; + if (param().rec.tpc.compressionTypeMask & GPUSettings::CompressionTruncate) { + GPUTPCCompression::truncateSignificantBitsChargeMax(tmpClusters[k].qMax, param()); + GPUTPCCompression::truncateSignificantBitsCharge(tmpClusters[k].qTot, param()); + GPUTPCCompression::truncateSignificantBitsWidth(tmpClusters[k].sigmaPadPacked, param()); + GPUTPCCompression::truncateSignificantBitsWidth(tmpClusters[k].sigmaTimePacked, param()); + } + } + std::sort(tmpClusters.begin(), tmpClusters.end()); + for (unsigned int k = 0; k < original->nClusters[i][j]; k++) { + const o2::tpc::ClusterNative& c1 = tmpClusters[k]; + const o2::tpc::ClusterNative& c2 = decoded->clusters[i][j][k]; + if (!(c1 == c2)) { + if (decodingErrors++ < 100) { + // GPUWarning("Cluster mismatch: slice %2u row %3u hit %5u: %6d %3d %4d %3d %3d %4d %4d", i, j, k, (int)c1.getTimePacked(), (int)c1.getFlags(), (int)c1.padPacked, (int)c1.sigmaTimePacked, (int)c1.sigmaPadPacked, (int)c1.qMax, (int)c1.qTot); + // GPUWarning("%45s %6d %3d %4d %3d %3d %4d %4d", "", (int)c2.getTimePacked(), (int)c2.getFlags(), (int)c2.padPacked, (int)c2.sigmaTimePacked, (int)c2.sigmaPadPacked, (int)c2.qMax, (int)c2.qTot); + } + } + } + } + } + if (decodingErrors) { + GPUWarning("Errors during cluster decoding %u\n", decodingErrors); + } else { + GPUInfo("Cluster decoding verification: PASSED"); + } + } delete[] tmpBuffer; // mRec->PopNonPersistentMemory(RecoStep::TPCCompression, qStr2Tag("TPCDCMPR")); -//#endif +#endif return 0; } \ No newline at end of file diff --git a/GPU/GPUTracking/Standalone/Benchmark/standalone.cxx b/GPU/GPUTracking/Standalone/Benchmark/standalone.cxx index 04bf75f39bab3..ebb3c3def1679 100644 --- a/GPU/GPUTracking/Standalone/Benchmark/standalone.cxx +++ b/GPU/GPUTracking/Standalone/Benchmark/standalone.cxx @@ -612,7 +612,8 @@ int RunBenchmark(GPUReconstruction* recUse, GPUChainTracking* chainTrackingUse, if (configStandalone.testSyncAsync) { printf("Running synchronous phase\n"); } - const GPUTrackingInOutPointers& ioPtrs = ioPtrEvents[!configStandalone.preloadEvents ? 0 : configStandalone.proc.doublePipeline ? (iteration % ioPtrEvents.size()) : (iEvent - configStandalone.StartEvent)]; + const GPUTrackingInOutPointers& ioPtrs = ioPtrEvents[!configStandalone.preloadEvents ? 0 : configStandalone.proc.doublePipeline ? (iteration % ioPtrEvents.size()) + : (iEvent - configStandalone.StartEvent)]; chainTrackingUse->mIOPtrs = ioPtrs; if (iteration == (configStandalone.proc.doublePipeline ? 2 : (configStandalone.runs - 1))) { if (configStandalone.proc.doublePipeline && timerPipeline) { @@ -667,7 +668,7 @@ int RunBenchmark(GPUReconstruction* recUse, GPUChainTracking* chainTrackingUse, chainTrackingAsync->mIOPtrs.rawClusters[i] = nullptr; chainTrackingAsync->mIOPtrs.nRawClusters[i] = 0; } - chainTrackingAsync->mIOPtrs.clustersNative = nullptr; + chainTrackingAsync->mIOPtrs.clustersNative = chainTrackingUse->mIOPtrs.clustersNative; // todo: revert back to nullptr recAsync->SetResetTimers(iRun < configStandalone.runsInit); tmpRetVal = recAsync->RunChains(); if (tmpRetVal == 0 || tmpRetVal == 2) { @@ -864,7 +865,8 @@ int main(int argc, char** argv) if (grp.continuousMaxTimeBin == 0) { printf("Cannot override max time bin for non-continuous data!\n"); } else { - grp.continuousMaxTimeBin = chainTracking->mIOPtrs.tpcZS ? GPUReconstructionConvert::GetMaxTimeBin(*chainTracking->mIOPtrs.tpcZS) : chainTracking->mIOPtrs.tpcPackedDigits ? GPUReconstructionConvert::GetMaxTimeBin(*chainTracking->mIOPtrs.tpcPackedDigits) : GPUReconstructionConvert::GetMaxTimeBin(*chainTracking->mIOPtrs.clustersNative); + grp.continuousMaxTimeBin = chainTracking->mIOPtrs.tpcZS ? GPUReconstructionConvert::GetMaxTimeBin(*chainTracking->mIOPtrs.tpcZS) : chainTracking->mIOPtrs.tpcPackedDigits ? GPUReconstructionConvert::GetMaxTimeBin(*chainTracking->mIOPtrs.tpcPackedDigits) + : GPUReconstructionConvert::GetMaxTimeBin(*chainTracking->mIOPtrs.clustersNative); printf("Max time bin set to %d\n", (int)grp.continuousMaxTimeBin); rec->UpdateSettings(&grp); if (recAsync) { @@ -948,7 +950,7 @@ int main(int argc, char** argv) printf("Error unregistering memory\n"); } } - //exit(0); + // exit(0); rec->Exit(); if (!configStandalone.noprompt) { diff --git a/GPU/GPUTracking/kernels.cmake b/GPU/GPUTracking/kernels.cmake index b06749010d2d9..d7302f17271d2 100644 --- a/GPU/GPUTracking/kernels.cmake +++ b/GPU/GPUTracking/kernels.cmake @@ -73,6 +73,8 @@ o2_gpu_add_kernel("GPUTPCCompressionGatherKernels, buffered32" LB simple) o2_gpu_add_kernel("GPUTPCCompressionGatherKernels, buffered64" LB simple) o2_gpu_add_kernel("GPUTPCCompressionGatherKernels, buffered128" LB simple) o2_gpu_add_kernel("GPUTPCCompressionGatherKernels, multiBlock" LB simple) +o2_gpu_add_kernel("GPUTPCDecompressionKernels, step0attached" LB simple) +o2_gpu_add_kernel("GPUTPCDecompressionKernels, step1unattached" LB simple) o2_gpu_add_kernel("GPUTPCCFCheckPadBaseline" LB single) o2_gpu_add_kernel("GPUTPCCFChargeMapFiller, fillIndexMap" LB single) o2_gpu_add_kernel("GPUTPCCFChargeMapFiller, fillFromDigits" LB single) diff --git a/cmake-build-debug/.cmake/api/v1/query/cache-v2 b/cmake-build-debug/.cmake/api/v1/query/cache-v2 deleted file mode 100644 index e69de29bb2d1d..0000000000000 diff --git a/cmake-build-debug/.cmake/api/v1/query/cmakeFiles-v1 b/cmake-build-debug/.cmake/api/v1/query/cmakeFiles-v1 deleted file mode 100644 index e69de29bb2d1d..0000000000000 diff --git a/cmake-build-debug/.cmake/api/v1/query/codemodel-v2 b/cmake-build-debug/.cmake/api/v1/query/codemodel-v2 deleted file mode 100644 index e69de29bb2d1d..0000000000000 diff --git a/cmake-build-debug/.cmake/api/v1/query/toolchains-v1 b/cmake-build-debug/.cmake/api/v1/query/toolchains-v1 deleted file mode 100644 index e69de29bb2d1d..0000000000000 diff --git a/cmake-build-debug/DartConfiguration.tcl b/cmake-build-debug/DartConfiguration.tcl deleted file mode 100644 index 09b3136cfee5f..0000000000000 --- a/cmake-build-debug/DartConfiguration.tcl +++ /dev/null @@ -1,106 +0,0 @@ -# This file is configured by CMake automatically as DartConfiguration.tcl -# If you choose not to use CMake, this file may be hand configured, by -# filling in the required variables. - - -# Configuration directories and files -SourceDirectory: /experiments/alice/cimag/o2/O2 -BuildDirectory: /experiments/alice/cimag/o2/O2/cmake-build-debug - -# Where to place the cost data store -CostDataFile: - -# Site is something like machine.domain, i.e. pragmatic.crd -Site: gr3srv.ts.infn.it - -# Build name is osname-revision-compiler, i.e. Linux-2.4.2-2smp-c++ -BuildName: Linux-c++ - -# Subprojects -LabelsForSubprojects: - -# Submission information -SubmitURL: http:// -SubmitInactivityTimeout: - -# Dashboard start time -NightlyStartTime: 00:00:00 EDT - -# Commands for the build/test/submit cycle -ConfigureCommand: "/home/cimag/.cache/JetBrains/RemoteDev/dist/b21c2b5ff1f19_CLion-2023.2.2/bin/cmake/linux/x64/bin/cmake" "/experiments/alice/cimag/o2/O2" -MakeCommand: /home/cimag/.cache/JetBrains/RemoteDev/dist/b21c2b5ff1f19_CLion-2023.2.2/bin/cmake/linux/x64/bin/cmake --build . --config "${CTEST_CONFIGURATION_TYPE}" -DefaultCTestConfigurationType: Release - -# version control -UpdateVersionOnly: - -# CVS options -# Default is "-d -P -A" -CVSCommand: -CVSUpdateOptions: - -# Subversion options -SVNCommand: -SVNOptions: -SVNUpdateOptions: - -# Git options -GITCommand: /usr/bin/git -GITInitSubmodules: -GITUpdateOptions: -GITUpdateCustom: - -# Perforce options -P4Command: -P4Client: -P4Options: -P4UpdateOptions: -P4UpdateCustom: - -# Generic update command -UpdateCommand: /usr/bin/git -UpdateOptions: -UpdateType: git - -# Compiler info -Compiler: /usr/bin/c++ -CompilerVersion: 4.8.5 - -# Dynamic analysis (MemCheck) -PurifyCommand: -ValgrindCommand: -ValgrindCommandOptions: -DrMemoryCommand: -DrMemoryCommandOptions: -CudaSanitizerCommand: -CudaSanitizerCommandOptions: -MemoryCheckType: -MemoryCheckSanitizerOptions: -MemoryCheckCommand: /usr/bin/valgrind -MemoryCheckCommandOptions: -MemoryCheckSuppressionFile: - -# Coverage -CoverageCommand: /usr/bin/gcov -CoverageExtraFlags: -l - -# Testing options -# TimeOut is the amount of time in seconds to wait for processes -# to complete during testing. After TimeOut seconds, the -# process will be summarily terminated. -# Currently set to 25 minutes -TimeOut: 1500 - -# During parallel testing CTest will not start a new test if doing -# so would cause the system load to exceed this value. -TestLoad: - -UseLaunchers: -CurlOptions: -# warning, if you add new options here that have to do with submit, -# you have to update cmCTestSubmitCommand.cxx - -# For CTest submissions that timeout, these options -# specify behavior for retrying the submission -CTestSubmitRetryDelay: 5 -CTestSubmitRetryCount: 3