Skip to content

Commit

Permalink
Updated code for new kernel registration and fixed clang formats
Browse files Browse the repository at this point in the history
  • Loading branch information
cima22 committed Jan 30, 2024
1 parent 9a5597c commit c49f074
Show file tree
Hide file tree
Showing 17 changed files with 149 additions and 206 deletions.
13 changes: 12 additions & 1 deletion DataFormats/Detectors/TPC/include/DataFormatsTPC/ClusterNative.h
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ template <class T>
class ConstMCTruthContainer;
template <class T>
class ConstMCTruthContainerView;
}
} // namespace dataformats
} // namespace o2

namespace o2
Expand Down Expand Up @@ -156,6 +156,17 @@ struct ClusterNative {
return (this->getFlags() < rhs.getFlags());
}
}

GPUd() bool operator==(const ClusterNative& rhs) const
{
return this->getTimePacked() == rhs.getTimePacked() &&
this->padPacked == rhs.padPacked &&
this->sigmaTimePacked == rhs.sigmaTimePacked &&
this->sigmaPadPacked == rhs.sigmaPadPacked &&
this->qMax == rhs.qMax &&
this->qTot == rhs.qTot &&
this->getFlags() == rhs.getFlags();
}
};

// This is an index struct to access TPC clusters inside sectors and rows. It shall not own the data, but just point to
Expand Down
2 changes: 2 additions & 0 deletions GPU/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,8 @@
# HDRS_CINT_O2: Headers for ROOT dictionary (only for O2) HDRS_INSTALL: Headers
# for installation only

#set(CMAKE_CXX_FLAGS_${CMAKE_BUILD_TYPE} "${CMAKE_CXX_FLAGS_${CMAKE_BUILD_TYPE}} -fno-omit-frame-pointer") # to uncomment if needed, tired of typing this...

if(NOT DEFINED GPUCA_NO_FAST_MATH)
set(GPUCA_NO_FAST_MATH 0)
endif()
Expand Down
2 changes: 1 addition & 1 deletion GPU/GPUTracking/Base/GPUReconstructionIncludesDevice.h
Original file line number Diff line number Diff line change
Expand Up @@ -75,7 +75,7 @@ using namespace GPUCA_NAMESPACE::gpu;
#include "GPUTPCCompressionKernels.cxx"
#include "GPUTPCCompressionTrackModel.cxx"

//Files for TPC Decompression
// Files for TPC Decompression
#include "GPUTPCDecompressionKernels.cxx"

// Files for TPC Cluster Finder
Expand Down
2 changes: 1 addition & 1 deletion GPU/GPUTracking/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@

set(MODULE GPUTracking)

set(CMAKE_CXX_FLAGS_${CMAKE_BUILD_TYPE} "${CMAKE_CXX_FLAGS_${CMAKE_BUILD_TYPE}} -fno-omit-frame-pointer") # to uncomment if needed, tired of typing this...
# set(CMAKE_CXX_FLAGS_${CMAKE_BUILD_TYPE} "${CMAKE_CXX_FLAGS_${CMAKE_BUILD_TYPE}} -fno-omit-frame-pointer") # to uncomment if needed, tired of typing this...

include(cmake/helpers.cmake)

Expand Down
46 changes: 18 additions & 28 deletions GPU/GPUTracking/DataCompression/GPUTPCDecompression.cxx
Original file line number Diff line number Diff line change
Expand Up @@ -65,45 +65,35 @@ void GPUTPCDecompression::SetPointersCompressedClusters(void*& mem, T& c, unsign
computePointerWithAlignment(mem, c.nTrackClusters, nTr);
}

void* GPUTPCDecompression::SetPointersTmpNativeBuffersGPU(void* mem){
computePointerWithAlignment(mem,mTmpNativeClusters,NSLICES * GPUCA_ROW_COUNT * mMaxNativeClustersPerBuffer);
//computePointerWithAlignment(mem,mClusterNativeAccess);
void* GPUTPCDecompression::SetPointersTmpNativeBuffersGPU(void* mem)
{
computePointerWithAlignment(mem, mTmpNativeClusters, NSLICES * GPUCA_ROW_COUNT * mMaxNativeClustersPerBuffer);
return mem;
}

void* GPUTPCDecompression::SetPointersTmpNativeBuffersOutput(void* mem){
computePointerWithAlignment(mem,mNativeClustersIndex,NSLICES * GPUCA_ROW_COUNT);
void* GPUTPCDecompression::SetPointersTmpNativeBuffersOutput(void* mem)
{
computePointerWithAlignment(mem, mNativeClustersIndex, NSLICES * GPUCA_ROW_COUNT);
return mem;
}

void* GPUTPCDecompression::SetPointersTmpNativeBuffersInput(void* mem){
computePointerWithAlignment(mem,mUnattachedClustersOffsets,NSLICES * GPUCA_ROW_COUNT);
void* GPUTPCDecompression::SetPointersTmpNativeBuffersInput(void* mem)
{
computePointerWithAlignment(mem, mUnattachedClustersOffsets, NSLICES * GPUCA_ROW_COUNT);
return mem;
}

void GPUTPCDecompression::RegisterMemoryAllocation() {
void GPUTPCDecompression::RegisterMemoryAllocation()
{
AllocateAndInitializeLate();
mMemoryResInputGPU = mRec->RegisterMemoryAllocation(this, &GPUTPCDecompression::SetPointersInputGPU, GPUMemoryResource::MEMORY_INPUT_FLAG | GPUMemoryResource::MEMORY_GPU | GPUMemoryResource::MEMORY_CUSTOM, "TPCDecompressionInput");
mRec->RegisterMemoryAllocation(this,&GPUTPCDecompression::SetPointersTmpNativeBuffersGPU,GPUMemoryResource::MEMORY_GPU,"TPCDecompressionTmpBuffersGPU");
mResourceTmpIndexes = mRec->RegisterMemoryAllocation(this,&GPUTPCDecompression::SetPointersTmpNativeBuffersOutput,GPUMemoryResource::MEMORY_OUTPUT,"TPCDecompressionTmpBuffersOutput");
mResourceTmpClustersOffsets = mRec->RegisterMemoryAllocation(this,&GPUTPCDecompression::SetPointersTmpNativeBuffersInput,GPUMemoryResource::MEMORY_INPUT,"TPCDecompressionTmpBuffersInput");
mRec->RegisterMemoryAllocation(this, &GPUTPCDecompression::SetPointersTmpNativeBuffersGPU, GPUMemoryResource::MEMORY_GPU, "TPCDecompressionTmpBuffersGPU");
mResourceTmpIndexes = mRec->RegisterMemoryAllocation(this, &GPUTPCDecompression::SetPointersTmpNativeBuffersOutput, GPUMemoryResource::MEMORY_OUTPUT, "TPCDecompressionTmpBuffersOutput");
mResourceTmpClustersOffsets = mRec->RegisterMemoryAllocation(this, &GPUTPCDecompression::SetPointersTmpNativeBuffersInput, GPUMemoryResource::MEMORY_INPUT, "TPCDecompressionTmpBuffersInput");
}

void GPUTPCDecompression::SetMaxData(const GPUTrackingInOutPointers& io){
//mMaxNativeClustersPerBuffer = 81760;
mMaxNativeClustersPerBuffer = 12000;
}
/*
GPUTPCDecompression::ConcurrentClusterNativeBuffer::ConcurrentClusterNativeBuffer():
mCmprClsBuffer{new o2::tpc::ClusterNative[mCapacity]},mIndex{0}
{}
void GPUTPCDecompression::ConcurrentClusterNativeBuffer::push_back(tpc::ClusterNative cluster)
void GPUTPCDecompression::SetMaxData(const GPUTrackingInOutPointers& io)
{
if(mIndex == mCapacity){
//reallocate?
return;
}
unsigned int current = CAMath::AtomicAdd(mIndex, 1u);
mTmpNativeClusters[current] = cluster;
}*/
// mMaxNativeClustersPerBuffer = 81760;
mMaxNativeClustersPerBuffer = 12000;
}
10 changes: 0 additions & 10 deletions GPU/GPUTracking/DataCompression/GPUTPCDecompression.h
Original file line number Diff line number Diff line change
Expand Up @@ -66,23 +66,13 @@ class GPUTPCDecompression : public GPUProcessor
o2::tpc::ClusterNative* mTmpNativeClusters;
o2::tpc::ClusterNativeAccess* mClusterNativeAccess;
o2::tpc::ClusterNative* mNativeClustersBuffer;
/* class ConcurrentClusterNativeBuffer{
size_t mIndex;
size_t mCapacity = 10;
o2::tpc::ClusterNative* mCmprClsBuffer;
public:
ConcurrentClusterNativeBuffer();
void push_back(ClusterNative cluster);
};*/
// ConcurrentClusterNativeBuffer* tmpBuffer;

template <class T>
void SetPointersCompressedClusters(void*& mem, T& c, unsigned int nClA, unsigned int nTr, unsigned int nClU, bool reducedClA);

short mMemoryResInputGPU = -1;
short mResourceTmpIndexes = -1;
short mResourceTmpClustersOffsets = -1;

};
} // namespace GPUCA_NAMESPACE::gpu
#endif // GPUTPCDECOMPRESSION_H
69 changes: 38 additions & 31 deletions GPU/GPUTracking/DataCompression/GPUTPCDecompressionKernels.cxx
Original file line number Diff line number Diff line change
Expand Up @@ -17,12 +17,14 @@
#include "GPUConstantMem.h"
#include "GPUTPCCompressionTrackModel.h"
#include "GPUCommonAlgorithm.h"
#include <string.h>

using namespace GPUCA_NAMESPACE::gpu;
using namespace o2::tpc;

template <>
GPUdii() void GPUTPCDecompressionKernels::Thread<GPUTPCDecompressionKernels::step0attached>(int nBlocks, int nThreads, int iBlock, int iThread, GPUsharedref() GPUSharedMemory& smem, processorType& processors){
GPUdii() void GPUTPCDecompressionKernels::Thread<GPUTPCDecompressionKernels::step0attached>(int nBlocks, int nThreads, int iBlock, int iThread, GPUsharedref() GPUSharedMemory& smem, processorType& processors)
{
GPUTPCDecompression& GPUrestrict() decompressor = processors.tpcDecompressor;
CompressedClusters& GPUrestrict() cmprClusters = decompressor.mInputGPU;
const GPUParam& GPUrestrict() param = processors.param;
Expand All @@ -39,46 +41,47 @@ GPUdii() void GPUTPCDecompressionKernels::Thread<GPUTPCDecompressionKernels::ste
}
}

GPUdii() void GPUTPCDecompressionKernels::decompressTrack(CompressedClusters& cmprClusters, const GPUParam& param, const unsigned int maxTime, const unsigned int trackIndex, unsigned int& clusterOffset, GPUTPCDecompression& decompressor){
GPUdii() void GPUTPCDecompressionKernels::decompressTrack(CompressedClusters& cmprClusters, const GPUParam& param, const unsigned int maxTime, const unsigned int trackIndex, unsigned int& clusterOffset, GPUTPCDecompression& decompressor)
{
float zOffset = 0;
unsigned int slice = cmprClusters.sliceA[trackIndex];
unsigned int row = cmprClusters.rowA[trackIndex];
GPUTPCCompressionTrackModel track;
unsigned int clusterIndex;
for(clusterIndex = 0; clusterIndex < cmprClusters.nTrackClusters[trackIndex]; clusterIndex++){
for (clusterIndex = 0; clusterIndex < cmprClusters.nTrackClusters[trackIndex]; clusterIndex++) {
unsigned int pad = 0, time = 0;
if(clusterIndex != 0){
unsigned char tmpSlice = cmprClusters.sliceLegDiffA[clusterOffset - trackIndex -1];
if (clusterIndex != 0) {
unsigned char tmpSlice = cmprClusters.sliceLegDiffA[clusterOffset - trackIndex - 1];
bool changeLeg = (tmpSlice >= GPUCA_NSLICES);
if(changeLeg){
if (changeLeg) {
tmpSlice -= GPUCA_NSLICES;
}
if(cmprClusters.nComppressionModes & GPUSettings::CompressionDifferences){
if (cmprClusters.nComppressionModes & GPUSettings::CompressionDifferences) {
slice += tmpSlice;
if(slice >= GPUCA_NSLICES){
if (slice >= GPUCA_NSLICES) {
slice -= GPUCA_NSLICES;
}
row += cmprClusters.rowDiffA[clusterOffset -trackIndex -1];
if(row >= GPUCA_ROW_COUNT){
row += cmprClusters.rowDiffA[clusterOffset - trackIndex - 1];
if (row >= GPUCA_ROW_COUNT) {
row -= GPUCA_ROW_COUNT;
}
} else {
slice = tmpSlice;
row = cmprClusters.rowDiffA[clusterOffset -trackIndex -1];
row = cmprClusters.rowDiffA[clusterOffset - trackIndex - 1];
}
if (changeLeg && track.Mirror()) {
break;
}
if (track.Propagate(param.tpcGeometry.Row2X(row),param.SliceParam[slice].Alpha)){
if (track.Propagate(param.tpcGeometry.Row2X(row), param.SliceParam[slice].Alpha)) {
break;
}
unsigned int timeTmp = cmprClusters.timeResA[clusterOffset -trackIndex -1];
unsigned int timeTmp = cmprClusters.timeResA[clusterOffset - trackIndex - 1];
if (timeTmp & 800000) {
timeTmp |= 0xFF000000;
}
time = timeTmp + ClusterNative::packTime(CAMath::Max(0.f,param.tpcGeometry.LinearZ2Time(slice,track.Z() + zOffset)));
time = timeTmp + ClusterNative::packTime(CAMath::Max(0.f, param.tpcGeometry.LinearZ2Time(slice, track.Z() + zOffset)));
float tmpPad = CAMath::Max(0.f, CAMath::Min((float)param.tpcGeometry.NPads(GPUCA_ROW_COUNT - 1), param.tpcGeometry.LinearY2Pad(slice, row, track.Y())));
pad = cmprClusters.padResA[clusterOffset -trackIndex - 1] + ClusterNative::packPad(tmpPad);
pad = cmprClusters.padResA[clusterOffset - trackIndex - 1] + ClusterNative::packPad(tmpPad);
time = time & 0xFFFFFF;
pad = (unsigned short)pad;
if (pad >= param.tpcGeometry.NPads(row) * ClusterNative::scalePadPacked) {
Expand All @@ -103,48 +106,52 @@ GPUdii() void GPUTPCDecompressionKernels::decompressTrack(CompressedClusters& cm
const auto cluster = decompressTrackStore(cmprClusters, clusterOffset, slice, row, pad, time, decompressor, stored);
float y = param.tpcGeometry.LinearPad2Y(slice, row, cluster.getPad());
float z = param.tpcGeometry.LinearTime2Z(slice, cluster.getTime());
if(clusterIndex == 0){
if (clusterIndex == 0) {
zOffset = z;
track.Init(param.tpcGeometry.Row2X(row), y, z - zOffset, param.SliceParam[slice].Alpha, cmprClusters.qPtA[trackIndex],param);
track.Init(param.tpcGeometry.Row2X(row), y, z - zOffset, param.SliceParam[slice].Alpha, cmprClusters.qPtA[trackIndex], param);
}
if(clusterIndex + 1 < cmprClusters.nTrackClusters[trackIndex] && track.Filter(y,z-zOffset,row)){
if (clusterIndex + 1 < cmprClusters.nTrackClusters[trackIndex] && track.Filter(y, z - zOffset, row)) {
break;
}
clusterOffset++;
}
clusterOffset += cmprClusters.nTrackClusters[trackIndex] - clusterIndex;
}

GPUdii() ClusterNative GPUTPCDecompressionKernels::decompressTrackStore(const o2::tpc::CompressedClusters& cmprClusters, const unsigned int clusterOffset, unsigned int slice, unsigned int row, unsigned int pad, unsigned int time, GPUTPCDecompression& decompressor, bool& stored){
unsigned int tmpBufferIndex = computeLinearTmpBufferIndex(slice,row,decompressor.mMaxNativeClustersPerBuffer);
unsigned int currentClusterIndex = CAMath::AtomicAdd(decompressor.mNativeClustersIndex + (slice * GPUCA_ROW_COUNT + row),1u);
GPUdii() ClusterNative GPUTPCDecompressionKernels::decompressTrackStore(const o2::tpc::CompressedClusters& cmprClusters, const unsigned int clusterOffset, unsigned int slice, unsigned int row, unsigned int pad, unsigned int time, GPUTPCDecompression& decompressor, bool& stored)
{
unsigned int tmpBufferIndex = computeLinearTmpBufferIndex(slice, row, decompressor.mMaxNativeClustersPerBuffer);
unsigned int currentClusterIndex = CAMath::AtomicAdd(decompressor.mNativeClustersIndex + (slice * GPUCA_ROW_COUNT + row), 1u);
const ClusterNative c(time, cmprClusters.flagsA[clusterOffset], pad, cmprClusters.sigmaTimeA[clusterOffset], cmprClusters.sigmaPadA[clusterOffset], cmprClusters.qMaxA[clusterOffset], cmprClusters.qTotA[clusterOffset]);
stored = currentClusterIndex < decompressor.mMaxNativeClustersPerBuffer;
if(stored){
if (stored) {
decompressor.mTmpNativeClusters[tmpBufferIndex + currentClusterIndex] = c;
}
return c;
}

template <>
GPUdii() void GPUTPCDecompressionKernels::Thread<GPUTPCDecompressionKernels::step1unattached>(int nBlocks, int nThreads, int iBlock, int iThread, GPUsharedref() GPUSharedMemory& smem, processorType& processors){
GPUdii() void GPUTPCDecompressionKernels::Thread<GPUTPCDecompressionKernels::step1unattached>(int nBlocks, int nThreads, int iBlock, int iThread, GPUsharedref() GPUSharedMemory& smem, processorType& processors)
{
GPUTPCDecompression& GPUrestrict() decompressor = processors.tpcDecompressor;
CompressedClusters& GPUrestrict() cmprClusters = decompressor.mInputGPU;
ClusterNative* GPUrestrict() clusterBuffer = decompressor.mNativeClustersBuffer;
const ClusterNativeAccess* outputAccess = processors.ioPtrs.clustersNative;

unsigned int* offsets = decompressor.mUnattachedClustersOffsets;
for (unsigned int i = get_global_id(0); i < GPUCA_NSLICES * GPUCA_ROW_COUNT; i += get_global_size(0)){
for (unsigned int i = get_global_id(0); i < GPUCA_NSLICES * GPUCA_ROW_COUNT; i += get_global_size(0)) {
unsigned int slice = i / GPUCA_ROW_COUNT;
unsigned int row = i % GPUCA_ROW_COUNT;
unsigned int tmpBufferIndex = computeLinearTmpBufferIndex(slice,row,decompressor.mMaxNativeClustersPerBuffer);
ClusterNative* buffer = clusterBuffer + processors.ioPtrs.clustersNative->clusterOffset[slice][row];
unsigned int tmpBufferIndex = computeLinearTmpBufferIndex(slice, row, decompressor.mMaxNativeClustersPerBuffer);
ClusterNative* buffer = clusterBuffer + outputAccess->clusterOffset[slice][row];
if (decompressor.mNativeClustersIndex[i] != 0) {
memcpy((void*)buffer, (const void*)(decompressor.mTmpNativeClusters + tmpBufferIndex), decompressor.mNativeClustersIndex[i] * sizeof(clusterBuffer[0]));
}
ClusterNative* clout = buffer + decompressor.mNativeClustersIndex[i];
unsigned int end = offsets[i] + ((i >= decompressor.mInputGPU.nSliceRows) ? 0 : decompressor.mInputGPU.nSliceRowClusters[i]);
decompressHits(cmprClusters, offsets[i], end, clout);
if (processors.param.rec.tpc.clustersShiftTimebins != 0.f) {
for (unsigned int k = 0; k < processors.ioPtrs.clustersNative->nClusters[slice][row]; k++) {
for (unsigned int k = 0; k < outputAccess->nClusters[slice][row]; k++) {
auto& cl = buffer[k];
float t = cl.getTime() + processors.param.rec.tpc.clustersShiftTimebins;
if (t < 0) {
Expand All @@ -156,12 +163,12 @@ GPUdii() void GPUTPCDecompressionKernels::Thread<GPUTPCDecompressionKernels::ste
cl.setTime(t);
}
}
GPUCommonAlgorithm::sort(buffer, buffer + processors.ioPtrs.clustersNative->nClusters[slice][row]);
GPUCommonAlgorithm::sort(buffer, buffer + outputAccess->nClusters[slice][row]);
}

}

GPUdii() void GPUTPCDecompressionKernels::decompressHits(const o2::tpc::CompressedClusters& cmprClusters, const unsigned int start, const unsigned int end, ClusterNative* clusterNativeBuffer){
GPUdii() void GPUTPCDecompressionKernels::decompressHits(const o2::tpc::CompressedClusters& cmprClusters, const unsigned int start, const unsigned int end, ClusterNative* clusterNativeBuffer)
{
unsigned int time = 0;
unsigned short pad = 0;
for (unsigned int k = start; k < end; k++) {
Expand All @@ -176,7 +183,7 @@ GPUdii() void GPUTPCDecompressionKernels::decompressHits(const o2::tpc::Compress
time = cmprClusters.timeDiffU[k];
pad = cmprClusters.padDiffU[k];
}
*(clusterNativeBuffer++) = ClusterNative(time, cmprClusters.flagsU[k], pad, cmprClusters.sigmaTimeU[k], cmprClusters.sigmaPadU[k], cmprClusters.qMaxU[k], cmprClusters.qTotU[k]);
*(clusterNativeBuffer++) = ClusterNative(time, cmprClusters.flagsU[k], pad, cmprClusters.sigmaTimeU[k], cmprClusters.sigmaPadU[k], cmprClusters.qMaxU[k], cmprClusters.qTotU[k]);
}
}

Expand Down
7 changes: 4 additions & 3 deletions GPU/GPUTracking/DataCompression/GPUTPCDecompressionKernels.h
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ class GPUTPCDecompressionKernels : public GPUKernelTemplate
enum K : int {
step0attached = 0,
step1unattached = 1,
prepareAccess = 2
// prepareAccess = 2
};

template <int iKernel = defaultKernel>
Expand All @@ -50,10 +50,11 @@ class GPUTPCDecompressionKernels : public GPUKernelTemplate
GPUdi() static o2::tpc::ClusterNative decompressTrackStore(const o2::tpc::CompressedClusters& cmprClusters, const unsigned int clusterOffset, unsigned int slice, unsigned int row, unsigned int pad, unsigned int time, GPUTPCDecompression& decompressor, bool& stored);
GPUdi() static void decompressHits(const o2::tpc::CompressedClusters& cmprClusters, const unsigned int start, const unsigned int end, o2::tpc::ClusterNative* clusterNativeBuffer);

GPUd() static unsigned int computeLinearTmpBufferIndex(unsigned int slice, unsigned int row, unsigned int maxClustersPerBuffer){
GPUd() static unsigned int computeLinearTmpBufferIndex(unsigned int slice, unsigned int row, unsigned int maxClustersPerBuffer)
{
return slice * (GPUCA_ROW_COUNT * maxClustersPerBuffer) + row * maxClustersPerBuffer;
}
};

}
} // namespace GPUCA_NAMESPACE::gpu
#endif // GPUTPCDECOMPRESSIONKERNELS_H
3 changes: 3 additions & 0 deletions GPU/GPUTracking/DataTypes/GPUO2FakeClasses.h
Original file line number Diff line number Diff line change
Expand Up @@ -116,6 +116,9 @@ class GPUTPCCompression
public:
GPUFakeEmpty* mOutput;
};
class GPUTPCDecompression
{
};
class GPUTPCClusterFinder
{
};
Expand Down
Loading

0 comments on commit c49f074

Please sign in to comment.