From 8e4adc8520a545a6aab5a8e7dbca5ad5020ffbdb Mon Sep 17 00:00:00 2001 From: Massimiliano Meneghin Date: Fri, 5 Jan 2024 16:44:55 +0100 Subject: [PATCH] wip: adding new multi-gpu bGrid. --- libNeonDomain/include/Neon/domain/Grids.h | 1 + .../include/Neon/domain/bGridMgpuDisg.h | 6 + .../domain/details/bGridDisgMgpu/BlockView.h | 29 + .../bGridDisgMgpu/BlockView/BlockViewGrid.h | 97 ++++ .../BlockView/BlockViewPartition.h | 42 ++ .../BlockView/BlockViewPartition_imp.h | 184 +++++++ .../details/bGridDisgMgpu/StaticBlock.h | 106 ++++ .../domain/details/bGridDisgMgpu/bField.h | 121 +++++ .../domain/details/bGridDisgMgpu/bField_imp.h | 349 ++++++++++++ .../Neon/domain/details/bGridDisgMgpu/bGrid.h | 239 ++++++++ .../domain/details/bGridDisgMgpu/bGrid_imp.h | 470 ++++++++++++++++ .../domain/details/bGridDisgMgpu/bIndex.h | 142 +++++ .../domain/details/bGridDisgMgpu/bIndex_imp.h | 67 +++ .../domain/details/bGridDisgMgpu/bPartition.h | 215 ++++++++ .../details/bGridDisgMgpu/bPartition_imp.h | 510 ++++++++++++++++++ .../Neon/domain/details/bGridDisgMgpu/bSpan.h | 55 ++ .../domain/details/bGridDisgMgpu/bSpan_imp.h | 51 ++ .../bGridDisgMgpu/bFieldReduceKernels.cu | 0 .../domain/details/bGridDisgMgpu/bGrid.cpp | 7 + .../tests/domain-map-disg/src/gtests.cpp | 4 +- libNeonDomain/tests/domain-map/src/gtests.cpp | 12 +- libNeonDomain/tests/domain-map/src/map.cu | 10 +- libNeonDomain/tests/domain-map/src/map.h | 2 + .../tests/domain-map/src/runHelper.h | 2 +- 24 files changed, 2713 insertions(+), 8 deletions(-) create mode 100644 libNeonDomain/include/Neon/domain/bGridMgpuDisg.h create mode 100644 libNeonDomain/include/Neon/domain/details/bGridDisgMgpu/BlockView.h create mode 100644 libNeonDomain/include/Neon/domain/details/bGridDisgMgpu/BlockView/BlockViewGrid.h create mode 100644 libNeonDomain/include/Neon/domain/details/bGridDisgMgpu/BlockView/BlockViewPartition.h create mode 100644 libNeonDomain/include/Neon/domain/details/bGridDisgMgpu/BlockView/BlockViewPartition_imp.h create mode 100644 libNeonDomain/include/Neon/domain/details/bGridDisgMgpu/StaticBlock.h create mode 100644 libNeonDomain/include/Neon/domain/details/bGridDisgMgpu/bField.h create mode 100644 libNeonDomain/include/Neon/domain/details/bGridDisgMgpu/bField_imp.h create mode 100644 libNeonDomain/include/Neon/domain/details/bGridDisgMgpu/bGrid.h create mode 100644 libNeonDomain/include/Neon/domain/details/bGridDisgMgpu/bGrid_imp.h create mode 100644 libNeonDomain/include/Neon/domain/details/bGridDisgMgpu/bIndex.h create mode 100644 libNeonDomain/include/Neon/domain/details/bGridDisgMgpu/bIndex_imp.h create mode 100644 libNeonDomain/include/Neon/domain/details/bGridDisgMgpu/bPartition.h create mode 100644 libNeonDomain/include/Neon/domain/details/bGridDisgMgpu/bPartition_imp.h create mode 100644 libNeonDomain/include/Neon/domain/details/bGridDisgMgpu/bSpan.h create mode 100644 libNeonDomain/include/Neon/domain/details/bGridDisgMgpu/bSpan_imp.h create mode 100644 libNeonDomain/src/domain/details/bGridDisgMgpu/bFieldReduceKernels.cu create mode 100644 libNeonDomain/src/domain/details/bGridDisgMgpu/bGrid.cpp diff --git a/libNeonDomain/include/Neon/domain/Grids.h b/libNeonDomain/include/Neon/domain/Grids.h index a7c9a878..6e48f584 100644 --- a/libNeonDomain/include/Neon/domain/Grids.h +++ b/libNeonDomain/include/Neon/domain/Grids.h @@ -5,3 +5,4 @@ #include "Neon/domain/bGrid.h" #include "Neon/domain/dGridSoA.h" #include "Neon/domain/bGridDisg.h" +#include "Neon/domain/bGridMgpuDisg.h" diff --git a/libNeonDomain/include/Neon/domain/bGridMgpuDisg.h b/libNeonDomain/include/Neon/domain/bGridMgpuDisg.h new file mode 100644 index 00000000..2d2a1c9f --- /dev/null +++ b/libNeonDomain/include/Neon/domain/bGridMgpuDisg.h @@ -0,0 +1,6 @@ +#pragma once +#include "Neon/domain/details/bGridDisgMgpu//bGrid.h" + +namespace Neon { +using bGridMgpu = Neon::domain::details::bGridMgpu::bGrid>; +} \ No newline at end of file diff --git a/libNeonDomain/include/Neon/domain/details/bGridDisgMgpu/BlockView.h b/libNeonDomain/include/Neon/domain/details/bGridDisgMgpu/BlockView.h new file mode 100644 index 00000000..5ef7df3b --- /dev/null +++ b/libNeonDomain/include/Neon/domain/details/bGridDisgMgpu/BlockView.h @@ -0,0 +1,29 @@ +#include "Neon/domain/details/bGridDisgMgpu//BlockView/BlockViewGrid.h" +#include "Neon/domain/tools/GridTransformer.h" + +namespace Neon::domain::details::bGridMgpu { + +struct BlockView +{ + public: + using Grid = Neon::domain::tool::GridTransformer::Grid; + template + using Field = Grid::template Field; + using index_3d = Neon::index_3d; + + template + static auto helpGetReference(T* mem, const int idx, const int card) -> std::enable_if_t + { + return mem[idx * card]; + } + + template + static auto helpGetReference(T* mem, const int idx, const int card) -> std::enable_if_t + { + return mem[idx * C]; + } + + static constexpr Neon::MemoryLayout layout = Neon::MemoryLayout::arrayOfStructs; +}; + +} // namespace Neon::domain::details::bGrid \ No newline at end of file diff --git a/libNeonDomain/include/Neon/domain/details/bGridDisgMgpu/BlockView/BlockViewGrid.h b/libNeonDomain/include/Neon/domain/details/bGridDisgMgpu/BlockView/BlockViewGrid.h new file mode 100644 index 00000000..6047bb88 --- /dev/null +++ b/libNeonDomain/include/Neon/domain/details/bGridDisgMgpu/BlockView/BlockViewGrid.h @@ -0,0 +1,97 @@ +#pragma once +#include + +#include "Neon/core/core.h" +#include "Neon/core/types/DataUse.h" +#include "Neon/core/types/Macros.h" + +#include "Neon/set/BlockConfig.h" +#include "Neon/set/Containter.h" +#include "Neon/set/DevSet.h" +#include "Neon/set/MemoryOptions.h" + +#include "Neon/sys/memory/MemDevice.h" + +#include "Neon/domain/aGrid.h" + +#include "Neon/domain/interface/GridBaseTemplate.h" +#include "Neon/domain/interface/GridConcept.h" +#include "Neon/domain/interface/KernelConfig.h" +#include "Neon/domain/interface/LaunchConfig.h" +#include "Neon/domain/interface/Stencil.h" +#include "Neon/domain/interface/common.h" + +#include "Neon/domain/tools/GridTransformer.h" +#include "Neon/domain/tools/SpanTable.h" + +#include "Neon/domain/details/eGrid/eGrid.h" +#include "Neon/domain/patterns/PatternScalar.h" + +#include "BlockViewPartition.h" + +namespace Neon::domain::details::bGridMgpu { + +namespace details { +struct GridTransformation +{ + template + using Partition = BlockViewPartition; + using Span = Neon::domain::details::eGrid::eSpan; + static constexpr Neon::set::internal::ContainerAPI::DataViewSupport dataViewSupport = Neon::set::internal::ContainerAPI::DataViewSupport::on; + + using FoundationGrid = Neon::domain::details::eGrid::eGrid; + static constexpr Neon::set::details::ExecutionThreadSpan executionThreadSpan = FoundationGrid::executionThreadSpan; + using ExecutionThreadSpanIndexType = int32_t; + using Idx = FoundationGrid::Idx; + + static auto getDefaultBlock(FoundationGrid& foundationGrid) -> Neon::index_3d const& + { + return foundationGrid.getDefaultBlock(); + } + + static auto initSpan(FoundationGrid& foundationGrid, Neon::domain::tool::SpanTable& spanTable) -> void + { + spanTable.forEachConfiguration([&](Neon::Execution execution, + Neon::SetIdx setIdx, + Neon::DataView dw, + Span& span) { + span = foundationGrid.getSpan(execution, setIdx, dw); + }); + } + + static auto initLaunchParameters(FoundationGrid& foundationGrid, + Neon::DataView dataView, + const Neon::index_3d& blockSize, + const size_t& shareMem) -> Neon::set::LaunchParameters + { + return foundationGrid.getLaunchParameters(dataView, blockSize, shareMem); + } + + static auto helpGetGridIdx(FoundationGrid&, + Neon::SetIdx const&, + FoundationGrid::Idx const& fgIdx) + -> GridTransformation::Idx + { + GridTransformation::Idx tgIdx = fgIdx; + return tgIdx; + } + + template + static auto initFieldPartition(FoundationGrid::Field& foundationField, + Neon::domain::tool::PartitionTable>& partitionTable) -> void + { + partitionTable.forEachConfiguration( + [&](Neon::Execution execution, + Neon::SetIdx setIdx, + Neon::DataView dw, + Partition& partition) { + auto& foundationPartition = foundationField.getPartition(execution, setIdx, dw); + partition = Partition(foundationPartition); + }); + } +}; +using BlockViewGrid = Neon::domain::tool::GridTransformer::Grid; + +} // namespace details + +} // namespace Neon::domain::details::bGrid diff --git a/libNeonDomain/include/Neon/domain/details/bGridDisgMgpu/BlockView/BlockViewPartition.h b/libNeonDomain/include/Neon/domain/details/bGridDisgMgpu/BlockView/BlockViewPartition.h new file mode 100644 index 00000000..e1d7be2b --- /dev/null +++ b/libNeonDomain/include/Neon/domain/details/bGridDisgMgpu/BlockView/BlockViewPartition.h @@ -0,0 +1,42 @@ +#pragma once +#include +#include "Neon/core/core.h" +#include "Neon/core/types/Macros.h" +#include "Neon/domain/details/eGrid/eGrid.h" +#include "Neon/domain/details/eGrid/eIndex.h" +#include "Neon/domain/interface/NghData.h" +#include "Neon/set/DevSet.h" +#include "Neon/sys/memory/CudaIntrinsics.h" +#include "cuda_fp16.h" + +namespace Neon::domain::details::bGridMgpu { + +template +class BlockViewPartition : public Neon::domain::details::eGrid::ePartition +{ + public: + BlockViewPartition() + { + } + BlockViewPartition(Neon::domain::details::eGrid::ePartition ePartition) + : Neon::domain::details::eGrid::ePartition(ePartition) + { + } + + template + static auto getInBlockIdx(typename Neon::domain::details::eGrid::ePartition::Idx const& idx, + uint8_3d const& inBlockLocation) -> BlockIdexType + { + BlockIdexType blockIdx(idx.helpGet(), inBlockLocation); + return inBlockLocation; + } + + auto getCountAllocated() const -> int32_t; +}; +template +auto BlockViewPartition::getCountAllocated() const -> int32_t +{ + return this->mCountAllocated; +} +} // namespace Neon::domain::details::bGrid diff --git a/libNeonDomain/include/Neon/domain/details/bGridDisgMgpu/BlockView/BlockViewPartition_imp.h b/libNeonDomain/include/Neon/domain/details/bGridDisgMgpu/BlockView/BlockViewPartition_imp.h new file mode 100644 index 00000000..4464f686 --- /dev/null +++ b/libNeonDomain/include/Neon/domain/details/bGridDisgMgpu/BlockView/BlockViewPartition_imp.h @@ -0,0 +1,184 @@ +#pragma once + +#include "Neon/domain/details//eGrid/ePartition.h" + +namespace Neon::domain::details::bGridMgpu { + + +template +NEON_CUDA_HOST_DEVICE auto +ePartition::prtID() const + -> int +{ + return mPrtID; +} + +template +template +inline NEON_CUDA_HOST_DEVICE auto +ePartition::cardinality() const + -> std::enable_if_t +{ + return mCardinality; +} + +template +template +constexpr inline NEON_CUDA_HOST_DEVICE auto +ePartition::cardinality() const + -> std::enable_if_t +{ + return C; +} + +template +NEON_CUDA_HOST_DEVICE inline auto +ePartition::operator()(eIndex eId, int cardinalityIdx) const + -> T +{ + Offset jump = getOffset(eId, cardinalityIdx); + return mMem[jump]; +} + +template +NEON_CUDA_HOST_DEVICE inline auto +ePartition::operator()(eIndex eId, int cardinalityIdx) -> T& +{ + Offset jump = getOffset(eId, cardinalityIdx); + return mMem[jump]; +} + +template +NEON_CUDA_HOST_DEVICE inline auto +ePartition::getNghData(eIndex eId, + NghIdx nghIdx, + int card, + const Type& alternativeVal) + const -> NghData +{ + eIndex eIdxNgh; + const bool isValidNeighbour = isValidNgh(eId, nghIdx, eIdxNgh); + T val = (isValidNeighbour) ? this->operator()(eIdxNgh, card) : alternativeVal; + // printf("(prtId %d)getNghData id %d card %d eIdxNgh %d val %d\n", + // mPrtID, eId.mIdx, card, eIdxNgh.mIdx, int(val)); + return NghData(val, isValidNeighbour); +} + +template +NEON_CUDA_HOST_DEVICE inline auto +ePartition::getNghData(eIndex eId, + const Neon::int8_3d& ngh3dIdx, + int card, + const Type& alternativeVal) + const -> NghData +{ + int tablePithc = (ngh3dIdx.x + mStencilRadius) + + (ngh3dIdx.y + mStencilRadius) * mStencilTableYPitch + + (ngh3dIdx.z + mStencilRadius) * mStencilTableYPitch * mStencilTableYPitch; + NghIdx nghIdx = mStencil3dTo1dOffset[tablePithc]; + NghData res = getNghData(eId, nghIdx, card, alternativeVal); + + return res; +} + +template +NEON_CUDA_HOST_DEVICE inline auto +ePartition::isValidNgh(eIndex eId, + NghIdx nghIdx, + eIndex& neighbourIdx) const + -> bool +{ + const eIndex::Offset connectivityJumo = mCountAllocated * nghIdx + eId.get(); + neighbourIdx.set() = NEON_CUDA_CONST_LOAD((mConnectivity + connectivityJumo)); + const bool isValidNeighbour = (neighbourIdx.mIdx > -1); +// printf("(prtId %d) getNghData id %d eIdxNgh %d connectivityJumo %d\n", +// mPrtID, +// eId.mIdx, neighbourIdx.mIdx, connectivityJumo); + return isValidNeighbour; +} + +template +NEON_CUDA_HOST_DEVICE inline auto +ePartition::getGlobalIndex(eIndex eIndex) const + -> Neon::index_3d +{ + Neon::index_3d loc; + const auto baseAddr = mOrigins + eIndex.get(); + loc = mOrigins[eIndex.get()]; + return loc; +} + +template +ePartition::ePartition(int prtId, + T* mem, + ePitch pitch, + int32_t cardinality, + int32_t countAllocated, + Offset* connRaw, + Neon::index_3d* toGlobal, + int8_t* stencil3dTo1dOffset, + int32_t stencilRadius) +{ + mPrtID = prtId; + mMem = mem; + mPitch = pitch; + mCardinality = cardinality; + mCountAllocated = countAllocated; + + mConnectivity = connRaw; + mOrigins = toGlobal; + + mStencil3dTo1dOffset = stencil3dTo1dOffset; + mStencilTableYPitch = 2 * stencilRadius + 1; + + mStencilRadius = stencilRadius; +} + +template +NEON_CUDA_HOST_DEVICE auto +ePartition::pointer(eIndex eId, int cardinalityIdx) const + -> const Type* +{ + Offset jump = getOffset(eId, cardinalityIdx); + return mMem + jump; +} + +template +NEON_CUDA_HOST_DEVICE inline auto +ePartition::getOffset(eIndex eId, int cardinalityIdx) const + -> Offset +{ + return Offset(eId.get() * mPitch.x + cardinalityIdx * mPitch.y); +} + +template +NEON_CUDA_HOST_DEVICE inline auto +ePartition::mem() + -> T* +{ + return mMem; +} + +template +NEON_CUDA_HOST_DEVICE inline auto +ePartition::mem() const + -> const T* +{ + return mMem; +} + +} // namespace Neon::domain::details::eGrid diff --git a/libNeonDomain/include/Neon/domain/details/bGridDisgMgpu/StaticBlock.h b/libNeonDomain/include/Neon/domain/details/bGridDisgMgpu/StaticBlock.h new file mode 100644 index 00000000..7fdd4577 --- /dev/null +++ b/libNeonDomain/include/Neon/domain/details/bGridDisgMgpu/StaticBlock.h @@ -0,0 +1,106 @@ +#pragma once + +#include "Neon/domain/details/bGridDisgMgpu/bSpan.h" + +namespace Neon::domain::details::bGridMgpu { + +template +struct StaticBlock +{ + public: + constexpr static uint32_t memBlockSizeX = memBlockSizeX_; + constexpr static uint32_t memBlockSizeY = memBlockSizeY_; + constexpr static uint32_t memBlockSizeZ = memBlockSizeZ_; + constexpr static Neon::uint32_3d memBlockSize3D = Neon::uint32_3d(memBlockSizeX, memBlockSizeY, memBlockSizeZ); + + constexpr static uint32_t userBlockSizeX = userBlockSizeX_; + constexpr static uint32_t userBlockSizeY = userBlockSizeY_; + constexpr static uint32_t userBlockSizeZ = userBlockSizeZ_; + constexpr static Neon::uint32_3d userBlockSize3D = Neon::uint32_3d(userBlockSizeX, userBlockSizeY, userBlockSizeZ); + + constexpr static uint32_t blockRatioX = memBlockSizeX / userBlockSizeX; + constexpr static uint32_t blockRatioY = memBlockSizeY / userBlockSizeY; + constexpr static uint32_t blockRatioZ = memBlockSizeZ / userBlockSizeZ; + + constexpr static uint32_t memBlockPitchX = 1; + constexpr static uint32_t memBlockPitchY = memBlockSizeX; + constexpr static uint32_t memBlockPitchZ = memBlockSizeX * memBlockSizeY; + + constexpr static bool isMultiResMode = isMultiResMode_; + + constexpr static uint32_t memBlockCountElements = memBlockSizeX * memBlockSizeY * memBlockSizeZ; + + static_assert(memBlockSizeX >= userBlockSizeX); + static_assert(memBlockSizeY >= userBlockSizeY); + static_assert(memBlockSizeZ >= userBlockSizeZ); + + static_assert(memBlockSizeX % userBlockSizeX == 0); + static_assert(memBlockSizeY % userBlockSizeY == 0); + static_assert(memBlockSizeZ % userBlockSizeZ == 0); + + struct BitMask + { + using BitMaskWordType = uint32_t; + auto reset() -> void + { + for (BitMaskWordType i = 0; i < nWords; ++i) { + bits[i] = 0; + } + } + + auto setActive(int threadX, + int threadY, + int threadZ) -> void + { + BitMaskWordType mask; + uint32_t wordIdx; + getMaskAndWordI(threadX, threadY, threadZ, mask, wordIdx); + auto& word = bits[wordIdx]; + word = word | mask; + } + + inline auto NEON_CUDA_HOST_DEVICE isActive(int threadX, + int threadY, + int threadZ) const -> bool + { + BitMaskWordType mask; + uint32_t wordIdx; + getMaskAndWordI(threadX, threadY, threadZ, mask, wordIdx); + auto& word = bits[wordIdx]; + return (word & mask) != 0; + } + + static inline auto NEON_CUDA_HOST_DEVICE getMaskAndWordI(int threadX, + int threadY, + int threadZ, + NEON_OUT BitMaskWordType& mask, + NEON_OUT uint32_t& wordIdx) -> void + { + const uint32_t threadPitch = threadX * memBlockPitchX + + threadY * memBlockPitchY + + threadZ * memBlockPitchZ; + + // threadPitch >> log2_of_bitPerWord + // the same as: threadPitch / 2^{log2_of_bitPerWord} + wordIdx = threadPitch >> log2_of_bitPerWord; + // threadPitch & ((bitMaskWordType(bitMaskStorageBitWidth)) - 1); + // same as threadPitch % 2^{log2OfbitMaskWordSize} + const uint32_t offsetInWord = threadPitch & ((BitMaskWordType(bitPerWord)) - 1); + mask = BitMaskWordType(1) << offsetInWord; + } + + constexpr static BitMaskWordType nWords = (memBlockCountElements + 31) / 32; + static constexpr uint32_t log2_of_bitPerWord = 5; + static constexpr uint32_t bitPerWord = 32; + + BitMaskWordType bits[nWords]; + }; +}; + +} // namespace Neon::domain::details::bGrid \ No newline at end of file diff --git a/libNeonDomain/include/Neon/domain/details/bGridDisgMgpu/bField.h b/libNeonDomain/include/Neon/domain/details/bGridDisgMgpu/bField.h new file mode 100644 index 00000000..251fdbab --- /dev/null +++ b/libNeonDomain/include/Neon/domain/details/bGridDisgMgpu/bField.h @@ -0,0 +1,121 @@ +#pragma once +#include "Neon/domain/details/bGridDisgMgpu/bPartition.h" +#include "Neon/domain/interface/FieldBaseTemplate.h" +#include "Neon/set/patterns/BlasSet.h" + +#include "Neon/core/core.h" +#include "Neon/core/types/Macros.h" + +#include "Neon/set/DevSet.h" +#include "Neon/set/HuOptions.h" +#include "Neon/set/MemoryTransfer.h" + +#include "Neon/domain/interface/FieldBaseTemplate.h" +#include "Neon/domain/tools/HaloUpdateTable1DPartitioning.h" +#include "Neon/domain/tools/PartitionTable.h" +#include "bPartition.h" + +namespace Neon::domain::details::bGridMgpu { + + +template +class bField : public Neon::domain::interface::FieldBaseTemplate, + bPartition, + int> +{ + friend bGrid; + + public: + using Type = T; + using Grid = bGrid; + using Field = bField; + using Partition = bPartition; + using Idx = bIndex; + using BlockViewGrid = Neon::domain::tool::GridTransformer::Grid; + template + using BlockViewField = BlockViewGrid::template Field; + + using NghIdx = typename Partition::NghIdx; + using NghData = typename Partition::NghData; + + bField(const std::string& fieldUserName, + Neon::DataUse dataUse, + Neon::MemoryOptions memoryOptions, + const Grid& grid, + int cardinality, + T inactiveValue); + + bField(); + + virtual ~bField() = default; + + auto getPartition(Neon::Execution, + Neon::SetIdx, + const Neon::DataView& dataView) const -> const Partition& final; + + auto getPartition(Neon::Execution, + Neon::SetIdx, + const Neon::DataView& dataView) -> Partition& final; + + auto isInsideDomain(const Neon::index_3d& idx) const -> bool; + + + auto operator()(const Neon::index_3d& idx, + const int& cardinality) const -> T final; + + auto getReference(const Neon::index_3d& idx, + const int& cardinality) -> T& final; + + auto updateHostData(int streamId = 0) -> void final; + + auto updateDeviceData(int streamId = 0) -> void final; + + auto newHaloUpdate(Neon::set::StencilSemantic semantic, + Neon::set::TransferMode transferMode, + Neon::Execution execution) + const -> Neon::set::Container; + + auto getMemoryField() -> BlockViewGrid::Field&; + + + private: + auto getRef(const Neon::index_3d& idx, const int& cardinality) const -> T&; + + auto initHaloUpdateTable() -> void; + + + struct Data + { + Data() = default; + Data(Neon::Backend const& bk) + { + partitionTable.init(bk); + } + + enum EndPoints + { + src = 1, + dst = 0 + }; + + struct EndPointsUtils + { + static constexpr int nConfigs = 2; + }; + + std::shared_ptr grid; + BlockViewField memoryField; + int cardinality; + + // Neon::domain::tool::HaloTable1DPartitioning latticeHaloUpdateTable; + Neon::domain::tool::HaloTable1DPartitioning soaHaloUpdateTable; + // Neon::domain::tool::HaloTable1DPartitioning aosHaloUpdateTable; + Neon::domain::tool::PartitionTable partitionTable; + }; + std::shared_ptr mData; +}; + + +} // namespace Neon::domain::details::bGrid diff --git a/libNeonDomain/include/Neon/domain/details/bGridDisgMgpu/bField_imp.h b/libNeonDomain/include/Neon/domain/details/bGridDisgMgpu/bField_imp.h new file mode 100644 index 00000000..ead142d1 --- /dev/null +++ b/libNeonDomain/include/Neon/domain/details/bGridDisgMgpu/bField_imp.h @@ -0,0 +1,349 @@ +#pragma once + +#include "Neon/domain/details/bGridDisgMgpu/bField.h" + +namespace Neon::domain::details::bGridMgpu { + +template +bField::bField() +{ + mData = std::make_shared(); +} + +template +bField::bField(const std::string& fieldUserName, + Neon::DataUse dataUse, + Neon::MemoryOptions memoryOptions, + const Grid& grid, + int cardinality, + T inactiveValue) + : Neon::domain::interface::FieldBaseTemplate(&grid, + fieldUserName, + "bField", + cardinality, + inactiveValue, + dataUse, + memoryOptions, + Neon::domain::haloStatus_et::e::ON) +{ + mData = std::make_shared(grid.getBackend()); + mData->grid = std::make_shared(grid); + + if (memoryOptions.getOrder() == Neon::MemoryLayout::arrayOfStructs) { + NEON_WARNING("bField does not support MemoryLayout::arrayOfStructs, enforcing MemoryLayout::structOfArrays"); + memoryOptions.setOrder(Neon::MemoryLayout::structOfArrays); + } + // the allocation size is the number of blocks x block size x cardinality + mData->memoryField = mData->grid->getBlockViewGrid().template newField( + "BitMask", + [&] { + int elPerBlock = SBlock::memBlockCountElements * cardinality; + return elPerBlock; + }(), + inactiveValue, + dataUse, + mData->grid->getBackend().getMemoryOptions(bSpan::activeMaskMemoryLayout)); + + + { // Setting up partitionTable + // const int setCardinality = mData->grid->getBackend().getDeviceCount(); + mData->partitionTable.forEachConfiguration( + [&](Neon::Execution execution, + Neon::SetIdx setIdx, + Neon::DataView, + Partition& partition) { + auto& partitioner = mData->grid->helpGetPartitioner1D(); + auto firstBup = partitioner.getSpanLayout().getBoundsBoundary(setIdx, Neon::domain::tool::partitioning::ByDirection::up).first; + auto firstBdw = partitioner.getSpanLayout().getBoundsBoundary(setIdx, Neon::domain::tool::partitioning::ByDirection::down).first; + auto firstGup = partitioner.getSpanLayout().getGhostBoundary(setIdx, Neon::domain::tool::partitioning::ByDirection::up).first; + auto firstGdw = partitioner.getSpanLayout().getGhostBoundary(setIdx, Neon::domain::tool::partitioning::ByDirection::down).first; + auto lastGdw = firstGdw + partitioner.getSpanLayout().getGhostBoundary(setIdx, Neon::domain::tool::partitioning::ByDirection::down).count; + + auto& memoryFieldPartition = mData->memoryField.getPartition(execution, setIdx, Neon::DataView::STANDARD); + auto& blockConnectivity = mData->grid->helpGetBlockConnectivity().getPartition(execution, setIdx, Neon::DataView::STANDARD); + auto& bitmask = mData->grid->getActiveBitMask().getPartition(execution, setIdx, Neon::DataView::STANDARD); + auto& dataBlockOrigins = mData->grid->helpGetDataBlockOriginField().getPartition(execution, setIdx, Neon::DataView::STANDARD); + + partition = bPartition(setIdx, + cardinality, + memoryFieldPartition.mem(), + blockConnectivity.mem(), + bitmask.mem(), + dataBlockOrigins.mem(), + mData->grid->helpGetStencilIdTo3dOffset().rawMem(execution, setIdx), + mData->grid->getDimension(), + firstBup, + firstBdw, + firstGup, + firstGdw, + lastGdw); + }); + } + + initHaloUpdateTable(); +} + +template +auto bField::getMemoryField() -> BlockViewGrid::Field& +{ + return mData->memoryField; +} + +template +auto bField::isInsideDomain(const Neon::index_3d& idx) const -> bool +{ + return mData->grid->isInsideDomain(idx); +} + +template +auto bField::getReference(const Neon::index_3d& cartesianIdx, + const int& cardinality) -> T& +{ + if constexpr (SBlock::isMultiResMode) { + auto& grid = this->getGrid(); + auto uniformCartesianIdx = cartesianIdx / grid.helGetMultiResDiscreteIdxSpacing(); + + if (cartesianIdx.x % grid.helGetMultiResDiscreteIdxSpacing() != 0 || + cartesianIdx.y % grid.helGetMultiResDiscreteIdxSpacing() != 0 || + cartesianIdx.z % grid.helGetMultiResDiscreteIdxSpacing() != 0) { + NeonException exp("bField::getReference"); + exp << "Input index is not multiple of the grid resolution"; + exp << "Index = " << cartesianIdx; + NEON_THROW(exp); + } + auto [setIdx, bIdx] = grid.helpGetSetIdxAndGridIdx(uniformCartesianIdx); + auto& partition = getPartition(Neon::Execution::host, setIdx, Neon::DataView::STANDARD); + auto& result = partition(bIdx, cardinality); + return result; + } else { + auto& grid = this->getGrid(); + auto [setIdx, bIdx] = grid.helpGetSetIdxAndGridIdx(cartesianIdx); + auto& partition = getPartition(Neon::Execution::host, setIdx, Neon::DataView::STANDARD); + auto& result = partition(bIdx, cardinality); + return result; + } +} + +template +auto bField::operator()(const Neon::index_3d& cartesianIdx, + const int& cardinality) const -> T +{ + auto& grid = this->getGrid(); + auto [setIdx, bIdx] = grid.helpGetSetIdxAndGridIdx(cartesianIdx); + if (setIdx.idx() == -1) { + return this->getOutsideValue(); + } + auto& partition = getPartition(Neon::Execution::host, setIdx, Neon::DataView::STANDARD); + auto& result = partition(bIdx, cardinality); + return result; +} + +template +auto bField::updateHostData(int streamId) -> void +{ + mData->memoryField.updateHostData(streamId); +} + +template +auto bField::updateDeviceData(int streamId) -> void +{ + mData->memoryField.updateDeviceData(streamId); +} + +template +auto bField::getPartition(Neon::Execution execution, + Neon::SetIdx setIdx, + const Neon::DataView& dataView) const -> const Partition& +{ + const Neon::DataUse dataUse = this->getDataUse(); + bool isOk = Neon::ExecutionUtils::checkCompatibility(dataUse, execution); + if (isOk) { + Partition const& result = mData->partitionTable.getPartition(execution, setIdx, dataView); + return result; + } + std::stringstream message; + message << "The requested execution mode ( " << execution << " ) is not compatible with the field DataUse (" << dataUse << ")"; + NEON_THROW_UNSUPPORTED_OPERATION(message.str()); +} + +template +auto bField::getPartition(Neon::Execution execution, + Neon::SetIdx setIdx, + const Neon::DataView& dataView) -> Partition& +{ + const Neon::DataUse dataUse = this->getDataUse(); + bool isOk = Neon::ExecutionUtils::checkCompatibility(dataUse, execution); + if (isOk) { + Partition& result = mData->partitionTable.getPartition(execution, setIdx, dataView); + return result; + } + std::stringstream message; + message << "The requested execution mode ( " << execution << " ) is not compatible with the field DataUse (" << dataUse << ")"; + NEON_THROW_UNSUPPORTED_OPERATION(message.str()); +} + +template +auto bField::newHaloUpdate(Neon::set::StencilSemantic stencilSemantic, + Neon::set::TransferMode transferMode, + Neon::Execution execution) const -> Neon::set::Container +{ + + + // We need to define a graph of Containers + // One for the actual memory transfer + // One for the synchronization + // The order depends on the transfer mode: put or get + Neon::set::Container dataTransferContainer; + auto const& bk = this->getGrid().getBackend(); + + if (stencilSemantic == Neon::set::StencilSemantic::standard) { + auto transfers = bk.template newDataSet>(); + + if (this->getMemoryOptions().getOrder() == Neon::MemoryLayout::structOfArrays) { + for (auto byDirection : {tool::partitioning::ByDirection::up, + tool::partitioning::ByDirection::down}) { + + auto const& tableEntryByDir = mData->soaHaloUpdateTable.get(transferMode, + execution, + byDirection); + + tableEntryByDir.forEachSeq([&](SetIdx setIdx, auto const& tableEntryByDirBySetIdx) { + transfers[setIdx].insert(std::end(transfers[setIdx]), + std::begin(tableEntryByDirBySetIdx), + std::end(tableEntryByDirBySetIdx)); + }); + } + dataTransferContainer = + Neon::set::Container::factoryDataTransfer( + *this, + transferMode, + stencilSemantic, + transfers, + execution); + + + } else { + NEON_THROW_UNSUPPORTED_OPERATION(""); + } + } else { + NEON_DEV_UNDER_CONSTRUCTION(""); + } + Neon::set::Container SyncContainer = + Neon::set::Container::factorySynchronization( + *this, + Neon::set::SynchronizationContainerType::hostOmpBarrier); + + Neon::set::container::Graph graph(this->getBackend()); + const auto& dataTransferNode = graph.addNode(dataTransferContainer); + const auto& syncNode = graph.addNode(SyncContainer); + + switch (transferMode) { + case Neon::set::TransferMode::put: + graph.addDependency(dataTransferNode, syncNode, Neon::GraphDependencyType::data); + break; + case Neon::set::TransferMode::get: + graph.addDependency(syncNode, dataTransferNode, Neon::GraphDependencyType::data); + break; + default: + NEON_THROW_UNSUPPORTED_OPTION(); + break; + } + + graph.removeRedundantDependencies(); + + Neon::set::Container output = + Neon::set::Container::factoryGraph("dGrid-Halo-Update", + graph, + [](Neon::SetIdx, Neon::set::Loader&) {}); + return output; +} + +template +auto bField::initHaloUpdateTable() -> void +{ + // NEON_THROW_UNSUPPORTED_OPERATION(""); + auto& grid = this->getGrid(); + auto bk = grid.getBackend(); + auto getNghSetIdx = [&](SetIdx setIdx, Neon::domain::tool::partitioning::ByDirection direction) { + int res; + if (direction == Neon::domain::tool::partitioning::ByDirection::up) { + res = (setIdx + 1) % bk.getDeviceCount(); + } else { + res = (setIdx + bk.getDeviceCount() - 1) % bk.getDeviceCount(); + } + return res; + }; + + mData->soaHaloUpdateTable.forEachPutConfiguration( + bk, [&](Neon::SetIdx setIdxSrc, + Execution execution, + Neon::domain::tool::partitioning::ByDirection byDirection, + std::vector& transfersVec) { + { + using namespace Neon::domain::tool::partitioning; + + Neon::SetIdx setIdxDst = getNghSetIdx(setIdxSrc, byDirection); + Neon::SetIdx setIdxVec[2]; + setIdxVec[Data::EndPoints::dst] = setIdxDst; + setIdxVec[Data::EndPoints::src] = setIdxSrc; + + std::array partitions; + std::array*, Data::EndPointsUtils::nConfigs> blockViewPartitions; + std::array, Data::EndPointsUtils::nConfigs> ghostZBeginIdx; + std::array, Data::EndPointsUtils::nConfigs> boundaryZBeginIdx; + std::array memPhyDim; + + partitions[Data::EndPoints::dst] = &this->getPartition(execution, setIdxDst, Neon::DataView::STANDARD); + partitions[Data::EndPoints::src] = &this->getPartition(execution, setIdxSrc, Neon::DataView::STANDARD); + blockViewPartitions[Data::EndPoints::dst] = &(mData->memoryField.getPartition(execution, setIdxDst, Neon::DataView::STANDARD)); + blockViewPartitions[Data::EndPoints::src] = &(mData->memoryField.getPartition(execution, setIdxSrc, Neon::DataView::STANDARD)); + + for (auto endPoint : {Data::EndPoints::dst, Data::EndPoints::src}) { + for (auto direction : {ByDirection::down, ByDirection::up}) { + auto ghostFirst = mData->grid->mData->partitioner1D.getSpanLayout().getGhostBoundary(setIdxVec[endPoint], direction).first; + auto boundaryFirst = mData->grid->mData->partitioner1D.getSpanLayout().getBoundsBoundary(setIdxVec[endPoint], direction).first; + ghostZBeginIdx[endPoint][static_cast(direction)] = ghostFirst; + boundaryZBeginIdx[endPoint][static_cast(direction)] = boundaryFirst; + } + + memPhyDim[endPoint] = Neon::size_4d( + SBlock::memBlockCountElements, + 1, + 1, + size_t(blockViewPartitions[endPoint]->getCountAllocated()) * SBlock::memBlockCountElements); + } + + if (ByDirection::up == byDirection && bk.isLastDevice(setIdxSrc)) { + return; + } + + if (ByDirection::down == byDirection && bk.isFirstDevice(setIdxSrc)) { + return; + } + + T* srcMem = blockViewPartitions[Data::EndPoints::src]->mem(); + T* dstMem = blockViewPartitions[Data::EndPoints::dst]->mem(); + + Neon::size_4d dstGhostBuff(ghostZBeginIdx[Data::EndPoints::dst][static_cast(ByDirectionUtils::invert(byDirection))], 0, 0, 0); + Neon::size_4d srcBoundaryBuff(boundaryZBeginIdx[Data::EndPoints::src][static_cast(byDirection)], 0, 0, 0); + + size_t transferDataBlockCount = mData->grid->mData->partitioner1D.getSpanLayout().getBoundsBoundary(setIdxVec[Data::EndPoints::src], byDirection).count; + + // std::cout << "To " << dstGhostBuff << " prt " << blockViewPartitions[Data::EndPoints::dst]->prtID() << " From " << srcBoundaryBuff << " prt " << blockViewPartitions[Data::EndPoints::src]->prtID() << std::endl; + // std::cout << "dst mem " << blockViewPartitions[Data::EndPoints::dst]->mem() << " " << std::endl; + // std::cout << "dst transferDataBlockCount " << transferDataBlockCount << " " << std::endl; + // std::cout << "dst pitch " << (dstGhostBuff * memPhyDim[Data::EndPoints::dst]).rSum() << " " << std::endl; + // std::cout << "dst dstGhostBuff " << dstGhostBuff << " " << std::endl; + // std::cout << "dst pitch all" << memPhyDim[Data::EndPoints::dst] << " " << std::endl; + + Neon::set::MemoryTransfer transfer({setIdxDst, dstMem + (dstGhostBuff * memPhyDim[Data::EndPoints::dst]).rSum(), dstGhostBuff}, + {setIdxSrc, srcMem + (srcBoundaryBuff * memPhyDim[Data::EndPoints::src]).rSum(), srcBoundaryBuff}, + sizeof(T) * SBlock::memBlockCountElements * transferDataBlockCount); + + transfersVec.push_back(transfer); + } + }); +} + + +} // namespace Neon::domain::details::bGridMgpu diff --git a/libNeonDomain/include/Neon/domain/details/bGridDisgMgpu/bGrid.h b/libNeonDomain/include/Neon/domain/details/bGridDisgMgpu/bGrid.h new file mode 100644 index 00000000..4af026c1 --- /dev/null +++ b/libNeonDomain/include/Neon/domain/details/bGridDisgMgpu/bGrid.h @@ -0,0 +1,239 @@ +#pragma once +#include "Neon/core/core.h" + +#include "Neon/domain/aGrid.h" +#include "Neon/domain/details/bGridDisgMgpu/BlockView.h" +#include "Neon/domain/details/bGridDisgMgpu/StaticBlock.h" +#include "Neon/domain/details/bGridDisgMgpu/bField.h" +#include "Neon/domain/details/bGridDisgMgpu/bIndex.h" +#include "Neon/domain/details/bGridDisgMgpu/bPartition.h" +#include "Neon/domain/details/bGridDisgMgpu/bSpan.h" +#include "Neon/domain/interface/GridBaseTemplate.h" +#include "Neon/domain/patterns/PatternScalar.h" +#include "Neon/domain/tools/Partitioner1D.h" +#include "Neon/domain/tools/PointHashTable.h" +#include "Neon/domain/tools/SpanTable.h" +#include "Neon/set/Containter.h" +#include "Neon/set/LaunchParametersTable.h" +#include "Neon/set/memory/memSet.h" + +#include "bField.h" +#include "bPartition.h" +#include "bSpan.h" + +namespace Neon::domain::details::bGridMgpu { + + +template +class bField; + +template +class bGrid : public Neon::domain::interface::GridBaseTemplate, + bIndex> +{ + public: + using Grid = bGrid; + template + using Partition = bPartition; + template + using Field = Neon::domain::details::bGridMgpu::bField; + + using Span = bSpan; + using NghIdx = typename Partition::NghIdx; + using GridBaseTemplate = Neon::domain::interface::GridBaseTemplate>; + + using Idx = bIndex; + static constexpr Neon::set::details::ExecutionThreadSpan executionThreadSpan = Neon::set::details::ExecutionThreadSpan::d1b3; + using ExecutionThreadSpanIndexType = uint32_t; + + using BlockIdx = uint32_t; + + bGrid() = default; + virtual ~bGrid(); + + /** + * Constructor for the vanilla block data structure with depth of 1 + */ + template + bGrid(const Neon::Backend& backend, + const Neon::int32_3d& domainSize, + const ActiveCellLambda activeCellLambda, + const Neon::domain::Stencil& stencil, + const double_3d& spacingData = double_3d(1, 1, 1), + const double_3d& origin = double_3d(0, 0, 0), + Neon::domain::tool::spaceCurves::EncoderType encoderType = Neon::domain::tool::spaceCurves::EncoderType::sweep); + + + /** + * Constructor for bGrid. This constructor should be directly used only by mGrid + */ + template + bGrid(const Neon::Backend& backend /**< Neon backend for the computation */, + const Neon::int32_3d& domainSize /**< Size of the bounded Cartesian */, + const ActiveCellLambda activeCellLambda /**< Function that identify the user domain inside the boxed Cartesian discretization */, + const Neon::domain::Stencil& stencil /**< union of tall the stencil that will be used in the computation */, + const int multiResDiscreteIdxSpacing /**< Parameter for the multi-resolution. Index i and index (i+1) may be remapped as i*voxelSpacing and (i+1)* voxelSpacing. + * For a uniform bGrid, i.e outside the context of multi-resolution this parameter is always 1 */, + const double_3d& spacingData /** Physical spacing between two consecutive data points in the Cartesian domain */, + const double_3d& origin /** Physical location in space of the origin of the Cartesian discretization */, + Neon::domain::tool::spaceCurves::EncoderType encoderType = Neon::domain::tool::spaceCurves::EncoderType::sweep); + + /** + * Returns some properties for a given cartesian in the Cartesian domain. + * The provide index my be inside or outside the user defined bounded Cartesian domain + */ + auto getProperties(const Neon::index_3d& idx) + const -> typename GridBaseTemplate::CellProperties final; + + /** + * Returns true if the query 3D point is inside the user domain + * @param idx + * @return + */ + auto isInsideDomain(const Neon::index_3d& idx) + const -> bool final; + + /** + * Retrieves the device index that contains the query point + * @param idx + * @return + */ + auto getSetIdx(const Neon::index_3d& idx) + const -> int32_t final; + + /** + * Allocates a new field on the grid + */ + template + auto newField(const std::string name, + int cardinality, + T inactiveValue, + Neon::DataUse dataUse = Neon::DataUse::HOST_DEVICE, + Neon::MemoryOptions memoryOptions = Neon::MemoryOptions()) const + -> Field; + + /** + * Allocates a new field on the block view grid + */ + template + auto newBlockViewField(const std::string name, + int cardinality, + T inactiveValue, + Neon::DataUse dataUse = Neon::DataUse::HOST_DEVICE, + Neon::MemoryOptions memoryOptions = Neon::MemoryOptions()) const + -> BlockView::Field; + + /** + * Allocates a new container to execute some computation in the grid + */ + template + auto newContainer(const std::string& name, + index_3d blockSize, + size_t sharedMem, + LoadingLambda lambda) const -> Neon::set::Container; + + /** + * Allocates a new container to execute some computation in the grid + */ + template + auto newContainer(const std::string& name, + LoadingLambda lambda) const -> Neon::set::Container; + + /** + * Defines a new set of parameter to launch a Container + */ + auto getLaunchParameters(Neon::DataView dataView, + const Neon::index_3d& blockSize, + const size_t& sharedMem) const -> Neon::set::LaunchParameters; + + /** + * Retrieve the span associated to the grid w.r.t. some user defined parameters. + */ + auto getSpan(Neon::Execution execution, + SetIdx setIdx, + Neon::DataView dataView) -> const Span&; + + /** + * Retrieve the block vew grid internally used. + * This grid can be leverage to allocate data at the block level. + */ + auto getBlockViewGrid() const -> BlockView::Grid&; + + + /** + * Retrieve the block vew grid internally used. + * This grid can be leverage to allocate data at the block level. + */ + auto getActiveBitMask() const -> BlockView::Field&; + + /** + * Helper function to retrieve the discrete index spacing used for the multi-resolution + */ + template + auto helGetMultiResDiscreteIdxSpacing() const -> std::enable_if_t; + + + /** + * Help function to retrieve the block connectivity as a BlockViewGrid field + */ + auto helpGetBlockConnectivity() const -> BlockView::Field&; + + /** + * Help function to retrieve the block origin as a BlockViewGrid field + */ + auto helpGetDataBlockOriginField() const -> Neon::aGrid::Field&; + + /** + * Help function to retrieve the map that converts a stencil point id to 3d offset + */ + auto helpGetStencilIdTo3dOffset() const -> Neon::set::MemSet&; + + auto helpGetPartitioner1D() -> Neon::domain::tool::Partitioner1D&; + + /** + * Help function retriev the device and the block index associated to a point in the BlockViewGrid grid + */ + auto helpGetSetIdxAndGridIdx(Neon::index_3d idx) const -> std::tuple; + + struct Data + { + auto init(const Neon::Backend& bk) + { + spanTable.init(bk); + launchParametersTable.init(bk); + } + + Neon::domain::tool::SpanTable spanTable /** Span for each data view configurations */; + Neon::set::LaunchParametersTable launchParametersTable; + + Neon::domain::tool::Partitioner1D partitioner1D; + Stencil stencil; + Neon::sys::patterns::Engine reduceEngine; + + Neon::aGrid memoryGrid /** memory allocator for fields */; + Neon::aGrid::Field mDataBlockOriginField; + Neon::set::MemSet mStencil3dTo1dOffset; + + BlockView::Grid blockViewGrid; + BlockView::Field activeBitField; + BlockView::Field blockConnectivity; + Neon::set::MemSet stencilIdTo3dOffset; + + int mMultiResDiscreteIdxSpacing; + + // number of active voxels in each block + Neon::set::DataSet mNumActiveVoxel; + + + // Stencil neighbor indices + Neon::set::MemSet mStencilNghIndex; + }; + std::shared_ptr mData; +}; +extern template class bGrid>; +} // namespace Neon::domain::details::bGrid + +#include "bField_imp.h" +#include "bGrid_imp.h" diff --git a/libNeonDomain/include/Neon/domain/details/bGridDisgMgpu/bGrid_imp.h b/libNeonDomain/include/Neon/domain/details/bGridDisgMgpu/bGrid_imp.h new file mode 100644 index 00000000..89614532 --- /dev/null +++ b/libNeonDomain/include/Neon/domain/details/bGridDisgMgpu/bGrid_imp.h @@ -0,0 +1,470 @@ +#include "Neon/domain/details/bGridDisgMgpu/bGrid.h" +#include "Neon/domain/tools/SpaceCurves.h" + +namespace Neon::domain::details::bGridMgpu { + +template +template +bGrid::bGrid(const Neon::Backend& backend, + const Neon::int32_3d& domainSize, + const ActiveCellLambda activeCellLambda, + const Neon::domain::Stencil& stencil, + const double_3d& spacingData, + const double_3d& origin, + Neon::domain::tool::spaceCurves::EncoderType encoderType) + : bGrid(backend, domainSize, activeCellLambda, stencil, 1, spacingData, origin, encoderType) +{ +} + +template +template +bGrid::bGrid(const Neon::Backend& backend, + const Neon::int32_3d& domainSize, + const ActiveCellLambda activeCellLambda, + const Neon::domain::Stencil& stencil, + const int multiResDiscreteIdxSpacing, + const double_3d& spacingData, + const double_3d& origin, + Neon::domain::tool::spaceCurves::EncoderType encoderType) +{ + + + mData = std::make_shared(); + mData->init(backend); + + mData->mMultiResDiscreteIdxSpacing = multiResDiscreteIdxSpacing; + mData->stencil = stencil; + const index_3d defaultKernelBlockSize(SBlock::memBlockSizeX, + SBlock::memBlockSizeY, + SBlock::memBlockSizeZ); + + std::stringstream gridName; + gridName << "bGrid_" << SBlock::memBlockSizeX << "_" + << SBlock::memBlockSizeY << "_" + << SBlock::memBlockSizeZ; + { + auto nElementsPerPartition = backend.devSet().template newDataSet(0); + // We do an initialization with nElementsPerPartition to zero, + // then we reset to the computed number. + + bGrid::GridBase::init(gridName.str(), + backend, + domainSize, + stencil, + nElementsPerPartition, + defaultKernelBlockSize, + multiResDiscreteIdxSpacing, + origin, + encoderType, + defaultKernelBlockSize); + } + + { // Initialization of the partitioner + + mData->partitioner1D = Neon::domain::tool::Partitioner1D( + backend, + activeCellLambda, + nullptr, + SBlock::memBlockSize3D.template newType(), + domainSize, + Neon::domain::Stencil::s27_t(false), + encoderType, + multiResDiscreteIdxSpacing); + + mData->mDataBlockOriginField = mData->partitioner1D.getGlobalMapping(); + mData->mStencil3dTo1dOffset = mData->partitioner1D.getStencil3dTo1dOffset(); + mData->memoryGrid = mData->partitioner1D.getMemoryGrid(); + } + + { // BlockViewGrid + Neon::domain::details::eGrid::eGrid egrid( + backend, + mData->partitioner1D.getBlockSpan(), + mData->partitioner1D, + Neon::domain::Stencil::s27_t(false), + spacingData * SBlock::memBlockSize3D, + origin); + + mData->blockViewGrid = BlockView::Grid(egrid); + } + + { // Active bitmask + mData->activeBitField = mData->blockViewGrid.template newField( + "BlockViewBitMask", + 1, + [] { + typename SBlock::BitMask outsideBitMask; + outsideBitMask.reset(); + return outsideBitMask; + }(), + Neon::DataUse::HOST_DEVICE, backend.getMemoryOptions(BlockView::layout)); + + mData->mNumActiveVoxel = backend.devSet().template newDataSet(); + + mData->activeBitField + .getGrid() + .template newContainer( + "activeBitMaskInit", + [&, this](Neon::set::Loader& loader) { + auto bitMaskPartition = loader.load(mData->activeBitField); + return [&, bitMaskPartition](const auto& bitMaskIdx) mutable { + auto prtIdx = bitMaskPartition.prtID(); + int countActive = 0; + auto const blockOrigin = bitMaskPartition.getGlobalIndex(bitMaskIdx); + typename SBlock::BitMask& bitMask = bitMaskPartition(bitMaskIdx, 0); + bitMask.reset(); + + for (int k = 0; k < SBlock::memBlockSize3D.template newType().z; k++) { + for (int j = 0; j < SBlock::memBlockSize3D.template newType().y; j++) { + for (int i = 0; i < SBlock::memBlockSize3D.template newType().x; i++) { + auto globalPosition = blockOrigin + Neon::int32_3d(i * this->mData->mMultiResDiscreteIdxSpacing, + j * this->mData->mMultiResDiscreteIdxSpacing, + k * this->mData->mMultiResDiscreteIdxSpacing); + bool const isInDomain = globalPosition < domainSize * this->mData->mMultiResDiscreteIdxSpacing; + bool const isActive = activeCellLambda(globalPosition); + if (isActive && isInDomain) { + countActive++; + bitMask.setActive(i, j, k); + } + } + } + } +#pragma omp critical + { + this->mData->mNumActiveVoxel[prtIdx] += countActive; + } + }; + }) + .run(Neon::Backend::mainStreamIdx); + + + mData->activeBitField.updateDeviceData(Neon::Backend::mainStreamIdx); + this->getBackend().sync(Neon::Backend::mainStreamIdx); + mData->activeBitField.newHaloUpdate(Neon::set::StencilSemantic::standard, + Neon::set::TransferMode::put, + Neon::Execution::device) + .run(Neon::Backend::mainStreamIdx); + } + + + { // Neighbor blocks + mData->blockConnectivity = mData->blockViewGrid.template newField("blockConnectivity", + 27, + Span::getInvalidBlockId(), + Neon::DataUse::HOST_DEVICE, + Neon::MemoryLayout::arrayOfStructs); + + mData->blockConnectivity.getGrid().template newContainer( + "blockConnectivityInit", + [&](Neon::set::Loader& loader) { + auto blockConnectivity = loader.load(mData->blockConnectivity); + return [&, blockConnectivity](auto const& idx) mutable { + for (int8_t k = 0; k < 3; k++) { + for (int8_t j = 0; j < 3; j++) { + for (int8_t i = 0; i < 3; i++) { + auto targetDirection = i + 3 * j + 3 * 3 * k; + BlockIdx blockNghIdx = Span::getInvalidBlockId(); + typename decltype(blockConnectivity)::Idx nghIdx; + Neon::int8_3d stencilPoint(i - int8_t(1), + j - int8_t(1), + k - int8_t(1)); + bool isValid = blockConnectivity.getNghIndex(idx, stencilPoint, nghIdx); + if (isValid) { + blockNghIdx = static_cast(nghIdx.helpGet()); + } + blockConnectivity(idx, targetDirection) = blockNghIdx; + } + } + } + }; + }) + .run(Neon::Backend::mainStreamIdx); + mData->blockConnectivity.updateDeviceData(Neon::Backend::mainStreamIdx); + } + + // Initialization of the SPAN table + mData->spanTable.forEachConfiguration([&](Neon::Execution execution, + Neon::SetIdx setIdx, + Neon::DataView dw, + Span& span) { + span.mDataView = dw; + switch (dw) { + case Neon::DataView::STANDARD: { + span.mFirstDataBlockOffset = 0; + span.mDataView = dw; + span.mActiveMask = mData->activeBitField.getPartition(execution, setIdx, dw).mem(); + break; + } + case Neon::DataView::BOUNDARY: { + span.mFirstDataBlockOffset = mData->partitioner1D.getSpanClassifier().countInternal(setIdx); + span.mDataView = dw; + span.mActiveMask = mData->activeBitField.getPartition(execution, setIdx, dw).mem(); + + break; + } + case Neon::DataView::INTERNAL: { + span.mFirstDataBlockOffset = 0; + span.mDataView = dw; + span.mActiveMask = mData->activeBitField.getPartition(execution, setIdx, dw).mem(); + break; + } + default: { + NeonException exc("dFieldDev"); + NEON_THROW(exc); + } + } + }); + + { // Stencil Idx to 3d offset + auto nPoints = backend.devSet().newDataSet(stencil.nNeighbours()); + mData->stencilIdTo3dOffset = backend.devSet().template newMemSet(Neon::DataUse::HOST_DEVICE, + 1, + backend.getMemoryOptions(), + nPoints); + for (int i = 0; i < stencil.nNeighbours(); ++i) { + for (int devIdx = 0; devIdx < backend.devSet().setCardinality(); devIdx++) { + index_3d pLong = stencil.neighbours()[i]; + Neon::int8_3d pShort = pLong.newType(); + mData->stencilIdTo3dOffset.eRef(devIdx, i) = pShort; + } + } + mData->stencilIdTo3dOffset.updateDeviceData(backend, Neon::Backend::mainStreamIdx); + } + // Init the base grid + bGrid::GridBase::init(gridName.str(), + backend, + domainSize, + Neon::domain::Stencil(), + mData->mNumActiveVoxel, + SBlock::memBlockSize3D.template newType(), + spacingData, + origin, + encoderType, + defaultKernelBlockSize); + { // setting launchParameters + mData->launchParametersTable.forEachSeq([&](Neon::DataView dw, + Neon::set::LaunchParameters& bLaunchParameters) { + auto defEGridBlock = mData->blockViewGrid.getDefaultBlock(); + auto eGridParams = mData->blockViewGrid.getLaunchParameters(dw, defEGridBlock, 0); + eGridParams.forEachSeq([&](Neon::SetIdx setIdx, Neon::sys::GpuLaunchInfo const& launchSingleDev) { + auto eDomainGridSize = launchSingleDev.domainGrid(); + assert(eDomainGridSize.y == 1); + assert(eDomainGridSize.z == 1); + int nBlocks = static_cast(eDomainGridSize.x); + bLaunchParameters.get(setIdx).set(Neon::sys::GpuLaunchInfo::mode_e::cudaGridMode, + nBlocks, SBlock::memBlockSize3D.template newType(), 0); + }); + }); + } +} + +template +template +auto bGrid::newField(const std::string name, + int cardinality, + T inactiveValue, + Neon::DataUse dataUse, + Neon::MemoryOptions memoryOptions) const -> Field +{ + memoryOptions = this->getDevSet().sanitizeMemoryOption(memoryOptions); + Field field(name, dataUse, memoryOptions, *this, cardinality, inactiveValue); + + return field; +} + +template +template +auto bGrid::newBlockViewField(const std::string name, + int cardinality, + T inactiveValue, + Neon::DataUse dataUse, + Neon::MemoryOptions memoryOptions) const -> BlockView::Field +{ + memoryOptions = this->getDevSet().sanitizeMemoryOption(memoryOptions); + BlockView::Field blockViewField = mData->blockViewGrid.template newField(name, cardinality, inactiveValue, dataUse, memoryOptions); + return blockViewField; +} + +template +template +auto bGrid::newContainer(const std::string& name, + index_3d blockSize, + size_t sharedMem, + LoadingLambda lambda) const -> Neon::set::Container +{ + Neon::set::Container kContainer = Neon::set::Container::factory(name, + Neon::set::internal::ContainerAPI::DataViewSupport::on, + *this, + lambda, + blockSize, + [sharedMem](const Neon::index_3d&) { return sharedMem; }); + return kContainer; +} + +template +template +auto bGrid::newContainer(const std::string& name, + LoadingLambda lambda) const -> Neon::set::Container +{ + const Neon::index_3d& defaultBlockSize = this->getDefaultBlock(); + Neon::set::Container kContainer = Neon::set::Container::factory(name, + Neon::set::internal::ContainerAPI::DataViewSupport::on, + *this, + lambda, + defaultBlockSize, + [](const Neon::index_3d&) { return 0; }); + return kContainer; +} + +template +auto bGrid:: + getBlockViewGrid() + const -> BlockView::Grid& +{ + return mData->blockViewGrid; +} + +template +auto bGrid:: + getActiveBitMask() + const -> BlockView::Field& +{ + return mData->activeBitField; +} + +/** + * Helper function to retrieve the discrete index spacing used for the multi-resolution + */ +template +template +auto bGrid::helGetMultiResDiscreteIdxSpacing() const + -> std::enable_if_t +{ + return mData->mMultiResDiscreteIdxSpacing; +} + +template +auto bGrid:: + helpGetBlockConnectivity() + const -> BlockView::Field& +{ + return mData->blockConnectivity; +} +template +auto bGrid:: + helpGetDataBlockOriginField() + const -> Neon::aGrid::Field& +{ + return mData->mDataBlockOriginField; +} +template +auto bGrid::getSpan(Neon::Execution execution, + SetIdx setIdx, + Neon::DataView dataView) -> const bGrid::Span& +{ + return mData->spanTable.getSpan(execution, setIdx, dataView); +} + +template +bGrid::~bGrid() +{ +} +template +auto bGrid::getSetIdx(const index_3d& idx) const -> int32_t +{ + typename GridBaseTemplate::CellProperties cellProperties; + + cellProperties.setIsInside(this->isInsideDomain(idx)); + if (!cellProperties.isInside()) { + return -1; + } + Neon::SetIdx setIdx = cellProperties.getSetIdx(); + return setIdx; +} +template +auto bGrid::getLaunchParameters(Neon::DataView dataView, + const index_3d&, + const size_t& sharedMem) const -> Neon::set::LaunchParameters +{ + auto res = mData->launchParametersTable.get(dataView); + res.forEachSeq([&](SetIdx const& /*setIdx*/, + Neon::set::LaunchParameters::launchInfo_e& launchParams) -> void { + launchParams.setShm(sharedMem); + }); + return res; +} + +template +auto bGrid:: + helpGetStencilIdTo3dOffset() + const -> Neon::set::MemSet& +{ + return mData->stencilIdTo3dOffset; +} + +template +auto bGrid::isInsideDomain(const index_3d& idx) const -> bool +{ + // 1. check if the block is active + const BlockView::index_3d blockIdx3d = idx / (SBlock::memBlockSize3D.template newType() * mData->mMultiResDiscreteIdxSpacing); + auto blockProperties = mData->blockViewGrid.getProperties(blockIdx3d); + + if (!blockProperties.isInside()) { + return false; + } + // 2. The block is active, check the element in the block + typename SBlock::BitMask const& bitMask = mData->activeBitField.getReference(blockIdx3d, 0); + + bool isActive = bitMask.isActive((idx.x / mData->mMultiResDiscreteIdxSpacing) % SBlock::memBlockSize3D.x, + (idx.y / mData->mMultiResDiscreteIdxSpacing) % SBlock::memBlockSize3D.y, + (idx.z / mData->mMultiResDiscreteIdxSpacing) % SBlock::memBlockSize3D.z); + return isActive; +} + +template +auto bGrid::getProperties(const index_3d& idx) + const -> typename GridBaseTemplate::CellProperties +{ + typename GridBaseTemplate::CellProperties cellProperties; + + cellProperties.setIsInside(this->isInsideDomain(idx)); + if (!cellProperties.isInside()) { + return cellProperties; + } + + if (this->getDevSet().setCardinality() == 1) { + cellProperties.init(0, DataView::INTERNAL); + } else { + const index_3d blockIdx3d = idx / SBlock::memBlockSize3D.template newType(); + auto blockViewProperty = mData->blockViewGrid.getProperties(blockIdx3d); + + cellProperties.init(blockViewProperty.getSetIdx(), + blockViewProperty.getDataView()); + } + return cellProperties; +} + +template +auto bGrid::helpGetSetIdxAndGridIdx(Neon::index_3d idx) + const -> std::tuple +{ + const index_3d blockIdx3d = idx / (SBlock::memBlockSize3D.template newType() * mData->mMultiResDiscreteIdxSpacing); + auto [setIdx, bvGridIdx] = mData->blockViewGrid.helpGetSetIdxAndGridIdx(blockIdx3d); + Idx bIdx; + bIdx.mDataBlockIdx = bvGridIdx.helpGet(); + bIdx.mInDataBlockIdx.x = static_cast((idx.x / mData->mMultiResDiscreteIdxSpacing) % SBlock::memBlockSize3D.x); + bIdx.mInDataBlockIdx.y = static_cast((idx.y / mData->mMultiResDiscreteIdxSpacing) % SBlock::memBlockSize3D.y); + bIdx.mInDataBlockIdx.z = static_cast((idx.z / mData->mMultiResDiscreteIdxSpacing) % SBlock::memBlockSize3D.z); + + return {setIdx, bIdx}; +} + +template +auto bGrid::helpGetPartitioner1D() -> Neon::domain::tool::Partitioner1D& +{ + return mData->partitioner1D; +} + +} // namespace Neon::domain::details::bGrid \ No newline at end of file diff --git a/libNeonDomain/include/Neon/domain/details/bGridDisgMgpu/bIndex.h b/libNeonDomain/include/Neon/domain/details/bGridDisgMgpu/bIndex.h new file mode 100644 index 00000000..0452cfc9 --- /dev/null +++ b/libNeonDomain/include/Neon/domain/details/bGridDisgMgpu/bIndex.h @@ -0,0 +1,142 @@ +#pragma once + +#include "Neon/core/core.h" + + +namespace Neon::domain::details::bGridMgpu { + +// Common forward declarations +template +class bGrid; +template +class bSpan; +template +class bPartition; + +class MicroIndex +{ + public: + using TrayIdx = int32_t; + using InTrayIdx = int8_3d; + + NEON_CUDA_HOST_DEVICE inline explicit MicroIndex() + : MicroIndex(0, 0, 0, 0) + { + } + + NEON_CUDA_HOST_DEVICE inline explicit MicroIndex(const TrayIdx& blockIdx, + const InTrayIdx::Integer& x, + const InTrayIdx::Integer& y, + const InTrayIdx::Integer& z) + { + mTrayBlockIdx = blockIdx; + mInTrayBlockIdx.x = x; + mInTrayBlockIdx.y = y; + mInTrayBlockIdx.z = z; + } + + NEON_CUDA_HOST_DEVICE inline auto getInTrayBlockIdx() const -> InTrayIdx const& + { + return mInTrayBlockIdx; + } + + NEON_CUDA_HOST_DEVICE inline auto getTrayBlockIdx() const -> TrayIdx const& + { + return mTrayBlockIdx; + } + + NEON_CUDA_HOST_DEVICE inline auto setInTrayBlockIdx(InTrayIdx const& inTrayIdx) -> void + { + mInTrayBlockIdx = inTrayIdx; + } + + NEON_CUDA_HOST_DEVICE inline auto setTrayBlockIdx(TrayIdx const& trayIdx) -> void + { + mTrayBlockIdx = trayIdx; + } + + InTrayIdx mInTrayBlockIdx; + TrayIdx mTrayBlockIdx{}; +}; + +template +class bIndex +{ + public: + template + friend class bSpan; + using OuterIdx = bIndex; + + using NghIdx = int8_3d; + template + friend class bPartition; + + template + friend class bField; + + template + friend class bSpan; + template + friend class bGrid; + + + using TrayIdx = MicroIndex::TrayIdx; + using InTrayIdx = MicroIndex::InTrayIdx; + + using DataBlockCount = std::make_unsigned_t; + using DataBlockIdx = std::make_unsigned_t; + using InDataBlockIdx = InTrayIdx; + + bIndex() = default; + ~bIndex() = default; + + NEON_CUDA_HOST_DEVICE inline explicit bIndex(const DataBlockIdx& blockIdx, + const InDataBlockIdx::Integer& x, + const InDataBlockIdx::Integer& y, + const InDataBlockIdx::Integer& z); + + NEON_CUDA_HOST_DEVICE inline auto getMicroIndex() -> MicroIndex; + NEON_CUDA_HOST_DEVICE inline auto init(MicroIndex const&) -> void; + + NEON_CUDA_HOST_DEVICE inline auto getInDataBlockIdx() const -> InDataBlockIdx const&; + NEON_CUDA_HOST_DEVICE inline auto getDataBlockIdx() const -> DataBlockIdx const&; + NEON_CUDA_HOST_DEVICE inline auto setInDataBlockIdx(InDataBlockIdx const&) -> void; + NEON_CUDA_HOST_DEVICE inline auto setDataBlockIdx(DataBlockIdx const&) -> void; + NEON_CUDA_HOST_DEVICE inline auto isActive() const -> bool; + // the local index within the block + InDataBlockIdx mInDataBlockIdx; + DataBlockIdx mDataBlockIdx{}; +}; + +template +NEON_CUDA_HOST_DEVICE auto bIndex::setDataBlockIdx(const bIndex::DataBlockIdx& dataBlockIdx) -> void +{ + mDataBlockIdx = dataBlockIdx; +} + +template +NEON_CUDA_HOST_DEVICE auto bIndex::setInDataBlockIdx(const bIndex::InDataBlockIdx& inDataBlockIdx) -> void +{ + mInDataBlockIdx = inDataBlockIdx; +} + +template +NEON_CUDA_HOST_DEVICE auto bIndex::getDataBlockIdx() const -> const bIndex::DataBlockIdx& +{ + return mDataBlockIdx; +} +template +NEON_CUDA_HOST_DEVICE auto bIndex::getInDataBlockIdx() const -> const bIndex::InDataBlockIdx& +{ + return mInDataBlockIdx; +} + +template +NEON_CUDA_HOST_DEVICE auto bIndex::isActive() const -> bool +{ + return mDataBlockIdx != std::numeric_limits::max(); +} + +} // namespace Neon::domain::details::bGrid + +#include "Neon/domain/details/bGrid/bIndex_imp.h" diff --git a/libNeonDomain/include/Neon/domain/details/bGridDisgMgpu/bIndex_imp.h b/libNeonDomain/include/Neon/domain/details/bGridDisgMgpu/bIndex_imp.h new file mode 100644 index 00000000..9da43b62 --- /dev/null +++ b/libNeonDomain/include/Neon/domain/details/bGridDisgMgpu/bIndex_imp.h @@ -0,0 +1,67 @@ +#pragma once +#include "Neon/domain/details/bGridDisgMgpu/bIndex.h" + +namespace Neon::domain::details::bGridMgpu { + +template +NEON_CUDA_HOST_DEVICE inline bIndex:: + bIndex(const DataBlockIdx& blockIdx, + const InDataBlockIdx::Integer& x, + const InDataBlockIdx::Integer& y, + const InDataBlockIdx::Integer& z) +{ + mDataBlockIdx = blockIdx; + mInDataBlockIdx.x = x; + mInDataBlockIdx.y = y; + mInDataBlockIdx.z = z; +} + + +template +NEON_CUDA_HOST_DEVICE inline auto bIndex::getMicroIndex() -> MicroIndex +{ + + + TrayIdx const exBlockOffset = mDataBlockIdx * (SBlock::blockRatioX * SBlock::blockRatioY * SBlock::blockRatioZ); + TrayIdx const exTrayOffset = [&] { + TrayIdx const trayBlockIdxX = mInDataBlockIdx.x / SBlock::userBlockSizeX; + TrayIdx const trayBlockIdxY = mInDataBlockIdx.y / SBlock::userBlockSizeY; + TrayIdx const trayBlockIdxZ = mInDataBlockIdx.z / SBlock::userBlockSizeZ; + + TrayIdx const res = trayBlockIdxX + trayBlockIdxY * SBlock::blockRatioX + + trayBlockIdxZ * (SBlock::blockRatioX * SBlock::blockRatioY); + return res; + }(); + MicroIndex res; + res.setTrayBlockIdx(exBlockOffset + exTrayOffset); + res.setInTrayBlockIdx({static_cast(mInDataBlockIdx.x % SBlock::userBlockSizeX), + static_cast(mInDataBlockIdx.y % SBlock::userBlockSizeY), + static_cast(mInDataBlockIdx.z % SBlock::userBlockSizeZ)}); + return res; +} + + +template +NEON_CUDA_HOST_DEVICE inline auto bIndex::init(MicroIndex const& microIndex) -> void +{ + constexpr uint32_t memBlockSize = SBlock::memBlockSizeX * SBlock::memBlockSizeY * SBlock::memBlockSizeZ; + constexpr uint32_t userBlockSize = SBlock::userBlockSizeX * SBlock::userBlockSizeY * SBlock::userBlockSizeZ; + constexpr uint32_t blockRatioSize = memBlockSize / userBlockSize; + + constexpr uint32_t blockRatioX = SBlock::memBlockSizeX / SBlock::userBlockSizeX; + constexpr uint32_t blockRatioY = SBlock::memBlockSizeY / SBlock::userBlockSizeY; + + mDataBlockIdx = microIndex.getTrayBlockIdx() / (blockRatioSize); + + uint32_t reminder = microIndex.getTrayBlockIdx() % (blockRatioSize); + + const uint32_t reminderInZ = reminder / (blockRatioX * blockRatioY); + mInDataBlockIdx.z = static_cast(microIndex.getInTrayBlockIdx().z + reminderInZ * SBlock::userBlockSizeZ); + reminder = reminder % (blockRatioX * blockRatioY); + const uint32_t reminderInY = reminder / (blockRatioX); + mInDataBlockIdx.y = static_cast(microIndex.getInTrayBlockIdx().y + reminderInY * SBlock::userBlockSizeY); + const uint32_t reminderInX = reminder % blockRatioX; + mInDataBlockIdx.x = static_cast(microIndex.getInTrayBlockIdx().x + reminderInX * SBlock::userBlockSizeX); +} + +} // namespace Neon::domain::details::bGrid \ No newline at end of file diff --git a/libNeonDomain/include/Neon/domain/details/bGridDisgMgpu/bPartition.h b/libNeonDomain/include/Neon/domain/details/bGridDisgMgpu/bPartition.h new file mode 100644 index 00000000..647a12d7 --- /dev/null +++ b/libNeonDomain/include/Neon/domain/details/bGridDisgMgpu/bPartition.h @@ -0,0 +1,215 @@ +#pragma once + +#include "Neon/domain/details/bGridDisgMgpu//bIndex.h" +#include "Neon/domain/details/bGridDisgMgpu//bSpan.h" + +#include "Neon/domain/interface/NghData.h" + +#include "Neon/sys/memory/CUDASharedMemoryUtil.h" + +namespace Neon::domain::details::bGridMgpu { + +template +class bSpan; + +template +class bPartition +{ + enum Sectors + { + bUp = 0, + bDw = 1, + gUp = 2, + gDw = 3, + after = 4, + first = bUp + }; + + public: + using Span = bSpan; + using Idx = bIndex; + using NghIdx = typename Idx::NghIdx; + using Type = T; + using NghData = Neon::domain::NghData; + + using BlockViewGrid = Neon::domain::tool::GridTransformer::Grid; + using BlockViewGridIdx = BlockViewGrid::Idx; + + public: + bPartition(); + + ~bPartition() = default; + + explicit bPartition(int setIdx, + int mCardinality, + T* mMem, + typename Idx::DataBlockIdx* mBlockConnectivity, + typename SBlock::BitMask const* NEON_RESTRICT mMask, + Neon::int32_3d* mOrigin, + NghIdx* mStencilNghIndex, + Neon::int32_3d mDomainSize, + typename Idx::DataBlockCount mFirstDataBUP, + typename Idx::DataBlockCount mFirstDataBDW, + typename Idx::DataBlockCount mFirstDataGUP, + typename Idx::DataBlockCount mFirstDataGDW, + typename Idx::DataBlockCount mLastDataGDW); + + /** + * Retrieve the cardinality of the field. + */ + inline NEON_CUDA_HOST_DEVICE auto + cardinality() + const -> int; + + /** + * Gets the field metadata at a cartesian point. + */ + inline NEON_CUDA_HOST_DEVICE auto + operator()(const Idx& cell, + int card) + -> T&; + + /** + * Gets the field metadata at a cartesian point. + */ + inline NEON_CUDA_HOST_DEVICE auto + operator()(const Idx& cell, + int card) + const -> const T&; + + /** + * Gets the field metadata at a neighbour cartesian point. + */ + NEON_CUDA_HOST_DEVICE inline auto + getNghData(const Idx& cell, + const NghIdx& offset, + const int card) + const -> NghData; + + /** + * Gets the field metadata at a neighbour cartesian point. + */ + NEON_CUDA_HOST_DEVICE inline auto + getNghData(const Idx& eId, + uint8_t nghID, + int card) + const -> NghData; + + /** + * Gets the field metadata at a neighbour cartesian point. + */ + template + NEON_CUDA_HOST_DEVICE inline auto + getNghData(const Idx& eId, + int card) + const -> NghData; + + /** + * Gets the field metadata at a neighbour cartesian point. + */ + template + NEON_CUDA_HOST_DEVICE inline auto + getNghData(const Idx& eId, + int card, + T defaultValue) + const -> NghData; + + template + NEON_CUDA_HOST_DEVICE inline auto + getNghData(const Idx& gidx, + int card, + LambdaVALID funIfValid, + LambdaNOTValid funIfNOTValid = nullptr) + const -> std::enable_if_t && (std::is_invocable_v || std::is_same_v), void>; + + template + NEON_CUDA_HOST_DEVICE inline auto + writeNghData(const Idx& gidx, + int card, + T value) + -> bool; + + /** + * Gets the global coordinates of the cartesian point. + */ + NEON_CUDA_HOST_DEVICE inline auto + getGlobalIndex(const Idx& cell) + const -> Neon::index_3d; + + NEON_CUDA_HOST_DEVICE inline auto + isActive(const Idx& cell, + const typename SBlock::BitMask* mask = nullptr) const -> bool; + + + NEON_CUDA_HOST_DEVICE inline auto + getDomainSize() + const -> Neon::index_3d; + + NEON_CUDA_HOST_DEVICE + auto mem() const -> T const*; + + /** + * Gets the Idx for in the block view space. + */ + NEON_CUDA_HOST_DEVICE inline auto + getBlockViewIdx(const Idx& cell) + const -> BlockViewGridIdx; + + + NEON_CUDA_HOST_DEVICE inline auto + helpGetPitch(const Idx& cell, int card) + const -> uint32_t; + + NEON_CUDA_HOST_DEVICE inline auto + helpGetValidIdxPitchExplicit(const Idx& idx, int card) + const -> uint32_t; + + NEON_CUDA_HOST_DEVICE inline auto + helpNghPitch(const Idx& nghIdx, int card) + const -> std::tuple; + + NEON_CUDA_HOST_DEVICE inline auto + helpGetNghIdx(const Idx& idx, const NghIdx& offset) + const -> Idx; + + template + NEON_CUDA_HOST_DEVICE inline auto + helpGetNghIdx(const Idx& idx) + const -> Idx; + + NEON_CUDA_HOST_DEVICE inline auto + helpGetNghIdx(const Idx& idx, const NghIdx& offset, const typename Idx::DataBlockIdx* blockConnectivity) + const -> Idx; + + template + NEON_CUDA_HOST_DEVICE inline auto + helpGetNghIdx(const Idx& idx, const typename Idx::DataBlockIdx* blockConnectivity) + const -> Idx; + + auto NEON_CUDA_HOST_DEVICE helpGetSectorFirstBlock(Sectors sector) const + -> typename Idx::DataBlockCount; + + int mCardinality; + T* mMem; + NghIdx const* NEON_RESTRICT mStencilNghIndex; + typename Idx::DataBlockIdx const* NEON_RESTRICT mBlockConnectivity; + typename SBlock::BitMask const* NEON_RESTRICT mMask; + Neon::int32_3d const* NEON_RESTRICT mOrigin; + int mSetIdx; + int mMultiResDiscreteIdxSpacing = 1; + Neon::int32_3d mDomainSize; + + + + typename Idx::DataBlockCount mSectorFirstBlockIdx[Sectors::after + 1]; +}; + +} // namespace Neon::domain::details::bGridMgpu + +#include "Neon/domain/details/bGridDisgMgpu//bPartition_imp.h" \ No newline at end of file diff --git a/libNeonDomain/include/Neon/domain/details/bGridDisgMgpu/bPartition_imp.h b/libNeonDomain/include/Neon/domain/details/bGridDisgMgpu/bPartition_imp.h new file mode 100644 index 00000000..0c1279ad --- /dev/null +++ b/libNeonDomain/include/Neon/domain/details/bGridDisgMgpu/bPartition_imp.h @@ -0,0 +1,510 @@ +#pragma once + +#include "Neon/domain/details/bGridDisgMgpu//bGrid.h" +#include "Neon/domain/details/bGridDisgMgpu/bSpan.h" + +namespace Neon::domain::details::bGridMgpu { + +template +bPartition::bPartition() + : mCardinality(0), + mMem(nullptr), + mStencilNghIndex(), + mBlockConnectivity(nullptr), + mMask(nullptr), + mOrigin(0), + mSetIdx(0) +{ +} + +template +bPartition:: + bPartition(int setIdx, + int cardinality, + T* mem, + typename Idx::DataBlockIdx* blockConnectivity, + typename SBlock::BitMask const* NEON_RESTRICT mask, + Neon::int32_3d* origin, + NghIdx* stencilNghIndex, + Neon::int32_3d mDomainSize, + typename Idx::DataBlockCount mFirstDataBUP, + typename Idx::DataBlockCount mFirstDataBDW, + typename Idx::DataBlockCount mFirstDataGUP, + typename Idx::DataBlockCount mFirstDataGDW, + typename Idx::DataBlockCount mLastDataGDW) + : mCardinality(cardinality), + mMem(mem), + mStencilNghIndex(stencilNghIndex), + mBlockConnectivity(blockConnectivity), + mMask(mask), + mOrigin(origin), + mSetIdx(setIdx), + mDomainSize(mDomainSize) +{ + mSectorFirstBlockIdx[Sectors::bUp] = mFirstDataBUP; + mSectorFirstBlockIdx[Sectors::bDw] = mFirstDataBDW; + mSectorFirstBlockIdx[Sectors::gUp] = mFirstDataGUP; + mSectorFirstBlockIdx[Sectors::gDw] = mFirstDataGDW; + mSectorFirstBlockIdx[Sectors::after] = mLastDataGDW; +} + +template +NEON_CUDA_HOST_DEVICE inline auto bPartition:: + getGlobalIndex(const Idx& gidx) + const -> Neon::index_3d +{ + auto location = mOrigin[gidx.mDataBlockIdx]; + location.x += gidx.mInDataBlockIdx.x; + location.y += gidx.mInDataBlockIdx.y; + location.z += gidx.mInDataBlockIdx.z; + if constexpr (SBlock::isMultiResMode) { + return location * mMultiResDiscreteIdxSpacing; + } + return location; +} + +template +NEON_CUDA_HOST_DEVICE inline auto bPartition:: + getDomainSize() + const -> Neon::index_3d +{ + return mDomainSize; +} + +template +NEON_CUDA_HOST_DEVICE inline auto bPartition:: + getBlockViewIdx(const Idx& gidx) + const -> BlockViewGridIdx +{ + BlockViewGridIdx res; + res.manualSet(gidx.getDataBlockIdx()); + return res; +} + +template +inline NEON_CUDA_HOST_DEVICE auto bPartition:: + cardinality() + const -> int +{ + return mCardinality; +} + +template +inline NEON_CUDA_HOST_DEVICE auto bPartition:: +operator()(const Idx& cell, + int card) -> T& +{ + return mMem[helpGetPitch(cell, card)]; +} + +template +inline NEON_CUDA_HOST_DEVICE auto bPartition:: +operator()(const Idx& cell, + int card) const -> const T& +{ + return mMem[helpGetPitch(cell, card)]; +} + +template +NEON_CUDA_HOST_DEVICE inline auto bPartition:: + mem() const -> T const* +{ + return mMem; +} + +template +inline NEON_CUDA_HOST_DEVICE auto bPartition:: + helpGetPitch(const Idx& idx, int card) + const -> uint32_t +{ + uint32_t const pitch = helpGetValidIdxPitchExplicit(idx, card); + return pitch; +} + +template +inline NEON_CUDA_HOST_DEVICE auto bPartition:: + helpGetValidIdxPitchExplicit(const Idx& idx, int card) + const -> uint32_t +{ + // we are in the internal sector, so we have the standard AoSoA + if (idx.getDataBlockIdx() < mSectorFirstBlockIdx[Sectors::first]) { + + uint32_t constexpr blockPitchByCard = SBlock::memBlockSizeX * SBlock::memBlockSizeY * SBlock::memBlockSizeZ; + uint32_t const inBlockInCardPitch = idx.mInDataBlockIdx.x + + SBlock::memBlockSizeX * idx.mInDataBlockIdx.y + + (SBlock::memBlockSizeX * SBlock::memBlockSizeY) * idx.mInDataBlockIdx.z; + uint32_t const blockAdnCardPitch = (idx.mDataBlockIdx * mCardinality + card) * blockPitchByCard; + uint32_t const pitch = blockAdnCardPitch + inBlockInCardPitch; + return pitch; + } + + // We switch to the other sector where we have a SoA + int sectorLenght = 0; + int myFirstBlock = 0; + + auto const myBlockIdx = idx.getDataBlockIdx(); + for (int mySector = 3; mySector >= 0; mySector--) { + if (myBlockIdx >= mSectorFirstBlockIdx[mySector]) { + sectorLenght = mSectorFirstBlockIdx[mySector + 1] - mSectorFirstBlockIdx[mySector]; + myFirstBlock = mSectorFirstBlockIdx[mySector]; + break; + } + } + int const denseX = (myBlockIdx - myFirstBlock) * SBlock::memBlockSizeX + idx.getInDataBlockIdx().x; + int const denseY = idx.mInDataBlockIdx.y; + int const denseZ = idx.mInDataBlockIdx.z; + + int const xStride = 1; + int const yStride = SBlock::memBlockSizeX * sectorLenght; + int const zStride = SBlock::memBlockSizeY * yStride; + + int const pitch = (SBlock::memBlockSizeX * SBlock::memBlockSizeY * SBlock::memBlockSizeZ) * + card * + sectorLenght + + denseX * xStride + + denseY * yStride + + denseZ * zStride; + + int const fullPitch = myFirstBlock * + SBlock::memBlockSizeX * + SBlock::memBlockSizeY * + SBlock::memBlockSizeZ * + cardinality() + + pitch; + return fullPitch; +} + +template +inline NEON_CUDA_HOST_DEVICE auto bPartition:: + helpNghPitch(const Idx& nghIdx, int card) + const -> std::tuple +{ + if (nghIdx.mDataBlockIdx == Span::getInvalidBlockId()) { + return {false, 0}; + } + + const bool isActive = mMask[nghIdx.mDataBlockIdx].isActive(nghIdx.mInDataBlockIdx.x, nghIdx.mInDataBlockIdx.y, nghIdx.mInDataBlockIdx.z); + if (!isActive) { + return {false, 0}; + } + auto const offset = helpGetValidIdxPitchExplicit(nghIdx, card); + return {true, offset}; +} + +template +NEON_CUDA_HOST_DEVICE inline auto bPartition:: + helpGetNghIdx(const Idx& idx, + const NghIdx& offset) + const -> Idx +{ + return this->helpGetNghIdx(idx, offset, mBlockConnectivity); +} + +template +NEON_CUDA_HOST_DEVICE inline auto bPartition:: + helpGetNghIdx(const Idx& idx, + const NghIdx& offset, + const typename Idx::DataBlockIdx* blockConnectivity) + const -> Idx +{ + + typename Idx::InDataBlockIdx ngh(idx.mInDataBlockIdx.x + offset.x, + idx.mInDataBlockIdx.y + offset.y, + idx.mInDataBlockIdx.z + offset.z); + + /** + * 0 if no offset on the direction + * 1 positive offset + * -1 negative offset + */ + const int xFlag = ngh.x < 0 ? -1 : (ngh.x >= SBlock::memBlockSizeX ? +1 : 0); + const int yFlag = ngh.y < 0 ? -1 : (ngh.y >= SBlock::memBlockSizeX ? +1 : 0); + const int zFlag = ngh.z < 0 ? -1 : (ngh.z >= SBlock::memBlockSizeX ? +1 : 0); + + const bool isLocal = (xFlag | yFlag | zFlag) == 0; + if (!(isLocal)) { + typename Idx::InDataBlockIdx remoteInBlockOffset; + /** + * Example + * - 8 block (1D case) + * Case 1: + * |0,1,2,3|0,1,2,3|0,1,2,3| + * ^ ^ + * -3 starting point + * + * - idx.inBlock = 2 + * - offset = -1 + * - remote.x = (2-3) - ((-1) * 4) = -1 + 4 = 3 + * Case 2: + * |0,1,2,3|0,1,2,3|0,1,2,3| + * ^ ^ + * starting point +3 from 3 + * + * - idx.inBlock = 3 + * - offset = (+3,0) + * - remote.x = (7+3) - ((+1) * 8) = 10 - 8 = 2 + * + * |0,1,2,3|0,1,2,3|0,1,2,3| + * ^ ^ + * -3 from 0 +3 from 3 + * + * NOTE: if in one direction the neighbour offet is zero, xFalg is 0; + * */ + + Idx remoteNghIdx; + remoteNghIdx.mInDataBlockIdx.x = ngh.x - xFlag * SBlock::memBlockSizeX; + remoteNghIdx.mInDataBlockIdx.y = ngh.y - yFlag * SBlock::memBlockSizeX; + remoteNghIdx.mInDataBlockIdx.z = ngh.z - zFlag * SBlock::memBlockSizeX; + + int connectivityJump = idx.mDataBlockIdx * 27 + + (xFlag + 1) + + (yFlag + 1) * 3 + + (zFlag + 1) * 9; + remoteNghIdx.mDataBlockIdx = blockConnectivity[connectivityJump]; + + return remoteNghIdx; + } else { + Idx localNghIdx; + localNghIdx.mDataBlockIdx = idx.mDataBlockIdx; + localNghIdx.mInDataBlockIdx = ngh; + return localNghIdx; + } +} + +template +template +NEON_CUDA_HOST_DEVICE inline auto bPartition:: + helpGetNghIdx(const Idx& idx) + const -> Idx +{ + return this->helpGetNghIdx(idx, mBlockConnectivity); +} + +template +template +NEON_CUDA_HOST_DEVICE inline auto bPartition:: + helpGetNghIdx(const Idx& idx, const typename Idx::DataBlockIdx* blockConnectivity) + const -> Idx +{ + + typename Idx::InDataBlockIdx ngh(idx.mInDataBlockIdx.x + xOff, + idx.mInDataBlockIdx.y + yOff, + idx.mInDataBlockIdx.z + zOff); + + /** + * 0 if no offset on the direction + * 1 positive offset + * -1 negative offset + */ + const int xFlag = [&] { + if constexpr (xOff == 0) { + return 0; + } else { + return ngh.x < 0 ? -1 : (ngh.x >= SBlock::memBlockSizeX ? +1 : 0); + } + }(); + + + const int yFlag = [&] { + if constexpr (yOff == 0) { + return 0; + } else { + return ngh.y < 0 ? -1 : (ngh.y >= SBlock::memBlockSizeX ? +1 : 0); + } + }(); + const int zFlag = [&] { + if constexpr (zOff == 0) { + return 0; + } else { + return ngh.z < 0 ? -1 : (ngh.z >= SBlock::memBlockSizeX ? +1 : 0); + } + }(); + + const bool isLocal = (xFlag | yFlag | zFlag) == 0; + if (!(isLocal)) { + typename Idx::InDataBlockIdx remoteInBlockOffset; + /** + * Example + * - 8 block (1D case) + * Case 1: + * |0,1,2,3|0,1,2,3|0,1,2,3| + * ^ ^ + * -3 starting point + * + * - idx.inBlock = 2 + * - offset = -1 + * - remote.x = (2-3) - ((-1) * 4) = -1 + 4 = 3 + * Case 2: + * |0,1,2,3|0,1,2,3|0,1,2,3| + * ^ ^ + * starting point +3 from 3 + * + * - idx.inBlock = 3 + * - offset = (+3,0) + * - remote.x = (7+3) - ((+1) * 8) = 10 - 8 = 2 + * + * |0,1,2,3|0,1,2,3|0,1,2,3| + * ^ ^ + * -3 from 0 +3 from 3 + * + * NOTE: if in one direction the neighbour offet is zero, xFalg is 0; + * */ + + Idx remoteNghIdx; + remoteNghIdx.mInDataBlockIdx.x = ngh.x - xFlag * SBlock::memBlockSizeX; + remoteNghIdx.mInDataBlockIdx.y = ngh.y - yFlag * SBlock::memBlockSizeX; + remoteNghIdx.mInDataBlockIdx.z = ngh.z - zFlag * SBlock::memBlockSizeX; + + int connectivityJump = idx.mDataBlockIdx * 27 + + (xFlag + 1) + + (yFlag + 1) * 3 + + (zFlag + 1) * 9; + remoteNghIdx.mDataBlockIdx = blockConnectivity[connectivityJump]; + + return remoteNghIdx; + } else { + Idx localNghIdx; + localNghIdx.mDataBlockIdx = idx.mDataBlockIdx; + localNghIdx.mInDataBlockIdx = ngh; + return localNghIdx; + } +} + +template +NEON_CUDA_HOST_DEVICE inline auto bPartition:: + getNghData(const Idx& eId, + uint8_t nghID, + int card) + const -> NghData +{ + NghIdx nghOffset = mStencilNghIndex[nghID]; + return getNghData(eId, nghOffset, card); +} + +template +NEON_CUDA_HOST_DEVICE inline auto bPartition:: + getNghData(const Idx& idx, + const NghIdx& offset, + const int card) + const -> NghData +{ + NghData result; + bIndex nghIdx = helpGetNghIdx(idx, offset); + auto [isValid, pitch] = helpNghPitch(nghIdx, card); + if (!isValid) { + result.invalidate(); + return result; + } + auto const value = mMem[pitch]; + result.set(value, true); + return result; +} + +template +template +NEON_CUDA_HOST_DEVICE inline auto bPartition:: + getNghData(const Idx& idx, + int card) + const -> NghData +{ + NghData result; + bIndex nghIdx = helpGetNghIdx(idx); + auto [isValid, pitch] = helpNghPitch(nghIdx, card); + if (!isValid) { + result.invalidate(); + return result; + } + auto const value = mMem[pitch]; + result.set(value, true); + return result; +} + +template +template +NEON_CUDA_HOST_DEVICE inline auto bPartition:: + getNghData(const Idx& idx, + int card, + T defaultValue) + const -> NghData +{ + NghData result; + bIndex nghIdx = helpGetNghIdx(idx); + auto [isValid, pitch] = helpNghPitch(nghIdx, card); + if (!isValid) { + result.set(defaultValue, false); + return result; + } + auto const value = mMem[pitch]; + result.set(value, true); + return result; +} + +template + +template +NEON_CUDA_HOST_DEVICE inline auto bPartition:: + getNghData(const Idx& gidx, + int card, + LambdaVALID funIfValid, + LambdaNOTValid funIfNOTValid) + const -> std::enable_if_t && (std::is_invocable_v || std::is_same_v), void> +{ + NghData result; + bIndex nghIdx = helpGetNghIdx(gidx); + auto [isValid, pitch] = helpNghPitch(nghIdx, card); + + if (isValid) { + auto const& value = mMem[pitch]; + funIfValid(value); + return; + } + + if constexpr (!std::is_same_v) { + funIfNOTValid(); + } + return; +} + +template +template +NEON_CUDA_HOST_DEVICE inline auto bPartition:: + writeNghData(const Idx& gidx, + int card, + T value) + -> bool +{ + NghData result; + bIndex nghIdx = helpGetNghIdx(gidx); + auto [isValid, pitch] = helpNghPitch(nghIdx, card); + if (!isValid) { + return false; + } + mMem[pitch] = value; + return true; +} + +template +NEON_CUDA_HOST_DEVICE inline auto +bPartition::isActive(const Idx& cell, + const typename SBlock::BitMask* mask) const -> bool +{ + if (!mask) { + return mMask[cell.mDataBlockIdx].isActive(cell.mInDataBlockIdx.x, cell.mInDataBlockIdx.y, cell.mInDataBlockIdx.z); + } else { + return mask[cell.mDataBlockIdx].isActive(cell.mInDataBlockIdx.x, cell.mInDataBlockIdx.y, cell.mInDataBlockIdx.z); + } +} + +template +NEON_CUDA_HOST_DEVICE inline auto +bPartition::helpGetSectorFirstBlock(Sectors sector) const + -> typename Idx::DataBlockCount +{ + return mSectorFirstBlockIdx[sector]; +} + +} // namespace Neon::domain::details::bGridMgpu \ No newline at end of file diff --git a/libNeonDomain/include/Neon/domain/details/bGridDisgMgpu/bSpan.h b/libNeonDomain/include/Neon/domain/details/bGridDisgMgpu/bSpan.h new file mode 100644 index 00000000..397133c8 --- /dev/null +++ b/libNeonDomain/include/Neon/domain/details/bGridDisgMgpu/bSpan.h @@ -0,0 +1,55 @@ +#pragma once + +#include "Neon/domain/details/bGridDisgMgpu/bIndex.h" + +namespace Neon::domain::details::bGridMgpu { + +template +class bSpan +{ + public: + // bit mask information + using BitMaskWordType = uint64_t; + + static constexpr uint32_t bitMaskStorageBitWidth = 64; + static constexpr Neon::MemoryLayout activeMaskMemoryLayout = Neon::MemoryLayout::arrayOfStructs; + static constexpr uint32_t log2OfbitMaskWordSize = 6; + + using Idx = bIndex; + friend class bGrid; + + static constexpr int SpaceDim = 3; + + bSpan() = default; + virtual ~bSpan() = default; + + NEON_CUDA_HOST_DEVICE inline static auto getInvalidBlockId() + -> typename Idx::DataBlockIdx + { + return std::numeric_limits::max(); + } + + inline bSpan( + typename Idx::DataBlockCount mFirstDataBlockOffset, + typename SBlock::BitMask const* NEON_RESTRICT mActiveMask, + Neon::DataView mDataView); + + NEON_CUDA_HOST_DEVICE inline auto setAndValidateCPUDevice( + Idx& bidx, + uint32_t const& threadIdx, + uint32_t const& x, + uint32_t const& y, + uint32_t const& z) const -> bool; + + NEON_CUDA_HOST_DEVICE inline auto setAndValidateGPUDevice( + Idx& bidx) const -> bool; + + + // We don't need to have a count on active blocks + typename Idx::DataBlockCount mFirstDataBlockOffset; + typename SBlock::BitMask const* NEON_RESTRICT mActiveMask; + Neon::DataView mDataView; +}; +} // namespace Neon::domain::details::bGrid + +#include "Neon/domain/details/bGridDisgMgpu/bSpan_imp.h" \ No newline at end of file diff --git a/libNeonDomain/include/Neon/domain/details/bGridDisgMgpu/bSpan_imp.h b/libNeonDomain/include/Neon/domain/details/bGridDisgMgpu/bSpan_imp.h new file mode 100644 index 00000000..45e40eeb --- /dev/null +++ b/libNeonDomain/include/Neon/domain/details/bGridDisgMgpu/bSpan_imp.h @@ -0,0 +1,51 @@ +#include "Neon/domain/details/bGridDisgMgpu/bSpan.h" + +namespace Neon::domain::details::bGridMgpu { + +template +NEON_CUDA_HOST_DEVICE inline auto +bSpan::setAndValidateGPUDevice([[maybe_unused]] Idx& bidx) const -> bool +{ +#ifdef NEON_PLACE_CUDA_DEVICE + bidx.mDataBlockIdx = blockIdx.x + mFirstDataBlockOffset; + bidx.mInDataBlockIdx.x = threadIdx.x; + bidx.mInDataBlockIdx.y = threadIdx.y; + bidx.mInDataBlockIdx.z = threadIdx.z; + + const bool isActive = mActiveMask[bidx.mDataBlockIdx].isActive(bidx.mInDataBlockIdx.x, bidx.mInDataBlockIdx.y, bidx.mInDataBlockIdx.z); + + return isActive; +#else + NEON_THROW_UNSUPPORTED_OPERATION("Operation supported only on GPU"); +#endif +} + +template +NEON_CUDA_HOST_DEVICE inline auto +bSpan::setAndValidateCPUDevice(Idx& bidx, + uint32_t const& dataBlockIdx, + uint32_t const& x, + uint32_t const& y, + uint32_t const& z) const -> bool +{ + + bidx.mDataBlockIdx = dataBlockIdx + mFirstDataBlockOffset; + bidx.mInDataBlockIdx.x = static_cast(x); + bidx.mInDataBlockIdx.y = static_cast(y); + bidx.mInDataBlockIdx.z = static_cast(z); + const bool isActive = mActiveMask[dataBlockIdx].isActive(bidx.mInDataBlockIdx.x, bidx.mInDataBlockIdx.y, bidx.mInDataBlockIdx.z); + return isActive; +} + +template +bSpan::bSpan(typename Idx::DataBlockCount firstDataBlockOffset, + typename SBlock::BitMask const* NEON_RESTRICT activeMask, + Neon::DataView dataView) + : mFirstDataBlockOffset(firstDataBlockOffset), + mActiveMask(activeMask), + mDataView(dataView) +{ +} + + +} // namespace Neon::domain::details::bGrid \ No newline at end of file diff --git a/libNeonDomain/src/domain/details/bGridDisgMgpu/bFieldReduceKernels.cu b/libNeonDomain/src/domain/details/bGridDisgMgpu/bFieldReduceKernels.cu new file mode 100644 index 00000000..e69de29b diff --git a/libNeonDomain/src/domain/details/bGridDisgMgpu/bGrid.cpp b/libNeonDomain/src/domain/details/bGridDisgMgpu/bGrid.cpp new file mode 100644 index 00000000..d207379b --- /dev/null +++ b/libNeonDomain/src/domain/details/bGridDisgMgpu/bGrid.cpp @@ -0,0 +1,7 @@ +#include "Neon/domain/details/bGridDisgMgpu/bGrid.h" + +namespace Neon::domain::details::bGridMgpu { + +template class bGrid>; + +} // namespace Neon::domain::details::bGrid \ No newline at end of file diff --git a/libNeonDomain/tests/domain-map-disg/src/gtests.cpp b/libNeonDomain/tests/domain-map-disg/src/gtests.cpp index b5b77c56..3664f646 100644 --- a/libNeonDomain/tests/domain-map-disg/src/gtests.cpp +++ b/libNeonDomain/tests/domain-map-disg/src/gtests.cpp @@ -4,7 +4,7 @@ #include "map.h" #include "runHelper.h" -TEST(domain_map, bGridDisg) +TEST(domain_map_disg, bGridDisg) { int nGpus = 1; using Type = int64_t; @@ -13,7 +13,7 @@ TEST(domain_map, bGridDisg) 1); } -TEST(domain_map_dataView, bGridDisg) +TEST(domain_map_disg_dataView, bGridDisg) { int nGpus = 1; using Type = int64_t; diff --git a/libNeonDomain/tests/domain-map/src/gtests.cpp b/libNeonDomain/tests/domain-map/src/gtests.cpp index 6240ad66..f6977444 100644 --- a/libNeonDomain/tests/domain-map/src/gtests.cpp +++ b/libNeonDomain/tests/domain-map/src/gtests.cpp @@ -42,7 +42,7 @@ TEST(domain_map, bGrid) TEST(domain_map, dGridDisg) { - int nGpus = 1; + int nGpus = 3; using Type = int64_t; runAllTestConfiguration(std::function(map::run), nGpus, @@ -58,6 +58,16 @@ TEST(domain_map, dGridSoA) 1); } +TEST(domain_map, bGridMgpu) +{ + int nGpus = 3; + using Type = int64_t; + // extern template auto run(TestData&) -> void; + runAllTestConfiguration(std::function(map::run), + nGpus, + 1); +} + int main(int argc, char** argv) { ::testing::InitGoogleTest(&argc, argv); diff --git a/libNeonDomain/tests/domain-map/src/map.cu b/libNeonDomain/tests/domain-map/src/map.cu index 810cb067..d2dc5e5f 100644 --- a/libNeonDomain/tests/domain-map/src/map.cu +++ b/libNeonDomain/tests/domain-map/src/map.cu @@ -1,7 +1,7 @@ #include #include "Neon/domain/Grids.h" -#include "Neon/domain/details/dGridDisg/dGrid.h" #include "Neon/domain/details/bGridDisg/bGrid.h" +#include "Neon/domain/details/dGridDisg/dGrid.h" #include "Neon/domain/details/dGridSoA/dGridSoA.h" #include "Neon/domain/tools/TestData.h" @@ -36,15 +36,15 @@ auto mapContainer_axpy(int streamIdx, template auto mapContainer_add(int streamIdx, - typename Field::Type& val, - Field& fieldB) + typename Field::Type& val, + Field& fieldB) -> Neon::set::Container { const auto& grid = fieldB.getGrid(); return grid.newContainer( "mapContainer_axpy", [&, val](Neon::set::Loader& loader) { - auto b = loader.load(fieldB); + auto b = loader.load(fieldB); return [=] NEON_CUDA_HOST_DEVICE(const typename Field::Idx& e) mutable { for (int i = 0; i < b.cardinality(); i++) { @@ -101,6 +101,7 @@ template auto run(TestData&) - template auto run(TestData&) -> void; template auto run(TestData&) -> void; template auto run(TestData&) -> void; +template auto run(TestData&) -> void; namespace dataView { template @@ -151,6 +152,7 @@ template auto run(TestData&) - template auto run(TestData&) -> void; template auto run(TestData&) -> void; template auto run(TestData&) -> void; +template auto run(TestData&) -> void; } // namespace dataView } // namespace map \ No newline at end of file diff --git a/libNeonDomain/tests/domain-map/src/map.h b/libNeonDomain/tests/domain-map/src/map.h index f6f8bc7c..73038353 100644 --- a/libNeonDomain/tests/domain-map/src/map.h +++ b/libNeonDomain/tests/domain-map/src/map.h @@ -19,6 +19,7 @@ extern template auto run(TestData(TestData&) -> void; extern template auto run(TestData&) -> void; extern template auto run(TestData&) -> void; +extern template auto run(TestData&) -> void; namespace dataView { @@ -30,6 +31,7 @@ extern template auto run(TestData(TestData&) -> void; extern template auto run(TestData&) -> void; extern template auto run(TestData&) -> void; +extern template auto run(TestData&) -> void; } // namespace dataView diff --git a/libNeonDomain/tests/domain-map/src/runHelper.h b/libNeonDomain/tests/domain-map/src/runHelper.h index 593e31c2..4c622f82 100644 --- a/libNeonDomain/tests/domain-map/src/runHelper.h +++ b/libNeonDomain/tests/domain-map/src/runHelper.h @@ -73,7 +73,7 @@ void runAllTestConfiguration( Neon::Backend backend(ids, runtime); Neon::MemoryOptions memoryOptions = backend.getMemoryOptions(); - if constexpr (std::is_same_v) { + if constexpr (G::executionThreadSpan == Neon::set::details::ExecutionThreadSpan::d1b3) { if (dim.z < 8 * ngpu * 3) { dim.z = ngpu * 3 * 8; }