diff --git a/benchmarks/lbm/lbm.py b/benchmarks/lbm/lbm.py index 8e56dd9f..8606e81b 100644 --- a/benchmarks/lbm/lbm.py +++ b/benchmarks/lbm/lbm.py @@ -1,6 +1,6 @@ -deviceType_LIST = 'cpu gpu'.split() +deviceType_LIST = 'gpu'.split() deviceIds_LIST = "0 1 2 3 4 5 6 7".split() -grid_LIST = "dGrid bGrid_4_4_4".split() +grid_LIST = "bGrid_4_4_4 bGridMgpu_4_4_4".split() domainSize_LIST = "64 80 96 112 128 144 160 176 192 208 224 240 256 272 288 304 320 336 352 368 384 400 416 432 448 464 480 496 512".split() computeFP_LIST = "double float".split() storageFP_LIST = "double float".split() @@ -35,7 +35,7 @@ def getDeviceConfigurations(DEVICE_TYPE, deviceIds_LIST): return [deviceIds_LIST[0]] if goal_is_efficiency_max_num_devices: - return [deviceIds_LIST[0], deviceIds_LIST] + return [deviceIds_LIST[0], ' '.join(deviceIds_LIST)] def printProgressBar(value, label): diff --git a/benchmarks/lbm/src/RunCavityTwoPop.cu b/benchmarks/lbm/src/RunCavityTwoPop.cu index ea6ae23f..c3cfc561 100644 --- a/benchmarks/lbm/src/RunCavityTwoPop.cu +++ b/benchmarks/lbm/src/RunCavityTwoPop.cu @@ -3,11 +3,9 @@ #include "D3Q19.h" #include "D3Q27.h" -#include "Neon/domain/bGrid.h" -#include "Neon/domain/dGrid.h" +#include "Neon/domain/Grids.h" #include "Neon/domain/details/dGridDisg/dGrid.h" #include "Neon/domain/details/dGridSoA/dGridSoA.h" -#include "Neon/domain/eGrid.h" #include "./Lbm.h" #include "CellType.h" @@ -135,18 +133,18 @@ auto runFilterCollision(Config& config, testCode << "_bgk"; return runFilterMethod(config, report, testCode); } - // if (config.collisionCli.getOption() == Collision::kbc) { - // if (config.lattice != "d3q27" && config.lattice != "D3Q27") { - // Neon::NeonException e("runFilterCollision"); - // e << "LBM kbc collision model only supports d3q27 lattice"; - // NEON_THROW(e); - // } - // testCode << "_kbc"; - // using L = D3Q27>; - // if constexpr (std::is_same_v) { - // return runFilterMethod(config, report, testCode); - // } - // } + if (config.collisionCli.getOption() == Collision::kbc) { + if (config.lattice != "d3q27" && config.lattice != "D3Q27") { + Neon::NeonException e("runFilterCollision"); + e << "LBM kbc collision model only supports d3q27 lattice"; + NEON_THROW(e); + } + testCode << "_kbc"; + using L = D3Q27>; + if constexpr (std::is_same_v) { + return runFilterMethod(config, report, testCode); + } + } NEON_DEV_UNDER_CONSTRUCTION(""); } @@ -218,13 +216,25 @@ auto run(Config& config, testCode << "___" << config.N << "_"; testCode << "_numDevs_" << config.devices.size(); - if (config.gridType == "dGrid") { - testCode << "_dGrid"; - return details::runFilterStoreType(config, report, testCode); + // if (config.gridType == "dGrid") { + // testCode << "_dGrid"; + // return details::runFilterStoreType(config, report, testCode); + // } + // if (config.gridType == "dGridDisg") { + // testCode << "_dGridDisg"; + // return details::runFilterStoreType(config, report, testCode); + // } + if (config.gridType == "bGrid_4_4_4") { + testCode << "_bGrid_4_4_4"; + using Block = Neon::domain::details::bGrid::StaticBlock<4, 4, 4>; + using Grid = Neon::domain::details::bGrid::bGrid; + return details::runFilterStoreType(config, report, testCode); } - if (config.gridType == "dGridDisg") { - testCode << "_dGridDisg"; - return details::runFilterStoreType(config, report, testCode); + if (config.gridType == "bGridMgpu_4_4_4") { + testCode << "_bGridMgpu_4_4_4"; + using Block = Neon::domain::details::bGridMgpu::StaticBlock<4, 4, 4>; + using Grid = Neon::domain::details::bGridMgpu::bGrid; + return details::runFilterStoreType(config, report, testCode); } // if (config.gridType == "eGrid") { // if constexpr (!skipTest) { diff --git a/libNeonDomain/include/Neon/domain/Grids.h b/libNeonDomain/include/Neon/domain/Grids.h index a7c9a878..6e48f584 100644 --- a/libNeonDomain/include/Neon/domain/Grids.h +++ b/libNeonDomain/include/Neon/domain/Grids.h @@ -5,3 +5,4 @@ #include "Neon/domain/bGrid.h" #include "Neon/domain/dGridSoA.h" #include "Neon/domain/bGridDisg.h" +#include "Neon/domain/bGridMgpuDisg.h" diff --git a/libNeonDomain/include/Neon/domain/bGridMgpuDisg.h b/libNeonDomain/include/Neon/domain/bGridMgpuDisg.h new file mode 100644 index 00000000..25d68e88 --- /dev/null +++ b/libNeonDomain/include/Neon/domain/bGridMgpuDisg.h @@ -0,0 +1,6 @@ +#pragma once +#include "Neon/domain/details/bGridDisgMgpu//bGrid.h" + +namespace Neon { +using bGridMgpu = Neon::domain::details::bGridMgpu::bGridMgpuDefault; +} \ No newline at end of file diff --git a/libNeonDomain/include/Neon/domain/details/bGrid/bPartition.h b/libNeonDomain/include/Neon/domain/details/bGrid/bPartition.h index 8ce0fea8..a5a79ae2 100644 --- a/libNeonDomain/include/Neon/domain/details/bGrid/bPartition.h +++ b/libNeonDomain/include/Neon/domain/details/bGrid/bPartition.h @@ -139,6 +139,7 @@ class bPartition getDomainSize() const -> Neon::index_3d; + NEON_CUDA_HOST_DEVICE auto mem() const -> T const *; /** diff --git a/libNeonDomain/include/Neon/domain/details/bGridDisg/bField.h b/libNeonDomain/include/Neon/domain/details/bGridDisg/bField.h index 7b9532de..86a45575 100644 --- a/libNeonDomain/include/Neon/domain/details/bGridDisg/bField.h +++ b/libNeonDomain/include/Neon/domain/details/bGridDisg/bField.h @@ -109,9 +109,7 @@ class bField : public Neon::domain::interface::FieldBaseTemplate memoryField; int cardinality; - // Neon::domain::tool::HaloTable1DPartitioning latticeHaloUpdateTable; - Neon::domain::tool::HaloTable1DPartitioning soaHaloUpdateTable; - // Neon::domain::tool::HaloTable1DPartitioning aosHaloUpdateTable; + Neon::domain::tool::HaloTable1DPartitioning mStandardHaloUpdateTable; Neon::domain::tool::PartitionTable partitionTable; }; std::shared_ptr mData; diff --git a/libNeonDomain/include/Neon/domain/details/bGridDisg/bField_imp.h b/libNeonDomain/include/Neon/domain/details/bGridDisg/bField_imp.h index ab122f8b..e7acbe8c 100644 --- a/libNeonDomain/include/Neon/domain/details/bGridDisg/bField_imp.h +++ b/libNeonDomain/include/Neon/domain/details/bGridDisg/bField_imp.h @@ -191,7 +191,7 @@ auto bField::newHaloUpdate(Neon::set::StencilSemantic stencilSeman for (auto byDirection : {tool::partitioning::ByDirection::up, tool::partitioning::ByDirection::down}) { - auto const& tableEntryByDir = mData->soaHaloUpdateTable.get(transferMode, + auto const& tableEntryByDir = mData->mStandardHaloUpdateTable.get(transferMode, execution, byDirection); @@ -262,7 +262,7 @@ auto bField::initHaloUpdateTable() -> void return res; }; - mData->soaHaloUpdateTable.forEachPutConfiguration( + mData->mStandardHaloUpdateTable.forEachPutConfiguration( bk, [&](Neon::SetIdx setIdxSrc, Execution execution, Neon::domain::tool::partitioning::ByDirection byDirection, diff --git a/libNeonDomain/include/Neon/domain/details/bGridDisg/bGrid.h b/libNeonDomain/include/Neon/domain/details/bGridDisg/bGrid.h index 5976b2c8..72e39ee9 100644 --- a/libNeonDomain/include/Neon/domain/details/bGridDisg/bGrid.h +++ b/libNeonDomain/include/Neon/domain/details/bGridDisg/bGrid.h @@ -223,6 +223,10 @@ class bGrid : public Neon::domain::interface::GridBaseTemplate, */ auto helpGetSetIdxAndGridIdx(Neon::index_3d idx) const -> std::tuple; + template + auto init_mask_field(ActiveCellLambda activeCellLambda) -> void; + auto helpGetClassField() -> Field&; + struct Data { auto init(const Neon::Backend& bk) @@ -258,6 +262,9 @@ class bGrid : public Neon::domain::interface::GridBaseTemplate, AlphaGrid alphaGrid; BetaGrid betaGrid; + + Field maskClassField; + }; std::shared_ptr mData; }; diff --git a/libNeonDomain/include/Neon/domain/details/bGridDisg/bGrid_imp.h b/libNeonDomain/include/Neon/domain/details/bGridDisg/bGrid_imp.h index 4af56db8..e5724510 100644 --- a/libNeonDomain/include/Neon/domain/details/bGridDisg/bGrid_imp.h +++ b/libNeonDomain/include/Neon/domain/details/bGridDisg/bGrid_imp.h @@ -279,6 +279,7 @@ bGrid::bGrid(const Neon::Backend& backend, mData->alphaGrid = details::cGrid::cGrid(*this); mData->betaGrid = details::cGrid::cGrid(*this); + init_mask_field(activeCellLambda); } template @@ -332,11 +333,11 @@ auto bGrid::newContainer(const std::string& name, { const Neon::index_3d& defaultBlockSize = this->getDefaultBlock(); Neon::set::Container kContainer = Neon::set::Container::factory(name, - Neon::set::internal::ContainerAPI::DataViewSupport::on, - *this, - lambda, - defaultBlockSize, - [](const Neon::index_3d&) { return 0; }); + Neon::set::internal::ContainerAPI::DataViewSupport::on, + *this, + lambda, + defaultBlockSize, + [](const Neon::index_3d&) { return 0; }); return kContainer; } @@ -534,4 +535,40 @@ auto bGrid::helpGetPartitioner1D() -> Neon::domain::tool::Partitioner1D& return mData->partitioner1D; } +template +template +auto bGrid::init_mask_field([[maybe_unused]] ActiveCellLambda activeCellLambda) -> void +{ + using returTypeOfLambda = typename std::invoke_result::type; + if constexpr (std::is_same_v) { + + + auto maskField = this->newField("maskField", 1, 0, Neon::DataUse::HOST_DEVICE); + maskField.getGrid().template newContainer( + "maskFieldInit", + [&](Neon::set::Loader& loader) { + auto maskFieldPartition = loader.load(maskField); + return [activeCellLambda, maskFieldPartition] (const auto& gIdx) mutable { + auto globalPosition = maskFieldPartition.getGlobalIndex(gIdx); + details::cGrid::ClassSelector voxelClass = activeCellLambda(globalPosition); + maskFieldPartition(gIdx, 0) = static_cast(voxelClass); +// maskFieldPartition(gIdx, 0) = 33; + }; + }) + .run(Neon::Backend::mainStreamIdx); + this->getBackend().sync(Neon::Backend::mainStreamIdx); + maskField.updateDeviceData(Neon::Backend::mainStreamIdx); + this->getBackend().sync(Neon::Backend::mainStreamIdx); + maskField.template ioToVtk("maskField", "maskField"); + this->mData->maskClassField = maskField; + return; + } +} + +template +auto bGrid::helpGetClassField() -> Field& +{ + return mData->maskClassField; +} + } // namespace Neon::domain::details::disaggregated::bGrid \ No newline at end of file diff --git a/libNeonDomain/include/Neon/domain/details/bGridDisg/bPartition.h b/libNeonDomain/include/Neon/domain/details/bGridDisg/bPartition.h index 7af86346..23e8b4bb 100644 --- a/libNeonDomain/include/Neon/domain/details/bGridDisg/bPartition.h +++ b/libNeonDomain/include/Neon/domain/details/bGridDisg/bPartition.h @@ -136,6 +136,7 @@ namespace Neon::domain::details::disaggregated::bGrid { getDomainSize() const -> Neon::index_3d; + NEON_CUDA_HOST_DEVICE auto mem() const -> T const *; /** diff --git a/libNeonDomain/include/Neon/domain/details/bGridDisgMask/BlockView.h b/libNeonDomain/include/Neon/domain/details/bGridDisgMask/BlockView.h new file mode 100644 index 00000000..6b5cfffc --- /dev/null +++ b/libNeonDomain/include/Neon/domain/details/bGridDisgMask/BlockView.h @@ -0,0 +1,29 @@ +#include "Neon/domain/details/bGridDisgMask/BlockViewGrid//BlockViewGrid.h" +#include "Neon/domain/tools/GridTransformer.h" + +namespace Neon::domain::details::disaggregated::bGridMask { + +struct BlockView +{ + public: + using Grid = Neon::domain::tool::GridTransformer::Grid; + template + using Field = Grid::template Field; + using index_3d = Neon::index_3d; + + template + static auto helpGetReference(T* mem, const int idx, const int card) -> std::enable_if_t + { + return mem[idx * card]; + } + + template + static auto helpGetReference(T* mem, const int idx, const int card) -> std::enable_if_t + { + return mem[idx * C]; + } + + static constexpr Neon::MemoryLayout layout = Neon::MemoryLayout::arrayOfStructs; +}; + +} // namespace Neon::domain::details::disaggregated::bGrid \ No newline at end of file diff --git a/libNeonDomain/include/Neon/domain/details/bGridDisgMask/BlockViewGrid/BlockViewGrid.h b/libNeonDomain/include/Neon/domain/details/bGridDisgMask/BlockViewGrid/BlockViewGrid.h new file mode 100644 index 00000000..51eb9717 --- /dev/null +++ b/libNeonDomain/include/Neon/domain/details/bGridDisgMask/BlockViewGrid/BlockViewGrid.h @@ -0,0 +1,97 @@ +#pragma once +#include + +#include "Neon/core/core.h" +#include "Neon/core/types/DataUse.h" +#include "Neon/core/types/Macros.h" + +#include "Neon/set/BlockConfig.h" +#include "Neon/set/Containter.h" +#include "Neon/set/DevSet.h" +#include "Neon/set/MemoryOptions.h" + +#include "Neon/sys/memory/MemDevice.h" + +#include "Neon/domain/aGrid.h" + +#include "Neon/domain/interface/GridBaseTemplate.h" +#include "Neon/domain/interface/GridConcept.h" +#include "Neon/domain/interface/KernelConfig.h" +#include "Neon/domain/interface/LaunchConfig.h" +#include "Neon/domain/interface/Stencil.h" +#include "Neon/domain/interface/common.h" + +#include "Neon/domain/tools/GridTransformer.h" +#include "Neon/domain/tools/SpanTable.h" + +#include "Neon/domain/details/eGrid/eGrid.h" +#include "Neon/domain/patterns/PatternScalar.h" + +#include "BlockViewPartition.h" + +namespace Neon::domain::details::disaggregated::bGridMask { + +namespace details { +struct GridTransformation +{ + template + using Partition = BlockViewPartition; + using Span = Neon::domain::details::eGrid::eSpan; + static constexpr Neon::set::internal::ContainerAPI::DataViewSupport dataViewSupport = Neon::set::internal::ContainerAPI::DataViewSupport::on; + + using FoundationGrid = Neon::domain::details::eGrid::eGrid; + static constexpr Neon::set::details::ExecutionThreadSpan executionThreadSpan = FoundationGrid::executionThreadSpan; + using ExecutionThreadSpanIndexType = int32_t; + using Idx = FoundationGrid::Idx; + + static auto getDefaultBlock(FoundationGrid& foundationGrid) -> Neon::index_3d const& + { + return foundationGrid.getDefaultBlock(); + } + + static auto initSpan(FoundationGrid& foundationGrid, Neon::domain::tool::SpanTable& spanTable) -> void + { + spanTable.forEachConfiguration([&](Neon::Execution execution, + Neon::SetIdx setIdx, + Neon::DataView dw, + Span& span) { + span = foundationGrid.getSpan(execution, setIdx, dw); + }); + } + + static auto initLaunchParameters(FoundationGrid& foundationGrid, + Neon::DataView dataView, + const Neon::index_3d& blockSize, + const size_t& shareMem) -> Neon::set::LaunchParameters + { + return foundationGrid.getLaunchParameters(dataView, blockSize, shareMem); + } + + static auto helpGetGridIdx(FoundationGrid&, + Neon::SetIdx const&, + FoundationGrid::Idx const& fgIdx) + -> GridTransformation::Idx + { + GridTransformation::Idx tgIdx = fgIdx; + return tgIdx; + } + + template + static auto initFieldPartition(FoundationGrid::Field& foundationField, + Neon::domain::tool::PartitionTable>& partitionTable) -> void + { + partitionTable.forEachConfiguration( + [&](Neon::Execution execution, + Neon::SetIdx setIdx, + Neon::DataView dw, + Partition& partition) { + auto& foundationPartition = foundationField.getPartition(execution, setIdx, dw); + partition = Partition(foundationPartition); + }); + } +}; +using BlockViewGrid = Neon::domain::tool::GridTransformer::Grid; + +} // namespace details + +} // namespace Neon::domain::details::bGrid diff --git a/libNeonDomain/include/Neon/domain/details/bGridDisgMask/BlockViewGrid/BlockViewPartition.h b/libNeonDomain/include/Neon/domain/details/bGridDisgMask/BlockViewGrid/BlockViewPartition.h new file mode 100644 index 00000000..3a860baa --- /dev/null +++ b/libNeonDomain/include/Neon/domain/details/bGridDisgMask/BlockViewGrid/BlockViewPartition.h @@ -0,0 +1,43 @@ +#pragma once +#include +#include "Neon/core/core.h" +#include "Neon/core/types/Macros.h" +#include "Neon/domain/details/eGrid/eGrid.h" +#include "Neon/domain/details/eGrid/eIndex.h" +#include "Neon/domain/interface/NghData.h" +#include "Neon/set/DevSet.h" +#include "Neon/sys/memory/CudaIntrinsics.h" +#include "cuda_fp16.h" + +namespace Neon::domain::details::disaggregated::bGridMask { + +template +class BlockViewPartition : public Neon::domain::details::eGrid::ePartition +{ + public: + BlockViewPartition() + { + } + BlockViewPartition(Neon::domain::details::eGrid::ePartition ePartition) + : Neon::domain::details::eGrid::ePartition(ePartition) + { + } + + template + static auto getInBlockIdx(typename Neon::domain::details::eGrid::ePartition::Idx const& idx, + uint8_3d const& inBlockLocation) -> BlockIdexType + { + BlockIdexType blockIdx(idx.helpGet(), inBlockLocation); + return inBlockLocation; + } + + auto getCountAllocated() const -> int32_t; +}; + +template +auto BlockViewPartition::getCountAllocated() const -> int32_t +{ + return this->mCountAllocated; +} +} // namespace Neon::domain::details::bGrid diff --git a/libNeonDomain/include/Neon/domain/details/bGridDisgMask/ClassificationGrid/ClassSelector.h b/libNeonDomain/include/Neon/domain/details/bGridDisgMask/ClassificationGrid/ClassSelector.h new file mode 100644 index 00000000..9b2f441b --- /dev/null +++ b/libNeonDomain/include/Neon/domain/details/bGridDisgMask/ClassificationGrid/ClassSelector.h @@ -0,0 +1,10 @@ +#pragma once + +namespace Neon::domain::details::disaggregated::bGridMask::details::cGrid { +enum ClassSelector +{ + alpha = 1, + beta = 2, + outside = 0, +}; +} \ No newline at end of file diff --git a/libNeonDomain/include/Neon/domain/details/bGridDisgMask/ClassificationGrid/cGrid.h b/libNeonDomain/include/Neon/domain/details/bGridDisgMask/ClassificationGrid/cGrid.h new file mode 100644 index 00000000..a6c20f48 --- /dev/null +++ b/libNeonDomain/include/Neon/domain/details/bGridDisgMask/ClassificationGrid/cGrid.h @@ -0,0 +1,145 @@ +#pragma once +#include + +#include "Neon/core/core.h" +#include "Neon/core/types/DataUse.h" +#include "Neon/core/types/Macros.h" + +#include "Neon/set/BlockConfig.h" +#include "Neon/set/Containter.h" +#include "Neon/set/DevSet.h" +#include "Neon/set/MemoryOptions.h" + +#include "Neon/sys/memory/MemDevice.h" + +#include "Neon/domain/interface/GridBaseTemplate.h" +#include "Neon/domain/interface/GridConcept.h" +#include "Neon/domain/interface/KernelConfig.h" +#include "Neon/domain/interface/LaunchConfig.h" +#include "Neon/domain/interface/Stencil.h" +#include "Neon/domain/interface/common.h" + +#include "Neon/domain/patterns/PatternScalar.h" +#include "Neon/domain/tools/GridTransformer.h" +#include "Neon/domain/tools/SpanTable.h" + +#include "../bGrid.h" +#include "./ClassSelector.h" +#include "./cSpan.h" + +namespace Neon::domain::details::disaggregated::bGridMask { + +namespace details::cGrid { + +template +struct GridTransformation_cGrid +{ + using FoundationGrid = Neon::domain::details::disaggregated::bGridMask::bGrid; + + template + using Partition = Neon::domain::details::disaggregated::bGridMask::bPartition; + using Span = cSpan; + static constexpr Neon::set::internal::ContainerAPI::DataViewSupport dataViewSupport = Neon::set::internal::ContainerAPI::DataViewSupport::on; + + static constexpr Neon::set::details::ExecutionThreadSpan executionThreadSpan = FoundationGrid::executionThreadSpan; + using ExecutionThreadSpanIndexType = typename FoundationGrid::ExecutionThreadSpanIndexType; + using Idx = typename FoundationGrid::Idx; + + static auto getDefaultBlock(FoundationGrid& foundationGrid) -> Neon::index_3d const& + { + return foundationGrid.getDefaultBlock(); + } + + static auto initSpan(FoundationGrid& foundationGrid, + Neon::domain::tool::SpanTable& spanTable) -> void + { + spanTable.forEachConfiguration([&](Neon::Execution execution, + Neon::SetIdx setIdx, + Neon::DataView dataViewOfTheTableEntry, + Span& NEON_OUT span) { + typename FoundationGrid::Span const& foundationSpan = foundationGrid.getSpan(execution, setIdx, dataViewOfTheTableEntry); + span = cSpan(foundationSpan.mFirstDataBlockOffset, + foundationSpan.mActiveMask, + foundationSpan.mDataView, + foundationGrid.helpGetClassField().getPartition(execution, setIdx, dataViewOfTheTableEntry)); + }); + } + + static auto initLaunchParameters(FoundationGrid& foundationGrid, + Neon::DataView dataView, + const Neon::index_3d& blockSize, + const size_t& shareMem) -> Neon::set::LaunchParameters + { + Neon::set::LaunchParameters launchParameters = foundationGrid.getLaunchParameters(dataView, blockSize, shareMem); + + launchParameters.forEachSeq([&](Neon::SetIdx setIdx, Neon::sys::GpuLaunchInfo& launchParameter) { + Neon::domain::tool::Partitioner1D const& foundationPartitioner1D = foundationGrid.helpGetPartitioner1D(); + auto const& spanLayout = foundationPartitioner1D.getSpanLayout(); + int nBlocks; + + Neon::domain::tool::partitioning::ByDomain byDomain = classSelector == cGrid::ClassSelector::alpha + ? Neon::domain::tool::partitioning::ByDomain::bulk + : Neon::domain::tool::partitioning::ByDomain::bc; + + int countInternal = spanLayout.getBoundsInternal(setIdx, byDomain).count; + int countBcUp = spanLayout.getBoundsBoundary(setIdx, + Neon::domain::tool::partitioning::ByDirection::up, + byDomain) + .count; + int countBcDw = spanLayout.getBoundsBoundary(setIdx, + Neon::domain::tool::partitioning::ByDirection::down, + byDomain) + .count; + + switch (dataView) { + case Neon::DataView::INTERNAL: + nBlocks = countInternal; + break; + case Neon::DataView::BOUNDARY: + nBlocks = countBcUp + countBcDw; + break; + case Neon::DataView::STANDARD: + nBlocks = countInternal + + countBcUp + + countBcDw; + break; + default: + throw Neon::NeonException("Unknown data view"); + } + + launchParameter.set(Neon::sys::GpuLaunchInfo::mode_e::cudaGridMode, + nBlocks, + SBlock::memBlockSize3D.template newType(), shareMem); + }); + return launchParameters; + } + + static auto helpGetGridIdx(FoundationGrid&, + Neon::SetIdx const&, + typename FoundationGrid::Idx const& fgIdx) + -> GridTransformation::Idx + { + GridTransformation::Idx tgIdx = fgIdx; + return tgIdx; + } + + template + static auto initFieldPartition(typename FoundationGrid::template Field& foundationField, + Neon::domain::tool::PartitionTable>& partitionTable) -> void + { + partitionTable.forEachConfiguration( + [&](Neon::Execution execution, + Neon::SetIdx setIdx, + Neon::DataView dw, + Partition& partition) { + auto& foundationPartition = foundationField.getPartition(execution, setIdx, dw); + partition = foundationPartition; + }); + } +}; + +template +using cGrid = typename Neon::domain::tool::GridTransformer>::Grid; + +} // namespace details::cGrid +} // namespace Neon::domain::details::disaggregated::bGridMask diff --git a/libNeonDomain/include/Neon/domain/details/bGridDisgMask/ClassificationGrid/cSpan.h b/libNeonDomain/include/Neon/domain/details/bGridDisgMask/ClassificationGrid/cSpan.h new file mode 100644 index 00000000..e38180ee --- /dev/null +++ b/libNeonDomain/include/Neon/domain/details/bGridDisgMask/ClassificationGrid/cSpan.h @@ -0,0 +1,58 @@ +#pragma once + +#include "Neon/domain/details/bGridDisgMask/bIndex.h" + +namespace Neon::domain::details::disaggregated::bGridMask { +namespace details::cGrid { + +template +class cSpan +{ + public: + // bit mask information + using BitMaskWordType = uint64_t; + + static constexpr uint32_t bitMaskStorageBitWidth = 64; + static constexpr Neon::MemoryLayout activeMaskMemoryLayout = Neon::MemoryLayout::arrayOfStructs; + static constexpr uint32_t log2OfbitMaskWordSize = 6; + + using Idx = bIndex; + friend class bGrid; + + static constexpr int SpaceDim = 3; + + cSpan() = default; + virtual ~cSpan() = default; + + NEON_CUDA_HOST_DEVICE inline static auto getInvalidBlockId() + -> typename Idx::DataBlockIdx + { + return std::numeric_limits::max(); + } + + inline cSpan( + typename Idx::DataBlockCount mFirstDataBlockOffset, + typename SBlock::BitMask const* NEON_RESTRICT mActiveMask, + Neon::DataView mDataView, + uint8_t const* NEON_RESTRICT mClassMask); + + NEON_CUDA_HOST_DEVICE inline auto setAndValidateCPUDevice( + Idx& bidx, + uint32_t const& threadIdx, + uint32_t const& x, + uint32_t const& y, + uint32_t const& z) const -> bool; + + NEON_CUDA_HOST_DEVICE inline auto setAndValidateGPUDevice( + Idx& bidx) const -> bool; + + + // We don't need to have a count on active blocks + typename Idx::DataBlockCount mFirstDataBlockOffset; + typename SBlock::BitMask const* NEON_RESTRICT mActiveMask; + Neon::DataView mDataView; + uint8_t const* NEON_RESTRICT mClassMask; +}; +} // namespace Neon::domain::details::disaggregated::bGridMask +} +#include "Neon/domain/details/bGridDisgMask/ClassificationGrid/cSpan_imp.h" \ No newline at end of file diff --git a/libNeonDomain/include/Neon/domain/details/bGridDisgMask/ClassificationGrid/cSpan_imp.h b/libNeonDomain/include/Neon/domain/details/bGridDisgMask/ClassificationGrid/cSpan_imp.h new file mode 100644 index 00000000..f8783a2e --- /dev/null +++ b/libNeonDomain/include/Neon/domain/details/bGridDisgMask/ClassificationGrid/cSpan_imp.h @@ -0,0 +1,60 @@ +#include +#include "Neon/domain/details/bGridDisgMask/ClassificationGrid/cSpan.h" +namespace Neon::domain::details::disaggregated::bGridMask { +namespace details::cGrid { + +template +NEON_CUDA_HOST_DEVICE inline auto +cSpan::setAndValidateGPUDevice([[maybe_unused]] Idx& bidx) const -> bool +{ +#ifdef NEON_PLACE_CUDA_DEVICE + bidx.mDataBlockIdx = blockIdx.x + mFirstDataBlockOffset; + bidx.mInDataBlockIdx.x = threadIdx.x; + bidx.mInDataBlockIdx.y = threadIdx.y; + bidx.mInDataBlockIdx.z = threadIdx.z; + + const bool isActive = mActiveMask[bidx.mDataBlockIdx].isActive(bidx.mInDataBlockIdx.x, bidx.mInDataBlockIdx.y, bidx.mInDataBlockIdx.z); + const bool isClass = mClassMask[bidx.mDataBlockIdx * SB] == classSelector; + return isActive; +#else + NEON_THROW_UNSUPPORTED_OPERATION("Operation supported only on GPU"); +#endif +} + +template +NEON_CUDA_HOST_DEVICE inline auto +cSpan::setAndValidateCPUDevice(Idx& bidx, + uint32_t const& dataBlockIdx, + uint32_t const& x, + uint32_t const& y, + uint32_t const& z) const -> bool +{ + + bidx.mDataBlockIdx = dataBlockIdx + mFirstDataBlockOffset; + ; + bidx.mInDataBlockIdx.x = static_cast(x); + bidx.mInDataBlockIdx.y = static_cast(y); + bidx.mInDataBlockIdx.z = static_cast(z); + const bool isActive = mActiveMask[dataBlockIdx].isActive(bidx.mInDataBlockIdx.x, bidx.mInDataBlockIdx.y, bidx.mInDataBlockIdx.z); + const bool isClass = mClassMask[bidx.mDataBlockIdx * SBlock::memBlockCountElements + + bidx.mInDataBlockIdx.x + + bidx.mInDataBlockIdx.y * SBlock::memBlockSizeX + + bidx.mInDataBlockIdx.z * SBlock::memBlockSizeX * SBlock::memBlockSizeY] == classSelector; + + return isActive && isClass; +} + +template +cSpan::cSpan(typename Idx::DataBlockCount firstDataBlockOffset, + typename SBlock::BitMask const* NEON_RESTRICT activeMask, + Neon::DataView dataView, + uint8_t const* NEON_RESTRICT ClassMask) + : mFirstDataBlockOffset(firstDataBlockOffset), + mActiveMask(activeMask), + mDataView(dataView), + mClassMask(ClassMask) +{ +} + +} +} // namespace Neon::domain::details::disaggregated::bGridMask \ No newline at end of file diff --git a/libNeonDomain/include/Neon/domain/details/bGridDisgMask/StaticBlock.h b/libNeonDomain/include/Neon/domain/details/bGridDisgMask/StaticBlock.h new file mode 100644 index 00000000..8ef687ce --- /dev/null +++ b/libNeonDomain/include/Neon/domain/details/bGridDisgMask/StaticBlock.h @@ -0,0 +1,106 @@ +#pragma once + +#include "Neon/domain/details/bGridDisgMask/bSpan.h" + +namespace Neon::domain::details::disaggregated::bGridMask { + +template +struct StaticBlock +{ + public: + constexpr static uint32_t memBlockSizeX = memBlockSizeX_; + constexpr static uint32_t memBlockSizeY = memBlockSizeY_; + constexpr static uint32_t memBlockSizeZ = memBlockSizeZ_; + constexpr static Neon::uint32_3d memBlockSize3D = Neon::uint32_3d(memBlockSizeX, memBlockSizeY, memBlockSizeZ); + + constexpr static uint32_t userBlockSizeX = userBlockSizeX_; + constexpr static uint32_t userBlockSizeY = userBlockSizeY_; + constexpr static uint32_t userBlockSizeZ = userBlockSizeZ_; + constexpr static Neon::uint32_3d userBlockSize3D = Neon::uint32_3d(userBlockSizeX, userBlockSizeY, userBlockSizeZ); + + constexpr static uint32_t blockRatioX = memBlockSizeX / userBlockSizeX; + constexpr static uint32_t blockRatioY = memBlockSizeY / userBlockSizeY; + constexpr static uint32_t blockRatioZ = memBlockSizeZ / userBlockSizeZ; + + constexpr static uint32_t memBlockPitchX = 1; + constexpr static uint32_t memBlockPitchY = memBlockSizeX; + constexpr static uint32_t memBlockPitchZ = memBlockSizeX * memBlockSizeY; + + constexpr static bool isMultiResMode = isMultiResMode_; + + constexpr static uint32_t memBlockCountElements = memBlockSizeX * memBlockSizeY * memBlockSizeZ; + + static_assert(memBlockSizeX >= userBlockSizeX); + static_assert(memBlockSizeY >= userBlockSizeY); + static_assert(memBlockSizeZ >= userBlockSizeZ); + + static_assert(memBlockSizeX % userBlockSizeX == 0); + static_assert(memBlockSizeY % userBlockSizeY == 0); + static_assert(memBlockSizeZ % userBlockSizeZ == 0); + + struct BitMask + { + using BitMaskWordType = uint32_t; + auto reset() -> void + { + for (BitMaskWordType i = 0; i < nWords; ++i) { + bits[i] = 0; + } + } + + auto setActive(int threadX, + int threadY, + int threadZ) -> void + { + BitMaskWordType mask; + uint32_t wordIdx; + getMaskAndWordI(threadX, threadY, threadZ, mask, wordIdx); + auto& word = bits[wordIdx]; + word = word | mask; + } + + inline auto NEON_CUDA_HOST_DEVICE isActive(int threadX, + int threadY, + int threadZ) const -> bool + { + BitMaskWordType mask; + uint32_t wordIdx; + getMaskAndWordI(threadX, threadY, threadZ, mask, wordIdx); + auto& word = bits[wordIdx]; + return (word & mask) != 0; + } + + static inline auto NEON_CUDA_HOST_DEVICE getMaskAndWordI(int threadX, + int threadY, + int threadZ, + NEON_OUT BitMaskWordType& mask, + NEON_OUT uint32_t& wordIdx) -> void + { + const uint32_t threadPitch = threadX * memBlockPitchX + + threadY * memBlockPitchY + + threadZ * memBlockPitchZ; + + // threadPitch >> log2_of_bitPerWord + // the same as: threadPitch / 2^{log2_of_bitPerWord} + wordIdx = threadPitch >> log2_of_bitPerWord; + // threadPitch & ((bitMaskWordType(bitMaskStorageBitWidth)) - 1); + // same as threadPitch % 2^{log2OfbitMaskWordSize} + const uint32_t offsetInWord = threadPitch & ((BitMaskWordType(bitPerWord)) - 1); + mask = BitMaskWordType(1) << offsetInWord; + } + + constexpr static BitMaskWordType nWords = (memBlockCountElements + 31) / 32; + static constexpr uint32_t log2_of_bitPerWord = 5; + static constexpr uint32_t bitPerWord = 32; + + BitMaskWordType bits[nWords]; + }; +}; + +} // namespace Neon::domain::details::disaggregated::bGrid \ No newline at end of file diff --git a/libNeonDomain/include/Neon/domain/details/bGridDisgMask/bField.h b/libNeonDomain/include/Neon/domain/details/bGridDisgMask/bField.h new file mode 100644 index 00000000..60235a8c --- /dev/null +++ b/libNeonDomain/include/Neon/domain/details/bGridDisgMask/bField.h @@ -0,0 +1,119 @@ +#pragma once +#include "Neon/domain/details/bGridDisgMask/bPartition.h" +#include "Neon/domain/interface/FieldBaseTemplate.h" +#include "Neon/set/patterns/BlasSet.h" + +#include "Neon/core/core.h" +#include "Neon/core/types/Macros.h" + +#include "Neon/set/DevSet.h" +#include "Neon/set/HuOptions.h" +#include "Neon/set/MemoryTransfer.h" + +#include "Neon/domain/interface/FieldBaseTemplate.h" +#include "Neon/domain/tools/HaloUpdateTable1DPartitioning.h" +#include "Neon/domain/tools/PartitionTable.h" +#include "bPartition.h" + +namespace Neon::domain::details::disaggregated::bGridMask { + + +template +class bField : public Neon::domain::interface::FieldBaseTemplate, + bPartition, + int> +{ + friend bGrid; + + public: + using Type = T; + using Grid = bGrid; + using Field = bField; + using Partition = bPartition; + using Idx = bIndex; + using BlockViewGrid = Neon::domain::tool::GridTransformer::Grid; + template + using BlockViewField = BlockViewGrid::template Field; + + using NghIdx = typename Partition::NghIdx; + using NghData = typename Partition::NghData; + + bField(const std::string& fieldUserName, + Neon::DataUse dataUse, + Neon::MemoryOptions memoryOptions, + const Grid& grid, + int cardinality, + T inactiveValue); + + bField(); + + virtual ~bField() = default; + + auto getPartition(Neon::Execution, + Neon::SetIdx, + const Neon::DataView& dataView) const -> const Partition& final; + + auto getPartition(Neon::Execution, + Neon::SetIdx, + const Neon::DataView& dataView) -> Partition& final; + + auto isInsideDomain(const Neon::index_3d& idx) const -> bool; + + + auto operator()(const Neon::index_3d& idx, + const int& cardinality) const -> T final; + + auto getReference(const Neon::index_3d& idx, + const int& cardinality) -> T& final; + + auto updateHostData(int streamId = 0) -> void final; + + auto updateDeviceData(int streamId = 0) -> void final; + + auto newHaloUpdate(Neon::set::StencilSemantic semantic, + Neon::set::TransferMode transferMode, + Neon::Execution execution) + const -> Neon::set::Container; + + auto getMemoryField() -> BlockViewGrid::Field&; + + + private: + auto getRef(const Neon::index_3d& idx, const int& cardinality) const -> T&; + + auto initHaloUpdateTable() -> void; + + + struct Data + { + Data() = default; + Data(Neon::Backend const& bk) + { + partitionTable.init(bk); + } + + enum EndPoints + { + src = 1, + dst = 0 + }; + + struct EndPointsUtils + { + static constexpr int nConfigs = 2; + }; + + std::shared_ptr grid; + BlockViewField memoryField; + int cardinality; + + Neon::domain::tool::HaloTable1DPartitioning mStandardHaloUpdateTable; + Neon::domain::tool::PartitionTable partitionTable; + }; + std::shared_ptr mData; +}; + + +} // namespace Neon::domain::details::disaggregated::bGrid diff --git a/libNeonDomain/include/Neon/domain/details/bGridDisgMask/bField_imp.h b/libNeonDomain/include/Neon/domain/details/bGridDisgMask/bField_imp.h new file mode 100644 index 00000000..006a9d3f --- /dev/null +++ b/libNeonDomain/include/Neon/domain/details/bGridDisgMask/bField_imp.h @@ -0,0 +1,337 @@ +#pragma once + +#include "Neon/domain/details/bGridDisgMask/bField.h" + +namespace Neon::domain::details::disaggregated::bGridMask { + +template +bField::bField() +{ + mData = std::make_shared(); +} + +template +bField::bField(const std::string& fieldUserName, + Neon::DataUse dataUse, + Neon::MemoryOptions memoryOptions, + const Grid& grid, + int cardinality, + T inactiveValue) + : Neon::domain::interface::FieldBaseTemplate(&grid, + fieldUserName, + "bField", + cardinality, + inactiveValue, + dataUse, + memoryOptions, + Neon::domain::haloStatus_et::e::ON) +{ + mData = std::make_shared(grid.getBackend()); + mData->grid = std::make_shared(grid); + + if (memoryOptions.getOrder() == Neon::MemoryLayout::arrayOfStructs) { + NEON_WARNING("bField does not support MemoryLayout::arrayOfStructs, enforcing MemoryLayout::structOfArrays"); + memoryOptions.setOrder(Neon::MemoryLayout::structOfArrays); + } + // the allocation size is the number of blocks x block size x cardinality + mData->memoryField = mData->grid->getBlockViewGrid().template newField( + "BitMask", + [&] { + int elPerBlock = SBlock::memBlockCountElements * cardinality; + return elPerBlock; + }(), + inactiveValue, + dataUse, + mData->grid->getBackend().getMemoryOptions(bSpan::activeMaskMemoryLayout)); + + + { // Setting up partitionTable + // const int setCardinality = mData->grid->getBackend().getDeviceCount(); + mData->partitionTable.forEachConfiguration( + [&](Neon::Execution execution, + Neon::SetIdx setIdx, + Neon::DataView, + Partition& partition) { + auto& memoryFieldPartition = mData->memoryField.getPartition(execution, setIdx, Neon::DataView::STANDARD); + auto& blockConnectivity = mData->grid->helpGetBlockConnectivity().getPartition(execution, setIdx, Neon::DataView::STANDARD); + auto& bitmask = mData->grid->getActiveBitMask().getPartition(execution, setIdx, Neon::DataView::STANDARD); + auto& dataBlockOrigins = mData->grid->helpGetDataBlockOriginField().getPartition(execution, setIdx, Neon::DataView::STANDARD); + + partition = bPartition(setIdx, + cardinality, + memoryFieldPartition.mem(), + blockConnectivity.mem(), + bitmask.mem(), + dataBlockOrigins.mem(), + mData->grid->helpGetStencilIdTo3dOffset().rawMem(execution, setIdx), + mData->grid->getDimension()); + }); + } + + initHaloUpdateTable(); +} + +template +auto bField::getMemoryField() -> BlockViewGrid::Field& +{ + return mData->memoryField; +} + +template +auto bField::isInsideDomain(const Neon::index_3d& idx) const -> bool +{ + return mData->grid->isInsideDomain(idx); +} + +template +auto bField::getReference(const Neon::index_3d& cartesianIdx, + const int& cardinality) -> T& +{ + if constexpr (SBlock::isMultiResMode) { + auto& grid = this->getGrid(); + auto uniformCartesianIdx = cartesianIdx / grid.helGetMultiResDiscreteIdxSpacing(); + + if (cartesianIdx.x % grid.helGetMultiResDiscreteIdxSpacing() != 0 || + cartesianIdx.y % grid.helGetMultiResDiscreteIdxSpacing() != 0 || + cartesianIdx.z % grid.helGetMultiResDiscreteIdxSpacing() != 0) { + NeonException exp("bField::getReference"); + exp << "Input index is not multiple of the grid resolution"; + exp << "Index = " << cartesianIdx; + NEON_THROW(exp); + } + auto [setIdx, bIdx] = grid.helpGetSetIdxAndGridIdx(uniformCartesianIdx); + auto& partition = getPartition(Neon::Execution::host, setIdx, Neon::DataView::STANDARD); + auto& result = partition(bIdx, cardinality); + return result; + } else { + auto& grid = this->getGrid(); + auto [setIdx, bIdx] = grid.helpGetSetIdxAndGridIdx(cartesianIdx); + auto& partition = getPartition(Neon::Execution::host, setIdx, Neon::DataView::STANDARD); + auto& result = partition(bIdx, cardinality); + return result; + } +} + +template +auto bField::operator()(const Neon::index_3d& cartesianIdx, + const int& cardinality) const -> T +{ + auto& grid = this->getGrid(); + auto [setIdx, bIdx] = grid.helpGetSetIdxAndGridIdx(cartesianIdx); + if (setIdx.idx() == -1) { + return this->getOutsideValue(); + } + auto& partition = getPartition(Neon::Execution::host, setIdx, Neon::DataView::STANDARD); + auto& result = partition(bIdx, cardinality); + return result; +} + +template +auto bField::updateHostData(int streamId) -> void +{ + mData->memoryField.updateHostData(streamId); +} + +template +auto bField::updateDeviceData(int streamId) -> void +{ + mData->memoryField.updateDeviceData(streamId); +} + +template +auto bField::getPartition(Neon::Execution execution, + Neon::SetIdx setIdx, + const Neon::DataView& dataView) const -> const Partition& +{ + const Neon::DataUse dataUse = this->getDataUse(); + bool isOk = Neon::ExecutionUtils::checkCompatibility(dataUse, execution); + if (isOk) { + Partition const& result = mData->partitionTable.getPartition(execution, setIdx, dataView); + return result; + } + std::stringstream message; + message << "The requested execution mode ( " << execution << " ) is not compatible with the field DataUse (" << dataUse << ")"; + NEON_THROW_UNSUPPORTED_OPERATION(message.str()); +} + +template +auto bField::getPartition(Neon::Execution execution, + Neon::SetIdx setIdx, + const Neon::DataView& dataView) -> Partition& +{ + const Neon::DataUse dataUse = this->getDataUse(); + bool isOk = Neon::ExecutionUtils::checkCompatibility(dataUse, execution); + if (isOk) { + Partition& result = mData->partitionTable.getPartition(execution, setIdx, dataView); + return result; + } + std::stringstream message; + message << "The requested execution mode ( " << execution << " ) is not compatible with the field DataUse (" << dataUse << ")"; + NEON_THROW_UNSUPPORTED_OPERATION(message.str()); +} + +template +auto bField::newHaloUpdate(Neon::set::StencilSemantic stencilSemantic, + Neon::set::TransferMode transferMode, + Neon::Execution execution) const -> Neon::set::Container +{ + + + // We need to define a graph of Containers + // One for the actual memory transfer + // One for the synchronization + // The order depends on the transfer mode: put or get + Neon::set::Container dataTransferContainer; + auto const& bk = this->getGrid().getBackend(); + + if (stencilSemantic == Neon::set::StencilSemantic::standard) { + auto transfers = bk.template newDataSet>(); + + if (this->getMemoryOptions().getOrder() == Neon::MemoryLayout::structOfArrays) { + for (auto byDirection : {tool::partitioning::ByDirection::up, + tool::partitioning::ByDirection::down}) { + + auto const& tableEntryByDir = mData->mStandardHaloUpdateTable.get(transferMode, + execution, + byDirection); + + tableEntryByDir.forEachSeq([&](SetIdx setIdx, auto const& tableEntryByDirBySetIdx) { + transfers[setIdx].insert(std::end(transfers[setIdx]), + std::begin(tableEntryByDirBySetIdx), + std::end(tableEntryByDirBySetIdx)); + }); + } + dataTransferContainer = + Neon::set::Container::factoryDataTransfer( + *this, + transferMode, + stencilSemantic, + transfers, + execution); + + + } else { + NEON_THROW_UNSUPPORTED_OPERATION(""); + } + } else { + NEON_DEV_UNDER_CONSTRUCTION(""); + } + Neon::set::Container SyncContainer = + Neon::set::Container::factorySynchronization( + *this, + Neon::set::SynchronizationContainerType::hostOmpBarrier); + + Neon::set::container::Graph graph(this->getBackend()); + const auto& dataTransferNode = graph.addNode(dataTransferContainer); + const auto& syncNode = graph.addNode(SyncContainer); + + switch (transferMode) { + case Neon::set::TransferMode::put: + graph.addDependency(dataTransferNode, syncNode, Neon::GraphDependencyType::data); + break; + case Neon::set::TransferMode::get: + graph.addDependency(syncNode, dataTransferNode, Neon::GraphDependencyType::data); + break; + default: + NEON_THROW_UNSUPPORTED_OPTION(); + break; + } + + graph.removeRedundantDependencies(); + + Neon::set::Container output = + Neon::set::Container::factoryGraph("dGrid-Halo-Update", + graph, + [](Neon::SetIdx, Neon::set::Loader&) {}); + return output; +} + +template +auto bField::initHaloUpdateTable() -> void +{ + // NEON_THROW_UNSUPPORTED_OPERATION(""); + auto& grid = this->getGrid(); + auto bk = grid.getBackend(); + auto getNghSetIdx = [&](SetIdx setIdx, Neon::domain::tool::partitioning::ByDirection direction) { + int res; + if (direction == Neon::domain::tool::partitioning::ByDirection::up) { + res = (setIdx + 1) % bk.getDeviceCount(); + } else { + res = (setIdx + bk.getDeviceCount() - 1) % bk.getDeviceCount(); + } + return res; + }; + + mData->mStandardHaloUpdateTable.forEachPutConfiguration( + bk, [&](Neon::SetIdx setIdxSrc, + Execution execution, + Neon::domain::tool::partitioning::ByDirection byDirection, + std::vector& transfersVec) { + { + using namespace Neon::domain::tool::partitioning; + + Neon::SetIdx setIdxDst = getNghSetIdx(setIdxSrc, byDirection); + Neon::SetIdx setIdxVec[2]; + setIdxVec[Data::EndPoints::dst] = setIdxDst; + setIdxVec[Data::EndPoints::src] = setIdxSrc; + + std::array partitions; + std::array*, Data::EndPointsUtils::nConfigs> blockViewPartitions; + std::array, Data::EndPointsUtils::nConfigs> ghostZBeginIdx; + std::array, Data::EndPointsUtils::nConfigs> boundaryZBeginIdx; + std::array memPhyDim; + + partitions[Data::EndPoints::dst] = &this->getPartition(execution, setIdxDst, Neon::DataView::STANDARD); + partitions[Data::EndPoints::src] = &this->getPartition(execution, setIdxSrc, Neon::DataView::STANDARD); + blockViewPartitions[Data::EndPoints::dst] = &(mData->memoryField.getPartition(execution, setIdxDst, Neon::DataView::STANDARD)); + blockViewPartitions[Data::EndPoints::src] = &(mData->memoryField.getPartition(execution, setIdxSrc, Neon::DataView::STANDARD)); + + for (auto endPoint : {Data::EndPoints::dst, Data::EndPoints::src}) { + for (auto direction : {ByDirection::down, ByDirection::up}) { + auto ghostFirst = mData->grid->mData->partitioner1D.getSpanLayout().getGhostBoundary(setIdxVec[endPoint], direction).first; + auto boundaryFirst = mData->grid->mData->partitioner1D.getSpanLayout().getBoundsBoundary(setIdxVec[endPoint], direction).first; + ghostZBeginIdx[endPoint][static_cast(direction)] = ghostFirst; + boundaryZBeginIdx[endPoint][static_cast(direction)] = boundaryFirst; + } + + memPhyDim[endPoint] = Neon::size_4d( + SBlock::memBlockCountElements, + 1, + 1, + size_t(blockViewPartitions[endPoint]->getCountAllocated()) * SBlock::memBlockCountElements); + } + + if (ByDirection::up == byDirection && bk.isLastDevice(setIdxSrc)) { + return; + } + + if (ByDirection::down == byDirection && bk.isFirstDevice(setIdxSrc)) { + return; + } + + T* srcMem = blockViewPartitions[Data::EndPoints::src]->mem(); + T* dstMem = blockViewPartitions[Data::EndPoints::dst]->mem(); + + Neon::size_4d dstGhostBuff(ghostZBeginIdx[Data::EndPoints::dst][static_cast(ByDirectionUtils::invert(byDirection))], 0, 0, 0); + Neon::size_4d srcBoundaryBuff(boundaryZBeginIdx[Data::EndPoints::src][static_cast(byDirection)], 0, 0, 0); + + size_t transferDataBlockCount = mData->grid->mData->partitioner1D.getSpanLayout().getBoundsBoundary(setIdxVec[Data::EndPoints::src], byDirection).count; + + // std::cout << "To " << dstGhostBuff << " prt " << blockViewPartitions[Data::EndPoints::dst]->prtID() << " From " << srcBoundaryBuff << " prt " << blockViewPartitions[Data::EndPoints::src]->prtID() << std::endl; + // std::cout << "dst mem " << blockViewPartitions[Data::EndPoints::dst]->mem() << " " << std::endl; + // std::cout << "dst transferDataBlockCount " << transferDataBlockCount << " " << std::endl; + // std::cout << "dst pitch " << (dstGhostBuff * memPhyDim[Data::EndPoints::dst]).rSum() << " " << std::endl; + // std::cout << "dst dstGhostBuff " << dstGhostBuff << " " << std::endl; + // std::cout << "dst pitch all" << memPhyDim[Data::EndPoints::dst] << " " << std::endl; + + Neon::set::MemoryTransfer transfer({setIdxDst, dstMem + (dstGhostBuff * memPhyDim[Data::EndPoints::dst]).rSum(), dstGhostBuff}, + {setIdxSrc, srcMem + (srcBoundaryBuff * memPhyDim[Data::EndPoints::src]).rSum(), srcBoundaryBuff}, + sizeof(T) * SBlock::memBlockCountElements * transferDataBlockCount); + + transfersVec.push_back(transfer); + } + }); +} + + +} // namespace Neon::domain::details::disaggregated::bGrid diff --git a/libNeonDomain/include/Neon/domain/details/bGridDisgMask/bGrid.h b/libNeonDomain/include/Neon/domain/details/bGridDisgMask/bGrid.h new file mode 100644 index 00000000..de0fec98 --- /dev/null +++ b/libNeonDomain/include/Neon/domain/details/bGridDisgMask/bGrid.h @@ -0,0 +1,274 @@ +#pragma once +#include "Neon/core/core.h" + +#include "Neon/domain/aGrid.h" +#include "Neon/domain/details/bGridDisgMask/BlockView.h" +#include "Neon/domain/details/bGridDisgMask/StaticBlock.h" +#include "Neon/domain/details/bGridDisgMask/bField.h" +#include "Neon/domain/details/bGridDisgMask/bIndex.h" +#include "Neon/domain/details/bGridDisgMask/bPartition.h" +#include "Neon/domain/details/bGridDisgMask/bSpan.h" +#include "Neon/domain/interface/GridBaseTemplate.h" +#include "Neon/domain/patterns/PatternScalar.h" +#include "Neon/domain/tools/Partitioner1D.h" +#include "Neon/domain/tools/PointHashTable.h" +#include "Neon/domain/tools/SpanTable.h" +#include "Neon/set/Containter.h" +#include "Neon/set/LaunchParametersTable.h" +#include "Neon/set/memory/memSet.h" + +#include "Neon/domain/details/bGridDisgMask/ClassificationGrid/cGrid.h" + +#include "ClassificationGrid/cGrid.h" +#include "bField.h" +#include "bPartition.h" +#include "bSpan.h" + +namespace Neon::domain::details::disaggregated::bGridMask { + + +template +class bField; + +template +class bGrid : public Neon::domain::interface::GridBaseTemplate, + bIndex> +{ + public: + using Grid = bGrid; + template + using Partition = bPartition; + template + using Field = Neon::domain::details::disaggregated::bGridMask::bField; + + using Span = bSpan; + using NghIdx = typename Partition::NghIdx; + using GridBaseTemplate = Neon::domain::interface::GridBaseTemplate>; + + using Idx = bIndex; + static constexpr Neon::set::details::ExecutionThreadSpan executionThreadSpan = Neon::set::details::ExecutionThreadSpan::d1b3; + using ExecutionThreadSpanIndexType = uint32_t; + + using BlockIdx = uint32_t; + + using AlphaGrid = typename Neon::domain::details::disaggregated::bGridMask::details::cGrid::cGrid; + using BetaGrid = typename Neon::domain::details::disaggregated::bGridMask::details::cGrid::cGrid; + + bGrid() = default; + virtual ~bGrid(); + + /** + * Constructor for the vanilla block data structure with depth of 1 + */ + template + bGrid(const Neon::Backend& backend, + const Neon::int32_3d& domainSize, + const ActiveCellLambda activeCellLambda, + const Neon::domain::Stencil& stencil, + const double_3d& spacingData = double_3d(1, 1, 1), + const double_3d& origin = double_3d(0, 0, 0), + Neon::domain::tool::spaceCurves::EncoderType encoderType = Neon::domain::tool::spaceCurves::EncoderType::sweep); + + + /** + * Constructor for bGrid. This constructor should be directly used only by mGrid + */ + template + bGrid(const Neon::Backend& backend /**< Neon backend for the computation */, + const Neon::int32_3d& domainSize /**< Size of the bounded Cartesian */, + const ActiveCellLambda activeCellLambda /**< Function that identify the user domain inside the boxed Cartesian discretization */, + const Neon::domain::Stencil& stencil /**< union of tall the stencil that will be used in the computation */, + const int multiResDiscreteIdxSpacing /**< Parameter for the multi-resolution. Index i and index (i+1) may be remapped as i*voxelSpacing and (i+1)* voxelSpacing. + * For a uniform bGrid, i.e outside the context of multi-resolution this parameter is always 1 */ + , + const double_3d& spacingData /** Physical spacing between two consecutive data points in the Cartesian domain */, + const double_3d& origin /** Physical location in space of the origin of the Cartesian discretization */, + Neon::domain::tool::spaceCurves::EncoderType encoderType = Neon::domain::tool::spaceCurves::EncoderType::sweep); + + /** + * Returns some properties for a given cartesian in the Cartesian domain. + * The provide index my be inside or outside the user defined bounded Cartesian domain + */ + auto getProperties(const Neon::index_3d& idx) + const -> typename GridBaseTemplate::CellProperties final; + + /** + * Returns true if the query 3D point is inside the user domain + * @param idx + * @return + */ + auto isInsideDomain(const Neon::index_3d& idx) + const -> bool final; + + /** + * Retrieves the device index that contains the query point + * @param idx + * @return + */ + auto getSetIdx(const Neon::index_3d& idx) + const -> int32_t final; + + /** + * Allocates a new field on the grid + */ + template + auto newField(const std::string name, + int cardinality, + T inactiveValue, + Neon::DataUse dataUse = Neon::DataUse::HOST_DEVICE, + Neon::MemoryOptions memoryOptions = Neon::MemoryOptions()) const + -> Field; + + /** + * Allocates a new field on the block view grid + */ + template + auto newBlockViewField(const std::string name, + int cardinality, + T inactiveValue, + Neon::DataUse dataUse = Neon::DataUse::HOST_DEVICE, + Neon::MemoryOptions memoryOptions = Neon::MemoryOptions()) const + -> BlockView::Field; + + /** + * Allocates a new container to execute some computation in the grid + */ + template + auto newContainer(const std::string& name, + index_3d blockSize, + size_t sharedMem, + LoadingLambda lambda) const -> Neon::set::Container; + + /** + * Allocates a new container to execute some computation in the grid + */ + template + auto newContainer(const std::string& name, + LoadingLambda lambda) const -> Neon::set::Container; + + template + auto newAlphaContainer(const std::string& name, + LoadingLambda lambda) const -> Neon::set::Container; + + template + auto newBetaContainer(const std::string& name, + LoadingLambda lambda) const -> Neon::set::Container; + + template + auto newAlphaBetaContainer(const std::string& name, + LoadingLambdaAlpha lambdaAlpha, + LoadingLambdaBeta lambdaBeta) const -> Neon::set::Container; + + /** + * Defines a new set of parameter to launch a Container + */ + auto getLaunchParameters(Neon::DataView dataView, + const Neon::index_3d& blockSize, + const size_t& sharedMem) const -> Neon::set::LaunchParameters; + + /** + * Retrieve the span associated to the grid w.r.t. some user defined parameters. + */ + auto getSpan(Neon::Execution execution, + SetIdx setIdx, + Neon::DataView dataView) -> const Span&; + + /** + * Retrieve the block vew grid internally used. + * This grid can be leverage to allocate data at the block level. + */ + auto getBlockViewGrid() const -> BlockView::Grid&; + + + /** + * Retrieve the block vew grid internally used. + * This grid can be leverage to allocate data at the block level. + */ + auto getActiveBitMask() const -> BlockView::Field&; + + /** + * Helper function to retrieve the discrete index spacing used for the multi-resolution + */ + template + auto helGetMultiResDiscreteIdxSpacing() const -> std::enable_if_t; + + + /** + * Help function to retrieve the block connectivity as a BlockViewGrid field + */ + auto helpGetBlockConnectivity() const -> BlockView::Field&; + + /** + * Help function to retrieve the block origin as a BlockViewGrid field + */ + auto helpGetDataBlockOriginField() const -> Neon::aGrid::Field&; + + /** + * Help function to retrieve the map that converts a stencil point id to 3d offset + */ + auto helpGetStencilIdTo3dOffset() const -> Neon::set::MemSet&; + + auto helpGetPartitioner1D() -> Neon::domain::tool::Partitioner1D&; + + /** + * Help function retriev the device and the block index associated to a point in the BlockViewGrid grid + */ + auto helpGetSetIdxAndGridIdx(Neon::index_3d idx) const -> std::tuple; + + template + auto init_mask_field(ActiveCellLambda activeCellLambda) -> void; + auto helpGetClassField() -> Field&; + + + struct Data + { + auto init(const Neon::Backend& bk) + { + spanTable.init(bk); + launchParametersTable.init(bk); + } + + Neon::domain::tool::SpanTable spanTable /** Span for each data view configurations */; + Neon::set::LaunchParametersTable launchParametersTable; + + Neon::domain::tool::Partitioner1D partitioner1D; + Stencil stencil; + Neon::sys::patterns::Engine reduceEngine; + + Neon::aGrid memoryGrid /** memory allocator for fields */; + Neon::aGrid::Field mDataBlockOriginField; + Neon::set::MemSet mStencil3dTo1dOffset; + + BlockView::Grid blockViewGrid; + BlockView::Field activeBitField; + BlockView::Field blockConnectivity; + Neon::set::MemSet stencilIdTo3dOffset; + + int mMultiResDiscreteIdxSpacing; + + // number of active voxels in each block + Neon::set::DataSet mNumActiveVoxel; + + + // Stencil neighbor indices + Neon::set::MemSet mStencilNghIndex; + + AlphaGrid alphaGrid; + BetaGrid betaGrid; + Field maskClassField; + }; + std::shared_ptr mData; +}; +extern template class bGrid>; +} // namespace Neon::domain::details::disaggregated::bGridMask + +#include "bField_imp.h" +#include "bGrid_imp.h" diff --git a/libNeonDomain/include/Neon/domain/details/bGridDisgMask/bGrid_imp.h b/libNeonDomain/include/Neon/domain/details/bGridDisgMask/bGrid_imp.h new file mode 100644 index 00000000..95fde021 --- /dev/null +++ b/libNeonDomain/include/Neon/domain/details/bGridDisgMask/bGrid_imp.h @@ -0,0 +1,572 @@ +#include "Neon/domain/details/bGridDisgMask/bGrid.h" +#include "Neon/domain/tools/SpaceCurves.h" + +namespace Neon::domain::details::disaggregated::bGridMask { + +template +template +bGrid::bGrid(const Neon::Backend& backend, + const Neon::int32_3d& domainSize, + const ActiveCellLambda activeCellLambda, + const Neon::domain::Stencil& stencil, + const double_3d& spacingData, + const double_3d& origin, + Neon::domain::tool::spaceCurves::EncoderType encoderType) + : bGrid(backend, domainSize, activeCellLambda, stencil, 1, spacingData, origin, encoderType) +{ +} + +template +template +bGrid::bGrid(const Neon::Backend& backend, + const Neon::int32_3d& domainSize, + const ActiveCellLambda activeCellLambda, + const Neon::domain::Stencil& stencil, + const int multiResDiscreteIdxSpacing, + const double_3d& spacingData, + const double_3d& origin, + Neon::domain::tool::spaceCurves::EncoderType encoderType) +{ + + + mData = std::make_shared(); + mData->init(backend); + + mData->mMultiResDiscreteIdxSpacing = multiResDiscreteIdxSpacing; + mData->stencil = stencil; + const index_3d defaultKernelBlockSize(SBlock::memBlockSizeX, + SBlock::memBlockSizeY, + SBlock::memBlockSizeZ); + + std::stringstream gridName; + gridName << "bGrid_" << SBlock::memBlockSizeX << "_" + << SBlock::memBlockSizeY << "_" + << SBlock::memBlockSizeZ; + { + auto nElementsPerPartition = backend.devSet().template newDataSet(0); + // We do an initialization with nElementsPerPartition to zero, + // then we reset to the computed number. + + bGrid::GridBase::init(gridName.str(), + backend, + domainSize, + stencil, + nElementsPerPartition, + defaultKernelBlockSize, + multiResDiscreteIdxSpacing, + origin, + encoderType, + defaultKernelBlockSize); + } + + { // Initialization of the partitioner + using returTypeOfLambda = typename std::invoke_result::type; + if constexpr (std::is_same_v) { + mData->partitioner1D = Neon::domain::tool::Partitioner1D( + backend, + activeCellLambda, + nullptr, + SBlock::memBlockSize3D.template newType(), + domainSize, + Neon::domain::Stencil::s27_t(false), + encoderType, + multiResDiscreteIdxSpacing); + } else if constexpr (std::is_same_v) { + mData->partitioner1D = Neon::domain::tool::Partitioner1D( + backend, + [&](Neon::index_3d idx) { + return activeCellLambda(idx) != details::cGrid::ClassSelector::outside; + }, + // [&](Neon::index_3d idx) { + // return (activeCellLambda(idx) == details::cGrid::ClassSelector::beta) + // ? Neon::domain::tool::partitioning::ByDomain::bc + // : Neon::domain::tool::partitioning::ByDomain::bulk; + // }, + nullptr, + SBlock::memBlockSize3D.template newType(), + domainSize, + Neon::domain::Stencil::s27_t(false), + encoderType, + multiResDiscreteIdxSpacing); + } else { + NEON_THROW_UNSUPPORTED_OPERATION("The user defined lambda must return a bool or a ClassSelector"); + } + + mData->mDataBlockOriginField = mData->partitioner1D.getGlobalMapping(); + mData->mStencil3dTo1dOffset = mData->partitioner1D.getStencil3dTo1dOffset(); + mData->memoryGrid = mData->partitioner1D.getMemoryGrid(); + } + + { // BlockViewGrid + Neon::domain::details::eGrid::eGrid egrid( + backend, + mData->partitioner1D.getBlockSpan(), + mData->partitioner1D, + Neon::domain::Stencil::s27_t(false), + spacingData * SBlock::memBlockSize3D, + origin); + + mData->blockViewGrid = BlockView::Grid(egrid); + } + + { // Active bitmask + mData->activeBitField = mData->blockViewGrid.template newField( + "BlockViewBitMask", + 1, + [] { + typename SBlock::BitMask outsideBitMask; + outsideBitMask.reset(); + return outsideBitMask; + }(), + Neon::DataUse::HOST_DEVICE, backend.getMemoryOptions(BlockView::layout)); + + mData->mNumActiveVoxel = backend.devSet().template newDataSet(); + + mData->activeBitField + .getGrid() + .template newContainer( + "activeBitMaskInit", + [&, this](Neon::set::Loader& loader) { + auto bitMaskPartition = loader.load(mData->activeBitField); + return [&, bitMaskPartition](const auto& bitMaskIdx) mutable { + auto prtIdx = bitMaskPartition.prtID(); + int countActive = 0; + auto const blockOrigin = bitMaskPartition.getGlobalIndex(bitMaskIdx); + typename SBlock::BitMask& bitMask = bitMaskPartition(bitMaskIdx, 0); + bitMask.reset(); + + for (int k = 0; k < SBlock::memBlockSize3D.template newType().z; k++) { + for (int j = 0; j < SBlock::memBlockSize3D.template newType().y; j++) { + for (int i = 0; i < SBlock::memBlockSize3D.template newType().x; i++) { + auto globalPosition = blockOrigin + Neon::int32_3d(i * this->mData->mMultiResDiscreteIdxSpacing, + j * this->mData->mMultiResDiscreteIdxSpacing, + k * this->mData->mMultiResDiscreteIdxSpacing); + bool const isInDomain = globalPosition < domainSize * this->mData->mMultiResDiscreteIdxSpacing; + bool const isActive = activeCellLambda(globalPosition); + if (isActive && isInDomain) { + countActive++; + bitMask.setActive(i, j, k); + } + } + } + } +#pragma omp critical + { + this->mData->mNumActiveVoxel[prtIdx] += countActive; + } + }; + }) + .run(Neon::Backend::mainStreamIdx); + + + mData->activeBitField.updateDeviceData(Neon::Backend::mainStreamIdx); + this->getBackend().sync(Neon::Backend::mainStreamIdx); + mData->activeBitField.newHaloUpdate(Neon::set::StencilSemantic::standard, + Neon::set::TransferMode::put, + Neon::Execution::device) + .run(Neon::Backend::mainStreamIdx); + } + + + { // Neighbor blocks + mData->blockConnectivity = mData->blockViewGrid.template newField("blockConnectivity", + 27, + Span::getInvalidBlockId(), + Neon::DataUse::HOST_DEVICE, + Neon::MemoryLayout::arrayOfStructs); + + mData->blockConnectivity.getGrid().template newContainer( + "blockConnectivityInit", + [&](Neon::set::Loader& loader) { + auto blockConnectivity = loader.load(mData->blockConnectivity); + return [&, blockConnectivity](auto const& idx) mutable { + for (int8_t k = 0; k < 3; k++) { + for (int8_t j = 0; j < 3; j++) { + for (int8_t i = 0; i < 3; i++) { + auto targetDirection = i + 3 * j + 3 * 3 * k; + BlockIdx blockNghIdx = Span::getInvalidBlockId(); + typename decltype(blockConnectivity)::Idx nghIdx; + Neon::int8_3d stencilPoint(i - int8_t(1), + j - int8_t(1), + k - int8_t(1)); + bool isValid = blockConnectivity.getNghIndex(idx, stencilPoint, nghIdx); + if (isValid) { + blockNghIdx = static_cast(nghIdx.helpGet()); + } + blockConnectivity(idx, targetDirection) = blockNghIdx; + } + } + } + }; + }) + .run(Neon::Backend::mainStreamIdx); + mData->blockConnectivity.updateDeviceData(Neon::Backend::mainStreamIdx); + } + + // Initialization of the SPAN table + mData->spanTable.forEachConfiguration([&](Neon::Execution execution, + Neon::SetIdx setIdx, + Neon::DataView dw, + Span& span) { + span.mDataView = dw; + switch (dw) { + case Neon::DataView::STANDARD: { + span.mFirstDataBlockOffset = 0; + span.mDataView = dw; + span.mActiveMask = mData->activeBitField.getPartition(execution, setIdx, dw).mem(); + break; + } + case Neon::DataView::BOUNDARY: { + span.mFirstDataBlockOffset = mData->partitioner1D.getSpanClassifier().countInternal(setIdx); + span.mDataView = dw; + span.mActiveMask = mData->activeBitField.getPartition(execution, setIdx, dw).mem(); + + break; + } + case Neon::DataView::INTERNAL: { + span.mFirstDataBlockOffset = 0; + span.mDataView = dw; + span.mActiveMask = mData->activeBitField.getPartition(execution, setIdx, dw).mem(); + break; + } + default: { + NeonException exc("dFieldDev"); + NEON_THROW(exc); + } + } + }); + + { // Stencil Idx to 3d offset + auto nPoints = backend.devSet().newDataSet(stencil.nNeighbours()); + mData->stencilIdTo3dOffset = backend.devSet().template newMemSet(Neon::DataUse::HOST_DEVICE, + 1, + backend.getMemoryOptions(), + nPoints); + for (int i = 0; i < stencil.nNeighbours(); ++i) { + for (int devIdx = 0; devIdx < backend.devSet().setCardinality(); devIdx++) { + index_3d pLong = stencil.neighbours()[i]; + Neon::int8_3d pShort = pLong.newType(); + mData->stencilIdTo3dOffset.eRef(devIdx, i) = pShort; + } + } + mData->stencilIdTo3dOffset.updateDeviceData(backend, Neon::Backend::mainStreamIdx); + } + // Init the base grid + bGrid::GridBase::init(gridName.str(), + backend, + domainSize, + Neon::domain::Stencil(), + mData->mNumActiveVoxel, + SBlock::memBlockSize3D.template newType(), + spacingData, + origin, + encoderType, + defaultKernelBlockSize); + { // setting launchParameters + mData->launchParametersTable.forEachSeq([&](Neon::DataView dw, + Neon::set::LaunchParameters& bLaunchParameters) { + auto defEGridBlock = mData->blockViewGrid.getDefaultBlock(); + auto eGridParams = mData->blockViewGrid.getLaunchParameters(dw, defEGridBlock, 0); + eGridParams.forEachSeq([&](Neon::SetIdx setIdx, Neon::sys::GpuLaunchInfo const& launchSingleDev) { + auto eDomainGridSize = launchSingleDev.domainGrid(); + assert(eDomainGridSize.y == 1); + assert(eDomainGridSize.z == 1); + int nBlocks = static_cast(eDomainGridSize.x); + bLaunchParameters.get(setIdx).set(Neon::sys::GpuLaunchInfo::mode_e::cudaGridMode, + nBlocks, SBlock::memBlockSize3D.template newType(), 0); + }); + }); + } + + this->init_mask_field(); + mData->alphaGrid = details::cGrid::cGrid(*this); + mData->betaGrid = details::cGrid::cGrid(*this); +} + +template +template +auto bGrid::newField(const std::string name, + int cardinality, + T inactiveValue, + Neon::DataUse dataUse, + Neon::MemoryOptions memoryOptions) const -> Field +{ + memoryOptions = this->getDevSet().sanitizeMemoryOption(memoryOptions); + Field field(name, dataUse, memoryOptions, *this, cardinality, inactiveValue); + return field; +} + +template +template +auto bGrid::newBlockViewField(const std::string name, + int cardinality, + T inactiveValue, + Neon::DataUse dataUse, + Neon::MemoryOptions memoryOptions) const -> BlockView::Field +{ + memoryOptions = this->getDevSet().sanitizeMemoryOption(memoryOptions); + BlockView::Field blockViewField = mData->blockViewGrid.template newField(name, cardinality, inactiveValue, dataUse, memoryOptions); + return blockViewField; +} + +template +template +auto bGrid::newContainer(const std::string& name, + index_3d blockSize, + size_t sharedMem, + LoadingLambda lambda) const -> Neon::set::Container +{ + Neon::set::Container kContainer = Neon::set::Container::factory(name, + Neon::set::internal::ContainerAPI::DataViewSupport::on, + *this, + lambda, + blockSize, + [sharedMem](const Neon::index_3d&) { return sharedMem; }); + return kContainer; +} + +template +template +auto bGrid::newContainer(const std::string& name, + LoadingLambda lambda) const -> Neon::set::Container +{ + const Neon::index_3d& defaultBlockSize = this->getDefaultBlock(); + Neon::set::Container kContainer = Neon::set::Container::factory(name, + Neon::set::internal::ContainerAPI::DataViewSupport::on, + *this, + lambda, + defaultBlockSize, + [](const Neon::index_3d&) { return 0; }); + return kContainer; +} + +template +template +auto bGrid::newAlphaContainer(const std::string& name, + LoadingLambda lambda) const -> Neon::set::Container +{ + + auto kContainer = mData->alphaGrid.newContainer(name, + lambda); + return kContainer; +} + +template +template +auto bGrid::newBetaContainer(const std::string& name, + LoadingLambda lambda) const -> Neon::set::Container +{ + + auto kContainer = mData->betaGrid.newContainer(name, + lambda); + return kContainer; +} +template + +template +auto bGrid::newAlphaBetaContainer(const std::string& name, + LoadingLambdaAlpha lambdaAlpha, + LoadingLambdaBeta lambdaBeta) const -> Neon::set::Container +{ + std::vector sequence; + auto containerAlpha = mData->alphaGrid.newContainer(name + "Alpha", + lambdaAlpha); + auto containerBeta = mData->betaGrid.newContainer(name + "Beta", + lambdaBeta); + + sequence.push_back(containerAlpha); + sequence.push_back(containerBeta); + + Neon::set::Container exec = Neon::set::Container::factorySequence(name + "Sequence", sequence); + return exec; +} + +template +auto bGrid:: + getBlockViewGrid() + const -> BlockView::Grid& +{ + return mData->blockViewGrid; +} + +template +auto bGrid:: + getActiveBitMask() + const -> BlockView::Field& +{ + return mData->activeBitField; +} + +/** + * Helper function to retrieve the discrete index spacing used for the multi-resolution + */ +template +template +auto bGrid::helGetMultiResDiscreteIdxSpacing() const + -> std::enable_if_t +{ + return mData->mMultiResDiscreteIdxSpacing; +} + +template +auto bGrid:: + helpGetBlockConnectivity() + const -> BlockView::Field& +{ + return mData->blockConnectivity; +} +template +auto bGrid:: + helpGetDataBlockOriginField() + const -> Neon::aGrid::Field& +{ + return mData->mDataBlockOriginField; +} +template +auto bGrid::getSpan(Neon::Execution execution, + SetIdx setIdx, + Neon::DataView dataView) -> const bGrid::Span& +{ + return mData->spanTable.getSpan(execution, setIdx, dataView); +} + +template +bGrid::~bGrid() +{ +} +template +auto bGrid::getSetIdx(const index_3d& idx) const -> int32_t +{ + typename GridBaseTemplate::CellProperties cellProperties; + + cellProperties.setIsInside(this->isInsideDomain(idx)); + if (!cellProperties.isInside()) { + return -1; + } + Neon::SetIdx setIdx = cellProperties.getSetIdx(); + return setIdx; +} +template +auto bGrid::getLaunchParameters(Neon::DataView dataView, + const index_3d&, + const size_t& sharedMem) const -> Neon::set::LaunchParameters +{ + auto res = mData->launchParametersTable.get(dataView); + res.forEachSeq([&](SetIdx const& /*setIdx*/, + Neon::set::LaunchParameters::launchInfo_e& launchParams) -> void { + launchParams.setShm(sharedMem); + }); + return res; +} + +template +auto bGrid:: + helpGetStencilIdTo3dOffset() + const -> Neon::set::MemSet& +{ + return mData->stencilIdTo3dOffset; +} + +template +auto bGrid::isInsideDomain(const index_3d& idx) const -> bool +{ + // 1. check if the block is active + const BlockView::index_3d blockIdx3d = idx / (SBlock::memBlockSize3D.template newType() * mData->mMultiResDiscreteIdxSpacing); + auto blockProperties = mData->blockViewGrid.getProperties(blockIdx3d); + + if (!blockProperties.isInside()) { + return false; + } + // 2. The block is active, check the element in the block + typename SBlock::BitMask const& bitMask = mData->activeBitField.getReference(blockIdx3d, 0); + + bool isActive = bitMask.isActive((idx.x / mData->mMultiResDiscreteIdxSpacing) % SBlock::memBlockSize3D.x, + (idx.y / mData->mMultiResDiscreteIdxSpacing) % SBlock::memBlockSize3D.y, + (idx.z / mData->mMultiResDiscreteIdxSpacing) % SBlock::memBlockSize3D.z); + return isActive; +} + +template +auto bGrid::getProperties(const index_3d& idx) + const -> typename GridBaseTemplate::CellProperties +{ + typename GridBaseTemplate::CellProperties cellProperties; + + cellProperties.setIsInside(this->isInsideDomain(idx)); + if (!cellProperties.isInside()) { + return cellProperties; + } + + if (this->getDevSet().setCardinality() == 1) { + cellProperties.init(0, DataView::INTERNAL); + } else { + const index_3d blockIdx3d = idx / SBlock::memBlockSize3D.template newType(); + auto blockViewProperty = mData->blockViewGrid.getProperties(blockIdx3d); + + cellProperties.init(blockViewProperty.getSetIdx(), + blockViewProperty.getDataView()); + } + return cellProperties; +} + +template +auto bGrid::helpGetSetIdxAndGridIdx(Neon::index_3d idx) + const -> std::tuple +{ + const index_3d blockIdx3d = idx / (SBlock::memBlockSize3D.template newType() * mData->mMultiResDiscreteIdxSpacing); + auto [setIdx, bvGridIdx] = mData->blockViewGrid.helpGetSetIdxAndGridIdx(blockIdx3d); + Idx bIdx; + bIdx.mDataBlockIdx = bvGridIdx.helpGet(); + bIdx.mInDataBlockIdx.x = static_cast((idx.x / mData->mMultiResDiscreteIdxSpacing) % SBlock::memBlockSize3D.x); + bIdx.mInDataBlockIdx.y = static_cast((idx.y / mData->mMultiResDiscreteIdxSpacing) % SBlock::memBlockSize3D.y); + bIdx.mInDataBlockIdx.z = static_cast((idx.z / mData->mMultiResDiscreteIdxSpacing) % SBlock::memBlockSize3D.z); + + return {setIdx, bIdx}; +} + +template +auto bGrid::helpGetPartitioner1D() -> Neon::domain::tool::Partitioner1D& +{ + return mData->partitioner1D; +} + +template +template +auto bGrid::init_mask_field(ActiveCellLambda activeCellLambda) -> void +{ + using returTypeOfLambda = typename std::invoke_result::type; + if constexpr (std::is_same_v) { + + + auto maskField = this->newField("maskField", 1, 0, Neon::DataUse::HOST_DEVICE); + maskField.getGrid().template newContainer( + "maskFieldInit", + [&](Neon::set::Loader& loader) { + auto maskFieldPartition = loader.load(maskField); + return [&, maskFieldPartition](const auto& gIdx) mutable { + details::cGrid::ClassSelector voxelClass = activeCellLambda(gIdx); + maskFieldPartition(gIdx, 0) = static_cast(voxelClass); + }; + }) + .run(Neon::Backend::mainStreamIdx); + this->getBackend().sync(Neon::Backend::mainStreamIdx); + maskField.updateDeviceData(Neon::Backend::mainStreamIdx); + this->getBackend().sync(Neon::Backend::mainStreamIdx); + this->mData->maskClassField = maskField; + return; + } +} + +template +auto bGrid::helpGetClassField() -> Field& +{ + return mData->maskClassField; +} + +} // namespace Neon::domain::details::disaggregated::bGridMask \ No newline at end of file diff --git a/libNeonDomain/include/Neon/domain/details/bGridDisgMask/bIndex.h b/libNeonDomain/include/Neon/domain/details/bGridDisgMask/bIndex.h new file mode 100644 index 00000000..7bc7bf74 --- /dev/null +++ b/libNeonDomain/include/Neon/domain/details/bGridDisgMask/bIndex.h @@ -0,0 +1,142 @@ +#pragma once + +#include "Neon/core/core.h" + + +namespace Neon::domain::details::disaggregated::bGridMask { + +// Common forward declarations +template +class bGrid; +template +class bSpan; +template +class bPartition; + +class MicroIndex +{ + public: + using TrayIdx = int32_t; + using InTrayIdx = int8_3d; + + NEON_CUDA_HOST_DEVICE inline explicit MicroIndex() + : MicroIndex(0, 0, 0, 0) + { + } + + NEON_CUDA_HOST_DEVICE inline explicit MicroIndex(const TrayIdx& blockIdx, + const InTrayIdx::Integer& x, + const InTrayIdx::Integer& y, + const InTrayIdx::Integer& z) + { + mTrayBlockIdx = blockIdx; + mInTrayBlockIdx.x = x; + mInTrayBlockIdx.y = y; + mInTrayBlockIdx.z = z; + } + + NEON_CUDA_HOST_DEVICE inline auto getInTrayBlockIdx() const -> InTrayIdx const& + { + return mInTrayBlockIdx; + } + + NEON_CUDA_HOST_DEVICE inline auto getTrayBlockIdx() const -> TrayIdx const& + { + return mTrayBlockIdx; + } + + NEON_CUDA_HOST_DEVICE inline auto setInTrayBlockIdx(InTrayIdx const& inTrayIdx) -> void + { + mInTrayBlockIdx = inTrayIdx; + } + + NEON_CUDA_HOST_DEVICE inline auto setTrayBlockIdx(TrayIdx const& trayIdx) -> void + { + mTrayBlockIdx = trayIdx; + } + + InTrayIdx mInTrayBlockIdx; + TrayIdx mTrayBlockIdx{}; +}; + +template +class bIndex +{ + public: + template + friend class bSpan; + using OuterIdx = bIndex; + + using NghIdx = int8_3d; + template + friend class bPartition; + + template + friend class bField; + + template + friend class bSpan; + template + friend class bGrid; + + + using TrayIdx = MicroIndex::TrayIdx; + using InTrayIdx = MicroIndex::InTrayIdx; + + using DataBlockCount = std::make_unsigned_t; + using DataBlockIdx = std::make_unsigned_t; + using InDataBlockIdx = InTrayIdx; + + bIndex() = default; + ~bIndex() = default; + + NEON_CUDA_HOST_DEVICE inline explicit bIndex(const DataBlockIdx& blockIdx, + const InDataBlockIdx::Integer& x, + const InDataBlockIdx::Integer& y, + const InDataBlockIdx::Integer& z); + + NEON_CUDA_HOST_DEVICE inline auto getMicroIndex() -> MicroIndex; + NEON_CUDA_HOST_DEVICE inline auto init(MicroIndex const&) -> void; + + NEON_CUDA_HOST_DEVICE inline auto getInDataBlockIdx() const -> InDataBlockIdx const&; + NEON_CUDA_HOST_DEVICE inline auto getDataBlockIdx() const -> DataBlockIdx const&; + NEON_CUDA_HOST_DEVICE inline auto setInDataBlockIdx(InDataBlockIdx const&) -> void; + NEON_CUDA_HOST_DEVICE inline auto setDataBlockIdx(DataBlockIdx const&) -> void; + NEON_CUDA_HOST_DEVICE inline auto isActive() const -> bool; + // the local index within the block + InDataBlockIdx mInDataBlockIdx; + DataBlockIdx mDataBlockIdx{}; +}; + +template +NEON_CUDA_HOST_DEVICE auto bIndex::setDataBlockIdx(const bIndex::DataBlockIdx& dataBlockIdx) -> void +{ + mDataBlockIdx = dataBlockIdx; +} + +template +NEON_CUDA_HOST_DEVICE auto bIndex::setInDataBlockIdx(const bIndex::InDataBlockIdx& inDataBlockIdx) -> void +{ + mInDataBlockIdx = inDataBlockIdx; +} + +template +NEON_CUDA_HOST_DEVICE auto bIndex::getDataBlockIdx() const -> const bIndex::DataBlockIdx& +{ + return mDataBlockIdx; +} +template +NEON_CUDA_HOST_DEVICE auto bIndex::getInDataBlockIdx() const -> const bIndex::InDataBlockIdx& +{ + return mInDataBlockIdx; +} + +template +NEON_CUDA_HOST_DEVICE auto bIndex::isActive() const -> bool +{ + return mDataBlockIdx != std::numeric_limits::max(); +} + +} // namespace Neon::domain::details::disaggregated::bGrid + +#include "Neon/domain/details/bGridDisgMask/bIndex_imp.h" diff --git a/libNeonDomain/include/Neon/domain/details/bGridDisgMask/bIndex_imp.h b/libNeonDomain/include/Neon/domain/details/bGridDisgMask/bIndex_imp.h new file mode 100644 index 00000000..10ad51b5 --- /dev/null +++ b/libNeonDomain/include/Neon/domain/details/bGridDisgMask/bIndex_imp.h @@ -0,0 +1,67 @@ +#pragma once +#include "Neon/domain/details/bGridDisgMask/bIndex.h" + +namespace Neon::domain::details::disaggregated::bGridMask { + +template +NEON_CUDA_HOST_DEVICE inline bIndex:: + bIndex(const DataBlockIdx& blockIdx, + const InDataBlockIdx::Integer& x, + const InDataBlockIdx::Integer& y, + const InDataBlockIdx::Integer& z) +{ + mDataBlockIdx = blockIdx; + mInDataBlockIdx.x = x; + mInDataBlockIdx.y = y; + mInDataBlockIdx.z = z; +} + + +template +NEON_CUDA_HOST_DEVICE inline auto bIndex::getMicroIndex() -> MicroIndex +{ + + + TrayIdx const exBlockOffset = mDataBlockIdx * (SBlock::blockRatioX * SBlock::blockRatioY * SBlock::blockRatioZ); + TrayIdx const exTrayOffset = [&] { + TrayIdx const trayBlockIdxX = mInDataBlockIdx.x / SBlock::userBlockSizeX; + TrayIdx const trayBlockIdxY = mInDataBlockIdx.y / SBlock::userBlockSizeY; + TrayIdx const trayBlockIdxZ = mInDataBlockIdx.z / SBlock::userBlockSizeZ; + + TrayIdx const res = trayBlockIdxX + trayBlockIdxY * SBlock::blockRatioX + + trayBlockIdxZ * (SBlock::blockRatioX * SBlock::blockRatioY); + return res; + }(); + MicroIndex res; + res.setTrayBlockIdx(exBlockOffset + exTrayOffset); + res.setInTrayBlockIdx({static_cast(mInDataBlockIdx.x % SBlock::userBlockSizeX), + static_cast(mInDataBlockIdx.y % SBlock::userBlockSizeY), + static_cast(mInDataBlockIdx.z % SBlock::userBlockSizeZ)}); + return res; +} + + +template +NEON_CUDA_HOST_DEVICE inline auto bIndex::init(MicroIndex const& microIndex) -> void +{ + constexpr uint32_t memBlockSize = SBlock::memBlockSizeX * SBlock::memBlockSizeY * SBlock::memBlockSizeZ; + constexpr uint32_t userBlockSize = SBlock::userBlockSizeX * SBlock::userBlockSizeY * SBlock::userBlockSizeZ; + constexpr uint32_t blockRatioSize = memBlockSize / userBlockSize; + + constexpr uint32_t blockRatioX = SBlock::memBlockSizeX / SBlock::userBlockSizeX; + constexpr uint32_t blockRatioY = SBlock::memBlockSizeY / SBlock::userBlockSizeY; + + mDataBlockIdx = microIndex.getTrayBlockIdx() / (blockRatioSize); + + uint32_t reminder = microIndex.getTrayBlockIdx() % (blockRatioSize); + + const uint32_t reminderInZ = reminder / (blockRatioX * blockRatioY); + mInDataBlockIdx.z = static_cast(microIndex.getInTrayBlockIdx().z + reminderInZ * SBlock::userBlockSizeZ); + reminder = reminder % (blockRatioX * blockRatioY); + const uint32_t reminderInY = reminder / (blockRatioX); + mInDataBlockIdx.y = static_cast(microIndex.getInTrayBlockIdx().y + reminderInY * SBlock::userBlockSizeY); + const uint32_t reminderInX = reminder % blockRatioX; + mInDataBlockIdx.x = static_cast(microIndex.getInTrayBlockIdx().x + reminderInX * SBlock::userBlockSizeX); +} + +} // namespace Neon::domain::details::disaggregated::bGrid \ No newline at end of file diff --git a/libNeonDomain/include/Neon/domain/details/bGridDisgMask/bPartition.h b/libNeonDomain/include/Neon/domain/details/bGridDisgMask/bPartition.h new file mode 100644 index 00000000..031e50b5 --- /dev/null +++ b/libNeonDomain/include/Neon/domain/details/bGridDisgMask/bPartition.h @@ -0,0 +1,194 @@ +#pragma once + +#include "Neon/domain/details/bGridDisgMask/bIndex.h" +#include "Neon/domain/details/bGridDisgMask/bSpan.h" + +#include "Neon/domain/interface/NghData.h" + +#include "Neon/sys/memory/CUDASharedMemoryUtil.h" + +namespace Neon::domain::details::disaggregated::bGridMask { + + template + class bSpan; + + template + class bPartition { + public: + using Span = bSpan; + using Idx = bIndex; + using NghIdx = typename Idx::NghIdx; + using Type = T; + using NghData = Neon::domain::NghData; + + using BlockViewGrid = Neon::domain::tool::GridTransformer::Grid; + using BlockViewGridIdx = BlockViewGrid::Idx; + + public: + bPartition(); + + ~bPartition() = default; + + explicit bPartition(int setIdx, + int mCardinality, + T *mMem, + typename Idx::DataBlockIdx *mBlockConnectivity, + typename SBlock::BitMask const *NEON_RESTRICT mMask, + Neon::int32_3d *mOrigin, + NghIdx *mStencilNghIndex, + Neon::int32_3d mDomainSize); + + /** + * Retrieve the cardinality of the field. + */ + inline NEON_CUDA_HOST_DEVICE auto + cardinality() + const -> int; + + /** + * Gets the field metadata at a cartesian point. + */ + inline NEON_CUDA_HOST_DEVICE auto + operator()(const Idx &cell, + int card) + -> T &; + + /** + * Gets the field metadata at a cartesian point. + */ + inline NEON_CUDA_HOST_DEVICE auto + operator()(const Idx &cell, + int card) + const -> const T &; + + /** + * Gets the field metadata at a neighbour cartesian point. + */ + NEON_CUDA_HOST_DEVICE inline auto + getNghData(const Idx &cell, + const NghIdx &offset, + const int card) + const -> NghData; + + /** + * Gets the field metadata at a neighbour cartesian point. + */ + NEON_CUDA_HOST_DEVICE inline auto + getNghData(const Idx &eId, + uint8_t nghID, + int card) + const -> NghData; + + /** + * Gets the field metadata at a neighbour cartesian point. + */ + template + NEON_CUDA_HOST_DEVICE inline auto + getNghData(const Idx &eId, + int card) + const -> NghData; + + /** + * Gets the field metadata at a neighbour cartesian point. + */ + template + NEON_CUDA_HOST_DEVICE inline auto + getNghData(const Idx &eId, + int card, + T defaultValue) + const -> NghData; + + template + NEON_CUDA_HOST_DEVICE inline auto + getNghData(const Idx &gidx, + int card, + LambdaVALID funIfValid, + LambdaNOTValid funIfNOTValid = nullptr) + const -> std::enable_if_t && (std::is_invocable_v || + std::is_same_v), void>; + + template + NEON_CUDA_HOST_DEVICE inline auto + writeNghData(const Idx &gidx, + int card, + T value) + -> bool; + + /** + * Gets the global coordinates of the cartesian point. + */ + NEON_CUDA_HOST_DEVICE inline auto + getGlobalIndex(const Idx &cell) + const -> Neon::index_3d; + + NEON_CUDA_HOST_DEVICE inline auto + isActive(const Idx &cell, + const typename SBlock::BitMask *mask = nullptr) const -> bool; + + + NEON_CUDA_HOST_DEVICE inline auto + getDomainSize() + const -> Neon::index_3d; + + NEON_CUDA_HOST_DEVICE + auto mem() const -> T const *; + + /** + * Gets the Idx for in the block view space. + */ + NEON_CUDA_HOST_DEVICE inline auto + getBlockViewIdx(const Idx &cell) + const -> BlockViewGridIdx; + + + NEON_CUDA_HOST_DEVICE inline auto + helpGetPitch(const Idx &cell, int card) + const -> uint32_t; + + NEON_CUDA_HOST_DEVICE inline auto + helpGetValidIdxPitchExplicit(const Idx &idx, int card) + const -> uint32_t; + + NEON_CUDA_HOST_DEVICE inline auto + helpNghPitch(const Idx &nghIdx, int card) + const -> std::tuple; + + NEON_CUDA_HOST_DEVICE inline auto + helpGetNghIdx(const Idx &idx, const NghIdx &offset) + const -> Idx; + + template + NEON_CUDA_HOST_DEVICE inline auto + helpGetNghIdx(const Idx &idx) + const -> Idx; + + NEON_CUDA_HOST_DEVICE inline auto + helpGetNghIdx(const Idx &idx, const NghIdx &offset, const typename Idx::DataBlockIdx *blockConnectivity) + const -> Idx; + + template + NEON_CUDA_HOST_DEVICE inline auto + helpGetNghIdx(const Idx &idx, const typename Idx::DataBlockIdx *blockConnectivity) + const -> Idx; + + + int mCardinality; + T *mMem; + NghIdx const *NEON_RESTRICT mStencilNghIndex; + typename Idx::DataBlockIdx const *NEON_RESTRICT mBlockConnectivity; + typename SBlock::BitMask const *NEON_RESTRICT mMask; + Neon::int32_3d const *NEON_RESTRICT mOrigin; + int mSetIdx; + int mMultiResDiscreteIdxSpacing = 1; + Neon::int32_3d mDomainSize; + }; + +} // namespace Neon::domain::details::disaggregated::bGrid + +#include "Neon/domain/details/bGridDisgMask/bPartition_imp.h" \ No newline at end of file diff --git a/libNeonDomain/include/Neon/domain/details/bGridDisgMask/bPartition_imp.h b/libNeonDomain/include/Neon/domain/details/bGridDisgMask/bPartition_imp.h new file mode 100644 index 00000000..80b635ba --- /dev/null +++ b/libNeonDomain/include/Neon/domain/details/bGridDisgMask/bPartition_imp.h @@ -0,0 +1,454 @@ +#pragma once + +#include "Neon/domain/details/bGridDisgMask/bGrid.h" +#include "Neon/domain/details/bGridDisgMask/bSpan.h" + +namespace Neon::domain::details::disaggregated::bGridMask { + +template +bPartition::bPartition() + : mCardinality(0), + mMem(nullptr), + mStencilNghIndex(), + mBlockConnectivity(nullptr), + mMask(nullptr), + mOrigin(0), + mSetIdx(0) +{ +} + +template +bPartition:: + bPartition(int setIdx, + int cardinality, + T* mem, + typename Idx::DataBlockIdx* blockConnectivity, + typename SBlock::BitMask const* NEON_RESTRICT mask, + Neon::int32_3d* origin, + NghIdx* stencilNghIndex, + Neon::int32_3d mDomainSize) + : mCardinality(cardinality), + mMem(mem), + mStencilNghIndex(stencilNghIndex), + mBlockConnectivity(blockConnectivity), + mMask(mask), + mOrigin(origin), + mSetIdx(setIdx), + mDomainSize(mDomainSize) +{ +} + +template +NEON_CUDA_HOST_DEVICE inline auto bPartition:: + getGlobalIndex(const Idx& gidx) + const -> Neon::index_3d +{ + auto location = mOrigin[gidx.mDataBlockIdx]; + location.x += gidx.mInDataBlockIdx.x; + location.y += gidx.mInDataBlockIdx.y; + location.z += gidx.mInDataBlockIdx.z; + if constexpr (SBlock::isMultiResMode) { + return location * mMultiResDiscreteIdxSpacing; + } + return location; +} + +template +NEON_CUDA_HOST_DEVICE inline auto bPartition:: + getDomainSize() + const -> Neon::index_3d +{ + return mDomainSize; +} + +template +NEON_CUDA_HOST_DEVICE inline auto bPartition:: +mem() const -> T const *{ + return mMem; +} + + +template +NEON_CUDA_HOST_DEVICE inline auto bPartition:: + getBlockViewIdx(const Idx& gidx) + const -> BlockViewGridIdx +{ + BlockViewGridIdx res; + res.manualSet(gidx.getDataBlockIdx()); + return res; +} + +template +inline NEON_CUDA_HOST_DEVICE auto bPartition:: + cardinality() + const -> int +{ + return mCardinality; +} + +template +inline NEON_CUDA_HOST_DEVICE auto bPartition:: +operator()(const Idx& cell, + int card) -> T& +{ + return mMem[helpGetPitch(cell, card)]; +} + +template +inline NEON_CUDA_HOST_DEVICE auto bPartition:: +operator()(const Idx& cell, + int card) const -> const T& +{ + return mMem[helpGetPitch(cell, card)]; +} + +template +inline NEON_CUDA_HOST_DEVICE auto bPartition:: + helpGetPitch(const Idx& idx, int card) + const -> uint32_t +{ + uint32_t const pitch = helpGetValidIdxPitchExplicit(idx, card); + return pitch; +} + + +template +inline NEON_CUDA_HOST_DEVICE auto bPartition:: + helpGetValidIdxPitchExplicit(const Idx& idx, int card) + const -> uint32_t +{ + uint32_t constexpr blockPitchByCard = SBlock::memBlockSizeX * SBlock::memBlockSizeY * SBlock::memBlockSizeZ; + uint32_t const inBlockInCardPitch = idx.mInDataBlockIdx.x + + SBlock::memBlockSizeX * idx.mInDataBlockIdx.y + + (SBlock::memBlockSizeX * SBlock::memBlockSizeY) * idx.mInDataBlockIdx.z; + uint32_t const blockAdnCardPitch = (idx.mDataBlockIdx * mCardinality + card) * blockPitchByCard; + uint32_t const pitch = blockAdnCardPitch + inBlockInCardPitch; + return pitch; +} + +template +inline NEON_CUDA_HOST_DEVICE auto bPartition:: + helpNghPitch(const Idx& nghIdx, int card) + const -> std::tuple +{ + if (nghIdx.mDataBlockIdx == Span::getInvalidBlockId()) { + return {false, 0}; + } + + const bool isActive = mMask[nghIdx.mDataBlockIdx].isActive(nghIdx.mInDataBlockIdx.x, nghIdx.mInDataBlockIdx.y, nghIdx.mInDataBlockIdx.z); + if (!isActive) { + return {false, 0}; + } + auto const offset = helpGetValidIdxPitchExplicit(nghIdx, card); + return {true, offset}; +} + +template +NEON_CUDA_HOST_DEVICE inline auto bPartition:: + helpGetNghIdx(const Idx& idx, + const NghIdx& offset) + const -> Idx +{ + return this->helpGetNghIdx(idx, offset, mBlockConnectivity); +} + +template +NEON_CUDA_HOST_DEVICE inline auto bPartition:: + helpGetNghIdx(const Idx& idx, + const NghIdx& offset, + const typename Idx::DataBlockIdx* blockConnectivity) + const -> Idx +{ + + typename Idx::InDataBlockIdx ngh(idx.mInDataBlockIdx.x + offset.x, + idx.mInDataBlockIdx.y + offset.y, + idx.mInDataBlockIdx.z + offset.z); + + /** + * 0 if no offset on the direction + * 1 positive offset + * -1 negative offset + */ + const int xFlag = ngh.x < 0 ? -1 : (ngh.x >= SBlock::memBlockSizeX ? +1 : 0); + const int yFlag = ngh.y < 0 ? -1 : (ngh.y >= SBlock::memBlockSizeX ? +1 : 0); + const int zFlag = ngh.z < 0 ? -1 : (ngh.z >= SBlock::memBlockSizeX ? +1 : 0); + + const bool isLocal = (xFlag | yFlag | zFlag) == 0; + if (!(isLocal)) { + typename Idx::InDataBlockIdx remoteInBlockOffset; + /** + * Example + * - 8 block (1D case) + * Case 1: + * |0,1,2,3|0,1,2,3|0,1,2,3| + * ^ ^ + * -3 starting point + * + * - idx.inBlock = 2 + * - offset = -1 + * - remote.x = (2-3) - ((-1) * 4) = -1 + 4 = 3 + * Case 2: + * |0,1,2,3|0,1,2,3|0,1,2,3| + * ^ ^ + * starting point +3 from 3 + * + * - idx.inBlock = 3 + * - offset = (+3,0) + * - remote.x = (7+3) - ((+1) * 8) = 10 - 8 = 2 + * + * |0,1,2,3|0,1,2,3|0,1,2,3| + * ^ ^ + * -3 from 0 +3 from 3 + * + * NOTE: if in one direction the neighbour offet is zero, xFalg is 0; + * */ + + Idx remoteNghIdx; + remoteNghIdx.mInDataBlockIdx.x = ngh.x - xFlag * SBlock::memBlockSizeX; + remoteNghIdx.mInDataBlockIdx.y = ngh.y - yFlag * SBlock::memBlockSizeX; + remoteNghIdx.mInDataBlockIdx.z = ngh.z - zFlag * SBlock::memBlockSizeX; + + int connectivityJump = idx.mDataBlockIdx * 27 + + (xFlag + 1) + + (yFlag + 1) * 3 + + (zFlag + 1) * 9; + remoteNghIdx.mDataBlockIdx = blockConnectivity[connectivityJump]; + + return remoteNghIdx; + } else { + Idx localNghIdx; + localNghIdx.mDataBlockIdx = idx.mDataBlockIdx; + localNghIdx.mInDataBlockIdx = ngh; + return localNghIdx; + } +} + +template +template +NEON_CUDA_HOST_DEVICE inline auto bPartition:: + helpGetNghIdx(const Idx& idx) + const -> Idx +{ + return this->helpGetNghIdx(idx, mBlockConnectivity); +} + +template +template +NEON_CUDA_HOST_DEVICE inline auto bPartition:: + helpGetNghIdx(const Idx& idx, const typename Idx::DataBlockIdx* blockConnectivity) + const -> Idx +{ + + typename Idx::InDataBlockIdx ngh(idx.mInDataBlockIdx.x + xOff, + idx.mInDataBlockIdx.y + yOff, + idx.mInDataBlockIdx.z + zOff); + + /** + * 0 if no offset on the direction + * 1 positive offset + * -1 negative offset + */ + const int xFlag = [&] { + if constexpr (xOff == 0) { + return 0; + } else { + return ngh.x < 0 ? -1 : (ngh.x >= SBlock::memBlockSizeX ? +1 : 0); + } + }(); + + + const int yFlag = [&] { + if constexpr (yOff == 0) { + return 0; + } else { + return ngh.y < 0 ? -1 : (ngh.y >= SBlock::memBlockSizeX ? +1 : 0); + } + }(); + const int zFlag = [&] { + if constexpr (zOff == 0) { + return 0; + } else { + return ngh.z < 0 ? -1 : (ngh.z >= SBlock::memBlockSizeX ? +1 : 0); + } + }(); + + const bool isLocal = (xFlag | yFlag | zFlag) == 0; + if (!(isLocal)) { + typename Idx::InDataBlockIdx remoteInBlockOffset; + /** + * Example + * - 8 block (1D case) + * Case 1: + * |0,1,2,3|0,1,2,3|0,1,2,3| + * ^ ^ + * -3 starting point + * + * - idx.inBlock = 2 + * - offset = -1 + * - remote.x = (2-3) - ((-1) * 4) = -1 + 4 = 3 + * Case 2: + * |0,1,2,3|0,1,2,3|0,1,2,3| + * ^ ^ + * starting point +3 from 3 + * + * - idx.inBlock = 3 + * - offset = (+3,0) + * - remote.x = (7+3) - ((+1) * 8) = 10 - 8 = 2 + * + * |0,1,2,3|0,1,2,3|0,1,2,3| + * ^ ^ + * -3 from 0 +3 from 3 + * + * NOTE: if in one direction the neighbour offet is zero, xFalg is 0; + * */ + + Idx remoteNghIdx; + remoteNghIdx.mInDataBlockIdx.x = ngh.x - xFlag * SBlock::memBlockSizeX; + remoteNghIdx.mInDataBlockIdx.y = ngh.y - yFlag * SBlock::memBlockSizeX; + remoteNghIdx.mInDataBlockIdx.z = ngh.z - zFlag * SBlock::memBlockSizeX; + + int connectivityJump = idx.mDataBlockIdx * 27 + + (xFlag + 1) + + (yFlag + 1) * 3 + + (zFlag + 1) * 9; + remoteNghIdx.mDataBlockIdx = blockConnectivity[connectivityJump]; + + return remoteNghIdx; + } else { + Idx localNghIdx; + localNghIdx.mDataBlockIdx = idx.mDataBlockIdx; + localNghIdx.mInDataBlockIdx = ngh; + return localNghIdx; + } +} + +template +NEON_CUDA_HOST_DEVICE inline auto bPartition:: + getNghData(const Idx& eId, + uint8_t nghID, + int card) + const -> NghData +{ + NghIdx nghOffset = mStencilNghIndex[nghID]; + return getNghData(eId, nghOffset, card); +} + +template +NEON_CUDA_HOST_DEVICE inline auto bPartition:: + getNghData(const Idx& idx, + const NghIdx& offset, + const int card) + const -> NghData +{ + NghData result; + bIndex nghIdx = helpGetNghIdx(idx, offset); + auto [isValid, pitch] = helpNghPitch(nghIdx, card); + if (!isValid) { + result.invalidate(); + return result; + } + auto const value = mMem[pitch]; + result.set(value, true); + return result; +} + +template +template +NEON_CUDA_HOST_DEVICE inline auto bPartition:: + getNghData(const Idx& idx, + int card) + const -> NghData +{ + NghData result; + bIndex nghIdx = helpGetNghIdx(idx); + auto [isValid, pitch] = helpNghPitch(nghIdx, card); + if (!isValid) { + result.invalidate(); + return result; + } + auto const value = mMem[pitch]; + result.set(value, true); + return result; +} + +template +template +NEON_CUDA_HOST_DEVICE inline auto bPartition:: + getNghData(const Idx& idx, + int card, + T defaultValue) + const -> NghData +{ + NghData result; + bIndex nghIdx = helpGetNghIdx(idx); + auto [isValid, pitch] = helpNghPitch(nghIdx, card); + if (!isValid) { + result.set(defaultValue, false); + return result; + } + auto const value = mMem[pitch]; + result.set(value, true); + return result; +} + +template + +template +NEON_CUDA_HOST_DEVICE inline auto bPartition:: + getNghData(const Idx& gidx, + int card, + LambdaVALID funIfValid, + LambdaNOTValid funIfNOTValid) + const -> std::enable_if_t && (std::is_invocable_v || std::is_same_v), void> +{ + NghData result; + bIndex nghIdx = helpGetNghIdx(gidx); + auto [isValid, pitch] = helpNghPitch(nghIdx, card); + + if (isValid) { + auto const& value = mMem[pitch]; + funIfValid(value); + return; + } + + if constexpr (!std::is_same_v) { + funIfNOTValid(); + } + return; +} + +template +template +NEON_CUDA_HOST_DEVICE inline auto bPartition:: + writeNghData(const Idx& gidx, + int card, + T value) + -> bool +{ + NghData result; + bIndex nghIdx = helpGetNghIdx(gidx); + auto [isValid, pitch] = helpNghPitch(nghIdx, card); + if (!isValid) { + return false; + } + mMem[pitch] = value; + return true; +} + +template +NEON_CUDA_HOST_DEVICE inline auto +bPartition::isActive(const Idx& cell, + const typename SBlock::BitMask* mask) const -> bool +{ + if (!mask) { + return mMask[cell.mDataBlockIdx].isActive(cell.mInDataBlockIdx.x, cell.mInDataBlockIdx.y, cell.mInDataBlockIdx.z); + } else { + return mask[cell.mDataBlockIdx].isActive(cell.mInDataBlockIdx.x, cell.mInDataBlockIdx.y, cell.mInDataBlockIdx.z); + } +} + +} // namespace Neon::domain::details::disaggregated::bGrid \ No newline at end of file diff --git a/libNeonDomain/include/Neon/domain/details/bGridDisgMask/bSpan.h b/libNeonDomain/include/Neon/domain/details/bGridDisgMask/bSpan.h new file mode 100644 index 00000000..f2cb0a93 --- /dev/null +++ b/libNeonDomain/include/Neon/domain/details/bGridDisgMask/bSpan.h @@ -0,0 +1,55 @@ +#pragma once + +#include "Neon/domain/details/bGridDisgMask/bIndex.h" + +namespace Neon::domain::details::disaggregated::bGridMask { + +template +class bSpan +{ + public: + // bit mask information + using BitMaskWordType = uint64_t; + + static constexpr uint32_t bitMaskStorageBitWidth = 64; + static constexpr Neon::MemoryLayout activeMaskMemoryLayout = Neon::MemoryLayout::arrayOfStructs; + static constexpr uint32_t log2OfbitMaskWordSize = 6; + + using Idx = bIndex; + friend class bGrid; + + static constexpr int SpaceDim = 3; + + bSpan() = default; + virtual ~bSpan() = default; + + NEON_CUDA_HOST_DEVICE inline static auto getInvalidBlockId() + -> typename Idx::DataBlockIdx + { + return std::numeric_limits::max(); + } + + inline bSpan( + typename Idx::DataBlockCount mFirstDataBlockOffset, + typename SBlock::BitMask const* NEON_RESTRICT mActiveMask, + Neon::DataView mDataView); + + NEON_CUDA_HOST_DEVICE inline auto setAndValidateCPUDevice( + Idx& bidx, + uint32_t const& threadIdx, + uint32_t const& x, + uint32_t const& y, + uint32_t const& z) const -> bool; + + NEON_CUDA_HOST_DEVICE inline auto setAndValidateGPUDevice( + Idx& bidx) const -> bool; + + + // We don't need to have a count on active blocks + typename Idx::DataBlockCount mFirstDataBlockOffset; + typename SBlock::BitMask const* NEON_RESTRICT mActiveMask; + Neon::DataView mDataView; +}; +} // namespace Neon::domain::details::disaggregated::bGrid + +#include "Neon/domain/details/bGridDisgMask/bSpan_imp.h" \ No newline at end of file diff --git a/libNeonDomain/include/Neon/domain/details/bGridDisgMask/bSpan_imp.h b/libNeonDomain/include/Neon/domain/details/bGridDisgMask/bSpan_imp.h new file mode 100644 index 00000000..8c23df74 --- /dev/null +++ b/libNeonDomain/include/Neon/domain/details/bGridDisgMask/bSpan_imp.h @@ -0,0 +1,52 @@ +#include "Neon/domain/details/bGridDisgMask/bSpan.h" + +namespace Neon::domain::details::disaggregated::bGridMask { + +template +NEON_CUDA_HOST_DEVICE inline auto +bSpan::setAndValidateGPUDevice([[maybe_unused]] Idx& bidx) const -> bool +{ +#ifdef NEON_PLACE_CUDA_DEVICE + bidx.mDataBlockIdx = blockIdx.x + mFirstDataBlockOffset; + bidx.mInDataBlockIdx.x = threadIdx.x; + bidx.mInDataBlockIdx.y = threadIdx.y; + bidx.mInDataBlockIdx.z = threadIdx.z; + + const bool isActive = mActiveMask[bidx.mDataBlockIdx].isActive(bidx.mInDataBlockIdx.x, bidx.mInDataBlockIdx.y, bidx.mInDataBlockIdx.z); + + return isActive; +#else + NEON_THROW_UNSUPPORTED_OPERATION("Operation supported only on GPU"); +#endif +} + +template +NEON_CUDA_HOST_DEVICE inline auto +bSpan::setAndValidateCPUDevice(Idx& bidx, + uint32_t const& dataBlockIdx, + uint32_t const& x, + uint32_t const& y, + uint32_t const& z) const -> bool +{ + + bidx.mDataBlockIdx = dataBlockIdx + mFirstDataBlockOffset; + ; + bidx.mInDataBlockIdx.x = static_cast(x); + bidx.mInDataBlockIdx.y = static_cast(y); + bidx.mInDataBlockIdx.z = static_cast(z); + const bool isActive = mActiveMask[dataBlockIdx].isActive(bidx.mInDataBlockIdx.x, bidx.mInDataBlockIdx.y, bidx.mInDataBlockIdx.z); + return isActive; +} + +template +bSpan::bSpan(typename Idx::DataBlockCount firstDataBlockOffset, + typename SBlock::BitMask const* NEON_RESTRICT activeMask, + Neon::DataView dataView) + : mFirstDataBlockOffset(firstDataBlockOffset), + mActiveMask(activeMask), + mDataView(dataView) +{ +} + + +} // namespace Neon::domain::details::disaggregated::bGrid \ No newline at end of file diff --git a/libNeonDomain/include/Neon/domain/details/bGridDisgMgpu/BlockView.h b/libNeonDomain/include/Neon/domain/details/bGridDisgMgpu/BlockView.h new file mode 100644 index 00000000..5ef7df3b --- /dev/null +++ b/libNeonDomain/include/Neon/domain/details/bGridDisgMgpu/BlockView.h @@ -0,0 +1,29 @@ +#include "Neon/domain/details/bGridDisgMgpu//BlockView/BlockViewGrid.h" +#include "Neon/domain/tools/GridTransformer.h" + +namespace Neon::domain::details::bGridMgpu { + +struct BlockView +{ + public: + using Grid = Neon::domain::tool::GridTransformer::Grid; + template + using Field = Grid::template Field; + using index_3d = Neon::index_3d; + + template + static auto helpGetReference(T* mem, const int idx, const int card) -> std::enable_if_t + { + return mem[idx * card]; + } + + template + static auto helpGetReference(T* mem, const int idx, const int card) -> std::enable_if_t + { + return mem[idx * C]; + } + + static constexpr Neon::MemoryLayout layout = Neon::MemoryLayout::arrayOfStructs; +}; + +} // namespace Neon::domain::details::bGrid \ No newline at end of file diff --git a/libNeonDomain/include/Neon/domain/details/bGridDisgMgpu/BlockView/BlockViewGrid.h b/libNeonDomain/include/Neon/domain/details/bGridDisgMgpu/BlockView/BlockViewGrid.h new file mode 100644 index 00000000..6047bb88 --- /dev/null +++ b/libNeonDomain/include/Neon/domain/details/bGridDisgMgpu/BlockView/BlockViewGrid.h @@ -0,0 +1,97 @@ +#pragma once +#include + +#include "Neon/core/core.h" +#include "Neon/core/types/DataUse.h" +#include "Neon/core/types/Macros.h" + +#include "Neon/set/BlockConfig.h" +#include "Neon/set/Containter.h" +#include "Neon/set/DevSet.h" +#include "Neon/set/MemoryOptions.h" + +#include "Neon/sys/memory/MemDevice.h" + +#include "Neon/domain/aGrid.h" + +#include "Neon/domain/interface/GridBaseTemplate.h" +#include "Neon/domain/interface/GridConcept.h" +#include "Neon/domain/interface/KernelConfig.h" +#include "Neon/domain/interface/LaunchConfig.h" +#include "Neon/domain/interface/Stencil.h" +#include "Neon/domain/interface/common.h" + +#include "Neon/domain/tools/GridTransformer.h" +#include "Neon/domain/tools/SpanTable.h" + +#include "Neon/domain/details/eGrid/eGrid.h" +#include "Neon/domain/patterns/PatternScalar.h" + +#include "BlockViewPartition.h" + +namespace Neon::domain::details::bGridMgpu { + +namespace details { +struct GridTransformation +{ + template + using Partition = BlockViewPartition; + using Span = Neon::domain::details::eGrid::eSpan; + static constexpr Neon::set::internal::ContainerAPI::DataViewSupport dataViewSupport = Neon::set::internal::ContainerAPI::DataViewSupport::on; + + using FoundationGrid = Neon::domain::details::eGrid::eGrid; + static constexpr Neon::set::details::ExecutionThreadSpan executionThreadSpan = FoundationGrid::executionThreadSpan; + using ExecutionThreadSpanIndexType = int32_t; + using Idx = FoundationGrid::Idx; + + static auto getDefaultBlock(FoundationGrid& foundationGrid) -> Neon::index_3d const& + { + return foundationGrid.getDefaultBlock(); + } + + static auto initSpan(FoundationGrid& foundationGrid, Neon::domain::tool::SpanTable& spanTable) -> void + { + spanTable.forEachConfiguration([&](Neon::Execution execution, + Neon::SetIdx setIdx, + Neon::DataView dw, + Span& span) { + span = foundationGrid.getSpan(execution, setIdx, dw); + }); + } + + static auto initLaunchParameters(FoundationGrid& foundationGrid, + Neon::DataView dataView, + const Neon::index_3d& blockSize, + const size_t& shareMem) -> Neon::set::LaunchParameters + { + return foundationGrid.getLaunchParameters(dataView, blockSize, shareMem); + } + + static auto helpGetGridIdx(FoundationGrid&, + Neon::SetIdx const&, + FoundationGrid::Idx const& fgIdx) + -> GridTransformation::Idx + { + GridTransformation::Idx tgIdx = fgIdx; + return tgIdx; + } + + template + static auto initFieldPartition(FoundationGrid::Field& foundationField, + Neon::domain::tool::PartitionTable>& partitionTable) -> void + { + partitionTable.forEachConfiguration( + [&](Neon::Execution execution, + Neon::SetIdx setIdx, + Neon::DataView dw, + Partition& partition) { + auto& foundationPartition = foundationField.getPartition(execution, setIdx, dw); + partition = Partition(foundationPartition); + }); + } +}; +using BlockViewGrid = Neon::domain::tool::GridTransformer::Grid; + +} // namespace details + +} // namespace Neon::domain::details::bGrid diff --git a/libNeonDomain/include/Neon/domain/details/bGridDisgMgpu/BlockView/BlockViewPartition.h b/libNeonDomain/include/Neon/domain/details/bGridDisgMgpu/BlockView/BlockViewPartition.h new file mode 100644 index 00000000..e1d7be2b --- /dev/null +++ b/libNeonDomain/include/Neon/domain/details/bGridDisgMgpu/BlockView/BlockViewPartition.h @@ -0,0 +1,42 @@ +#pragma once +#include +#include "Neon/core/core.h" +#include "Neon/core/types/Macros.h" +#include "Neon/domain/details/eGrid/eGrid.h" +#include "Neon/domain/details/eGrid/eIndex.h" +#include "Neon/domain/interface/NghData.h" +#include "Neon/set/DevSet.h" +#include "Neon/sys/memory/CudaIntrinsics.h" +#include "cuda_fp16.h" + +namespace Neon::domain::details::bGridMgpu { + +template +class BlockViewPartition : public Neon::domain::details::eGrid::ePartition +{ + public: + BlockViewPartition() + { + } + BlockViewPartition(Neon::domain::details::eGrid::ePartition ePartition) + : Neon::domain::details::eGrid::ePartition(ePartition) + { + } + + template + static auto getInBlockIdx(typename Neon::domain::details::eGrid::ePartition::Idx const& idx, + uint8_3d const& inBlockLocation) -> BlockIdexType + { + BlockIdexType blockIdx(idx.helpGet(), inBlockLocation); + return inBlockLocation; + } + + auto getCountAllocated() const -> int32_t; +}; +template +auto BlockViewPartition::getCountAllocated() const -> int32_t +{ + return this->mCountAllocated; +} +} // namespace Neon::domain::details::bGrid diff --git a/libNeonDomain/include/Neon/domain/details/bGridDisgMgpu/BlockView/BlockViewPartition_imp.h b/libNeonDomain/include/Neon/domain/details/bGridDisgMgpu/BlockView/BlockViewPartition_imp.h new file mode 100644 index 00000000..4464f686 --- /dev/null +++ b/libNeonDomain/include/Neon/domain/details/bGridDisgMgpu/BlockView/BlockViewPartition_imp.h @@ -0,0 +1,184 @@ +#pragma once + +#include "Neon/domain/details//eGrid/ePartition.h" + +namespace Neon::domain::details::bGridMgpu { + + +template +NEON_CUDA_HOST_DEVICE auto +ePartition::prtID() const + -> int +{ + return mPrtID; +} + +template +template +inline NEON_CUDA_HOST_DEVICE auto +ePartition::cardinality() const + -> std::enable_if_t +{ + return mCardinality; +} + +template +template +constexpr inline NEON_CUDA_HOST_DEVICE auto +ePartition::cardinality() const + -> std::enable_if_t +{ + return C; +} + +template +NEON_CUDA_HOST_DEVICE inline auto +ePartition::operator()(eIndex eId, int cardinalityIdx) const + -> T +{ + Offset jump = getOffset(eId, cardinalityIdx); + return mMem[jump]; +} + +template +NEON_CUDA_HOST_DEVICE inline auto +ePartition::operator()(eIndex eId, int cardinalityIdx) -> T& +{ + Offset jump = getOffset(eId, cardinalityIdx); + return mMem[jump]; +} + +template +NEON_CUDA_HOST_DEVICE inline auto +ePartition::getNghData(eIndex eId, + NghIdx nghIdx, + int card, + const Type& alternativeVal) + const -> NghData +{ + eIndex eIdxNgh; + const bool isValidNeighbour = isValidNgh(eId, nghIdx, eIdxNgh); + T val = (isValidNeighbour) ? this->operator()(eIdxNgh, card) : alternativeVal; + // printf("(prtId %d)getNghData id %d card %d eIdxNgh %d val %d\n", + // mPrtID, eId.mIdx, card, eIdxNgh.mIdx, int(val)); + return NghData(val, isValidNeighbour); +} + +template +NEON_CUDA_HOST_DEVICE inline auto +ePartition::getNghData(eIndex eId, + const Neon::int8_3d& ngh3dIdx, + int card, + const Type& alternativeVal) + const -> NghData +{ + int tablePithc = (ngh3dIdx.x + mStencilRadius) + + (ngh3dIdx.y + mStencilRadius) * mStencilTableYPitch + + (ngh3dIdx.z + mStencilRadius) * mStencilTableYPitch * mStencilTableYPitch; + NghIdx nghIdx = mStencil3dTo1dOffset[tablePithc]; + NghData res = getNghData(eId, nghIdx, card, alternativeVal); + + return res; +} + +template +NEON_CUDA_HOST_DEVICE inline auto +ePartition::isValidNgh(eIndex eId, + NghIdx nghIdx, + eIndex& neighbourIdx) const + -> bool +{ + const eIndex::Offset connectivityJumo = mCountAllocated * nghIdx + eId.get(); + neighbourIdx.set() = NEON_CUDA_CONST_LOAD((mConnectivity + connectivityJumo)); + const bool isValidNeighbour = (neighbourIdx.mIdx > -1); +// printf("(prtId %d) getNghData id %d eIdxNgh %d connectivityJumo %d\n", +// mPrtID, +// eId.mIdx, neighbourIdx.mIdx, connectivityJumo); + return isValidNeighbour; +} + +template +NEON_CUDA_HOST_DEVICE inline auto +ePartition::getGlobalIndex(eIndex eIndex) const + -> Neon::index_3d +{ + Neon::index_3d loc; + const auto baseAddr = mOrigins + eIndex.get(); + loc = mOrigins[eIndex.get()]; + return loc; +} + +template +ePartition::ePartition(int prtId, + T* mem, + ePitch pitch, + int32_t cardinality, + int32_t countAllocated, + Offset* connRaw, + Neon::index_3d* toGlobal, + int8_t* stencil3dTo1dOffset, + int32_t stencilRadius) +{ + mPrtID = prtId; + mMem = mem; + mPitch = pitch; + mCardinality = cardinality; + mCountAllocated = countAllocated; + + mConnectivity = connRaw; + mOrigins = toGlobal; + + mStencil3dTo1dOffset = stencil3dTo1dOffset; + mStencilTableYPitch = 2 * stencilRadius + 1; + + mStencilRadius = stencilRadius; +} + +template +NEON_CUDA_HOST_DEVICE auto +ePartition::pointer(eIndex eId, int cardinalityIdx) const + -> const Type* +{ + Offset jump = getOffset(eId, cardinalityIdx); + return mMem + jump; +} + +template +NEON_CUDA_HOST_DEVICE inline auto +ePartition::getOffset(eIndex eId, int cardinalityIdx) const + -> Offset +{ + return Offset(eId.get() * mPitch.x + cardinalityIdx * mPitch.y); +} + +template +NEON_CUDA_HOST_DEVICE inline auto +ePartition::mem() + -> T* +{ + return mMem; +} + +template +NEON_CUDA_HOST_DEVICE inline auto +ePartition::mem() const + -> const T* +{ + return mMem; +} + +} // namespace Neon::domain::details::eGrid diff --git a/libNeonDomain/include/Neon/domain/details/bGridDisgMgpu/StaticBlock.h b/libNeonDomain/include/Neon/domain/details/bGridDisgMgpu/StaticBlock.h new file mode 100644 index 00000000..7fdd4577 --- /dev/null +++ b/libNeonDomain/include/Neon/domain/details/bGridDisgMgpu/StaticBlock.h @@ -0,0 +1,106 @@ +#pragma once + +#include "Neon/domain/details/bGridDisgMgpu/bSpan.h" + +namespace Neon::domain::details::bGridMgpu { + +template +struct StaticBlock +{ + public: + constexpr static uint32_t memBlockSizeX = memBlockSizeX_; + constexpr static uint32_t memBlockSizeY = memBlockSizeY_; + constexpr static uint32_t memBlockSizeZ = memBlockSizeZ_; + constexpr static Neon::uint32_3d memBlockSize3D = Neon::uint32_3d(memBlockSizeX, memBlockSizeY, memBlockSizeZ); + + constexpr static uint32_t userBlockSizeX = userBlockSizeX_; + constexpr static uint32_t userBlockSizeY = userBlockSizeY_; + constexpr static uint32_t userBlockSizeZ = userBlockSizeZ_; + constexpr static Neon::uint32_3d userBlockSize3D = Neon::uint32_3d(userBlockSizeX, userBlockSizeY, userBlockSizeZ); + + constexpr static uint32_t blockRatioX = memBlockSizeX / userBlockSizeX; + constexpr static uint32_t blockRatioY = memBlockSizeY / userBlockSizeY; + constexpr static uint32_t blockRatioZ = memBlockSizeZ / userBlockSizeZ; + + constexpr static uint32_t memBlockPitchX = 1; + constexpr static uint32_t memBlockPitchY = memBlockSizeX; + constexpr static uint32_t memBlockPitchZ = memBlockSizeX * memBlockSizeY; + + constexpr static bool isMultiResMode = isMultiResMode_; + + constexpr static uint32_t memBlockCountElements = memBlockSizeX * memBlockSizeY * memBlockSizeZ; + + static_assert(memBlockSizeX >= userBlockSizeX); + static_assert(memBlockSizeY >= userBlockSizeY); + static_assert(memBlockSizeZ >= userBlockSizeZ); + + static_assert(memBlockSizeX % userBlockSizeX == 0); + static_assert(memBlockSizeY % userBlockSizeY == 0); + static_assert(memBlockSizeZ % userBlockSizeZ == 0); + + struct BitMask + { + using BitMaskWordType = uint32_t; + auto reset() -> void + { + for (BitMaskWordType i = 0; i < nWords; ++i) { + bits[i] = 0; + } + } + + auto setActive(int threadX, + int threadY, + int threadZ) -> void + { + BitMaskWordType mask; + uint32_t wordIdx; + getMaskAndWordI(threadX, threadY, threadZ, mask, wordIdx); + auto& word = bits[wordIdx]; + word = word | mask; + } + + inline auto NEON_CUDA_HOST_DEVICE isActive(int threadX, + int threadY, + int threadZ) const -> bool + { + BitMaskWordType mask; + uint32_t wordIdx; + getMaskAndWordI(threadX, threadY, threadZ, mask, wordIdx); + auto& word = bits[wordIdx]; + return (word & mask) != 0; + } + + static inline auto NEON_CUDA_HOST_DEVICE getMaskAndWordI(int threadX, + int threadY, + int threadZ, + NEON_OUT BitMaskWordType& mask, + NEON_OUT uint32_t& wordIdx) -> void + { + const uint32_t threadPitch = threadX * memBlockPitchX + + threadY * memBlockPitchY + + threadZ * memBlockPitchZ; + + // threadPitch >> log2_of_bitPerWord + // the same as: threadPitch / 2^{log2_of_bitPerWord} + wordIdx = threadPitch >> log2_of_bitPerWord; + // threadPitch & ((bitMaskWordType(bitMaskStorageBitWidth)) - 1); + // same as threadPitch % 2^{log2OfbitMaskWordSize} + const uint32_t offsetInWord = threadPitch & ((BitMaskWordType(bitPerWord)) - 1); + mask = BitMaskWordType(1) << offsetInWord; + } + + constexpr static BitMaskWordType nWords = (memBlockCountElements + 31) / 32; + static constexpr uint32_t log2_of_bitPerWord = 5; + static constexpr uint32_t bitPerWord = 32; + + BitMaskWordType bits[nWords]; + }; +}; + +} // namespace Neon::domain::details::bGrid \ No newline at end of file diff --git a/libNeonDomain/include/Neon/domain/details/bGridDisgMgpu/bField.h b/libNeonDomain/include/Neon/domain/details/bGridDisgMgpu/bField.h new file mode 100644 index 00000000..ef911c45 --- /dev/null +++ b/libNeonDomain/include/Neon/domain/details/bGridDisgMgpu/bField.h @@ -0,0 +1,120 @@ +#pragma once +#include "Neon/domain/details/bGridDisgMgpu/bPartition.h" +#include "Neon/domain/interface/FieldBaseTemplate.h" +#include "Neon/set/patterns/BlasSet.h" + +#include "Neon/core/core.h" +#include "Neon/core/types/Macros.h" + +#include "Neon/set/DevSet.h" +#include "Neon/set/HuOptions.h" +#include "Neon/set/MemoryTransfer.h" + +#include "Neon/domain/interface/FieldBaseTemplate.h" +#include "Neon/domain/tools/HaloUpdateTable1DPartitioning.h" +#include "Neon/domain/tools/PartitionTable.h" +#include "bPartition.h" + +namespace Neon::domain::details::bGridMgpu { + + +template +class bField : public Neon::domain::interface::FieldBaseTemplate, + bPartition, + int> +{ + friend bGrid; + + public: + using Type = T; + using Grid = bGrid; + using Field = bField; + using Partition = bPartition; + using Idx = bIndex; + using BlockViewGrid = Neon::domain::tool::GridTransformer::Grid; + template + using BlockViewField = BlockViewGrid::template Field; + + using NghIdx = typename Partition::NghIdx; + using NghData = typename Partition::NghData; + + bField(const std::string& fieldUserName, + Neon::DataUse dataUse, + Neon::MemoryOptions memoryOptions, + const Grid& grid, + int cardinality, + T inactiveValue); + + bField(); + + virtual ~bField() = default; + + auto getPartition(Neon::Execution, + Neon::SetIdx, + const Neon::DataView& dataView) const -> const Partition& final; + + auto getPartition(Neon::Execution, + Neon::SetIdx, + const Neon::DataView& dataView) -> Partition& final; + + auto isInsideDomain(const Neon::index_3d& idx) const -> bool; + + + auto operator()(const Neon::index_3d& idx, + const int& cardinality) const -> T final; + + auto getReference(const Neon::index_3d& idx, + const int& cardinality) -> T& final; + + auto updateHostData(int streamId = 0) -> void final; + + auto updateDeviceData(int streamId = 0) -> void final; + + auto newHaloUpdate(Neon::set::StencilSemantic semantic, + Neon::set::TransferMode transferMode, + Neon::Execution execution) + const -> Neon::set::Container; + + auto getMemoryField() -> BlockViewGrid::Field&; + + + private: + auto getRef(const Neon::index_3d& idx, const int& cardinality) const -> T&; + + auto initHaloUpdateTable() -> void; + + + struct Data + { + Data() = default; + Data(Neon::Backend const& bk) + { + mPartitionTable.init(bk); + } + + enum EndPoints + { + src = 1, + dst = 0 + }; + + struct EndPointsUtils + { + static constexpr int nConfigs = 2; + }; + + std::shared_ptr grid; + BlockViewField memoryField; + int cardinality; + + Neon::domain::tool::HaloTable1DPartitioning mLatticeHaloUpdateTable; + Neon::domain::tool::HaloTable1DPartitioning mStandardHaloUpdateTable; + Neon::domain::tool::PartitionTable mPartitionTable; + }; + std::shared_ptr mData; +}; + + +} // namespace Neon::domain::details::bGridMgpu diff --git a/libNeonDomain/include/Neon/domain/details/bGridDisgMgpu/bField_imp.h b/libNeonDomain/include/Neon/domain/details/bGridDisgMgpu/bField_imp.h new file mode 100644 index 00000000..80f807f6 --- /dev/null +++ b/libNeonDomain/include/Neon/domain/details/bGridDisgMgpu/bField_imp.h @@ -0,0 +1,523 @@ +#pragma once + +#include "Neon/domain/details/bGridDisgMgpu/bField.h" + +namespace Neon::domain::details::bGridMgpu { + +template +bField::bField() +{ + mData = std::make_shared(); +} + +template +bField::bField(const std::string& fieldUserName, + Neon::DataUse dataUse, + Neon::MemoryOptions memoryOptions, + const Grid& grid, + int cardinality, + T inactiveValue) + : Neon::domain::interface::FieldBaseTemplate(&grid, + fieldUserName, + "bField", + cardinality, + inactiveValue, + dataUse, + memoryOptions, + Neon::domain::haloStatus_et::e::ON) +{ + mData = std::make_shared(grid.getBackend()); + mData->grid = std::make_shared(grid); + + if (memoryOptions.getOrder() == Neon::MemoryLayout::arrayOfStructs) { + NEON_WARNING("bField does not support MemoryLayout::arrayOfStructs, enforcing MemoryLayout::structOfArrays"); + memoryOptions.setOrder(Neon::MemoryLayout::structOfArrays); + } + // the allocation size is the number of blocks x block size x cardinality + mData->memoryField = mData->grid->getBlockViewGrid().template newField( + "BitMask", + [&] { + int elPerBlock = SBlock::memBlockCountElements * cardinality; + return elPerBlock; + }(), + inactiveValue, + dataUse, + mData->grid->getBackend().getMemoryOptions(bSpan::activeMaskMemoryLayout)); + + + { // Setting up mPartitionTable + // const int setCardinality = mData->grid->getBackend().getDeviceCount(); + mData->mPartitionTable.forEachConfiguration( + [&](Neon::Execution execution, + Neon::SetIdx setIdx, + Neon::DataView, + Partition& partition) { + auto& partitioner = mData->grid->helpGetPartitioner1D(); + auto firstBup = partitioner.getSpanLayout().getBoundsBoundary(setIdx, Neon::domain::tool::partitioning::ByDirection::up).first; + auto firstBdw = partitioner.getSpanLayout().getBoundsBoundary(setIdx, Neon::domain::tool::partitioning::ByDirection::down).first; + auto firstGup = partitioner.getSpanLayout().getGhostBoundary(setIdx, Neon::domain::tool::partitioning::ByDirection::up).first; + auto firstGdw = partitioner.getSpanLayout().getGhostBoundary(setIdx, Neon::domain::tool::partitioning::ByDirection::down).first; + auto lastGdw = firstGdw + partitioner.getSpanLayout().getGhostBoundary(setIdx, Neon::domain::tool::partitioning::ByDirection::down).count; + + auto& memoryFieldPartition = mData->memoryField.getPartition(execution, setIdx, Neon::DataView::STANDARD); + auto& blockConnectivity = mData->grid->helpGetBlockConnectivity().getPartition(execution, setIdx, Neon::DataView::STANDARD); + auto& bitmask = mData->grid->getActiveBitMask().getPartition(execution, setIdx, Neon::DataView::STANDARD); + auto& dataBlockOrigins = mData->grid->helpGetDataBlockOriginField().getPartition(execution, setIdx, Neon::DataView::STANDARD); + + partition = bPartition(setIdx, + cardinality, + memoryFieldPartition.mem(), + blockConnectivity.mem(), + bitmask.mem(), + dataBlockOrigins.mem(), + mData->grid->helpGetStencilIdTo3dOffset().rawMem(execution, setIdx), + mData->grid->getDimension(), + firstBup, + firstBdw, + firstGup, + firstGdw, + lastGdw); + }); + } + + initHaloUpdateTable(); +} + +template +auto bField::getMemoryField() -> BlockViewGrid::Field& +{ + return mData->memoryField; +} + +template +auto bField::isInsideDomain(const Neon::index_3d& idx) const -> bool +{ + return mData->grid->isInsideDomain(idx); +} + +template +auto bField::getReference(const Neon::index_3d& cartesianIdx, + const int& cardinality) -> T& +{ + if constexpr (SBlock::isMultiResMode) { + auto& grid = this->getGrid(); + auto uniformCartesianIdx = cartesianIdx / grid.helGetMultiResDiscreteIdxSpacing(); + + if (cartesianIdx.x % grid.helGetMultiResDiscreteIdxSpacing() != 0 || + cartesianIdx.y % grid.helGetMultiResDiscreteIdxSpacing() != 0 || + cartesianIdx.z % grid.helGetMultiResDiscreteIdxSpacing() != 0) { + NeonException exp("bField::getReference"); + exp << "Input index is not multiple of the grid resolution"; + exp << "Index = " << cartesianIdx; + NEON_THROW(exp); + } + auto [setIdx, bIdx] = grid.helpGetSetIdxAndGridIdx(uniformCartesianIdx); + auto& partition = getPartition(Neon::Execution::host, setIdx, Neon::DataView::STANDARD); + auto& result = partition(bIdx, cardinality); + return result; + } else { + auto& grid = this->getGrid(); + auto [setIdx, bIdx] = grid.helpGetSetIdxAndGridIdx(cartesianIdx); + auto& partition = getPartition(Neon::Execution::host, setIdx, Neon::DataView::STANDARD); + auto& result = partition(bIdx, cardinality); + return result; + } +} + +template +auto bField::operator()(const Neon::index_3d& cartesianIdx, + const int& cardinality) const -> T +{ + auto& grid = this->getGrid(); + auto [setIdx, bIdx] = grid.helpGetSetIdxAndGridIdx(cartesianIdx); + if (setIdx.idx() == -1) { + return this->getOutsideValue(); + } + auto& partition = getPartition(Neon::Execution::host, setIdx, Neon::DataView::STANDARD); + auto& result = partition(bIdx, cardinality); + return result; +} + +template +auto bField::updateHostData(int streamId) -> void +{ + mData->memoryField.updateHostData(streamId); +} + +template +auto bField::updateDeviceData(int streamId) -> void +{ + mData->memoryField.updateDeviceData(streamId); +} + +template +auto bField::getPartition(Neon::Execution execution, + Neon::SetIdx setIdx, + const Neon::DataView& dataView) const -> const Partition& +{ + const Neon::DataUse dataUse = this->getDataUse(); + bool isOk = Neon::ExecutionUtils::checkCompatibility(dataUse, execution); + if (isOk) { + Partition const& result = mData->mPartitionTable.getPartition(execution, setIdx, dataView); + return result; + } + std::stringstream message; + message << "The requested execution mode ( " << execution << " ) is not compatible with the field DataUse (" << dataUse << ")"; + NEON_THROW_UNSUPPORTED_OPERATION(message.str()); +} + +template +auto bField::getPartition(Neon::Execution execution, + Neon::SetIdx setIdx, + const Neon::DataView& dataView) -> Partition& +{ + const Neon::DataUse dataUse = this->getDataUse(); + bool isOk = Neon::ExecutionUtils::checkCompatibility(dataUse, execution); + if (isOk) { + Partition& result = mData->mPartitionTable.getPartition(execution, setIdx, dataView); + return result; + } + std::stringstream message; + message << "The requested execution mode ( " << execution << " ) is not compatible with the field DataUse (" << dataUse << ")"; + NEON_THROW_UNSUPPORTED_OPERATION(message.str()); +} + +template +auto bField::newHaloUpdate(Neon::set::StencilSemantic stencilSemantic, + Neon::set::TransferMode transferMode, + Neon::Execution execution) const -> Neon::set::Container +{ + + + // We need to define a graph of Containers + // One for the actual memory transfer + // One for the synchronization + // The order depends on the transfer mode: put or get + Neon::set::Container dataTransferContainer; + auto const& bk = this->getGrid().getBackend(); + + if (stencilSemantic == Neon::set::StencilSemantic::standard) { + auto transfers = bk.template newDataSet>(); + + if (this->getMemoryOptions().getOrder() == Neon::MemoryLayout::structOfArrays) { + for (auto byDirection : {tool::partitioning::ByDirection::up, + tool::partitioning::ByDirection::down}) { + + auto const& tableEntryByDir = mData->mStandardHaloUpdateTable.get(transferMode, + execution, + byDirection); + + tableEntryByDir.forEachSeq([&](SetIdx setIdx, auto const& tableEntryByDirBySetIdx) { + transfers[setIdx].insert(std::end(transfers[setIdx]), + std::begin(tableEntryByDirBySetIdx), + std::end(tableEntryByDirBySetIdx)); + }); + } + dataTransferContainer = + Neon::set::Container::factoryDataTransfer( + *this, + transferMode, + stencilSemantic, + transfers, + execution); + + + } else { + NEON_THROW_UNSUPPORTED_OPERATION(""); + } + } else { + auto transfers = bk.template newDataSet>(); + + if (this->getMemoryOptions().getOrder() == Neon::MemoryLayout::structOfArrays) { + for (auto byDirection : {tool::partitioning::ByDirection::up, + tool::partitioning::ByDirection::down}) { + + auto const& tableEntryByDir = mData->mLatticeHaloUpdateTable.get(transferMode, + execution, + byDirection); + + tableEntryByDir.forEachSeq([&](SetIdx setIdx, auto const& tableEntryByDirBySetIdx) { + transfers[setIdx].insert(std::end(transfers[setIdx]), + std::begin(tableEntryByDirBySetIdx), + std::end(tableEntryByDirBySetIdx)); + }); + } + dataTransferContainer = + Neon::set::Container::factoryDataTransfer( + *this, + transferMode, + stencilSemantic, + transfers, + execution); + + + } else { + NEON_THROW_UNSUPPORTED_OPERATION(""); + } + } + Neon::set::Container SyncContainer = + Neon::set::Container::factorySynchronization( + *this, + Neon::set::SynchronizationContainerType::hostOmpBarrier); + + Neon::set::container::Graph graph(this->getBackend()); + const auto& dataTransferNode = graph.addNode(dataTransferContainer); + const auto& syncNode = graph.addNode(SyncContainer); + + switch (transferMode) { + case Neon::set::TransferMode::put: + graph.addDependency(dataTransferNode, syncNode, Neon::GraphDependencyType::data); + break; + case Neon::set::TransferMode::get: + graph.addDependency(syncNode, dataTransferNode, Neon::GraphDependencyType::data); + break; + default: + NEON_THROW_UNSUPPORTED_OPTION(); + break; + } + + graph.removeRedundantDependencies(); + + Neon::set::Container output = + Neon::set::Container::factoryGraph("dGrid-Halo-Update", + graph, + [](Neon::SetIdx, Neon::set::Loader&) {}); + return output; +} + +template +auto bField::initHaloUpdateTable() -> void +{ + // NEON_THROW_UNSUPPORTED_OPERATION(""); + auto& grid = this->getGrid(); + auto bk = grid.getBackend(); + auto getNghSetIdx = [&](SetIdx setIdx, Neon::domain::tool::partitioning::ByDirection direction) { + int res; + if (direction == Neon::domain::tool::partitioning::ByDirection::up) { + res = (setIdx + 1) % bk.getDeviceCount(); + } else { + res = (setIdx + bk.getDeviceCount() - 1) % bk.getDeviceCount(); + } + return res; + }; + + mData->mStandardHaloUpdateTable.forEachPutConfiguration( + bk, [&]( + Neon::SetIdx setIdxSend, + Execution execution, + Neon::domain::tool::partitioning::ByDirection byDirection, + std::vector& transfersVec) { + { + using namespace Neon::domain::tool::partitioning; + + if (ByDirection::up == byDirection && bk.isLastDevice(setIdxSend)) { + return; + } + + if (ByDirection::down == byDirection && bk.isFirstDevice(setIdxSend)) { + return; + } + + + Neon::SetIdx setIdxRecv = getNghSetIdx(setIdxSend, byDirection); + + Partition* partitionsRecv = &this->getPartition(execution, setIdxRecv, Neon::DataView::STANDARD); + Partition* partitionsSend = &this->getPartition(execution, setIdxSend, Neon::DataView::STANDARD); + + auto const recvDirection = byDirection == ByDirection::up + ? ByDirection::down + : ByDirection::up; + auto const sendDirection = byDirection; + + int const ghostSectorFirstBlockIdx = + recvDirection == ByDirection::up + ? partitionsRecv->helpGetSectorFirstBlock(Partition::Sectors::gUp) + : partitionsRecv->helpGetSectorFirstBlock(Partition::Sectors::gDw); + + int const boundarySectorFirstBlockIdx = + sendDirection == ByDirection::up + ? partitionsSend->helpGetSectorFirstBlock(Partition::Sectors::bUp) + : partitionsSend->helpGetSectorFirstBlock(Partition::Sectors::bDw); + + auto const msgLengthInBlocks = partitionsSend->helpGetSectorLength(sendDirection == ByDirection::up + ? Partition::Sectors::bUp + : Partition::Sectors::bDw); + + + for (int c = 0; c < this->getCardinality(); c++) { + + auto const recvPitch = [&] { + Idx idx; + typename Idx::InDataBlockIdx inDataBlockIdx; + if (recvDirection == ByDirection::up) { + inDataBlockIdx = typename Idx::InDataBlockIdx(0, 0, 0); + } else { + inDataBlockIdx = typename Idx::InDataBlockIdx(0, 0, SBlock::memBlockSizeZ - 1); + } + idx.setInDataBlockIdx(inDataBlockIdx); + idx.setDataBlockIdx(ghostSectorFirstBlockIdx); + + auto pitch = partitionsRecv->helpGetPitch(idx, c); + return pitch; + }(); + + auto const sendPitch = [&] { + typename Idx::InDataBlockIdx inDataBlockIdx; + if (sendDirection == ByDirection::up) { + inDataBlockIdx = typename Idx::InDataBlockIdx(0, 0, SBlock::memBlockSizeZ - 1); + } else { + inDataBlockIdx = typename Idx::InDataBlockIdx(0, 0, 0); + } + Idx idx; + idx.setInDataBlockIdx(inDataBlockIdx); + idx.setDataBlockIdx(boundarySectorFirstBlockIdx); + auto pitch = partitionsSend->helpGetPitch(idx, c); + return pitch; + }(); + + auto const msgSizePerCardinality = [&] { + // All blocks are mapped into a 3D grid, where blocks are places one after the other in a 1D mapping + // for each block we send only the top or bottom slice + // Therefore the size of the message is equal to the number of blocks in the sector + // by the size of element in a slice of a block... + auto size = msgLengthInBlocks * SBlock::memBlockSizeX * SBlock::memBlockSizeY; + return size; + }(); + + T const* sendMem = partitionsSend->mem(); + T const* recvMem = partitionsRecv->mem(); + + + Neon::set::MemoryTransfer transfer({setIdxRecv, (void*)(recvMem + recvPitch)}, + {setIdxSend, (void*)(sendMem + sendPitch)}, + sizeof(T) * msgSizePerCardinality); + + transfersVec.push_back(transfer); + } + } + }); + + mData->mLatticeHaloUpdateTable.forEachPutConfiguration( + bk, [&]( + Neon::SetIdx setIdxSend, + Execution execution, + Neon::domain::tool::partitioning::ByDirection byDirection, + std::vector& transfersVec) { + { + using namespace Neon::domain::tool::partitioning; + + if (ByDirection::up == byDirection && bk.isLastDevice(setIdxSend)) { + return; + } + + if (ByDirection::down == byDirection && bk.isFirstDevice(setIdxSend)) { + return; + } + + + Neon::SetIdx setIdxRecv = getNghSetIdx(setIdxSend, byDirection); + Partition* partitionsRecv = &this->getPartition(execution, setIdxRecv, Neon::DataView::STANDARD); + Partition* partitionsSend = &this->getPartition(execution, setIdxSend, Neon::DataView::STANDARD); + + auto const recvDirection = byDirection == ByDirection::up + ? ByDirection::down + : ByDirection::up; + auto const sendDirection = byDirection; + + int const ghostSectorFirstBlockIdx = + recvDirection == ByDirection::up + ? partitionsRecv->helpGetSectorFirstBlock(Partition::Sectors::gUp) + : partitionsRecv->helpGetSectorFirstBlock(Partition::Sectors::gDw); + + int const boundarySectorFirstBlockIdx = + sendDirection == ByDirection::up + ? partitionsSend->helpGetSectorFirstBlock(Partition::Sectors::bUp) + : partitionsSend->helpGetSectorFirstBlock(Partition::Sectors::bDw); + + auto const msgLengthInBlocks = partitionsSend->helpGetSectorLength(sendDirection == ByDirection::up + ? Partition::Sectors::bUp + : Partition::Sectors::bDw); + + bool canBeFusedWithPrevious = false; + + for (int c = 0; c < this->getCardinality(); c++) { + auto const& stencil = this->getGrid().getStencil(); + + auto const recvPitch = [&] { + Idx idx; + typename Idx::InDataBlockIdx inDataBlockIdx; + if (recvDirection == ByDirection::up) { + inDataBlockIdx = typename Idx::InDataBlockIdx(0, 0, 0); + } else { + inDataBlockIdx = typename Idx::InDataBlockIdx(0, 0, SBlock::memBlockSizeZ - 1); + } + idx.setInDataBlockIdx(inDataBlockIdx); + idx.setDataBlockIdx(ghostSectorFirstBlockIdx); + + auto pitch = partitionsRecv->helpGetPitch(idx, c); + return pitch; + }(); + + auto const sendPitch = [&] { + typename Idx::InDataBlockIdx inDataBlockIdx; + if (sendDirection == ByDirection::up) { + inDataBlockIdx = typename Idx::InDataBlockIdx(0, 0, SBlock::memBlockSizeZ - 1); + } else { + inDataBlockIdx = typename Idx::InDataBlockIdx(0, 0, 0); + } + Idx idx; + idx.setInDataBlockIdx(inDataBlockIdx); + idx.setDataBlockIdx(boundarySectorFirstBlockIdx); + auto pitch = partitionsSend->helpGetPitch(idx, c); + return pitch; + }(); + + auto const msgSizePerCardinality = [&] { + // All blocks are mapped into a 3D grid, where blocks are places one after the other in a 1D mapping + // for each block we send only the top or bottom slice + // Therefore the size of the message is equal to the number of blocks in the sector + // by the size of element in a slice of a block... + auto size = msgLengthInBlocks * SBlock::memBlockSizeX * SBlock::memBlockSizeY; + return size; + }(); + + T const* sendMem = partitionsSend->mem(); + T const* recvMem = partitionsRecv->mem(); + + + Neon::set::MemoryTransfer transfer({setIdxRecv, (void*)(recvMem + recvPitch)}, + {setIdxSend, (void*)(sendMem + sendPitch)}, + sizeof(T) * msgSizePerCardinality); + + if (ByDirection::up == sendDirection && !(stencil.points()[c].z > 0)) { + std::cout << "c " << c << " " << stencil.points()[c] << "skipped" << std::endl; + canBeFusedWithPrevious = false; + continue; + } + if (ByDirection::down == sendDirection && !(stencil.points()[c].z < 0)) { + std::cout << "c " << c << " " << stencil.points()[c] << "skipped" << std::endl; + canBeFusedWithPrevious = false; + continue; + } + if (canBeFusedWithPrevious) { + + const T* begin = (recvMem + recvPitch); + const T* previous = (((T*)(transfersVec[transfersVec.size() - 1].src.mem)) + msgSizePerCardinality); + if (begin != previous) { + NEON_THROW_UNSUPPORTED_OPTION("begin != transfersVec[transfersVec.size() - 1].dst"); + } + transfersVec[transfersVec.size() - 1].size += sizeof(T) * msgSizePerCardinality; + std::cout << "c " << c << " " << stencil.points()[c] << "fused" << std::endl + << "new size = " << transfersVec[transfersVec.size() - 1].size << std::endl; + } else { + transfersVec.push_back(transfer); + std::cout << "c " << c << " " << stencil.points()[c] << "added " << transfer.toString() << std::endl; + canBeFusedWithPrevious = false; + } + } + } + }); +} + + +} // namespace Neon::domain::details::bGridMgpu diff --git a/libNeonDomain/include/Neon/domain/details/bGridDisgMgpu/bGrid.h b/libNeonDomain/include/Neon/domain/details/bGridDisgMgpu/bGrid.h new file mode 100644 index 00000000..b7d42d94 --- /dev/null +++ b/libNeonDomain/include/Neon/domain/details/bGridDisgMgpu/bGrid.h @@ -0,0 +1,243 @@ +#pragma once +#include "Neon/core/core.h" + +#include "Neon/domain/aGrid.h" +#include "Neon/domain/details/bGridDisgMgpu/BlockView.h" +#include "Neon/domain/details/bGridDisgMgpu/StaticBlock.h" +#include "Neon/domain/details/bGridDisgMgpu/bField.h" +#include "Neon/domain/details/bGridDisgMgpu/bIndex.h" +#include "Neon/domain/details/bGridDisgMgpu/bPartition.h" +#include "Neon/domain/details/bGridDisgMgpu/bSpan.h" +#include "Neon/domain/interface/GridBaseTemplate.h" +#include "Neon/domain/patterns/PatternScalar.h" +#include "Neon/domain/tools/Partitioner1D.h" +#include "Neon/domain/tools/PointHashTable.h" +#include "Neon/domain/tools/SpanTable.h" +#include "Neon/set/Containter.h" +#include "Neon/set/LaunchParametersTable.h" +#include "Neon/set/memory/memSet.h" + +#include "bField.h" +#include "bPartition.h" +#include "bSpan.h" + +namespace Neon::domain::details::bGridMgpu { + + +template +class bField; + +template +class bGrid : public Neon::domain::interface::GridBaseTemplate, + bIndex> +{ + public: + using Grid = bGrid; + template + using Partition = bPartition; + template + using Field = Neon::domain::details::bGridMgpu::bField; + + using Span = bSpan; + using NghIdx = typename Partition::NghIdx; + using GridBaseTemplate = Neon::domain::interface::GridBaseTemplate>; + + using Idx = bIndex; + static constexpr Neon::set::details::ExecutionThreadSpan executionThreadSpan = Neon::set::details::ExecutionThreadSpan::d1b3; + using ExecutionThreadSpanIndexType = uint32_t; + + using BlockIdx = uint32_t; + + bGrid() = default; + virtual ~bGrid(); + + /** + * Constructor for the vanilla block data structure with depth of 1 + */ + template + bGrid(const Neon::Backend& backend, + const Neon::int32_3d& domainSize, + const ActiveCellLambda activeCellLambda, + const Neon::domain::Stencil& stencil, + const double_3d& spacingData = double_3d(1, 1, 1), + const double_3d& origin = double_3d(0, 0, 0), + Neon::domain::tool::spaceCurves::EncoderType encoderType = Neon::domain::tool::spaceCurves::EncoderType::sweep); + + + /** + * Constructor for bGrid. This constructor should be directly used only by mGrid + */ + template + bGrid(const Neon::Backend& backend /**< Neon backend for the computation */, + const Neon::int32_3d& domainSize /**< Size of the bounded Cartesian */, + const ActiveCellLambda activeCellLambda /**< Function that identify the user domain inside the boxed Cartesian discretization */, + const Neon::domain::Stencil& stencil /**< union of tall the stencil that will be used in the computation */, + const int multiResDiscreteIdxSpacing /**< Parameter for the multi-resolution. Index i and index (i+1) may be remapped as i*voxelSpacing and (i+1)* voxelSpacing. + * For a uniform bGrid, i.e outside the context of multi-resolution this parameter is always 1 */ + , + const double_3d& spacingData /** Physical spacing between two consecutive data points in the Cartesian domain */, + const double_3d& origin /** Physical location in space of the origin of the Cartesian discretization */, + Neon::domain::tool::spaceCurves::EncoderType encoderType = Neon::domain::tool::spaceCurves::EncoderType::sweep); + + /** + * Returns some properties for a given cartesian in the Cartesian domain. + * The provide index my be inside or outside the user defined bounded Cartesian domain + */ + auto getProperties(const Neon::index_3d& idx) + const -> typename GridBaseTemplate::CellProperties final; + + /** + * Returns true if the query 3D point is inside the user domain + * @param idx + * @return + */ + auto isInsideDomain(const Neon::index_3d& idx) + const -> bool final; + + /** + * Retrieves the device index that contains the query point + * @param idx + * @return + */ + auto getSetIdx(const Neon::index_3d& idx) + const -> int32_t final; + + /** + * Allocates a new field on the grid + */ + template + auto newField(const std::string name, + int cardinality, + T inactiveValue, + Neon::DataUse dataUse = Neon::DataUse::HOST_DEVICE, + Neon::MemoryOptions memoryOptions = Neon::MemoryOptions()) const + -> Field; + + /** + * Allocates a new field on the block view grid + */ + template + auto newBlockViewField(const std::string name, + int cardinality, + T inactiveValue, + Neon::DataUse dataUse = Neon::DataUse::HOST_DEVICE, + Neon::MemoryOptions memoryOptions = Neon::MemoryOptions()) const + -> BlockView::Field; + + /** + * Allocates a new container to execute some computation in the grid + */ + template + auto newContainer(const std::string& name, + index_3d blockSize, + size_t sharedMem, + LoadingLambda lambda) const -> Neon::set::Container; + + /** + * Allocates a new container to execute some computation in the grid + */ + template + auto newContainer(const std::string& name, + LoadingLambda lambda) const -> Neon::set::Container; + + /** + * Defines a new set of parameter to launch a Container + */ + auto getLaunchParameters(Neon::DataView dataView, + const Neon::index_3d& blockSize, + const size_t& sharedMem) const -> Neon::set::LaunchParameters; + + /** + * Retrieve the span associated to the grid w.r.t. some user defined parameters. + */ + auto getSpan(Neon::Execution execution, + SetIdx setIdx, + Neon::DataView dataView) -> const Span&; + + /** + * Retrieve the block vew grid internally used. + * This grid can be leverage to allocate data at the block level. + */ + auto getBlockViewGrid() const -> BlockView::Grid&; + + + /** + * Retrieve the block vew grid internally used. + * This grid can be leverage to allocate data at the block level. + */ + auto getActiveBitMask() const -> BlockView::Field&; + + /** + * Helper function to retrieve the discrete index spacing used for the multi-resolution + */ + template + auto helGetMultiResDiscreteIdxSpacing() const -> std::enable_if_t; + + + /** + * Help function to retrieve the block connectivity as a BlockViewGrid field + */ + auto helpGetBlockConnectivity() const -> BlockView::Field&; + + /** + * Help function to retrieve the block origin as a BlockViewGrid field + */ + auto helpGetDataBlockOriginField() const -> Neon::aGrid::Field&; + + /** + * Help function to retrieve the map that converts a stencil point id to 3d offset + */ + auto helpGetStencilIdTo3dOffset() const -> Neon::set::MemSet&; + + auto helpGetPartitioner1D() -> Neon::domain::tool::Partitioner1D&; + + /** + * Help function retriev the device and the block index associated to a point in the BlockViewGrid grid + */ + auto helpGetSetIdxAndGridIdx(Neon::index_3d idx) const -> std::tuple; + + struct Data + { + auto init(const Neon::Backend& bk) + { + spanTable.init(bk); + launchParametersTable.init(bk); + } + + Neon::domain::tool::SpanTable spanTable /** Span for each data view configurations */; + Neon::set::LaunchParametersTable launchParametersTable; + + Neon::domain::tool::Partitioner1D partitioner1D; + Stencil stencil; + Neon::sys::patterns::Engine reduceEngine; + + Neon::aGrid memoryGrid /** memory allocator for fields */; + Neon::aGrid::Field mDataBlockOriginField; + Neon::set::MemSet mStencil3dTo1dOffset; + + BlockView::Grid blockViewGrid; + BlockView::Field activeBitField; + BlockView::Field blockConnectivity; + Neon::set::MemSet stencilIdTo3dOffset; + + int mMultiResDiscreteIdxSpacing; + + // number of active voxels in each block + Neon::set::DataSet mNumActiveVoxel; + + + // Stencil neighbor indices + Neon::set::MemSet mStencilNghIndex; + }; + std::shared_ptr mData; +}; + +constexpr int defaultBlockSize = 4; +using bGridMgpuDefault = bGrid>; +extern template class bGrid>; +} // namespace Neon::domain::details::bGridMgpu + +#include "bField_imp.h" +#include "bGrid_imp.h" diff --git a/libNeonDomain/include/Neon/domain/details/bGridDisgMgpu/bGrid_imp.h b/libNeonDomain/include/Neon/domain/details/bGridDisgMgpu/bGrid_imp.h new file mode 100644 index 00000000..1ebc0468 --- /dev/null +++ b/libNeonDomain/include/Neon/domain/details/bGridDisgMgpu/bGrid_imp.h @@ -0,0 +1,470 @@ +#include "Neon/domain/details/bGridDisgMgpu/bGrid.h" +#include "Neon/domain/tools/SpaceCurves.h" + +namespace Neon::domain::details::bGridMgpu { + +template +template +bGrid::bGrid(const Neon::Backend& backend, + const Neon::int32_3d& domainSize, + const ActiveCellLambda activeCellLambda, + const Neon::domain::Stencil& stencil, + const double_3d& spacingData, + const double_3d& origin, + Neon::domain::tool::spaceCurves::EncoderType encoderType) + : bGrid(backend, domainSize, activeCellLambda, stencil, 1, spacingData, origin, encoderType) +{ +} + +template +template +bGrid::bGrid(const Neon::Backend& backend, + const Neon::int32_3d& domainSize, + const ActiveCellLambda activeCellLambda, + const Neon::domain::Stencil& stencil, + const int multiResDiscreteIdxSpacing, + const double_3d& spacingData, + const double_3d& origin, + Neon::domain::tool::spaceCurves::EncoderType encoderType) +{ + + + mData = std::make_shared(); + mData->init(backend); + + mData->mMultiResDiscreteIdxSpacing = multiResDiscreteIdxSpacing; + mData->stencil = stencil; + const index_3d defaultKernelBlockSize(SBlock::memBlockSizeX, + SBlock::memBlockSizeY, + SBlock::memBlockSizeZ); + + std::stringstream gridName; + gridName << "bGridMgpu_" << SBlock::memBlockSizeX << "_" + << SBlock::memBlockSizeY << "_" + << SBlock::memBlockSizeZ; + { + auto nElementsPerPartition = backend.devSet().template newDataSet(0); + // We do an initialization with nElementsPerPartition to zero, + // then we reset to the computed number. + + bGrid::GridBase::init(gridName.str(), + backend, + domainSize, + stencil, + nElementsPerPartition, + defaultKernelBlockSize, + multiResDiscreteIdxSpacing, + origin, + encoderType, + defaultKernelBlockSize); + } + + { // Initialization of the partitioner + + mData->partitioner1D = Neon::domain::tool::Partitioner1D( + backend, + activeCellLambda, + nullptr, + SBlock::memBlockSize3D.template newType(), + domainSize, + Neon::domain::Stencil::s27_t(false), + encoderType, + multiResDiscreteIdxSpacing); + + mData->mDataBlockOriginField = mData->partitioner1D.getGlobalMapping(); + mData->mStencil3dTo1dOffset = mData->partitioner1D.getStencil3dTo1dOffset(); + mData->memoryGrid = mData->partitioner1D.getMemoryGrid(); + } + + { // BlockViewGrid + Neon::domain::details::eGrid::eGrid egrid( + backend, + mData->partitioner1D.getBlockSpan(), + mData->partitioner1D, + Neon::domain::Stencil::s27_t(false), + spacingData * SBlock::memBlockSize3D, + origin); + + mData->blockViewGrid = BlockView::Grid(egrid); + } + + { // Active bitmask + mData->activeBitField = mData->blockViewGrid.template newField( + "BlockViewBitMask", + 1, + [] { + typename SBlock::BitMask outsideBitMask; + outsideBitMask.reset(); + return outsideBitMask; + }(), + Neon::DataUse::HOST_DEVICE, backend.getMemoryOptions(BlockView::layout)); + + mData->mNumActiveVoxel = backend.devSet().template newDataSet(); + + mData->activeBitField + .getGrid() + .template newContainer( + "activeBitMaskInit", + [&, this](Neon::set::Loader& loader) { + auto bitMaskPartition = loader.load(mData->activeBitField); + return [&, bitMaskPartition](const auto& bitMaskIdx) mutable { + auto prtIdx = bitMaskPartition.prtID(); + int countActive = 0; + auto const blockOrigin = bitMaskPartition.getGlobalIndex(bitMaskIdx); + typename SBlock::BitMask& bitMask = bitMaskPartition(bitMaskIdx, 0); + bitMask.reset(); + + for (int k = 0; k < SBlock::memBlockSize3D.template newType().z; k++) { + for (int j = 0; j < SBlock::memBlockSize3D.template newType().y; j++) { + for (int i = 0; i < SBlock::memBlockSize3D.template newType().x; i++) { + auto globalPosition = blockOrigin + Neon::int32_3d(i * this->mData->mMultiResDiscreteIdxSpacing, + j * this->mData->mMultiResDiscreteIdxSpacing, + k * this->mData->mMultiResDiscreteIdxSpacing); + bool const isInDomain = globalPosition < domainSize * this->mData->mMultiResDiscreteIdxSpacing; + bool const isActive = activeCellLambda(globalPosition); + if (isActive && isInDomain) { + countActive++; + bitMask.setActive(i, j, k); + } + } + } + } +#pragma omp critical + { + this->mData->mNumActiveVoxel[prtIdx] += countActive; + } + }; + }) + .run(Neon::Backend::mainStreamIdx); + + + mData->activeBitField.updateDeviceData(Neon::Backend::mainStreamIdx); + this->getBackend().sync(Neon::Backend::mainStreamIdx); + mData->activeBitField.newHaloUpdate(Neon::set::StencilSemantic::standard, + Neon::set::TransferMode::put, + Neon::Execution::device) + .run(Neon::Backend::mainStreamIdx); + } + + + { // Neighbor blocks + mData->blockConnectivity = mData->blockViewGrid.template newField("blockConnectivity", + 27, + Span::getInvalidBlockId(), + Neon::DataUse::HOST_DEVICE, + Neon::MemoryLayout::arrayOfStructs); + + mData->blockConnectivity.getGrid().template newContainer( + "blockConnectivityInit", + [&](Neon::set::Loader& loader) { + auto blockConnectivity = loader.load(mData->blockConnectivity); + return [&, blockConnectivity](auto const& idx) mutable { + for (int8_t k = 0; k < 3; k++) { + for (int8_t j = 0; j < 3; j++) { + for (int8_t i = 0; i < 3; i++) { + auto targetDirection = i + 3 * j + 3 * 3 * k; + BlockIdx blockNghIdx = Span::getInvalidBlockId(); + typename decltype(blockConnectivity)::Idx nghIdx; + Neon::int8_3d stencilPoint(i - int8_t(1), + j - int8_t(1), + k - int8_t(1)); + bool isValid = blockConnectivity.getNghIndex(idx, stencilPoint, nghIdx); + if (isValid) { + blockNghIdx = static_cast(nghIdx.helpGet()); + } + blockConnectivity(idx, targetDirection) = blockNghIdx; + } + } + } + }; + }) + .run(Neon::Backend::mainStreamIdx); + mData->blockConnectivity.updateDeviceData(Neon::Backend::mainStreamIdx); + } + + // Initialization of the SPAN table + mData->spanTable.forEachConfiguration([&](Neon::Execution execution, + Neon::SetIdx setIdx, + Neon::DataView dw, + Span& span) { + span.mDataView = dw; + switch (dw) { + case Neon::DataView::STANDARD: { + span.mFirstDataBlockOffset = 0; + span.mDataView = dw; + span.mActiveMask = mData->activeBitField.getPartition(execution, setIdx, dw).mem(); + break; + } + case Neon::DataView::BOUNDARY: { + span.mFirstDataBlockOffset = mData->partitioner1D.getSpanClassifier().countInternal(setIdx); + span.mDataView = dw; + span.mActiveMask = mData->activeBitField.getPartition(execution, setIdx, dw).mem(); + + break; + } + case Neon::DataView::INTERNAL: { + span.mFirstDataBlockOffset = 0; + span.mDataView = dw; + span.mActiveMask = mData->activeBitField.getPartition(execution, setIdx, dw).mem(); + break; + } + default: { + NeonException exc("dFieldDev"); + NEON_THROW(exc); + } + } + }); + + { // Stencil Idx to 3d offset + auto nPoints = backend.devSet().newDataSet(stencil.nNeighbours()); + mData->stencilIdTo3dOffset = backend.devSet().template newMemSet(Neon::DataUse::HOST_DEVICE, + 1, + backend.getMemoryOptions(), + nPoints); + for (int i = 0; i < stencil.nNeighbours(); ++i) { + for (int devIdx = 0; devIdx < backend.devSet().setCardinality(); devIdx++) { + index_3d pLong = stencil.neighbours()[i]; + Neon::int8_3d pShort = pLong.newType(); + mData->stencilIdTo3dOffset.eRef(devIdx, i) = pShort; + } + } + mData->stencilIdTo3dOffset.updateDeviceData(backend, Neon::Backend::mainStreamIdx); + } + // Init the base grid + bGrid::GridBase::init(gridName.str(), + backend, + domainSize, + Neon::domain::Stencil(), + mData->mNumActiveVoxel, + SBlock::memBlockSize3D.template newType(), + spacingData, + origin, + encoderType, + defaultKernelBlockSize); + { // setting launchParameters + mData->launchParametersTable.forEachSeq([&](Neon::DataView dw, + Neon::set::LaunchParameters& bLaunchParameters) { + auto defEGridBlock = mData->blockViewGrid.getDefaultBlock(); + auto eGridParams = mData->blockViewGrid.getLaunchParameters(dw, defEGridBlock, 0); + eGridParams.forEachSeq([&](Neon::SetIdx setIdx, Neon::sys::GpuLaunchInfo const& launchSingleDev) { + auto eDomainGridSize = launchSingleDev.domainGrid(); + assert(eDomainGridSize.y == 1); + assert(eDomainGridSize.z == 1); + int nBlocks = static_cast(eDomainGridSize.x); + bLaunchParameters.get(setIdx).set(Neon::sys::GpuLaunchInfo::mode_e::cudaGridMode, + nBlocks, SBlock::memBlockSize3D.template newType(), 0); + }); + }); + } +} + +template +template +auto bGrid::newField(const std::string name, + int cardinality, + T inactiveValue, + Neon::DataUse dataUse, + Neon::MemoryOptions memoryOptions) const -> Field +{ + memoryOptions = this->getDevSet().sanitizeMemoryOption(memoryOptions); + Field field(name, dataUse, memoryOptions, *this, cardinality, inactiveValue); + + return field; +} + +template +template +auto bGrid::newBlockViewField(const std::string name, + int cardinality, + T inactiveValue, + Neon::DataUse dataUse, + Neon::MemoryOptions memoryOptions) const -> BlockView::Field +{ + memoryOptions = this->getDevSet().sanitizeMemoryOption(memoryOptions); + BlockView::Field blockViewField = mData->blockViewGrid.template newField(name, cardinality, inactiveValue, dataUse, memoryOptions); + return blockViewField; +} + +template +template +auto bGrid::newContainer(const std::string& name, + index_3d blockSize, + size_t sharedMem, + LoadingLambda lambda) const -> Neon::set::Container +{ + Neon::set::Container kContainer = Neon::set::Container::factory(name, + Neon::set::internal::ContainerAPI::DataViewSupport::on, + *this, + lambda, + blockSize, + [sharedMem](const Neon::index_3d&) { return sharedMem; }); + return kContainer; +} + +template +template +auto bGrid::newContainer(const std::string& name, + LoadingLambda lambda) const -> Neon::set::Container +{ + const Neon::index_3d& defaultBlockSize = this->getDefaultBlock(); + Neon::set::Container kContainer = Neon::set::Container::factory(name, + Neon::set::internal::ContainerAPI::DataViewSupport::on, + *this, + lambda, + defaultBlockSize, + [](const Neon::index_3d&) { return 0; }); + return kContainer; +} + +template +auto bGrid:: + getBlockViewGrid() + const -> BlockView::Grid& +{ + return mData->blockViewGrid; +} + +template +auto bGrid:: + getActiveBitMask() + const -> BlockView::Field& +{ + return mData->activeBitField; +} + +/** + * Helper function to retrieve the discrete index spacing used for the multi-resolution + */ +template +template +auto bGrid::helGetMultiResDiscreteIdxSpacing() const + -> std::enable_if_t +{ + return mData->mMultiResDiscreteIdxSpacing; +} + +template +auto bGrid:: + helpGetBlockConnectivity() + const -> BlockView::Field& +{ + return mData->blockConnectivity; +} +template +auto bGrid:: + helpGetDataBlockOriginField() + const -> Neon::aGrid::Field& +{ + return mData->mDataBlockOriginField; +} +template +auto bGrid::getSpan(Neon::Execution execution, + SetIdx setIdx, + Neon::DataView dataView) -> const bGrid::Span& +{ + return mData->spanTable.getSpan(execution, setIdx, dataView); +} + +template +bGrid::~bGrid() +{ +} +template +auto bGrid::getSetIdx(const index_3d& idx) const -> int32_t +{ + typename GridBaseTemplate::CellProperties cellProperties; + + cellProperties.setIsInside(this->isInsideDomain(idx)); + if (!cellProperties.isInside()) { + return -1; + } + Neon::SetIdx setIdx = cellProperties.getSetIdx(); + return setIdx; +} +template +auto bGrid::getLaunchParameters(Neon::DataView dataView, + const index_3d&, + const size_t& sharedMem) const -> Neon::set::LaunchParameters +{ + auto res = mData->launchParametersTable.get(dataView); + res.forEachSeq([&](SetIdx const& /*setIdx*/, + Neon::set::LaunchParameters::launchInfo_e& launchParams) -> void { + launchParams.setShm(sharedMem); + }); + return res; +} + +template +auto bGrid:: + helpGetStencilIdTo3dOffset() + const -> Neon::set::MemSet& +{ + return mData->stencilIdTo3dOffset; +} + +template +auto bGrid::isInsideDomain(const index_3d& idx) const -> bool +{ + // 1. check if the block is active + const BlockView::index_3d blockIdx3d = idx / (SBlock::memBlockSize3D.template newType() * mData->mMultiResDiscreteIdxSpacing); + auto blockProperties = mData->blockViewGrid.getProperties(blockIdx3d); + + if (!blockProperties.isInside()) { + return false; + } + // 2. The block is active, check the element in the block + typename SBlock::BitMask const& bitMask = mData->activeBitField.getReference(blockIdx3d, 0); + + bool isActive = bitMask.isActive((idx.x / mData->mMultiResDiscreteIdxSpacing) % SBlock::memBlockSize3D.x, + (idx.y / mData->mMultiResDiscreteIdxSpacing) % SBlock::memBlockSize3D.y, + (idx.z / mData->mMultiResDiscreteIdxSpacing) % SBlock::memBlockSize3D.z); + return isActive; +} + +template +auto bGrid::getProperties(const index_3d& idx) + const -> typename GridBaseTemplate::CellProperties +{ + typename GridBaseTemplate::CellProperties cellProperties; + + cellProperties.setIsInside(this->isInsideDomain(idx)); + if (!cellProperties.isInside()) { + return cellProperties; + } + + if (this->getDevSet().setCardinality() == 1) { + cellProperties.init(0, DataView::INTERNAL); + } else { + const index_3d blockIdx3d = idx / SBlock::memBlockSize3D.template newType(); + auto blockViewProperty = mData->blockViewGrid.getProperties(blockIdx3d); + + cellProperties.init(blockViewProperty.getSetIdx(), + blockViewProperty.getDataView()); + } + return cellProperties; +} + +template +auto bGrid::helpGetSetIdxAndGridIdx(Neon::index_3d idx) + const -> std::tuple +{ + const index_3d blockIdx3d = idx / (SBlock::memBlockSize3D.template newType() * mData->mMultiResDiscreteIdxSpacing); + auto [setIdx, bvGridIdx] = mData->blockViewGrid.helpGetSetIdxAndGridIdx(blockIdx3d); + Idx bIdx; + bIdx.mDataBlockIdx = bvGridIdx.helpGet(); + bIdx.mInDataBlockIdx.x = static_cast((idx.x / mData->mMultiResDiscreteIdxSpacing) % SBlock::memBlockSize3D.x); + bIdx.mInDataBlockIdx.y = static_cast((idx.y / mData->mMultiResDiscreteIdxSpacing) % SBlock::memBlockSize3D.y); + bIdx.mInDataBlockIdx.z = static_cast((idx.z / mData->mMultiResDiscreteIdxSpacing) % SBlock::memBlockSize3D.z); + + return {setIdx, bIdx}; +} + +template +auto bGrid::helpGetPartitioner1D() -> Neon::domain::tool::Partitioner1D& +{ + return mData->partitioner1D; +} + +} // namespace Neon::domain::details::bGrid \ No newline at end of file diff --git a/libNeonDomain/include/Neon/domain/details/bGridDisgMgpu/bIndex.h b/libNeonDomain/include/Neon/domain/details/bGridDisgMgpu/bIndex.h new file mode 100644 index 00000000..0452cfc9 --- /dev/null +++ b/libNeonDomain/include/Neon/domain/details/bGridDisgMgpu/bIndex.h @@ -0,0 +1,142 @@ +#pragma once + +#include "Neon/core/core.h" + + +namespace Neon::domain::details::bGridMgpu { + +// Common forward declarations +template +class bGrid; +template +class bSpan; +template +class bPartition; + +class MicroIndex +{ + public: + using TrayIdx = int32_t; + using InTrayIdx = int8_3d; + + NEON_CUDA_HOST_DEVICE inline explicit MicroIndex() + : MicroIndex(0, 0, 0, 0) + { + } + + NEON_CUDA_HOST_DEVICE inline explicit MicroIndex(const TrayIdx& blockIdx, + const InTrayIdx::Integer& x, + const InTrayIdx::Integer& y, + const InTrayIdx::Integer& z) + { + mTrayBlockIdx = blockIdx; + mInTrayBlockIdx.x = x; + mInTrayBlockIdx.y = y; + mInTrayBlockIdx.z = z; + } + + NEON_CUDA_HOST_DEVICE inline auto getInTrayBlockIdx() const -> InTrayIdx const& + { + return mInTrayBlockIdx; + } + + NEON_CUDA_HOST_DEVICE inline auto getTrayBlockIdx() const -> TrayIdx const& + { + return mTrayBlockIdx; + } + + NEON_CUDA_HOST_DEVICE inline auto setInTrayBlockIdx(InTrayIdx const& inTrayIdx) -> void + { + mInTrayBlockIdx = inTrayIdx; + } + + NEON_CUDA_HOST_DEVICE inline auto setTrayBlockIdx(TrayIdx const& trayIdx) -> void + { + mTrayBlockIdx = trayIdx; + } + + InTrayIdx mInTrayBlockIdx; + TrayIdx mTrayBlockIdx{}; +}; + +template +class bIndex +{ + public: + template + friend class bSpan; + using OuterIdx = bIndex; + + using NghIdx = int8_3d; + template + friend class bPartition; + + template + friend class bField; + + template + friend class bSpan; + template + friend class bGrid; + + + using TrayIdx = MicroIndex::TrayIdx; + using InTrayIdx = MicroIndex::InTrayIdx; + + using DataBlockCount = std::make_unsigned_t; + using DataBlockIdx = std::make_unsigned_t; + using InDataBlockIdx = InTrayIdx; + + bIndex() = default; + ~bIndex() = default; + + NEON_CUDA_HOST_DEVICE inline explicit bIndex(const DataBlockIdx& blockIdx, + const InDataBlockIdx::Integer& x, + const InDataBlockIdx::Integer& y, + const InDataBlockIdx::Integer& z); + + NEON_CUDA_HOST_DEVICE inline auto getMicroIndex() -> MicroIndex; + NEON_CUDA_HOST_DEVICE inline auto init(MicroIndex const&) -> void; + + NEON_CUDA_HOST_DEVICE inline auto getInDataBlockIdx() const -> InDataBlockIdx const&; + NEON_CUDA_HOST_DEVICE inline auto getDataBlockIdx() const -> DataBlockIdx const&; + NEON_CUDA_HOST_DEVICE inline auto setInDataBlockIdx(InDataBlockIdx const&) -> void; + NEON_CUDA_HOST_DEVICE inline auto setDataBlockIdx(DataBlockIdx const&) -> void; + NEON_CUDA_HOST_DEVICE inline auto isActive() const -> bool; + // the local index within the block + InDataBlockIdx mInDataBlockIdx; + DataBlockIdx mDataBlockIdx{}; +}; + +template +NEON_CUDA_HOST_DEVICE auto bIndex::setDataBlockIdx(const bIndex::DataBlockIdx& dataBlockIdx) -> void +{ + mDataBlockIdx = dataBlockIdx; +} + +template +NEON_CUDA_HOST_DEVICE auto bIndex::setInDataBlockIdx(const bIndex::InDataBlockIdx& inDataBlockIdx) -> void +{ + mInDataBlockIdx = inDataBlockIdx; +} + +template +NEON_CUDA_HOST_DEVICE auto bIndex::getDataBlockIdx() const -> const bIndex::DataBlockIdx& +{ + return mDataBlockIdx; +} +template +NEON_CUDA_HOST_DEVICE auto bIndex::getInDataBlockIdx() const -> const bIndex::InDataBlockIdx& +{ + return mInDataBlockIdx; +} + +template +NEON_CUDA_HOST_DEVICE auto bIndex::isActive() const -> bool +{ + return mDataBlockIdx != std::numeric_limits::max(); +} + +} // namespace Neon::domain::details::bGrid + +#include "Neon/domain/details/bGrid/bIndex_imp.h" diff --git a/libNeonDomain/include/Neon/domain/details/bGridDisgMgpu/bIndex_imp.h b/libNeonDomain/include/Neon/domain/details/bGridDisgMgpu/bIndex_imp.h new file mode 100644 index 00000000..9da43b62 --- /dev/null +++ b/libNeonDomain/include/Neon/domain/details/bGridDisgMgpu/bIndex_imp.h @@ -0,0 +1,67 @@ +#pragma once +#include "Neon/domain/details/bGridDisgMgpu/bIndex.h" + +namespace Neon::domain::details::bGridMgpu { + +template +NEON_CUDA_HOST_DEVICE inline bIndex:: + bIndex(const DataBlockIdx& blockIdx, + const InDataBlockIdx::Integer& x, + const InDataBlockIdx::Integer& y, + const InDataBlockIdx::Integer& z) +{ + mDataBlockIdx = blockIdx; + mInDataBlockIdx.x = x; + mInDataBlockIdx.y = y; + mInDataBlockIdx.z = z; +} + + +template +NEON_CUDA_HOST_DEVICE inline auto bIndex::getMicroIndex() -> MicroIndex +{ + + + TrayIdx const exBlockOffset = mDataBlockIdx * (SBlock::blockRatioX * SBlock::blockRatioY * SBlock::blockRatioZ); + TrayIdx const exTrayOffset = [&] { + TrayIdx const trayBlockIdxX = mInDataBlockIdx.x / SBlock::userBlockSizeX; + TrayIdx const trayBlockIdxY = mInDataBlockIdx.y / SBlock::userBlockSizeY; + TrayIdx const trayBlockIdxZ = mInDataBlockIdx.z / SBlock::userBlockSizeZ; + + TrayIdx const res = trayBlockIdxX + trayBlockIdxY * SBlock::blockRatioX + + trayBlockIdxZ * (SBlock::blockRatioX * SBlock::blockRatioY); + return res; + }(); + MicroIndex res; + res.setTrayBlockIdx(exBlockOffset + exTrayOffset); + res.setInTrayBlockIdx({static_cast(mInDataBlockIdx.x % SBlock::userBlockSizeX), + static_cast(mInDataBlockIdx.y % SBlock::userBlockSizeY), + static_cast(mInDataBlockIdx.z % SBlock::userBlockSizeZ)}); + return res; +} + + +template +NEON_CUDA_HOST_DEVICE inline auto bIndex::init(MicroIndex const& microIndex) -> void +{ + constexpr uint32_t memBlockSize = SBlock::memBlockSizeX * SBlock::memBlockSizeY * SBlock::memBlockSizeZ; + constexpr uint32_t userBlockSize = SBlock::userBlockSizeX * SBlock::userBlockSizeY * SBlock::userBlockSizeZ; + constexpr uint32_t blockRatioSize = memBlockSize / userBlockSize; + + constexpr uint32_t blockRatioX = SBlock::memBlockSizeX / SBlock::userBlockSizeX; + constexpr uint32_t blockRatioY = SBlock::memBlockSizeY / SBlock::userBlockSizeY; + + mDataBlockIdx = microIndex.getTrayBlockIdx() / (blockRatioSize); + + uint32_t reminder = microIndex.getTrayBlockIdx() % (blockRatioSize); + + const uint32_t reminderInZ = reminder / (blockRatioX * blockRatioY); + mInDataBlockIdx.z = static_cast(microIndex.getInTrayBlockIdx().z + reminderInZ * SBlock::userBlockSizeZ); + reminder = reminder % (blockRatioX * blockRatioY); + const uint32_t reminderInY = reminder / (blockRatioX); + mInDataBlockIdx.y = static_cast(microIndex.getInTrayBlockIdx().y + reminderInY * SBlock::userBlockSizeY); + const uint32_t reminderInX = reminder % blockRatioX; + mInDataBlockIdx.x = static_cast(microIndex.getInTrayBlockIdx().x + reminderInX * SBlock::userBlockSizeX); +} + +} // namespace Neon::domain::details::bGrid \ No newline at end of file diff --git a/libNeonDomain/include/Neon/domain/details/bGridDisgMgpu/bPartition.h b/libNeonDomain/include/Neon/domain/details/bGridDisgMgpu/bPartition.h new file mode 100644 index 00000000..666f813b --- /dev/null +++ b/libNeonDomain/include/Neon/domain/details/bGridDisgMgpu/bPartition.h @@ -0,0 +1,218 @@ +#pragma once + +#include "Neon/domain/details/bGridDisgMgpu//bIndex.h" +#include "Neon/domain/details/bGridDisgMgpu//bSpan.h" + +#include "Neon/domain/interface/NghData.h" + +#include "Neon/sys/memory/CUDASharedMemoryUtil.h" + +namespace Neon::domain::details::bGridMgpu { + +template +class bSpan; + +template +class bPartition +{ + public: + enum Sectors + { + bUp = 0, + bDw = 1, + gUp = 2, + gDw = 3, + after = 4, + first = bUp + }; + + using Span = bSpan; + using Idx = bIndex; + using NghIdx = typename Idx::NghIdx; + using Type = T; + using NghData = Neon::domain::NghData; + + using BlockViewGrid = Neon::domain::tool::GridTransformer::Grid; + using BlockViewGridIdx = BlockViewGrid::Idx; + + public: + bPartition(); + + ~bPartition() = default; + + explicit bPartition(int setIdx, + int mCardinality, + T* mMem, + typename Idx::DataBlockIdx* mBlockConnectivity, + typename SBlock::BitMask const* NEON_RESTRICT mMask, + Neon::int32_3d* mOrigin, + NghIdx* mStencilNghIndex, + Neon::int32_3d mDomainSize, + typename Idx::DataBlockCount mFirstDataBUP, + typename Idx::DataBlockCount mFirstDataBDW, + typename Idx::DataBlockCount mFirstDataGUP, + typename Idx::DataBlockCount mFirstDataGDW, + typename Idx::DataBlockCount mLastDataGDW); + + /** + * Retrieve the cardinality of the field. + */ + inline NEON_CUDA_HOST_DEVICE auto + cardinality() + const -> int; + + /** + * Gets the field metadata at a cartesian point. + */ + inline NEON_CUDA_HOST_DEVICE auto + operator()(const Idx& cell, + int card) + -> T&; + + /** + * Gets the field metadata at a cartesian point. + */ + inline NEON_CUDA_HOST_DEVICE auto + operator()(const Idx& cell, + int card) + const -> const T&; + + /** + * Gets the field metadata at a neighbour cartesian point. + */ + NEON_CUDA_HOST_DEVICE inline auto + getNghData(const Idx& cell, + const NghIdx& offset, + const int card) + const -> NghData; + + /** + * Gets the field metadata at a neighbour cartesian point. + */ + NEON_CUDA_HOST_DEVICE inline auto + getNghData(const Idx& eId, + uint8_t nghID, + int card) + const -> NghData; + + /** + * Gets the field metadata at a neighbour cartesian point. + */ + template + NEON_CUDA_HOST_DEVICE inline auto + getNghData(const Idx& eId, + int card) + const -> NghData; + + /** + * Gets the field metadata at a neighbour cartesian point. + */ + template + NEON_CUDA_HOST_DEVICE inline auto + getNghData(const Idx& eId, + int card, + T defaultValue) + const -> NghData; + + template + NEON_CUDA_HOST_DEVICE inline auto + getNghData(const Idx& gidx, + int card, + LambdaVALID funIfValid, + LambdaNOTValid funIfNOTValid = nullptr) + const -> std::enable_if_t && (std::is_invocable_v || std::is_same_v), void>; + + template + NEON_CUDA_HOST_DEVICE inline auto + writeNghData(const Idx& gidx, + int card, + T value) + -> bool; + + /** + * Gets the global coordinates of the cartesian point. + */ + NEON_CUDA_HOST_DEVICE inline auto + getGlobalIndex(const Idx& cell) + const -> Neon::index_3d; + + NEON_CUDA_HOST_DEVICE inline auto + isActive(const Idx& cell, + const typename SBlock::BitMask* mask = nullptr) const -> bool; + + + NEON_CUDA_HOST_DEVICE inline auto + getDomainSize() + const -> Neon::index_3d; + + NEON_CUDA_HOST_DEVICE + auto mem() const -> T const*; + + /** + * Gets the Idx for in the block view space. + */ + NEON_CUDA_HOST_DEVICE inline auto + getBlockViewIdx(const Idx& cell) + const -> BlockViewGridIdx; + + + NEON_CUDA_HOST_DEVICE inline auto + helpGetPitch(const Idx& cell, int card) + const -> uint32_t; + + NEON_CUDA_HOST_DEVICE inline auto + helpGetValidIdxPitchExplicit(const Idx& idx, int card) + const -> uint32_t; + + NEON_CUDA_HOST_DEVICE inline auto + helpNghPitch(const Idx& nghIdx, int card) + const -> std::tuple; + + NEON_CUDA_HOST_DEVICE inline auto + helpGetNghIdx(const Idx& idx, const NghIdx& offset) + const -> Idx; + + template + NEON_CUDA_HOST_DEVICE inline auto + helpGetNghIdx(const Idx& idx) + const -> Idx; + + NEON_CUDA_HOST_DEVICE inline auto + helpGetNghIdx(const Idx& idx, const NghIdx& offset, const typename Idx::DataBlockIdx* blockConnectivity) + const -> Idx; + + template + NEON_CUDA_HOST_DEVICE inline auto + helpGetNghIdx(const Idx& idx, const typename Idx::DataBlockIdx* blockConnectivity) + const -> Idx; + + auto NEON_CUDA_HOST_DEVICE helpGetSectorFirstBlock(Sectors sector) const + -> typename Idx::DataBlockCount; + + auto NEON_CUDA_HOST_DEVICE helpGetSectorLength(Sectors sector) const + -> typename Idx::DataBlockCount; + + int mCardinality; + T* mMem; + NghIdx const* NEON_RESTRICT mStencilNghIndex; + typename Idx::DataBlockIdx const* NEON_RESTRICT mBlockConnectivity; + typename SBlock::BitMask const* NEON_RESTRICT mMask; + Neon::int32_3d const* NEON_RESTRICT mOrigin; + int mSetIdx; + int mMultiResDiscreteIdxSpacing = 1; + Neon::int32_3d mDomainSize; + + + + typename Idx::DataBlockCount mSectorFirstBlockIdx[Sectors::after + 1]; +}; + +} // namespace Neon::domain::details::bGridMgpu + +#include "Neon/domain/details/bGridDisgMgpu//bPartition_imp.h" \ No newline at end of file diff --git a/libNeonDomain/include/Neon/domain/details/bGridDisgMgpu/bPartition_imp.h b/libNeonDomain/include/Neon/domain/details/bGridDisgMgpu/bPartition_imp.h new file mode 100644 index 00000000..7fdf8bc4 --- /dev/null +++ b/libNeonDomain/include/Neon/domain/details/bGridDisgMgpu/bPartition_imp.h @@ -0,0 +1,517 @@ +#pragma once + +#include "Neon/domain/details/bGridDisgMgpu//bGrid.h" +#include "Neon/domain/details/bGridDisgMgpu/bSpan.h" + +namespace Neon::domain::details::bGridMgpu { + +template +bPartition::bPartition() + : mCardinality(0), + mMem(nullptr), + mStencilNghIndex(), + mBlockConnectivity(nullptr), + mMask(nullptr), + mOrigin(0), + mSetIdx(0) +{ +} + +template +bPartition:: + bPartition(int setIdx, + int cardinality, + T* mem, + typename Idx::DataBlockIdx* blockConnectivity, + typename SBlock::BitMask const* NEON_RESTRICT mask, + Neon::int32_3d* origin, + NghIdx* stencilNghIndex, + Neon::int32_3d mDomainSize, + typename Idx::DataBlockCount mFirstDataBUP, + typename Idx::DataBlockCount mFirstDataBDW, + typename Idx::DataBlockCount mFirstDataGUP, + typename Idx::DataBlockCount mFirstDataGDW, + typename Idx::DataBlockCount mLastDataGDW) + : mCardinality(cardinality), + mMem(mem), + mStencilNghIndex(stencilNghIndex), + mBlockConnectivity(blockConnectivity), + mMask(mask), + mOrigin(origin), + mSetIdx(setIdx), + mDomainSize(mDomainSize) +{ + mSectorFirstBlockIdx[Sectors::bUp] = mFirstDataBUP; + mSectorFirstBlockIdx[Sectors::bDw] = mFirstDataBDW; + mSectorFirstBlockIdx[Sectors::gUp] = mFirstDataGUP; + mSectorFirstBlockIdx[Sectors::gDw] = mFirstDataGDW; + mSectorFirstBlockIdx[Sectors::after] = mLastDataGDW; +} + +template +NEON_CUDA_HOST_DEVICE inline auto bPartition:: + getGlobalIndex(const Idx& gidx) + const -> Neon::index_3d +{ + auto location = mOrigin[gidx.mDataBlockIdx]; + location.x += gidx.mInDataBlockIdx.x; + location.y += gidx.mInDataBlockIdx.y; + location.z += gidx.mInDataBlockIdx.z; + if constexpr (SBlock::isMultiResMode) { + return location * mMultiResDiscreteIdxSpacing; + } + return location; +} + +template +NEON_CUDA_HOST_DEVICE inline auto bPartition:: + getDomainSize() + const -> Neon::index_3d +{ + return mDomainSize; +} + +template +NEON_CUDA_HOST_DEVICE inline auto bPartition:: + getBlockViewIdx(const Idx& gidx) + const -> BlockViewGridIdx +{ + BlockViewGridIdx res; + res.manualSet(gidx.getDataBlockIdx()); + return res; +} + +template +inline NEON_CUDA_HOST_DEVICE auto bPartition:: + cardinality() + const -> int +{ + return mCardinality; +} + +template +inline NEON_CUDA_HOST_DEVICE auto bPartition:: +operator()(const Idx& cell, + int card) -> T& +{ + return mMem[helpGetPitch(cell, card)]; +} + +template +inline NEON_CUDA_HOST_DEVICE auto bPartition:: +operator()(const Idx& cell, + int card) const -> const T& +{ + return mMem[helpGetPitch(cell, card)]; +} + +template +NEON_CUDA_HOST_DEVICE inline auto bPartition:: + mem() const -> T const* +{ + return mMem; +} + +template +inline NEON_CUDA_HOST_DEVICE auto bPartition:: + helpGetPitch(const Idx& idx, int card) + const -> uint32_t +{ + uint32_t const pitch = helpGetValidIdxPitchExplicit(idx, card); + return pitch; +} + +template +inline NEON_CUDA_HOST_DEVICE auto bPartition:: + helpGetValidIdxPitchExplicit(const Idx& idx, int card) + const -> uint32_t +{ + // we are in the internal sector, so we have the standard AoSoA + if (idx.getDataBlockIdx() < mSectorFirstBlockIdx[Sectors::first]) { + + uint32_t constexpr blockPitchByCard = SBlock::memBlockSizeX * SBlock::memBlockSizeY * SBlock::memBlockSizeZ; + uint32_t const inBlockInCardPitch = idx.mInDataBlockIdx.x + + SBlock::memBlockSizeX * idx.mInDataBlockIdx.y + + (SBlock::memBlockSizeX * SBlock::memBlockSizeY) * idx.mInDataBlockIdx.z; + uint32_t const blockAdnCardPitch = (idx.mDataBlockIdx * mCardinality + card) * blockPitchByCard; + uint32_t const pitch = blockAdnCardPitch + inBlockInCardPitch; + return pitch; + } + + // We switch to the other sector where we have a SoA + int sectorLenght = 0; + int myFirstBlock = 0; + + auto const myBlockIdx = idx.getDataBlockIdx(); + for (int mySector = Sectors::gDw; mySector >= 0; mySector--) { + if (myBlockIdx >= mSectorFirstBlockIdx[mySector]) { + sectorLenght = mSectorFirstBlockIdx[mySector + 1] - mSectorFirstBlockIdx[mySector]; + myFirstBlock = mSectorFirstBlockIdx[mySector]; + break; + } + } + int const denseX = (myBlockIdx - myFirstBlock) * SBlock::memBlockSizeX + idx.getInDataBlockIdx().x; + int const denseY = idx.mInDataBlockIdx.y; + int const denseZ = idx.mInDataBlockIdx.z; + + int const xStride = 1; + int const yStride = SBlock::memBlockSizeX * sectorLenght; + int const zStride = SBlock::memBlockSizeY * yStride; + + int const pitch = (SBlock::memBlockSizeX * SBlock::memBlockSizeY * SBlock::memBlockSizeZ) * + card * + sectorLenght + + denseX * xStride + + denseY * yStride + + denseZ * zStride; + + int const fullPitch = myFirstBlock * + SBlock::memBlockSizeX * + SBlock::memBlockSizeY * + SBlock::memBlockSizeZ * + cardinality() + + pitch; + return fullPitch; +} + +template +inline NEON_CUDA_HOST_DEVICE auto bPartition:: + helpNghPitch(const Idx& nghIdx, int card) + const -> std::tuple +{ + if (nghIdx.mDataBlockIdx == Span::getInvalidBlockId()) { + return {false, 0}; + } + + const bool isActive = mMask[nghIdx.mDataBlockIdx].isActive(nghIdx.mInDataBlockIdx.x, nghIdx.mInDataBlockIdx.y, nghIdx.mInDataBlockIdx.z); + if (!isActive) { + return {false, 0}; + } + auto const offset = helpGetValidIdxPitchExplicit(nghIdx, card); + return {true, offset}; +} + +template +NEON_CUDA_HOST_DEVICE inline auto bPartition:: + helpGetNghIdx(const Idx& idx, + const NghIdx& offset) + const -> Idx +{ + return this->helpGetNghIdx(idx, offset, mBlockConnectivity); +} + +template +NEON_CUDA_HOST_DEVICE inline auto bPartition:: + helpGetNghIdx(const Idx& idx, + const NghIdx& offset, + const typename Idx::DataBlockIdx* blockConnectivity) + const -> Idx +{ + + typename Idx::InDataBlockIdx ngh(idx.mInDataBlockIdx.x + offset.x, + idx.mInDataBlockIdx.y + offset.y, + idx.mInDataBlockIdx.z + offset.z); + + /** + * 0 if no offset on the direction + * 1 positive offset + * -1 negative offset + */ + const int xFlag = ngh.x < 0 ? -1 : (ngh.x >= SBlock::memBlockSizeX ? +1 : 0); + const int yFlag = ngh.y < 0 ? -1 : (ngh.y >= SBlock::memBlockSizeX ? +1 : 0); + const int zFlag = ngh.z < 0 ? -1 : (ngh.z >= SBlock::memBlockSizeX ? +1 : 0); + + const bool isLocal = (xFlag | yFlag | zFlag) == 0; + if (!(isLocal)) { + typename Idx::InDataBlockIdx remoteInBlockOffset; + /** + * Example + * - 8 block (1D case) + * Case 1: + * |0,1,2,3|0,1,2,3|0,1,2,3| + * ^ ^ + * -3 starting point + * + * - idx.inBlock = 2 + * - offset = -1 + * - remote.x = (2-3) - ((-1) * 4) = -1 + 4 = 3 + * Case 2: + * |0,1,2,3|0,1,2,3|0,1,2,3| + * ^ ^ + * starting point +3 from 3 + * + * - idx.inBlock = 3 + * - offset = (+3,0) + * - remote.x = (7+3) - ((+1) * 8) = 10 - 8 = 2 + * + * |0,1,2,3|0,1,2,3|0,1,2,3| + * ^ ^ + * -3 from 0 +3 from 3 + * + * NOTE: if in one direction the neighbour offet is zero, xFalg is 0; + * */ + + Idx remoteNghIdx; + remoteNghIdx.mInDataBlockIdx.x = ngh.x - xFlag * SBlock::memBlockSizeX; + remoteNghIdx.mInDataBlockIdx.y = ngh.y - yFlag * SBlock::memBlockSizeX; + remoteNghIdx.mInDataBlockIdx.z = ngh.z - zFlag * SBlock::memBlockSizeX; + + int connectivityJump = idx.mDataBlockIdx * 27 + + (xFlag + 1) + + (yFlag + 1) * 3 + + (zFlag + 1) * 9; + remoteNghIdx.mDataBlockIdx = blockConnectivity[connectivityJump]; + + return remoteNghIdx; + } else { + Idx localNghIdx; + localNghIdx.mDataBlockIdx = idx.mDataBlockIdx; + localNghIdx.mInDataBlockIdx = ngh; + return localNghIdx; + } +} + +template +template +NEON_CUDA_HOST_DEVICE inline auto bPartition:: + helpGetNghIdx(const Idx& idx) + const -> Idx +{ + return this->helpGetNghIdx(idx, mBlockConnectivity); +} + +template +template +NEON_CUDA_HOST_DEVICE inline auto bPartition:: + helpGetNghIdx(const Idx& idx, const typename Idx::DataBlockIdx* blockConnectivity) + const -> Idx +{ + + typename Idx::InDataBlockIdx ngh(idx.mInDataBlockIdx.x + xOff, + idx.mInDataBlockIdx.y + yOff, + idx.mInDataBlockIdx.z + zOff); + + /** + * 0 if no offset on the direction + * 1 positive offset + * -1 negative offset + */ + const int xFlag = [&] { + if constexpr (xOff == 0) { + return 0; + } else { + return ngh.x < 0 ? -1 : (ngh.x >= SBlock::memBlockSizeX ? +1 : 0); + } + }(); + + + const int yFlag = [&] { + if constexpr (yOff == 0) { + return 0; + } else { + return ngh.y < 0 ? -1 : (ngh.y >= SBlock::memBlockSizeX ? +1 : 0); + } + }(); + const int zFlag = [&] { + if constexpr (zOff == 0) { + return 0; + } else { + return ngh.z < 0 ? -1 : (ngh.z >= SBlock::memBlockSizeX ? +1 : 0); + } + }(); + + const bool isLocal = (xFlag | yFlag | zFlag) == 0; + if (!(isLocal)) { + typename Idx::InDataBlockIdx remoteInBlockOffset; + /** + * Example + * - 8 block (1D case) + * Case 1: + * |0,1,2,3|0,1,2,3|0,1,2,3| + * ^ ^ + * -3 starting point + * + * - idx.inBlock = 2 + * - offset = -1 + * - remote.x = (2-3) - ((-1) * 4) = -1 + 4 = 3 + * Case 2: + * |0,1,2,3|0,1,2,3|0,1,2,3| + * ^ ^ + * starting point +3 from 3 + * + * - idx.inBlock = 3 + * - offset = (+3,0) + * - remote.x = (7+3) - ((+1) * 8) = 10 - 8 = 2 + * + * |0,1,2,3|0,1,2,3|0,1,2,3| + * ^ ^ + * -3 from 0 +3 from 3 + * + * NOTE: if in one direction the neighbour offet is zero, xFalg is 0; + * */ + + Idx remoteNghIdx; + remoteNghIdx.mInDataBlockIdx.x = ngh.x - xFlag * SBlock::memBlockSizeX; + remoteNghIdx.mInDataBlockIdx.y = ngh.y - yFlag * SBlock::memBlockSizeX; + remoteNghIdx.mInDataBlockIdx.z = ngh.z - zFlag * SBlock::memBlockSizeX; + + int connectivityJump = idx.mDataBlockIdx * 27 + + (xFlag + 1) + + (yFlag + 1) * 3 + + (zFlag + 1) * 9; + remoteNghIdx.mDataBlockIdx = blockConnectivity[connectivityJump]; + + return remoteNghIdx; + } else { + Idx localNghIdx; + localNghIdx.mDataBlockIdx = idx.mDataBlockIdx; + localNghIdx.mInDataBlockIdx = ngh; + return localNghIdx; + } +} + +template +NEON_CUDA_HOST_DEVICE inline auto bPartition:: + getNghData(const Idx& eId, + uint8_t nghID, + int card) + const -> NghData +{ + NghIdx nghOffset = mStencilNghIndex[nghID]; + return getNghData(eId, nghOffset, card); +} + +template +NEON_CUDA_HOST_DEVICE inline auto bPartition:: + getNghData(const Idx& idx, + const NghIdx& offset, + const int card) + const -> NghData +{ + NghData result; + bIndex nghIdx = helpGetNghIdx(idx, offset); + auto [isValid, pitch] = helpNghPitch(nghIdx, card); + if (!isValid) { + result.invalidate(); + return result; + } + auto const value = mMem[pitch]; + result.set(value, true); + return result; +} + +template +template +NEON_CUDA_HOST_DEVICE inline auto bPartition:: + getNghData(const Idx& idx, + int card) + const -> NghData +{ + NghData result; + bIndex nghIdx = helpGetNghIdx(idx); + auto [isValid, pitch] = helpNghPitch(nghIdx, card); + if (!isValid) { + result.invalidate(); + return result; + } + auto const value = mMem[pitch]; + result.set(value, true); + return result; +} + +template +template +NEON_CUDA_HOST_DEVICE inline auto bPartition:: + getNghData(const Idx& idx, + int card, + T defaultValue) + const -> NghData +{ + NghData result; + bIndex nghIdx = helpGetNghIdx(idx); + auto [isValid, pitch] = helpNghPitch(nghIdx, card); + if (!isValid) { + result.set(defaultValue, false); + return result; + } + auto const value = mMem[pitch]; + result.set(value, true); + return result; +} + +template + +template +NEON_CUDA_HOST_DEVICE inline auto bPartition:: + getNghData(const Idx& gidx, + int card, + LambdaVALID funIfValid, + LambdaNOTValid funIfNOTValid) + const -> std::enable_if_t && (std::is_invocable_v || std::is_same_v), void> +{ + NghData result; + bIndex nghIdx = helpGetNghIdx(gidx); + auto [isValid, pitch] = helpNghPitch(nghIdx, card); + + if (isValid) { + auto const& value = mMem[pitch]; + funIfValid(value); + return; + } + + if constexpr (!std::is_same_v) { + funIfNOTValid(); + } + return; +} + +template +template +NEON_CUDA_HOST_DEVICE inline auto bPartition:: + writeNghData(const Idx& gidx, + int card, + T value) + -> bool +{ + NghData result; + bIndex nghIdx = helpGetNghIdx(gidx); + auto [isValid, pitch] = helpNghPitch(nghIdx, card); + if (!isValid) { + return false; + } + mMem[pitch] = value; + return true; +} + +template +NEON_CUDA_HOST_DEVICE inline auto +bPartition::isActive(const Idx& cell, + const typename SBlock::BitMask* mask) const -> bool +{ + if (!mask) { + return mMask[cell.mDataBlockIdx].isActive(cell.mInDataBlockIdx.x, cell.mInDataBlockIdx.y, cell.mInDataBlockIdx.z); + } else { + return mask[cell.mDataBlockIdx].isActive(cell.mInDataBlockIdx.x, cell.mInDataBlockIdx.y, cell.mInDataBlockIdx.z); + } +} + +template +NEON_CUDA_HOST_DEVICE inline auto +bPartition::helpGetSectorFirstBlock(Sectors sector) const + -> typename Idx::DataBlockCount +{ + return mSectorFirstBlockIdx[sector]; +} + +template +NEON_CUDA_HOST_DEVICE inline auto +bPartition::helpGetSectorLength(Sectors sector) const + -> typename Idx::DataBlockCount +{ + return mSectorFirstBlockIdx[sector + 1] - mSectorFirstBlockIdx[sector]; +} +} // namespace Neon::domain::details::bGridMgpu \ No newline at end of file diff --git a/libNeonDomain/include/Neon/domain/details/bGridDisgMgpu/bSpan.h b/libNeonDomain/include/Neon/domain/details/bGridDisgMgpu/bSpan.h new file mode 100644 index 00000000..397133c8 --- /dev/null +++ b/libNeonDomain/include/Neon/domain/details/bGridDisgMgpu/bSpan.h @@ -0,0 +1,55 @@ +#pragma once + +#include "Neon/domain/details/bGridDisgMgpu/bIndex.h" + +namespace Neon::domain::details::bGridMgpu { + +template +class bSpan +{ + public: + // bit mask information + using BitMaskWordType = uint64_t; + + static constexpr uint32_t bitMaskStorageBitWidth = 64; + static constexpr Neon::MemoryLayout activeMaskMemoryLayout = Neon::MemoryLayout::arrayOfStructs; + static constexpr uint32_t log2OfbitMaskWordSize = 6; + + using Idx = bIndex; + friend class bGrid; + + static constexpr int SpaceDim = 3; + + bSpan() = default; + virtual ~bSpan() = default; + + NEON_CUDA_HOST_DEVICE inline static auto getInvalidBlockId() + -> typename Idx::DataBlockIdx + { + return std::numeric_limits::max(); + } + + inline bSpan( + typename Idx::DataBlockCount mFirstDataBlockOffset, + typename SBlock::BitMask const* NEON_RESTRICT mActiveMask, + Neon::DataView mDataView); + + NEON_CUDA_HOST_DEVICE inline auto setAndValidateCPUDevice( + Idx& bidx, + uint32_t const& threadIdx, + uint32_t const& x, + uint32_t const& y, + uint32_t const& z) const -> bool; + + NEON_CUDA_HOST_DEVICE inline auto setAndValidateGPUDevice( + Idx& bidx) const -> bool; + + + // We don't need to have a count on active blocks + typename Idx::DataBlockCount mFirstDataBlockOffset; + typename SBlock::BitMask const* NEON_RESTRICT mActiveMask; + Neon::DataView mDataView; +}; +} // namespace Neon::domain::details::bGrid + +#include "Neon/domain/details/bGridDisgMgpu/bSpan_imp.h" \ No newline at end of file diff --git a/libNeonDomain/include/Neon/domain/details/bGridDisgMgpu/bSpan_imp.h b/libNeonDomain/include/Neon/domain/details/bGridDisgMgpu/bSpan_imp.h new file mode 100644 index 00000000..45e40eeb --- /dev/null +++ b/libNeonDomain/include/Neon/domain/details/bGridDisgMgpu/bSpan_imp.h @@ -0,0 +1,51 @@ +#include "Neon/domain/details/bGridDisgMgpu/bSpan.h" + +namespace Neon::domain::details::bGridMgpu { + +template +NEON_CUDA_HOST_DEVICE inline auto +bSpan::setAndValidateGPUDevice([[maybe_unused]] Idx& bidx) const -> bool +{ +#ifdef NEON_PLACE_CUDA_DEVICE + bidx.mDataBlockIdx = blockIdx.x + mFirstDataBlockOffset; + bidx.mInDataBlockIdx.x = threadIdx.x; + bidx.mInDataBlockIdx.y = threadIdx.y; + bidx.mInDataBlockIdx.z = threadIdx.z; + + const bool isActive = mActiveMask[bidx.mDataBlockIdx].isActive(bidx.mInDataBlockIdx.x, bidx.mInDataBlockIdx.y, bidx.mInDataBlockIdx.z); + + return isActive; +#else + NEON_THROW_UNSUPPORTED_OPERATION("Operation supported only on GPU"); +#endif +} + +template +NEON_CUDA_HOST_DEVICE inline auto +bSpan::setAndValidateCPUDevice(Idx& bidx, + uint32_t const& dataBlockIdx, + uint32_t const& x, + uint32_t const& y, + uint32_t const& z) const -> bool +{ + + bidx.mDataBlockIdx = dataBlockIdx + mFirstDataBlockOffset; + bidx.mInDataBlockIdx.x = static_cast(x); + bidx.mInDataBlockIdx.y = static_cast(y); + bidx.mInDataBlockIdx.z = static_cast(z); + const bool isActive = mActiveMask[dataBlockIdx].isActive(bidx.mInDataBlockIdx.x, bidx.mInDataBlockIdx.y, bidx.mInDataBlockIdx.z); + return isActive; +} + +template +bSpan::bSpan(typename Idx::DataBlockCount firstDataBlockOffset, + typename SBlock::BitMask const* NEON_RESTRICT activeMask, + Neon::DataView dataView) + : mFirstDataBlockOffset(firstDataBlockOffset), + mActiveMask(activeMask), + mDataView(dataView) +{ +} + + +} // namespace Neon::domain::details::bGrid \ No newline at end of file diff --git a/libNeonDomain/include/Neon/domain/details/eGrid/eField_imp.h b/libNeonDomain/include/Neon/domain/details/eGrid/eField_imp.h index 2427dc57..7fab92b1 100644 --- a/libNeonDomain/include/Neon/domain/details/eGrid/eField_imp.h +++ b/libNeonDomain/include/Neon/domain/details/eGrid/eField_imp.h @@ -75,7 +75,7 @@ eField::eField(const std::string& fieldUserName, } #if 0 { // Setting Reduction information - mData->partitionTable.forEachConfigurationWithUserData( + mData->mPartitionTable.forEachConfigurationWithUserData( [&](Neon::Execution, Neon::SetIdx setIdx, Neon::DataView dw, @@ -322,7 +322,7 @@ auto eField::helpHaloUpdate(SetIdx setIdx, T* src = [&]() { auto southDevice = setId; - auto& partition = mData->partitionTable.getPartition(execution, + auto& partition = mData->mPartitionTable.getPartition(execution, southDevice, Neon::DataView::STANDARD); dIndex firstBoundaryNorthCell(0, 0, partition.dim.z - mData->zHaloDim); @@ -332,7 +332,7 @@ auto eField::helpHaloUpdate(SetIdx setIdx, T* dst = [&]() { auto northDevice = setId + 1; - auto& partition = mData->partitionTable.getPartition(execution, + auto& partition = mData->mPartitionTable.getPartition(execution, northDevice, Neon::DataView::STANDARD); dIndex firstBoundarySouthCell(0, 0, 0); @@ -355,7 +355,7 @@ auto eField::helpHaloUpdate(SetIdx setIdx, const size_t transferBytes = sizeof(T) * mData->zHaloDim * mData->pitch[setId].z; if (setId != setCardinality - 1) { // Addressing all partitions that needs to send data north - auto& partition = mData->partitionTable.getPartition(Neon::Execution::device, + auto& partition = mData->mPartitionTable.getPartition(Neon::Execution::device, setId, Neon::DataView::STANDARD); diff --git a/libNeonDomain/include/Neon/domain/tools/partitioning/SpanClassifier.h b/libNeonDomain/include/Neon/domain/tools/partitioning/SpanClassifier.h index 9ab62b43..ef52e7ef 100644 --- a/libNeonDomain/include/Neon/domain/tools/partitioning/SpanClassifier.h +++ b/libNeonDomain/include/Neon/domain/tools/partitioning/SpanClassifier.h @@ -272,7 +272,6 @@ namespace Neon::domain::tool::partitioning { if (activeCellLambda(globalId)) { isActiveBlock = true; if (whatdomain == ByDomain::bc) { - //std::cout << "HERE " << blockOrigin << std::endl; byDomain = ByDomain::bc; doBreak = true; } diff --git a/libNeonDomain/src/domain/details/bGridDisgMask/bFieldReduceKernels.cu b/libNeonDomain/src/domain/details/bGridDisgMask/bFieldReduceKernels.cu new file mode 100644 index 00000000..678565ea --- /dev/null +++ b/libNeonDomain/src/domain/details/bGridDisgMask/bFieldReduceKernels.cu @@ -0,0 +1,62 @@ +#include "Neon/domain/details/dGrid/dGrid.h" +#include "Neon/domain/patterns/ReduceKernels.cuh" + +namespace Neon::domain::details::bGridMask { +#if 0 +template +auto dFieldDev::dotCUB( + Neon::set::patterns::BlasSet& blasSet, + const dFieldDev& input, + Neon::set::MemDevSet& output, + const Neon::DataView& dataView) -> void +{ + Neon::domain::details::dotCUB(blasSet, + grid(), + *this, + input, + output, + dataView); +} + + +template +auto dFieldDev::norm2CUB( + Neon::set::patterns::BlasSet& blasSet, + Neon::set::MemDevSet& output, + const Neon::DataView& dataView) -> void +{ + Neon::domain::details::norm2CUB(blasSet, + grid(), + *this, + output, + dataView); +} + +template void dFieldDev::dotCUB(Neon::set::patterns::BlasSet&, + const dFieldDev&, + Neon::set::MemDevSet&, + const Neon::DataView&); + +template void dFieldDev::dotCUB(Neon::set::patterns::BlasSet&, + const dFieldDev&, + Neon::set::MemDevSet&, + const Neon::DataView&); + +template void dFieldDev::dotCUB(Neon::set::patterns::BlasSet&, + const dFieldDev&, + Neon::set::MemDevSet&, + const Neon::DataView&); + +template void dFieldDev::norm2CUB(Neon::set::patterns::BlasSet&, + Neon::set::MemDevSet&, + const Neon::DataView&); + +template void dFieldDev::norm2CUB(Neon::set::patterns::BlasSet&, + Neon::set::MemDevSet&, + const Neon::DataView&); + +template void dFieldDev::norm2CUB(Neon::set::patterns::BlasSet&, + Neon::set::MemDevSet&, + const Neon::DataView&); +#endif +} // namespace Neon::domain::details::dGrid \ No newline at end of file diff --git a/libNeonDomain/src/domain/details/bGridDisgMask/bGrid.cpp b/libNeonDomain/src/domain/details/bGridDisgMask/bGrid.cpp new file mode 100644 index 00000000..d5e3126d --- /dev/null +++ b/libNeonDomain/src/domain/details/bGridDisgMask/bGrid.cpp @@ -0,0 +1,8 @@ +#include "Neon/domain/details/bGridDisgMask/bGrid.h" + +namespace Neon::domain::details::disaggregated::bGridMask { + + +template class bGrid>; + +} // namespace Neon::domain::details::bGrid \ No newline at end of file diff --git a/libNeonDomain/src/domain/details/bGridDisgMgpu/bFieldReduceKernels.cu b/libNeonDomain/src/domain/details/bGridDisgMgpu/bFieldReduceKernels.cu new file mode 100644 index 00000000..e69de29b diff --git a/libNeonDomain/src/domain/details/bGridDisgMgpu/bGrid.cpp b/libNeonDomain/src/domain/details/bGridDisgMgpu/bGrid.cpp new file mode 100644 index 00000000..a54e64ca --- /dev/null +++ b/libNeonDomain/src/domain/details/bGridDisgMgpu/bGrid.cpp @@ -0,0 +1,7 @@ +#include "Neon/domain/details/bGridDisgMgpu/bGrid.h" + +namespace Neon::domain::details::bGridMgpu { + +template class bGrid>; + +} // namespace Neon::domain::details::bGrid \ No newline at end of file diff --git a/libNeonDomain/tests/domain-globalIdx/src/globalIdx.cu b/libNeonDomain/tests/domain-globalIdx/src/globalIdx.cu index e74c61bc..21c75926 100644 --- a/libNeonDomain/tests/domain-globalIdx/src/globalIdx.cu +++ b/libNeonDomain/tests/domain-globalIdx/src/globalIdx.cu @@ -102,5 +102,6 @@ template auto run(TestData&) - template auto run(TestData&) -> void; template auto run(TestData&) -> void; template auto run(TestData&) -> void; +template auto run(TestData&) -> void; } // namespace globalIdx \ No newline at end of file diff --git a/libNeonDomain/tests/domain-globalIdx/src/globalIdx.h b/libNeonDomain/tests/domain-globalIdx/src/globalIdx.h index 616c46f0..117fdcd5 100644 --- a/libNeonDomain/tests/domain-globalIdx/src/globalIdx.h +++ b/libNeonDomain/tests/domain-globalIdx/src/globalIdx.h @@ -19,6 +19,7 @@ extern template auto run(TestData(TestData&) -> void; extern template auto run(TestData&) -> void; extern template auto run(TestData&) -> void; +extern template auto run(TestData&) -> void; } // namespace globalIdx diff --git a/libNeonDomain/tests/domain-globalIdx/src/gtests.cpp b/libNeonDomain/tests/domain-globalIdx/src/gtests.cpp index 4e2f193d..d6b91edd 100644 --- a/libNeonDomain/tests/domain-globalIdx/src/gtests.cpp +++ b/libNeonDomain/tests/domain-globalIdx/src/gtests.cpp @@ -41,6 +41,15 @@ TEST(domain_globalIdx, dGridDisg) 1); } +TEST(domain_globalIdx, bGridMgpu) +{ + int nGpus = 3; + using Type = int64_t; + runAllTestConfiguration(std::function(globalIdx::run), + nGpus, + 1); +} + int main(int argc, char** argv) { ::testing::InitGoogleTest(&argc, argv); diff --git a/libNeonDomain/tests/domain-map-disg/src/gtests.cpp b/libNeonDomain/tests/domain-map-disg/src/gtests.cpp index b5b77c56..3664f646 100644 --- a/libNeonDomain/tests/domain-map-disg/src/gtests.cpp +++ b/libNeonDomain/tests/domain-map-disg/src/gtests.cpp @@ -4,7 +4,7 @@ #include "map.h" #include "runHelper.h" -TEST(domain_map, bGridDisg) +TEST(domain_map_disg, bGridDisg) { int nGpus = 1; using Type = int64_t; @@ -13,7 +13,7 @@ TEST(domain_map, bGridDisg) 1); } -TEST(domain_map_dataView, bGridDisg) +TEST(domain_map_disg_dataView, bGridDisg) { int nGpus = 1; using Type = int64_t; diff --git a/libNeonDomain/tests/domain-map-disg/src/map.cu b/libNeonDomain/tests/domain-map-disg/src/map.cu index 991ac8dd..434802ef 100644 --- a/libNeonDomain/tests/domain-map-disg/src/map.cu +++ b/libNeonDomain/tests/domain-map-disg/src/map.cu @@ -138,7 +138,7 @@ auto run(TestData& data) -> void if (!isInside) { return Neon::domain::details::disaggregated::bGrid::details::cGrid::ClassSelector::outside; } - if (idx.rSum() % 2 == 0) { + if (idx.x == 0 || idx.y == 0 || idx.z == 0 || idx.x == grid.getDimension().x - 1 || idx.y == grid.getDimension().y - 1 || idx.z == grid.getDimension().z - 1) { return Neon::domain::details::disaggregated::bGrid::details::cGrid::ClassSelector::beta; } return Neon::domain::details::disaggregated::bGrid::details::cGrid::ClassSelector::alpha; @@ -151,6 +151,7 @@ auto run(TestData& data) -> void grid = tmp; + grid.helpGetClassField().template ioToVtk("classField", "classField"); const std::string appName = TestInformation::fullName(grid.getImplementationName()); data.resetValuesToLinear(1, 100); diff --git a/libNeonDomain/tests/domain-map/src/gtests.cpp b/libNeonDomain/tests/domain-map/src/gtests.cpp index 6240ad66..f6977444 100644 --- a/libNeonDomain/tests/domain-map/src/gtests.cpp +++ b/libNeonDomain/tests/domain-map/src/gtests.cpp @@ -42,7 +42,7 @@ TEST(domain_map, bGrid) TEST(domain_map, dGridDisg) { - int nGpus = 1; + int nGpus = 3; using Type = int64_t; runAllTestConfiguration(std::function(map::run), nGpus, @@ -58,6 +58,16 @@ TEST(domain_map, dGridSoA) 1); } +TEST(domain_map, bGridMgpu) +{ + int nGpus = 3; + using Type = int64_t; + // extern template auto run(TestData&) -> void; + runAllTestConfiguration(std::function(map::run), + nGpus, + 1); +} + int main(int argc, char** argv) { ::testing::InitGoogleTest(&argc, argv); diff --git a/libNeonDomain/tests/domain-map/src/map.cu b/libNeonDomain/tests/domain-map/src/map.cu index 810cb067..d2dc5e5f 100644 --- a/libNeonDomain/tests/domain-map/src/map.cu +++ b/libNeonDomain/tests/domain-map/src/map.cu @@ -1,7 +1,7 @@ #include #include "Neon/domain/Grids.h" -#include "Neon/domain/details/dGridDisg/dGrid.h" #include "Neon/domain/details/bGridDisg/bGrid.h" +#include "Neon/domain/details/dGridDisg/dGrid.h" #include "Neon/domain/details/dGridSoA/dGridSoA.h" #include "Neon/domain/tools/TestData.h" @@ -36,15 +36,15 @@ auto mapContainer_axpy(int streamIdx, template auto mapContainer_add(int streamIdx, - typename Field::Type& val, - Field& fieldB) + typename Field::Type& val, + Field& fieldB) -> Neon::set::Container { const auto& grid = fieldB.getGrid(); return grid.newContainer( "mapContainer_axpy", [&, val](Neon::set::Loader& loader) { - auto b = loader.load(fieldB); + auto b = loader.load(fieldB); return [=] NEON_CUDA_HOST_DEVICE(const typename Field::Idx& e) mutable { for (int i = 0; i < b.cardinality(); i++) { @@ -101,6 +101,7 @@ template auto run(TestData&) - template auto run(TestData&) -> void; template auto run(TestData&) -> void; template auto run(TestData&) -> void; +template auto run(TestData&) -> void; namespace dataView { template @@ -151,6 +152,7 @@ template auto run(TestData&) - template auto run(TestData&) -> void; template auto run(TestData&) -> void; template auto run(TestData&) -> void; +template auto run(TestData&) -> void; } // namespace dataView } // namespace map \ No newline at end of file diff --git a/libNeonDomain/tests/domain-map/src/map.h b/libNeonDomain/tests/domain-map/src/map.h index f6f8bc7c..73038353 100644 --- a/libNeonDomain/tests/domain-map/src/map.h +++ b/libNeonDomain/tests/domain-map/src/map.h @@ -19,6 +19,7 @@ extern template auto run(TestData(TestData&) -> void; extern template auto run(TestData&) -> void; extern template auto run(TestData&) -> void; +extern template auto run(TestData&) -> void; namespace dataView { @@ -30,6 +31,7 @@ extern template auto run(TestData(TestData&) -> void; extern template auto run(TestData&) -> void; extern template auto run(TestData&) -> void; +extern template auto run(TestData&) -> void; } // namespace dataView diff --git a/libNeonDomain/tests/domain-map/src/runHelper.h b/libNeonDomain/tests/domain-map/src/runHelper.h index 593e31c2..4c622f82 100644 --- a/libNeonDomain/tests/domain-map/src/runHelper.h +++ b/libNeonDomain/tests/domain-map/src/runHelper.h @@ -73,7 +73,7 @@ void runAllTestConfiguration( Neon::Backend backend(ids, runtime); Neon::MemoryOptions memoryOptions = backend.getMemoryOptions(); - if constexpr (std::is_same_v) { + if constexpr (G::executionThreadSpan == Neon::set::details::ExecutionThreadSpan::d1b3) { if (dim.z < 8 * ngpu * 3) { dim.z = ngpu * 3 * 8; } diff --git a/libNeonDomain/tests/domain-neighbour-globalIdx/src/gtests.cpp b/libNeonDomain/tests/domain-neighbour-globalIdx/src/gtests.cpp index 3ea17257..2376b72f 100644 --- a/libNeonDomain/tests/domain-neighbour-globalIdx/src/gtests.cpp +++ b/libNeonDomain/tests/domain-neighbour-globalIdx/src/gtests.cpp @@ -49,6 +49,15 @@ TEST(domain_neighbour_globalIdx, dGridDisg) 1); } +TEST(domain_neighbour_globalIdx, bGridMgpu) +{ + int nGpus = 5; + using Type = int64_t; + runAllTestConfiguration(std::function(globalIdx::run), + nGpus, + 1); +} + /////////////////////////////////////////// TEST(domain_neighbour_globalIdx, dGrid_template) @@ -87,6 +96,17 @@ TEST(domain_neighbour_globalIdx, dGridSoA_template) 1); } +TEST(domain_neighbour_globalIdx,bGridMgpu_template) +{ + int nGpus = 5; + using Type = int64_t; + runAllTestConfiguration(std::function(globalIdx::runTemplate), + nGpus, + 1); +} + + + int main(int argc, char** argv) { ::testing::InitGoogleTest(&argc, argv); diff --git a/libNeonDomain/tests/domain-neighbour-globalIdx/src/testsAndContainers.cu b/libNeonDomain/tests/domain-neighbour-globalIdx/src/testsAndContainers.cu index 17f9f0ec..fe25f54e 100644 --- a/libNeonDomain/tests/domain-neighbour-globalIdx/src/testsAndContainers.cu +++ b/libNeonDomain/tests/domain-neighbour-globalIdx/src/testsAndContainers.cu @@ -407,6 +407,7 @@ template auto run(TestData&) - template auto run(TestData&) -> void; template auto run(TestData&) -> void; template auto run(TestData&) -> void; +template auto run(TestData&) -> void; template auto runTemplate(TestData&) -> void; @@ -414,5 +415,6 @@ template auto runTemplate(TestData(TestData&) -> void; template auto runTemplate(TestData&) -> void; template auto runTemplate(TestData&) -> void; +template auto runTemplate(TestData&) -> void; } // namespace globalIdx \ No newline at end of file diff --git a/libNeonDomain/tests/domain-neighbour-globalIdx/src/testsAndContainers.h b/libNeonDomain/tests/domain-neighbour-globalIdx/src/testsAndContainers.h index 03550960..92542034 100644 --- a/libNeonDomain/tests/domain-neighbour-globalIdx/src/testsAndContainers.h +++ b/libNeonDomain/tests/domain-neighbour-globalIdx/src/testsAndContainers.h @@ -22,11 +22,13 @@ extern template auto run(TestData(TestData&) -> void; extern template auto run(TestData&) -> void; extern template auto run(TestData&) -> void; +extern template auto run(TestData&) -> void; extern template auto runTemplate(TestData&) -> void; extern template auto runTemplate(TestData&) -> void; extern template auto runTemplate(TestData&) -> void; extern template auto runTemplate(TestData&) -> void; extern template auto runTemplate(TestData&) -> void; +extern template auto runTemplate(TestData&) -> void; } // namespace map diff --git a/libNeonDomain/tests/domain-stencil/src/gtests.cpp b/libNeonDomain/tests/domain-stencil/src/gtests.cpp index ada2bd6b..3d5f436c 100644 --- a/libNeonDomain/tests/domain-stencil/src/gtests.cpp +++ b/libNeonDomain/tests/domain-stencil/src/gtests.cpp @@ -94,6 +94,15 @@ TEST(domain_stencil, dGridDisg_Template) 1); } +TEST(domain_stencil, bGridDisgMgpu_Template) +{ + int nGpus = 5; + using Type = int64_t; + runAllTestConfiguration(std::function(map::runTemplate), + nGpus, + 1); +} + int main(int argc, char** argv) { ::testing::InitGoogleTest(&argc, argv); diff --git a/libNeonDomain/tests/domain-stencil/src/stencil.cu b/libNeonDomain/tests/domain-stencil/src/stencil.cu index 6deb55f7..4cf31f05 100644 --- a/libNeonDomain/tests/domain-stencil/src/stencil.cu +++ b/libNeonDomain/tests/domain-stencil/src/stencil.cu @@ -117,7 +117,7 @@ auto laplaceTemplate(const Field& filedA, count++; }); }); - + b(idx, i) = a(idx, i) - count * partial; } }; @@ -281,11 +281,13 @@ template auto runNoTemplate(TestData(TestData&) -> void; template auto runNoTemplate(TestData&) -> void; template auto runNoTemplate(TestData&) -> void; +template auto runNoTemplate(TestData&) -> void; template auto runTemplate(TestData&) -> void; template auto runTemplate(TestData&) -> void; template auto runTemplate(TestData&) -> void; template auto runTemplate(TestData&) -> void; template auto runTemplate(TestData&) -> void; +template auto runTemplate(TestData&) -> void; } // namespace map \ No newline at end of file diff --git a/libNeonDomain/tests/test-template/CMakeLists.txt b/libNeonDomain/tests/test-template/CMakeLists.txt new file mode 100644 index 00000000..c077f20f --- /dev/null +++ b/libNeonDomain/tests/test-template/CMakeLists.txt @@ -0,0 +1,19 @@ +cmake_minimum_required(VERSION 3.19 FATAL_ERROR) + +set(APP_NAME domain-neighbour-globalIdx) +file(GLOB_RECURSE SrcFiles src/*.*) + +add_executable(${APP_NAME} ${SrcFiles}) + +target_link_libraries(${APP_NAME} + PUBLIC libNeonDomain + PUBLIC gtest_main) + +set_target_properties(${APP_NAME} PROPERTIES + CUDA_SEPARABLE_COMPILATION ON + CUDA_RESOLVE_DEVICE_SYMBOLS ON) + +set_target_properties(${APP_NAME} PROPERTIES FOLDER "libNeonDomain") +source_group(TREE ${CMAKE_CURRENT_LIST_DIR} PREFIX "${APP_NAME}" FILES ${SrcFiles}) + +add_test(NAME ${APP_NAME} COMMAND ${APP_NAME}) \ No newline at end of file diff --git a/libNeonDomain/tests/test-template/src/testsAndContainers.cu b/libNeonDomain/tests/test-template/src/testsAndContainers.cu new file mode 100644 index 00000000..91368f7c --- /dev/null +++ b/libNeonDomain/tests/test-template/src/testsAndContainers.cu @@ -0,0 +1,75 @@ +#include +#include "Neon/domain/Grids.h" +#include "Neon/domain/details/dGridDisg/dGrid.h" +#include "Neon/domain/details/dGridSoA/dGridSoA.h" + +#include "Neon/domain/tools/TestData.h" + +auto NEON_CUDA_HOST_DEVICE idxToInt(Neon::index_3d idx) +{ + return 33 * 1000000 + 10000 * idx.x + 100 * idx.y + idx.z; +}; + +int main() +{ + using Grid = Neon::bGridMgpu; + Neon::init(); + Neon::Backend bk({0, 0}, Neon::Runtime::openmp); + int blockSize = Neon::domain::details::bGridMgpu::defaultBlockSize; + Grid bGridMgpu( + bk, + {blockSize, blockSize, 6 * blockSize}, + [](Neon::index_3d idx) { return true; }, + Neon::domain::Stencil::s6_Jacobi_t(), + {1, 1, 1}, + {0, 0, 0}, + Neon::domain::tool::spaceCurves::EncoderType::sweep); + using Field = typename Grid::Field; + Field A = bGridMgpu.newField("A", 1, 0); + + auto setupOp = bGridMgpu.newContainer( + "setup", + [&A](Neon::set::Loader& loader) { + auto a = loader.load(A); + return [=] NEON_CUDA_HOST_DEVICE(const typename Field::Idx& gIdx) mutable { + auto const global_idx = a.getGlobalIndex(gIdx); + a(gIdx, 0) = idxToInt(global_idx); + }; + }); + + setupOp.run(0); + bk.sync(0); + + A.newHaloUpdate( + Neon::set::StencilSemantic::standard, + Neon::set::TransferMode::get, + Neon::Execution::device) + .run(0); + + bk.sync(0); + + auto stencilOp = bGridMgpu.newContainer( + "setup", + [&A](Neon::set::Loader& loader) { + auto a = loader.load(A); + return [=] NEON_CUDA_HOST_DEVICE(const typename Field::Idx& gIdx) mutable { + auto const global_idx = a.getGlobalIndex(gIdx); + auto expectedNgh = global_idx + Neon::index_3d(0, 0, 1); + auto val = a.getNghData<0, 0, 1>(gIdx, 0, -1).getData(); + if (val != -1 && + val != idxToInt(expectedNgh)) { + printf("ERROR: %d (%d %d %d) %d\n", val, + global_idx.x, global_idx.y, global_idx.z, + a(gIdx, 0)); + if (global_idx.x == 0 && global_idx.y == 0 && global_idx.z == 11) { + printf("ERROR: %d (%d %d %d) %d\n", val, + global_idx.x, global_idx.y, global_idx.z, + a(gIdx, 0)); + val = a.getNghData<0, 0, 1>(gIdx, 0, -1).getData(); + } + } + }; + }); + stencilOp.run(0); + bk.sync(0); +}; diff --git a/libNeonSet/include/Neon/set/DevSet.h b/libNeonSet/include/Neon/set/DevSet.h index 5e8b03b7..114408d3 100644 --- a/libNeonSet/include/Neon/set/DevSet.h +++ b/libNeonSet/include/Neon/set/DevSet.h @@ -401,10 +401,18 @@ class DevSet executor = (void*)Neon::set::details::blockSpan::launchLambdaOnSpanCUDAWithCompilerHints; } } - dev.kernel.template cudaLaunchKernel(gpuStreamSet[setIdx.idx()], - launchInfoSet[setIdx.idx()], - executor, - untypedParams); + auto launchInfo = launchInfoSet[setIdx.idx()]; + auto cudaGrid = launchInfo.cudaGrid(); + if (cudaGrid.x * cudaGrid.y * cudaGrid.z != 0) { + + dev.kernel.template cudaLaunchKernel(gpuStreamSet[setIdx.idx()], + launchInfo, + executor, + untypedParams); + } else { + NEON_WARNING("Cuda grid with zero number of element was detected. The kernel will be skipped."); + ; + } } #else NeonException exp("DevSet"); @@ -447,10 +455,18 @@ class DevSet } else { executor = (void*)Neon::set::details::blockSpan::launchLambdaOnSpanCUDA; } - dev.kernel.template cudaLaunchKernel(gpuStreamSet[setIdx.idx()], - launchInfoSet[setIdx.idx()], - executor, - untypedParams); + auto launchInfo = launchInfoSet[setIdx.idx()]; + auto cudaGrid = launchInfo.cudaGrid(); + if (cudaGrid.x * cudaGrid.y * cudaGrid.z != 0) { + + dev.kernel.template cudaLaunchKernel(gpuStreamSet[setIdx.idx()], + launchInfo, + executor, + untypedParams); + } else { + NEON_WARNING("Cuda grid with zero number of element was detected. The kernel will be skipped."); + ; + } } if (kernelConfig.runMode() == Neon::run_et::sync) { gpuStreamSet.sync(); @@ -502,9 +518,14 @@ class DevSet const Neon::Integer_3d blockSize(cudaBlock.x, cudaBlock.y, cudaBlock.z); const Neon::Integer_3d gridSize(cudaGrid.x, cudaGrid.y, cudaGrid.z); - Neon::set::details::blockSpan::launchLambdaOnSpanOMP(blockSize, gridSize, iterator, lambda); + if (cudaGrid.x * cudaGrid.y * cudaGrid.z != 0) { + + Neon::set::details::blockSpan::launchLambdaOnSpanOMP(blockSize, gridSize, iterator, lambda); + } else { + NEON_WARNING("Omp grid with zero number of element was detected. The kernel will be skipped."); + } } } } @@ -543,10 +564,14 @@ class DevSet auto const& cudaGrid = launchInfoSet[setIdx].cudaGrid(); const Neon::Integer_3d blockSize(cudaBlock.x, cudaBlock.y, cudaBlock.z); const Neon::Integer_3d gridSize(cudaGrid.x, cudaGrid.y, cudaGrid.z); + if (cudaGrid.x * cudaGrid.y * cudaGrid.z != 0) { Neon::set::details::blockSpan::launchLambdaOnSpanOMP(blockSize, gridSize, iterator, lambda); + } else { + NEON_WARNING("Omp grid with zero number of element was detected. The kernel will be skipped."); + } } return; }