diff --git a/cmake/ManageCompilationFlags.cmake b/cmake/ManageCompilationFlags.cmake index dca53533..937fafe1 100644 --- a/cmake/ManageCompilationFlags.cmake +++ b/cmake/ManageCompilationFlags.cmake @@ -35,12 +35,13 @@ set(NeonCXXFlags #Add GCC specific compiler flags here #-Wno-class-memaccess for "writing to an object of type XXX with no trivial copy-assignment; use copy-assignment or copy-initialization instead" - $<$:-m64 -Wall -Wextra -Werror -Wno-unused-function -Wno-deprecated-declarations -Wno-class-memaccess> + $<$:-m64 -Wall -Wextra -Werror -Wno-unused-function -Wno-deprecated-declarations -Wno-class-memaccess -Wno-deprecated-declarations> #Add Clang specific compiler flags here - $<$:-m64 -Wall -Wextra -Werror -Wno-unused-function -Wno-deprecated-declarations> + $<$:-m64 -Wall -Wextra -Werror -Wno-unused-function -Wno-deprecated-declarations -Wno-deprecated-copy -Wno-unused-parameter -Wno-unused-private-field -Wno-braced-scalar-init -Wno-unused-variable -Wno-unused-but-set-variable -Wno-deprecated-declarations > ) +set(MSVC_XCOMPILER_FLAGS "/openmp /std:c++17") set(NeonCUDAFlags # Optimization flags for Release $<$: $<$:-O3> > @@ -51,7 +52,7 @@ set(NeonCUDAFlags # Host compiler $<$:-Xcompiler -fopenmp -std=c++17 $<$:-O3> $<$:-O0> > $<$:-Xcompiler -fopenmp -std=c++17 $<$:-O3> $<$:-O0>> - $<$:-Xcompiler /std:c++17> + $<$:-Xcompiler ${MSVC_XCOMPILER_FLAGS}> #Disables warning #177-D "function XXX was declared but never referenced" -Xcudafe "--display_error_number --diag_suppress=177" diff --git a/cmake/Nvtx.cmake b/cmake/Nvtx.cmake index 50ad7242..d7ff6df2 100644 --- a/cmake/Nvtx.cmake +++ b/cmake/Nvtx.cmake @@ -9,4 +9,4 @@ if (${NEON_USE_NVTX}) message(STATUS "NVTX Ranges is enabled") else () message(STATUS "NVTX Ranges is disabled") -endif () \ No newline at end of file +endif () diff --git a/libNeonCore/include/Neon/core/tools/Logger.h b/libNeonCore/include/Neon/core/tools/Logger.h index 913a738c..3fa791e3 100644 --- a/libNeonCore/include/Neon/core/tools/Logger.h +++ b/libNeonCore/include/Neon/core/tools/Logger.h @@ -53,7 +53,12 @@ LIBNEONCORE_EXPORT extern Logger LoggerObj; } // namespace Neon +#if defined( NEON_ACTIVETE_TRACING) #define NEON_TRACE(...) ::Neon::globalSpace::LoggerObj.getLogger()->trace(__VA_ARGS__) +#else +#define NEON_TRACE(...) +#endif + #define NEON_INFO(...) ::Neon::globalSpace::LoggerObj.getLogger()->info(__VA_ARGS__) #define NEON_WARNING(...) \ ::Neon::globalSpace::LoggerObj.getLogger()->warn("Line {} File {}", __LINE__, __FILE__); \ diff --git a/libNeonDomain/include/Neon/domain/interface/FieldBase.h b/libNeonDomain/include/Neon/domain/interface/FieldBase.h index b1f197af..4e9006fc 100644 --- a/libNeonDomain/include/Neon/domain/interface/FieldBase.h +++ b/libNeonDomain/include/Neon/domain/interface/FieldBase.h @@ -4,6 +4,7 @@ #include "Neon/core/tools/io/IODense.h" #include "Neon/core/types/Macros.h" +#include "Neon/set/Containter.h" #include "Neon/set/DataConfig.h" #include "Neon/set/DevSet.h" #include "Neon/set/HuOptions.h" @@ -12,7 +13,6 @@ #include "GridBase.h" #include "Neon/domain/interface/common.h" - namespace Neon::domain::interface { template @@ -25,8 +25,8 @@ class FieldBase FieldBase(); - FieldBase(const std::string fieldUserName, - const std::string fieldClassName, + FieldBase(const std::string& fieldUserName, + const std::string& fieldClassName, const Neon::index_3d& dimension, int cardinality, T outsideVal, @@ -57,6 +57,10 @@ class FieldBase virtual auto haloUpdate(Neon::set::HuOptions& opt) -> void = 0; + virtual auto haloUpdateContainer(Neon::set::TransferMode transferMode, + Neon::set::StencilSemantic stencilSemantic) const + -> Neon::set::Container; + auto getDimension() const -> const Neon::index_3d&; diff --git a/libNeonDomain/include/Neon/domain/interface/FieldBaseTemplate.h b/libNeonDomain/include/Neon/domain/interface/FieldBaseTemplate.h index 2ba43eca..fa913e2e 100644 --- a/libNeonDomain/include/Neon/domain/interface/FieldBaseTemplate.h +++ b/libNeonDomain/include/Neon/domain/interface/FieldBaseTemplate.h @@ -6,7 +6,7 @@ #include "Neon/set/DataConfig.h" #include "Neon/set/DevSet.h" -#include "Neon/set/MultiDeviceObjectInterface.h" +#include "Neon/set/MultiXpuDataInterface.h" #include "Neon/set/memory/memSet.h" #include "Neon/domain/interface/FieldBase.h" @@ -21,7 +21,7 @@ template class FieldBaseTemplate : public FieldBase, - public Neon::set::interface::MultiDeviceObjectInterface + public Neon::set::interface::MultiXpuDataInterface { public: using Partition = P; diff --git a/libNeonDomain/include/Neon/domain/interface/FieldBaseTemplate_imp.h b/libNeonDomain/include/Neon/domain/interface/FieldBaseTemplate_imp.h index 7156200d..870f859e 100644 --- a/libNeonDomain/include/Neon/domain/interface/FieldBaseTemplate_imp.h +++ b/libNeonDomain/include/Neon/domain/interface/FieldBaseTemplate_imp.h @@ -157,7 +157,7 @@ auto FieldBaseTemplate::swapUIDBeforeFullSwap(FieldBaseTemplate:: NEON_THROW(exp); } - Neon::set::interface::MultiDeviceObjectInterface::swapUIDs(A,B); + Neon::set::interface::MultiXpuDataInterface::swapUIDs(A,B); } diff --git a/libNeonDomain/include/Neon/domain/interface/FieldBase_imp.h b/libNeonDomain/include/Neon/domain/interface/FieldBase_imp.h index 098c325d..326759c7 100644 --- a/libNeonDomain/include/Neon/domain/interface/FieldBase_imp.h +++ b/libNeonDomain/include/Neon/domain/interface/FieldBase_imp.h @@ -13,8 +13,8 @@ FieldBase::FieldBase() } template -FieldBase::FieldBase(const std::string FieldBaseUserName, - const std::string fieldClassName, +FieldBase::FieldBase(const std::string& FieldBaseUserName, + const std::string& fieldClassName, const Neon::index_3d& dimension, int cardinality, T outsideVal, @@ -274,6 +274,13 @@ auto FieldBase::getClassName() const -> const std::string& return mStorage->className; } +template +auto FieldBase::haloUpdateContainer(Neon::set::TransferMode, + Neon::set::StencilSemantic) const -> Neon::set::Container +{ + NEON_THROW_UNSUPPORTED_OPERATION(""); +} + template FieldBase::Storage::Storage(const std::string FieldBaseUserName, const std::string fieldClassName, diff --git a/libNeonDomain/include/Neon/domain/internal/bGrid/bGrid_imp.h b/libNeonDomain/include/Neon/domain/internal/bGrid/bGrid_imp.h index 3bdef78c..ee36e209 100644 --- a/libNeonDomain/include/Neon/domain/internal/bGrid/bGrid_imp.h +++ b/libNeonDomain/include/Neon/domain/internal/bGrid/bGrid_imp.h @@ -309,6 +309,7 @@ auto bGrid::dot(const std::string& name, return Neon::set::Container::factoryOldManaged( name, Neon::set::internal::ContainerAPI::DataViewSupport::on, + Neon::set::ContainerPatternType::reduction, *this, [&](Neon::set::Loader& loader) { loader.load(input1); if (input1.getUid() != input2.getUid()) { @@ -372,6 +373,7 @@ auto bGrid::norm2(const std::string& name, return Neon::set::Container::factoryOldManaged( name, Neon::set::internal::ContainerAPI::DataViewSupport::on, + Neon::set::ContainerPatternType::reduction, *this, [&](Neon::set::Loader& loader) { loader.load(input); diff --git a/libNeonDomain/include/Neon/domain/internal/dGrid/dField.h b/libNeonDomain/include/Neon/domain/internal/dGrid/dField.h index e9f1632b..ff70f7a8 100644 --- a/libNeonDomain/include/Neon/domain/internal/dGrid/dField.h +++ b/libNeonDomain/include/Neon/domain/internal/dGrid/dField.h @@ -43,7 +43,7 @@ class dField : public Neon::domain::interface::FieldBaseTemplate Self&; @@ -61,14 +61,18 @@ class dField : public Neon::domain::interface::FieldBaseTemplate void final; + auto haloUpdateContainer(Neon::set::TransferMode, + Neon::set::StencilSemantic) + const -> Neon::set::Container final; + auto haloUpdate(SetIdx setIdx, Neon::set::HuOptions& opt) const - -> void; //TODO add this function to the API if performance boost is reasonable -> void final; + -> void; // TODO add this function to the API if performance boost is reasonable -> void final; auto haloUpdate(Neon::set::HuOptions& opt) -> void final; auto haloUpdate(SetIdx setIdx, Neon::set::HuOptions& opt) - -> void; //TODO add this function to the API if performance boost is reasonable -> void final; + -> void; // TODO add this function to the API if performance boost is reasonable -> void final; virtual auto getReference(const Neon::index_3d& idx, const int& cardinality) @@ -188,4 +192,6 @@ class dField : public Neon::domain::interface::FieldBaseTemplate Neon::set::MultiDeviceObjectUid; + auto uid() const -> Neon::set::dataDependency::MultiXpuDataUid; auto grid() -> grid_t&; diff --git a/libNeonDomain/include/Neon/domain/internal/dGrid/dFieldDev_imp.h b/libNeonDomain/include/Neon/domain/internal/dGrid/dFieldDev_imp.h index 04449e69..c9bfd809 100644 --- a/libNeonDomain/include/Neon/domain/internal/dGrid/dFieldDev_imp.h +++ b/libNeonDomain/include/Neon/domain/internal/dGrid/dFieldDev_imp.h @@ -74,10 +74,10 @@ auto dFieldDev::operator=(dFieldDev&& other) -> dFieldDev& } template -auto dFieldDev::uid() const -> Neon::set::MultiDeviceObjectUid +auto dFieldDev::uid() const -> Neon::set::dataDependency::MultiXpuDataUid { void* addr = static_cast(m_data.get()); - Neon::set::MultiDeviceObjectUid uidRes = (size_t)addr; + Neon::set::dataDependency::MultiXpuDataUid uidRes = (size_t)addr; return uidRes; } diff --git a/libNeonDomain/include/Neon/domain/internal/dGrid/dField_imp.h b/libNeonDomain/include/Neon/domain/internal/dGrid/dField_imp.h index a5ecb9cd..9902675f 100644 --- a/libNeonDomain/include/Neon/domain/internal/dGrid/dField_imp.h +++ b/libNeonDomain/include/Neon/domain/internal/dGrid/dField_imp.h @@ -351,7 +351,7 @@ auto dField::getPartition(Neon::Execution execution, -> const Partition& { const Neon::DataUse dataUse = this->getDataUse(); - bool isOk = Neon::ExecutionUtils::checkCompatibility(dataUse, execution); + bool isOk = Neon::ExecutionUtils::checkCompatibility(dataUse, execution); if (isOk) { if (execution == Neon::Execution::device) { return m_gpu.getPartition(Neon::DeviceType::CUDA, setIdx, dataView); @@ -438,6 +438,7 @@ template auto dField::haloUpdate(Neon::set::HuOptions& opt) const -> void { + NEON_TRACE("haloUpdate stream {} transferMode {} ", opt.streamSetIdx(), Neon::set::TransferModeUtils::toString(opt.transferMode())); auto& bk = self().getBackend(); auto fieldDev = field(bk.devType()); switch (opt.transferMode()) { @@ -458,13 +459,19 @@ auto dField::haloUpdate(Neon::SetIdx setIdx, Neon::set::HuOptions& opt) const -> void { + + auto& bk = self().getBackend(); auto fieldDev = field(bk.devType()); switch (opt.transferMode()) { case Neon::set::TransferMode::put: + NEON_TRACE("TRACE haloUpdate PUT setIdx {} stream {} transferMode {} ", setIdx.idx(), opt.streamSetIdx(), Neon::set::TransferModeUtils::toString(opt.transferMode())); + fieldDev.template haloUpdate(setIdx, bk, -1, opt.startWithBarrier(), opt.streamSetIdx()); break; case Neon::set::TransferMode::get: + NEON_TRACE("TRACE haloUpdate GET setIdx {} stream {} transferMode {} ", setIdx.idx(), opt.streamSetIdx(), Neon::set::TransferModeUtils::toString(opt.transferMode())); + fieldDev.template haloUpdate(setIdx, bk, -1, opt.startWithBarrier(), opt.streamSetIdx()); break; default: @@ -477,6 +484,8 @@ template auto dField::haloUpdate(Neon::set::HuOptions& opt) -> void { + NEON_TRACE("haloUpdate stream {} transferMode {} ", opt.streamSetIdx(), Neon::set::TransferModeUtils::toString(opt.transferMode())); + auto& bk = self().getBackend(); auto fieldDev = field(bk.devType()); switch (opt.transferMode()) { @@ -501,9 +510,17 @@ auto dField::haloUpdate(Neon::SetIdx setIdx, auto fieldDev = field(bk.devType()); switch (opt.transferMode()) { case Neon::set::TransferMode::put: +#pragma omp critical + { + NEON_TRACE("TRACE haloUpdate PUT setIdx {} stream {} transferMode {} ", setIdx.idx(), opt.streamSetIdx(), Neon::set::TransferModeUtils::toString(opt.transferMode())); + } fieldDev.template haloUpdate(setIdx, bk, -1, opt.startWithBarrier(), opt.streamSetIdx()); break; case Neon::set::TransferMode::get: +#pragma omp critical + { + NEON_TRACE("TRACE haloUpdate GET setIdx {} stream {} transferMode {} ", setIdx.idx(), opt.streamSetIdx(), Neon::set::TransferModeUtils::toString(opt.transferMode())); + } fieldDev.template haloUpdate(setIdx, bk, -1, opt.startWithBarrier(), opt.streamSetIdx()); break; default: @@ -512,6 +529,46 @@ auto dField::haloUpdate(Neon::SetIdx setIdx, } } + +template +auto dField:: + haloUpdateContainer(Neon::set::TransferMode transferMode, + Neon::set::StencilSemantic stencilSemantic) + const -> Neon::set::Container +{ + Neon::set::Container dataTransferContainer = + Neon::set::Container::factoryDataTransfer(*this, + transferMode, + stencilSemantic); + + Neon::set::Container SyncContainer = + Neon::set::Container::factorySynchronization(*this, + Neon::set::SynchronizationContainerType::hostOmpBarrier); + Neon::set::container::Graph graph(this->getBackend()); + const auto& dataTransferNode = graph.addNode(dataTransferContainer); + const auto& syncNode = graph.addNode(SyncContainer); + + switch (transferMode) { + case Neon::set::TransferMode::put: + graph.addDependency(dataTransferNode, syncNode, Neon::GraphDependencyType::data); + break; + case Neon::set::TransferMode::get: + graph.addDependency(syncNode, dataTransferNode, Neon::GraphDependencyType::data); + break; + default: + NEON_THROW_UNSUPPORTED_OPTION(); + break; + } + + graph.removeRedundantDependencies(); + + Neon::set::Container output = + Neon::set::Container::factoryGraph("dGrid-Halo-Update", + graph, + [](Neon::SetIdx, Neon::set::Loader&) {}); + return output; +} + template auto dField::dot(Neon::set::patterns::BlasSet& blasSet, const dField& input, @@ -584,4 +641,5 @@ auto dField::swap(dField::Field& A, dField::Field& B) -> void std::swap(A, B); } + } // namespace Neon::domain::internal::dGrid diff --git a/libNeonDomain/include/Neon/domain/internal/dGrid/dGrid.h b/libNeonDomain/include/Neon/domain/internal/dGrid/dGrid.h index c58212ac..c4c40120 100644 --- a/libNeonDomain/include/Neon/domain/internal/dGrid/dGrid.h +++ b/libNeonDomain/include/Neon/domain/internal/dGrid/dGrid.h @@ -58,7 +58,7 @@ class dGrid : public Neon::domain::interface::GridBaseTemplate dGrid(const dGrid& rhs) = default; - ~dGrid() = default; + virtual ~dGrid() = default; /** * Constructor compatible with the general grid API @@ -88,7 +88,7 @@ class dGrid : public Neon::domain::interface::GridBaseTemplate * Creates a new Field */ template - auto newField(const std::string fieldUserName, + auto newField(const std::string& fieldUserName, int cardinality, T inactiveValue, Neon::DataUse dataUse = Neon::DataUse::IO_COMPUTE, @@ -183,7 +183,7 @@ class dGrid : public Neon::domain::interface::GridBaseTemplate Neon::index_3d halo; std::vector> partitionIndexSpaceVec; - Neon::sys::patterns::Engine reduceEngine; + Neon::sys::patterns::Engine reduceEngine; }; std::shared_ptr m_data; }; diff --git a/libNeonDomain/include/Neon/domain/internal/dGrid/dGrid_imp.h b/libNeonDomain/include/Neon/domain/internal/dGrid/dGrid_imp.h index f40e51f5..2270e533 100644 --- a/libNeonDomain/include/Neon/domain/internal/dGrid/dGrid_imp.h +++ b/libNeonDomain/include/Neon/domain/internal/dGrid/dGrid_imp.h @@ -110,7 +110,7 @@ dGrid::dGrid(const Neon::Backend& backend, template -auto dGrid::newField(const std::string fieldUserName, +auto dGrid::newField(const std::string& fieldUserName, int cardinality, [[maybe_unused]] T inactiveValue, Neon::DataUse dataUse, @@ -162,11 +162,11 @@ auto dGrid::getContainer(const std::string& name, { const Neon::index_3d& defaultBlockSize = getDefaultBlock(); Neon::set::Container kContainer = Neon::set::Container::factory(name, - Neon::set::internal::ContainerAPI::DataViewSupport::on, - *this, - lambda, - defaultBlockSize, - [](const Neon::index_3d&) { return size_t(0); }); + Neon::set::internal::ContainerAPI::DataViewSupport::on, + *this, + lambda, + defaultBlockSize, + [](const Neon::index_3d&) { return size_t(0); }); return kContainer; } @@ -180,11 +180,11 @@ auto dGrid::getContainer(const std::string& name, { const Neon::index_3d& defaultBlockSize = getDefaultBlock(); Neon::set::Container kContainer = Neon::set::Container::factory(name, - Neon::set::internal::ContainerAPI::DataViewSupport::on, - *this, - lambda, - blockSize, - [sharedMem](const Neon::index_3d&) { return sharedMem; }); + Neon::set::internal::ContainerAPI::DataViewSupport::on, + *this, + lambda, + blockSize, + [sharedMem](const Neon::index_3d&) { return sharedMem; }); return kContainer; } @@ -219,6 +219,7 @@ auto dGrid::dot(const std::string& name, return Neon::set::Container::factoryOldManaged( name, Neon::set::internal::ContainerAPI::DataViewSupport::on, + Neon::set::ContainerPatternType::reduction, *this, [&](Neon::set::Loader& loader) { loader.load(input1); if (input1.getUid() != input2.getUid()) { @@ -245,6 +246,7 @@ auto dGrid::dot(const std::string& name, return Neon::set::Container::factoryOldManaged( name, Neon::set::internal::ContainerAPI::DataViewSupport::on, + Neon::set::ContainerPatternType::reduction, *this, [&](Neon::set::Loader& loader) { loader.load(input1); if (input1.getUid() != input2.getUid()) { @@ -307,6 +309,7 @@ auto dGrid::norm2(const std::string& name, return Neon::set::Container::factoryOldManaged( name, Neon::set::internal::ContainerAPI::DataViewSupport::on, + Neon::set::ContainerPatternType::reduction, *this, [&](Neon::set::Loader& loader) { loader.load(input); @@ -330,6 +333,7 @@ auto dGrid::norm2(const std::string& name, return Neon::set::Container::factoryOldManaged( name, Neon::set::internal::ContainerAPI::DataViewSupport::on, + Neon::set::ContainerPatternType::reduction, *this, [&](Neon::set::Loader& loader) { loader.load(input); diff --git a/libNeonDomain/include/Neon/domain/internal/eGrid/eField.h b/libNeonDomain/include/Neon/domain/internal/eGrid/eField.h index 36936198..7ee651f5 100644 --- a/libNeonDomain/include/Neon/domain/internal/eGrid/eField.h +++ b/libNeonDomain/include/Neon/domain/internal/eGrid/eField.h @@ -142,6 +142,14 @@ class eField : public Neon::domain::interface::FieldBaseTemplate void final; + auto haloUpdate(Neon::SetIdx setIdx, + Neon::set::HuOptions& opt) + -> void; + + auto haloUpdateContainer(Neon::set::TransferMode transferMode, + Neon::set::StencilSemantic stencilSemantic) + const -> Neon::set::Container final; + static auto swap(Field& A, Field& B) -> void; private: @@ -201,6 +209,7 @@ class eField : public Neon::domain::interface::FieldBaseTemplate void; + /** * * @param streamSet @@ -223,4 +232,5 @@ class eField : public Neon::domain::interface::FieldBaseTemplate, - Neon::set::TransferSemanticUtils::nOptions>, + Neon::set::StencilSemanticUtils::nOptions>, Neon::set::TransferModeUtils::nOptions> m_haloUpdateInfo; }; @@ -125,7 +125,7 @@ class eFieldDevice_t for (auto mode : {Neon::set::TransferMode::get, Neon::set::TransferMode::put}) { { // (GET,PUT), FORWARD, GRID - const auto structure = Neon::set::TransferSemantic::grid; + const auto structure = Neon::set::StencilSemantic::standard; auto& transfers = h_haloUpdateInfo(mode, structure); Neon::set::HuOptions huOptions(mode, transfers, structure); this->haloUpdate__(m_data->grid->getBackend(), huOptions); @@ -142,8 +142,8 @@ class eFieldDevice_t } } - auto h_haloUpdateInfo(Neon::set::TransferMode mode, - Neon::set::TransferSemantic structure) + auto h_haloUpdateInfo(Neon::set::TransferMode mode, + Neon::set::StencilSemantic structure) -> std::vector& { return m_data->m_haloUpdateInfo[static_cast(mode)] @@ -200,10 +200,10 @@ class eFieldDevice_t * Returns a unique identifier for this type of DataSet * @return */ - auto uid() const -> Neon::set::MultiDeviceObjectUid + auto uid() const -> Neon::set::dataDependency::MultiXpuDataUid { - void* addr = static_cast(m_data.get()); - Neon::set::MultiDeviceObjectUid uidRes = (size_t)addr; + void* addr = static_cast(m_data.get()); + Neon::set::dataDependency::MultiXpuDataUid uidRes = (size_t)addr; return uidRes; } @@ -491,7 +491,7 @@ class eFieldDevice_t #pragma omp parallel for num_threads(ompNDevs) default(shared) for (int setIdx = 0; setIdx < nDevs; setIdx++) { for (int cardIdx = 0; cardIdx < self().cardinality(); cardIdx++) { - constexpr auto structure = Neon::set::TransferSemantic::grid; + constexpr auto structure = Neon::set::StencilSemantic::standard; auto& peerTransferOpt = opt.getPeerTransferOpt(bk); h_huSoAByCardSingleDevFwd(peerTransferOpt, @@ -562,6 +562,106 @@ class eFieldDevice_t } //------------------------------------------------ } + auto haloUpdate__(const Neon::Backend& bk, + Neon::SetIdx setIdx, + Neon::set::HuOptions& opt) const + -> void + { + + // No halo update operation can be performed on a filed where halo was not activated. + if (m_data->haloStatus != Neon::domain::haloStatus_et::ON) { + NEON_THROW_UNSUPPORTED_OPERATION("Halo support was not activated for this field."); + } + + // We don't need any update if the number of devices is one. + if (m_data->devSet.setCardinality() == 1) { + return; + } + + // If we are in execution mode, then we use on omp thread per device + // If we are in storeInfo mode, then we use only one thread, which will + // insert information on the transfer vectors sequentially + + // This sync goes before the following loop. + // This is a complete barrier over the stream and + // it can not be put inside the loop + // The sync is done only if opt.isExecuteMode() == true + if (opt.startWithBarrier() && opt.isExecuteMode()) { + bk.sync(opt.streamSetIdx()); + } + + // Different behaviour base on the data layout + switch (m_data->memOrder) { + case Neon::memLayout_et::order_e::structOfArrays: { + for (int cardIdx = 0; cardIdx < self().cardinality(); cardIdx++) { + constexpr auto structure = Neon::set::StencilSemantic::standard; + + auto& peerTransferOpt = opt.getPeerTransferOpt(bk); + h_huSoAByCardSingleDevFwd(peerTransferOpt, + setIdx, + cardIdx, + structure); + } + + break; + } + //------------------------------------------------ + case Neon::memLayout_et::order_e::arrayOfStructs: { + + // ARRAYS OF STRUCTURES + // -> for each voxel the components are stored contiguously + // -> We follow the same configuration either for Lattice or Standard + // -> We use the Standard as reference + + const int dstIdx = setIdx.idx(); + LocalIndexingInfo_t& dst = m_data->frame_shp->template localIndexingInfo(dstIdx); + + const std::array + srcIdx = {dst.nghIdx(ComDirection_e::COM_DW), + dst.nghIdx(ComDirection_e::COM_UP)}; + + for (const auto& comDirection : {ComDirection_e::COM_DW, + ComDirection_e::COM_UP}) { + + // In terms of elements we need to a number of values + // equivalent to the number of elements (voxels) + // by the number of values per element (=cardinality) + count_t transferEl = dst.remoteBdrCount(comDirection) * self().cardinality(); + Cell::Offset remoteOffset = dst.remoteBdrOff(comDirection); + Cell::Offset localOffset = dst.ghostOff(comDirection); + + T_ta* dstMem = m_data->memoryStorage.mem(dstIdx); + const T_ta* srcMem = m_data->memoryStorage.mem(srcIdx[comDirection]); + + T_ta* dstBuf = dstMem + localOffset; + const T_ta* srcBuf = srcMem + remoteOffset; + + // For partition 0, communication are only in the UP direction + if (setIdx.idx() == 0 && comDirection == ComDirection_e::COM_DW) + continue; + + // For the last, communication are only in the DW direction + if (setIdx.idx() == (bk.devSet().setCardinality() - 1) && comDirection == ComDirection_e::COM_UP) + continue; + + assert(transferEl > -1); + if (transferEl > 0) { + // auto& streamSet = bk.streamSet(opt.streamSetIdx()); + Neon::set::Transfer::Endpoint_t srcEndPoint(srcIdx[comDirection], (void*)srcBuf); + Neon::set::Transfer::Endpoint_t dstEndPoint(dstIdx, (void*)dstBuf); + + Neon::set::Transfer transfer(opt.transferMode(), + dstEndPoint, + srcEndPoint, + transferEl * sizeof(T_ta)); + + m_data->devSet.peerTransfer(opt.getPeerTransferOpt(bk), transfer); + } + } + + } + } //------------------------------------------------ + } /** * Do Update by cardinality * @param bk @@ -656,7 +756,7 @@ class eFieldDevice_t Neon::set::PeerTransferOption& peerTransferOpt = huOptions.getPeerTransferOpt(bk); // LATTICE + FORWARD - constexpr auto structure = Neon::set::TransferSemantic::lattice; + constexpr auto structure = Neon::set::StencilSemantic::streaming; h_huSoAByCardSingleDevFwd(peerTransferOpt, gpuIdx, cardIdx, @@ -808,7 +908,7 @@ class eFieldDevice_t auto h_huSoAByCardSingleDevFwd(Neon::set::PeerTransferOption opt, int devIdx, const int cardIdx, - Neon::set::TransferSemantic structure) const + Neon::set::StencilSemantic structure) const -> void { const Neon::Backend& bk = m_data->grid->getBackend(); @@ -846,7 +946,7 @@ class eFieldDevice_t } } - if (structure == Neon::set::TransferSemantic::lattice) { + if (structure == Neon::set::StencilSemantic::streaming) { index_3d dir = grid().getStencil().neighbours().at(cardIdx); switch (comDirection) { case ComDirection_e::COM_DW: { diff --git a/libNeonDomain/include/Neon/domain/internal/eGrid/eField_imp.h b/libNeonDomain/include/Neon/domain/internal/eGrid/eField_imp.h index cc7be9bf..f0eda664 100644 --- a/libNeonDomain/include/Neon/domain/internal/eGrid/eField_imp.h +++ b/libNeonDomain/include/Neon/domain/internal/eGrid/eField_imp.h @@ -372,6 +372,32 @@ auto eField::haloUpdate(Neon::set::HuOptions& opt) const fieldDev.haloUpdate__(bk, opt); } +template +auto eField::haloUpdate(Neon::SetIdx setIdx, + Neon::set::HuOptions& opt) + -> void +{ + auto field = [this](const Neon::DeviceType& devType) { + switch (devType) { + case Neon::DeviceType::CPU: + case Neon::DeviceType::OMP: { + return mCpu; + } + case Neon::DeviceType::CUDA: { + return mGpu; + } + default: { + NeonException exp("eField_t"); + exp << "Incompatible device parameter."; + NEON_THROW(exp); + } + } + }; + auto& bk = self().getBackend(); + auto fieldDev = field(bk.devType()); + fieldDev.haloUpdate__(bk, setIdx, opt); +} + template auto eField::haloUpdate(Neon::set::HuOptions& opt) -> void @@ -397,10 +423,51 @@ auto eField::haloUpdate(Neon::set::HuOptions& opt) fieldDev.haloUpdate__(bk, opt); } + +template +auto eField:: + haloUpdateContainer(Neon::set::TransferMode transferMode, + Neon::set::StencilSemantic stencilSemantic) + const -> Neon::set::Container +{ + Neon::set::Container dataTransferContainer = + Neon::set::Container::factoryDataTransfer(*this, + transferMode, + stencilSemantic); + + Neon::set::Container SyncContainer = + Neon::set::Container::factorySynchronization(*this, + Neon::set::SynchronizationContainerType::hostOmpBarrier); + + Neon::set::container::Graph graph(this->getBackend()); + const auto& dataTransferNode = graph.addNode(dataTransferContainer); + const auto& syncNode = graph.addNode(SyncContainer); + + switch (transferMode) { + case Neon::set::TransferMode::put: + graph.addDependency(dataTransferNode, syncNode, Neon::GraphDependencyType::data); + break; + case Neon::set::TransferMode::get: + graph.addDependency(syncNode, dataTransferNode, Neon::GraphDependencyType::data); + break; + default: + NEON_THROW_UNSUPPORTED_OPTION(); + break; + } + + graph.removeRedundantDependencies(); + + Neon::set::Container output = + Neon::set::Container::factoryGraph("dGrid-Halo-Update", + graph, + [](Neon::SetIdx, Neon::set::Loader&) {}); + return output; +} + template auto eField::swap(Field& A, Field& B) -> void { - Neon::domain::interface::FieldBaseTemplate::swapUIDBeforeFullSwap(A,B); + Neon::domain::interface::FieldBaseTemplate::swapUIDBeforeFullSwap(A, B); std::swap(A, B); } diff --git a/libNeonDomain/include/Neon/domain/patterns/PatternScalar.h b/libNeonDomain/include/Neon/domain/patterns/PatternScalar.h index a4fdff78..1653f202 100644 --- a/libNeonDomain/include/Neon/domain/patterns/PatternScalar.h +++ b/libNeonDomain/include/Neon/domain/patterns/PatternScalar.h @@ -1,13 +1,13 @@ #pragma once #include "Neon/set/Backend.h" -#include "Neon/set/MultiDeviceObjectInterface.h" +#include "Neon/set/MultiXpuDataInterface.h" #include "Neon/set/patterns/BlasSet.h" namespace Neon { template class PatternScalar - : public set::interface::MultiDeviceObjectInterface, int> + : public set::interface::MultiXpuDataInterface, int> { public: @@ -37,7 +37,7 @@ class PatternScalar /** * Returns a unique identifier to be used for the loading process */ - auto uid() const -> Neon::set::MultiDeviceObjectUid; + auto uid() const -> Neon::set::dataDependency::MultiXpuDataUid; auto getPartition(const Neon::DeviceType& devType, const Neon::SetIdx& idx, @@ -81,6 +81,8 @@ class PatternScalar */ auto operator()(const Neon::DataView& dataView) -> T&; + auto getName() const -> std::string; + private: auto updateIO(int streamId = 0) -> void final; @@ -110,6 +112,8 @@ class PatternScalar T standardResult; }; + + } // namespace Neon #include "Neon/domain/patterns/PatternScalar_imp.h" \ No newline at end of file diff --git a/libNeonDomain/include/Neon/domain/patterns/PatternScalar_imp.h b/libNeonDomain/include/Neon/domain/patterns/PatternScalar_imp.h index 06d516b8..23215dc4 100644 --- a/libNeonDomain/include/Neon/domain/patterns/PatternScalar_imp.h +++ b/libNeonDomain/include/Neon/domain/patterns/PatternScalar_imp.h @@ -45,10 +45,10 @@ auto PatternScalar::operator()() const -> const T& } template -auto PatternScalar::uid() const -> Neon::set::MultiDeviceObjectUid +auto PatternScalar::uid() const -> Neon::set::dataDependency::MultiXpuDataUid { void* addr = static_cast(mData.get()); - Neon::set::MultiDeviceObjectUid uidRes = (size_t)addr; + Neon::set::dataDependency::MultiXpuDataUid uidRes = (size_t)addr; return uidRes; } @@ -189,6 +189,11 @@ auto PatternScalar::getBlasSet(const Neon::DataView& dataView) -> Neon::set:: } } +template +auto PatternScalar::getName() const -> std::string +{ + return "PatternScalar"; +} extern template class PatternScalar; extern template class PatternScalar; diff --git a/libNeonDomain/include/Neon/domain/tools/TestData.h b/libNeonDomain/include/Neon/domain/tools/TestData.h index 97c1b9c7..6cafe8c2 100644 --- a/libNeonDomain/include/Neon/domain/tools/TestData.h +++ b/libNeonDomain/include/Neon/domain/tools/TestData.h @@ -4,6 +4,7 @@ #include "Neon/set/Backend.h" #include "Neon/set/MemoryOptions.h" +#include "Neon/Neon.h" #include "Neon/domain/interface/Stencil.h" #include "Neon/domain/tools/Geometries.h" #include "Neon/domain/tools/IODomain.h" @@ -131,6 +132,8 @@ TestData::TestData(const Neon::Backend& backend, const domain::Stencil& stencil, Type outsideValue) { + Neon::init(); + mGeometry = geometry; Neon::domain::tool::GeometryMask geometryMask(geometry, dimension, diff --git a/libNeonDomain/src/domain/internal/dGrid/dGrid.cpp b/libNeonDomain/src/domain/internal/dGrid/dGrid.cpp index 94e2e007..3d837b8e 100644 --- a/libNeonDomain/src/domain/internal/dGrid/dGrid.cpp +++ b/libNeonDomain/src/domain/internal/dGrid/dGrid.cpp @@ -104,7 +104,8 @@ auto dGrid::getLaunchParameters(const Neon::DataView dataView, dims[i].z = dims[i].z - m_zBoundaryRadius * 2; if (dims[i].z <= 0 && dims.size() > 1) { NeonException exp("dGrid"); - exp << "The grid size is too small to support the data view model correctly"; + exp << "The grid size is too small to support the data view model correctly \n"; + exp << dims[i] << " for setIdx "<< i << " and device " << getDevSet().devId(i); NEON_THROW(exp); } } diff --git a/libNeonDomain/tests/unit/domain-unit-test-staggered-grid/src/RunHelper.h b/libNeonDomain/tests/unit/domain-unit-test-staggered-grid/src/RunHelper.h index bcb34847..d8008953 100644 --- a/libNeonDomain/tests/unit/domain-unit-test-staggered-grid/src/RunHelper.h +++ b/libNeonDomain/tests/unit/domain-unit-test-staggered-grid/src/RunHelper.h @@ -35,7 +35,7 @@ void runAllTestConfiguration(const std::string&, std::vector cardinalityTest{1}; // nGpuTest = std::vector(1,1); - std::vector dimTest{{60, 10, 250}, {10, 50, 80}}; + std::vector dimTest{{60, 40, 250}, {10, 50, 80}}; std::vector runtimeE{Neon::Runtime::openmp, Neon::Runtime::stream}; // std::vector dimTest{{3}}; // std::vector runtimeE{Neon::Runtime::stream}; diff --git a/libNeonDomain/tests/unit/domain-unit-test-staggered-grid/src/staggeredGrid.cpp b/libNeonDomain/tests/unit/domain-unit-test-staggered-grid/src/staggeredGrid.cpp index 3f28187c..9c506bce 100644 --- a/libNeonDomain/tests/unit/domain-unit-test-staggered-grid/src/staggeredGrid.cpp +++ b/libNeonDomain/tests/unit/domain-unit-test-staggered-grid/src/staggeredGrid.cpp @@ -235,7 +235,7 @@ int getNGpus() } } // namespace -TEST(Map, dGrid) +TEST(DISABLED_Map, dGrid) { Neon::init(); int nGpus = getNGpus(); @@ -244,7 +244,7 @@ TEST(Map, dGrid) runAllTestConfiguration("staggeredGrid", StaggeredGrid_Map, nGpus, 1); } -TEST(VoxToNodes, dGrid) +TEST(DISABLED_CodeVoxToNodes, dGrid) { Neon::init(); int nGpus = getNGpus(); @@ -253,7 +253,7 @@ TEST(VoxToNodes, dGrid) runAllTestConfiguration("staggeredGrid", StaggeredGrid_VoxToNodes, nGpus, 1); } -TEST(NodeToVoxels, dGrid) +TEST(DISABLED_NodeToVoxels, dGrid) { Neon::init(); int nGpus = getNGpus(); diff --git a/libNeonDomain/tests/unit/domainUt_sGrid/src/sGrid.cu b/libNeonDomain/tests/unit/domainUt_sGrid/src/sGrid.cu index 6cc1d031..98856f54 100644 --- a/libNeonDomain/tests/unit/domainUt_sGrid/src/sGrid.cu +++ b/libNeonDomain/tests/unit/domainUt_sGrid/src/sGrid.cu @@ -6,7 +6,7 @@ #include "Neon/domain/aGrid.h" #include "Neon/domain/dGrid.h" #include "Neon/domain/eGrid.h" -//#include "Neon/domain/sGrid.h" +// #include "Neon/domain/sGrid.h" #include "Neon/skeleton/Options.h" #include "Neon/skeleton/Skeleton.h" @@ -114,16 +114,16 @@ void sGridTestContainerRun(TestData& data) auto& X = data.getField(FieldNames::X); auto& Y = data.getField(FieldNames::Y); - dim.forEach([&](int x, int y, int z) { -#pragma omp critial - { - index_3d newE(x, y, z); - if (X.isInsideDomain(newE)) { - if (X(newE, 0) % 2 == 0) { - elements.push_back(newE); - } + dim.forEach([&](int x, int y, int z) { + // #pragma omp critial + // { + index_3d newE(x, y, z); + if (X.isInsideDomain(newE)) { + if (X(newE, 0) % 2 == 0) { + elements.push_back(newE); } } + // } }); Neon::domain::sGrid sGrid(grid, elements); @@ -223,15 +223,15 @@ void sGridTestSkeleton(TestData& data) auto& Y = data.getField(FieldNames::Y); dim.template forEach([&](int x, int y, int z) { -#pragma omp critial - { - index_3d newE(x, y, z); - if (X.isInsideDomain(newE)) { - if (X(newE, 0) % 2 == 0) { - elements.push_back(newE); - } + // #pragma omp critial + // { + index_3d newE(x, y, z); + if (X.isInsideDomain(newE)) { + if (X(newE, 0) % 2 == 0) { + elements.push_back(newE); } } + // } }); Neon::domain::sGrid sGrid(grid, elements); diff --git a/libNeonSet/CMakeLists.txt b/libNeonSet/CMakeLists.txt index 3a867537..98d5d5fc 100644 --- a/libNeonSet/CMakeLists.txt +++ b/libNeonSet/CMakeLists.txt @@ -27,6 +27,10 @@ target_include_directories(libNeonSet PUBLIC "${CMAKE_CURRENT_SOURCE_DIR}/includ source_group(TREE ${CMAKE_CURRENT_LIST_DIR} PREFIX "libNeonSet" FILES ${libNeonSetFiles}) +if(${NEON_USE_NVTX}) + target_link_libraries(libNeonSet PUBLIC ${CUDA_nvToolsExt_LIBRARY}) +endif() + if (${BUILD_NEON_TESTING}) add_subdirectory("tests") endif() diff --git a/libNeonSet/include/Neon/set/Backend.h b/libNeonSet/include/Neon/set/Backend.h index 22f40fcc..bdfa34b7 100644 --- a/libNeonSet/include/Neon/set/Backend.h +++ b/libNeonSet/include/Neon/set/Backend.h @@ -139,34 +139,34 @@ class Backend auto eventSet(Neon::EventIdx eventdx) -> Neon::set::GpuEventSet&; - /** - * Extract a stream base in the provided streamIds. - * The method uses a circular policy to select the stream. - * The parameter rotateIdx is the index used to store the - * state of the circular polity in between calls. - * - * @param rotateIdx - * @param streamIdxVec - * @return - */ - auto streamSetRotate(int& rotateIdx, - const std::vector& streamIdxVec) - const - -> const Neon::set::StreamSet&; - - /** - * Extract a stream base in the provided streamIds. - * The method uses a circular policy to select the stream. - * The parameter rotateIdx is the index used to store the - * state of the circular polity in between calls. - * - * @param rotateIdx - * @param streamIdxVec - * @return - */ - static auto streamSetIdxRotate(int& rotateIdx, - const std::vector& streamIdxVec) - -> int; +// /** +// * Extract a stream base in the provided streamIds. +// * The method uses a circular policy to select the stream. +// * The parameter rotateIdx is the index used to store the +// * state of the circular polity in between calls. +// * +// * @param rotateIdx +// * @param streamIdxVec +// * @return +// */ +// auto streamSetRotate(int& rotateIdx, +// const std::vector& streamIdxVec) +// const +// -> const Neon::set::StreamSet&; +// +// /** +// * Extract a stream base in the provided streamIds. +// * The method uses a circular policy to select the stream. +// * The parameter rotateIdx is the index used to store the +// * state of the circular polity in between calls. +// * +// * @param rotateIdx +// * @param streamIdxVec +// * @return +// */ +// static auto streamSetIdxRotate(int& rotateIdx, +// const std::vector& streamIdxVec) +// -> int; /** * @@ -213,14 +213,14 @@ class Backend auto waitEventOnStream(Neon::SetIdx setIdx, int eventId, int streamId) -> void; - /** - * Create a set of cuda events to create an exit barrier. - * I.e. one streams sync with all the others - * The stream holding the barrier is the first in the streamIdxVec vector. - * - * @param streamIdxVec - */ - auto streamEventBarrier(const std::vector& streamIdxVec) -> void; +// /** +// * Create a set of cuda events to create an exit barrier. +// * I.e. one streams sync with all the others +// * The stream holding the barrier is the first in the streamIdxVec vector. +// * +// * @param streamIdxVec +// */ +// auto streamEventBarrier(const std::vector& streamIdxVec) -> void; auto getMemoryOptions(Neon::MemoryLayout order) const -> Neon::MemoryOptions; @@ -242,6 +242,7 @@ class Backend std::string toString() const; auto toReport(Neon::Report& report, Report::SubBlock* subdocAPI = nullptr) const -> void; + void syncEvent(SetIdx setIdx, int eventIdx) const; }; } // namespace Neon diff --git a/libNeonSet/include/Neon/set/Containter.h b/libNeonSet/include/Neon/set/Containter.h index b9114e14..2d4cc22a 100644 --- a/libNeonSet/include/Neon/set/Containter.h +++ b/libNeonSet/include/Neon/set/Containter.h @@ -1,21 +1,22 @@ #pragma once #include "Neon/set/DevSet.h" -#include "Neon/set/dependencyTools/DataParsing.h" #include "functional" #include "type_traits" #include "Neon/set/container/ContainerAPI.h" -#include "Neon/set/container/HostManagedSyncType.h" -#include "Neon/set/container/Loader.h" +#include "Neon/set/container/types/HostManagedSyncType.h" +#include "Neon/set/container/types/SynchronizationContainerType.h" namespace Neon::set { +struct Loader; struct Container { public: Container() = default; + virtual ~Container() = default; /** * Run a Neon Container on a given stream and with a given data view @@ -74,6 +75,7 @@ struct Container typename UserLoadingLambdaT> static auto factoryOldManaged(const std::string& name, Neon::set::internal::ContainerAPI::DataViewSupport dataViewSupport, + Neon::set::ContainerPatternType patternType, DataContainerT a, const UserLoadingLambdaT& f) -> Container; @@ -105,6 +107,17 @@ struct Container Container& host) -> Container; + template + static auto factoryDataTransfer(const MultiXpuDataT& multiXpuData, + Neon::set::TransferMode transferMode, + Neon::set::StencilSemantic transferSemantic) + -> Neon::set::Container; + + template + static auto factorySynchronization(const MxpuDataT& multiXpuData, + SynchronizationContainerType syncType) + -> Container; + static auto factoryAnchor(const std::string& name /**< A user's string to identify the computation done by the Container. */) -> Container; diff --git a/libNeonSet/include/Neon/set/Containter_imp.h b/libNeonSet/include/Neon/set/Containter_imp.h index 3c401836..a8cbd7da 100644 --- a/libNeonSet/include/Neon/set/Containter_imp.h +++ b/libNeonSet/include/Neon/set/Containter_imp.h @@ -1,17 +1,21 @@ #pragma once #include "Neon/set/DevSet.h" -#include "Neon/set/dependencyTools/DataParsing.h" + #include "functional" #include "type_traits" +#include "Neon/set/container/Loader.h" + +#include "Neon/set/container/DataTransferContainer.h" #include "Neon/set/container/DeviceContainer.h" #include "Neon/set/container/DeviceManagedContainer.h" #include "Neon/set/container/DeviceThenHostManagedContainer.h" +#include "Neon/set/container/GraphContainer.h" #include "Neon/set/container/HostManagedContainer.h" #include "Neon/set/container/OldDeviceManagedContainer.h" +#include "Neon/set/container/SynchronizationContainer.h" -#include "Neon/set/container/GraphContainer.h" namespace Neon::set { @@ -30,23 +34,27 @@ auto Container::factory(const std::string& name, blockSize, shMemSizeFun); std::shared_ptr tmp(k); - return Container(tmp); + return {tmp}; } template auto Container::factoryOldManaged(const std::string& name, Neon::set::internal::ContainerAPI::DataViewSupport dataViewSupport, + Neon::set::ContainerPatternType patternType, DataContainerT a, const UserLoadingLambdaT& f) -> Container { using ManagedLaunch = typename std::invoke_result::type; - auto k = new Neon::set::internal::OldDeviceManagedContainer(name, dataViewSupport, - a, f); + auto k = new Neon::set::internal::OldDeviceManagedContainer(name, + dataViewSupport, + patternType, + a, + f); std::shared_ptr tmp(k); - return Container(tmp); + return {tmp}; } template tmp(k); - return Container(tmp); + return {tmp}; } template tmp(k); - return Container(tmp); + return {tmp}; } +template +auto Container:: + factoryDataTransfer(const MultiXpuDataT& multiXpuData, + Neon::set::TransferMode transferMode, + Neon::set::StencilSemantic transferSemantic) + -> Neon::set::Container +{ + auto k = new Neon::set::internal::DataTransferContainer(multiXpuData, + transferMode, + transferSemantic); + + std::shared_ptr tmp(k); + return {tmp}; +} + +template +auto Container:: + factorySynchronization(const MxpuDataT& multiXpuData, + SynchronizationContainerType syncType) -> Container +{ + auto k = new Neon::set::internal::SynchronizationContainer(multiXpuData, + syncType); + + std::shared_ptr tmp(k); + return {tmp}; +} } // namespace Neon::set diff --git a/libNeonSet/include/Neon/set/DataSet.h b/libNeonSet/include/Neon/set/DataSet.h index b729db7b..782b2d77 100644 --- a/libNeonSet/include/Neon/set/DataSet.h +++ b/libNeonSet/include/Neon/set/DataSet.h @@ -2,7 +2,7 @@ #include #include "Neon/core/types/DataView.h" -#include "Neon/set/MultiDeviceObjectUid.h" +#include "Neon/set/MultiXpuDataUid.h" #include "Neon/sys/devices/DevInterface.h" namespace Neon { @@ -103,10 +103,11 @@ struct DataSet * Returns a unique identifier for the data set. * @return */ - auto uid() -> MultiDeviceObjectUid + auto uid() + const -> Neon::set::dataDependency::MultiXpuDataUid { - T_ta* addr = m_data.get(); - MultiDeviceObjectUid uidRes = (size_t)addr; + T_ta* addr = m_data.get(); + auto uidRes = (Neon::set::dataDependency::MultiXpuDataUid)addr; return uidRes; } @@ -114,7 +115,8 @@ struct DataSet * * @return */ - auto local(Neon::DeviceType, SetIdx setIdx, const Neon::DataView& dataView = Neon::DataView::STANDARD) + auto local(Neon::DeviceType, SetIdx setIdx, + const Neon::DataView& dataView = Neon::DataView::STANDARD) -> T_ta& { (void)dataView; diff --git a/libNeonSet/include/Neon/set/DevSet.h b/libNeonSet/include/Neon/set/DevSet.h index c5d76092..6ade26ee 100644 --- a/libNeonSet/include/Neon/set/DevSet.h +++ b/libNeonSet/include/Neon/set/DevSet.h @@ -12,6 +12,7 @@ #include #include #include +#include //#include "Neon/set/backend.h" #include "Neon/set/DataSet.h" @@ -311,8 +312,9 @@ class DevSet const LaunchParameters& launchInfoSet = kernelConfig.launchInfoSet(); const int nGpus = int(m_devIds.size()); { -#pragma omp parallel for num_threads(nGpus) default(shared) firstprivate(lambdaHolder) - for (int idx = 0; idx < nGpus; idx++) { +#pragma omp parallel num_threads(nGpus) default(shared) firstprivate(lambdaHolder) + { + int idx = omp_get_thread_num(); const Neon::sys::GpuDevice& dev = Neon::sys::globalSpace::gpuSysObj().dev(m_devIds[idx]); // std::tupleargsForIthGpuFunction(parametersVec.at(i) ...); diff --git a/libNeonSet/include/Neon/set/GpuStreamSet.h b/libNeonSet/include/Neon/set/GpuStreamSet.h index 23910675..03ea2200 100644 --- a/libNeonSet/include/Neon/set/GpuStreamSet.h +++ b/libNeonSet/include/Neon/set/GpuStreamSet.h @@ -2,6 +2,7 @@ #include "Neon/set/GpuEventSet.h" #include "Neon/sys/devices/gpu/GpuStream.h" +#include namespace Neon { namespace set { @@ -143,8 +144,9 @@ class StreamSet // Without nDev>0, VS on debug mode displays this annoying message //"User Error 1001: argument to num_threads clause must be positive" if (run_et::et::sync == runMode && nDev > 0) { -#pragma omp parallel for num_threads(nDev) - for (int idx = 0; idx < nDev; idx++) { +#pragma omp parallel num_threads(nDev) + { + const int idx = omp_get_thread_num(); m_streamVec[idx].sync(); } return; @@ -192,7 +194,7 @@ class StreamSet */ auto validateId(SetIdx id) const -> void; -}; +}; // namespace set } // namespace set } // End of namespace Neon diff --git a/libNeonSet/include/Neon/set/HuOptions.h b/libNeonSet/include/Neon/set/HuOptions.h index 779b79b8..5aff1a52 100644 --- a/libNeonSet/include/Neon/set/HuOptions.h +++ b/libNeonSet/include/Neon/set/HuOptions.h @@ -11,17 +11,17 @@ struct HuOptions bool m_startWithBarrier = true; int m_streamSetIdx = 0; Neon::set::PeerTransferOption m_peerTransferOpt; - Neon::set::TransferSemantic m_structure; + Neon::set::StencilSemantic m_structure; public: HuOptions(Neon::set::TransferMode transferMode /*< Mode of the transfer: put or get */, bool startWithBarrier /*< If true a barrier is executed before initiating the halo update */, int streamSetIdx = Neon::Backend::mainStreamIdx /*< Target stream for the halo update */, - Neon::set::TransferSemantic structure = Neon::set::TransferSemantic::grid /*< Structure on top of which the transfer is one: grid or lattice */); + Neon::set::StencilSemantic structure = Neon::set::StencilSemantic::standard /*< Structure on top of which the transfer is one: grid or lattice */); HuOptions(Neon::set::TransferMode transferMode, NEON_OUT std::vector& transfers, - Neon::set::TransferSemantic structure = Neon::set::TransferSemantic::grid); + Neon::set::StencilSemantic structure = Neon::set::StencilSemantic::standard); auto getPeerTransferOpt(const Neon::Backend& bk) -> Neon::set::PeerTransferOption&; auto startWithBarrier() const -> bool; @@ -30,7 +30,7 @@ struct HuOptions auto operationMode() const -> Neon::set::PeerTransferOption::operationMode_e; auto transferMode() const -> Neon::set::TransferMode; auto isExecuteMode() const -> bool; - auto structure() -> Neon::set::TransferSemantic; + auto getSemantic() const -> Neon::set::StencilSemantic; }; } // namespace set } // namespace Neon diff --git a/libNeonSet/include/Neon/set/MultiDeviceObjectUid.h b/libNeonSet/include/Neon/set/MultiDeviceObjectUid.h deleted file mode 100644 index d15fe9bc..00000000 --- a/libNeonSet/include/Neon/set/MultiDeviceObjectUid.h +++ /dev/null @@ -1,7 +0,0 @@ -#pragma once - -namespace Neon::set { - -using MultiDeviceObjectUid = size_t; - -} // namespace Neon::set diff --git a/libNeonSet/include/Neon/set/MultiDeviceObjectInterface.h b/libNeonSet/include/Neon/set/MultiXpuDataInterface.h similarity index 69% rename from libNeonSet/include/Neon/set/MultiDeviceObjectInterface.h rename to libNeonSet/include/Neon/set/MultiXpuDataInterface.h index f745f961..2eb64549 100644 --- a/libNeonSet/include/Neon/set/MultiDeviceObjectInterface.h +++ b/libNeonSet/include/Neon/set/MultiXpuDataInterface.h @@ -4,22 +4,22 @@ #include "Neon/core/core.h" #include "Neon/core/types/Execution.h" -#include "Neon/set/MultiDeviceObjectUid.h" +#include "Neon/set/MultiXpuDataUid.h" namespace Neon::set::interface { template -class MultiDeviceObjectInterface +class MultiXpuDataInterface { public: using Partition = P; using Storage = S; - using Self = MultiDeviceObjectInterface; + using Self = MultiXpuDataInterface; - virtual ~MultiDeviceObjectInterface() = default; + virtual ~MultiXpuDataInterface() = default; - MultiDeviceObjectInterface(); + MultiXpuDataInterface(); virtual auto updateIO(int streamId = 0) -> void = 0; @@ -44,7 +44,7 @@ class MultiDeviceObjectInterface auto getStorage() const -> const Storage&; - auto getUid() const -> Neon::set::MultiDeviceObjectUid; + auto getUid() const -> Neon::set::dataDependency::MultiXpuDataUid; protected: static auto swapUIDs(Self& A, Self& B) -> void; @@ -56,33 +56,33 @@ class MultiDeviceObjectInterface }; template -auto MultiDeviceObjectInterface::getStorage() -> Storage& +auto MultiXpuDataInterface::getStorage() -> Storage& { return *(mStorage.get()); } template -auto MultiDeviceObjectInterface::getStorage() const -> const Storage& +auto MultiXpuDataInterface::getStorage() const -> const Storage& { return *(mStorage.get()); } template -auto MultiDeviceObjectInterface::getUid() const -> Neon::set::MultiDeviceObjectUid +auto MultiXpuDataInterface::getUid() const -> Neon::set::dataDependency::MultiXpuDataUid { void* addr = static_cast(mUid.get()); - Neon::set::MultiDeviceObjectUid uidRes = (size_t)addr; + Neon::set::dataDependency::MultiXpuDataUid uidRes = (size_t)addr; return uidRes; } template -MultiDeviceObjectInterface::MultiDeviceObjectInterface() +MultiXpuDataInterface::MultiXpuDataInterface() { mStorage = std::make_shared(); mUid = std::make_shared(); } template -auto MultiDeviceObjectInterface::swapUIDs(MultiDeviceObjectInterface::Self& A, MultiDeviceObjectInterface::Self& B) -> void +auto MultiXpuDataInterface::swapUIDs(MultiXpuDataInterface::Self& A, MultiXpuDataInterface::Self& B) -> void { std::swap(A.mUid,B.mUid); } diff --git a/libNeonSet/include/Neon/set/MultiXpuDataUid.h b/libNeonSet/include/Neon/set/MultiXpuDataUid.h new file mode 100644 index 00000000..490e9b99 --- /dev/null +++ b/libNeonSet/include/Neon/set/MultiXpuDataUid.h @@ -0,0 +1,8 @@ +#pragma once + +namespace Neon::set::dataDependency { + +using MultiXpuDataUid = size_t; +using MultiXpuDataIdx = size_t; + +} // namespace Neon::set::dataDependency diff --git a/libNeonSet/include/Neon/set/Replica.h b/libNeonSet/include/Neon/set/Replica.h index ee7bafa7..508b7944 100644 --- a/libNeonSet/include/Neon/set/Replica.h +++ b/libNeonSet/include/Neon/set/Replica.h @@ -3,7 +3,7 @@ #include #include "Neon/set/Containter.h" #include "Neon/set/DevSet.h" -#include "Neon/set/MultiDeviceObjectInterface.h" +#include "Neon/set/MultiXpuDataInterface.h" namespace Neon::set::internal::datum { @@ -69,7 +69,7 @@ namespace Neon::set { * @tparam Obj */ template -class Replica : public Neon::set::interface::MultiDeviceObjectInterface, +class Replica : public Neon::set::interface::MultiXpuDataInterface, Neon::set::internal::datum::Storage> { public: diff --git a/libNeonSet/include/Neon/set/SingletonSet.h b/libNeonSet/include/Neon/set/SingletonSet.h index 6cdc7d12..bd91a577 100644 --- a/libNeonSet/include/Neon/set/SingletonSet.h +++ b/libNeonSet/include/Neon/set/SingletonSet.h @@ -72,10 +72,10 @@ struct SingletonSet * Returns a unique identifier for the data set. * @return */ - auto uid() -> MultiDeviceObjectUid + auto uid() ->Neon::set::dataDependency::MultiXpuDataUid { T_ta* addr = m_data.get(); - MultiDeviceObjectUid uidRes = (size_t)addr; + Neon::set::dataDependency::MultiXpuDataUid uidRes = (size_t)addr; return uidRes; } @@ -83,10 +83,10 @@ struct SingletonSet * Returns a unique identifier for the data set. * @return */ - auto uid() const -> MultiDeviceObjectUid + auto uid() const ->Neon::set::dataDependency::MultiXpuDataUid { T_ta* addr = m_data.get(); - MultiDeviceObjectUid uidRes = (size_t)addr; + Neon::set::dataDependency::MultiXpuDataUid uidRes = (size_t)addr; return uidRes; } diff --git a/libNeonSet/include/Neon/set/StencilSemantic.h b/libNeonSet/include/Neon/set/StencilSemantic.h new file mode 100644 index 00000000..cd512ae7 --- /dev/null +++ b/libNeonSet/include/Neon/set/StencilSemantic.h @@ -0,0 +1,41 @@ +#pragma once +#include +#include + +#include "Neon/core/core.h" + +namespace Neon::set { + +enum struct StencilSemantic +{ + standard = 0 /*< Transfer for halo update on grid structure */, + streaming = 1 /*< Transfer for halo update on lattice structure */ +}; + + +struct StencilSemanticUtils +{ + static constexpr int nOptions = 2; + + static auto toString(StencilSemantic opt) -> std::string; + static auto fromString(const std::string& opt) -> StencilSemantic; + static auto getOptions() -> std::array; + + struct Cli + { + explicit Cli(std::string); + explicit Cli(StencilSemantic model); + Cli(); + + auto getOption() -> StencilSemantic; + auto set(const std::string& opt) -> void; + auto getStringOptions() -> std::string; + + private: + bool mSet = false; + StencilSemantic mOption; + }; +}; + + +} // namespace Neon::set diff --git a/libNeonSet/include/Neon/set/Transfer.h b/libNeonSet/include/Neon/set/Transfer.h index dbece3b9..1d2053b7 100644 --- a/libNeonSet/include/Neon/set/Transfer.h +++ b/libNeonSet/include/Neon/set/Transfer.h @@ -3,8 +3,8 @@ #include #include "Neon/core/core.h" +#include "Neon/set/StencilSemantic.h" #include "Neon/set/TransferMode.h" -#include "Neon/set/TransferSemantic.h" namespace Neon { namespace set { @@ -31,14 +31,14 @@ struct Transfer Endpoint_t m_dst; Endpoint_t m_src; size_t m_size{0}; - TransferSemantic m_structure; + StencilSemantic m_structure; public: Transfer(TransferMode mode, const Endpoint_t& dst, const Endpoint_t& src, size_t size, - TransferSemantic structure = TransferSemantic::grid) + StencilSemantic structure = StencilSemantic::standard) : m_mode(mode), m_dst(dst), m_src(src), m_size(size), m_structure(structure) { } @@ -96,14 +96,14 @@ struct PeerTransferOption operationMode_e m_operationMode = Neon::set::PeerTransferOption::execute; TransferMode m_transferMode = Neon::set::TransferMode::get; - Neon::set::TransferSemantic m_structure; + Neon::set::StencilSemantic m_structure; public: /** * Constructor with stream parameter */ explicit PeerTransferOption(TransferMode tranferMode, - Neon::set::TransferSemantic structure) + Neon::set::StencilSemantic structure) : m_transfers(nullptr), m_streamSet(nullptr), m_operationMode(operationMode_e::execute), @@ -120,7 +120,7 @@ struct PeerTransferOption */ PeerTransferOption(TransferMode tranferMode, std::vector& NEON_OUT transfers, - Neon::set::TransferSemantic structure) + Neon::set::StencilSemantic structure) : m_transfers(&transfers), m_streamSet(nullptr), m_operationMode(operationMode_e::storeInfo), @@ -180,7 +180,7 @@ struct PeerTransferOption return *m_transfers; } - auto structure() const -> const Neon::set::TransferSemantic& + auto structure() const -> const Neon::set::StencilSemantic& { return m_structure; } diff --git a/libNeonSet/include/Neon/set/TransferSemantic.h b/libNeonSet/include/Neon/set/TransferSemantic.h deleted file mode 100644 index ae347fac..00000000 --- a/libNeonSet/include/Neon/set/TransferSemantic.h +++ /dev/null @@ -1,42 +0,0 @@ -#pragma once -#include -#include - -#include "Neon/core/core.h" - -namespace Neon::set { - - -enum struct TransferSemantic -{ - grid = 0 /*< Transfer for halo update on grid structure */, - lattice = 1 /*< Transfer for halo update on lattice structure */ -}; - - -struct TransferSemanticUtils -{ - static constexpr int nOptions = 2; - - static auto toString(TransferSemantic opt) -> std::string; - static auto fromString(const std::string& opt) -> TransferSemantic; - static auto getOptions() -> std::array; - - struct Cli - { - explicit Cli(std::string); - explicit Cli(TransferSemantic model); - Cli(); - - auto getOption() -> TransferSemantic; - auto set(const std::string& opt) -> void; - auto getStringOptions() -> std::string; - - private: - bool mSet = false; - TransferSemantic mOption; - }; -}; - - -} // namespace Neon::set diff --git a/libNeonSet/include/Neon/set/container/AnchorContainer.h b/libNeonSet/include/Neon/set/container/AnchorContainer.h index 945885b3..68069bed 100644 --- a/libNeonSet/include/Neon/set/container/AnchorContainer.h +++ b/libNeonSet/include/Neon/set/container/AnchorContainer.h @@ -23,29 +23,27 @@ struct AnchorContainer : ContainerAPI public: AnchorContainer(const std::string& name); - auto parse() -> const std::vector& override; - - auto getHostContainer() -> std::shared_ptr final; - - virtual auto getDeviceContainer() -> std::shared_ptr final; + auto parse() -> const std::vector& override; /** * Run container over streams * @param streamIdx * @param dataView */ - virtual auto run(int streamIdx = 0, Neon::DataView dataView = Neon::DataView::STANDARD) -> void override; + virtual auto run(int streamIdx = 0, + Neon::DataView dataView = Neon::DataView::STANDARD) -> void override; /** * Run container over streams * @param streamIdx * @param dataView */ - virtual auto run(Neon::SetIdx setIdx, int streamIdx, Neon::DataView dataView) -> void override; + virtual auto run(Neon::SetIdx setIdx, + int streamIdx, + Neon::DataView dataView) -> void override; private: - std::vector mEmtpy; - + std::vector mEmtpy; }; } // namespace internal diff --git a/libNeonSet/include/Neon/set/container/ContainerAPI.h b/libNeonSet/include/Neon/set/container/ContainerAPI.h index 28805856..cf83a46e 100644 --- a/libNeonSet/include/Neon/set/container/ContainerAPI.h +++ b/libNeonSet/include/Neon/set/container/ContainerAPI.h @@ -1,11 +1,10 @@ #pragma once -#include "Neon/set/container/ContainerOperationType.h" -#include "Neon/set/container/ContainerPatternType.h" - #include "Neon/set/DevSet.h" -#include "Neon/set/container/ContainerExecutionType.h" -#include "Neon/set/dependencyTools/DataParsing.h" +#include "Neon/set/container/types/ContainerExecutionType.h" +#include "Neon/set/container/types/ContainerOperationType.h" +#include "Neon/set/container/types/ContainerPatternType.h" +// #include "Neon/set/dependency/Token.h" #include "functional" #include "type_traits" @@ -16,6 +15,11 @@ struct Loader; namespace Neon::set::container { struct Graph; +struct GraphNode; +} // namespace Neon::set::container + +namespace Neon::set::dataDependency { +struct Token; } namespace Neon::set::internal { @@ -26,6 +30,8 @@ namespace Neon::set::internal { */ struct ContainerAPI { + virtual auto configureWithScheduling(Neon::set::container::GraphNode& graphNode) -> void; + public: friend Neon::set::Loader; @@ -43,13 +49,16 @@ struct ContainerAPI /** * Run this Container over a stream. */ - virtual auto run(int streamIdx, Neon::DataView dataView = Neon::DataView::STANDARD) + virtual auto run(int streamIdx, + Neon::DataView dataView = Neon::DataView::STANDARD) -> void = 0; /** * Run this Container over a stream. */ - virtual auto run(Neon::SetIdx idx, int streamIdx, Neon::DataView dataView) + virtual auto run(Neon::SetIdx idx, + int streamIdx, + Neon::DataView dataView) -> void = 0; /** @@ -67,7 +76,7 @@ struct ContainerAPI /** * Returns a handle to the internal graph of Containers. */ - virtual auto getGraph() + virtual auto getGraph() const -> const Neon::set::container::Graph&; /** @@ -75,8 +84,10 @@ struct ContainerAPI * @return */ virtual auto parse() - -> const std::vector& = 0; + -> const std::vector&; + virtual auto getTransferMode() const + -> Neon::set::TransferMode; /** * Returns a name associated to the container. @@ -88,13 +99,7 @@ struct ContainerAPI * Returns a list of tokens as result of parsing the Container loading lambda. */ auto getTokens() const - -> const std::vector&; - - /** - * Returns a list of tokens as result of parsing the Container loading lambda. - */ - auto getTokenRef() - -> std::vector&; + -> const std::vector&; /** * Get the execution type for the Container. @@ -126,10 +131,16 @@ struct ContainerAPI auto toLog(uint64_t ContainerUid) -> void; protected: + /** + * Returns a list of tokens as result of parsing the Container loading lambda. + */ + auto getTokensRef() + -> std::vector&; + /** * Add a new token */ - auto addToken(Neon::set::internal::dependencyTools::DataToken& dataParsing) + auto addToken(Neon::set::dataDependency::Token& dataParsing) -> void; /** @@ -154,7 +165,7 @@ struct ContainerAPI * Generate a string that will be printed in case or exceptions * @return */ - auto helpGetNameForError() + auto helpGetNameForError() const -> std::string; /** @@ -179,7 +190,7 @@ struct ContainerAPI * Set the patter for this Container based on a list of tokens. * @param tokens */ - auto setContainerPattern(const std::vector& tokens) + auto setContainerPattern(const std::vector& tokens) -> void; /** @@ -196,7 +207,7 @@ struct ContainerAPI -> void; private: - using TokenList = std::vector; + using TokenList = std::vector; std::string mName{"Anonymous"}; /**< Name of the Container */ bool mParsingDataUpdated = false; diff --git a/libNeonSet/include/Neon/set/container/DataTransferContainer.h b/libNeonSet/include/Neon/set/container/DataTransferContainer.h new file mode 100644 index 00000000..1d7c737d --- /dev/null +++ b/libNeonSet/include/Neon/set/container/DataTransferContainer.h @@ -0,0 +1,70 @@ +#pragma once +#include "Neon/core/core.h" + +#include "Neon/set/container/ContainerAPI.h" +#include "Neon/set/container/Loader.h" + +namespace Neon::set::internal { + +template +struct DataTransferContainer + : ContainerAPI +{ + virtual ~DataTransferContainer() override = default; + + DataTransferContainer(const MxpuDataT& multiXpuData, + Neon::set::TransferMode transferMode, + Neon::set::StencilSemantic transferSemantic) + : mMultiXpuData(multiXpuData), + mTransferMode(transferMode), + mTransferSemantic(transferSemantic) + { + setName("DataTransferContainer"); + + setContainerExecutionType(ContainerExecutionType::deviceManaged); + setContainerOperationType(ContainerOperationType::communication); + setDataViewSupport(DataViewSupport::off); + + mDataTransferFun = [&](Neon::SetIdx setIdx, + int streamIdx) { + Neon::set::HuOptions options(this->mTransferMode, + false, + streamIdx, + mTransferSemantic); + this->mMultiXpuData.haloUpdate(setIdx, options); + }; + } + + auto run(int streamIdx, + Neon::DataView dataView) -> void override + { + const Neon::Backend& bk = mMultiXpuData.getBackend(); + const int setCardinality = bk.devSet().setCardinality(); + +#pragma omp parallel for num_threads(setCardinality) + for (int i = 0; i < setCardinality; ++i) { + run(Neon::SetIdx(i), streamIdx, dataView); + } + } + + auto run(Neon::SetIdx setIdx, + int streamIdx, + Neon::DataView /*dataView*/) -> void override + { + if (ContainerExecutionType::deviceManaged == this->getContainerExecutionType()) { + mDataTransferFun(setIdx, streamIdx); + return; + } + NEON_THROW_UNSUPPORTED_OPTION(""); + } + + private: + std::function + mDataTransferFun; + MxpuDataT mMultiXpuData; + Neon::set::TransferMode mTransferMode; + Neon::set::StencilSemantic mTransferSemantic; +}; + +} // namespace Neon::set::internal diff --git a/libNeonSet/include/Neon/set/container/DeviceContainer.h b/libNeonSet/include/Neon/set/container/DeviceContainer.h index 28b90294..cab76c1f 100644 --- a/libNeonSet/include/Neon/set/container/DeviceContainer.h +++ b/libNeonSet/include/Neon/set/container/DeviceContainer.h @@ -4,17 +4,8 @@ #include "Neon/set/container/ContainerAPI.h" #include "Neon/set/container/Loader.h" -namespace Neon { -namespace set { -namespace internal { - -/** - * Specialized implementation of KContainer_i - * - * - * @tparam DataIteratorContainerT - * @tparam UserComputeLambdaT - */ +namespace Neon::set::internal { + template struct DeviceContainer : ContainerAPI @@ -22,7 +13,6 @@ struct DeviceContainer : ContainerAPI public: virtual ~DeviceContainer() override = default; - public: DeviceContainer(const std::string& name, ContainerAPI::DataViewSupport dataViewSupport, const DataIteratorContainerT& dataIteratorContainer, @@ -78,7 +68,7 @@ struct DeviceContainer : ContainerAPI return parser; } - auto parse() -> const std::vector& override + auto parse() -> const std::vector& override { if (!this->isParsingDataUpdated()) { auto parser = newParser(); @@ -90,23 +80,13 @@ struct DeviceContainer : ContainerAPI return getTokens(); } - - auto - getHostContainer() -> std::shared_ptr final - { - NEON_THROW_UNSUPPORTED_OPTION("This Container type can not be decoupled."); - } - - virtual auto getDeviceContainer() -> std::shared_ptr final - { - NEON_THROW_UNSUPPORTED_OPTION("This Container type can not be decoupled."); - } /** * Run container over streams * @param streamIdx * @param dataView */ - virtual auto run(int streamIdx = 0, Neon::DataView dataView = Neon::DataView::STANDARD) -> void override + virtual auto run(int streamIdx = 0, + Neon::DataView dataView = Neon::DataView::STANDARD) -> void override { const Neon::Backend& bk = m_dataIteratorContainer.getBackend(); @@ -132,18 +112,30 @@ struct DeviceContainer : ContainerAPI * @param streamIdx * @param dataView */ - virtual auto run(Neon::SetIdx setIdx, int streamIdx, Neon::DataView dataView) -> void override + virtual auto run(Neon::SetIdx setIdx, + int streamIdx, + Neon::DataView dataView) -> void override { const Neon::Backend& bk = m_dataIteratorContainer.getBackend(); Neon::set::KernelConfig kernelConfig(dataView, bk, streamIdx, this->getLaunchParameters(dataView)); +#pragma omp critical + { + const int threadRank = omp_get_thread_num(); + NEON_TRACE("TRACE DeviceContainer run rank {} setIdx {} stream {} dw {}", + threadRank, setIdx.idx(), kernelConfig.stream(), Neon::DataViewUtil::toString(kernelConfig.dataView())); + }; + if (ContainerExecutionType::device == this->getContainerExecutionType()) { bk.devSet().template kernelLambdaWithIterator( setIdx, kernelConfig, m_dataIteratorContainer, - [&](Neon::DeviceType devE, Neon::SetIdx setIdx, Neon::DataView dataView) -> UserComputeLambdaT { + [&](Neon::DeviceType devE, + Neon::SetIdx setIdx, + Neon::DataView dataView) + -> UserComputeLambdaT { Loader loader = this->newLoader(devE, setIdx, dataView, LoadingMode_e::EXTRACT_LAMBDA); UserComputeLambdaT userLambda = this->m_loadingLambda(loader); return userLambda; @@ -163,6 +155,4 @@ struct DeviceContainer : ContainerAPI DataIteratorContainerT m_dataIteratorContainer; }; -} // namespace internal -} // namespace set -} // namespace Neon +} // namespace Neon::set::internal diff --git a/libNeonSet/include/Neon/set/container/DeviceManagedContainer.h b/libNeonSet/include/Neon/set/container/DeviceManagedContainer.h index f8e9cdec..515b721a 100644 --- a/libNeonSet/include/Neon/set/container/DeviceManagedContainer.h +++ b/libNeonSet/include/Neon/set/container/DeviceManagedContainer.h @@ -62,7 +62,7 @@ struct DeviceManagedContainer : ContainerAPI return parser; } - auto parse() -> const std::vector& override + auto parse() -> const std::vector& override { Neon::SetIdx setIdx(0); if (!this->mParsingDataUpdated) { @@ -108,9 +108,6 @@ struct DeviceManagedContainer : ContainerAPI if (ContainerExecutionType::deviceManaged == this->getContainerType()) { const Neon::Backend& bk = mDataContainer.getBackend(); - // We use device 0 as a dummy setIdx to create a loader. - // The actual value is not important as the managed container will take care of launching on all devices. - SetIdx dummyTargetSetIdx = 0; Loader loader = this->newLoader(bk.devType(), setIdx, dataView, LoadingMode_e::EXTRACT_LAMBDA); ComputeLambdaT computeLambda = this->mLoadingLambda(setIdx, loader); computeLambda(streamIdx, dataView); diff --git a/libNeonSet/include/Neon/set/container/DeviceThenHostManagedContainer.h b/libNeonSet/include/Neon/set/container/DeviceThenHostManagedContainer.h index 42b9aa6a..ae8ee3ff 100644 --- a/libNeonSet/include/Neon/set/container/DeviceThenHostManagedContainer.h +++ b/libNeonSet/include/Neon/set/container/DeviceThenHostManagedContainer.h @@ -64,7 +64,7 @@ struct DeviceThenHostManagedContainer : ContainerAPI return parser; } - auto parse() -> const std::vector& override + auto parse() -> const std::vector& override { mHost->parse(); mDevice->parse(); @@ -73,12 +73,12 @@ struct DeviceThenHostManagedContainer : ContainerAPI auto const& devTokens = mDevice->getTokens(); for (auto const& token : devTokens) { - getTokenRef().push_back(token); + getTokensRef().push_back(token); } - std::vector filtered; + std::vector filtered; for (auto const& token : hostTokens) { bool foundMatch = false; - for (auto& acceptedTokens : getTokenRef()) { + for (auto& acceptedTokens : getTokensRef()) { if (token.uid() == acceptedTokens.uid()) { acceptedTokens.mergeAccess(token.access()); foundMatch = true; @@ -90,7 +90,7 @@ struct DeviceThenHostManagedContainer : ContainerAPI } for (auto const& token : filtered) { - getTokenRef().push_back(token); + getTokensRef().push_back(token); } return getTokens(); diff --git a/libNeonSet/include/Neon/set/container/Graph.h b/libNeonSet/include/Neon/set/container/Graph.h index 5403fd02..1914c923 100644 --- a/libNeonSet/include/Neon/set/container/Graph.h +++ b/libNeonSet/include/Neon/set/container/Graph.h @@ -5,8 +5,19 @@ #include "Neon/set/container/graph/Bfs.h" #include "Neon/set/container/graph/GraphDependency.h" +#include "Neon/set/container/graph/GraphDependencyType.h" #include "Neon/set/container/graph/GraphNode.h" + +namespace Neon::skeleton::internal { +struct MultiXpuGraph; +} + +namespace Neon::set::container { +struct GraphNode; +struct GraphDependency; +} // namespace Neon::set::container + namespace Neon::set::container { /** @@ -15,13 +26,15 @@ namespace Neon::set::container { * directed edges of the graph are dependencies between the containers. * Dependencies my be data driven or user provided. * - * */ struct Graph { using Uid = GraphData::Uid; using Index = GraphData::Index; friend struct Bfs; + friend Neon::set::container::GraphNode; + friend Neon::set::container::GraphDependency; + friend Neon::skeleton::internal::MultiXpuGraph; public: Graph(); @@ -76,6 +89,23 @@ struct Graph GraphDependencyType type) -> GraphDependency&; + /** + * Adds a dependency between two node of the graph + */ + auto addDependency(const GraphNode& nodeA, + const GraphNode& nodeB, + const Neon::set::dataDependency::Token& token) + -> GraphDependency&; + + auto removeDependency(const GraphDependency&) + -> void; + + + /** + * Helper - it removes redundant dependencies + */ + auto removeRedundantDependencies() -> void; + /** * Returns the dependency type between two nodes. */ @@ -83,6 +113,14 @@ struct Graph const GraphNode& nodeB) -> GraphDependencyType; + auto getDependency(const GraphNode& nodeA, + const GraphNode& nodeB) + const -> const GraphDependency&; + + auto getMutableDependency(const GraphNode& nodeA, + const GraphNode& nodeB) + -> GraphDependency&; + /** * Clone a node and return a reference to the new clone. * The cloning process connects the clone the the same nodes of the original @@ -105,7 +143,9 @@ struct Graph */ auto getSubsequentGraphNodes(const GraphNode& graphNode, const std::vector& dependencyTypes = {GraphDependencyType::user, - GraphDependencyType::data}) -> std::vector; + GraphDependencyType::data}) + -> std::vector; + /** * Set the stream to run the graph. * We provide a preset function so that some initialization @@ -146,7 +186,24 @@ struct Graph -> void; auto getNumberOfNodes() - ->int; + -> int; + + auto forEachDependency(const std::function& fun) + const -> void; + + auto forEachNode(const std::function& fun) + const -> void; + + /** + * Adding a graph between two nodes A and B + * The method works only on Graph type containers. + * Finally, it returns the number of added nodes. + */ + auto expandAndMerge(const GraphNode& A, + const Container& graphOriginal, + const GraphNode& B, + bool propagateSchedulingHints) + -> int; protected: /** @@ -159,10 +216,6 @@ struct Graph */ auto helpCheckBackendStatus() -> void; - /** - * Helper - it removes redundant dependencies - */ - auto helpRemoveRedundantDependencies() -> void; /** * Compute BFS @@ -173,9 +226,8 @@ struct Graph * Returns the out-neighbour of a target node */ auto helpGetOutNeighbors(GraphData::Uid, - bool fileterOutEnd = true, - const std::vector& dependencyTypes = {GraphDependencyType::user, - GraphDependencyType::data}) + bool fileterOutEnd, + const std::vector& dependencyTypes) -> std::set; /** @@ -231,19 +283,21 @@ struct Graph * - order of execution * - mapping between streams and graph nodes */ - auto helpComputeScheduling(bool filterOutAnchors, int anchorStream) + auto helpComputeScheduling(bool filterOutAnchors, + int anchorStream) -> void; /** * Helper - it executes the graph on all devices */ - auto helpExecute(int anchorStream) + auto helpExecuteWithOmpAtNodeLevel(int anchorStream) -> void; /** * Helper - it executes the graph on a target device */ - auto helpExecute(Neon::SetIdx setIdx, int anchorStream) + auto helpExecute(Neon::SetIdx setIdx, + int anchorStream) -> void; /** * Helper - It resets node scheduling data @@ -261,7 +315,9 @@ struct Graph * Helper - it maps node to streams. * Returns the max stream Id used by the scheduling */ - auto helpComputeScheduling_02_mappingStreams(Bfs& bfs, bool filterOutAnchors, int anchorStream) + auto helpComputeScheduling_02_mappingStreams(Bfs& bfs, + bool filterOutAnchors, + int anchorStream) -> int; /** @@ -274,9 +330,15 @@ struct Graph /** * Helper - it Books the required resources from the backend. */ - auto helpComputeScheduling_04_ensureResources(int maxStreamId, int maxEventId) + auto helpComputeScheduling_04_ensureResources(int maxStreamId, + int maxEventId) -> void; + /** + * Helper - it Books the required resources from the backend. + */ + auto helpComputeScheduling_05_executionOrder(bool filterOutAnchors, Bfs& bfs) + -> void; using RawGraph = DiGraph; diff --git a/libNeonSet/include/Neon/set/container/GraphContainer.h b/libNeonSet/include/Neon/set/container/GraphContainer.h index 3630ce1a..f10ead04 100644 --- a/libNeonSet/include/Neon/set/container/GraphContainer.h +++ b/libNeonSet/include/Neon/set/container/GraphContainer.h @@ -17,39 +17,30 @@ struct GraphContainer : ContainerAPI public: ~GraphContainer() override = default; - /** - * User facing API to define a kernel - * @param data - * @param userLambda - */ GraphContainer(const std::string& name, const Neon::set::container::Graph& containerGraph, std::function loadingLambda); - auto newParser() -> Loader; + auto newParser() + -> Loader; - auto parse() -> const std::vector& override; + auto parse() + -> const std::vector& override; - auto getGraph() -> const Neon::set::container::Graph& override; + auto getGraph() + const -> const Neon::set::container::Graph& override; - auto getHostContainer() -> std::shared_ptr override; - - auto getDeviceContainer() -> std::shared_ptr override; - /** - * Run container over streams - * @param streamIdx - * @param dataView - */ - auto run(int streamIdx = 0, + auto run(int streamIdx = 0, Neon::DataView dataView = Neon::DataView::STANDARD) -> void override; auto run(Neon::SetIdx setIdx, int streamIdx = 0, - Neon::DataView dataView = Neon::DataView::STANDARD) -> void override; + Neon::DataView dataView = Neon::DataView::STANDARD) + -> void override; private: - std::function mLoadingLambda; + std::function mLoadingLambda; std::shared_ptr mGraph; }; diff --git a/libNeonSet/include/Neon/set/container/HaloUpdateContainer.h b/libNeonSet/include/Neon/set/container/HaloUpdateContainer.h new file mode 100644 index 00000000..a5a48df7 --- /dev/null +++ b/libNeonSet/include/Neon/set/container/HaloUpdateContainer.h @@ -0,0 +1,34 @@ +#pragma once + +#include "Neon/core/core.h" + +#include "Neon/set/container/ContainerAPI.h" +#include "Neon/set/container/Graph.h" +#include "Neon/set/container/GraphContainer.h" +#include "Neon/set/container/Loader.h" +#include "Neon/set/container/types/SynchronizationContainerType.h" + +namespace Neon::set { +struct Container; +} + +namespace Neon::set::internal { + +template +struct HaloUpdateContainer + : public GraphContainer +{ + + public: + ~HaloUpdateContainer() override = default; + + HaloUpdateContainer(const Neon::Backend& bk, + const Neon::set::Container& dataTransferContainer, + const Neon::set::Container& syncContainer); + + private: +}; + +} // namespace Neon::set::internal + +#include "Neon/set/container/HaloUpdateContainer_imp.h" \ No newline at end of file diff --git a/libNeonSet/include/Neon/set/container/HaloUpdateContainer_imp.h b/libNeonSet/include/Neon/set/container/HaloUpdateContainer_imp.h new file mode 100644 index 00000000..7746ee4e --- /dev/null +++ b/libNeonSet/include/Neon/set/container/HaloUpdateContainer_imp.h @@ -0,0 +1,35 @@ +#include "Neon/core/core.h" + +#include "Neon/set/container/HaloUpdateContainer.h" + +namespace Neon::set::internal { + +template +HaloUpdateContainer:: + HaloUpdateContainer(const Neon::Backend& bk, + const Neon::set::Container& dataTransferContainer, + const Neon::set::Container& syncContainer) +{ + Neon::set::container::Graph graph(bk); + + auto dataTranferNode = graph.addNode(dataTransferContainer); + auto syncNode = graph.addNode(syncContainer); + + if (dataTransferContainer.getContainerInterface().getTransferMode() == + Neon::set::TransferMode::get) { + graph.addDependency(syncNode, dataTranferNode, GraphDependencyType::data); + } else { + graph.addDependency(dataTranferNode, syncNode, GraphDependencyType::data); + } + + auto name = std::string("HaloUpdate"); + GraphContainer graphContainer(graph, [&](Neon::set::Loader& loader) { + // Nothing to load + }); + + this->GraphContainer = graphContainer; + + setContainerOperationType(ContainerOperationType::communication); + setDataViewSupport(DataViewSupport::off); +} +} // namespace Neon::set::internal diff --git a/libNeonSet/include/Neon/set/container/HostManagedContainer.h b/libNeonSet/include/Neon/set/container/HostManagedContainer.h index 20c1b306..99429d20 100644 --- a/libNeonSet/include/Neon/set/container/HostManagedContainer.h +++ b/libNeonSet/include/Neon/set/container/HostManagedContainer.h @@ -1,8 +1,8 @@ #pragma once #include "Neon/set/container/ContainerAPI.h" -#include "Neon/set/container/HostManagedSyncType.h" #include "Neon/set/container/Loader.h" +#include "Neon/set/container/types/HostManagedSyncType.h" namespace Neon::set::internal { @@ -71,7 +71,7 @@ struct HostManagedContainer : ContainerAPI return parser; } - auto parse() -> const std::vector& override + auto parse() -> const std::vector& override { auto parser = newParser(); Neon::SetIdx setIdx(0); diff --git a/libNeonSet/include/Neon/set/container/Loader.h b/libNeonSet/include/Neon/set/container/Loader.h index bdbfe74e..bb7180b7 100644 --- a/libNeonSet/include/Neon/set/container/Loader.h +++ b/libNeonSet/include/Neon/set/container/Loader.h @@ -1,10 +1,15 @@ #pragma once -//#include + +#include "type_traits" + #include "Neon/set/DevSet.h" -#include "Neon/set/dependencyTools/DataParsing.h" +#include "Neon/set/HuOptions.h" +#include "Neon/set/StencilSemantic.h" #include "Neon/set/container/ContainerAPI.h" +#include "Neon/set/dependency/AccessType.h" +#include "Neon/set/dependency/ComputeType.h" +#include "Neon/set/dependency/Token.h" -#include "type_traits" namespace Neon::set { namespace internal { @@ -19,115 +24,6 @@ struct LoadingMode_e LoadingMode_e(const LoadingMode_e&) = delete; }; -namespace tmp { -// From: -// https://en.cppreference.com/w/cpp/experimental/is_detected -namespace detail { -template class Op, class... Args> -struct detector -{ - using value_t = std::false_type; - using type = Default; -}; - -template class Op, class... Args> -struct detector>, Op, Args...> -{ - using value_t = std::true_type; - using type = Op; -}; - -} // namespace detail - -struct nonesuch -{ - ~nonesuch() = delete; - nonesuch(nonesuch const&) = delete; - void operator=(nonesuch const&) = delete; -}; - -template