From 8e4adc8520a545a6aab5a8e7dbca5ad5020ffbdb Mon Sep 17 00:00:00 2001
From: Massimiliano Meneghin <massimiliano.meneghin@autodesk.com>
Date: Fri, 5 Jan 2024 16:44:55 +0100
Subject: [PATCH] wip: adding new multi-gpu bGrid.

---
 libNeonDomain/include/Neon/domain/Grids.h     |   1 +
 .../include/Neon/domain/bGridMgpuDisg.h       |   6 +
 .../domain/details/bGridDisgMgpu/BlockView.h  |  29 +
 .../bGridDisgMgpu/BlockView/BlockViewGrid.h   |  97 ++++
 .../BlockView/BlockViewPartition.h            |  42 ++
 .../BlockView/BlockViewPartition_imp.h        | 184 +++++++
 .../details/bGridDisgMgpu/StaticBlock.h       | 106 ++++
 .../domain/details/bGridDisgMgpu/bField.h     | 121 +++++
 .../domain/details/bGridDisgMgpu/bField_imp.h | 349 ++++++++++++
 .../Neon/domain/details/bGridDisgMgpu/bGrid.h | 239 ++++++++
 .../domain/details/bGridDisgMgpu/bGrid_imp.h  | 470 ++++++++++++++++
 .../domain/details/bGridDisgMgpu/bIndex.h     | 142 +++++
 .../domain/details/bGridDisgMgpu/bIndex_imp.h |  67 +++
 .../domain/details/bGridDisgMgpu/bPartition.h | 215 ++++++++
 .../details/bGridDisgMgpu/bPartition_imp.h    | 510 ++++++++++++++++++
 .../Neon/domain/details/bGridDisgMgpu/bSpan.h |  55 ++
 .../domain/details/bGridDisgMgpu/bSpan_imp.h  |  51 ++
 .../bGridDisgMgpu/bFieldReduceKernels.cu      |   0
 .../domain/details/bGridDisgMgpu/bGrid.cpp    |   7 +
 .../tests/domain-map-disg/src/gtests.cpp      |   4 +-
 libNeonDomain/tests/domain-map/src/gtests.cpp |  12 +-
 libNeonDomain/tests/domain-map/src/map.cu     |  10 +-
 libNeonDomain/tests/domain-map/src/map.h      |   2 +
 .../tests/domain-map/src/runHelper.h          |   2 +-
 24 files changed, 2713 insertions(+), 8 deletions(-)
 create mode 100644 libNeonDomain/include/Neon/domain/bGridMgpuDisg.h
 create mode 100644 libNeonDomain/include/Neon/domain/details/bGridDisgMgpu/BlockView.h
 create mode 100644 libNeonDomain/include/Neon/domain/details/bGridDisgMgpu/BlockView/BlockViewGrid.h
 create mode 100644 libNeonDomain/include/Neon/domain/details/bGridDisgMgpu/BlockView/BlockViewPartition.h
 create mode 100644 libNeonDomain/include/Neon/domain/details/bGridDisgMgpu/BlockView/BlockViewPartition_imp.h
 create mode 100644 libNeonDomain/include/Neon/domain/details/bGridDisgMgpu/StaticBlock.h
 create mode 100644 libNeonDomain/include/Neon/domain/details/bGridDisgMgpu/bField.h
 create mode 100644 libNeonDomain/include/Neon/domain/details/bGridDisgMgpu/bField_imp.h
 create mode 100644 libNeonDomain/include/Neon/domain/details/bGridDisgMgpu/bGrid.h
 create mode 100644 libNeonDomain/include/Neon/domain/details/bGridDisgMgpu/bGrid_imp.h
 create mode 100644 libNeonDomain/include/Neon/domain/details/bGridDisgMgpu/bIndex.h
 create mode 100644 libNeonDomain/include/Neon/domain/details/bGridDisgMgpu/bIndex_imp.h
 create mode 100644 libNeonDomain/include/Neon/domain/details/bGridDisgMgpu/bPartition.h
 create mode 100644 libNeonDomain/include/Neon/domain/details/bGridDisgMgpu/bPartition_imp.h
 create mode 100644 libNeonDomain/include/Neon/domain/details/bGridDisgMgpu/bSpan.h
 create mode 100644 libNeonDomain/include/Neon/domain/details/bGridDisgMgpu/bSpan_imp.h
 create mode 100644 libNeonDomain/src/domain/details/bGridDisgMgpu/bFieldReduceKernels.cu
 create mode 100644 libNeonDomain/src/domain/details/bGridDisgMgpu/bGrid.cpp

diff --git a/libNeonDomain/include/Neon/domain/Grids.h b/libNeonDomain/include/Neon/domain/Grids.h
index a7c9a878..6e48f584 100644
--- a/libNeonDomain/include/Neon/domain/Grids.h
+++ b/libNeonDomain/include/Neon/domain/Grids.h
@@ -5,3 +5,4 @@
 #include "Neon/domain/bGrid.h"
 #include "Neon/domain/dGridSoA.h"
 #include "Neon/domain/bGridDisg.h"
+#include "Neon/domain/bGridMgpuDisg.h"
diff --git a/libNeonDomain/include/Neon/domain/bGridMgpuDisg.h b/libNeonDomain/include/Neon/domain/bGridMgpuDisg.h
new file mode 100644
index 00000000..2d2a1c9f
--- /dev/null
+++ b/libNeonDomain/include/Neon/domain/bGridMgpuDisg.h
@@ -0,0 +1,6 @@
+#pragma once
+#include "Neon/domain/details/bGridDisgMgpu//bGrid.h"
+
+namespace Neon {
+using bGridMgpu = Neon::domain::details::bGridMgpu::bGrid<Neon::domain::details::bGridMgpu::StaticBlock<8,8,8>>;
+}
\ No newline at end of file
diff --git a/libNeonDomain/include/Neon/domain/details/bGridDisgMgpu/BlockView.h b/libNeonDomain/include/Neon/domain/details/bGridDisgMgpu/BlockView.h
new file mode 100644
index 00000000..5ef7df3b
--- /dev/null
+++ b/libNeonDomain/include/Neon/domain/details/bGridDisgMgpu/BlockView.h
@@ -0,0 +1,29 @@
+#include "Neon/domain/details/bGridDisgMgpu//BlockView/BlockViewGrid.h"
+#include "Neon/domain/tools/GridTransformer.h"
+
+namespace Neon::domain::details::bGridMgpu {
+
+struct BlockView
+{
+   public:
+    using Grid = Neon::domain::tool::GridTransformer<details::GridTransformation>::Grid;
+    template <typename T, int C = 0>
+    using Field = Grid::template Field<T, C>;
+    using index_3d = Neon::index_3d;
+
+    template <typename T, int C = 0>
+    static auto helpGetReference(T* mem, const int idx, const int card) -> std::enable_if_t<C == 0, T&>
+    {
+        return mem[idx * card];
+    }
+
+    template <typename T, int C = 0>
+    static auto helpGetReference(T* mem, const int idx, const int card) -> std::enable_if_t<C != 0, T&>
+    {
+        return mem[idx * C];
+    }
+
+    static constexpr Neon::MemoryLayout layout = Neon::MemoryLayout::arrayOfStructs;
+};
+
+}  // namespace Neon::domain::details::bGrid
\ No newline at end of file
diff --git a/libNeonDomain/include/Neon/domain/details/bGridDisgMgpu/BlockView/BlockViewGrid.h b/libNeonDomain/include/Neon/domain/details/bGridDisgMgpu/BlockView/BlockViewGrid.h
new file mode 100644
index 00000000..6047bb88
--- /dev/null
+++ b/libNeonDomain/include/Neon/domain/details/bGridDisgMgpu/BlockView/BlockViewGrid.h
@@ -0,0 +1,97 @@
+#pragma once
+#include <assert.h>
+
+#include "Neon/core/core.h"
+#include "Neon/core/types/DataUse.h"
+#include "Neon/core/types/Macros.h"
+
+#include "Neon/set/BlockConfig.h"
+#include "Neon/set/Containter.h"
+#include "Neon/set/DevSet.h"
+#include "Neon/set/MemoryOptions.h"
+
+#include "Neon/sys/memory/MemDevice.h"
+
+#include "Neon/domain/aGrid.h"
+
+#include "Neon/domain/interface/GridBaseTemplate.h"
+#include "Neon/domain/interface/GridConcept.h"
+#include "Neon/domain/interface/KernelConfig.h"
+#include "Neon/domain/interface/LaunchConfig.h"
+#include "Neon/domain/interface/Stencil.h"
+#include "Neon/domain/interface/common.h"
+
+#include "Neon/domain/tools/GridTransformer.h"
+#include "Neon/domain/tools/SpanTable.h"
+
+#include "Neon/domain/details/eGrid/eGrid.h"
+#include "Neon/domain/patterns/PatternScalar.h"
+
+#include "BlockViewPartition.h"
+
+namespace Neon::domain::details::bGridMgpu {
+
+namespace details {
+struct GridTransformation
+{
+    template <typename T, int C>
+    using Partition = BlockViewPartition<T, C>;
+    using Span = Neon::domain::details::eGrid::eSpan;
+    static constexpr Neon::set::internal::ContainerAPI::DataViewSupport dataViewSupport = Neon::set::internal::ContainerAPI::DataViewSupport::on;
+
+    using FoundationGrid = Neon::domain::details::eGrid::eGrid;
+    static constexpr Neon::set::details::ExecutionThreadSpan executionThreadSpan = FoundationGrid::executionThreadSpan;
+    using ExecutionThreadSpanIndexType = int32_t;
+    using Idx = FoundationGrid::Idx;
+
+    static auto getDefaultBlock(FoundationGrid& foundationGrid) -> Neon::index_3d const&
+    {
+        return foundationGrid.getDefaultBlock();
+    }
+
+    static auto initSpan(FoundationGrid& foundationGrid, Neon::domain::tool::SpanTable<Span>& spanTable) -> void
+    {
+        spanTable.forEachConfiguration([&](Neon::Execution execution,
+                                           Neon::SetIdx    setIdx,
+                                           Neon::DataView  dw,
+                                           Span&           span) {
+            span = foundationGrid.getSpan(execution, setIdx, dw);
+        });
+    }
+
+    static auto initLaunchParameters(FoundationGrid&       foundationGrid,
+                                     Neon::DataView        dataView,
+                                     const Neon::index_3d& blockSize,
+                                     const size_t&         shareMem) -> Neon::set::LaunchParameters
+    {
+        return foundationGrid.getLaunchParameters(dataView, blockSize, shareMem);
+    }
+
+    static auto helpGetGridIdx(FoundationGrid&,
+                               Neon::SetIdx const&,
+                               FoundationGrid::Idx const& fgIdx)
+        -> GridTransformation::Idx
+    {
+        GridTransformation::Idx tgIdx = fgIdx;
+        return tgIdx;
+    }
+
+    template <typename T, int C>
+    static auto initFieldPartition(FoundationGrid::Field<T, C>&                         foundationField,
+                                   Neon::domain::tool::PartitionTable<Partition<T, C>>& partitionTable) -> void
+    {
+        partitionTable.forEachConfiguration(
+            [&](Neon::Execution  execution,
+                Neon::SetIdx     setIdx,
+                Neon::DataView   dw,
+                Partition<T, C>& partition) {
+                auto& foundationPartition = foundationField.getPartition(execution, setIdx, dw);
+                partition = Partition<T, C>(foundationPartition);
+            });
+    }
+};
+using BlockViewGrid = Neon::domain::tool::GridTransformer<details::GridTransformation>::Grid;
+
+}  // namespace details
+
+}  // namespace Neon::domain::details::bGrid
diff --git a/libNeonDomain/include/Neon/domain/details/bGridDisgMgpu/BlockView/BlockViewPartition.h b/libNeonDomain/include/Neon/domain/details/bGridDisgMgpu/BlockView/BlockViewPartition.h
new file mode 100644
index 00000000..e1d7be2b
--- /dev/null
+++ b/libNeonDomain/include/Neon/domain/details/bGridDisgMgpu/BlockView/BlockViewPartition.h
@@ -0,0 +1,42 @@
+#pragma once
+#include <assert.h>
+#include "Neon/core/core.h"
+#include "Neon/core/types/Macros.h"
+#include "Neon/domain/details/eGrid/eGrid.h"
+#include "Neon/domain/details/eGrid/eIndex.h"
+#include "Neon/domain/interface/NghData.h"
+#include "Neon/set/DevSet.h"
+#include "Neon/sys/memory/CudaIntrinsics.h"
+#include "cuda_fp16.h"
+
+namespace Neon::domain::details::bGridMgpu {
+
+template <typename T,
+          int C = 1>
+class BlockViewPartition : public Neon::domain::details::eGrid::ePartition<T, C>
+{
+   public:
+    BlockViewPartition()
+    {
+    }
+    BlockViewPartition(Neon::domain::details::eGrid::ePartition<T, C> ePartition)
+        : Neon::domain::details::eGrid::ePartition<T, C>(ePartition)
+    {
+    }
+
+    template <class BlockIdexType>
+    static auto getInBlockIdx(typename Neon::domain::details::eGrid::ePartition<T, C>::Idx const& idx,
+                                  uint8_3d const&                                            inBlockLocation) -> BlockIdexType
+    {
+        BlockIdexType blockIdx(idx.helpGet(), inBlockLocation);
+        return inBlockLocation;
+    }
+
+    auto getCountAllocated() const -> int32_t;
+};
+template <typename T, int C>
+auto BlockViewPartition<T, C>::getCountAllocated() const -> int32_t
+{
+    return this->mCountAllocated;
+}
+}  // namespace Neon::domain::details::bGrid
diff --git a/libNeonDomain/include/Neon/domain/details/bGridDisgMgpu/BlockView/BlockViewPartition_imp.h b/libNeonDomain/include/Neon/domain/details/bGridDisgMgpu/BlockView/BlockViewPartition_imp.h
new file mode 100644
index 00000000..4464f686
--- /dev/null
+++ b/libNeonDomain/include/Neon/domain/details/bGridDisgMgpu/BlockView/BlockViewPartition_imp.h
@@ -0,0 +1,184 @@
+#pragma once
+
+#include "Neon/domain/details//eGrid/ePartition.h"
+
+namespace Neon::domain::details::bGridMgpu {
+
+
+template <typename T,
+          int C>
+NEON_CUDA_HOST_DEVICE auto
+ePartition<T, C>::prtID() const
+    -> int
+{
+    return mPrtID;
+}
+
+template <typename T,
+          int C>
+template <int dummy_ta>
+inline NEON_CUDA_HOST_DEVICE auto
+ePartition<T, C>::cardinality() const
+    -> std::enable_if_t<dummy_ta == 0, int>
+{
+    return mCardinality;
+}
+
+template <typename T,
+          int C>
+template <int dummy_ta>
+constexpr inline NEON_CUDA_HOST_DEVICE auto
+ePartition<T, C>::cardinality() const
+    -> std::enable_if_t<dummy_ta != 0, int>
+{
+    return C;
+}
+
+template <typename T,
+          int C>
+NEON_CUDA_HOST_DEVICE inline auto
+ePartition<T, C>::operator()(eIndex eId, int cardinalityIdx) const
+    -> T
+{
+    Offset jump = getOffset(eId, cardinalityIdx);
+    return mMem[jump];
+}
+
+template <typename T,
+          int C>
+NEON_CUDA_HOST_DEVICE inline auto
+ePartition<T, C>::operator()(eIndex eId, int cardinalityIdx) -> T&
+{
+    Offset jump = getOffset(eId, cardinalityIdx);
+    return mMem[jump];
+}
+
+template <typename T,
+          int C>
+NEON_CUDA_HOST_DEVICE inline auto
+ePartition<T, C>::getNghData(eIndex      eId,
+                             NghIdx      nghIdx,
+                             int         card,
+                             const Type& alternativeVal)
+    const -> NghData
+{
+    eIndex     eIdxNgh;
+    const bool isValidNeighbour = isValidNgh(eId, nghIdx, eIdxNgh);
+    T          val = (isValidNeighbour) ? this->operator()(eIdxNgh, card) : alternativeVal;
+    //    printf("(prtId %d)getNghData id %d card %d eIdxNgh %d val %d\n",
+    //         mPrtID,  eId.mIdx, card, eIdxNgh.mIdx, int(val));
+    return NghData(val, isValidNeighbour);
+}
+
+template <typename T,
+          int C>
+NEON_CUDA_HOST_DEVICE inline auto
+ePartition<T, C>::getNghData(eIndex               eId,
+                             const Neon::int8_3d& ngh3dIdx,
+                             int                  card,
+                             const Type&          alternativeVal)
+    const -> NghData
+{
+    int tablePithc = (ngh3dIdx.x + mStencilRadius) +
+                     (ngh3dIdx.y + mStencilRadius) * mStencilTableYPitch +
+                     (ngh3dIdx.z + mStencilRadius) * mStencilTableYPitch * mStencilTableYPitch;
+    NghIdx  nghIdx = mStencil3dTo1dOffset[tablePithc];
+    NghData res = getNghData(eId, nghIdx, card, alternativeVal);
+
+    return res;
+}
+
+template <typename T,
+          int C>
+NEON_CUDA_HOST_DEVICE inline auto
+ePartition<T, C>::isValidNgh(eIndex  eId,
+                             NghIdx  nghIdx,
+                             eIndex& neighbourIdx) const
+    -> bool
+{
+    const eIndex::Offset connectivityJumo = mCountAllocated * nghIdx + eId.get();
+    neighbourIdx.set() = NEON_CUDA_CONST_LOAD((mConnectivity + connectivityJumo));
+    const bool isValidNeighbour = (neighbourIdx.mIdx > -1);
+//    printf("(prtId %d) getNghData id %d eIdxNgh %d connectivityJumo %d\n",
+//           mPrtID,
+//           eId.mIdx, neighbourIdx.mIdx, connectivityJumo);
+    return isValidNeighbour;
+}
+
+template <typename T,
+          int C>
+NEON_CUDA_HOST_DEVICE inline auto
+ePartition<T, C>::getGlobalIndex(eIndex eIndex) const
+    -> Neon::index_3d
+{
+    Neon::index_3d loc;
+    const auto     baseAddr = mOrigins + eIndex.get();
+    loc = mOrigins[eIndex.get()];
+    return loc;
+}
+
+template <typename T,
+          int C>
+ePartition<T, C>::ePartition(int             prtId,
+                             T*              mem,
+                             ePitch          pitch,
+                             int32_t         cardinality,
+                             int32_t         countAllocated,
+                             Offset*         connRaw,
+                             Neon::index_3d* toGlobal,
+                             int8_t*         stencil3dTo1dOffset,
+                             int32_t         stencilRadius)
+{
+    mPrtID = prtId;
+    mMem = mem;
+    mPitch = pitch;
+    mCardinality = cardinality;
+    mCountAllocated = countAllocated;
+
+    mConnectivity = connRaw;
+    mOrigins = toGlobal;
+
+    mStencil3dTo1dOffset = stencil3dTo1dOffset;
+    mStencilTableYPitch = 2 * stencilRadius + 1;
+
+    mStencilRadius = stencilRadius;
+}
+
+template <typename T,
+          int C>
+NEON_CUDA_HOST_DEVICE auto
+ePartition<T, C>::pointer(eIndex eId, int cardinalityIdx) const
+    -> const Type*
+{
+    Offset jump = getOffset(eId, cardinalityIdx);
+    return mMem + jump;
+}
+
+template <typename T,
+          int C>
+NEON_CUDA_HOST_DEVICE inline auto
+ePartition<T, C>::getOffset(eIndex eId, int cardinalityIdx) const
+    -> Offset
+{
+    return Offset(eId.get() * mPitch.x + cardinalityIdx * mPitch.y);
+}
+
+template <typename T,
+          int C>
+NEON_CUDA_HOST_DEVICE inline auto
+ePartition<T, C>::mem()
+    -> T*
+{
+    return mMem;
+}
+
+template <typename T,
+          int C>
+NEON_CUDA_HOST_DEVICE inline auto
+ePartition<T, C>::mem() const
+    -> const T*
+{
+    return mMem;
+}
+
+}  // namespace Neon::domain::details::eGrid
diff --git a/libNeonDomain/include/Neon/domain/details/bGridDisgMgpu/StaticBlock.h b/libNeonDomain/include/Neon/domain/details/bGridDisgMgpu/StaticBlock.h
new file mode 100644
index 00000000..7fdd4577
--- /dev/null
+++ b/libNeonDomain/include/Neon/domain/details/bGridDisgMgpu/StaticBlock.h
@@ -0,0 +1,106 @@
+#pragma once 
+
+#include "Neon/domain/details/bGridDisgMgpu/bSpan.h"
+
+namespace Neon::domain::details::bGridMgpu {
+
+template <uint32_t memBlockSizeX_,
+          uint32_t memBlockSizeY_,
+          uint32_t memBlockSizeZ_,
+          uint32_t userBlockSizeX_ = memBlockSizeX_,
+          uint32_t userBlockSizeY_ = memBlockSizeY_,
+          uint32_t userBlockSizeZ_ = memBlockSizeZ_,
+          bool     isMultiResMode_ = false>
+struct StaticBlock
+{
+   public:
+    constexpr static uint32_t        memBlockSizeX = memBlockSizeX_;
+    constexpr static uint32_t        memBlockSizeY = memBlockSizeY_;
+    constexpr static uint32_t        memBlockSizeZ = memBlockSizeZ_;
+    constexpr static Neon::uint32_3d memBlockSize3D = Neon::uint32_3d(memBlockSizeX, memBlockSizeY, memBlockSizeZ);
+
+    constexpr static uint32_t        userBlockSizeX = userBlockSizeX_;
+    constexpr static uint32_t        userBlockSizeY = userBlockSizeY_;
+    constexpr static uint32_t        userBlockSizeZ = userBlockSizeZ_;
+    constexpr static Neon::uint32_3d userBlockSize3D = Neon::uint32_3d(userBlockSizeX, userBlockSizeY, userBlockSizeZ);
+
+    constexpr static uint32_t blockRatioX = memBlockSizeX / userBlockSizeX;
+    constexpr static uint32_t blockRatioY = memBlockSizeY / userBlockSizeY;
+    constexpr static uint32_t blockRatioZ = memBlockSizeZ / userBlockSizeZ;
+
+    constexpr static uint32_t memBlockPitchX = 1;
+    constexpr static uint32_t memBlockPitchY = memBlockSizeX;
+    constexpr static uint32_t memBlockPitchZ = memBlockSizeX * memBlockSizeY;
+
+    constexpr static bool isMultiResMode = isMultiResMode_;
+
+    constexpr static uint32_t memBlockCountElements = memBlockSizeX * memBlockSizeY * memBlockSizeZ;
+
+    static_assert(memBlockSizeX >= userBlockSizeX);
+    static_assert(memBlockSizeY >= userBlockSizeY);
+    static_assert(memBlockSizeZ >= userBlockSizeZ);
+
+    static_assert(memBlockSizeX % userBlockSizeX == 0);
+    static_assert(memBlockSizeY % userBlockSizeY == 0);
+    static_assert(memBlockSizeZ % userBlockSizeZ == 0);
+
+    struct BitMask
+    {
+        using BitMaskWordType = uint32_t;
+        auto reset() -> void
+        {
+            for (BitMaskWordType i = 0; i < nWords; ++i) {
+                bits[i] = 0;
+            }
+        }
+
+        auto setActive(int threadX,
+                       int threadY,
+                       int threadZ) -> void
+        {
+            BitMaskWordType mask;
+            uint32_t        wordIdx;
+            getMaskAndWordI(threadX, threadY, threadZ, mask, wordIdx);
+            auto& word = bits[wordIdx];
+            word = word | mask;
+        }
+
+        inline auto NEON_CUDA_HOST_DEVICE isActive(int threadX,
+                                                   int threadY,
+                                                   int threadZ) const -> bool
+        {
+            BitMaskWordType mask;
+            uint32_t        wordIdx;
+            getMaskAndWordI(threadX, threadY, threadZ, mask, wordIdx);
+            auto& word = bits[wordIdx];
+            return (word & mask) != 0;
+        }
+
+        static inline auto NEON_CUDA_HOST_DEVICE getMaskAndWordI(int                       threadX,
+                                                                 int                       threadY,
+                                                                 int                       threadZ,
+                                                                 NEON_OUT BitMaskWordType& mask,
+                                                                 NEON_OUT uint32_t&        wordIdx) -> void
+        {
+            const uint32_t threadPitch = threadX * memBlockPitchX +
+                                         threadY * memBlockPitchY +
+                                         threadZ * memBlockPitchZ;
+
+            // threadPitch >> log2_of_bitPerWord
+            // the same as: threadPitch / 2^{log2_of_bitPerWord}
+            wordIdx = threadPitch >> log2_of_bitPerWord;
+            // threadPitch & ((bitMaskWordType(bitMaskStorageBitWidth)) - 1);
+            // same as threadPitch % 2^{log2OfbitMaskWordSize}
+            const uint32_t offsetInWord = threadPitch & ((BitMaskWordType(bitPerWord)) - 1);
+            mask = BitMaskWordType(1) << offsetInWord;
+        }
+
+        constexpr static BitMaskWordType nWords = (memBlockCountElements + 31) / 32;
+        static constexpr uint32_t        log2_of_bitPerWord = 5;
+        static constexpr uint32_t        bitPerWord = 32;
+
+        BitMaskWordType bits[nWords];
+    };
+};
+
+}  // namespace Neon::domain::details::bGrid
\ No newline at end of file
diff --git a/libNeonDomain/include/Neon/domain/details/bGridDisgMgpu/bField.h b/libNeonDomain/include/Neon/domain/details/bGridDisgMgpu/bField.h
new file mode 100644
index 00000000..251fdbab
--- /dev/null
+++ b/libNeonDomain/include/Neon/domain/details/bGridDisgMgpu/bField.h
@@ -0,0 +1,121 @@
+#pragma once
+#include "Neon/domain/details/bGridDisgMgpu/bPartition.h"
+#include "Neon/domain/interface/FieldBaseTemplate.h"
+#include "Neon/set/patterns/BlasSet.h"
+
+#include "Neon/core/core.h"
+#include "Neon/core/types/Macros.h"
+
+#include "Neon/set/DevSet.h"
+#include "Neon/set/HuOptions.h"
+#include "Neon/set/MemoryTransfer.h"
+
+#include "Neon/domain/interface/FieldBaseTemplate.h"
+#include "Neon/domain/tools/HaloUpdateTable1DPartitioning.h"
+#include "Neon/domain/tools/PartitionTable.h"
+#include "bPartition.h"
+
+namespace Neon::domain::details::bGridMgpu {
+
+
+template <typename T, int C, typename SBlock>
+class bField : public Neon::domain::interface::FieldBaseTemplate<T,
+                                                                 C,
+                                                                 bGrid<SBlock>,
+                                                                 bPartition<T, C, SBlock>,
+                                                                 int>
+{
+    friend bGrid<SBlock>;
+
+   public:
+    using Type = T;
+    using Grid = bGrid<SBlock>;
+    using Field = bField<T, C, SBlock>;
+    using Partition = bPartition<T, C, SBlock>;
+    using Idx = bIndex<SBlock>;
+    using BlockViewGrid = Neon::domain::tool::GridTransformer<details::GridTransformation>::Grid;
+    template <typename TT, int CC = 0>
+    using BlockViewField = BlockViewGrid::template Field<TT, CC>;
+
+    using NghIdx = typename Partition::NghIdx;
+    using NghData = typename Partition::NghData;
+
+    bField(const std::string&  fieldUserName,
+           Neon::DataUse       dataUse,
+           Neon::MemoryOptions memoryOptions,
+           const Grid&         grid,
+           int                 cardinality,
+           T                   inactiveValue);
+
+    bField();
+
+    virtual ~bField() = default;
+
+    auto getPartition(Neon::Execution,
+                      Neon::SetIdx,
+                      const Neon::DataView& dataView) const -> const Partition& final;
+
+    auto getPartition(Neon::Execution,
+                      Neon::SetIdx,
+                      const Neon::DataView& dataView) -> Partition& final;
+
+    auto isInsideDomain(const Neon::index_3d& idx) const -> bool;
+
+
+    auto operator()(const Neon::index_3d& idx,
+                    const int&            cardinality) const -> T final;
+
+    auto getReference(const Neon::index_3d& idx,
+                      const int&            cardinality) -> T& final;
+
+    auto updateHostData(int streamId = 0) -> void final;
+
+    auto updateDeviceData(int streamId = 0) -> void final;
+
+    auto newHaloUpdate(Neon::set::StencilSemantic semantic,
+                       Neon::set::TransferMode    transferMode,
+                       Neon::Execution            execution)
+        const -> Neon::set::Container;
+
+    auto getMemoryField() -> BlockViewGrid::Field<T, C>&;
+
+
+   private:
+    auto getRef(const Neon::index_3d& idx, const int& cardinality) const -> T&;
+
+    auto initHaloUpdateTable() -> void;
+
+
+    struct Data
+    {
+        Data() = default;
+        Data(Neon::Backend const& bk)
+        {
+            partitionTable.init(bk);
+        }
+
+        enum EndPoints
+        {
+            src = 1,
+            dst = 0
+        };
+
+        struct EndPointsUtils
+        {
+            static constexpr int nConfigs = 2;
+        };
+
+        std::shared_ptr<Grid> grid;
+        BlockViewField<T, 0>  memoryField;
+        int                   cardinality;
+
+        //        Neon::domain::tool::HaloTable1DPartitioning   latticeHaloUpdateTable;
+        Neon::domain::tool::HaloTable1DPartitioning soaHaloUpdateTable;
+        //        Neon::domain::tool::HaloTable1DPartitioning   aosHaloUpdateTable;
+        Neon::domain::tool::PartitionTable<Partition> partitionTable;
+    };
+    std::shared_ptr<Data> mData;
+};
+
+
+}  // namespace Neon::domain::details::bGrid
diff --git a/libNeonDomain/include/Neon/domain/details/bGridDisgMgpu/bField_imp.h b/libNeonDomain/include/Neon/domain/details/bGridDisgMgpu/bField_imp.h
new file mode 100644
index 00000000..ead142d1
--- /dev/null
+++ b/libNeonDomain/include/Neon/domain/details/bGridDisgMgpu/bField_imp.h
@@ -0,0 +1,349 @@
+#pragma once
+
+#include "Neon/domain/details/bGridDisgMgpu/bField.h"
+
+namespace Neon::domain::details::bGridMgpu {
+
+template <typename T, int C, typename SBlock>
+bField<T, C, SBlock>::bField()
+{
+    mData = std::make_shared<Data>();
+}
+
+template <typename T, int C, typename SBlock>
+bField<T, C, SBlock>::bField(const std::string&  fieldUserName,
+                             Neon::DataUse       dataUse,
+                             Neon::MemoryOptions memoryOptions,
+                             const Grid&         grid,
+                             int                 cardinality,
+                             T                   inactiveValue)
+    : Neon::domain::interface::FieldBaseTemplate<T, C, Grid, Partition, int>(&grid,
+                                                                             fieldUserName,
+                                                                             "bField",
+                                                                             cardinality,
+                                                                             inactiveValue,
+                                                                             dataUse,
+                                                                             memoryOptions,
+                                                                             Neon::domain::haloStatus_et::e::ON)
+{
+    mData = std::make_shared<Data>(grid.getBackend());
+    mData->grid = std::make_shared<Grid>(grid);
+
+    if (memoryOptions.getOrder() == Neon::MemoryLayout::arrayOfStructs) {
+        NEON_WARNING("bField does not support MemoryLayout::arrayOfStructs, enforcing MemoryLayout::structOfArrays");
+        memoryOptions.setOrder(Neon::MemoryLayout::structOfArrays);
+    }
+    // the allocation size is the number of blocks x block size x cardinality
+    mData->memoryField = mData->grid->getBlockViewGrid().template newField<T, 0>(
+        "BitMask",
+        [&] {
+            int elPerBlock = SBlock::memBlockCountElements * cardinality;
+            return elPerBlock;
+        }(),
+        inactiveValue,
+        dataUse,
+        mData->grid->getBackend().getMemoryOptions(bSpan<SBlock>::activeMaskMemoryLayout));
+
+
+    {  // Setting up partitionTable
+        // const int setCardinality = mData->grid->getBackend().getDeviceCount();
+        mData->partitionTable.forEachConfiguration(
+            [&](Neon::Execution execution,
+                Neon::SetIdx    setIdx,
+                Neon::DataView,
+                Partition& partition) {
+                auto& partitioner = mData->grid->helpGetPartitioner1D();
+                auto  firstBup = partitioner.getSpanLayout().getBoundsBoundary(setIdx, Neon::domain::tool::partitioning::ByDirection::up).first;
+                auto  firstBdw = partitioner.getSpanLayout().getBoundsBoundary(setIdx, Neon::domain::tool::partitioning::ByDirection::down).first;
+                auto  firstGup = partitioner.getSpanLayout().getGhostBoundary(setIdx, Neon::domain::tool::partitioning::ByDirection::up).first;
+                auto  firstGdw = partitioner.getSpanLayout().getGhostBoundary(setIdx, Neon::domain::tool::partitioning::ByDirection::down).first;
+                auto  lastGdw = firstGdw + partitioner.getSpanLayout().getGhostBoundary(setIdx, Neon::domain::tool::partitioning::ByDirection::down).count;
+
+                auto& memoryFieldPartition = mData->memoryField.getPartition(execution, setIdx, Neon::DataView::STANDARD);
+                auto& blockConnectivity = mData->grid->helpGetBlockConnectivity().getPartition(execution, setIdx, Neon::DataView::STANDARD);
+                auto& bitmask = mData->grid->getActiveBitMask().getPartition(execution, setIdx, Neon::DataView::STANDARD);
+                auto& dataBlockOrigins = mData->grid->helpGetDataBlockOriginField().getPartition(execution, setIdx, Neon::DataView::STANDARD);
+
+                partition = bPartition<T, C, SBlock>(setIdx,
+                                                     cardinality,
+                                                     memoryFieldPartition.mem(),
+                                                     blockConnectivity.mem(),
+                                                     bitmask.mem(),
+                                                     dataBlockOrigins.mem(),
+                                                     mData->grid->helpGetStencilIdTo3dOffset().rawMem(execution, setIdx),
+                                                     mData->grid->getDimension(),
+                                                     firstBup,
+                                                     firstBdw,
+                                                     firstGup,
+                                                     firstGdw,
+                                                     lastGdw);
+            });
+    }
+
+    initHaloUpdateTable();
+}
+
+template <typename T, int C, typename SBlock>
+auto bField<T, C, SBlock>::getMemoryField() -> BlockViewGrid::Field<T, C>&
+{
+    return mData->memoryField;
+}
+
+template <typename T, int C, typename SBlock>
+auto bField<T, C, SBlock>::isInsideDomain(const Neon::index_3d& idx) const -> bool
+{
+    return mData->grid->isInsideDomain(idx);
+}
+
+template <typename T, int C, typename SBlock>
+auto bField<T, C, SBlock>::getReference(const Neon::index_3d& cartesianIdx,
+                                        const int&            cardinality) -> T&
+{
+    if constexpr (SBlock::isMultiResMode) {
+        auto& grid = this->getGrid();
+        auto  uniformCartesianIdx = cartesianIdx / grid.helGetMultiResDiscreteIdxSpacing();
+
+        if (cartesianIdx.x % grid.helGetMultiResDiscreteIdxSpacing() != 0 ||
+            cartesianIdx.y % grid.helGetMultiResDiscreteIdxSpacing() != 0 ||
+            cartesianIdx.z % grid.helGetMultiResDiscreteIdxSpacing() != 0) {
+            NeonException exp("bField::getReference");
+            exp << "Input index is not multiple of the grid resolution";
+            exp << "Index = " << cartesianIdx;
+            NEON_THROW(exp);
+        }
+        auto [setIdx, bIdx] = grid.helpGetSetIdxAndGridIdx(uniformCartesianIdx);
+        auto& partition = getPartition(Neon::Execution::host, setIdx, Neon::DataView::STANDARD);
+        auto& result = partition(bIdx, cardinality);
+        return result;
+    } else {
+        auto& grid = this->getGrid();
+        auto [setIdx, bIdx] = grid.helpGetSetIdxAndGridIdx(cartesianIdx);
+        auto& partition = getPartition(Neon::Execution::host, setIdx, Neon::DataView::STANDARD);
+        auto& result = partition(bIdx, cardinality);
+        return result;
+    }
+}
+
+template <typename T, int C, typename SBlock>
+auto bField<T, C, SBlock>::operator()(const Neon::index_3d& cartesianIdx,
+                                      const int&            cardinality) const -> T
+{
+    auto& grid = this->getGrid();
+    auto [setIdx, bIdx] = grid.helpGetSetIdxAndGridIdx(cartesianIdx);
+    if (setIdx.idx() == -1) {
+        return this->getOutsideValue();
+    }
+    auto& partition = getPartition(Neon::Execution::host, setIdx, Neon::DataView::STANDARD);
+    auto& result = partition(bIdx, cardinality);
+    return result;
+}
+
+template <typename T, int C, typename SBlock>
+auto bField<T, C, SBlock>::updateHostData(int streamId) -> void
+{
+    mData->memoryField.updateHostData(streamId);
+}
+
+template <typename T, int C, typename SBlock>
+auto bField<T, C, SBlock>::updateDeviceData(int streamId) -> void
+{
+    mData->memoryField.updateDeviceData(streamId);
+}
+
+template <typename T, int C, typename SBlock>
+auto bField<T, C, SBlock>::getPartition(Neon::Execution       execution,
+                                        Neon::SetIdx          setIdx,
+                                        const Neon::DataView& dataView) const -> const Partition&
+{
+    const Neon::DataUse dataUse = this->getDataUse();
+    bool                isOk = Neon::ExecutionUtils::checkCompatibility(dataUse, execution);
+    if (isOk) {
+        Partition const& result = mData->partitionTable.getPartition(execution, setIdx, dataView);
+        return result;
+    }
+    std::stringstream message;
+    message << "The requested execution mode ( " << execution << " ) is not compatible with the field DataUse (" << dataUse << ")";
+    NEON_THROW_UNSUPPORTED_OPERATION(message.str());
+}
+
+template <typename T, int C, typename SBlock>
+auto bField<T, C, SBlock>::getPartition(Neon::Execution       execution,
+                                        Neon::SetIdx          setIdx,
+                                        const Neon::DataView& dataView) -> Partition&
+{
+    const Neon::DataUse dataUse = this->getDataUse();
+    bool                isOk = Neon::ExecutionUtils::checkCompatibility(dataUse, execution);
+    if (isOk) {
+        Partition& result = mData->partitionTable.getPartition(execution, setIdx, dataView);
+        return result;
+    }
+    std::stringstream message;
+    message << "The requested execution mode ( " << execution << " ) is not compatible with the field DataUse (" << dataUse << ")";
+    NEON_THROW_UNSUPPORTED_OPERATION(message.str());
+}
+
+template <typename T, int C, typename SBlock>
+auto bField<T, C, SBlock>::newHaloUpdate(Neon::set::StencilSemantic stencilSemantic,
+                                         Neon::set::TransferMode    transferMode,
+                                         Neon::Execution            execution) const -> Neon::set::Container
+{
+
+
+    // We need to define a graph of Containers
+    // One for the actual memory transfer
+    // One for the synchronization
+    // The order depends on the transfer mode: put or get
+    Neon::set::Container dataTransferContainer;
+    auto const&          bk = this->getGrid().getBackend();
+
+    if (stencilSemantic == Neon::set::StencilSemantic::standard) {
+        auto transfers = bk.template newDataSet<std::vector<Neon::set::MemoryTransfer>>();
+
+        if (this->getMemoryOptions().getOrder() == Neon::MemoryLayout::structOfArrays) {
+            for (auto byDirection : {tool::partitioning::ByDirection::up,
+                                     tool::partitioning::ByDirection::down}) {
+
+                auto const& tableEntryByDir = mData->soaHaloUpdateTable.get(transferMode,
+                                                                            execution,
+                                                                            byDirection);
+
+                tableEntryByDir.forEachSeq([&](SetIdx setIdx, auto const& tableEntryByDirBySetIdx) {
+                    transfers[setIdx].insert(std::end(transfers[setIdx]),
+                                             std::begin(tableEntryByDirBySetIdx),
+                                             std::end(tableEntryByDirBySetIdx));
+                });
+            }
+            dataTransferContainer =
+                Neon::set::Container::factoryDataTransfer(
+                    *this,
+                    transferMode,
+                    stencilSemantic,
+                    transfers,
+                    execution);
+
+
+        } else {
+            NEON_THROW_UNSUPPORTED_OPERATION("");
+        }
+    } else {
+        NEON_DEV_UNDER_CONSTRUCTION("");
+    }
+    Neon::set::Container SyncContainer =
+        Neon::set::Container::factorySynchronization(
+            *this,
+            Neon::set::SynchronizationContainerType::hostOmpBarrier);
+
+    Neon::set::container::Graph graph(this->getBackend());
+    const auto&                 dataTransferNode = graph.addNode(dataTransferContainer);
+    const auto&                 syncNode = graph.addNode(SyncContainer);
+
+    switch (transferMode) {
+        case Neon::set::TransferMode::put:
+            graph.addDependency(dataTransferNode, syncNode, Neon::GraphDependencyType::data);
+            break;
+        case Neon::set::TransferMode::get:
+            graph.addDependency(syncNode, dataTransferNode, Neon::GraphDependencyType::data);
+            break;
+        default:
+            NEON_THROW_UNSUPPORTED_OPTION();
+            break;
+    }
+
+    graph.removeRedundantDependencies();
+
+    Neon::set::Container output =
+        Neon::set::Container::factoryGraph("dGrid-Halo-Update",
+                                           graph,
+                                           [](Neon::SetIdx, Neon::set::Loader&) {});
+    return output;
+}
+
+template <typename T, int C, typename SBlock>
+auto bField<T, C, SBlock>::initHaloUpdateTable() -> void
+{
+    // NEON_THROW_UNSUPPORTED_OPERATION("");
+    auto& grid = this->getGrid();
+    auto  bk = grid.getBackend();
+    auto  getNghSetIdx = [&](SetIdx setIdx, Neon::domain::tool::partitioning::ByDirection direction) {
+        int res;
+        if (direction == Neon::domain::tool::partitioning::ByDirection::up) {
+            res = (setIdx + 1) % bk.getDeviceCount();
+        } else {
+            res = (setIdx + bk.getDeviceCount() - 1) % bk.getDeviceCount();
+        }
+        return res;
+    };
+
+    mData->soaHaloUpdateTable.forEachPutConfiguration(
+        bk, [&](Neon::SetIdx                                  setIdxSrc,
+                Execution                                     execution,
+                Neon::domain::tool::partitioning::ByDirection byDirection,
+                std::vector<Neon::set::MemoryTransfer>&       transfersVec) {
+            {
+                using namespace Neon::domain::tool::partitioning;
+
+                Neon::SetIdx setIdxDst = getNghSetIdx(setIdxSrc, byDirection);
+                Neon::SetIdx setIdxVec[2];
+                setIdxVec[Data::EndPoints::dst] = setIdxDst;
+                setIdxVec[Data::EndPoints::src] = setIdxSrc;
+
+                std::array<Partition*, Data::EndPointsUtils::nConfigs>                                  partitions;
+                std::array<BlockViewPartition<T, 0>*, Data::EndPointsUtils::nConfigs>                   blockViewPartitions;
+                std::array<std::array<int, ByDirectionUtils::nConfigs>, Data::EndPointsUtils::nConfigs> ghostZBeginIdx;
+                std::array<std::array<int, ByDirectionUtils::nConfigs>, Data::EndPointsUtils::nConfigs> boundaryZBeginIdx;
+                std::array<Neon::size_4d, Data::EndPointsUtils::nConfigs>                               memPhyDim;
+
+                partitions[Data::EndPoints::dst] = &this->getPartition(execution, setIdxDst, Neon::DataView::STANDARD);
+                partitions[Data::EndPoints::src] = &this->getPartition(execution, setIdxSrc, Neon::DataView::STANDARD);
+                blockViewPartitions[Data::EndPoints::dst] = &(mData->memoryField.getPartition(execution, setIdxDst, Neon::DataView::STANDARD));
+                blockViewPartitions[Data::EndPoints::src] = &(mData->memoryField.getPartition(execution, setIdxSrc, Neon::DataView::STANDARD));
+
+                for (auto endPoint : {Data::EndPoints::dst, Data::EndPoints::src}) {
+                    for (auto direction : {ByDirection::down, ByDirection::up}) {
+                        auto ghostFirst = mData->grid->mData->partitioner1D.getSpanLayout().getGhostBoundary(setIdxVec[endPoint], direction).first;
+                        auto boundaryFirst = mData->grid->mData->partitioner1D.getSpanLayout().getBoundsBoundary(setIdxVec[endPoint], direction).first;
+                        ghostZBeginIdx[endPoint][static_cast<int>(direction)] = ghostFirst;
+                        boundaryZBeginIdx[endPoint][static_cast<int>(direction)] = boundaryFirst;
+                    }
+
+                    memPhyDim[endPoint] = Neon::size_4d(
+                        SBlock::memBlockCountElements,
+                        1,
+                        1,
+                        size_t(blockViewPartitions[endPoint]->getCountAllocated()) * SBlock::memBlockCountElements);
+                }
+
+                if (ByDirection::up == byDirection && bk.isLastDevice(setIdxSrc)) {
+                    return;
+                }
+
+                if (ByDirection::down == byDirection && bk.isFirstDevice(setIdxSrc)) {
+                    return;
+                }
+
+                T* srcMem = blockViewPartitions[Data::EndPoints::src]->mem();
+                T* dstMem = blockViewPartitions[Data::EndPoints::dst]->mem();
+
+                Neon::size_4d dstGhostBuff(ghostZBeginIdx[Data::EndPoints::dst][static_cast<int>(ByDirectionUtils::invert(byDirection))], 0, 0, 0);
+                Neon::size_4d srcBoundaryBuff(boundaryZBeginIdx[Data::EndPoints::src][static_cast<int>(byDirection)], 0, 0, 0);
+
+                size_t transferDataBlockCount = mData->grid->mData->partitioner1D.getSpanLayout().getBoundsBoundary(setIdxVec[Data::EndPoints::src], byDirection).count;
+
+                //                std::cout << "To  " << dstGhostBuff << " prt " << blockViewPartitions[Data::EndPoints::dst]->prtID() << " From  " << srcBoundaryBuff << " prt " << blockViewPartitions[Data::EndPoints::src]->prtID() <<  std::endl;
+                //                std::cout << "dst mem " << blockViewPartitions[Data::EndPoints::dst]->mem() << " " << std::endl;
+                //                std::cout << "dst transferDataBlockCount " << transferDataBlockCount << " " << std::endl;
+                //                std::cout << "dst pitch " << (dstGhostBuff * memPhyDim[Data::EndPoints::dst]).rSum() << " " << std::endl;
+                //                std::cout << "dst dstGhostBuff " << dstGhostBuff << " " << std::endl;
+                //                std::cout << "dst pitch all" << memPhyDim[Data::EndPoints::dst] << " " << std::endl;
+
+                Neon::set::MemoryTransfer transfer({setIdxDst, dstMem + (dstGhostBuff * memPhyDim[Data::EndPoints::dst]).rSum(), dstGhostBuff},
+                                                   {setIdxSrc, srcMem + (srcBoundaryBuff * memPhyDim[Data::EndPoints::src]).rSum(), srcBoundaryBuff},
+                                                   sizeof(T) * SBlock::memBlockCountElements * transferDataBlockCount);
+
+                transfersVec.push_back(transfer);
+            }
+        });
+}
+
+
+}  // namespace Neon::domain::details::bGridMgpu
diff --git a/libNeonDomain/include/Neon/domain/details/bGridDisgMgpu/bGrid.h b/libNeonDomain/include/Neon/domain/details/bGridDisgMgpu/bGrid.h
new file mode 100644
index 00000000..4af026c1
--- /dev/null
+++ b/libNeonDomain/include/Neon/domain/details/bGridDisgMgpu/bGrid.h
@@ -0,0 +1,239 @@
+#pragma once
+#include "Neon/core/core.h"
+
+#include "Neon/domain/aGrid.h"
+#include "Neon/domain/details/bGridDisgMgpu/BlockView.h"
+#include "Neon/domain/details/bGridDisgMgpu/StaticBlock.h"
+#include "Neon/domain/details/bGridDisgMgpu/bField.h"
+#include "Neon/domain/details/bGridDisgMgpu/bIndex.h"
+#include "Neon/domain/details/bGridDisgMgpu/bPartition.h"
+#include "Neon/domain/details/bGridDisgMgpu/bSpan.h"
+#include "Neon/domain/interface/GridBaseTemplate.h"
+#include "Neon/domain/patterns/PatternScalar.h"
+#include "Neon/domain/tools/Partitioner1D.h"
+#include "Neon/domain/tools/PointHashTable.h"
+#include "Neon/domain/tools/SpanTable.h"
+#include "Neon/set/Containter.h"
+#include "Neon/set/LaunchParametersTable.h"
+#include "Neon/set/memory/memSet.h"
+
+#include "bField.h"
+#include "bPartition.h"
+#include "bSpan.h"
+
+namespace Neon::domain::details::bGridMgpu {
+
+
+template <typename T, int C, typename SBlock>
+class bField;
+
+template <typename SBlock>
+class bGrid : public Neon::domain::interface::GridBaseTemplate<bGrid<SBlock>,
+                                                               bIndex<SBlock>>
+{
+   public:
+    using Grid = bGrid<SBlock>;
+    template <typename T, int C = 0>
+    using Partition = bPartition<T, C, SBlock>;
+    template <typename T, int C = 0>
+    using Field = Neon::domain::details::bGridMgpu::bField<T, C, SBlock>;
+
+    using Span = bSpan<SBlock>;
+    using NghIdx = typename Partition<int>::NghIdx;
+    using GridBaseTemplate = Neon::domain::interface::GridBaseTemplate<Grid, bIndex<SBlock>>;
+
+    using Idx = bIndex<SBlock>;
+    static constexpr Neon::set::details::ExecutionThreadSpan executionThreadSpan = Neon::set::details::ExecutionThreadSpan::d1b3;
+    using ExecutionThreadSpanIndexType = uint32_t;
+
+    using BlockIdx = uint32_t;
+
+    bGrid() = default;
+    virtual ~bGrid();
+
+    /**
+     * Constructor for the vanilla block data structure with depth of 1
+     */
+    template <typename ActiveCellLambda>
+    bGrid(const Neon::Backend&         backend,
+          const Neon::int32_3d&        domainSize,
+          const ActiveCellLambda       activeCellLambda,
+          const Neon::domain::Stencil& stencil,
+          const double_3d&             spacingData = double_3d(1, 1, 1),
+          const double_3d&             origin = double_3d(0, 0, 0),
+          Neon::domain::tool::spaceCurves::EncoderType encoderType = Neon::domain::tool::spaceCurves::EncoderType::sweep);
+
+
+    /**
+     * Constructor for bGrid. This constructor should be directly used only by mGrid
+     */
+    template <typename ActiveCellLambda>
+    bGrid(const Neon::Backend&         backend /**< Neon backend for the computation */,
+          const Neon::int32_3d&        domainSize /**< Size of the bounded Cartesian */,
+          const ActiveCellLambda       activeCellLambda /**< Function that identify the user domain inside the boxed Cartesian discretization  */,
+          const Neon::domain::Stencil& stencil /**< union of tall the stencil that will be used in the computation */,
+          const int                    multiResDiscreteIdxSpacing /**< Parameter for the multi-resolution. Index i and index (i+1) may be remapped as i*voxelSpacing  and (i+1)* voxelSpacing.
+                                                                   * For a uniform bGrid, i.e outside the context of multi-resolution this parameter is always 1 */,
+          const double_3d& spacingData /** Physical spacing between two consecutive data points in the Cartesian domain */,
+          const double_3d& origin /** Physical location in space of the origin of the Cartesian discretization */,
+          Neon::domain::tool::spaceCurves::EncoderType encoderType = Neon::domain::tool::spaceCurves::EncoderType::sweep);
+
+    /**
+     * Returns some properties for a given cartesian in the Cartesian domain.
+     * The provide index my be inside or outside the user defined bounded Cartesian domain
+     */
+    auto getProperties(const Neon::index_3d& idx)
+        const -> typename GridBaseTemplate::CellProperties final;
+
+    /**
+     * Returns true if the query 3D point is inside the user domain
+     * @param idx
+     * @return
+     */
+    auto isInsideDomain(const Neon::index_3d& idx)
+        const -> bool final;
+
+    /**
+     * Retrieves the device index that contains the query point
+     * @param idx
+     * @return
+     */
+    auto getSetIdx(const Neon::index_3d& idx)
+        const -> int32_t final;
+
+    /**
+     * Allocates a new field on the grid
+     */
+    template <typename T, int C = 0>
+    auto newField(const std::string   name,
+                  int                 cardinality,
+                  T                   inactiveValue,
+                  Neon::DataUse       dataUse = Neon::DataUse::HOST_DEVICE,
+                  Neon::MemoryOptions memoryOptions = Neon::MemoryOptions()) const
+        -> Field<T, C>;
+
+    /**
+     * Allocates a new field on the block view grid
+     */
+    template <typename T, int C = 0>
+    auto newBlockViewField(const std::string   name,
+                           int                 cardinality,
+                           T                   inactiveValue,
+                           Neon::DataUse       dataUse = Neon::DataUse::HOST_DEVICE,
+                           Neon::MemoryOptions memoryOptions = Neon::MemoryOptions()) const
+        -> BlockView::Field<T, C>;
+
+    /**
+     * Allocates a new container to execute some computation in the grid
+     */
+    template <Neon::Execution execution = Neon::Execution::device,
+              typename LoadingLambda = void*>
+    auto newContainer(const std::string& name,
+                      index_3d           blockSize,
+                      size_t             sharedMem,
+                      LoadingLambda      lambda) const -> Neon::set::Container;
+
+    /**
+     * Allocates a new container to execute some computation in the grid
+     */
+    template <Neon::Execution execution = Neon::Execution::device,
+              typename LoadingLambda = void*>
+    auto newContainer(const std::string& name,
+                      LoadingLambda      lambda) const -> Neon::set::Container;
+
+    /**
+     * Defines a new set of parameter to launch a Container
+     */
+    auto getLaunchParameters(Neon::DataView        dataView,
+                             const Neon::index_3d& blockSize,
+                             const size_t&         sharedMem) const -> Neon::set::LaunchParameters;
+
+    /**
+     * Retrieve the span associated to the grid w.r.t. some user defined parameters.
+     */
+    auto getSpan(Neon::Execution execution,
+                 SetIdx          setIdx,
+                 Neon::DataView  dataView) -> const Span&;
+
+    /**
+     * Retrieve the block vew grid internally used.
+     * This grid can be leverage to allocate data at the block level.
+     */
+    auto getBlockViewGrid() const -> BlockView::Grid&;
+
+
+    /**
+     * Retrieve the block vew grid internally used.
+     * This grid can be leverage to allocate data at the block level.
+     */
+    auto getActiveBitMask() const -> BlockView::Field<typename SBlock::BitMask, 1>&;
+
+    /**
+     * Helper function to retrieve the discrete index spacing used for the multi-resolution
+     */
+    template <int dummy = SBlock::isMultiResMode>
+    auto helGetMultiResDiscreteIdxSpacing() const -> std::enable_if_t<dummy == 1, int>;
+
+
+    /**
+     * Help function to retrieve the block connectivity as a BlockViewGrid field
+     */
+    auto helpGetBlockConnectivity() const -> BlockView::Field<BlockIdx, 27>&;
+
+    /**
+     * Help function to retrieve the block origin as a BlockViewGrid field
+     */
+    auto helpGetDataBlockOriginField() const -> Neon::aGrid::Field<index_3d, 0>&;
+
+    /**
+     * Help function to retrieve the map that converts a stencil point id to 3d offset
+     */
+    auto helpGetStencilIdTo3dOffset() const -> Neon::set::MemSet<Neon::int8_3d>&;
+
+    auto helpGetPartitioner1D() -> Neon::domain::tool::Partitioner1D&;
+
+    /**
+     * Help function retriev the device and the block index associated to a point in the BlockViewGrid grid
+     */
+    auto helpGetSetIdxAndGridIdx(Neon::index_3d idx) const -> std::tuple<Neon::SetIdx, Idx>;
+
+    struct Data
+    {
+        auto init(const Neon::Backend& bk)
+        {
+            spanTable.init(bk);
+            launchParametersTable.init(bk);
+        }
+
+        Neon::domain::tool::SpanTable<Span> spanTable /** Span for each data view configurations */;
+        Neon::set::LaunchParametersTable    launchParametersTable;
+
+        Neon::domain::tool::Partitioner1D partitioner1D;
+        Stencil                           stencil;
+        Neon::sys::patterns::Engine       reduceEngine;
+
+        Neon::aGrid                     memoryGrid /** memory allocator for fields */;
+        Neon::aGrid::Field<index_3d, 0> mDataBlockOriginField;
+        Neon::set::MemSet<int8_t>       mStencil3dTo1dOffset;
+
+        BlockView::Grid                               blockViewGrid;
+        BlockView::Field<typename SBlock::BitMask, 1> activeBitField;
+        BlockView::Field<BlockIdx, 27>                blockConnectivity;
+        Neon::set::MemSet<Neon::int8_3d>              stencilIdTo3dOffset;
+
+        int mMultiResDiscreteIdxSpacing;
+
+        // number of active voxels in each block
+        Neon::set::DataSet<uint64_t> mNumActiveVoxel;
+
+
+        // Stencil neighbor indices
+        Neon::set::MemSet<NghIdx> mStencilNghIndex;
+    };
+    std::shared_ptr<Data> mData;
+};
+extern template class bGrid<StaticBlock<8, 8, 8>>;
+}  // namespace Neon::domain::details::bGrid
+
+#include "bField_imp.h"
+#include "bGrid_imp.h"
diff --git a/libNeonDomain/include/Neon/domain/details/bGridDisgMgpu/bGrid_imp.h b/libNeonDomain/include/Neon/domain/details/bGridDisgMgpu/bGrid_imp.h
new file mode 100644
index 00000000..89614532
--- /dev/null
+++ b/libNeonDomain/include/Neon/domain/details/bGridDisgMgpu/bGrid_imp.h
@@ -0,0 +1,470 @@
+#include "Neon/domain/details/bGridDisgMgpu/bGrid.h"
+#include "Neon/domain/tools/SpaceCurves.h"
+
+namespace Neon::domain::details::bGridMgpu {
+
+template <typename SBlock>
+template <typename ActiveCellLambda>
+bGrid<SBlock>::bGrid(const Neon::Backend&                         backend,
+                     const Neon::int32_3d&                        domainSize,
+                     const ActiveCellLambda                       activeCellLambda,
+                     const Neon::domain::Stencil&                 stencil,
+                     const double_3d&                             spacingData,
+                     const double_3d&                             origin,
+                     Neon::domain::tool::spaceCurves::EncoderType encoderType)
+    : bGrid(backend, domainSize, activeCellLambda, stencil, 1, spacingData, origin, encoderType)
+{
+}
+
+template <typename SBlock>
+template <typename ActiveCellLambda>
+bGrid<SBlock>::bGrid(const Neon::Backend&                         backend,
+                     const Neon::int32_3d&                        domainSize,
+                     const ActiveCellLambda                       activeCellLambda,
+                     const Neon::domain::Stencil&                 stencil,
+                     const int                                    multiResDiscreteIdxSpacing,
+                     const double_3d&                             spacingData,
+                     const double_3d&                             origin,
+                     Neon::domain::tool::spaceCurves::EncoderType encoderType)
+{
+
+
+    mData = std::make_shared<Data>();
+    mData->init(backend);
+
+    mData->mMultiResDiscreteIdxSpacing = multiResDiscreteIdxSpacing;
+    mData->stencil = stencil;
+    const index_3d defaultKernelBlockSize(SBlock::memBlockSizeX,
+                                          SBlock::memBlockSizeY,
+                                          SBlock::memBlockSizeZ);
+
+    std::stringstream gridName;
+    gridName << "bGrid_" << SBlock::memBlockSizeX << "_"
+             << SBlock::memBlockSizeY << "_"
+             << SBlock::memBlockSizeZ;
+    {
+        auto nElementsPerPartition = backend.devSet().template newDataSet<size_t>(0);
+        // We do an initialization with nElementsPerPartition to zero,
+        // then we reset to the computed number.
+
+        bGrid::GridBase::init(gridName.str(),
+                              backend,
+                              domainSize,
+                              stencil,
+                              nElementsPerPartition,
+                              defaultKernelBlockSize,
+                              multiResDiscreteIdxSpacing,
+                              origin,
+                              encoderType,
+                              defaultKernelBlockSize);
+    }
+
+    {  // Initialization of the partitioner
+
+        mData->partitioner1D = Neon::domain::tool::Partitioner1D(
+            backend,
+            activeCellLambda,
+            nullptr,
+            SBlock::memBlockSize3D.template newType<int32_t>(),
+            domainSize,
+            Neon::domain::Stencil::s27_t(false),
+            encoderType,
+            multiResDiscreteIdxSpacing);
+
+        mData->mDataBlockOriginField = mData->partitioner1D.getGlobalMapping();
+        mData->mStencil3dTo1dOffset = mData->partitioner1D.getStencil3dTo1dOffset();
+        mData->memoryGrid = mData->partitioner1D.getMemoryGrid();
+    }
+
+    {  // BlockViewGrid
+        Neon::domain::details::eGrid::eGrid egrid(
+            backend,
+            mData->partitioner1D.getBlockSpan(),
+            mData->partitioner1D,
+            Neon::domain::Stencil::s27_t(false),
+            spacingData * SBlock::memBlockSize3D,
+            origin);
+
+        mData->blockViewGrid = BlockView::Grid(egrid);
+    }
+
+    {  // Active bitmask
+        mData->activeBitField = mData->blockViewGrid.template newField<typename SBlock::BitMask, 1>(
+            "BlockViewBitMask",
+            1,
+            [] {
+                typename SBlock::BitMask outsideBitMask;
+                outsideBitMask.reset();
+                return outsideBitMask;
+            }(),
+            Neon::DataUse::HOST_DEVICE, backend.getMemoryOptions(BlockView::layout));
+
+        mData->mNumActiveVoxel = backend.devSet().template newDataSet<uint64_t>();
+
+        mData->activeBitField
+            .getGrid()
+            .template newContainer<Neon::Execution::host>(
+                "activeBitMaskInit",
+                [&, this](Neon::set::Loader& loader) {
+                    auto bitMaskPartition = loader.load(mData->activeBitField);
+                    return [&, bitMaskPartition](const auto& bitMaskIdx) mutable {
+                        auto                      prtIdx = bitMaskPartition.prtID();
+                        int                       countActive = 0;
+                        auto const                blockOrigin = bitMaskPartition.getGlobalIndex(bitMaskIdx);
+                        typename SBlock::BitMask& bitMask = bitMaskPartition(bitMaskIdx, 0);
+                        bitMask.reset();
+
+                        for (int k = 0; k < SBlock::memBlockSize3D.template newType<int32_t>().z; k++) {
+                            for (int j = 0; j < SBlock::memBlockSize3D.template newType<int32_t>().y; j++) {
+                                for (int i = 0; i < SBlock::memBlockSize3D.template newType<int32_t>().x; i++) {
+                                    auto       globalPosition = blockOrigin + Neon::int32_3d(i * this->mData->mMultiResDiscreteIdxSpacing,
+                                                                                             j * this->mData->mMultiResDiscreteIdxSpacing,
+                                                                                             k * this->mData->mMultiResDiscreteIdxSpacing);
+                                    bool const isInDomain = globalPosition < domainSize * this->mData->mMultiResDiscreteIdxSpacing;
+                                    bool const isActive = activeCellLambda(globalPosition);
+                                    if (isActive && isInDomain) {
+                                        countActive++;
+                                        bitMask.setActive(i, j, k);
+                                    }
+                                }
+                            }
+                        }
+#pragma omp critical
+                        {
+                            this->mData->mNumActiveVoxel[prtIdx] += countActive;
+                        }
+                    };
+                })
+            .run(Neon::Backend::mainStreamIdx);
+
+
+        mData->activeBitField.updateDeviceData(Neon::Backend::mainStreamIdx);
+        this->getBackend().sync(Neon::Backend::mainStreamIdx);
+        mData->activeBitField.newHaloUpdate(Neon::set::StencilSemantic::standard,
+                                            Neon::set::TransferMode::put,
+                                            Neon::Execution::device)
+            .run(Neon::Backend::mainStreamIdx);
+    }
+
+
+    {  // Neighbor blocks
+        mData->blockConnectivity = mData->blockViewGrid.template newField<BlockIdx, 27>("blockConnectivity",
+                                                                                        27,
+                                                                                        Span::getInvalidBlockId(),
+                                                                                        Neon::DataUse::HOST_DEVICE,
+                                                                                        Neon::MemoryLayout::arrayOfStructs);
+
+        mData->blockConnectivity.getGrid().template newContainer<Neon::Execution::host>(
+                                              "blockConnectivityInit",
+                                              [&](Neon::set::Loader& loader) {
+                                                  auto blockConnectivity = loader.load(mData->blockConnectivity);
+                                                  return [&, blockConnectivity](auto const& idx) mutable {
+                                                      for (int8_t k = 0; k < 3; k++) {
+                                                          for (int8_t j = 0; j < 3; j++) {
+                                                              for (int8_t i = 0; i < 3; i++) {
+                                                                  auto                                      targetDirection = i + 3 * j + 3 * 3 * k;
+                                                                  BlockIdx                                  blockNghIdx = Span::getInvalidBlockId();
+                                                                  typename decltype(blockConnectivity)::Idx nghIdx;
+                                                                  Neon::int8_3d                             stencilPoint(i - int8_t(1),
+                                                                                                                         j - int8_t(1),
+                                                                                                                         k - int8_t(1));
+                                                                  bool                                      isValid = blockConnectivity.getNghIndex(idx, stencilPoint, nghIdx);
+                                                                  if (isValid) {
+                                                                      blockNghIdx = static_cast<BlockIdx>(nghIdx.helpGet());
+                                                                  }
+                                                                  blockConnectivity(idx, targetDirection) = blockNghIdx;
+                                                              }
+                                                          }
+                                                      }
+                                                  };
+                                              })
+            .run(Neon::Backend::mainStreamIdx);
+        mData->blockConnectivity.updateDeviceData(Neon::Backend::mainStreamIdx);
+    }
+
+    // Initialization of the SPAN table
+    mData->spanTable.forEachConfiguration([&](Neon::Execution execution,
+                                              Neon::SetIdx    setIdx,
+                                              Neon::DataView  dw,
+                                              Span&           span) {
+        span.mDataView = dw;
+        switch (dw) {
+            case Neon::DataView::STANDARD: {
+                span.mFirstDataBlockOffset = 0;
+                span.mDataView = dw;
+                span.mActiveMask = mData->activeBitField.getPartition(execution, setIdx, dw).mem();
+                break;
+            }
+            case Neon::DataView::BOUNDARY: {
+                span.mFirstDataBlockOffset = mData->partitioner1D.getSpanClassifier().countInternal(setIdx);
+                span.mDataView = dw;
+                span.mActiveMask = mData->activeBitField.getPartition(execution, setIdx, dw).mem();
+
+                break;
+            }
+            case Neon::DataView::INTERNAL: {
+                span.mFirstDataBlockOffset = 0;
+                span.mDataView = dw;
+                span.mActiveMask = mData->activeBitField.getPartition(execution, setIdx, dw).mem();
+                break;
+            }
+            default: {
+                NeonException exc("dFieldDev");
+                NEON_THROW(exc);
+            }
+        }
+    });
+
+    {  // Stencil Idx to 3d offset
+        auto nPoints = backend.devSet().newDataSet<uint64_t>(stencil.nNeighbours());
+        mData->stencilIdTo3dOffset = backend.devSet().template newMemSet<int8_3d>(Neon::DataUse::HOST_DEVICE,
+                                                                                  1,
+                                                                                  backend.getMemoryOptions(),
+                                                                                  nPoints);
+        for (int i = 0; i < stencil.nNeighbours(); ++i) {
+            for (int devIdx = 0; devIdx < backend.devSet().setCardinality(); devIdx++) {
+                index_3d      pLong = stencil.neighbours()[i];
+                Neon::int8_3d pShort = pLong.newType<int8_t>();
+                mData->stencilIdTo3dOffset.eRef(devIdx, i) = pShort;
+            }
+        }
+        mData->stencilIdTo3dOffset.updateDeviceData(backend, Neon::Backend::mainStreamIdx);
+    }
+    // Init the base grid
+    bGrid::GridBase::init(gridName.str(),
+                          backend,
+                          domainSize,
+                          Neon::domain::Stencil(),
+                          mData->mNumActiveVoxel,
+                          SBlock::memBlockSize3D.template newType<int32_t>(),
+                          spacingData,
+                          origin,
+                          encoderType,
+                          defaultKernelBlockSize);
+    {  // setting launchParameters
+        mData->launchParametersTable.forEachSeq([&](Neon::DataView               dw,
+                                                    Neon::set::LaunchParameters& bLaunchParameters) {
+            auto defEGridBlock = mData->blockViewGrid.getDefaultBlock();
+            auto eGridParams = mData->blockViewGrid.getLaunchParameters(dw, defEGridBlock, 0);
+            eGridParams.forEachSeq([&](Neon::SetIdx setIdx, Neon::sys::GpuLaunchInfo const& launchSingleDev) {
+                auto eDomainGridSize = launchSingleDev.domainGrid();
+                assert(eDomainGridSize.y == 1);
+                assert(eDomainGridSize.z == 1);
+                int nBlocks = static_cast<int>(eDomainGridSize.x);
+                bLaunchParameters.get(setIdx).set(Neon::sys::GpuLaunchInfo::mode_e::cudaGridMode,
+                                                  nBlocks, SBlock::memBlockSize3D.template newType<int32_t>(), 0);
+            });
+        });
+    }
+}
+
+template <typename SBlock>
+template <typename T, int C>
+auto bGrid<SBlock>::newField(const std::string   name,
+                             int                 cardinality,
+                             T                   inactiveValue,
+                             Neon::DataUse       dataUse,
+                             Neon::MemoryOptions memoryOptions) const -> Field<T, C>
+{
+    memoryOptions = this->getDevSet().sanitizeMemoryOption(memoryOptions);
+    Field<T, C> field(name, dataUse, memoryOptions, *this, cardinality, inactiveValue);
+
+    return field;
+}
+
+template <typename SBlock>
+template <typename T, int C>
+auto bGrid<SBlock>::newBlockViewField(const std::string   name,
+                                      int                 cardinality,
+                                      T                   inactiveValue,
+                                      Neon::DataUse       dataUse,
+                                      Neon::MemoryOptions memoryOptions) const -> BlockView::Field<T, C>
+{
+    memoryOptions = this->getDevSet().sanitizeMemoryOption(memoryOptions);
+    BlockView::Field<T, C> blockViewField = mData->blockViewGrid.template newField<T, C>(name, cardinality, inactiveValue, dataUse, memoryOptions);
+    return blockViewField;
+}
+
+template <typename SBlock>
+template <Neon::Execution execution,
+          typename LoadingLambda>
+auto bGrid<SBlock>::newContainer(const std::string& name,
+                                 index_3d           blockSize,
+                                 size_t             sharedMem,
+                                 LoadingLambda      lambda) const -> Neon::set::Container
+{
+    Neon::set::Container kContainer = Neon::set::Container::factory<execution>(name,
+                                                                               Neon::set::internal::ContainerAPI::DataViewSupport::on,
+                                                                               *this,
+                                                                               lambda,
+                                                                               blockSize,
+                                                                               [sharedMem](const Neon::index_3d&) { return sharedMem; });
+    return kContainer;
+}
+
+template <typename SBlock>
+template <Neon::Execution execution,
+          typename LoadingLambda>
+auto bGrid<SBlock>::newContainer(const std::string& name,
+                                 LoadingLambda      lambda) const -> Neon::set::Container
+{
+    const Neon::index_3d& defaultBlockSize = this->getDefaultBlock();
+    Neon::set::Container  kContainer = Neon::set::Container::factory<execution>(name,
+                                                                               Neon::set::internal::ContainerAPI::DataViewSupport::on,
+                                                                               *this,
+                                                                               lambda,
+                                                                               defaultBlockSize,
+                                                                               [](const Neon::index_3d&) { return 0; });
+    return kContainer;
+}
+
+template <typename SBlock>
+auto bGrid<SBlock>::
+    getBlockViewGrid()
+        const -> BlockView::Grid&
+{
+    return mData->blockViewGrid;
+}
+
+template <typename SBlock>
+auto bGrid<SBlock>::
+    getActiveBitMask()
+        const -> BlockView::Field<typename SBlock::BitMask, 1>&
+{
+    return mData->activeBitField;
+}
+
+/**
+ * Helper function to retrieve the discrete index spacing used for the multi-resolution
+ */
+template <typename SBlock>
+template <int dummy>
+auto bGrid<SBlock>::helGetMultiResDiscreteIdxSpacing() const
+    -> std::enable_if_t<dummy == 1, int>
+{
+    return mData->mMultiResDiscreteIdxSpacing;
+}
+
+template <typename SBlock>
+auto bGrid<SBlock>::
+    helpGetBlockConnectivity()
+        const -> BlockView::Field<BlockIdx, 27>&
+{
+    return mData->blockConnectivity;
+}
+template <typename SBlock>
+auto bGrid<SBlock>::
+    helpGetDataBlockOriginField()
+        const -> Neon::aGrid::Field<index_3d, 0>&
+{
+    return mData->mDataBlockOriginField;
+}
+template <typename SBlock>
+auto bGrid<SBlock>::getSpan(Neon::Execution execution,
+                            SetIdx          setIdx,
+                            Neon::DataView  dataView) -> const bGrid::Span&
+{
+    return mData->spanTable.getSpan(execution, setIdx, dataView);
+}
+
+template <typename SBlock>
+bGrid<SBlock>::~bGrid()
+{
+}
+template <typename SBlock>
+auto bGrid<SBlock>::getSetIdx(const index_3d& idx) const -> int32_t
+{
+    typename GridBaseTemplate::CellProperties cellProperties;
+
+    cellProperties.setIsInside(this->isInsideDomain(idx));
+    if (!cellProperties.isInside()) {
+        return -1;
+    }
+    Neon::SetIdx setIdx = cellProperties.getSetIdx();
+    return setIdx;
+}
+template <typename SBlock>
+auto bGrid<SBlock>::getLaunchParameters(Neon::DataView dataView,
+                                        const index_3d&,
+                                        const size_t& sharedMem) const -> Neon::set::LaunchParameters
+{
+    auto res = mData->launchParametersTable.get(dataView);
+    res.forEachSeq([&](SetIdx const& /*setIdx*/,
+                       Neon::set::LaunchParameters::launchInfo_e& launchParams) -> void {
+        launchParams.setShm(sharedMem);
+    });
+    return res;
+}
+
+template <typename SBlock>
+auto bGrid<SBlock>::
+    helpGetStencilIdTo3dOffset()
+        const -> Neon::set::MemSet<Neon::int8_3d>&
+{
+    return mData->stencilIdTo3dOffset;
+}
+
+template <typename SBlock>
+auto bGrid<SBlock>::isInsideDomain(const index_3d& idx) const -> bool
+{
+    // 1. check if the block is active
+    const BlockView::index_3d blockIdx3d = idx / (SBlock::memBlockSize3D.template newType<int32_t>() * mData->mMultiResDiscreteIdxSpacing);
+    auto                      blockProperties = mData->blockViewGrid.getProperties(blockIdx3d);
+
+    if (!blockProperties.isInside()) {
+        return false;
+    }
+    // 2. The block is active, check the element in the block
+    typename SBlock::BitMask const& bitMask = mData->activeBitField.getReference(blockIdx3d, 0);
+
+    bool isActive = bitMask.isActive((idx.x / mData->mMultiResDiscreteIdxSpacing) % SBlock::memBlockSize3D.x,
+                                     (idx.y / mData->mMultiResDiscreteIdxSpacing) % SBlock::memBlockSize3D.y,
+                                     (idx.z / mData->mMultiResDiscreteIdxSpacing) % SBlock::memBlockSize3D.z);
+    return isActive;
+}
+
+template <typename SBlock>
+auto bGrid<SBlock>::getProperties(const index_3d& idx)
+    const -> typename GridBaseTemplate::CellProperties
+{
+    typename GridBaseTemplate::CellProperties cellProperties;
+
+    cellProperties.setIsInside(this->isInsideDomain(idx));
+    if (!cellProperties.isInside()) {
+        return cellProperties;
+    }
+
+    if (this->getDevSet().setCardinality() == 1) {
+        cellProperties.init(0, DataView::INTERNAL);
+    } else {
+        const index_3d blockIdx3d = idx / SBlock::memBlockSize3D.template newType<int32_t>();
+        auto           blockViewProperty = mData->blockViewGrid.getProperties(blockIdx3d);
+
+        cellProperties.init(blockViewProperty.getSetIdx(),
+                            blockViewProperty.getDataView());
+    }
+    return cellProperties;
+}
+
+template <typename SBlock>
+auto bGrid<SBlock>::helpGetSetIdxAndGridIdx(Neon::index_3d idx)
+    const -> std::tuple<Neon::SetIdx, Idx>
+{
+    const index_3d blockIdx3d = idx / (SBlock::memBlockSize3D.template newType<int32_t>() * mData->mMultiResDiscreteIdxSpacing);
+    auto [setIdx, bvGridIdx] = mData->blockViewGrid.helpGetSetIdxAndGridIdx(blockIdx3d);
+    Idx bIdx;
+    bIdx.mDataBlockIdx = bvGridIdx.helpGet();
+    bIdx.mInDataBlockIdx.x = static_cast<typename Idx::InDataBlockIdx::Integer>((idx.x / mData->mMultiResDiscreteIdxSpacing) % SBlock::memBlockSize3D.x);
+    bIdx.mInDataBlockIdx.y = static_cast<typename Idx::InDataBlockIdx::Integer>((idx.y / mData->mMultiResDiscreteIdxSpacing) % SBlock::memBlockSize3D.y);
+    bIdx.mInDataBlockIdx.z = static_cast<typename Idx::InDataBlockIdx::Integer>((idx.z / mData->mMultiResDiscreteIdxSpacing) % SBlock::memBlockSize3D.z);
+
+    return {setIdx, bIdx};
+}
+
+template <typename SBlock>
+auto bGrid<SBlock>::helpGetPartitioner1D() -> Neon::domain::tool::Partitioner1D&
+{
+    return mData->partitioner1D;
+}
+
+}  // namespace Neon::domain::details::bGrid
\ No newline at end of file
diff --git a/libNeonDomain/include/Neon/domain/details/bGridDisgMgpu/bIndex.h b/libNeonDomain/include/Neon/domain/details/bGridDisgMgpu/bIndex.h
new file mode 100644
index 00000000..0452cfc9
--- /dev/null
+++ b/libNeonDomain/include/Neon/domain/details/bGridDisgMgpu/bIndex.h
@@ -0,0 +1,142 @@
+#pragma once
+
+#include "Neon/core/core.h"
+
+
+namespace Neon::domain::details::bGridMgpu {
+
+// Common forward declarations
+template <typename SBlock>
+class bGrid;
+template <typename SBlock>
+class bSpan;
+template <typename T, int C, typename SBlock>
+class bPartition;
+
+class MicroIndex
+{
+   public:
+    using TrayIdx = int32_t;
+    using InTrayIdx = int8_3d;
+
+    NEON_CUDA_HOST_DEVICE inline explicit MicroIndex()
+        : MicroIndex(0, 0, 0, 0)
+    {
+    }
+
+    NEON_CUDA_HOST_DEVICE inline explicit MicroIndex(const TrayIdx&            blockIdx,
+                                                     const InTrayIdx::Integer& x,
+                                                     const InTrayIdx::Integer& y,
+                                                     const InTrayIdx::Integer& z)
+    {
+        mTrayBlockIdx = blockIdx;
+        mInTrayBlockIdx.x = x;
+        mInTrayBlockIdx.y = y;
+        mInTrayBlockIdx.z = z;
+    }
+
+    NEON_CUDA_HOST_DEVICE inline auto getInTrayBlockIdx() const -> InTrayIdx const&
+    {
+        return mInTrayBlockIdx;
+    }
+
+    NEON_CUDA_HOST_DEVICE inline auto getTrayBlockIdx() const -> TrayIdx const&
+    {
+        return mTrayBlockIdx;
+    }
+
+    NEON_CUDA_HOST_DEVICE inline auto setInTrayBlockIdx(InTrayIdx const& inTrayIdx) -> void
+    {
+        mInTrayBlockIdx = inTrayIdx;
+    }
+
+    NEON_CUDA_HOST_DEVICE inline auto setTrayBlockIdx(TrayIdx const& trayIdx) -> void
+    {
+        mTrayBlockIdx = trayIdx;
+    }
+
+    InTrayIdx mInTrayBlockIdx;
+    TrayIdx   mTrayBlockIdx{};
+};
+
+template <typename SBlock>
+class bIndex
+{
+   public:
+    template <typename SBlock_>
+    friend class bSpan;
+    using OuterIdx = bIndex<SBlock>;
+
+    using NghIdx = int8_3d;
+    template <typename T, int C, typename SBlock_>
+    friend class bPartition;
+
+    template <typename T, int C, typename SBlock_>
+    friend class bField;
+
+    template <typename SBlock_>
+    friend class bSpan;
+    template <typename SBlock_>
+    friend class bGrid;
+
+
+    using TrayIdx = MicroIndex::TrayIdx;
+    using InTrayIdx = MicroIndex::InTrayIdx;
+
+    using DataBlockCount = std::make_unsigned_t<TrayIdx>;
+    using DataBlockIdx = std::make_unsigned_t<TrayIdx>;
+    using InDataBlockIdx = InTrayIdx;
+
+    bIndex() = default;
+    ~bIndex() = default;
+
+    NEON_CUDA_HOST_DEVICE inline explicit bIndex(const DataBlockIdx&            blockIdx,
+                                                 const InDataBlockIdx::Integer& x,
+                                                 const InDataBlockIdx::Integer& y,
+                                                 const InDataBlockIdx::Integer& z);
+
+    NEON_CUDA_HOST_DEVICE inline auto getMicroIndex() -> MicroIndex;
+    NEON_CUDA_HOST_DEVICE inline auto init(MicroIndex const&) -> void;
+
+    NEON_CUDA_HOST_DEVICE inline auto getInDataBlockIdx() const -> InDataBlockIdx const&;
+    NEON_CUDA_HOST_DEVICE inline auto getDataBlockIdx() const -> DataBlockIdx const&;
+    NEON_CUDA_HOST_DEVICE inline auto setInDataBlockIdx(InDataBlockIdx const&) -> void;
+    NEON_CUDA_HOST_DEVICE inline auto setDataBlockIdx(DataBlockIdx const&) -> void;
+    NEON_CUDA_HOST_DEVICE inline auto isActive() const -> bool;
+    // the local index within the block
+    InDataBlockIdx mInDataBlockIdx;
+    DataBlockIdx   mDataBlockIdx{};
+};
+
+template <typename SBlock>
+NEON_CUDA_HOST_DEVICE auto bIndex<SBlock>::setDataBlockIdx(const bIndex::DataBlockIdx& dataBlockIdx) -> void
+{
+    mDataBlockIdx = dataBlockIdx;
+}
+
+template <typename SBlock>
+NEON_CUDA_HOST_DEVICE auto bIndex<SBlock>::setInDataBlockIdx(const bIndex::InDataBlockIdx& inDataBlockIdx) -> void
+{
+    mInDataBlockIdx = inDataBlockIdx;
+}
+
+template <typename SBlock>
+NEON_CUDA_HOST_DEVICE auto bIndex<SBlock>::getDataBlockIdx() const -> const bIndex::DataBlockIdx&
+{
+    return mDataBlockIdx;
+}
+template <typename SBlock>
+NEON_CUDA_HOST_DEVICE auto bIndex<SBlock>::getInDataBlockIdx() const -> const bIndex::InDataBlockIdx&
+{
+    return mInDataBlockIdx;
+}
+
+template <typename SBlock>
+NEON_CUDA_HOST_DEVICE auto bIndex<SBlock>::isActive() const -> bool
+{
+    return mDataBlockIdx != std::numeric_limits<typename bIndex::DataBlockIdx>::max();
+}
+
+}  // namespace Neon::domain::details::bGrid
+
+#include "Neon/domain/details/bGrid/bIndex_imp.h"
diff --git a/libNeonDomain/include/Neon/domain/details/bGridDisgMgpu/bIndex_imp.h b/libNeonDomain/include/Neon/domain/details/bGridDisgMgpu/bIndex_imp.h
new file mode 100644
index 00000000..9da43b62
--- /dev/null
+++ b/libNeonDomain/include/Neon/domain/details/bGridDisgMgpu/bIndex_imp.h
@@ -0,0 +1,67 @@
+#pragma once
+#include "Neon/domain/details/bGridDisgMgpu/bIndex.h"
+
+namespace Neon::domain::details::bGridMgpu {
+
+template <typename SBlock>
+NEON_CUDA_HOST_DEVICE inline bIndex<SBlock>::
+    bIndex(const DataBlockIdx&            blockIdx,
+           const InDataBlockIdx::Integer& x,
+           const InDataBlockIdx::Integer& y,
+           const InDataBlockIdx::Integer& z)
+{
+    mDataBlockIdx = blockIdx;
+    mInDataBlockIdx.x = x;
+    mInDataBlockIdx.y = y;
+    mInDataBlockIdx.z = z;
+}
+
+
+template <typename SBlock>
+NEON_CUDA_HOST_DEVICE inline auto bIndex<SBlock>::getMicroIndex() -> MicroIndex
+{
+
+
+    TrayIdx const exBlockOffset = mDataBlockIdx * (SBlock::blockRatioX * SBlock::blockRatioY * SBlock::blockRatioZ);
+    TrayIdx const exTrayOffset = [&] {
+        TrayIdx const trayBlockIdxX = mInDataBlockIdx.x / SBlock::userBlockSizeX;
+        TrayIdx const trayBlockIdxY = mInDataBlockIdx.y / SBlock::userBlockSizeY;
+        TrayIdx const trayBlockIdxZ = mInDataBlockIdx.z / SBlock::userBlockSizeZ;
+
+        TrayIdx const res = trayBlockIdxX + trayBlockIdxY * SBlock::blockRatioX +
+                            trayBlockIdxZ * (SBlock::blockRatioX * SBlock::blockRatioY);
+        return res;
+    }();
+    MicroIndex res;
+    res.setTrayBlockIdx(exBlockOffset + exTrayOffset);
+    res.setInTrayBlockIdx({static_cast<InTrayIdx::Integer>(mInDataBlockIdx.x % SBlock::userBlockSizeX),
+                           static_cast<InTrayIdx::Integer>(mInDataBlockIdx.y % SBlock::userBlockSizeY),
+                           static_cast<InTrayIdx::Integer>(mInDataBlockIdx.z % SBlock::userBlockSizeZ)});
+    return res;
+}
+
+
+template <typename SBlock>
+NEON_CUDA_HOST_DEVICE inline auto bIndex<SBlock>::init(MicroIndex const& microIndex) -> void
+{
+    constexpr uint32_t memBlockSize = SBlock::memBlockSizeX * SBlock::memBlockSizeY * SBlock::memBlockSizeZ;
+    constexpr uint32_t userBlockSize = SBlock::userBlockSizeX * SBlock::userBlockSizeY * SBlock::userBlockSizeZ;
+    constexpr uint32_t blockRatioSize = memBlockSize / userBlockSize;
+
+    constexpr uint32_t blockRatioX = SBlock::memBlockSizeX / SBlock::userBlockSizeX;
+    constexpr uint32_t blockRatioY = SBlock::memBlockSizeY / SBlock::userBlockSizeY;
+
+    mDataBlockIdx = microIndex.getTrayBlockIdx() / (blockRatioSize);
+
+    uint32_t reminder = microIndex.getTrayBlockIdx() % (blockRatioSize);
+
+    const uint32_t reminderInZ = reminder / (blockRatioX * blockRatioY);
+    mInDataBlockIdx.z = static_cast<InDataBlockIdx::Integer>(microIndex.getInTrayBlockIdx().z + reminderInZ * SBlock::userBlockSizeZ);
+    reminder = reminder % (blockRatioX * blockRatioY);
+    const uint32_t reminderInY = reminder / (blockRatioX);
+    mInDataBlockIdx.y = static_cast<InDataBlockIdx::Integer>(microIndex.getInTrayBlockIdx().y + reminderInY * SBlock::userBlockSizeY);
+    const uint32_t reminderInX = reminder % blockRatioX;
+    mInDataBlockIdx.x = static_cast<InDataBlockIdx::Integer>(microIndex.getInTrayBlockIdx().x + reminderInX * SBlock::userBlockSizeX);
+}
+
+}  // namespace Neon::domain::details::bGrid
\ No newline at end of file
diff --git a/libNeonDomain/include/Neon/domain/details/bGridDisgMgpu/bPartition.h b/libNeonDomain/include/Neon/domain/details/bGridDisgMgpu/bPartition.h
new file mode 100644
index 00000000..647a12d7
--- /dev/null
+++ b/libNeonDomain/include/Neon/domain/details/bGridDisgMgpu/bPartition.h
@@ -0,0 +1,215 @@
+#pragma once
+
+#include "Neon/domain/details/bGridDisgMgpu//bIndex.h"
+#include "Neon/domain/details/bGridDisgMgpu//bSpan.h"
+
+#include "Neon/domain/interface/NghData.h"
+
+#include "Neon/sys/memory/CUDASharedMemoryUtil.h"
+
+namespace Neon::domain::details::bGridMgpu {
+
+template <typename SBlock>
+class bSpan;
+
+template <typename T, int C, typename SBlock>
+class bPartition
+{
+    enum Sectors
+    {
+        bUp = 0,
+        bDw = 1,
+        gUp = 2,
+        gDw = 3,
+        after = 4,
+        first = bUp
+    };
+
+   public:
+    using Span = bSpan<SBlock>;
+    using Idx = bIndex<SBlock>;
+    using NghIdx = typename Idx::NghIdx;
+    using Type = T;
+    using NghData = Neon::domain::NghData<T>;
+
+    using BlockViewGrid = Neon::domain::tool::GridTransformer<details::GridTransformation>::Grid;
+    using BlockViewGridIdx = BlockViewGrid::Idx;
+
+   public:
+    bPartition();
+
+    ~bPartition() = default;
+
+    explicit bPartition(int                                           setIdx,
+                        int                                           mCardinality,
+                        T*                                            mMem,
+                        typename Idx::DataBlockIdx*                   mBlockConnectivity,
+                        typename SBlock::BitMask const* NEON_RESTRICT mMask,
+                        Neon::int32_3d*                               mOrigin,
+                        NghIdx*                                       mStencilNghIndex,
+                        Neon::int32_3d                                mDomainSize,
+                        typename Idx::DataBlockCount                  mFirstDataBUP,
+                        typename Idx::DataBlockCount                  mFirstDataBDW,
+                        typename Idx::DataBlockCount                  mFirstDataGUP,
+                        typename Idx::DataBlockCount                  mFirstDataGDW,
+                        typename Idx::DataBlockCount                  mLastDataGDW);
+
+    /**
+     * Retrieve the cardinality of the field.
+     */
+    inline NEON_CUDA_HOST_DEVICE auto
+    cardinality()
+        const -> int;
+
+    /**
+     * Gets the field metadata at a cartesian point.
+     */
+    inline NEON_CUDA_HOST_DEVICE auto
+    operator()(const Idx& cell,
+               int        card)
+        -> T&;
+
+    /**
+     * Gets the field metadata at a cartesian point.
+     */
+    inline NEON_CUDA_HOST_DEVICE auto
+    operator()(const Idx& cell,
+               int        card)
+        const -> const T&;
+
+    /**
+     * Gets the field metadata at a neighbour cartesian point.
+     */
+    NEON_CUDA_HOST_DEVICE inline auto
+    getNghData(const Idx&    cell,
+               const NghIdx& offset,
+               const int     card)
+        const -> NghData;
+
+    /**
+     * Gets the field metadata at a neighbour cartesian point.
+     */
+    NEON_CUDA_HOST_DEVICE inline auto
+    getNghData(const Idx& eId,
+               uint8_t    nghID,
+               int        card)
+        const -> NghData;
+
+    /**
+     * Gets the field metadata at a neighbour cartesian point.
+     */
+    template <int xOff, int yOff, int zOff>
+    NEON_CUDA_HOST_DEVICE inline auto
+    getNghData(const Idx& eId,
+               int        card)
+        const -> NghData;
+
+    /**
+     * Gets the field metadata at a neighbour cartesian point.
+     */
+    template <int xOff, int yOff, int zOff>
+    NEON_CUDA_HOST_DEVICE inline auto
+    getNghData(const Idx& eId,
+               int        card,
+               T          defaultValue)
+        const -> NghData;
+
+    template <int xOff,
+              int yOff,
+              int zOff,
+              typename LambdaVALID,
+              typename LambdaNOTValid = void*>
+    NEON_CUDA_HOST_DEVICE inline auto
+    getNghData(const Idx&     gidx,
+               int            card,
+               LambdaVALID    funIfValid,
+               LambdaNOTValid funIfNOTValid = nullptr)
+        const -> std::enable_if_t<std::is_invocable_v<LambdaVALID, T> && (std::is_invocable_v<LambdaNOTValid, T> || std::is_same_v<LambdaNOTValid, void*>), void>;
+
+    template <int xOff,
+              int yOff,
+              int zOff>
+    NEON_CUDA_HOST_DEVICE inline auto
+    writeNghData(const Idx& gidx,
+                 int        card,
+                 T          value)
+        -> bool;
+
+    /**
+     * Gets the global coordinates of the cartesian point.
+     */
+    NEON_CUDA_HOST_DEVICE inline auto
+    getGlobalIndex(const Idx& cell)
+        const -> Neon::index_3d;
+
+    NEON_CUDA_HOST_DEVICE inline auto
+    isActive(const Idx&                      cell,
+             const typename SBlock::BitMask* mask = nullptr) const -> bool;
+
+
+    NEON_CUDA_HOST_DEVICE inline auto
+    getDomainSize()
+        const -> Neon::index_3d;
+
+    NEON_CUDA_HOST_DEVICE
+    auto mem() const -> T const*;
+
+    /**
+     * Gets the Idx for in the block view space.
+     */
+    NEON_CUDA_HOST_DEVICE inline auto
+    getBlockViewIdx(const Idx& cell)
+        const -> BlockViewGridIdx;
+
+
+    NEON_CUDA_HOST_DEVICE inline auto
+    helpGetPitch(const Idx& cell, int card)
+        const -> uint32_t;
+
+    NEON_CUDA_HOST_DEVICE inline auto
+    helpGetValidIdxPitchExplicit(const Idx& idx, int card)
+        const -> uint32_t;
+
+    NEON_CUDA_HOST_DEVICE inline auto
+    helpNghPitch(const Idx& nghIdx, int card)
+        const -> std::tuple<bool, uint32_t>;
+
+    NEON_CUDA_HOST_DEVICE inline auto
+    helpGetNghIdx(const Idx& idx, const NghIdx& offset)
+        const -> Idx;
+
+    template <int xOff, int yOff, int zOff>
+    NEON_CUDA_HOST_DEVICE inline auto
+    helpGetNghIdx(const Idx& idx)
+        const -> Idx;
+
+    NEON_CUDA_HOST_DEVICE inline auto
+    helpGetNghIdx(const Idx& idx, const NghIdx& offset, const typename Idx::DataBlockIdx* blockConnectivity)
+        const -> Idx;
+
+    template <int xOff, int yOff, int zOff>
+    NEON_CUDA_HOST_DEVICE inline auto
+    helpGetNghIdx(const Idx& idx, const typename Idx::DataBlockIdx* blockConnectivity)
+        const -> Idx;
+
+   auto NEON_CUDA_HOST_DEVICE helpGetSectorFirstBlock(Sectors sector) const
+        -> typename Idx::DataBlockCount;
+
+    int                                             mCardinality;
+    T*                                              mMem;
+    NghIdx const* NEON_RESTRICT                     mStencilNghIndex;
+    typename Idx::DataBlockIdx const* NEON_RESTRICT mBlockConnectivity;
+    typename SBlock::BitMask const* NEON_RESTRICT   mMask;
+    Neon::int32_3d const* NEON_RESTRICT             mOrigin;
+    int                                             mSetIdx;
+    int                                             mMultiResDiscreteIdxSpacing = 1;
+    Neon::int32_3d                                  mDomainSize;
+
+
+
+    typename Idx::DataBlockCount mSectorFirstBlockIdx[Sectors::after + 1];
+};
+
+}  // namespace Neon::domain::details::bGridMgpu
+
+#include "Neon/domain/details/bGridDisgMgpu//bPartition_imp.h"
\ No newline at end of file
diff --git a/libNeonDomain/include/Neon/domain/details/bGridDisgMgpu/bPartition_imp.h b/libNeonDomain/include/Neon/domain/details/bGridDisgMgpu/bPartition_imp.h
new file mode 100644
index 00000000..0c1279ad
--- /dev/null
+++ b/libNeonDomain/include/Neon/domain/details/bGridDisgMgpu/bPartition_imp.h
@@ -0,0 +1,510 @@
+#pragma once
+
+#include "Neon/domain/details/bGridDisgMgpu//bGrid.h"
+#include "Neon/domain/details/bGridDisgMgpu/bSpan.h"
+
+namespace Neon::domain::details::bGridMgpu {
+
+template <typename T, int C, typename SBlock>
+bPartition<T, C, SBlock>::bPartition()
+    : mCardinality(0),
+      mMem(nullptr),
+      mStencilNghIndex(),
+      mBlockConnectivity(nullptr),
+      mMask(nullptr),
+      mOrigin(0),
+      mSetIdx(0)
+{
+}
+
+template <typename T, int C, typename SBlock>
+bPartition<T, C, SBlock>::
+    bPartition(int                                           setIdx,
+               int                                           cardinality,
+               T*                                            mem,
+               typename Idx::DataBlockIdx*                   blockConnectivity,
+               typename SBlock::BitMask const* NEON_RESTRICT mask,
+               Neon::int32_3d*                               origin,
+               NghIdx*                                       stencilNghIndex,
+               Neon::int32_3d                                mDomainSize,
+               typename Idx::DataBlockCount                  mFirstDataBUP,
+               typename Idx::DataBlockCount                  mFirstDataBDW,
+               typename Idx::DataBlockCount                  mFirstDataGUP,
+               typename Idx::DataBlockCount                  mFirstDataGDW,
+               typename Idx::DataBlockCount                  mLastDataGDW)
+    : mCardinality(cardinality),
+      mMem(mem),
+      mStencilNghIndex(stencilNghIndex),
+      mBlockConnectivity(blockConnectivity),
+      mMask(mask),
+      mOrigin(origin),
+      mSetIdx(setIdx),
+      mDomainSize(mDomainSize)
+{
+    mSectorFirstBlockIdx[Sectors::bUp] = mFirstDataBUP;
+    mSectorFirstBlockIdx[Sectors::bDw] = mFirstDataBDW;
+    mSectorFirstBlockIdx[Sectors::gUp] = mFirstDataGUP;
+    mSectorFirstBlockIdx[Sectors::gDw] = mFirstDataGDW;
+    mSectorFirstBlockIdx[Sectors::after] = mLastDataGDW;
+}
+
+template <typename T, int C, typename SBlock>
+NEON_CUDA_HOST_DEVICE inline auto bPartition<T, C, SBlock>::
+    getGlobalIndex(const Idx& gidx)
+        const -> Neon::index_3d
+{
+    auto location = mOrigin[gidx.mDataBlockIdx];
+    location.x += gidx.mInDataBlockIdx.x;
+    location.y += gidx.mInDataBlockIdx.y;
+    location.z += gidx.mInDataBlockIdx.z;
+    if constexpr (SBlock::isMultiResMode) {
+        return location * mMultiResDiscreteIdxSpacing;
+    }
+    return location;
+}
+
+template <typename T, int C, typename SBlock>
+NEON_CUDA_HOST_DEVICE inline auto bPartition<T, C, SBlock>::
+    getDomainSize()
+        const -> Neon::index_3d
+{
+    return mDomainSize;
+}
+
+template <typename T, int C, typename SBlock>
+NEON_CUDA_HOST_DEVICE inline auto bPartition<T, C, SBlock>::
+    getBlockViewIdx(const Idx& gidx)
+        const -> BlockViewGridIdx
+{
+    BlockViewGridIdx res;
+    res.manualSet(gidx.getDataBlockIdx());
+    return res;
+}
+
+template <typename T, int C, typename SBlock>
+inline NEON_CUDA_HOST_DEVICE auto bPartition<T, C, SBlock>::
+    cardinality()
+        const -> int
+{
+    return mCardinality;
+}
+
+template <typename T, int C, typename SBlock>
+inline NEON_CUDA_HOST_DEVICE auto bPartition<T, C, SBlock>::
+operator()(const Idx& cell,
+           int        card) -> T&
+{
+    return mMem[helpGetPitch(cell, card)];
+}
+
+template <typename T, int C, typename SBlock>
+inline NEON_CUDA_HOST_DEVICE auto bPartition<T, C, SBlock>::
+operator()(const Idx& cell,
+           int        card) const -> const T&
+{
+    return mMem[helpGetPitch(cell, card)];
+}
+
+template <typename T, int C, typename SBlock>
+NEON_CUDA_HOST_DEVICE inline auto bPartition<T, C, SBlock>::
+    mem() const -> T const*
+{
+    return mMem;
+}
+
+template <typename T, int C, typename SBlock>
+inline NEON_CUDA_HOST_DEVICE auto bPartition<T, C, SBlock>::
+    helpGetPitch(const Idx& idx, int card)
+        const -> uint32_t
+{
+    uint32_t const pitch = helpGetValidIdxPitchExplicit(idx, card);
+    return pitch;
+}
+
+template <typename T, int C, typename SBlock>
+inline NEON_CUDA_HOST_DEVICE auto bPartition<T, C, SBlock>::
+    helpGetValidIdxPitchExplicit(const Idx& idx, int card)
+        const -> uint32_t
+{
+    // we are in the internal sector, so we have the standard AoSoA
+    if (idx.getDataBlockIdx() < mSectorFirstBlockIdx[Sectors::first]) {
+
+        uint32_t constexpr blockPitchByCard = SBlock::memBlockSizeX * SBlock::memBlockSizeY * SBlock::memBlockSizeZ;
+        uint32_t const inBlockInCardPitch = idx.mInDataBlockIdx.x +
+                                            SBlock::memBlockSizeX * idx.mInDataBlockIdx.y +
+                                            (SBlock::memBlockSizeX * SBlock::memBlockSizeY) * idx.mInDataBlockIdx.z;
+        uint32_t const blockAdnCardPitch = (idx.mDataBlockIdx * mCardinality + card) * blockPitchByCard;
+        uint32_t const pitch = blockAdnCardPitch + inBlockInCardPitch;
+        return pitch;
+    }
+
+    // We switch to the other sector where we have a SoA
+    int sectorLenght = 0;
+    int myFirstBlock = 0;
+
+    auto const myBlockIdx = idx.getDataBlockIdx();
+    for (int mySector = 3; mySector >= 0; mySector--) {
+        if (myBlockIdx >= mSectorFirstBlockIdx[mySector]) {
+            sectorLenght = mSectorFirstBlockIdx[mySector + 1] - mSectorFirstBlockIdx[mySector];
+            myFirstBlock = mSectorFirstBlockIdx[mySector];
+            break;
+        }
+    }
+    int const denseX = (myBlockIdx - myFirstBlock) * SBlock::memBlockSizeX + idx.getInDataBlockIdx().x;
+    int const denseY = idx.mInDataBlockIdx.y;
+    int const denseZ = idx.mInDataBlockIdx.z;
+
+    int const xStride = 1;
+    int const yStride = SBlock::memBlockSizeX * sectorLenght;
+    int const zStride = SBlock::memBlockSizeY * yStride;
+
+    int const pitch = (SBlock::memBlockSizeX * SBlock::memBlockSizeY * SBlock::memBlockSizeZ) *
+                          card *
+                          sectorLenght +
+                      denseX * xStride +
+                      denseY * yStride +
+                      denseZ * zStride;
+
+    int const fullPitch = myFirstBlock *
+                              SBlock::memBlockSizeX *
+                              SBlock::memBlockSizeY *
+                              SBlock::memBlockSizeZ *
+                              cardinality() +
+                          pitch;
+    return fullPitch;
+}
+
+template <typename T, int C, typename SBlock>
+inline NEON_CUDA_HOST_DEVICE auto bPartition<T, C, SBlock>::
+    helpNghPitch(const Idx& nghIdx, int card)
+        const -> std::tuple<bool, uint32_t>
+{
+    if (nghIdx.mDataBlockIdx == Span::getInvalidBlockId()) {
+        return {false, 0};
+    }
+
+    const bool isActive = mMask[nghIdx.mDataBlockIdx].isActive(nghIdx.mInDataBlockIdx.x, nghIdx.mInDataBlockIdx.y, nghIdx.mInDataBlockIdx.z);
+    if (!isActive) {
+        return {false, 0};
+    }
+    auto const offset = helpGetValidIdxPitchExplicit(nghIdx, card);
+    return {true, offset};
+}
+
+template <typename T, int C, typename SBlock>
+NEON_CUDA_HOST_DEVICE inline auto bPartition<T, C, SBlock>::
+    helpGetNghIdx(const Idx&    idx,
+                  const NghIdx& offset)
+        const -> Idx
+{
+    return this->helpGetNghIdx(idx, offset, mBlockConnectivity);
+}
+
+template <typename T, int C, typename SBlock>
+NEON_CUDA_HOST_DEVICE inline auto bPartition<T, C, SBlock>::
+    helpGetNghIdx(const Idx&                        idx,
+                  const NghIdx&                     offset,
+                  const typename Idx::DataBlockIdx* blockConnectivity)
+        const -> Idx
+{
+
+    typename Idx::InDataBlockIdx ngh(idx.mInDataBlockIdx.x + offset.x,
+                                     idx.mInDataBlockIdx.y + offset.y,
+                                     idx.mInDataBlockIdx.z + offset.z);
+
+    /**
+     * 0 if no offset on the direction
+     * 1 positive offset
+     * -1 negative offset
+     */
+    const int xFlag = ngh.x < 0 ? -1 : (ngh.x >= SBlock::memBlockSizeX ? +1 : 0);
+    const int yFlag = ngh.y < 0 ? -1 : (ngh.y >= SBlock::memBlockSizeX ? +1 : 0);
+    const int zFlag = ngh.z < 0 ? -1 : (ngh.z >= SBlock::memBlockSizeX ? +1 : 0);
+
+    const bool isLocal = (xFlag | yFlag | zFlag) == 0;
+    if (!(isLocal)) {
+        typename Idx::InDataBlockIdx remoteInBlockOffset;
+        /**
+         * Example
+         * - 8 block (1D case)
+         * Case 1:
+         * |0,1,2,3|0,1,2,3|0,1,2,3|
+         *        ^     ^
+         *       -3     starting point
+         *
+         * - idx.inBlock = 2
+         * - offset = -1
+         * - remote.x = (2-3) - ((-1) * 4) = -1 + 4 = 3
+         * Case 2:
+         * |0,1,2,3|0,1,2,3|0,1,2,3|
+         *                ^     ^
+         *  starting point      +3 from 3
+         *
+         * - idx.inBlock = 3
+         * - offset = (+3,0)
+         * - remote.x = (7+3) - ((+1) * 8) = 10 - 8 = 2
+         *
+         * |0,1,2,3|0,1,2,3|0,1,2,3|
+         *  ^                   ^
+         *  -3 from 0          +3 from 3
+         *
+         * NOTE: if in one direction the neighbour offet is zero, xFalg is 0;
+         * */
+
+        Idx remoteNghIdx;
+        remoteNghIdx.mInDataBlockIdx.x = ngh.x - xFlag * SBlock::memBlockSizeX;
+        remoteNghIdx.mInDataBlockIdx.y = ngh.y - yFlag * SBlock::memBlockSizeX;
+        remoteNghIdx.mInDataBlockIdx.z = ngh.z - zFlag * SBlock::memBlockSizeX;
+
+        int connectivityJump = idx.mDataBlockIdx * 27 +
+                               (xFlag + 1) +
+                               (yFlag + 1) * 3 +
+                               (zFlag + 1) * 9;
+        remoteNghIdx.mDataBlockIdx = blockConnectivity[connectivityJump];
+
+        return remoteNghIdx;
+    } else {
+        Idx localNghIdx;
+        localNghIdx.mDataBlockIdx = idx.mDataBlockIdx;
+        localNghIdx.mInDataBlockIdx = ngh;
+        return localNghIdx;
+    }
+}
+
+template <typename T, int C, typename SBlock>
+template <int xOff, int yOff, int zOff>
+NEON_CUDA_HOST_DEVICE inline auto bPartition<T, C, SBlock>::
+    helpGetNghIdx(const Idx& idx)
+        const -> Idx
+{
+    return this->helpGetNghIdx<xOff, yOff, zOff>(idx, mBlockConnectivity);
+}
+
+template <typename T, int C, typename SBlock>
+template <int xOff, int yOff, int zOff>
+NEON_CUDA_HOST_DEVICE inline auto bPartition<T, C, SBlock>::
+    helpGetNghIdx(const Idx& idx, const typename Idx::DataBlockIdx* blockConnectivity)
+        const -> Idx
+{
+
+    typename Idx::InDataBlockIdx ngh(idx.mInDataBlockIdx.x + xOff,
+                                     idx.mInDataBlockIdx.y + yOff,
+                                     idx.mInDataBlockIdx.z + zOff);
+
+    /**
+     * 0 if no offset on the direction
+     * 1 positive offset
+     * -1 negative offset
+     */
+    const int xFlag = [&] {
+        if constexpr (xOff == 0) {
+            return 0;
+        } else {
+            return ngh.x < 0 ? -1 : (ngh.x >= SBlock::memBlockSizeX ? +1 : 0);
+        }
+    }();
+
+
+    const int yFlag = [&] {
+        if constexpr (yOff == 0) {
+            return 0;
+        } else {
+            return ngh.y < 0 ? -1 : (ngh.y >= SBlock::memBlockSizeX ? +1 : 0);
+        }
+    }();
+    const int zFlag = [&] {
+        if constexpr (zOff == 0) {
+            return 0;
+        } else {
+            return ngh.z < 0 ? -1 : (ngh.z >= SBlock::memBlockSizeX ? +1 : 0);
+        }
+    }();
+
+    const bool isLocal = (xFlag | yFlag | zFlag) == 0;
+    if (!(isLocal)) {
+        typename Idx::InDataBlockIdx remoteInBlockOffset;
+        /**
+         * Example
+         * - 8 block (1D case)
+         * Case 1:
+         * |0,1,2,3|0,1,2,3|0,1,2,3|
+         *        ^     ^
+         *       -3     starting point
+         *
+         * - idx.inBlock = 2
+         * - offset = -1
+         * - remote.x = (2-3) - ((-1) * 4) = -1 + 4 = 3
+         * Case 2:
+         * |0,1,2,3|0,1,2,3|0,1,2,3|
+         *                ^     ^
+         *  starting point      +3 from 3
+         *
+         * - idx.inBlock = 3
+         * - offset = (+3,0)
+         * - remote.x = (7+3) - ((+1) * 8) = 10 - 8 = 2
+         *
+         * |0,1,2,3|0,1,2,3|0,1,2,3|
+         *  ^                   ^
+         *  -3 from 0          +3 from 3
+         *
+         * NOTE: if in one direction the neighbour offet is zero, xFalg is 0;
+         * */
+
+        Idx remoteNghIdx;
+        remoteNghIdx.mInDataBlockIdx.x = ngh.x - xFlag * SBlock::memBlockSizeX;
+        remoteNghIdx.mInDataBlockIdx.y = ngh.y - yFlag * SBlock::memBlockSizeX;
+        remoteNghIdx.mInDataBlockIdx.z = ngh.z - zFlag * SBlock::memBlockSizeX;
+
+        int connectivityJump = idx.mDataBlockIdx * 27 +
+                               (xFlag + 1) +
+                               (yFlag + 1) * 3 +
+                               (zFlag + 1) * 9;
+        remoteNghIdx.mDataBlockIdx = blockConnectivity[connectivityJump];
+
+        return remoteNghIdx;
+    } else {
+        Idx localNghIdx;
+        localNghIdx.mDataBlockIdx = idx.mDataBlockIdx;
+        localNghIdx.mInDataBlockIdx = ngh;
+        return localNghIdx;
+    }
+}
+
+template <typename T, int C, typename SBlock>
+NEON_CUDA_HOST_DEVICE inline auto bPartition<T, C, SBlock>::
+    getNghData(const Idx& eId,
+               uint8_t    nghID,
+               int        card)
+        const -> NghData
+{
+    NghIdx nghOffset = mStencilNghIndex[nghID];
+    return getNghData(eId, nghOffset, card);
+}
+
+template <typename T, int C, typename SBlock>
+NEON_CUDA_HOST_DEVICE inline auto bPartition<T, C, SBlock>::
+    getNghData(const Idx&    idx,
+               const NghIdx& offset,
+               const int     card)
+        const -> NghData
+{
+    NghData result;
+    bIndex  nghIdx = helpGetNghIdx(idx, offset);
+    auto [isValid, pitch] = helpNghPitch(nghIdx, card);
+    if (!isValid) {
+        result.invalidate();
+        return result;
+    }
+    auto const value = mMem[pitch];
+    result.set(value, true);
+    return result;
+}
+
+template <typename T, int C, typename SBlock>
+template <int xOff, int yOff, int zOff>
+NEON_CUDA_HOST_DEVICE inline auto bPartition<T, C, SBlock>::
+    getNghData(const Idx& idx,
+               int        card)
+        const -> NghData
+{
+    NghData result;
+    bIndex  nghIdx = helpGetNghIdx<xOff, yOff, zOff>(idx);
+    auto [isValid, pitch] = helpNghPitch(nghIdx, card);
+    if (!isValid) {
+        result.invalidate();
+        return result;
+    }
+    auto const value = mMem[pitch];
+    result.set(value, true);
+    return result;
+}
+
+template <typename T, int C, typename SBlock>
+template <int xOff, int yOff, int zOff>
+NEON_CUDA_HOST_DEVICE inline auto bPartition<T, C, SBlock>::
+    getNghData(const Idx& idx,
+               int        card,
+               T          defaultValue)
+        const -> NghData
+{
+    NghData result;
+    bIndex  nghIdx = helpGetNghIdx<xOff, yOff, zOff>(idx);
+    auto [isValid, pitch] = helpNghPitch(nghIdx, card);
+    if (!isValid) {
+        result.set(defaultValue, false);
+        return result;
+    }
+    auto const value = mMem[pitch];
+    result.set(value, true);
+    return result;
+}
+
+template <typename T, int C, typename SBlock>
+
+template <int xOff,
+          int yOff,
+          int zOff,
+          typename LambdaVALID,
+          typename LambdaNOTValid>
+NEON_CUDA_HOST_DEVICE inline auto bPartition<T, C, SBlock>::
+    getNghData(const Idx&     gidx,
+               int            card,
+               LambdaVALID    funIfValid,
+               LambdaNOTValid funIfNOTValid)
+        const -> std::enable_if_t<std::is_invocable_v<LambdaVALID, T> && (std::is_invocable_v<LambdaNOTValid, T> || std::is_same_v<LambdaNOTValid, void*>), void>
+{
+    NghData result;
+    bIndex  nghIdx = helpGetNghIdx<xOff, yOff, zOff>(gidx);
+    auto [isValid, pitch] = helpNghPitch(nghIdx, card);
+
+    if (isValid) {
+        auto const& value = mMem[pitch];
+        funIfValid(value);
+        return;
+    }
+
+    if constexpr (!std::is_same_v<LambdaNOTValid, void*>) {
+        funIfNOTValid();
+    }
+    return;
+}
+
+template <typename T, int C, typename SBlock>
+template <int xOff, int yOff, int zOff>
+NEON_CUDA_HOST_DEVICE inline auto bPartition<T, C, SBlock>::
+    writeNghData(const Idx& gidx,
+                 int        card,
+                 T          value)
+        -> bool
+{
+    NghData result;
+    bIndex  nghIdx = helpGetNghIdx<xOff, yOff, zOff>(gidx);
+    auto [isValid, pitch] = helpNghPitch(nghIdx, card);
+    if (!isValid) {
+        return false;
+    }
+    mMem[pitch] = value;
+    return true;
+}
+
+template <typename T, int C, typename SBlock>
+NEON_CUDA_HOST_DEVICE inline auto
+bPartition<T, C, SBlock>::isActive(const Idx&                      cell,
+                                   const typename SBlock::BitMask* mask) const -> bool
+{
+    if (!mask) {
+        return mMask[cell.mDataBlockIdx].isActive(cell.mInDataBlockIdx.x, cell.mInDataBlockIdx.y, cell.mInDataBlockIdx.z);
+    } else {
+        return mask[cell.mDataBlockIdx].isActive(cell.mInDataBlockIdx.x, cell.mInDataBlockIdx.y, cell.mInDataBlockIdx.z);
+    }
+}
+
+template <typename T, int C, typename SBlock>
+NEON_CUDA_HOST_DEVICE inline auto
+bPartition<T, C, SBlock>::helpGetSectorFirstBlock(Sectors sector) const
+    -> typename Idx::DataBlockCount
+{
+    return mSectorFirstBlockIdx[sector];
+}
+
+}  // namespace Neon::domain::details::bGridMgpu
\ No newline at end of file
diff --git a/libNeonDomain/include/Neon/domain/details/bGridDisgMgpu/bSpan.h b/libNeonDomain/include/Neon/domain/details/bGridDisgMgpu/bSpan.h
new file mode 100644
index 00000000..397133c8
--- /dev/null
+++ b/libNeonDomain/include/Neon/domain/details/bGridDisgMgpu/bSpan.h
@@ -0,0 +1,55 @@
+#pragma once
+
+#include "Neon/domain/details/bGridDisgMgpu/bIndex.h"
+
+namespace Neon::domain::details::bGridMgpu {
+
+template <typename SBlock>
+class bSpan
+{
+   public:
+    // bit mask information
+    using BitMaskWordType = uint64_t;
+
+    static constexpr uint32_t           bitMaskStorageBitWidth = 64;
+    static constexpr Neon::MemoryLayout activeMaskMemoryLayout = Neon::MemoryLayout::arrayOfStructs;
+    static constexpr uint32_t           log2OfbitMaskWordSize = 6;
+
+    using Idx = bIndex<SBlock>;
+    friend class bGrid<SBlock>;
+
+    static constexpr int SpaceDim = 3;
+
+    bSpan() = default;
+    virtual ~bSpan() = default;
+
+    NEON_CUDA_HOST_DEVICE inline static auto getInvalidBlockId()
+        -> typename Idx::DataBlockIdx
+    {
+        return std::numeric_limits<uint32_t>::max();
+    }
+
+    inline bSpan(
+        typename Idx::DataBlockCount                  mFirstDataBlockOffset,
+        typename SBlock::BitMask const* NEON_RESTRICT mActiveMask,
+        Neon::DataView                                mDataView);
+
+    NEON_CUDA_HOST_DEVICE inline auto setAndValidateCPUDevice(
+        Idx&            bidx,
+        uint32_t const& threadIdx,
+        uint32_t const& x,
+        uint32_t const& y,
+        uint32_t const& z) const -> bool;
+
+    NEON_CUDA_HOST_DEVICE inline auto setAndValidateGPUDevice(
+        Idx& bidx) const -> bool;
+
+
+    // We don't need to have a count on active blocks
+    typename Idx::DataBlockCount                  mFirstDataBlockOffset;
+    typename SBlock::BitMask const* NEON_RESTRICT mActiveMask;
+    Neon::DataView                                mDataView;
+};
+}  // namespace Neon::domain::details::bGrid
+
+#include "Neon/domain/details/bGridDisgMgpu/bSpan_imp.h"
\ No newline at end of file
diff --git a/libNeonDomain/include/Neon/domain/details/bGridDisgMgpu/bSpan_imp.h b/libNeonDomain/include/Neon/domain/details/bGridDisgMgpu/bSpan_imp.h
new file mode 100644
index 00000000..45e40eeb
--- /dev/null
+++ b/libNeonDomain/include/Neon/domain/details/bGridDisgMgpu/bSpan_imp.h
@@ -0,0 +1,51 @@
+#include "Neon/domain/details/bGridDisgMgpu/bSpan.h"
+
+namespace Neon::domain::details::bGridMgpu {
+
+template <typename SBlock>
+NEON_CUDA_HOST_DEVICE inline auto
+bSpan<SBlock>::setAndValidateGPUDevice([[maybe_unused]] Idx& bidx) const -> bool
+{
+#ifdef NEON_PLACE_CUDA_DEVICE
+    bidx.mDataBlockIdx = blockIdx.x + mFirstDataBlockOffset;
+    bidx.mInDataBlockIdx.x = threadIdx.x;
+    bidx.mInDataBlockIdx.y = threadIdx.y;
+    bidx.mInDataBlockIdx.z = threadIdx.z;
+
+    const bool isActive = mActiveMask[bidx.mDataBlockIdx].isActive(bidx.mInDataBlockIdx.x, bidx.mInDataBlockIdx.y, bidx.mInDataBlockIdx.z);
+
+    return isActive;
+#else
+    NEON_THROW_UNSUPPORTED_OPERATION("Operation supported only on GPU");
+#endif
+}
+
+template <typename SBlock>
+NEON_CUDA_HOST_DEVICE inline auto
+bSpan<SBlock>::setAndValidateCPUDevice(Idx&            bidx,
+                                       uint32_t const& dataBlockIdx,
+                                       uint32_t const& x,
+                                       uint32_t const& y,
+                                       uint32_t const& z) const -> bool
+{
+
+    bidx.mDataBlockIdx = dataBlockIdx + mFirstDataBlockOffset;
+    bidx.mInDataBlockIdx.x = static_cast<typename Idx::InDataBlockIdx::Integer>(x);
+    bidx.mInDataBlockIdx.y = static_cast<typename Idx::InDataBlockIdx::Integer>(y);
+    bidx.mInDataBlockIdx.z = static_cast<typename Idx::InDataBlockIdx::Integer>(z);
+    const bool isActive = mActiveMask[dataBlockIdx].isActive(bidx.mInDataBlockIdx.x, bidx.mInDataBlockIdx.y, bidx.mInDataBlockIdx.z);
+    return isActive;
+}
+
+template <typename SBlock>
+bSpan<SBlock>::bSpan(typename Idx::DataBlockCount                  firstDataBlockOffset,
+                     typename SBlock::BitMask const* NEON_RESTRICT activeMask,
+                     Neon::DataView                                dataView)
+    : mFirstDataBlockOffset(firstDataBlockOffset),
+      mActiveMask(activeMask),
+      mDataView(dataView)
+{
+}
+
+
+}  // namespace Neon::domain::details::bGrid
\ No newline at end of file
diff --git a/libNeonDomain/src/domain/details/bGridDisgMgpu/bFieldReduceKernels.cu b/libNeonDomain/src/domain/details/bGridDisgMgpu/bFieldReduceKernels.cu
new file mode 100644
index 00000000..e69de29b
diff --git a/libNeonDomain/src/domain/details/bGridDisgMgpu/bGrid.cpp b/libNeonDomain/src/domain/details/bGridDisgMgpu/bGrid.cpp
new file mode 100644
index 00000000..d207379b
--- /dev/null
+++ b/libNeonDomain/src/domain/details/bGridDisgMgpu/bGrid.cpp
@@ -0,0 +1,7 @@
+#include "Neon/domain/details/bGridDisgMgpu/bGrid.h"
+
+namespace Neon::domain::details::bGridMgpu {
+
+template class bGrid<Neon::domain::details::bGridMgpu::StaticBlock<8,8,8>>;
+
+}  // namespace Neon::domain::details::bGrid
\ No newline at end of file
diff --git a/libNeonDomain/tests/domain-map-disg/src/gtests.cpp b/libNeonDomain/tests/domain-map-disg/src/gtests.cpp
index b5b77c56..3664f646 100644
--- a/libNeonDomain/tests/domain-map-disg/src/gtests.cpp
+++ b/libNeonDomain/tests/domain-map-disg/src/gtests.cpp
@@ -4,7 +4,7 @@
 #include "map.h"
 #include "runHelper.h"
 
-TEST(domain_map, bGridDisg)
+TEST(domain_map_disg, bGridDisg)
 {
     int nGpus = 1;
     using Type = int64_t;
@@ -13,7 +13,7 @@ TEST(domain_map, bGridDisg)
                             1);
 }
 
-TEST(domain_map_dataView, bGridDisg)
+TEST(domain_map_disg_dataView, bGridDisg)
 {
     int nGpus = 1;
     using Type = int64_t;
diff --git a/libNeonDomain/tests/domain-map/src/gtests.cpp b/libNeonDomain/tests/domain-map/src/gtests.cpp
index 6240ad66..f6977444 100644
--- a/libNeonDomain/tests/domain-map/src/gtests.cpp
+++ b/libNeonDomain/tests/domain-map/src/gtests.cpp
@@ -42,7 +42,7 @@ TEST(domain_map, bGrid)
 
 TEST(domain_map, dGridDisg)
 {
-    int nGpus = 1;
+    int nGpus = 3;
     using Type = int64_t;
     runAllTestConfiguration(std::function(map::run<Neon::domain::details::disaggregated::dGrid::dGrid, Type, 0>),
                             nGpus,
@@ -58,6 +58,16 @@ TEST(domain_map, dGridSoA)
                             1);
 }
 
+TEST(domain_map, bGridMgpu)
+{
+    int nGpus = 3;
+    using Type = int64_t;
+    // extern template auto run<Neon::bGridMgpu, int64_t, 0>(TestData<Neon::bGridMgpu, int64_t, 0>&) -> void;
+    runAllTestConfiguration(std::function(map::run<Neon::bGridMgpu, Type, 0>),
+                            nGpus,
+                            1);
+}
+
 int main(int argc, char** argv)
 {
     ::testing::InitGoogleTest(&argc, argv);
diff --git a/libNeonDomain/tests/domain-map/src/map.cu b/libNeonDomain/tests/domain-map/src/map.cu
index 810cb067..d2dc5e5f 100644
--- a/libNeonDomain/tests/domain-map/src/map.cu
+++ b/libNeonDomain/tests/domain-map/src/map.cu
@@ -1,7 +1,7 @@
 #include <functional>
 #include "Neon/domain/Grids.h"
-#include "Neon/domain/details/dGridDisg/dGrid.h"
 #include "Neon/domain/details/bGridDisg/bGrid.h"
+#include "Neon/domain/details/dGridDisg/dGrid.h"
 
 #include "Neon/domain/details/dGridSoA/dGridSoA.h"
 #include "Neon/domain/tools/TestData.h"
@@ -36,15 +36,15 @@ auto mapContainer_axpy(int                   streamIdx,
 
 template <typename Field>
 auto mapContainer_add(int                   streamIdx,
-                       typename Field::Type& val,
-                       Field&                fieldB)
+                      typename Field::Type& val,
+                      Field&                fieldB)
     -> Neon::set::Container
 {
     const auto& grid = fieldB.getGrid();
     return grid.newContainer(
         "mapContainer_axpy",
         [&, val](Neon::set::Loader& loader) {
-            auto       b = loader.load(fieldB);
+            auto b = loader.load(fieldB);
 
             return [=] NEON_CUDA_HOST_DEVICE(const typename Field::Idx& e) mutable {
                 for (int i = 0; i < b.cardinality(); i++) {
@@ -101,6 +101,7 @@ template auto run<Neon::eGrid, int64_t, 0>(TestData<Neon::eGrid, int64_t, 0>&) -
 template auto run<Neon::bGrid, int64_t, 0>(TestData<Neon::bGrid, int64_t, 0>&) -> void;
 template auto run<Neon::domain::details::dGridSoA::dGridSoA, int64_t, 0>(TestData<Neon::domain::details::dGridSoA::dGridSoA, int64_t, 0>&) -> void;
 template auto run<Neon::domain::details::disaggregated::dGrid::dGrid, int64_t, 0>(TestData<Neon::domain::details::disaggregated::dGrid::dGrid, int64_t, 0>&) -> void;
+template auto run<Neon::bGridMgpu, int64_t, 0>(TestData<Neon::bGridMgpu, int64_t, 0>&) -> void;
 
 namespace dataView {
 template <typename G, typename T, int C>
@@ -151,6 +152,7 @@ template auto run<Neon::bGrid, int64_t, 0>(TestData<Neon::bGrid, int64_t, 0>&) -
 template auto run<Neon::domain::details::dGridSoA::dGridSoA, int64_t, 0>(TestData<Neon::domain::details::dGridSoA::dGridSoA, int64_t, 0>&) -> void;
 template auto run<Neon::domain::details::disaggregated::dGrid::dGrid, int64_t, 0>(TestData<Neon::domain::details::disaggregated::dGrid::dGrid, int64_t, 0>&) -> void;
 template auto run<Neon::bGridDisg, int64_t, 0>(TestData<Neon::bGridDisg, int64_t, 0>&) -> void;
+template auto run<Neon::bGridMgpu, int64_t, 0>(TestData<Neon::bGridMgpu, int64_t, 0>&) -> void;
 
 }  // namespace dataView
 }  // namespace map
\ No newline at end of file
diff --git a/libNeonDomain/tests/domain-map/src/map.h b/libNeonDomain/tests/domain-map/src/map.h
index f6f8bc7c..73038353 100644
--- a/libNeonDomain/tests/domain-map/src/map.h
+++ b/libNeonDomain/tests/domain-map/src/map.h
@@ -19,6 +19,7 @@ extern template auto run<Neon::eGrid, int64_t, 0>(TestData<Neon::eGrid, int64_t,
 extern template auto run<Neon::bGrid, int64_t, 0>(TestData<Neon::bGrid, int64_t, 0>&) -> void;
 extern template auto run<Neon::domain::details::dGridSoA::dGridSoA, int64_t, 0>(TestData<Neon::domain::details::dGridSoA::dGridSoA, int64_t, 0>&) -> void;
 extern template auto run<Neon::domain::details::disaggregated::dGrid::dGrid, int64_t, 0>(TestData<Neon::domain::details::disaggregated::dGrid::dGrid, int64_t, 0>&) -> void;
+extern template auto run<Neon::bGridMgpu, int64_t, 0>(TestData<Neon::bGridMgpu, int64_t, 0>&) -> void;
 
 namespace dataView {
 
@@ -30,6 +31,7 @@ extern template auto run<Neon::eGrid, int64_t, 0>(TestData<Neon::eGrid, int64_t,
 extern template auto run<Neon::bGrid, int64_t, 0>(TestData<Neon::bGrid, int64_t, 0>&) -> void;
 extern template auto run<Neon::domain::details::dGridSoA::dGridSoA, int64_t, 0>(TestData<Neon::domain::details::dGridSoA::dGridSoA, int64_t, 0>&) -> void;
 extern template auto run<Neon::domain::details::disaggregated::dGrid::dGrid, int64_t, 0>(TestData<Neon::domain::details::disaggregated::dGrid::dGrid, int64_t, 0>&) -> void;
+extern template auto run<Neon::bGridMgpu, int64_t, 0>(TestData<Neon::bGridMgpu, int64_t, 0>&) -> void;
 
 }  // namespace dataView
 
diff --git a/libNeonDomain/tests/domain-map/src/runHelper.h b/libNeonDomain/tests/domain-map/src/runHelper.h
index 593e31c2..4c622f82 100644
--- a/libNeonDomain/tests/domain-map/src/runHelper.h
+++ b/libNeonDomain/tests/domain-map/src/runHelper.h
@@ -73,7 +73,7 @@ void runAllTestConfiguration(
                         Neon::Backend       backend(ids, runtime);
                         Neon::MemoryOptions memoryOptions = backend.getMemoryOptions();
 
-                        if constexpr (std::is_same_v<G, Neon::bGrid>) {
+                        if constexpr (G::executionThreadSpan == Neon::set::details::ExecutionThreadSpan::d1b3) {
                             if (dim.z < 8 * ngpu * 3) {
                                 dim.z = ngpu * 3 * 8;
                             }