diff --git a/MoltenVK/MoltenVK/API/mvk_datatypes.h b/MoltenVK/MoltenVK/API/mvk_datatypes.h index 4d109957f..98a5f9910 100644 --- a/MoltenVK/MoltenVK/API/mvk_datatypes.h +++ b/MoltenVK/MoltenVK/API/mvk_datatypes.h @@ -333,6 +333,14 @@ typedef enum { kMVKShaderStageMax = kMVKShaderStageCount // Public API legacy value } MVKShaderStage; +typedef enum { + kMVKBarrierStageVertex = 0, + kMVKBarrierStageFragment, + kMVKBarrierStageCompute, + kMVKBarrierStageCopy, + kMVKBarrierStageCount +} MVKBarrierStage; + /** Returns the Metal MTLColorWriteMask corresponding to the specified Vulkan VkColorComponentFlags. */ MTLColorWriteMask mvkMTLColorWriteMaskFromVkChannelFlags(VkColorComponentFlags vkWriteFlags); diff --git a/MoltenVK/MoltenVK/API/mvk_private_api.h b/MoltenVK/MoltenVK/API/mvk_private_api.h index c50ae1bbe..f7088ce0b 100644 --- a/MoltenVK/MoltenVK/API/mvk_private_api.h +++ b/MoltenVK/MoltenVK/API/mvk_private_api.h @@ -358,6 +358,7 @@ typedef struct { VkBool32 needsCubeGradWorkaround; /**< If true, sampling from cube textures with explicit gradients is broken and needs a workaround. */ VkBool32 nativeTextureAtomics; /**< If true, atomic operations on textures are supported natively. */ VkBool32 needsArgumentBufferEncoders; /**< If true, Metal argument buffer encoders are needed to populate argument buffer content. */ + VkBool32 residencySets; /**< If true, the device supports creating residency sets. */ } MVKPhysicalDeviceMetalFeatures; diff --git a/MoltenVK/MoltenVK/Commands/MVKCmdPipeline.mm b/MoltenVK/MoltenVK/Commands/MVKCmdPipeline.mm index ac92eb2c3..ccfe8481f 100644 --- a/MoltenVK/MoltenVK/Commands/MVKCmdPipeline.mm +++ b/MoltenVK/MoltenVK/Commands/MVKCmdPipeline.mm @@ -79,6 +79,35 @@ return VK_SUCCESS; } +static uint64_t mvkPipelineStageFlagsToBarrierStages(VkPipelineStageFlags2 flags) { + uint64_t result = 0; + + if (mvkIsAnyFlagEnabled(flags, VK_PIPELINE_STAGE_2_VERTEX_INPUT_BIT | VK_PIPELINE_STAGE_2_VERTEX_SHADER_BIT | VK_PIPELINE_STAGE_2_TESSELLATION_CONTROL_SHADER_BIT | + VK_PIPELINE_STAGE_2_TESSELLATION_EVALUATION_SHADER_BIT | VK_PIPELINE_STAGE_2_ALL_GRAPHICS_BIT | VK_PIPELINE_STAGE_2_ALL_COMMANDS_BIT | + VK_PIPELINE_STAGE_2_DRAW_INDIRECT_BIT | VK_PIPELINE_STAGE_2_GEOMETRY_SHADER_BIT | VK_PIPELINE_STAGE_2_INDEX_INPUT_BIT | + VK_PIPELINE_STAGE_2_VERTEX_ATTRIBUTE_INPUT_BIT | VK_PIPELINE_STAGE_2_PRE_RASTERIZATION_SHADERS_BIT | VK_PIPELINE_STAGE_2_TRANSFORM_FEEDBACK_BIT_EXT | + VK_PIPELINE_STAGE_2_TOP_OF_PIPE_BIT | VK_PIPELINE_STAGE_2_BOTTOM_OF_PIPE_BIT)) + result |= 1 << kMVKBarrierStageVertex; + + if (mvkIsAnyFlagEnabled(flags, VK_PIPELINE_STAGE_2_FRAGMENT_SHADER_BIT | VK_PIPELINE_STAGE_2_ALL_GRAPHICS_BIT | VK_PIPELINE_STAGE_2_ALL_COMMANDS_BIT | + VK_PIPELINE_STAGE_2_EARLY_FRAGMENT_TESTS_BIT | VK_PIPELINE_STAGE_2_LATE_FRAGMENT_TESTS_BIT | VK_PIPELINE_STAGE_2_COLOR_ATTACHMENT_OUTPUT_BIT | + VK_PIPELINE_STAGE_2_FRAGMENT_SHADING_RATE_ATTACHMENT_BIT_KHR | VK_PIPELINE_STAGE_2_FRAGMENT_DENSITY_PROCESS_BIT_EXT | + VK_PIPELINE_STAGE_2_TOP_OF_PIPE_BIT | VK_PIPELINE_STAGE_2_BOTTOM_OF_PIPE_BIT)) + result |= 1 << kMVKBarrierStageFragment; + + if (mvkIsAnyFlagEnabled(flags, VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT | VK_PIPELINE_STAGE_2_ALL_COMMANDS_BIT | VK_PIPELINE_STAGE_2_TOP_OF_PIPE_BIT | + VK_PIPELINE_STAGE_2_BOTTOM_OF_PIPE_BIT)) + result |= 1 << kMVKBarrierStageCompute; + + if (mvkIsAnyFlagEnabled(flags, VK_PIPELINE_STAGE_2_BLIT_BIT | VK_PIPELINE_STAGE_2_COPY_BIT | VK_PIPELINE_STAGE_2_ALL_COMMANDS_BIT | + VK_PIPELINE_STAGE_2_ALL_TRANSFER_BIT | VK_PIPELINE_STAGE_2_TRANSFER_BIT | VK_PIPELINE_STAGE_2_RESOLVE_BIT | + VK_PIPELINE_STAGE_2_CLEAR_BIT | VK_PIPELINE_STAGE_2_ACCELERATION_STRUCTURE_COPY_BIT_KHR | VK_PIPELINE_STAGE_2_TOP_OF_PIPE_BIT | + VK_PIPELINE_STAGE_2_BOTTOM_OF_PIPE_BIT)) + result |= 1 << kMVKBarrierStageCopy; + + return result; +} + template VkResult MVKCmdPipelineBarrier::setContent(MVKCommandBuffer* cmdBuff, VkPipelineStageFlags srcStageMask, @@ -157,6 +186,15 @@ } #endif + if (!cmdEncoder->_mtlRenderEncoder && cmdEncoder->isUsingMetalArgumentBuffers() && cmdEncoder->getDevice()->hasResidencySet()) { + cmdEncoder->endCurrentMetalEncoding(); + + for (auto& b : _barriers) { + uint64_t sourceStageMask = mvkPipelineStageFlagsToBarrierStages(b.srcStageMask), destStageMask = mvkPipelineStageFlagsToBarrierStages(b.dstStageMask); + cmdEncoder->setBarrier(sourceStageMask, destStageMask); + } + } + // Apple GPUs do not support renderpass barriers, and do not support rendering/writing // to an attachment and then reading from that attachment within a single renderpass. // So, in the case where we are inside a Metal renderpass, we need to split those activities diff --git a/MoltenVK/MoltenVK/Commands/MVKCmdTransfer.mm b/MoltenVK/MoltenVK/Commands/MVKCmdTransfer.mm index bd73322dd..565556d68 100644 --- a/MoltenVK/MoltenVK/Commands/MVKCmdTransfer.mm +++ b/MoltenVK/MoltenVK/Commands/MVKCmdTransfer.mm @@ -617,6 +617,8 @@ static inline MTLSize mvkClampMTLSize(MTLSize size, MTLOrigin origin, MTLSize ma id mtlRendEnc = [cmdEncoder->_mtlCmdBuffer renderCommandEncoderWithDescriptor: mtlRPD]; cmdEncoder->_cmdBuffer->setMetalObjectLabel(mtlRendEnc, mvkMTLRenderCommandEncoderLabel(commandUse)); + cmdEncoder->barrierWait(kMVKBarrierStageCopy, mtlRendEnc, MTLRenderStageFragment); + float zIncr; if (blitKey.srcMTLTextureType == MTLTextureType3D) { // In this case, I need to interpolate along the third dimension manually. @@ -678,7 +680,10 @@ static inline MTLSize mvkClampMTLSize(MTLSize size, MTLOrigin origin, MTLSize ma NSUInteger instanceCount = isLayeredBlit ? mtlRPD.renderTargetArrayLengthMVK : 1; [mtlRendEnc drawPrimitives: MTLPrimitiveTypeTriangleStrip vertexStart: 0 vertexCount: kMVKBlitVertexCount instanceCount: instanceCount]; - [mtlRendEnc popDebugGroup]; + + cmdEncoder->barrierUpdate(kMVKBarrierStageCopy, mtlRendEnc, MTLRenderStageFragment); + + [mtlRendEnc popDebugGroup]; [mtlRendEnc endEncoding]; } } @@ -897,6 +902,10 @@ static inline MTLSize mvkClampMTLSize(MTLSize size, MTLOrigin origin, MTLSize ma [mtlRendEnc pushDebugGroup: @"vkCmdResolveImage"]; [mtlRendEnc popDebugGroup]; + + cmdEncoder->barrierWait(kMVKBarrierStageCopy, mtlRendEnc, MTLRenderStageFragment); + cmdEncoder->barrierUpdate(kMVKBarrierStageCopy, mtlRendEnc, MTLRenderStageFragment); + [mtlRendEnc endEncoding]; } } @@ -1700,6 +1709,10 @@ static inline MTLSize mvkClampMTLSize(MTLSize size, MTLOrigin origin, MTLSize ma id mtlRendEnc = [cmdEncoder->_mtlCmdBuffer renderCommandEncoderWithDescriptor: mtlRPDesc]; cmdEncoder->_cmdBuffer->setMetalObjectLabel(mtlRendEnc, mtlRendEncName); + + cmdEncoder->barrierWait(kMVKBarrierStageCopy, mtlRendEnc, MTLRenderStageFragment); + cmdEncoder->barrierUpdate(kMVKBarrierStageCopy, mtlRendEnc, MTLRenderStageFragment); + [mtlRendEnc endEncoding]; } else { for (uint32_t layer = layerStart; layer < layerEnd; layer++) { @@ -1715,7 +1728,11 @@ static inline MTLSize mvkClampMTLSize(MTLSize size, MTLOrigin origin, MTLSize ma id mtlRendEnc = [cmdEncoder->_mtlCmdBuffer renderCommandEncoderWithDescriptor: mtlRPDesc]; cmdEncoder->_cmdBuffer->setMetalObjectLabel(mtlRendEnc, mtlRendEncName); - [mtlRendEnc endEncoding]; + + cmdEncoder->barrierWait(kMVKBarrierStageCopy, mtlRendEnc, MTLRenderStageFragment); + cmdEncoder->barrierUpdate(kMVKBarrierStageCopy, mtlRendEnc, MTLRenderStageFragment); + + [mtlRendEnc endEncoding]; } } } diff --git a/MoltenVK/MoltenVK/Commands/MVKCommandBuffer.h b/MoltenVK/MoltenVK/Commands/MVKCommandBuffer.h index a3d190f75..25e9def23 100644 --- a/MoltenVK/MoltenVK/Commands/MVKCommandBuffer.h +++ b/MoltenVK/MoltenVK/Commands/MVKCommandBuffer.h @@ -402,6 +402,27 @@ class MVKCommandEncoder : public MVKBaseDeviceObject { /** Returns the command encoding pool. */ MVKCommandEncodingPool* getCommandEncodingPool(); + #pragma mark Barriers + + /** Encode waits in the current command encoder for the stage that corresponds to given use. */ + void encodeBarrierWaits(MVKCommandUse use); + + /** Update fences for the currently executing pipeline stage. */ + void encodeBarrierUpdates(); + + /** Insert a new execution barrier */ + void setBarrier(uint64_t sourceStageMask, uint64_t destStageMask); + + /** Encode waits for a specific stage in given encoder. */ + void barrierWait(MVKBarrierStage stage, id mtlEncoder, MTLRenderStages beforeStages); + void barrierWait(MVKBarrierStage stage, id mtlEncoder); + void barrierWait(MVKBarrierStage stage, id mtlEncoder); + + /** Encode update for a specific stage in given encoder. */ + void barrierUpdate(MVKBarrierStage stage, id mtlEncoder, MTLRenderStages afterStages); + void barrierUpdate(MVKBarrierStage stage, id mtlEncoder); + void barrierUpdate(MVKBarrierStage stage, id mtlEncoder); + #pragma mark Queries /** Begins an occlusion query. */ @@ -484,6 +505,7 @@ class MVKCommandEncoder : public MVKBaseDeviceObject { NSString* getMTLRenderCommandEncoderName(MVKCommandUse cmdUse); template void retainIfImmediatelyEncoding(T& mtlEnc); template void endMetalEncoding(T& mtlEnc); + id getBarrierStageFence(MVKBarrierStage stage); typedef struct GPUCounterQuery { MVKGPUCounterQueryPool* queryPool = nullptr; @@ -511,6 +533,9 @@ class MVKCommandEncoder : public MVKBaseDeviceObject { uint32_t _flushCount; MVKCommandUse _mtlComputeEncoderUse; MVKCommandUse _mtlBlitEncoderUse; + uint32_t _updateFenceSlotDirtyBits = ~0; + int _updateFenceSlots[kMVKBarrierStageCount] = {}; + int _waitFenceSlots[kMVKBarrierStageCount][kMVKBarrierStageCount] = {}; bool _isRenderingEntireAttachment; }; diff --git a/MoltenVK/MoltenVK/Commands/MVKCommandBuffer.mm b/MoltenVK/MoltenVK/Commands/MVKCommandBuffer.mm index f79cf62df..19e922dd9 100644 --- a/MoltenVK/MoltenVK/Commands/MVKCommandBuffer.mm +++ b/MoltenVK/MoltenVK/Commands/MVKCommandBuffer.mm @@ -390,8 +390,21 @@ } void MVKCommandEncoder::endEncoding() { - endCurrentMetalEncoding(); - finishQueries(); + endCurrentMetalEncoding(); + finishQueries(); + + // Synchronize all stages to their fences at index 0, which will be waited on in the next command buffer. + if (isUsingMetalArgumentBuffers()) { + for (int i = 0; i < kMVKBarrierStageCount; ++i) { + auto fenceIndex = _updateFenceSlots[i]; + if (!fenceIndex) continue; + + auto encoder = [_mtlCmdBuffer blitCommandEncoder]; + [encoder waitForFence:getDevice()->getFence((MVKBarrierStage)i, fenceIndex)]; + [encoder updateFence:getDevice()->getFence((MVKBarrierStage)i, 0)]; + [encoder endEncoding]; + } + } } void MVKCommandEncoder::encodeSecondary(MVKCommandBuffer* secondaryCmdBuffer) { @@ -486,6 +499,7 @@ uint32_t subpassIndex, MVKCommandUse cmdUse) { encodeStoreActions(); + encodeBarrierUpdates(); endMetalRenderEncoding(); _lastMultiviewPassCmd = subpassCmd; @@ -521,6 +535,161 @@ mtlEnc = nil; } +static MVKBarrierStage commandUseToBarrierStage(MVKCommandUse use) { + switch (use) { + case kMVKCommandUseNone: return kMVKBarrierStageCount; /**< No use defined. */ + case kMVKCommandUseBeginCommandBuffer: return kMVKBarrierStageCount; /**< vkBeginCommandBuffer (prefilled VkCommandBuffer). */ + case kMVKCommandUseQueueSubmit: return kMVKBarrierStageCount; /**< vkQueueSubmit. */ + case kMVKCommandUseAcquireNextImage: return kMVKBarrierStageCount; /**< vkAcquireNextImageKHR. */ + case kMVKCommandUseQueuePresent: return kMVKBarrierStageCount; /**< vkQueuePresentKHR. */ + case kMVKCommandUseQueueWaitIdle: return kMVKBarrierStageCount; /**< vkQueueWaitIdle. */ + case kMVKCommandUseDeviceWaitIdle: return kMVKBarrierStageCount; /**< vkDeviceWaitIdle. */ + case kMVKCommandUseInvalidateMappedMemoryRanges: return kMVKBarrierStageCount; /**< vkInvalidateMappedMemoryRanges. */ + case kMVKCommandUseBeginRendering: return kMVKBarrierStageCount; /**< vkCmdBeginRendering. */ + case kMVKCommandUseBeginRenderPass: return kMVKBarrierStageCount; /**< vkCmdBeginRenderPass. */ + case kMVKCommandUseNextSubpass: return kMVKBarrierStageCount; /**< vkCmdNextSubpass. */ + case kMVKCommandUseRestartSubpass: return kMVKBarrierStageCount; /**< Create a new Metal renderpass due to Metal requirements. */ + case kMVKCommandUsePipelineBarrier: return kMVKBarrierStageCount; /**< vkCmdPipelineBarrier. */ + case kMVKCommandUseBlitImage: return kMVKBarrierStageCopy; /**< vkCmdBlitImage. */ + case kMVKCommandUseCopyImage: return kMVKBarrierStageCopy; /**< vkCmdCopyImage. */ + case kMVKCommandUseResolveImage: return kMVKBarrierStageCopy; /**< vkCmdResolveImage - resolve stage. */ + case kMVKCommandUseResolveExpandImage: return kMVKBarrierStageCopy; /**< vkCmdResolveImage - expand stage. */ + case kMVKCommandUseResolveCopyImage: return kMVKBarrierStageCopy; /**< vkCmdResolveImage - copy stage. */ + case kMVKCommandUseCopyImageToMemory: return kMVKBarrierStageCopy; /**< vkCopyImageToMemoryEXT host sync. */ + case kMVKCommandUseCopyBuffer: return kMVKBarrierStageCopy; /**< vkCmdCopyBuffer. */ + case kMVKCommandUseCopyBufferToImage: return kMVKBarrierStageCopy; /**< vkCmdCopyBufferToImage. */ + case kMVKCommandUseCopyImageToBuffer: return kMVKBarrierStageCopy; /**< vkCmdCopyImageToBuffer. */ + case kMVKCommandUseFillBuffer: return kMVKBarrierStageCopy; /**< vkCmdFillBuffer. */ + case kMVKCommandUseUpdateBuffer: return kMVKBarrierStageCopy; /**< vkCmdUpdateBuffer. */ + case kMVKCommandUseClearAttachments: return kMVKBarrierStageCount; /**< vkCmdClearAttachments. */ + case kMVKCommandUseClearColorImage: return kMVKBarrierStageCopy; /**< vkCmdClearColorImage. */ + case kMVKCommandUseClearDepthStencilImage: return kMVKBarrierStageCopy; /**< vkCmdClearDepthStencilImage. */ + case kMVKCommandUseResetQueryPool: return kMVKBarrierStageCopy; /**< vkCmdResetQueryPool. */ + case kMVKCommandUseDispatch: return kMVKBarrierStageCompute; /**< vkCmdDispatch. */ + case kMVKCommandUseTessellationVertexTessCtl: return kMVKBarrierStageVertex; /**< vkCmdDraw* - vertex and tessellation control stages. */ + case kMVKCommandUseDrawIndirectConvertBuffers: return kMVKBarrierStageVertex; /**< vkCmdDrawIndirect* convert indirect buffers. */ + case kMVKCommandUseCopyQueryPoolResults: return kMVKBarrierStageCopy; /**< vkCmdCopyQueryPoolResults. */ + case kMVKCommandUseAccumOcclusionQuery: return kMVKBarrierStageCount; /**< Any command terminating a Metal render pass with active visibility buffer. */ + case kMVKCommandUseRecordGPUCounterSample: return kMVKBarrierStageCount; /**< Any command triggering the recording of a GPU counter sample. */ + } +} + + + +void MVKCommandEncoder::barrierWait(MVKBarrierStage stage, id mtlEncoder, MTLRenderStages beforeStages) { + if (!isUsingMetalArgumentBuffers() || !getDevice()->hasResidencySet()) return; + for (int i = 0; i < kMVKBarrierStageCount; ++i) { + auto fenceIndex = _waitFenceSlots[stage][i]; + auto fence = _device->getFence((MVKBarrierStage)i, fenceIndex); + [mtlEncoder waitForFence:fence beforeStages:beforeStages]; + } +} + +void MVKCommandEncoder::barrierWait(MVKBarrierStage stage, id mtlEncoder) { + if (!isUsingMetalArgumentBuffers() || !getDevice()->hasResidencySet()) return; + for (int i = 0; i < kMVKBarrierStageCount; ++i) { + auto fenceIndex = _waitFenceSlots[stage][i]; + auto fence = _device->getFence((MVKBarrierStage)i, fenceIndex); + [mtlEncoder waitForFence:fence]; + } +} + +void MVKCommandEncoder::barrierWait(MVKBarrierStage stage, id mtlEncoder) { + if (!isUsingMetalArgumentBuffers() || !getDevice()->hasResidencySet()) return; + for (int i = 0; i < kMVKBarrierStageCount; ++i) { + auto fenceIndex = _waitFenceSlots[stage][i]; + auto fence = _device->getFence((MVKBarrierStage)i, fenceIndex); + [mtlEncoder waitForFence:fence]; + } +} + +void MVKCommandEncoder::barrierUpdate(MVKBarrierStage stage, id mtlEncoder, MTLRenderStages afterStages) { + if (!isUsingMetalArgumentBuffers() || !getDevice()->hasResidencySet()) return; + auto fence = getBarrierStageFence(stage); + [mtlEncoder updateFence:fence afterStages:afterStages]; +} + +void MVKCommandEncoder::barrierUpdate(MVKBarrierStage stage, id mtlEncoder) { + if (!isUsingMetalArgumentBuffers() || !getDevice()->hasResidencySet()) return; + auto fence = getBarrierStageFence(stage); + [mtlEncoder updateFence:fence]; +} + +void MVKCommandEncoder::barrierUpdate(MVKBarrierStage stage, id mtlEncoder) { + if (!isUsingMetalArgumentBuffers() || !getDevice()->hasResidencySet()) return; + auto fence = getBarrierStageFence(stage); + [mtlEncoder updateFence:fence]; +} + +id MVKCommandEncoder::getBarrierStageFence(MVKBarrierStage stage) { + if (mvkAreAllFlagsEnabled(_updateFenceSlotDirtyBits, 1 << stage)) { + mvkDisableFlags(_updateFenceSlotDirtyBits, 1 << stage); + + _updateFenceSlots[stage] = (_updateFenceSlots[stage] + 1) % kMVKBarrierFenceCount; + if (_updateFenceSlots[stage] == 0) _updateFenceSlots[stage] = 1; + } + + return _device->getFence(stage, _updateFenceSlots[stage]); +} + +void MVKCommandEncoder::setBarrier(uint64_t sourceStageMask, uint64_t destStageMask) { + for (int i = 0; i < kMVKBarrierStageCount; ++i) { + if (!mvkIsAnyFlagEnabled(sourceStageMask, 1ull << i)) continue; + + for (int j = 0; j < kMVKBarrierStageCount; ++j) { + if (!mvkIsAnyFlagEnabled(destStageMask, 1ull << j)) continue; + + _waitFenceSlots[j][i] = _updateFenceSlots[i]; + } + + _waitFenceSlots[i][i] = _updateFenceSlots[i]; + mvkEnableFlags(_updateFenceSlotDirtyBits, 1 << i); + } +} + + +void MVKCommandEncoder::encodeBarrierWaits(MVKCommandUse use) { + if (_mtlRenderEncoder) { + [_mtlRenderEncoder insertDebugSignpost:@"Encoding waits"]; + barrierWait(kMVKBarrierStageVertex, _mtlRenderEncoder, MTLRenderStageVertex); + barrierWait(kMVKBarrierStageFragment, _mtlRenderEncoder, MTLRenderStageFragment); + } + if (_mtlComputeEncoder) { + auto stage = commandUseToBarrierStage(use); + if (stage != kMVKBarrierStageCount) { + barrierWait(stage, _mtlComputeEncoder); + } + } + if (_mtlBlitEncoder) { + auto stage = commandUseToBarrierStage(use); + if (stage != kMVKBarrierStageCount) { + barrierWait(stage, _mtlBlitEncoder); + } + } +} + +void MVKCommandEncoder::encodeBarrierUpdates() { + if (_mtlRenderEncoder) { + barrierUpdate(kMVKBarrierStageVertex, _mtlRenderEncoder, MTLRenderStageVertex); + barrierUpdate(kMVKBarrierStageFragment, _mtlRenderEncoder, MTLRenderStageFragment); + } + + if (_mtlComputeEncoder) { + MVKBarrierStage stage = commandUseToBarrierStage(_mtlComputeEncoderUse); + if (stage != kMVKBarrierStageCount) { + barrierUpdate(stage, _mtlComputeEncoder); + } + } + + if (_mtlBlitEncoder) { + MVKBarrierStage stage = commandUseToBarrierStage(_mtlBlitEncoderUse); + if (stage != kMVKBarrierStageCount) { + barrierUpdate(stage, _mtlBlitEncoder); + } + } +} + + // Creates _mtlRenderEncoder and marks cached render state as dirty so it will be set into the _mtlRenderEncoder. void MVKCommandEncoder::beginMetalRenderPass(MVKCommandUse cmdUse) { @@ -584,6 +753,8 @@ retainIfImmediatelyEncoding(_mtlRenderEncoder); _cmdBuffer->setMetalObjectLabel(_mtlRenderEncoder, getMTLRenderCommandEncoderName(cmdUse)); + encodeBarrierWaits(cmdUse); + // We shouldn't clear the render area if we are restarting the Metal renderpass // separately from a Vulkan subpass, and we otherwise only need to clear render // area if we're not rendering to the entire attachment. @@ -787,6 +958,7 @@ } encodeStoreActions(); + encodeBarrierUpdates(); endMetalRenderEncoding(); if ( !mvkIsAnyFlagEnabled(_pEncodingContext->getRenderingFlags(), VK_RENDERING_SUSPENDING_BIT) ) { _pEncodingContext->setRenderingContext(nullptr, nullptr); @@ -815,6 +987,7 @@ } void MVKCommandEncoder::endCurrentMetalEncoding() { + encodeBarrierUpdates(); endMetalRenderEncoding(); _computePipelineState.markDirty(); @@ -833,7 +1006,9 @@ } id MVKCommandEncoder::getMTLComputeEncoder(MVKCommandUse cmdUse, bool markCurrentComputeStateDirty) { + bool needWaits = false; if ( !_mtlComputeEncoder ) { + needWaits = true; endCurrentMetalEncoding(); _mtlComputeEncoder = [_mtlCmdBuffer computeCommandEncoder]; retainIfImmediatelyEncoding(_mtlComputeEncoder); @@ -846,22 +1021,32 @@ _computeResourcesState.markDirty(); } if (_mtlComputeEncoderUse != cmdUse) { + needWaits = true; _mtlComputeEncoderUse = cmdUse; _cmdBuffer->setMetalObjectLabel(_mtlComputeEncoder, mvkMTLComputeCommandEncoderLabel(cmdUse)); } + if (needWaits) { + encodeBarrierWaits(cmdUse); + } return _mtlComputeEncoder; } id MVKCommandEncoder::getMTLBlitEncoder(MVKCommandUse cmdUse) { + bool needWaits = false; if ( !_mtlBlitEncoder ) { + needWaits = true; endCurrentMetalEncoding(); _mtlBlitEncoder = [_mtlCmdBuffer blitCommandEncoder]; retainIfImmediatelyEncoding(_mtlBlitEncoder); } if (_mtlBlitEncoderUse != cmdUse) { + needWaits = true; _mtlBlitEncoderUse = cmdUse; _cmdBuffer->setMetalObjectLabel(_mtlBlitEncoder, mvkMTLBlitCommandEncoderLabel(cmdUse)); } + if (needWaits) { + encodeBarrierWaits(cmdUse); + } return _mtlBlitEncoder; } diff --git a/MoltenVK/MoltenVK/Commands/MVKCommandEncoderState.mm b/MoltenVK/MoltenVK/Commands/MVKCommandEncoderState.mm index 7a0de21ae..d6687f069 100644 --- a/MoltenVK/MoltenVK/Commands/MVKCommandEncoderState.mm +++ b/MoltenVK/MoltenVK/Commands/MVKCommandEncoderState.mm @@ -681,6 +681,7 @@ - (void)setDepthBoundsTestAMD:(BOOL)enable minDepth:(float)minDepth maxDepth:(fl auto* dslBind = dsLayout->getBindingAt(dslBindIdx); if (dslBind->getApplyToStage(stage) && shaderBindingUsage.getBit(dslBindIdx)) { shouldBindArgBuffToStage = true; + if (getDevice()->hasResidencySet()) continue; uint32_t elemCnt = dslBind->getDescriptorCount(descSet->getVariableDescriptorCount()); for (uint32_t elemIdx = 0; elemIdx < elemCnt; elemIdx++) { uint32_t descIdx = dslBind->getDescriptorIndex(elemIdx); diff --git a/MoltenVK/MoltenVK/Commands/MVKMTLBufferAllocation.mm b/MoltenVK/MoltenVK/Commands/MVKMTLBufferAllocation.mm index 10ee00e38..54a816c9e 100644 --- a/MoltenVK/MoltenVK/Commands/MVKMTLBufferAllocation.mm +++ b/MoltenVK/MoltenVK/Commands/MVKMTLBufferAllocation.mm @@ -46,6 +46,7 @@ void MVKMTLBufferAllocationPool::addMTLBuffer() { MTLResourceOptions mbOpts = (_mtlStorageMode << MTLResourceStorageModeShift) | MTLResourceCPUCacheModeDefaultCache; _mtlBuffers.push_back({ [getMTLDevice() newBufferWithLength: _mtlBufferLength options: mbOpts], 0 }); + getDevice()->makeResident(_mtlBuffers.back().mtlBuffer); _nextOffset = 0; } @@ -106,6 +107,7 @@ MVKMTLBufferAllocationPool::~MVKMTLBufferAllocationPool() { for (uint32_t bufferIndex = 0; bufferIndex < _mtlBuffers.size(); ++bufferIndex) { + getDevice()->removeResidency(_mtlBuffers[bufferIndex].mtlBuffer); [_mtlBuffers[bufferIndex].mtlBuffer release]; } _mtlBuffers.clear(); diff --git a/MoltenVK/MoltenVK/GPUObjects/MVKBuffer.mm b/MoltenVK/MoltenVK/GPUObjects/MVKBuffer.mm index a7cd05595..82581be1b 100644 --- a/MoltenVK/MoltenVK/GPUObjects/MVKBuffer.mm +++ b/MoltenVK/MoltenVK/GPUObjects/MVKBuffer.mm @@ -185,6 +185,7 @@ _mtlBuffer = [_deviceMemory->getMTLHeap() newBufferWithLength: getByteCount() options: _deviceMemory->getMTLResourceOptions() offset: _deviceMemoryOffset]; // retained + getDevice()->makeResident(_mtlBuffer); propagateDebugName(); return _mtlBuffer; } else { @@ -202,6 +203,7 @@ _mtlBufferCache = [getMTLDevice() newBufferWithLength: getByteCount() options: MTLResourceStorageModeManaged]; // retained + getDevice()->makeResident(_mtlBufferCache); flushToDevice(_deviceMemoryOffset, _byteCount); } #endif @@ -268,8 +270,10 @@ void MVKBuffer::detachMemory() { if (_deviceMemory) { _deviceMemory->removeBuffer(this); } _deviceMemory = nullptr; + if (_mtlBuffer) getDevice()->removeResidency(_mtlBuffer); [_mtlBuffer release]; _mtlBuffer = nil; + if (_mtlBufferCache) getDevice()->removeResidency(_mtlBufferCache); [_mtlBufferCache release]; _mtlBufferCache = nil; } @@ -327,6 +331,7 @@ _mtlTexture = [mtlBuff newTextureWithDescriptor: mtlTexDesc offset: mtlBuffOffset bytesPerRow: _mtlBytesPerRow]; + getDevice()->makeResident(_mtlTexture); propagateDebugName(); } return _mtlTexture; @@ -390,6 +395,7 @@ // Potentially called twice, from destroy() and destructor, so ensure everything is nulled out. void MVKBufferView::detachMemory() { + if (_mtlTexture) getDevice()->removeResidency(_mtlTexture); [_mtlTexture release]; _mtlTexture = nil; } diff --git a/MoltenVK/MoltenVK/GPUObjects/MVKDevice.h b/MoltenVK/MoltenVK/GPUObjects/MVKDevice.h index ca0e7e01b..34b44d033 100644 --- a/MoltenVK/MoltenVK/GPUObjects/MVKDevice.h +++ b/MoltenVK/MoltenVK/GPUObjects/MVKDevice.h @@ -491,6 +491,10 @@ typedef struct MVKMTLBlitEncoder { id mtlCmdBuffer = nil; } MVKMTLBlitEncoder; +// Arbitrary, after that many barriers with a given source pipeline stage we will wrap around +// and potentially introduce extra synchronization on previous invocations of the same stage. +static const uint32_t kMVKBarrierFenceCount = 64; + /** Represents a Vulkan logical GPU device, associated with a physical device. */ class MVKDevice : public MVKDispatchableVulkanAPIObject { @@ -819,6 +823,41 @@ class MVKDevice : public MVKDispatchableVulkanAPIObject { /** Returns the Metal objects underpinning the Vulkan objects indicated in the pNext chain of pMetalObjectsInfo. */ void getMetalObjects(VkExportMetalObjectsInfoEXT* pMetalObjectsInfo); +#if !MVK_XCODE_16 + inline void makeResident(id allocation) {} +#else + inline void makeResident(id allocation) { + @synchronized(_residencySet) { + [_residencySet addAllocation: allocation]; + [_residencySet commit]; + } + } +#endif + +#if !MVK_XCODE_16 + inline void removeResidency(id allocation) {} +#else + inline void removeResidency(id allocation) { + @synchronized(_residencySet) { + [_residencySet removeAllocation:allocation]; + [_residencySet commit]; + } + } +#endif + + inline void addResidencySet(id queue) { +#if MVK_XCODE_16 + if (_residencySet) [queue addResidencySet:_residencySet]; +#endif + } + + inline bool hasResidencySet() { +#if MVK_XCODE_16 + return _residencySet != nil; +#else + return false; +#endif + } #pragma mark Construction @@ -841,6 +880,15 @@ class MVKDevice : public MVKDispatchableVulkanAPIObject { return (MVKDevice*)getDispatchableObject(vkDevice); } +#pragma mark Barriers + + /** Returns a Metal fence to update for the given barrier stage. */ + id getBarrierStageFence(id mtlCommandBuffer, MVKBarrierStage stage); + + inline id getFence(MVKBarrierStage stage, int index) { + return _barrierFences[stage][index]; + } + protected: friend class MVKDeviceTrackingMixin; @@ -880,6 +928,8 @@ class MVKDevice : public MVKDispatchableVulkanAPIObject { VkPhysicalDevice##structName##Features##extnSfx _enabled##structName##Features; #include "MVKDeviceFeatureStructs.def" + id _barrierFences[kMVKBarrierStageCount][kMVKBarrierFenceCount]; + MVKPerformanceStatistics _performanceStats; MVKCommandResourceFactory* _commandResourceFactory = nullptr; MVKSmallVector, kMVKQueueFamilyCount> _queuesByQueueFamilyIndex; @@ -897,6 +947,9 @@ class MVKDevice : public MVKDispatchableVulkanAPIObject { id _globalVisibilityResultMTLBuffer = nil; id _defaultMTLSamplerState = nil; id _dummyBlitMTLBuffer = nil; +#if MVK_XCODE_16 + id _residencySet = nil; +#endif uint32_t _globalVisibilityQueryCount = 0; int _capturePipeFileDesc = -1; bool _isPerformanceTracking = false; diff --git a/MoltenVK/MoltenVK/GPUObjects/MVKDevice.mm b/MoltenVK/MoltenVK/GPUObjects/MVKDevice.mm index c8ec8ebe7..e0df5d362 100644 --- a/MoltenVK/MoltenVK/GPUObjects/MVKDevice.mm +++ b/MoltenVK/MoltenVK/GPUObjects/MVKDevice.mm @@ -2501,6 +2501,9 @@ } #endif +#if MVK_XCODE_16 && MVK_MACOS + _metalFeatures.residencySets = mvkOSVersionIsAtLeast(15) && supportsMTLGPUFamily(Apple6); +#endif } // Initializes the physical device features of this instance. @@ -4792,6 +4795,16 @@ static uint32_t mvkGetEntryProperty(io_registry_entry_t entry, CFStringRef prope #pragma mark Construction +static NSString *mvkBarrierStageName(MVKBarrierStage stage) { + switch (stage) { + case kMVKBarrierStageVertex: return @"Vertex"; + case kMVKBarrierStageFragment: return @"Fragment"; + case kMVKBarrierStageCompute: return @"Compute"; + case kMVKBarrierStageCopy: return @"Copy"; + default: return [NSString stringWithFormat:@"Invalid (%d)", stage]; + } +} + MVKDevice::MVKDevice(MVKPhysicalDevice* physicalDevice, const VkDeviceCreateInfo* pCreateInfo) : _enabledExtensions(this) { // If the physical device is lost, bail. @@ -4808,6 +4821,17 @@ static uint32_t mvkGetEntryProperty(io_registry_entry_t entry, CFStringRef prope initQueues(pCreateInfo); reservePrivateData(pCreateInfo); + // Initialize fences for execution barriers + @autoreleasepool { + for (int stage = 0; stage < kMVKBarrierStageCount; ++stage) { + for (int index = 0; index < kMVKBarrierFenceCount; ++index) { + auto &fence = _barrierFences[stage][index]; + fence = [_physicalDevice->getMTLDevice() newFence]; + [fence setLabel:[NSString stringWithFormat:@"%@ Fence %d", mvkBarrierStageName((MVKBarrierStage)stage), index]]; + } + } + } + #if MVK_MACOS // After enableExtensions // If the VK_KHR_swapchain extension is enabled, we expect to render to the screen. @@ -5110,6 +5134,23 @@ static uint32_t mvkGetEntryProperty(io_registry_entry_t entry, CFStringRef prope // Create the command queues void MVKDevice::initQueues(const VkDeviceCreateInfo* pCreateInfo) { +#if MVK_XCODE_16 + if (_physicalDevice->_isUsingMetalArgumentBuffers && _physicalDevice->_metalFeatures.residencySets) { + MTLResidencySetDescriptor *setDescriptor; + setDescriptor = [MTLResidencySetDescriptor new]; + setDescriptor.label = @"Primary residency set"; + setDescriptor.initialCapacity = 256; + + NSError *error; + _residencySet = [_physicalDevice->getMTLDevice() newResidencySetWithDescriptor:setDescriptor + error:&error]; + if (error) { + reportMessage(MVK_CONFIG_LOG_LEVEL_ERROR, "Error allocating residency set: %s", error.description.UTF8String); + } + [setDescriptor release]; + } +#endif + auto qFams = _physicalDevice->getQueueFamilies(); uint32_t qrCnt = pCreateInfo->queueCreateInfoCount; for (uint32_t qrIdx = 0; qrIdx < qrCnt; qrIdx++) { @@ -5173,6 +5214,11 @@ static uint32_t mvkGetEntryProperty(io_registry_entry_t entry, CFStringRef prope if (_commandResourceFactory) { _commandResourceFactory->destroy(); } + for (auto &fences: _barrierFences) for (auto fence: fences) [fence release]; + +#if MVK_XCODE_16 + [_residencySet release]; +#endif [_globalVisibilityResultMTLBuffer release]; [_defaultMTLSamplerState release]; [_dummyBlitMTLBuffer release]; diff --git a/MoltenVK/MoltenVK/GPUObjects/MVKDeviceMemory.mm b/MoltenVK/MoltenVK/GPUObjects/MVKDeviceMemory.mm index 7323691f6..8169ccde5 100644 --- a/MoltenVK/MoltenVK/GPUObjects/MVKDeviceMemory.mm +++ b/MoltenVK/MoltenVK/GPUObjects/MVKDeviceMemory.mm @@ -246,7 +246,7 @@ } if (!_mtlBuffer) { return false; } _pMemory = isMemoryHostAccessible() ? _mtlBuffer.contents : nullptr; - + getDevice()->makeResident(_mtlBuffer); propagateDebugName(); return true; @@ -425,6 +425,7 @@ auto imgCopies = _imageMemoryBindings; for (auto& img : imgCopies) { img->bindDeviceMemory(nullptr, 0); } + if (_mtlBuffer) getDevice()->removeResidency(_mtlBuffer); [_mtlBuffer release]; _mtlBuffer = nil; diff --git a/MoltenVK/MoltenVK/GPUObjects/MVKImage.mm b/MoltenVK/MoltenVK/GPUObjects/MVKImage.mm index 6bedb4a7b..3807f5b43 100644 --- a/MoltenVK/MoltenVK/GPUObjects/MVKImage.mm +++ b/MoltenVK/MoltenVK/GPUObjects/MVKImage.mm @@ -76,7 +76,7 @@ } [mtlTexDesc release]; // temp release - + _image->getDevice()->makeResident(_mtlTexture); propagateDebugName(); } return _mtlTexture; @@ -101,6 +101,7 @@ } void MVKImagePlane::releaseMTLTexture() { + if (_mtlTexture) _image->getDevice()->removeResidency(_mtlTexture); [_mtlTexture release]; _mtlTexture = nil; @@ -436,7 +437,8 @@ } if (!_mtlTexelBuffer) { return reportError(VK_ERROR_OUT_OF_DEVICE_MEMORY, "Could not create an MTLBuffer for an image that requires a buffer backing store. Images that can be used for atomic accesses must have a texel buffer backing them."); - } + } + getDevice()->makeResident(_mtlTexelBuffer); _mtlTexelBufferOffset = 0; _ownsTexelBuffer = true; } @@ -444,7 +446,6 @@ _mtlTexelBuffer = _deviceMemory->_mtlBuffer; _mtlTexelBufferOffset = getDeviceMemoryOffset(); } - flushToDevice(getDeviceMemoryOffset(), getByteCount()); return _deviceMemory->addImageMemoryBinding(this); } @@ -541,7 +542,10 @@ MVKImageMemoryBinding::~MVKImageMemoryBinding() { if (_deviceMemory) { _deviceMemory->removeImageMemoryBinding(this); } - if (_ownsTexelBuffer) { [_mtlTexelBuffer release]; } + if (_ownsTexelBuffer) { + if (_ownsTexelBuffer) _image->getDevice()->removeResidency(_mtlTexelBuffer); + [_mtlTexelBuffer release]; + } } diff --git a/MoltenVK/MoltenVK/GPUObjects/MVKQueue.mm b/MoltenVK/MoltenVK/GPUObjects/MVKQueue.mm index 8fe78d30e..db363634d 100644 --- a/MoltenVK/MoltenVK/GPUObjects/MVKQueue.mm +++ b/MoltenVK/MoltenVK/GPUObjects/MVKQueue.mm @@ -339,6 +339,7 @@ // Retrieves and initializes the Metal command queue and Xcode GPU capture scopes void MVKQueue::initMTLCommandQueue() { _mtlQueue = _queueFamily->getMTLCommandQueue(_index); // not retained (cached in queue family) + _device->addResidencySet(_mtlQueue); _submissionCaptureScope = new MVKGPUCaptureScope(this); if (_queueFamily->getIndex() == getMVKConfig().defaultGPUCaptureScopeQueueFamilyIndex &&