From 97a35a4ef109796ce21ac94aa665fe8559ac78f0 Mon Sep 17 00:00:00 2001 From: Hans-Kristian Arntzen Date: Thu, 1 Aug 2024 14:39:02 +0200 Subject: [PATCH] meta: Add meta-ops for workgraph emulation. Details of implementation strategy is explained in docs/workgraphs.md. Signed-off-by: Hans-Kristian Arntzen --- libs/vkd3d/meson.build | 5 + libs/vkd3d/meta.c | 148 ++++++++++ .../cs_workgraph_complete_compaction.comp | 52 ++++ .../shaders/cs_workgraph_data_structures.h | 18 ++ ..._workgraph_distribute_payload_offsets.comp | 265 ++++++++++++++++++ .../cs_workgraph_distribute_workgroups.comp | 203 ++++++++++++++ .../shaders/cs_workgraph_setup_gpu_input.comp | 93 ++++++ libs/vkd3d/vkd3d_private.h | 86 ++++++ libs/vkd3d/vkd3d_shaders.h | 4 + 9 files changed, 874 insertions(+) create mode 100644 libs/vkd3d/shaders/cs_workgraph_complete_compaction.comp create mode 100644 libs/vkd3d/shaders/cs_workgraph_data_structures.h create mode 100644 libs/vkd3d/shaders/cs_workgraph_distribute_payload_offsets.comp create mode 100644 libs/vkd3d/shaders/cs_workgraph_distribute_workgroups.comp create mode 100644 libs/vkd3d/shaders/cs_workgraph_setup_gpu_input.comp diff --git a/libs/vkd3d/meson.build b/libs/vkd3d/meson.build index d110285356..637a4f8ef2 100644 --- a/libs/vkd3d/meson.build +++ b/libs/vkd3d/meson.build @@ -53,6 +53,11 @@ vkd3d_shaders =[ 'shaders/cs_resolve_color_float.comp', 'shaders/cs_resolve_color_uint.comp', 'shaders/cs_resolve_color_sint.comp', + + 'shaders/cs_workgraph_distribute_workgroups.comp', + 'shaders/cs_workgraph_distribute_payload_offsets.comp', + 'shaders/cs_workgraph_complete_compaction.comp', + 'shaders/cs_workgraph_setup_gpu_input.comp', ] vkd3d_src = [ diff --git a/libs/vkd3d/meta.c b/libs/vkd3d/meta.c index 729688044b..aa2542c0d7 100644 --- a/libs/vkd3d/meta.c +++ b/libs/vkd3d/meta.c @@ -2054,6 +2054,102 @@ static HRESULT vkd3d_sampler_feedback_ops_init(struct vkd3d_sampler_feedback_res return S_OK; } +static HRESULT vkd3d_workgraph_ops_init(struct vkd3d_workgraph_indirect_ops *workgraph_ops, + struct d3d12_device *device) +{ + VkPipelineShaderStageRequiredSubgroupSizeCreateInfo required; + VkSpecializationMapEntry map_entries[4]; + VkPushConstantRange push_range; + VkSpecializationInfo spec_info; + uint32_t spec_data[4]; + unsigned int i; + VkResult vr; + + if (!device->device_info.vulkan_1_2_features.vulkanMemoryModel || + !device->device_info.vulkan_1_3_features.subgroupSizeControl || + !(device->device_info.vulkan_1_3_properties.requiredSubgroupSizeStages & VK_SHADER_STAGE_COMPUTE_BIT)) + return S_OK; + + push_range.stageFlags = VK_SHADER_STAGE_COMPUTE_BIT; + push_range.offset = 0; + + push_range.size = sizeof(struct vkd3d_workgraph_workgroups_args); + if ((vr = vkd3d_meta_create_pipeline_layout(device, + 0, NULL, 1, &push_range, + &workgraph_ops->vk_workgroup_layout))) + return hresult_from_vk_result(vr); + + push_range.size = sizeof(struct vkd3d_workgraph_payload_offsets_args); + if ((vr = vkd3d_meta_create_pipeline_layout(device, + 0, NULL, 1, &push_range, + &workgraph_ops->vk_payload_offset_layout))) + return hresult_from_vk_result(vr); + + push_range.size = sizeof(struct vkd3d_workgraph_complete_compaction_args); + if ((vr = vkd3d_meta_create_pipeline_layout(device, + 0, NULL, 1, &push_range, + &workgraph_ops->vk_complete_compaction_layout))) + return hresult_from_vk_result(vr); + + push_range.size = sizeof(struct vkd3d_workgraph_setup_gpu_input_args); + if ((vr = vkd3d_meta_create_pipeline_layout(device, + 0, NULL, 1, &push_range, + &workgraph_ops->vk_setup_gpu_input_layout))) + return hresult_from_vk_result(vr); + + for (i = 0; i < ARRAY_SIZE(map_entries); i++) + { + map_entries[i].offset = sizeof(uint32_t) * i; + map_entries[i].size = sizeof(uint32_t); + map_entries[i].constantID = i; + } + + spec_info.mapEntryCount = ARRAY_SIZE(map_entries); + spec_info.pMapEntries = map_entries; + spec_info.pData = spec_data; + spec_info.dataSize = ARRAY_SIZE(map_entries) * sizeof(uint32_t); + spec_data[0] = device->device_info.vulkan_1_3_properties.maxSubgroupSize; + spec_data[1] = device->device_info.vulkan_1_3_properties.maxSubgroupSize; + spec_data[2] = 0; + spec_data[3] = device->device_info.properties2.properties.limits.maxComputeWorkGroupCount[0] >= + VKD3D_WORKGRAPH_MAX_WGX_NO_PRIMARY_EXECUTION_THRESHOLD; + + memset(&required, 0, sizeof(required)); + required.sType = VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_REQUIRED_SUBGROUP_SIZE_CREATE_INFO; + required.requiredSubgroupSize = device->device_info.vulkan_1_3_properties.maxSubgroupSize; + + if ((vr = vkd3d_meta_create_compute_pipeline(device, sizeof(cs_workgraph_distribute_workgroups), + cs_workgraph_distribute_workgroups, workgraph_ops->vk_workgroup_layout, + &spec_info, true, &required, &workgraph_ops->vk_payload_workgroup_pipeline[0]))) + return hresult_from_vk_result(vr); + + spec_data[2] = 1; + if ((vr = vkd3d_meta_create_compute_pipeline(device, sizeof(cs_workgraph_distribute_workgroups), + cs_workgraph_distribute_workgroups, workgraph_ops->vk_workgroup_layout, + &spec_info, true, &required, &workgraph_ops->vk_payload_workgroup_pipeline[1]))) + return hresult_from_vk_result(vr); + + if ((vr = vkd3d_meta_create_compute_pipeline(device, sizeof(cs_workgraph_complete_compaction), + cs_workgraph_complete_compaction, workgraph_ops->vk_complete_compaction_layout, + NULL, true, NULL, &workgraph_ops->vk_complete_compaction_pipeline))) + return hresult_from_vk_result(vr); + + spec_info.mapEntryCount = 1; + spec_info.dataSize = sizeof(uint32_t); + if ((vr = vkd3d_meta_create_compute_pipeline(device, sizeof(cs_workgraph_distribute_payload_offsets), + cs_workgraph_distribute_payload_offsets, workgraph_ops->vk_payload_offset_layout, + &spec_info, true, NULL, &workgraph_ops->vk_payload_offset_pipeline))) + return hresult_from_vk_result(vr); + + spec_data[0] = spec_data[3]; + if ((vr = vkd3d_meta_create_compute_pipeline(device, sizeof(cs_workgraph_setup_gpu_input), + cs_workgraph_setup_gpu_input, workgraph_ops->vk_setup_gpu_input_layout, + &spec_info, true, NULL, &workgraph_ops->vk_setup_gpu_input_pipeline))) + return hresult_from_vk_result(vr); + + return S_OK; +} + void vkd3d_meta_get_sampler_feedback_resolve_pipeline(struct vkd3d_meta_ops *meta_ops, enum vkd3d_sampler_feedback_resolve_type type, struct vkd3d_sampler_feedback_resolve_info *info) { @@ -2092,6 +2188,52 @@ static void vkd3d_sampler_feedback_ops_cleanup(struct vkd3d_sampler_feedback_res VK_CALL(vkDestroyPipeline(device->vk_device, sampler_feedback_ops->vk_pipelines[i], NULL)); } +static void vkd3d_workgraph_ops_cleanup(struct vkd3d_workgraph_indirect_ops *workgraph_ops, + struct d3d12_device *device) +{ + const struct vkd3d_vk_device_procs *vk_procs = &device->vk_procs; + unsigned int i; + + VK_CALL(vkDestroyPipelineLayout(device->vk_device, workgraph_ops->vk_payload_offset_layout, NULL)); + VK_CALL(vkDestroyPipelineLayout(device->vk_device, workgraph_ops->vk_workgroup_layout, NULL)); + VK_CALL(vkDestroyPipelineLayout(device->vk_device, workgraph_ops->vk_setup_gpu_input_layout, NULL)); + VK_CALL(vkDestroyPipelineLayout(device->vk_device, workgraph_ops->vk_complete_compaction_layout, NULL)); + + for (i = 0; i < ARRAY_SIZE(workgraph_ops->vk_payload_workgroup_pipeline); i++) + VK_CALL(vkDestroyPipeline(device->vk_device, workgraph_ops->vk_payload_workgroup_pipeline[i], NULL)); + VK_CALL(vkDestroyPipeline(device->vk_device, workgraph_ops->vk_setup_gpu_input_pipeline, NULL)); + VK_CALL(vkDestroyPipeline(device->vk_device, workgraph_ops->vk_payload_offset_pipeline, NULL)); + VK_CALL(vkDestroyPipeline(device->vk_device, workgraph_ops->vk_complete_compaction_pipeline, NULL)); +} + +void vkd3d_meta_get_workgraph_workgroup_pipeline(struct vkd3d_meta_ops *meta_ops, + struct vkd3d_workgraph_meta_pipeline_info *info, bool broadcast_compacting) +{ + info->vk_pipeline_layout = meta_ops->workgraph.vk_workgroup_layout; + info->vk_pipeline = meta_ops->workgraph.vk_payload_workgroup_pipeline[broadcast_compacting]; +} + +void vkd3d_meta_get_workgraph_setup_gpu_input_pipeline(struct vkd3d_meta_ops *meta_ops, + struct vkd3d_workgraph_meta_pipeline_info *info) +{ + info->vk_pipeline_layout = meta_ops->workgraph.vk_setup_gpu_input_layout; + info->vk_pipeline = meta_ops->workgraph.vk_setup_gpu_input_pipeline; +} + +void vkd3d_meta_get_workgraph_payload_offset_pipeline(struct vkd3d_meta_ops *meta_ops, + struct vkd3d_workgraph_meta_pipeline_info *info) +{ + info->vk_pipeline_layout = meta_ops->workgraph.vk_payload_offset_layout; + info->vk_pipeline = meta_ops->workgraph.vk_payload_offset_pipeline; +} + +void vkd3d_meta_get_workgraph_complete_compaction_pipeline(struct vkd3d_meta_ops *meta_ops, + struct vkd3d_workgraph_meta_pipeline_info *info) +{ + info->vk_pipeline_layout = meta_ops->workgraph.vk_complete_compaction_layout; + info->vk_pipeline = meta_ops->workgraph.vk_complete_compaction_pipeline; +} + HRESULT vkd3d_meta_ops_init(struct vkd3d_meta_ops *meta_ops, struct d3d12_device *device) { HRESULT hr; @@ -2132,8 +2274,13 @@ HRESULT vkd3d_meta_ops_init(struct vkd3d_meta_ops *meta_ops, struct d3d12_device if (FAILED(hr = vkd3d_sampler_feedback_ops_init(&meta_ops->sampler_feedback, device))) goto fail_sampler_feedback; + if (FAILED(hr = vkd3d_workgraph_ops_init(&meta_ops->workgraph, device))) + goto fail_workgraphs; + return S_OK; +fail_workgraphs: + vkd3d_sampler_feedback_ops_cleanup(&meta_ops->sampler_feedback, device); fail_sampler_feedback: vkd3d_dstorage_ops_cleanup(&meta_ops->dstorage, device); fail_dstorage_ops: @@ -2160,6 +2307,7 @@ HRESULT vkd3d_meta_ops_init(struct vkd3d_meta_ops *meta_ops, struct d3d12_device HRESULT vkd3d_meta_ops_cleanup(struct vkd3d_meta_ops *meta_ops, struct d3d12_device *device) { + vkd3d_workgraph_ops_cleanup(&meta_ops->workgraph, device); vkd3d_sampler_feedback_ops_cleanup(&meta_ops->sampler_feedback, device); vkd3d_dstorage_ops_cleanup(&meta_ops->dstorage, device); vkd3d_multi_dispatch_indirect_ops_cleanup(&meta_ops->multi_dispatch_indirect, device); diff --git a/libs/vkd3d/shaders/cs_workgraph_complete_compaction.comp b/libs/vkd3d/shaders/cs_workgraph_complete_compaction.comp new file mode 100644 index 0000000000..bf73314cd4 --- /dev/null +++ b/libs/vkd3d/shaders/cs_workgraph_complete_compaction.comp @@ -0,0 +1,52 @@ +#version 450 +#extension GL_EXT_buffer_reference : require +#extension GL_GOOGLE_include_directive : require + +layout(local_size_x = 32) in; + +#include "cs_workgraph_data_structures.h" + +layout(buffer_reference, buffer_reference_align = 16, std430) buffer IndirectCommandsBuffer +{ + layout(offset = 16) IndirectCommands indirect_commands[]; +}; + +struct NodeMeta +{ + uint packed_control; + uint payload_stride_grid_offset_or_count; +}; + +layout(buffer_reference, buffer_reference_align = 8, std430) restrict readonly buffer NodeTypeMeta +{ + NodeMeta data[]; +}; + +layout(push_constant, std430) uniform Registers +{ + IndirectCommandsBuffer commands; + NodeTypeMeta meta; + uint num_nodes; +} registers; + +void main() +{ + uint node_index = gl_GlobalInvocationID.x; + if (node_index >= registers.num_nodes) + return; + + bool should_compact_broadcast = bitfieldExtract(registers.meta.data[node_index].packed_control, 24, 8) != 0; + if (should_compact_broadcast) + should_compact_broadcast = registers.commands.indirect_commands[node_index].primary_execute.y == 0u; + + if (should_compact_broadcast) + { + uint total_groups = registers.commands.indirect_commands[node_index].expander_total_groups; + registers.commands.indirect_commands[node_index].expander_total_groups = 0u; + uint wgx = registers.commands.indirect_commands[node_index].secondary_execute.x; + uint average_amplification = min(1024u, uint(float(total_groups) / float(max(1u, wgx)))); + uint current_amplification = registers.commands.indirect_commands[node_index].secondary_execute.z; + if (average_amplification > current_amplification) + registers.commands.indirect_commands[node_index].secondary_execute.z = average_amplification; + } +} diff --git a/libs/vkd3d/shaders/cs_workgraph_data_structures.h b/libs/vkd3d/shaders/cs_workgraph_data_structures.h new file mode 100644 index 0000000000..a0370d5f3c --- /dev/null +++ b/libs/vkd3d/shaders/cs_workgraph_data_structures.h @@ -0,0 +1,18 @@ +#ifndef CS_WORKGRAPH_DATA_STRUCTURES_H_ +#define CS_WORKGRAPH_DATA_STRUCTURES_H_ + +// 48 bytes per node. +struct IndirectCommands +{ + uvec3 primary_execute; + uint primary_linear_offset; // Read by node as input metadata. + uvec3 secondary_execute; + uint secondary_linear_offset; // Read by node as input metadata. + uint end_elements; // Read by node as input metadata in coalesce / thread mode. + uint linear_offset_atomic; // Used by expander to write unrolled data. + uint expander_total_groups; + uint padding0; +}; + +#endif + diff --git a/libs/vkd3d/shaders/cs_workgraph_distribute_payload_offsets.comp b/libs/vkd3d/shaders/cs_workgraph_distribute_payload_offsets.comp new file mode 100644 index 0000000000..0577f8c58c --- /dev/null +++ b/libs/vkd3d/shaders/cs_workgraph_distribute_payload_offsets.comp @@ -0,0 +1,265 @@ +#version 450 +#extension GL_KHR_shader_subgroup_basic : require +#extension GL_KHR_shader_subgroup_ballot : require +#extension GL_KHR_shader_subgroup_arithmetic : require +#extension GL_EXT_shader_16bit_storage : require +#extension GL_EXT_buffer_reference : require +#extension GL_GOOGLE_include_directive : require + +layout(local_size_x_id = 0) in; + +layout(buffer_reference, buffer_reference_align = 8, std430) readonly buffer NodePayloadOffsetCount +{ + uvec2 data[]; +}; + +layout(buffer_reference, buffer_reference_align = 4, std430) writeonly buffer UnrolledOffsets +{ + uint data[]; +}; + +#include "cs_workgraph_data_structures.h" + +// Abuse aliasing rules to make sure that we can get scalar loads while doing +// atomics to part of the buffer :v +layout(buffer_reference, buffer_reference_align = 16, std430) buffer IndirectCommandsBufferAtomic +{ + layout(offset = 16) IndirectCommands indirect_commands_atomic[]; +}; + +layout(buffer_reference, buffer_reference_align = 16, std430) restrict readonly buffer IndirectCommandsBufferRO +{ + layout(offset = 12) uint total_fused_elements; + IndirectCommands indirect_commands_read[]; +}; + +// For patching in sharing count. +layout(buffer_reference, buffer_reference_align = 4, std430) buffer Payload32 +{ + uint data[]; +}; + +layout(buffer_reference, buffer_reference_align = 4, std430) buffer Payload16 +{ + uint16_t data[]; +}; + +struct NodeMeta +{ + uint packed_control; + uint payload_stride_grid_offset_or_count; +}; + +layout(buffer_reference, buffer_reference_align = 8, std430) restrict readonly buffer NodeTypeMeta +{ + NodeMeta data[]; +}; + +layout(push_constant, std430) uniform Registers +{ + NodePayloadOffsetCount packed_offset_counts; + UnrolledOffsets unrolled_offsets; + IndirectCommandsBufferRO commands; + Payload32 payload; + NodeTypeMeta meta; +} registers; + +void process_node_index(uint node_index, uint payload_offset, uint count) +{ + NodeMeta node_meta = registers.meta.data[node_index]; + int group_components = int(bitfieldExtract(node_meta.packed_control, 0, 8)); + bool group_components_u32 = bitfieldExtract(node_meta.packed_control, 8, 8) != 0; + bool rw_group_tracking = bitfieldExtract(node_meta.packed_control, 16, 8) != 0; + bool group_compact_broadcast = bitfieldExtract(node_meta.packed_control, 24, 8) != 0; + int grid_offset_or_count = bitfieldExtract(int(node_meta.payload_stride_grid_offset_or_count), 16, 16); + uint payload_stride = bitfieldExtract(node_meta.payload_stride_grid_offset_or_count, 0, 16); + + bool should_compact_broadcast = group_compact_broadcast; + if (should_compact_broadcast) + should_compact_broadcast = registers.commands.indirect_commands_read[node_index].primary_execute.y == 0u; + + // If we're not going to compact, we can allocate atomically once. + uint output_offset; + if (!should_compact_broadcast) + { + uint total_scan = subgroupAdd(count); + if (subgroupElect()) + output_offset = atomicAdd(IndirectCommandsBufferAtomic(registers.commands).indirect_commands_atomic[node_index].linear_offset_atomic, total_scan); + output_offset = subgroupBroadcastFirst(output_offset); + } + + uvec4 ballot = subgroupBallot(count != 0u); + + while (any(notEqual(ballot, uvec4(0)))) + { + uint lane = subgroupBallotFindLSB(ballot); + + // TODO: Is there a more elegant way that is just as fast and fully portable? + if (gl_SubgroupSize == 128) + { + if (lane >= 3 * 32) + ballot.w &= ballot.w - 1u; + else if (lane >= 2 * 32) + ballot.z &= ballot.z - 1u; + else if (lane >= 32) + ballot.y &= ballot.y - 1u; + else + ballot.x &= ballot.x - 1u; + } + else if (gl_SubgroupSize == 64) + { + if (lane >= 32) + ballot.y &= ballot.y - 1u; + else + ballot.x &= ballot.x - 1u; + } + else + ballot.x &= ballot.x - 1u; + + uint wave_payload_offset = subgroupBroadcast(payload_offset, lane); + uint wave_count = subgroupBroadcast(count, lane); + + if (should_compact_broadcast) + { + // Need to do atomics per iteration since we don't want to scan the payloads twice. + + for (uint base_index = 0; base_index < wave_count; base_index += gl_SubgroupSize) + { + uint packed_index = base_index + gl_SubgroupInvocationID; + uint unrolled_offset = wave_payload_offset + payload_stride * packed_index; + bool is_active_payload = false; + uint grid_count = 0u; + + if (packed_index < wave_count) + { + // We only take this path for broadcast nodes with MaxGrid size. + grid_count = 1u; + + if (grid_offset_or_count >= 0) + { + // For [NodeMaxDispatchGrid]. + if (group_components_u32) + { + uint u32_grid_offset = (unrolled_offset + grid_offset_or_count) >> 2u; + for (int i = 0; i < group_components; i++) + grid_count *= registers.payload.data[u32_grid_offset + i]; + } + else + { + uint u16_grid_offset = (unrolled_offset + grid_offset_or_count) >> 1u; + for (int i = 0; i < group_components; i++) + grid_count *= uint(Payload16(registers.payload).data[u16_grid_offset + i]); + } + } + else + { + // For [NodeDispatchGrid]. Ignore any grids. + grid_count = -grid_offset_or_count; + } + + if (rw_group_tracking) + registers.payload.data[(unrolled_offset + payload_stride - 4u) >> 2u] = grid_count; + } + + bool is_active_broadcast = grid_count > 0u; + uvec4 active_ballot = subgroupBallot(is_active_broadcast); + uint compacted_offset = subgroupBallotExclusiveBitCount(active_ballot); + uint total_compacted = subgroupBallotBitCount(active_ballot); + uint total_workgroup_iterations = subgroupAdd(grid_count); + + uint atomic_offset = 0; + if (subgroupElect()) + { + if (total_compacted != 0) + { + restrict IndirectCommandsBufferAtomic atomics = IndirectCommandsBufferAtomic(registers.commands); + atomic_offset = atomicAdd(atomics.indirect_commands_atomic[node_index].secondary_execute.x, total_compacted); + atomicAdd(atomics.indirect_commands_atomic[node_index].expander_total_groups, total_workgroup_iterations); + } + } + + atomic_offset = subgroupBroadcastFirst(atomic_offset); + atomic_offset += registers.commands.indirect_commands_read[node_index].secondary_linear_offset; + + if (is_active_broadcast) + registers.unrolled_offsets.data[atomic_offset + compacted_offset] = unrolled_offset; + } + } + else + { + for (uint packed_index = gl_SubgroupInvocationID; packed_index < wave_count; packed_index += gl_SubgroupSize) + { + uint compacted_index = packed_index; + uint unrolled_offset = wave_payload_offset + payload_stride * packed_index; + registers.unrolled_offsets.data[output_offset + packed_index] = unrolled_offset; + + if (group_components > 0) + { + uint grid_count = 1u; + if (grid_offset_or_count >= 0) + { + // For [NodeMaxDispatchGrid]. + if (group_components_u32) + { + uint u32_grid_offset = (unrolled_offset + grid_offset_or_count) >> 2u; + for (int i = 0; i < group_components; i++) + grid_count *= registers.payload.data[u32_grid_offset + i]; + } + else + { + uint u16_grid_offset = (unrolled_offset + grid_offset_or_count) >> 1u; + for (int i = 0; i < group_components; i++) + grid_count *= uint(Payload16(registers.payload).data[u16_grid_offset + i]); + } + } + else + { + // For [NodeDispatchGrid]. Ignore any grids. + grid_count = -grid_offset_or_count; + } + + if (rw_group_tracking) + registers.payload.data[(unrolled_offset + payload_stride - 4u) >> 2u] = grid_count; + } + } + + output_offset += wave_count; + } + } +} + +void main() +{ + uint total_fused_elements = registers.commands.total_fused_elements; + uint total_fused_groups = (total_fused_elements + gl_WorkGroupSize.x - 1) / gl_WorkGroupSize.x; + + for (uint i = gl_WorkGroupID.x; i < total_fused_groups; i += gl_NumWorkGroups.x) + { + uint packed_offset_index = i * gl_WorkGroupSize.x + gl_SubgroupID * gl_SubgroupSize + gl_SubgroupInvocationID; + uint payload_offset = 0; + uint node_index = 0; + uint count = 0; + + if (packed_offset_index < total_fused_elements) + { + uvec2 words = registers.packed_offset_counts.data[packed_offset_index]; + node_index = bitfieldExtract(words.x, 8, 24); + count = bitfieldExtract(words.x, 0, 8) + 1; + payload_offset = words.y; + } + + // An altered waterfall loop. All threads need to participate in the inner loop due to expansion. + // We just need to mask off work depending on which node index we're processing. + uvec4 node_index_ballot = subgroupBallot(count != 0); + + while (any(notEqual(node_index_ballot, uvec4(0)))) + { + uint bit = subgroupBallotFindLSB(node_index_ballot); + uint next_node_index = subgroupBroadcast(node_index, bit); + bool contributes = next_node_index == node_index; + process_node_index(next_node_index, payload_offset, contributes ? count : 0); + node_index_ballot &= subgroupBallot(!contributes); + } + } +} + diff --git a/libs/vkd3d/shaders/cs_workgraph_distribute_workgroups.comp b/libs/vkd3d/shaders/cs_workgraph_distribute_workgroups.comp new file mode 100644 index 0000000000..efd0808fe7 --- /dev/null +++ b/libs/vkd3d/shaders/cs_workgraph_distribute_workgroups.comp @@ -0,0 +1,203 @@ +#version 450 +#extension GL_KHR_shader_subgroup_basic : require +#extension GL_KHR_shader_subgroup_ballot : require +#extension GL_KHR_shader_subgroup_arithmetic : require +#extension GL_KHR_shader_subgroup_vote : require +#extension GL_EXT_buffer_reference : require +#extension GL_GOOGLE_include_directive : require + +#extension GL_KHR_memory_scope_semantics : require +#pragma use_vulkan_memory_model + +layout(local_size_x_id = 0) in; +layout(constant_id = 1) const uint FUSED_DIVIDER = 0; +layout(constant_id = 2) const bool COMPACT_BROADCAST_NODES = false; + +// If maxComputeWorkGroups[0] is huge, we don't have to worry. +// AMD and NV support that at least. +// When we introduce mesh shaders, we'll have to be more conservative. +// However, mesh nodes will require some esoteric MDI handling anyway, so ... eh. +layout(constant_id = 3) const bool REQUIRE_WG_DIVIDER = true; + +#include "cs_workgraph_data_structures.h" + +layout(buffer_reference, buffer_reference_align = 8, std430) buffer NodeAtomics +{ + uint payload_atomic; + uint fused_atomic; + uint node_counts[]; +}; + +// Sharing across threads requires coherent, even within a subgroup. +layout(buffer_reference, buffer_reference_align = 16, std430) subgroupcoherent buffer IndirectCommandsBuffer +{ + uvec4 expander_execute_total_elements; + IndirectCommands indirect_commands[]; +}; + +layout(buffer_reference, buffer_reference_align = 4, std430) readonly buffer DividersOrAmplification +{ + int data[]; +}; + +layout(buffer_reference, buffer_reference_align = 4, std430) readonly buffer NodeShareMapping +{ + uint data[]; +}; + +layout(push_constant, std430) uniform Registers +{ + NodeAtomics atomics; + IndirectCommandsBuffer commands; + DividersOrAmplification dividers; + NodeShareMapping node_share_mapping; + uint num_nodes; +} registers; + +// Assumption is that number of nodes is fairly small and can reasonably be iterated over one wave. +// We do very little work here, just parcel out memory regions and have fun. +// Here, we could in theory detect OOM, report the failure and nop out the indirects. + +const uint WG_DIVIDER = 32 * 1024; +// Arbitrary tuneable. +const uint AMPLIFICATION_EXTRA_SHIFT = 2; + +void main() +{ + uint linear_offset = 0; + bool has_share_input = false; + + for (uint i = 0; i < registers.num_nodes; i += gl_SubgroupSize) + { + uint node_index = i + gl_SubgroupInvocationID; + uint counts = 0; + uint total_wgs; + + if (node_index < registers.num_nodes) + { + uint sharing_index = registers.node_share_mapping.data[node_index]; + if (sharing_index == ~0u) + { + counts = registers.atomics.node_counts[node_index]; + total_wgs = counts; + } + else + { + // Don't contribute to the prefix sum. + // It is not allowed for a node to be both a real input while also sharing input of another node. + total_wgs = registers.atomics.node_counts[sharing_index]; + has_share_input = true; + } + } + + uint scan = subgroupInclusiveAdd(counts); + uint total_scan = subgroupBroadcast(scan, gl_SubgroupSize - 1); + scan -= counts; + + if (node_index < registers.num_nodes) + { + uint node_linear_offset = scan + linear_offset; + int coalesce_divider = registers.dividers.data[node_index]; + + // Could make this multiplier-based if we need to. + if (coalesce_divider > 0) + total_wgs = (total_wgs + uint(coalesce_divider) - 1) / uint(coalesce_divider); + + uint coalesce_mult = coalesce_divider > 0 ? uint(coalesce_divider) : 1u; + uint amplification = coalesce_divider < 0 ? uint(-coalesce_divider) : 1; + + bool dynamic_amplification = amplification <= 0xffffu; + amplification &= 0xffffu; + + // Try to balance work we spawn on the GPU. + amplification = max(1u, amplification >> findMSB(max(total_wgs >> AMPLIFICATION_EXTRA_SHIFT, 1u))); + + IndirectCommands cmd; + uint spilled_wgs; + + if (REQUIRE_WG_DIVIDER) + { + cmd.primary_execute = uvec3(WG_DIVIDER, total_wgs / WG_DIVIDER, amplification); + spilled_wgs = total_wgs % WG_DIVIDER; + } + else + { + cmd.primary_execute = uvec3(0); + spilled_wgs = total_wgs; + } + + cmd.primary_linear_offset = node_linear_offset; + + // If we only do secondary executions we can compact empty broadcasts easily. + // This is relevant for AMD Compute Rasterizer demo for whatever reason. *shrug* + uint secondary_executions = COMPACT_BROADCAST_NODES && + dynamic_amplification && coalesce_divider < 0 && + cmd.primary_execute.y == 0 ? 0u : spilled_wgs; + + cmd.secondary_execute = uvec3(secondary_executions, 1, amplification); + cmd.secondary_linear_offset = node_linear_offset + cmd.primary_execute.y * WG_DIVIDER * coalesce_mult; + cmd.end_elements = counts + node_linear_offset; + cmd.linear_offset_atomic = node_linear_offset; + + registers.commands.indirect_commands[node_index] = cmd; + + // Reset the counters so we don't have to do an extra pass on next iteration. + registers.atomics.node_counts[node_index] = 0; + } + + // Wave-uniform accumulate. + linear_offset += total_scan; + } + + if (subgroupAny(has_share_input)) + { + subgroupMemoryBarrierBuffer(); + subgroupBarrier(); + + // There cannot be chains of sharing, i.e. you cannot have A sharing with B, and B sharing with C, + // so this cannot cause any weird WAR hazard. + + for (uint i = gl_SubgroupInvocationID; i < registers.num_nodes; i += gl_SubgroupSize) + { + uint sharing_index = registers.node_share_mapping.data[i]; + if (sharing_index != ~0u) + { + // We need to know the primary_linear_offset to set up the indirect properly. + IndirectCommands other_cmd = registers.commands.indirect_commands[sharing_index]; + IndirectCommands cmd = registers.commands.indirect_commands[i]; + + int coalesce_divider = registers.dividers.data[i]; + uint coalesce_mult = coalesce_divider != 0 ? coalesce_divider : 1u; + uint amplification = coalesce_divider < 0 ? uint(-coalesce_divider) : 1; + + amplification &= 0xffffu; + + uint total_wgs = other_cmd.end_elements - other_cmd.primary_linear_offset; + // Try to balance work we spawn on the GPU. + amplification = max(1u, amplification >> findMSB(max(total_wgs >> AMPLIFICATION_EXTRA_SHIFT, 1u))); + + registers.commands.indirect_commands[i].primary_execute.z = amplification; + registers.commands.indirect_commands[i].secondary_execute.z = amplification; + registers.commands.indirect_commands[i].primary_linear_offset = other_cmd.primary_linear_offset; + registers.commands.indirect_commands[i].secondary_linear_offset = other_cmd.primary_linear_offset + cmd.primary_execute.y * WG_DIVIDER * coalesce_mult; + registers.commands.indirect_commands[i].end_elements = other_cmd.end_elements; + } + } + } + + // Reset the counters so we don't have to do an extra pass on next iteration. + // Also, have a single thread emit the fused expander kernel. + if (subgroupElect()) + { + uint fused = registers.atomics.fused_atomic; + + const uint FUSE_LIMIT = REQUIRE_WG_DIVIDER ? 0xffffu : 0xffffffu; + + // Do a single ubershader that does payload expansion. Goes as wide as reasonably possible. + registers.commands.expander_execute_total_elements = + uvec4(min(FUSE_LIMIT, (fused + FUSED_DIVIDER - 1) / FUSED_DIVIDER), 1, 1, fused); + + registers.atomics.payload_atomic = 0; + registers.atomics.fused_atomic = 0; + } +} diff --git a/libs/vkd3d/shaders/cs_workgraph_setup_gpu_input.comp b/libs/vkd3d/shaders/cs_workgraph_setup_gpu_input.comp new file mode 100644 index 0000000000..fa99ac6fff --- /dev/null +++ b/libs/vkd3d/shaders/cs_workgraph_setup_gpu_input.comp @@ -0,0 +1,93 @@ +#version 450 +#extension GL_EXT_buffer_reference : require + +layout(local_size_x = 32) in; +layout(constant_id = 0) const bool REQUIRE_WG_DIVIDER = true; + +// D3D12_NODE_GPU_INPUT +layout(buffer_reference, buffer_reference_align = 4, std430) readonly buffer GPUInputNumNodes +{ + uint entry_point_index; + uint num_nodes; +}; + +struct IndirectCommand +{ + uvec4 primary_dispatch_and_offset; + uvec4 secondary_dispatch_and_offset; +}; + +layout(buffer_reference, buffer_reference_align = 16, std430) writeonly buffer Indirects +{ + IndirectCommand commands[]; +}; + +layout(buffer_reference, buffer_reference_align = 4, std430) readonly buffer DividersOrAmplification +{ + int data[]; +}; + +layout(buffer_reference, buffer_reference_align = 4, std430) readonly buffer EntryPoints +{ + uint data[]; +}; + +layout(push_constant) uniform Registers +{ + GPUInputNumNodes node_input; + Indirects indirects; + DividersOrAmplification coalesce_dividers; + EntryPoints entry_point_mapping; + uint num_entry_points; +} registers; + +// Arbitrary tuneable. +const uint AMPLIFICATION_EXTRA_SHIFT = 2; + +void main() +{ + uint id = gl_GlobalInvocationID.x; + if (id < registers.num_entry_points) + { + // For shared input nodes, there can be multiple nodes executing for any given input. + if (registers.entry_point_mapping.data[id] == registers.node_input.entry_point_index) + { + const uint WG_DIVIDER = 32 * 1024; + uint count = registers.node_input.num_nodes; + uint wg_count; + + int divider = registers.coalesce_dividers.data[id]; + + if (divider > 1) + wg_count = (count + uint(divider) - 1) / uint(divider); + else + wg_count = count; + + uint coalesce_mult = divider > 0 ? uint(divider) : 1u; + uint amplification = divider < 0 ? uint(-divider) : 1u; + + // Try to balance work we spawn on the GPU. + amplification = max(1u, amplification >> findMSB(max(wg_count >> AMPLIFICATION_EXTRA_SHIFT, 1u))); + + IndirectCommand cmd; + + if (REQUIRE_WG_DIVIDER) + { + cmd.primary_dispatch_and_offset = uvec4(WG_DIVIDER, wg_count / WG_DIVIDER, amplification, 0); + cmd.secondary_dispatch_and_offset = uvec4(wg_count % WG_DIVIDER, 1, amplification, + cmd.primary_dispatch_and_offset.y * WG_DIVIDER * coalesce_mult); + } + else + { + cmd.primary_dispatch_and_offset = uvec4(0); + cmd.secondary_dispatch_and_offset = uvec4(wg_count, 1, amplification, 0); + } + registers.indirects.commands[id] = cmd; + } + else + { + registers.indirects.commands[id] = IndirectCommand(uvec4(0), uvec4(0)); + } + } +} + diff --git a/libs/vkd3d/vkd3d_private.h b/libs/vkd3d/vkd3d_private.h index 3028e41e88..30606b38c0 100644 --- a/libs/vkd3d/vkd3d_private.h +++ b/libs/vkd3d/vkd3d_private.h @@ -4458,6 +4458,66 @@ struct vkd3d_sampler_feedback_resolve_ops VkPipeline vk_pipelines[VKD3D_SAMPLER_FEEDBACK_RESOLVE_COUNT]; }; +struct vkd3d_workgraph_payload_offsets_args +{ + VkDeviceAddress packed_offset_counts; + VkDeviceAddress unrolled_offsets; + VkDeviceAddress commands; + VkDeviceAddress payload; + VkDeviceAddress meta; +}; + +struct vkd3d_workgraph_complete_compaction_args +{ + VkDeviceAddress commands; + VkDeviceAddress meta; + uint32_t node_count; +}; + +struct vkd3d_workgraph_workgroups_args +{ + VkDeviceAddress node_atomics_va; + VkDeviceAddress commands_va; + VkDeviceAddress dividers_va; + VkDeviceAddress node_share_mapping_va; + uint32_t num_nodes; +}; + +/* If the implementation supports 16M workgroups (arbitrarily chosen large number), + * we don't have to split execution into primary and secondary. + * Reduces number of indirect node dispatches by a factor of 2 since the primary will always be empty. */ +#define VKD3D_WORKGRAPH_MAX_WGX_NO_PRIMARY_EXECUTION_THRESHOLD 0xffffffu + +struct vkd3d_workgraph_setup_gpu_input_args +{ + VkDeviceAddress gpu_input_va; + VkDeviceAddress indirect_commands_va; + VkDeviceAddress coalesce_divider_va; + VkDeviceAddress entry_point_mapping_va; + uint32_t num_entry_points; +}; + +struct vkd3d_workgraph_indirect_pipeline +{ + uint32_t component_count; + uint32_t component_bits; + bool group_tracking; + bool group_compact; + VkPipeline vk_pipeline; +}; + +struct vkd3d_workgraph_indirect_ops +{ + VkPipelineLayout vk_setup_gpu_input_layout; + VkPipelineLayout vk_complete_compaction_layout; + VkPipelineLayout vk_workgroup_layout; + VkPipelineLayout vk_payload_offset_layout; + VkPipeline vk_payload_workgroup_pipeline[2]; + VkPipeline vk_setup_gpu_input_pipeline; + VkPipeline vk_payload_offset_pipeline; + VkPipeline vk_complete_compaction_pipeline; +}; + struct vkd3d_meta_ops { struct d3d12_device *device; @@ -4472,6 +4532,7 @@ struct vkd3d_meta_ops struct vkd3d_multi_dispatch_indirect_ops multi_dispatch_indirect; struct vkd3d_dstorage_ops dstorage; struct vkd3d_sampler_feedback_resolve_ops sampler_feedback; + struct vkd3d_workgraph_indirect_ops workgraph; }; HRESULT vkd3d_meta_ops_init(struct vkd3d_meta_ops *meta_ops, struct d3d12_device *device); @@ -4528,6 +4589,31 @@ static inline VkExtent3D vkd3d_meta_get_sampler_feedback_workgroup_size(void) return result; } +struct vkd3d_workgraph_meta_pipeline_info +{ + VkPipeline vk_pipeline; + VkPipelineLayout vk_pipeline_layout; +}; + +void vkd3d_meta_get_workgraph_workgroup_pipeline(struct vkd3d_meta_ops *meta_ops, + struct vkd3d_workgraph_meta_pipeline_info *info, bool broadcast_compacting); +void vkd3d_meta_get_workgraph_setup_gpu_input_pipeline(struct vkd3d_meta_ops *meta_ops, + struct vkd3d_workgraph_meta_pipeline_info *info); +void vkd3d_meta_get_workgraph_payload_offset_pipeline(struct vkd3d_meta_ops *meta_ops, + struct vkd3d_workgraph_meta_pipeline_info *info); +void vkd3d_meta_get_workgraph_complete_compaction_pipeline(struct vkd3d_meta_ops *meta_ops, + struct vkd3d_workgraph_meta_pipeline_info *info); + +static inline uint32_t vkd3d_meta_get_workgraph_setup_gpu_input_workgroup_size(void) +{ + return 32; +} + +static inline uint32_t vkd3d_meta_get_workgraph_complete_compaction_workgroup_size(void) +{ + return 32; +} + enum vkd3d_time_domain_flag { VKD3D_TIME_DOMAIN_DEVICE = 0x00000001u, diff --git a/libs/vkd3d/vkd3d_shaders.h b/libs/vkd3d/vkd3d_shaders.h index a6857fedf3..29c2942ade 100644 --- a/libs/vkd3d/vkd3d_shaders.h +++ b/libs/vkd3d/vkd3d_shaders.h @@ -76,5 +76,9 @@ enum vkd3d_meta_copy_mode #include #include #include +#include +#include +#include +#include #endif /* __VKD3D_SPV_SHADERS_H */