Skip to content

Commit

Permalink
meta: Add meta-ops for workgraph emulation.
Browse files Browse the repository at this point in the history
Details of implementation strategy is explained in docs/workgraphs.md.

Signed-off-by: Hans-Kristian Arntzen <[email protected]>
  • Loading branch information
HansKristian-Work committed Jan 15, 2025
1 parent e23577d commit 8c4f4ff
Show file tree
Hide file tree
Showing 9 changed files with 865 additions and 0 deletions.
5 changes: 5 additions & 0 deletions libs/vkd3d/meson.build
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,11 @@ vkd3d_shaders =[
'shaders/cs_resolve_color_float.comp',
'shaders/cs_resolve_color_uint.comp',
'shaders/cs_resolve_color_sint.comp',

'shaders/cs_workgraph_distribute_workgroups.comp',
'shaders/cs_workgraph_distribute_payload_offsets.comp',
'shaders/cs_workgraph_complete_compaction.comp',
'shaders/cs_workgraph_setup_gpu_input.comp',
]

vkd3d_src = [
Expand Down
138 changes: 138 additions & 0 deletions libs/vkd3d/meta.c
Original file line number Diff line number Diff line change
Expand Up @@ -2044,6 +2044,92 @@ static HRESULT vkd3d_sampler_feedback_ops_init(struct vkd3d_sampler_feedback_res
return S_OK;
}

static HRESULT vkd3d_workgraph_ops_init(struct vkd3d_workgraph_indirect_ops *workgraph_ops,
struct d3d12_device *device)
{
VkSpecializationMapEntry map_entries[4];
VkPushConstantRange push_range;
VkSpecializationInfo spec_info;
uint32_t spec_data[4];
unsigned int i;
VkResult vr;

push_range.stageFlags = VK_SHADER_STAGE_COMPUTE_BIT;
push_range.offset = 0;

push_range.size = sizeof(struct vkd3d_workgraph_workgroups_args);
if ((vr = vkd3d_meta_create_pipeline_layout(device,
0, NULL, 1, &push_range,
&workgraph_ops->vk_workgroup_layout)))
return hresult_from_vk_result(vr);

push_range.size = sizeof(struct vkd3d_workgraph_payload_offsets_args);
if ((vr = vkd3d_meta_create_pipeline_layout(device,
0, NULL, 1, &push_range,
&workgraph_ops->vk_payload_offset_layout)))
return hresult_from_vk_result(vr);

push_range.size = sizeof(struct vkd3d_workgraph_complete_compaction_args);
if ((vr = vkd3d_meta_create_pipeline_layout(device,
0, NULL, 1, &push_range,
&workgraph_ops->vk_complete_compaction_layout)))
return hresult_from_vk_result(vr);

push_range.size = sizeof(struct vkd3d_workgraph_setup_gpu_input_args);
if ((vr = vkd3d_meta_create_pipeline_layout(device,
0, NULL, 1, &push_range,
&workgraph_ops->vk_setup_gpu_input_layout)))
return hresult_from_vk_result(vr);

for (i = 0; i < ARRAY_SIZE(map_entries); i++)
{
map_entries[i].offset = sizeof(uint32_t) * i;
map_entries[i].size = sizeof(uint32_t);
map_entries[i].constantID = i;
}

spec_info.mapEntryCount = ARRAY_SIZE(map_entries);
spec_info.pMapEntries = map_entries;
spec_info.pData = spec_data;
spec_info.dataSize = ARRAY_SIZE(map_entries) * sizeof(uint32_t);
spec_data[0] = device->device_info.vulkan_1_1_properties.subgroupSize;
spec_data[1] = device->device_info.vulkan_1_1_properties.subgroupSize;
spec_data[2] = 0;
spec_data[3] = device->device_info.properties2.properties.limits.maxComputeWorkGroupCount[0] >=
VKD3D_WORKGRAPH_MAX_WGX_NO_PRIMARY_EXECUTION_THRESHOLD;

if ((vr = vkd3d_meta_create_compute_pipeline(device, sizeof(cs_workgraph_distribute_workgroups),
cs_workgraph_distribute_workgroups, workgraph_ops->vk_workgroup_layout,
&spec_info, true, &workgraph_ops->vk_payload_workgroup_pipeline[0])))
return hresult_from_vk_result(vr);

spec_data[2] = 1;
if ((vr = vkd3d_meta_create_compute_pipeline(device, sizeof(cs_workgraph_distribute_workgroups),
cs_workgraph_distribute_workgroups, workgraph_ops->vk_workgroup_layout,
&spec_info, true, &workgraph_ops->vk_payload_workgroup_pipeline[1])))
return hresult_from_vk_result(vr);

if ((vr = vkd3d_meta_create_compute_pipeline(device, sizeof(cs_workgraph_complete_compaction),
cs_workgraph_complete_compaction, workgraph_ops->vk_complete_compaction_layout,
NULL, true, &workgraph_ops->vk_complete_compaction_pipeline)))
return hresult_from_vk_result(vr);

spec_info.mapEntryCount = 1;
spec_info.dataSize = sizeof(uint32_t);
if ((vr = vkd3d_meta_create_compute_pipeline(device, sizeof(cs_workgraph_distribute_payload_offsets),
cs_workgraph_distribute_payload_offsets, workgraph_ops->vk_payload_offset_layout,
&spec_info, true, &workgraph_ops->vk_payload_offset_pipeline)))
return hresult_from_vk_result(vr);

spec_data[0] = spec_data[3];
if ((vr = vkd3d_meta_create_compute_pipeline(device, sizeof(cs_workgraph_setup_gpu_input),
cs_workgraph_setup_gpu_input, workgraph_ops->vk_setup_gpu_input_layout,
&spec_info, true, &workgraph_ops->vk_setup_gpu_input_pipeline)))
return hresult_from_vk_result(vr);

return S_OK;
}

void vkd3d_meta_get_sampler_feedback_resolve_pipeline(struct vkd3d_meta_ops *meta_ops,
enum vkd3d_sampler_feedback_resolve_type type, struct vkd3d_sampler_feedback_resolve_info *info)
{
Expand Down Expand Up @@ -2082,6 +2168,52 @@ static void vkd3d_sampler_feedback_ops_cleanup(struct vkd3d_sampler_feedback_res
VK_CALL(vkDestroyPipeline(device->vk_device, sampler_feedback_ops->vk_pipelines[i], NULL));
}

static void vkd3d_workgraph_ops_cleanup(struct vkd3d_workgraph_indirect_ops *workgraph_ops,
struct d3d12_device *device)
{
const struct vkd3d_vk_device_procs *vk_procs = &device->vk_procs;
unsigned int i;

VK_CALL(vkDestroyPipelineLayout(device->vk_device, workgraph_ops->vk_payload_offset_layout, NULL));
VK_CALL(vkDestroyPipelineLayout(device->vk_device, workgraph_ops->vk_workgroup_layout, NULL));
VK_CALL(vkDestroyPipelineLayout(device->vk_device, workgraph_ops->vk_setup_gpu_input_layout, NULL));
VK_CALL(vkDestroyPipelineLayout(device->vk_device, workgraph_ops->vk_complete_compaction_layout, NULL));

for (i = 0; i < ARRAY_SIZE(workgraph_ops->vk_payload_workgroup_pipeline); i++)
VK_CALL(vkDestroyPipeline(device->vk_device, workgraph_ops->vk_payload_workgroup_pipeline[i], NULL));
VK_CALL(vkDestroyPipeline(device->vk_device, workgraph_ops->vk_setup_gpu_input_pipeline, NULL));
VK_CALL(vkDestroyPipeline(device->vk_device, workgraph_ops->vk_payload_offset_pipeline, NULL));
VK_CALL(vkDestroyPipeline(device->vk_device, workgraph_ops->vk_complete_compaction_pipeline, NULL));
}

void vkd3d_meta_get_workgraph_workgroup_pipeline(struct vkd3d_meta_ops *meta_ops,
struct vkd3d_workgraph_meta_pipeline_info *info, bool broadcast_compacting)
{
info->vk_pipeline_layout = meta_ops->workgraph.vk_workgroup_layout;
info->vk_pipeline = meta_ops->workgraph.vk_payload_workgroup_pipeline[broadcast_compacting];
}

void vkd3d_meta_get_workgraph_setup_gpu_input_pipeline(struct vkd3d_meta_ops *meta_ops,
struct vkd3d_workgraph_meta_pipeline_info *info)
{
info->vk_pipeline_layout = meta_ops->workgraph.vk_setup_gpu_input_layout;
info->vk_pipeline = meta_ops->workgraph.vk_setup_gpu_input_pipeline;
}

void vkd3d_meta_get_workgraph_payload_offset_pipeline(struct vkd3d_meta_ops *meta_ops,
struct vkd3d_workgraph_meta_pipeline_info *info)
{
info->vk_pipeline_layout = meta_ops->workgraph.vk_payload_offset_layout;
info->vk_pipeline = meta_ops->workgraph.vk_payload_offset_pipeline;
}

void vkd3d_meta_get_workgraph_complete_compaction_pipeline(struct vkd3d_meta_ops *meta_ops,
struct vkd3d_workgraph_meta_pipeline_info *info)
{
info->vk_pipeline_layout = meta_ops->workgraph.vk_complete_compaction_layout;
info->vk_pipeline = meta_ops->workgraph.vk_complete_compaction_pipeline;
}

HRESULT vkd3d_meta_ops_init(struct vkd3d_meta_ops *meta_ops, struct d3d12_device *device)
{
HRESULT hr;
Expand Down Expand Up @@ -2122,8 +2254,13 @@ HRESULT vkd3d_meta_ops_init(struct vkd3d_meta_ops *meta_ops, struct d3d12_device
if (FAILED(hr = vkd3d_sampler_feedback_ops_init(&meta_ops->sampler_feedback, device)))
goto fail_sampler_feedback;

if (FAILED(hr = vkd3d_workgraph_ops_init(&meta_ops->workgraph, device)))
goto fail_workgraphs;

return S_OK;

fail_workgraphs:
vkd3d_sampler_feedback_ops_cleanup(&meta_ops->sampler_feedback, device);
fail_sampler_feedback:
vkd3d_dstorage_ops_cleanup(&meta_ops->dstorage, device);
fail_dstorage_ops:
Expand All @@ -2150,6 +2287,7 @@ HRESULT vkd3d_meta_ops_init(struct vkd3d_meta_ops *meta_ops, struct d3d12_device

HRESULT vkd3d_meta_ops_cleanup(struct vkd3d_meta_ops *meta_ops, struct d3d12_device *device)
{
vkd3d_workgraph_ops_cleanup(&meta_ops->workgraph, device);
vkd3d_sampler_feedback_ops_cleanup(&meta_ops->sampler_feedback, device);
vkd3d_dstorage_ops_cleanup(&meta_ops->dstorage, device);
vkd3d_multi_dispatch_indirect_ops_cleanup(&meta_ops->multi_dispatch_indirect, device);
Expand Down
52 changes: 52 additions & 0 deletions libs/vkd3d/shaders/cs_workgraph_complete_compaction.comp
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
#version 450
#extension GL_EXT_buffer_reference : require
#extension GL_GOOGLE_include_directive : require

layout(local_size_x = 32) in;

#include "cs_workgraph_data_structures.h"

layout(buffer_reference, buffer_reference_align = 16, std430) buffer IndirectCommandsBuffer
{
layout(offset = 16) IndirectCommands indirect_commands[];
};

struct NodeMeta
{
uint packed_control;
uint payload_stride_grid_offset_or_count;
};

layout(buffer_reference, buffer_reference_align = 8, std430) restrict readonly buffer NodeTypeMeta
{
NodeMeta data[];
};

layout(push_constant, std430) uniform Registers
{
IndirectCommandsBuffer commands;
NodeTypeMeta meta;
uint num_nodes;
} registers;

void main()
{
uint node_index = gl_GlobalInvocationID.x;
if (node_index >= registers.num_nodes)
return;

bool should_compact_broadcast = bitfieldExtract(registers.meta.data[node_index].packed_control, 24, 8) != 0;
if (should_compact_broadcast)
should_compact_broadcast = registers.commands.indirect_commands[node_index].primary_execute.y == 0u;

if (should_compact_broadcast)
{
uint total_groups = registers.commands.indirect_commands[node_index].expander_total_groups;
registers.commands.indirect_commands[node_index].expander_total_groups = 0u;
uint wgx = registers.commands.indirect_commands[node_index].secondary_execute.x;
uint average_amplification = min(1024u, uint(float(total_groups) / float(max(1u, wgx))));
uint current_amplification = registers.commands.indirect_commands[node_index].secondary_execute.z;
if (average_amplification > current_amplification)
registers.commands.indirect_commands[node_index].secondary_execute.z = average_amplification;
}
}
18 changes: 18 additions & 0 deletions libs/vkd3d/shaders/cs_workgraph_data_structures.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
#ifndef CS_WORKGRAPH_DATA_STRUCTURES_H_
#define CS_WORKGRAPH_DATA_STRUCTURES_H_

// 64 bytes per node, nicely aligns to a cache line.
struct IndirectCommands
{
uvec3 primary_execute;
uint primary_linear_offset; // Read by node as input metadata.
uvec3 secondary_execute;
uint secondary_linear_offset; // Read by node as input metadata.
uint end_elements; // Read by node as input metadata in coalesce / thread mode.
uint linear_offset_atomic; // Used by expander to write unrolled data.
uint expander_total_groups;
uint padding0;
};

#endif

Loading

0 comments on commit 8c4f4ff

Please sign in to comment.