Skip to content

Commit

Permalink
meta: Add meta-ops for workgraph emulation.
Browse files Browse the repository at this point in the history
Details of implementation strategy is explained in docs/workgraphs.md.

Signed-off-by: Hans-Kristian Arntzen <[email protected]>
  • Loading branch information
HansKristian-Work committed Oct 16, 2024
1 parent 486b6c4 commit fe76abb
Show file tree
Hide file tree
Showing 8 changed files with 809 additions and 0 deletions.
4 changes: 4 additions & 0 deletions libs/vkd3d/meson.build
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,10 @@ vkd3d_shaders =[
'shaders/cs_resolve_color_float.comp',
'shaders/cs_resolve_color_uint.comp',
'shaders/cs_resolve_color_sint.comp',

'shaders/cs_workgraph_distribute_workgroups.comp',
'shaders/cs_workgraph_distribute_payload_offsets.comp',
'shaders/cs_workgraph_setup_gpu_input.comp',
]

vkd3d_src = [
Expand Down
180 changes: 180 additions & 0 deletions libs/vkd3d/meta.c
Original file line number Diff line number Diff line change
Expand Up @@ -2044,6 +2044,71 @@ static HRESULT vkd3d_sampler_feedback_ops_init(struct vkd3d_sampler_feedback_res
return S_OK;
}

static HRESULT vkd3d_workgraph_ops_init(struct vkd3d_workgraph_indirect_ops *workgraph_ops,
struct d3d12_device *device)
{
VkSpecializationMapEntry map_entries[3];
VkPushConstantRange push_range;
VkSpecializationInfo spec_info;
uint32_t spec_data[3];
unsigned int i;
VkResult vr;

push_range.stageFlags = VK_SHADER_STAGE_COMPUTE_BIT;
push_range.offset = 0;

push_range.size = sizeof(struct vkd3d_workgraph_workgroups_args);
if ((vr = vkd3d_meta_create_pipeline_layout(device,
0, NULL, 1, &push_range,
&workgraph_ops->vk_workgroup_layout)))
return hresult_from_vk_result(vr);

push_range.size = sizeof(struct vkd3d_workgraph_payload_offsets_args);
if ((vr = vkd3d_meta_create_pipeline_layout(device,
0, NULL, 1, &push_range,
&workgraph_ops->vk_payload_offset_layout)))
return hresult_from_vk_result(vr);

push_range.size = sizeof(struct vkd3d_workgraph_setup_gpu_input_args);
if ((vr = vkd3d_meta_create_pipeline_layout(device,
0, NULL, 1, &push_range,
&workgraph_ops->vk_setup_gpu_input_layout)))
return hresult_from_vk_result(vr);

for (i = 0; i < ARRAY_SIZE(map_entries); i++)
{
map_entries[i].offset = sizeof(uint32_t) * i;
map_entries[i].size = sizeof(uint32_t);
map_entries[i].constantID = i;
}

spec_info.mapEntryCount = ARRAY_SIZE(map_entries);
spec_info.pMapEntries = map_entries;
spec_info.pData = spec_data;
spec_info.dataSize = ARRAY_SIZE(map_entries) * sizeof(uint32_t);
spec_data[0] = device->device_info.vulkan_1_1_properties.subgroupSize;
spec_data[1] = device->device_info.vulkan_1_1_properties.subgroupSize;
spec_data[2] = 0;

if ((vr = vkd3d_meta_create_compute_pipeline(device, sizeof(cs_workgraph_distribute_workgroups),
cs_workgraph_distribute_workgroups, workgraph_ops->vk_workgroup_layout,
&spec_info, true, &workgraph_ops->vk_payload_workgroup_pipeline[0])))
return hresult_from_vk_result(vr);

spec_data[2] = 1;
if ((vr = vkd3d_meta_create_compute_pipeline(device, sizeof(cs_workgraph_distribute_workgroups),
cs_workgraph_distribute_workgroups, workgraph_ops->vk_workgroup_layout,
&spec_info, true, &workgraph_ops->vk_payload_workgroup_pipeline[1])))
return hresult_from_vk_result(vr);

if ((vr = vkd3d_meta_create_compute_pipeline(device, sizeof(cs_workgraph_setup_gpu_input),
cs_workgraph_setup_gpu_input, workgraph_ops->vk_setup_gpu_input_layout,
NULL, true, &workgraph_ops->vk_setup_gpu_input_pipeline)))
return hresult_from_vk_result(vr);

return S_OK;
}

void vkd3d_meta_get_sampler_feedback_resolve_pipeline(struct vkd3d_meta_ops *meta_ops,
enum vkd3d_sampler_feedback_resolve_type type, struct vkd3d_sampler_feedback_resolve_info *info)
{
Expand Down Expand Up @@ -2082,6 +2147,115 @@ static void vkd3d_sampler_feedback_ops_cleanup(struct vkd3d_sampler_feedback_res
VK_CALL(vkDestroyPipeline(device->vk_device, sampler_feedback_ops->vk_pipelines[i], NULL));
}

static void vkd3d_workgraph_ops_cleanup(struct vkd3d_workgraph_indirect_ops *workgraph_ops,
struct d3d12_device *device)
{
const struct vkd3d_vk_device_procs *vk_procs = &device->vk_procs;
unsigned int i;

VK_CALL(vkDestroyPipelineLayout(device->vk_device, workgraph_ops->vk_payload_offset_layout, NULL));
VK_CALL(vkDestroyPipelineLayout(device->vk_device, workgraph_ops->vk_workgroup_layout, NULL));
VK_CALL(vkDestroyPipelineLayout(device->vk_device, workgraph_ops->vk_setup_gpu_input_layout, NULL));

for (i = 0; i < ARRAY_SIZE(workgraph_ops->vk_payload_workgroup_pipeline); i++)
VK_CALL(vkDestroyPipeline(device->vk_device, workgraph_ops->vk_payload_workgroup_pipeline[i], NULL));
VK_CALL(vkDestroyPipeline(device->vk_device, workgraph_ops->vk_setup_gpu_input_pipeline, NULL));

for (i = 0; i < workgraph_ops->payload_pipelines_count; i++)
VK_CALL(vkDestroyPipeline(device->vk_device, workgraph_ops->payload_pipelines[i].vk_pipeline, NULL));
vkd3d_free(workgraph_ops->payload_pipelines);
}

void vkd3d_meta_get_workgraph_workgroup_pipeline(struct vkd3d_meta_ops *meta_ops,
struct vkd3d_workgraph_meta_pipeline_info *info, bool broadcast_compacting)
{
info->vk_pipeline_layout = meta_ops->workgraph.vk_workgroup_layout;
info->vk_pipeline = meta_ops->workgraph.vk_payload_workgroup_pipeline[broadcast_compacting];
}

void vkd3d_meta_get_workgraph_setup_gpu_input_pipeline(struct vkd3d_meta_ops *meta_ops,
struct vkd3d_workgraph_meta_pipeline_info *info)
{
info->vk_pipeline_layout = meta_ops->workgraph.vk_setup_gpu_input_layout;
info->vk_pipeline = meta_ops->workgraph.vk_setup_gpu_input_pipeline;
}

void vkd3d_meta_get_workgraph_payload_offset_pipeline(struct vkd3d_meta_ops *meta_ops,
uint32_t component_bits, uint32_t component_count, bool group_tracking, bool group_compact,
struct vkd3d_workgraph_meta_pipeline_info *info)
{
struct vkd3d_workgraph_indirect_pipeline *pipeline;
VkSpecializationMapEntry map_entries[5];
VkSpecializationInfo spec_info;
uint32_t spec_data[5];
unsigned int i;
VkResult vr;

info->vk_pipeline_layout = meta_ops->workgraph.vk_payload_offset_layout;
info->vk_pipeline = VK_NULL_HANDLE;

pthread_mutex_lock(&meta_ops->workgraph.lock);

for (i = 0; i < meta_ops->workgraph.payload_pipelines_count; i++)
{
pipeline = &meta_ops->workgraph.payload_pipelines[i];
if (pipeline->component_count == component_count &&
pipeline->component_bits == component_bits &&
pipeline->group_tracking == group_tracking &&
pipeline->group_compact == group_compact)
{
info->vk_pipeline = pipeline->vk_pipeline;
break;
}
}

if (i == meta_ops->workgraph.payload_pipelines_count)
{
vkd3d_array_reserve((void **)&meta_ops->workgraph.payload_pipelines,
&meta_ops->workgraph.payload_pipelines_size,
meta_ops->workgraph.payload_pipelines_count + 1,
sizeof(*meta_ops->workgraph.payload_pipelines));

pipeline = &meta_ops->workgraph.payload_pipelines[meta_ops->workgraph.payload_pipelines_count];

for (i = 0; i < ARRAY_SIZE(map_entries); i++)
{
map_entries[i].offset = sizeof(uint32_t) * i;
map_entries[i].size = sizeof(uint32_t);
map_entries[i].constantID = i;
}

spec_info.mapEntryCount = ARRAY_SIZE(map_entries);
spec_info.pMapEntries = map_entries;
spec_info.pData = spec_data;
spec_info.dataSize = ARRAY_SIZE(map_entries) * sizeof(uint32_t);
spec_data[0] = meta_ops->device->device_info.vulkan_1_1_properties.subgroupSize;
spec_data[1] = component_count;
spec_data[2] = component_bits == 32 ? 1 : 0;
spec_data[3] = group_tracking ? 1 : 0;
spec_data[4] = group_compact ? 1 : 0;

if ((vr = vkd3d_meta_create_compute_pipeline(meta_ops->device, sizeof(cs_workgraph_distribute_payload_offsets),
cs_workgraph_distribute_payload_offsets, meta_ops->workgraph.vk_payload_offset_layout,
&spec_info, true, &pipeline->vk_pipeline)))
{
ERR("Failed to compile meta pipeline, vr %d.\n", vr);
goto out;
}

info->vk_pipeline = pipeline->vk_pipeline;
pipeline->component_count = component_count;
pipeline->component_bits = component_bits;
pipeline->group_tracking = group_tracking;
pipeline->group_compact = group_compact;

meta_ops->workgraph.payload_pipelines_count++;
}

out:
pthread_mutex_unlock(&meta_ops->workgraph.lock);
}

HRESULT vkd3d_meta_ops_init(struct vkd3d_meta_ops *meta_ops, struct d3d12_device *device)
{
HRESULT hr;
Expand Down Expand Up @@ -2122,8 +2296,13 @@ HRESULT vkd3d_meta_ops_init(struct vkd3d_meta_ops *meta_ops, struct d3d12_device
if (FAILED(hr = vkd3d_sampler_feedback_ops_init(&meta_ops->sampler_feedback, device)))
goto fail_sampler_feedback;

if (FAILED(hr = vkd3d_workgraph_ops_init(&meta_ops->workgraph, device)))
goto fail_workgraphs;

return S_OK;

fail_workgraphs:
vkd3d_sampler_feedback_ops_cleanup(&meta_ops->sampler_feedback, device);
fail_sampler_feedback:
vkd3d_dstorage_ops_cleanup(&meta_ops->dstorage, device);
fail_dstorage_ops:
Expand All @@ -2150,6 +2329,7 @@ HRESULT vkd3d_meta_ops_init(struct vkd3d_meta_ops *meta_ops, struct d3d12_device

HRESULT vkd3d_meta_ops_cleanup(struct vkd3d_meta_ops *meta_ops, struct d3d12_device *device)
{
vkd3d_workgraph_ops_cleanup(&meta_ops->workgraph, device);
vkd3d_sampler_feedback_ops_cleanup(&meta_ops->sampler_feedback, device);
vkd3d_dstorage_ops_cleanup(&meta_ops->dstorage, device);
vkd3d_multi_dispatch_indirect_ops_cleanup(&meta_ops->multi_dispatch_indirect, device);
Expand Down
26 changes: 26 additions & 0 deletions libs/vkd3d/shaders/cs_workgraph_data_structures.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
#ifndef CS_WORKGRAPH_DATA_STRUCTURES_H_
#define CS_WORKGRAPH_DATA_STRUCTURES_H_

struct NodeCounts
{
uint fused;
uint total;
};

// 64 bytes per node, nicely aligns to a cache line.
struct IndirectCommands
{
uvec3 primary_execute;
uint primary_linear_offset; // Read by node as input metadata.
uvec3 secondary_execute;
uint secondary_linear_offset; // Read by node as input metadata.
uvec3 expander_execute;
uint end_elements; // Read by node as input metadata in coalesce / thread mode.
uint linear_offset_atomic; // Used by expander to write unrolled data.
uint total_fused_elements;
uint expander_workgroup_counter; // Used by payload exander.
uint expander_total_groups; // Used by payload expander.
};

#endif

Loading

0 comments on commit fe76abb

Please sign in to comment.